R Help 10: Regression Pitfalls

Galton peas (nonconstant variance and weighted least squares)

Load the galton data.
Fit an ordinary least squares (OLS) simple linear regression model of Progeny vs Parent.
Fit a weighted least squares (WLS) model using weights = 1/SD².
Create a scatterplot of the data with a regression line for each model.

galton <- read.table("~/path-to-data/galton.txt", header=T)
attach(galton)

model.1 <- lm(Progeny ~ Parent)
summary(model.1)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept) 0.127029   0.006993  18.164 9.29e-06 ***
# Parent      0.210000   0.038614   5.438  0.00285 ** 

model.2 <- lm(Progeny ~ Parent, weights=1/SD^2)
summary(model.2)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept) 0.127964   0.006811  18.787 7.87e-06 ***
# Parent      0.204801   0.038155   5.368  0.00302 ** 

plot(x=Parent, y=Progeny, ylim=c(0.158,0.174),
     panel.last = c(lines(sort(Parent), fitted(model.1)[order(Parent)], col="blue"),
                    lines(sort(Parent), fitted(model.2)[order(Parent)], col="red")))
legend("topleft", col=c("blue","red"), lty=1,
       inset=0.02, legend=c("OLS", "WLS"))

detach(galton)

Computer-assisted learning (nonconstant variance and weighted least squares)

Load the ca_learning data.
Create a scatterplot of the data.
Fit an OLS model.
Plot the OLS residuals vs num.responses.
Plot the absolute OLS residuals vs num.responses.
Calculate fitted values from a regression of absolute residuals vs num.responses.
Fit a WLS model using weights = 1/(fitted values)².
Create a scatterplot of the data with a regression line for each model.
Plot the WLS standardized residuals vs num.responses.

ca_learning <- read.table("~/path-to-data/ca_learning_new.txt", header=T)
attach(ca_learning)

plot(x=num.responses, y=cost)

model.1 <- lm(cost ~ num.responses)
summary(model.1)
#               Estimate Std. Error t value Pr(>|t|)    
# (Intercept)    19.4727     5.5162   3.530  0.00545 ** 
# num.responses   3.2689     0.3651   8.955 4.33e-06 ***
# ---
# Residual standard error: 4.598 on 10 degrees of freedom
# Multiple R-squared:  0.8891,  Adjusted R-squared:  0.878 
# F-statistic: 80.19 on 1 and 10 DF,  p-value: 4.33e-06

plot(num.responses, residuals(model.1))
plot(num.responses, abs(residuals(model.1)))

wts <- 1/fitted(lm(abs(residuals(model.1)) ~ num.responses))^2

model.2 <- lm(cost ~ num.responses, weights=wts)
summary(model.2)
#               Estimate Std. Error t value Pr(>|t|)    
# (Intercept)    17.3006     4.8277   3.584  0.00498 ** 
# num.responses   3.4211     0.3703   9.238 3.27e-06 ***
# ---
# Residual standard error: 1.159 on 10 degrees of freedom
# Multiple R-squared:  0.8951,  Adjusted R-squared:  0.8846 
# F-statistic: 85.35 on 1 and 10 DF,  p-value: 3.269e-06

plot(x=num.responses, y=cost, ylim=c(50,95),
     panel.last = c(lines(sort(num.responses), fitted(model.1)[order(num.responses)], col="blue"),
                    lines(sort(num.responses), fitted(model.2)[order(num.responses)], col="red")))
legend("topleft", col=c("blue","red"), lty=1,
       inset=0.02, legend=c("OLS", "WLS"))

plot(num.responses, rstandard(model.2))

detach(ca_learning)

Market share (nonconstant variance and weighted least squares)

Load the marketshare data.
Fit an OLS model.
Plot the OLS residuals vs fitted values with points marked by Discount.
Use the tapply function to calculate the residual variance for Discount=0 and Discount=1.
Fit a WLS model using weights = 1/variance for Discount=0 and Discount=1.
Plot the WLS standardized residuals vs fitted values.

marketshare <- read.table("~/path-to-data/marketshare.txt", header=T)
attach(marketshare)

model.1 <- lm(MarketShare ~ Price + P1 + P2)
summary(model.1)
  #             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)  3.19592    0.35616   8.973 3.00e-10 ***
# Price       -0.33358    0.15226  -2.191   0.0359 *  
# P1           0.30808    0.06412   4.804 3.51e-05 ***
# P2           0.48431    0.05541   8.740 5.49e-10 ***

plot(fitted(model.1), residuals(model.1), col=Discount+1)
vars <- tapply(residuals(model.1), Discount, var)
#          0          1 
# 0.01052324 0.02680546 

wts <- Discount/vars[2] + (1-Discount)/vars[1]

model.2 <- lm(MarketShare ~ Price + P1 + P2, weights=wts)
summary(model.2)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)  3.17437    0.35671   8.899 3.63e-10 ***
# Price       -0.32432    0.15291  -2.121   0.0418 *  
# P1           0.30834    0.06575   4.689 4.89e-05 ***
# P2           0.48419    0.05422   8.930 3.35e-10 ***

plot(fitted(model.2), rstandard(model.2), col=Discount+1)

detach(marketshare)

Home price (nonconstant variance and weighted least squares)

Load the realestate data.
Calculate log transformations of the variables.
Fit an OLS model.
Plot the OLS residuals vs fitted values.
Calculate fitted values from a regression of absolute residuals vs fitted values.
Fit a WLS model using weights = 1/(fitted values)².
Plot the WLS standardized residuals vs fitted values.

realestate <- read.table("~/path-to-data/realestate.txt", header=T)
attach(realestate)

logY <- log(SalePrice)
logX1 <- log(SqFeet)
logX2 <- log(Lot)

model.1 <- lm(logY ~ logX1 + logX2)
summary(model.1)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)  4.25485    0.07353  57.864  < 2e-16 ***
# logX1        1.22141    0.03371  36.234  < 2e-16 ***
# logX2        0.10595    0.02394   4.425 1.18e-05 ***

plot(fitted(model.1), residuals(model.1))

wts <- 1/fitted(lm(abs(residuals(model.1)) ~ fitted(model.1)))^2

model.2 <- lm(logY ~ logX1 + logX2, weights=wts)
summary(model.2)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)  4.35189    0.06330  68.755  < 2e-16 ***
# logX1        1.20150    0.03332  36.065  < 2e-16 ***
# logX2        0.07924    0.02152   3.682 0.000255 ***

plot(fitted(model.2), rstandard(model.2))

detach(realestate)

Google stock (autoregression model)

Use the read.zoo function in the zoo package to load the google_stock data in time series format.
Create a time series plot of the data.
Load the google_stock data in the usual way using read-table.
Use the ts function to convert the price variable to a time series.
Create a plot of partial autocorrelations of price.
Calculate a lag-1 price variable (note that the lag argument for the function is –1, not +1).
Create a scatterplot of price vs lag1price.
Use the ts.intersect function to create a dataframe containing price and lag1price.
Fit a simple linear regression model of price vs lag1price (a first-order autoregression model).

library(zoo)
google.ts <- read.zoo("~/path-to-data/google_stock.txt", format="%m/%d/%Y",
                      header=T)
plot(google.ts)

google <- read.table("~/path-to-data/google_stock.txt", header=T)
attach(google)

price.ts <- ts(price)
pacf(price.ts)

lag1price <- lag(price.ts, -1)
plot(price.ts ~ lag1price, xy.labels=F)

lagdata <- ts.intersect(price.ts, lag1price, dframe=T)
summary(lm(price.ts ~ lag1price, data=lagdata))

detach(google)

Earthquakes (autoregression model)

Load the earthquakes data.
Create a time series plot of the data.
Use the ts function to convert the Quakes variable to a time series.
Create a plot of partial autocorrelations of Quakes.
Calculate lag-1, lag-2, and lag-3 Quakes variables.
Use the ts.intersect function to create a dataframe containing Quakes and the three lag variables.
Fit a multiple linear regression model of Quakes versus the three lag variables (a third-order autoregression model).

earthquakes <- read.table("~/path-to-data/earthquakes.txt", header=T)
attach(earthquakes)

plot(Year, Quakes, type="b")

Quakes.ts <- ts(Quakes)
pacf(Quakes.ts)

lag1Quakes <- lag(Quakes.ts, -1)
lag2Quakes <- lag(Quakes.ts, -2)
lag3Quakes <- lag(Quakes.ts, -3)

lagdata <- ts.intersect(Quakes.ts, lag1Quakes, lag2Quakes, lag3Quakes, dframe=T)
summary(lm(Quakes.ts ~ lag1Quakes + lag2Quakes + lag3Quakes, data=lagdata))
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)  6.44916    1.78646   3.610 0.000496 ***
# lag1Quakes   0.16424    0.10063   1.632 0.106049    
# lag2Quakes   0.07125    0.10128   0.703 0.483517    
# lag3Quakes   0.26928    0.09783   2.753 0.007110 ** 
# ---
# Residual standard error: 3.851 on 93 degrees of freedom
# Multiple R-squared:  0.1388,  Adjusted R-squared:  0.111 
# F-statistic: 4.997 on 3 and 93 DF,  p-value: 0.002942

detach(earthquakes)

Blaisdell company (regression with autoregressive errors)

Load the blaisdell data.
Fit a simple linear regression model of comsales vs indsales.
Use the dwt function in the car package to conduct the Durbin-Watson test on the residuals.
Conduct the Ljung-Box test on the residuals.
Perform the Cochrane-Orcutt procedure to transform the variables.
Forecast comsales for period 21 when indsales are projected to be $175.3 million.
Perform the Hildreth-Lu procedure to transform the variables.
Perform the first differences procedure to transform the variables.

blaisdell <- read.table("~/path-to-data/blaisdell.txt", header=T)
attach(blaisdell)

model.1 <- lm(comsales ~ indsales)
summary(model.1)
#              Estimate Std. Error t value Pr(>|t|)    
# (Intercept) -1.454750   0.214146  -6.793 2.31e-06 ***
# indsales     0.176283   0.001445 122.017  < 2e-16 ***

# Durbin-Watson Test
library(car)
dwt(model.1)
# lag Autocorrelation D-W Statistic p-value
#   1       0.6260046     0.7347256       0
# Alternative hypothesis: rho != 0

# Ljung-Box Q Test
Box.test(residuals(model.1), lag = 1, type = "Ljung")
# Box-Ljung test
# data:  residuals(model.1)
# X-squared = 9.0752, df = 1, p-value = 0.002591

# Cochrane-Orcutt Procedure
res.ts <- ts(residuals(model.1))
lag1res <- lag(res.ts, -1)
lagdata1 <- ts.intersect(res.ts, lag1res)
acp <- coef(lm(res.ts ~ lag1res -1, data=lagdata1)) # 0.6311636
y.ts <- ts(comsales)
x.ts <- ts(indsales)
lag1y <- lag(y.ts, -1)
lag1x <- lag(x.ts, -1)
y.co <- y.ts-acp*lag1y
x.co <- x.ts-acp*lag1x
model.2 <- lm(y.co ~ x.co)
summary(model.2)
#              Estimate Std. Error t value Pr(>|t|)    
# (Intercept) -0.394111   0.167230  -2.357   0.0307 *  
# x.co         0.173758   0.002957  58.767   <2e-16 ***

dwt(model.2)
# lag Autocorrelation D-W Statistic p-value
#   1       0.1473569      1.650248   0.306
# Alternative hypothesis: rho != 0

b0 <- coef(model.2)[1]/(1-acp) # -1.068524
sqrt(vcov(model.2)[1,1])/(1-acp) # se = 0.4533986
b1 <- coef(model.2)[2] # 0.1737583

fit.20 <- b0+b1*indsales[20] # 28.76577
res.20 <- comsales[20]-fit.20 # 0.01422919
fit.21 <- b0+b1*175.3 # 29.3913
forecast.21 <- fit.21+acp*res.20 # 29.40028

# Hildreth-Lu Procedure
sse <- vector()
for(i in 1:90){
  y.hl = y.ts-(0.09+0.01*i)*lag1y
  x.hl = x.ts-(0.09+0.01*i)*lag1x
  sse[i] <- sum(residuals(lm(y.hl ~ x.hl))^2)
}
acp <- 0.09+0.01*which.min(sse) # 0.96
y.hl = y.ts-acp*lag1y
x.hl = x.ts-acp*lag1x
model.3 <- lm(y.hl ~ x.hl)
summary(model.3)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)  0.07117    0.05797   1.228    0.236    
# x.hl         0.16045    0.00684  23.458 2.18e-14 ***

dwt(model.3)
# lag Autocorrelation D-W Statistic p-value
#   1        0.116145      1.725439   0.548
# Alternative hypothesis: rho != 0

coef(model.3)[1]/(1-acp) # 1.77933
sqrt(vcov(model.3)[1,1])/(1-acp) # 1.449373

# First Differences Procedure

y.fd = y.ts-lag1y
x.fd = x.ts-lag1x
dwt(lm(y.fd ~ x.fd))
# lag Autocorrelation D-W Statistic p-value
#   1       0.1160548      1.748834    0.62
# Alternative hypothesis: rho != 0

model.4 <- lm(y.fd ~ x.fd -1)
summary(model.4)
#      Estimate Std. Error t value Pr(>|t|)    
# x.fd 0.168488   0.005096   33.06   <2e-16 ***

mean(comsales)-coef(model.4)[1]*mean(indsales) # -0.3040052

detach(blaisdell)

Metal fabricator and vendor employees (regression with autoregressive errors)

Load the employee data.
Fit a simple linear regression model of metal vs vendor.
Create a scatterplot of the data with a regression line.
Create a scatterplot of the residuals vs time order.
Create a plot of partial autocorrelations of the residuals.
Use the dwt function in the car package to conduct the Durbin-Watson test on the residuals.
Perform the Cochrane-Orcutt procedure to transform the variables.

employee <- read.table("~/path-to-data/employee.txt", header=T)
attach(employee)

model.1 <- lm(metal ~ vendor)
summary(model.1)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept) 2.847911   3.299962   0.863    0.392    
# vendor      0.122442   0.009423  12.994   <2e-16 ***

plot(x=vendor, y=metal,
     panel.last = lines(sort(vendor), fitted(model.1)[order(vendor)]))

plot(x=time, y=residuals(model.1), type="b",
     panel.last = abline(h=0, lty=2))

pacf(residuals(model.1))

# Durbin-Watson Test
dwt(model.1)
# lag Autocorrelation D-W Statistic p-value
#   1        0.772038     0.3592396       0
# Alternative hypothesis: rho != 0

# Cochrane-Orcutt Procedure
res.ts <- ts(residuals(model.1))
lag1res <- lag(res.ts, -1)
lagdata1 <- ts.intersect(res.ts, lag1res)
acp <- coef(lm(res.ts ~ lag1res -1, data=lagdata1)) # 0.831385
y.ts <- ts(metal)
x.ts <- ts(vendor)
lag1y <- lag(y.ts, -1)
lag1x <- lag(x.ts, -1)
y.co <- y.ts-acp*lag1y
x.co <- x.ts-acp*lag1x
model.2 <- lm(y.co ~ x.co)
summary(model.2)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)  4.87560    0.78655   6.199 6.78e-08 ***
# x.co         0.04795    0.01300   3.688 0.000505 ***

coef(model.2)[1]/(1-acp) # 28.91557
sqrt(vcov(model.2)[1,1])/(1-acp) # se = 4.664789

detach(employee)

Blood pressure (multicollinearity)

Load the bloodpress data.
Create scatterplot matrices of the data.
Calculate correlations between the variables.

bloodpress <- read.table("~/path-to-data/bloodpress.txt", header=T)
attach(bloodpress)

pairs(bloodpress[,c(2:5)])
pairs(bloodpress[,c(2,6:8)])

round(cor(bloodpress[,c(2:8)]),3)
#           BP   Age Weight   BSA   Dur Pulse Stress
# BP     1.000 0.659  0.950 0.866 0.293 0.721  0.164
# Age    0.659 1.000  0.407 0.378 0.344 0.619  0.368
# Weight 0.950 0.407  1.000 0.875 0.201 0.659  0.034
# BSA    0.866 0.378  0.875 1.000 0.131 0.465  0.018
# Dur    0.293 0.344  0.201 0.131 1.000 0.402  0.312
# Pulse  0.721 0.619  0.659 0.465 0.402 1.000  0.506
# Stress 0.164 0.368  0.034 0.018 0.312 0.506  1.000

detach(bloodpress)

Uncorrelated predictors (no multicollinearity)

Load the uncorrpreds data.
Create a scatterplot matrix of the data.
Calculate the correlation between the predictors.
Fit a simple linear regression model of y vs x₁.
Fit a simple linear regression model of y vs x₂.
Fit a multiple linear regression model of y vs x₁ + x₂.
Fit a multiple linear regression model of y vs x₂ + x₁.
Use the scatter3d function in the car package to create a 3D scatterplot of the data with the fitted plane for a multiple linear regression model of y vs x₁ + x₂.

uncorrpreds <- read.table("~/path-to-data/uncorrpreds.txt", header=T)
attach(uncorrpreds)

pairs(uncorrpreds)

cor(x1,x2) # 0

model.1 <- lm(y ~ x1)
summary(model.1)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)   49.500      4.655  10.634 4.07e-05 ***
# x1            -1.000      1.472  -0.679    0.522    
anova(model.1)
#           Df Sum Sq Mean Sq F value Pr(>F)
# x1         1      8   8.000  0.4615 0.5222
# Residuals  6    104  17.333               

model.2 <- lm(y ~ x2)
summary(model.2)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)   57.000      8.213   6.940 0.000444 ***
# x2            -1.750      1.350  -1.296 0.242545    
anova(model.2)
#           Df Sum Sq Mean Sq F value Pr(>F)
# x2         1   24.5  24.500    1.68 0.2425
# Residuals  6   87.5  14.583               

model.12 <- lm(y ~ x1 + x2)
summary(model.12)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)   60.000      9.562   6.275  0.00151 **
# x1            -1.000      1.410  -0.709  0.50982   
# x2            -1.750      1.410  -1.241  0.26954   
anova(model.12)
#           Df Sum Sq Mean Sq F value Pr(>F)
# x1         1    8.0     8.0  0.5031 0.5098
# x2         1   24.5    24.5  1.5409 0.2695
# Residuals  5   79.5    15.9               

model.21 <- lm(y ~ x2 + x1)
summary(model.21)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)   60.000      9.562   6.275  0.00151 **
# x2            -1.750      1.410  -1.241  0.26954   
# x1            -1.000      1.410  -0.709  0.50982   
anova(model.21)
#           Df Sum Sq Mean Sq F value Pr(>F)
# x2         1   24.5    24.5  1.5409 0.2695
# x1         1    8.0     8.0  0.5031 0.5098
# Residuals  5   79.5    15.9               

# library(car)
scatter3d(y ~ x1 + x2)

detach(uncorrpreds)

Blood pressure (predictors with almost no multicollinearity)

Load the bloodpress data.
Create a scatterplot matrix of the data.
Fit a simple linear regression model of BP vs Stress.
Fit a simple linear regression model of BP vs BSA.
Fit a multiple linear regression model of BP vs Stress + BSA.
Fit a multiple linear regression model of BP vs BSA + Stress.
Use the scatter3d function in the car package to create a 3D scatterplot of the data with the fitted plane for a multiple linear regression model of BP vs Stress + BSA.

attach(bloodpress)

pairs(bloodpress[,c(2,5,8)])

model.1 <- lm(BP ~ Stress)
summary(model.1)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept) 112.71997    2.19345  51.389   <2e-16 ***
# Stress        0.02399    0.03404   0.705     0.49    
anova(model.1)
#           Df Sum Sq Mean Sq F value Pr(>F)
# Stress     1  15.04  15.044  0.4969 0.4899
# Residuals 18 544.96  30.275               

model.2 <- lm(BP ~ BSA)
summary(model.2)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)   45.183      9.392   4.811  0.00014 ***
# BSA           34.443      4.690   7.343 8.11e-07 ***
anova(model.2)
#           Df Sum Sq Mean Sq F value Pr(>F)
# BSA        1 419.86  419.86  53.927 8.114e-07 ***
# Residuals 18 140.14    7.79                      

model.12 <- lm(BP ~ Stress + BSA)
summary(model.12)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept) 44.24452    9.26104   4.777 0.000175 ***
# Stress       0.02166    0.01697   1.277 0.218924    
# BSA         34.33423    4.61110   7.446 9.56e-07 ***
anova(model.12)
#           Df Sum Sq Mean Sq F value Pr(>F)
# Stress     1  15.04   15.04  1.9998    0.1754    
# BSA        1 417.07  417.07 55.4430 9.561e-07 ***
# Residuals 17 127.88    7.52                      

model.21 <- lm(BP ~ BSA + Stress)
summary(model.21)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept) 44.24452    9.26104   4.777 0.000175 ***
# BSA         34.33423    4.61110   7.446 9.56e-07 ***
# Stress       0.02166    0.01697   1.277 0.218924    
anova(model.21)
#           Df Sum Sq Mean Sq F value Pr(>F)
# BSA        1 419.86  419.86 55.8132 9.149e-07 ***
# Stress     1  12.26   12.26  1.6296    0.2189    
# Residuals 17 127.88    7.52                      

scatter3d(BP ~ Stress + BSA)

detach(bloodpress)

Blood pressure (predictors with high multicollinearity)

Load the bloodpress data.
Create a scatterplot matrix of the data.
Fit a simple linear regression model of BP vs Weight.
Fit a simple linear regression model of BP vs BSA.
Fit a multiple linear regression model of BP vs Weight + BSA.
Fit a multiple linear regression model of BP vs BSA + Weight.
Use the scatter3d function in the car package to create a 3D scatterplot of the data with the fitted plane for a multiple linear regression model of BP vs Weight + BSA.
Predict BP for Weight=92 and BSA=2 for the two simple linear regression models and the multiple linear regression model.

attach(bloodpress)

pairs(bloodpress[,c(2,5,4)])

model.1 <- lm(BP ~ Weight)
summary(model.1)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)  2.20531    8.66333   0.255    0.802    
# Weight       1.20093    0.09297  12.917 1.53e-10 ***
anova(model.1)
#           Df Sum Sq Mean Sq F value Pr(>F)
# Weight     1 505.47  505.47  166.86 1.528e-10 ***
# Residuals 18  54.53    3.03                      

model.2 <- lm(BP ~ BSA)
summary(model.2)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)   45.183      9.392   4.811  0.00014 ***
# BSA           34.443      4.690   7.343 8.11e-07 ***
anova(model.2)
#           Df Sum Sq Mean Sq F value Pr(>F)
# BSA        1 419.86  419.86  53.927 8.114e-07 ***
# Residuals 18 140.14    7.79                      

model.12 <- lm(BP ~ Weight + BSA)
summary(model.12)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)   5.6534     9.3925   0.602    0.555    
# Weight        1.0387     0.1927   5.392 4.87e-05 ***
# BSA           5.8313     6.0627   0.962    0.350    
anova(model.12)
#           Df Sum Sq Mean Sq F value Pr(>F)
# Weight     1 505.47  505.47 166.1648 3.341e-10 ***
# BSA        1   2.81    2.81   0.9251    0.3496    
# Residuals 17  51.71    3.04                       

model.21 <- lm(BP ~ BSA + Weight)
summary(model.21)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)   5.6534     9.3925   0.602    0.555    
# BSA           5.8313     6.0627   0.962    0.350    
# Weight        1.0387     0.1927   5.392 4.87e-05 ***
anova(model.21)
#           Df Sum Sq Mean Sq F value Pr(>F)
# BSA        1 419.86  419.86 138.021 1.391e-09 ***
# Weight     1  88.43   88.43  29.069 4.871e-05 ***
# Residuals 17  51.71    3.04                      

scatter3d(BP ~ Weight + BSA)

predict(model.1, interval="prediction",
        newdata=data.frame(Weight=92))
#       fit     lwr     upr
# 1 112.691 108.938 116.444

predict(model.2, interval="prediction",
        newdata=data.frame(BSA=2))
#        fit      lwr      upr
# 1 114.0689 108.0619 120.0758

predict(model.12, interval="prediction",
        newdata=data.frame(Weight=92, BSA=2))
#        fit      lwr      upr
# 1 112.8794 109.0801 116.6787

detach(bloodpress)

Poverty and teen birth rate (high multicollinearity)

Load the poverty data and remove the District of Columbia.
Create a scatterplot matrix of the data.
Fit a simple linear regression model of PovPct vs Brth15to17.
Fit a simple linear regression model of PovPct vs Brth18to19.
Fit a multiple linear regression model of PovPct vs Brth15to17 + Brth18to19.

poverty <- read.table("~/path-to-data/poverty.txt", header=T)
poverty <- poverty[poverty$Location!="District_of_Columbia",]
attach(poverty)

pairs(poverty[,c(2:4)])

model.1 <- lm(PovPct ~ Brth15to17)
summary(model.1)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)   4.4871     1.3181   3.404  0.00135 ** 
# Brth15to17    0.3872     0.0572   6.768 1.67e-08 ***
# ---
# Residual standard error: 2.982 on 48 degrees of freedom
# Multiple R-squared:  0.4883,  Adjusted R-squared:  0.4777 
# F-statistic: 45.81 on 1 and 48 DF,  p-value: 1.666e-08

model.2 <- lm(PovPct ~ Brth18to19)
summary(model.2)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)  3.05279    1.83169   1.667    0.102    
# Brth18to19   0.13842    0.02482   5.576 1.11e-06 ***
# ---
# Residual standard error: 3.248 on 48 degrees of freedom
# Multiple R-squared:  0.3931,  Adjusted R-squared:  0.3805 
# F-statistic: 31.09 on 1 and 48 DF,  p-value: 1.106e-06

model.12 <- lm(PovPct ~ Brth15to17 + Brth18to19)
summary(model.12)
#             Estimate Std. Error t value Pr(>|t|)   
# (Intercept)  6.43963    1.95904   3.287  0.00192 **
# Brth15to17   0.63235    0.19178   3.297  0.00186 **
# Brth18to19  -0.10227    0.07642  -1.338  0.18724   
# ---
# Residual standard error: 2.958 on 47 degrees of freedom
# Multiple R-squared:  0.5071,  Adjusted R-squared:  0.4861 
# F-statistic: 24.18 on 2 and 47 DF,  p-value: 6.017e-08

detach(poverty)

Blood pressure (high multicollinearity)

Load the bloodpress data.
Fit a multiple linear regression model of BP vs Age + Weight + BSA + Dur + Pulse + Stress.
Use the vif function in the car package to calculate variance inflation factors.
Fit a multiple linear regression model of Weight vs Age + BSA + Dur + Pulse + Stress and confirm the VIF value for Weight as 1/(1-R²) for this model.
Fit a multiple linear regression model of BP vs Age + Weight + Dur + Stress.
Use the vif function in the car package to calculate variance inflation factors.

attach(bloodpress)

model.1 <- lm(BP ~ Age + Weight + BSA + Dur + Pulse + Stress)
summary(model.1)
#               Estimate Std. Error t value Pr(>|t|)    
# (Intercept) -12.870476   2.556650  -5.034 0.000229 ***
# Age           0.703259   0.049606  14.177 2.76e-09 ***
# Weight        0.969920   0.063108  15.369 1.02e-09 ***
# BSA           3.776491   1.580151   2.390 0.032694 *  
# Dur           0.068383   0.048441   1.412 0.181534    
# Pulse        -0.084485   0.051609  -1.637 0.125594    
# Stress        0.005572   0.003412   1.633 0.126491    
# ---
# Residual standard error: 0.4072 on 13 degrees of freedom
# Multiple R-squared:  0.9962,  Adjusted R-squared:  0.9944 
# F-statistic: 560.6 on 6 and 13 DF,  p-value: 6.395e-15

# library(car)
vif(model.1)
#      Age   Weight      BSA      Dur    Pulse   Stress 
# 1.762807 8.417035 5.328751 1.237309 4.413575 1.834845 

model.2 <- lm(Weight ~ Age + BSA + Dur + Pulse + Stress)
summary(model.2)
# Residual standard error: 1.725 on 14 degrees of freedom
# Multiple R-squared:  0.8812,  Adjusted R-squared:  0.8388 
# F-statistic: 20.77 on 5 and 14 DF,  p-value: 5.046e-06

1/(1-summary(model.2)$r.squared) # 8.417035

model.3 <- lm(BP ~ Age + Weight + Dur + Stress)
summary(model.3)
#               Estimate Std. Error t value Pr(>|t|)    
# (Intercept) -15.869829   3.195296  -4.967 0.000169 ***
# Age           0.683741   0.061195  11.173 1.14e-08 ***
# Weight        1.034128   0.032672  31.652 3.76e-15 ***
# Dur           0.039889   0.064486   0.619 0.545485    
# Stress        0.002184   0.003794   0.576 0.573304    
# ---
# Residual standard error: 0.5505 on 15 degrees of freedom
# Multiple R-squared:  0.9919,  Adjusted R-squared:  0.9897 
# F-statistic: 458.3 on 4 and 15 DF,  p-value: 1.764e-15

vif(model.3)
#      Age   Weight      Dur   Stress 
# 1.468245 1.234653 1.200060 1.241117 

detach(bloodpress)

Allen Cognitive Level study (reducing data-based multicollinearity)

Load the sampled allentestn23 data.
Create a scatterplot matrix of the data.
Calculate the correlation between Vocab and Abstract.
Fit a multiple linear regression model of ACL vs SDMT + Vocab + Abstract.
Use the vif function in the car package to calculate variance inflation factors.
Repeat for the full allentest data.

allentestn23 <- read.table("~/path-to-data/allentestn23.txt", header=T)
attach(allentestn23)

pairs(allentestn23[,2:5])

cor(Vocab, Abstract) # 0.9897771

model.1 <- lm(ACL ~ SDMT + Vocab + Abstract)
summary(model.1)
#             Estimate Std. Error t value Pr(>|t|)  
# (Intercept)  3.74711    1.34237   2.791   0.0116 *
# SDMT         0.02326    0.01273   1.827   0.0834 .
# Vocab        0.02825    0.15239   0.185   0.8549  
# Abstract    -0.01379    0.10055  -0.137   0.8924  
# ---
# Residual standard error: 0.7344 on 19 degrees of freedom
# Multiple R-squared:  0.2645,  Adjusted R-squared:  0.1484 
# F-statistic: 2.278 on 3 and 19 DF,  p-value: 0.1124

vif(model.1)
#     SDMT     Vocab  Abstract 
# 1.726185 49.286239 50.603085 

detach(allentestn23)

allentest <- read.table("~/path-to-data/allentest.txt", header=T)
attach(allentest)

pairs(allentest[,2:5])

cor(Vocab, Abstract) # 0.6978405

model.1 <- lm(ACL ~ SDMT + Vocab + Abstract)
summary(model.1)
#              Estimate Std. Error t value Pr(>|t|)    
# (Intercept)  3.946347   0.338069  11.673  < 2e-16 ***
# SDMT         0.027404   0.007168   3.823 0.000298 ***
# Vocab       -0.017397   0.018077  -0.962 0.339428    
# Abstract     0.012182   0.011585   1.051 0.296926    
# ---
# Residual standard error: 0.6878 on 65 degrees of freedom
# Multiple R-squared:  0.2857,  Adjusted R-squared:  0.2528 
# F-statistic: 8.668 on 3 and 65 DF,  p-value: 6.414e-05

vif(model.1)
#     SDMT    Vocab Abstract 
# 1.609662 2.093297 2.167428 

detach(allentest)

Exercise and immunity (reducing structural multicollinearity)

Load the exerimmun data.
Create a scatterplot of igg vs oxygen.
Calculate an oxygen-squared variable named oxygensq.
Fit a quadratic regression model of igg vs oxygen + oxygensq.
Add a quadratic regression line to the scatterplot.
Use the vif function in the car package to calculate variance inflation factors.
Create a scatterplot of oxygensq vs oxygen and calculate the correlation.
Calculate a centered oxygen variable named oxcent and an oxcent-squared variable named oxcentsq.
Fit a quadratic regression model of igg vs oxcent + oxcentsq.
Use the vif function in the car package to calculate variance inflation factors.
Create a scatterplot of igg vs oxcent with the quadratic regression line added.
Fit a simple linear regression model of igg vs oxcent.
Confirm the equivalence of the original quadratic and centered quadratic models by transforming the regression parameter estimates.
Create a residual vs fits plot for the centered quadratic model.
Create a normal probability plot of the residuals for the centered quadratic model.
Predict igg for oxygen = 70 using the centered quadratic model.

exerimmun <- read.table("~/path-to-data/exerimmun.txt", header=T)
attach(exerimmun)

plot(oxygen, igg)

oxygensq <- oxygen^2

model.1 <- lm(igg ~ oxygen + oxygensq)

plot(x=oxygen, y=igg,
     panel.last = lines(sort(oxygen), fitted(model.1)[order(oxygen)]))

summary(model.1)
#               Estimate Std. Error t value Pr(>|t|)    
# (Intercept) -1464.4042   411.4012  -3.560  0.00140 ** 
# oxygen         88.3071    16.4735   5.361 1.16e-05 ***
# oxygensq       -0.5362     0.1582  -3.390  0.00217 ** 
# ---
# Residual standard error: 106.4 on 27 degrees of freedom
# Multiple R-squared:  0.9377,  Adjusted R-squared:  0.9331 
# F-statistic: 203.2 on 2 and 27 DF,  p-value: < 2.2e-16

vif(model.1)
#   oxygen oxygensq 
# 99.94261 99.94261 

plot(oxygen, oxygensq)
cor(oxygen, oxygensq) # 0.9949846

oxcent <- oxygen-mean(oxygen)
oxcentsq <- oxcent^2

plot(oxcent, oxcentsq)
cor(oxcent, oxcentsq) # 0.2195179

model.2 <- lm(igg ~ oxcent + oxcentsq)
summary(model.2)
#              Estimate Std. Error t value Pr(>|t|)    
# (Intercept) 1632.1962    29.3486   55.61  < 2e-16 ***
# oxcent        33.9995     1.6890   20.13  < 2e-16 ***
# oxcentsq      -0.5362     0.1582   -3.39  0.00217 ** 
# ---
# Residual standard error: 106.4 on 27 degrees of freedom
# Multiple R-squared:  0.9377,  Adjusted R-squared:  0.9331 
# F-statistic: 203.2 on 2 and 27 DF,  p-value: < 2.2e-16

vif(model.2)
#   oxcent oxcentsq 
# 1.050628 1.050628 

plot(x=oxcent, y=igg,
     panel.last = lines(sort(oxcent), fitted(model.2)[order(oxcent)]))

model.3 <- lm(igg ~ oxcent)
summary(model.3)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept) 1557.633     22.782   68.37  < 2e-16 ***
# oxcent        32.743      1.932   16.95 2.97e-16 ***

coef(model.2)[1]-coef(model.2)[2]*mean(oxygen)+coef(model.2)[3]*mean(oxygen)^2 # -1464.404
coef(model.2)[2]-2*coef(model.2)[3]*mean(oxygen) # 88.3071
coef(model.2)[3] # -0.5362473

coef(model.1)
#   (Intercept)        oxygen      oxygensq 
# -1464.4042284    88.3070970    -0.5362473 

plot(x=fitted(model.2), y=residuals(model.2),
     panel.last = abline(h=0, lty=2))

qqnorm(residuals(model.2), main="", datax=TRUE)
qqline(residuals(model.2), datax=TRUE)

predict(model.2, interval="prediction",
        newdata=data.frame(oxcent=70-mean(oxygen), oxcentsq=(70-mean(oxygen))^2))
#        fit      lwr      upr
# 1 2089.481 1849.931 2329.031

detach(exerimmun)

R Help 10: Regression Pitfalls

Galton peas (nonconstant variance and weighted least squares)

Computer-assisted learning (nonconstant variance and weighted least squares)

Market share (nonconstant variance and weighted least squares)

Home price (nonconstant variance and weighted least squares)

Google stock (autoregression model)

Earthquakes (autoregression model)

Blaisdell company (regression with autoregressive errors)

Metal fabricator and vendor employees (regression with autoregressive errors)

Blood pressure (multicollinearity)

Uncorrelated predictors (no multicollinearity)

Blood pressure (predictors with almost no multicollinearity)

Blood pressure (predictors with high multicollinearity)

Poverty and teen birth rate (high multicollinearity)

Blood pressure (high multicollinearity)

Allen Cognitive Level study (reducing data-based multicollinearity)

Exercise and immunity (reducing structural multicollinearity)

Navigation

Start Here!

Lessons

Resources