R Help 10: Regression Pitfalls

Galton peas (nonconstant variance and weighted least squares)

  • Load the galton data.
  • Fit an ordinary least squares (OLS) simple linear regression model of Progeny vs Parent.
  • Fit a weighted least squares (WLS) model using weights = 1/SD2.
  • Create a scatterplot of the data with a regression line for each model.
galton <- read.table("~/path-to-data/galton.txt", header=T)
attach(galton)

model.1 <- lm(Progeny ~ Parent)
summary(model.1)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept) 0.127029   0.006993  18.164 9.29e-06 ***
# Parent      0.210000   0.038614   5.438  0.00285 ** 

model.2 <- lm(Progeny ~ Parent, weights=1/SD^2)
summary(model.2)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept) 0.127964   0.006811  18.787 7.87e-06 ***
# Parent      0.204801   0.038155   5.368  0.00302 ** 

plot(x=Parent, y=Progeny, ylim=c(0.158,0.174),
     panel.last = c(lines(sort(Parent), fitted(model.1)[order(Parent)], col="blue"),
                    lines(sort(Parent), fitted(model.2)[order(Parent)], col="red")))
legend("topleft", col=c("blue","red"), lty=1,
       inset=0.02, legend=c("OLS", "WLS"))

detach(galton)

Computer-assisted learning (nonconstant variance and weighted least squares)

  • Load the ca_learning data.
  • Create a scatterplot of the data.
  • Fit an OLS model.
  • Plot the OLS residuals vs num.responses.
  • Plot the absolute OLS residuals vs num.responses.
  • Calculate fitted values from a regression of absolute residuals vs num.responses.
  • Fit a WLS model using weights = 1/(fitted values)2.
  • Create a scatterplot of the data with a regression line for each model.
  • Plot the WLS standardized residuals vs num.responses.
ca_learning <- read.table("~/path-to-data/ca_learning_new.txt", header=T)
attach(ca_learning)

plot(x=num.responses, y=cost)

model.1 <- lm(cost ~ num.responses)
summary(model.1)
#               Estimate Std. Error t value Pr(>|t|)    
# (Intercept)    19.4727     5.5162   3.530  0.00545 ** 
# num.responses   3.2689     0.3651   8.955 4.33e-06 ***
# ---
# Residual standard error: 4.598 on 10 degrees of freedom
# Multiple R-squared:  0.8891,  Adjusted R-squared:  0.878 
# F-statistic: 80.19 on 1 and 10 DF,  p-value: 4.33e-06

plot(num.responses, residuals(model.1))
plot(num.responses, abs(residuals(model.1)))

wts <- 1/fitted(lm(abs(residuals(model.1)) ~ num.responses))^2

model.2 <- lm(cost ~ num.responses, weights=wts)
summary(model.2)
#               Estimate Std. Error t value Pr(>|t|)    
# (Intercept)    17.3006     4.8277   3.584  0.00498 ** 
# num.responses   3.4211     0.3703   9.238 3.27e-06 ***
# ---
# Residual standard error: 1.159 on 10 degrees of freedom
# Multiple R-squared:  0.8951,  Adjusted R-squared:  0.8846 
# F-statistic: 85.35 on 1 and 10 DF,  p-value: 3.269e-06

plot(x=num.responses, y=cost, ylim=c(50,95),
     panel.last = c(lines(sort(num.responses), fitted(model.1)[order(num.responses)], col="blue"),
                    lines(sort(num.responses), fitted(model.2)[order(num.responses)], col="red")))
legend("topleft", col=c("blue","red"), lty=1,
       inset=0.02, legend=c("OLS", "WLS"))

plot(num.responses, rstandard(model.2))

detach(ca_learning)

Market share (nonconstant variance and weighted least squares)

  • Load the marketshare data.
  • Fit an OLS model.
  • Plot the OLS residuals vs fitted values with points marked by Discount.
  • Use the tapply function to calculate the residual variance for Discount=0 and Discount=1.
  • Fit a WLS model using weights = 1/variance for Discount=0 and Discount=1.
  • Plot the WLS standardized residuals vs fitted values.
marketshare <- read.table("~/path-to-data/marketshare.txt", header=T)
attach(marketshare)

model.1 <- lm(MarketShare ~ Price + P1 + P2)
summary(model.1)
  #             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)  3.19592    0.35616   8.973 3.00e-10 ***
# Price       -0.33358    0.15226  -2.191   0.0359 *  
# P1           0.30808    0.06412   4.804 3.51e-05 ***
# P2           0.48431    0.05541   8.740 5.49e-10 ***

plot(fitted(model.1), residuals(model.1), col=Discount+1)
vars <- tapply(residuals(model.1), Discount, var)
#          0          1 
# 0.01052324 0.02680546 

wts <- Discount/vars[2] + (1-Discount)/vars[1]

model.2 <- lm(MarketShare ~ Price + P1 + P2, weights=wts)
summary(model.2)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)  3.17437    0.35671   8.899 3.63e-10 ***
# Price       -0.32432    0.15291  -2.121   0.0418 *  
# P1           0.30834    0.06575   4.689 4.89e-05 ***
# P2           0.48419    0.05422   8.930 3.35e-10 ***

plot(fitted(model.2), rstandard(model.2), col=Discount+1)

detach(marketshare)

Home price (nonconstant variance and weighted least squares)

  • Load the realestate data.
  • Calculate log transformations of the variables.
  • Fit an OLS model.
  • Plot the OLS residuals vs fitted values.
  • Calculate fitted values from a regression of absolute residuals vs fitted values.
  • Fit a WLS model using weights = 1/(fitted values)2.
  • Plot the WLS standardized residuals vs fitted values.
realestate <- read.table("~/path-to-data/realestate.txt", header=T)
attach(realestate)

logY <- log(SalePrice)
logX1 <- log(SqFeet)
logX2 <- log(Lot)

model.1 <- lm(logY ~ logX1 + logX2)
summary(model.1)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)  4.25485    0.07353  57.864  < 2e-16 ***
# logX1        1.22141    0.03371  36.234  < 2e-16 ***
# logX2        0.10595    0.02394   4.425 1.18e-05 ***

plot(fitted(model.1), residuals(model.1))

wts <- 1/fitted(lm(abs(residuals(model.1)) ~ fitted(model.1)))^2

model.2 <- lm(logY ~ logX1 + logX2, weights=wts)
summary(model.2)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)  4.35189    0.06330  68.755  < 2e-16 ***
# logX1        1.20150    0.03332  36.065  < 2e-16 ***
# logX2        0.07924    0.02152   3.682 0.000255 ***

plot(fitted(model.2), rstandard(model.2))

detach(realestate)

Google stock (autoregression model)

  • Use the read.zoo function in the zoo package to load the google_stock data in time series format.
  • Create a time series plot of the data.
  • Load the google_stock data in the usual way using read-table.
  • Use the ts function to convert the price variable to a time series.
  • Create a plot of partial autocorrelations of price.
  • Calculate a lag-1 price variable (note that the lag argument for the function is –1, not +1).
  • Create a scatterplot of price vs lag1price.
  • Use the ts.intersect function to create a dataframe containing price and lag1price.
  • Fit a simple linear regression model of price vs lag1price (a first-order autoregression model).
library(zoo)
google.ts <- read.zoo("~/path-to-data/google_stock.txt", format="%m/%d/%Y",
                      header=T)
plot(google.ts)

google <- read.table("~/path-to-data/google_stock.txt", header=T)
attach(google)

price.ts <- ts(price)
pacf(price.ts)

lag1price <- lag(price.ts, -1)
plot(price.ts ~ lag1price, xy.labels=F)

lagdata <- ts.intersect(price.ts, lag1price, dframe=T)
summary(lm(price.ts ~ lag1price, data=lagdata))

detach(google)

Earthquakes (autoregression model)

  • Load the earthquakes data.
  • Create a time series plot of the data.
  • Use the ts function to convert the Quakes variable to a time series.
  • Create a plot of partial autocorrelations of Quakes.
  • Calculate lag-1, lag-2, and lag-3 Quakes variables.
  • Use the ts.intersect function to create a dataframe containing Quakes and the three lag variables.
  • Fit a multiple linear regression model of Quakes versus the three lag variables (a third-order autoregression model).
earthquakes <- read.table("~/path-to-data/earthquakes.txt", header=T)
attach(earthquakes)

plot(Year, Quakes, type="b")

Quakes.ts <- ts(Quakes)
pacf(Quakes.ts)

lag1Quakes <- lag(Quakes.ts, -1)
lag2Quakes <- lag(Quakes.ts, -2)
lag3Quakes <- lag(Quakes.ts, -3)

lagdata <- ts.intersect(Quakes.ts, lag1Quakes, lag2Quakes, lag3Quakes, dframe=T)
summary(lm(Quakes.ts ~ lag1Quakes + lag2Quakes + lag3Quakes, data=lagdata))
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)  6.44916    1.78646   3.610 0.000496 ***
# lag1Quakes   0.16424    0.10063   1.632 0.106049    
# lag2Quakes   0.07125    0.10128   0.703 0.483517    
# lag3Quakes   0.26928    0.09783   2.753 0.007110 ** 
# ---
# Residual standard error: 3.851 on 93 degrees of freedom
# Multiple R-squared:  0.1388,  Adjusted R-squared:  0.111 
# F-statistic: 4.997 on 3 and 93 DF,  p-value: 0.002942

detach(earthquakes)

Blaisdell company (regression with autoregressive errors)

  • Load the blaisdell data.
  • Fit a simple linear regression model of comsales vs indsales.
  • Use the dwt function in the car package to conduct the Durbin-Watson test on the residuals.
  • Conduct the Ljung-Box test on the residuals.
  • Perform the Cochrane-Orcutt procedure to transform the variables.
  • Forecast comsales for period 21 when indsales are projected to be $175.3 million.
  • Perform the Hildreth-Lu procedure to transform the variables.
  • Perform the first differences procedure to transform the variables.
blaisdell <- read.table("~/path-to-data/blaisdell.txt", header=T)
attach(blaisdell)

model.1 <- lm(comsales ~ indsales)
summary(model.1)
#              Estimate Std. Error t value Pr(>|t|)    
# (Intercept) -1.454750   0.214146  -6.793 2.31e-06 ***
# indsales     0.176283   0.001445 122.017  < 2e-16 ***

# Durbin-Watson Test
library(car)
dwt(model.1)
# lag Autocorrelation D-W Statistic p-value
#   1       0.6260046     0.7347256       0
# Alternative hypothesis: rho != 0

# Ljung-Box Q Test
Box.test(residuals(model.1), lag = 1, type = "Ljung")
# Box-Ljung test
# data:  residuals(model.1)
# X-squared = 9.0752, df = 1, p-value = 0.002591

# Cochrane-Orcutt Procedure
res.ts <- ts(residuals(model.1))
lag1res <- lag(res.ts, -1)
lagdata1 <- ts.intersect(res.ts, lag1res)
acp <- coef(lm(res.ts ~ lag1res -1, data=lagdata1)) # 0.6311636
y.ts <- ts(comsales)
x.ts <- ts(indsales)
lag1y <- lag(y.ts, -1)
lag1x <- lag(x.ts, -1)
y.co <- y.ts-acp*lag1y
x.co <- x.ts-acp*lag1x
model.2 <- lm(y.co ~ x.co)
summary(model.2)
#              Estimate Std. Error t value Pr(>|t|)    
# (Intercept) -0.394111   0.167230  -2.357   0.0307 *  
# x.co         0.173758   0.002957  58.767   <2e-16 ***

dwt(model.2)
# lag Autocorrelation D-W Statistic p-value
#   1       0.1473569      1.650248   0.306
# Alternative hypothesis: rho != 0

b0 <- coef(model.2)[1]/(1-acp) # -1.068524
sqrt(vcov(model.2)[1,1])/(1-acp) # se = 0.4533986
b1 <- coef(model.2)[2] # 0.1737583

fit.20 <- b0+b1*indsales[20] # 28.76577
res.20 <- comsales[20]-fit.20 # 0.01422919
fit.21 <- b0+b1*175.3 # 29.3913
forecast.21 <- fit.21+acp*res.20 # 29.40028

# Hildreth-Lu Procedure
sse <- vector()
for(i in 1:90){
  y.hl = y.ts-(0.09+0.01*i)*lag1y
  x.hl = x.ts-(0.09+0.01*i)*lag1x
  sse[i] <- sum(residuals(lm(y.hl ~ x.hl))^2)
}
acp <- 0.09+0.01*which.min(sse) # 0.96
y.hl = y.ts-acp*lag1y
x.hl = x.ts-acp*lag1x
model.3 <- lm(y.hl ~ x.hl)
summary(model.3)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)  0.07117    0.05797   1.228    0.236    
# x.hl         0.16045    0.00684  23.458 2.18e-14 ***

dwt(model.3)
# lag Autocorrelation D-W Statistic p-value
#   1        0.116145      1.725439   0.548
# Alternative hypothesis: rho != 0

coef(model.3)[1]/(1-acp) # 1.77933
sqrt(vcov(model.3)[1,1])/(1-acp) # 1.449373

# First Differences Procedure

y.fd = y.ts-lag1y
x.fd = x.ts-lag1x
dwt(lm(y.fd ~ x.fd))
# lag Autocorrelation D-W Statistic p-value
#   1       0.1160548      1.748834    0.62
# Alternative hypothesis: rho != 0

model.4 <- lm(y.fd ~ x.fd -1)
summary(model.4)
#      Estimate Std. Error t value Pr(>|t|)    
# x.fd 0.168488   0.005096   33.06   <2e-16 ***

mean(comsales)-coef(model.4)[1]*mean(indsales) # -0.3040052

detach(blaisdell)

Metal fabricator and vendor employees (regression with autoregressive errors)

  • Load the employee data.
  • Fit a simple linear regression model of metal vs vendor.
  • Create a scatterplot of the data with a regression line.
  • Create a scatterplot of the residuals vs time order.
  • Create a plot of partial autocorrelations of the residuals.
  • Use the dwt function in the car package to conduct the Durbin-Watson test on the residuals.
  • Perform the Cochrane-Orcutt procedure to transform the variables.
employee <- read.table("~/path-to-data/employee.txt", header=T)
attach(employee)

model.1 <- lm(metal ~ vendor)
summary(model.1)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept) 2.847911   3.299962   0.863    0.392    
# vendor      0.122442   0.009423  12.994   <2e-16 ***

plot(x=vendor, y=metal,
     panel.last = lines(sort(vendor), fitted(model.1)[order(vendor)]))

plot(x=time, y=residuals(model.1), type="b",
     panel.last = abline(h=0, lty=2))

pacf(residuals(model.1))

# Durbin-Watson Test
dwt(model.1)
# lag Autocorrelation D-W Statistic p-value
#   1        0.772038     0.3592396       0
# Alternative hypothesis: rho != 0

# Cochrane-Orcutt Procedure
res.ts <- ts(residuals(model.1))
lag1res <- lag(res.ts, -1)
lagdata1 <- ts.intersect(res.ts, lag1res)
acp <- coef(lm(res.ts ~ lag1res -1, data=lagdata1)) # 0.831385
y.ts <- ts(metal)
x.ts <- ts(vendor)
lag1y <- lag(y.ts, -1)
lag1x <- lag(x.ts, -1)
y.co <- y.ts-acp*lag1y
x.co <- x.ts-acp*lag1x
model.2 <- lm(y.co ~ x.co)
summary(model.2)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)  4.87560    0.78655   6.199 6.78e-08 ***
# x.co         0.04795    0.01300   3.688 0.000505 ***

coef(model.2)[1]/(1-acp) # 28.91557
sqrt(vcov(model.2)[1,1])/(1-acp) # se = 4.664789

detach(employee)

Blood pressure (multicollinearity)

  • Load the bloodpress data.
  • Create scatterplot matrices of the data.
  • Calculate correlations between the variables.
bloodpress <- read.table("~/path-to-data/bloodpress.txt", header=T)
attach(bloodpress)

pairs(bloodpress[,c(2:5)])
pairs(bloodpress[,c(2,6:8)])

round(cor(bloodpress[,c(2:8)]),3)
#           BP   Age Weight   BSA   Dur Pulse Stress
# BP     1.000 0.659  0.950 0.866 0.293 0.721  0.164
# Age    0.659 1.000  0.407 0.378 0.344 0.619  0.368
# Weight 0.950 0.407  1.000 0.875 0.201 0.659  0.034
# BSA    0.866 0.378  0.875 1.000 0.131 0.465  0.018
# Dur    0.293 0.344  0.201 0.131 1.000 0.402  0.312
# Pulse  0.721 0.619  0.659 0.465 0.402 1.000  0.506
# Stress 0.164 0.368  0.034 0.018 0.312 0.506  1.000

detach(bloodpress)

Uncorrelated predictors (no multicollinearity)

  • Load the uncorrpreds data.
  • Create a scatterplot matrix of the data.
  • Calculate the correlation between the predictors.
  • Fit a simple linear regression model of y vs x1.
  • Fit a simple linear regression model of y vs x2.
  • Fit a multiple linear regression model of y vs x1 + x2.
  • Fit a multiple linear regression model of y vs x2 + x1.
  • Use the scatter3d function in the car package to create a 3D scatterplot of the data with the fitted plane for a multiple linear regression model of y vs x1 + x2.
uncorrpreds <- read.table("~/path-to-data/uncorrpreds.txt", header=T)
attach(uncorrpreds)

pairs(uncorrpreds)

cor(x1,x2) # 0

model.1 <- lm(y ~ x1)
summary(model.1)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)   49.500      4.655  10.634 4.07e-05 ***
# x1            -1.000      1.472  -0.679    0.522    
anova(model.1)
#           Df Sum Sq Mean Sq F value Pr(>F)
# x1         1      8   8.000  0.4615 0.5222
# Residuals  6    104  17.333               

model.2 <- lm(y ~ x2)
summary(model.2)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)   57.000      8.213   6.940 0.000444 ***
# x2            -1.750      1.350  -1.296 0.242545    
anova(model.2)
#           Df Sum Sq Mean Sq F value Pr(>F)
# x2         1   24.5  24.500    1.68 0.2425
# Residuals  6   87.5  14.583               

model.12 <- lm(y ~ x1 + x2)
summary(model.12)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)   60.000      9.562   6.275  0.00151 **
# x1            -1.000      1.410  -0.709  0.50982   
# x2            -1.750      1.410  -1.241  0.26954   
anova(model.12)
#           Df Sum Sq Mean Sq F value Pr(>F)
# x1         1    8.0     8.0  0.5031 0.5098
# x2         1   24.5    24.5  1.5409 0.2695
# Residuals  5   79.5    15.9               

model.21 <- lm(y ~ x2 + x1)
summary(model.21)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)   60.000      9.562   6.275  0.00151 **
# x2            -1.750      1.410  -1.241  0.26954   
# x1            -1.000      1.410  -0.709  0.50982   
anova(model.21)
#           Df Sum Sq Mean Sq F value Pr(>F)
# x2         1   24.5    24.5  1.5409 0.2695
# x1         1    8.0     8.0  0.5031 0.5098
# Residuals  5   79.5    15.9               

# library(car)
scatter3d(y ~ x1 + x2)

detach(uncorrpreds)

Blood pressure (predictors with almost no multicollinearity)

  • Load the bloodpress data.
  • Create a scatterplot matrix of the data.
  • Fit a simple linear regression model of BP vs Stress.
  • Fit a simple linear regression model of BP vs BSA.
  • Fit a multiple linear regression model of BP vs Stress + BSA.
  • Fit a multiple linear regression model of BP vs BSA + Stress.
  • Use the scatter3d function in the car package to create a 3D scatterplot of the data with the fitted plane for a multiple linear regression model of BP vs Stress + BSA.
attach(bloodpress)

pairs(bloodpress[,c(2,5,8)])

model.1 <- lm(BP ~ Stress)
summary(model.1)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept) 112.71997    2.19345  51.389   <2e-16 ***
# Stress        0.02399    0.03404   0.705     0.49    
anova(model.1)
#           Df Sum Sq Mean Sq F value Pr(>F)
# Stress     1  15.04  15.044  0.4969 0.4899
# Residuals 18 544.96  30.275               

model.2 <- lm(BP ~ BSA)
summary(model.2)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)   45.183      9.392   4.811  0.00014 ***
# BSA           34.443      4.690   7.343 8.11e-07 ***
anova(model.2)
#           Df Sum Sq Mean Sq F value Pr(>F)
# BSA        1 419.86  419.86  53.927 8.114e-07 ***
# Residuals 18 140.14    7.79                      

model.12 <- lm(BP ~ Stress + BSA)
summary(model.12)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept) 44.24452    9.26104   4.777 0.000175 ***
# Stress       0.02166    0.01697   1.277 0.218924    
# BSA         34.33423    4.61110   7.446 9.56e-07 ***
anova(model.12)
#           Df Sum Sq Mean Sq F value Pr(>F)
# Stress     1  15.04   15.04  1.9998    0.1754    
# BSA        1 417.07  417.07 55.4430 9.561e-07 ***
# Residuals 17 127.88    7.52                      

model.21 <- lm(BP ~ BSA + Stress)
summary(model.21)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept) 44.24452    9.26104   4.777 0.000175 ***
# BSA         34.33423    4.61110   7.446 9.56e-07 ***
# Stress       0.02166    0.01697   1.277 0.218924    
anova(model.21)
#           Df Sum Sq Mean Sq F value Pr(>F)
# BSA        1 419.86  419.86 55.8132 9.149e-07 ***
# Stress     1  12.26   12.26  1.6296    0.2189    
# Residuals 17 127.88    7.52                      

scatter3d(BP ~ Stress + BSA)

detach(bloodpress)

Blood pressure (predictors with high multicollinearity)

  • Load the bloodpress data.
  • Create a scatterplot matrix of the data.
  • Fit a simple linear regression model of BP vs Weight.
  • Fit a simple linear regression model of BP vs BSA.
  • Fit a multiple linear regression model of BP vs Weight + BSA.
  • Fit a multiple linear regression model of BP vs BSA + Weight.
  • Use the scatter3d function in the car package to create a 3D scatterplot of the data with the fitted plane for a multiple linear regression model of BP vs Weight + BSA.
  • Predict BP for Weight=92 and BSA=2 for the two simple linear regression models and the multiple linear regression model.
attach(bloodpress)

pairs(bloodpress[,c(2,5,4)])

model.1 <- lm(BP ~ Weight)
summary(model.1)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)  2.20531    8.66333   0.255    0.802    
# Weight       1.20093    0.09297  12.917 1.53e-10 ***
anova(model.1)
#           Df Sum Sq Mean Sq F value Pr(>F)
# Weight     1 505.47  505.47  166.86 1.528e-10 ***
# Residuals 18  54.53    3.03                      

model.2 <- lm(BP ~ BSA)
summary(model.2)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)   45.183      9.392   4.811  0.00014 ***
# BSA           34.443      4.690   7.343 8.11e-07 ***
anova(model.2)
#           Df Sum Sq Mean Sq F value Pr(>F)
# BSA        1 419.86  419.86  53.927 8.114e-07 ***
# Residuals 18 140.14    7.79                      

model.12 <- lm(BP ~ Weight + BSA)
summary(model.12)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)   5.6534     9.3925   0.602    0.555    
# Weight        1.0387     0.1927   5.392 4.87e-05 ***
# BSA           5.8313     6.0627   0.962    0.350    
anova(model.12)
#           Df Sum Sq Mean Sq F value Pr(>F)
# Weight     1 505.47  505.47 166.1648 3.341e-10 ***
# BSA        1   2.81    2.81   0.9251    0.3496    
# Residuals 17  51.71    3.04                       

model.21 <- lm(BP ~ BSA + Weight)
summary(model.21)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)   5.6534     9.3925   0.602    0.555    
# BSA           5.8313     6.0627   0.962    0.350    
# Weight        1.0387     0.1927   5.392 4.87e-05 ***
anova(model.21)
#           Df Sum Sq Mean Sq F value Pr(>F)
# BSA        1 419.86  419.86 138.021 1.391e-09 ***
# Weight     1  88.43   88.43  29.069 4.871e-05 ***
# Residuals 17  51.71    3.04                      

scatter3d(BP ~ Weight + BSA)

predict(model.1, interval="prediction",
        newdata=data.frame(Weight=92))
#       fit     lwr     upr
# 1 112.691 108.938 116.444

predict(model.2, interval="prediction",
        newdata=data.frame(BSA=2))
#        fit      lwr      upr
# 1 114.0689 108.0619 120.0758

predict(model.12, interval="prediction",
        newdata=data.frame(Weight=92, BSA=2))
#        fit      lwr      upr
# 1 112.8794 109.0801 116.6787

detach(bloodpress)

Poverty and teen birth rate (high multicollinearity)

  • Load the poverty data and remove the District of Columbia.
  • Create a scatterplot matrix of the data.
  • Fit a simple linear regression model of PovPct vs Brth15to17.
  • Fit a simple linear regression model of PovPct vs Brth18to19.
  • Fit a multiple linear regression model of PovPct vs Brth15to17 + Brth18to19.
poverty <- read.table("~/path-to-data/poverty.txt", header=T)
poverty <- poverty[poverty$Location!="District_of_Columbia",]
attach(poverty)

pairs(poverty[,c(2:4)])

model.1 <- lm(PovPct ~ Brth15to17)
summary(model.1)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)   4.4871     1.3181   3.404  0.00135 ** 
# Brth15to17    0.3872     0.0572   6.768 1.67e-08 ***
# ---
# Residual standard error: 2.982 on 48 degrees of freedom
# Multiple R-squared:  0.4883,  Adjusted R-squared:  0.4777 
# F-statistic: 45.81 on 1 and 48 DF,  p-value: 1.666e-08

model.2 <- lm(PovPct ~ Brth18to19)
summary(model.2)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)  3.05279    1.83169   1.667    0.102    
# Brth18to19   0.13842    0.02482   5.576 1.11e-06 ***
# ---
# Residual standard error: 3.248 on 48 degrees of freedom
# Multiple R-squared:  0.3931,  Adjusted R-squared:  0.3805 
# F-statistic: 31.09 on 1 and 48 DF,  p-value: 1.106e-06

model.12 <- lm(PovPct ~ Brth15to17 + Brth18to19)
summary(model.12)
#             Estimate Std. Error t value Pr(>|t|)   
# (Intercept)  6.43963    1.95904   3.287  0.00192 **
# Brth15to17   0.63235    0.19178   3.297  0.00186 **
# Brth18to19  -0.10227    0.07642  -1.338  0.18724   
# ---
# Residual standard error: 2.958 on 47 degrees of freedom
# Multiple R-squared:  0.5071,  Adjusted R-squared:  0.4861 
# F-statistic: 24.18 on 2 and 47 DF,  p-value: 6.017e-08

detach(poverty)

Blood pressure (high multicollinearity)

  • Load the bloodpress data.
  • Fit a multiple linear regression model of BP vs Age + Weight + BSA + Dur + Pulse + Stress.
  • Use the vif function in the car package to calculate variance inflation factors.
  • Fit a multiple linear regression model of Weight vs Age + BSA + Dur + Pulse + Stress and confirm the VIF value for Weight as 1/(1-R2) for this model.
  • Fit a multiple linear regression model of BP vs Age + Weight + Dur + Stress.
  • Use the vif function in the car package to calculate variance inflation factors.
attach(bloodpress)

model.1 <- lm(BP ~ Age + Weight + BSA + Dur + Pulse + Stress)
summary(model.1)
#               Estimate Std. Error t value Pr(>|t|)    
# (Intercept) -12.870476   2.556650  -5.034 0.000229 ***
# Age           0.703259   0.049606  14.177 2.76e-09 ***
# Weight        0.969920   0.063108  15.369 1.02e-09 ***
# BSA           3.776491   1.580151   2.390 0.032694 *  
# Dur           0.068383   0.048441   1.412 0.181534    
# Pulse        -0.084485   0.051609  -1.637 0.125594    
# Stress        0.005572   0.003412   1.633 0.126491    
# ---
# Residual standard error: 0.4072 on 13 degrees of freedom
# Multiple R-squared:  0.9962,  Adjusted R-squared:  0.9944 
# F-statistic: 560.6 on 6 and 13 DF,  p-value: 6.395e-15

# library(car)
vif(model.1)
#      Age   Weight      BSA      Dur    Pulse   Stress 
# 1.762807 8.417035 5.328751 1.237309 4.413575 1.834845 

model.2 <- lm(Weight ~ Age + BSA + Dur + Pulse + Stress)
summary(model.2)
# Residual standard error: 1.725 on 14 degrees of freedom
# Multiple R-squared:  0.8812,  Adjusted R-squared:  0.8388 
# F-statistic: 20.77 on 5 and 14 DF,  p-value: 5.046e-06

1/(1-summary(model.2)$r.squared) # 8.417035

model.3 <- lm(BP ~ Age + Weight + Dur + Stress)
summary(model.3)
#               Estimate Std. Error t value Pr(>|t|)    
# (Intercept) -15.869829   3.195296  -4.967 0.000169 ***
# Age           0.683741   0.061195  11.173 1.14e-08 ***
# Weight        1.034128   0.032672  31.652 3.76e-15 ***
# Dur           0.039889   0.064486   0.619 0.545485    
# Stress        0.002184   0.003794   0.576 0.573304    
# ---
# Residual standard error: 0.5505 on 15 degrees of freedom
# Multiple R-squared:  0.9919,  Adjusted R-squared:  0.9897 
# F-statistic: 458.3 on 4 and 15 DF,  p-value: 1.764e-15

vif(model.3)
#      Age   Weight      Dur   Stress 
# 1.468245 1.234653 1.200060 1.241117 

detach(bloodpress)

Allen Cognitive Level study (reducing data-based multicollinearity)

  • Load the sampled allentestn23 data.
  • Create a scatterplot matrix of the data.
  • Calculate the correlation between Vocab and Abstract.
  • Fit a multiple linear regression model of ACL vs SDMT + Vocab + Abstract.
  • Use the vif function in the car package to calculate variance inflation factors.
  • Repeat for the full allentest data.
allentestn23 <- read.table("~/path-to-data/allentestn23.txt", header=T)
attach(allentestn23)

pairs(allentestn23[,2:5])

cor(Vocab, Abstract) # 0.9897771

model.1 <- lm(ACL ~ SDMT + Vocab + Abstract)
summary(model.1)
#             Estimate Std. Error t value Pr(>|t|)  
# (Intercept)  3.74711    1.34237   2.791   0.0116 *
# SDMT         0.02326    0.01273   1.827   0.0834 .
# Vocab        0.02825    0.15239   0.185   0.8549  
# Abstract    -0.01379    0.10055  -0.137   0.8924  
# ---
# Residual standard error: 0.7344 on 19 degrees of freedom
# Multiple R-squared:  0.2645,  Adjusted R-squared:  0.1484 
# F-statistic: 2.278 on 3 and 19 DF,  p-value: 0.1124

vif(model.1)
#     SDMT     Vocab  Abstract 
# 1.726185 49.286239 50.603085 

detach(allentestn23)

allentest <- read.table("~/path-to-data/allentest.txt", header=T)
attach(allentest)

pairs(allentest[,2:5])

cor(Vocab, Abstract) # 0.6978405

model.1 <- lm(ACL ~ SDMT + Vocab + Abstract)
summary(model.1)
#              Estimate Std. Error t value Pr(>|t|)    
# (Intercept)  3.946347   0.338069  11.673  < 2e-16 ***
# SDMT         0.027404   0.007168   3.823 0.000298 ***
# Vocab       -0.017397   0.018077  -0.962 0.339428    
# Abstract     0.012182   0.011585   1.051 0.296926    
# ---
# Residual standard error: 0.6878 on 65 degrees of freedom
# Multiple R-squared:  0.2857,  Adjusted R-squared:  0.2528 
# F-statistic: 8.668 on 3 and 65 DF,  p-value: 6.414e-05

vif(model.1)
#     SDMT    Vocab Abstract 
# 1.609662 2.093297 2.167428 

detach(allentest)

Exercise and immunity (reducing structural multicollinearity)

  • Load the exerimmun data.
  • Create a scatterplot of igg vs oxygen.
  • Calculate an oxygen-squared variable named oxygensq.
  • Fit a quadratic regression model of igg vs oxygen + oxygensq.
  • Add a quadratic regression line to the scatterplot.
  • Use the vif function in the car package to calculate variance inflation factors.
  • Create a scatterplot of oxygensq vs oxygen and calculate the correlation.
  • Calculate a centered oxygen variable named oxcent and an oxcent-squared variable named oxcentsq.
  • Fit a quadratic regression model of igg vs oxcent + oxcentsq.
  • Use the vif function in the car package to calculate variance inflation factors.
  • Create a scatterplot of igg vs oxcent with the quadratic regression line added.
  • Fit a simple linear regression model of igg vs oxcent.
  • Confirm the equivalence of the original quadratic and centered quadratic models by transforming the regression parameter estimates.
  • Create a residual vs fits plot for the centered quadratic model.
  • Create a normal probability plot of the residuals for the centered quadratic model.
  • Predict igg for oxygen = 70 using the centered quadratic model.
exerimmun <- read.table("~/path-to-data/exerimmun.txt", header=T)
attach(exerimmun)

plot(oxygen, igg)

oxygensq <- oxygen^2

model.1 <- lm(igg ~ oxygen + oxygensq)

plot(x=oxygen, y=igg,
     panel.last = lines(sort(oxygen), fitted(model.1)[order(oxygen)]))

summary(model.1)
#               Estimate Std. Error t value Pr(>|t|)    
# (Intercept) -1464.4042   411.4012  -3.560  0.00140 ** 
# oxygen         88.3071    16.4735   5.361 1.16e-05 ***
# oxygensq       -0.5362     0.1582  -3.390  0.00217 ** 
# ---
# Residual standard error: 106.4 on 27 degrees of freedom
# Multiple R-squared:  0.9377,  Adjusted R-squared:  0.9331 
# F-statistic: 203.2 on 2 and 27 DF,  p-value: < 2.2e-16

vif(model.1)
#   oxygen oxygensq 
# 99.94261 99.94261 

plot(oxygen, oxygensq)
cor(oxygen, oxygensq) # 0.9949846

oxcent <- oxygen-mean(oxygen)
oxcentsq <- oxcent^2

plot(oxcent, oxcentsq)
cor(oxcent, oxcentsq) # 0.2195179

model.2 <- lm(igg ~ oxcent + oxcentsq)
summary(model.2)
#              Estimate Std. Error t value Pr(>|t|)    
# (Intercept) 1632.1962    29.3486   55.61  < 2e-16 ***
# oxcent        33.9995     1.6890   20.13  < 2e-16 ***
# oxcentsq      -0.5362     0.1582   -3.39  0.00217 ** 
# ---
# Residual standard error: 106.4 on 27 degrees of freedom
# Multiple R-squared:  0.9377,  Adjusted R-squared:  0.9331 
# F-statistic: 203.2 on 2 and 27 DF,  p-value: < 2.2e-16

vif(model.2)
#   oxcent oxcentsq 
# 1.050628 1.050628 

plot(x=oxcent, y=igg,
     panel.last = lines(sort(oxcent), fitted(model.2)[order(oxcent)]))

model.3 <- lm(igg ~ oxcent)
summary(model.3)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept) 1557.633     22.782   68.37  < 2e-16 ***
# oxcent        32.743      1.932   16.95 2.97e-16 ***

coef(model.2)[1]-coef(model.2)[2]*mean(oxygen)+coef(model.2)[3]*mean(oxygen)^2 # -1464.404
coef(model.2)[2]-2*coef(model.2)[3]*mean(oxygen) # 88.3071
coef(model.2)[3] # -0.5362473

coef(model.1)
#   (Intercept)        oxygen      oxygensq 
# -1464.4042284    88.3070970    -0.5362473 

plot(x=fitted(model.2), y=residuals(model.2),
     panel.last = abline(h=0, lty=2))

qqnorm(residuals(model.2), main="", datax=TRUE)
qqline(residuals(model.2), datax=TRUE)

predict(model.2, interval="prediction",
        newdata=data.frame(oxcent=70-mean(oxygen), oxcentsq=(70-mean(oxygen))^2))
#        fit      lwr      upr
# 1 2089.481 1849.931 2329.031

detach(exerimmun)