R Help 12: Multicollinearity

Help

Blood pressure (multicollinearity)

  • Load the bloodpress data.
  • Create scatterplot matrices of the data.
  • Calculate correlations between the variables.
bloodpress <- read.table("~/path-to-data/bloodpress.txt", header=T)
attach(bloodpress)

pairs(bloodpress[,c(2:5)])
pairs(bloodpress[,c(2,6:8)])

round(cor(bloodpress[,c(2:8)]),3)
#           BP   Age Weight   BSA   Dur Pulse Stress
# BP     1.000 0.659  0.950 0.866 0.293 0.721  0.164
# Age    0.659 1.000  0.407 0.378 0.344 0.619  0.368
# Weight 0.950 0.407  1.000 0.875 0.201 0.659  0.034
# BSA    0.866 0.378  0.875 1.000 0.131 0.465  0.018
# Dur    0.293 0.344  0.201 0.131 1.000 0.402  0.312
# Pulse  0.721 0.619  0.659 0.465 0.402 1.000  0.506
# Stress 0.164 0.368  0.034 0.018 0.312 0.506  1.000

detach(bloodpress)

Uncorrelated predictors (no multicollinearity)

  • Load the uncorrpreds data.
  • Create a scatterplot matrix of the data.
  • Calculate the correlation between the predictors.
  • Fit a simple linear regression model of y vs \(x_1\).
  • Fit a simple linear regression model of y vs \(x_2\).
  • Fit a multiple linear regression model of y vs \(x_1\) + \(x_2\).
  • Fit a multiple linear regression model of y vs \(x_2\) + \(x_1\).
  • Use the scatter3d function in the car package to create a 3D scatterplot of the data with the fitted plane for a multiple linear regression model of y vs \(x_1\) + \(x_2\).
uncorrpreds <- read.table("~/path-to-data/uncorrpreds.txt", header=T)
attach(uncorrpreds)

pairs(uncorrpreds)

cor(x1,x2) # 0

model.1 <- lm(y ~ x1)
summary(model.1)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)   49.500      4.655  10.634 4.07e-05 ***
# x1            -1.000      1.472  -0.679    0.522    
anova(model.1)
#           Df Sum Sq Mean Sq F value Pr(>F)
# x1         1      8   8.000  0.4615 0.5222
# Residuals  6    104  17.333               

model.2 <- lm(y ~ x2)
summary(model.2)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)   57.000      8.213   6.940 0.000444 ***
# x2            -1.750      1.350  -1.296 0.242545    
anova(model.2)
#           Df Sum Sq Mean Sq F value Pr(>F)
# x2         1   24.5  24.500    1.68 0.2425
# Residuals  6   87.5  14.583               

model.12 <- lm(y ~ x1 + x2)
summary(model.12)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)   60.000      9.562   6.275  0.00151 **
# x1            -1.000      1.410  -0.709  0.50982   
# x2            -1.750      1.410  -1.241  0.26954   
anova(model.12)
#           Df Sum Sq Mean Sq F value Pr(>F)
# x1         1    8.0     8.0  0.5031 0.5098
# x2         1   24.5    24.5  1.5409 0.2695
# Residuals  5   79.5    15.9               

model.21 <- lm(y ~ x2 + x1)
summary(model.21)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)   60.000      9.562   6.275  0.00151 **
# x2            -1.750      1.410  -1.241  0.26954   
# x1            -1.000      1.410  -0.709  0.50982   
anova(model.21)
#           Df Sum Sq Mean Sq F value Pr(>F)
# x2         1   24.5    24.5  1.5409 0.2695
# x1         1    8.0     8.0  0.5031 0.5098
# Residuals  5   79.5    15.9               

# library(car)
scatter3d(y ~ x1 + x2)

detach(uncorrpreds)

Blood pressure (predictors with almost no multicollinearity)

  • Load the bloodpress data.
  • Create a scatterplot matrix of the data.
  • Fit a simple linear regression model of BP vs Stress.
  • Fit a simple linear regression model of BP vs BSA.
  • Fit a multiple linear regression model of BP vs Stress + BSA.
  • Fit a multiple linear regression model of BP vs BSA + Stress.
  • Use the scatter3d function in the car package to create a 3D scatterplot of the data with the fitted plane for a multiple linear regression model of BP vs Stress + BSA.
attach(bloodpress)

pairs(bloodpress[,c(2,5,8)])

model.1 <- lm(BP ~ Stress)
summary(model.1)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept) 112.71997    2.19345  51.389   <2e-16 ***
# Stress        0.02399    0.03404   0.705     0.49    
anova(model.1)
#           Df Sum Sq Mean Sq F value Pr(>F)
# Stress     1  15.04  15.044  0.4969 0.4899
# Residuals 18 544.96  30.275               

model.2 <- lm(BP ~ BSA)
summary(model.2)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)   45.183      9.392   4.811  0.00014 ***
# BSA           34.443      4.690   7.343 8.11e-07 ***
anova(model.2)
#           Df Sum Sq Mean Sq F value Pr(>F)
# BSA        1 419.86  419.86  53.927 8.114e-07 ***
# Residuals 18 140.14    7.79                      

model.12 <- lm(BP ~ Stress + BSA)
summary(model.12)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept) 44.24452    9.26104   4.777 0.000175 ***
# Stress       0.02166    0.01697   1.277 0.218924    
# BSA         34.33423    4.61110   7.446 9.56e-07 ***
anova(model.12)
#           Df Sum Sq Mean Sq F value Pr(>F)
# Stress     1  15.04   15.04  1.9998    0.1754    
# BSA        1 417.07  417.07 55.4430 9.561e-07 ***
# Residuals 17 127.88    7.52                      

model.21 <- lm(BP ~ BSA + Stress)
summary(model.21)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept) 44.24452    9.26104   4.777 0.000175 ***
# BSA         34.33423    4.61110   7.446 9.56e-07 ***
# Stress       0.02166    0.01697   1.277 0.218924    
anova(model.21)
#           Df Sum Sq Mean Sq F value Pr(>F)
# BSA        1 419.86  419.86 55.8132 9.149e-07 ***
# Stress     1  12.26   12.26  1.6296    0.2189    
# Residuals 17 127.88    7.52                      

scatter3d(BP ~ Stress + BSA)

detach(bloodpress)

Blood pressure (predictors with high multicollinearity)

  • Load the bloodpress data.
  • Create a scatterplot matrix of the data.
  • Fit a simple linear regression model of BP vs Weight.
  • Fit a simple linear regression model of BP vs BSA.
  • Fit a multiple linear regression model of BP vs Weight + BSA.
  • Fit a multiple linear regression model of BP vs BSA + Weight.
  • Use the scatter3d function in the car package to create a 3D scatterplot of the data with the fitted plane for a multiple linear regression model of BP vs Weight + BSA.
  • Predict BP for Weight=92 and BSA=2 for the two simple linear regression models and the multiple linear regression model.
attach(bloodpress)

pairs(bloodpress[,c(2,5,4)])

model.1 <- lm(BP ~ Weight)
summary(model.1)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)  2.20531    8.66333   0.255    0.802    
# Weight       1.20093    0.09297  12.917 1.53e-10 ***
anova(model.1)
#           Df Sum Sq Mean Sq F value Pr(>F)
# Weight     1 505.47  505.47  166.86 1.528e-10 ***
# Residuals 18  54.53    3.03                      

model.2 <- lm(BP ~ BSA)
summary(model.2)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)   45.183      9.392   4.811  0.00014 ***
# BSA           34.443      4.690   7.343 8.11e-07 ***
anova(model.2)
#           Df Sum Sq Mean Sq F value Pr(>F)
# BSA        1 419.86  419.86  53.927 8.114e-07 ***
# Residuals 18 140.14    7.79                      

model.12 <- lm(BP ~ Weight + BSA)
summary(model.12)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)   5.6534     9.3925   0.602    0.555    
# Weight        1.0387     0.1927   5.392 4.87e-05 ***
# BSA           5.8313     6.0627   0.962    0.350    
anova(model.12)
#           Df Sum Sq Mean Sq F value Pr(>F)
# Weight     1 505.47  505.47 166.1648 3.341e-10 ***
# BSA        1   2.81    2.81   0.9251    0.3496    
# Residuals 17  51.71    3.04                       

model.21 <- lm(BP ~ BSA + Weight)
summary(model.21)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)   5.6534     9.3925   0.602    0.555    
# BSA           5.8313     6.0627   0.962    0.350    
# Weight        1.0387     0.1927   5.392 4.87e-05 ***
anova(model.21)
#           Df Sum Sq Mean Sq F value Pr(>F)
# BSA        1 419.86  419.86 138.021 1.391e-09 ***
# Weight     1  88.43   88.43  29.069 4.871e-05 ***
# Residuals 17  51.71    3.04                      

scatter3d(BP ~ Weight + BSA)

predict(model.1, interval="prediction",
        newdata=data.frame(Weight=92))
#       fit     lwr     upr
# 1 112.691 108.938 116.444

predict(model.2, interval="prediction",
        newdata=data.frame(BSA=2))
#        fit      lwr      upr
# 1 114.0689 108.0619 120.0758

predict(model.12, interval="prediction",
        newdata=data.frame(Weight=92, BSA=2))
#        fit      lwr      upr
# 1 112.8794 109.0801 116.6787

detach(bloodpress)

Poverty and teen birth rate (high multicollinearity)

  • Load the poverty data and remove the District of Columbia.
  • Create a scatterplot matrix of the data.
  • Fit a simple linear regression model of PovPct vs Brth15to17.
  • Fit a simple linear regression model of PovPct vs Brth18to19.
  • Fit a multiple linear regression model of PovPct vs Brth15to17 + Brth18to19.
poverty <- read.table("~/path-to-data/poverty.txt", header=T)
poverty <- poverty[poverty$Location!="District_of_Columbia",]
attach(poverty)

pairs(poverty[,c(2:4)])

model.1 <- lm(PovPct ~ Brth15to17)
summary(model.1)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)   4.4871     1.3181   3.404  0.00135 ** 
# Brth15to17    0.3872     0.0572   6.768 1.67e-08 ***
# ---
# Residual standard error: 2.982 on 48 degrees of freedom
# Multiple R-squared:  0.4883,  Adjusted R-squared:  0.4777 
# F-statistic: 45.81 on 1 and 48 DF,  p-value: 1.666e-08

model.2 <- lm(PovPct ~ Brth18to19)
summary(model.2)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)  3.05279    1.83169   1.667    0.102    
# Brth18to19   0.13842    0.02482   5.576 1.11e-06 ***
# ---
# Residual standard error: 3.248 on 48 degrees of freedom
# Multiple R-squared:  0.3931,  Adjusted R-squared:  0.3805 
# F-statistic: 31.09 on 1 and 48 DF,  p-value: 1.106e-06

model.12 <- lm(PovPct ~ Brth15to17 + Brth18to19)
summary(model.12)
#             Estimate Std. Error t value Pr(>|t|)   
# (Intercept)  6.43963    1.95904   3.287  0.00192 **
# Brth15to17   0.63235    0.19178   3.297  0.00186 **
# Brth18to19  -0.10227    0.07642  -1.338  0.18724   
# ---
# Residual standard error: 2.958 on 47 degrees of freedom
# Multiple R-squared:  0.5071,  Adjusted R-squared:  0.4861 
# F-statistic: 24.18 on 2 and 47 DF,  p-value: 6.017e-08

detach(poverty)

Blood pressure (high multicollinearity)

  • Load the bloodpress data.
  • Fit a multiple linear regression model of BP vs Age + Weight + BSA + Dur + Pulse + Stress.
  • Use the vif function in the car package to calculate variance inflation factors.
  • Fit a multiple linear regression model of Weight vs Age + BSA + Dur + Pulse + Stress and confirm the VIF value for Weight as 1/(1-\(R^2\)) for this model.
  • Fit a multiple linear regression model of BP vs Age + Weight + Dur + Stress.
  • Use the vif function in the car package to calculate variance inflation factors.
attach(bloodpress)

model.1 <- lm(BP ~ Age + Weight + BSA + Dur + Pulse + Stress)
summary(model.1)
#               Estimate Std. Error t value Pr(>|t|)    
# (Intercept) -12.870476   2.556650  -5.034 0.000229 ***
# Age           0.703259   0.049606  14.177 2.76e-09 ***
# Weight        0.969920   0.063108  15.369 1.02e-09 ***
# BSA           3.776491   1.580151   2.390 0.032694 *  
# Dur           0.068383   0.048441   1.412 0.181534    
# Pulse        -0.084485   0.051609  -1.637 0.125594    
# Stress        0.005572   0.003412   1.633 0.126491    
# ---
# Residual standard error: 0.4072 on 13 degrees of freedom
# Multiple R-squared:  0.9962,  Adjusted R-squared:  0.9944 
# F-statistic: 560.6 on 6 and 13 DF,  p-value: 6.395e-15

# library(car)
vif(model.1)
#      Age   Weight      BSA      Dur    Pulse   Stress 
# 1.762807 8.417035 5.328751 1.237309 4.413575 1.834845 

model.2 <- lm(Weight ~ Age + BSA + Dur + Pulse + Stress)
summary(model.2)
# Residual standard error: 1.725 on 14 degrees of freedom
# Multiple R-squared:  0.8812,  Adjusted R-squared:  0.8388 
# F-statistic: 20.77 on 5 and 14 DF,  p-value: 5.046e-06

1/(1-summary(model.2)$r.squared) # 8.417035

model.3 <- lm(BP ~ Age + Weight + Dur + Stress)
summary(model.3)
#               Estimate Std. Error t value Pr(>|t|)    
# (Intercept) -15.869829   3.195296  -4.967 0.000169 ***
# Age           0.683741   0.061195  11.173 1.14e-08 ***
# Weight        1.034128   0.032672  31.652 3.76e-15 ***
# Dur           0.039889   0.064486   0.619 0.545485    
# Stress        0.002184   0.003794   0.576 0.573304    
# ---
# Residual standard error: 0.5505 on 15 degrees of freedom
# Multiple R-squared:  0.9919,  Adjusted R-squared:  0.9897 
# F-statistic: 458.3 on 4 and 15 DF,  p-value: 1.764e-15

vif(model.3)
#      Age   Weight      Dur   Stress 
# 1.468245 1.234653 1.200060 1.241117 

detach(bloodpress)

Allen Cognitive Level study (reducing data-based multicollinearity)

  • Load the sampled allentestn23 data.
  • Create a scatterplot matrix of the data.
  • Calculate the correlation between Vocab and Abstract.
  • Fit a multiple linear regression model of ACL vs SDMT + Vocab + Abstract.
  • Use the vif function in the car package to calculate variance inflation factors.
  • Repeat for the full allentest data.
allentestn23 <- read.table("~/path-to-data/allentestn23.txt", header=T)
attach(allentestn23)

pairs(allentestn23[,2:5])

cor(Vocab, Abstract) # 0.9897771

model.1 <- lm(ACL ~ SDMT + Vocab + Abstract)
summary(model.1)
#             Estimate Std. Error t value Pr(>|t|)  
# (Intercept)  3.74711    1.34237   2.791   0.0116 *
# SDMT         0.02326    0.01273   1.827   0.0834 .
# Vocab        0.02825    0.15239   0.185   0.8549  
# Abstract    -0.01379    0.10055  -0.137   0.8924  
# ---
# Residual standard error: 0.7344 on 19 degrees of freedom
# Multiple R-squared:  0.2645,  Adjusted R-squared:  0.1484 
# F-statistic: 2.278 on 3 and 19 DF,  p-value: 0.1124

vif(model.1)
#     SDMT     Vocab  Abstract 
# 1.726185 49.286239 50.603085 

detach(allentestn23)

allentest <- read.table("~/path-to-data/allentest.txt", header=T)
attach(allentest)

pairs(allentest[,2:5])

cor(Vocab, Abstract) # 0.6978405

model.1 <- lm(ACL ~ SDMT + Vocab + Abstract)
summary(model.1)
#              Estimate Std. Error t value Pr(>|t|)    
# (Intercept)  3.946347   0.338069  11.673  < 2e-16 ***
# SDMT         0.027404   0.007168   3.823 0.000298 ***
# Vocab       -0.017397   0.018077  -0.962 0.339428    
# Abstract     0.012182   0.011585   1.051 0.296926    
# ---
# Residual standard error: 0.6878 on 65 degrees of freedom
# Multiple R-squared:  0.2857,  Adjusted R-squared:  0.2528 
# F-statistic: 8.668 on 3 and 65 DF,  p-value: 6.414e-05

vif(model.1)
#     SDMT    Vocab Abstract 
# 1.609662 2.093297 2.167428 

detach(allentest)

Exercise and immunity (reducing structural multicollinearity)

  • Load the exerimmun data.
  • Create a scatterplot of igg vs oxygen.
  • Calculate an oxygen-squared variable named oxygensq.
  • Fit a quadratic regression model of igg vs oxygen + oxygensq.
  • Add a quadratic regression line to the scatterplot.
  • Use the vif function in the car package to calculate variance inflation factors.
  • Create a scatterplot of oxygensq vs oxygen and calculate the correlation.
  • Calculate a centered oxygen variable named oxcent and an oxcent-squared variable named oxcentsq.
  • Fit a quadratic regression model of igg vs oxcent + oxcentsq.
  • Use the vif function in the car package to calculate variance inflation factors.
  • Create a scatterplot of igg vs oxcent with the quadratic regression line added.
  • Fit a simple linear regression model of igg vs oxcent.
  • Confirm the equivalence of the original quadratic and centered quadratic models by transforming the regression parameter estimates.
  • Create a residual vs fits plot for the centered quadratic model.
  • Create a normal probability plot of the residuals for the centered quadratic model.
  • Predict igg for oxygen = 70 using the centered quadratic model.
exerimmun <- read.table("~/path-to-data/exerimmun.txt", header=T)
attach(exerimmun)

plot(oxygen, igg)

oxygensq <- oxygen^2

model.1 <- lm(igg ~ oxygen + oxygensq)

plot(x=oxygen, y=igg,
     panel.last = lines(sort(oxygen), fitted(model.1)[order(oxygen)]))

summary(model.1)
#               Estimate Std. Error t value Pr(>|t|)    
# (Intercept) -1464.4042   411.4012  -3.560  0.00140 ** 
# oxygen         88.3071    16.4735   5.361 1.16e-05 ***
# oxygensq       -0.5362     0.1582  -3.390  0.00217 ** 
# ---
# Residual standard error: 106.4 on 27 degrees of freedom
# Multiple R-squared:  0.9377,  Adjusted R-squared:  0.9331 
# F-statistic: 203.2 on 2 and 27 DF,  p-value: < 2.2e-16

vif(model.1)
#   oxygen oxygensq 
# 99.94261 99.94261 

plot(oxygen, oxygensq)
cor(oxygen, oxygensq) # 0.9949846

oxcent <- oxygen-mean(oxygen)
oxcentsq <- oxcent^2

plot(oxcent, oxcentsq)
cor(oxcent, oxcentsq) # 0.2195179

model.2 <- lm(igg ~ oxcent + oxcentsq)
summary(model.2)
#              Estimate Std. Error t value Pr(>|t|)    
# (Intercept) 1632.1962    29.3486   55.61  < 2e-16 ***
# oxcent        33.9995     1.6890   20.13  < 2e-16 ***
# oxcentsq      -0.5362     0.1582   -3.39  0.00217 ** 
# ---
# Residual standard error: 106.4 on 27 degrees of freedom
# Multiple R-squared:  0.9377,  Adjusted R-squared:  0.9331 
# F-statistic: 203.2 on 2 and 27 DF,  p-value: < 2.2e-16

vif(model.2)
#   oxcent oxcentsq 
# 1.050628 1.050628 

plot(x=oxcent, y=igg,
     panel.last = lines(sort(oxcent), fitted(model.2)[order(oxcent)]))

model.3 <- lm(igg ~ oxcent)
summary(model.3)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept) 1557.633     22.782   68.37  < 2e-16 ***
# oxcent        32.743      1.932   16.95 2.97e-16 ***

coef(model.2)[1]-coef(model.2)[2]*mean(oxygen)+coef(model.2)[3]*mean(oxygen)^2 # -1464.404
coef(model.2)[2]-2*coef(model.2)[3]*mean(oxygen) # 88.3071
coef(model.2)[3] # -0.5362473

coef(model.1)
#   (Intercept)        oxygen      oxygensq 
# -1464.4042284    88.3070970    -0.5362473 

plot(x=fitted(model.2), y=residuals(model.2),
     panel.last = abline(h=0, lty=2))

qqnorm(residuals(model.2), main="", datax=TRUE)
qqline(residuals(model.2), datax=TRUE)

predict(model.2, interval="prediction",
        newdata=data.frame(oxcent=70-mean(oxygen), oxcentsq=(70-mean(oxygen))^2))
#        fit      lwr      upr
# 1 2089.481 1849.931 2329.031

detach(exerimmun)