Software Help 12

Software Help 12

   

The next two pages cover the Minitab and R commands for the procedures in this lesson.

Below is a zip file that contains all the data sets used in this lesson:

STAT501_Lesson12.zip

  • allentest.txt
  • allentestn23.txt
  • bloodpress.txt
  • cement.txt
  • exerimmun.txt
  • poverty.txt
  • uncorrelated.txt
  • uncorrpreds.txt

Minitab Help 12: Multicollinearity

Minitab Help 12: Multicollinearity

Minitab®

Blood pressure (multicollinearity)

Uncorrelated predictors (no multicollinearity)

Blood pressure (predictors with almost no multicollinearity)

Blood pressure (predictors with high multicollinearity)

Poverty and teen birth rate (high multicollinearity)

Blood pressure (high multicollinearity)

Allen Cognitive Level study (reducing data-based multicollinearity)

Exercise and immunity (reducing structural multicollinearity)


R Help 12: Multicollinearity

R Help 12: Multicollinearity

R Help

Blood pressure (multicollinearity)

  • Load the bloodpress data.
  • Create scatterplot matrices of the data.
  • Calculate correlations between the variables.
bloodpress <- read.table("~/path-to-data/bloodpress.txt", header=T)
attach(bloodpress)

pairs(bloodpress[,c(2:5)])
pairs(bloodpress[,c(2,6:8)])

round(cor(bloodpress[,c(2:8)]),3)
#           BP   Age Weight   BSA   Dur Pulse Stress
# BP     1.000 0.659  0.950 0.866 0.293 0.721  0.164
# Age    0.659 1.000  0.407 0.378 0.344 0.619  0.368
# Weight 0.950 0.407  1.000 0.875 0.201 0.659  0.034
# BSA    0.866 0.378  0.875 1.000 0.131 0.465  0.018
# Dur    0.293 0.344  0.201 0.131 1.000 0.402  0.312
# Pulse  0.721 0.619  0.659 0.465 0.402 1.000  0.506
# Stress 0.164 0.368  0.034 0.018 0.312 0.506  1.000

detach(bloodpress)

Uncorrelated predictors (no multicollinearity)

  • Load the uncorrpreds data.
  • Create a scatterplot matrix of the data.
  • Calculate the correlation between the predictors.
  • Fit a simple linear regression model of y vs \(x_1\).
  • Fit a simple linear regression model of y vs \(x_2\).
  • Fit a multiple linear regression model of y vs \(x_1\) + \(x_2\).
  • Fit a multiple linear regression model of y vs \(x_2\) + \(x_1\).
  • Use the scatter3d function in the car package to create a 3D scatterplot of the data with the fitted plane for a multiple linear regression model of y vs \(x_1\) + \(x_2\).
uncorrpreds <- read.table("~/path-to-data/uncorrpreds.txt", header=T)
attach(uncorrpreds)

pairs(uncorrpreds)

cor(x1,x2) # 0

model.1 <- lm(y ~ x1)
summary(model.1)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)   49.500      4.655  10.634 4.07e-05 ***
# x1            -1.000      1.472  -0.679    0.522    
anova(model.1)
#           Df Sum Sq Mean Sq F value Pr(>F)
# x1         1      8   8.000  0.4615 0.5222
# Residuals  6    104  17.333               

model.2 <- lm(y ~ x2)
summary(model.2)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)   57.000      8.213   6.940 0.000444 ***
# x2            -1.750      1.350  -1.296 0.242545    
anova(model.2)
#           Df Sum Sq Mean Sq F value Pr(>F)
# x2         1   24.5  24.500    1.68 0.2425
# Residuals  6   87.5  14.583               

model.12 <- lm(y ~ x1 + x2)
summary(model.12)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)   60.000      9.562   6.275  0.00151 **
# x1            -1.000      1.410  -0.709  0.50982   
# x2            -1.750      1.410  -1.241  0.26954   
anova(model.12)
#           Df Sum Sq Mean Sq F value Pr(>F)
# x1         1    8.0     8.0  0.5031 0.5098
# x2         1   24.5    24.5  1.5409 0.2695
# Residuals  5   79.5    15.9               

model.21 <- lm(y ~ x2 + x1)
summary(model.21)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)   60.000      9.562   6.275  0.00151 **
# x2            -1.750      1.410  -1.241  0.26954   
# x1            -1.000      1.410  -0.709  0.50982   
anova(model.21)
#           Df Sum Sq Mean Sq F value Pr(>F)
# x2         1   24.5    24.5  1.5409 0.2695
# x1         1    8.0     8.0  0.5031 0.5098
# Residuals  5   79.5    15.9               

# library(car)
scatter3d(y ~ x1 + x2)

detach(uncorrpreds)

Blood pressure (predictors with almost no multicollinearity)

  • Load the bloodpress data.
  • Create a scatterplot matrix of the data.
  • Fit a simple linear regression model of BP vs Stress.
  • Fit a simple linear regression model of BP vs BSA.
  • Fit a multiple linear regression model of BP vs Stress + BSA.
  • Fit a multiple linear regression model of BP vs BSA + Stress.
  • Use the scatter3d function in the car package to create a 3D scatterplot of the data with the fitted plane for a multiple linear regression model of BP vs Stress + BSA.
attach(bloodpress)

pairs(bloodpress[,c(2,5,8)])

model.1 <- lm(BP ~ Stress)
summary(model.1)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept) 112.71997    2.19345  51.389   <2e-16 ***
# Stress        0.02399    0.03404   0.705     0.49    
anova(model.1)
#           Df Sum Sq Mean Sq F value Pr(>F)
# Stress     1  15.04  15.044  0.4969 0.4899
# Residuals 18 544.96  30.275               

model.2 <- lm(BP ~ BSA)
summary(model.2)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)   45.183      9.392   4.811  0.00014 ***
# BSA           34.443      4.690   7.343 8.11e-07 ***
anova(model.2)
#           Df Sum Sq Mean Sq F value Pr(>F)
# BSA        1 419.86  419.86  53.927 8.114e-07 ***
# Residuals 18 140.14    7.79                      

model.12 <- lm(BP ~ Stress + BSA)
summary(model.12)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept) 44.24452    9.26104   4.777 0.000175 ***
# Stress       0.02166    0.01697   1.277 0.218924    
# BSA         34.33423    4.61110   7.446 9.56e-07 ***
anova(model.12)
#           Df Sum Sq Mean Sq F value Pr(>F)
# Stress     1  15.04   15.04  1.9998    0.1754    
# BSA        1 417.07  417.07 55.4430 9.561e-07 ***
# Residuals 17 127.88    7.52                      

model.21 <- lm(BP ~ BSA + Stress)
summary(model.21)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept) 44.24452    9.26104   4.777 0.000175 ***
# BSA         34.33423    4.61110   7.446 9.56e-07 ***
# Stress       0.02166    0.01697   1.277 0.218924    
anova(model.21)
#           Df Sum Sq Mean Sq F value Pr(>F)
# BSA        1 419.86  419.86 55.8132 9.149e-07 ***
# Stress     1  12.26   12.26  1.6296    0.2189    
# Residuals 17 127.88    7.52                      

scatter3d(BP ~ Stress + BSA)

detach(bloodpress)

Blood pressure (predictors with high multicollinearity)

  • Load the bloodpress data.
  • Create a scatterplot matrix of the data.
  • Fit a simple linear regression model of BP vs Weight.
  • Fit a simple linear regression model of BP vs BSA.
  • Fit a multiple linear regression model of BP vs Weight + BSA.
  • Fit a multiple linear regression model of BP vs BSA + Weight.
  • Use the scatter3d function in the car package to create a 3D scatterplot of the data with the fitted plane for a multiple linear regression model of BP vs Weight + BSA.
  • Predict BP for Weight=92 and BSA=2 for the two simple linear regression models and the multiple linear regression model.
attach(bloodpress)

pairs(bloodpress[,c(2,5,4)])

model.1 <- lm(BP ~ Weight)
summary(model.1)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)  2.20531    8.66333   0.255    0.802    
# Weight       1.20093    0.09297  12.917 1.53e-10 ***
anova(model.1)
#           Df Sum Sq Mean Sq F value Pr(>F)
# Weight     1 505.47  505.47  166.86 1.528e-10 ***
# Residuals 18  54.53    3.03                      

model.2 <- lm(BP ~ BSA)
summary(model.2)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)   45.183      9.392   4.811  0.00014 ***
# BSA           34.443      4.690   7.343 8.11e-07 ***
anova(model.2)
#           Df Sum Sq Mean Sq F value Pr(>F)
# BSA        1 419.86  419.86  53.927 8.114e-07 ***
# Residuals 18 140.14    7.79                      

model.12 <- lm(BP ~ Weight + BSA)
summary(model.12)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)   5.6534     9.3925   0.602    0.555    
# Weight        1.0387     0.1927   5.392 4.87e-05 ***
# BSA           5.8313     6.0627   0.962    0.350    
anova(model.12)
#           Df Sum Sq Mean Sq F value Pr(>F)
# Weight     1 505.47  505.47 166.1648 3.341e-10 ***
# BSA        1   2.81    2.81   0.9251    0.3496    
# Residuals 17  51.71    3.04                       

model.21 <- lm(BP ~ BSA + Weight)
summary(model.21)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)   5.6534     9.3925   0.602    0.555    
# BSA           5.8313     6.0627   0.962    0.350    
# Weight        1.0387     0.1927   5.392 4.87e-05 ***
anova(model.21)
#           Df Sum Sq Mean Sq F value Pr(>F)
# BSA        1 419.86  419.86 138.021 1.391e-09 ***
# Weight     1  88.43   88.43  29.069 4.871e-05 ***
# Residuals 17  51.71    3.04                      

scatter3d(BP ~ Weight + BSA)

predict(model.1, interval="prediction",
        newdata=data.frame(Weight=92))
#       fit     lwr     upr
# 1 112.691 108.938 116.444

predict(model.2, interval="prediction",
        newdata=data.frame(BSA=2))
#        fit      lwr      upr
# 1 114.0689 108.0619 120.0758

predict(model.12, interval="prediction",
        newdata=data.frame(Weight=92, BSA=2))
#        fit      lwr      upr
# 1 112.8794 109.0801 116.6787

detach(bloodpress)

Poverty and teen birth rate (high multicollinearity)

  • Load the poverty data and remove the District of Columbia.
  • Create a scatterplot matrix of the data.
  • Fit a simple linear regression model of PovPct vs Brth15to17.
  • Fit a simple linear regression model of PovPct vs Brth18to19.
  • Fit a multiple linear regression model of PovPct vs Brth15to17 + Brth18to19.
poverty <- read.table("~/path-to-data/poverty.txt", header=T)
poverty <- poverty[poverty$Location!="District_of_Columbia",]
attach(poverty)

pairs(poverty[,c(2:4)])

model.1 <- lm(PovPct ~ Brth15to17)
summary(model.1)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)   4.4871     1.3181   3.404  0.00135 ** 
# Brth15to17    0.3872     0.0572   6.768 1.67e-08 ***
# ---
# Residual standard error: 2.982 on 48 degrees of freedom
# Multiple R-squared:  0.4883,  Adjusted R-squared:  0.4777 
# F-statistic: 45.81 on 1 and 48 DF,  p-value: 1.666e-08

model.2 <- lm(PovPct ~ Brth18to19)
summary(model.2)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)  3.05279    1.83169   1.667    0.102    
# Brth18to19   0.13842    0.02482   5.576 1.11e-06 ***
# ---
# Residual standard error: 3.248 on 48 degrees of freedom
# Multiple R-squared:  0.3931,  Adjusted R-squared:  0.3805 
# F-statistic: 31.09 on 1 and 48 DF,  p-value: 1.106e-06

model.12 <- lm(PovPct ~ Brth15to17 + Brth18to19)
summary(model.12)
#             Estimate Std. Error t value Pr(>|t|)   
# (Intercept)  6.43963    1.95904   3.287  0.00192 **
# Brth15to17   0.63235    0.19178   3.297  0.00186 **
# Brth18to19  -0.10227    0.07642  -1.338  0.18724   
# ---
# Residual standard error: 2.958 on 47 degrees of freedom
# Multiple R-squared:  0.5071,  Adjusted R-squared:  0.4861 
# F-statistic: 24.18 on 2 and 47 DF,  p-value: 6.017e-08

detach(poverty)

Blood pressure (high multicollinearity)

  • Load the bloodpress data.
  • Fit a multiple linear regression model of BP vs Age + Weight + BSA + Dur + Pulse + Stress.
  • Use the vif function in the car package to calculate variance inflation factors.
  • Fit a multiple linear regression model of Weight vs Age + BSA + Dur + Pulse + Stress and confirm the VIF value for Weight as 1/(1-\(R^2\)) for this model.
  • Fit a multiple linear regression model of BP vs Age + Weight + Dur + Stress.
  • Use the vif function in the car package to calculate variance inflation factors.
attach(bloodpress)

model.1 <- lm(BP ~ Age + Weight + BSA + Dur + Pulse + Stress)
summary(model.1)
#               Estimate Std. Error t value Pr(>|t|)    
# (Intercept) -12.870476   2.556650  -5.034 0.000229 ***
# Age           0.703259   0.049606  14.177 2.76e-09 ***
# Weight        0.969920   0.063108  15.369 1.02e-09 ***
# BSA           3.776491   1.580151   2.390 0.032694 *  
# Dur           0.068383   0.048441   1.412 0.181534    
# Pulse        -0.084485   0.051609  -1.637 0.125594    
# Stress        0.005572   0.003412   1.633 0.126491    
# ---
# Residual standard error: 0.4072 on 13 degrees of freedom
# Multiple R-squared:  0.9962,  Adjusted R-squared:  0.9944 
# F-statistic: 560.6 on 6 and 13 DF,  p-value: 6.395e-15

# library(car)
vif(model.1)
#      Age   Weight      BSA      Dur    Pulse   Stress 
# 1.762807 8.417035 5.328751 1.237309 4.413575 1.834845 

model.2 <- lm(Weight ~ Age + BSA + Dur + Pulse + Stress)
summary(model.2)
# Residual standard error: 1.725 on 14 degrees of freedom
# Multiple R-squared:  0.8812,  Adjusted R-squared:  0.8388 
# F-statistic: 20.77 on 5 and 14 DF,  p-value: 5.046e-06

1/(1-summary(model.2)$r.squared) # 8.417035

model.3 <- lm(BP ~ Age + Weight + Dur + Stress)
summary(model.3)
#               Estimate Std. Error t value Pr(>|t|)    
# (Intercept) -15.869829   3.195296  -4.967 0.000169 ***
# Age           0.683741   0.061195  11.173 1.14e-08 ***
# Weight        1.034128   0.032672  31.652 3.76e-15 ***
# Dur           0.039889   0.064486   0.619 0.545485    
# Stress        0.002184   0.003794   0.576 0.573304    
# ---
# Residual standard error: 0.5505 on 15 degrees of freedom
# Multiple R-squared:  0.9919,  Adjusted R-squared:  0.9897 
# F-statistic: 458.3 on 4 and 15 DF,  p-value: 1.764e-15

vif(model.3)
#      Age   Weight      Dur   Stress 
# 1.468245 1.234653 1.200060 1.241117 

detach(bloodpress)

Allen Cognitive Level study (reducing data-based multicollinearity)

  • Load the sampled allentestn23 data.
  • Create a scatterplot matrix of the data.
  • Calculate the correlation between Vocab and Abstract.
  • Fit a multiple linear regression model of ACL vs SDMT + Vocab + Abstract.
  • Use the vif function in the car package to calculate variance inflation factors.
  • Repeat for the full allentest data.
allentestn23 <- read.table("~/path-to-data/allentestn23.txt", header=T)
attach(allentestn23)

pairs(allentestn23[,2:5])

cor(Vocab, Abstract) # 0.9897771

model.1 <- lm(ACL ~ SDMT + Vocab + Abstract)
summary(model.1)
#             Estimate Std. Error t value Pr(>|t|)  
# (Intercept)  3.74711    1.34237   2.791   0.0116 *
# SDMT         0.02326    0.01273   1.827   0.0834 .
# Vocab        0.02825    0.15239   0.185   0.8549  
# Abstract    -0.01379    0.10055  -0.137   0.8924  
# ---
# Residual standard error: 0.7344 on 19 degrees of freedom
# Multiple R-squared:  0.2645,  Adjusted R-squared:  0.1484 
# F-statistic: 2.278 on 3 and 19 DF,  p-value: 0.1124

vif(model.1)
#     SDMT     Vocab  Abstract 
# 1.726185 49.286239 50.603085 

detach(allentestn23)

allentest <- read.table("~/path-to-data/allentest.txt", header=T)
attach(allentest)

pairs(allentest[,2:5])

cor(Vocab, Abstract) # 0.6978405

model.1 <- lm(ACL ~ SDMT + Vocab + Abstract)
summary(model.1)
#              Estimate Std. Error t value Pr(>|t|)    
# (Intercept)  3.946347   0.338069  11.673  < 2e-16 ***
# SDMT         0.027404   0.007168   3.823 0.000298 ***
# Vocab       -0.017397   0.018077  -0.962 0.339428    
# Abstract     0.012182   0.011585   1.051 0.296926    
# ---
# Residual standard error: 0.6878 on 65 degrees of freedom
# Multiple R-squared:  0.2857,  Adjusted R-squared:  0.2528 
# F-statistic: 8.668 on 3 and 65 DF,  p-value: 6.414e-05

vif(model.1)
#     SDMT    Vocab Abstract 
# 1.609662 2.093297 2.167428 

detach(allentest)

Exercise and immunity (reducing structural multicollinearity)

  • Load the exerimmun data.
  • Create a scatterplot of igg vs oxygen.
  • Calculate an oxygen-squared variable named oxygensq.
  • Fit a quadratic regression model of igg vs oxygen + oxygensq.
  • Add a quadratic regression line to the scatterplot.
  • Use the vif function in the car package to calculate variance inflation factors.
  • Create a scatterplot of oxygensq vs oxygen and calculate the correlation.
  • Calculate a centered oxygen variable named oxcent and an oxcent-squared variable named oxcentsq.
  • Fit a quadratic regression model of igg vs oxcent + oxcentsq.
  • Use the vif function in the car package to calculate variance inflation factors.
  • Create a scatterplot of igg vs oxcent with the quadratic regression line added.
  • Fit a simple linear regression model of igg vs oxcent.
  • Confirm the equivalence of the original quadratic and centered quadratic models by transforming the regression parameter estimates.
  • Create a residual vs fits plot for the centered quadratic model.
  • Create a normal probability plot of the residuals for the centered quadratic model.
  • Predict igg for oxygen = 70 using the centered quadratic model.
exerimmun <- read.table("~/path-to-data/exerimmun.txt", header=T)
attach(exerimmun)

plot(oxygen, igg)

oxygensq <- oxygen^2

model.1 <- lm(igg ~ oxygen + oxygensq)

plot(x=oxygen, y=igg,
     panel.last = lines(sort(oxygen), fitted(model.1)[order(oxygen)]))

summary(model.1)
#               Estimate Std. Error t value Pr(>|t|)    
# (Intercept) -1464.4042   411.4012  -3.560  0.00140 ** 
# oxygen         88.3071    16.4735   5.361 1.16e-05 ***
# oxygensq       -0.5362     0.1582  -3.390  0.00217 ** 
# ---
# Residual standard error: 106.4 on 27 degrees of freedom
# Multiple R-squared:  0.9377,  Adjusted R-squared:  0.9331 
# F-statistic: 203.2 on 2 and 27 DF,  p-value: < 2.2e-16

vif(model.1)
#   oxygen oxygensq 
# 99.94261 99.94261 

plot(oxygen, oxygensq)
cor(oxygen, oxygensq) # 0.9949846

oxcent <- oxygen-mean(oxygen)
oxcentsq <- oxcent^2

plot(oxcent, oxcentsq)
cor(oxcent, oxcentsq) # 0.2195179

model.2 <- lm(igg ~ oxcent + oxcentsq)
summary(model.2)
#              Estimate Std. Error t value Pr(>|t|)    
# (Intercept) 1632.1962    29.3486   55.61  < 2e-16 ***
# oxcent        33.9995     1.6890   20.13  < 2e-16 ***
# oxcentsq      -0.5362     0.1582   -3.39  0.00217 ** 
# ---
# Residual standard error: 106.4 on 27 degrees of freedom
# Multiple R-squared:  0.9377,  Adjusted R-squared:  0.9331 
# F-statistic: 203.2 on 2 and 27 DF,  p-value: < 2.2e-16

vif(model.2)
#   oxcent oxcentsq 
# 1.050628 1.050628 

plot(x=oxcent, y=igg,
     panel.last = lines(sort(oxcent), fitted(model.2)[order(oxcent)]))

model.3 <- lm(igg ~ oxcent)
summary(model.3)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept) 1557.633     22.782   68.37  < 2e-16 ***
# oxcent        32.743      1.932   16.95 2.97e-16 ***

coef(model.2)[1]-coef(model.2)[2]*mean(oxygen)+coef(model.2)[3]*mean(oxygen)^2 # -1464.404
coef(model.2)[2]-2*coef(model.2)[3]*mean(oxygen) # 88.3071
coef(model.2)[3] # -0.5362473

coef(model.1)
#   (Intercept)        oxygen      oxygensq 
# -1464.4042284    88.3070970    -0.5362473 

plot(x=fitted(model.2), y=residuals(model.2),
     panel.last = abline(h=0, lty=2))

qqnorm(residuals(model.2), main="", datax=TRUE)
qqline(residuals(model.2), datax=TRUE)

predict(model.2, interval="prediction",
        newdata=data.frame(oxcent=70-mean(oxygen), oxcentsq=(70-mean(oxygen))^2))
#        fit      lwr      upr
# 1 2089.481 1849.931 2329.031

detach(exerimmun)

Legend
[1]Link
Has Tooltip/Popover
 Toggleable Visibility