Blood pressure (multicollinearity)
- Load the bloodpress data.
- Create scatterplot matrices of the data.
- Calculate correlations between the variables.
bloodpress <- read.table("~/path-to-data/bloodpress.txt", header=T)
attach(bloodpress)
pairs(bloodpress[,c(2:5)])
pairs(bloodpress[,c(2,6:8)])
round(cor(bloodpress[,c(2:8)]),3)
# BP Age Weight BSA Dur Pulse Stress
# BP 1.000 0.659 0.950 0.866 0.293 0.721 0.164
# Age 0.659 1.000 0.407 0.378 0.344 0.619 0.368
# Weight 0.950 0.407 1.000 0.875 0.201 0.659 0.034
# BSA 0.866 0.378 0.875 1.000 0.131 0.465 0.018
# Dur 0.293 0.344 0.201 0.131 1.000 0.402 0.312
# Pulse 0.721 0.619 0.659 0.465 0.402 1.000 0.506
# Stress 0.164 0.368 0.034 0.018 0.312 0.506 1.000
detach(bloodpress)
Uncorrelated predictors (no multicollinearity)
- Load the uncorrpreds data.
- Create a scatterplot matrix of the data.
- Calculate the correlation between the predictors.
- Fit a simple linear regression model of y vs \(x_1\).
- Fit a simple linear regression model of y vs \(x_2\).
- Fit a multiple linear regression model of y vs \(x_1\) + \(x_2\).
- Fit a multiple linear regression model of y vs \(x_2\) + \(x_1\).
- Use the
scatter3d
function in the car
package to create a 3D scatterplot of the data with the fitted plane for a multiple linear regression model of y vs \(x_1\) + \(x_2\).
uncorrpreds <- read.table("~/path-to-data/uncorrpreds.txt", header=T)
attach(uncorrpreds)
pairs(uncorrpreds)
cor(x1,x2) # 0
model.1 <- lm(y ~ x1)
summary(model.1)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 49.500 4.655 10.634 4.07e-05 ***
# x1 -1.000 1.472 -0.679 0.522
anova(model.1)
# Df Sum Sq Mean Sq F value Pr(>F)
# x1 1 8 8.000 0.4615 0.5222
# Residuals 6 104 17.333
model.2 <- lm(y ~ x2)
summary(model.2)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 57.000 8.213 6.940 0.000444 ***
# x2 -1.750 1.350 -1.296 0.242545
anova(model.2)
# Df Sum Sq Mean Sq F value Pr(>F)
# x2 1 24.5 24.500 1.68 0.2425
# Residuals 6 87.5 14.583
model.12 <- lm(y ~ x1 + x2)
summary(model.12)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 60.000 9.562 6.275 0.00151 **
# x1 -1.000 1.410 -0.709 0.50982
# x2 -1.750 1.410 -1.241 0.26954
anova(model.12)
# Df Sum Sq Mean Sq F value Pr(>F)
# x1 1 8.0 8.0 0.5031 0.5098
# x2 1 24.5 24.5 1.5409 0.2695
# Residuals 5 79.5 15.9
model.21 <- lm(y ~ x2 + x1)
summary(model.21)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 60.000 9.562 6.275 0.00151 **
# x2 -1.750 1.410 -1.241 0.26954
# x1 -1.000 1.410 -0.709 0.50982
anova(model.21)
# Df Sum Sq Mean Sq F value Pr(>F)
# x2 1 24.5 24.5 1.5409 0.2695
# x1 1 8.0 8.0 0.5031 0.5098
# Residuals 5 79.5 15.9
# library(car)
scatter3d(y ~ x1 + x2)
detach(uncorrpreds)
Blood pressure (predictors with almost no multicollinearity)
- Load the bloodpress data.
- Create a scatterplot matrix of the data.
- Fit a simple linear regression model of BP vs Stress.
- Fit a simple linear regression model of BP vs BSA.
- Fit a multiple linear regression model of BP vs Stress + BSA.
- Fit a multiple linear regression model of BP vs BSA + Stress.
- Use the
scatter3d
function in the car
package to create a 3D scatterplot of the data with the fitted plane for a multiple linear regression model of BP vs Stress + BSA.
attach(bloodpress)
pairs(bloodpress[,c(2,5,8)])
model.1 <- lm(BP ~ Stress)
summary(model.1)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 112.71997 2.19345 51.389 <2e-16 ***
# Stress 0.02399 0.03404 0.705 0.49
anova(model.1)
# Df Sum Sq Mean Sq F value Pr(>F)
# Stress 1 15.04 15.044 0.4969 0.4899
# Residuals 18 544.96 30.275
model.2 <- lm(BP ~ BSA)
summary(model.2)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 45.183 9.392 4.811 0.00014 ***
# BSA 34.443 4.690 7.343 8.11e-07 ***
anova(model.2)
# Df Sum Sq Mean Sq F value Pr(>F)
# BSA 1 419.86 419.86 53.927 8.114e-07 ***
# Residuals 18 140.14 7.79
model.12 <- lm(BP ~ Stress + BSA)
summary(model.12)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 44.24452 9.26104 4.777 0.000175 ***
# Stress 0.02166 0.01697 1.277 0.218924
# BSA 34.33423 4.61110 7.446 9.56e-07 ***
anova(model.12)
# Df Sum Sq Mean Sq F value Pr(>F)
# Stress 1 15.04 15.04 1.9998 0.1754
# BSA 1 417.07 417.07 55.4430 9.561e-07 ***
# Residuals 17 127.88 7.52
model.21 <- lm(BP ~ BSA + Stress)
summary(model.21)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 44.24452 9.26104 4.777 0.000175 ***
# BSA 34.33423 4.61110 7.446 9.56e-07 ***
# Stress 0.02166 0.01697 1.277 0.218924
anova(model.21)
# Df Sum Sq Mean Sq F value Pr(>F)
# BSA 1 419.86 419.86 55.8132 9.149e-07 ***
# Stress 1 12.26 12.26 1.6296 0.2189
# Residuals 17 127.88 7.52
scatter3d(BP ~ Stress + BSA)
detach(bloodpress)
Blood pressure (predictors with high multicollinearity)
- Load the bloodpress data.
- Create a scatterplot matrix of the data.
- Fit a simple linear regression model of BP vs Weight.
- Fit a simple linear regression model of BP vs BSA.
- Fit a multiple linear regression model of BP vs Weight + BSA.
- Fit a multiple linear regression model of BP vs BSA + Weight.
- Use the
scatter3d
function in the car
package to create a 3D scatterplot of the data with the fitted plane for a multiple linear regression model of BP vs Weight + BSA.
- Predict BP for Weight=92 and BSA=2 for the two simple linear regression models and the multiple linear regression model.
attach(bloodpress)
pairs(bloodpress[,c(2,5,4)])
model.1 <- lm(BP ~ Weight)
summary(model.1)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 2.20531 8.66333 0.255 0.802
# Weight 1.20093 0.09297 12.917 1.53e-10 ***
anova(model.1)
# Df Sum Sq Mean Sq F value Pr(>F)
# Weight 1 505.47 505.47 166.86 1.528e-10 ***
# Residuals 18 54.53 3.03
model.2 <- lm(BP ~ BSA)
summary(model.2)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 45.183 9.392 4.811 0.00014 ***
# BSA 34.443 4.690 7.343 8.11e-07 ***
anova(model.2)
# Df Sum Sq Mean Sq F value Pr(>F)
# BSA 1 419.86 419.86 53.927 8.114e-07 ***
# Residuals 18 140.14 7.79
model.12 <- lm(BP ~ Weight + BSA)
summary(model.12)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 5.6534 9.3925 0.602 0.555
# Weight 1.0387 0.1927 5.392 4.87e-05 ***
# BSA 5.8313 6.0627 0.962 0.350
anova(model.12)
# Df Sum Sq Mean Sq F value Pr(>F)
# Weight 1 505.47 505.47 166.1648 3.341e-10 ***
# BSA 1 2.81 2.81 0.9251 0.3496
# Residuals 17 51.71 3.04
model.21 <- lm(BP ~ BSA + Weight)
summary(model.21)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 5.6534 9.3925 0.602 0.555
# BSA 5.8313 6.0627 0.962 0.350
# Weight 1.0387 0.1927 5.392 4.87e-05 ***
anova(model.21)
# Df Sum Sq Mean Sq F value Pr(>F)
# BSA 1 419.86 419.86 138.021 1.391e-09 ***
# Weight 1 88.43 88.43 29.069 4.871e-05 ***
# Residuals 17 51.71 3.04
scatter3d(BP ~ Weight + BSA)
predict(model.1, interval="prediction",
newdata=data.frame(Weight=92))
# fit lwr upr
# 1 112.691 108.938 116.444
predict(model.2, interval="prediction",
newdata=data.frame(BSA=2))
# fit lwr upr
# 1 114.0689 108.0619 120.0758
predict(model.12, interval="prediction",
newdata=data.frame(Weight=92, BSA=2))
# fit lwr upr
# 1 112.8794 109.0801 116.6787
detach(bloodpress)
Poverty and teen birth rate (high multicollinearity)
- Load the poverty data and remove the District of Columbia.
- Create a scatterplot matrix of the data.
- Fit a simple linear regression model of PovPct vs Brth15to17.
- Fit a simple linear regression model of PovPct vs Brth18to19.
- Fit a multiple linear regression model of PovPct vs Brth15to17 + Brth18to19.
poverty <- read.table("~/path-to-data/poverty.txt", header=T)
poverty <- poverty[poverty$Location!="District_of_Columbia",]
attach(poverty)
pairs(poverty[,c(2:4)])
model.1 <- lm(PovPct ~ Brth15to17)
summary(model.1)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 4.4871 1.3181 3.404 0.00135 **
# Brth15to17 0.3872 0.0572 6.768 1.67e-08 ***
# ---
# Residual standard error: 2.982 on 48 degrees of freedom
# Multiple R-squared: 0.4883, Adjusted R-squared: 0.4777
# F-statistic: 45.81 on 1 and 48 DF, p-value: 1.666e-08
model.2 <- lm(PovPct ~ Brth18to19)
summary(model.2)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 3.05279 1.83169 1.667 0.102
# Brth18to19 0.13842 0.02482 5.576 1.11e-06 ***
# ---
# Residual standard error: 3.248 on 48 degrees of freedom
# Multiple R-squared: 0.3931, Adjusted R-squared: 0.3805
# F-statistic: 31.09 on 1 and 48 DF, p-value: 1.106e-06
model.12 <- lm(PovPct ~ Brth15to17 + Brth18to19)
summary(model.12)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 6.43963 1.95904 3.287 0.00192 **
# Brth15to17 0.63235 0.19178 3.297 0.00186 **
# Brth18to19 -0.10227 0.07642 -1.338 0.18724
# ---
# Residual standard error: 2.958 on 47 degrees of freedom
# Multiple R-squared: 0.5071, Adjusted R-squared: 0.4861
# F-statistic: 24.18 on 2 and 47 DF, p-value: 6.017e-08
detach(poverty)
Blood pressure (high multicollinearity)
- Load the bloodpress data.
- Fit a multiple linear regression model of BP vs Age + Weight + BSA + Dur + Pulse + Stress.
- Use the
vif
function in the car
package to calculate variance inflation factors.
- Fit a multiple linear regression model of Weight vs Age + BSA + Dur + Pulse + Stress and confirm the VIF value for Weight as 1/(1-\(R^2\)) for this model.
- Fit a multiple linear regression model of BP vs Age + Weight + Dur + Stress.
- Use the
vif
function in the car
package to calculate variance inflation factors.
attach(bloodpress)
model.1 <- lm(BP ~ Age + Weight + BSA + Dur + Pulse + Stress)
summary(model.1)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) -12.870476 2.556650 -5.034 0.000229 ***
# Age 0.703259 0.049606 14.177 2.76e-09 ***
# Weight 0.969920 0.063108 15.369 1.02e-09 ***
# BSA 3.776491 1.580151 2.390 0.032694 *
# Dur 0.068383 0.048441 1.412 0.181534
# Pulse -0.084485 0.051609 -1.637 0.125594
# Stress 0.005572 0.003412 1.633 0.126491
# ---
# Residual standard error: 0.4072 on 13 degrees of freedom
# Multiple R-squared: 0.9962, Adjusted R-squared: 0.9944
# F-statistic: 560.6 on 6 and 13 DF, p-value: 6.395e-15
# library(car)
vif(model.1)
# Age Weight BSA Dur Pulse Stress
# 1.762807 8.417035 5.328751 1.237309 4.413575 1.834845
model.2 <- lm(Weight ~ Age + BSA + Dur + Pulse + Stress)
summary(model.2)
# Residual standard error: 1.725 on 14 degrees of freedom
# Multiple R-squared: 0.8812, Adjusted R-squared: 0.8388
# F-statistic: 20.77 on 5 and 14 DF, p-value: 5.046e-06
1/(1-summary(model.2)$r.squared) # 8.417035
model.3 <- lm(BP ~ Age + Weight + Dur + Stress)
summary(model.3)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) -15.869829 3.195296 -4.967 0.000169 ***
# Age 0.683741 0.061195 11.173 1.14e-08 ***
# Weight 1.034128 0.032672 31.652 3.76e-15 ***
# Dur 0.039889 0.064486 0.619 0.545485
# Stress 0.002184 0.003794 0.576 0.573304
# ---
# Residual standard error: 0.5505 on 15 degrees of freedom
# Multiple R-squared: 0.9919, Adjusted R-squared: 0.9897
# F-statistic: 458.3 on 4 and 15 DF, p-value: 1.764e-15
vif(model.3)
# Age Weight Dur Stress
# 1.468245 1.234653 1.200060 1.241117
detach(bloodpress)
Allen Cognitive Level study (reducing data-based multicollinearity)
- Load the sampled allentestn23 data.
- Create a scatterplot matrix of the data.
- Calculate the correlation between Vocab and Abstract.
- Fit a multiple linear regression model of ACL vs SDMT + Vocab + Abstract.
- Use the
vif
function in the car
package to calculate variance inflation factors.
- Repeat for the full allentest data.
allentestn23 <- read.table("~/path-to-data/allentestn23.txt", header=T)
attach(allentestn23)
pairs(allentestn23[,2:5])
cor(Vocab, Abstract) # 0.9897771
model.1 <- lm(ACL ~ SDMT + Vocab + Abstract)
summary(model.1)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 3.74711 1.34237 2.791 0.0116 *
# SDMT 0.02326 0.01273 1.827 0.0834 .
# Vocab 0.02825 0.15239 0.185 0.8549
# Abstract -0.01379 0.10055 -0.137 0.8924
# ---
# Residual standard error: 0.7344 on 19 degrees of freedom
# Multiple R-squared: 0.2645, Adjusted R-squared: 0.1484
# F-statistic: 2.278 on 3 and 19 DF, p-value: 0.1124
vif(model.1)
# SDMT Vocab Abstract
# 1.726185 49.286239 50.603085
detach(allentestn23)
allentest <- read.table("~/path-to-data/allentest.txt", header=T)
attach(allentest)
pairs(allentest[,2:5])
cor(Vocab, Abstract) # 0.6978405
model.1 <- lm(ACL ~ SDMT + Vocab + Abstract)
summary(model.1)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 3.946347 0.338069 11.673 < 2e-16 ***
# SDMT 0.027404 0.007168 3.823 0.000298 ***
# Vocab -0.017397 0.018077 -0.962 0.339428
# Abstract 0.012182 0.011585 1.051 0.296926
# ---
# Residual standard error: 0.6878 on 65 degrees of freedom
# Multiple R-squared: 0.2857, Adjusted R-squared: 0.2528
# F-statistic: 8.668 on 3 and 65 DF, p-value: 6.414e-05
vif(model.1)
# SDMT Vocab Abstract
# 1.609662 2.093297 2.167428
detach(allentest)
Exercise and immunity (reducing structural multicollinearity)
- Load the exerimmun data.
- Create a scatterplot of igg vs oxygen.
- Calculate an oxygen-squared variable named oxygensq.
- Fit a quadratic regression model of igg vs oxygen + oxygensq.
- Add a quadratic regression line to the scatterplot.
- Use the
vif
function in the car
package to calculate variance inflation factors.
- Create a scatterplot of oxygensq vs oxygen and calculate the correlation.
- Calculate a centered oxygen variable named oxcent and an oxcent-squared variable named oxcentsq.
- Fit a quadratic regression model of igg vs oxcent + oxcentsq.
- Use the
vif
function in the car
package to calculate variance inflation factors.
- Create a scatterplot of igg vs oxcent with the quadratic regression line added.
- Fit a simple linear regression model of igg vs oxcent.
- Confirm the equivalence of the original quadratic and centered quadratic models by transforming the regression parameter estimates.
- Create a residual vs fits plot for the centered quadratic model.
- Create a normal probability plot of the residuals for the centered quadratic model.
- Predict igg for oxygen = 70 using the centered quadratic model.
exerimmun <- read.table("~/path-to-data/exerimmun.txt", header=T)
attach(exerimmun)
plot(oxygen, igg)
oxygensq <- oxygen^2
model.1 <- lm(igg ~ oxygen + oxygensq)
plot(x=oxygen, y=igg,
panel.last = lines(sort(oxygen), fitted(model.1)[order(oxygen)]))
summary(model.1)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) -1464.4042 411.4012 -3.560 0.00140 **
# oxygen 88.3071 16.4735 5.361 1.16e-05 ***
# oxygensq -0.5362 0.1582 -3.390 0.00217 **
# ---
# Residual standard error: 106.4 on 27 degrees of freedom
# Multiple R-squared: 0.9377, Adjusted R-squared: 0.9331
# F-statistic: 203.2 on 2 and 27 DF, p-value: < 2.2e-16
vif(model.1)
# oxygen oxygensq
# 99.94261 99.94261
plot(oxygen, oxygensq)
cor(oxygen, oxygensq) # 0.9949846
oxcent <- oxygen-mean(oxygen)
oxcentsq <- oxcent^2
plot(oxcent, oxcentsq)
cor(oxcent, oxcentsq) # 0.2195179
model.2 <- lm(igg ~ oxcent + oxcentsq)
summary(model.2)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 1632.1962 29.3486 55.61 < 2e-16 ***
# oxcent 33.9995 1.6890 20.13 < 2e-16 ***
# oxcentsq -0.5362 0.1582 -3.39 0.00217 **
# ---
# Residual standard error: 106.4 on 27 degrees of freedom
# Multiple R-squared: 0.9377, Adjusted R-squared: 0.9331
# F-statistic: 203.2 on 2 and 27 DF, p-value: < 2.2e-16
vif(model.2)
# oxcent oxcentsq
# 1.050628 1.050628
plot(x=oxcent, y=igg,
panel.last = lines(sort(oxcent), fitted(model.2)[order(oxcent)]))
model.3 <- lm(igg ~ oxcent)
summary(model.3)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 1557.633 22.782 68.37 < 2e-16 ***
# oxcent 32.743 1.932 16.95 2.97e-16 ***
coef(model.2)[1]-coef(model.2)[2]*mean(oxygen)+coef(model.2)[3]*mean(oxygen)^2 # -1464.404
coef(model.2)[2]-2*coef(model.2)[3]*mean(oxygen) # 88.3071
coef(model.2)[3] # -0.5362473
coef(model.1)
# (Intercept) oxygen oxygensq
# -1464.4042284 88.3070970 -0.5362473
plot(x=fitted(model.2), y=residuals(model.2),
panel.last = abline(h=0, lty=2))
qqnorm(residuals(model.2), main="", datax=TRUE)
qqline(residuals(model.2), datax=TRUE)
predict(model.2, interval="prediction",
newdata=data.frame(oxcent=70-mean(oxygen), oxcentsq=(70-mean(oxygen))^2))
# fit lwr upr
# 1 2089.481 1849.931 2329.031
detach(exerimmun)