Galton peas (nonconstant variance and weighted least squares)
- Load the galton data.
- Fit an ordinary least squares (OLS) simple linear regression model of Progeny vs Parent.
- Fit a weighted least squares (WLS) model using weights = 1/SD2.
- Create a scatterplot of the data with a regression line for each model.
galton <- read.table("~/path-to-data/galton.txt", header=T)
attach(galton)
model.1 <- lm(Progeny ~ Parent)
summary(model.1)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 0.127029 0.006993 18.164 9.29e-06 ***
# Parent 0.210000 0.038614 5.438 0.00285 **
model.2 <- lm(Progeny ~ Parent, weights=1/SD^2)
summary(model.2)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 0.127964 0.006811 18.787 7.87e-06 ***
# Parent 0.204801 0.038155 5.368 0.00302 **
plot(x=Parent, y=Progeny, ylim=c(0.158,0.174),
panel.last = c(lines(sort(Parent), fitted(model.1)[order(Parent)], col="blue"),
lines(sort(Parent), fitted(model.2)[order(Parent)], col="red")))
legend("topleft", col=c("blue","red"), lty=1,
inset=0.02, legend=c("OLS", "WLS"))
detach(galton)
Computer-assisted learning (nonconstant variance and weighted least squares)
- Load the ca_learning data.
- Create a scatterplot of the data.
- Fit an OLS model.
- Plot the OLS residuals vs num.responses.
- Plot the absolute OLS residuals vs num.responses.
- Calculate fitted values from a regression of absolute residuals vs num.responses.
- Fit a WLS model using weights = 1/(fitted values)2.
- Create a scatterplot of the data with a regression line for each model.
- Plot the WLS standardized residuals vs num.responses.
ca_learning <- read.table("~/path-to-data/ca_learning_new.txt", header=T)
attach(ca_learning)
plot(x=num.responses, y=cost)
model.1 <- lm(cost ~ num.responses)
summary(model.1)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 19.4727 5.5162 3.530 0.00545 **
# num.responses 3.2689 0.3651 8.955 4.33e-06 ***
# ---
# Residual standard error: 4.598 on 10 degrees of freedom
# Multiple R-squared: 0.8891, Adjusted R-squared: 0.878
# F-statistic: 80.19 on 1 and 10 DF, p-value: 4.33e-06
plot(num.responses, residuals(model.1))
plot(num.responses, abs(residuals(model.1)))
wts <- 1/fitted(lm(abs(residuals(model.1)) ~ num.responses))^2
model.2 <- lm(cost ~ num.responses, weights=wts)
summary(model.2)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 17.3006 4.8277 3.584 0.00498 **
# num.responses 3.4211 0.3703 9.238 3.27e-06 ***
# ---
# Residual standard error: 1.159 on 10 degrees of freedom
# Multiple R-squared: 0.8951, Adjusted R-squared: 0.8846
# F-statistic: 85.35 on 1 and 10 DF, p-value: 3.269e-06
plot(x=num.responses, y=cost, ylim=c(50,95),
panel.last = c(lines(sort(num.responses), fitted(model.1)[order(num.responses)], col="blue"),
lines(sort(num.responses), fitted(model.2)[order(num.responses)], col="red")))
legend("topleft", col=c("blue","red"), lty=1,
inset=0.02, legend=c("OLS", "WLS"))
plot(num.responses, rstandard(model.2))
detach(ca_learning)
Market share (nonconstant variance and weighted least squares)
- Load the marketshare data.
- Fit an OLS model.
- Plot the OLS residuals vs fitted values with points marked by Discount.
- Use the tapply function to calculate the residual variance for Discount=0 and Discount=1.
- Fit a WLS model using weights = 1/variance for Discount=0 and Discount=1.
- Plot the WLS standardized residuals vs fitted values.
marketshare <- read.table("~/path-to-data/marketshare.txt", header=T)
attach(marketshare)
model.1 <- lm(MarketShare ~ Price + P1 + P2)
summary(model.1)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 3.19592 0.35616 8.973 3.00e-10 ***
# Price -0.33358 0.15226 -2.191 0.0359 *
# P1 0.30808 0.06412 4.804 3.51e-05 ***
# P2 0.48431 0.05541 8.740 5.49e-10 ***
plot(fitted(model.1), residuals(model.1), col=Discount+1)
vars <- tapply(residuals(model.1), Discount, var)
# 0 1
# 0.01052324 0.02680546
wts <- Discount/vars[2] + (1-Discount)/vars[1]
model.2 <- lm(MarketShare ~ Price + P1 + P2, weights=wts)
summary(model.2)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 3.17437 0.35671 8.899 3.63e-10 ***
# Price -0.32432 0.15291 -2.121 0.0418 *
# P1 0.30834 0.06575 4.689 4.89e-05 ***
# P2 0.48419 0.05422 8.930 3.35e-10 ***
plot(fitted(model.2), rstandard(model.2), col=Discount+1)
detach(marketshare)
Home price (nonconstant variance and weighted least squares)
- Load the realestate data.
- Calculate log transformations of the variables.
- Fit an OLS model.
- Plot the OLS residuals vs fitted values.
- Calculate fitted values from a regression of absolute residuals vs fitted values.
- Fit a WLS model using weights = 1/(fitted values)2.
- Plot the WLS standardized residuals vs fitted values.
realestate <- read.table("~/path-to-data/realestate.txt", header=T)
attach(realestate)
logY <- log(SalePrice)
logX1 <- log(SqFeet)
logX2 <- log(Lot)
model.1 <- lm(logY ~ logX1 + logX2)
summary(model.1)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 4.25485 0.07353 57.864 < 2e-16 ***
# logX1 1.22141 0.03371 36.234 < 2e-16 ***
# logX2 0.10595 0.02394 4.425 1.18e-05 ***
plot(fitted(model.1), residuals(model.1))
wts <- 1/fitted(lm(abs(residuals(model.1)) ~ fitted(model.1)))^2
model.2 <- lm(logY ~ logX1 + logX2, weights=wts)
summary(model.2)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 4.35189 0.06330 68.755 < 2e-16 ***
# logX1 1.20150 0.03332 36.065 < 2e-16 ***
# logX2 0.07924 0.02152 3.682 0.000255 ***
plot(fitted(model.2), rstandard(model.2))
detach(realestate)
Google stock (autoregression model)
- Use the read.zoo function in the zoo package to load the google_stock data in time series format.
- Create a time series plot of the data.
- Load the google_stock data in the usual way using read-table.
- Use the ts function to convert the price variable to a time series.
- Create a plot of partial autocorrelations of price.
- Calculate a lag-1 price variable (note that the lag argument for the function is –1, not +1).
- Create a scatterplot of price vs lag1price.
- Use the ts.intersect function to create a dataframe containing price and lag1price.
- Fit a simple linear regression model of price vs lag1price (a first-order autoregression model).
library(zoo)
google.ts <- read.zoo("~/path-to-data/google_stock.txt", format="%m/%d/%Y",
header=T)
plot(google.ts)
google <- read.table("~/path-to-data/google_stock.txt", header=T)
attach(google)
price.ts <- ts(price)
pacf(price.ts)
lag1price <- lag(price.ts, -1)
plot(price.ts ~ lag1price, xy.labels=F)
lagdata <- ts.intersect(price.ts, lag1price, dframe=T)
summary(lm(price.ts ~ lag1price, data=lagdata))
detach(google)
Earthquakes (autoregression model)
- Load the earthquakes data.
- Create a time series plot of the data.
- Use the ts function to convert the Quakes variable to a time series.
- Create a plot of partial autocorrelations of Quakes.
- Calculate lag-1, lag-2, and lag-3 Quakes variables.
- Use the ts.intersect function to create a dataframe containing Quakes and the three lag variables.
- Fit a multiple linear regression model of Quakes versus the three lag variables (a third-order autoregression model).
earthquakes <- read.table("~/path-to-data/earthquakes.txt", header=T)
attach(earthquakes)
plot(Year, Quakes, type="b")
Quakes.ts <- ts(Quakes)
pacf(Quakes.ts)
lag1Quakes <- lag(Quakes.ts, -1)
lag2Quakes <- lag(Quakes.ts, -2)
lag3Quakes <- lag(Quakes.ts, -3)
lagdata <- ts.intersect(Quakes.ts, lag1Quakes, lag2Quakes, lag3Quakes, dframe=T)
summary(lm(Quakes.ts ~ lag1Quakes + lag2Quakes + lag3Quakes, data=lagdata))
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 6.44916 1.78646 3.610 0.000496 ***
# lag1Quakes 0.16424 0.10063 1.632 0.106049
# lag2Quakes 0.07125 0.10128 0.703 0.483517
# lag3Quakes 0.26928 0.09783 2.753 0.007110 **
# ---
# Residual standard error: 3.851 on 93 degrees of freedom
# Multiple R-squared: 0.1388, Adjusted R-squared: 0.111
# F-statistic: 4.997 on 3 and 93 DF, p-value: 0.002942
detach(earthquakes)
Blaisdell company (regression with autoregressive errors)
- Load the blaisdell data.
- Fit a simple linear regression model of comsales vs indsales.
- Use the dwt function in the car package to conduct the Durbin-Watson test on the residuals.
- Conduct the Ljung-Box test on the residuals.
- Perform the Cochrane-Orcutt procedure to transform the variables.
- Forecast comsales for period 21 when indsales are projected to be $175.3 million.
- Perform the Hildreth-Lu procedure to transform the variables.
- Perform the first differences procedure to transform the variables.
blaisdell <- read.table("~/path-to-data/blaisdell.txt", header=T)
attach(blaisdell)
model.1 <- lm(comsales ~ indsales)
summary(model.1)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) -1.454750 0.214146 -6.793 2.31e-06 ***
# indsales 0.176283 0.001445 122.017 < 2e-16 ***
# Durbin-Watson Test
library(car)
dwt(model.1)
# lag Autocorrelation D-W Statistic p-value
# 1 0.6260046 0.7347256 0
# Alternative hypothesis: rho != 0
# Ljung-Box Q Test
Box.test(residuals(model.1), lag = 1, type = "Ljung")
# Box-Ljung test
# data: residuals(model.1)
# X-squared = 9.0752, df = 1, p-value = 0.002591
# Cochrane-Orcutt Procedure
res.ts <- ts(residuals(model.1))
lag1res <- lag(res.ts, -1)
lagdata1 <- ts.intersect(res.ts, lag1res)
acp <- coef(lm(res.ts ~ lag1res -1, data=lagdata1)) # 0.6311636
y.ts <- ts(comsales)
x.ts <- ts(indsales)
lag1y <- lag(y.ts, -1)
lag1x <- lag(x.ts, -1)
y.co <- y.ts-acp*lag1y
x.co <- x.ts-acp*lag1x
model.2 <- lm(y.co ~ x.co)
summary(model.2)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) -0.394111 0.167230 -2.357 0.0307 *
# x.co 0.173758 0.002957 58.767 <2e-16 ***
dwt(model.2)
# lag Autocorrelation D-W Statistic p-value
# 1 0.1473569 1.650248 0.306
# Alternative hypothesis: rho != 0
b0 <- coef(model.2)[1]/(1-acp) # -1.068524
sqrt(vcov(model.2)[1,1])/(1-acp) # se = 0.4533986
b1 <- coef(model.2)[2] # 0.1737583
fit.20 <- b0+b1*indsales[20] # 28.76577
res.20 <- comsales[20]-fit.20 # 0.01422919
fit.21 <- b0+b1*175.3 # 29.3913
forecast.21 <- fit.21+acp*res.20 # 29.40028
# Hildreth-Lu Procedure
sse <- vector()
for(i in 1:90){
y.hl = y.ts-(0.09+0.01*i)*lag1y
x.hl = x.ts-(0.09+0.01*i)*lag1x
sse[i] <- sum(residuals(lm(y.hl ~ x.hl))^2)
}
acp <- 0.09+0.01*which.min(sse) # 0.96
y.hl = y.ts-acp*lag1y
x.hl = x.ts-acp*lag1x
model.3 <- lm(y.hl ~ x.hl)
summary(model.3)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 0.07117 0.05797 1.228 0.236
# x.hl 0.16045 0.00684 23.458 2.18e-14 ***
dwt(model.3)
# lag Autocorrelation D-W Statistic p-value
# 1 0.116145 1.725439 0.548
# Alternative hypothesis: rho != 0
coef(model.3)[1]/(1-acp) # 1.77933
sqrt(vcov(model.3)[1,1])/(1-acp) # 1.449373
# First Differences Procedure
y.fd = y.ts-lag1y
x.fd = x.ts-lag1x
dwt(lm(y.fd ~ x.fd))
# lag Autocorrelation D-W Statistic p-value
# 1 0.1160548 1.748834 0.62
# Alternative hypothesis: rho != 0
model.4 <- lm(y.fd ~ x.fd -1)
summary(model.4)
# Estimate Std. Error t value Pr(>|t|)
# x.fd 0.168488 0.005096 33.06 <2e-16 ***
mean(comsales)-coef(model.4)[1]*mean(indsales) # -0.3040052
detach(blaisdell)
Metal fabricator and vendor employees (regression with autoregressive errors)
- Load the employee data.
- Fit a simple linear regression model of metal vs vendor.
- Create a scatterplot of the data with a regression line.
- Create a scatterplot of the residuals vs time order.
- Create a plot of partial autocorrelations of the residuals.
- Use the dwt function in the car package to conduct the Durbin-Watson test on the residuals.
- Perform the Cochrane-Orcutt procedure to transform the variables.
employee <- read.table("~/path-to-data/employee.txt", header=T)
attach(employee)
model.1 <- lm(metal ~ vendor)
summary(model.1)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 2.847911 3.299962 0.863 0.392
# vendor 0.122442 0.009423 12.994 <2e-16 ***
plot(x=vendor, y=metal,
panel.last = lines(sort(vendor), fitted(model.1)[order(vendor)]))
plot(x=time, y=residuals(model.1), type="b",
panel.last = abline(h=0, lty=2))
pacf(residuals(model.1))
# Durbin-Watson Test
dwt(model.1)
# lag Autocorrelation D-W Statistic p-value
# 1 0.772038 0.3592396 0
# Alternative hypothesis: rho != 0
# Cochrane-Orcutt Procedure
res.ts <- ts(residuals(model.1))
lag1res <- lag(res.ts, -1)
lagdata1 <- ts.intersect(res.ts, lag1res)
acp <- coef(lm(res.ts ~ lag1res -1, data=lagdata1)) # 0.831385
y.ts <- ts(metal)
x.ts <- ts(vendor)
lag1y <- lag(y.ts, -1)
lag1x <- lag(x.ts, -1)
y.co <- y.ts-acp*lag1y
x.co <- x.ts-acp*lag1x
model.2 <- lm(y.co ~ x.co)
summary(model.2)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 4.87560 0.78655 6.199 6.78e-08 ***
# x.co 0.04795 0.01300 3.688 0.000505 ***
coef(model.2)[1]/(1-acp) # 28.91557
sqrt(vcov(model.2)[1,1])/(1-acp) # se = 4.664789
detach(employee)
Blood pressure (multicollinearity)
- Load the bloodpress data.
- Create scatterplot matrices of the data.
- Calculate correlations between the variables.
bloodpress <- read.table("~/path-to-data/bloodpress.txt", header=T)
attach(bloodpress)
pairs(bloodpress[,c(2:5)])
pairs(bloodpress[,c(2,6:8)])
round(cor(bloodpress[,c(2:8)]),3)
# BP Age Weight BSA Dur Pulse Stress
# BP 1.000 0.659 0.950 0.866 0.293 0.721 0.164
# Age 0.659 1.000 0.407 0.378 0.344 0.619 0.368
# Weight 0.950 0.407 1.000 0.875 0.201 0.659 0.034
# BSA 0.866 0.378 0.875 1.000 0.131 0.465 0.018
# Dur 0.293 0.344 0.201 0.131 1.000 0.402 0.312
# Pulse 0.721 0.619 0.659 0.465 0.402 1.000 0.506
# Stress 0.164 0.368 0.034 0.018 0.312 0.506 1.000
detach(bloodpress)
Uncorrelated predictors (no multicollinearity)
- Load the uncorrpreds data.
- Create a scatterplot matrix of the data.
- Calculate the correlation between the predictors.
- Fit a simple linear regression model of y vs x1.
- Fit a simple linear regression model of y vs x2.
- Fit a multiple linear regression model of y vs x1 + x2.
- Fit a multiple linear regression model of y vs x2 + x1.
- Use the scatter3d function in the car package to create a 3D scatterplot of the data with the fitted plane for a multiple linear regression model of y vs x1 + x2.
uncorrpreds <- read.table("~/path-to-data/uncorrpreds.txt", header=T)
attach(uncorrpreds)
pairs(uncorrpreds)
cor(x1,x2) # 0
model.1 <- lm(y ~ x1)
summary(model.1)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 49.500 4.655 10.634 4.07e-05 ***
# x1 -1.000 1.472 -0.679 0.522
anova(model.1)
# Df Sum Sq Mean Sq F value Pr(>F)
# x1 1 8 8.000 0.4615 0.5222
# Residuals 6 104 17.333
model.2 <- lm(y ~ x2)
summary(model.2)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 57.000 8.213 6.940 0.000444 ***
# x2 -1.750 1.350 -1.296 0.242545
anova(model.2)
# Df Sum Sq Mean Sq F value Pr(>F)
# x2 1 24.5 24.500 1.68 0.2425
# Residuals 6 87.5 14.583
model.12 <- lm(y ~ x1 + x2)
summary(model.12)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 60.000 9.562 6.275 0.00151 **
# x1 -1.000 1.410 -0.709 0.50982
# x2 -1.750 1.410 -1.241 0.26954
anova(model.12)
# Df Sum Sq Mean Sq F value Pr(>F)
# x1 1 8.0 8.0 0.5031 0.5098
# x2 1 24.5 24.5 1.5409 0.2695
# Residuals 5 79.5 15.9
model.21 <- lm(y ~ x2 + x1)
summary(model.21)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 60.000 9.562 6.275 0.00151 **
# x2 -1.750 1.410 -1.241 0.26954
# x1 -1.000 1.410 -0.709 0.50982
anova(model.21)
# Df Sum Sq Mean Sq F value Pr(>F)
# x2 1 24.5 24.5 1.5409 0.2695
# x1 1 8.0 8.0 0.5031 0.5098
# Residuals 5 79.5 15.9
# library(car)
scatter3d(y ~ x1 + x2)
detach(uncorrpreds)
Blood pressure (predictors with almost no multicollinearity)
- Load the bloodpress data.
- Create a scatterplot matrix of the data.
- Fit a simple linear regression model of BP vs Stress.
- Fit a simple linear regression model of BP vs BSA.
- Fit a multiple linear regression model of BP vs Stress + BSA.
- Fit a multiple linear regression model of BP vs BSA + Stress.
- Use the scatter3d function in the car package to create a 3D scatterplot of the data with the fitted plane for a multiple linear regression model of BP vs Stress + BSA.
attach(bloodpress)
pairs(bloodpress[,c(2,5,8)])
model.1 <- lm(BP ~ Stress)
summary(model.1)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 112.71997 2.19345 51.389 <2e-16 ***
# Stress 0.02399 0.03404 0.705 0.49
anova(model.1)
# Df Sum Sq Mean Sq F value Pr(>F)
# Stress 1 15.04 15.044 0.4969 0.4899
# Residuals 18 544.96 30.275
model.2 <- lm(BP ~ BSA)
summary(model.2)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 45.183 9.392 4.811 0.00014 ***
# BSA 34.443 4.690 7.343 8.11e-07 ***
anova(model.2)
# Df Sum Sq Mean Sq F value Pr(>F)
# BSA 1 419.86 419.86 53.927 8.114e-07 ***
# Residuals 18 140.14 7.79
model.12 <- lm(BP ~ Stress + BSA)
summary(model.12)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 44.24452 9.26104 4.777 0.000175 ***
# Stress 0.02166 0.01697 1.277 0.218924
# BSA 34.33423 4.61110 7.446 9.56e-07 ***
anova(model.12)
# Df Sum Sq Mean Sq F value Pr(>F)
# Stress 1 15.04 15.04 1.9998 0.1754
# BSA 1 417.07 417.07 55.4430 9.561e-07 ***
# Residuals 17 127.88 7.52
model.21 <- lm(BP ~ BSA + Stress)
summary(model.21)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 44.24452 9.26104 4.777 0.000175 ***
# BSA 34.33423 4.61110 7.446 9.56e-07 ***
# Stress 0.02166 0.01697 1.277 0.218924
anova(model.21)
# Df Sum Sq Mean Sq F value Pr(>F)
# BSA 1 419.86 419.86 55.8132 9.149e-07 ***
# Stress 1 12.26 12.26 1.6296 0.2189
# Residuals 17 127.88 7.52
scatter3d(BP ~ Stress + BSA)
detach(bloodpress)
Blood pressure (predictors with high multicollinearity)
- Load the bloodpress data.
- Create a scatterplot matrix of the data.
- Fit a simple linear regression model of BP vs Weight.
- Fit a simple linear regression model of BP vs BSA.
- Fit a multiple linear regression model of BP vs Weight + BSA.
- Fit a multiple linear regression model of BP vs BSA + Weight.
- Use the scatter3d function in the car package to create a 3D scatterplot of the data with the fitted plane for a multiple linear regression model of BP vs Weight + BSA.
- Predict BP for Weight=92 and BSA=2 for the two simple linear regression models and the multiple linear regression model.
attach(bloodpress)
pairs(bloodpress[,c(2,5,4)])
model.1 <- lm(BP ~ Weight)
summary(model.1)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 2.20531 8.66333 0.255 0.802
# Weight 1.20093 0.09297 12.917 1.53e-10 ***
anova(model.1)
# Df Sum Sq Mean Sq F value Pr(>F)
# Weight 1 505.47 505.47 166.86 1.528e-10 ***
# Residuals 18 54.53 3.03
model.2 <- lm(BP ~ BSA)
summary(model.2)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 45.183 9.392 4.811 0.00014 ***
# BSA 34.443 4.690 7.343 8.11e-07 ***
anova(model.2)
# Df Sum Sq Mean Sq F value Pr(>F)
# BSA 1 419.86 419.86 53.927 8.114e-07 ***
# Residuals 18 140.14 7.79
model.12 <- lm(BP ~ Weight + BSA)
summary(model.12)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 5.6534 9.3925 0.602 0.555
# Weight 1.0387 0.1927 5.392 4.87e-05 ***
# BSA 5.8313 6.0627 0.962 0.350
anova(model.12)
# Df Sum Sq Mean Sq F value Pr(>F)
# Weight 1 505.47 505.47 166.1648 3.341e-10 ***
# BSA 1 2.81 2.81 0.9251 0.3496
# Residuals 17 51.71 3.04
model.21 <- lm(BP ~ BSA + Weight)
summary(model.21)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 5.6534 9.3925 0.602 0.555
# BSA 5.8313 6.0627 0.962 0.350
# Weight 1.0387 0.1927 5.392 4.87e-05 ***
anova(model.21)
# Df Sum Sq Mean Sq F value Pr(>F)
# BSA 1 419.86 419.86 138.021 1.391e-09 ***
# Weight 1 88.43 88.43 29.069 4.871e-05 ***
# Residuals 17 51.71 3.04
scatter3d(BP ~ Weight + BSA)
predict(model.1, interval="prediction",
newdata=data.frame(Weight=92))
# fit lwr upr
# 1 112.691 108.938 116.444
predict(model.2, interval="prediction",
newdata=data.frame(BSA=2))
# fit lwr upr
# 1 114.0689 108.0619 120.0758
predict(model.12, interval="prediction",
newdata=data.frame(Weight=92, BSA=2))
# fit lwr upr
# 1 112.8794 109.0801 116.6787
detach(bloodpress)
Poverty and teen birth rate (high multicollinearity)
- Load the poverty data and remove the District of Columbia.
- Create a scatterplot matrix of the data.
- Fit a simple linear regression model of PovPct vs Brth15to17.
- Fit a simple linear regression model of PovPct vs Brth18to19.
- Fit a multiple linear regression model of PovPct vs Brth15to17 + Brth18to19.
poverty <- read.table("~/path-to-data/poverty.txt", header=T)
poverty <- poverty[poverty$Location!="District_of_Columbia",]
attach(poverty)
pairs(poverty[,c(2:4)])
model.1 <- lm(PovPct ~ Brth15to17)
summary(model.1)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 4.4871 1.3181 3.404 0.00135 **
# Brth15to17 0.3872 0.0572 6.768 1.67e-08 ***
# ---
# Residual standard error: 2.982 on 48 degrees of freedom
# Multiple R-squared: 0.4883, Adjusted R-squared: 0.4777
# F-statistic: 45.81 on 1 and 48 DF, p-value: 1.666e-08
model.2 <- lm(PovPct ~ Brth18to19)
summary(model.2)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 3.05279 1.83169 1.667 0.102
# Brth18to19 0.13842 0.02482 5.576 1.11e-06 ***
# ---
# Residual standard error: 3.248 on 48 degrees of freedom
# Multiple R-squared: 0.3931, Adjusted R-squared: 0.3805
# F-statistic: 31.09 on 1 and 48 DF, p-value: 1.106e-06
model.12 <- lm(PovPct ~ Brth15to17 + Brth18to19)
summary(model.12)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 6.43963 1.95904 3.287 0.00192 **
# Brth15to17 0.63235 0.19178 3.297 0.00186 **
# Brth18to19 -0.10227 0.07642 -1.338 0.18724
# ---
# Residual standard error: 2.958 on 47 degrees of freedom
# Multiple R-squared: 0.5071, Adjusted R-squared: 0.4861
# F-statistic: 24.18 on 2 and 47 DF, p-value: 6.017e-08
detach(poverty)
Blood pressure (high multicollinearity)
- Load the bloodpress data.
- Fit a multiple linear regression model of BP vs Age + Weight + BSA + Dur + Pulse + Stress.
- Use the vif function in the car package to calculate variance inflation factors.
- Fit a multiple linear regression model of Weight vs Age + BSA + Dur + Pulse + Stress and confirm the VIF value for Weight as 1/(1-R2) for this model.
- Fit a multiple linear regression model of BP vs Age + Weight + Dur + Stress.
- Use the vif function in the car package to calculate variance inflation factors.
attach(bloodpress)
model.1 <- lm(BP ~ Age + Weight + BSA + Dur + Pulse + Stress)
summary(model.1)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) -12.870476 2.556650 -5.034 0.000229 ***
# Age 0.703259 0.049606 14.177 2.76e-09 ***
# Weight 0.969920 0.063108 15.369 1.02e-09 ***
# BSA 3.776491 1.580151 2.390 0.032694 *
# Dur 0.068383 0.048441 1.412 0.181534
# Pulse -0.084485 0.051609 -1.637 0.125594
# Stress 0.005572 0.003412 1.633 0.126491
# ---
# Residual standard error: 0.4072 on 13 degrees of freedom
# Multiple R-squared: 0.9962, Adjusted R-squared: 0.9944
# F-statistic: 560.6 on 6 and 13 DF, p-value: 6.395e-15
# library(car)
vif(model.1)
# Age Weight BSA Dur Pulse Stress
# 1.762807 8.417035 5.328751 1.237309 4.413575 1.834845
model.2 <- lm(Weight ~ Age + BSA + Dur + Pulse + Stress)
summary(model.2)
# Residual standard error: 1.725 on 14 degrees of freedom
# Multiple R-squared: 0.8812, Adjusted R-squared: 0.8388
# F-statistic: 20.77 on 5 and 14 DF, p-value: 5.046e-06
1/(1-summary(model.2)$r.squared) # 8.417035
model.3 <- lm(BP ~ Age + Weight + Dur + Stress)
summary(model.3)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) -15.869829 3.195296 -4.967 0.000169 ***
# Age 0.683741 0.061195 11.173 1.14e-08 ***
# Weight 1.034128 0.032672 31.652 3.76e-15 ***
# Dur 0.039889 0.064486 0.619 0.545485
# Stress 0.002184 0.003794 0.576 0.573304
# ---
# Residual standard error: 0.5505 on 15 degrees of freedom
# Multiple R-squared: 0.9919, Adjusted R-squared: 0.9897
# F-statistic: 458.3 on 4 and 15 DF, p-value: 1.764e-15
vif(model.3)
# Age Weight Dur Stress
# 1.468245 1.234653 1.200060 1.241117
detach(bloodpress)
Allen Cognitive Level study (reducing data-based multicollinearity)
- Load the sampled allentestn23 data.
- Create a scatterplot matrix of the data.
- Calculate the correlation between Vocab and Abstract.
- Fit a multiple linear regression model of ACL vs SDMT + Vocab + Abstract.
- Use the vif function in the car package to calculate variance inflation factors.
- Repeat for the full allentest data.
allentestn23 <- read.table("~/path-to-data/allentestn23.txt", header=T)
attach(allentestn23)
pairs(allentestn23[,2:5])
cor(Vocab, Abstract) # 0.9897771
model.1 <- lm(ACL ~ SDMT + Vocab + Abstract)
summary(model.1)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 3.74711 1.34237 2.791 0.0116 *
# SDMT 0.02326 0.01273 1.827 0.0834 .
# Vocab 0.02825 0.15239 0.185 0.8549
# Abstract -0.01379 0.10055 -0.137 0.8924
# ---
# Residual standard error: 0.7344 on 19 degrees of freedom
# Multiple R-squared: 0.2645, Adjusted R-squared: 0.1484
# F-statistic: 2.278 on 3 and 19 DF, p-value: 0.1124
vif(model.1)
# SDMT Vocab Abstract
# 1.726185 49.286239 50.603085
detach(allentestn23)
allentest <- read.table("~/path-to-data/allentest.txt", header=T)
attach(allentest)
pairs(allentest[,2:5])
cor(Vocab, Abstract) # 0.6978405
model.1 <- lm(ACL ~ SDMT + Vocab + Abstract)
summary(model.1)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 3.946347 0.338069 11.673 < 2e-16 ***
# SDMT 0.027404 0.007168 3.823 0.000298 ***
# Vocab -0.017397 0.018077 -0.962 0.339428
# Abstract 0.012182 0.011585 1.051 0.296926
# ---
# Residual standard error: 0.6878 on 65 degrees of freedom
# Multiple R-squared: 0.2857, Adjusted R-squared: 0.2528
# F-statistic: 8.668 on 3 and 65 DF, p-value: 6.414e-05
vif(model.1)
# SDMT Vocab Abstract
# 1.609662 2.093297 2.167428
detach(allentest)
Exercise and immunity (reducing structural multicollinearity)
- Load the exerimmun data.
- Create a scatterplot of igg vs oxygen.
- Calculate an oxygen-squared variable named oxygensq.
- Fit a quadratic regression model of igg vs oxygen + oxygensq.
- Add a quadratic regression line to the scatterplot.
- Use the vif function in the car package to calculate variance inflation factors.
- Create a scatterplot of oxygensq vs oxygen and calculate the correlation.
- Calculate a centered oxygen variable named oxcent and an oxcent-squared variable named oxcentsq.
- Fit a quadratic regression model of igg vs oxcent + oxcentsq.
- Use the vif function in the car package to calculate variance inflation factors.
- Create a scatterplot of igg vs oxcent with the quadratic regression line added.
- Fit a simple linear regression model of igg vs oxcent.
- Confirm the equivalence of the original quadratic and centered quadratic models by transforming the regression parameter estimates.
- Create a residual vs fits plot for the centered quadratic model.
- Create a normal probability plot of the residuals for the centered quadratic model.
- Predict igg for oxygen = 70 using the centered quadratic model.
exerimmun <- read.table("~/path-to-data/exerimmun.txt", header=T)
attach(exerimmun)
plot(oxygen, igg)
oxygensq <- oxygen^2
model.1 <- lm(igg ~ oxygen + oxygensq)
plot(x=oxygen, y=igg,
panel.last = lines(sort(oxygen), fitted(model.1)[order(oxygen)]))
summary(model.1)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) -1464.4042 411.4012 -3.560 0.00140 **
# oxygen 88.3071 16.4735 5.361 1.16e-05 ***
# oxygensq -0.5362 0.1582 -3.390 0.00217 **
# ---
# Residual standard error: 106.4 on 27 degrees of freedom
# Multiple R-squared: 0.9377, Adjusted R-squared: 0.9331
# F-statistic: 203.2 on 2 and 27 DF, p-value: < 2.2e-16
vif(model.1)
# oxygen oxygensq
# 99.94261 99.94261
plot(oxygen, oxygensq)
cor(oxygen, oxygensq) # 0.9949846
oxcent <- oxygen-mean(oxygen)
oxcentsq <- oxcent^2
plot(oxcent, oxcentsq)
cor(oxcent, oxcentsq) # 0.2195179
model.2 <- lm(igg ~ oxcent + oxcentsq)
summary(model.2)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 1632.1962 29.3486 55.61 < 2e-16 ***
# oxcent 33.9995 1.6890 20.13 < 2e-16 ***
# oxcentsq -0.5362 0.1582 -3.39 0.00217 **
# ---
# Residual standard error: 106.4 on 27 degrees of freedom
# Multiple R-squared: 0.9377, Adjusted R-squared: 0.9331
# F-statistic: 203.2 on 2 and 27 DF, p-value: < 2.2e-16
vif(model.2)
# oxcent oxcentsq
# 1.050628 1.050628
plot(x=oxcent, y=igg,
panel.last = lines(sort(oxcent), fitted(model.2)[order(oxcent)]))
model.3 <- lm(igg ~ oxcent)
summary(model.3)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 1557.633 22.782 68.37 < 2e-16 ***
# oxcent 32.743 1.932 16.95 2.97e-16 ***
coef(model.2)[1]-coef(model.2)[2]*mean(oxygen)+coef(model.2)[3]*mean(oxygen)^2 # -1464.404
coef(model.2)[2]-2*coef(model.2)[3]*mean(oxygen) # 88.3071
coef(model.2)[3] # -0.5362473
coef(model.1)
# (Intercept) oxygen oxygensq
# -1464.4042284 88.3070970 -0.5362473
plot(x=fitted(model.2), y=residuals(model.2),
panel.last = abline(h=0, lty=2))
qqnorm(residuals(model.2), main="", datax=TRUE)
qqline(residuals(model.2), datax=TRUE)
predict(model.2, interval="prediction",
newdata=data.frame(oxcent=70-mean(oxygen), oxcentsq=(70-mean(oxygen))^2))
# fit lwr upr
# 1 2089.481 1849.931 2329.031
detach(exerimmun)