Birthweight and smoking (2-level categorical predictor, additive model)
- Load the birthsmokers data.
- Create a scatterplot matrix of the data
- Fit a multiple linear regression model of Wgt on Gest + Smoke.
- Display scatterplot of Wgt vs Gest with points marked by Smoke and add parallel regression lines representing Smoke=0 and Smoke=1.
- Display regression results and calculate confidence intervals for the regression parameters.
- Display confidence intervals for expected Wgt at Gest=38 (for Smoke=1 and Smoke=0).
- Repeat analysis separately for Smoke=0 and Smoke=1.
- Repeat analysis using (1, -1) coding.
birthsmokers <- read.table("~/path-to-folder/birthsmokers.txt", header=T)
attach(birthsmokers)
pairs(cbind(Wgt, Gest, Smoke))
model <- lm(Wgt ~ Gest + Smoke)
plot(x=Gest, y=Wgt, ylim=c(2300, 3700),
col=ifelse(Smoke=="yes", "red", "blue"),
panel.last = c(lines(sort(Gest[Smoke=="no"]),
fitted(model)[Smoke=="no"][order(Gest[Smoke=="no"])],
col="blue"),
lines(sort(Gest[Smoke=="yes"]),
fitted(model)[Smoke=="yes"][order(Gest[Smoke=="yes"])],
col="red")))
summary(model)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) -2389.573 349.206 -6.843 1.63e-07 ***
# Gest 143.100 9.128 15.677 1.07e-15 ***
# Smokeyes -244.544 41.982 -5.825 2.58e-06 ***
# ---
# Residual standard error: 115.5 on 29 degrees of freedom
# Multiple R-squared: 0.8964, Adjusted R-squared: 0.8892
# F-statistic: 125.4 on 2 and 29 DF, p-value: 5.289e-15
confint(model)
# 2.5 % 97.5 %
# (Intercept) -3103.7795 -1675.3663
# Gest 124.4312 161.7694
# Smokeyes -330.4064 -158.6817
predict(model, interval="confidence",
newdata=data.frame(Gest=c(38, 38), Smoke=c("yes", "no")))
# fit lwr upr
# 1 2803.693 2740.599 2866.788
# 2 3048.237 2989.120 3107.355
model.0 <- lm(Wgt ~ Gest, subset=Smoke=="no")
summary(model.0)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) -2546.14 457.29 -5.568 6.93e-05 ***
# Gest 147.21 11.97 12.294 6.85e-09 ***
predict(model.0, interval="confidence",
newdata=data.frame(Gest=38))
# fit lwr upr
# 1 3047.724 2990.298 3105.15
model.1 <- lm(Wgt ~ Gest, subset=Smoke=="yes")
summary(model.1)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) -2474.56 553.97 -4.467 0.000532 ***
# Gest 139.03 14.11 9.851 1.12e-07 ***
predict(model.1, interval="confidence",
newdata=data.frame(Gest=38))
# fit lwr upr
# 1 2808.528 2731.726 2885.331
Smoke2 <- ifelse(Smoke=="yes", 1, -1)
model.3 <- lm(Wgt ~ Gest + Smoke2)
summary(model.3)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) -2511.845 353.449 -7.107 8.07e-08 ***
# Gest 143.100 9.128 15.677 1.07e-15 ***
# Smoke2 -122.272 20.991 -5.825 2.58e-06 ***
# Alternatively
model.3 <- lm(Wgt ~ Gest + Smoke, contrasts=list(Smoke="contr.sum"))
summary(model.3)
detach(birthsmokers)
Depression treatments (3-level categorical predictor, interaction model)
- Load the depression data.
- Display scatterplot of y (treatment effectiveness) vs age with points marked by treatment.
- Create interaction variables and fit a multiple linear regression model of y on age + x2 + x3 + age.x2 + age.x3.
- Add non-parallel regression lines representing each of the three treatments to the scatterplot.
- Display a residuals vs fits plot and a normal probability plot of the residuals, and conduct an Anderson-Darling normality test using the
nortest
package.
- Conduct an F-test to see if at least one of x2, x3, age.x2, and age.x3 are useful (i.e., the regression functions differ).
- Conduct an F-test to see if at least one of age.x2 and age.x3 are useful (i.e., the regression functions have different slopes).
depression <- read.table("~/path-to-folder/depression.txt", header=T)
attach(depression)
plot(x=age, y=y, col=as.numeric(TRT))
legend("topleft", col=1:3, pch=1,
inset=0.02, x.intersp = 1.5, y.intersp = 1.8,
legend=c("Trt A", "Trt B", "Trt C"))
age.x2 <- age*x2
age.x3 <- age*x3
model.1 <- lm(y ~ age + x2 + x3 + age.x2 + age.x3)
summary(model.1)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 6.21138 3.34964 1.854 0.073545 .
# age 1.03339 0.07233 14.288 6.34e-15 ***
# x2 41.30421 5.08453 8.124 4.56e-09 ***
# x3 22.70682 5.09097 4.460 0.000106 ***
# age.x2 -0.70288 0.10896 -6.451 3.98e-07 ***
# age.x3 -0.50971 0.11039 -4.617 6.85e-05 ***
plot(x=age, y=y, ylim=c(20, 80), col=as.numeric(TRT),
panel.last = c(lines(sort(age[TRT=="A"]),
fitted(model.1)[TRT=="A"][order(age[TRT=="A"])],
col=1),
lines(sort(age[TRT=="B"]),
fitted(model.1)[TRT=="B"][order(age[TRT=="B"])],
col=2),
lines(sort(age[TRT=="C"]),
fitted(model.1)[TRT=="C"][order(age[TRT=="C"])],
col=3)))
legend("topleft", col=1:3, pch=1, lty=1,
inset=0.02, x.intersp = 1.5, y.intersp = 1.8,
legend=c("Trt A", "Trt B", "Trt C"))
plot(x=fitted(model.1), y=rstudent(model.1),
panel.last = abline(h=0, lty=2))
qqnorm(residuals(model.1), main="", datax=TRUE)
qqline(residuals(model.1), datax=TRUE)
library(nortest)
ad.test(residuals(model.1)) # A = 0.4057, p-value = 0.3345
anova(model.1)
# Df Sum Sq Mean Sq F value Pr(>F)
# age 1 3424.4 3424.4 222.2946 2.059e-15 ***
# x2 1 803.8 803.8 52.1784 4.857e-08 ***
# x3 1 1.2 1.2 0.0772 0.7831
# age.x2 1 375.0 375.0 24.3430 2.808e-05 ***
# age.x3 1 328.4 328.4 21.3194 6.850e-05 ***
# Residuals 30 462.1 15.4
model.2 <- lm(y ~ age)
anova(model.2, model.1)
# Model 1: y ~ age
# Model 2: y ~ age + x2 + x3 + age.x2 + age.x3
# Res.Df RSS Df Sum of Sq F Pr(>F)
# 1 34 1970.57
# 2 30 462.15 4 1508.4 24.48 4.458e-09 ***
model.3 <- lm(y ~ age + x2 + x3)
anova(model.3, model.1)
# Model 1: y ~ age + x2 + x3
# Model 2: y ~ age + x2 + x3 + age.x2 + age.x3
# Res.Df RSS Df Sum of Sq F Pr(>F)
# 1 32 1165.57
# 2 30 462.15 2 703.43 22.831 9.41e-07 ***
detach(depression)
Real estate air conditioning (2-level categorical predictor, interaction model, transformations)
- Load the realestate data.
- Create an interaction variable and fit a multiple linear regression model of SalePrice on SqFeet + Air + SqFeet.Air.
- Display scatterplot of SalePrice vs SqFeet with points marked by Air and add non-parallel regression lines representing Air=0 and Air=1.
- Display residual plot with fitted (predicted) values on the horizontal axis.
- Create log(SalePrice), log(SqFeet), and log(SqFeet).Air variables and fit a multiple linear regression model of log(SalePrice) on log(SqFeet) + Air + log(SqFeet).Air.
- Display scatterplot of log(SalePrice) vs log(SqFeet) with points marked by Air and add non-parallel regression lines representing Air=0 and Air=1.
- Display residual plot with fitted (predicted) values on the horizontal axis.
realestate <- read.table("~/path-to-folder/realestate.txt", header=T)
attach(realestate)
SqFeet.Air <- SqFeet*Air
model.1 <- lm(SalePrice ~ SqFeet + Air + SqFeet.Air)
summary(model.1)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) -3.218 30.085 -0.107 0.914871
# SqFeet 104.902 15.748 6.661 6.96e-11 ***
# Air -78.868 32.663 -2.415 0.016100 *
# SqFeet.Air 55.888 16.580 3.371 0.000805 ***
plot(x=SqFeet, y=SalePrice,
col=ifelse(Air, "red", "blue"),
panel.last = c(lines(sort(SqFeet[Air==0]),
fitted(model.1)[Air==0][order(SqFeet[Air==0])],
col="blue"),
lines(sort(SqFeet[Air==1]),
fitted(model.1)[Air==1][order(SqFeet[Air==1])],
col="red")))
legend("topleft", col=c("blue", "red"), pch=1, lty=1, inset=0.02,
legend=c("No air conditioning", "Air conditioning"))
plot(x=fitted(model.1), y=residuals(model.1),
xlab="Fitted values", ylab="Residuals",
panel.last = abline(h=0, lty=2))
lnSalePrice <- log(SalePrice)
lnSqFeet <- log(SqFeet)
lnSqFeet.Air <- lnSqFeet*Air
model.2 <- lm(lnSalePrice ~ lnSqFeet + Air + lnSqFeet.Air)
plot(x=lnSqFeet, y=lnSalePrice,
col=ifelse(Air, "red", "blue"),
panel.last = c(lines(sort(lnSqFeet[Air==0]),
fitted(model.2)[Air==0][order(lnSqFeet[Air==0])],
col="blue"),
lines(sort(lnSqFeet[Air==1]),
fitted(model.2)[Air==1][order(lnSqFeet[Air==1])],
col="red")))
legend("topleft", col=c("blue", "red"), pch=1, lty=1, inset=0.02,
legend=c("No air conditioning", "Air conditioning"))
plot(x=fitted(model.2), y=residuals(model.2),
xlab="Fitted values", ylab="Residuals",
panel.last = abline(h=0, lty=2))
detach(realestate)
Hospital infection risk (4-level categorical predictor, additive model)
- Load the infectionrisk data and select observations with Stay <= 14.
- Create indicator variables for regions.
- Fit a multiple linear regression model of InfctRsk on Stay + Xray + i2 + i3 + i4.
- Conduct an F-test to see if at least one of i2, i3, and i4 are useful (conclusion: the regression functions differ by region).
- Conduct an F-test to see if at least one of i2 and i3 are useful (conclusion: only the west region differs).
infectionrisk <- read.table("~/path-to-folder/infectionrisk.txt", header=T)
infectionrisk <- infectionrisk[infectionrisk$Stay<=14,]
attach(infectionrisk)
i1 <- ifelse(Region==1,1,0) # NE
i2 <- ifelse(Region==2,1,0) # NC
i3 <- ifelse(Region==3,1,0) # S
i4 <- ifelse(Region==4,1,0) # W
model.1 <- lm(InfctRsk ~ Stay + Xray + i2 + i3 + i4)
summary(model.1)
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) -2.134259 0.877347 -2.433 0.01668 *
# Stay 0.505394 0.081455 6.205 1.11e-08 ***
# Xray 0.017587 0.005649 3.113 0.00238 **
# i2 0.171284 0.281475 0.609 0.54416
# i3 0.095461 0.288852 0.330 0.74169
# i4 1.057835 0.378077 2.798 0.00612 **
# ---
# Residual standard error: 1.036 on 105 degrees of freedom
# Multiple R-squared: 0.4198, Adjusted R-squared: 0.3922
# F-statistic: 15.19 on 5 and 105 DF, p-value: 3.243e-11
model.2 <- lm(InfctRsk ~ Stay + Xray)
anova(model.2, model.1)
# Res.Df RSS Df Sum of Sq F Pr(>F)
# 1 108 123.56
# 2 105 112.71 3 10.849 3.3687 0.02135 *
model.3 <- lm(InfctRsk ~ Stay + Xray + i4)
anova(model.3, model.1)
# Res.Df RSS Df Sum of Sq F Pr(>F)
# 1 107 113.11
# 2 105 112.71 2 0.39949 0.1861 0.8305
detach(infectionrisk)