R Help 13: Weighted Least Squares & Logistic Regressions

Help

Galton peas (nonconstant variance and weighted least squares)

Load the galton data.
Fit an ordinary least squares (OLS) simple linear regression model of Progeny vs Parent.
Fit a weighted least squares (WLS) model using weights = \(1/{SD^2}\).
Create a scatterplot of the data with a regression line for each model.

galton <- read.table("~/path-to-data/galton.txt", header=T)
attach(galton)
model.1 <- lm(Progeny ~ Parent)
summary(model.1)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept) 0.127029   0.006993  18.164 9.29e-06 ***
# Parent      0.210000   0.038614   5.438  0.00285 ** 
model.2 <- lm(Progeny ~ Parent, weights=1/SD^2)
summary(model.2)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept) 0.127964   0.006811  18.787 7.87e-06 ***
# Parent      0.204801   0.038155   5.368  0.00302 ** 
plot(x=Parent, y=Progeny, ylim=c(0.158,0.174),
panel.last = c(lines(sort(Parent), fitted(model.1)[order(Parent)], col="blue"),
lines(sort(Parent), fitted(model.2)[order(Parent)], col="red")))
legend("topleft", col=c("blue","red"), lty=1,
inset=0.02, legend=c("OLS", "WLS"))
detach(galton)

Computer-assisted learning (nonconstant variance and weighted least squares)

Load the ca_learning data.
Create a scatterplot of the data.
Fit an OLS model.
Plot the OLS residuals vs num.responses.
Plot the absolute OLS residuals vs num.responses.
Calculate fitted values from a regression of absolute residuals vs num.responses.
Fit a WLS model using weights = \(1/{(\text{fitted values})^2}\).
Create a scatterplot of the data with a regression line for each model.
Plot the WLS standardized residuals vs num.responses.

ca_learning <- read.table("~/path-to-data/ca_learning_new.txt", header=T)
attach(ca_learning)
plot(x=num.responses, y=cost)
model.1 <- lm(cost ~ num.responses)
summary(model.1)
#               Estimate Std. Error t value Pr(>|t|)    
# (Intercept)    19.4727     5.5162   3.530  0.00545 ** 
# num.responses   3.2689     0.3651   8.955 4.33e-06 ***
# ---
# Residual standard error: 4.598 on 10 degrees of freedom
# Multiple R-squared:  0.8891,  Adjusted R-squared:  0.878 
# F-statistic: 80.19 on 1 and 10 DF,  p-value: 4.33e-06
plot(num.responses, residuals(model.1))
plot(num.responses, abs(residuals(model.1)))
wts <- 1/fitted(lm(abs(residuals(model.1)) ~ num.responses))^2
model.2 <- lm(cost ~ num.responses, weights=wts)
summary(model.2)
#               Estimate Std. Error t value Pr(>|t|)    
# (Intercept)    17.3006     4.8277   3.584  0.00498 ** 
# num.responses   3.4211     0.3703   9.238 3.27e-06 ***
# ---
# Residual standard error: 1.159 on 10 degrees of freedom
# Multiple R-squared:  0.8951,  Adjusted R-squared:  0.8846 
# F-statistic: 85.35 on 1 and 10 DF,  p-value: 3.269e-06
plot(x=num.responses, y=cost, ylim=c(50,95),
panel.last = c(lines(sort(num.responses), fitted(model.1)[order(num.responses)], col="blue"),
lines(sort(num.responses), fitted(model.2)[order(num.responses)], col="red")))
legend("topleft", col=c("blue","red"), lty=1,
inset=0.02, legend=c("OLS", "WLS"))
plot(num.responses, rstandard(model.2))
detach(ca_learning)

Market share (nonconstant variance and weighted least squares)

Load the marketshare data.
Fit an OLS model.
Plot the OLS residuals vs fitted values with points marked by Discount.
Use the tapply function to calculate the residual variance for Discount=0 and Discount=1.
Fit a WLS model using weights = 1/variance for Discount=0 and Discount=1.
Plot the WLS standardized residuals vs fitted values.

marketshare <- read.table("~/path-to-data/marketshare.txt", header=T)
attach(marketshare)
model.1 <- lm(MarketShare ~ Price + P1 + P2)
summary(model.1)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)  3.19592    0.35616   8.973 3.00e-10 ***
# Price       -0.33358    0.15226  -2.191   0.0359 *  
# P1           0.30808    0.06412   4.804 3.51e-05 ***
# P2           0.48431    0.05541   8.740 5.49e-10 ***
plot(fitted(model.1), residuals(model.1), col=Discount+1)
vars <- tapply(residuals(model.1), Discount, var)
#          0          1 
# 0.01052324 0.02680546 
wts <- Discount/vars[2] + (1-Discount)/vars[1]
model.2 <- lm(MarketShare ~ Price + P1 + P2, weights=wts)
summary(model.2)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)  3.17437    0.35671   8.899 3.63e-10 ***
# Price       -0.32432    0.15291  -2.121   0.0418 *  
# P1           0.30834    0.06575   4.689 4.89e-05 ***
# P2           0.48419    0.05422   8.930 3.35e-10 ***
plot(fitted(model.2), rstandard(model.2), col=Discount+1)
detach(marketshare)

Home price (nonconstant variance and weighted least squares)

Load the realestate data.
Calculate log transformations of the variables.
Fit an OLS model.
Plot the OLS residuals vs fitted values.
Calculate fitted values from a regression of absolute residuals vs fitted values.
Fit a WLS model using weights = \(1/{(\text{fitted values})^2}\).
Plot the WLS standardized residuals vs fitted values.

realestate <- read.table("~/path-to-data/realestate.txt", header=T)
attach(realestate)
logY <- log(SalePrice)
logX1 <- log(SqFeet)
logX2 <- log(Lot)
model.1 <- lm(logY ~ logX1 + logX2)
summary(model.1)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)  4.25485    0.07353  57.864  < 2e-16 ***
# logX1        1.22141    0.03371  36.234  < 2e-16 ***
# logX2        0.10595    0.02394   4.425 1.18e-05 ***
plot(fitted(model.1), residuals(model.1))
wts <- 1/fitted(lm(abs(residuals(model.1)) ~ fitted(model.1)))^2
model.2 <- lm(logY ~ logX1 + logX2, weights=wts)
summary(model.2)
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)  4.35189    0.06330  68.755  < 2e-16 ***
# logX1        1.20150    0.03332  36.065  < 2e-16 ***
# logX2        0.07924    0.02152   3.682 0.000255 ***
plot(fitted(model.2), rstandard(model.2))
detach(realestate)

Leukemia remission (logistic regression)

Load the leukemia data.
Fit a logistic regression model of REMISS vs CELL + SMEAR + INFIL + LI + BLAST + TEMP.
Calculate 95% confidence intervals for the regression parameters based on asymptotic normality and based on profiling the least-squares estimation surface.
Fit a logistic regression model of REMISS vs LI.
Create a scatterplot of REMISS vs LI and add a fitted line based on the logistic regression model.
Calculate the odds ratio for LI and a 95% confidence interval.
Conduct a likelihood ratio (or deviance) test for LI.
Calculate the sum of squared deviance residuals and the sum of squared Pearson residuals.
Use the hoslem.test function in the ResourceSelection package to conduct the Hosmer-Lemeshow goodness-of-fit test.
Calculate a version of \(R^2\) for logistic regression.
Create residual plots using Pearson and deviance residuals.
Calculate hat values (leverages), studentized residuals, and Cook's distances.

  
leukemia <- read.table("~/path-to-data/leukemia_remission.txt", header=T)
attach(leukemia)
model.1 <- glm(REMISS ~ CELL + SMEAR + INFIL + LI + BLAST + TEMP, family="binomial")
summary(model.1)
#               Estimate Std. Error z value Pr(>|z|)
# (Intercept)   64.25808   74.96480   0.857    0.391
# CELL          30.83006   52.13520   0.591    0.554
# SMEAR         24.68632   61.52601   0.401    0.688
# INFIL        -24.97447   65.28088  -0.383    0.702
# LI             4.36045    2.65798   1.641    0.101
# BLAST         -0.01153    2.26634  -0.005    0.996
# TEMP        -100.17340   77.75289  -1.288    0.198
# 
# (Dispersion parameter for binomial family taken to be 1)
# 
# Null deviance: 34.372  on 26  degrees of freedom
# Residual deviance: 21.594  on 20  degrees of freedom
# AIC: 35.594
confint.default(model.1) # based on asymptotic normality
confint(model.1) # based on profiling the least-squares estimation surface
model.2 <- glm(REMISS ~ LI, family="binomial")
summary(model.2)
#             Estimate Std. Error z value Pr(>|z|)   
# (Intercept)   -3.777      1.379  -2.740  0.00615 **
# LI             2.897      1.187   2.441  0.01464 * 
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# 
# (Dispersion parameter for binomial family taken to be 1)
# 
# Null deviance: 34.372  on 26  degrees of freedom
# Residual deviance: 26.073  on 25  degrees of freedom
# AIC: 30.073
plot(x=LI, y=REMISS,
panel.last = lines(sort(LI), fitted(model.2)[order(LI)]))
exp(coef(model.2)[2]) # odds ratio = 18.12449
exp(confint.default(model.2)[2,]) # 95% CI = (1.770284, 185.561725)
anova(model.2, test="Chisq")
#      Df Deviance Resid. Df Resid. Dev Pr(>Chi)   
# NULL                    26     34.372            
# LI    1   8.2988        25     26.073 0.003967 **
sum(residuals(model.2, type="deviance")^2) # 26.07296
model.2$deviance # 26.07296
sum(residuals(model.2, type="pearson")^2) # 23.93298
library(ResourceSelection)
hoslem.test(model.2$y, fitted(model.2), g=9)
# Hosmer and Lemeshow goodness of fit (GOF) test
# data:  REMISS, fitted(model.2)
# X-squared = 7.3293, df = 7, p-value = 0.3954
1-model.2$deviance/model.2$null.deviance # "R-squared" = 0.2414424
plot(1:27, residuals(model.2, type="pearson"), type="b")
plot(1:27, residuals(model.2, type="deviance"), type="b")
summary(influence.measures(model.2))
#  dfb.1_ dfb.LI dffit   cov.r cook.d hat  
# 8  0.63  -0.83  -0.93_*  0.88  0.58   0.15
hatvalues(model.2)[8] # 0.1498395
residuals(model.2)[8] # -1.944852
rstudent(model.2)[8] # -2.185013
cooks.distance(model.2)[8] # 0.5833219
detach(leukemia)

Disease outbreak (logistic regression)

Load the disease outbreak data.
Create interaction variables.
Fit "full" logistic regression model of Disease vs four predictors and five interactions.
Fit "reduced" logistic regression model of Disease vs four predictors.
Conduct a likelihood ratio (or deviance) test for the five interactions.
Display the analysis of deviance table with sequential deviances.


disease <- read.table("~/path-to-data/DiseaseOutbreak.txt", header=T)
attach(disease)
Age.Middle <- Age*Middle
Age.Lower <- Age*Lower
Age.Sector <- Age*Sector
Middle.Sector <- Middle*Sector
Lower.Sector <- Lower*Sector
model.1 <- glm(Disease ~ Age + Middle + Lower + Sector + Age.Middle + Age.Lower +
Age.Sector + Middle.Sector + Lower.Sector, family="binomial")
model.2 <- glm(Disease ~ Age + Middle + Lower + Sector, family="binomial")
anova(model.2, model.1, test="Chisq")
#   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
# 1        93    101.054                     
# 2        88     93.996  5   7.0583   0.2163
anova(model.1, test="Chisq")
#               Df Deviance Resid. Df Resid. Dev Pr(>Chi)   
# NULL                             97    122.318            
# Age            1   7.4050        96    114.913 0.006504 **
# Middle         1   1.8040        95    113.109 0.179230   
# Lower          1   1.6064        94    111.502 0.205003   
# Sector         1  10.4481        93    101.054 0.001228 **
# Age.Middle     1   4.5697        92     96.484 0.032542 * 
# Age.Lower      1   1.0152        91     95.469 0.313666   
# Age.Sector     1   1.1202        90     94.349 0.289878   
# Middle.Sector  1   0.0001        89     94.349 0.993427   
# Lower.Sector   1   0.3531        88     93.996 0.552339   
detach(disease)

Toxicity and insects (logistic regression using event/trial data format)

Load the toxicity data.
Create a Survivals variable and a matrix with Deaths in one column and Survivals in the other column.
Fit a logistic regression model of Deaths vs Dose.
Calculate 95% confidence intervals for the regression parameters based on asymptotic normality and based on profiling the least-squares estimation surface.
Calculate the odds ratio for Dose and a 95% confidence interval.
Display the observed and fitted probabilities.
Create a scatterplot of observed probabilities vs Dose and add a fitted line based on the logistic regression model.


toxicity <- read.table("~/path-to-data/toxicity.txt", header=T)
attach(toxicity)
Survivals <- SampSize - Deaths
y <- cbind(Deaths, Survivals)
model.1 <- glm(y ~ Dose, family="binomial")
summary(model.1)
#             Estimate Std. Error z value Pr(>|z|)    
# (Intercept) -2.64367    0.15610  -16.93   <2e-16 ***
# Dose         0.67399    0.03911   17.23   <2e-16 ***
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# 
# (Dispersion parameter for binomial family taken to be 1)
# 
# Null deviance: 383.0695  on 5  degrees of freedom
# Residual deviance:   1.4491  on 4  degrees of freedom
# AIC: 39.358
confint.default(model.1) # based on asymptotic normality
confint(model.1) # based on profiling the least-squares estimation surface
exp(coef(model.1)[2]) # odds ratio = 1.962056
exp(confint.default(model.1)[2,]) # 95% CI = (1.817279, 2.118366)
cbind(Dose, SampSize, Deaths, Deaths/SampSize, fitted(model.1))
#   Dose SampSize Deaths                
# 1    1      250     28 0.112 0.1224230
# 2    2      250     53 0.212 0.2148914
# 3    3      250     93 0.372 0.3493957
# 4    4      250    126 0.504 0.5130710
# 5    5      250    172 0.688 0.6739903
# 6    6      250    197 0.788 0.8022286
plot(x=Dose, y=Deaths/SampSize,
panel.last = lines(sort(Dose), fitted(model.1)[order(Dose)]))
detach(toxicity)