# R code for Crab example # Poisson Regression Model for Count Data library(rsq) library(ggplot2) # https://rdrr.io/cran/rsq/man/hcrabs.html data(hcrabs) head(hcrabs) colnames(hcrabs) = c("C","S","W","Sa","Wt") write.table(hcrabs, file='crabr.txt', col.names=FALSE) # plot of satellite count versus width ggplot(hcrabs, aes(x=W, y=Sa)) + geom_point() # width as only predictor model = glm(Sa ~ W, family=poisson(link=log), data=hcrabs) summary(model) model$fitted.values rstandard(model) anova(model) # the linear predictor values model$linear.predictors exp(model$linear.predictors) # to get the predicted count for each observation: print = data.frame(crab,pred=model$fitted) print #### Based on the residual deviance the model does NOT fit well #### e.g., 567.88/171 = 3.3209 1-pchisq(model$deviance, model$df.residual) #### Diagnostics measures (like in logistic regression) #### But these work for ungrouped data too, #### as long as there is a variable with counts influence(model) plot(influence(model)$pear.res) plot(model$linear.predictors, residuals(model, type="pearson")) #### To predict a new value newdt=data.frame(W=26.3) predict.glm(model, type="response", newdata=newdt) #### Let's assume for now that we do not have other covariates #### and we will adjust for overdispersion #### first look at the sample mean and variances ## e.g., tapply(crab$Sa, crab$W,function(x)c(mean=mean(x),variance=var(x))) #### To estimate dispersion parameter model.disp = glm(Sa ~ W, family=quasipoisson(link=log), data=hcrabs) summary.glm(model.disp) summary.glm(model.disp)$dispersion #### Fit a negative binomial model #### the dispersion parameter is THETA #### Here are two different ways, both must use library(MASS) #### In the first one, we fix theta to be 1.0 #### In the second one we let glm.nb() estimate it library(MASS) nb.fit = glm(Sa~W, data=hcrabs, family=negative.binomial(theta=1, link="identity"), start=model$coef) summary(nb.fit) nb.fit1=glm.nb(Sa~W, data=crab, init.theta=1, link=identity, start=model$coef) summary(nb.fit1) #### Adding a categorical predictor #### make sure C is a factor # setting 5th color as baseline hcrabs$C = factor(hcrabs$C, levels=c('5','2','3','4')) model = glm(Sa ~ W+C, family=quasipoisson(link=log), data=hcrabs) summary(model) anova(model, test='Chisq') # color as a numeric predictor data(hcrabs) colnames(hcrabs) = c("C","S","W","Sa","Wt") hcrabs$C = as.numeric(hcrabs$C)+1 model = glm(Sa ~ W+C, family=quasipoisson(link=log), data=hcrabs) summary(model) anova(model, test='Chisq') # converting to 8 predictor groups, based on width intervals # creating the 8 width groups width.long = cut(hcrabs$W, breaks=c(0,seq(23.25, 29.25),Inf)) # summarizing per group Cases = aggregate(hcrabs$W, by=list(W=width.long),length)$x lcases = log(Cases) Width = aggregate(hcrabs$W, by=list(W=width.long),mean)$x AvgSa = aggregate(hcrabs$Sa, by=list(W=width.long),mean)$x SdSa = aggregate(hcrabs$Sa, by=list(W=width.long),sd)$x VarSa = aggregate(hcrabs$Sa, by=list(W=width.long),var)$x SaTotal = aggregate(hcrabs$Sa, by=list(W=width.long),sum)$x plot(Width, AvgSa) ggplot(hcrabs.grp, aes(x=Width, y=AvgSa)) + geom_point() cbind(table(width.long), Width, AvgSa, SdSa, VarSa) hcrabs.grp = data.frame(Width=Width, lcases=lcases, SaTotal=SaTotal) model.grp = glm(SaTotal ~ Width, offset=lcases, family=poisson, data=hcrabs.grp) summary(model.grp) rstandard(model.grp) predict.glm(model.grp, type="response", newdata=hcrabs.grp) # create a plot of the results plot(SaTotal, pch="o", col="blue", main="Plot of Observed and Predicted Sa vs. groups", xlab="Width groups", ylab="Number of Satellites") points(model.grp$fitted, pch="p", col="red") legend(6,30,c("obs","pred"), pch=c("o","p"), col=c("blue","red"))