# R code for Crab example
# Poisson Regression Model for Count Data

library(rsq)
library(ggplot2)

# https://rdrr.io/cran/rsq/man/hcrabs.html
data(hcrabs)

head(hcrabs)
colnames(hcrabs) = c("C","S","W","Sa","Wt")

write.table(hcrabs, file='crabr.txt', col.names=FALSE)

# plot of satellite count versus width
ggplot(hcrabs, aes(x=W, y=Sa)) + geom_point()

# width as only predictor
model = glm(Sa ~ W, family=poisson(link=log), data=hcrabs)
summary(model)
model$fitted.values
rstandard(model)
anova(model)

# the linear predictor values
model$linear.predictors
exp(model$linear.predictors)

# to get the predicted count for each observation: 
print = data.frame(crab,pred=model$fitted)
print

#### Based on the residual deviance the model does NOT fit well
#### e.g., 567.88/171 = 3.3209

1-pchisq(model$deviance, model$df.residual)

#### Diagnostics measures (like in logistic regression)
#### But these work for ungrouped data too, 
#### as long as there is a variable with counts

influence(model)
plot(influence(model)$pear.res)
plot(model$linear.predictors, residuals(model, type="pearson"))

#### To predict a new value
newdt=data.frame(W=26.3)
predict.glm(model, type="response", newdata=newdt)

#### Let's assume for now that we do not have other covariates
#### and we will adjust for overdispersion 
#### first look at the sample mean and variances 
## e.g., tapply(crab$Sa, crab$W,function(x)c(mean=mean(x),variance=var(x)))

#### To estimate dispersion parameter
model.disp = glm(Sa ~ W, family=quasipoisson(link=log), data=hcrabs)
summary.glm(model.disp)
summary.glm(model.disp)$dispersion

#### Fit a negative binomial model
#### the dispersion parameter is THETA
#### Here are two different ways, both must use library(MASS) 
#### In the first one, we fix theta to be 1.0
#### In the second one we let glm.nb() estimate it

library(MASS)
nb.fit = glm(Sa~W, data=hcrabs, family=negative.binomial(theta=1, link="identity"), start=model$coef)
summary(nb.fit)

nb.fit1=glm.nb(Sa~W, data=crab, init.theta=1, link=identity, start=model$coef)
summary(nb.fit1)

#### Adding a categorical predictor
#### make sure C is a factor

# setting 5th color as baseline
hcrabs$C = factor(hcrabs$C, levels=c('5','2','3','4'))
model = glm(Sa ~ W+C, family=quasipoisson(link=log), data=hcrabs)
summary(model)
anova(model, test='Chisq')

# color as a numeric predictor 
data(hcrabs)
colnames(hcrabs) = c("C","S","W","Sa","Wt")
hcrabs$C = as.numeric(hcrabs$C)+1
model = glm(Sa ~ W+C, family=quasipoisson(link=log), data=hcrabs)
summary(model)
anova(model, test='Chisq')

# converting to 8 predictor groups, based on width intervals

# creating the 8 width groups
width.long = cut(hcrabs$W, breaks=c(0,seq(23.25, 29.25),Inf))

# summarizing per group
Cases = aggregate(hcrabs$W, by=list(W=width.long),length)$x
lcases = log(Cases)
Width = aggregate(hcrabs$W, by=list(W=width.long),mean)$x
AvgSa = aggregate(hcrabs$Sa, by=list(W=width.long),mean)$x
SdSa = aggregate(hcrabs$Sa, by=list(W=width.long),sd)$x
VarSa = aggregate(hcrabs$Sa, by=list(W=width.long),var)$x
SaTotal = aggregate(hcrabs$Sa, by=list(W=width.long),sum)$x

plot(Width, AvgSa)
ggplot(hcrabs.grp, aes(x=Width, y=AvgSa)) + geom_point()
cbind(table(width.long), Width, AvgSa, SdSa, VarSa)

hcrabs.grp = data.frame(Width=Width, lcases=lcases, SaTotal=SaTotal)
model.grp = glm(SaTotal ~ Width, offset=lcases, family=poisson, data=hcrabs.grp)
summary(model.grp)
rstandard(model.grp)
predict.glm(model.grp, type="response", newdata=hcrabs.grp)

# create a plot of the results
plot(SaTotal, pch="o", col="blue", main="Plot of Observed and Predicted Sa vs. groups", 
     xlab="Width groups", ylab="Number of Satellites")
points(model.grp$fitted, pch="p", col="red")
legend(6,30,c("obs","pred"), pch=c("o","p"), col=c("blue","red"))