#### R code for Crab example
####Poisson Regression Model for Count Data

crab=read.table("crab.txt")
colnames(crab)=c("Obs","C","S","W","Wt","Sa")

#### to remove the column labeled "Obs"

crab=crab[,-1]

#### the following code corresponds to crab.SAS-crab1.SAS

#### Fitting the intercept only model 
#### This model implies the expected number of satellites 
#### per each crab is the same
#### in this case: E(Sa)=2.919=exp(1.0713)

model=glm(crab$Sa~1, family=poisson(link=log))
summary(model)
model$fitted

#### Poisson Regression of Sa on W

model=glm(crab$Sa~1+crab$W,family=poisson(link=log))
summary(model)
anova(model)

#### to get the predicted count for each observation: 
#### e.g. for the first observation E(y1)=3.810

print=data.frame(crab,pred=model$fitted)
print

#### note the linear predictor values
#### e.g., for the first observation, exp(1.3378)=3.810

model$linear.predictors
exp(model$linear.predictors)

rstandard(model)

#### Interpretation of the slope which is statistically significant here
#### e.g., exp(0.1640)=1.18
#### as the width increases by one unit, the number of satilies will 
#### increase (positive sign of the coef), 
#### it will be multiplied by 1.18
#### e.g., for W=26 and W=25, first for all values
#### then for specific rows

model$fitted[crab$W==26]/model$fitted[crab$W==25]

model$fitted[2]/model$fitted[6]

#### Based on the residual deviance the model does NOT fit well
#### e.g., 567.88/171 = 3.3209

1-pchisq(model$deviance, model$df.residual)

#### creating a scatter plot of Sa vs. W

plot(crab$W,crab$Sa)
identify(crab$W, crab$Sa) 

#### click on the plot to identify individual values
#### identified on the screen and the plot, \#48,101,165

#### Diagnostics measures (like in logistic regression)
#### But these work for ungrouped data too, 
#### as long as there is a variable with counts
#### You can do many more but here are a few indicating a lack of fit

influence(model)
plot(influence(model)$pear.res)
plot(model$linear.predictors, residuals(model, type="pearson"))

#### To predict a new value

newdt=data.frame(W=26.3)
predict.glm(model, type="response", newdata=newdt)


#### Let's assume for now that we do not have other covariates
#### and we will adjust for overdispersion 
#### first look at the sample mean and variances 
## e.g., tapply(crab$Sa, crab$W,function(x)c(mean=mean(x),variance=var(x)))

#### To estimate dispersion parameter
#### Do it on your own by X2/df & use summary.glm() (see log. reg notes)
#### Use DISMOD package (see log.reg notes)
#### Use quasipoisson family 

model.disp=glm(crab$Sa~crab$W, family=quasipoisson(link=log), data=crab)
summary.glm(model.disp)
summary.glm(model.disp)$dispersion


#### Fit a negative binomial model
#### the dispersion parameter is THETA
#### Here are two different ways, both must use library(MASS) 
#### In the first one, we fix theta to be 1.0
#### In the second one we let glm.nb() to estimate it

library(MASS)
nb.fit=glm(Sa~W, data=crab, family=negative.binomial(theta=1, link="identity"), start=model$coef)
summary(nb.fit)

nb.fit1=glm.nb(Sa~W, data=crab, init.theta=1, link=identity, start=model$coef)
summary(nb.fit1)

#### Adding a categorical predictor
#### This corresponds with crab2.SAS
#### make sure C is a factor

is.factor(crab$C)
crab$C=as.factor(crab$C)
model=glm(Sa~W+C,family=poisson(link=log), data=crab)
summary(model)
anova(model)
print=data.frame(crab,pred=model$fitted)
print

#### to get the same order as you do in SAS

contrasts(crab$C)=contr.SAS(levels(crab$C))
model=glm(Sa~W+C,family=poisson(link=log),data=crab)
summary(model)
anova(model)

#### to get back to the default level

contrasts(crab$C)=contr.treatment(levels(crab$C),base=1)

#### Or you can explicilty code the levels to correspond to SAS
#### Notice that change from C1 to C4 in a number of SA is significant 
#### exp(0.447)=1.54, pvalue=0.0324
#### but if you adjust for overdispersion it's not significant

Sa=crab$Sa
W=crab$W
C1=1*(crab$C==1)
C2=1*(crab$C==2)
C3=1*(crab$C==3)
model=glm(Sa~W+C1+C2+C3,family=poisson(link=log))
summary(model)
anova(model)
print=data.frame(crab,pred=model$fitted)
print

plot(crab$W, model$fitted)

model.disp=glm(Sa~W+C1+C2+C3, family=quasipoisson(link=log))
summary.glm(model.disp)
summary.glm(model.disp)$dispersion
anova(model.disp)

#### Treat Color as a numeric predictor 
#### This corresponds to crab3.SAS

Sa=crab$Sa
W=crab$W
C=as.numeric(crab$C)
model=glm(Sa~W+C,family=poisson(link=log))
summary(model)
anova(model)
print=data.frame(crab,pred=model$fitted)
print

newdt=data.frame(W=0,C=1)
predict.glm(model, type="response", newdata=newdt)

#### This corresponds to to crab5.SAS

width=c(22.69,23.84,24.77, 25.84,26.79,27.74,28.67,30.41)
cases=c(14,14,28,39,22,24,18,14)
SaTotal=c(14,20,67,105,63,93,71,72)
lcases=log(cases)
CrabGrp=data.frame(width,cases,SaTotal,lcases)
model=glm(SaTotal~width,offset=lcases,family=poisson(link=log))
residuals(model)
summary(model)
anova(model)
print=data.frame(CrabGrp,pred=model$fitted)
print

#### create a plot of the results

plot(SaTotal, pch="o", col="blue", main="Plot of Observed and Predicted Sa vs. groups", 
     xlab="Width groups", ylab="Number of Satellites")
points(model$fitted, pch="p", col="red")
legend(6,30,c("obs","pred"), pch=c("o","p"), col=c("blue","red"))
 
model=glm(SaTotal~width,offset=lcases,family=poisson(link=identity))
residuals(model)

#### if you wanted to aggregate from the original data
#### create W as a factor variable with 8 levels

W.fac=cut(crab$W, breaks=c(0,seq(23.25, 29.25),Inf))
numcases=table(W.fac)

#### now compute sample means for Width variable by the cuts
#### and total number of Sa cases

width=aggregate(crab$W, by=list(W=W.fac),mean)$x
width
SaMean=aggregate(crab$Sa, by=list(W=W.fac),mean)$x
SaMean
plot(width,SaMean)
SaTotal=aggregate(crab$Sa, by=list(W=W.fac),sum)$x
SaTotal
lcases=log(numcases)
lcases