##### World Cup Soccer Data 'soccer.txt" ######

## read the datafile into R
## header=TRUE will take the first row of the data as the label for the columns
soccer<-read.table("soccer2002.txt", header=TRUE) 
soccer

## compute the sample size 
sampleN=sum(soccer$Freq)
sampleN

## compute the sample mean 
## this is also the MLE of lambda
smean=(1/sampleN)*(sum(soccer$Goals*soccer$Freq))
smean

## compute the Poisson probability for X=1, P(X=1) 
dpois(1, lambda=smean)

## Poisson probabilities for X=0, 1, ..., 8 with LAMBDA=smean 
## these are estimates of cell probabilities 
pihat<-dpois(0:8, lambda=smean)
pihat


## compute expected frequencies for each cell j, E(Xj)=n pi_j
efreq<-sampleN*pihat
efreq

## plot observed and expected frequencies versus the number of goals scored, but save direclty into a file "soccerCounts.pdf"###
## if you do NOT want to save into the file direclty, remove the first and the last line of this section of the code ####
pdf("soccerCounts.pdf")
plot(soccer$Goals, soccer$Freq, type="p",col="red", xlab="goals scored",ylab="frequencies",main="World Cup Soccer data")
points(soccer$Goals, efreq, col="blue", pch=22)
legend(list(x=4,y=30), legend=c("Observed count", "Expeced count"), pch=21:22, col=c("red", "blue"))
dev.off()

########### Goodness of Fit ###############################
### (I) Goodness of fit ignoring LAMBDA estimate ####

## this tests if the observed values fit a model that all cells are equally likley
ex<-chisq.test(soccer$Freq)
ex
ex$statistic ##gives the Chi-Squared test statistic only
ex$p.value ## gives the pvalue of the above statistic only
ex$residuals ## gives the Pearson residuals

out<-round(cbind(0:8, ex$observed, ex$expected, ex$residuals),3)
out<-as.data.frame(out)
names(out)<-c("goals", "O_j", "E_j", "res_j")
out

## to get G2, but the problem is devision by zero
 
G2<-2*sum(ex$observed*log(ex$observed/ex$expected))
G2
1-pchisq(G2,8)

## deviance residuals
devres=sqrt(abs(2*ex$observed*log(ex$observed/ex$expected)))*sign(ex$observed-ex$expected)

##one solution add a small count such as 1/2 to each cell
ex1=chisq.test(soccer$Freq+0.5)
ex1
G2<-2*sum(ex1$observed*log(ex1$observed/ex1$expected))
G2
1-pchisq(G2,8)

##deviance residuals
devres=sqrt(abs(2*ex1$observed*log(ex1$observed/ex1$expected)))*sign(ex1$observed-ex1$expected)

## another (less common) solutions is to do the calculations with all values greater than zero for G2
## but that fixes the zero cell counts to so we are estimating two less paramaters in this example
## thus the degrees of freedom should be adjusted
##option na.rm=TRUE removes all zero values from the calculation

G2<-2*sum(ex$observed*log(ex$observed/ex$expected), na.rm=TRUE)
G2
1-pchisq(G2,6)


### (II) Goodness of fit using LAMBDA estimate ####
## This tests if the observed values fit the Poisson model with estimated/given LAMBDA ###

ex2<-chisq.test(soccer$Freq, p=pihat, rescale.p=TRUE)

##Notice the warningn msg. That is because there are some expected probabilities which are very small, e.g. less than 0.2; see zero cell counts. ##

ex2
ex2$statistic
ex2$p.value 
ex2$residuals


######## What's the problem here??? ###### 
## DFs (9-1) are not correct, since we first estimated lambda and then pihats
## DFs=(9-1)-1=7

1-pchisq(ex2$statistic,7)

out<-round(cbind(0:8, ex2$observed, ex2$expected, ex2$residuals),3)
out<-as.data.frame(out)
names(out)<-c("goals", "O_j", "E_j", "res_j") 
out   ##notice that the model actually fits well except for the last cell with a very large residual


##### NEW G2 to do the calculations with all values greater than zero; option na.rm=TRUE removes all zero values from the calculation

G2<-2*sum(ex2$observed*log(ex2$observed/ex2$expected), na.rm=TRUE)
G2
1-pchisq(G2,7)