##### World Cup Soccer Data 'soccer.txt" ###### ## read the datafile into R ## header=TRUE will take the first row of the data as the label for the columns soccer<-read.table("soccer2002.txt", header=TRUE) soccer ## compute the sample size sampleN=sum(soccer\$Freq) sampleN ## compute the sample mean ## this is also the MLE of lambda smean=(1/sampleN)*(sum(soccer\$Goals*soccer\$Freq)) smean ## compute the Poisson probability for X=1, P(X=1) dpois(1, lambda=smean) ## Poisson probabilities for X=0, 1, ..., 8 with LAMBDA=smean ## these are estimates of cell probabilities pihat<-dpois(0:8, lambda=smean) pihat ## compute expected frequencies for each cell j, E(Xj)=n pi_j efreq<-sampleN*pihat efreq ## plot observed and expected frequencies versus the number of goals scored, but save direclty into a file "soccerCounts.pdf"### ## if you do NOT want to save into the file direclty, remove the first and the last line of this section of the code #### pdf("soccerCounts.pdf") plot(soccer\$Goals, soccer\$Freq, type="p",col="red", xlab="goals scored",ylab="frequencies",main="World Cup Soccer data") points(soccer\$Goals, efreq, col="blue", pch=22) legend(list(x=4,y=30), legend=c("Observed count", "Expeced count"), pch=21:22, col=c("red", "blue")) dev.off() ########### Goodness of Fit ############################### ### (I) Goodness of fit ignoring LAMBDA estimate #### ## this tests if the observed values fit a model that all cells are equally likley ex<-chisq.test(soccer\$Freq) ex ex\$statistic ##gives the Chi-Squared test statistic only ex\$p.value ## gives the pvalue of the above statistic only ex\$residuals ## gives the Pearson residuals out<-round(cbind(0:8, ex\$observed, ex\$expected, ex\$residuals),3) out<-as.data.frame(out) names(out)<-c("goals", "O_j", "E_j", "res_j") out ## to get G2, but the problem is devision by zero G2<-2*sum(ex\$observed*log(ex\$observed/ex\$expected)) G2 1-pchisq(G2,8) ## deviance residuals devres=sqrt(abs(2*ex\$observed*log(ex\$observed/ex\$expected)))*sign(ex\$observed-ex\$expected) ##one solution add a small count such as 1/2 to each cell ex1=chisq.test(soccer\$Freq+0.5) ex1 G2<-2*sum(ex1\$observed*log(ex1\$observed/ex1\$expected)) G2 1-pchisq(G2,8) ##deviance residuals devres=sqrt(abs(2*ex1\$observed*log(ex1\$observed/ex1\$expected)))*sign(ex1\$observed-ex1\$expected) ## another (less common) solutions is to do the calculations with all values greater than zero for G2 ## but that fixes the zero cell counts to so we are estimating two less paramaters in this example ## thus the degrees of freedom should be adjusted ##option na.rm=TRUE removes all zero values from the calculation G2<-2*sum(ex\$observed*log(ex\$observed/ex\$expected), na.rm=TRUE) G2 1-pchisq(G2,6) ### (II) Goodness of fit using LAMBDA estimate #### ## This tests if the observed values fit the Poisson model with estimated/given LAMBDA ### ex2<-chisq.test(soccer\$Freq, p=pihat, rescale.p=TRUE) ##Notice the warningn msg. That is because there are some expected probabilities which are very small, e.g. less than 0.2; see zero cell counts. ## ex2 ex2\$statistic ex2\$p.value ex2\$residuals ######## What's the problem here??? ###### ## DFs (9-1) are not correct, since we first estimated lambda and then pihats ## DFs=(9-1)-1=7 1-pchisq(ex2\$statistic,7) out<-round(cbind(0:8, ex2\$observed, ex2\$expected, ex2\$residuals),3) out<-as.data.frame(out) names(out)<-c("goals", "O_j", "E_j", "res_j") out ##notice that the model actually fits well except for the last cell with a very large residual ##### NEW G2 to do the calculations with all values greater than zero; option na.rm=TRUE removes all zero values from the calculation G2<-2*sum(ex2\$observed*log(ex2\$observed/ex2\$expected), na.rm=TRUE) G2 1-pchisq(G2,7)