> > ##### World Cup Soccer Data 'soccer.txt" ###### > > ## read the datafile into R > ## header=TRUE will take the first row of the data as the label for the columns > soccer<-read.table("soccer2002.txt", header=TRUE) > soccer Goals Freq 1 0 23 2 1 37 3 2 20 4 3 11 5 4 2 6 5 1 7 6 0 8 7 0 9 8 1 > > ## compute the sample size > sampleN=sum(soccer$Freq) > sampleN [1] 95 > > ## compute the sample mean > ## this is also the MLE of lambda > smean=(1/sampleN)*(sum(soccer$Goals*soccer$Freq)) > smean [1] 1.378947 > > ## compute the Poisson probability for X=1, P(X=1) > dpois(1, lambda=smean) [1] 0.3472789 > > ## Poisson probabilities for X=0, 1, ..., 8 with LAMBDA=smean > ## these are estimates of cell probabilities > pihat<-dpois(0:8, lambda=smean) > pihat [1] 2.518435e-01 3.472789e-01 2.394397e-01 1.100582e-01 3.794113e-02 1.046376e-02 2.404830e-03 4.737335e-04 [9] 8.165669e-05 > > > ## compute expected frequencies for each cell j, E(Xj)=n pi_j > efreq<-sampleN*pihat > efreq [1] 23.925133642 32.991500074 22.746771104 10.455533385 3.604407562 0.994057664 0.228458867 0.045004679 [9] 0.007757385 > > ## plot observed and expected frequencies versus the number of goals scored, but save direclty into a file "soccerCounts.pdf"### > ## if you do NOT want to save into the file direclty, remove the first and the last line of this section of the code #### > pdf("soccerCounts.pdf") > plot(soccer$Goals, soccer$Freq, type="p",col="red", xlab="goals scored",ylab="frequencies",main="World Cup Soccer data") > points(soccer$Goals, efreq, col="blue", pch=22) > legend(list(x=4,y=30), legend=c("Observed count", "Expeced count"), pch=21:22, col=c("red", "blue")) > dev.off() null device 1 > > ########### Goodness of Fit ############################### > ### (I) Goodness of fit ignoring LAMBDA estimate #### > > ## this tests if the observed values fit a model that all cells are equally likley > ex<-chisq.test(soccer$Freq) > ex Chi-squared test for given probabilities data: soccer$Freq X-squared = 134.7368, df = 8, p-value < 2.2e-16 > ex$statistic ##gives the Chi-Squared test statistic only X-squared 134.7368 > ex$p.value ## gives the pvalue of the above statistic only [1] 2.944172e-25 > ex$residuals ## gives the Pearson residuals [1] 3.8303192 8.1394283 2.9069387 0.1367971 -2.6333444 -2.9411379 -3.2489314 -3.2489314 -2.9411379 > > out<-round(cbind(0:8, ex$observed, ex$expected, ex$residuals),3) > out<-as.data.frame(out) > names(out)<-c("goals", "O_j", "E_j", "res_j") > out goals O_j E_j res_j 1 0 23 10.556 3.830 2 1 37 10.556 8.139 3 2 20 10.556 2.907 4 3 11 10.556 0.137 5 4 2 10.556 -2.633 6 5 1 10.556 -2.941 7 6 0 10.556 -3.249 8 7 0 10.556 -3.249 9 8 1 10.556 -2.941 > > ## to get G2, but the problem is devision by zero > > G2<-2*sum(ex$observed*log(ex$observed/ex$expected)) > G2 [1] NaN > 1-pchisq(G2,8) [1] NaN > > ## deviance residuals > devres=sqrt(abs(2*ex$observed*log(ex$observed/ex$expected)))*sign(ex$observed-ex$expected) > > ##one solution add a small count such as 1/2 to each cell > ex1=chisq.test(soccer$Freq+0.5) > ex1 Chi-squared test for given probabilities data: soccer$Freq + 0.5 X-squared = 128.6432, df = 8, p-value < 2.2e-16 > G2<-2*sum(ex1$observed*log(ex1$observed/ex1$expected)) > G2 [1] 127.6603 > 1-pchisq(G2,8) [1] 0 > > ##deviance residuals > devres=sqrt(abs(2*ex1$observed*log(ex1$observed/ex1$expected)))*sign(ex1$observed-ex1$expected) > > ## another (less common) solutions is to do the calculations with all values greater than zero for G2 > ## but that fixes the zero cell counts to so we are estimating two less paramaters in this example > ## thus the degrees of freedom should be adjusted > ##option na.rm=TRUE removes all zero values from the calculation > > G2<-2*sum(ex$observed*log(ex$observed/ex$expected), na.rm=TRUE) > G2 [1] 139.0323 > 1-pchisq(G2,6) [1] 0 > > > > ### (II) Goodness of fit using LAMBDA estimate #### > ## This tests if the observed values fit the Poisson model with estimated/given LAMBDA ### > > ex2<-chisq.test(soccer$Freq, p=pihat, rescale.p=TRUE) Warning message: In chisq.test(soccer$Freq, p = pihat, rescale.p = TRUE) : Chi-squared approximation may be incorrect > > ##Notice the warningn msg. That is because there are some expected probabilities which are very small, e.g. less than 0.2; see zero cell counts. ## > > ex2 Chi-squared test for given probabilities data: soccer$Freq X-squared = 128.7858, df = 8, p-value < 2.2e-16 > ex2$statistic X-squared 128.7858 > ex2$p.value [1] 5.049692e-24 > ex2$residuals [1] -0.18920680 0.69779193 -0.57598546 0.16833493 -0.84510124 0.00594559 -0.47797717 -0.21214460 [9] 11.26566922 > > > ######## What's the problem here??? ###### > ## DFs (9-1) are not correct, since we first estimated lambda and then pihats > ## DFs=(9-1)-1=7 > > 1-pchisq(ex2$statistic,7) X-squared 0 > > out<-round(cbind(0:8, ex2$observed, ex2$expected, ex2$residuals),3) > out<-as.data.frame(out) > names(out)<-c("goals", "O_j", "E_j", "res_j") > out ##notice that the model actually fits well except for the last cell with a very large residual goals O_j E_j res_j 1 0 23 23.925 -0.189 2 1 37 32.992 0.698 3 2 20 22.747 -0.576 4 3 11 10.456 0.168 5 4 2 3.604 -0.845 6 5 1 0.994 0.006 7 6 0 0.228 -0.478 8 7 0 0.045 -0.212 9 8 1 0.008 11.266 > > > > ##### World Cup Soccer Data 'soccer.txt" ###### > > ## read the datafile into R > ## header=TRUE will take the first row of the data as the label for the columns > soccer<-read.table("soccer2002.txt", header=TRUE) > soccer Goals Freq 1 0 23 2 1 37 3 2 20 4 3 11 5 4 2 6 5 1 7 6 0 8 7 0 9 8 1 > > ## compute the sample size > sampleN=sum(soccer$Freq) > sampleN [1] 95 > > ## compute the sample mean > ## this is also the MLE of lambda > smean=(1/sampleN)*(sum(soccer$Goals*soccer$Freq)) > smean [1] 1.378947 > > ## compute the Poisson probability for X=1, P(X=1) > dpois(1, lambda=smean) [1] 0.3472789 > > ## Poisson probabilities for X=0, 1, ..., 8 with LAMBDA=smean > ## these are estimates of cell probabilities > pihat<-dpois(0:8, lambda=smean) > pihat [1] 2.518435e-01 3.472789e-01 2.394397e-01 1.100582e-01 3.794113e-02 1.046376e-02 2.404830e-03 4.737335e-04 [9] 8.165669e-05 > > > ## compute expected frequencies for each cell j, E(Xj)=n pi_j > efreq<-sampleN*pihat > efreq [1] 23.925133642 32.991500074 22.746771104 10.455533385 3.604407562 0.994057664 0.228458867 0.045004679 [9] 0.007757385 > > ## plot observed and expected frequencies versus the number of goals scored, but save direclty into a file "soccerCounts.pdf"### > ## if you do NOT want to save into the file direclty, remove the first and the last line of this section of the code #### > pdf("soccerCounts.pdf") > plot(soccer$Goals, soccer$Freq, type="p",col="red", xlab="goals scored",ylab="frequencies",main="World Cup Soccer data") > points(soccer$Goals, efreq, col="blue", pch=22) > legend(list(x=4,y=30), legend=c("Observed count", "Expeced count"), pch=21:22, col=c("red", "blue")) > dev.off() null device 1 > > ########### Goodness of Fit ############################### > ### (I) Goodness of fit ignoring LAMBDA estimate #### > > ## this tests if the observed values fit a model that all cells are equally likley > ex<-chisq.test(soccer$Freq) > ex Chi-squared test for given probabilities data: soccer$Freq X-squared = 134.7368, df = 8, p-value < 2.2e-16 > ex$statistic ##gives the Chi-Squared test statistic only X-squared 134.7368 > ex$p.value ## gives the pvalue of the above statistic only [1] 2.944172e-25 > ex$residuals ## gives the Pearson residuals [1] 3.8303192 8.1394283 2.9069387 0.1367971 -2.6333444 -2.9411379 -3.2489314 -3.2489314 -2.9411379 > > out<-round(cbind(0:8, ex$observed, ex$expected, ex$residuals),3) > out<-as.data.frame(out) > names(out)<-c("goals", "O_j", "E_j", "res_j") > out goals O_j E_j res_j 1 0 23 10.556 3.830 2 1 37 10.556 8.139 3 2 20 10.556 2.907 4 3 11 10.556 0.137 5 4 2 10.556 -2.633 6 5 1 10.556 -2.941 7 6 0 10.556 -3.249 8 7 0 10.556 -3.249 9 8 1 10.556 -2.941 > > ## to get G2, but the problem is devision by zero > > G2<-2*sum(ex$observed*log(ex$observed/ex$expected)) > G2 [1] NaN > 1-pchisq(G2,8) [1] NaN > > ## deviance residuals > devres=sqrt(abs(2*ex$observed*log(ex$observed/ex$expected)))*sign(ex$observed-ex$expected) > devres [1] 5.9855432 9.6340881 5.0560062 0.9525466 -2.5795388 -2.1710147 NaN NaN -2.1710147 > > ##one solution add a small count such as 1/2 to each cell > ex1=chisq.test(soccer$Freq+0.5) > ex1 Chi-squared test for given probabilities data: soccer$Freq + 0.5 X-squared = 128.6432, df = 8, p-value < 2.2e-16 > G2<-2*sum(ex1$observed*log(ex1$observed/ex1$expected)) > G2 [1] 127.6603 > 1-pchisq(G2,8) [1] 0 > > ##deviance residuals > devres=sqrt(abs(2*ex1$observed*log(ex1$observed/ex1$expected)))*sign(ex1$observed-ex1$expected) > devres [1] 5.9532483 9.5710809 5.0316165 0.9521141 -2.7263917 -2.4479387 -1.7595682 -1.7595682 -2.4479387 > > ## another (less common) solutions is to do the calculations with all values greater than zero for G2 > ## but that fixes the zero cell counts to so we are estimating two less paramaters in this example > ## thus the degrees of freedom should be adjusted > ##option na.rm=TRUE removes all zero values from the calculation > > G2<-2*sum(ex$observed*log(ex$observed/ex$expected), na.rm=TRUE) > G2 [1] 139.0323 > 1-pchisq(G2,6) [1] 0 > > > > ### (II) Goodness of fit using LAMBDA estimate #### > ## This tests if the observed values fit the Poisson model with estimated/given LAMBDA ### > > ex2<-chisq.test(soccer$Freq, p=pihat, rescale.p=TRUE) Warning message: In chisq.test(soccer$Freq, p = pihat, rescale.p = TRUE) : Chi-squared approximation may be incorrect > > ##Notice the warningn msg. That is because there are some expected probabilities which are very small, e.g. less than 0.2; see zero cell counts. ## > > ex2 Chi-squared test for given probabilities data: soccer$Freq X-squared = 128.7858, df = 8, p-value < 2.2e-16 > ex2$statistic X-squared 128.7858 > ex2$p.value [1] 5.049692e-24 > ex2$residuals [1] -0.18920680 0.69779193 -0.57598546 0.16833493 -0.84510124 0.00594559 -0.47797717 -0.21214460 [9] 11.26566922 > > > ######## What's the problem here??? ###### > ## DFs (9-1) are not correct, since we first estimated lambda and then pihats > ## DFs=(9-1)-1=7 > > 1-pchisq(ex2$statistic,7) X-squared 0 > > out<-round(cbind(0:8, ex2$observed, ex2$expected, ex2$residuals),3) > out<-as.data.frame(out) > names(out)<-c("goals", "O_j", "E_j", "res_j") > out ##notice that the model actually fits well except for the last cell with a very large residual goals O_j E_j res_j 1 0 23 23.925 -0.189 2 1 37 32.992 0.698 3 2 20 22.747 -0.576 4 3 11 10.456 0.168 5 4 2 3.604 -0.845 6 5 1 0.994 0.006 7 6 0 0.228 -0.478 8 7 0 0.045 -0.212 9 8 1 0.008 11.266