/* Program taken from the UCLA Computing website 
   http://www.ats.ucla.edu/stat/sas/dae/poissonreg.htm
   and slightly modified by me.  */

/*  The data constitute attendance data on 316 high school juniors from two urban high schools
    in the file poissonreg.csv.  The response variable of interest is days absent, daysabs.
    The variabes math and langarts give the standardized test scores for math and language
    arts, respectively.  The variable male isa binary indicator of student gender.  */  
 
data poissonreg;
  infile "c:\data\classdat\poisson\poissonreg.csv" delimiter=',' firstobs=2;
input id school male math langarts daysatt daysabs;
run;

proc means data = poissonreg mean std min max var;
  var daysabs math langarts male;
run;

proc univariate data = poissonreg noprint;
  histogram daysabs / midpoints = 0 to 50 by 1 vscle = count;
run;

proc freq data = poissonreg;
  tables male;
run;

/* Here we assume the count data implies that the mean and variance of the counts
   are the same and thus use the poisson distribution  (equidispersion).  Of course
   this should be formally tested.  */

proc genmod data = poissonreg;
   model daysabs = male math langarts /dist=poisson;
run;

/*  Here we apply the negative binomial distribution.  We would use this one if
    we have the overdispersion case where the variance of the counts is greater
    than the mean of the counts.  */  

proc genmod data = poissonreg;
   model daysabs = male math langarts /dist=negbin;
run;

/*  In the case that there are more zeros than wouldb eb expected by either a
    Poisson model or negative binomial model, we could use the Zero-inflated
    Regression Model which unfortunately is not supported by Proc Genmod. */

/* Just to be on the safe side, let's rerun proc genmod and the poisson method with
   the "repeated" statement in order to obtain robust standard errors for the 
   Poisson regression coefficients.  */

proc genmod data = poissonreg;
   class id;
   model daysabs = male math langarts /dist=poisson;
   repeated subject=id / type=cs;
run;

/* The robust standard errors attempt to adjust for heterogeneity in the model.  Using
   the robust standard errors has resulted in a fairly large change in the standard errors,
   which should be more appropriate.  The z-tests still yield similar significant
   results, but give more realistic p-values.  /*

/*  The variable math was border-line significant without the "repeated" statement
    and is clearly not significant with it.  Since math is not significant in the model
    with robust standard errors, we will rerun the model dropping that variable.  */

proc genmod data = poissonreg;
   class id;
   model daysabs = male langarts /dist=poisson;
   repeated subject=id / type=cs;
run;

/* The model fits the data significantly better than the null model, i.e. the intercept-
   only model.  To show that this is the case, we can run the null model and compare the
   null model with the current model using chi-squared test on the difference of log
   likelihood.  */

 proc genmod data = poissonreg;
   class id;
   model daysabs =  / type3 dist=poisson;
   repeated subject=id / type=cs;
	run;
	quit;

/* The log likelihood for the full model is 1480.3813 and is 1394 for the null
   model.  The chi-squared vale is 2*(1480.3813 - 1394.6299) = 171.5028.  Since we
   have two perdictor variables in the full model, the degrees of freedom for
   the chi-squared test is 2.  This yields a p-value < 0.0001.  */

/*  Finally, we will use the estimate statement to get the predicted change in days
	absent for male and female group when the langarts is held at its mean.  */

proc genmod data = poissonreg;
   class id;
   model daysabs = male langarts /dist=poisson;
   repeated subject=id / type=cs;
   estimate "male" langarts 50.0637938 male 1 intercept 1 / exp;
   estimate "female" langarts 50.0637938 male 0 intercept 1 / exp;
run;

/* The Poisson regression model predicting days absent from school stay from language arts
   and gender was statistically significant with likelihood ratio chi-squre = 171.503, df=2
   yielding p-value < 0.0001.  The predictors langarts and male were each statistically
   significant.  For these data, the expected change in log count fro a one-unit increas in
   languare arts was -0.0146.  Male students had an expected log count 0.41 less than
   female students.  */