You are on page 1of 43

/*------------------------------------------------------------------- */ /* /* /* /* /* Advanced Log-Linear Models Using SAS by Daniel Zelterman */ */ */

Copyright(c) 2002 by SAS Institute Inc., Cary, NC, USA SAS Publications order # 57496 ISBN 1-59047-080-X */ */

/*-------------------------------------------------------------------*/ /* */ */

/* This material is provided "as is" by SAS Institute Inc. There

/* are no warranties, expressed or implied, as to merchantability or */ /* fitness for a particular purpose regarding the materials or code */ /* contained herein. The Institute is not responsible for errors /* in this material as it now exists or will exist, nor does the /* Institute provide technical support for it. /* */ */ */ */

/*-------------------------------------------------------------------*/ /* Questions or problem reports concerning this material may be /* addressed to the author: /* /* SAS Institute Inc. /* Books by Users /* Attn: Daniel Zelterman /* SAS Campus Drive /* Cary, NC 27513 /* /* */ */ */ */ */ */ */ */ */ */

/* If you prefer, you can send email to: sasbbu@sas.com /* Use this for subject field: /* /* Comments for Daniel Zelterman */ */ */

*/

/*-------------------------------------------------------------------*/

Program 1.1:

title1 'Beetle mortality and pesticide dose'; data beetle; input y n dose; label y n = 'number killed in group' = 'number in dose group'

dose = 'insecticide dose' ;

datalines; 15 50 1.08 24 49 1.16 26 50 1.21 24 50 1.26 29 50 1.31 29 49 1.35

run;

proc print; run;

title2 'Fit a linear dose effect to the binomial data'; proc genmod; model y/n=dose / dist=binomial link=identity obstats; run;

title2 'Logistic regression in GENMOD'; proc genmod; model y/n=dose / dist=binomial obstats; run;

title2 'Probit regression in GENMOD'; proc genmod; model y/n=dose / dist=binomial link=probit obstats; run;

Program 2.1

options linesize=80 center pagesize=54;

ods trace on;

ods printer file='c:\Table2-1.ps' ps; ods printer select Genmod.ModelFit; ods listing select Genmod.Modelfit;

data; input count expos tumor alpha beta; label tumor = 'mice with tumors' expos = 'exposure status' alpha = 'row effects' beta = 'column effect' ; datalines; 4 1 1 1 1 5 0 1 1 -1 12 1 0 -1 1 79 0 0 -1 -1 run;

proc genmod;

/* Model using the CLASS statement */

class tumor expos; model count = tumor expos /dist = Poisson obstats; run;

proc genmod;

/* Mimic the CLASS statement */

model count = tumor expos

/dist = Poisson obstats; run;

proc genmod;

/* Row and column effects sum to zero */

model count = alpha beta /dist = Poisson obstats; run;

ods printer close;

Program 2.2

title1 'Perinatal mortality'; data; input gest age cigs vita count @@; label gest = 'Gestational age 261+ days' age = 'Mother over 30 years' cigs = 'More than 6 cigarettes/day' vita = 'Live births' ; datalines; 0 0 0 0 50 0 0 1 1 40 0110 4 0 0 0 1 315 0 1 0 0 41 0 1 1 1 11 0010 9

0 1 0 1 147 1 0 0 0 24

1 0 0 1 4012 1 1 0 0 14 1 1 1 1 124 run;

1010

1 0 1 1 459 1110 1

1 1 0 1 1494

title2 'Log-linear model of mutual independence'; proc genmod; /* Produces Output 2.6 */

model count = gest age cigs vita / dist = Poisson; run; title2 'Log-linear model with all pairwise effects'; proc genmod; /* Produces Output 2.7 */

model count = gest | age | cigs | vita @2 / dist = Poisson; run; title2 'Log-linear model with all 3-way interactions'; proc genmod; /* Produces Output 2.8 */

model count = gest | age | cigs | vita @3 / dist = Poisson; run; title2 'A small log-linear model with 5 pairwise interactions'; proc genmod; /* Produces Output 2.9 */

model count = gest age cigs vita gest*age age*cigs gest*vita age*vita cigs*vita / dist = Poisson; run; title2 'A log-linear model with 4 pairwise interactions'; proc genmod; /* Produces Output 2.10 */

model count = gest age cigs vita gest*age age*cigs gest*vita age*vita / dist = Poisson; run;

Program 2.3

proc genmod; model count = gest age cigs vita gest*age age*cigs cigs*vita gest*vita age*vita / dist = Poisson type1 type3; run;

Program 3.1

title1 'Coal miners: breathlessness and wheezing by age'; data; /* Read data as a 2x2 table for each age group. Produce a data set with one frequency per line. input age a b c d; label age br wh bwa = '5 year interval' = 'breathlessness' = 'wheeze' = 'linear age times br-wh interaction'; /* recode age categories: 1 to 9 */ */

age=age/5 - 3; /*

Output four lines of frequencies for each age group.

bwa is a log-odds ratio that is linear in age br=1; wh=1; bwa= age; freq=a; output; br=1; wh=0; bwa=-age; freq=b; output; br=0; wh=1; bwa=-age; freq=c; output; br=0; wh=0; bwa= age; freq=d; output; drop a b c d; datalines; 20 25 30 9 23 54 7 95 1841

*/

/* omit the original 2x2 table */

9 105 1654 19 177 1863 48 257 2357 54 273 1778 88 324 1712

35 121 40 169 45 269

50 404 117 245 1324 55 406 152 225 60 372 106 132 run; title2 'Log-linear model with all pairwise interactions'; proc genmod; /* age as a class variable ignores its ordering */ class age br wh; model freq = br | wh | age @2 / dist = Poisson obstats; run; 967 526

title2 'All pairwise interactions + linear odds ratio'; proc genmod;

/* bwa models the interaction of br and wh as linear in age */ class age br wh; model freq = br | wh | age @2 bwa / dist = Poisson obstats; run;

Program 3.2

title1 'Trauma and outcome. Data from Agresti and Coull, 1998.'; data; input treat outcome freq common @@; label treat = 'four treatments' outcome= 'five ordered outcomes' txo = 'treatment and outcome interaction'

common = 'common class-style odds ratio' ; txo=treat*outcome; datalines; 1 1 59 1 1 2 25 1 1 3 46 1 1 4 48 1 1 5 32 -4 2 1 48 1 2 2 21 1 2 3 44 1 2 4 47 1 2 5 30 -4 3 1 44 1 3 2 14 1 3 3 54 1 3 4 64 1 3 5 31 -4 4 1 43 -3 4 2 4 -3 4 3 49 -3 4 4 58 -3 4 5 41 12 run; title2 'Independence of treatment and outcome'; proc genmod; class treat outcome; model freq = treat outcome / dist = Poisson; /* This step produces Output 3.3 */ /* linear by linear interaction */

run; title2 'Interaction of a pair of class variables'; proc genmod; /* Output 3.4 is produced by this step */

class treat outcome; model freq = treat outcome treat * outcome / dist = Poisson; run; title2 'Fit the common odds ratio interaction'; proc genmod; /* The fitted values from this step appear in Table 3.5 */ class treat outcome; model freq = treat outcome common / dist = Poisson; run; title2 'Fit the treat * outcome linear by linear interaction'; proc genmod; /* The fitted values from this step appear in Table 3.6 */ class treat outcome; model freq = treat outcome txo /dist = Poisson; run; title2 'Common odds and linear by linear interaction effects'; proc genmod; class treat outcome; model freq = treat outcome common txo / dist = Poisson; run;

Program 4.1

title1 'Triangular stroke data'; data; input count row $ col $; label count = 'number of patients' row col ; datalines; 11 E A 23 E B 12 E C 15 E D 8 E E 9 D A 10 D B 4 D C 1 D D 6 C A 4 C B 4 C C 4 B A 5 B B 5 A A = 'admission status' = 'discharge status'

run;

proc print; run;

proc genmod; class row col; model count = row col /* independence of rows and columns */ / run; dist = Poisson obstats;

Program 4.2

data stroke; input count row col @@; h=(col-5.5)*(row-6); /* h asymmetry */ */ */

b= 1/(1+(col-5.5)**2 +(row-6)**2); /* center bump ob= 1/(1+(col-4.5)**2 +(row-6)**2); /* offset bump e=sqrt(col**2+row**2); rowc = row; colc = col; jr=row+normal(0)/10; label count = 'number of patients' colc rowc = 'left to right category' = 'front to back category'

/* distance from center */ /* row category /* column category /* jitter row */ */ */

e h b ob jr datalines;

= 'distance from center' = 'h interaction' = 'bump in the center' = 'offset bump' = 'jittered row' ;

0 1 4 1 1 5 0 1 6 0 1 7 1 2 3 1 2 4 1 2 5 0 2 6 0 2 7 4 2 8 3 3 2 1 3 3 0 3 4 1 3 5 0 3 6 2 3 7 9 3 8 11 3 9 1 4 1 4 4 2 3 4 3 0 4 4 1 4 5 0 4 6 3 4 7 11 4 8 15 4 9 11 4 10 2 5 1 2 5 2 3 5 3 3 5 4 0 5 5 1 5 6 0 5 7 1 5 8 8 5 9 13 5 10 13 6 1 7 6 2 2 6 3 5 6 4 6 6 5 3 6 6 1 6 7 0 6 8 9 6 9 12 6 10 15 7 1 9 7 2 4 7 3 4 7 5 0 7 6 8 7 8 7 7 9 9 7 10 6 8 1 8 8 2 5 8 3 1 8 4 0 8 5 1 8 6 1 8 7 8 8 8 8 8 9 7 8 10 4 9 2 4 9 3 2 9 4 1 9 5 2 9 6 5 9 7 7 9 8 7 9 9 6 10 2 4 10 3 2 10 4 1 10 5 2 10 6 5 10 7 8 10 8 7 10 9 3 11 3 1 11 4 1 11 5 2 11 6 5 11 7 5 11 8 run; proc genmod; class rowc colc; model count =rowc colc / dist=Poisson; run; /* Fit model for indep, h, e, ob and obtain residuals proc genmod; */ /* model of independence for rows and columns */

class rowc colc; model count = h e ob rowc colc / dist=Poisson obstats; ods output obstats=fitted; run; data origin; set stroke; drop rowc colc; run; data both; /* Merge observed and fitted values */ /* Drop class variables */ /* Capture residuals */

merge origin fitted; run; proc gplot; /* Plot chi-squared residuals by jittered row */ / vref=0 haxis=axis1 vaxis=axis2;

plot reschi * jr run;

Program 4.3

%let max=7; %let cons=13;

/* maximum number of items to compare */ /* number of pairwise comparisons performed */

title1 'Baseball games using the Bradley Terry model'; data bball; input wteam $ 1-10 wtmn ltmn freq; label wteam = 'name of winning team' wtmn = 'winning team number'

ltmn = 'losing team number' freq = 'number of wins' ; jitwin=wtmn+normal(0)/10; if ltmn < wtmn then delete; comps=&cons; /* jitter winning team number */ /* omit top half of table */

/* number of times each team plays others */ /* build phi parameters */

array ph(&max) phi1-phi&max; do j=1 to &max; ph(j)=0; end;

/* initialize all to zero */ /* differences of phi's for comparisons: */

ph(wtmn)=+1; ph(ltmn)=-1; datalines; Milwaukee 1 2 7 Milwaukee 1 3 9 Milwaukee 1 4 7 Milwaukee 1 5 7 Milwaukee 1 6 9 Milwaukee 1 7 11 Detroit Detroit Detroit Detroit Detroit Detroit Toronto Toronto 2 1 6 2 3 7 2 4 5 2 5 11 2 6 9 2 7 9 3 1 4 3 2 6

/* +1 for winning team */ /* -1 for losing team */

Toronto Toronto Toronto Toronto

3 4 7 3 5 7 3 6 8 3 7 12

New York 4 1 6 New York 4 2 8 New York 4 3 6 New York 4 5 6 New York 4 6 7 New York 4 7 10 Boston Boston Boston Boston Boston Boston 5 1 6 5 2 2 5 3 6 5 4 7 5 6 7 5 7 12

Cleveland 6 1 4 Cleveland 6 2 4 Cleveland 6 3 5 Cleveland 6 4 6 Cleveland 6 5 6 Cleveland 6 7 6 Baltimore 7 1 2 Baltimore 7 2 4 Baltimore 7 3 1 Baltimore 7 4 3

Baltimore 7 5 1 Baltimore 7 6 7 run;

ods output obstats=fitted; proc genmod; /* Fit Bradley-Terry model */

model freq / comps = phi1-phi7 / dist = binomial noint obstats; run; data two; merge fitted bball; output; /* combine fitted and raw data */ /* output lower half of table */ /* rebuild upper half of table */

j=wtmn; wtmn=ltmn; ltmn=j; reschi=-reschi; pred=13-pred; jitwin=wtmn+normal(0)/10; output; drop phi1-phi7; run; proc gplot; plot Streschi * jitwin; run; quit;

=============================================== =========

Program 5.1

title1 'Deaths from testicular cancer in Japan'; data tcancer; input age pop1 c1 pop2 c2 pop3 c3 pop4 c4 pop5 c5; array p(5) pop1-pop5; array d(5) c1-c5; label agegrp = 'age in 5yr interval' logpop = 'log-population'; /* Produce a separate line for each year/age combination agegrp=age/5; do j=1 to 5; deaths=d(j); logpop=log(p(j)); year=j-1; yearc=year; /* age in five year intervals */ /* for each line read in . . */ /* number of cancer deaths */ /* log of population */ /* recode the years as 0,...,4 */ /* year category */ */ /* populations at each year group */ /* cancer cases for each year group */

cohort=agegrp - year + 4; /* identify the diagonal cohort */ output; end; drop j pop1-pop5 c1-c5; datalines; 0 15501 17 26914 51 21027 65 20246 69 21596 74 5 14236 . 25380 6 26613 7 20885 8 20051 7 10 13270 . 23492 3 25324 3 26540 7 20718 11 /* omit unneeded variables */ /* produce five output lines for each read in */

15 12658 2 21881 6 23211 15 24931 25 26182 39 20 10696 5 20402 27 21263 39 22228 56 24033 83 25 7563 5 17242 40 19994 58 20606 97 21805 125 30 7074 7 12609 18 17128 54 19864 77 20750 129 35 7038 10 11712 13 12476 36 17001 70 19890 101 40 6418 9 11478 26 11450 32 12275 29 16794 67 45 5981 7 10274 16 11157 26 11147 34 11962 37 50 4944 7 9325 16 9828 27 10705 27 10741 29 55 3994 7 7562 17 8718 19 9206 32 10086 39 60 3098 6 5902 13 6796 21 7869 21 8399 31 65 2317 4 4244 12 4911 26 5728 29 6715 34 70 1513 7 2845 17 3197 22 3737 25 4448 33 75 688 5 1587 9 1812 10 2061 25 2482 31 80 264 2 583 6 787 6 904 14 1068 9 85 73 2 179 2 246 3 335 3 419 3 run; title2 'Model (5.1): Cohort, age, year; No population information'; proc genmod; class yearc agegrp cohort; model deaths = yearc agegrp cohort / type1 type3 dist=Poisson; run; title2 'Model (5.2): Cohort, age, year, and offset log(Pop)'; proc genmod; class yearc agegrp cohort; ods listing exclude obstats; /* turn off the obstats listing */ output out=fitted reschi=reschi; /* create an output dataset */

model deaths = yearc agegrp cohort / obstats type1 type3 offset=logpop dist=Poisson; run; proc gplot data=fitted; /* bubble plot of residuals */

bubble reschi * agegrp= year; run;

=============================================== ============

Program 5.2

title1 'Species diversity on the Galapagos Islands'; data species; input name $ 1-13 species area d2neigh d2sc adjsp; loga=log(area); area=area/1000; adjsp=adjsp/100; d2sc=d2sc/100; d2neigh=d2neigh/100; isa=0; if name='Isabela' then isa=1; /* indicator for Isabela */ label species = '# of species on island' area loga = 'in sq-km' = 'log area' /* rescale variables */

d2neigh = 'distance to nearest neighbor' d2sc = 'distance to Santa Cruz'

adjsp = '# of species on adjacent island'; datalines; Baltra Bartolome Caldwell Champion Coamano 58 25.09 0.6 0.6 44 31 3 25 2 1.24 0.6 26.3 237 .21 2.8 58.7 .10 1.9 47.4 5 2

.05 1.9 1.9 444 .34 8.0 8.0 44 .08 6.0 12.0 18

Daphne Major 18 Daphne Minor 24 Darwin Eden Enderby Espanola Fernandina Gardner A Gardner B Genovesa Isabela Marchena Onslow Pinta Pinzon Las Plazas Rabida 10 8 2

2.33 34.1 290.2 21 .03 0.4 0.4 108 .18 2.6 50.2 25

97 58.27 1.1 88.3 58 93 634.49 4.3 95.3 347 58 5 .57 1.1 93.1 97 .78 4.6 62.2 3

40 17.35 47.4 92.2 51 347 4669.32 0.7 28.1 93 51 129.49 29.1 85.9 104 2 .01 3.3 45.9 25

104 59.56 29.1 119.6 51 108 17.95 10.7 10.7 12 70 .23 0.5 0.6 58 4.89 4.4 24.4 237 8

San Cristobal 280 551.62 45.2 66.6 58 San Salvador 237 572.33 0.2 19.8 70 Santa Cruz Santa Fe 444 903.82 0.6 0.0 8

62 24.08 16.5 16.5 444

Santa Maria 285 170.92 2.6 49.2 25 Seymour Tortuga Wolf run; proc print; run; title2 'Fit Poisson model with all pairwise interactions'; proc genmod; ods output obstats=fit; model species = loga | d2neigh | adjsp | d2sc @2 isa / dist=Poisson obstats type1 type3; run; proc plot data=fit; plot reschi * loga; run; title2 'Fit Negative Binomial model with all pairwise interactions'; proc genmod; ods output obstats=fitnb; model species = loga | d2neigh | adjsp | d2sc @2 isa / dist=nb obstats type1 type3; run; /* plot Poisson Pearson residuals */ 44 16 21 1.84 0.6 9.6 58 1.24 6.8 50.9 108 2.85 34.1 254.7 10

proc plot data=fitnb; plot reschi* loga; run;

/* plot negative binomial residuals */

Program 6.1

data; input count birth death medic ; label birth = 'names on birth certificate' death = 'names on death certificate' medic = 'names on medical record' ; datalines; 60 0 0 1 49 0 1 0 4011 247 1 0 0 112 1 0 1 142 1 1 0 12 1 1 1 run; title2 'Log-linear model of mutual independence'; proc genmod; /* m, d, b model */

model count = medic birth death /dist = Poisson; run; title2 'Log-linear models with one interaction'; proc genmod; /* d, b*m model */

model count = medic birth death birth*medic / dist = Poisson; run; proc genmod; /* m, b*d model */

model count = medic birth death birth*death / dist = Poisson; run; proc genmod; /* b, d*m model (6.1) with output in Table 6.4 */ model count = medic death birth death*medic / dist = Poisson; run; title2 'Log-linear models with two interactions'; proc genmod; /* b*m, d*b model */

model count = medic birth death birth*medic birth*death /dist = Poisson; run; proc genmod; /* b*d, m*d model */

model count = medic birth death birth*death death*medic /dist = Poisson; run; proc genmod; /* d*m, m*b model */

model count = medic birth death death*medic birth*medic /dist = Poisson; run;

Program 7.1 (Macros)

%macro tpr; y=_RESP_; lambda=exp(_XBETA_); eml=exp(-lambda);

/* Truncated Poisson regression */ /* Observed count */

/* Truncated Poisson parameter */ /* e to the minus lambda */

/* Truncated Poisson deviance */

dev=0; if y > 1 then do; %inv(y);

/* Deviance for y=1

*/

/* lam is MLE for y >1

*/ */

dev= -lam + y*log(lam) - log(1-exp(-lam)); /* deviance end; dev = dev + lambda - y*log(lambda) + log(1-eml);

deviance d = 2*dev; /* times 2 for chi-squared distribution */

/* Provide GENMOD with link functions and variance */

%inv(_MEAN_); fwdlink ey = log(lam);

/* Find lambda for _MEAN_

*/

/* Link function: log lambda */ */

invlink linv = lambda / (1-eml); /* Truncated Poisson mean

/* Truncated Poisson variance */ variance var = (lambda*(lambda+1)*(1-eml)-lambda**2)/(1-eml)**2; %mend tpr;

%macro inv(ev); /* Interval bisection to find lambda from the expected value (ev) */ if &ev LE 1 then lam= . ; else do; lamlo=&ev-1; lamhi=&ev; /* expected value must be >1 */ /* lambda is not defined for ev LE 1 */ /* otherwise iterate to find lambda */ /* lambda is between exp value less one */ /* . . . and the expected value */

do until (abs(lamhi-lamlo)<1e-7); /* convergence criteria */ lam=(lamhi+lamlo)/2; mal= lam/(1-exp(-lam)); /* examine midpoint /* mean at midpoint */ */

if mal GE &ev then lamhi=lam; /* lower upper endpoint */ if mal LE &ev then lamlo=lam; /* raise lower endpoint */ end; end; %mend inv;

Program 7.2

data skunk;

input count freq year sex $ @@; yr=year; sx=sex; label count ='count' freq = 'frequency' year = '1977 or 1978' sex = 'F M' datalines; 1 1 77 F 1 3 77 M 6 2 77 M 5 2 78 F run; proc genmod; %tpr; frequency freq; class sex year; model count = sex year year*sex / obstats type1 type3; ods listing exclude obstats; ods output obstats=fitted; run; data fit2; set fitted; drop sex year; run; data both; /* Combine two data sets */ /* omit class variables because these are defined */ /* in two different ways */ /* omit obstats output */ /* Create fitted value dataset */ /* invoke the %tpr macro */ 2 2 77 F 2 0 77 M 1 7 78 F 1 4 78 M 3 4 77 F 4 2 77 F 5 1 77 F 3 3 77 M 4 3 77 M 5 2 77 M 2 7 78 F 3 3 78 F 4 1 78 F 2 3 78 M 3 1 78 M ; /* make copies of year and sex variables */

merge skunk fit2; lambda=exp(xbeta); p0 = exp(-lambda);

/* Merge fitted & original values */ /* Estimate of lambda parameter */ /* Est of probability of zero category */ /* 95% CI for lambda */

lamup = exp(xbeta + 1.96*std); lamlow = exp(xbeta - 1.96*std); p0up = exp(-lamlow); p0low = exp(-lamup); se=std*lambda; sep0=std*lambda*p0; run; data; set both; by yr sx; if not ( first.yr | first.sx ) then delete; run; proc print noobs;

/* 95% CI for p0 */

/* SE of lambda from delta-method */ /* SE of p0 from the delta method */

/* Remove duplicated values */

var yr sx lambda se lamlow lamup p0 sep0 p0low p0up; run;

Program 7.3

title1 'Truncated Poisson regression to model lottery winners'; data lottery; input town $ 1-14 winners popul area mill;

dens = popul/area; logpop = log(popul); label popul = 'population in 1000s' logpop = 'log population' area = 'in square miles' mill = 'property tax rate' dens = 'population density'; datalines; Ansonia Beacon Falls Branford Cheshire Clinton Derby East Haven Guilford Hamden Madison Milford N. Branford North Haven Old Saybrook Orange Oxford Seymour 9 3 1 6 9 5 10 2 12 1 6 3 11 6 2 6 9 17.9 5.3 28.0 26.2 12.8 12.0 26.5 20.3 52.0 16.0 49.5 13.1 21.6 9.3 12.5 9.1 14.5 6.2 9.8 27.9 33.0 17.2 5.3 28.9 25.0 22.6 27.1 27.9 29.6 37.1 28.6 34.1 22.3 30.8 26.9 23.4 15.3 23.8 29.0 40.5

12.6 47.7 33.0 36.3 23.5 26.8 21.0 18.3 17.6

33.0 14.7

Shelton Trumbull West Haven Woodbridge run;

7 14

36.0 33.0

31.4 23.5

21.6 24.1 41.4 28.4

12 1

54.0 8.0

10.6 19.3

title2 'Fit log population as a covariate'; proc genmod; %tpr; /* invoke the %tpr macros */

model winners = mill area dens logpop / type1 type3 maxiter=100 intercept=.3; run; title2 'Fit log population using offset'; proc genmod; %tpr; /* invoke the %tpr macros */ /* increase number of iterations */ /* initial starting value for intercept */

model winners = mill area dens / offset=logpop obstats type1 type3 maxiter=100 intercept=.3; /* increase number of iterations */ /* initial starting value for intercept */ /* create fitted value dataset */

ods output obstats=fitted;

ods listing exclude obstats; /* do not print the obstats data */ run; data both; merge lottery fitted; /* merge the fitted and original values */ lambda=exp(xbeta); run; /* estimated lambda parameter */

title3 'Plot of residuals from the fitted model'; proc gplot; bubble reschi * pred = dens; run;

Program 8.1

data; do row=1 to 2; do col=1 to 2; input count @@; output; end; end; datalines; 4 5 12 74 run; proc freq; tables row * col / all; weight count; exact or; run;

Program 8.2

title1 'Extended hypergeometric distribution in Genmod'; title2 'Oral contraceptive use and myocardial infarction, Shapiro 1979'; data mi; input age cwoc ocuse cases total; label cwoc = 'cases with oc use' ocuse = '# in age stratum using oc''s' cases = '# of cases in age stratum' total = 'sample size in this age stratum' age = 'age category' loga = 'log-age category'; loga=log(age); datalines; 1 4 66 6 292

2 9 42 21 444 3 4 30 37 393 4 6 15 71 442 5 6 11 99 405 run; title3 'Model for log-odds ratio is linear in age'; proc genmod data=mi;

%fithyp(cases,ocuse,total); /* Provide table margins */ model cwoc = age / obstats intercept=2 initial = -.30 to -.10 by .05 maxit=250000; run; title3 'Model for log-odds ratio is linear in log-age'; proc genmod data=mi; %fithyp(cases,ocuse,total); /* Provide table margins */ model cwoc = loga / obstats intercept=2 initial = -.80 to -.60 by .05 maxit=250000; ods output obstats=fitted; /* Create fitted value dataset */ ods exclude listing obstats; run; data both; merge mi fitted; /* Merge fitted & original values */

%hypmean(cases,ocuse,total,xbeta); mean=hypmean; sd=sqrt(hypvar); run; proc print data=both; var age loga mean sd total pred xbeta; run;

Program 8.3

title1 'Extended hypergeometric regression in Genmod'; title2 'Four fungicide 2x2 tables';

data avadex; input expt exposed tumors total strain $ sex $; label expt = 'exposed mice w/tumors'

exposed = 'mice exposed to fungicide' tumors = 'mice with tumors' total = 'sample size in this 2x2 table' strain = 'X or Y' sex = 'M or F' ;

datalines; 4 16 9 95 X M 2 14 5 101 X F 4 18 14 108 Y M 1 15 4 97 Y F run;

proc genmod; %fithyp(exposed,tumors,total); /* Provide table margins */ class strain sex; model expt = strain sex / obstats maxiter=250 type1 type3;

run;

Program 8.4 (hypergeometric macros)

/* Macros to fit the extended hypergeometric distribution in Genmod */

%macro hyp(m,n,nn,x); /* Log of 2 binomial coefficients in the numerator of the extended hypergeometric distribution: m=row sum; n=col sum; nn=sample size; x=count */

lgamma(&m+1)-lgamma(&x+1)-lgamma(&m-&x+1) /* m choose x */ /* nn-m choose n-x */ + lgamma(&nn-&m+1)-lgamma(&n-&x+1)-lgamma(&nn-&n-&m+&x+1) /* note omitted semicolon */ %mend hyp;

%macro hypmean(m,n,nn,lor); /* Extended hypergeometric mean and variance: m=row sum;

n=column sum; nn=sample size; lor=log-odds ratio */ den=0; hypmean=0; hypvar=0; do j=max(0,&m+&n-&nn) to min(&n,&m); /* loop over range */ dterm=exp(%hyp(&m,&n,&nn,j)+j*&lor); den=den+dterm; /* accumulate denominator */ /* accumulate mean */

hypmean=hypmean+j*dterm;

hypvar=hypvar+j**2*dterm; /* accumulate variance */ end; hypmean=hypmean/den; /* divide by denominator */ */

hypvar=hypvar/den -hypmean**2; /* corrected variance %mend hypmean;

%macro hypinv(m,n,nn,expv); /* Find the hypergeometric parameter giving a distribution with expected value equal to expv. This value is called lamd upon completion. Interval bisection is used to find lamd. The initial interval for estimation of log-odds-ratio is +/- lorlimit. This log-odds parameter is needed for the deviance. lorlimit= 15; */

lamlo= -lorlimit; lamhi= lorlimit;

/* initial interval

*/

/* if expv is at extreme of its range then the model is degenerate */ if &expv LE max(0,&m+&n-&nn) then lamhi=lamlo; if &expv GE min(&n,&m) then lamlo=lamhi; /* expv low */

/* exvp at hi end */

do until (abs(lamhi-lamlo)<1e-6); lamd=(lamhi+lamlo)/2; %hypmean(&m,&n,&nn,lamd);

/* convergence criteria */ /* examine midpoint */ */

/* mean at midpoint */

/* shrink interval: equate expected value with expv if hypmean GE &expv then lamhi=lamd; if hypmean LE &expv then lamlo=lamd; end; %mend hypinv;

%macro fithyp(n,m,nn); /* This macro provides GENMOD with the link, inverse link, variance, and deviance needed to perform regression on the log odds ratio parameter of the extended hypergeometric distribution. The parameters are: n=row sum; m=column sum; nn=2x2 table total. */

/* Inverse link is the mean of the distribution at _XBETA_ %hypmean(&m,&n,&nn,_XBETA_); mean0=hypmean;

*/

/* Likelihood evaluated at _XBETA_ is denominator of deviance */ devden=exp(%hyp(&m,&n,&nn,_RESP_)+_RESP_*_XBETA_)/den;

/* The deviance requires the unconstrained maximum likelihood estimator. This estimate is the parameter value that equates the expected value with the observed value, _RESP_ */

%hypinv(&m,&n,&nn,_RESP_); var0=hypvar; /* Variance at MLE */

/* Maximized likelihood is the numerator of deviance devnum=exp(%hyp(&m,&n,&nn,_RESP_)+_RESP_*lamd)/den;

*/

/*

Deviance at observed count is twice the log ratio

*/

devi= 2*log(devnum/devden);

/*

Tell GENMOD about these values

*/

invlink ilink = mean0; fwdlink link = log(_MEAN_); deviance dev = devi; variance vari = var0;

/* Hypergeometric mean at _XBETA_ */ /* Forward link /* Deviance */ */

/* Hypergeometric variance */

%mend fithyp;

Program 9.1 (power macro)

/* Approximate power for chi-squared tests. INVOKE AS INPUT: df = degrees of freedom, taken to be 1 if missing ncp = non-centrality parameter for a sample of size n=1 alpha = significance level under null hypothesis, taken to be .05 if missing OUTPUT variables: power = values .25 to .9 by .05 and .9 to .99 by .01 n = approximate sample size at respective power */ %macro chipow(df,ncp,alpha); sig=&alpha; if sig=. then sig=.05; %chipow(df=.,ncp=.,alpha=.);

do j=1 to 23;

/* range for power

*/

if j<15 then power=.2 + j/20;

/* power is .25 by .05 to .9 */

if j>=15 then power=.9+.01*(j-14); /* ... and .9 by .01 to .99 */ /* approximate sample size */

n=cnonct(cinv(1-sig,&df),&df,1-power)/&ncp; output; end; drop j sig; %mend chipow; /* drop local variables */

Program 9.2

/* Power for the null situation where margins are know a priori in a 2x2 table. INVOKE this macro as %nullpow(n=100, p1=.5, p2=.5); INPUT: n= known sample size, assumed to be 100 if missing p1,p2 = marginal probabilities, set to 1/2 if missing OUTPUT variables: power = values .25 to .9 by .05 and .9 to .99 by .01 psi1-psi3 = odds ratios that can be tested with specified power and alpha=.01, .05, and .10, respectively. */ %macro nullpow(n=100,p1=.5,p2=.5); %let nsig=3; /* number of significance levels */

array alpha(&nsig) alpha1-alpha&nsig;

array theta(&nsig) theta1-theta&nsig; array psi(&nsig) psi1-psi&nsig; alpha(1)=.01; alpha(2)=.05; alpha(3)=.10; m= 1 / (&p1*&p2) + 1 / (&p1*(1-&p2)) + 1 / ((1-&p1)*&p2) + 1 / ((1-&p1)*(1-&p2)); do j=1 to 23; /* multiplier in (9.10) */ /* range for power */ /* specify significance levels */

if j<15 then power=.2 + j/20; /* power is .25 by .05 to .9 */ if j>=15 then power=.9+.01*(j-14); /* and .9 by .01 to .99 */ do k=1 to &nsig; /* for every different significance level */ t=cnonct(cinv(1-alpha(k),1),1,1-power); /* 1 df crit val */ theta(k)=sqrt(t/(m*&n)); /* theta in Table 9.9, 10 */

psi(k)=((1+4*theta(k))/(1-4*theta(k)))**2; /* odds ratio */ end; output; end; %mend nullpow; title1 'Approximate odds ratios for chi-squared tests'; title2 'Significance levels are .01, .05, and .10'; data initial; %nullpow(n=150,p1=.5,p2=.5); /* specify N and marginal prob's */ run; proc print noobs; var power psi1 psi2 psi3; run; /* every level of power on a different line */

Program A.1 in Appendix

%macro tpr; y=_RESP_; lambda=exp(_XBETA_); eml=exp(-lambda);

/* Truncated Poisson regression */ /* Observed count */

/* Truncated Poisson parameter */ /* e to the minus lambda */

/* Truncated Poisson deviance */

dev=0; if y > 1 then do; %inv(y);

/* Deviance for y=1

*/

/* lam is MLE for y >1

*/ */

dev= -lam + y*log(lam) - log(1-exp(-lam)); /* deviance end; dev = dev + lambda - y*log(lambda) + log(1-eml);

deviance d = 2*dev; /* times 2 for chi-squared distribution */

/* Provide GENMOD with link functions and variance */

%inv(_MEAN_); fwdlink ey = log(lam);

/* Find lambda for _MEAN_

*/

/* Link function: log lambda */

invlink linv = lambda / (1-eml); /* Truncated Poisson mean /* Truncated Poisson variance */

*/

variance var = (lambda*(lambda+1)*(1-eml)-lambda**2)/(1-eml)**2; %mend tpr;

%macro inv(ev); /* Interval bisection to find lambda from the expected value (ev) */ if &ev LE 1 then lam= . ; else do; lamlo=&ev-1; lamhi=&ev; /* expected value must be >1 */ /* lambda is not defined for ev LE 1 */ /* otherwise iterate to find lambda */ /* lambda is between exp value less one */ /* . . . and the expected value */

do until (abs(lamhi-lamlo)<1e-7); /* convergence criteria */ lam=(lamhi+lamlo)/2; mal= lam/(1-exp(-lam)); /* examine midpoint /* mean at midpoint */ */

if mal GE &ev then lamhi=lam; /* lower upper endpoint */ if mal LE &ev then lamlo=lam; /* raise lower endpoint */ end; end; %mend inv;

=================== end of this file =====================

You might also like