R version 3.2.3 (2015-12-10) -- "Wooden Christmas-Tree"
Copyright (C) 2015 The R Foundation for Statistical Computing
Platform: x86_64-pc-linux-gnu (64-bit)

R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.

  Natural language support but running in an English locale

R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.

Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.

> 
> # for reproducibility of results
> # to get different results, change seed
> # or omit this statement
> set.seed(42)
> 
> pdf(file = "mcmc-too.pdf")
> 
> # For table 2.6 in Agresti, fit Bayesian model
> # use as prior conjugate prior that puts "count" 1/2 in each cell
> # this is a conjugate prior
> 
> victim <- factor(rep(c("white", "black"), each = 2))
> defendant <- factor(rep(c("white", "black"), times = 2))
> deathpenalty <- matrix(c(53, 11, 0, 4, 414, 37, 16, 139),
+         ncol = 2, dimnames = list(NULL, c("yes", "no")))
> data.frame(victim, defendant, deathpenalty)
  victim defendant yes  no
1  white     white  53 414
2  white     black  11  37
3  black     white   0  16
4  black     black   4 139
> 
> # Just for comparison, do frequentist (likelihoodist) analysis
> #
> # Note: the way glm with family = binomial works is special.
> # If you have Bernoulli response (a. k. a., zero-or-one-valued,
> # a. k. a., binomial with sample size 1) then you supply a
> # (zero-or-one-valued) response vector.
> # If you have binomial response with sample size > 1, then you
> # supply a response MATRIX whose first column is the number of successes
> # for each case and whose second column is the number of failures for
> # each case. It doesn't matter which you call "success" and which "failure"
> # as long as you keep straight which is which.  This works when the sample
> # sizes for each "case" are different, which we have here where "cases"
> 
> gout0 <- glm(deathpenalty ~ victim + defendant, family = binomial)
> summary(gout0)

Call:
glm(formula = deathpenalty ~ victim + defendant, family = binomial)

Deviance Residuals: 
       1         2         3         4  
 0.02660  -0.06232  -0.60535   0.09379  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -3.5961     0.5069  -7.094 1.30e-12 ***
victimwhite      2.4044     0.6006   4.003 6.25e-05 ***
defendantwhite  -0.8678     0.3671  -2.364   0.0181 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 22.26591  on 3  degrees of freedom
Residual deviance:  0.37984  on 1  degrees of freedom
AIC: 19.3

Number of Fisher Scoring iterations: 4

> gout1 <- glm(deathpenalty ~ victim * defendant, family = binomial)
> summary(gout1)

Call:
glm(formula = deathpenalty ~ victim * defendant, family = binomial)

Deviance Residuals: 
[1]  0  0  0  0

Coefficients:
                             Estimate Std. Error z value Pr(>|z|)    
(Intercept)                   -3.5482     0.5071  -6.996 2.63e-12 ***
victimwhite                    2.3352     0.6125   3.813 0.000137 ***
defendantwhite               -21.9957 53403.2302   0.000 0.999671    
victimwhite:defendantwhite    21.1531 53403.2302   0.000 0.999684    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2.2266e+01  on 3  degrees of freedom
Residual deviance: 2.5803e-10  on 0  degrees of freedom
AIC: 20.92

Number of Fisher Scoring iterations: 22

> anova(gout0, gout1)
Analysis of Deviance Table

Model 1: deathpenalty ~ victim + defendant
Model 2: deathpenalty ~ victim * defendant
  Resid. Df Resid. Dev Df Deviance
1         1    0.37984            
2         0    0.00000  1  0.37984
> 
> # we see the smaller model fits as well as the bigger model,
> # i. e., the "interaction" is not statistically significant
> # so we will use the smaller model henceforth
> # there are Bayesian competitors to frequentist hypothesis tests
> # but I don't think we will have time to cover them.
> # for those that are interested, my STAT 5102 slides, deck 4, slides 94 ff.
> # http://www.stat.umn.edu/geyer/5102/slides/s4.pdf#page=94
> # but I won't go over them in class (too theoretical)
> 
> # we fit the "best" model according to likelihood inference Bayesianly.
> # first we create an R function that calculates the log unnormalized density
> # of the distribution that we are trying to sample by MCMC, which for
> # Bayesian inference is the log unnormalized posterior
> 
> yes <- deathpenalty[ , "yes"]
> no <- deathpenalty[ , "no"]
> 
> # get model matrix
> gout0 <- glm(deathpenalty ~ victim + defendant, family = binomial, x = TRUE)
> modmat <- gout0$x
> modmat
  (Intercept) victimwhite defendantwhite
1           1           1              1
2           1           1              0
3           1           0              1
4           1           0              0
attr(,"assign")
[1] 0 1 2
attr(,"contrasts")
attr(,"contrasts")$victim
[1] "contr.treatment"

attr(,"contrasts")$defendant
[1] "contr.treatment"

> 
> # essentially copied from "demo" vignette in package "mcmc"
> 
> ludfun <- function(beta) {
+     stopifnot(is.numeric(beta))
+     stopifnot(length(beta) == ncol(modmat))
+     stopifnot(! is.na(beta))
+     eta <- as.vector(modmat %*% beta)
+     logp <- ifelse(eta < 0, eta - log1p(exp(eta)), - log1p(exp(- eta)))
+     logq <- ifelse(eta < 0, - log1p(exp(eta)), - eta - log1p(exp(- eta)))
+     sum((yes + 1/2) * logp + (no + 1/2) * logq)
+ }
> 
> # The reason for yes + 1/2 and no + 1/2 is that is how conjugate priors work.
> # They imagine the prior describes the uncertainty we would have from starting
> # with a flat prior and observing data that is a count of 1/2 in each cell.
> # Fractional counts make no sense for actual data, but they do make algebraic
> # sense.  We should think of these as hyperparameters of the prior.  If we
> # replaced the last line by
> #
> #     sum((yes + alpha1) * logp + (no + alpha2) * logq)
> #
> # then adjusting the "knobs" alpha1 and alpha2 to any values between zero
> # and infinity gives a range of priors that describe a range of uncertainties
> # about the parameters, and we happen to have chosen alpha1 = alpha2 = 1/2
> #
> # "ludfun" stands for log unnormalized density function
> #
> # the reasons for the tricky computation of logp and logq are explained
> # in demo.pdf.  The general reason is computational accuracy despite
> # inexactness of computer arithmetic, but the details are fussy.
> 
> library(mcmc)
> 
> mout <- metrop(ludfun, gout0$coefficients, nbatch = 1e4)
> mout$accept
[1] 0.0386
> mout <- metrop(mout, scale = 0.1)
> mout$accept
[1] 0.6286
> mout <- metrop(mout, scale = 0.5)
> mout$accept
[1] 0.1334
> mout <- metrop(mout, scale = 0.3)
> mout$accept
[1] 0.256
> 
> # scale now adjusted so acceptance rate is about 1/4
> 
> acf(mout$batch[ , 2], lag.max = 100,
+     main = expression(paste("Series: ", beta["victim"])))
> acf(mout$batch[ , 3], lag.max = 100,
+     main = expression(paste("Series: ", beta["defendant"])))
> 
> mout <- metrop(mout, blen = 100, nbatch = 100)
> mout$accept
[1] 0.2516
> 
> acf(mout$batch[ , 2], main = expression(paste("Series: ",
+     beta["victim"], " means of batches of length 100")))
> acf(mout$batch[ , 3], main = expression(paste("Series: ",
+     beta["defendant"], " means of batches of length 100")))
> 
> mout <- metrop(mout, blen = 250)
> mout$accept
[1] 0.2592
> 
> acf(mout$batch[ , 2], main = expression(paste("Series: ",
+     beta["victim"], " means of batches of length 250")))
> acf(mout$batch[ , 3], main = expression(paste("Series: ",
+     beta["defendant"], " means of batches of length 250")))
> 
> # now we want density plots of the marginal posteriors for the parameters
> # in order to do this we require batches of length 1 (in essence,
> # unbatched) because we want to plot the PDF of the variables not the
> # PDF of the batch means.  In order for automatic bandwidth selection
> # in the density function to work properly, we must have nearly independent
> # output, hence we need wide spacing of output.
> # then we use the same bandwidth on the highly dependent data with no spacing
> # (gives more accurate density plot (because more data) with right amount
> # of smoothing (because bandwidth selection done on nearly IID data))
> 
> mout <- metrop(mout, blen = 1, nbatch = 1e5)
> bout.spac <- mout$batch[seq(1, mout$nbatch, 250), ]
> 
> dout <- density(bout.spac[ , 2])
> plot(dout, main = expression(paste("Series: ",
+     beta["victim"], " spacing of chain 250")))
> dout <- density(mout$batch[ , 2], bw = dout$bw)
> plot(dout, main = expression(paste("Series: ", beta["victim"])))
> 
> dout <- density(bout.spac[ , 3])
> plot(dout, main = expression(paste("Series: ",
+     beta["defendant"], " spacing of chain 250")))
> dout <- density(mout$batch[ , 3], bw = dout$bw)
> plot(dout, main = expression(paste("Series: ", beta["defendant"])))
> 
> 
> 
> 
> 
> 
> 
> 
> proc.time()
   user  system elapsed 
  5.883   0.026   7.163