# for reproducibility of results # to get different results, change seed # or omit this statement set.seed(42) pdf(file = "mcmc-too.pdf") # For table 2.6 in Agresti, fit Bayesian model # use as prior conjugate prior that puts "count" 1/2 in each cell # this is a conjugate prior victim <- factor(rep(c("white", "black"), each = 2)) defendant <- factor(rep(c("white", "black"), times = 2)) deathpenalty <- matrix(c(53, 11, 0, 4, 414, 37, 16, 139), ncol = 2, dimnames = list(NULL, c("yes", "no"))) data.frame(victim, defendant, deathpenalty) # Just for comparison, do frequentist (likelihoodist) analysis # # Note: the way glm with family = binomial works is special. # If you have Bernoulli response (a. k. a., zero-or-one-valued, # a. k. a., binomial with sample size 1) then you supply a # (zero-or-one-valued) response vector. # If you have binomial response with sample size > 1, then you # supply a response MATRIX whose first column is the number of successes # for each case and whose second column is the number of failures for # each case. It doesn't matter which you call "success" and which "failure" # as long as you keep straight which is which. This works when the sample # sizes for each "case" are different, which we have here where "cases" gout0 <- glm(deathpenalty ~ victim + defendant, family = binomial) summary(gout0) gout1 <- glm(deathpenalty ~ victim * defendant, family = binomial) summary(gout1) anova(gout0, gout1) # we see the smaller model fits as well as the bigger model, # i. e., the "interaction" is not statistically significant # so we will use the smaller model henceforth # there are Bayesian competitors to frequentist hypothesis tests # but I don't think we will have time to cover them. # for those that are interested, my STAT 5102 slides, deck 4, slides 94 ff. # http://www.stat.umn.edu/geyer/5102/slides/s4.pdf#page=94 # but I won't go over them in class (too theoretical) # we fit the "best" model according to likelihood inference Bayesianly. # first we create an R function that calculates the log unnormalized density # of the distribution that we are trying to sample by MCMC, which for # Bayesian inference is the log unnormalized posterior yes <- deathpenalty[ , "yes"] no <- deathpenalty[ , "no"] # get model matrix gout0 <- glm(deathpenalty ~ victim + defendant, family = binomial, x = TRUE) modmat <- gout0$x modmat # essentially copied from "demo" vignette in package "mcmc" ludfun <- function(beta) { stopifnot(is.numeric(beta)) stopifnot(length(beta) == ncol(modmat)) stopifnot(! is.na(beta)) eta <- as.vector(modmat %*% beta) logp <- ifelse(eta < 0, eta - log1p(exp(eta)), - log1p(exp(- eta))) logq <- ifelse(eta < 0, - log1p(exp(eta)), - eta - log1p(exp(- eta))) sum((yes + 1/2) * logp + (no + 1/2) * logq) } # The reason for yes + 1/2 and no + 1/2 is that is how conjugate priors work. # They imagine the prior describes the uncertainty we would have from starting # with a flat prior and observing data that is a count of 1/2 in each cell. # Fractional counts make no sense for actual data, but they do make algebraic # sense. We should think of these as hyperparameters of the prior. If we # replaced the last line by # # sum((yes + alpha1) * logp + (no + alpha2) * logq) # # then adjusting the "knobs" alpha1 and alpha2 to any values between zero # and infinity gives a range of priors that describe a range of uncertainties # about the parameters, and we happen to have chosen alpha1 = alpha2 = 1/2 # # "ludfun" stands for log unnormalized density function # # the reasons for the tricky computation of logp and logq are explained # in demo.pdf. The general reason is computational accuracy despite # inexactness of computer arithmetic, but the details are fussy. library(mcmc) mout <- metrop(ludfun, gout0$coefficients, nbatch = 1e4) mout$accept mout <- metrop(mout, scale = 0.1) mout$accept mout <- metrop(mout, scale = 0.5) mout$accept mout <- metrop(mout, scale = 0.3) mout$accept # scale now adjusted so acceptance rate is about 1/4 acf(mout$batch[ , 2], lag.max = 100, main = expression(paste("Series: ", beta["victim"]))) acf(mout$batch[ , 3], lag.max = 100, main = expression(paste("Series: ", beta["defendant"]))) mout <- metrop(mout, blen = 100, nbatch = 100) mout$accept acf(mout$batch[ , 2], main = expression(paste("Series: ", beta["victim"], " means of batches of length 100"))) acf(mout$batch[ , 3], main = expression(paste("Series: ", beta["defendant"], " means of batches of length 100"))) mout <- metrop(mout, blen = 250) mout$accept acf(mout$batch[ , 2], main = expression(paste("Series: ", beta["victim"], " means of batches of length 250"))) acf(mout$batch[ , 3], main = expression(paste("Series: ", beta["defendant"], " means of batches of length 250"))) # now we want density plots of the marginal posteriors for the parameters # in order to do this we require batches of length 1 (in essence, # unbatched) because we want to plot the PDF of the variables not the # PDF of the batch means. In order for automatic bandwidth selection # in the density function to work properly, we must have nearly independent # output, hence we need wide spacing of output. # then we use the same bandwidth on the highly dependent data with no spacing # (gives more accurate density plot (because more data) with right amount # of smoothing (because bandwidth selection done on nearly IID data)) mout <- metrop(mout, blen = 1, nbatch = 1e5) bout.spac <- mout$batch[seq(1, mout$nbatch, 250), ] dout <- density(bout.spac[ , 2]) plot(dout, main = expression(paste("Series: ", beta["victim"], " spacing of chain 250"))) dout <- density(mout$batch[ , 2], bw = dout$bw) plot(dout, main = expression(paste("Series: ", beta["victim"]))) dout <- density(bout.spac[ , 3]) plot(dout, main = expression(paste("Series: ", beta["defendant"], " spacing of chain 250"))) dout <- density(mout$batch[ , 3], bw = dout$bw) plot(dout, main = expression(paste("Series: ", beta["defendant"])))