# for reproducibility of results
# to get different results, change seed
# or omit this statement
set.seed(42)

pdf(file = "mcmc-too.pdf")

# For table 2.6 in Agresti, fit Bayesian model
# use as prior conjugate prior that puts "count" 1/2 in each cell
# this is a conjugate prior

victim <- factor(rep(c("white", "black"), each = 2))
defendant <- factor(rep(c("white", "black"), times = 2))
deathpenalty <- matrix(c(53, 11, 0, 4, 414, 37, 16, 139),
        ncol = 2, dimnames = list(NULL, c("yes", "no")))
data.frame(victim, defendant, deathpenalty)

# Just for comparison, do frequentist (likelihoodist) analysis
#
# Note: the way glm with family = binomial works is special.
# If you have Bernoulli response (a. k. a., zero-or-one-valued,
# a. k. a., binomial with sample size 1) then you supply a
# (zero-or-one-valued) response vector.
# If you have binomial response with sample size > 1, then you
# supply a response MATRIX whose first column is the number of successes
# for each case and whose second column is the number of failures for
# each case. It doesn't matter which you call "success" and which "failure"
# as long as you keep straight which is which.  This works when the sample
# sizes for each "case" are different, which we have here where "cases"

gout0 <- glm(deathpenalty ~ victim + defendant, family = binomial)
summary(gout0)
gout1 <- glm(deathpenalty ~ victim * defendant, family = binomial)
summary(gout1)
anova(gout0, gout1)

# we see the smaller model fits as well as the bigger model,
# i. e., the "interaction" is not statistically significant
# so we will use the smaller model henceforth
# there are Bayesian competitors to frequentist hypothesis tests
# but I don't think we will have time to cover them.
# for those that are interested, my STAT 5102 slides, deck 4, slides 94 ff.
# http://www.stat.umn.edu/geyer/5102/slides/s4.pdf#page=94
# but I won't go over them in class (too theoretical)

# we fit the "best" model according to likelihood inference Bayesianly.
# first we create an R function that calculates the log unnormalized density
# of the distribution that we are trying to sample by MCMC, which for
# Bayesian inference is the log unnormalized posterior

yes <- deathpenalty[ , "yes"]
no <- deathpenalty[ , "no"]

# get model matrix
gout0 <- glm(deathpenalty ~ victim + defendant, family = binomial, x = TRUE)
modmat <- gout0$x
modmat

# essentially copied from "demo" vignette in package "mcmc"

ludfun <- function(beta) {
    stopifnot(is.numeric(beta))
    stopifnot(length(beta) == ncol(modmat))
    stopifnot(! is.na(beta))
    eta <- as.vector(modmat %*% beta)
    logp <- ifelse(eta < 0, eta - log1p(exp(eta)), - log1p(exp(- eta)))
    logq <- ifelse(eta < 0, - log1p(exp(eta)), - eta - log1p(exp(- eta)))
    sum((yes + 1/2) * logp + (no + 1/2) * logq)
}

# The reason for yes + 1/2 and no + 1/2 is that is how conjugate priors work.
# They imagine the prior describes the uncertainty we would have from starting
# with a flat prior and observing data that is a count of 1/2 in each cell.
# Fractional counts make no sense for actual data, but they do make algebraic
# sense.  We should think of these as hyperparameters of the prior.  If we
# replaced the last line by
#
#     sum((yes + alpha1) * logp + (no + alpha2) * logq)
#
# then adjusting the "knobs" alpha1 and alpha2 to any values between zero
# and infinity gives a range of priors that describe a range of uncertainties
# about the parameters, and we happen to have chosen alpha1 = alpha2 = 1/2
#
# "ludfun" stands for log unnormalized density function
#
# the reasons for the tricky computation of logp and logq are explained
# in demo.pdf.  The general reason is computational accuracy despite
# inexactness of computer arithmetic, but the details are fussy.

library(mcmc)

mout <- metrop(ludfun, gout0$coefficients, nbatch = 1e4)
mout$accept
mout <- metrop(mout, scale = 0.1)
mout$accept
mout <- metrop(mout, scale = 0.5)
mout$accept
mout <- metrop(mout, scale = 0.3)
mout$accept

# scale now adjusted so acceptance rate is about 1/4

acf(mout$batch[ , 2], lag.max = 100,
    main = expression(paste("Series: ", beta["victim"])))
acf(mout$batch[ , 3], lag.max = 100,
    main = expression(paste("Series: ", beta["defendant"])))

mout <- metrop(mout, blen = 100, nbatch = 100)
mout$accept

acf(mout$batch[ , 2], main = expression(paste("Series: ",
    beta["victim"], " means of batches of length 100")))
acf(mout$batch[ , 3], main = expression(paste("Series: ",
    beta["defendant"], " means of batches of length 100")))

mout <- metrop(mout, blen = 250)
mout$accept

acf(mout$batch[ , 2], main = expression(paste("Series: ",
    beta["victim"], " means of batches of length 250")))
acf(mout$batch[ , 3], main = expression(paste("Series: ",
    beta["defendant"], " means of batches of length 250")))

# now we want density plots of the marginal posteriors for the parameters
# in order to do this we require batches of length 1 (in essence,
# unbatched) because we want to plot the PDF of the variables not the
# PDF of the batch means.  In order for automatic bandwidth selection
# in the density function to work properly, we must have nearly independent
# output, hence we need wide spacing of output.
# then we use the same bandwidth on the highly dependent data with no spacing
# (gives more accurate density plot (because more data) with right amount
# of smoothing (because bandwidth selection done on nearly IID data))

mout <- metrop(mout, blen = 1, nbatch = 1e5)
bout.spac <- mout$batch[seq(1, mout$nbatch, 250), ]

dout <- density(bout.spac[ , 2])
plot(dout, main = expression(paste("Series: ",
    beta["victim"], " spacing of chain 250")))
dout <- density(mout$batch[ , 2], bw = dout$bw)
plot(dout, main = expression(paste("Series: ", beta["victim"])))

dout <- density(bout.spac[ , 3])
plot(dout, main = expression(paste("Series: ",
    beta["defendant"], " spacing of chain 250")))
dout <- density(mout$batch[ , 3], bw = dout$bw)
plot(dout, main = expression(paste("Series: ", beta["defendant"])))