# data x <- 2 n <- 25 # mle phat <- x / n # confidence level conf.level <- 0.95 # Wald interval (used to be standard taught in intro stats) phat + c(-1, 1) * qnorm((1 + conf.level) / 2) * sqrt(phat * (1 - phat) / n) # score interval (some intro stats books now teach this) prop.test(x, n, conf.level = conf.level, correct = FALSE) # the "correct = FALSE" is just bizarre. (Does that mean we don't want the # correct answer?) No. The R statement help(prop.test) explains that it # it means we do not want to use "continuity correction". And we don't. # No authority recommends what prop.test does by default. # Agresti, Section 1.4.2, recommends correct = FALSE. # Brown, Cai and DasGupta (Statistical Science, 2005, 20, pp. 375-379) # criticize Geyer and Meeden (Statistical Science, 2005, 20, pp. 358-366) # for using prop.test with correct = TRUE, providing plots of coverage # probability for with correct = FALSE and correct = TRUE to show this. # AFAIK no authority defends correct = TRUE, even help(prop.test) cites # no such authority. They do cite # Newcombe (Statistics in Medicine, 1998, 17, 857-872) where correct = FALSE # is his method 3 and correct = TRUE is (apparently) his method 4 # Newcombe's simulations do not show the correct = TRUE is better than # correct = FALSE and his conclusion does not recommend correct = TRUE # over correct = FALSE (but he does not pick a particular method to recommend). # # for a full derivation of the score interval and a check that correct = FALSE # actually calculates it see # http://www.stat.umn.edu/geyer/5102/slides/s2.pdf, slides 113-116 and 123 # (hmmmm. That does not actually seem to check. We'll check here.) crit <- qnorm((1 + conf.level) / 2) foo <- (phat + crit^2 / (2 * n) + c(-1, 1) * crit * sqrt(phat * (1 - phat) / n + crit^2 / (4 * n^2))) / (1 + crit^2 / n) foo # just get the interval bar <- prop.test(x, n, conf.level = conf.level, correct = FALSE) names(bar) bar$conf.int all.equal(bar$conf.int, foo, check.attributes = FALSE) # level set of log likelihood interval (too esoteric for intro stats) logl <- function(p, x, n) ifelse(x == 0, n * log(1 - p), ifelse(x == n, n * log(p), x * log(p) + (n - x) * log(1 - p))) crit <- qchisq(conf.level, df = 1) fred <- function(p) 2 * (logl(phat, x, n) - logl(p, x, n)) - crit tol <- sqrt(.Machine$double.eps) if (phat == 0) { low <- 0 } else { low <- uniroot(fred, lower = 0, upper = phat, tol = tol)$root } if (phat == 1) { hig <- 1 } else { hig <- uniroot(fred, lower = phat, upper = 1, tol = tol)$root } c(low, hig) # can also do with confint function in MASS package ??? library(MASS) gout <- glm(rbind(c(x, n - x)) ~ 1, family = binomial) cout <- confint(gout, level = conf.level) cout fred <- 1 / (1 + exp(- cout)) fred all.equal(c(low, hig), fred, check.attributes = FALSE, tolerance = 4e-4) # intervals compared phat + c(-1, 1) * qnorm((1 + conf.level) / 2) * sqrt(phat * (1 - phat) / n) as.vector(bar$conf.int) c(low, hig) # Hypothesis tests # data x <- 1300 n <- 2500 # mle phat <- x / n # upper tailed test # null hypothesis p0 <- 1 / 2 # test statistic and P-value for score test (standard in intro stats) tstat <- (x - n * p0) / sqrt(n * p0 * (1 - p0)) tstat pnorm(tstat, lower.tail = FALSE) # P-value for exact test (should be standard in intro stats ?????) pbinom(x - 1, n, p0, lower.tail = FALSE) # fuzzy P-value for exact test (Geyer and Meeden, Statistical Science, # 2005, vol 20, pp. 358-387) is uniformly distributed on interval pbinom(c(x, x - 1), n, p0, lower.tail = FALSE) # test statistic and P-value for likelihood ratio test # for one-tailed test have to use signed likelihood ratio statistic tstat <- 2 * (logl(phat, x, n) - logl(p0, x, n)) tstat <- sqrt(tstat) tstat <- tstat * sign(phat - p0) tstat pnorm(tstat, lower.tail = FALSE) # test statistic and P-value for Wald test tstat <- (x - n * p0) / sqrt(n * phat * (1 - phat)) tstat pnorm(tstat, lower.tail = FALSE) # two tailed test (same data) tstat <- (x - n * p0) / sqrt(n * p0 * (1 - p0)) tstat pnorm(abs(tstat), lower.tail = FALSE) * 2 # P-value for exact test (no exact two-tailed test) # fuzzy P-value for exact test (Geyer and Meeden, Statistical Science, # 2005, vol 20, pp. 358-387) now not simple library(ump) print(arpv.binom(x, n, p0, plot = FALSE)) # test statistic and P-value for likelihood ratio test tstat <- 2 * (logl(phat, x, n) - logl(p0, x, n)) pchisq(tstat, df = 1, lower.tail = FALSE) # test statistic and P-value for Wald test tstat <- (x - n * p0) / sqrt(n * phat * (1 - phat)) tstat pnorm(abs(tstat), lower.tail = FALSE) * 2