\documentclass[11pt]{article}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{amsthm}
\usepackage{indentfirst}
\usepackage{natbib}
\usepackage{url}
\RequirePackage{amsfonts}
\newcommand{\real}{\mathbb{R}}
\newcommand{\rats}{\mathbb{Q}}
\newcommand{\nats}{\mathbb{N}}
\newcommand{\set}[1]{\{\, #1 \,\}}
\newcommand{\abs}[1]{\lvert #1 \rvert}
\newcommand{\norm}[1]{\lVert #1 \rVert}
\newcommand{\inner}[1]{\langle #1 \rangle}
\newcommand{\biginner}[1]{\left\langle #1 \right\rangle}
\newcommand{\fatdot}{\,\cdot\,}
\DeclareMathOperator{\cl}{cl}
\DeclareMathOperator{\interior}{int}
\DeclareMathOperator{\boundary}{bdry}
\DeclareMathOperator{\var}{var}
\DeclareMathOperator{\cov}{cov}
\DeclareMathOperator{\cor}{cor}
\DeclareMathOperator{\pr}{pr}
\DeclareMathOperator{\tr}{tr}
\newcommand{\uto}{\stackrel{u}{\longrightarrow}}
\newcommand{\weakto}{\stackrel{\mathcal{D}}{\longrightarrow}}
\newcommand{\lawto}{\stackrel{\mathcal{L}}{\longrightarrow}}
\newcommand{\asto}{\stackrel{\text{a.s.}}{\longrightarrow}}
\newcommand{\probto}{\stackrel{P}{\longrightarrow}}
\newcommand{\opand}{\mathbin{\rm and}}
\newcommand{\opor}{\mathbin{\rm or}}
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{definition}{Foo}
\RequirePackage{amssymb}
\let\emptyset=\varnothing
%%%%% Barred Capital Letters %%%%%
\newcommand{\Wbar}{W{\mkern -19.0 mu}\overline{\phantom{\text{W}}}}
\newcommand{\Xbar}{X{\mkern -13.5 mu}\overline{\phantom{\text{X}}}}
\newcommand{\Ybar}{Y{\mkern -14.0 mu}\overline{\phantom{\text{Y}}}}
\newcommand{\Zbar}{Z{\mkern -10.5 mu}\overline{\phantom{\text{Z}}}}
\begin{document}
\begin{raggedright}
\Large
Stat 8501 Lecture Notes
\\
\textbf{Markov Chains}
\\
Charles J. Geyer
\\
\today
\par
\end{raggedright}
\tableofcontents
\section{Kernels}
\subsection{Definitions}
A \emph{kernel} on a measurable space $(\Omega, \mathcal{A})$ is a function
$K : \Omega \times \mathcal{A} \to \real$ having the following properties
\citep[Section~1.1]{nummelin}.
\begin{enumerate}
\item[(i)] For each fixed $A \in \mathcal{A}$,
the function $x \mapsto K(x, A)$ is measurable.
\item[(ii)] For each fixed $x \in \Omega$,
the function $A \mapsto K(x, A)$ is either a positive measure or
a signed measure.
\end{enumerate}
We will only be interested in kernels such that $K(x, \fatdot)$ is a
signed measure for each $x$.
A kernel is \emph{nonnegative} if all of its values are nonnegative.
A kernel is \emph{substochastic} if it is nonnegative and
$$
K(x, \Omega) \le 1, \qquad x \in \Omega.
$$
A kernel is \emph{stochastic} or \emph{Markov} if it is nonnegative and
$$
K(x, \Omega) = 1, \qquad x \in \Omega.
$$
If $K$ is a stochastic kernel then $K(x, \fatdot)$ a probability measure
for each $x$.
\subsection{Operations} \label{sec:operations}
Signed measures and kernels have the following operations
\citep[Section~1.1]{nummelin}.
For any signed measure $\lambda$ and kernel $K$, we can
``left multiply'' $K$ by $\lambda$ giving another signed measure,
denoted $\mu = \lambda K$, defined by
$$
\mu(A) = \int \lambda(d x) K(x, A), \qquad A \in \mathcal{A}.
$$
For any two kernels $K_1$ and $K_2$ we can ``multiply'' them giving another
kernel, denoted $K_3 = K_1 K_2$, defined by
$$
K_3(x, A) = \int K_1(x, d y) K_2(y, A), \qquad A \in \mathcal{A}.
$$
For any kernels $K$ and measurable function $f : \Omega \to \real$,
we can ``right multiply'' $K$ by $f$ giving another measurable function,
denoted $g = K f$, defined by
\begin{equation} \label{eq:right-multiply}
g(x) = \int K(x, d y) f(y), \qquad A \in \mathcal{A},
\end{equation}
provided the integral exists (we can only write $K f$ when we know
the integral exists).
The kernel which acts as an identity element for kernel multiplication is
defined by
$$
I(x, A) = \begin{cases} 1, & x \in A \\ 0, & \text{otherwise} \end{cases}
$$
Note that this combines two familiar notions. The map $x \mapsto I(x, A)$
is the indicator function of the set $A$, and the map $A \mapsto I(x, A)$
is the probability measure concentrated at the point $x$.
It is easily checked that $I$ does act as an identity element, that is
$\lambda = \lambda I$ when $\lambda$ is a signed measure,
$K I = K = I K$ when $K$ is a kernel, and
$I f = f$ when $f$ is a bounded measurable function.
For any kernel $K$ we write $K^n$ for the product of $K$ with itself $n$
times. We also write $K^1 = K$ and $K^0 = I$, so we have $K^m K^n = K^{m + n}$
for any nonnegative integers $m$ and $n$.
\subsection{Finite State Space}
The notation for these operations is meant to recall the notation for
matrix multiplication. When $\Omega$ is a finite set, we can associate
signed measures and functions with vectors and kernels with matrices
and the ``multiplication'' operations defined above become multiplications
of the associated matrices.
We take the vector space to be $\real^\Omega$
so vectors are functions $\Omega \to \real$, that is, we are taking $\Omega$
to be the index set and writing $v(x)$ rather than $v_x$ for the components
of a vector $v$. Then matrices are elements of $\real^{\Omega \times \Omega}$,
so they are functions $\Omega \times \Omega \to \real$, that is, we are
again taking $\Omega$ to be the index set and writing $m(x, y)$ rather than
$m_{x y}$ for the components of a matrix $M$.
We can associate a signed measure $\lambda$ with a vector
$\tilde{\lambda}$ defined by
$$
\tilde{\lambda}(x) = \lambda(\{x\}), \qquad x \in \Omega,
$$
and can associate a kernel $K$ with a matrix $\widetilde{K}$ having elements
defined by
$$
\tilde{k}(x, y) = K(x, \{y\}), \qquad x, y \in \Omega.
$$
A function $f : \Omega \to \real$ is
a vector already. Think of signed measures as row vectors, then
the matrix multiplication $\tilde{\lambda} \widetilde{K}$ is associated
with the kernel $\lambda K$.
Think of functions as column vectors, then
the matrix multiplication $\widetilde{K} f$ is associated
with the function $K f$.
The matrix multiplication $\widetilde{K}_1 \widetilde{K}_2$ is associated
with the kernel $K_1 K_2$.
The matrix associated
with the identity kernel is the identity matrix with elements
$\tilde{i}(x, y) = I(x, \{y\})$.
\subsection{Regular Conditional Probabilities}
A Markov kernel gives a \emph{regular conditional probability},
it describes the conditional distribution of two random variables,
say of $Y$ given $X$. This is often written
\begin{equation} \label{eq:fubar}
K(x, A) = \Pr(Y \in A \mid X = x),
\end{equation}
but the right side is undefined when $\Pr(X = x) = 0$, so the right hand
side is not really mathematics. Kernels are real mathematics.
\section{Markov Chains} \label{sec:markov}
A stochastic process $X_1$, $X_2$, $\ldots$
taking values in an arbitrary measurable space (the $X_i$ need not be
real-valued or vector-valued), which is called the \emph{state space}
of the process, is a \emph{Markov chain} if has the \emph{Markov property}:
the conditional distribution of the future given the past and present depends
only on the present, that is, the conditional distribution of
$(X_{n + 1}, X_{n + 2}, \ldots)$ given $(X_1, \ldots, X_n)$ depends only
on $X_n$. A Markov chain has \emph{stationary transition probabilities}
if the conditional distribution of $X_{n + 1}$ given $X_n$ does not depend
on $n$. We assume stationary transition probabilities without further mention
throughout this handout.
In this handout we are interested in Markov chains on general state spaces,
where ``general'' does not mean completely general (sorry about that), but
means the measurable space $(\Omega, \mathcal{A})$ is countably generated,
meaning $\mathcal{A} = \sigma(\mathcal{C})$, where $\mathcal{C}$ is
a countable family of subsets of $\Omega$ and $\sigma(\mathcal{C})$ is the
smallest sigma-algebra containing $\mathcal{C}$, which is the intersection
of all sigma-algebras containing $\mathcal{C}$. This is the assumption
made by the authoritative books on general state space Markov chains
\citep{nummelin,meyn-tweedie}. Countably generated is a very weak assumption
(it applies to the Borel sigma-algebra of $\real^d$, for example,
the Borel sigma-algebra being $\sigma(\mathcal{O})$, where $\mathcal{O}$ is
the family of open subsets of $\real^d$).
We always assume it, but will not mention it again
except in Section~\ref{sec:small-petite} where the reason for this
assumption will be explained.
We assume the conditional distribution of $X_{n + 1}$ given $X_n$ is given
by a Markov kernel $P$. The marginal distribution of $X_1$ is called
the \emph{initial distribution}. Together the initial distribution and
the transition probability kernel determine the joint distribution of the
stochastic process that is the Markov chain.
Straightforwardly, they determine all the finite-dimensional distributions,
the joint distribution of $X_1$, $\ldots,$ $X_n$ for any $n$ is determined by
\begin{multline*}
E\{ g(X_1, \ldots, X_n) \}
\\
=
\idotsint
g(x_1, \ldots, x_n)
\lambda(d x_1) P(x_1, d x_2) P(x_2, d x_3) \cdots P(x_{n - 1}, d x_n),
\end{multline*}
for all bounded measurable functions $g(X_1, \ldots, X_n)$.
\citet[Sections~22.1 and~22.3]{fristedt} discuss the construction
of the probability measure governing the infinite sequence, showing
it is determined by the finite-dimensional distributions.
For any nonnegative integer $n$, the kernel $P^n$ gives the $n$-step transition
probabilities of the Markov chain. In sloppy notation,
$$
P^n(x, A) = \Pr(X_{n + 1} \in A \mid X_1 = x).
$$
In a different sloppy notation, we can write the joint probability measure of
$(X_2, \ldots, X_{n + 1})$ given $X_1$ as
$$
P(x_1, d x_2) P(x_2, d x_3) \cdots P(x_n, d x_{n + 1}),
$$
which is shorthand for
\begin{multline*}
E\{ g(X_2, \ldots, X_{n + 1}) \mid X_1 = x_1 \}
\\
=
\idotsint
g(x_2, \ldots, x_{n + 1})
P(x_1, d x_2) P(x_2, d x_3) \cdots P(x_n, d x_{n + 1}),
\end{multline*}
whenever $g(X_2, \ldots, X_{n + 1})$ has expectation.
So
\begin{multline*}
\Pr( X_{n + 1} \in A \mid X_1 = x_1 )
\\
=
\idotsint
I_A(x_{n + 1})
P(x_1, d x_2) P(x_2, d x_3) \cdots P(x_n, d x_{n + 1}),
\end{multline*}
and this does indeed equal $P^n(x_1, A)$.
\section{Irreducibility}
Let $\varphi$ be a strictly positive measure on the state space
$(\Omega, \mathcal{A})$,
meaning $\varphi(A) \ge 0$ for all $A \in \mathcal{A}$
and $\varphi(\Omega) > 0$.
We say a set $A \in \mathcal{A}$ is \emph{$\varphi$-positive} in case
$\varphi(A) > 0$.
A nonnegative kernel $P$ on the the state space
is \emph{$\varphi$-irreducible} if for every $x \in \Omega$
and $\varphi$-positive $A \in \mathcal{A}$
there exists a positive integer $n$ (which may depend on $x$ and $A$) such that
$P^n(x, A) > 0$. When $P$ is $\varphi$-irreducible, we also say $\varphi$
is an \emph{irreducibility measure} for $P$.
We say $P$ is \emph{irreducible} if it is $\varphi$-irreducible
for some $\varphi$.
We also apply these terms to Markov chains.
A Markov chain is $\varphi$-irreducible (resp.~irreducible)
if its transition probability kernel has this property.
This definition seems quite arbitrary in that the measure $\varphi$ is quite
arbitrary. Note, however that $\varphi$ is used only to specify a
family of null sets, which are excluded from the test (we only have to
find an $n$ such that $P^n(x, A) > 0$ for $A$ such that $\varphi(A) > 0$).
\subsection{Maximal Irreducibility Measures}
If a kernel is $\varphi$-irreducible for any $\varphi$, then there
always exists \citep[Theorem~2.4]{nummelin}
a \emph{maximal irreducibility measure $\psi$} that specifies the
minimal family of null sets, meaning $\psi(A) = 0$ implies $\varphi(A) = 0$
for any irreducibility measure $\varphi$.
A maximal irreducibility measure is not unique, but the family
of null sets it specifies is unique.
\subsection{Communicating Sets}
A set $B \in \mathcal{A}$
is \emph{$\varphi$-communicating} if for every $x \in B$
and every $\varphi$-positive $A \in \mathcal{A}$ such that $A \subset B$
there exists
a positive integer $n$ (which may depend on $x$ and $A$
such that $P^n(x, A) > 0$. Clearly, the kernel $P$
is $\varphi$-irreducible if and only if the whole state space
is $\varphi$-communicating.
\subsection{Subsampled Chains} \label{sec:sample}
Suppose $P$ is a Markov kernel and $q$ is the probability vector for
a nonnegative-integer-valued random variable. Define
\begin{equation} \label{eq:subsample}
P_q(x, A) = \sum_{n = 0}^\infty q_n P^n(x, A).
\end{equation}
Then it is easily seen that $P_q$ is also a Markov kernel.
If $X_1$, $X_2$, $\ldots$ is a Markov chain having
transition probability kernel $P$ and $N_1$, $N_2$, $\ldots$ is
an independent and identically distributed (IID) sequence
of random variables having probability vector $q$ that are also independent
of $X_1$, $X_2$, $\ldots,$ then
$X_{1 + N_1}$, $X_{1 + N_1 + N_2}$, $\ldots$ is a Markov chain having
transition probability kernel $P_q$, which is said to be derived from
the original chain by \emph{subsampling}.
If the random variables $N_1$, $N_2$, $\ldots$ are almost surely constant,
that is, if the vector $q$ has only one non-zero element, then we say
the subsampling is \emph{nonrandom}. Otherwise, we say it is \emph{random}.
\begin{lemma} \label{lem:convolution}
If $P_q$ and $P_r$ are subsampling kernels, then
$$
P_q P_r = P_{q * r},
$$
where $q * r$ is the convolution of the probability vectors $q$ and $r$
defined by
\begin{equation} \label{eq:convolution}
(q * r)_n = \sum_{k = 0}^n q_k r_{n - k}.
\end{equation}
\end{lemma}
\begin{proof}
\begin{align*}
(P_q P_r)(x, A)
& =
\sum_{k = 0}^\infty \sum_{m = 0}^\infty q_k r_m P^{k + m}(x, A)
\\
& =
\sum_{n = 0}^\infty \sum_{k = 0}^n q_k r_{n - k} P^n(x, A)
\end{align*}
(in the change of summation indices $n = k + m$ so $m = n - k$).
\end{proof}
\begin{lemma} \label{lem:sampling}
Let $q$ be a probability vector having no elements equal to zero.
The following are equivalent (each implies the others).
\begin{enumerate}
\item[\normalfont (a)] The set $B$ is $\varphi$-communicating for the
kernel $P$.
\item[\normalfont (b)] The set $B$ is $\varphi$-communicating for the
kernel $P_q$.
\item[\normalfont (c)] $P_q(x, A) > 0$ for every $x \in B$ and
every $\varphi$-positive $A \subset B$.
\end{enumerate}
\end{lemma}
\begin{proof}
That (a) implies (c) implies (b) is clear. It remains only to be shown
that (b) implies (a). So assume (b). Then for any $x \in B$ and $A \subset B$
such that $\varphi(A) > 0$ there exists an $n$ such that $P_q^n(x, A) > 0$.
Suppose $r$ is another probability vector having no zero elements.
It is clear from \eqref{eq:convolution} that $q * r$ has no zero elements
either. Let $s$ be the $n$-fold convolution of $q$ with itself, then
(by mathematical induction) $P_q^n = P_s$ and $s$ has no zero elements.
Hence
\begin{equation} \label{eq:sampling-lemma}
P_s(x, A)
=
\sum_{n = 0}^\infty s_n P^n(x, A) > 0,
\end{equation}
hence some term in \eqref{eq:sampling-lemma} must be nonzero, hence
$P^n(x, A) > 0$ for some $n$, and this holding for all $x$
and all $\varphi$-positive $A$ implies (a).
\end{proof}
\subsection{Separable Metric Spaces}
Verifying irreducibility can be quite easy
or exceedingly difficult. Here is a case when it is easy.
An open set is said
to be \emph{connected} when it is not a union of a pair of disjoint open sets.
A \emph{basis} for a topological space is a
family of open sets $\mathcal{B}$ such that every open set is
a union of a subfamily of $\mathcal{B}$.
A topological space is said to be \emph{second countable} if it
has a countable basis.
Every separable metric space
is second countable (balls having radii $1 / n$ for integer $n$ centered on
points in the countable dense set form a basis).
\begin{theorem} \label{th:topology}
Suppose a Markov chain with state space $\Omega$ has the following properties.
\begin{enumerate}
\item[\normalfont (a)] $\Omega$ is a connected second countable topological space.
\item[\normalfont (b)] Every nonempty open subset of $\Omega$
is $\varphi$-positive.
\item[\normalfont (c)] Every point in $\Omega$ has a $\varphi$-communicating
neighborhood.
\end{enumerate}
Then the Markov chain is $\varphi$-irreducible.
\end{theorem}
\begin{proof}
Let $\mathcal{U}$ be a countable basis for $\Omega$,
and let $\mathcal{B}$ be the family of $\varphi$-communicating elements
of $\mathcal{U}$. We claim that $\mathcal{B}$ is also a countable basis
for $\Omega$. To prove this, consider an arbitrary open set $W$ in $\Omega$.
Then for each $x \in W$, there there exists $U_x \in \mathcal{U}$
satisfying $x \in U_x \subset W$.
By assumption $x$ also has a $\varphi$-communicating neighborhood $N_x$
whose interior $N_x^\circ$ contains $x$.
Then there also exists $B_x \in \mathcal{U}$ satisfying
satisfying $x \in B_x \subset U_x \cap N_x^\circ$.
Since subsets of $\varphi$-communicating sets
are themselves $\varphi$-communicating, $B_x \in \mathcal{B}$.
This shows an arbitrary open set $W$ is a union of elements of $\mathcal{B}$,
so $\mathcal{B}$ is a basis.
Consider a sequence $C_1$, $C_2$, $\ldots$ of sets defined inductively
as follows. First, $C_1$ is an arbitrary element of $\mathcal{B}$.
Then, assuming $C_1$, $\ldots,$ $C_n$ have been defined, we define
$$
C_{n + 1} = \bigcup \set{ B \in \mathcal{B} : B \cap C_n \neq \emptyset }.
$$
We show each $C_n$ is $\varphi$-communicating by mathematical induction.
The base of the induction, that $C_1$ is $\varphi$-communicating is true
by definition. To complete the induction we assume $C_n$
is $\varphi$-communicating and must show $C_{n + 1}$
is $\varphi$-communicating.
By (b) of Lemma~\ref{lem:sampling} we may use a kernel $P_q$ to show this.
So suppose $x \in C_{n + 1}$ and
$A \subset C_{n + 1}$ is $\varphi$-positive.
Because $\mathcal{B}$ is countable, there must exist
$B \in \mathcal{B}$ such that $B \subset C_{n + 1}$ and
$\varphi(A \cap B) > 0$. Moreover we must have $B \cap C_n \neq \emptyset$
by definition of $C_{n + 1}$.
Hence $\varphi(B \cap C_n) > 0$ by assumption (b) of the theorem.
Also we must have $x \in B_x$ for some $B_x \in \mathcal{B}$ such that
$B_x \subset C_{n + 1}$ and $B_x \cap C_n \neq \emptyset$.
Hence $\varphi(B_x \cap C_n) > 0$ by assumption (b) of the theorem.
Then
\begin{align*}
P_q^3(x, A \cap B)
& =
\iint P_q(x, d y) P_q(y, d z) P_q(z, A \cap B)
\end{align*}
is strictly positive because
$P_q(z, A \cap B) > 0$ for all $z \in B$
because $B$ is $\varphi$-communicating,
and $P_q(y, B \cap C_n) > 0$ for all $y \in C_n$,
because $C_n$ is $\varphi$-communicating and $B \cap C_n$ is $\varphi$-positive,
and this implies that
$$
\int P_q(y, d z) P_q(z, A \cap B)
$$
is strictly positive for all $y \in C_n$,
and $P_q(x, B_x \cap C_n) > 0$ because $B_x$ is $\varphi$-communicating
and $B_x \cap C_n$ is $\varphi$-positive,
and this implies that
$$
\int P_q(x, d y) \int P_q(y, d z) P_q(z, A \cap B)
$$
is strictly positive.
This finishes the proof that each $C_n$ is $\varphi$-communicating.
Let
$$
C_\infty = \bigcup_{k = 1}^\infty C_k.
$$
Then $C_\infty$ is $\varphi$-communicating,
because any for $x \in C_\infty$ and $\varphi$-positive $A \subset C_\infty$
there is a $k$
such that $x \in C_k$ and $\varphi(A \cap C_k) > 0$. Hence
$P_q(x, A \cap C_k) > 0$ because
because $C_k$ is $\varphi$-communicating.
Now let
$$
\mathcal{B}_{\text{leftovers}}
=
\set{ B \in \mathcal{B} : B \not\subset C_\infty }.
$$
Any $B \in \mathcal{B}_{\text{leftovers}}$ is actually disjoint
from $C_\infty$ because otherwise it would have to intersect some $C_n$
and hence be contained in $C_{n + 1}$. Thus
the open set $C_{\text{leftovers}} = \bigcup \mathcal{B}_{\text{leftovers}}$
and the open set
$C_\infty$ are disjoint and their union is $\Omega$.
By the assumption that $\Omega$ is topologically connected
$C_{\text{leftovers}}$ must be empty.
Thus $\Omega = C_\infty$ is $\varphi$-communicating.
\end{proof}
The theorem seems very technical, but here is a simple toy problem that
illustrates it. Let $W$ be an arbitrary connected open subset of $\real^d$,
and take $W$ to be the state space of a Markov chain. Fix $\varepsilon > 0$,
define $K(x, \fatdot)$ to
be the uniform distribution on the ball of radius $\varepsilon$
centered at $x$, and define
$$
P(x, A) = [1 - K(x, W) ] I(x, A) + K(x, W \cap A).
$$
This is a special case of the Metropolis algorithm, described
in Section~\ref{sec:metropolis} below.
The Markov chain can be described as follows. We may take $X_1$ to be
any point of $W$. When the current state is $X_n$, we ``propose'' $Y_n$
uniformly distributed on the ball of radius $\varepsilon$ centered at $X_n$.
Then we set
\begin{equation} \label{eq:metropolis-uniform}
X_{n + 1} = \begin{cases} Y_n, & Y_n \in W \\ X_n, & \text{otherwise}
\end{cases}
\end{equation}
Since $W$ is a separable metric space it is second countable.
Thus condition (a) of the theorem is satisfied.
Let $\varphi$ be Lebesgue measure on $W$.
Then condition (b) of the theorem is satisfied.
Let $x \in W$ and let $B$ be the open ball of radius less than or equal
to $\varepsilon / 2$ centered at $x$ and contained in $W$.
Then $\Pr(Y_n \in B \mid X_n \in B) > 0$
and the conditional distribution of $Y_n$ given $X_n \in B$ and $Y_n \in B$
is uniformly distributed on $B$.
Since $Y_n \in B$ implies $Y_n \in W$ and $X_{n + 1} = Y_n$,
the conditional distribution of $X_{n + 1}$ given $X_n \in B$ and $Y_n \in B$
is uniformly distributed on $B$.
This implies $B$ is $\varphi$-communicating, and that establishes
condition (c) of the theorem.
\subsection{Variable at a Time Samplers}
Here is another toy example that illustrates general issues.
As in the example in the preceding section, we let the
state space be a connected open set $W$ in $\real^d$, and we show the Markov
chain is $\varphi$-irreducible where $\varphi$ is Lebesgue measure on $W$.
This time, however, we use a variable at a time sampler.
Fix $\varepsilon > 0$. Let $X_n(i)$ denote the $i$-th coordinate
of the state $X_n$ of the Markov chain (which is a $d$-dimensional vector).
The update of the state proceeds as follows. Let $I_n$ be uniformly
distributed on the finite set $\{ 1, \ldots, d \}$.
Let $Y_n(j) = X_n(j)$ for $j \neq I_n$, and let $Y_n(I_n)$
be uniformly distributed
on the open interval $(X_n(I_n) - \varepsilon, X_n(I_n) + \varepsilon)$.
Then we define $X_n$ by \eqref{eq:metropolis-uniform}.
This is a special case of the variable-at-a-time Metropolis algorithm,
which is not described in general in this handout \citep[see][]{geyer-intro}.
In order to apply Theorem~\ref{th:topology} to this example it only remains
to be shown that every point of $W$ has a $\varphi$-connected neighborhood,
where $\varphi$ is Lebesgue measure on $W$. Since $W$ is open, every point
contains a box
$$
B_\delta(x)
=
\set{ y \in \real^d : \abs{x_i - y_i} < \delta, i = 1, \ldots, d }
$$
such that $B_\delta(x) \subset W$ and $\delta < \varepsilon$. Fix $y \in
B_\delta(x)$ and $C \subset B_\delta(x)$ such that $C$ has positive Lebesgue
measure. We claim that $P^d(y, C) > 0$. The probability that $I_k = k$,
$k = 1$, $\ldots,$ $d$ is $(1 / d)^d > 0$. When this occurs, we have
$\Pr(X_{k + 1} \neq X_k) > (\delta / \varepsilon) > 0$, $k = 1$, $\ldots,$ $d$.
And when this occurs, we have the conditional distribution of $X_d$ conditional
on $X_d \in B_\delta(x)$ uniformly distributed on $B_\delta(x)$.
Hence we have
$$
P^d(y, C) \ge \left(\frac{\delta}{\varepsilon d}\right)^d \cdot
\frac{\varphi(C)}{\varphi(B_\delta(x))}
$$
and this is greater than zero.
\subsection{Countable State Spaces}
If the state space of the Markov chain is countable, then irreducibility
questions can be settled by looking at paths. A \emph{path} from $x$ to
$y$ is a finite sequence of states
$$
x = x_1, x_2, \dots, x_n = y
$$
such that
$$
P(x_i, \{ x_{i + 1} \}) > 0, \qquad i = 1, \ldots, n - 1.
$$
If there exists a state $y$ such that there is a path from $x$ to $y$
for every $x \in \Omega$, then the kernel is $\varphi$-irreducible with
$\varphi$ concentrated at $y$.
If there does not exist such a state $y$, then the kernel
is not $\varphi$-irreducible for any $\varphi$.
Suppose the kernel is $\varphi$-irreducible with $\varphi$ concentrated at $y$.
Let $S$ denote the set of states $z$ such that there exists a path from $y$ to
$z$. We claim that counting measure on $S$ is a maximal irreducibility
measure $\psi$. Clearly,
there is a path $x \to y \to z$ for any $x \in \Omega$
and $z \in S$. Thus $\psi$ is an irreducibility measure.
Conversely, if $w \notin S$, then there is no path $y \to w$.
Hence no irreducibility measure can give positive measure to the point $w$.
\section{Stochastic Stability}
\subsection{Transience and Recurrence}
Let $N_A$ denote the number of visits to $A$ made by a Markov chain
(this is a random variable, different for each realization of the chain).
A set $A$ is called \emph{recurrent} if
$$
E_x(N_A) = \infty, \qquad x \in A.
$$
A set $A$ is called \emph{uniformly transient} if there exists a real number
$M$ such that
$$
E_x(N_A) \le M, \qquad x \in A.
$$
For a countable state space Markov chain and $A$ a singleton set,
these definitions reduce to
the usual ones \citep[Theorem~1 of Chapter~1]{hoel-port-stone}.
But for uncountable state spaces, especially for continuous distributions,
looking at single points makes no sense. If the distribution of the Markov
chain is continuous, then every point has probability zero and
$E_x(N_{\{y\}}) = 0$, for every point $y$.
For a $\psi$-irreducible Markov chain the behavior is simple.
Either every $\psi$-positive set is recurrent,
in which case we say the chain is \emph{recurrent},
or there exists a countable cover of the state space by uniformly transient
sets,
in which case we say the chain is \emph{transient}
\citep[Theorem~8.0.1]{meyn-tweedie}.
This is the \emph{transience-recurrence dichotomy:} every irreducible
Markov chain is either transient or recurrent.
For a transient chain we can say more about which sets are uniformly
transient, but this gets us a bit ahead of ourselves. Petite sets
are defined in Section~\ref{sec:small-petite} below, and an easily
used criterion for petiteness is given in Section~\ref{sec:t-chain} below
(every compact set is petite under certain regularity conditions that hold
for many Markov chains, and every subset of a petite set is petite, so
for such chains every bounded set is petite).
Theorem~8.0.1 in \citet{meyn-tweedie} adds that for a transient chain
every petite set is uniformly transient.
The definitions of transience and recurrence work in opposite directions.
For recurrence, we might be concerned that a set is too little to be hit
infinitely often, but the theorem cited above says no $\psi$-positive set,
no matter how little, fails to be recurrent.
For transience, we might be concerned that a set is too big to be hit
only finitely many times, but the theorem cited above says every petite set,
no matter how big, is uniformly transient.
We will learn more about transience and recurrence in the
next section and also in Sections~\ref{sec:drift-transience} and~\ref{sec:drift-harris} below.
\subsection{Subinvariant and Invariant Measures}
A measure is said to be sigma-finite (also written $\sigma$-finite) if
there is a countable partition of the state space such that the measure
of each element of the partition is finite.
A measure is said to be strictly positive if it is positive and not the
zero measure.
For every irreducible Markov kernel $P$ there exists a strictly positive,
sigma-finite measure $\mu$ such that $\mu \ge \mu P$, meaning
$$
\mu(A) \ge \int \mu(d x) P(x, A), \qquad A \in \mathcal{A}.
$$
where $(\Omega, \mathcal{A})$ is the state space
\citep[Theorem~8.0.1, Theorem~10.0.1, and Proposition~10.1.3]{meyn-tweedie}.
Such a measure $\mu$ is called \emph{subinvariant}.
If a subinvariant measure actually satisfies $\mu = \mu P$,
then it is called \emph{invariant}.
If a subinvariant measure is not invariant, then it is called
\emph{strictly subinvariant}.
If $P$ is irreducible and recurrent,
then every subinvariant measure is actually invariant
and is unique up to multiplication by a positive scalar
\citep[Theorem~10.4.4]{meyn-tweedie}.
If the invariant measure is finite, in which case it can be renormalized
to be a probability measure, then we say the chain is \emph{positive recurrent}.
Otherwise, we say the chain is \emph{null recurrent}.
If $P$ is irreducible, then it is positive recurrent if and only if
it has a finite invariant probability measure
\citep[Theorems~10.1.1 and~10.4.4]{meyn-tweedie}.
If $P$ is irreducible and transient, then $P$ has a strictly subinvariant
measure, which need not be not unique
\citep[Proposition~10.1.3]{meyn-tweedie}.
It may or may not have invariant measures, and the invariant measures,
if they exist, may or may not be unique. An invariant measure, if it
exists cannot be finite (otherwise the chain would be positive recurrent).
Thus we can use invariant and subinvariant measures to classify
irreducible Markov chains. The chain is positive recurrent if and only
if an invariant probability measure exists. The chain is transient if
and only if a strictly subinvariant measure exists. The left over case
is null recurrent.
Another useful fact is that any subinvariant measure is a maximal
irreducibility measure \citep[Proposition~10.1.2]{meyn-tweedie}. This
takes some of the mystery out of maximal irreducibility measures.
\subsubsection{Example: AR(1) Time Series} \label{sec:ar1-one}
An AR(1) time series is defined by
\begin{equation} \label{eq:ar1}
X_n = \rho X_{n - 1} + \sigma Y_n,
\end{equation}
where $Y_1$, $Y_2$, $\ldots$ are IID standard normal and independent
of $X_0$. It is clear
that the conditional distribution of $X_n$ given the past history
only depends on $X_{n - 1}$, so this is a Markov chain.
The AR in the name stands for auto-regressive, the idea being that
\eqref{eq:ar1} looks something like the specification for simple linear
regression except that the series is being regressed on itself (the
$X_i$ play the role of response on the left hand side and the role of
predictor on the right hand side). The 1 \relax in the name indicates
that there is just one ``predictor.'' An AR($k$) time series for $k > 1$
has additional terms $\rho_2 X_{n - 2}$, $\rho_3 X_{n - 3}$, and so forth.
But these are not Markov chains, so we ignore them (they are discussed
in the time series class).
By independence of $X_{n - 1}$ and $Y_n$ we have
$$
\var(X_n) = \rho^2 \var(X_{n - 1}) + \sigma^2.
$$
Suppose an AR(1) time series is a stationary Markov chain. Then
we have $\var(X_n) = \var(X_{n - 1})$. Hence
\begin{equation} \label{eq:tausq}
\var(X_n) = \frac{\sigma^2}{1 - \rho^2}
\end{equation}
provided $\rho^2 < 1$ (otherwise, the right hand side clearly cannot
define a variance). In case \eqref{eq:tausq} does define a variance,
call it $\tau^2$.
We guess that a normal distribution is invariant and we check that.
Clearly, if $E(X_{n - 1}) = 0$, then $E(X_n) = 0$, too.
This, together with our derivation of \eqref{eq:tausq} and the fact
that the sum of independent normal random variables is normal, tells
us that $\text{Normal}(0, \tau^2)$ is an invariant distribution when
$\rho^2 < 1$.
Clearly this Markov chain is irreducible, Lebesgue
measure being an irreducibility measure, because the conditional distribution
of $X_{n + 1}$ given $X_n$ gives positive probability to every set having
positive Lebesgue measure. Thus we now know $\text{Normal}(0, \tau^2)$
is the unique invariant
probability distribution and the Markov chain is positive recurrent
(when $\rho^2 < 1$).
It is also fairly clear that $\lambda P$ is proportional to $\lambda$ when
$\lambda$ is Lebesgue measure by symmetry of the normal distribution.
Let's calculate that. For any Lebesgue measurable set $A$,
\begin{align*}
(\lambda P)(A)
& =
\int_{- \infty}^\infty P(x, A) \, d x
\\
& =
\int_{- \infty}^\infty \, d x
\int_{\rho x + \sigma y \in A} \phi(y) \, d y
\\
& =
\int_{- \infty}^\infty \phi(y) \, d y
\int_{\rho x + \sigma y \in A} \, d x
\\
& =
\lambda(\rho^{- 1} A)
\int_{- \infty}^\infty \phi(y) \, d y
\\
& =
\lambda(\rho^{- 1} A)
\\
& =
\abs{\rho}^{- 1} \lambda(A)
\end{align*}
where $\phi$ is the PDF of the standard normal distribution,
where in the last three lines we are assuming $\rho \neq 0$,
and where $\rho^{- 1} A$ means multiplying every point of $A$ by
$\rho^{- 1}$.
The third equality is interchanging the order of integration,
which is also valid in measure theory (this is called the Fubini theorem
or the Tonelli theorem). The last equality is translation invariance of
Lebesgue measure.
In case $\rho^2 < 1$ so $\abs{\rho}^{- 1} > 1$ we do not find that Lebesgue
measure is subinvariant.
In case $\rho^2 = 1$ so $\abs{\rho}^{- 1} = 1$ we find that Lebesgue
measure is invariant.
In case $\rho^2 > 1$ so $\abs{\rho}^{- 1} < 1$ we find that Lebesgue
measure is strictly subinvariant.
Hence we have found that the Markov chain is transient in case $\rho^2 > 1$.
In case $\rho^2 = 1$ we have found that there is an invariant measure
that is not a finite measure, so we have ruled out positive recurrence,
but we still do not know whether the chain is transient or null recurrent.
We will settle this open issue and find out more about AR(1) Markov chains
in Sections~\ref{sec:ar1-two} and~\ref{sec:ar1-geo} below.
\subsection{Stationary Markov Chains}
A stochastic process is \emph{strictly stationary} if the
distribution of a block of consecutive random variables only depends on
the length of the block, that is, the distribution of $X_{n + 1}$, $\ldots,$
$X_{n + k}$ depends only on $k$ (does not depend on $n$).
A Markov chain is \emph{stationary} if it is a strictly stationary stochastic
process.
A Markov chain is stationary if it has an invariant
probability measure $\pi$ that is its initial distribution.
Then $\pi = \pi P$ says that $\pi$ is the marginal distribution of $X_n$
for all $n$. Since $\pi$ and $P$ determine the finite-dimensional
distributions of the Markov chain (Section~\ref{sec:markov} above), this
implies the joint distribution of
$X_{n + 1}$, $X_{n + 2}$, $\ldots,$ $X_{n + k}$ does not depend on $n$.
Conversely, if the Markov chain is a strictly stationary stochastic process,
then the marginal distribution of $X_n$ does not depend on $n$, hence
this marginal distribution $\pi$ satisfies $\pi = \pi P$.
Stationary implies stationary transition probabilities, but not vice versa.
\subsection{Harris Recurrence} \label{sec:harris}
A Markov chain is \emph{Harris recurrent} if it is irreducible
with maximal irreducibility measure $\psi$ and for every every $\psi$-positive
set $A$ the chain started at $x$ hits $A$ infinitely often with probability
one. Writing this out in mathematical formulas is complicated
\citep[p.~199]{meyn-tweedie}, and we shall not do so, since one never verifies
Harris recurrence directly from the definition.
We will learn more about Harris recurrence in Sections~\ref{sec:harris-too}
and~\ref{sec:drift-harris} below.
\subsection{The Law of Large Numbers}
The strong law of large numbers (LLN) for IID sequences of random variables
says the following. Let $X_1$, $X_2$, $\ldots$ be a sequence of IID random
variables having expectation $\mu$, and define
\begin{equation} \label{eq:x-bar}
\Xbar_n = \frac{1}{n} \sum_{i = 1}^n X_i,
\end{equation}
then
\begin{equation} \label{eq:lln}
\Xbar_n \asto \mu,
\end{equation}
this being known as the law of large numbers.
We want to discuss the LLN for Markov chains, but if $X_1$, $X_2$, $\ldots$
is a Markov chain, the $X_1$ need not be real-valued, so expectation need not
even be defined. Hence we introduce the notion of functionals of Markov
chains.
Suppose $X_1$, $X_2$, $\ldots$ is a Markov chain and $f$ is a real-valued
function on the state space of the Markov chain, then we say the stochastic
process $f(X_1)$, $f(X_2)$, $\ldots$ is a \emph{functional} of this Markov
chain.
Suppose $X_1$, $X_2$, $\ldots$ is a positive Harris recurrent Markov chain,
$\pi$ is its unique invariant distribution, and $f$ is a real-valued function
on the state space such that
\begin{equation} \label{eq:moo}
\mu = E_\pi\{ f(X) \} = \int f(x) \, \pi(d x)
\end{equation}
exists. Define
\begin{equation} \label{eq:moo-hat}
\hat{\mu}_n = \frac{1}{n} \sum_{i = 1}^n f(X_i)
\end{equation}
then
\begin{equation} \label{eq:lln-fun}
\hat{\mu}_n \asto \mu
\end{equation}
\citep[Proposition~17.1.7]{meyn-tweedie},
this being known as the Markov chain law of large numbers.
The similarity of \eqref{eq:lln} and \eqref{eq:lln-fun} is made clearer
if we define $Y_i = f(X_i)$ so the left hand side of \eqref{eq:lln-fun}
is the sample mean $\Ybar_n$. There is a difference that in \eqref{eq:lln-fun}
$\mu$ is not the expectation of the $Y_i$ (indeed, if the Markov chain is
not stationary, they may all have different expectations), rather $\mu$
is the analogous expectation with respect to the invariant distribution.
\subsection{Reversibility}
A kernel $K$ is said to be \emph{reversible} with respect to a signed
measure $\eta$ if
\begin{equation} \label{eq:reversible}
\iint f(x, y) \eta(d x) K(x, d y)
=
\iint f(y, x) \eta(d x) K(x, d y)
\end{equation}
for any bounded measurable function $f$.
The name comes from the fact that if $K$ is a Markov kernel and $\eta$
is a probability measure, then the Markov chain with transition probability
kernel $K$ and initial distribution $\eta$ looks the same running forwards
or backwards in time, that is,
$(X_{n + 1}, X_{n + 2}, \ldots, X_{n + k})$
has the same distribution as
$(X_{n + k}, X_{n + k - 1}, \ldots, X_{n + 1})$
for any positive integer $k$.
If a Markov kernel $P$ is reversible with respect to a probability measure
$\pi$, then $\pi$ is invariant for $P$.
To see this substitute $I_B(y)$ for $f(x, y)$ in \eqref{eq:reversible},
which gives
\begin{align*}
\int \pi(d x) P(x, B)
& =
\iint I_B(y) \pi(d x) P(x, d y)
\\
& =
\iint I_B(x) \pi(d x) P(x, d y)
\\
& =
\int I_B(x) \pi(d x)
\\
& =
\pi(B)
\end{align*}
which is $\pi = \pi P$.
%%%%% NEED FORWARD REFERENCES %%%%%
%%%%% to CLT section
%%%%% to variance estimation
%%%%% to Gibbs, Metropolis-Hastings, and MHG sections
\subsection{Total Variation Norm}
The \emph{total variation norm}
of a signed measure $\lambda$ on a measurable space $(\Omega, \mathcal{A})$
is defined by
\begin{equation} \label{eq:total-variation}
\norm{\lambda}
=
\sup_{A \in \mathcal{A}} \lambda(A) - \inf_{A \in \mathcal{A}} \lambda(A)
\end{equation}
Clearly, we have
$$
\abs{\lambda(A)} \le \norm{\lambda}
$$
and hence
$$
\sup_{A \in \mathcal{A}} \abs{\lambda(A)} \le \norm{\lambda}.
$$
Conversely,
\begin{align*}
\sup_{A \in \mathcal{A}} \lambda(A)
& \le
\sup_{A \in \mathcal{A}} \abs{\lambda(A)}
\\
- \inf_{A \in \mathcal{A}} \lambda(A)
& \le
\sup_{A \in \mathcal{A}} \bigl[ - \lambda(A) \bigr]
\\
& \le
\sup_{A \in \mathcal{A}} \abs{\lambda(A)}
\end{align*}
so
$$
\norm{\lambda} \le 2 \sup_{A \in \mathcal{A}} \abs{\lambda(A)}.
$$
In summary,
$$
\sup_{A \in \mathcal{A}} \abs{\lambda(A)}
\le
\norm{\lambda}
\le
2 \sup_{A \in \mathcal{A}} \abs{\lambda(A)}.
$$
For this reason one sometimes sees $\sup_{A \in \mathcal{A}} \abs{\lambda(A)}$
referred to as the total variation norm of $\lambda$, but this does not
agree with the definition used in many other areas of mathematics,
which is \eqref{eq:total-variation}.
\subsection{Geometric Ergodicity} \label{sec:geometric}
The following definition is given by \citep[p.~363]{meyn-tweedie}.
A Markov chain with transition probability
kernel $P$ and invariant distribution $\pi$ is \emph{geometrically ergodic}
if it is Harris recurrent and there exists a real number $r > 1$ such that
\begin{equation} \label{eq:geometrically-ergodic}
\sum_{n = 1}^\infty r^n \norm{P^n(x, \fatdot) - \pi(\fatdot)}
< \infty, \qquad x \in \Omega.
\end{equation}
(Note that $r$ does not depend on $x$.)
One often sees an alternative definition:
a positive Harris recurrent Markov chain with transition probability
kernel $P$ and invariant distribution $\pi$ is \emph{geometrically ergodic}
if there exists a real number $s < 1$ and a nonnegative function $M$ on
the state space $\Omega$ such that
\begin{equation} \label{eq:geometrically-ergodic-alternate}
\norm{P^n(x, \fatdot) - \pi(\fatdot)}
\le M(x) s^n,
\qquad x \in \Omega.
\end{equation}
It is obvious that \eqref{eq:geometrically-ergodic}
implies \eqref{eq:geometrically-ergodic-alternate}, but the reverse implication
is almost as obvious. If we assume \eqref{eq:geometrically-ergodic-alternate},
then
\begin{align*}
\sum_{n = 1}^\infty r^n \norm{P^n(x, \fatdot) - \pi(\fatdot)}
& \le
\sum_{n = 1}^\infty r^n M(x) s^n
\\
& \le
\frac{M(x)}{1 - r s}
\end{align*}
so long as $r s < 1$, and this proves \eqref{eq:geometrically-ergodic}
for any $r$ such that $1 < r < 1 / s$.
%%%%% NEED FORWARD REFERENCES %%%%%
%%%%% to CLT section
%%%%% to drift conditions
\subsection{The Central Limit Theorem}
The central limit theorem (CLT) for IID sequences of random variables
says the following. Let $X_1$, $X_2$, $\ldots$ be a sequence of IID random
variables having expectation $\mu$ and standard deviation $\sigma$, and define
$\Xbar_n$ by \eqref{eq:x-bar}, then
\begin{equation} \label{eq:clt}
\sqrt{n} (\Xbar_n - \mu) \weakto \text{Normal}(0, \sigma^2)
\end{equation}
this being known as the central limit theorem (in case $\sigma = 0$,
the right hand side is interpreted as the degenerate normal distribution
concentrated at zero).
We want to discuss the CLT for Markov chains, so again we have to go to
functionals and again use the notation \eqref{eq:moo} and \eqref{eq:moo-hat}.
Suppose $X_1$, $X_2$, $\ldots$ is a geometrically ergodic Markov
chain, $\pi$ is its invariant distribution, and $f$ is a real-valued function
on the state space such that
\begin{equation} \label{eq:mom}
E_\pi\{ \abs{f(X)}^{2 + \varepsilon} \}
=
\int \abs{f(x)}^{2 + \varepsilon} \, \pi(d x)
\end{equation}
exists for some $\varepsilon > 0$.
Then
\begin{equation} \label{eq:clt-fun}
\sqrt{n} (\hat{\mu}_n - \mu) \weakto \text{Normal}(0, \sigma^2),
\end{equation}
where $\hat{\mu}_n$ and $\mu$ are given by
\eqref{eq:moo-hat} and \eqref{eq:moo}, where
\begin{equation} \label{eq:ass-var}
\sigma^2 = \gamma_0 + 2 \sum_{k = 1}^\infty \gamma_k
\end{equation}
and where
\begin{equation} \label{eq:auto-covariance}
\gamma_n = \cov_\pi\{ f(X_i), f(X_{i + n}) \},
\end{equation}
the notation $\cov_\pi$ indicating covariances with respect to the stationary
Markov chain having $\pi$ as its initial distribution
\citep[Theorem~2]{chan-geyer}.
If we change assumptions for the theorem stated above slightly,
adding reversibility to the assumptions and weakening \eqref{eq:mom}
by only requiring it hold for $\varepsilon = 0$, then the conclusions
still hold (\citealp[Theorem~2.1]{roberts-rosenthal}, combined with
a central limit theorem for rho-mixing stationary stochastic processes,
\citealp[Theorem~2.2, Remark~2.2, and Theorem~2.3]{peligrad}, combined
with the fact that any rho-mixing Markov chain is rho-mixing exponentially
fast \citealp[Theorem~4.2]{bradley-mixing}).
This theorem is, qualitatively, no worse than the CLT for IID. Both say
the CLT holds for all functionals having second moments. Without reversibility
we need a little bit more than second moments.
Geometric ergodicity is not necessary for a CLT. There are a lot
of Markov chain CLT in the literature, but unlike the geometrically ergodic
ones stated above, they do not provide any simple condition for which
functionals (which $f$) have the CLT and which do not. They are thus not very
usable in practice.
\section{Monte Carlo}
``Monte Carlo'' is a cutesy name for the practice of learning about
a probability distribution by simulating it. The term was coined in
the 1950's when gambling was illegal almost everywhere (no legal gambling
anywhere in the United States except in Las Vegas) and the casino at
Monte Carlo was the most famous in the world. And gambling has something
to do with randomness hence the term. It was a weak joke, now it is a
colorless technical term designating a method in applied mathematics.
\subsection{Ordinary Monte Carlo}
Suppose there is a probability distribution $\pi$ and there is an expectation
$\mu$ given by \eqref{eq:moo}
that is analytically intractable: we cannot calculate it either with pencil
and paper methods or with a computer algebra system (like Mathematica
or Maple).
The ordinary Monte Carlo (OMC) method says to simulate IID realizations
$X_1$, $X_2$, $\ldots$ from the distribution $\pi$ and use
$\hat{\mu}_n$ given by \eqref{eq:moo-hat}
as an estimator (or Monte Carlo approximation) of \eqref{eq:moo}.
Then the LLN \eqref{eq:lln-fun} says our Monte Carlo approximation converges
almost surely to the quantity we want to calculate as $n$ goes to infinity,
and the CLT says the difference between our Monte Carlo approximation and
the quantity we want to calculate, which we call the Monte Carlo error,
converges to a normal distribution at a root $n$ rate.
So there is no mystery to OMC. It is just the most elementary of statistics:
using the sample mean to estimate the population mean and using the law of
large numbers and the central limit theorem for justification and error
analysis. One difference is that since $n$ is how many realizations we
have the computer generate, we can always have $n$ very large.
In order to avoid confusion when applying the Monte Carlo method to problems
arising in statistics, we always emphasize that $n$ is the \emph{Monte Carlo
sample size}, the number of simulations done by the computer, rather than
anything else called ``sample size'' in the statistical problem being done.
OMC has only two drawbacks, one major and one minor.
The major one is that it is very difficult to simulate IID
realizations of any complicated multivariate distribution. Univariate
distributions are easy to simulate. \citet{devroye} has hundreds of
recipes that have appeared in the literature. They allow simulation
of just about any univariate distribution that can be described.
But there are almost no recipes for multivariate distributions:
uniform on a box, uniform on a ball, and multivariate normal are the
only multivariate distributions that are easy to simulate.
The minor drawback is that the ``square root law'' (the root $n$ in the CLT)
means that only limited precision is possible. To get 10 times the accuracy,
one needs 100 times the Monte Carlo sample size. To get 100 times the accuracy,
one needs 10,000 times the Monte Carlo sample size. At some point one just
gives up. Arbitrary precision is not practical.
\subsection{Markov Chain Monte Carlo}
The Markov chain Monte Carlo (MCMC) method says to simulate an irreducible
positive recurrent Markov chain $X_1$, $X_2$, $\ldots$ having $\pi$ as its
unique invariant distribution. We still use \eqref{eq:moo-hat} as the
estimator of \eqref{eq:moo}.
We have now left the realm of elementary statistics. For justification
we need a LLN and CLT for Markov chains, which are completely missing
from many statistics programs.
It is important to emphasize that the LLN and CLT for geometrically ergodic
Markov chains do not depend on the initial distribution \citep[if the LLN
holds for any initial distribution, then it holds for every initial
distribution, and similarly for the CLT][Proposition~17.1.6]{meyn-tweedie}
because in practice we never use the stationary distribution
as the initial distribution. If we knew how to generate even one sample from
the stationary distribution, we could do that over and over and do OMC.
So in MCMC the samples $X_1$, $X_2$, $\ldots$ are usually neither
independent nor identically distributed and their distribution is not the
distribution of interest.
If the samples were independent, then they actually are IID and MCMC is
in this case actually OMC.
If the samples are identically distributed, then the Markov chain is
stationary, but this is never possible to arrange unless one can produce
IID samples from the distribution of interest. So in practice MCMC provides
a not independent, not identically distributed, sample from the distribution
of interest.
MCMC has only two drawbacks, one major and one minor. The minor one is
the same one that OMC has. MCMC obeys the square root law too, so only
limited precision is practical. The major drawback is very different.
MCMC easily simulates any multivariate distribution
(Section~\ref{sec:metropolis} below), so it does not have the major drawback
of OMC.
The major drawback of MCMC is that you are never quite sure that it
has worked. This is a bit hard to explain, so let us consider a very
special case. What is the probability that an OMC calculation estimates
probability zero for an event $A$ having true probability $\pi(A)$?
This problem is solvable by intro statistics students: it is just the
multiplication rule and the complement rule, and the answer is $[1 - \pi(A)]^n$.
What is the answer to the same question for MCMC rather than OMC?
Let $T_A$ denote the hitting time for $A$ (first time after time zero
that the chain enters $A$). If the chain is geometrically ergodic, then
there exist $r > 1$ such that
$$
E_\pi\{ r^{T_A} \} < \infty
$$
\citep[Proposition 5.19]{nummelin}.
Thus by Markov's inequality, there
exists a constant $M < \infty$ such that
$$
P_\pi(T_A \ge n) \le M r^{-n}
$$
which says the same thing as our answer for OMC except that we
usually have no sharp bounds for $M$ and $r$. With OMC we
know that $M = 1$ and $r = 1 / [1 - \pi(A)]$ will do. With MCMC
we only know that some $M < \infty$ and $r > 1$ will do.
This is not of merely theoretical concern. In practical situations,
it may take a very large number of iterations to get a sample that
is reasonably representative of the invariant distribution,
and there is usually no simple calculation that tells us how many
iterations are required.
Theorems do exist that give bounds on how many iterations are required
\citep{rosenthal,latuszynski-niemiro,latuszynski-miasojedow-niemiro},
but these bounds are very sloppy except in the simplest
problems. In most practical MCMC applications, such bounds are useless
if they can be computed at all.
In summary, OMC has the major drawback that you can't do it for complicated
multivariate problems, and MCMC has the major drawback that you are never
quite sure it has worked.
\subsection{Unnormalized Probability Densities}
We say $h$ is an \emph{unnormalized density} of a random vector $X$ with
respect to a positive measure $\mu$ if $\int h \, d \mu$ is nonzero and finite.
Then the (proper, normalized, probability) density of $X$ with respect to $\mu$
is $f = h / c$, where $c = \int h \, d \mu$.
The notion of an unnormalized density provides many master's level probability
theory homework problems of the form given $h$ find $f$, but it is also very
very useful in Bayesian inference and spatial statistics. Bayes rule can
be phrased: likelihood times prior equals unnormalized posterior.
Thus one always knows the unnormalized posterior but may not know how to
normalize it. In spatial statistics and other areas of statistics involving
complicated stochastic dependence amongst components of the data it is easy
to specify models by unnormalized densities, because it is easy to make up
functions of the data and parameters that are integrable, but it may be
impossible to give closed-form expressions for those integrals and hence
impossible to specify the normalized densities of the model.
The following two sections give algorithms for MCMC samplers for probability
models specified by unnormalized densities.
\subsection{The Gibbs Sampler} \label{sec:gibbs}
The Gibbs sampler was introduced by \citet{geman-geman} and popularized by
\citet{gelfand-smith}. Why is it named after Gibbs if he didn't invent it?
It was originally used to simulate Gibbs distributions in thermodynamics,
which were invented by Gibbs, and it was only later realized that the algorithm
applied to any distribution.
Given an unnormalized density of a random vector,
it may be possible to normalize the conditional distribution of each component
given the other components when it is analytically intractable to normalize
the joint
distribution. These conditional distributions are called
the \emph{full conditionals}.
In a \emph{random scan} Gibbs sampler each step of the Markov chain proceeds
by choosing one of the full conditionals uniformly at random and then
simulating a new value of that component of the state vector using the
full conditional (the remaining components do not change in this step).
In a \emph{fixed scan} Gibbs sampler each step of the Markov chain proceeds
by simulating new values of each component of the state vector using
the full conditional for each (in each such simulation the remaining
components do not change in that substep). The components are simulated
in the same order in each step of the Markov chain.
More precisely, let $X_n$ denote the state vector and $X_{n i}$ its components,
and let $f_i$ denote the full conditionals. Then one step of the Markov
chain proceeds as follows
\begin{align*}
X_{n + 1, i_1}
& \sim
f_{i_1}(\fatdot \mid X_{n i_2}, \ldots, X_{n i_d})
\\
X_{n + 1, i_2}
& \sim
f_{i_2}(\fatdot \mid X_{n + 1, i_1}, X_{n i_3} \ldots, X_{n i_d})
\\
X_{n + 1, i_3}
& \sim
f_{i_2}(\fatdot \mid X_{n + 1, i_1}, X_{n + 1, i_2},
X_{n i_4} \ldots, X_{n i_d})
\\
& \hspace*{0.5em} \vdots
\\
X_{n + 1, i_{d - 1}}
& \sim
f_{i_{d - 1}}(\fatdot \mid X_{n + 1, i_1}, \ldots X_{n + 1, i_{d - 2}},
X_{n i_d})
\\
X_{n + 1, i_d}
& \sim
f_{i_d}(\fatdot \mid X_{n + 1, i_1}, \ldots X_{n + 1, i_{d - 1}})
\end{align*}
where $d$ is the dimension of $X_n$ and $(i_1, \ldots, i_d)$ is
a permutation of $(1, \ldots, d)$ that remains fixed for all steps
of the Markov chain.
It is obvious that each substep involving the update of one coordinate
preserves the distribution of interest (the one having the full conditionals
being used) because if the joint distribution of all the components is
the distribution of interest before the substep, then it is the same
distribution afterwords (marginal times conditional equals joint).
Thus a Gibbs sampler, if irreducible, simulates the distribution of interest.
A random scan Gibbs sampler is reversible: if the $i$-th component is
simulated, then $X_{n + 1, i}$ and $X_{n i}$ both have the same distribution
given the rest of the components (which are the same in both $X_{n + 1}$ and
$X_n$), and this implies reversibility.
A fixed scan Gibbs sampler is not reversible (the time-reversed chain
simulates components in the reverse order).
If one wants to do fixed scan and also wants reversible, one can use a
so-called palindromic fixed scan (the same forwards and backwards), such as
1, 2, 3, 2, 1 for $d = 3$.
\subsection{The Metropolis-Hastings Algorithm} \label{sec:metropolis}
Suppose $h$ is an unnormalized density with respect to a positive measure $\mu$
on the state space and for each $x$ in the state space $q(x, \fatdot)$ is
a properly normalized density with respect to $\mu$ chosen to be easy
to simulate
(multivariate normal, for example). The \emph{Metropolis-Hastings algorithm}
\citep*{metropolis-et-al,hastings} repeats the following in each step of
the Markov chain.
\begin{enumerate}
\item[(i)] Simulate $Y_n$ from the distribution $q(X_n, \fatdot)$.
\item[(ii)] Calculate $a(X_n, Y_n)$ where
\begin{equation} \label{eq:hastings}
r(x, y) = \frac{h(y) q(y, x)}{h(x) q(x, y)}
\end{equation}
and
\begin{equation} \label{eq:hastings-acceptance}
a(x, y) = \min\bigl(1, r(x, y) \bigr).
\end{equation}
\item[(iii)] Set $X_{n + 1} = Y_n$ with probability $a(X_n, Y_n)$,
and set $X_{n + 1} = X_n$ with probability $1 - a(X_n, Y_n)$.
\end{enumerate}
In order to avoid divide by zero in \eqref{eq:hastings} it is necessary
and sufficient that $h(X_1) > 0$. Proof: $q(X_n, Y_n) > 0$ with probability
one because of (i), and $h(Y_n) = 0$ implies $a(X_n, Y_n) = 0$ implies
$X_{n + 1} = X_n$ with probability one, hence (conversely)
$X_{n + 1} \neq X_n$ implies $h(X_{n + 1}) > 0$.
Since the Metropolis-Hastings update is undefined when $h(X_n) = 0$,
in theoretical arguments we must consider the state space to be the
set of points $x$ such that $h(x) > 0$. This is permissible, because,
as was just shown, we always have $h(X_n) > 0$ even though there is no
requirement that $h(Y_n) > 0$.
Terminology: $Y_n$ is called the \emph{proposal}, \eqref{eq:hastings}
is called the \emph{Hastings ratio}, \eqref{eq:hastings-acceptance}
is called the \emph{acceptance probability}, substep (iii) is called
\emph{Metropolis rejection}, and the proposal is said to be \emph{accepted}
when we set $X_{n + 1} = Y_n$ in step (iii) and \emph{rejected} when we
set $X_{n + 1} = X_n$ in step (iii).
In the special case where $q(x, y) = q(y, x)$ for all $x$ and $y$
the proposal distribution $q$ is said to be \emph{symmetric} and
this special case of the Metropolis-Hastings algorithm is called the
\emph{Metropolis algorithm}. In this special case \eqref{eq:hastings}
becomes
\begin{equation} \label{eq:metropolis}
r(x, y) = \frac{h(y)}{h(x)}
\end{equation}
and is called the \emph{Metropolis ratio} or the \emph{odds ratio}.
There is little advantage to this special case.
It only saves a bit of time in not having to calculate $q(X_n, Y_n)$
and $q(Y_n, X_n)$ in each step.
It only gets a special name because it was proposed earlier.
The Metropolis algorithm was proposed by \citet{metropolis-et-al},
and the Metropolis-Hastings algorithm was proposed by \citet{hastings}.
\begin{theorem}
The Metropolis-Hastings update is reversible with respect to the distribution
having unnormalized density $h$.
\end{theorem}
Thus a Metropolis-Hastings sampler, if irreducible, simulates the distribution
of interest (it does not matter what the proposal distribution is).
\begin{proof}
The kernel for the Metropolis-Hastings update is
\begin{equation} \label{eq:kernel-metropolis-hastings}
P(x, A) = m(x) I(x, A) + \int_A q(x, y) a(x, y) \, \mu(d y),
\end{equation}
where
$$
m(x) = 1 - \int q(x, y) a(x, y) \, \mu(d y).
$$
Let $\eta$ be the measure having density $h$ with respect to $\mu$.
Then
\begin{align*}
\iint f(x, y) \eta(d x) P(x, d y)
& =
\iint f(x, y) h(x) P(x, d y) \mu(d x)
\\
& =
\iint f(x, y) m(x) I(x, d y) \mu(d x)
\\
& \qquad
+
\iint f(x, y) h(x) q(x, y) a(x, y) \mu(d x) \mu(d y)
\\
& =
\int f(x, x) m(x) \mu(d x)
\\
& \qquad
+
\iint f(x, y) h(x) q(x, y) a(x, y) \mu(d x) \mu(d y)
\end{align*}
Clearly, the first term on the right side is unchanged if the arguments
are interchanged in $f(x, x)$. Thus to show reversibility we only need
to show that the value of
\begin{equation} \label{eq:metropolis-hastings-foo}
\iint f(x, y) h(x) q(x, y) a(x, y) \mu(d x) \mu(d y)
\end{equation}
is not changed if $f(x, y)$ is changed to $f(y, x)$, and this is implied by
\begin{equation} \label{eq:metropolis-hastings-claim}
h(x) q(x, y) a(x, y)
=
h(y) q(y, x) a(y, x)
\end{equation}
holding for all $x$ and $y$ except for those in a set making no contribution
to \eqref{eq:metropolis-hastings-foo} because the integrand is zero.
Thus we may assume
\begin{equation} \label{eq:metropolis-hastings-bar}
h(x) > 0 \opand q(x, y) > 0 \opand a(x, y) > 0,
\end{equation}
which implies $r(x, y) > 0$ and also implies
\eqref{eq:metropolis-hastings-bar} with $x$ and $y$ swapped.
For $(x, y)$ satisfying both
\eqref{eq:metropolis-hastings-bar} and
\eqref{eq:metropolis-hastings-bar} with $x$ and $y$ swapped,
neither the numerator nor the
denominator in \eqref{eq:hastings} is equal to zero, and
$$
r(x, y) = \frac{1}{r(y, x)}.
$$
The proof of the claim \eqref{eq:metropolis-hastings-claim} now splits into
two cases. First, if $r(x, y) \ge 1$, so $a(x, y) = 1$, then
$r(y, x) \le 1$, so $a(y, x) = r(y, x)$, and
\begin{align*}
h(y) q(y, x) a(y, x)
& =
h(y) q(y, x) r(y, x)
\\
& =
h(y) q(y, x) \frac{h(x) q(x, y)}{h(y) q(y, x)}
\\
& =
h(x) q(x, y)
\\
& =
h(x) q(x, y) a(x, y)
\end{align*}
The second case is exactly the same as the first except that $x$ and $y$
are exchanged.
\end{proof}
\subsection{One Variable at a Time Metropolis-Hastings}
A variant of the Metropolis-Hastings algorithm has elementary updates that
update one variable at a time. These updates are then combined in a fixed
or random scan like with the Gibbs sampler. We will not write out the details;
see \citet[Section~1.12.5]{geyer-intro}.
Nowadays we use the term ``Metropolis-Hastings algorithm'' to refer to
the procedure described the preceding section and must use some long-winded
term like that in the title of this section to refer to this algorithm.
However, this is historically inaccurate. The original example in
\citet{metropolis-et-al} was a sampler of the type described in this section
not the type described in the preceding section.
The Gibbs sampler is a special case of the algorithm described in this section
\citep[Section~1.12.6]{geyer-intro}.
The algorithm described in this section is a special case of the algorithm
described in the following section.
\subsection{The Metropolis-Hastings-Green Algorithm}
Many other MCMC algorithms have been put in the literature. They are all
(as far as I know) special cases of the Metropolis-Hastings-Green algorithm
\citep{green}, which is just like the Metropolis-Hastings algorithm except
that is allows proposals from distributions not defined by probability
density functions and hence is inherently measure theoretic. We do not
describe it here; see \citet[Section~1.17]{geyer-intro}.
\subsection{Harris Recurrence} \label{sec:harris-too}
For the most commonly used MCMC algorithms there are three theorems that say
irreducibility implies Harris recurrence. Corollaries~1 and~2 of
\citet{tierney} show this for Gibbs samplers and Metropolis-Hastings samplers
that update all variables simultaneously.
Theorem~1 of \citet{chan-geyer} shows this for Metropolis-Hastings samplers
that update one variable at a time (the latter requires irreducibility not
only of the given Markov chain but also of all Markov chains that fix any
subset of the variables).
Of course, the literature contains many other MCMC algorithms. For those
one must verify Harris recurrence directly.
\subsection{Variance Estimation}
In order to estimate the accuracy of Monte Carlo approximations, we must
estimate the asymptotic variance in the CLT \eqref{eq:ass-var}. There
are many methods of doing this (\citealp{geyer-practical}, Section~3;
\citealp{geyer-intro}, Section~1.10). We will only discuss the simplest,
which use the method of batch means.
A ``batch'' is a consecutive part of a time series such as a functional
of a Markov chain. If $f(X_1)$, $f(X_2)$, $\ldots$ is a functional of
a Markov chain, then $f(X_{i + 1})$, $f(X_{i + 2})$, $\ldots,$ $f(X_{i + b})$
is a \emph{batch of length $b$}, and
$$
\hat{\mu}_{i b} = \frac{1}{b} \sum_{j = 1}^b f(X_{i + j})
$$
is the corresponding \emph{batch mean}. The Markov chain CLT says
$$
\sqrt{b} (\hat{\mu}_{i b} - \mu) \weakto \text{Normal}(0, \sigma^2)
$$
and this holds regardless of $i$, so we expect
\begin{equation} \label{eq:bme-one}
b (\hat{\mu}_{i b} - \hat{\mu}_n)^2
\end{equation}
to be a good estimate of $\sigma^2$ when $0 \ll b \ll n$, where $\ll$ means
``a lot greater than.'' We need $0 \ll b$ in order for the CLT to hold at
size $b$ and we need the $b \ll n$ in order for the randomness in
$\hat{\mu}_n$ to be much less than the randomness in $\hat{\mu}_{i b}$ so
\eqref{eq:bme-one} is a good approximation to
$$
b (\hat{\mu}_{i b} - \mu)^2.
$$
The method of batch means has several subvarieties. The method of
\emph{overlapping batch means} uses all of the batches of length $b$.
$$
\hat{\sigma}^2_{\text{olbm}}
=
\frac{b}{n - b + 1}
\sum_{i = 1}^{n - b + 1} (\hat{\mu}_{i b} - \hat{\mu}_n)^2
$$
The method of \emph{nonoverlapping batch means} uses only nonoverlapping
and abutting batches of length $b$.
$$
\hat{\sigma}^2_{\text{nolbm}}
=
\frac{b}{\lfloor n / b \rfloor}
\sum_{k = 1}^{\lfloor n / b \rfloor}
(\hat{\mu}_{(k - 1) b + 1, b} - \hat{\mu}_n)^2
$$
The overlapping batch means estimator is somewhat more efficient
\citep{meketon-schmeiser}, but not necessarily enough to be worth the
extra computer time and storage
\citep[Section~1.10, last paragraph]{geyer-intro}.
So we consider only the latter.
There is another distinction between the method
of \emph{consistent batch means} (CBM), which requires both $b$ and $n / b$
to go to infinity
at a certain rate as $n$ goes to infinity \citep{jones-et-al}, and the
method of \emph{inconsistent batch means} (IBM),
which fixes the number of batches
so the batch length is $\lfloor n / m \rfloor$ if there are $m$ batches.
It can then be shown \citep[Section~3.2]{geyer-practical} that the batches
are asymptotically IID normal with mean $\mu$ and variance $\sigma^2$ so
an ordinary $t$ confidence interval gives a valid confidence interval for
the quantity $\mu$ being approximated by MCMC. So the only penalty
of IBM versus CBM is that one uses a $t$ critical value rather than a $z$
critical value in constructing the confidence interval. So long as the
number of batches is moderately large (greater than 30), this penalty
is negligible.
CBM has the virtue that it is consistent (of course), so it can be easily
used as a component of more complicated procedures \citep{jones-et-al} but
it requires stronger regularity conditions than the CLT itself.
IBM has the virtue that it it valid whenever the CLT holds.
%%%%% NEED FORWARD REFERENCES %%%%%
%%%%% to drift conditions
%%%%% to invariance principle ?????
\section{Drift Conditions} \label{sec:drift}
\subsection{Small and Petite Sets} \label{sec:small-petite}
A subset $C$ of the state space $(\Omega, \mathcal{A})$
is \emph{small} if there exists a nonzero positive
measure $\nu$ on the state space and a positive integer $n$ such that
\begin{equation} \label{eq:small}
P^n(x, A) \ge \nu(A), \qquad A \in \mathcal{A} \opand x \in C.
\end{equation}
It is not obvious that small sets having positive irreducibility measure exist.
That they do exist for any
irreducible kernel $P$ was proved by \citet{jain-jamison} under the
assumption that the state space is countably generated (this is why
that assumption is always imposed).
Recall the notion of the kernel $P_q$ derived from a kernel $P$ by
subsampling introduced in Section~\ref{sec:sample} above.
A subset $C$ of the state space $(\Omega, \mathcal{A})$
is \emph{petite} if there exists a nonzero positive
measure $\nu$ on the state space and a subsampling distribution $q$
such that
\begin{equation} \label{eq:petite}
P_q(x, A) \ge \nu(A), \qquad A \in \mathcal{A} \opand x \in C.
\end{equation}
Clearly every small set is petite (take $q$ such that $q_n = 1$).
So petite sets exist because small sets exist.
\citet{meyn-tweedie} show that a finite union of petite sets is petite
and there exists a sequence of petite sets whose union is the whole state
space (their Proposition~5.5.5).
\subsection{T-Chains} \label{sec:t-chain}
In this section we again use topology.
A topological space is \emph{locally compact} if every point has a compact
neighborhood. The main example is $\real^d$, where for any $x$ every closed
ball centered at $x$ is a compact neighborhood of $x$.
Following \citet[Chapter~6]{meyn-tweedie}, we assume throughout this section
that the state space is a locally compact Polish space (a Polish space
is a complete separable metric space, and again $\real^d$ is an example).
A function $f$ on a metric space is \emph{lower semicontinuous} (LSC) if
$$
\liminf_{y \to x} f(y) \ge f(x), \qquad \text{for all $x$}.
$$
A \emph{continuous component} $T$ of
a kernel $P$ having state space $(\Omega, \mathcal{A})$
is a substochastic kernel such that the function
$$
x \mapsto T(x, A)
$$
is LSC for any $A \in \mathcal{A}$ and there is a probability vector $q$
such that
$$
P_q(x, A) \ge T(x, A), \qquad x \in \Omega \opand A \in \mathcal{A}.
$$
We also say a Markov chain having $P$ as its transition probability kernel
has a continuous component $T$ if $T$ is a continuous component of $P$.
A Markov chain is a \emph{$T$-chain} if it has a continuous component $T$ such
that
$$
T(x, \Omega) > 0, \qquad \text{for all $x \in \Omega$}.
$$
For a $T$-chain every compact set is petite and, conversely,
if every compact set is petite, then the chain is a $T$-chain
\citep[Theorem~6.0.1]{meyn-tweedie}.
\begin{theorem} \label{th:gibbs:T}
A Gibbs sampler is a $T$-chain if all the full conditionals are
LSC functions of the variables on which they condition.
\end{theorem}
\begin{proof}[Partial Proof]
Since the notation for the Gibbs sampler is so messy, we do only the
three-component case. The general idea should be clear.
For both kinds of Gibbs sampler, take
the continuous component $T$ to be $P$ itself.
For a three-component random scan Gibbs sampler, the kernel is
\begin{align*}
P(x, A)
& =
\frac{1}{3} \int I\bigl((y, x_2, x_3), A\bigr) f_1(y \mid x_2, x_3) \, d y
\\
& \qquad
+
\frac{1}{3} \int I\bigl((x_1, y, x_3), A\bigr) f_2(y \mid x_1, x_3) \, d y
\\
& \qquad
+
\frac{1}{3} \int I\bigl((x_1, x_2, y), A\bigr) f_3(y \mid x_1, x_2) \, d y
\end{align*}
and this is an LSC function of $x$ for each fixed $A$ by Fatou's lemma.
For a three-component fixed scan Gibbs sampler that updates in the order
1, 2, 3, the kernel is
\begin{align*}
P(x, A)
& =
\iiint I(y, A) f_3(y_3 \mid y_1, y_2) f_2(y_2 \mid y_1, x_3)
f_1(y_1 \mid x_2, x_3) \, d y
\end{align*}
and this is an LSC function of $x$ for each fixed $A$ by Fatou's lemma.
(For state spaces of other dimensions,
the general idea is that one writes down the kernel, however messy the
notation may be, and then says ``and this is an LSC function of $x$ for
each fixed $A$ by Fatou's lemma.'')
\end{proof}
\begin{theorem} \label{th:hastings:T}
An irreducible Metropolis-Hastings sampler is a $T$-chain if
the unnormalized density of the invariant distribution is continuous
and the proposal density is separately continuous.
\end{theorem}
\begin{proof}
As noted in Section~\ref{sec:metropolis} we must define the state space
of the Markov chain to be the set $W = \{x : h(x) > 0 \}$. The assumption
that $h$ is continuous means $W$ is an open set.
We take the continuous component to be the part of the kernel corresponding
to accepted updates, that is,
\begin{equation} \label{eq:hastings:T}
T(x, A)
=
\int_A q(x, y) a(x, y) \, d y,
\end{equation}
where we define
$$
a(x, y)
=
\begin{cases}
1, & h(y) q(y, x) \ge h(x) q(x, y)
\\
\frac{h(y) q(y, x)}{h(x) q(x, y)}, & \text{otherwise}
\end{cases}
$$
(note that our definition of $a(x, y)$ avoids the problem of divide by zero
when $q(x, y) = 0$, because then the first case in the definition is chosen).
Fix $y$ and consider a sequence $x_n \to x$ with $x \in W$.
It is clear that if $q(x, y) > 0$, then
$$
a(x_n, y) q(x_n, y) \to a(x, y) q(x, y)
$$
by the continuity assumptions of the theorem. In case $q(x, y) = 0$, we have
$$
0 \le a(x_n, y) q(x_n, y) \le q(x_n, y) \to 0
$$
by the continuity assumptions of the theorem and our definition of $a(x, y)$.
The integrand in \eqref{eq:hastings:T} being an LSC function for
each fixed value of the variable of integration, so is the integral
by Fatou's lemma. It remains only to be shown that $T(x, W) > 0$ for
every $x \in W$, but if this failed for any $x$ this would mean that
the chain could never move from $x$ to anywhere and hence
this chain is would not be irreducible, contrary to assumption.
\end{proof}
\subsection{Periodicity}
Suppose $C$ is a small set satisfying \eqref{eq:small} and also satisfies
$\nu(C) > 0$, which is always possible to arrange
\cite[Proposition~5.2.4]{meyn-tweedie}. Define
$$
E_C = \set{ n \ge 1 : (\exists \delta > 0) (\forall A \in \mathcal{A})
(\forall x \in C) (P^n(x, A) \ge \delta \nu(A)) }
$$
Let $d$ be the greatest common divisor of the elements of $E_C$.
\citet[Theorem~5.4.4]{meyn-tweedie} then show that there exist
disjoint measurable subsets $A_0$, $\ldots,$ $A_{d - 1}$ of the state space
$\Omega$ such that
$$
P(x, A_{i}) = 1, \qquad x \in A_j \opand i = j + 1 \mod d,
$$
where $j + 1 \mod d$ denotes the remainder of $j + 1$ when divided by $d$, and
$$
\psi\bigl((A_0 \cup \cdots \cup A_{d - 1})^c\bigr) = 0,
$$
where $\psi$ is a maximal irreducibility measure.
If $d \ge 2$ we say the Markov chain is \emph{periodic} with \emph{period $d$}.
Otherwise, we say the Markov chain is \emph{aperiodic}.
We use the same terminology for the transition probability kernel
(since whether the Markov chain is periodic or not depends only on the kernel
not on the initial distribution).
For an obvious example of a periodic chain, consider a chain with
state space ${0, \ldots, d - 1}$ and deterministic movement: $X_n = x$
then $X_{n +_ 1} = x + 1 \mod d$.
In MCMC the possibility of periodicity is mostly a nuisance.
No Markov chain used in practical MCMC applications is periodic.
\begin{theorem} \label{th:aperiodic-hastings}
A positive recurrent Markov kernel of the form
$$
P(x, A) = m(x) I(x, A) + K(x, A)
$$
is aperiodic if $\int m \, d \pi > 0$, where $\pi$ is the invariant probability
measure.
\end{theorem}
Note that \eqref{eq:kernel-metropolis-hastings}, the kernel for
a Metropolis-Hastings update has this form, where $m(x)$ is the probability
that, if the current position is $x$, the proposal made will be rejected.
In short, a Metropolis-Hastings sampler that rejects with positive probability
at a set of points $x$ having positive probability under the invariant
distribution cannot be periodic.
\begin{proof}
Suppose to get a contradiction that the sampler is periodic with period $d$
and $A_0$, $\ldots,$ $A_{d - 1}$ as described above. We must have
$\pi(A_k) = 1 / d$ for all $k$ because $\pi(A_k) = \pi(A_{k + 1 \mod d})$.
Hence we have for the stationary chain
$$
\Pr(X_n \in A_k \opand X_{n + 1} \in A_k)
\ge
\int_{A_k} \pi(d x) m(x)
$$
and the latter is greater than zero, contradicting the periodicity assumption
because $A_k$ is $\pi$-positive.
\end{proof}
\begin{theorem} \label{th:gibbs-aperiodic}
An irreducible Gibbs sampler is aperiodic.
\end{theorem}
\begin{proof}
The proof begins with the same two sentences as the preceding proof.
Any Gibbs update simulates $X$ given $h_i(X)$ for some function $h_i$
(for a traditional Gibbs sampler $h_i$ is the projection that drops the $i$-th
coordinate).
That is, $h_i(X_{n + 1}) = h_i(X_n)$ and the conditional distribution
of $X_{n + 1}$ given $h_i(X_{n + 1})$ is the one derived from $\pi$.
First consider a random scan Gibbs sampler. Write $I_n$ for the
random choice of which coordinate to update. Then
conditional on $h_{I_n}(X_n)$ the two random elements $X_n$ and $X_{n + 1}$
are conditionally independent. Hence
\begin{equation} \label{eq:gibbs-aperiodic}
\Pr\bigl(X_{n + 1} \in A_k \mid X_n \in A_k, h_{I_n}(X_n)\bigr)
=
\Pr\bigl(X_{n + 1} \in A_k \mid h_{I_n}(X_n)\bigr)
\end{equation}
In order for the sampler to be periodic, we must have
\begin{multline*}
\Pr(X_{n + 1} \in A_k \mid X_n \in A_k)
\\
=
E \bigl\{ \Pr\bigl(X_{n + 1} \in A_k \mid X_n \in A_k, h_{I_n}(X_n)\bigr)
\mid X_n \in A_k \bigr\}
\end{multline*}
equal to zero, and this implies \eqref{eq:gibbs-aperiodic} is zero almost
surely with respect to $\pi$,
but this would imply $\Pr(X_{n + 1} \in A_k) = 0$, when it
must be $1 / d$. That is the contradiction. Since whether the chain
is periodic or not does not depend on the initial distribution, this finishes
the proof for random scan Gibbs samplers.
For a fixed scan Gibbs sampler, the argument is almost the same.
Now there are no choices $I_n$ and we need to consider the state
between substeps. Suppose without loss of generality
the scan order is $1$, $\ldots,$ $k$.
Consider again the stationary chain, write $Y_0 = X_n$ and let $Y_1$ be the
state after the first substep, $Y_2$, after the second, and so forth.
Then conditional on $h_1(Y_0)$, $h_2(Y_1)$, $\ldots,$ $h_k(Y_{k - 1})$
the two random elements $X_n = Y_0$ and $X_{n + 1} = Y_k$
are conditionally independent. Hence
\begin{multline*}
\Pr\bigl(X_{n + 1} \in A_k \mid X_n \in A_k, h_1(Y_0),
\ldots, h_k(Y_{k - 1})\bigr)
\\
=
\Pr\bigl(X_{n + 1} \in A_k \mid h_1(Y_0), \ldots, h_k(Y_{k - 1})\bigr)
\end{multline*}
holds and contradicts the assumption
of periodicity in the same way as before.
Since whether the chain
is periodic or not does not depend on the initial distribution, this finishes
the proof for fixed scan Gibbs samplers.
\end{proof}
\subsection{The Aperiodic Ergodic Theorem}
The following is Theorem~13.3.3 in \citet{meyn-tweedie}.
\begin{theorem} \label{th:aperiodic-ergodic}
For a positive Harris recurrent chain with transition probability kernel
$P$, initial distribution $\lambda$, and invariant distribution $\pi$
$$
\norm{\lambda P^n - \pi} \to 0, \qquad n \to \infty.
$$
\end{theorem}
This says the marginal distribution of $X_n$, which is $\lambda P^n$,
converges to $\pi$ in total variation, which is a much stronger form
of convergence than convergence in distribution.
\begin{corollary}
For a positive Harris recurrent chain with transition probability kernel
$P$ and invariant distribution $\pi$
$$
\norm{P^n(x, \fatdot) - \pi(\fatdot)} \to 0, \qquad n \to \infty,
$$
for any $x$ in the state space.
\end{corollary}
\noindent
This is just just the special case of Theorem~\ref{th:aperiodic-ergodic}
where $\lambda$ is concentrated at the point $x$.
\subsection{Drift Conditions in General} \label{sec:drift-general}
An abstract state space $(\Omega, \mathcal{A})$ where $\Omega$ is just a
set having no other properties gives us little to work with in studying
transience and recurrence. The chain is transient when it moves off to
infinity (sort of), but on a bare set there is no direction toward infinity.
The idea of drift functions is to impose directions on a bare set.
A drift function is just a nonnegative-valued function $V$ on the state space;
it may have extra restrictions, but different ones in different applications.
Uphill on the drift function is toward infinity.
Downhill on the drift function is toward the center.
Drift conditions work by comparing the functions $V$ and $P V$. The latter
is (by definition)
$$
(P V)(x) = \int P(x, d y) V(y) = E\{ V(X_{n + 1}) \mid X_n = x \}.
$$
If $V$ is unbounded, the integral (conditional expectation) need not exist,
in which case we say the value (in the loose sense) is $+ \infty$, which
always makes sense because $V$ is nonnegative.
To simplify notation, we write $P V(x)$ instead of $(P V)(x)$. The former
seems less clear, but it can only mean the latter, since we have no definition
of a kernel multiplied on the right by a number (rather than by a function).
A notion that is useful in describing some drift functions is the following.
We say a nonnegative function $V$ is \emph{unbounded off petite sets}
if the level sets
$$
\set{ x \in \Omega : V(x) \le r }
$$
are petite for each real number $r$ \citep[Section~8.4.2]{meyn-tweedie}.
\subsection{The Drift Condition for Transience} \label{sec:drift-transience}
Part of Theorem~8.0.2 \relax in \citet{meyn-tweedie} says the following.
Suppose $P$ is $\psi$-irreducible. Then a Markov chain with transition
probability kernel $P$ is transient if and only if there exists a bounded
drift function $V$ and a $\psi$-positive set $A$ such that
$$
P V(x) \ge V(x), \qquad x \in A^c
$$
and the set
$$
\left\{\, x \in \Omega : V(x) > \sup_{y \in A} V(y) \,\right\}
$$
is $\psi$-positive.
\citet{meyn-tweedie} do not give any examples of using this drift condition,
and I do not know of any. So we will not illustrate it here
\subsection{The Drift Condition for Harris Recurrence} \label{sec:drift-harris}
The following is Theorem~9.1.8 in \citet{meyn-tweedie}.
(See Section~\ref{sec:drift-general} for the meaning of $P V$ and the
definition of unbounded off petite sets.)
\begin{theorem} \label{th:drift-recurrence}
Suppose for an irreducible Markov chain having transition probability kernel
$P$ there exists a petite set $C$ and a nonnegative function $V$ that is
unbounded off petite sets such that
\begin{equation} \label{eq:drift-recurrence}
P V(x) \le V(x), \qquad x \notin C,
\end{equation}
holds. Then the chain is Harris recurrent.
\end{theorem}
\subsubsection{Example: AR(1) Time Series, Continued} \label{sec:ar1-two}
\citet[Section~8.5.2]{meyn-tweedie} use this theorem to show that the
AR(1) time series with $\rho = 1$ is Harris recurrent. Since their
proof is rather complicated (it goes on for a page and a half), we
won't try to duplicate it here.
Because of the symmetry of the normal distribution, if $X_0$, $X_1$, $X_2$,
$\ldots$ is an AR(1) Markov chain with $\rho = - 1$ and $\sigma = s$
started at $X_0 = x$, then $X_0$, $X_2$, $X_4$, $\ldots$ is an AR(1)
Markov chain with $\rho = 1$ and $\sigma = s \sqrt{2}$ started at $X_0 = x$.
Hence recurrence of the latter implies recurrence of the former.
This settles the case left open in Section~\ref{sec:ar1-one} above.
In case $\rho^2 = 1$, the AR(1) Markov chain is null recurrent.
\subsection{The Drift Condition for Geometric Ergodicity} \label{sec:drift-geo}
Recall the definition of ``unbounded off petite sets''
from Section~\ref{sec:drift-general}.
The following is part of Theorem~15.0.1 in \citet{meyn-tweedie}.
\begin{theorem} \label{th:drift-geometric}
Suppose for an irreducible, aperiodic Markov chain having
transition probability kernel $P$ and state space $\Omega$
there exists a petite set $C$, a real-valued function $V$ satisfying
$V \ge 1$, and constants $b < \infty$ and $\lambda < 1$ such that
\begin{equation} \label{eq:drift-geometric}
P V(x) \le \lambda V(x) + b I(x, C), \qquad x \in \Omega,
\end{equation}
holds. Then the chain is geometrically ergodic.
\end{theorem}
The function $V$ is referred to as a drift function
and \eqref{eq:drift-geometric} as the drift condition for geometric ergodicity.
Theorem~\ref{th:drift-geometric} has a near converse,
which is another part of Theorem~15.0.1 in \citet{meyn-tweedie}.
\begin{theorem} \label{th:drift-geometric-converse}
For an geometrically ergodic Markov chain having
transition probability kernel $P$, invariant distribution $\pi$,
and state space $\Omega$,
there exists an extended-real-valued function $V$ satisfying
$V \ge 1$ and $\pi\bigl(V(x) < \infty\bigr) = 1$, constants
$b < \infty$ and $\lambda < 1$, and a petite set $C$
such that \eqref{eq:drift-geometric} holds.
Moreover, there exist constants $r > 1$ and $R < \infty$ such that
$$
\sum_{n = 1}^\infty r^n \norm{P^n(x, \fatdot) - \pi(\fatdot)}
\le
R V(x), \qquad x \in \Omega.
$$
\end{theorem}
This shows that the function $M$ in \eqref{eq:geometrically-ergodic-alternate}
can be taken to be a positive multiple of a drift function $V$.
Taking expectations with respect to $\pi$ of both sides
of \eqref{eq:drift-geometric} and using $\pi = \pi P$, we get
$$
(1 - \lambda) E_\pi\{V(X)\} \le b \pi(C),
$$
which shows that a function satisfying the geometric drift condition
is always $\pi$-integrable.
Thus we can always take the function $M$
in \eqref{eq:geometrically-ergodic-alternate} to be $\pi$-integrable.
The fact that any solution $V$ to the geometric drift condition
is $\pi$-integrable
gives us a way to find at least some unbounded $\pi$-integrable functions:
any random variable $g(X)$ satisfying $\abs{g} \le V$ has expectation
with respect to $\pi$.
There is an alternate form of the geometric drift condition that is often
easier to verify \citep[Lemma~15.2.8]{meyn-tweedie}.
\begin{theorem}
The geometric drift condition \eqref{eq:drift-geometric} holds if and only
if $V$ is unbounded off petite sets and there exists a constant $L < \infty$
such that
\begin{equation} \label{eq:drift-geometric-alternate}
P V \le \lambda V + L.
\end{equation}
\end{theorem}
\subsubsection{Example: AR(1) Time Series, Continued} \label{sec:ar1-geo}
Here we show that an AR(1) process with $\rho^2 < 1$ is geometrically ergodic.
First it is a $T$-chain because the conditional probability density function
for $X_{n + 1}$ given $X_n$ is a continuous function of $X_n$.
Thus every compact set is petite and the function $V$ defined by
$V(x) = 1 + x^2$ is unbounded off petite sets. Now
$$
P V(x) = E(1 + X_{n + 1}^2 \mid X_n = x)
=
1 + \rho^2 x^2 + \sigma^2
$$
and hence we have the alternate geometric drift condition
\eqref{eq:drift-geometric-alternate} with $\lambda = \rho^2$
and $L = 1 - \rho^2 + \sigma^2$.
\subsubsection{Example: A Gibbs Sampler}
Suppose $X_1$, $\ldots,$ $X_n$ are IID $\text{Normal}(\mu, \lambda^{-1})$
and we suppose that the prior distribution on $(\mu, \lambda)$ is the improper
prior with density with respect to Lebesgue measure
$$
g(\mu, \lambda) = \lambda^{- 1 / 2}.
$$
We wish to use a Gibbs sampler to simulate this
(actually the joint posterior distribution can be derived in closed form,
but for this example we ignore that).
The unnormalized posterior is
\begin{multline*}
\lambda^{n / 2} \exp\left( - \frac{\lambda}{2} \sum_{i = 1}^n (x_i - \mu)^2
\right)
\cdot \lambda^{- 1 / 2}
\\
=
\lambda^{(n - 1) / 2}
\exp\left( - \frac{n \lambda}{2} \bigl[ v_n + (\bar{x}_n - \mu)^2 \bigr]
\right)
\end{multline*}
where
\begin{align*}
\bar{x}_n & = \frac{1}{n} \sum_{i = 1}^n x_i
\\
v_n & = \frac{1}{n} \sum_{i = 1}^n (x_i - \bar{x}_n)^2
\end{align*}
Hence the posterior conditional distribution of $\lambda$ given
$\mu$ is $\text{Gamma}(a, b)$, where
\begin{align*}
a & = (n + 1) / 2
\\
b & = n \bigl[ v_n + (\bar{x}_n - \mu)^2 \bigr] / 2
\end{align*}
and the posterior conditional distribution of $\mu$ given $\lambda$ is
$\text{Normal}(c, d)$ where
\begin{align*}
c & = \bar{x}_n
\\
d & = n^{- 1} \lambda^{- 1}
\end{align*}
We use a fixed scan Gibbs sampler updating first $\lambda$ then $\mu$ in
each iteration, that is, we simulate the
Markov chain $(\mu_t, \lambda_t)$, $t = 1$, 2, $\ldots$ (we use $t$ rather
than $n$ for the time because we already have another $n$ in this problem)
as follows
\begin{align*}
\lambda_{t + 1} & \sim f_{\lambda \mid \mu}( \fatdot \mid \mu_t )
\\
\mu_{t + 1} & \sim f_{\mu \mid \lambda}( \fatdot \mid \lambda_{t + 1} )
\end{align*}
where $\sim$ means ``is simulated from the distribution.''
Again, we know the conditional distributions are continuous functions
of the conditioning variables so the chain is a $T$-chain and every
compact set is petite.
We try a drift function
$$
V(\mu, \lambda)
=
1 + (\mu - \bar{x}_n)^2 + \varepsilon \lambda^{- 1} + \lambda
$$
where $\varepsilon > 0$ is a constant to be named later.
Clearly, this is unbounded off compact sets of the state space which
is $\real \times (0, \infty)$. The term $\varepsilon \lambda^{- 1}$
makes $V(\mu, \lambda)$ go to infinity as $\lambda$ goes to zero.
First
\begin{align*}
E \{ V(\mu_{t + 1}, \lambda_{t + 1} ) \mid \lambda_{t + 1},
\mu_t, \lambda_t \}
& =
E \{ V(\mu_{t + 1}, \lambda_{t + 1} ) \mid \lambda_{t + 1} \}
\\
& =
1 + n^{- 1} \lambda_{t + 1}^{- 1} + \varepsilon \lambda_{t + 1}^{- 1}
+ \lambda_{t + 1}
\\
& =
1 + (\varepsilon + n^{- 1}) \lambda_{t + 1}^{- 1} + \lambda_{t + 1}
\end{align*}
so, using the facts that if $X$ is $\text{Gamma}(a, b)$ then
\begin{align*}
E(X^{-1}) & = \frac{b}{a - 1}
\\
E(X) & = \frac{a}{b}
\end{align*}
(the first requires $a - 1 > 0$, otherwise the expectation does not exist),
we obtain
\begin{align*}
E \{ V(\mu_{t + 1}, \lambda_{t + 1} ) \mid \mu_t, \lambda_t \}
& =
E \{ E [ V(\mu_{t + 1}, \lambda_{t + 1} ) \mid \lambda_{t + 1},
\mu_t, \lambda_t ] \mid \mu_t, \lambda_t \}
\\
& =
1 + (\varepsilon + n^{- 1}) E(\lambda_{t + 1}^{- 1} \mid \mu_t)
+ E(\lambda_{t + 1} \mid \mu_t)
\\
& =
1 + \frac{(\varepsilon + n^{- 1}) n
\bigl[ v_n + (\bar{x}_n - \mu_t)^2 \bigr] / 2 }{ (n + 1) / 2 - 1}
\\
& \qquad
+ \frac{(n + 1) / 2}{ n \bigl[ v_n + (\bar{x}_n - \mu_t)^2 \bigr] / 2 }
\\
& \le
1 + \frac{(n \varepsilon + 1)
\bigl[ v_n + (\bar{x}_n - \mu_t)^2 \bigr]}{n - 1}
+ \frac{n + 1}{n v_n}
\\
& =
1 + \frac{(n \varepsilon + 1) v_n}{n - 1} + \frac{n + 1}{n v_n}
+ \frac{(n \varepsilon + 1) (\bar{x}_n - \mu_t)^2}{n - 1}
\\
& \le
\rho V(\mu_t, \lambda_t) + L,
\end{align*}
where
\begin{align*}
\rho & = \frac{n \varepsilon + 1}{n - 1}
\\
L & = 1 + \frac{(n \varepsilon + 1) v_n}{n - 1} + \frac{n + 1}{n v_n}
\end{align*}
Thus we satisfy the geometric drift condition if we can make $\rho < 1$,
which we can if $n > 2$ and $\varepsilon < 1 / n$.
\begin{thebibliography}{}
\bibitem[Bradley(1986)]{bradley-mixing}
Bradley, R.~C. (1986).
\newblock Basic properties of strong mixing conditions.
\newblock In \emph{Dependence in Probability and Statistics: A Survey
of Recent Results} (Oberwolfach, 1985), edited by Eberlein, E.,
and Taqqu, M.~S.
\newblock Boston: Birkh\"{a}user.
%%% NEED PAGES
% \bibitem[Breiman(1968)]{breiman}
% Breiman, L. (1968).
% \newblock \emph{Probability}.
% \newblock Redding, MA: Addison-Wesley. Republished 1992, Philadelphia:
% Society for Industrial and Applied Mathematics.
\bibitem[Chan and Geyer(1994)]{chan-geyer}
Chan, K.~S. and Geyer, C.~J. (1994).
\newblock Discussion of \citet{tierney}.
\newblock \emph{Annals of Statistics}, \textbf{22}, 1747--1758.
\bibitem[Devroye(1986)]{devroye}
Devroye, L. (1986).
\newblock \emph{Non-Uniform Random Variate Generation}.
\newblock New York: Springer-Verlag.
\bibitem[Fristedt and Gray(1996)]{fristedt}
Fristedt, B.~E. and Gray, L.~F. (1996).
\newblock \emph{A Modern Approach to Probability Theory}.
\newblock Boston: Birkh\"{a}user.
\bibitem[Gelfand and Smith(1990)]{gelfand-smith}
Gelfand, A.~E. and Smith, A.~F.~M. (1990).
\newblock Sampling-based approaches to calculating marginal densities.
\newblock \emph{Journal of the American Statistical Association},
\textbf{85}, 398--409.
\bibitem[Geman and Geman(1984)]{geman-geman}
Geman, S. and Geman, D. (1984).
\newblock Stochastic relaxation, Gibbs distributions, and the
Bayesian restoration of images.
\newblock \emph{IEEE Transactions on Pattern Analysis and Machine
Intelligence}, \textbf{6}, 721--741.
\bibitem[Geyer(1992)]{geyer-practical}
Geyer, C.~J. (1992).
\newblock Practical Markov chain Monte Carlo (with discussion).
\newblock \emph{Statistical Science}, \textbf{7}, 473--511.
\bibitem[Geyer(2011)]{geyer-intro}
Geyer, C. J. (2011).
\newblock Introduction to Markov chain Monte Carlo.
\newblock In \emph{Handbook of Markov Chain Monte Carlo}, edited by
Brooks, S., Gelman, A., Jones, G., and Meng, X.-L., pp.~3--48.
\newblock Boca Raton, FL: Chapman \& Hall/CRC.
% \bibitem[Geyer and Johnson(2013)]{mcmc-package}
% Geyer, C.~J. and Johnson, L.~T. (2013).
% \newblock R package \texttt{mcmc} (Markov Chain Monte Carlo), version 0.9-2.
% \newblock \url{http://cran.r-project.org/package=mcmc}.
% \bibitem[{Gordin and Lif\v{s}ic(1978)}]{gordin}
% Gordin, M.~I. and Lif\v{s}ic, B.~A. (1978).
% \newblock Central limit theorem for stationary {M}arkov processes.
% \newblock \emph{Soviet Mathematics, Doklady}, \textbf{19}, 392--394.
% \newblock (English translation of Russian original).
\bibitem[Green(1995)]{green}
Green, P.~J. (1995).
\newblock Reversible jump Markov chain Monte Carlo computation and
Bayesian model determination.
\newblock \emph{Biometrika}, \textbf{82}, 711--732.
\bibitem[Hastings(1970)]{hastings}
Hastings, W.~K. (1970).
\newblock Monte Carlo sampling methods using Markov chains and their
applications.
\newblock \emph{Biometrika}, \textbf{57}, 97--109.
\bibitem[Hoel, et al.(1986)Hoel, Port, and Stone]{hoel-port-stone}
Hoel, P.~G., Port, S.~C., and Stone, C.~J. (1972).
\newblock \emph{Introduction to Stochastic Processes}.
\newblock Boston: Houghton Mifflin.
\newblock Republished, Waveland Press, Prospect Heights, Illinois, 1986.
\bibitem[Ibragimov and Linnik(1971)]{ibragimov-linnik}
Ibragimov, I.~A. and Linnik, Yu.~V. (1971).
\newblock \emph{Independent and Stationary Sequences of Random Variables}
(edited by J.~F.~C. Kingman).
\newblock Groningen: Wolters-Noordhoff.
\bibitem[Jain and Jamison(1967)]{jain-jamison}
Jain, N. and Jamison, B. (1967).
\newblock Contributions to Doeblin's theory of Markov processes.
\newblock \emph{Zeitschrift f\"{u}r Wahrscheinlichkeitstheorie und
Verwandte Gebiete}, \textbf{8}, 19-40.
\bibitem[Jones, et al.(2006)Jones, Haran, Caffo and Neath]{jones-et-al}
Jones, G.~L., Haran, M., Caffo, B.~S., and Neath, R. (2006).
\newblock Fixed-width output analysis for Markov chain Monte Carlo.
\newblock \emph{Journal of the American Statistical Association},
\textbf{101}, 1537--1547.
% \bibitem[Kipnis and Varadhan(1986)]{kipnis-varadhan}
% Kipnis, C. and Varadhan, S.~R.~S. (1986).
% \newblock Central limit theorem for additive functionals of reversible
% Markov processes and applications to simple exclusions.
% \newblock \emph{Communications in Mathematical Physics}, \textbf{104}, 1--19.
\bibitem[{\L}atuszy\'{n}ski, et al.(2013){\L}atuszy\'{n}ski,
Miasojedow and Niemiro]{latuszynski-miasojedow-niemiro}
{\L}atuszy\'{n}ski, K., Miasojedow, B., and Niemiro, W. (2013).
\newblock Nonasymptotic bounds on the estimation error of MCMC algorithms.
\newblock \emph{Bernoulli}, \textbf{19}, 2033--2066.
\bibitem[{\L}atuszy\'{n}ski and Niemiro(2011)]{latuszynski-niemiro}
{\L}atuszy\'{n}ski, K., and Niemiro, W. (2011).
\newblock Rigorous confidence bounds for MCMC under a geometric drift
condition.
\newblock \emph{Journal of Complexity}, \textbf{27}, 23--38.
% \bibitem[Maigret(1978)]{maigret}
% Maigret, N. (1978).
% \newblock Th\'{e}or\`{e}me de limite centrale pour une chaine du Markov
% r\'{e}currente Harris positive.
% \newblock \emph{Annales de l'Institut Henri Poincar\'{e}. Section B. Calcul
% des Probabilit\'{e}s et Statistique. Nouvelle S\'{e}rie},
% \textbf{14}, 425-440.
\bibitem[Meketon and Schmeiser(1984)]{meketon-schmeiser}
Meketon, M.~S. and Schmeiser, B.~W. (1984).
\newblock Overlapping batch means: Something for nothing?
\newblock In \emph{Proceedings of the 1984 Winter Simulation Conference},
edited by Sheppard, S., Pooch, U., and Pegden, D., pp. 227–230.
\newblock Piscataway, NJ: IEEE Press.
\bibitem[Metropolis, et al.(1953)Metropolis, Rosenbluth, Rosenbluth, Teller
and Teller]{metropolis-et-al}
Metropolis, N., Rosenbluth, A.~W., Rosenbluth, M.~N., Teller, A.~H., and
Teller, E. (1953).
\newblock Equation of state calculations by fast computing machines.
\newblock \emph{Journal of Chemical Physics}, \textbf{21}, 1087--1092.
\bibitem[Meyn and Tweedie(2009)]{meyn-tweedie}
Meyn, S.~P. and Tweedie, R.~L. (2009).
\newblock \emph{Markov Chains and Stochastic Stability}, second edition.
\newblock Cambridge: Cambridge University Press.
\bibitem[Nummelin(1984)]{nummelin}
Nummelin, E. (1984).
\newblock \emph{General Irreducible Markov Chains and Non-Negative Operators}.
\newblock Cambridge: Cambridge University Press.
\bibitem[Peligrad(1986)]{peligrad}
Peligrad, M. (1986).
\newblock Recent advances in the central limit theorem and its weak
invariance principle for mixing sequences of random variables (a survey).
\newblock In \emph{Dependence in Probability and Statistics: A Survey
of Recent Results} (Oberwolfach, 1985), edited by Eberlein, E.,
and Taqqu, M.~S.
\newblock Boston: Birkh\"{a}user.
%%% NEED PAGES
\bibitem[Roberts and Rosenthal(1997)]{roberts-rosenthal}
Roberts, G.~O. and Rosenthal, J.~S. (1997).
\newblock Geometric ergodicity and hybrid Markov chains.
\newblock \emph{Electronic Communications in Probability}, \textbf{2}, 13--25.
\bibitem[Rosenthal(1995)]{rosenthal}
Rosenthal, J.~S. (1995).
\newblock Minorization conditions and convergence rates for Markov chain
Monte Carlo.
\newblock \emph{Journal of the American Statistical Association},
\textbf{90}, 558--566.
% \bibitem[Rudin(1986)]{rudin}
% Rudin, W. (1986).
% \newblock \emph{Real and Complex Analysis}, third edition.
% \newblock New York: McGraw-Hill.
\bibitem[Tierney(1994)]{tierney}
Tierney, L. (1994).
\newblock Markov chains for exploring posterior distributions (with discussion).
\newblock \emph{Annals of Statistics}, \textbf{22}, 1701--1762.
\end{thebibliography}
\end{document}
\section{Uniform Ergodicity}
Our final and strongest form of ergodicity is this.
A positive Harris recurrent Markov chain with transition probability
kernel $P$, invariant distribution $\pi$, and state space $\Omega$
is \emph{uniformly ergodic}
if one can use a constant function $M$ in the alternative definition
of geometrically ergodic, that is,
if there exist a real numbers $s < 1$ and $M < \infty$ such that
\begin{equation} \label{eq:uniformly-ergodic}
\norm{P^n(x, \fatdot) - \pi(\fatdot)}
\le M s^n,
\qquad x \in \Omega.
\end{equation}
It is obvious that uniform ergodicity \eqref{eq:uniformly-ergodic}
implies geometric ergodicity \eqref{eq:geometrically-ergodic-alternate},
but the reverse implication does not necessarily hold.
Since uniform ergodicity implies geometric ergodicity,
by Theorem~\ref{th:drift-geometric-converse} the geometric drift condition
\eqref{eq:drift-geometric} holds and the drift function $V$ is unbounded
off petite sets and the function $M$ in the alternate definition of
geometric ergodicity \eqref{eq:geometrically-ergodic-alternate} can be
taken proportional to $V$. But this says $V$ must be both bounded and
unbounded off petite sets, which can happen only if the whole state
space is petite. Actually something stronger is true
\citep[Theorem~16.0.2]{meyn-tweedie}.
\begin{theorem}
A Markov chain is uniformly ergodic if and only if the whole state space
is small.
\end{theorem}
This shows that most Markov chains are not uniformly ergodic, but here
are a few cases where one does have uniform ergodicity.
\begin{itemize}
\item The Markov chain is actually an IID sequence, in which case it is
obvious from the definition that the whole state space is a small set
because $P(x, A) = \pi(A)$ for all $x$ by independence.
\item The Markov chain is a $T$-chain and the state space is compact.
\item The state space is finite. In this case the Markov chain is
automatically a $T$-chain because every function on a discrete
topological space is continuous and the state space is automatically
compact because every finite set of any topological space is compact.
\end{itemize}
In all of these cases the Markov chain is uniformly ergodic if it is
positive Harris recurrent.
\begin{center} \LARGE REVISED DOWN TO HERE \end{center}