\documentclass[11pt]{article}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{amsthm}
\usepackage{indentfirst}
\usepackage{natbib}
\usepackage{url}
\RequirePackage{amsfonts}
\newcommand{\real}{\mathbb{R}}
\newcommand{\rats}{\mathbb{Q}}
\newcommand{\nats}{\mathbb{N}}
\newcommand{\set}[1]{\{\, #1 \,\}}
\newcommand{\abs}[1]{\lvert #1 \rvert}
\newcommand{\norm}[1]{\lVert #1 \rVert}
\newcommand{\inner}[1]{\langle #1 \rangle}
\newcommand{\biginner}[1]{\left\langle #1 \right\rangle}
\newcommand{\fatdot}{\,\cdot\,}
\DeclareMathOperator{\cl}{cl}
\DeclareMathOperator{\interior}{int}
\DeclareMathOperator{\boundary}{bdry}
\DeclareMathOperator{\var}{var}
\DeclareMathOperator{\cov}{cov}
\DeclareMathOperator{\cor}{cor}
\DeclareMathOperator{\pr}{pr}
\DeclareMathOperator{\tr}{tr}
\newcommand{\uto}{\stackrel{u}{\longrightarrow}}
\newcommand{\weakto}{\stackrel{\mathcal{D}}{\longrightarrow}}
\newcommand{\lawto}{\stackrel{\mathcal{L}}{\longrightarrow}}
\newcommand{\asto}{\stackrel{\text{a.s.}}{\longrightarrow}}
\newcommand{\probto}{\stackrel{P}{\longrightarrow}}
\newcommand{\opand}{\mathbin{\rm and}}
\newcommand{\opor}{\mathbin{\rm or}}
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{definition}{Foo}
\RequirePackage{amssymb}
\let\emptyset=\varnothing
%%%%% Barred Capital Letters %%%%%
\newcommand{\Wbar}{W{\mkern -19.0 mu}\overline{\phantom{\text{W}}}}
\newcommand{\Xbar}{X{\mkern -13.5 mu}\overline{\phantom{\text{X}}}}
\newcommand{\Ybar}{Y{\mkern -14.0 mu}\overline{\phantom{\text{Y}}}}
\newcommand{\Zbar}{Z{\mkern -10.5 mu}\overline{\phantom{\text{Z}}}}
%%%%% Boldface Letters %%%%%
\newcommand{\boldh}{\mathbf{h}}
\newcommand{\bolds}{\mathbf{s}}
\newcommand{\boldt}{\mathbf{t}}
\newcommand{\boldv}{\mathbf{v}}
\newcommand{\boldx}{\mathbf{x}}
\newcommand{\boldX}{\mathbf{X}}
\newcommand{\boldmu}{\boldsymbol{\mu}}
\begin{document}
\begin{raggedright}
\Large
Stat 8501 Lecture Notes
\\
\textbf{Spatial Gaussian Processes}
\\
Charles J. Geyer
\\
% \today
April 14, 2014
\par
\end{raggedright}
\section{Introduction}
We start more or less following \citet*[hereinafter HPS]{hoel-port-stone},
changing what needs to be
changed to allow for vector parameters. We are studying second order
processes $X(\boldt)$, where now $\boldt$ denotes a vector. But other than
that everything stays the same: for each $\boldt$ in the domain $T$,
we have a random variable $X(\boldt)$. (I don't really like the vectors
are boldface convention, but we will use it in this handout.)
We use the same notation as HPS
\begin{align*}
\mu_X(\boldt) & = E\{ X(\boldt) \}
\\
r_X(\bolds, \boldt) & = \cov\{ X(\bolds), X(\boldt) \}
\end{align*}
We say the process is \emph{weakly stationary} or \emph{translation invariant}
if
\begin{align*}
\mu_X(\boldt + \boldh) & = \mu_X(\boldt)
\\
r_X(\bolds + \boldh, \boldt + \boldh) & = r_X(\bolds, \boldt)
\end{align*}
for all $\boldh$, $\bolds$, and $\boldt$ for which the expressions make sense.
The first implies that $\mu_X$ is actually a constant function, and the
second implies that $r_X(\bolds, \boldt)$ is a function of $\bolds - \boldt$
only, so (again following HPS) we can define the covariance function
as a function of one variable rather than two
$$
r_X(\boldt) = r_X(0, \boldt)
$$
As in the one-dimensional case, a covariance function must be a positive
definite function, that is for any scalars $a_1$, $\ldots,$ $a_k$ and vectors
$\boldt_1$, $\ldots,$ $\boldt_k$
$$
\sum_{i = 1}^k
\sum_{j = 1}^k
a_i a_j
r_X(\boldt_i, \boldt_j)
\ge
0
$$
when $r_X$ is the covariance function of a (not necessarily translation
invariant) second order process, and
$$
\sum_{i = 1}^k
\sum_{j = 1}^k
a_i a_j
r_X(\boldt_i - \boldt_j)
\ge
0
$$
when $r_X$ is the covariance function of a translation
invariant second order process.
Bochner's theorem says that $r_X$ is the covariance function of
a translation invariant process if and only if
$\boldt \mapsto r_X(\boldt) / r_X(0)$ is the characteristic function of
a random vector having a distribution symmetric about zero.
So far, everything is the same as in the one-dimensional carrier case
except for some boldface for vectors instead of lightface for scalars.
Now comes the different part, but to introduce that we first ask why
we would assume translation invariance. The reason is to simplify
the model making it easier to estimate when we are doing statistical
inference (we won't discuss statistical inference for Gaussian processes
in this course, but that is a large part of spatial statistics).
We say a translation invariant process $X$ is \emph{isotropic}
or \emph{rotationally invariant} if $r_X(\boldt)$ is a function of
$\norm{\boldt}$ only,
where $\norm{\fatdot}$ denotes the Euclidean norm. This is something
new that we do not see in the one-dimensional case. There are no rotations
in one dimension.
We follow Section~2.3 of \citet{chiles-delfiner} with some additions from
other sources. Write
\begin{equation} \label{eq:radial}
r_X(\boldt) = c_X(\norm{\boldt})
\end{equation}
so we are studying the properties of the function $c_X$ of one nonnegative
real variable.
We will call the one dimensional covariance functions like $c_X$
\emph{radial covariance functions}.
Bochner's theorem still applies: $c_X$ is a valid radial covariance function
if
$$
\boldt \mapsto \frac{c_X(\norm{\boldt})}{c_X(0)}
$$
is a valid $d$-dimensional covariance function.
As in the one dimensional case
$$
\var\{ X(\boldt) \} = c_X(0), \qquad \text{for all $\boldt$}.
$$
The first issue we deal with is that if $c(r)$ is a valid radial covariance
function for processes on $\real^d$, then it is also a valid radial covariance
function for processes on $\real^m$ for $m \le d$, but is not necessarily
valid for processes on $\real^n$ for $n > d$. For example, the radial
covariance function
$$
c(r) = \begin{cases}
1 - \frac{r}{a}, & 0 \le r \le a
\\
0, & \text{otherwise}
\end{cases}
$$
is valid for processes on $\real^1$ but is not valid for higher dimensions.
I could not find simple general conditions for validity of radial covariance
functions. There are however, some well known simple examples that are
valid in all dimensions. One is the
\emph{squared exponential} or \emph{Gaussian} radial covariance function
\begin{equation} \label{eq:squared-exponential}
c(r) = A \exp\left( - \frac{r^2}{a^2} \right)
\end{equation}
for constants $A > 0$ and $a > 0$. Another is the
\emph{exponential} radial covariance function
\begin{equation} \label{eq:exponential}
c(r) = A \exp\left( - \frac{r}{a} \right)
\end{equation}
Here are some more complicated models that are also valid for all dimensions
\begin{align*}
c(r) & = A \left(1 + \frac{r}{a} \right) \exp\left( - \frac{r}{a} \right)
\\
c(r) & = A \left(1 + \frac{r}{a} + \frac{r^2}{3 a^2} \right) \exp\left( - \frac{r}{a} \right)
\end{align*}
and another with much lighter tails that is also valid for all dimensions,
which is called the \emph{generalized Cauchy} radial covariance function,
$$
c(r) = A \left(1 + \frac{r^2}{a^2} \right)^{- \alpha}
$$
where here, as in all of the above, we have $A > 0$ and $a > 0$ but how
also have another parameter $\alpha > 0$.
There is one more quite complicated covariance function that has received
some attention in the literature. This is also valid in all dimensions
and is called the $K-Bessel$ radial covariance function
by \citet{chiles-delfiner} and the \emph{Mat\'{e}rn} radial covariance function
by others \citep{wikipedia-matern}
$$
c(r) = \frac{A}{2^{\nu - 1} \Gamma(\nu)} \left(\frac{r}{a}\right)^\nu
K_\nu\left(\frac{r}{a}\right)
$$
where here, as in all of the above, we have $A > 0$ and $a > 0$ but how
also have another parameter $\nu \ge 0$. Here $\Gamma(\nu)$ denotes the
gamma function and $K_\nu(x)$ denotes the modified Bessel function of
the second kind of order $\nu$ \citep{wikipedia-bessel}. For those not
familiar with Bessel functions, they should just be thought of a another
kind of special function, like sines and cosines, exponentials and logarithms,
and gamma functions. They are calculated by the \texttt{besselK} function in R.
Part of the reason for the interest in this class of radial covariance
functions is that it includes some of the others. The
case $\nu = 1 / 2$ is the exponential radial covariance function
\eqref{eq:exponential}, and as $\nu \to \infty$ it converges to
the squared exponential radial covariance function
\eqref{eq:squared-exponential}.
A Gaussian process with Mat\'{e}rn radial covariance function has sample paths
that are $\lfloor \nu - 1 \rfloor$ times differentiable.
A Gaussian process with exponential radial covariance function has sample paths
that are nowhere differentiable.
A Gaussian process with squared exponential radial covariance function has
sample paths that are infinitely differentiable.
\section{Prediction} \label{sec:predict}
Suppose we observe a Gaussian process at locations
$\boldt_1$, $\ldots,$ $\boldt_k$ and we want to predict the value at
another location $t_0$ where we have not observed the value of the process.
The best prediction if best means squared error loss is the conditional
expectation, and the best prediction if best means absolute error loss is
the conditional median, but the conditional distribution is normal for which
the mean is equal to the median, so the conditional expectation is best
for either case.
If a partitioned random vector
\begin{equation} \label{eq:part}
\begin{pmatrix} \boldX_1 \\ \boldX_2 \end{pmatrix}
\end{equation}
is multivariate normal with mean vector
$$
\begin{pmatrix} \boldmu_1 \\ \boldmu_2 \end{pmatrix}
$$
and variance matrix
$$
\begin{pmatrix} \Sigma_{1 1} & \Sigma_{1 2}
\\ \Sigma_{2 1} & \Sigma_{2 2} \end{pmatrix}
$$
then the conditional distribution of $\boldX_1$ given $\boldX_2$ is
multivariate normal with mean vector
\begin{equation} \label{eq:mean}
\boldmu_1 - \Sigma_{1 2} \Sigma_{2 2}^{- 1} (\boldX_2 - \boldmu_2)
\end{equation}
and variance matrix
\begin{equation} \label{eq:variance}
\Sigma_{1 1} - \Sigma_{1 2} \Sigma_{2 2}^{- 1} \Sigma_{2 1}
\end{equation}
\citep[Theorem~2.5.1]{anderson}. This notation assumes that the distribution
of $\boldX_2$ is nondegenerate (so $\Sigma_{2 2}$ is invertible). If not,
then $\boldX_2$ is a linear function of some set of its components that
do have a nondegenerate distribution \citep[Theorem~2.4.4 and the rest of
Section~2.4]{anderson}, in which case we can rewrite \eqref{eq:part} as
$$
\begin{pmatrix} \boldX_1 \\ \boldX_3 \\ \boldX_4 \end{pmatrix}
$$
where $\boldX_3$ has a nondegenerate distribution and $\boldX_4$ is
a deterministic function (actually a linear function) of $\boldX_3$, then
the conditional distribution of $\boldX_1$ given $\boldX_2$ is the same as
the conditional distribution of $\boldX_1$ given $\boldX_3$ is the same as
and is multivariate normal with mean vector \eqref{eq:mean} with subscript
2 replaced by 3 and variance matrix \eqref{eq:variance} with subscript 2
replaced by 3.
So all we need to do prediction is the vector $\mu$ and the matrix $\Sigma$
for the random vector with components $X(\boldt_i)$, $i = 1$, $\ldots,$ $k$.
The vector $\mu$ has $i$-th
component $\mu_X(\boldt_i)$, and the matrix $\Sigma$ has $i,j$-th component
$r_X(\boldt_i, \boldt_j)$. So we are done with prediction.
\section{Integration}
Now suppose we want to predict the average of the process over a bounded
set $A$. which we denote
$$
X(A) = \frac{1}{m(A)} \int_A X(\boldt) \, d \boldt
$$
where $m(A)$ is Lebesgue measure of the set $A$
$$
m(A) = \int_A \, d \boldt
$$
(which is finite because we assumed $A$ is bounded).
Again, we use the observations at $\boldt_i$, $i = 1$, $\ldots,$ $k$.
We already know the means, variances, and covariances for the $X(\boldt_i)$.
All we need to determine is the mean and variance of $X(A)$ and its covariance
with each of the $X(\boldt_i)$.
As in Section~5.2 of HPS, this is done by interchanging the order of
integration.
\begin{align*}
E\{ X(A) \}
& =
E\left\{ \frac{1}{m(A)} \int_A X(\boldt) \, d \boldt \right\}
\\
& =
\frac{1}{m(A)}
\int_A E \{ X(\boldt) \} \, d \boldt
\\
& =
\frac{1}{m(A)}
\int_A \mu_X(\boldt) \, d \boldt
\end{align*}
Denote that $\mu_X(A)$.
Then
\begin{align*}
\var\{ X(A) \}
& =
E\left\{ \left[ \frac{1}{m(A)} \int_A X(\boldt) \, d \boldt
- \mu_X(A) \right]^2 \right\}
\\
& =
E\left\{ \left[ \frac{1}{m(A)} \int_A [ X(\boldt) - \mu_X(\boldt) ]
\, d \boldt \right]^2 \right\}
\\
& =
E\left\{ \frac{1}{m(A)^2} \int_A \int_A
[ X(\bolds) - \mu_X(\bolds) ]
[ X(\boldt) - \mu_X(\boldt) ]
\, d \bolds \, d \boldt \right\}
\\
& =
\frac{1}{m(A)^2} \int_A \int_A
E \{ [ X(\bolds) - \mu_X(\bolds) ]
[ X(\boldt) - \mu_X(\boldt) ] \}
\, d \bolds \, d \boldt
\\
& =
\frac{1}{m(A)^2} \int_A \int_A
r_X(\bolds, \boldt)
\, d \bolds \, d \boldt
\end{align*}
and
\begin{align*}
\cov\{ X(A), X(\boldt_i) \}
& =
\cov\left\{ X(\boldt_i), \frac{1}{m(A)} \int_A X(\boldt) \, d \boldt
\right\}
\\
& =
E\left\{ [ X(\boldt_i) - \mu_X(\boldt_i) ]
\frac{1}{m(A)} \int_A [ X(\boldt) - \mu_X(\boldt) ]
\, d \boldt \right\}
\\
& =
E\left\{
\frac{1}{m(A)} \int_A
[ X(\boldt_i) - \mu_X(\boldt_i) ]
[ X(\boldt) - \mu_X(\boldt) ]
\, d \boldt \right\}
\\
& =
\frac{1}{m(A)} \int_A
E\{
[ X(\boldt_i) - \mu_X(\boldt_i) ]
[ X(\boldt) - \mu_X(\boldt) ]
\}
\, d \boldt
\\
& =
\frac{1}{m(A)} \int_A
r_X(\boldt_i, \boldt)
\, d \boldt
\end{align*}
These integrals are not usually easy to do and may need to be done by
numerical integration. But they are what has to be done to do this
prediction.
\section{Differentiation}
Now we look at derivatives, when they exist. The first issue is that
when $\boldt$ is multivariate, the first derivative is a vector, the
second derivative a matrix, the third derivative a three-dimensional tensor,
and so forth. So we only look at first derivatives.
We usually think of the first derivative vector as the vector whose components
are the partial derivatives, but this is an oversimplification. The
correct definition of multivariate differentiation
\citep[Definition~8.9]{browder} is that $f$ is differentiable at $\boldx$
if there exists a linear function $l$ such that
\begin{equation} \label{eq:differentiable}
\lim_{\boldh \to 0}
\frac{f(\boldx + \boldh) - f(\boldx) - l(\boldh)}{\norm{\boldh}} = 0
\end{equation}
in which case $l$ is uniquely defined and has the form
$$
l(\boldh) = \boldv^T \boldh
$$
for some vector $\boldv$, which we call the derivative
of $f$ at $\boldx$, usually written $f'(\boldx)$ or $\nabla f(\boldx)$.
If $f$ is differentiable at $\boldx$, that is, if there exists $l$ such
that \eqref{eq:differentiable} holds, then all partial derivatives exist
at $\boldx$, and the $\nabla f(\boldx)$ is the vector of partial derivatives
\citep[Theorem~8.21]{browder}. The converse is not true. More is needed
than existence of partial derivatives to make the function differentiable.
If all partial derivatives exist and are continuous on an open neighborhood
of $\boldx$, then the function is differentiable \citep[Theorem~8.23]{browder}.
We will ignore this distinction between differentiability and existence of
partial derivatives in what follows, mostly because we do not have sharp
conditions for existence of derivatives. A sharp condition for almost sure
existence and continuity of partial derivatives of any order is given by
\citet[Theorem~1.4.2]{adler-taylor} but seems complicated.
Staying at the level of HPS, we can consider a partial derivative to
be the same as the total derivative of the function restricted to a line,
so the results of Section~5.3 \relax in HPS hold. Let us write
$$
X_i(\boldt) = \frac{\partial X(\boldt)}{\partial t_i}
$$
(assuming this exists). Each $X_i(\boldt)$ is a Gaussian process.
From equation~14 of Chapter~5 of HPS we get
$$
\mu_{X_i}(\boldt) = \frac{\partial}{\partial t_i} \mu_X(\boldt)
$$
Then (15) and (16) of Chapter~5 \relax in HPS become
\begin{align*}
r_{Y X_i}(\bolds, \boldt)
& =
\frac{\partial}{\partial t_i} r_{Y X}(\bolds, \boldt)
\\
r_{X_i Y}(\bolds, \boldt)
& =
\frac{\partial}{\partial s_i} r_{X Y}(\bolds, \boldt)
\end{align*}
from which we get (like (20) and (21) of HPS)
\begin{align*}
r_{X X_j}(\bolds, \boldt)
& =
\frac{\partial}{\partial t_j} r_X(\bolds, \boldt)
\\
r_{X_i X_j}(\bolds, \boldt)
& =
\frac{\partial^2 }{\partial s_i \partial t_j} r_X(\bolds, \boldt)
\end{align*}
so that characterizes the first derivative process (or $d$ processes,
one for each dimension).
\begin{thebibliography}{}
\bibitem[Adler and Taylor(2007)]{adler-taylor}
Adler, R.~J., and Taylor, J.~E. (2007).
\newblock \emph{Random Fields and Geometry}.
\newblock New York: Springer.
\bibitem[Anderson(2003)]{anderson}
Anderson, T.~W. (2003).
\newblock \emph{An Introduction to Multivariate Statistical Analysis}, third ed.
\newblock Hoboken: John Wiley.
\bibitem[Browder(1996)]{browder}
Browder, A. (1996).
\newblock \emph{Mathematical Analysis: An Introduction}.
\newblock New York: Springer-Verlag.
\bibitem[Chil\'{e}s and Delfiner(1999)]{chiles-delfiner}
Chil\'{e}s, J.-P., and Delfiner, P. (1999).
\newblock \emph{Geostatistics: Modeling Spatial Uncertainty}.
\newblock New York: John Wiley.
% \bibitem[Cressie(1993)]{cressie}
% Cressie, N.~A.~C. (1993).
% \newblock \emph{Statistics for Spatial Data}, revised ed.
% \newblock New York: John Wiley.
\bibitem[Hoel, et al.(1986)Hoel, Port, and Stone]{hoel-port-stone}
Hoel, P.~G., Port, S.~C., and Stone, C.~J. (1972).
\newblock \emph{Introduction to Stochastic Processes}.
\newblock Boston: Houghton Mifflin.
\newblock Republished, Waveland Press, Prospect Heights, Illinois, 1986.
\bibitem[Wikipedia(2014a)]{wikipedia-bessel}
Wikipedia (2014a).
\newblock Bessel function.
\newblock \url{http://en.wikipedia.org/w/index.php?title=Bessel_function&oldid=603570290}
\bibitem[Wikipedia(2014b)]{wikipedia-matern}
Wikipedia (2014b).
\newblock Mat\'{e}rn covariance function.
\newblock \url{http://en.wikipedia.org/w/index.php?title=Mat%C3%A9rn_covariance_function&oldid=590577006}
\end{thebibliography}
\end{document}