

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
            "http://www.w3.org/TR/html4/strict.dtd">

<html lang="en-US">

<head>

<title>
Statistics 8931 (Geyer, Spring 2008) 
</title>

<link href="/geyer/8931/foo.css" rel="stylesheet" type="text/css">

</head>

<body>

<div id="header">
<h1>Statistics 8931 (Geyer, Spring 2008) </h1>
</div>

<div id="main">


\documentclass[11pt]{article}

\usepackage{amsmath}
\usepackage{amssymb}

\newcommand{\norm}[1]{\lVert #1 \rVert}

\DeclareMathOperator{\var}{var}
\DeclareMathOperator{\cov}{cov}
\DeclareMathOperator{\tr}{tr}

\begin{document}

\begin{raggedright}
\large Stat 8054 Homework 3: Due Monday, Mar 24, 2008.
\end{raggedright}

\section*{Newton and EM for Normal Random Effects}

\subsection*{Problem}

Let
$$
   y = M \beta + b_1 + \cdots + b_p
$$
where $M$ is a known matrix, $\beta$ is an unknown parameter vector
and the $b_i$ are independent $\text{Normal}(0, \theta_i G_i)$ random
vectors where each $\theta_i$ is an an unknown scalar parameter and each
$G_i$ is a known matrix.

The log likelihood is
$$
   l(\theta, \beta)
   =
   - \tfrac{1}{2} (y - M \beta)^T V(\theta)^{-1} (y - M \beta)
   - \tfrac{1}{2} \log \det V(\theta)
$$
where
$$
   V(\theta) = \var_{\theta}(y)
$$

\begin{itemize}
\item[(a)] Calculate the first derivative vector (gradient) and second
    derivative matrix (Hessian) of the log likelihood.
\item[(b)] Show that the gradient has expectation zero and
    the variance-covariance matrix of the gradient is
    minus the expectation of the Hessian.
\item[(c)] Give a closed form expression for the EM update.
    Since the joint distribution of $(y, b_1, \ldots, b_p)$ is degenerate
    there is no complete data log likelihood if we take this to be the
    complete data.
    Hence take the complete data to be $(y, b_1, \ldots, b_{p - 1}$).
    Hint: write out the joint density for $b_1$, $\ldots$, $b_p$ and
    plug in $b_p = y - M \beta - b_1 - \cdots - b_{p - 1}$.  Its much
    simpler that way, no matrix inversion and no determinants to calculate.

\end{itemize}

\subsection*{Solution}

\subsubsection*{(a)}

First note that that
$$
   V(\theta) = \sum_i \theta_i G_i
$$
by independence of the $b_i$.  Thus
$$
   \frac{\partial V(\theta)}{\partial \theta_i} = G_i
$$
does not contain (any components of) $\theta$.

Then
\begin{align*}
   \frac{\partial l}{\partial \beta}
   & =
   M^T V(\theta)^{-1} (y - M \beta)
   \\
   \frac{\partial l}{\partial \theta_i}
   & =
   \tfrac{1}{2} (y - M \beta)^T V(\theta)^{-1}
   G_i V(\theta)^{-1} (y - M \beta)
   - \tfrac{1}{2} \tr\{ G_i V(\theta)^{-1} \}
\end{align*}
(the first equation is a vector equation
with components $\partial l / \partial \beta_k$).
And
\begin{align*}
   \frac{\partial^2 l}{\partial \beta \partial \beta}
   & =
   - M^T V(\theta)^{-1} M
   \\
   \frac{\partial^2 l}{\partial \theta_i \partial \theta_j}
   & =
   - (y - M \beta)^T V(\theta)^{-1}
   G_i V(\theta)^{-1}
   G_j V(\theta)^{-1} (y - M \beta)
   \\
   & \quad
   +
   \tfrac{1}{2} \tr\{ G_i V(\theta)^{-1} G_j V(\theta)^{-1} \}
   \\
   \frac{\partial^2 l}{\partial \theta_i \partial \beta}
   & =
   - M^T V(\theta)^{-1} G_i V(\theta)^{-1} (y - M \beta)
\end{align*}
(the first equation is a matrix equation
with components $\partial^2 l / \partial \beta_k \partial \beta_l$, and
the third equation is a vector equation with components
$\partial^2 l / \partial \theta_i \partial \beta_k$).

From the first displayed equation of the problem statement we have
$$
   E(y) = M \beta
$$
Hence any equation that has only one $(y - M \beta)$ has expectation zero.

From the definition of variance-covariance matrix
$$
   V(\theta) = E\{ (y - M \beta) (y - M \beta)^T \}
$$
and from the definition of trace we can rewrite the second score equation
as
\begin{align*}
   \frac{\partial l}{\partial \theta_i}
   & =
   \tfrac{1}{2}
   \tr\left\{ V(\theta)^{-1} G_i V(\theta)^{-1}
   (y - M \beta) (y - M \beta)^T \right\}
   \\
   & \quad
   - \tfrac{1}{2} \tr\{ G_i V(\theta)^{-1} \}
\end{align*}
which makes it clear that this equation too has expectation zero.

\begin{align*}
   \var \left\{
   \frac{\partial l}{\partial \beta}
   \right\}
   & =
   M^T V(\theta)^{-1} E \{ (y - M \beta) (y - M \beta)^T \} V(\theta)^{-1} M^T
   \\
   & =
   M^T V(\theta)^{-1} V(\theta) V(\theta)^{-1} M
   \\
   & =
   M^T V(\theta)^{-1} M
   \\
   & =
   - \frac{\partial^2 l}{\partial \beta \partial \beta}
\end{align*}
so that checks.

Note that
$$
   E\left\{ \frac{\partial^2 l}{\partial \theta_i \partial \beta} \right\}
   =
   0
$$
so we must have
$$
   \cov\left\{ \frac{\partial l}{\partial \beta},
   \frac{\partial l}{\partial \theta_i} \right\}
   =
   0
$$
too.  This is true because $\partial l / \partial \beta$ is of odd degree
in $y - M \beta$ and $\partial l / \partial \theta_i$ is of even degree, hence
the product is of odd degree, and all odd central moments of the normal
distribution are zero (by symmetry).

Finally we need to check
$$
   \cov\left\{ \frac{\partial l}{\partial \theta_i},
   \frac{\partial l}{\partial \theta_j} \right\}
   =
   - E \left\{
   \frac{\partial^2 l}{\partial \theta_i \partial \theta_j}
   \right\}
$$
First
\begin{align*}
   - E \left\{
   \frac{\partial^2 l}{\partial \theta_i \partial \theta_j}
   \right\}
   & =
 \tr\bigl[ V(\theta)^{-1}
   G_i V(\theta)^{-1}
   G_j V(\theta)^{-1} E\{ (y - M \beta) (y - M \beta)^T \} \bigr]
   \\
   & \quad
   -
   \tfrac{1}{2} \tr\{ G_i V(\theta)^{-1} G_j V(\theta)^{-1} \}
   \\
   & =
   \tfrac{1}{2} \tr\{ G_i V(\theta)^{-1} G_j V(\theta)^{-1} \}
\end{align*}
This uses the easily checked property $\tr[ A B ] = \tr[ B A ]$.

The covariance here requires fourth moments of the normal distribution,
which we now calculate.  Since $y - M \beta$ has mean vector zero,
we only need concern ourselves with this case.  The moment generating
function of a $\text{Normal}(0, V)$ random vector is
$$
   \varphi(s) = \exp(\tfrac{1}{2} s^T V s)
$$
Hence
$$
   \frac{\partial \varphi(s)}{\partial s_i}
   =
   \varphi(s) \textstyle \sum_m v_{i m} s_m
$$
where $v_{i j}$ are the elements of $V$.  And
\begin{align*}
   \frac{\partial^2 \varphi(s)}{\partial s_i \partial s_j}
   & =
   \varphi(s) v_{i j}
   +
   \varphi(s) \textstyle \sum_m v_{i m} s_m \textstyle \sum_n v_{j n} s_n
   \\
   \frac{\partial^3 \varphi(s)}{\partial s_i \partial s_j \partial s_k}
   & =
   \varphi(s) \textstyle \sum_m v_{i m} s_m \textstyle \sum_n v_{j n} s_n
   \textstyle \sum_q v_{k q} s_q
   \\
   & \quad
   +
   \varphi(s) v_{i j} \textstyle \sum_q v_{k q} s_q
   +
   \varphi(s) v_{j k} \textstyle \sum_m v_{i m} s_m
   +
   \varphi(s) v_{k i} \textstyle \sum_n v_{j n} s_n
\end{align*}
And finally,
\begin{align*}
   \frac{\partial^4 \varphi(s)}
   {\partial s_i \partial s_j \partial s_k \partial s_l}
   & =
   \varphi(s) \textstyle \sum_m v_{i m} s_m \textstyle \sum_n v_{j n} s_n
   \textstyle \sum_q v_{k q} s_q \textstyle \sum_r v_{l r} s_r
   \\
   & \quad
   +
   \varphi(s) v_{i l} \textstyle \sum_n v_{j n} s_n
   \textstyle \sum_q v_{k q} s_q
   \\
   & \quad
   +
   \text{5 terms obtained by permuting $i$, $j$, $k$, and $l$ in above}
   \\
   & \quad
   +
   \varphi(s) v_{i j} v_{k l}
   \\
   & \quad
   +
   \text{2 terms obtained by permuting $i$, $j$, $k$, and $l$ in above}
\end{align*}
Setting $s = 0$ we obtain.
\begin{align*}
   E\{ x_i \} & = 0
   \\
   E\{ x_i x_j \} & = v_{i j}
   \\
   E\{ x_i x_j x_k \} & = 0
   \\
   E\{ x_i x_j x_k x_l \} & = v_{i j} v_{k l} + v_{i k} v_{j l}
   + v_{i l} v_{j k}
\end{align*}
This could also just be looked up (e.~g., Anderson \emph{An Introduction
to Multivariate Analysis}, 3rd ed., eqn. (26) of Section~2.6).

To simplify notation, write
$$
   \frac{\partial l}{\partial \theta_i}
   =
   \tr( A_i x x^T ) - \tr( B_i )
$$
where
\begin{align*}
   A_i
   & =
   \tfrac{1}{2}
   V(\theta)^{-1} G_i V(\theta)^{-1}
   \\
   x
   & =
   y - M \beta
   \\
   B_i
   & =
   \tfrac{1}{2}
   G_i V(\theta)^{-1}
\end{align*}
Then
\begin{align*}
   \cov\left\{
   \frac{\partial l}{\partial \theta_r},
   \frac{\partial l}{\partial \theta_s}
   \right\}
   & =
   E\left\{
   \frac{\partial l}{\partial \theta_r}
   \frac{\partial l}{\partial \theta_s}
   \right\}
   \\
   & =
   E\left\{
   \bigl[ \tr( A_r x x^T ) - \tr( B_r ) \bigr]
   \bigl[ \tr( A_s x x^T ) - \tr( B_s ) \bigr]
   \right\}
   \\
   & =
   E\left\{ \tr( A_r x x^T ) \tr( A_s x x^T ) \right\}
   - \tr( B_s ) E\left\{ \tr( A_r x x^T ) \right\}
   \\
   & \quad
   - \tr( B_r ) E\left\{ \tr( A_s x x^T ) \right\}
   + \tr( B_r ) \tr( B_s )
   \\
   & =
   E\left\{ \tr( A_r x x^T ) \tr( A_s x x^T ) \right\}
   - \tr( B_s ) \tr( A_r V )
   \\
   & \quad
   - \tr( B_r ) \tr( A_s V )
   + \tr( B_r ) \tr( B_s )
\end{align*}
where $V = V(\theta)$.  Only the fourth moment term remains to be done
\begin{align*}
   E\left\{ \tr( A_r x x^T ) \tr( A_s x x^T ) \right\}
   & =
   \textstyle \sum_{i j k l} a_{r, i j} a_{s, k l} E\{ x_i x_j x_k x_l \}
   \\
   & =
   \textstyle \sum_{i j k l} a_{r, i j} a_{s, k l}
   \bigl(
   v_{i j} v_{k l} + v_{i k} v_{j l} + v_{i l} v_{j k}
   \bigr)
   \\
   & =
   \tr( A_r V ) \tr( A_s V )
   +
   2 \tr( A_r V A_s V )
\end{align*}
where $a_{r, i j}$ are the elements of $A_r$ and similarly with $r$ replaced
by $s$.
Hence
\begin{align*}
   \cov\left\{
   \frac{\partial l}{\partial \theta_r},
   \frac{\partial l}{\partial \theta_s}
   \right\}
   & =
   \tr( A_r V ) \tr( A_s V )
   +
   2 \tr( A_r V A_s V )
   \\
   & \quad
   - \tr( B_s ) \tr( A_r V )
   - \tr( B_r ) \tr( A_s V )
   \\
   & \quad
   + \tr( B_r ) \tr( B_s )
\end{align*}
Using
$$
   \tr( A_r V )
   =
   \tfrac{1}{2}
   \tr \left(
   V^{-1} G_r V^{-1} V
   \right)
   =
   \tfrac{1}{2}
   \tr \left(
   V^{-1} G_r
   \right)
   =
   \tr(B_r)
$$
we get
\begin{align*}
   \cov\left\{
   \frac{\partial l}{\partial \theta_r},
   \frac{\partial l}{\partial \theta_s}
   \right\}
   & =
   2 \tr( A_r V A_s V )
   \\
   & =
   \tfrac{1}{2}
   \tr( V^{-1} G_r V^{-1} V V^{-1} G_s V^{-1} V )
   \\
   & =
   \tfrac{1}{2}
   \tr( V^{-1} G_r V^{-1} G_s )
\end{align*}
and this agrees with our calculation
of $- E \{ \partial^2 l / \partial \theta_i \partial \theta_j \}$,
so we are done with part (b).

\subsubsection*{(c)}

The joint distribution of $b_1$, $\ldots$, $b_p$ is multivariate normal
with mean zero and block diagonal variance matrix with blocks $\theta_i G_i$,
assuming all $\theta_i > 0$ and all $G_i$ positive definite,
the complete data log likelihood is
\begin{align*}
   l(\theta, \beta)
   & =
   - \frac{1}{2} \sum_{i = 1}^p \theta_i^{-1} b_i^T G_i^{-1} b_i
   - \frac{n}{2} \sum_{i = 1}^p \log(\theta_i)
   \\
   & =
   - \frac{1}{2} \sum_{i = 1}^p \theta_i^{-1} \tr[ G_i^{-1} b_i b_i^T ]
   - \frac{n}{2} \sum_{i = 1}^p \log(\theta_i)
\end{align*}
Following the hint we plug in
$b_p = y - M \beta - b_1 - \cdots - b_{p - 1}$
obtaining
\begin{align*}
   l(\theta, \beta)
   & =
   - \frac{1}{2} \sum_{i = 1}^{p - 1} \theta_i^{-1} \tr[ G_i^{-1} b_i b_i^T ]
   - \frac{n}{2} \sum_{i = 1}^p \log(\theta_i)
   \\
   & \quad
   - \frac{1}{2} \theta_p^{-1} \tr\left[ G_p^{-1}
   \left( y - M \beta - \sum_{i = 1}^{p - 1} b_i \right)
   \left( y - M \beta - \sum_{j = 1}^{p - 1} b_j \right)^T
   \right]
   \\
   & =
   - \frac{1}{2} \sum_{i = 1}^{p - 1} \theta_i^{-1} \tr[ G_i^{-1} b_i b_i^T ]
   - \frac{n}{2} \sum_{i = 1}^p \log(\theta_i)
   \\
   & \quad
   - \frac{1}{2} \theta_p^{-1} \tr\left[ G_p^{-1}
   \left( y - M \beta \right)
   \left( y - M \beta \right)^T
   \right]
   \\
   & \quad
   + \theta_p^{-1} \sum_{i = 1}^{p - 1}
   \tr\left[ G_p^{-1} \left( y - M \beta \right) b_i^T \right]
   \\
   & \quad
   - \frac{1}{2} \theta_p^{-1} \sum_{i = 1}^{p - 1} \sum_{j = 1}^{p - 1}
   \tr\left[ G_p^{-1} b_i b_j^T \right]
\end{align*}

Now we need to calculate $E\{b_i \mid y\}$ and $E\{ b_i b_j^T \mid y \}$.
For the first we only need consider the joint distribution of $(b_i, y)$,
which is multivariate normal with mean vector
$$
   \begin{pmatrix} 0 \\ M \beta \end{pmatrix}
$$
and variance-covariance matrix
$$
   \begin{pmatrix} \theta_i G_i & \theta_i G_i \\
   \theta_i G_i & V(\theta) \end{pmatrix}
$$
From Theorem~2.5.1 in Anderson (\emph{An Introduction
to Multivariate Analysis}, 3rd ed.)\ we have
\begin{align*}
   E\{ b_i \mid y \}
   & = \theta_i G_i V(\theta)^{-1} (y - M \beta)
   \\
   \var\{ b_i \mid y \}
   & = \theta_i G_i - \theta_i^2 G_i V(\theta)^{-1} G_i
\end{align*}
The same theorem applied to
$$
   \begin{pmatrix}
   \theta_i G_i & 0 & \theta_i G_i \\
   0 & \theta_j G_j & \theta_j G_j \\
   \theta_i G_i & \theta_j G_j & V(\theta)
   \end{pmatrix}
$$
gives
\begin{align*}
   \var\biggl\{ \begin{pmatrix} b_i \\ b_j \end{pmatrix} \biggm| y \biggr\}
   & =
   \begin{pmatrix}
   \theta_i G_i & 0 \\
   0 & \theta_j G_j
   \end{pmatrix}
   \\
   & \quad
   -
   \begin{pmatrix}
   \theta_i G_i \\
   \theta_j G_j
   \end{pmatrix}
   V(\theta)^{-1}
   \begin{pmatrix}
   \theta_i G_i &
   \theta_j G_j
   \end{pmatrix}
\end{align*}
hence
$$
   \cov\{ b_i, b_j \mid y \} = - \theta_i \theta_j G_i V(\theta)^{-1} G_j,
   \qquad i \neq j
$$
We can unify our formulas by introducing $\delta_{i j}$ for the elements
of the identity matrix, so
$$
   \cov\{ b_i, b_j \mid y \}
   =
   \delta_{i j} \theta_i G_i - \theta_i \theta_j G_i V(\theta)^{-1} G_j
$$
holds whether $i = j$ or $i \ne j$.
Now
\begin{align*}
   E\{ b_i b_j^T \mid y \}
   & =
   \cov\{ b_i, b_j \mid y \} + E\{ b_i \mid y \} E\{ b_j \mid y \}^T
   \\
   & =
   \delta_{i j} \theta_i G_i - \theta_i \theta_j G_i V(\theta)^{-1} G_j
   \\
   & \quad
   +
   \theta_i \theta_j G_i V(\theta)^{-1} (y - M \beta)
   (y - M \beta)^T V(\theta)^{-1} G_j
\end{align*}

Now we are ready to take the conditional expectation of the complete
data log likelihood.
\begin{align*}
   E_{\theta^*, \beta^*}\{ l(\theta, \beta) \mid y \}
   & =
   - \frac{n}{2} \sum_{i = 1}^p \log(\theta_i)
   - \frac{1}{2} \theta_p^{-1} \tr[ G_p^{-1} (y - M \beta) (y - M \beta)^T ]
   \\
   & \quad
   + \theta_p^{-1} \sum_{i = 1}^{p - 1}
   \tr\left[ G_p^{-1} (y - M \beta)
   E_{\theta^*, \beta^*}\{ b_i \mid y \}^T \right]
   \\
   & \quad
   - \frac{1}{2} \sum_{i = 1}^{p - 1} \theta_i^{-1}
   \tr\left[ G_i^{-1} E_{\theta^*, \beta^*}\{ b_i b_i^T \mid y \}^T \right]
   \\
   & \quad
   - \frac{1}{2} \theta_p^{-1} \sum_{i = 1}^{p - 1} \sum_{j = 1}^{p - 1}
   \tr\left[ G_p^{-1} E_{\theta^*, \beta^*}\{ b_i b_j^T \mid y \}^T \right]
\end{align*}
So the EM equations set to zero
\begin{align*}
   \frac{\partial E_{\theta^*, \beta^*}\{ l(\theta, \beta) \mid y \}}
   {\partial \beta}
   & =
   \theta_p^{-1} M^T G_p^{-1} \left( y - M \beta - \sum_{i = 1}^{p - 1}
   E_{\theta^*, \beta^*}\{ b_i \mid y \} \right)
\end{align*}
which makes $\beta_{m + 1}$ the weighted least squares estimator
that regresses the ``response''
$y - \sum_{i = 1}^{p - 1} E_{\theta^*, \beta^*}\{ b_i \mid y \}$
on the model matrix $M$ using the weight matrix $G_p$.

And they also set to zero
\begin{align*}
   \frac{\partial E_{\theta^*, \beta^*}\{ l(\theta, \beta) \mid y \}}
   {\partial \theta_i}
   & =
   - \frac{n}{2 \theta_i}
   + \frac{1}{2 \theta_i^2}
   \tr\left[ G_i^{-1} E_{\theta^*, \beta^*}\{ b_i b_i^T \mid y \}^T \right]
\end{align*}
for $i < p$, giving
$$
   \theta_i
   =
   \frac{1}{n}
   \tr\left[ G_i^{-1} E_{\theta^*, \beta^*}\{ b_i b_i^T \mid y \}^T \right]
$$
And they also set to zero
\begin{align*}
   \frac{\partial E_{\theta^*, \beta^*}\{ l(\theta, \beta) \mid y \}}
   {\partial \theta_p}
   & =
   - \frac{n}{2 \theta_p}
   + \frac{1}{2 \theta_p^2} \tr[ G_p^{-1} (y - M \beta) (y - M \beta)^T ]
   \\
   & \quad
   - \frac{1}{\theta_p^2} \sum_{i = 1}^{p - 1}
   \tr\left[ G_p^{-1} (y - M \beta)
   E_{\theta^*, \beta^*}\{ b_i \mid y \}^T \right]
   \\
   & \quad
   + \frac{1}{2 \theta_p^2} \sum_{i = 1}^{p - 1} \sum_{j = 1}^{p - 1}
   \tr\left[ G_p^{-1} E_{\theta^*, \beta^*}\{ b_i b_j^T \mid y \}^T \right]
\end{align*}
giving
$$
   \theta_p
   =
   \frac{1}{n}
   \tr\left[ G_p^{-1} E_{\theta^*, \beta^*}\{ b_p b_p^T \mid y \}^T \right]
$$
also, so all of the variance components are estimated by the same formula.





\end{document}



<div class="clearboth"></div>
</div> <!-- end of div main -->

<div id="footer">
<p>
Copyright 2007&mdash;2008 Charles J. Geyer
<p>
Last modified: 26 March 2008 at 1004 hours CDT.
<p>
<a rel="license" href="http://creativecommons.org/licenses/by-sa/3.0/">
<img alt="Creative Commons License" style="border-width:0" src="http://i.creativecommons.org/l/by-sa/3.0/88x31.png" />
</a>
<br />This 
<!-- Supposed to be this but doesn't validate, well known problem
   see http://infomesh.net/2002/rdfinhtml/
   RDF is XML, doesn't go into validatable HTML without contortions
<span xmlns:dc="http://purl.org/dc/elements/1.1/" href="http://purl.org/dc/dcmitype/" rel="dc:type">work</span> is licensed under a 
   For now just kludge -->
work is licensed under a 
<a rel="license" href="http://creativecommons.org/licenses/by-sa/3.0/">Creative Commons Attribution-Share Alike 3.0 License</a>.
</p>
<p>
    <a href="http://validator.w3.org/check?uri=referer"><img
        src="/geyer/8931/valid-html401"
        alt="Valid HTML 4.01 Strict" height="31" width="88"></a>
  </p>
</div>


</body>
</html>

