Statistics 8931 (Geyer, Spring 2008)

\documentclass[11pt]{article} \usepackage{amsmath} \usepackage{amssymb} \newcommand{\norm}[1]{\lVert #1 \rVert} \DeclareMathOperator{\var}{var} \DeclareMathOperator{\cov}{cov} \DeclareMathOperator{\tr}{tr} \begin{document} \begin{raggedright} \large Stat 8054 Homework 3: Due Monday, Mar 24, 2008. \end{raggedright} \section*{Newton and EM for Normal Random Effects} \subsection*{Problem} Let $$ y = M \beta + b_1 + \cdots + b_p $$ where $M$ is a known matrix, $\beta$ is an unknown parameter vector and the $b_i$ are independent $\text{Normal}(0, \theta_i G_i)$ random vectors where each $\theta_i$ is an an unknown scalar parameter and each $G_i$ is a known matrix. The log likelihood is $$ l(\theta, \beta) = - \tfrac{1}{2} (y - M \beta)^T V(\theta)^{-1} (y - M \beta) - \tfrac{1}{2} \log \det V(\theta) $$ where $$ V(\theta) = \var_{\theta}(y) $$ \begin{itemize} \item[(a)] Calculate the first derivative vector (gradient) and second derivative matrix (Hessian) of the log likelihood. \item[(b)] Show that the gradient has expectation zero and the variance-covariance matrix of the gradient is minus the expectation of the Hessian. \item[(c)] Give a closed form expression for the EM update. Since the joint distribution of $(y, b_1, \ldots, b_p)$ is degenerate there is no complete data log likelihood if we take this to be the complete data. Hence take the complete data to be $(y, b_1, \ldots, b_{p - 1}$). Hint: write out the joint density for $b_1$, $\ldots$, $b_p$ and plug in $b_p = y - M \beta - b_1 - \cdots - b_{p - 1}$. Its much simpler that way, no matrix inversion and no determinants to calculate. \end{itemize} \subsection*{Solution} \subsubsection*{(a)} First note that that $$ V(\theta) = \sum_i \theta_i G_i $$ by independence of the $b_i$. Thus $$ \frac{\partial V(\theta)}{\partial \theta_i} = G_i $$ does not contain (any components of) $\theta$. Then \begin{align*} \frac{\partial l}{\partial \beta} & = M^T V(\theta)^{-1} (y - M \beta) \\ \frac{\partial l}{\partial \theta_i} & = \tfrac{1}{2} (y - M \beta)^T V(\theta)^{-1} G_i V(\theta)^{-1} (y - M \beta) - \tfrac{1}{2} \tr\{ G_i V(\theta)^{-1} \} \end{align*} (the first equation is a vector equation with components $\partial l / \partial \beta_k$). And \begin{align*} \frac{\partial^2 l}{\partial \beta \partial \beta} & = - M^T V(\theta)^{-1} M \\ \frac{\partial^2 l}{\partial \theta_i \partial \theta_j} & = - (y - M \beta)^T V(\theta)^{-1} G_i V(\theta)^{-1} G_j V(\theta)^{-1} (y - M \beta) \\ & \quad + \tfrac{1}{2} \tr\{ G_i V(\theta)^{-1} G_j V(\theta)^{-1} \} \\ \frac{\partial^2 l}{\partial \theta_i \partial \beta} & = - M^T V(\theta)^{-1} G_i V(\theta)^{-1} (y - M \beta) \end{align*} (the first equation is a matrix equation with components $\partial^2 l / \partial \beta_k \partial \beta_l$, and the third equation is a vector equation with components $\partial^2 l / \partial \theta_i \partial \beta_k$). From the first displayed equation of the problem statement we have $$ E(y) = M \beta $$ Hence any equation that has only one $(y - M \beta)$ has expectation zero. From the definition of variance-covariance matrix $$ V(\theta) = E\{ (y - M \beta) (y - M \beta)^T \} $$ and from the definition of trace we can rewrite the second score equation as \begin{align*} \frac{\partial l}{\partial \theta_i} & = \tfrac{1}{2} \tr\left\{ V(\theta)^{-1} G_i V(\theta)^{-1} (y - M \beta) (y - M \beta)^T \right\} \\ & \quad - \tfrac{1}{2} \tr\{ G_i V(\theta)^{-1} \} \end{align*} which makes it clear that this equation too has expectation zero. \begin{align*} \var \left\{ \frac{\partial l}{\partial \beta} \right\} & = M^T V(\theta)^{-1} E \{ (y - M \beta) (y - M \beta)^T \} V(\theta)^{-1} M^T \\ & = M^T V(\theta)^{-1} V(\theta) V(\theta)^{-1} M \\ & = M^T V(\theta)^{-1} M \\ & = - \frac{\partial^2 l}{\partial \beta \partial \beta} \end{align*} so that checks. Note that $$ E\left\{ \frac{\partial^2 l}{\partial \theta_i \partial \beta} \right\} = 0 $$ so we must have $$ \cov\left\{ \frac{\partial l}{\partial \beta}, \frac{\partial l}{\partial \theta_i} \right\} = 0 $$ too. This is true because $\partial l / \partial \beta$ is of odd degree in $y - M \beta$ and $\partial l / \partial \theta_i$ is of even degree, hence the product is of odd degree, and all odd central moments of the normal distribution are zero (by symmetry). Finally we need to check $$ \cov\left\{ \frac{\partial l}{\partial \theta_i}, \frac{\partial l}{\partial \theta_j} \right\} = - E \left\{ \frac{\partial^2 l}{\partial \theta_i \partial \theta_j} \right\} $$ First \begin{align*} - E \left\{ \frac{\partial^2 l}{\partial \theta_i \partial \theta_j} \right\} & = \tr\bigl[ V(\theta)^{-1} G_i V(\theta)^{-1} G_j V(\theta)^{-1} E\{ (y - M \beta) (y - M \beta)^T \} \bigr] \\ & \quad - \tfrac{1}{2} \tr\{ G_i V(\theta)^{-1} G_j V(\theta)^{-1} \} \\ & = \tfrac{1}{2} \tr\{ G_i V(\theta)^{-1} G_j V(\theta)^{-1} \} \end{align*} This uses the easily checked property $\tr[ A B ] = \tr[ B A ]$. The covariance here requires fourth moments of the normal distribution, which we now calculate. Since $y - M \beta$ has mean vector zero, we only need concern ourselves with this case. The moment generating function of a $\text{Normal}(0, V)$ random vector is $$ \varphi(s) = \exp(\tfrac{1}{2} s^T V s) $$ Hence $$ \frac{\partial \varphi(s)}{\partial s_i} = \varphi(s) \textstyle \sum_m v_{i m} s_m $$ where $v_{i j}$ are the elements of $V$. And \begin{align*} \frac{\partial^2 \varphi(s)}{\partial s_i \partial s_j} & = \varphi(s) v_{i j} + \varphi(s) \textstyle \sum_m v_{i m} s_m \textstyle \sum_n v_{j n} s_n \\ \frac{\partial^3 \varphi(s)}{\partial s_i \partial s_j \partial s_k} & = \varphi(s) \textstyle \sum_m v_{i m} s_m \textstyle \sum_n v_{j n} s_n \textstyle \sum_q v_{k q} s_q \\ & \quad + \varphi(s) v_{i j} \textstyle \sum_q v_{k q} s_q + \varphi(s) v_{j k} \textstyle \sum_m v_{i m} s_m + \varphi(s) v_{k i} \textstyle \sum_n v_{j n} s_n \end{align*} And finally, \begin{align*} \frac{\partial^4 \varphi(s)} {\partial s_i \partial s_j \partial s_k \partial s_l} & = \varphi(s) \textstyle \sum_m v_{i m} s_m \textstyle \sum_n v_{j n} s_n \textstyle \sum_q v_{k q} s_q \textstyle \sum_r v_{l r} s_r \\ & \quad + \varphi(s) v_{i l} \textstyle \sum_n v_{j n} s_n \textstyle \sum_q v_{k q} s_q \\ & \quad + \text{5 terms obtained by permuting $i$, $j$, $k$, and $l$ in above} \\ & \quad + \varphi(s) v_{i j} v_{k l} \\ & \quad + \text{2 terms obtained by permuting $i$, $j$, $k$, and $l$ in above} \end{align*} Setting $s = 0$ we obtain. \begin{align*} E\{ x_i \} & = 0 \\ E\{ x_i x_j \} & = v_{i j} \\ E\{ x_i x_j x_k \} & = 0 \\ E\{ x_i x_j x_k x_l \} & = v_{i j} v_{k l} + v_{i k} v_{j l} + v_{i l} v_{j k} \end{align*} This could also just be looked up (e.~g., Anderson \emph{An Introduction to Multivariate Analysis}, 3rd ed., eqn. (26) of Section~2.6). To simplify notation, write $$ \frac{\partial l}{\partial \theta_i} = \tr( A_i x x^T ) - \tr( B_i ) $$ where \begin{align*} A_i & = \tfrac{1}{2} V(\theta)^{-1} G_i V(\theta)^{-1} \\ x & = y - M \beta \\ B_i & = \tfrac{1}{2} G_i V(\theta)^{-1} \end{align*} Then \begin{align*} \cov\left\{ \frac{\partial l}{\partial \theta_r}, \frac{\partial l}{\partial \theta_s} \right\} & = E\left\{ \frac{\partial l}{\partial \theta_r} \frac{\partial l}{\partial \theta_s} \right\} \\ & = E\left\{ \bigl[ \tr( A_r x x^T ) - \tr( B_r ) \bigr] \bigl[ \tr( A_s x x^T ) - \tr( B_s ) \bigr] \right\} \\ & = E\left\{ \tr( A_r x x^T ) \tr( A_s x x^T ) \right\} - \tr( B_s ) E\left\{ \tr( A_r x x^T ) \right\} \\ & \quad - \tr( B_r ) E\left\{ \tr( A_s x x^T ) \right\} + \tr( B_r ) \tr( B_s ) \\ & = E\left\{ \tr( A_r x x^T ) \tr( A_s x x^T ) \right\} - \tr( B_s ) \tr( A_r V ) \\ & \quad - \tr( B_r ) \tr( A_s V ) + \tr( B_r ) \tr( B_s ) \end{align*} where $V = V(\theta)$. Only the fourth moment term remains to be done \begin{align*} E\left\{ \tr( A_r x x^T ) \tr( A_s x x^T ) \right\} & = \textstyle \sum_{i j k l} a_{r, i j} a_{s, k l} E\{ x_i x_j x_k x_l \} \\ & = \textstyle \sum_{i j k l} a_{r, i j} a_{s, k l} \bigl( v_{i j} v_{k l} + v_{i k} v_{j l} + v_{i l} v_{j k} \bigr) \\ & = \tr( A_r V ) \tr( A_s V ) + 2 \tr( A_r V A_s V ) \end{align*} where $a_{r, i j}$ are the elements of $A_r$ and similarly with $r$ replaced by $s$. Hence \begin{align*} \cov\left\{ \frac{\partial l}{\partial \theta_r}, \frac{\partial l}{\partial \theta_s} \right\} & = \tr( A_r V ) \tr( A_s V ) + 2 \tr( A_r V A_s V ) \\ & \quad - \tr( B_s ) \tr( A_r V ) - \tr( B_r ) \tr( A_s V ) \\ & \quad + \tr( B_r ) \tr( B_s ) \end{align*} Using $$ \tr( A_r V ) = \tfrac{1}{2} \tr \left( V^{-1} G_r V^{-1} V \right) = \tfrac{1}{2} \tr \left( V^{-1} G_r \right) = \tr(B_r) $$ we get \begin{align*} \cov\left\{ \frac{\partial l}{\partial \theta_r}, \frac{\partial l}{\partial \theta_s} \right\} & = 2 \tr( A_r V A_s V ) \\ & = \tfrac{1}{2} \tr( V^{-1} G_r V^{-1} V V^{-1} G_s V^{-1} V ) \\ & = \tfrac{1}{2} \tr( V^{-1} G_r V^{-1} G_s ) \end{align*} and this agrees with our calculation of $- E \{ \partial^2 l / \partial \theta_i \partial \theta_j \}$, so we are done with part (b). \subsubsection*{(c)} The joint distribution of $b_1$, $\ldots$, $b_p$ is multivariate normal with mean zero and block diagonal variance matrix with blocks $\theta_i G_i$, assuming all $\theta_i > 0$ and all $G_i$ positive definite, the complete data log likelihood is \begin{align*} l(\theta, \beta) & = - \frac{1}{2} \sum_{i = 1}^p \theta_i^{-1} b_i^T G_i^{-1} b_i - \frac{n}{2} \sum_{i = 1}^p \log(\theta_i) \\ & = - \frac{1}{2} \sum_{i = 1}^p \theta_i^{-1} \tr[ G_i^{-1} b_i b_i^T ] - \frac{n}{2} \sum_{i = 1}^p \log(\theta_i) \end{align*} Following the hint we plug in $b_p = y - M \beta - b_1 - \cdots - b_{p - 1}$ obtaining \begin{align*} l(\theta, \beta) & = - \frac{1}{2} \sum_{i = 1}^{p - 1} \theta_i^{-1} \tr[ G_i^{-1} b_i b_i^T ] - \frac{n}{2} \sum_{i = 1}^p \log(\theta_i) \\ & \quad - \frac{1}{2} \theta_p^{-1} \tr\left[ G_p^{-1} \left( y - M \beta - \sum_{i = 1}^{p - 1} b_i \right) \left( y - M \beta - \sum_{j = 1}^{p - 1} b_j \right)^T \right] \\ & = - \frac{1}{2} \sum_{i = 1}^{p - 1} \theta_i^{-1} \tr[ G_i^{-1} b_i b_i^T ] - \frac{n}{2} \sum_{i = 1}^p \log(\theta_i) \\ & \quad - \frac{1}{2} \theta_p^{-1} \tr\left[ G_p^{-1} \left( y - M \beta \right) \left( y - M \beta \right)^T \right] \\ & \quad + \theta_p^{-1} \sum_{i = 1}^{p - 1} \tr\left[ G_p^{-1} \left( y - M \beta \right) b_i^T \right] \\ & \quad - \frac{1}{2} \theta_p^{-1} \sum_{i = 1}^{p - 1} \sum_{j = 1}^{p - 1} \tr\left[ G_p^{-1} b_i b_j^T \right] \end{align*} Now we need to calculate $E\{b_i \mid y\}$ and $E\{ b_i b_j^T \mid y \}$. For the first we only need consider the joint distribution of $(b_i, y)$, which is multivariate normal with mean vector $$ \begin{pmatrix} 0 \\ M \beta \end{pmatrix} $$ and variance-covariance matrix $$ \begin{pmatrix} \theta_i G_i & \theta_i G_i \\ \theta_i G_i & V(\theta) \end{pmatrix} $$ From Theorem~2.5.1 in Anderson (\emph{An Introduction to Multivariate Analysis}, 3rd ed.)\ we have \begin{align*} E\{ b_i \mid y \} & = \theta_i G_i V(\theta)^{-1} (y - M \beta) \\ \var\{ b_i \mid y \} & = \theta_i G_i - \theta_i^2 G_i V(\theta)^{-1} G_i \end{align*} The same theorem applied to $$ \begin{pmatrix} \theta_i G_i & 0 & \theta_i G_i \\ 0 & \theta_j G_j & \theta_j G_j \\ \theta_i G_i & \theta_j G_j & V(\theta) \end{pmatrix} $$ gives \begin{align*} \var\biggl\{ \begin{pmatrix} b_i \\ b_j \end{pmatrix} \biggm| y \biggr\} & = \begin{pmatrix} \theta_i G_i & 0 \\ 0 & \theta_j G_j \end{pmatrix} \\ & \quad - \begin{pmatrix} \theta_i G_i \\ \theta_j G_j \end{pmatrix} V(\theta)^{-1} \begin{pmatrix} \theta_i G_i & \theta_j G_j \end{pmatrix} \end{align*} hence $$ \cov\{ b_i, b_j \mid y \} = - \theta_i \theta_j G_i V(\theta)^{-1} G_j, \qquad i \neq j $$ We can unify our formulas by introducing $\delta_{i j}$ for the elements of the identity matrix, so $$ \cov\{ b_i, b_j \mid y \} = \delta_{i j} \theta_i G_i - \theta_i \theta_j G_i V(\theta)^{-1} G_j $$ holds whether $i = j$ or $i \ne j$. Now \begin{align*} E\{ b_i b_j^T \mid y \} & = \cov\{ b_i, b_j \mid y \} + E\{ b_i \mid y \} E\{ b_j \mid y \}^T \\ & = \delta_{i j} \theta_i G_i - \theta_i \theta_j G_i V(\theta)^{-1} G_j \\ & \quad + \theta_i \theta_j G_i V(\theta)^{-1} (y - M \beta) (y - M \beta)^T V(\theta)^{-1} G_j \end{align*} Now we are ready to take the conditional expectation of the complete data log likelihood. \begin{align*} E_{\theta^*, \beta^*}\{ l(\theta, \beta) \mid y \} & = - \frac{n}{2} \sum_{i = 1}^p \log(\theta_i) - \frac{1}{2} \theta_p^{-1} \tr[ G_p^{-1} (y - M \beta) (y - M \beta)^T ] \\ & \quad + \theta_p^{-1} \sum_{i = 1}^{p - 1} \tr\left[ G_p^{-1} (y - M \beta) E_{\theta^*, \beta^*}\{ b_i \mid y \}^T \right] \\ & \quad - \frac{1}{2} \sum_{i = 1}^{p - 1} \theta_i^{-1} \tr\left[ G_i^{-1} E_{\theta^*, \beta^*}\{ b_i b_i^T \mid y \}^T \right] \\ & \quad - \frac{1}{2} \theta_p^{-1} \sum_{i = 1}^{p - 1} \sum_{j = 1}^{p - 1} \tr\left[ G_p^{-1} E_{\theta^*, \beta^*}\{ b_i b_j^T \mid y \}^T \right] \end{align*} So the EM equations set to zero \begin{align*} \frac{\partial E_{\theta^*, \beta^*}\{ l(\theta, \beta) \mid y \}} {\partial \beta} & = \theta_p^{-1} M^T G_p^{-1} \left( y - M \beta - \sum_{i = 1}^{p - 1} E_{\theta^*, \beta^*}\{ b_i \mid y \} \right) \end{align*} which makes $\beta_{m + 1}$ the weighted least squares estimator that regresses the ``response'' $y - \sum_{i = 1}^{p - 1} E_{\theta^*, \beta^*}\{ b_i \mid y \}$ on the model matrix $M$ using the weight matrix $G_p$. And they also set to zero \begin{align*} \frac{\partial E_{\theta^*, \beta^*}\{ l(\theta, \beta) \mid y \}} {\partial \theta_i} & = - \frac{n}{2 \theta_i} + \frac{1}{2 \theta_i^2} \tr\left[ G_i^{-1} E_{\theta^*, \beta^*}\{ b_i b_i^T \mid y \}^T \right] \end{align*} for $i < p$, giving $$ \theta_i = \frac{1}{n} \tr\left[ G_i^{-1} E_{\theta^*, \beta^*}\{ b_i b_i^T \mid y \}^T \right] $$ And they also set to zero \begin{align*} \frac{\partial E_{\theta^*, \beta^*}\{ l(\theta, \beta) \mid y \}} {\partial \theta_p} & = - \frac{n}{2 \theta_p} + \frac{1}{2 \theta_p^2} \tr[ G_p^{-1} (y - M \beta) (y - M \beta)^T ] \\ & \quad - \frac{1}{\theta_p^2} \sum_{i = 1}^{p - 1} \tr\left[ G_p^{-1} (y - M \beta) E_{\theta^*, \beta^*}\{ b_i \mid y \}^T \right] \\ & \quad + \frac{1}{2 \theta_p^2} \sum_{i = 1}^{p - 1} \sum_{j = 1}^{p - 1} \tr\left[ G_p^{-1} E_{\theta^*, \beta^*}\{ b_i b_j^T \mid y \}^T \right] \end{align*} giving $$ \theta_p = \frac{1}{n} \tr\left[ G_p^{-1} E_{\theta^*, \beta^*}\{ b_p b_p^T \mid y \}^T \right] $$ also, so all of the variance components are estimated by the same formula. \end{document}