\documentclass[11pt]{article}

\usepackage{indentfirst}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{amsthm}
\usepackage{amscd}
\usepackage{natbib}
\usepackage{url}
\usepackage{graphicx}
\usepackage[utf8]{inputenc}

\newcommand{\real}{\mathbb{R}}

\newcommand{\inner}[1]{\langle #1 \rangle}
\newcommand{\set}[1]{\{\, #1 \,\}}
\newcommand{\bigset}[1]{\left\{\, #1 \,\right\}}

\newcommand{\fatdot}{\,\cdot\,}

\newcommand{\opand}{\mathbin{\rm and}}

\newtheorem{theorem}{Theorem}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{lemma}[theorem]{Lemma}

\DeclareMathOperator{\var}{var}
\DeclareMathOperator{\rint}{rint}
\DeclareMathOperator{\con}{con}
\DeclareMathOperator{\pos}{pos}
\DeclareMathOperator{\cl}{cl}
\DeclareMathOperator{\spam}{span}
\DeclareMathOperator{\rc}{rc}
\DeclareMathOperator{\myPr}{Pr}
\DeclareMathOperator{\supp}{supp}
\DeclareMathOperator{\rank}{rank}
\DeclareMathOperator{\logit}{logit}

\setlength{\textheight}{\paperheight}
\addtolength{\textheight}{- 2 in}
\setlength{\topmargin}{0.25 pt}
\setlength{\headheight}{0 pt}
\setlength{\headsep}{0 pt}
\addtolength{\textheight}{- \topmargin}
\addtolength{\textheight}{- \topmargin}
\addtolength{\textheight}{- \footskip}

\setlength{\textwidth}{\paperwidth}
\addtolength{\textwidth}{- 2 in}
\setlength{\oddsidemargin}{0.25 in}
\setlength{\evensidemargin}{0.25 in}
\addtolength{\textwidth}{- \oddsidemargin}
\addtolength{\textwidth}{- \evensidemargin}

\begin{document}

  \vspace*{0.9375in}
  \begin{center}
    {\bfseries Supporting Theory and Data Analysis for ``Likelihood Inference
    in Exponential Families and Directions of Recession''} \\
    By \\
    Charles J. Geyer \\
    Technical Report No.~672 \\
    School of Statistics \\
    University of Minnesota \\
%       April 20, 2005 \\
     \today
  \end{center}
  \thispagestyle{empty}
  \cleardoublepage
  \setcounter{page}{1}
  \thispagestyle{empty}

\begin{abstract}
When in a full exponential family the maximum likelihood estimate (MLE)
does not exist, the MLE may exist in the Barndorff-Nielsen completion of the
family \citep{barndorff,brown,thesis}.
A practical algorithm for finding the MLE in the completion using repeated
linear programming was proposed in the author's unpublished thesis
\citep{thesis} and used in \citet{gt}.
Now we propose a slightly different method, also using
repeated linear programming with the R contributed package \texttt{rcdd}
\citep{rcdd},
which makes straightforward the calculation of the MLE in the Barndorff-Nielsen
completion for any models satisfying a condition of \citet{brown} and for which
some R function can calculate the MLE when it does exist, for example,
generalized linear models (GLM) and aster models \citep{gws,aster}.
In this technical report we give details of two GLM examples.

Likelihood ratio tests of model comparison are almost unchanged from
the usual case.
Only the degrees of freedom need be adjusted when the MLE for the null
hypothesis lies in the completion rather than the original family.
Confidence intervals are changed much more.  When the MLE
for the natural parameter does not exist, it can be thought of as having
gone to infinity in a certain direction, which we call a generic direction
of recession.  Here we propose a new kind of one-sided confidence
interval, not involving asymptotic approximation, for how close to infinity
the true unknown natural parameter value may be.  This maps to a one-sided
confidence interval for the mean value parameter showing how close to the
boundary of its support it may be.
\end{abstract}

<<foo,include=FALSE,echo=FALSE>>=
options(keep.source = TRUE, width = 65)
ps.options(pointsize = 15)
pdf.options(pointsize = 15)
@

\section{R Package Rcdd}

We use the R statistical computing environment \citep{rcore} in our analysis.
It is free software and can be obtained from
\url{http://cran.r-project.org}.  Precompiled binaries
are available for Windows, Macintosh, and popular Linux distributions.
We use the contributed package \verb@rcdd@.
If R has been installed, but this package has
not yet been installed, do
\begin{verbatim}
install.packages("rcdd")
\end{verbatim}
from the R command line
(or do the equivalent using the GUI menus if on Macintosh
or Windows).  This may require root or administrator privileges.

If the \verb@rcdd@ package has been installed, we load it
<<library>>=
library(rcdd)
@
<<baz,include=FALSE,echo=FALSE>>=
baz <- library(help = "rcdd")
baz <- baz$info[[1]]
baz <- baz[grep("Version", baz)]
baz <- sub("^Version: *", "", baz)
bazzer <- paste(R.version$major, R.version$minor, sep = ".")
@
The version of the package used to make this document
is \Sexpr{baz}.
The version of R used to make this document is \Sexpr{bazzer}.

This entire document and all of the calculations shown were made using
the R command \texttt{Sweave} and hence are exactly reproducible by
anyone who has R and the R noweb (RNW) file from which it was created.
Both the RNW file and and the PDF document produced from it are available at
\url{http://www.stat.umn.edu/geyer/gdor}.

\section{Introduction}

When in a full exponential family the maximum likelihood estimate (MLE)
does not exist, the MLE may exist in the Barndorff-Nielsen completion of the
family \citep[and references cited at the beginning of Chapter~2 therein]
{barndorff,brown,thesis}.  In this case the MLE in the completion
can be thought of as a pair $(\hat{\theta}, \delta)$ satisfying
\begin{equation} \label{eq:char-one}
   \lim_{s \to \infty} l(\hat{\theta} + s \delta)
   =
   \sup_{\theta \in \Theta} l(\theta).
\end{equation}
Although $\hat{\theta}$ and $\delta$ satisfying \eqref{eq:char-one} are
not necessarily unique, the distribution which is the limit as $s \to \infty$
of distributions having parameter values $\hat{\theta} + s \delta$ is unique.

A more complicated but also more interesting characterization of $\delta$
is that it is an element of the relative interior of the normal cone
$N_C(y)$ of the convex support $C$ at the point $y$, where $C$ is
the smallest closed convex set that contains the natural statistic $Y$
almost surely for one of the distributions in the family and hence
for all of them and $y$ is the observed value of $Y$.  This makes determination
of $\delta$ an exercise in computational geometry.  When $C$ is a polyhedral
convex set, $\delta$ can be determined using the R functions \texttt{linearity}
and \texttt{lpcdd} in the \texttt{rcdd} contributed package \citep{rcdd}
for the R statistical computing environment \citep{rcore}.

A more complicated but also more interesting characterization of $\hat{\theta}$
is that it is the MLE in the limiting conditional model, which is the family
of distributions that are limits in distribution as $s \to \infty$ of
distributions having parameters $\theta + s \delta$ for all $\theta \in \Theta$.
This limiting conditional model, described in Section~\ref{sec:limits} below,
is an exponential family, the given exponential family conditioned on the event
$\inner{Y - y, \delta} = 0$, where $Y$ is the natural statistic,
$y$ is its observed value, and $\inner{\fatdot, \fatdot}$ is the bilinear
form given by \eqref{eq:inner} below.
Thus computation
of $\hat{\theta}$ is maximum likelihood estimation
in an exponential family and
can often be done using available software.

Some regularity conditions are necessary for the above theory to be correct.
They are given in Section~\ref{sec:assume} below.  These regularity conditions
are not restrictive, being satisfied by all applications known to me
involving full exponential families.
\citet{thesis} has algorithms that work
on non-full but convex families, meaning the set of natural parameter values
is a convex subset of the natural parameter space that is closed relative
to it, and \citet{gt} have an application of such families,
but we do not discuss them here.

So what is new here?  \citet{thesis} provided a method of finding the MLE
in the Barndorff-Nielsen completion that was effective and, like the methods
recommended here, was based on repeated linear programming.  However, that
work was never published,
partly because it depended on high-quality linear programming
software, which was not easy for statisticians to use and certainly not
available in widely used statistical computing environments.  With the
advent of the \texttt{rcdd} package \citep{rcdd}, high quality linear
programming is now available in R, so it was time
to revisit the issue.  Some new ideas have been
added, so we do not closely follow \citet{thesis}.  In particular,
the algorithms presented in Sections~\ref{sec:linearity} and~\ref{sec:gdor}
below, which are the heart of our methodology, differ from those
of \citet{thesis}.
Finally, the hypothesis tests and confidence intervals proposed in
Sections~\ref{sec:tests} and~\ref{sec:confidence} below are new,
although we cannot claim priority for our proposal for testing,
which was suggested by S. Fienberg (personal communication).

Section~\ref{sec:example} below contains explicit examples that show
how all calculations related to our methodology are carried out in R.

\section{Theory}

\subsection{Exponential Families}

An exponential family of distributions \citep{barndorff,brown,thesis} is
a statistical model having log likelihood
\begin{equation} \label{eq:logl}
   l(\theta) = \inner{y, \theta} - c(\theta),
\end{equation}
where $y$ is a vector statistic, $\theta$ is a vector parameter,
and $(y, \theta) \mapsto \inner{y, \theta}$ is a bilinear form.
In all computations, we will assume $y$ and $\theta$ are elements of $\real^p$
and
\begin{equation} \label{eq:inner}
   \inner{y, \theta} = \sum_{i = 1}^p y_i \theta_i,
\end{equation}
but in theory any bilinear form works.
A statistic $y$ and parameter $\theta$ that give a log likelihood of this
form are called \emph{natural} or \emph{canonical}.
We will say \emph{natural}.
The function $c$ is called the \emph{cumulant function} of the family.

The distribution with parameter value $\theta$ has a density with respect
to the distribution with parameter value $\psi$ of the form
\begin{equation} \label{eq:density}
   f_\theta(\omega)
   =
   e^{\inner{Y(\omega), \theta - \psi} - c(\theta) + c(\psi)}.
\end{equation}
The requirement that this integrate to one determines
the function $c$ up to an additive constant
\begin{equation} \label{eq:cumulant}
   c(\theta) = c(\psi) + \log E_\psi \bigl( e^{\inner{Y, \theta - \psi}} \bigr).
\end{equation}
We take \eqref{eq:cumulant} to be valid for all $\theta$ in $\real^p$,
defining $c(\theta) = \infty$ for $\theta$ such that the expectation
in \eqref{eq:cumulant} does not exist.  Since the argument of the expectation
operator in \eqref{eq:cumulant} is strictly positive, so is the expectation;
hence the cumulant function takes values that are either real or $+ \infty$ and
the log likelihood function takes values that are either real or $- \infty$.
Define
\begin{equation} \label{eq:space}
   \Theta = \set{ \theta \in \real^p : c(\theta) < \infty }.
\end{equation}
The exponential family
is \emph{full} if its natural parameter space is \eqref{eq:space}.
We shall be interested only in full exponential families.

% \begin{theorem} \label{th:concave}
% The cumulant function of a full exponential family is
% lower semicontinuous and convex.  The log likelihood function
% is upper semicontinuous and concave.
% \end{theorem}

\subsection{Convex Support, Tangent Cone, and Normal Cone}

The \emph{convex support} of an exponential family is the smallest closed
convex set that contains the natural statistic with probability one under
some distribution in the family \citep[p.~90]{barndorff},
in which case this is true for all
distributions in the family, because the distributions are mutually absolutely
continuous because the densities \eqref{eq:density} are everywhere nonzero.

The \emph{tangent cone} of a convex set $C$ at a point $y \in C$ is
\begin{equation} \label{eq:tangent-cone}
   T_C(y) = \cl \set{ s (w - y) : \text{$w \in C$ and $s \ge 0$} },
\end{equation}
where $\cl$ denotes the closure operation \citep[Theorem~6.9]{raw}.
The \emph{normal cone} of a convex set $C$ in $\real^p$ at a point $y \in C$ is
\begin{equation} \label{eq:normal-cone}
   N_C(y)
   =
   \set{ \delta \in \real^p :
   \text{$\inner{w - y, \delta} \le 0$ for all $w \in C$} }.
\end{equation}
\citep[Theorem~6.9]{raw}.
Tangent and normal cones are polars of each other
\citep[Theorem~6.9 and~Corollary~6.30]{raw}.
Each determines the other.

\subsection{Directions of Recession and Constancy}

Directions of recession and constancy of convex and concave functions
are defined by \citet[p.~69]{rocky}.  We apply these notions to log
likelihoods of full exponential families.

Proofs of all theorems are given in Section~\ref{sec:proofs}.

\begin{theorem} \label{th:constancy}
For some vector $\delta$ and
for a full exponential family with log likelihood \eqref{eq:logl},
natural parameter space $\Theta$, convex support $C$, natural statistic $Y$,
and observed value
of the natural statistic $y$ such that $y \in C$, the following are equivalent.
\begin{enumerate}
\item[\upshape (a)] There exists a $\theta \in \Theta$
such that $s \mapsto l(\theta + s \delta)$ is
not a strictly concave function on the interval where it is finite.
\item[\upshape (b)] For all $\theta \in \Theta$ the function
$s \mapsto l(\theta + s \delta)$ is constant on $\real$.
\item[\upshape (c)] The parameter values $\theta$ and $\theta + s \delta$
correspond to the same probability distribution for
some $\theta \in \Theta$ and some $s \neq 0$.
\item[\upshape (d)] The parameter values $\theta$ and $\theta + s \delta$
correspond to the same probability distribution for
all $\theta \in \Theta$ and all real $s$.
\item[\upshape (e)] $\inner{Y - y, \delta} = 0$ almost surely for some
distribution in the family.
\item[\upshape (f)] $\inner{Y - y, \delta} = 0$ almost surely for all
distributions in the family.
\item[\upshape (g)] $\delta \in N_C(y)$ and $- \delta \in N_C(y)$.
\item[\upshape (h)] $\inner{w, \delta} = 0$, for all $w \in T_C(y)$.
\end{enumerate}
\end{theorem}
Any vector $\delta$ that satisfies any one of the conditions
of the theorem (and hence all of them) is called
a \emph{direction of constancy} of the log likelihood.
The set of all directions of constancy is called the \emph{constancy space}
of the log likelihood.
It is clear from (e) or (h) of the theorem that the constancy space is
a vector subspace.
\begin{corollary} \label{cor:non-uniqueness}
For a full exponential family, suppose $\hat{\theta}_1$ and\/
$\hat{\theta}_2$ are maximum likelihood estimates.  Then
$\hat{\theta}_1 - \hat{\theta}_2$ is a direction of constancy.
\end{corollary}

From the corollary and (d) of the theorem, we see that directions of
constancy do not cause any problem for statistical inference, because
all maximum likelihood estimates correspond to the same probability
distribution.  Thus we have uniqueness where it is important.
Nonuniqueness of the MLE for the natural parameter is, at worst,
merely a computational nuisance.

A family is said to be \emph{minimal} if it has no directions of constancy.
This can always be arranged by reparametrization
(\citealp[pp.~111--116]{barndorff}; \citealp[pp.~13--16]{brown}; see also
\citealp[Section~1.5]{thesis}).
The R function \texttt{glm} always uses a minimal parametrization,
dropping predictors to obtain a full rank model matrix.  We take the view,
however, that minimality is not necessary and that insisting on minimality
can complicate other theoretical issues.  Thus we never insist on minimality
and do allow for directions of constancy.

\begin{theorem} \label{th:recession}
For some vector $\delta$ and
for a full exponential family with log likelihood \eqref{eq:logl},
natural parameter space $\Theta$, convex support $C$, natural statistic $Y$,
and observed value
of the natural statistic $y$ such that $y \in C$, the following are equivalent.
\begin{enumerate}
\item[\upshape (a)] There exists a $\theta \in \Theta$ such that the function
$s \mapsto l(\theta + s \delta)$ is nondecreasing on $\real$.
\item[\upshape (b)] For all $\theta \in \Theta$ the function
$s \mapsto l(\theta + s \delta)$ is nondecreasing on $\real$.
\item[\upshape (c)] $\inner{Y - y, \delta} \le 0$ almost surely for some
distribution in the family.
\item[\upshape (d)] $\inner{Y - y, \delta} \le 0$ almost surely for all
distributions in the family.
\item[\upshape (e)] $\delta \in N_C(y)$.
\item[\upshape (f)] $\inner{w, \delta} \le 0$, for all $w \in T_C(y)$.
\end{enumerate}
\end{theorem}
Any vector $\delta$ that satisfies any one of the conditions
of the theorem (and hence all of them) is called
a \emph{direction of recession} of the log likelihood.
From now on we will simply say direction of recession or constancy
to refer to directions of recession or constancy of the log likelihood
(since we will not be interested in directions of recession or constancy
of any other function).  Note that every direction of constancy is a direction
of recession.

In light of (e) of the theorem, we could also call directions of recession
normal vectors (of the convex support at the observed
value of the natural statistic).  Since the word ``normal'' is already overused
in statistics, we prefer the term that cannot be confused with other
statistical notions.

\begin{theorem} \label{th:exist}
For a full exponential family with convex support $C$ and observed value
of the natural statistic $y$ such that $y \in C$, the following are equivalent.
\begin{enumerate}
\item[\upshape (a)] The MLE exists.
\item[\upshape (b)] Every direction of recession is a direction of constancy.
\item[\upshape (c)] $N_C(y)$ is a vector subspace.
\item[\upshape (d)] $T_C(y)$ is a vector subspace.
\end{enumerate}
\end{theorem}
This theorem provides a complete geometric solution to the problem
of when the MLE exists in a full exponential family.

\begin{corollary} \label{cor:strictly-increasing}
For a full exponential family with log likelihood \eqref{eq:logl},
natural parameter space $\Theta$, convex support $C$, and observed value
of the natural statistic $y$ such that $y \in C$,
if $\delta$ is a direction of recession
that is not a direction of constancy, then for all $\theta \in \Theta$
the function $s \mapsto l(\theta + s \delta)$ is strictly increasing
on the interval where it is finite.
\end{corollary}

\subsection{Limits in Directions of Recession} \label{sec:limits}

\begin{theorem} \label{th:limits}
For a full exponential family having log likelihood \eqref{eq:logl},
densities \eqref{eq:density}, natural statistic $Y$,
observed value of the natural statistic $y$
such that $y$ is in the convex support,
and natural parameter space $\Theta$,
if $\delta$ is a direction of recession that is not a direction of constancy,
\begin{equation} \label{eq:h2}
   H = \set{ w \in \real^p : \inner{w - y, \delta} = 0 },
\end{equation}
and $\Pr(Y \in H) > 0$ for some distribution in the family, and hence for all,
then for all $\theta \in \Theta$
\begin{equation} \label{eq:limits}
   \lim_{s \to \infty} f_{\theta + s \delta}(\omega)
   =
   \begin{cases}
   0, & \inner{Y(\omega) - y, \delta} < 0
   \\
   f_\theta(\omega) / \Pr_\theta(Y \in H), & \inner{Y(\omega) - y, \delta} = 0
   \\
   + \infty, & \inner{Y(\omega) - y, \delta} > 0
   \end{cases}
\end{equation}
Moreover $s \mapsto \Pr_{\theta + s \delta}(Y \in H)$ is continuous and strictly
increasing, and $\Pr_{\theta + s \delta}(Y \in H) \to 1$ as $s \to \infty$.
\end{theorem}
We note three things about the right-hand side of \eqref{eq:limits}.
First, it is a probability density with respect to the distribution having
parameter value $\psi$.  The set where it is $+\infty$ has probability zero
by Theorem~\ref{th:recession}~(d),
so this is not a problem.
Second, it is the density of the conditional distribution given the event
$Y \in H$ of the distribution having parameter value $\theta$.
Third, by Scheff\'{e}'s lemma \citep[p.~351]{lehmann-tsh} pointwise convergence
of densities implies convergence in total variation, which implies convergence
in distribution.

Denote the right-hand side of \eqref{eq:limits} by $f_\theta(\omega \mid Y \in H)$.
It is clear that the family
\begin{equation} \label{eq:conditional-family}
   \set{ f_\theta(\fatdot \mid Y \in H) : \theta \in \Theta }
\end{equation}
is an exponential family with the same natural statistic and natural parameter
as the original family.  Moreover, it is clear
that the log likelihood for
this conditional family
$$
   l_H(\theta) = \inner{y, \theta} - c(\theta) - \log \myPr_\theta(Y \in H)
$$
satisfies
$$
   l(\theta) < l_H(\theta), \qquad \theta \in \Theta.
$$
Thus, if an MLE exists for
the conditional family \eqref{eq:conditional-family},
then it maximizes the likelihood in the family that is the union
of \eqref{eq:conditional-family} and the original family.
When this happens, we say we have found an MLE in the Barndorff-Nielsen
completion of the original exponential family.

If the conditional family \eqref{eq:conditional-family}
has a direction of recession that is not a direction of constancy,
we can take limits again, and continue until we have a limiting family
that has no directions of recession that are not directions of constancy,
which must eventually occur since
the dimension of the convex support of the limiting family must
decrease in each limiting operation and can go no lower than zero,
in which case the convex support contains a single point and the conditional
family contains a single distribution which is trivially the MLE\@.
This iterated limiting process
is the one followed in \citet{thesis}.  It is actually much more general
than the one we follow here.

Up to this point, we have been closely following \citet{thesis},
but now we part company, imposing conditions that assure $\delta$ can be chosen
so that one limit is enough.  So far the only condition we have imposed
is $\Pr(Y \in H) > 0$, which is required for the existence of the limiting conditional
family \eqref{eq:conditional-family}.
We will impose other conditions as we go along.
All of the conditions we assume will be collected in one place in
Section~\ref{sec:assume}.  Most of these conditions are not new, having been
proposed by \citet[pp.~193 and~197]{brown}.
Although these conditions limit the applicability of our procedure,
they do apply in most applications, and they make
construction of confidence intervals much easier.

The limiting conditional family \eqref{eq:conditional-family} need not be full;
the natural parameter space of the full family containing
\eqref{eq:conditional-family} is at least
as large as
\begin{equation} \label{eq:larger}
   \Theta + \Gamma_{\text{lim}}
   =
   \set{ \theta + \gamma :
   \theta \in \Theta \opand \gamma \in \Gamma_{\text{lim}} },
\end{equation}
where $\Theta$ is the natural parameter space of the original family
and $\Gamma_{\text{lim}}$ is the constancy space of
\eqref{eq:conditional-family}.
We will assume that \eqref{eq:larger} is the natural parameter space
of the full family containing \eqref{eq:conditional-family}.
Although pathological examples can be constructed for which this assumption
fails \citep[Example~2.1]{thesis},
we know of no realistic applications for which it fails.

\subsection{Convex Polyhedra}

One part of the condition of \citet[p.~197]{brown} mentioned above is that
the convex support is polyhedral \citep[Section~19]{rocky}.
Since most applications satisfy this condition, it entails
little loss of generality.

A set $C$ is a \emph{convex polyhedron} if it is
the intersection of a finite collection of closed half-spaces
\citep[Section~19]{rocky}, that is,
\begin{equation} \label{eq:polyhedron}
   C
   =
   \set{ w \in \real^d : \text{$\inner{w, \alpha_i} = b_i$, $i \in E$,
    and $\inner{w, \alpha_i} \le b_i$, $i \in I$} },
\end{equation}
where the $\alpha_i$ are nonzero vectors, the $b_i$ are scalars,
and $E$ and $I$ are disjoint finite sets.

There is an alternative characterization of convex polyhedra; they are
convex hulls of finite sets of points and directions, that is,
sets of all linear combinations
\begin{equation} \label{eq:linear-combination}
   \sum_{i \in E \cup I} b_i \alpha_i,
\end{equation}
where the $\alpha_i$ are vectors, the $b_i$ are scalars, $E$ and $I$
are disjoint finite sets, and the $b_i$ satisfy
\begin{subequations}
\begin{equation} \label{eq:non-neg}
   b_i \ge 0, \qquad i \in E \cup I
\end{equation}
and if $I$ is nonempty
\begin{equation} \label{eq:sum-to-one}
   \sum_{i \in I} b_i  = 1.
\end{equation}
\end{subequations}
The vectors $\alpha_i$, $i \in E \cup I$ are also called the \emph{generators}
of $C$ and $C$ the set \emph{generated} by the \emph{points} $\alpha_i$, $i \in I$
and the \emph{directions} $\alpha_i$, $i \in E$.

The equivalence of these two
characterizations is called the Minkowski-Weyl theorem
\citep[Theorem~19.1]{rocky}.  The R function \texttt{scdd} in the
contributed package \texttt{rcdd} \citep{rcdd} converts between these
two representations of a convex polyhedron, which it calls the H-representation
and V-representation, respectively.

Let $P$ denote the set $\set{\alpha_i : i \in I }$ of points
and $D$ denote the set $\set{\alpha_i : i \in E }$ of directions.
When $P \neq \varnothing$, we write
$$
   C = \con(P) + \con(\pos D)
$$
to denote the relationship between the convex polyhedron $C$ and its sets
of generators $P$ and $D$.
When $P = \varnothing$, we write
$$
   C = \con(\pos D)
$$
to denote the relationship between $C$ and $D$.
This notation follows \citet[Sections~2E and~3G]{raw}.

When $C$ is a convex polyhedron, $N_C(y)$ and $T_C(y)$ are also convex
polyhedrons for each $y \in C$ and are given in terms of the H-representation
of $C$ by simple formulas \citep[Theorem~6.46]{raw}.  Moreover,
the closure operation in \eqref{eq:tangent-cone} is unnecessary when
$C$ is a convex polyhedron.
When $T_C(y)$ or any convex cone is polyhedral, its V-representation
can consist of directions only, so can be of the form
$\con(\pos V)$ for some finite set $V$.

\subsection{Generic Directions of Recession}

The \emph{relative interior} of a convex set $C$, denoted $\rint C$,
is its interior relative to its affine hull \citep[Chapter 6]{rocky}.
Every nonempty convex set has a nonempty relative interior
\citep[Theorem~6.2]{rocky}.

We say a vector $\delta$ is a \emph{generic direction of recession}
(GDOR) if $\delta \in \rint N_C(y)$ and $N_C(y)$ is not a vector subspace,
where $C$ is the convex support and $y$ an observed value of the natural
statistic such that $y \in C$.
Since the relative interior is always nonempty, a GDOR exists if and only if
none of the conditions of Theorem~\ref{th:exist} hold.

\begin{theorem} \label{th:generic}
For a full exponential family having polyhedral convex support $C$
and observed value of the natural statistic $y$ such that $y \in C$,
let $T_C(y) = \con(\pos V)$, and define
$$
   L = \set{ v \in V : - v \in T_C(y) }.
$$
Then a generic direction of recession exists if and only if $L \neq V$,
in which case a vector $\delta$ is a generic direction of recession
if and only if
\begin{subequations}
\begin{align}
   \inner{w, \delta} = 0, & \qquad w \in L
   \label{eq:calc-equality}
   \\
   \inner{w, \delta} < 0, & \qquad w \in V \setminus L
   \label{eq:calc-inequality}
\end{align}
\end{subequations}
\end{theorem}

\begin{corollary} \label{cor:generic}
Under the assumptions of the theorem,
a generic direction of recession is not a direction of constancy.
\end{corollary}

If $B$ is a set of vectors, let $\spam B$ denote the smallest
vector subspace containing $B$.  Also for any vector $x$, let
$x + \spam B = \set{ x + v : v \in \spam B }$.
\begin{corollary} \label{cor:generic-limit}
Under the assumptions of the theorem,
suppose $\delta$ is a generic
direction of recession, and $H$ is defined by \eqref{eq:h2}.
Then $T_{C \cap H}(y) = \spam L$, and $C \cap H = C \cap (y + \spam L)$.
\end{corollary}

The theorem and corollaries explain the purpose
of generic directions of recession.
By Corollary~\ref{cor:generic}, a GDOR implies the MLE does not exist in the
conventional sense, so we seek it in the limiting family described
in \eqref{sec:limits}.
Suppose that $C \cap H$ is the convex support of the limiting family.
This is another part of the conditions of \citet[pp.~193]{brown} referred to
above.
Then $T_{C \cap H}(y)$ being a vector subspace implies that the MLE
in the limiting family exists by Theorem~\ref{th:exist} (c).
Thus finding one GDOR allows us to find the MLE in the Barndorff-Nielsen
completion.

At this point our theory is essentially complete; only computational issues
remain.  So we take some time to explain the connection between this theory
and the pre-existing theory of Barndorff-Nielsen completion
\citep{barndorff,brown,thesis}.  The pre-existing theory says
the MLE lies in the limiting conditional family whose convex support,
what we are calling $C \cap H$, is the unique face of $C$ containing $y$
in its relative interior \citep[Chapter~4 generalizes this]{thesis}.
Thus the pre-existing approach makes it clear that the limiting conditional
family containing the MLE is unique and does not depend on the GDOR,
which is in general not
unique.  In our approach, uniqueness comes from the assertion
$C \cap H = C \cap (y + \spam L)$ in Corollary~\ref{cor:generic-limit}.
This makes it clear
that, although the hyperplane $H$ does depend on the GDOR $\delta$ used to
define it, the convex support $C \cap H$ of the limiting distribution does
not depend on $H$, hence does not depend on $\delta$.  Since we do not need
the pre-existing theory, we shall not bother with a proof that $C \cap H$
is the face of $C$ containing $y$ in its relative interior (the proof is
almost immediate from Lemma~A.1 in \citealp{thesis}), and merely assure
the reader that this is indeed the case, and our new theory describes the
same mathematical structure as the pre-existing theory.  We have not rewritten
the theory of Barndorff-Nielsen completion merely for amusement.
As we shall see, the
new theory based on the GDOR concept is much better suited for computation
and statistical inference than the pre-existing theory based on faces
of a convex set.

Before moving to computational issues, we would like to write down somewhere
one interesting issue.  Corollary~\ref{cor:generic-limit} would be false
without the assumption that $C$ is polyhedral.  Consider the following
example.  $C$ is the set in $\real^2$ consisting of the closed unit disk
and all points with one coordinate negative and the other less than or equal
to one.  This is a closed convex set, but not polyhedral.  The points
$(0, 1)$ and $(1, 0)$ are peculiar in that they are faces of $C$ that are not
exposed in the terminology of \citet[pp.~162--163]{rocky}.
Suppose $y = (1, 0)$.
Then $N_C(y) = \set{ (s, 0) : s \ge 0 }$.  Since this normal cone is not
a vector subspace, there is a generic direction of recession;
one is $\delta = (1, 0)$.  Then $H = \set{ (1, s) : s \in \real }$
and $C \cap H = \set{ (1, s) : s \le 0 }$.
Then $T_{C \cap H}(y) = \set{ (0, s) : s \le 0 }$ is not a vector subspace.
In this sort of situation the more general theory of \citet{thesis} may apply,
but our theory based on generic directions of recession cannot.  The condition
that $C$ be polyhedral could clearly be weakened to $C$ being locally
polyhedral (every point has a convex polyhedral neighborhood $B$
such that $B \cap C$ is polyhedral), but we know of no application of such
a condition, hence do not use it.

\subsection{Assumptions} \label{sec:assume}

We summarize the assumptions we have made above.  We deal with a full
exponential family.
If every direction of recession is a direction of constancy, then
we need no further assumptions.  Otherwise, let $\delta$ be a generic direction
of recession, let $C$ be the convex support, let $Y$ be the natural statistic,
let $y$ be an observed value of the natural statistic satisfying $y \in C$,
and let $H$ be defined by \eqref{eq:h2}.
We assume the event $Y \in H$ has positive probability so the
limiting conditional family defined in Section~\ref{sec:limits} exists.
We further assume that $C \cap H$ is the convex support of this limiting
conditional family, so that by Theorem~\ref{cor:generic-limit} the MLE in this
limiting conditional family exists.
We further assume that the natural parameter space of the full family
containing the limiting conditional
family is given by \eqref{eq:larger}, so that the confidence interval
construction in Section~\ref{sec:confidence} below is valid.
Finally, we assume that $C$ is a convex polyhedron.
We need no other assumptions.

\subsection{Natural Affine Submodels}

In most applications of exponential family theory, we start with
a very large exponential family, which we call \emph{saturated} and which
has too many parameters to estimate well.
Then we consider \emph{natural affine submodels}, parametrized by
$$
   \theta = a + M \beta,
$$
where $\theta$ is the natural parameter of the saturated model, $\beta$ is
the natural parameter of the natural affine submodel, $a$ is a known vector,
and $M$ is a known matrix.  In the terminology of the R function \texttt{glm},
$a$ is called the \emph{offset vector}
and $M$ is called the \emph{model matrix}.

Observe that
$$
   \inner{y, a + M \beta} = \inner{y, a} + \inner{M^T y, \beta},
$$
where the two bilinear forms on the right-hand side have different dimensions.
Since the first term on the right-hand side does not contain the parameter
and can be dropped from the log likelihood, the submodel is itself
an exponential family with natural statistic $M^T y$
and natural parameter $\beta$.
Thus everything said above applies to natural affine submodels, we
just work with the convex support of $M^T Y$ rather than of $Y$.

\subsection{Relating Tangent Cones of Models and Affine Submodels}
\label{sec:relate}

Let $C_{\text{sat}}$ denote the convex support of the saturated model
and $C_{\text{sub}}$ that of the natural affine submodel.
By Theorems~6.43 and~6.46 in \citet{raw},
\begin{equation} \label{eq:tc-map}
   T_{C_{\text{sub}}}(M^T y) = \cl \set{ M^T w : w \in T_{C_{\text{sat}}}(y) }
\end{equation}
and the closure operation is not necessary if $C_{\text{sat}}$
is polyhedral.  Moreover, it is clear that
if $T_{C_{\text{sat}}}(y) = \con(\pos V_{\text{sat}})$,
then $T_{C_{\text{sub}}}(y) = \con(\pos V_{\text{sub}})$, where
\begin{equation} \label{eq:vee-map}
   V_{\text{sub}}
   =
   \set{ M^T w : w \in V_{\text{sat}} }.
\end{equation}
The induced mapping for normal cones is not so simple, requiring linear
programming.  This explains our starting with tangent vectors
and V-representations rather than normal vectors and H-representations.

\subsection{Tangent Cones of Saturated Models}

In saturated families the convex support is often easy to calculate.
In logistic regression, each component of the response vector is Bernoulli
and $C_{\text{sat}} = [0, 1]^p$.
In Poisson regression, each component of the response vector is Poisson
and $C_{\text{sat}} = [0, \infty)^p$.
In categorical data analysis with Poisson response,
$C_{\text{sat}} = [0, \infty)^p$, just as in Poisson regression.
In categorical data analysis with multinomial or product-multinomial response,
the model can be derived from the Poisson model by conditioning on an affine
subspace, hence the convex support is the intersection of $[0, \infty)^p$
with this affine subspace.
As we shall see (Section~\ref{sec:multi} below),
this allows us to use the solution in the Poisson response
problem to calculate the solutions in the other problems.

When $C_{\text{sat}}$ is a Cartesian product,
as in the examples mentioned, $T_{C_{\text{sat}}}(y)$ can be calculated
coordinatewise \citep[Proposition~6.41]{raw}.
This is the only situation we will use for our examples.
Let $e_i$ denote the unit vector in the $i$-th coordinate direction
(every coordinate is zero except for the $i$-th, which is one).
Then $e_i$ is a tangent
vector at $y$ if $y_i$ is not at the upper bound of its range,
and $- e_i$ is a tangent vector at $y$ if $y_i$ is not at the lower bound.
In this case, the set $V_{\text{sat}}$ of the preceding section
contains $e_i$ or $- e_i$ or both for each $i$.

We will not do any examples where $C_{\text{sat}}$ is not a Cartesian product, except for
categorical data analysis with multinomial or product multinomial response,
where we will derive the solution from the solution for the Poisson response
case where $C_{\text{sat}}$ is a Cartesian product.
An application where the convex
support is not a Cartesian product is provided by unconditional aster models
\citep{gws}.  We merely note that all of the theory developed in this technical
report applies to case where $C_{\text{sat}}$ is not a Cartesian product.  Moreover,
most of the computational procedures described below also apply to this case.
Only at the beginning and the end of this process
does the non-Cartesian-product case present
additional issues.  At the beginning we need to determine
a set $V_{\text{sat}}$ that generates $T_{C_\text{sat}}(y)$, and this may
be more difficult than in the Cartesian product case.
At the end (Section~\ref{sec:completion} below), we need to determine
the convex support $C_{\text{sat}} \cap H_{\text{sat}}$ of
the limiting conditional model and compute
the MLE in this model, and this may also be more difficult than in the
Cartesian product case.

\subsection{Calculating the Linearity} \label{sec:linearity}

Next we determine the \emph{linearity} of $V_{\text{sub}}$
\begin{equation} \label{eq:linearity}
   L_{\text{sub}}
   =
   \set{ w \in V_{\text{sub}} : - w \in \con(\pos V_{\text{sub}}) }.
\end{equation}
This sounds like a complicated operation, and it is, but the \texttt{rcdd}
package has a function \texttt{linearity} that does it by repeated linear
programming.

Having found the linearity, we have solved the problem of when the MLE
exists in the original family.
It exists if and only if $L_{\text{sub}} = V_{\text{sub}}$.

All functions in the
\texttt{rcdd} package use two forms of arithmetic.  One is the
default computer arithmetic used by all other R functions.
Answers produced using that arithmetic are inexact, so one is uncertain
whether the $L_{\text{sub}}$ produced is actually correct.
The other form of arithmetic is exact, infinite-precision, rational arithmetic.
Answers produced using that arithmetic are exact, so one is certain
that the $L_{\text{sub}}$ produced is actually correct, but only
if the vectors in $V_{\text{sub}}$ are also produced exactly using
either integer arithmetic or rational arithmetic.

For readers curious about how the \texttt{linearity} function works
we give the following description.  Others should skip the rest of
this section.  According to comments in the source code
(starting at line 3062 of the file \texttt{cddlp.c} of the source
code for the \texttt{rcdd} package, version~1.1-1, which comes from
the \texttt{cddlib} library, version~0.94f, written by K. Fukuda)
for each $w \in V_{\text{sub}}$ it solves the linear programming problem
\begin{equation} \label{eq:linprog}
\begin{split}
   & \text{maximize} \\
   & \qquad \inner{w, \delta} \\
   & \text{subject to} \\
   & \qquad
   \begin{aligned}
      % \inner{w, \delta} & \ge - 1 \\
      \inner{v, \delta} & \ge 0,
      & & \qquad v \in V_{\text{sub}} \setminus \{ w \}
  \end{aligned}
\end{split}
\end{equation}
where $\delta$ is the state vector of the linear programming problem.
\begin{theorem} \label{th:linprog}
A vector $w$ is in the linearity \eqref{eq:linearity}
if and only if the optimal value of the linear program \eqref{eq:linprog}
is nonpositive.
\end{theorem}
It may be the case that some $w \in V_{\text{sub}}$ are known a priori
to be in $L_{\text{sub}}$.  This can be specified in the input to the
\texttt{linearity} function, so the work verifying this need not be done.

\subsection{Calculating Generic Directions of Recession} \label{sec:gdor}

If $L_{\text{sub}} \neq V_{\text{sub}}$, then $T_{C_{\text{sub}}}(M^T y)$
is not a vector subspace, hence there exists
a generic direction of recession.  By Theorem~\ref{th:generic}, $\delta$
is a GDOR if and only if
\begin{subequations}
\begin{align}
   \inner{w, \delta} = 0, & \qquad w \in L_{\text{\upshape sub}}
   \label{eq:linear}
   \\
   \inner{w, \delta} < 0, & \qquad w \in V_{\text{\upshape sub}} \setminus L_{\text{\upshape sub}}
   \label{eq:nonlinear}
\end{align}
\end{subequations}
Hence we can find one such $\delta$
by solving the following linear programming problem
\begin{align*}
   & \text{maximize} \\
   & \qquad \epsilon \\
   & \text{subject to} \\
   & \qquad
   \begin{aligned}
      \epsilon  & \le 1 \\
      \inner{v, \delta} & = 0, & & \qquad v \in L_{\text{sub}} \\
      \inner{v, \delta} & \le - \epsilon,
      & & \qquad v \in V_{\text{sub}} \setminus L_{\text{sub}}
  \end{aligned}
\end{align*}
where $\delta$ is a $p$-vector, $\epsilon$ is a scalar, and
$(\delta, \epsilon)$ is the state vector of the linear programming problem
(so the dimension is $p + 1$).  The $\delta$ part of the solution
is a generic direction of recession.  The $\epsilon$ part does not matter.

The idea for using this particular linear program came from the documentation
for the \verb@dd_ExistsRestrictedFace2@ function in the \texttt{cddlib} library,
which is the computational geometry library (written by K. Fukuda)
to which \texttt{rcdd} provides an incomplete interface.
The \texttt{rcdd} package does not provide an interface to this \texttt{cddlib}
function, but it does provide a function \texttt{lpcdd} that does linear
programming and can be used to solve this linear program.

\subsection{Calculating Maximum Likelihood Estimates} \label{sec:mle}

\subsubsection{In the Original Family} \label{sec:original}

There is little to be said about calculating the MLE in the original family.
When we have found that $L_{\text{sub}} = V_{\text{sub}}$, then we know the MLE
exists and can use available software to find it.  We will use the R function
\texttt{glm} for our examples.

There is one issue worth mentioning.  If the model is non-identifiable,
so the MLE is non-unique, the R function \texttt{glm} is smart enough to
drop enough predictors to produce an identifiable model.  However, its
method of doing so is not guaranteed because of inexactness of the default
computer arithmetic.

We can use the \texttt{redundant} function in the \texttt{rcdd}
package applied to the columns of $M$ to reduce to a linearly independent
set.  If $M$ was calculated using integer or rational arithmetic, and
\texttt{redundant} uses rational arithmetic, then this operation will
be exact.

\subsubsection{In the Completion} \label{sec:completion}

When we have found that $L_{\text{sub}} \neq V_{\text{sub}}$ and have found
a generic direction of recession $\delta$, we still need to characterize
the support of the limiting conditional family.  We can characterize this
two ways using either of
\begin{align*}
   H_{\text{sub}} & = \set{ w \in \real^q : \inner{w - M^T y, \delta} = 0 }
   \\
   H_{\text{sat}} & = \set{ w \in \real^p : \inner{w - y, M \delta} = 0 }
\end{align*}
where $p$ and $q$ are the dimensions of the saturated model
and affine submodel, respectively.  Then the limiting conditional model
conditions on the event $M^T Y \in H_{\text{sub}}$ or $Y \in H_{\text{sat}}$,
which is the same event characterized two different ways, the latter usually
simpler.
\begin{theorem} \label{th:support}
In the setup of Sections~\ref{sec:linearity} through~\ref{sec:mle},
define
$$
   L_{\text{\upshape sat}} = \set{ v \in V_{\text{\upshape sat}} : M^T v \in L_{\text{\upshape sub}} }.
$$
Then the support of the limiting conditional family is
\begin{equation} \label{eq:support-sat}
   C_{\text{\upshape sat}} \cap H_{\text{\upshape sat}}
   =
   C_{\text{\upshape sat}} \cap (y + \spam L_{\text{\upshape sat}})
\end{equation}
when referred to the saturated model.
\end{theorem}

In the case where the convex support of the saturated model is a Cartesian
product, the support of the limiting model simply constrains $Y_i = y_i$
for $i$ such that $e_i \notin L_{\text{sat}}$, that is, the $i$-th component of
the response is unconstrained if $i \in L_{\text{sat}}$ and is constrained
to be equal to its observed value if $i \notin L_{\text{sat}}$.

This finishes our analysis of maximum likelihood estimation in the
Barndorff-Nielsen completion.  At least in the Cartesian product case,
the maximum likelihood problem in the completion is of the same form
as the original problem.  The only difference is that we constrain
certain components of the response vector to their observed values.
This can be achieved by removing those components from the response vector
and proceeding as if the resulting subvector were the entire response vector.
If, for example, we are using the R function \texttt{glm} to fit models,
we merely delete certain elements of the response vector and the corresponding
rows of the model matrix (or the data frame containing the data if we are
using a formula to specify the model) and proceed normally.  This conditional
model (with some components deleted) will always be non-identifiable,
because $\delta$ will always be a direction of constancy and there may be
other directions of constancy.  The \texttt{glm} function, however, can
deal with this issue.  Furthermore, even in the rare case when
the \texttt{glm} function may be confused, we can find a full rank
model matrix having
the same column space as the original model matrix
using the function \texttt{redundant}
in the \texttt{rcdd} package, as described in the preceding section.

\subsection{Phase I and Phase II}

\citet{thesis} coined the term ``phase~I maximum likelihood problem'' to
refer to the process of determining
whether the likelihood function has any local or global maxima and if not
what to do about it.  This term was also used by \citet{gt}.
It was coined by analogy with the phase~I linear
programming problem, which is to find a feasible point, if any exists, or
determine that none exist.  If one exists, then this is used
to start the phase~II problem, finding optimal values.

Little, if anything is known about the phase~I maximum likelihood problem
except in one special case: exponential families, where it is completely
understood theoretically.  As we have seen, the phase~I problem for full
exponential families satisfying the condition of \citet{brown} consists
entirely in determining the set $L_{\text{sat}}$ and a GDOR $\delta$,
and this is done by repeated
linear programming using one call to the function \texttt{linearity} in the
\texttt{rcdd} package and one call to the function \texttt{lpcdd} in the
same package.  The phase~I algorithm can be done using exact infinite-precision
rational arithmetic, in which case the result is exact.

The rest of the maximum likelihood problem, can be called the phase~II problem:
finding the MLE\@.  In Section~\ref{sec:original} we saw that,
when the phase~I problem has established $L_{\text{sat}} = V_{\text{sat}}$,
the MLE exists in the usual sense and is found using the usual algorithm.
In Section~\ref{sec:completion} we saw that,
when the phase~I problem
has established $L_{\text{sat}} \neq V_{\text{sat}}$,
the MLE exists in the Barndorff-Nielsen completion and is found using
the usual algorithm applied to modified data, the modification being determined
by $L_{\text{sat}}$.

\subsection{Likelihood Ratio Tests} \label{sec:tests}

Given two nested natural affine submodels, the maximum value of the
log likelihood can be calculated for each submodel
even without solving the phase~I
algorithm.  Available software, such as the R function \texttt{glm}, will
go uphill on the log likelihood until reaching a point where the log likelihood
is nearly flat, in which case the value of the log likelihood is nearly the
maximum.  If the MLE does not exist in the conventional sense,
then the natural parameter estimates will be large
but not infinite, and the \texttt{glm} function may or may not give a warning
about lack of convergence.  If the MLE does not exist in the conventional
sense, then the natural parameter estimates are infinitely wrong, but the value of the
maximized log likelihood is nearly correct.  Thus we can correctly calculate
the likelihood ratio test statistic without solving the phase~I problem.

This does no good, however, because the usual asymptotics of the likelihood
ratio test (Wilks' theorem) do not hold in the case where the MLE for the null
model does not exist in the conventional sense.
In this case, the following simple correction,
suggested by S. Fienberg (personal communication) seems reasonable.
Solve the phase~I problem for the null model,
determining $L_{\text{sat}} \neq V_{\text{sat}}$.
Let $M_0$ and $M_1$ be the model matrices for the null and alternative
natural affine submodels ($M_0$ was used in the phase~I calculation).

If we apply Wilks' theorem to the limiting conditional model for the null
hypothesis, we obtain the result that the deviance (twice the log likelihood
ratio) is approximately chi-squared with degrees of freedom which is the
difference in dimension of $M_1^T (\spam L_{\text{sat}})$
and of $M_0^T (\spam L_{\text{sat}})$.  Assuming that $M_0$, $M_1$,
and $L_{\text{sat}}$ were
determined exactly using rational arithmetic, the degrees of freedom can
be determined exactly by applying the \texttt{redundant} function in the
\texttt{rcdd} package to the sets $\set{ M_i^T  w : w \in L_{\text{sat}} }$,
$i = 0, 1$.

This asymptotic approximation may or may not hold depending on the sample
size and on how close the observed value of the natural statistic is to the
boundary of the convex support of the limiting conditional model.
By construction, it cannot be on the boundary, but if it is close the
asymptotic approximation can be bad.  If one is worried about the validity
of the asymptotic approximation, one can always do a parametric bootstrap
calculation based on the limiting conditional model for the null hypothesis.

\subsection{Confidence Intervals} \label{sec:confidence}

Confidence intervals are more complicated than hypothesis tests.
Confidence intervals for both natural and mean value parameters are
of interest.  The R function \texttt{predict.glm} provides either, depending
on the value of its \texttt{type} argument.  We will also provide either.

Before getting into details, we should first note that confidence intervals
are often inappropriate from a theoretical point of view.
When there is not a single scalar parameter
of interest, a confidence region for the vector parameter of interest should,
theoretically,
be provided.  However, high-dimensional confidence regions are unvisualizable,
hence uninterpretable and of no interest to users.  Thus statisticians usually
provide something users think they can interpret, which is multiple confidence
intervals, often not adjusted for simultaneous coverage.  That is what,
for example, \texttt{predict.glm} provides.  We will follow the usual practice,
providing multiple confidence intervals in our examples and restricting our
treatment of confidence regions to a few comments.

In the case where the MLE does not exist in the conventional sense
but is found in the Barndorff-Nielsen completion, we have two natural parameter
spaces in play, the natural parameter space $\Theta$ of the original natural
affine submodel and the natural parameter space $\Theta_{\text{lim}}$ of
the full family containing the limiting conditional model, which by assumption
is given by \eqref{eq:larger}.
The MLE is in the latter, but confidence intervals or confidence regions must
be in the former.  Thus we need to fully understand the relationship between
the two.  Both are subsets of $\real^p$ and considered as such
$\Theta \subset \Theta_{\text{lim}}$, but we should not consider them as such
because a point $\theta$ in both sets corresponds to different distributions
in the two models.  Let $\delta$ be a GDOR, and let $\Gamma_{\text{lim}}$
denote the constancy space of the limiting conditional model.
Then we know $\delta \in \Gamma_{\text{lim}} \subset \Theta_{\text{lim}}$.
We also know that in the limiting conditional model the parameter is not
identifiable: for every $\gamma \in \Gamma_{\text{lim}}$ the parameter
values $\theta$ and $\theta + \gamma$ correspond to the same distribution.
Thus distributions do not correspond to parameter points $\theta$ but to
equivalence classes of points
$$
   \theta + \Gamma_{\text{lim}}
   = \set{ \theta + \gamma : \gamma \in \Gamma_{\lim} }.
$$
Of course, the same issue applies to the original model.  If its constancy
space is $\Gamma$, then equivalence classes $\theta + \Gamma$ correspond
to distributions in the original model.
% However, we can, and usually do,
% arrange so that there are no directions of constancy for the original model,
% that is, $\Gamma = \{ 0 \}$.
Since $\Gamma_{\text{lim}}$ contains $\delta$, it is
a nontrivial subspace.  We cannot reparametrize to make the limiting conditional
model identifiable without losing the connection between the two models.

Points $\theta$ and $\theta + \gamma$ with $\gamma \in \Gamma_{\text{lim}}$
correspond to the same distribution in the limiting conditional model but
may correspond to different distributions in the original model (do correspond
to different distributions unless $\gamma \in \Gamma$).
The relationship between the two models is that the distributions in
the original model corresponding to $\theta + s \delta$
and $\theta + \gamma + s \delta$ converge as $s \to \infty$ to the
(single) distribution in the limiting conditional model corresponding to
both parameter values $\theta$ and $\theta + \gamma$.

\subsubsection{In the Limiting Conditional Model}

In the limiting conditional model we have the ``usual asymptotics'' of maximum
likelihood.  The MLE $\hat{\theta}$ is asymptotically normal with variance
inverse Fisher information, or would be except for two issues: the Fisher
information matrix is singular with null space $\Gamma_{\text{lim}}$ and
we may not believe any distribution in the limiting conditional model is
correct, so a confidence region in  the limiting conditional model may
be nonsense.  Nevertheless, such confidence regions may be useful as a tool
for constructing confidence regions in the original model that do make sense.
Moreover, the non-identifiability issue in the limiting conditional model
can be dealt with either by using a pseudo-inverse for Fisher information or
by constraining $\hat{\theta}$ to lie in a subspace of $\real^p$ such that
the limiting conditional model is identifiable when this constraint is imposed.
We usually take the latter approach, since the R function \texttt{glm}
does this automatically, dropping parameters to obtain identifiability.

Suppose we have such a confidence region $R_{\text{lim}}$ for $\theta$ in
the limiting conditional model.  Because we used constraints, we do
not automatically get $R_{\text{lim}} \supset \Gamma_{\text{lim}}$.
Hence, if we wish to relate this confidence region to the original model,
which we do, then we need to consider
$R_{\text{lim}} + \Gamma_{\text{lim}} = \set{ \theta + \gamma :
\theta \in R_{\text{lim}}, \ \gamma \in \Gamma_{\text{lim}} }$
the ``actual'' confidence region.

\subsubsection{In the Original Model}

Now for each $\theta \in R_{\text{lim}}$
and each $\gamma \in \Gamma_{\text{lim}}$, the distribution in the limiting
conditional model corresponding to $\theta + \gamma$ is the limit
as $s \to \infty$ of the distributions in the original model corresponding
to $\theta + \gamma + s \delta$.  Thus it remains to be decided how large
$s$ may be, that is, we need a confidence interval for $s$, which will
necessarily be one-sided, of the form $(\hat{s}_{\theta + \gamma}, \infty)$.
As our notation suggests, we make one such confidence interval for each
point $\theta + \gamma$ we need to consider.

We base our interval on the statistic
$\inner{Y, \delta}$, the observed value of which $\inner{y, \delta}$ is its
maximum possible value.  Since $\inner{Y, \delta}$ is discrete where it counts
--- the value $\inner{y, \delta}$ is assumed to have positive probability so
that the limiting conditional model described in Section~\ref{sec:limits}
exists --- it is not possible to get an exact confidence interval unless one
uses randomized or fuzzy intervals, described by \citet{fuzzy}.
It follows immediately from the theory of uniformly most powerful tests
\citep{lehmann-tsh} and the definitions in \citet{fuzzy},
that the exact $1 - \alpha$ fuzzy confidence interval in this case
has membership function
\begin{equation} \label{eq:fuzzy}
   I(s) = \max[ 0, 1 - \alpha /
   \myPr_{\theta + \gamma + s \delta}(Y \in H) ],
\end{equation}
which has a direct interpretation as a confidence statement: $I(s)$ gives
the degree to which $s$ should be considered to be in the confidence interval
\citep{fuzzy}.  Also \eqref{eq:fuzzy} can be used to construct randomized
confidence intervals; if $U$ is a $\text{Uniform}(0, 1)$ random variate
independent of the data, then
$$
   \vphantom{I(s)}^{U}I(s) = \set{ s : I(s) \ge U }
$$
is an exact $1 - \alpha$ randomized confidence interval
\citet[Section~2.1]{fuzzy}.  The main point of this technical report
is to advocate
the use of generic directions of recession to calculate maximum likelihood
estimates in the Barndorff-Nielsen completion and related hypothesis tests
and confidence intervals.  Advocacy of fuzzy confidence intervals is not
our purpose (see \citealp{fuzzy}, for that).  However, if one wants an
exact procedure, one must use the fuzzy confidence interval, so we have
mentioned it.

For those that want a conventional confidence interval, the \emph{support}
of the fuzzy interval
\begin{equation} \label{eq:support}
   (\hat{s}_{\theta + \gamma}, \infty) = \supp I = \set{ s : I(s) > 0 }
\end{equation}
\citet[Section~1.3]{fuzzy} is a conservative $1 - \alpha$ confidence interval
for $s$.  Clearly,
$$
   \hat{s}_{\theta + \gamma}
   =
   \inf \set{ s \in \real : \theta + \gamma + s \delta \in \Theta
   \opand \myPr_{\theta + \gamma + s \delta}(Y \in H) > \alpha }.
$$
Since $s \mapsto \myPr_{\theta + \gamma + s \delta}(Y \in H)$ is continuous
and strictly increasing by Theorem~\ref{th:limits},
usually $\hat{s}_{\theta + \gamma}$ is the unique $s$ such that
$\myPr_{\theta + \gamma + s \delta}(Y \in H) = \alpha$.
Only when the data
have very little information about the parameter would it be the case that
$\myPr_{\theta + \gamma + s \delta}(Y \in H) > \alpha$ for all $s$ in
the allowed range, in which case $\hat{s}_{\theta + \gamma}$ would be
the lower endpoint of this range.

We should remark that there is a simple argument leading directly to
these conventional confidence intervals \eqref{eq:support} without going
through fuzzy confidence intervals.  The conventional
conservative $P$-value for the upper-tailed test
having test statistic $\inner{Y, \delta}$,
null hypothesis $\theta + \gamma + s \delta$,
and observed data $y$ is
\begin{equation} \label{eq:pval}
   \myPr_{\theta + \gamma + s \delta}(\inner{Y - y, \delta} \ge 0).
\end{equation}
A conventional level $\alpha$ test rejects when \eqref{eq:pval} is less than
or equal to $\alpha$.  A conservative $1 - \alpha$ confidence interval consists
of the set of $s$ values that are not rejected at level $\alpha$.  In the
only case of interest to us, where $\inner{y, \delta}$ is the largest possible
value of $\inner{Y, \delta}$ so the event $\inner{Y - y, \delta} \ge 0$ is
the same as $Y \in H$ up to a set of measure zero, this comes to the same interval
as \eqref{eq:support}.

\subsubsection{A Combination of the Two}

Because all of our one-sided confidence intervals for $s$, one for each
parameter point $\theta + \gamma$, $\theta \in R_{\text{lim}}$,
$\gamma \in \Gamma_{\text{lim}}$, use the same
test statistic $\inner{Y, \delta}$, it follows that they have simultaneous
coverage probability at least $1 - \alpha$.  Thus under the assumption that
$R_{\text{lim}}$ was a $1 - \alpha^*$ confidence region in the conditional
limiting model, we see that
$$
   \set{ \theta + \gamma + s \delta : \theta \in R_{\text{lim}},
   \ \gamma \in \Gamma_{\text{lim}}, \ s \ge \hat{s}_{\theta + \gamma} }
$$
is a $1 - \alpha - \alpha^*$ confidence region for the natural parameter
of the original model.

However, we are rarely interested in providing a confidence region.
Thus we will instead consider
$$
   \set{ \hat{\theta} + \gamma + s \delta :
   \ \gamma \in \Gamma_{\text{lim}}, \ s \ge \hat{s}_{\hat{\theta} + \gamma} }
$$
to be a $1 - \alpha$ confidence region that captures the part
of the uncertainty relating to ``how close to infinity'' the natural parameter
may be.  This must be combined with $R_{\text{lim}}$ or with non-simultaneous
confidence intervals based on the limiting conditional model.  Simultaneous
coverage will not be achieved unless Bonferroni or other correction is applied.

\subsubsection{The Cartesian Product Case}

In the case where the convex support of the saturated model is a Cartesian
product, all of this simplifies somewhat.   We know that a confidence region
for the mean value parameters of the limiting conditional model only involves
the response variables that are not constrained to be at their observed values
in the limiting conditional model.  We take such a confidence region or
separate confidence intervals, such as those provided by the R function
\texttt{predict.glm} applied to the limiting conditional model, to be adequate
for describing those components of the response.

Our one-sided intervals come into play in computing one-sided confidence
intervals for the mean value parameters of the other components of the
response.  Since the MLE of their mean value parameters are on the boundary,
one-sided intervals are the only kind that make sense.  We distinguish two
cases.

The first case, is where $\Gamma_{\text{lim}} = \set{ s \delta : s \in \real }$.
In this case, all intervals $(\hat{s}_{\theta + \gamma}, \infty)$ for the
same $\theta$ but different $\gamma \in \Gamma_{\text{lim}}$ correspond
to the same natural parameter values.  Hence we can ignore $\gamma$.  We
can hope that one interval $(\hat{s}_{\hat{\theta}}, \infty)$ adequately
describes the variability in mean value parameters for components of the
response having MLE at the boundary.

The second case, is where $\Gamma_{\text{lim}}$ contains vectors
not proportional to $\delta$.  Then we use our general formula, but can
still hope that the intervals $(\hat{s}_{\hat{\theta} + \gamma}, \infty)$,
$\gamma \in \Gamma_{\text{lim}}$ adequately
describe the variability in mean value parameters for components of the
response having MLE at the boundary.

In both cases we only use the intervals corresponding to $\hat{\theta}$
rather than all $\theta \in R_{\text{lim}}$.
This is clearly not exactly correct, however, we do have the following
argument.  The idea is that we are separating variation into two components,
one along the direction of recession and the other across the direction of
recession.  To a first approximation, our one-sided intervals are about
variation along the direction of recession and conventional intervals for
the limiting conditional model are about variation across the direction of
recession.  In the spirit of sloppiness that allows us to provide separate
confidence intervals rather than confidence regions, we consider this division
not too bad.  From the discussion about the distributions
for $\theta + s \delta$ and $\theta + \gamma + s \delta$ converging to the
same limiting conditional distribution, we see that this division cannot
be exactly correct, but it may do for practical purposes.

In any event, statisticians have made do up to now with no tools whatsoever
for handling this issue.
Approximately correct tools will be a great improvement.

\subsubsection{Calculating the Constancy Space} \label{sec:gamma}

By Theorems~\ref{th:constancy} and~\ref{th:exist} the constancy space
is $N_C(y)$ in the case where every direction of recession is a direction
of constancy.  By Corollary~\ref{cor:generic-limit} the tangent space
$T_{C \cap H}(y)$ in the limiting conditional model is $\spam L$.
Hence the constancy space is
$$
   N_{C \cap H}(y)
   =
   \set{ \delta \in \real^p : \inner{v, \delta} = 0, \ v \in L }.
$$
In the case of a natural affine submodel this becomes
\begin{equation} \label{eq:gamma}
   \Gamma_{\text{lim}}
   =
   N_{C_{\text{sub}} \cap H_{\text{sub}}}(M^T y)
   =
   \set{ \delta \in \real^q :
   \inner{v, \delta} = 0, \ v \in L_{\text{sub}} }.
\end{equation}
where $q$ is the dimension of the submodel.

We have already seen in Section~\ref{sec:linearity} how to calculate
$L_{\text{sub}}$.  Now we merely note that $L_{\text{sub}}$ is
a V-representation of its span, and a call to the function \texttt{scdd}
in the \texttt{rcdd} package will compute an H-representation of its span
which is also a V-representation for \eqref{eq:gamma}, that is, a basis
for the constancy space of the limiting conditional model.  The examples
that follow do not exhibit a need for this operation, but the need would
arise in other examples.

When calculating \eqref{eq:gamma} for the purpose of generating one-sided
confidence intervals it is clear that we do not need to let $\gamma$ range
over the whole constancy space \eqref{eq:gamma} because points $\gamma$
and $\gamma + s \delta$ lead to the same one-sided confidence intervals.
Hence it is enough to use the subspace of $\Gamma_{\text{lim}}$ orthogonal
to $\delta$, which is calculated by feeding $L_{\text{sub}} \cup \{ \delta \}$
to the R function \texttt{scdd} for conversion to an H-representation,
which will also be a basis of the desired subspace.

\section{Examples} \label{sec:example}

\subsection{A Logistic Regression Example} \label{sec:logit}

We start with a logistic regression.
Suppose we observe a vector $y$ whose
components are Bernoulli with means forming a vector $p$.  The natural
parameter is $\theta = \logit(p)$, where logit operates componentwise
$\theta_i = \logit(p_i)$.  Suppose we also have one covariate vector $x$
and we want to fit a quadratic model
$$
   \theta_i = \beta_1 + \beta_2 x_i + \beta_3 x_i^2.
$$
Finally, suppose $x_i$ takes the values 1, $\ldots$, 30 and $y_i = 0$
for $x_i \le 12$ or $x_i \ge 24$ and $y_i = 1$ otherwise.

The following R statements create the data and attempt to fit the model.
<<logit-data>>=
x <- 1:30
y <- c(rep(0, 12), rep(1, 11), rep(0, 7))
out1 <- suppressWarnings(glm(y ~ x + I(x^2), family = binomial, x = TRUE))
@
It seems difficult if not impossible to capture warning messages to an
Sweave file, so we have suppressed them.  The warnings given by R version 2.7.0
are
\begin{verbatim}
  algorithm did not converge
  fitted probabilities numerically 0 or 1 occurred
\end{verbatim}
(two separate warnings).  This is the \texttt{glm} function's way of indicating
that the MLE may not exist.  However, because we gave the argument
\verb@x = TRUE@ we have obtained the model matrix
<<logit-modmat>>=
M <- out1$x
@

Now we are ready to carry out the computation of Section~\ref{sec:linearity},
determining $L_{\text{sub}}$.
<<logit-linear>>=
tanv <- M
tanv[y == 1, ] <- (- tanv[y == 1, ])
vrep <- cbind(0, 0, tanv)
lout <- linearity(vrep, rep = "V")
lout
@
The rows of the matrix \texttt{tanv} are the elements of $V_{\text{sub}}$.
The matrix \texttt{vrep} is the form in which the \texttt{rcdd} package
encodes the V-representation
of $\con(\pos V_{\text{sub}}) = T_{C_{\text{sub}}}(M^Ty)$.
The vector \texttt{lout} gives the indices of the elements of $V_{\text{sub}}$
that are in $L_{\text{sub}}$.  It having length zero indicates that
$L_{\text{sub}} = \varnothing$, and this in turn indicates
by Corollary~\ref{cor:generic-limit} that the
support of the limiting conditional model is $C_{\text{sat}} \cap H_{\text{sat}} = \{ y \}$.
Hence there is only one distribution in the limiting conditional model,
which is the distribution concentrated at $y$, and this trivially must be
the MLE in the limiting conditional model.  This also implies that
every direction is a direction of constancy in the limiting conditional model,
that is $\Gamma_{\text{lim}} = \real^3$.

Now we are ready to carry out the computation of Section~\ref{sec:gdor},
calculating $\delta$.
<<logit-gdor>>=
hrep <- cbind(- vrep, - 1)
hrep <- rbind(hrep, c(0, 1, rep(0, 3), - 1))
objv <- c(rep(0, 3), 1)
pout <- lpcdd(hrep, objv, minimize = FALSE)
names(pout)
pout$solution.type
@
The list \texttt{pout} is the solution of the linear programming problem
described in Section~\ref{sec:gdor}.
We obtain the GDOR and check its validity
<<logit-gdor-too>>=
gdor <- pout$primal.solution[- length(pout$primal.solution)]
all(tanv %*% gdor < 0)
gdor
@
All of the computation to this point has used ordinary inexact computer
arithmetic.  We also illustrate exact infinite-precision rational arithmetic
computations
<<logit-gdor-three>>=
pout.exact <- lpcdd(d2q(hrep), d2q(objv), minimize = FALSE)
gdor.exact <- pout.exact$primal.solution[- length(pout$primal.solution)]
gdor.exact
q2d(gdor.exact)
@

We are now ready to find one-sided confidence intervals.  We use the
R function \texttt{uniroot} to solve equations involved in this process.
We start by defining a function that evaluates $\myPr_{\beta + s \delta}(Y \in H)$
for any $\beta$ and $\delta$.
<<logit-func>>=
invlogit <- function(x) 1 / (1 + exp(- x))
prob.face <- function(beta, s) {
    moo <- M %*% (beta + s * gdor)
    moo <- as.vector(moo)
    prod(ifelse(y == 1, invlogit(moo), invlogit(- moo)))
}
@
Now we try it out on the point $\beta = 0$.
<<logit-conf-one>>=
alpha <- 0.05
beta.hat <- rep(0, length(gdor))
foo <- function(s) prob.face(beta.hat, s) - alpha
fred <- uniroot(foo, lower = -10, upper = 10)
lowbnd <- fred$root
c(lowbnd, Inf)
fred$estim.prec
@

The meaning of this confidence interval is not obvious just from looking
at the numbers.  Hence we make a plot showing the mean values that correspond
to $s \delta$ for $s$ in the interval.
<<label=fig1plot,include=FALSE>>=
eta.gdor <- as.vector(M %*% gdor)
par(mar = c(5, 4, 1, 1) + 0.1)
plot(x, invlogit(lowbnd * eta.gdor), ylim = c(0, 1), ylab = "mean of y")
points(x, y)
segments(x, y, x, invlogit(lowbnd * eta.gdor))
@
\begin{figure}
\begin{center}
<<label=fig1,fig=TRUE,echo=FALSE>>=
<<fig1plot>>
@
\end{center}
\caption{Quick and Dirty 95\% Confidence Intervals for Regression Function.
Compare with Figure~\ref{fig:two}.}
\label{fig:one}
\end{figure}
The confidence intervals shown in Figure~\ref{fig:one} are certainly better
than what was previously available, which was nothing.
Figure~\ref{fig:one} clearly shows that some mean values are much better
estimated than others, and gives a rough idea of the variability.  However,
Figure~\ref{fig:one} is not the Right Thing, which was explained in
Section~\ref{sec:confidence}.  We now proceed to an approximation of that.

We should for each $\gamma \in \real^3$ find the $\hat{s}_\gamma$ such
that $\myPr_{\gamma + \hat{s}_\gamma \delta}(Y \in H) = \alpha$.  Then we should
form the confidence region
$\set{ \gamma + s \delta : \gamma \in \real^3, \ s \ge \hat{s}_\gamma }$.
Finally, we should find the set of mean value parameters corresponding to this
confidence region.  Clearly, we cannot do that, as it would entail an infinite
amount of work.  We can hope that we can make do with just a few $\gamma$
values.  Let us try that.
<<logit-conf-two>>=
set.seed(42)
nboot <- 100
scale <- 0.25
eta.up <- rep(- Inf, length(y))
eta.dn <- rep(Inf, length(y))
for (iboot in 1:nboot) {
    beta.hat <- rnorm(length(gdor)) * scale
    foo <- function(s) prob.face(beta.hat, s) - alpha
    fred <- uniroot(foo, lower = -20, upper = 500)
    lowbnd <- fred$root
    eta.hat <- as.vector(M %*% (beta.hat + lowbnd * gdor))
    eta.up <- pmax(eta.up, eta.hat)
    eta.dn <- pmin(eta.dn, eta.hat)
}
mu.up <- ifelse(y == 1, 1, invlogit(eta.up))
mu.dn <- ifelse(y == 0, 0, invlogit(eta.dn))
@
Then we make a plot showing these confidence intervals.
<<label=fig2plot,include=FALSE>>=
plot(x, mu.up, ylim = c(0, 1), ylab = "mean of y")
points(x, mu.dn)
segments(x, mu.up, x, mu.dn)
@
\begin{figure}
\begin{center}
<<label=fig2,fig=TRUE,echo=FALSE>>=
<<fig2plot>>
@
\end{center}
\caption{Simultaneous 95\% Confidence Intervals for Regression Function.}
\label{fig:two}
\end{figure}
We claim (hope) that these confidence intervals do give good simultaneous
coverage.  Our methods are not ideal in that we probably should not use the
\texttt{uniroot} function, which requires as input an interval bounding the
solution and does not use the fact that the function is strictly increasing.
Some method designed explicitly for strictly increasing functions would
be better.  If such a method needs first or second derivatives, these can
be calculated explicitly.  We do not give the formulas because we do not
propose a particular method.

Our choice of random starting points is also
perhaps not ideal for two reasons.  The constant \Sexpr{scale} in
the algorithm is
obviously arbitrary.  Increasing it requires increasing the width of the
interval given to the \texttt{uniroot} function.  A better function for
solving equations would give more flexibility in choosing this constant.
The other reason our choice is less than ideal is that we could use
the computation discussed in Section~\ref{sec:gamma} to let our points
\texttt{beta.hat} vary over the two-dimensional subspace perpendicular
to $\delta$, but we have not bothered to do this, since it is a mere
optimization that may be more efficient but does not change the result
calculated.

\subsection{A Contingency Table Example} \label{sec:contab}

This example is a $2 \times 2 \times \cdots \times 2$ contingency
table with seven dimensions hence $2^7 = 128$ cells.  The data are
<<contab-data>>=
dat <- read.table(url("http://www.stat.umn.edu/geyer/gdor/catrec.txt"),
    header = TRUE)
# dat <- read.table("catrec.txt", header = TRUE)
dim(dat)
names(dat)
@
which presents the data as eight vectors, seven categorical predictors
$v_1$, $\ldots$, $v_7$ that specify the cells of the contingency table
and one response $y$ that gives the cell counts.

\subsubsection{Poisson Sampling}

We start by fitting two models assuming Poisson sampling,
one with all two-way interactions and no
higher interactions and one with all three-way interactions and no higher
interactions, and attempt to compare them using a test of model comparison.
<<contab-initial>>=
out2 <- glm(y ~ (v1 + v2 + v3 + v4 + v5 + v6 + v7)^2, family = poisson,
    data = dat, x = TRUE)
out3 <- glm(y ~ (v1 + v2 + v3 + v4 + v5 + v6 + v7)^3, family = poisson,
    data = dat, x = TRUE)
anova(out2, out3, test = "Chisq")
@
Unlike the logistic regression example, the \texttt{glm} function gives
no warning here about lack of convergence or nonexistence of the MLE\@.
However, as we shall see, the MLE does not exist in the conventional sense
in the larger model \texttt{out3}.

First we determine the linearity for model \texttt{out2}.
<<contab-lin2>>=
tanv <- out2$x
vrep <- cbind(0, 0, tanv)
vrep[dat$y > 0, 1] <- 1
lout <- linearity(d2q(vrep), rep = "V")
linear <- dat$y > 0
linear[lout] <- TRUE
all(linear)
@
Thus the MLE does exist in the conventional sense for this model, and
the \texttt{glm} function presumably finds it.

Second we determine the linearity for model \texttt{out3}.
<<contab-lin3>>=
tanv <- out3$x
vrep <- cbind(0, 0, tanv)
vrep[dat$y > 0, 1] <- 1
lout <- linearity(d2q(vrep), rep = "V")
linear <- dat$y > 0
linear[lout] <- TRUE
all(linear)
@
Thus the MLE does not exist in the conventional sense for this model, and
the \texttt{glm} function has produced nonsense with no error or warning.

Since the MLE does exist for the null hypothesis, the test done by
the \texttt{anova} function above is correct (Section~\ref{sec:tests}).
Thus we have a problem for which available software provides no solution.
The model \texttt{out2} that we can fit with available software clearly
does not fit the data, but the model \texttt{out3} that appears to fit
the data we cannot fit with available software.  Hence the proposals of
this technical report!

Next we determine a GDOR.
<<contab-gdor>>=
hrep <- cbind(0, 0, - tanv, 0)
hrep[! linear, ncol(hrep)] <- (- 1)
hrep[linear, 1] <- 1
hrep <- rbind(hrep, c(0, 1, rep(0, ncol(out3$x)), - 1))
objv <- c(rep(0, ncol(out3$x)), 1)
pout <- lpcdd(d2q(hrep), d2q(objv), minimize = FALSE)
gdor <- pout$primal.solution[- length(pout$primal.solution)]
@
and do some checks to try to understand what we have found
<<contab-gdor-check>>=
foo <- gdor
names(foo) <- names(out3$coef)
print(cbind(foo[foo != "0"]))

eta.gdor <- as.vector(qmatmult(tanv, cbind(gdor)))

all(qsign(eta.gdor) <= 0)
all(qsign(eta.gdor[linear]) == 0)
all(qsign(eta.gdor[! linear]) < 0)
@
First we print out the nonzero components of the GDOR, not that this tells
us much.  Then we check that the GDOR actually satisfies the conditions
\eqref{eq:linear} and \eqref{eq:nonlinear}.

Then we figure out the convex support of the limiting conditional family.
Which of the cells that have observed count zero are fixed at zero in
the limiting conditional family?
<<contab-support>>=
sum(! linear)
sum(dat$y == 0)
all(dat$y == 0 | linear)
@
we see that of the \Sexpr{sum(dat$y == 0)} cells that have zero count in
the observed data \Sexpr{sum(! linear)} are conditioned to be
zero in the limiting conditional model.

Our next task is to fit the limiting conditional model.
<<contab-fit-limit>>=
dat.cond <- dat[linear, ]
out3.cond <- glm(y ~ (v1 + v2 + v3 + v4 + v5 + v6 + v7)^3, family = poisson,
    data = dat.cond)
summary(out3.cond)
beta.hat <- coefficients(out3.cond)
beta.hat[is.na(beta.hat)] <- 0
@
We see that, as expected, the model is not identifiable.  However,
the \texttt{glm} function decides which predictor to drop or, equivalently,
which parameter to constrain to zero, which it reports as \texttt{NA}.

Since exactly one parameter need be set to zero, we see that the dimension
of the constancy space of the limiting conditional model is one.  Thus
$\delta$ is a basis for the constancy space $\Gamma_{\text{lim}}$, and
we see that a single one-sided confidence interval will do the job.

Since confidence intervals for mean values for components
of the response vector that are not constrained to be zero in the limiting
conditional model are entirely conventional and calculated by
\texttt{predict.glm} applied to the object \texttt{out3.cond}, we will
omit this step, assuming readers will know how to do it or at least figure
out how to do it from reading the help for
the function \texttt{predict.glm}.

On to one-sided intervals.
<<contab-one-sided>>=
eta.hat <- as.numeric(out3$x %*% beta.hat)
eta.gdor <- q2d(eta.gdor)

prob.face <- function(s) {
    moo <- eta.hat + s * eta.gdor
    exp(- sum(exp(moo[! linear])))
}
foo <- function(s) prob.face(s) - alpha

fred <- uniroot(foo, lower = -5, upper = 5)
lowbnd <- fred$root
c(lowbnd, Inf)
fred$estim.prec
@
This is our one-sided confidence interval for $s$,
which we map to one-sided confidence intervals for the mean values
of cells that are constrained to be zero in the limiting conditional model.
<<contab-mvp>>=
moo <- exp(eta.hat + lowbnd * eta.gdor)
foo <- cbind(dat, moo)
@
Table~\ref{tab:one} shows these intervals.
<<contab-xtable,results=tex>>=
rownames(foo) <- NULL
colnames(foo)[colnames(foo) == "y"] <- "lower"
colnames(foo)[colnames(foo) == "moo"] <- "upper"
library(xtable)
print(xtable(foo[! linear, ], digits = c(rep(0, 9), 4), align = "cccccccccc",
    caption = paste("One-sided 95\\% Confidence Intervals for Mean Values.",
    "Columns v1 to v7 give the cell.",
    "Lower and Upper give end points of the interval.",
    "Based on Poisson sampling, compare Table~\\ref{tab:two}."),
     label = "tab:one"), caption.placement = "top", table.placement = "tbp",
     include.rownames = FALSE)
@

We would now like to illustrate a hypothesis test in which the MLE in the
null hypothesis does not exist in the conventional sense.  We will use
the model with all three-way interactions and no higher interactions for
the null hypothesis and the model with all four-way interactions and no
higher interactions for the alternative hypothesis.
<<contab-test>>=
out4.cond <- glm(y ~ (v1 + v2 + v3 + v4 + v5 + v6 + v7)^4, family = poisson,
    data = dat.cond)
anova(out3.cond, out4.cond, test = "Chisq")
@
Simple.  The functions \texttt{glm} and \texttt{anova.glm} do the right
thing if they are provided the data \texttt{dat.cond} for the limiting
conditional model for the null hypothesis.

\subsubsection{Multinomial Sampling} \label{sec:multi}

Consider one contingency table and one vector of observed data,
but two models: Poisson sampling and multinomial sampling.
As is well known, the maximum likelihood estimates for the mean value
parameters are the same for both sampling schemes.
But much more is the same.

Suppose we consider the natural statistic to be the vector of cell counts
for both models, so both have the same natural statistic and natural parameter.
For Poisson sampling,
there are no directions of constancy and the MLE for the natural parameter
is unique.  For multinomial sampling,
the vector $\gamma = (1, 1, \ldots, 1)$ is a direction of constancy, and
the MLE for the natural parameter is nonunique.
However, it is easy to see that the unique MLE for the Poisson model is
also an MLE for the multinomial model (when we use the parameterizations
just defined).

Moreover, when we use the same natural statistic for both models,
the computational geometry is similar.  If subscripts $P$ and
$M$ refer to the Poisson
and multinomial models, respectively, then
$$
   C_{\text{sat}, M} = \set{ v \in C_{\text{sat}, P} : \inner{v, \gamma} = n }
$$
where $n$ is the sample size.  Also note that $\gamma$ is the first column
of $M$, the ``intercept'' column.  From this it follows that
$$
   T_{C_{\text{sub}, M}}(M^T y) = \set{ v \in T_{C_{\text{sub}, P}}(M^T y) :
   \inner{v, e_1} = 0 }
$$
where $e_1 = (1, 0, \ldots, 0)$.  And from this it follows by Theorem~6.42
in \citet{raw} that
$$
   N_{C_{\text{sub}, M}}(M^T y) \supset \set{ v + s e_1 :
   v \in N_{C_{\text{sub}, P}}(M^T y), \ s \in \real }.
$$
Hence every direction of recession in the Poisson model is also one
in the multinomial model.  Similarly, every GDOR in the Poisson model is also
one in the multinomial model.  Hence the GDOR we have already calculated
is correct for the multinomial model.  Hence the support of the limiting
conditional model is also correct.  Hence also the parameter estimates
for the limiting conditional model already obtained are correct.
Hence (asymptotically) so is the hypothesis test comparing the 4-way
interactions model to the 3-way interactions model.

The only thing we need to change for multinomial sampling is our one-sided
confidence intervals, because they are based on exact probabilities that
differ between the two models.
We proceed to redo that
<<multi-one-sided>>=
n <- sum(dat$y)
prob.face <- function(s) {
    moo <- eta.hat + s * eta.gdor
    mmoo <- max(moo)
    bark <- exp(moo - mmoo)
    qqq <- sum(bark[linear]) / sum(bark)
    qqq^n
}
foo <- function(s) prob.face(s) - alpha

fred <- uniroot(foo, lower = -5, upper = 5)
lowbnd.multi <- fred$root
c(lowbnd.multi, Inf)
fred$estim.prec
@
This is our one-sided confidence interval for $s$.
Our interval for multinomial sampling with
lower bound \Sexpr{round(lowbnd.multi, digits = 4)}
is not that different from the interval for Poisson sampling with
lower bound \Sexpr{round(lowbnd, digits = 4)}, but it is different.

Now we produce the analog of Table~\ref{tab:one} for multinomial sampling.
<<multi-mvp>>=
moo <- eta.hat + lowbnd.multi * eta.gdor
mmoo <- max(moo)
bark <- exp(moo - mmoo)
woof <- n * bark / sum(bark)
foo <- cbind(dat, woof)
@
Table~\ref{tab:two} shows these intervals.
<<multi-xtable,results=tex>>=
rownames(foo) <- NULL
colnames(foo)[colnames(foo) == "y"] <- "lower"
colnames(foo)[colnames(foo) == "woof"] <- "upper"
library(xtable)
print(xtable(foo[! linear, ], digits = c(rep(0, 9), 4), align = "cccccccccc",
    caption = paste("One-sided 95\\% Confidence Intervals for Mean Values.",
    "Columns v1 to v7 give the cell.",
    "Lower and Upper give end points of the interval.",
    "Based on multinomial sampling, compare Table~\\ref{tab:one}."),
     label = "tab:two"), caption.placement = "top", table.placement = "tbp",
     include.rownames = FALSE)
@
Again the results are different --- Table~\ref{tab:one} is different
from Table~\ref{tab:two} --- though not much different.

We note that a section for product-multinomial sampling would look
much like this section, so similar that it is left as an exercise for
the reader.

\section{Alternative Calculational Ideas}

Pre-existing theory of Barndorff-Nielsen completion
\citep{barndorff,brown,thesis} is based on the family of faces of the
convex support $C$.  The R package \texttt{rcdd} can be used to calculate
all the faces of $C$, but only in the most toyish of toy problems.  In
order to calculate $C_{\text{sub}}$ we need a V-representation
for $C_{\text{sat}}$.  For our first example (Section~\ref{sec:logit})
$C_{\text{sat}} = [0, 1]^{30}$ has $2^{30} = \Sexpr{2^30}$ generators,
which is far too many to deal with in an actual calculation.  So the project
of calculating all the faces of the convex support is a non-starter in this
example.  For our second example (Section~\ref{sec:contab})
$C_{\text{sat}} = [0, \infty)^{128}$ has 128 generators, so we could attempt
to calculate all the faces.  Since the function \texttt{allfaces} requires
H-representation input, we must first use the function \texttt{scdd} to convert
from the V-representation we have for $C_{\text{sub}}$ given
by \eqref{eq:vee-map} to an H-representation.  Unfortunately, this takes a very
long time --- the process had taken many hours when it was killed for lack
of patience, whereas all of the calculations done in this technical report
take only a minute --- so the project of calculating all the faces is
a non-finisher for
our second example.  We conclude that the idea of calculating all the faces
of the convex support is not practical, since our examples are not particularly
complicated.  One would expect most practical applications to be much more
complicated.

There is more hope for calculating the entire tangent and normal cones.
For our first example (Section~\ref{sec:logit}), we recreate the
V-representation for the tangent cone of the affine submodel
and convert to an H-representation, because an H-representation for the
tangent cone is essentially a V-representation for the normal cone and vice
versa.
<<logit-normal-cone>>=
tanv <- M
tanv[y == 1, ] <- (- tanv[y == 1, ])
vrep <- cbind(0, 0, tanv)
sout <- scdd(d2q(vrep), rep = "V")
unclass(sout$output)[ , - c(1, 2)]
@
The four vectors shown generate the normal cone.  Any vector that is linear
combination of these four vectors with strictly positive coefficients is a
GDOR.  So this calculation, instead of finding one GDOR, finds every GDOR.
But as we have seen, we only need one GDOR to carry out all the statistical
inferential procedures one might want to do.  Moreover, when we try to
apply this scheme to our second example (Section~\ref{sec:contab}), the
\texttt{scdd} operation takes a very long time --- again taking many hours
before the process was killed for lack of patience.  Hence the idea of
calculating the whole normal cone is a non-finisher except in some toy problems.

Hence we see that the idea of seeking a single GDOR, is strongly motivated
by efficiency concerns.  The methods of this technical report, using repeated
linear programming, are not the only such methods.  \citet{thesis} provided
a competing scheme, which, although not aimed at calculating a GDOR (in fact,
the GDOR concept does not appear in that thesis) does calculate the linearity
space $L_{\text{sub}}$, and hence does allow computation of the MLE in the
Barndorff-Nielsen completion.  Neither confidence intervals nor hypothesis
tests are discussed in \citet{thesis}, so the GDOR notion was not needed.

We mention \citet{thesis} merely to show that the scheme introduced here
based on one invocation of the function \texttt{linearity} and one invocation
of the function \texttt{lpcdd}, both in the package \texttt{rcdd}, is not
the only efficient method of calculating the MLE in the Barndorff-Nielsen
completion and associated hypothesis tests and confidence intervals.
We do conjecture that any efficient method, if general, must be based
on repeated invocations of linear programming.  Since the \texttt{linearity}
function is particularly designed for the job it does,
calculating $L_{\text{sub}}$, and since this is an essential step in computing
the support of the limiting conditional model, it would seem that any method
more efficient than what is proposed here must essentially improve on the
scheme used in the \texttt{linearity} function, which was explained in
Section~\ref{sec:linearity}.  Whether or not this algorithm can be improved
upon, it does seem that the \texttt{linearity} function implements a fairly
efficient algorithm.

\section{Proofs} \label{sec:proofs}

% \begin{proof}[Proof of Theorem~\ref{th:concave}]
% The convexity of $c$ is H\"{o}lder's inequality; the lower semicontinuity
% is Fatou's lemma.  The corresponding facts for $l$ follow immediately.
% See Theorem~2.1 in \citet{thesis} for more detail.
% \end{proof}

\begin{proof}[Proof of Theorem~\ref{th:constancy}]
Clearly, $s \mapsto l(\theta + s \delta)$ fails to be strictly concave
if and only if $s \mapsto c(\theta + s \delta)$ fails to be strictly convex,
and by Theorem~2.1 in \citet{thesis} this happens
if and only if $\inner{Y, \delta}$ is concentrated at one point, in which
case this point must be $\inner{y, \delta}$ so (e) holds.
Since all distributions in the family are mutually absolutely continuous
by \eqref{eq:density}, (e) implies (f), which trivially implies (e).
If (f) holds, then by \eqref{eq:cumulant}
\begin{equation} \label{eq:fred}
\begin{split}
   c(\theta + s \delta)
   & =
   c(\psi) + \log E_\psi \bigl( e^{\inner{Y, \theta + s \delta - \psi}} \bigr)
   \\
   & =
   c(\psi) + s \inner{y, \delta}
   + \log E_\psi \bigl( e^{\inner{Y, \theta - \psi}} \bigr)
   \\
   & =
   c(\theta) + s \inner{y, \delta}
\end{split}
\end{equation}
Hence (b) holds, and (b) clearly implies (a).
We have now proved that (a), (b), (e), and (f) are equivalent.

Also \eqref{eq:fred} implies (d) by \eqref{eq:density},
so (f) implies (d).
Trivially, (d) implies (c).  Conversely, if (c) holds, then $f_\theta$
and $f_{\theta + s \delta}$ must be equal almost surely, hence by
\eqref{eq:density}
$$
   \log f_{\theta + s \delta}(\omega) - \log f_\theta(\omega)
   =
   s \inner{Y(\omega), \delta} - c(\theta + s \delta) + c(\theta)
$$
almost surely, hence $\inner{Y, \delta}$ is constant almost surely,
and the constant must be $\inner{y, \delta}$; hence (e) holds.
Because all distributions in the family are mutually absolutely continuous
by \eqref{eq:density}, (e) implies (f).
We have now proved that (a) through (f) are equivalent.

By definition of normal cone and convex support, (e) and (g) are equivalent,
and (g) and (h) are equivalent by the polarity relationship of normal and
tangent cones \citep[Theorem~6.9 and~Corollary~6.30]{raw}.
\end{proof}

\begin{proof}[Proof of Corollary~\ref{cor:non-uniqueness}]
By Theorem~7.1 and p.~140 in \citet{barndorff}, $l$ is concave;
thus we must have
\begin{equation} \label{eq:sally}
   l\bigl(t \hat{\theta}_1 - (1 - t) \hat{\theta}_2\bigr)
   \ge
   t l(\hat{\theta}_1) + (1 - t) l(\hat{\theta}_2),
   \qquad 0 < t < 1,
\end{equation}
and since $\hat{\theta}_1$ and $\hat{\theta}_2$ are MLE, \eqref{eq:sally}
must actually hold with equality.  Thus by (a) of the theorem
$\hat{\theta}_1 - \hat{\theta}_2$ is a direction of constancy.
\end{proof}

For the proof of Theorem~\ref{th:recession}
we use Corollary~2.4.1 in \citet{thesis},
which relies on Theorem~2.3 in \citet{thesis}, but the proof of that theorem
given in \citet{thesis} is murky at best.  So we give a corrected version.
\begin{proof}[Corrected Proof of Theorem~2.3 in \citet{thesis}]
First, equation (2.5) in \citet{thesis} contains an obvious typographical
error.  It should read
\begin{align*}
  (\rc \log c)(\phi)
  & =
  \lim_{s \to \infty} \frac{\log c(\theta + s \phi) - \log c(\theta)}{s}
  \\
  & =
  \lim_{s \to \infty} \log \left(
  \left[ \frac{c(\theta + s \phi)}{c(\theta) e^{s \sigma_K(\phi)}}
  \right]^{1 / s} e^{\sigma_K(\phi)} \right)
\end{align*}
The rest of the proof of the $\lambda(H_\phi) > 0$ case is correct.
In the proof of the of the $\lambda(H_\phi) = 0$ case, the last displayed
formula of the proof is incorrect.  Clearly
$$
   e^{a - \sigma_K(\phi)} F_\theta(A)^{1 / s} \to
   e^{a - \sigma_K(\phi)}, \qquad \text{as $s \to \infty$}.
$$
However, since $a < \sigma_K(\phi)$ was arbitrary, the limit can be made
arbitrarily close to 1, and we see that
$$
   \left[ \frac{c(\theta + s \phi)}{c(\theta) e^{s \sigma_K(\phi)}}
   \right]^{1 / s} \to 1, \qquad \text{as $s \to \infty$},
$$
as is required for the completion of the proof.
\end{proof}

\begin{proof}[Proof of Theorem~\ref{th:recession}]
The equivalence of (a) and (b) is Theorem~8.6 in \citet{rocky}.
The equivalence of (a) and (c) is Corollary~2.4.1 in \citet{thesis}.
The equivalence of (c) and (d) is mutual absolute continuity of the
distributions in an exponential family, which follows from \eqref{eq:density}.
The equivalence of (c) and (e) is immediate from
our definition \eqref{eq:normal-cone} of the normal cone.
The equivalence of (e) and (f) is the polarity relationship of tangent and
normal cones \citep[Theorem~6.9 and~Corollary~6.30]{raw}.
\end{proof}

\begin{proof}[Proof of Theorem~\ref{th:exist}]
That (a) and (b) are equivalent is Theorem~2.5
in \citet{thesis}.  That (b) and (c) are equivalent follows from (g)
of Theorem~\ref{th:constancy} and (e) of Theorem~\ref{th:recession}.
That (c) and (d) are equivalent is the polarity relationship of tangent
and normal cones.
\end{proof}

\begin{proof}[Proof of Corollary~\ref{cor:strictly-increasing}]
By assumption $s \mapsto l(\theta + s \delta)$ is a nondecreasing function.
Suppose to get a contradiction that
\begin{equation} \label{eq:strictly-increasing}
   l(\theta + s_1 \delta) = l(\theta + s_2 \delta)
\end{equation}
for some $s_1$ and $s_2$ such that both sides of \eqref{eq:strictly-increasing}
are finite and $s_1 < s_2$.  In order that $l$ be nondecreasing we must have
$$
   l(\theta + s_1 \delta) = l(\theta + s \delta), \qquad s_1 \le s \le s_2
$$
but then $\delta$ is a direction of constancy
by Theorem~\ref{th:constancy}~(a).
\end{proof}

\begin{proof}[Proof of Theorem~\ref{th:limits}]
Except for the last sentence, this follows immediately
from Theorem~2.2 in \citet{thesis}.
From \eqref{eq:density}
\begin{align*}
   \myPr_{\theta + s \delta}(Y \in H)
   & =
   E_\psi \bigl\{ I_H e^{\inner{Y, \theta + s \delta - \psi}
       - c(\theta + s \delta) + c(\psi)} \bigr\}
   \\
   & =
   e^{s \inner{y, \delta} - c(\theta + s \delta) + c(\theta)}
   E_\psi \bigl\{ I_H e^{\inner{Y, \theta - \psi}
       - c(\theta) + c(\psi)} \bigr\}
   \\
   & =
   e^{s \inner{y, \delta} - c(\theta + s \delta) + c(\theta)}
   \myPr_\theta(Y \in H)
\end{align*}
where $I_H$ denotes the indicator function of the event $Y \in H$.
By Corollary~\ref{cor:strictly-increasing}, the function
$s \mapsto \inner{y, \theta + s \delta} - c(\theta + s \delta)$
is strictly increasing, hence
so is $s \mapsto \Pr_{\theta + s \delta}(Y \in H)$.
That $\Pr_{\theta + s \delta}(Y \in H) \to 1$ as $s \to \infty$ follows from
Scheff\'{e}'s lemma (see the comments following the theorem).
The continuity assertion follows from the fact that the moment generating
function of the random variable $\inner{Y, \delta}$ is
\begin{align*}
   E_{\theta}\bigl\{ e^{s \inner{Y, \delta}} \}
   & =
   E_\psi \bigl\{
   e^{\inner{Y, \theta + s \delta - \psi} - c(\theta) + c(\psi)}
   \bigr\}
   \\
   & =
   e^{c(\theta + s \delta) - c(\theta)}
\end{align*}
Hence $s \mapsto c(\theta + s \delta)$ is actually infinitely differentiable
and so is $s \mapsto \myPr_{\theta + s \delta}(Y \in H)$.
\end{proof}

\begin{proof}[Proof of Theorem~\ref{th:generic}]
Suppose $L = V$.  Then $\con(\pos V)$ is the subspace spanned by $V$,
in which case a GDOR does not exist by Theorem~\ref{th:exist}.

Suppose $L \neq V$.  Then by the polarity relationship of normal and tangent
cones for each $v \in V \setminus L$ there exists $\delta_v \in N_C(y)$ such
that $\inner{v, \delta_v} < 0$.  Hence $- \delta_v \notin N_C(y)$ and
$N_C(y)$ is not a vector subspace.
So a GDOR does exist by Theorem~\ref{th:exist}.

Let $\delta^* = \sum_{v \in V \setminus L} \delta_v$.
Then $\delta^*$ satisfies \eqref{eq:calc-equality}
and \eqref{eq:calc-inequality}.
Observe that $\delta \in N_C(y)$ if and only if
\eqref{eq:calc-equality} holds and \eqref{eq:calc-inequality} holds
with $<$ replaced by $\le$.  Then it is clear that
for every $\delta \in N_C(y)$ there exists $t > 1$
such that $t \delta^* + (1 - t) \delta$ is in $N_C(y)$.
Hence $\delta^* \in \rint N_C(y)$ by Theorem~6.4 in \citet{rocky}.
It now follows from Proposition~2.42 in \citet{raw} that
the set of points satisfying \eqref{eq:calc-equality}
and \eqref{eq:calc-inequality} is $\rint N_C(y)$.
\end{proof}

\begin{proof}[Proof of Corollary~\ref{cor:generic}]
In the proof of the theorem we saw that
if a GDOR exists, then $L \neq V$ and $N_C(y)$ is not a vector subspace.
\end{proof}

\begin{proof}[Proof of Corollary~\ref{cor:generic-limit}]
Since $C$ is polyhedral convex, every tangent vector is of the form
$s (w - y)$ for some $w \in C$ and $s \ge 0$, that is, the closure operation
in \eqref{eq:tangent-cone} is not necessary.  This implies, in particular,
that for each $v \in L$ there exist points $w_{v, +}$ and $w_{v, -}$ in $C$
and positive scalars $s_{v, +}$ and $s_{v, -}$
such that $\pm v = s_{v, \pm} (w_{v, \pm} - y)$.
Observe that these $w_{v, \pm}$
are also in $C \cap H$, but no $w \in V \setminus L$ is in $C \cap H$.
Thus $T_{C \cap H}(y) = \con(\pos L) = \spam L$.
Since $y + \spam L \subset H$, we have $C \cap H \supset C \cap (y + \spam L)$.
If $C \cap H \not\subset C \cap (y + \spam L)$,
then we cannot have $T_{C \cap H}(y) = \spam L$.
\end{proof}

\begin{proof}[Proof of Theorem~\ref{th:linprog}]
The polar of a convex cone $K$ is
$$
   K^* = \set{ \delta : \inner{w, \delta} \le 0, \ w \in K }
$$
\citep[Section~6.E]{raw}.  The double polar theorem
\citep[Corollary~6.2.1]{raw} says
that $K^{{*}{*}} = \cl K$.  When $K$ is closed, in particular
when $K$ is polyhedral, then $K^{{*}{*}} = K$.
Here let $K = \con(\pos(V_{\text{sub}} \setminus \{ w \}))$.  Then the feasible
region for the linear program \eqref{eq:linprog} is $- K^*$.
Now the optimal value to \eqref{eq:linprog} is nonpositive if and only if
$\inner{w, \delta} \le 0$ for all $\delta \in - K^*$, which is equivalent
by the double polar theorem to $w \in (- K^*)^* = - K$ or to $- w \in K$.

Now $w$ is in \eqref{eq:linearity} if and only if $- w$ is a linear combination
of elements of $V_{\text{sub}}$ with nonnegative coefficients, that is, if
$- w = a \cdot w + \sum_{v \in V_{\text{sub}} \setminus \{ w \}} a_v \cdot v$ where $a$ and
all the $a_v$ are nonnegative scalars.  But this happens if and only if
$- w = \sum_{v \in V_{\text{sub}} \setminus \{ w \}} (a_v / (1 + a)) \cdot v$, which
is equivalent to $- w \in K$.
\end{proof}

\begin{proof}[Proof of Theorem~\ref{th:support}]
With probability one
$$
   Y - y = \sum_{v \in V_{\text{sat}}} b_v(Y) \cdot v
$$
where all the coefficients $b_v(Y)$ are nonnegative.
From \eqref{eq:linear} and \eqref{eq:nonlinear} we can derive
\begin{align*}
   \inner{v, M \delta} = 0, & \qquad v \in L_{\text{sat}}
   \\
   \inner{v, M \delta} < 0,
   & \qquad v \in V_{\text{sat}} \setminus L_{\text{sat}}
\end{align*}
Hence
\begin{equation} \label{eq:bvy}
   \inner{Y - y, M \delta}
   =
   \sum_{v \in V_{\text{sat}} \setminus L_{\text{sat}}}
   b_v(Y) \cdot \inner{v, M \delta}
\end{equation}
and since all of the $\inner{v, M \delta}$ in \eqref{eq:bvy}
are strictly negative,
the sum can only be zero if all the $b_v(Y)$,
$v \in V_{\text{sat}} \setminus L_{\text{sat}}$ are zero.
Thus the support of the limiting conditional model consists of points of
the form $y + \sum_{v \in L_{\text{sat}}} b_v \cdot v$, where the coefficients
are arbitrary.
Since all such points are in the preimage of $H_{\text{sub}}$ under the
map $y \mapsto M^T y$, we conclude \eqref{eq:support-sat} holds.
\end{proof}

\section*{Acknowledgments}

Much of the theory of this technical report comes from a thesis
\citep{thesis} done under the supervision of Elizabeth Thompson.

\begin{thebibliography}{}

\bibitem[Barndorff-Nielsen(1978)]{barndorff}
Barndorff-Nielsen, O.~E. (1978).
\newblock \emph{Information and Exponential Families}.
\newblock Chichester: John Wiley.

\bibitem[Brown(1986)]{brown}
Brown, L.~D. (1986).
\newblock \emph{Fundamentals of Statistical Exponential Families: with
    Applications in Statistical Decision Theory}.
\newblock Hayward, CA: Institute of Mathematical Statistics.

\bibitem[Fukuda(2008)]{cddlib}
Fukuda, K. (2008)
\newblock cddlib package, version 094f.
\newblock \url{http://www.ifor.math.ethz.ch/~fukuda/cdd_home/cdd.html}.
\newblock Documentation is the file \texttt{cddlibman.pdf},
    which is included in the \texttt{cddlib} source and also in
    every \texttt{rcdd} installation.

\bibitem[Geyer(1990)]{thesis}
Geyer, C.~J. (1990).
\newblock \emph{Likelihood and Exponential Families}.
\newblock Unpublished Ph.~D. Thesis.  University of Washington.

\bibitem[Geyer(2008)]{aster}
Geyer, C.~J. (2008).
\newblock R package \texttt{aster} (Aster Models), version 0.7-4.
\newblock \url{http://www.stat.umn.edu/geyer/aster/}.

\bibitem[Geyer and Meeden(2005)]{fuzzy}
Geyer, C.~J. and Meeden, G.~D. (2005).
\newblock Fuzzy and randomized confidence intervals and P-values
    (with discussion).
\newblock \emph{Statistical Science}, \textbf{20}, 358--387.

\bibitem[Geyer and Meeden(2008)]{rcdd}
Geyer, C.~J. and Meeden, G.~D. (2008).
\newblock R package \texttt{rcdd} (C Double Description for R), version 1.1.
\newblock \url{http://http://www.stat.umn.edu/geyer/rcdd/}.
\newblock Incorporates code from \citet{cddlib}.

\bibitem[Geyer and Thompson(1992)]{gt}
Geyer, C.~J. and Thompson, E.~A. (1992).
\newblock Constrained Monte Carlo maximum likelihood for dependent data
    (with discussion).
\newblock \emph{Journal of the Royal Statistical Society, Series B},
   \textbf{54} 657--99.

\bibitem[Geyer et al.(2007)Geyer, Wagenius and Shaw]{gws}
Geyer, C.~J., Wagenius, S. and Shaw, R.~G. (2007).
\newblock Aster models for life history analysis.
\newblock \emph{Biometrika}, \textbf{94}, 415--426.

\bibitem[Lehmann(1959)]{lehmann-tsh}
Lehmann, E.~L. (1959).
\newblock \emph{Testing Statistical Hypotheses}.
\newblock New York: Wiley.

\bibitem[R Development Core Team(2008)]{rcore}
R Development Core Team (2008).
\newblock R: A language and environment for statistical computing.
\newblock R Foundation for Statistical Computing, Vienna, Austria.
\newblock \url{http://www.R-project.org}.

\bibitem[Rockafellar(1970)]{rocky}
Rockafellar, R.~T. (1970).
\newblock \emph{Convex Analysis}.
\newblock Princeton: Princeton University Press.

\bibitem[Rockafellar and Wets(2004)]{raw}
Rockafellar, R.~T. and Wets, R. J.-B. (2004).
\newblock \emph{Variational Analysis}, corr.\ 2nd printing.
\newblock Berlin: Springer-Verlag.

\end{thebibliography}

\end{document}

\begin{center} \LARGE REVISED DOWN TO HERE \end{center}