% \section{ORDERING OF THE VARIATIONAL FAMILIES}
\section{PRELIMINARIES}
\label{sec:ordering}

We first set up some theoretical facts about A-VI and F-VI, and articulate the conditions under which the A-VI solution is as accurate as the F-VI solution.
We assume that both variational families (\Cref{eq:Q_F,eq:Q_A}) use the same type of distribution for $q_0(\theta)$ and so we focus on the variational distributions of $z_n$.
%
For each local latent variable $z_n$, F-VI assigns a marginal distribution $q_n(z_n \s \nu_n)$ from a parametric family $\mathcal Q_\ell$ with parameter $\nu_n \in \mathcal U$, where $\mathcal U$ denotes the space of valid parameters for the variational distribution $\mathcal Q_\ell$.
The joint family $\mathcal Q_\text{F}$ is then defined as the product of marginals $q_0(\theta \s \nu_0) \prod_{n=1}^{N} q_n(z_n \s \nu_n)$.
Minimizing the KL-divergence of \Cref{eq:vi-optim} yields the optimal variational parameters $\nu^* = (\nu^*_0, \nu^*_1, \cdots, \nu^*_N)$.

Let $\mathcal X$ be the space of $x_n$.
A-VI fits a function $f_\phi: \mathcal X \to \mathcal U$ over a family of inference functions $\mathcal F$ parameterized by $\phi$ and the KL-divergence of \Cref{eq:vi-optim} is minimized with respect to $\phi$.
We denote the resulting variational family $\mathcal Q_\text{A}(\mathcal F)$.

% For example, suppose $\mathcal Q_\ell$ is a univariate Gaussian.
% Then we can use a neural network as our inference function which takes in $x_n$ and returns the mean and variance of the approximating Gaussian for $z_n$.
% $\mathcal Q_\ell$ may be any parametric distribution, and the inference function any estimable function, including polynomials, networks, and more.

% The first proposition states that the armotization gap is always positive.
% the optimal variational distribution found by A-VI can be no better than the optimal variational distribution found by F-VI. %, i.e. the amortization gap is always positive.
%
\begin{proposition}
  \label{prop:ordering}
  For any class of inference functions $\mathcal F$, $\mathcal Q_A (\mathcal F)$ is a strict subset of $\mathcal Q_F$.  
  %Hence A-VI cannot
  % achieve a lower KL-divergence than F-VI.
\end{proposition}
%
\begin{proof}
It is straightforward to see from \Cref{eq:Q_F} and \Cref{eq:Q_A} that $\mathcal Q_\text{A}(\mathcal F)$ is a subset of $\mathcal Q_\text{F}$.
% Any distribution in $\mathcal Q_A(\mathcal F)$ can be found in $\mathcal Q_F$ by (i) matching $q_0(\theta)$, and (ii) matching the variational parameters $\nu_n$ to $f_\phi(x_n)$.
% Hence $\mathcal Q_\text{A}(\mathcal F) \subseteq \mathcal Q_\text{F}$.

To make the ordering strict, it suffices to find an element in $\mathcal Q_\text{F}$ which does not belong to $\mathcal Q_\text{A}$.
Note this element need not be a minimizer of $\text{KL}(q || p)$.
Consider a case where two data points are equal, $x_n = x_m$.
Then there exists a distribution $\tilde q(\theta, \mbz) \in \mathcal Q_\text{F}$ such that $\nu_n \neq \nu_m$, however, we necessarily have $f_\phi(x_n) = f_\phi(x_m)$, and so $\tilde q(\theta, \mbz) \notin \mathcal Q_\text{A}(\mathcal F)$.
\end{proof}

An immediate consequence of the ordering is that A-VI cannot achieve a lower KL-divergence than F-VI, leading to a potential amortization gap.
To close the gap, the inference function $f_\phi$ must interpolate between $x_n$ and the optimal variational parameter $\nu^*_n$,
%
\begin{equation}~\label{eq:interpolation}
    f_\phi(x_n) = \nu^*_n, \ \ \forall n.
\end{equation}
%
We call the problem of finding an $f$ that solves \Cref{eq:interpolation} the \textit{amortization interpolation problem}.

\begin{definition}
  Given a data set $\mbx$, suppose $f: \mathcal X \to \mathcal U$ solves \Cref{eq:interpolation}.
  Then we say $f$ is an ideal inference function.
\end{definition}

The strict ordering between $\mathcal Q_\text{A}(\mathcal F)$ and $\mathcal Q_\text{F}$ warns us that the amortization interpolation problem may not be well posed, since we may find ourselves in a setting where $x_n = x_m$ but $\nu^*_n \neq \nu^*_m$, in which case no ideal inference function exists.
In the next section, we derive conditions on the model $p(\theta, \mbz, \mbx)$ that guarantee the existence of an ideal inference function.
%
Once we establish that the amortization problem is well posed, we can ask how rich does $\mathcal F$ need to be to include an ideal inference function.
We will investigate this question empirically in \Cref{sec:experiment}.


% We now ask: can A-VI achieve the same minimum KL-divergence as F-VI? To do so, A-VI must construct a function from $x_n$ to F-VI's optimal variational factors. This defines the \textit{amortization interpolation problem}.

% \begin{proposition}  \label{prop:interpolation}
%   Consider a model $p(\theta, {\bf z}, {\bf x})$ over $N$ observations and latent variables.
%   Suppose that for any $\mbx = x_{1:N}$ there exists $f_\phi \in \mathcal F$ which solves the amortization interpolation problem, that is
%   %
%   \mbox{
%   $
%       f_\phi(x_n) = \nu^*_n, \ \ \forall n.
%   $
%   }
%   %
%   Then
%   {\small
%   \begin{align}
%     \min_{q \in \cQ_{\rmA}}
%     \kl{q(\theta, \mbz) \, || \, p(\theta, \mbz \g \mbx)}
%     =
%     \min_{q \in \cQ_{\rmF}}
%     \kl{q(\theta, \mbz) \, || \, p(\theta, \mbz \g \mbx)}.
%   \end{align}
%   }
% \end{proposition}
% %
% With this proposition in place, we investigate the following questions: (i) is there a class of inference functions $\mathcal F$ that solves the amortization interpolation problem? (ii) If so, how expressive does $\mathcal F$ need to be to contain a solution? 
% %
% Existing literature, notably on VAEs, has primarily focused on the second question \citep[e.g][]{Cremer:2018, Kim:2018}. 
% However when considering latent variable models more generally, we first need to carefully check that the interpolation problem is well-posed.
% Indeed if $x_n = x_m$ but \mbox{$q(z_n \s \nu_n^*) \neq q(z_m \s \nu_m^*)$}, then the interpolation problem does \underline{not} admit a solution no matter how expressive $\mathcal F$ is.
% This immediately suggests a condition under which the interpolation problem is solvable.
% %
% \begin{proposition} \label{prop:condition}
%     For a model $p(\theta, \mbz, \mbx)$, the amortization interpolation problem can be solved for any $\mbx = x_{1:N}$ if and only if  there exists a (dataset-dependent) function $f_\mbx: \mathcal X \to \mathcal \mathcal N$ such that
%     $
%         f_\mbx(x_n) = \nu^*_n, \ \ \forall n.
%     $
% \end{proposition}
% %
% This can be shown by recalling that a function ascribes a single image to each point in its domain (see Appendix).
% We call $f_\mbx$ an \textit{ideal inference function} and emphasize that its existence is a property of the model $p(\theta, \mbz, \mbx)$.
% Combining Propositions~\ref{prop:interpolation} and \ref{prop:condition}, we have that the existence of $f_\mbx$ is a necessary and sufficient condition on $p(\theta, \mbz, \mbx)$, under which the amortization gap can be closed.
% This condition is \underline{not} equivalent to a posteriori independence of $z_n$ on $\mbx_{-n}$, which is the condition a D-seperation argument produces \citep{Girin:2021}.
% In the next section, we will see that the existence of $f_\mbx$ is a verifiable property.

% If a learnable inference function exists, we can ask how expressive does $\mathcal F$ need to be to solve the amortization interpolation problem.
% The expressiveness of $\mathcal F$ may be measured for example by the degree of the learning polynomials or the width of the learning neural networks.
% The existence of $f_{\bf x}$ guarantees that a brute-force approach works, e.g. with polynomials of degree $\mathcal O(N)$. But this may be overkill: a less expressive class of inference functions may still solve the interpolation problem, as we will see empirically.
%
% Then A-VI can achieve F-VI's solution even when $\cQ_\rmA(\mathcal F)$ is a much smaller space than $\cQ_\rmF$; see Figure~\ref{fig:ordering}b.

