\documentclass[accepted]{uai2022}
\usepackage[table]{xcolor}

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros

% -- begin our custom commands and packages not included in uai2022.cls or its template  --
\usepackage{amsfonts,amssymb}
\usepackage{graphicx}
\usepackage{wrapfig}
\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}
\newtheorem{definition}{Definition}
\newcommand{\KL}{{\rm KL}}     % KL
\newcommand{\CE}{\mathcal{CE}} % Cross entropy
\renewcommand{\H}{\mathcal{H}} % Entropy
\newcommand{\MI}{\mathcal{I}}  % Mutual Information
\newcommand{\E}{\mathbb{E}}    % Expectations
\renewcommand{\L}{\mathcal{L}}   % Lagrange
\newcommand{\q}{{\rm q}}       % component
\newcommand{\p}{{\rm p}}       % true p
\newcommand{\Z}{{\rm Z}}       % normalizing constant
\newcommand{\x}{\mathbf{x}}    % the var we're inferring
\newcommand{\randomt}{\mathbf{t}}    % random direction in x-space
\newcommand{\D}{\mathcal{D}}   % condition on data
\newcommand{\m}{{\rm m}}       % mixture
\newcommand{\rd}{{\rm d}}      % derivative
\newcommand{\FIM}{\mathcal{F}} % Fisher Information Matrix
\newcommand{\thetastar}{\theta^*} % Best theta (VI solution)
\DeclareMathOperator*{\argmin}{arg\,min}
\newcommand{\qed}{\hfill\ensuremath{\blacksquare}}

\newcommand{\X}{\mathbf{X}}    % a set of \x's
\newcommand{\bmu}{\boldsymbol{\mu}}
\newcommand{\bsigma}{\boldsymbol{\sigma}}
\newcommand{\rtheta}{{\color{red} \theta}}
\DeclareMathOperator{\argmax}{arg\,max}
\newcommand{\N}{\mathcal{N}}
\newcommand{\Sigmastar}{{\Sigma^*}}

% Enable tighter layout of figures and text on the same page (see https://aty.sdsu.edu/bibliog/latex/floats.html)
\renewcommand{\topfraction}{0.9}    % max fraction of floats at top
\renewcommand{\bottomfraction}{0.8} % max fraction of floats at bottom
%   Parameters for TEXT pages (not float pages):
\setcounter{topnumber}{2}
\setcounter{bottomnumber}{2}
\setcounter{totalnumber}{4}     % 2 may work better
\setcounter{dbltopnumber}{2}    % for 2-column pages
\renewcommand{\dbltopfraction}{0.9} % fit big float above 2-col. text
\renewcommand{\textfraction}{0.07}  % allow minimal text w. figs
%   Parameters for FLOAT pages (not text pages):
\renewcommand{\floatpagefraction}{0.7}  % require fuller float pages
% N.B.: floatpagefraction MUST be less than topfraction !!
\renewcommand{\dblfloatpagefraction}{0.7}   % require fuller float pages

\usepackage{xr}
\makeatletter
\newcommand*{\addFileDependency}[1]{% argument=file name and extension
  \typeout{(#1)}
  \@addtofilelist{#1}
  \IfFileExists{#1}{}{\typeout{No file #1.}}
}
\makeatother
\newcommand*{\myexternaldocument}[1]{%
    \externaldocument{#1}%
    \addFileDependency{#1.tex}%
    \addFileDependency{#1.aux}%
}
\myexternaldocument{lange_658}
% -- end custom definitions --

\title{Interpolating Between Sampling and Variational Inference with Infinite Stochastic Mixtures (Supplementary material)}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
% \author[1]{\href{mailto:<jj@example.edu>?Subject=Your UAI 2022 paper}{Jane~J.~von~O'L\'opez}{}}
% \author[1]{Harry~Q.~Bovik}
% \author[1,2]{Further~Coauthor}
% \author[3]{Further~Coauthor}
% \author[1]{Further~Coauthor}
% \author[3]{Further~Coauthor}
% \author[3,1]{Further~Coauthor}
\author[1]{\href{mailto:lange.richard.d@gmail.com}{Richard D. Lange}{}}
\author[1]{Ari S. Benjamin}
\author[2]{Ralf M. Haefner$^*$}
\author[3]{\href{mailto:xaq@rice.edu}{Xaq Pitkow$^*$}{}}
% % Add affiliations after the authors
\affil[1]{%
    Dept. of Neurobiology\\
    University of Pennsylvania\\
    Philadelphia, Pennsylvania, USA
}
\affil[2]{%
    Dept. of Brain and Cognitive Sciences\\
    University of Rochester\\
    Rochester, New York, USA
}
\affil[3]{%
    Baylor College of Medicine\\
    Rice University
    Houston, Texas, USA
  }
\affil[*]{equal contribution}

\begin{document}

\onecolumn
\maketitle

\setcounter{section}{0}
\renewcommand{\thesection}{\Alph{section}}
\setcounter{equation}{0}
\renewcommand{\theequation}{\Alph{section}.\arabic{equation}}
\setcounter{figure}{0}
\renewcommand{\thefigure}{\Alph{section}.\arabic{figure}}
\setcounter{table}{0}
\renewcommand{\thetable}{\Alph{section}.\arabic{table}}    

\section{Proofs and Derivations}

Throughout, we assume that $\theta$ forms a minimal statistical manifold \citep{Amari2016}, so that the degrees of freedom of $\q$ match the dimensionality of $\theta$, and whenever $\q(\x;\theta^{(i)}) = \q(\x;\theta^{(j)})$ for all $\x$, it must be that $\theta^{(i)} = \theta^{(j)}$.

Recall that in the main text, we defined the following objective:
\begin{equation}\tag{(\ref{eqn:weighted_optim}) restated}
    \L(\psi,\lambda) \equiv \MI[\x;\theta] - \lambda \E_{\psi(\theta)}\left[\KL(\q(\x;\theta)||\p^*(\x))\right] \, ,
\end{equation}
where $\lambda\in[1,\infty)$ is a hyper-parameter, and $\psi$ is a probability density on $\theta$. We also introduced an \textbf{approximate objective} in which $\MI[\x;\theta]$ is replaced with
\begin{equation}\tag{(\ref{eqn:mi_stams}) restated}
    \MI_\FIM[\x;\theta] \equiv \H[\theta] - \frac{1}{2}\E_{\psi(\theta)}\left[\log\left|2\pi e \FIM(\theta)^{-1}\right|\right] \, .
\end{equation}
This approximate objective is
\begin{equation}\tag{(\ref{eqn:weighted_optim_f}) restated}
    \L_\FIM(\psi,\lambda) = \H[\theta] + \E_{\psi(\theta)}\left[\frac{1}{2}\log|\FIM(\theta)| - \lambda \KL(\q(\x;\theta)||\p^*(\x))\right] \, ,
\end{equation}
and it is maximized for a given $\lambda$ by
\begin{align*}
    \psi(\theta) &= \frac{1}{Z(\lambda)}\exp\left(\frac{1}{2}\log|\FIM(\theta)| - \lambda \KL(\q(\x;\theta)||\p(\x))\right) \tag{(\ref{eqn:log_psi_stams}) restated}\\
    \text{where} \qquad Z(\lambda) &= \int_\theta \exp\left(\frac{1}{2}\log|\FIM(\theta)| - \lambda \KL(\q(\x;\theta)||\p(\x))\right) \rd\theta \, .
\end{align*}

\subsection{Characterizing the Pareto Front}\label{app:pareto}

Let us begin with a set of results regarding the shape of the Pareto front that connects VI to Sampling in Figure \ref{fig:mi_kl_space}.
\begin{lemma}\label{lem:concave}
    $\L(\psi,\lambda)$ is concave in $\psi$, i.e. $\L(\omega\psi_1 + (1-\omega)\psi_2, \lambda) \geq \omega\L(\psi_1,\lambda)+(1-\omega)\L(\psi_2,\lambda)$ for $0 \leq \omega \leq 1$. Further, $\L_\FIM(\psi,\lambda)$ is \emph{strictly} concave in $\psi$.
\end{lemma}
\paragraph{Proof:} The proof for $\L$ follows from the fact that $\E_{\psi(\theta)}\left[\KL(\q(\x;\theta)||\p^*(\x))\right]$ is \emph{linear} in $\psi$, and $\MI[\x;\theta]$ is known to be concave in the marginal distribution of either variable \citep{Braverman2011}. The proof for $\L_\FIM$ is similar: the $\E_{\psi(\theta)}\left[\frac{1}{2}\log|\FIM(\theta)|\right]$ term is linear in $\psi$, and $\H[\theta]$ is strictly concave in $\psi$. This can be seen, for instance, by taking the second variational derivative of $\H[\theta]$ with respect to $\psi$:
\begin{align*}
    \nabla_\psi^2 \H[\theta] \big\rvert_{\theta_i\theta_j} &= \nabla_\psi\left( \nabla_\psi\H[\theta] \big\rvert_{\theta_i}\right)\big\rvert_{\theta_j} \\
        &= \nabla_\psi\left( -\nabla_\psi\int_\theta \psi(\theta)\log\psi(\theta)\rd\theta \big\rvert_{\theta_i}\right)\big\rvert_{\theta_j} \\
        &= \nabla_\psi\left( -1 - \log\psi(\theta_i) \right)\big\rvert_{\theta_j} \\
        &= \begin{cases}
            -\frac{1}{\psi(\theta_i)} &\text{if $\theta_i=\theta_j$} \\
            0 &\text{otherwise} \, .
        \end{cases}
\end{align*}
Since $\psi(\theta)\geq 0$ everywhere, this implies that the curvature of $\H[\theta]$ is strictly negative at all values of $\theta$. \qed

% \begin{wrapfigure}{R}{3.25in}
%     \centering
%     \includegraphics{figures/figure_exact_vs_approx.pdf}
%     \caption{Elaborating on Figure \ref{fig:mi_kl_space} of the main text. Maximizing (\ref{eqn:weighted_optim}) with respect to $\psi$, as a function of $\lambda$, defines a set of solutions shown here in blue -- the Pareto front. Lemma \ref{lem:pareto_slope} states that $\lambda$ is the slope of this curve. Note that this does not guarantee unique solutions; region (i) contains all unbiased solutions where $\m(\x) = \p(\x)$, and region (ii) contains all minimum-KL solutions.}
%     \label{fig:non_unique_regions}
% \end{wrapfigure}

\begin{lemma}\label{lem:pareto_slope} Let $\MI^*(\lambda)$ and $\E[\KL]^*(\lambda)$ denote the values of Mutual Information and Expected KL achieved by optima of $\L$ for a given $\lambda$. Then, $\lambda$ defines the slope of the Pareto front:
\begin{align*}
    \lambda = \frac{\rd \MI^*/\rd \lambda}{\rd \E[\KL]^* / \rd \lambda} \, .
\end{align*}
Or, in the case of $\L_\FIM$, $\lambda$ similarly defines the slope of 
\begin{align*}
    \lambda = \frac{\rd \MI_\FIM^*/\rd \lambda}{\rd \E[\KL]^* / \rd \lambda} \, ,
\end{align*}
with $\MI_\FIM$ in place of $\MI$.
\end{lemma}
\paragraph{Proof:} This follows from viewing $\L$ as the Lagrangian of a constrained optimization problem, with $\lambda$ as a Lagrange multiplier. The same argument applies to both $\L$ and $\MI$ as to $\L_\FIM$ and $\MI_\FIM$, so we will just give the proof for one. Consider the constrained optimization problem of maximizing $\MI$ (or $\MI_\FIM$) subject to the constraint that $\E[\KL(\q||\p)]=C$. The Lagrangian for this problem is identical to (\ref{eqn:weighted_optim}), but with $C$ added:
\begin{align*}
    \L(\psi,\lambda) \equiv \MI[\x;\theta] - \lambda \left(\E_{\psi(\theta)}\left[\KL(\q(\x;\theta)||\p^*(\x))\right] - C\right)
\end{align*}
Optimizing with respect to $\psi$, this is a concave maximization problem with a linear constraint. A well-known property of such problems is that, at the solution, the Lagrange multiplier ($\lambda$) is equal to the change in the objective ($\MI^*$) per change in the constraint ($C$), or $\lambda = \frac{\rd \MI^*}{\rd C}$. Since $C$ is the constrained value of $\E[\KL(\q||\p)]$, we also immediately have $\frac{\rd \E[\KL]^*}{\rd C} = 1$. This implies that
\begin{align*}
    \lambda = \frac{\rd \MI_\FIM^*/\rd C}{\rd \E[\KL]^* / \rd C} \, .
\end{align*}
So far, we have treated $\lambda$ as a function of $C$, but for all values of $\lambda$ that correspond to a unique $C$, we can invert this relationship and treat $C$ as a function of $\lambda$. Then, assuming $\frac{\rd C}{\rd \lambda} \neq 0$ for all $1 \leq \lambda < \infty$ that we are interested in, we have
\begin{align*}
    \lambda &= \frac{\rd \MI_\FIM^*/\rd C \times \rd C / \rd \lambda}{\rd \E[\KL]^* / \rd C \times \rd C / \rd \lambda} = \frac{\rd \MI_\FIM^*/\rd \lambda}{\rd \E[\KL]^* / \rd \lambda} \, .
\end{align*}
Again using the fact that $C=\E[\KL]^*$ by construction, the assumption that $\frac{\rd C}{\rd\lambda}\neq 0$ is equivalent to $\frac{\rd\E[\KL]^*}{\rd\lambda} \neq 0$. In other words, as long as changing $\lambda$ has some effect on $\E[\KL]^*$, the combined effect on $\MI^*$ and $\E[\KL]^*$ will be such that $\lambda=\frac{\rd \MI^*}{\rd \E[\KL]^*}$. \qed


\subsection{Sampling-like behavior of our method}\label{app:sampling}

Recall our definition of sampling:
\begin{definition}[Sampling]
A stochastic mixture, defined by the component family $\q(\x;\theta)$ and mixing distribution $\psi(\theta)$, is considered to be ``sampling'' if it is \textbf{unbiased} and it consists of \textbf{non-overlapping components}. \\
An \textbf{unbiased} mixture is one where $\m(\x) = \p(\x)$. \\
A mixture consists of $T$ \textbf{non-overlapping components} if $\sum_{t=1}^T\q(\x;\theta^{(t)}) \approx \max_t \q(\x;\theta^{(t)})$ with high probability.
\end{definition}

We will assume throughout this section that $\q$ is a location-scale family, and in particular Gaussian for Lemma \ref{lem:sampling_approximate_solution}, but it may be fruitful for future work to consider other families of mixture components.

% Note that the \textbf{narrow components} property says that $\q$ behaves like a Dirac delta \emph{relative to} some family of $f(\x)$ functions of interest. The more smooth $f$ is, the less narrow $\q$ can be while still (approximately) satisfying this property.

\begin{lemma}\label{lem:sampling_solution}
Sampling is an optimum of the original objective, $\L$, when $\lambda = 1$.
\end{lemma}
\paragraph{Proof:} When $\lambda = 1$, $\L$ simplifies back to $\KL(\m||\p)$. Any \textbf{unbiased} mixture is a minimum of $\KL(\m||\p)$. \qed

Note, however, that this does not imply sampling is the unique optimum. In general there may be other unbiased mixing distributions $\psi(\theta)$ such that $\m(\x)=\p(\x)$. For instance, if $\q$ is Gaussian and $\p(\x)$ is itself a finite mixture of Gaussians, then $\psi(\theta)$ could concentrate on exactly those modes in $\p$. In any case where there two such unbiased $\psi$s, there are in fact infinitely many unbiased, since any mixture of them, $\alpha\psi_1(\theta) + (1-\alpha)\psi_2(\theta)$, will also be unbiased. Among all unbiased mixtures, sampling may in some sense be the worst choice -- we conjecture that it has the highest variance of all unbiased mixtures.

\begin{lemma}\label{lem:sampling_approximate_solution}
When $\q$ is Gaussian and $\lambda = 1$, the optimal $\psi$ that maximizes the approximate objective $\L_\FIM$ is both \textbf{unbiased} and has \textbf{non-overlapping components}.
\end{lemma}
In other words, Lemma \ref{lem:sampling_approximate_solution} states that the solution to the approximate objective $\L_\FIM$ ``looks like'' sampling when $\lambda=1$, in the sense of Definition \ref{def:sampling}.
\paragraph{Proof:} Without loss of generality, let us assume that $\theta$ is already parameterized in terms of its location and scale, $[\bmu, \bsigma]$, where $\bmu$ determines the mean of $\q$ and $\bsigma$ determines its covariance. Then, the Fisher Information Matrix is a block-diagonal matrix:\footnote{\url{https://en.wikipedia.org/wiki/Fisher\_information\#Multivariate_normal_distribution}}
\begin{align*}
    \FIM(\theta) = \begin{bmatrix}\FIM(\bmu) & 0 \\ 0 & \FIM(\bsigma) \end{bmatrix}
\end{align*}
where
\begin{align*}
    \FIM(\bmu) &= \Lambda \\
    \FIM(\bsigma)_{ij} &= \frac{1}{2}\text{Tr}\left(\Lambda\frac{\partial \Sigma}{\partial \bsigma_i}\Lambda\frac{\partial \Sigma}{\partial \bsigma_j}\right) \, .
\end{align*}
$\Lambda$ and $\Sigma$ are the precision matrix and covariance matrix of $\q$, respectively. Both $\Lambda$ and $\Sigma$ are functions of the parameters $\bsigma$ but not of $\bmu$. To simplify further, consider a coordinate system where the covariance of $\q$ is diagonal, and that $\bsigma_i$ is the log standard deviation of the $i$th dimension of $\x$:
\begin{align*}
    \Sigma(\bsigma)_{ij} = \begin{cases}
        e^{2\bsigma_i} &\text{if $i=j$} \\
        0 &\text{otherwise}
    \end{cases}
\end{align*}
We emphasize that this simplification is for notational convenience only, and other parameterizations of $\Sigma(\bsigma)$ are permissible (e.g. additionally parameterizing the orientation of $\Sigma$ with a rotation matrix). With this assumption, $\FIM(\bsigma)$ becomes the identity matrix, and the log determinant of $\FIM(\theta)$ becomes simply
\begin{align*}
    \log|\FIM(\theta)| = \log|\Lambda| \, .
\end{align*}
So, for Gaussian $\q$, the expression for $\psi$ becomes
\begin{align*}
    \log\psi(\theta) = \log\psi(\bmu,\bsigma) = \frac{1}{2}\log|\Lambda(\bsigma)| - \lambda\KL(\q(\x;\bmu,\bsigma)||\p(\x)) \, .
\end{align*}

Next, we will split $\KL(\q||\p)$ into separate entropy and cross-entropy terms:
\begin{align*}
    \KL(\q||\p) &= \E_{\q(\x;\theta)}\left[\log \q(\x;\theta) \right] - \E_{\q(\x;\theta)}\left[\log \p(\x) \right] \\
        &= -\H[\q] + \CE[\q||\p] \, .
\end{align*}
And note that when $\q$ is Gaussian, its entropy is given by
\begin{align*}
    \H[\q] = \frac{1}{2}\log|2\pi e \Sigma| = \frac{1}{2}\log|\Sigma| + \text{constants} \, .
\end{align*}
Taking $\lambda=1$ and using the fact that $\log|\Sigma|=-\log|\Sigma^{-1}|=-\log|\Lambda|$ and combining the above three equations, the $\H[\q]$ and $\log|\FIM(\bmu)|$ terms cancel in $\psi$ and we are left -- up to additive constants -- with
\begin{equation}\label{eqn:log_psi_equals_ce}
    \log\psi(\theta) = -\CE[\q||\p] = \E_{\q(\x;\bmu,\bsigma)}\left[\log \p(\x) \right] \, .
\end{equation}
To summarize, equation (\ref{eqn:log_psi_equals_ce}) says that, using Gaussian components and letting $\lambda\rightarrow 1$, our method, derived from the $\MI_\FIM$ approximation to $\MI$, selects components simply according to the \emph{cross entropy} between $\q(\x;\theta)$ and $\p(\x)$.

Note that (\ref{eqn:log_psi_equals_ce}) is not a proper distribution over $\theta$. To see this, consider any sufficiently narrow component such that $\q$ behaves like a Dirac delta, or $\E_{\q(\x;\bmu,\bsigma)}[\log\p(\x)] \approx \log\p(\bmu)$. Wherever this holds for some $\bsigma$, it will additionally hold for all \emph{narrower} components at the same $\bmu$.\footnote{There is an implicit assumption here that $\log\p(\x)$ is almost everywhere smooth, so that there is some small enough scale at which $\p(\x)$ appears locally linear under $\q$.} Therefore, below a particular scale where $\q$ behaves like a Dirac delta, (\ref{eqn:log_psi_equals_ce}) places uniform mass on the infinitely many $\q$s that are at least as narrow. This effect is visible in the top-right panel of Figure \ref{fig:mi_kl_space}. Also note that $\psi$ is only improper for $\lambda=1$; for all other $\lambda > 1$, a $(\lambda-1)\H[\q]$ term remains, and $\psi$ cannot place arbitrarily much mass on arbitrarily narrow components.

Despite its impropriety, we are free to draw samples of $\theta$ from this improper $\psi$ when $\lambda=1$ \citep{Besag1995,Hobert1996}. We will then find that with probability approaching $1$ we only ever see components that ``look like'' Dirac-deltas. This phenomenon is seen empirically in all of our experiments where we set $\lambda=1$ and run HMC dynamics drawing $\theta \sim \psi(\theta)$ (in practice, we set a lower bound on $\log\sigma$ for numerical stability). Since components will become arbitrarily narrow with high probability, we have that $\q(\x;\theta^{(j)}) << \q(\x;\theta^{(i)})$ in the region where $\q(\x;\theta^{(i)})$ has appreciable mass. This means that the mixture will consist of \textbf{non-overlapping components} when $\lambda=1$.

The fact that each component shrinks towards a Dirac delta with high probability then implies that the mixture will be unbiased. To see this, consider decomposing $\psi(\theta)$ into $\psi(\bsigma)\psi(\bmu|\bsigma)$. The previous paragraph establishes that the marginal distribution $\psi(\bsigma)$ will allocate effectively all samples to parts of $\theta$-space where components behave like Dirac deltas. This implies
\begin{align*}
    \log\psi(\bmu|\bsigma = \text{narrow}) &= \E_{\q(\x;\bmu,\bsigma)}\left[\log \p(\x) \right] \\
        &= \log \p(\bmu) \, .
\end{align*}
In other words, when components are narrow, the distribution of means $\bmu$ according to $\psi$ will match the true distribution $\p$. Hence, $\m(\x)$ will be a mixture of Dirac-delta-like components, each of which is chosen in proportion to the true probability of its mean, $\p(\bmu)$. This means that $\m(\x)$ will be \textbf{unbiased} when $\lambda=1$. \qed

\begin{theorem}[Improve on sampling]
If a mixture is sampling as in Definition \ref{def:sampling}, then $\frac{\rd}{\rd\lambda}\text{KL bias}=0$ and $\frac{\rd}{\rd\lambda}\text{KL variance} < 0$. Thus, $\frac{\rd}{\rd\lambda}\text{KL error}<0$.
\end{theorem}
\paragraph{Proof:} Our approach will be to calculate the variational derivatives of KL bias and KL variance with respect to $\psi$, then take the inner product (directional derivative) with the change in $\psi$ per change in $\lambda$.

First, we need the sensitivy of $\psi$ to changes in $\lambda$. Recall that the closed-form solution for $\psi$ we get from solving $\L_\FIM$ is
\begin{align*}
    \log\psi(\theta) = \frac{1}{2}\log|\FIM(\theta)| - \lambda\KL(\q(\x;\theta)||\p(\x)) - \log Z(\lambda) \, .
\end{align*}
The sensitivity of $\log\psi$ to $\lambda$ is
\begin{align*}
\frac{\rd}{\rd \lambda}\log\psi(\theta) &= -\KL(\q||\p) + \frac{1}{Z}\int_{\theta'}e^{\frac{1}{2}\log|\FIM(\theta)| - \lambda \KL(\q||\p)} \KL(\q||\p) \rd\theta'  \\
    &= \E_\psi[\KL(\q||\p)] - \KL(\q||\p) \, .
\end{align*}
Converting from $\log\psi$ to $\psi$, we get
\begin{equation}\label{eqn:dpsi_dlambda}
    \frac{\rd}{\rd \lambda}\psi(\theta) = \psi(\theta)\left(\E_\psi[\KL(\q||\p)] - \KL(\q||\p)\right)
\end{equation}

Recall that we defined $\text{KL bias}=\KL(\m||\p)$ and $\text{KL variance}=\E[\KL(\m_T||\m)]$. The variational derivative of KL bias with respect to $\psi$, evaluated at $\rtheta$ is\footnote{This section is best viewed in color. Our notation uses red $\rtheta$ to indicate the value where the variational derivative is evaluated, which is distinct from from black $\theta$s, which are integrated out. }
\begin{align}
\nabla_\psi \KL(\m||\p) =& \nabla_\psi \int_\x \left(\E_\psi[\q(\x;\theta)]\right)\log\frac{\left(\E_\psi[\q(\x;\theta)]\right)}{\p(\x)} \rd\x \nonumber \\
    =& \int_\x \left(\m(\x)\frac{\p(\x)}{\m(\x)}\frac{\q(\x;\rtheta)}{\p(\x)} + \q(\x;\rtheta)\log\frac{\m(\x)}{\p(\x)} \right)\rd\x \nonumber \\
    =& 1 + \E_{\q(\x;\rtheta)}\left[\log\frac{\m(\x)}{\p(\x)}\right] \, . \label{eqn:dbias_dpsi}
\end{align}
To get the sensitivity of KL bias to $\lambda$ at the point $\lambda=1$, we will take the inner-product of (\ref{eqn:dpsi_dlambda}) with (\ref{eqn:dbias_dpsi}). This is
\begin{align*}
\frac{\rd}{\rd \lambda} \text{KL bias} &= \left\langle\frac{\rd \text{KL bias}}{\rd \psi}, \frac{\rd \psi}{\rd\lambda}\right\rangle & \\
    &= \int_\rtheta \left(1 + \E_{\q(\x;\rtheta)}\left[\log\frac{\m(\x)}{\p(\x)}\right] \right) \psi(\rtheta)\left(\E_\psi[\KL(\q||\p)] - \KL({\color{red} \q}||\p)\right) \rd\rtheta & \\
    &= \int_\rtheta (1 + 0) \psi(\rtheta)\left(\E_\psi[\KL(\q||\p)] - \KL({\color{red}\q}||\p)\right) \rd\rtheta & \text{(\textbf{unbiased})} \\
    &= \E_\psi[\KL(\q||\p)] - \E_\psi[\KL(\q||\p)] & \\
    &= 0 \, . &
\end{align*}
So, we can conclude that in the sampling limit, small changes in $\lambda$ have no effect on KL bias. Geometrically, this tells us the Pareto front is tangent to the y=x line in that limit, as illustrated in Figure \ref{fig:mi_kl_space}.

Next we will consider the variational derivative of KL variance with respect to $\psi$, where
\begin{align*}
    \text{KL variance} &\equiv \E_{1..T}[\KL(\m_T||\m)] \\
        &= \E_{1..T}\left[\int_\x\left(\frac{1}{T}\sum_{i=1}^T\q(\x;\theta^{(i)})\right)\log\frac{\left(\frac{1}{T}\sum_{j=1}^T\q(\x;\theta^{(j)})\right)}{\m(\x)}\rd\x\right] \\
\end{align*}
using the shorthand $\E_{1..T}[\ldots]$ to denote an expectation over independent draws of $\lbrace{\theta^{(t)}}\rbrace \sim \psi(\theta)$, for each of $t=\lbrace{1..T}\rbrace$. Applying the assumption of \textbf{non-overlapping components}, the sum inside the log is dominated by its maximum. We can therefore approximate KL variance as
\begin{align*}
    \text{KL variance} &\approx \E_{1..T}\left[\int_\x \frac{1}{T}\sum_{i=1}^T\q(\x;\theta^{(i)})\log\frac{\frac{1}{T}\q(\x;\theta^{(i)})}{\m(\x)}\rd\x\right] \, ,
\end{align*}
since the maximum of the sum over $j$ inside the log will be the $i$th component from outside the log. This step is most applicable for small to moderate $T$, since when $T$ grows sufficiently large, even narrow components will overlap each other with appreciable probability. Let us continue assuming that $T$ is sufficiently small and that components are sufficiently non-overlapping. By symmetry, we can remove the sum over $i$ and simplify the outer expectation to a single $\theta$:
\begin{align*}
    \ldots = \E_{i}\left[\int_\x \q(\x;\theta^{(i)})\log\frac{\frac{1}{T}\q(\x;\theta^{(i)})}{\m(\x)}\rd\x\right] \, ,
\end{align*}
which simplifies to
\begin{align*}
    \text{KL variance} \approx \MI[\x;\theta] - \log T \, .
\end{align*}

The variational derivative of $\MI[\x;\theta]$ with respect to $\psi$ is
\begin{align*}
    \nabla_\psi\MI[\x;\theta]\big\rvert_{\rtheta} &\approx \nabla_\psi\left. \int_\theta \psi(\theta) \int_\x \q(\x;\theta) \log\frac{\q(\x;\theta)}{\m(\x)}\rd\x\rd\theta \right\rvert_{\rtheta} \\
        &= -\int_\theta \psi(\theta) \int_\x \q(\x;\theta)\frac{\m(\x)}{\q(\x;\theta)}\frac{\q(\x;\theta)}{\m(\x)^2}\q(\x;\rtheta)\rd\x\rd\theta + \int_\x \q(\x;\rtheta) \log\frac{\q(\x;\rtheta)}{\m(\x)}\rd\x \\
        &= -1 + \KL(\q(\x;\rtheta)||\m(\x)) \, .
\end{align*}
Taking the inner product with $\frac{\rd}{\rd\lambda}\psi$, and applying the assumptions from the definition of sampling,
\begin{align*}
\frac{\rd}{\rd \lambda} \text{KL variance} &= \left\langle\frac{\rd \text{KL variance}}{\rd \psi}, \frac{\rd \psi}{\rd\lambda}\right\rangle & \\
    &\approx \left\langle\frac{\rd \MI[\x;\theta]}{\rd \psi}, \frac{\rd \psi}{\rd\lambda}\right\rangle& \text{(\textbf{non-overlapping})}\\
    &= \int_\rtheta \left(-1 + \KL({\color{red}\q}||\m) \right) \psi(\rtheta) \left(\E_\psi[\KL(\q||\p)] - \KL({\color{red}\q}||\p)\right) \rd\rtheta & \\
    &= \int_\rtheta \left(-1 + \KL({\color{red}\q}||\p) \right) \psi(\rtheta) \left(\E_\psi[\KL(\q||\p)] - \KL({\color{red}\q}||\p)\right) \rd\rtheta & \text{(\textbf{unbiased})}\\
    &= -\E_{\psi(\rtheta)}\left[\left(\KL({\color{red}\q}||\p) - \E_\psi[\KL(\q||\p)]\right)\KL({\color{red}\q}||\p)\right] & \\
    &= -\text{var}\left(\KL(\q||\p)\right) \, .
\end{align*}
In other words, this says that the change in the (upper bound on) KL variance is \emph{non-positive}, and its magnitude is given by the variance of the values taken by $\KL(\q||\p)$ across all $\theta$.

To summarize, we have shown that, in the sampling limit, where $\lambda=1$, we have $\frac{\rd}{\rd\lambda}\text{KL bias} = 0$ and $\frac{\rd}{\rd\lambda}\text{KL variance} \leq 0$, which proves the theorem. \qed

\subsection{VI-like behavior of our method}\label{app:vi}

\begin{definition}[VI limit]\label{def:vi_limit} We model the large $\lambda$ limit of our method using a Laplace approximation around the optimal $\thetastar=\argmin_\theta \KL(\q(\x;\theta)||\p(\x))$:
\begin{equation}\label{eqn:define_vi}
\begin{split}
    \psi(\theta) &\approx \N(\theta; \thetastar, \Sigmastar) \\
    \text{where} \qquad \Sigmastar^{-1} &= \lambda \nabla^2_\theta \KL(\q(\x;\theta)||\p(\x))\big\rvert_{\thetastar} \, .
\end{split}
\end{equation}
\end{definition}
In other words, we approximate $\psi$ by a normal distribution whose mean is $\thetastar$ and whose precision is set by the curvature of $\KL(\q(\x;\theta)||\p(\x))$ and scales with $\lambda$. We will assume, for the purposes of proofs related to the VI limit, that there is a single optimal $\thetastar$. As long as $\nabla^2\KL(\q||\p)$ is positive definite, which is guaranteed by the assumption that $\thetastar$ is unique, the accuracy of this Laplace approximation can be made arbitrarily good by considering larger and larger $\lambda$.

\begin{theorem}[Improve on VI]\label{thm:improve_vi}
Assume that $\q(\x;\thetastar)$ is poorly matched to $\p(\x)$, in the sense that $\text{Tr}\left((\nabla^2_\theta\KL(\q||\p))^{-1} \FIM \right) > |\theta|$, and that $\lambda$ is sufficiently large to use a Laplace approximation to $\psi$ around $\thetastar$. Then, there exists some finite $T_0>1$ such that for all $T\geq T_0$, $\frac{\rd}{\rd\lambda}\text{KL error}>0$.
\end{theorem}
\paragraph{Proof:} As $\lambda$ grows, the Laplace approximation in (\ref{eqn:define_vi}) becomes increasingly accurate, and increasingly narrow. Thus, for sufficiently large $\lambda$, we can accurately approximate expectations under $\psi$ using a second order Taylor approximation to the integrand. The general rule for multivariate Gaussians is
\begin{align*}
    \E_{\N(\mathbf{y};\mu,\Sigma)}[f(\mathbf{y})] \approx f(\mu) + \frac{1}{2}\text{Tr}\left(\Sigma \; \nabla^2_\mathbf{y}f\right)\big\rvert_\mu
\end{align*}
Recall that we defined KL error as $\E_{1..T}[\KL(\m_T(\x)||\p(\x))]$. Approximating each $\psi(\theta^{(t)})$ as a multivariate Gaussian, their product is also a multivariate Gaussian whose collective covariance is block-diagonal\footnote{This assumes the $T$ components are statistically independent draws from $\psi(\theta)$. The approach outlined here could be generalized to include correlation between $\theta$s in the off-block-diagonals to model variance of an autocorrelated chain of $\theta$ values.} containing $T$ copies of $\Sigmastar$ from (\ref{eqn:define_vi}), and whose collective mean is $\thetastar$ for each component. At this mean value where all $T$ components' parameters are equal to $\thetastar$, $\m_T(\x)$ becomes $\q(\x;\thetastar)$. Hence, applying the Taylor series approximation to KL error, the $f(\mu)$ term is just $\KL(\q(\x;\thetastar)||\p(\x))$. The second term is
\begin{align*}
    \frac{1}{2}\text{Tr}\left(
        \begin{bmatrix}
            \Sigmastar &  &  & 0  \\
             & \Sigmastar & & &  \\
             &  & \ddots & \\
             0 &  &  & \Sigmastar
        \end{bmatrix}
        \nabla^2_{\theta_1,\ldots,\theta^{(t)}}\KL(\m_T||\p)
    \right) \, .
\end{align*}
First, note that the zeros in the off-block-diagonal terms on the left mean that we can ignore interactions between $\theta$s across different mixture components in the Hessian term on the right. Second, there is $T-$fold symmetry between all components. So, this simplifies to
\begin{align*}
    \frac{T}{2}\text{Tr}\left(\Sigmastar \; \nabla^2_{\theta_1}\KL(\m_T||\p) \right) = \frac{T}{2\lambda}\text{Tr}\left((\nabla^2_\theta\KL(\q||\p))^{-1} \; \nabla^2_{\theta_1}\KL(\m_T||\p) \right) \, .
\end{align*}

Next, since this Hessian is being evaluated around the point $\thetastar$, all of $\theta_2,\ldots,\theta^{(t)}$ are equal to $\thetastar$, and we can write the mixture as a function only of the component parameters we are varying in the Hessian. Call this mixture, with $T-1$ components set to the variational solution, ``$\m_T^*$,'' defined as
\begin{align*}
    \m_T^*(\x;\theta) = \frac{T-1}{T}\q(\x;\thetastar) + \frac{1}{T}\q(\x;\theta) \, .
\end{align*}

Next we will calculate the Hessian of $\KL(\m_T^*(\x;\theta)||\p(\x))$. Note that the derivatives are with respect to $\theta$, not $\thetastar$. First, the Hessian of $\KL(\q||\p)$ is
\begin{align}
    \frac{\partial^2}{\partial\theta_j\partial\theta_i} \KL(\q(\x;\theta)||\p(\x)) &= \frac{\partial^2}{\partial\theta_j\partial\theta_i} \int_\x\q(\x;\theta)\log\frac{\q(\x;\theta)}{\p(\x)}\rd\x \nonumber \\
        &= \frac{\partial}{\partial\theta_j} \int_\x\left[\left(\frac{\partial}{\partial\theta_i}\q(\x;\theta)\right)\left(1 +  \log\frac{\q(\x;\theta)}{\p(\x)} \right)\right]\rd\x \nonumber \\
        &= \int_\x\left[\left(\frac{\partial}{\partial\theta_i}\q(\x;\theta)\right)\left(\frac{\frac{\partial}{\partial\theta_j}\q(\x;\theta)}{\q(\x;\thetastar)} \right) + \left(\frac{\partial^2}{\partial\theta_i\partial\theta_j}\q(\x;\theta)\right)\left(1 +  \log\frac{\q(\x;\thetastar)}{\p(\x)} \right)\right]\rd\x \nonumber \\
        (*) &= \int_\x \frac{\left(\frac{\partial}{\partial\theta_i}\q(\x;\theta)\right)\left(\frac{\partial}{\partial\theta_j}\q(\x;\theta)\right)}{\q(\x;\thetastar)} \rd\x + \int_\x \left(\frac{\partial^2}{\partial\theta_i\partial\theta_j}\q(\x;\theta)\right)\log\frac{\q(\x;\thetastar)}{\p(\x)} \rd\x \nonumber \\
        &= \FIM(\thetastar) + M(\thetastar) \label{eqn:hess_kl_q_p} \,  .
\end{align}
In line $(*)$ we used the fact that $\int_\x \nabla^2_\theta \q(\x;\theta) \rd\x = \nabla^2_\theta \int_\x \q(\x;\theta) \rd\x = \nabla^2_\theta 1 = 0$. In the last line, we recognized the first term as the Fisher Information Matrix $\FIM(\thetastar)$, and we have defined $M(\theta) = \int_\x\left(\frac{\partial^2}{\partial\theta_i\partial\theta_j}\q(\x;\theta)\right)\log\frac{\q(\x;\theta)}{\p(\x)}\rd\x$ as a placeholder.

Following a similar derivation, the Hessian of $\KL(\m_T^*(\x;\theta)||\p(\x))$ is
\begin{align}
    \frac{\partial^2}{\partial\theta_j\partial\theta_i} & \KL(\m_T^*(\x;\theta)||\p(\x)) = \frac{\partial^2}{\partial\theta_j\partial\theta_i} \int_\x\left(\frac{T-1}{T}\q(\x;\thetastar)+\frac{1}{T}\q(\x;\theta)\right)\log\frac{\left(\frac{T-1}{T}\q(\x;\thetastar)+\frac{1}{T}\q(\x;\theta)\right)}{\p(\x)}\rd\x \nonumber \\
        &= \frac{\partial}{\partial\theta_j} \int_\x\left[\frac{1}{T}\left(\frac{\partial}{\partial\theta_i}\q(\x;\theta)\right) + \frac{1}{T}\left(\frac{\partial}{\partial\theta_i}\q(\x;\theta)\right)\log\frac{\left(\frac{T-1}{T}\q(\x;\thetastar)+\frac{1}{T}\q(\x;\theta)\right)}{\p(\x)} \right]\rd\x \nonumber \\
        &= \frac{1}{T} \frac{\partial}{\partial\theta_j} \int_\x\left[\left(\frac{\partial}{\partial\theta_i}\q(\x;\theta)\right)\left(1 + \log\frac{\left(\frac{T-1}{T}\q(\x;\thetastar)+\frac{1}{T}\q(\x;\theta)\right)}{\p(\x)}\right) \right]\rd\x \nonumber \\
        &= \frac{1}{T} \int_\x\left[\left(\frac{\partial}{\partial\theta_i}\q(\x;\theta)\right)\left(\frac{\frac{1}{T}\frac{\partial}{\partial\theta_j}\q(\x;\theta)}{\m_T^*(\x;\theta)}\right) + \left(\frac{\partial^2}{\partial\theta_i\partial\theta_j}\q(\x;\theta)\right)\left(1 + \log\frac{\m_T^*(\x;\theta)}{\p(\x)}\right) \right]\rd\x \nonumber \\
   (**) &= \frac{1}{T^2}\int_\x\left(\frac{\frac{\partial}{\partial\theta_i}\q(\x;\theta)\frac{\partial}{\partial\theta_j}\q(\x;\theta)}{\q(\x;\thetastar)}\right)\rd\x + \frac{1}{T}\int_\x\left(\frac{\partial^2}{\partial\theta_i\partial\theta_j}\q(\x;\theta)\right)\log\frac{\q(\x;\thetastar)}{\p(\x)}\rd\x \nonumber \\
        &= \frac{1}{T^2}\FIM(\thetastar) + \frac{1}{T} M(\thetastar) \nonumber \\
        &= \frac{1}{T}\left(\FIM(\thetastar) + M(\thetastar)\right) + \left(\frac{1}{T^2} - \frac{1}{T}\right)\FIM(\thetastar) \nonumber \\
        &= \frac{1}{T}\frac{\partial^2}{\partial\theta_j\partial\theta_i} \KL(\q(\x;\theta)||\p(\x)) + \FIM(\theta)\left(\frac{1-T}{T^2}\right) 
        \label{eqn:hess_kl_mt_p}
        % (**) &= \frac{1}{T^2}\FIM(\thetastar) + \frac{1}{T}\int_\x\q(\x;\thetastar)\left(\FIM(\thetastar) + \left(\frac{\partial}{\partial\theta_i}\log\q(\x;\theta)\right)\left(\frac{\partial}{\partial\theta_j}\log\q(\x;\theta)\right)\right)\log\frac{\q(\x;\thetastar)}{\p(\x)}\rd\x 
\end{align}
Here, in $(**)$, we additionally used the fact that $\m_T^*(\x;\thetastar)=\q(\x;\thetastar)$. We then wrote the final line in terms of the Hessian of $\KL(\q||\p)$ in (\ref{eqn:hess_kl_q_p}).
% In $(**)$, we used the definition of the Fisher Information Matrix for the first term. For the second term, we used the identity
% \begin{align*}
%     \frac{\partial^2}{\partial\theta_i\partial\theta_j}\q(\x;\theta) = \q(\x;\theta)\left(\frac{\partial^2}{\partial\theta_i\partial\theta_j}\log\q(\x;\theta) + \left(\frac{\partial}{\partial\theta_i}\log\q(\x;\theta)\right)\left(\frac{\partial}{\partial\theta_j}\log\q(\x;\theta)\right)\right)
% \end{align*}
% as well as the assumption that $\theta$ are natural parameters to substitute $\FIM(\theta)$ for the $\frac{\partial^2}{\partial\theta_i\partial\theta_j}$ term. Combining terms, this is
% \begin{align}
% \frac{\partial^2}{\partial\theta_j\partial\theta_i} & \KL(\m_T^*(\x;\theta)||\p(\x)) = \frac{\FIM(\thetastar)}{T}\left(\frac{1}{T} + \KL(\q(\x;\thetastar)||\p(\x))\right) + \frac{1}{T}\int_\x\q(\x;\theta)\left(\frac{\partial}{\partial\theta_i}\log\q(\x;\theta)\right)\left(\frac{\partial}{\partial\theta_j}\log\q(\x;\theta)\right)\log\frac{\q(\x;\thetastar)}{\p(\x)}\rd\x 
%         \label{eqn:hess_kl_mt_p} \, .
% \end{align}

To summarize, near the variational limit we have that the KL error is approximately
\begin{align*}
    \KL(\q(\x;\thetastar)||\p(\x)) + \frac{T}{2\lambda}\text{Tr}((\underbrace{\nabla^2_\theta\KL(\q||\p)}_{(\ref{eqn:hess_kl_q_p})})^{-1} \; (\underbrace{\nabla^2_\theta\KL(\m_T^*||\p)}_{(\ref{eqn:hess_kl_mt_p})})) \, ,
\end{align*}
and we found that (\ref{eqn:hess_kl_mt_p}) could be written in terms of (\ref{eqn:hess_kl_q_p}). To reduce clutter temporarily, let $\mathbf{H} = \nabla^2_\theta\KL(\q||\p)$. Combining terms, we have
\begin{align*}
    \text{KL error} &\approx \KL(\q(\x;\thetastar)||\p(\x)) + \frac{T}{2\lambda}\text{Tr}\left( \mathbf{H}^{-1} \left(\frac{1}{T}\mathbf{H} + \frac{1-T}{T^2}\FIM(\thetastar)\right)\right) \\
        &= \KL(\q(\x;\thetastar)||\p(\x)) + \frac{1}{2\lambda}\text{Tr}\left(\mathbf{I} + \frac{1-T}{T}\mathbf{H}^{-1} \FIM\right) \\
        &= \KL(\q(\x;\thetastar)||\p(\x)) + \frac{d}{2\lambda} - \frac{1}{2\lambda}\text{Tr}\left(\frac{T-1}{T}\mathbf{H}^{-1} \FIM\right)
\end{align*}
where $\mathbf{I}$ is the identity matrix and $d=\text{Tr}(\mathbf{I})$ is the dimensionality of $\theta$.
Consider the case where $T=1$: the KL error simplifies to $\KL(\q(\x;\thetastar)||\p(\x)) + \frac{d}{2\lambda}$. Therefore when $T=1$, KL error is only reduced by further increasing $\lambda$. This is an intuitive result: we cannot reduce bias compared to VI when using a single component, and any stochasticity only adds variance.

Now consider the case where $T \geq 2$. We are interested in cases where KL error \emph{increases} with $\lambda$ near the VI limit, as this would imply that using a finite $\lambda$ would improve on VI. This is equivalent to asking when the following inequality holds:
\begin{align*}
    \frac{\text{Tr}\left(\mathbf{H}^{-1} \FIM\right)}{d} > \frac{T}{T-1} \, .
\end{align*}
Recall that $\mathbf{H}$ was defined as the Hessian of $\KL(\q(\x;\theta)||\p(\x))$, so this is
\begin{align*}
    \frac{\text{Tr}\left((\nabla^2_\theta\KL(\q||\p))^{-1} \FIM \right)}{d} > \frac{T}{T-1} \, .
\end{align*}
The Fisher Information Matrix can also be derived from a local quadratic approximation to $\KL(\q||\q)$; this means that in the case where the VI solution is exact, or $\q(\x;\thetastar)=\p(\x)$, the trace term becomes $\text{Tr}(\FIM^{-1}\FIM)$, and the inequality is $1 > \frac{T}{T-1}$. This inequality is not satisfied by any positive integer $T$, and so this expression captures the intuitive condition that VI cannot be improved upon by reducing $\lambda$ -- for any finite $T$ -- if the VI solution is already exact.

Conversely, the ratio \[R \equiv \frac{\text{Tr}\left((\nabla^2_\theta\KL(\q||\p))^{-1} \FIM \right)}{d}\] may be thought of as quantifying the extent to which $\p(\x)$ is over-dispersed relative to the VI solution $\q(\x;\thetastar)$. If the curvature of $\KL(\q||\p)$ is low, then many ``nearby'' $\q$s would fit $\p$ almost as well, and this will be reflected in this ratio being larger than 1. Then, the minimum $T$ for which reducing $\lambda$ improves KL error relative to VI can be found by solving the above inequality, giving
which gives
\begin{equation}\label{eqn:t0}
    T_0 = \left\lceil{\frac{R}{R - 1}}\right\rceil \, ,
\end{equation}
so that for all $T>T_0$, we have the desired property that $\frac{\rd}{\rd\lambda}\text{KL error} > 0$, which implies that using some finite $\lambda < \infty$ will reduce error relative to VI. \qed

%We \textbf{conjecture} that this ratio is always greater than $1$ whenever $\p(\x)$ is heavier-tailed than $\q(\x;\thetastar)$. Since $\frac{T}{T-1}$ approaches $1$ from above in the limit of large $T$, this implies that there will be some finite $T_0$ where $\frac{T_0}{T_0-1}$ is less than the ratio in (\ref{eqn:ratio_hessian_match}), and that such a $T_0$ will be reached sooner the worse $\q(\x;\thetastar)$ locally approximates $\p$. \qed

\section{Numerical Details}
\label{app:numerical_details}

\begin{figure*}[ht]
    \centering
    \includegraphics{figures/figure_bias_variance_extras}
    \caption{Bias/variance decomposition across many random $f$s (all with $\alpha=-1.5$) for additional test problems, plotted in the same format as Figure 3d-f. Each row corresponds to a different inference problem. We chose $T$ separately for each problem such that the variance of NUTS and bias of ADVI were of similar orders of magnitude.
    \textbf{\texttt{mix laplace}} is the 1D mixture of two Laplace distributions shown in Figure 2 of the main paper.
    \textbf{\texttt{cigar}} is a 2D Gaussian with a correlation of 0.99, oriented along the $y=x$ axis .
    \textbf{\texttt{garch11}}, \textbf{\texttt{eight\_schools}}, and \textbf{\texttt{arK}} refer to problems taken from posteriordb \citep{posteriordb}.
    \textbf{\texttt{lasso regression}} is a regression problem with 30 regressors and a double-exponential (Laplace) prior on each regression weight. We report the variance of ADVI using all default settings, which fails to converge reliably for this problem, and hence has significant variance. Optimizing the ADVI settings for this problem could potentially result in much lower variance.
    }
    \label{fig:posteriordb}
\end{figure*}

All code to generate the figures in this paper is available at \url{https://github.com/wrongu/sampling-variational-demos}. 

We implemented (\ref{eqn:log_psi_stams}) in Stan \citep{Carpenter2017}. For $\q$, we used the family of multivariate Gaussians with diagonal covariance, parameterized as $\theta=[\mu_1, \ldots, \mu_n, \log\sigma_1, \ldots, \log\sigma_n]$ where $n$ is the number of unconstrained parameters (i.e the dimensionality of $\x$). In this parameterization, $\frac{1}{2}\log\FIM(\theta)$ is simply $-\sum_{i=1}^n\log\sigma_i$. We sampled $\theta$ from $\psi(\theta)$ using Stan's default implementation of the No U-Turn Sampler (NUTS) with automatic step-size adaptation \citep{Hoffman2014}, and we set the mass equal to $\lambda$ times the identity matrix. NUTS requires both $\KL(\q||\p)$ and its gradient, which we computed using Monte Carlo samples from $\q$ and the reparameterization trick. The reparameterized samples were frozen for each trajectory of NUTS and resampled between trajectories.

We used two toy distributions in the main paper:
\begin{itemize}
    \item The ``banana'' distribution over $\mathbb{R}^2$, defined as
    \begin{align*}
        \log \p(x,y) = -(y-(x/2)^2)^2 - (x/2)^2 \, .
    \end{align*}
    \item The ``Laplace mixture'' distribution over $\mathbb{R}^1$, defined as
    \begin{align*}
        \p(x) \propto 0.4e^{\frac{|x+1.5|}{0.75}} + 0.6e^{\frac{|x-1.5|}{0.75}} \, .
    \end{align*}
\end{itemize}

We also tested our method on three reference problems taken from posteriordb \citep{posteriordb}, a database of reference problems for testing and validating inference methods. These were \texttt{arK}, \texttt{eight schools centered}, and \texttt{garch11}. These problems are 7--, 10--, and 4--dimensional problems, respectively. Finally, we synthesized ground-truth data from a hierarchical regression problem with 30 regressors (a total of 32 parameters) to test how our algorithm would scale to an even higher dimensional problem. Results for all of these additional experiments are shown in Figure \ref{fig:posteriordb}.  

In our experiments, all functions integrated are sums of sinusoids, $$f(\x)=\sum_{\omega=1}^N a \sin(\omega \randomt^T\x+\phi_\omega)$$ where $\randomt$ is a random unit vector. This is a convenient target distribution as the integral of a sinusoid under a Gaussian is known analytically:
$$
\int_\x \sin(\omega \randomt^\top\x+\phi_\omega) \mathcal{N}(\mu, \Sigma) = \sin(\omega \randomt^\top\mu+\phi_\omega)\exp\left(-\frac{\omega^2}2 \randomt^T\Sigma \randomt\right)
$$
The capability for exact integration of $\int_\x\m_T(\x)f(\x)\rd\x$ ensures that no additional variance is introduced in plots; all variance is due to the selection of the components $\q$. In general this integral can be computed with MC methods or, in low enough dimensions, Gaussian quadrature.

In our experiments (Figures \ref{fig:bias_variance} and \ref{fig:wiggliness}) we used $N=100$ sinusoidal components in $f(\x)$, and calculated bias using $T=5,000$ components thinned from 4 MCMC chains of length $50,000$.
To calculate variance, we subsampled $T=10$ components from these chains, and computed variance over these random instantiations of $\m_{10}(\x)$.
The NUTS samples over $\x$ treated as ground truth derive from 4 chains of length 1,000,000. Because variance scales like $1/T$, we estimated variance for other values of $T$ simply by scaling the variance from the $T=10$ case. We chose $T=30$ (scaling variance from $T=10$ by a factor of $1/3$) for the plots using the banana distribution in the main paper because this gave roughly equal magnitude to the variance of NUTS and the bias of ADVI, which makes the effects of the trade-off between them most pronounced. In Figure \ref{fig:posteriordb}, we chose $T$ separately for each problem using the same strategy.

\section{Detailed Comparison to Prior Work}

\newcommand{\green}{\cellcolor[RGB]{96,224,96}}
\newcommand{\yellow}{\cellcolor[RGB]{224,224,96}}
\newcommand{\red}{\cellcolor[RGB]{224,96,96}}

{\tiny
\begin{table}[ht]
    \centering
    \begin{tabular}{|p{.2\linewidth}|p{.133\linewidth}|p{.133\linewidth}|p{.133\linewidth}|p{.133\linewidth}|p{.133\linewidth}|p{.133\linewidth}|}
    \hline
         \bf Author (Year)                                   & \bf Component parameters      & \bf Component family & \bf Auxiliary Optimization     & \bf Time; Space Complexity                    & \bf Recovers Sampling \\ \hline \hline
         \cite{Jaakkola1998} "Mixture of Mean Field"         & \green optimized              & \yellow mean field   & \yellow minor                  & \red $\mathcal{O}(T^2)$; $\mathcal{O}(T)$     & \yellow $T\rightarrow\infty$ \\ \hline
         \cite{Salimans2015} "Bridging the Gap" & \yellow sampled (implicit)    & \green flexible      & \red NN optimization (k steps) & \yellow $\mathcal{O}(k+T)$; $\mathcal{O}(T)$  & \red unknown \\ \hline
         \cite{Gershman2012b} "Nonparametric VI"             & \green optimized              & \red Guassian        & \green none                    & \red $\mathcal{O}(T^2)$; $\mathcal{O}(T)$     & \yellow $T\rightarrow\infty$ \\ \hline
         \cite{Zobay2014} "VI with Gaussian Mixtures"        & \green optimized              & \red Gaussian        & \green none                    & \red $\mathcal{O}(T^2)$; $\mathcal{O}(T)$     & \yellow $T\rightarrow\infty$ \\ \hline
         \cite{Guo2016,Miller2017} "Boosting VI"             & \green optimized              & \green flexible      & \green none                    & \red $\mathcal{O}(T^2)$; $\mathcal{O}(T)$     & \red no \\ \hline
         \cite{Nalisnick2017} "Stein Mixtures"               & \green optimized              & \green flexible      & \green none                    & \red $\mathcal{O}(T^2)$; $\mathcal{O}(T)^*$   & \yellow $T\rightarrow\infty$ \\ \hline
         \cite{Yin2018} "SIVI"                               & \yellow sampled (implicit)    & \yellow Gaussian$^*$    & \red NN optimization (k steps) & \red $\mathcal{O}(kT^2)$; $\mathcal{O}(T)^*$ & \red unknown \\ \hline
         \cite{Acerbi2018,Acerbi2020} "VBMC"                 & \green optimized              & \red Gaussian        & \yellow fit GP (k steps)       & \red $\mathcal{O}(kT^2)$; $\mathcal{O}(T)$   & \red no \\ \hline
         \bf Ours                                            & \yellow sampled (closed-form) & \yellow Gaussian$^*$    & \green none                    & \green $\mathcal{O}(T)$; $\mathcal{O}(T)^*$   & \green $\lambda\rightarrow 1$\\ \hline
         % \cite{Arenz2018}                                  &                               &                      &                                &                                               & \\ \hline
         % \cite{Ranganath2016}                              &                               & any                  & yes                            &                                               & \\ \hline
    \end{tabular}
    \caption{Comparison of our proposed algorithm to a number of existing methods that, in some way or another, use a mixture of ``simple'' component distributions for approximate inference.
        \textbf{Component parameters:} how are the parameters of the individual mixture components chosen? In terms of minimizing bias and variance, it is best to jointly optimize the location of all $T$ components together. The approach of \cite{Salimans2015} and \cite{Yin2018} is similar to ours in that mixture components are stochastically sampled, though ours is the only stochastic method for which we have the mixing distribution $\psi(\theta)$ explicitly in closed form.
        \textbf{Component family:} what is the allowable form of $\q(\x;\theta)$? Methods marked ``Gaussian$^*$'' (including ours) are in principle applicable to a wider class of distributions, but so far only demonstrated empirically using Gaussian mixtures.
        \textbf{Auxiliary optimization:} Are there additional parameters of the inference process itself that need to be fit or optimized at inference-time? The advantage of our method is that, since we derived $\psi(\theta)$ in closed form, we can begin sampling from it immediately without further optimizations.
        \textbf{Time; Space Complexity:} Note that all methods for which component parameters are ``optimized'' incur a $\mathcal{O}(T^2)$ cost in time complexity, since the optimal location of each component depends on the location of other components. Methods that require auxiliary optimization (such as training a neural network or NN) incur additional runtime costs. Methods marked $\mathcal{O}(T)^*$ space-complexity can in principle be ``streamed,'' in which case they use constant $\mathcal{O}(1)$ space.
        \textbf{Recovers sampling:} some methods ``look like'' sampling (unbiased and with narrow components) only in the $T\rightarrow\infty$ limit, which is infeasible given $\mathcal{O}(T^2)$ time complexity. Ours is the only method we know of for which sampling-like behavior can be recovered, independent of $T$, by setting $\lambda = 1$.
    }
    \label{tbl:prior_work}
\end{table}
}

\bibliography{references-mendeley-group,additional-references}

\end{document}
