\documentclass[accepted,x11names]{uai2022} % for initial submission
% \documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent 
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage[numbers]{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{array}
\usepackage{ragged2e}
\newcolumntype{P}[1]{>{\RaggedRight\hspace{0pt}}p{#1}}
\usepackage{xr-hyper}
% For theorems and such  
\usepackage{lipsum}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{amsthm}
\usepackage{wrapfig}
\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors
\usepackage{graphicx}
\usepackage{placeins}
\usepackage{enumitem}
\usepackage{wrapfig}
\usepackage{subcaption}
\usepackage[noend]{algorithm2e}


%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\newcommand{\figref}[1]{Fig. \ref{#1}}
\newcommand{\secref}[1]{\S \ref{#1}}
\newcommand{\E}[0]{\mathbb{E}}
\newcommand{\LL}[0]{\mathcal{L}}
\newcommand{\UU}[0]{\mathcal{U}}
\newcommand{\grad}[0]{\nabla_{\theta}}

\makeatletter
\newcommand{\removelatexerror}{\let\@latex@error\@gobble}
\makeatother

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% THEOREMS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}

\hypersetup{
    colorlinks=true,
    urlcolor=black,
    linkcolor=cyan,
    citecolor=SpringGreen3,
    linkbordercolor=white,
}
\externaldocument{lew_657}

\title{Recursive Monte Carlo and Variational Inference with Auxiliary Variables (Supplemental Material)}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<alexlew@mit.edu>?Subject=Your UAI 2022 paper}{Alexander~K.~Lew}{}}
\author[1]{\href{mailto:<marcoct@mit.edu>?Subject=Your UAI 2022 paper}{Marco~Cusumano-Towner}{}}
\author[1]{\href{mailto:<vkm@mit.edu>?Subject=Your UAI 2022 paper}{Vikash~K.~Mansinghka}{}}
% \author[3]{Further~Coauthor}
% \author[1]{Further~Coauthor}
% \author[3]{Further~Coauthor}
% \author[3,1]{Further~Coauthor}
% Add affiliations after the authors
\affil[1]{%
    Massachusetts Institute of Technology\\
    Cambridge, Massachusetts, USA
}
% \affil[2]{%
%     Second Affiliation\\
%     Address\\
%     …
% }
% \affil[3]{%
%     Another Affiliation\\
%     Address\\
%     …
%   }
  
\begin{document}
\onecolumn

\maketitle

\appendix

\section*{Supplementary Material for ``Recursive Monte Carlo and Variational Inference with Auxiliary Variables''}

This document and the accompanying code files contain supplementary material for the submission ``Recursive Monte Carlo and Variational Inference with Auxiliary Variables.'' In particular, we provide:

\begin{enumerate}
    \item In Section~\ref{sec:proofs}, \textbf{proofs} of Theorems 1-4.
    
%    \item In Section~\ref{sec:experimental-details}, \textbf{experimental details} on all experiments appearing in the main paper, including model details, inference algorithm details, and computational resources used.
    
    \item In Section~\ref{sec:appendix-examples}, \textbf{RAVI inference strategies for many existing algorithms}.
    
    \item In Section~\ref{sec:even-odd}, a further discussion of the {\bf absolute continuity requirements} for RAVI and how they can be relaxed.
    
    \item In Section~\ref{sec:other-applications}, \textbf{other applications} of RAVI inference strategies, to parameterize rejection sampling and KL divergence estimation algorithms.
\end{enumerate}

\section{Omitted Proofs.}
\label{sec:proofs}

Throughout this section, we use the notation introduced in Section~\ref{sec:theory}: the random variable $\hat{Z}(\tilde{\pi}, \mathcal{S})$ is the weight returned by $\texttt{IMPORTANCE}(\tilde{\pi}, \mathcal{S})$, and $\check{Z}(\tilde{\pi}, \mathcal{S})$ is the reciprocal of the weight returned by $\texttt{HME}(\tilde{\pi}, x, \mathcal{S})$, for $x \sim \pi$.


\subsection{Proof of Theorem 1.}

\textbf{Theorem 1.} \textit{
    Let $\tilde{\pi}(x) = Z\pi(x)$ be an unnormalized target density, and $\mathcal{S}$ an inference strategy targeting $\pi(x)$. Then:
    \begin{itemize}
        \item $\texttt{IMPORTANCE}(\mathcal{S}, \tilde{\pi})$ generates $(x, \hat{Z})$ with $x \sim \mathcal{S}.q$ and $\mathbb{E}[\hat{Z} \mid x] = Z\frac{\pi(x)}{\mathcal{S}.q(x)}$. Furthermore, the unconditional expectation $\mathbb{E}[\hat{Z}(\tilde{\pi}, \mathcal{S})] = Z$.
        \item $\mathbb{E}[{\check{Z}}(\tilde{\pi}, \mathcal{S})^{-1}] = \mathbb{E}_{x \sim \pi}[\texttt{HME}(\mathcal{S}, x, \tilde{\pi})] = Z^{-1}.$
    \end{itemize}
}

\textbf{Proof.} The proof is by induction on the level of nesting 
present in the inference strategy. 

First consider the case where $\mathcal{S}.q$ has a tractable 
marginal density. Then:
\begin{itemize}
    \item \texttt{IMPORTANCE} samples $x \sim \mathcal{S}.q$ on line 2, and 
computes $\hat{Z} = \frac{\tilde{\pi}(x)}{\mathcal{S}.q(x)} = Z\frac{\pi(x)}{\mathcal{S}.q(x)}$ exactly (lines 3 and 7). By the standard importance sampling argument, the unconditional expectation $\mathbb{E}[\hat{Z}(\tilde{\pi}, \mathcal{S})] = \mathbb{E}_{x \sim \mathcal{S}.q}[Z\frac{\pi(x)}{\mathcal{S}.q(x)}]=Z\mathbb{E}_{x \sim \pi}[1] = Z$. (This argument relies on the fact that, because $\mathcal{S}$ targets $\pi$, $\pi$ is absolutely continuous with respect to $\mathcal{S}.q$.)
    \item 
$\texttt{HME}(\mathcal{S}, x, \tilde{\pi})$ returns exactly $\frac{\mathcal{S}.q(x)}{\tilde{\pi}(x)}$ (lines 2 and 5), and $$\mathbb{E}_{x \sim \pi}\left[\frac{\mathcal{S}.q(x)}{\tilde{\pi}(x)}\right] = \int \pi(x) \frac{\mathcal{S}.q(x)}{Z\pi(x)}\text{d}x = \frac{1}{Z} \int \mathcal{S}.q(x) \text{d}x = \frac{1}{Z},$$ where the last step follows because $\mathcal{S}.q$ is a normalized probability density, and $\mathcal{S}.q$ is absolutely continuous with respect to $\pi$.
\end{itemize}

Now consider the inductive step. Assume $\mathcal{S}.q(x) = \int \mathcal{S}.q(r, x)\text{d}r$ and that for all $x$, the theorem holds for the inference strategy $\mathcal{S}.\mathcal{M}(x)$ and the unnormalized target distribution $\mathcal{S}.q(\cdot, x)$. In this case:

\begin{itemize}
    \item On line 5, \texttt{IMPORTANCE} generates $x \sim \mathcal{S}.q$ and $r \sim \mathcal{S}.q(r \mid x)$. In the call to $\texttt{HME}$, the unnormalized target distribution is $\mathcal{S}.q(\cdot, x)$, and so the normalizing constant is $\mathcal{S}.q(x)$ and the normalized target is $\mathcal{S}.q(r \mid x)$. By the inductive hypothesis, the call to $\texttt{HME}$ on line 6 returns an unbiased estimate of the normalizing constant's reciprocal, i.e. $\mathbb{E}[w \mid x] = \frac{1}{\mathcal{S}.q(x)}$. Since \texttt{IMPORTANCE} returns $\hat{Z} = w\tilde{\pi}(x)$ on line 7, this implies that $\mathbb{E}[\hat{Z} \mid x] = \frac{\tilde{\pi}(x)}{\mathcal{S}.q(x)} = Z\frac{\pi(x)}{\mathcal{S}.q(x)}$. From this, the same standard importance sampling argument as above shows that the unconditional expectation $\mathbb{E}[\hat{Z}(\tilde{\pi}, \mathcal{S})] = Z$.
    \item On line 4, \texttt{HME} calls \texttt{IMPORTANCE} on the 
    unnormalized target $\mathcal{S}.q(\cdot, x)$, and so by the inductive hypothesis, $\mathbb{E}[w] = \mathcal{S}.q(x)$ (the normalizing constant). On line 5, the returned weight has expectation $\mathbb{E}_{x \sim \pi}\left[\frac{w}{\tilde{\pi}(x)}\right]=\frac{1}{Z} \int \pi(x) \cdot \frac{\mathcal{S}.q(x)}{\pi(x)} \text{d}x = \frac{1}{Z}$, where the last equality again follows because $\mathcal{S}.q$ is a normalized density, and $\mathcal{S}.q$ is absolutely continuous with respect to $\pi$.
\end{itemize}

% As for \texttt{HME}, for $w$ generated on L4, we have $$\mathbb{E}[w] = \mathbb{E}_{r \sim \mathcal{S}.\mathcal{M}(x).q}\left[\mathbb{E}[w \mid r]\right] = \mathbb{E}_{r \sim \mathcal{S}.\mathcal{M}(x).q}\left[\mathcal{S}.q(x) \frac{\mathcal{S}.q(r \mid x)}{\mathcal{S}.\mathcal{M}.q(r)}\right] = \mathcal{S}.q(x),$$ where the second step follows from the inductive hypothesis, and the third by the fact that $\mathcal{S}.q(r \mid x)$ is a normalized probability density. The return value 
% of $\texttt{HME}$ is $\frac{w}{\tilde{\pi}(x)}$, and 
% $$\mathbb{E}_{x \sim \pi(x)}\left[\frac{w}{\tilde{\pi}(x)}\right]$$

% \textbf{Lemma.} Let $\tilde{\pi}(x) = Z\pi(x)$ be an unnormalized target density and $q(x)$ a proposal distribution, with $\pi(x)$ and $q(x)$ mutually absolutely continuous. Then $\mathbb{E}_{x \sim \pi}[\frac{q(x)}{\tilde{\pi}(x)}] = \frac{1}{Z}$ and $\mathbb{E}_{x\sim q}[\frac{\tilde{\pi}(x)}{q(x)}] = Z$.

% \textbf{Theorem 1.} \textit{
%     Let $\tilde{\pi}(x) = Z\pi(x)$ be an unnormalized target density, and $\mathcal{S}$ an inference strategy targeting $\pi(x)$. Then Algorithm 1 implements a properly weighted sampler for $\pi$, and Algorithm 2 transforms samples $x \sim \pi$ into unbiased estimates of $\frac{1}{Z}$.
% }

% \textbf{Theorem 1.} \textit{Given an unnormalized density $\tilde{\pi}(x) = Z\pi(x)$ and an inference strategy $\mathcal{S}$ targeting $\pi(x)$, Algorithm 1 generates $x \sim \mathcal{S}.q$ and a weight $\hat{Z}$ such that $\mathbb{E}[\hat{Z} \mid x] = Z \frac{\pi(x)}{\mathcal{S}.q(x)}$. If additionally given a sample $x \sim \pi$, Algorithm 2 generates a weight $\check{Z}$ such that $\mathbb{E}[\frac{1}{\check{Z}}] = \frac{1}{Z}$.}

% \textbf{Proof.} The proof is by induction on the level of nesting present in the inference strategy $\mathcal{S}$. First consider the case where $\mathcal{S}.q$ has a tractable density and no $\mathcal{S}.\mathcal{M}$ is needed. Then:

% \begin{itemize}
%     \item Algorithm 1 generates $x \sim \mathcal{S}.q$ in its first step, and returns $\frac{\tilde{\pi}(x)}{\mathcal{S}.q(x)} = Z\frac{\pi(x)}{\mathcal{S}.q(x)}$ exactly.
%     \item Algorithm 2 returns $\frac{\mathcal{S}.q(x)}{\tilde{\pi}(x)} = \frac{1}{Z} \frac{\mathcal{S}.q(x)}{\pi(x)}$. In expectation over $x \sim \pi$, we have $\mathbb{E}_{x \sim \pi}[\frac{1}{Z} \frac{\mathcal{S}.q(x)}{\pi(x)}] = \frac{1}{Z} \int \mathcal{S}.q(x) \text{d}x = \frac{1}{Z}.$
% \end{itemize}

% Now assume the theorem holds for the strategy $\mathcal{S}.\mathcal{M}(x)$, for all $x$, and consider $\mathcal{S}$ itself. Algorithm 1 samples $(r, x) \sim \mathcal{S}.q$, so $r$ is distributed as $\mathcal{S}.q(r \mid x)$. By the inductive hypothesis, the call to Algorithm 2 yields a weight $w$ with $\mathbb{E}[w] = \frac{1}{\mathcal{S}.q(x)}$. 


% \textbf{Theorem 1.} \textit{If $p_0$ is a model and $p_1, \dots, p_n$ encodes an inference strategy, then Alg. 1 is properly weighted for $p_0(x_1 \mid x_0)$. Alg. 2 generates $x_0 \sim p_0$ and a weight $\frac{1}{\check{Z}}$ such that $\mathbb{E}[\frac{1}{\check{Z}} \mid x_0] = \frac{1}{p_0(x_0)}$.}


% \textbf{Proof.} The proof is by induction on the length of the inference strategy. 

% First consider $n = 0$. In this case Algorithm 2 terminates on its first step, producing $x_0 \sim p_0$ and returning $\frac{1}{p_0(x_0)}$ exactly. For Algorithm 1, as there is no latent variable $x_1$, we view $x_1$ as deterministically equal to a null value $\emptyset$. In this case, Algorithm 1 returns $(x_1 = \emptyset, w = p_0(x_0))$ on its first step, and $\mathbb{E}[w f(x_1)] = p_0(x_0) \cdot f(\emptyset)$ as required.

% % Algorithm 1 then reduces to ordinary importance sampling, with weight $\frac{p(x, y)}{q_0(x; y)}$. The usual argument shows that $p(y)$ is the expected weight:
% % $$\mathbb{E}\left[\frac{p(x,y)}{q_0(x; y)}\right] = \int q_0(x; y) \frac{p(x,y)}{q_0(x; y)} dx = \int p(x, y) dx = p(y).$$

% Now consider the inductive step. Assume the theorem proven for inference strategies of length $n < k$ and consider an inference strategy of length $k$. 

% We first consider Algorithm 2. The variable $x_0$ is generated from $p_0$ in Step 2, so we have directly that $x_0 \sim p_0$. By induction, the call to Algorithm 1 generates a weight $w'$ whose expectation (given $x_0$ and $x_1$) is $p_1(x_1; x_0)$. Then the conditional expectation of $w = \frac{w'}{p_0(x_1, x_0)}$, given $x_0$, is:
% $$\mathbb{E}_{x_1, \dots, x_k}[w \mid x_0] = \mathbb{E}_{x_1}\left[\frac{\mathbb{E}_{x_2, \dots, x_k}\left[w' \mid x_1, x_0\right]}{p_0(x_1, x_0)} \mathrel{\Big|} x_0 \right] = \mathbb{E}_{x_1}\left[\frac{p_1(x_1; x_0)}{p_0(x_1, x_0)} \mathrel{\Big|} x_0 \right].$$
% This last expression can be viewed as an ordinary importance sampling estimator, with proposal distribution $p_0(x_1 \mid x_0)$ and target $\frac{p_1(x_1; x_0)}{p_0(x_0)}$:
% $$\mathbb{E}_{x_1}\left[\frac{p_1(x_1; x_0)}{p_0(x_1, x_0)} \mathrel{\Big|} x_0 \right] = \int p_0(x_1 \mid x_0) \cdot \frac{p_1(x_1; x_0)}{p_0(x_0)p_0(x_1\mid x_0)} dx_1 = \int \frac{p_1(x_1; x_0)}{p_0(x_0)} dx_1 = \frac{1}{p_0(x_0)}.$$
% The last equations use the absolute continuity assumption: the integral is only over the support of $p_0(x_1 \mid x_0)$, and so for $\int p_1(x_1; x_0) dx_1$ to equal 1, we must have that $p_1(x_1; x_0) \ll p_0(x_1 \mid x_0)$.

% We now turn to Algorithm 1. We have, for any measurable test function $f$:
% \begin{align*}
%     \mathbb{E}\left[w \cdot p_0(x_1, x_0) \cdot f(x_1) \right] &= \mathbb{E}_{x_1 \sim p_1(x_1; x_0)} \left[\mathbb{E}\left[w \mathrel{\Big|} x_1\right] \cdot p_0(x_1, x_0) \cdot f(x_1) \right]\\ 
%     &=
%     \mathbb{E}_{x_1 \sim p_1(x_1; x_0)}\left[\frac{p_0(x_1, x_0)}{p_1(x_1; x_0)} \cdot f(x_1)\right] \\ 
%     &= p_0(x_0) \cdot \mathbb{E}_{x_1 \sim p_0(x_1 \mid x_0)}[f(x_1)],
% \end{align*}
% where the second equation holds due to the proof for Algorithm 2 and the third is the ordinary importance sampling argument.
% \\

\subsection{Proof of Theorem 2}


\begin{lemma}
    For an inference strategy $\mathcal{S}$ targeting $p(x \mid y)$, if $\mathcal{S}.q(x) = \int \mathcal{S}.q(r, x)\text{d}r$ has an intractable marginal density, then:
    \[
    \LL(p, y, \mathcal{S}) = \E_{x \sim \mathcal{S}.q} [\log p(x, y) - \UU(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x)]
    \]
    and
    \[
    \UU(p, y, \mathcal{S}) = \E_{x \sim p( \cdot | y) } [ \log p(x, y) - \LL(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x)) ]
    \]
\end{lemma}

\textbf{Proof.} For the first conclusion,

\begin{align}
\LL(p, y, \mathcal{S})
    &= \E[\log \hat{Z}(p(\cdot \mid y), \mathcal{S})]\\
    &= \E\left[\log \frac{p(x, y)}{\check{Z}(\mathcal{S}.q(\cdot, x), \mathcal{S}.\mathcal{M}(x))}\right]\\
    &= \E_{x \sim \mathcal{S}.q}[\E[\log p(x, y) - \log \check{Z}(\mathcal{S}.q(\cdot, x), \mathcal{S}.\mathcal{M}(x)) \mid x]]\\
    &= \E_{x \sim \mathcal{S}.q}[\log p(x, y) - \mathbb{E}[\log \check{Z}(\mathcal{S}.q(\cdot, x), \mathcal{S}.\mathcal{M}(x)) \mid x]]\\
    &= \E_{x \sim \mathcal{S}.q} [\log p(x, y) - \UU(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x)) ]
\end{align}
The same approach, but with $\E [ \log \check{Z} ]$, can be used to prove the other conclusion.

\noindent\textbf{Theorem 2.} \textit{
Given a model $p_\theta(x, y)$ and an inference strategy $\mathcal{S}_\theta$ targeting $p_\theta(x \mid y)$,
%Let $p_0, \dots, p_n$ be a
%valid model strategy and a valid inference strategy.
Alg. 3 yields unbiased estimates of $\LL(p, y, \mathcal{S})$ and of $\grad \LL(p, y, \mathcal{S})$.
Furthermore, when $(x, y) \sim p_\theta$, Alg. 4 yields
(i) $\hat{U}$ such that
$\E [ \hat{U} \mid y ] = \UU(p, y, \mathcal{S})$,
(ii) $\widehat{\grad}$ such that
$\E [ \widehat{\grad} ] = \grad \mathbb{E}_{y \sim p_\theta}[\UU(p, y, \mathcal{S})]$,
and (iii) a value $\mathbf{g}$ such that for any function $R$ that does not depend on $\theta$,
$\E [ \mathbf{g} \cdot R(y) ] = \grad \E_{y \sim p_\theta} [ R(y) ]$  if
$\grad \E_{y \sim p_\theta} [ R(y) ]$ is defined.}

\textbf{Proof.} The proof is by induction on the level of nesting present in the inference strategy.

First consider inference strategies $\mathcal{S}$ with tractable proposals $\mathcal{S}.q(x)$. In this case $\texttt{ELBO}\nabla$ generates $x \sim \mathcal{S}.q$ and returns $\hat{L} = \log p(x, y) - \log \mathcal{S}.q(x)$ and $\widehat{\grad} = \grad (\log p(x, y) - \log \mathcal{S}.q(x)) + (\grad \log \mathcal{S}.q(x))(\log p(x, y) - \log \mathcal{S}.q(x))$. Clearly, $\E_{x \sim \mathcal{S}.q}[\hat{L}] = \E[\log \hat{Z}(p(\cdot, y), \mathcal{S})] = \LL(p, y, \mathcal{S})$. And by the log-derivative trick, $\E_{x \sim \mathcal{S}.q}[\widehat{\grad}] = \E[\grad (\log p(x, y) - \log \mathcal{S}.q(x))] = \E[\mathcal{L}(p, y, \mathcal{S})]$. When we apply $\texttt{EUBO}\nabla$ to $\mathcal{S}$ with $(x, y) \sim p$, it returns (1) $\hat{U} = \log p(x, y) - \log \mathcal{S}.q(x)$ (for which $\E[\hat{U} \mid y] = \UU(p, y, \mathcal{S})$), (2) $\widehat{\grad} = \grad (\log p(x, y) - \log \mathcal{S}.q(x)) + \grad \log p(x, y) (\log p(x, y) - \log \mathcal{S}.q(x))$ (for which, by the log-derivative trick, $\mathbb{E}[\widehat{\grad}] = \grad \E_{y \sim p}[\UU(p, y, \mathcal{S})]$), and (3) $\mathbf{g} = \grad \log p(x, y)$. This last return value satisfies the spec for $\mathbf{g}$ because if $R$ does not depend on $\theta$, then $\mathbb{E}_{(x, y) \sim p}[R(y) \cdot \grad\log p(x, y)] = \int\int p(x, y) \cdot 
\frac{\grad p(x, y)}{p(x, y)} \cdot R(y) \text{d}x \text{d}y = \grad \int \int p(x, y) R(y) \text{d}x \text{d}y = \grad \E [R(y)]$, as required. 

Now consider the inductive step. Assume the theorem holds for the inference strategy $\mathcal{S}.\mathcal{M}(x)$ and joint distribution $\mathcal{S}.q(r, x)$. 

We first consider $\texttt{ELBO}\nabla$. It generates $(r, x) \sim \mathcal{S}.q$ before calling $\texttt{EUBO}\nabla$, which by induction returns $(\hat{U}, \widehat{\grad}, \mathbf{g})$ such that:

\begin{enumerate}
    \item $\mathbb{E}[\hat{U} \mid x] = \UU(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x))$
    \item $\E[\widehat{\grad}] = \grad\E_{x \sim \mathcal{S}.q}[\UU(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x))]$
    \item $\E[g \cdot R(x)] = \grad \E_{x \sim \mathcal{S}.q}[R(x)]$ for all valid $R$.
\end{enumerate}


$\texttt{ELBO}\nabla$ computes its first return value, $\hat{L}$, as $\log p(x, y) - \hat{U}$, so
\begin{align*}
    \E[\hat{L}] &= \E[\log p(x, y) - \hat{U}]\\ 
    &= \E_{x \sim \mathcal{S}.q}[\E[\log p(x, y) - \hat{U} \mid x]]\\
    &= \E_{x \sim \mathcal{S}.q}[\log p(x, y) - \E[\hat{U} \mid x]]\\
    &= \E_{x \sim \mathcal{S}.q}[\log p(x, y)-\UU(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x))]\\ 
    &= \LL(p, y, \mathcal{S}),
\end{align*}
where the fourth equality holds by the inductive hypothesis and the final one by Lemma 1. Its second return value is computed as $\widehat{\grad}' = \grad\log p(x, y) + \mathbf{g}\log p(x, y) - \widehat{\grad}$, and so
\begin{align*}
    \E [ \widehat{\grad}' ]
        &= \E \left[ \grad \log p(x, y) + \mathbf{g} \cdot \log p(x, y) - \widehat{\grad} \right]\\
        &= \E \left[ \grad \log p(x, y) \right]
            + \grad \E_{x \sim \mathcal{S}.q} [ \log \boxed{p}(x, y) ]
            - \grad \E_{x \sim \mathcal{S}.q} [ \mathcal{U}(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x)) ]\\
        &= \grad \E \left[ \log p(x, y) \right]
            - \grad \E_{x \sim \mathcal{S}.q} [ \mathcal{U}(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x)) ]\\
        &= \grad \E_{x \sim \mathcal{S}.q} [ \log p(x, y) - \UU(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x)) ]\\
        &= \grad \LL(p, y, \mathcal{S}),
\end{align*}
where $\boxed{p}(x, y)$ denotes the distribution $p(x, y)$ but without a dependence on $\theta$, for the purposes of differentiation with respect to $\theta$. The second equality holds by the inductive hypothesis about $\mathbf{g}$ (with $R(x) = \log \boxed{p}(x, y)$) and about $\widehat{\grad}$, and the third uses the log-derivative trick. The final equation is due to Lemma 1.

We now turn to $\texttt{EUBO}\nabla$. 
By induction, the call to $\texttt{ELBO}\nabla$ satisfies the theorem, and so: \begin{enumerate}
    \item $\E [ \hat{L} ] = \LL(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x))$
    \item $\E [ \widehat{\grad} ] = \grad\LL(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x))$
\end{enumerate}

We treat each of the return values, $(\hat{U}, \widehat{\grad}, \mathbf{g})$, in sequence. We view them as random variables, accounting for stochasticity in the algorithm as well as the inputs $(x, y)$, which are assumed in the theorem's statement to be jointly distributed according to $p$.

First, $\hat{U}$ is computed as $\log p(x, y) - \hat{L}$, and so 
\begin{align*}
\E [ \hat{U} | y ] 
    &= \E_{x \sim p(\cdot | y)} [ \E [ \log p(x, y) - \hat{L} | x, y ] ]\\
    &= \E_{x \sim p(\cdot | y) } [ \log p(x, y) - \LL(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x)) ]\\
    &= \UU(p, y, \mathcal{S}).
\end{align*}
Next, $\E[ \widehat{\grad}']$:
\begin{align*}
    \E[\widehat{\grad}']
    &= \E_{x, y \sim p} \left[
            \E \left[
                \grad \log p(x, y)
                + (\grad \log p(x, y)) \cdot \hat{U}
                - \widehat{\grad} | x, y
            \right]
        \right]\\
    &= \E_{x, y \sim p} \left[
            \grad \log p(x, y) +
            (\grad \log p(x, y))
                \cdot \E \left[ \hat{U} | x, y \right]
            - \E \left[\widehat{\grad} | x, y \right]
        \right]\\
    &= \E_{x, y \sim p} \left[
        \grad \log p(x, y) +
        (\grad \log p(x, y))
            \cdot \E \left[ \log p(x, y) - \hat{L} \mid x, y \right]
        - \grad \LL(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x))
    \right]\\
    &= \E_{x, y \sim p} \left[
        \grad \log p(x, y) 
        - \grad \LL(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x))
        + (\grad \log p(x, y))
            \cdot (\log p(x, y) - \LL(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x))
    \right]\\
    &= \grad \E_{x, y \sim p} \left[
            \log p(x, y) - \LL(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x))
        \right]\\        
    &= \grad \E_{y \sim p} [ \E_{x \sim p(\cdot | y) } [ \log p(x, y) - \LL(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x)) ] ]\\
    &= \grad \E_{y \sim p} [ \UU(p, y, \mathcal{S}) ].    
\end{align*}
Finally, we consider $\E_{y \sim p} [ \E [ \mathbf{g} \cdot R(y) | y ] ]$ (and recall that $R(y)$ is not to be treated as a function of $\theta$):
\begin{align*}
\E_{y \sim p} [ \E [ \mathbf{g} \cdot R(y) | y ] ]
    &= \E_{y \sim p} \left[ \E [ (\grad \log p(x, y)) \cdot R(y) | y ] \right]\\
    &= \E_{x, y \sim p} \left[ (\grad \log p(x, y)) \cdot R(y) \right]\\
    &= \grad \E_{x, y \sim p} \left[ R(y) \right]\\
    &= \grad \E_{y \sim p} [ R(y) ].
\end{align*}


\subsection{Proof of Theorem 3}

\noindent\textbf{Theorem 3.} {\it Consider an unnormalized target distribution $\tilde{\pi}(x) = Z\pi(x)$ and an inference strategy $\mathcal{S}$ targeting $\pi(x)$. Then the relative variances of the estimators $\hat{Z}(\tilde{\pi}, \mathcal{S})$ and $\check{Z}(\tilde{\pi}, \mathcal{S})$ are given by the following recursive equations:}
\begin{align*}
\text{Var}_{\hat{Z}}&(\pi, \mathcal{S}) = \chi^2(\pi || \mathcal{S}.q) \, +\\
 & \mathbb{E}_{x \sim \mathcal{S}.q}\left[\left(\frac{\pi(x)^2}{\mathcal{S}.q(x)^2}\right) \cdot \text{Var}_{\check{Z}}(\mathcal{S}.q(\cdot \mid x), \mathcal{S}.\mathcal{M}(x))\right]\\
\text{Var}_{\check{Z}}&(\pi, \mathcal{S}) = \chi^2(\mathcal{S}.q || \pi) + \\
& \mathbb{E}_{x \sim \pi}\left[\left(\frac{\mathcal{S}.q(x)^2}{\pi(x)^2}\right) \cdot \text{Var}_{\hat{Z}}(\mathcal{S}.q(\cdot \mid x), \mathcal{S}.\mathcal{M}(x))\right]
%
%\text{Var}\left(\frac{\hat{Z}}{Z}\right) = \sum_{i=0}^{n-1} \mathbb{E}%_{x_0, \dots, x_{i-1}}
%\Big[%
%\frac{\hat{Z}_i^2}{Z^2} \cdot \chi^2(&p_{2\lceil \frac{i}{2}\rceil}(x_{i+1} \mid x_{i}, \dots, x_0) \,\, ||\\
%&p_{2\lfloor\frac{i}{2}\rfloor + 1}(x_{i+1} \mid x_i, \dots, x_0))
%w_i^2 \cdot d_i%
%\Big],
\end{align*}
{\it When $\mathcal{S}.q$ is tractable, the second term of each sum is 0.}

\textbf{Proof.} The proof is by induction on the level of nesting present in the inference strategy $\mathcal{S}$. 

First suppose $\mathcal{S}.q$ has a tractable marginal density. Then:

\begin{itemize}
    \item $\hat{Z}(\pi, \mathcal{S})$ is the normalized importance weight 
    $\frac{\pi(x)}{\mathcal{S}.q(x)}$, with $x \sim \mathcal{S}.q$. So the relative variance is:
    $$\text{Var}_{\hat{Z}}({\pi}, \mathcal{S}) = \text{Var}\left(\hat{Z}({\pi}, \mathcal{S})\right) = \mathbb{E}_{x \sim \mathcal{S}.q}\left[\frac{\pi(x)^2}{\mathcal{S}.q(x)^2}\right] - \mathbb{E}_{x \sim \mathcal{S}.q}\left[\frac{\pi(x)}{\mathcal{S}.q(x)}\right]^2 = \mathbb{E}_{x \sim \mathcal{S}.q}\left[\frac{\pi(x)^2}{\mathcal{S}.q(x)^2} - 1\right] = \chi^2(\pi || \mathcal{S}.q),$$
    where the third equality holds because $\pi$ is a normalized density and $\pi$ is absolutely continuous with respect to $\mathcal{S}.q$.

    \item $\check{Z}({\pi}, \mathcal{S})$ is the weight $\frac{\pi(x)}{\mathcal{S}.q(x)}$, with $x \sim \pi$. Then the relative variance $$\text{Var}_{\check{Z}}({\pi}, \mathcal{S}) = \text{Var}\left(\check{Z}(\pi, \mathcal{S})^{-1}\right) = \mathbb{E}_{x \sim \pi}\left[\frac{\mathcal{S}.q(x)^2}{\pi(x)^2}\right] - \mathbb{E}_{x \sim \pi}\left[\frac{\mathcal{S}.q(x)}{\pi(x)}\right]^2 = \mathbb{E}_{x \sim \pi}\left[\frac{\mathcal{S}.q(x)^2}{\pi(x)^2} - 1\right] = \chi^2(\mathcal{S}.q || \pi),$$ where the third equality holds because $\mathcal{S}.q$ is a normalized density and is absolutely continuous with respect to $\pi$.
\end{itemize}
Now consider the inductive step. Assume that for all $x$, the theorem holds of the strategy $\mathcal{S}.\mathcal{M}(x)$ targeting $\mathcal{S}.q(\cdot \mid x)$. $\text{Var}_{\hat{Z}}(\mathcal{S}.q(\cdot \mid x), \mathcal{S}.\mathcal{M}(x))$, for all $x$. Then:
\begin{itemize}
    \item The $\texttt{IMPORTANCE}(\pi, \mathcal{S})$ algorithm generates $x \sim \mathcal{S}.q$. It then calls \texttt{HME} (with $r \sim \mathcal{S}.q(\cdot \mid x)$) to obtain $w = \check{Z}(\mathcal{S}.q(\cdot, x), \mathcal{S}.\mathcal{M}(x))^{-1}$, and returns $\hat{Z} = w\pi(x)$. The variance of $\hat{Z}$ is then:
    \begin{align*}
    \text{Var}_{\hat{Z}}(\pi, \mathcal{S}) &= \text{Var}\left(\frac{\pi(x)}{\check{Z}(\mathcal{S}.q(\cdot, x), \mathcal{S}.\mathcal{M}(x))}\right)\\
    &= \mathbb{E}\left[\left(\frac{\pi(x)}{\check{Z}(\mathcal{S}.q(\cdot, x), \mathcal{S}.\mathcal{M}(x))}\right)^2 - 1\right]
    && (\mathbb{E}[\hat{Z}(\pi, \mathcal{S})]^2 = Z^2 = 1)\\
    &= \mathbb{E}\left[\left(\frac{\pi(x)}{\mathcal{S}.q(x)} \cdot \frac{\mathcal{S}.q(x)}{\check{Z}(\mathcal{S}.q(\cdot, x), \mathcal{S}.\mathcal{M}(x))}\right)^2 - 1\right]
    && \text{(divide and multiply by } \mathcal{S}.q(x))\\
    &= \mathbb{E}\left[\left(\frac{\pi(x)}{\mathcal{S}.q(x)} \cdot \frac{1}{\check{Z}(\mathcal{S}.q(\cdot \mid x), \mathcal{S}.\mathcal{M}(x))}\right)^2 - 1\right]
    &&(\mathcal{S}.q(x) \text{ is the normalizing constant of } \mathcal{S}.q(\cdot, x))\\
    &= \mathbb{E}\left[\left(\frac{\pi(x)}{\mathcal{S}.q(x)}\right)^2 \left(\mathbb{E}\left[{\check{Z}(\mathcal{S}.q(\cdot \mid x), \mathcal{S}.\mathcal{M}(x))}^{-2} \bigl\vert x\right]\right) - 1\right]\\
    &= \mathbb{E}\left[\left(\frac{\pi(x)}{\mathcal{S}.q(x)}\right)^2 \left(\text{Var}_{\check{Z}}(\mathcal{S}.q(\cdot \mid x), \mathcal{S}.\mathcal{M}(x)) + 1\right) - 1\right]
    && \text{(definition of Var}_{\check{Z}}(\cdot, \cdot))\\
    &= \mathbb{E}\left[\left(\frac{\pi(x)}{\mathcal{S}.q(x)}\right)^2 \left(\text{Var}_{\check{Z}}(\mathcal{S}.q(\cdot \mid x), \mathcal{S}.\mathcal{M}(x))\right) + \left(\frac{\pi(x)}{\mathcal{S}.q(x)}\right)^2 - 1\right]
    && \text{(distributing product over sum)}\\
    &= \mathbb{E}\left[\left(\frac{\pi(x)}{\mathcal{S}.q(x)}\right)^2 \left(\text{Var}_{\check{Z}}(\mathcal{S}.q(\cdot \mid x), \mathcal{S}.\mathcal{M}(x))\right)\right] + \chi^2(\pi || \mathcal{S}.q).\\
    \end{align*}


    \item The argument for $\check{Z}$ is largely the same:
    \begin{align*}
        \text{Var}_{\check{Z}}(\pi, \mathcal{S}) &= \text{Var}\left(\frac{\hat{Z}(\mathcal{S}.q(\cdot, x), \mathcal{S}.\mathcal{M}(x))}{\pi(x)}\right)\\
        &= \mathbb{E}\left[\left(\frac{\check{Z}(\mathcal{S}.q(\cdot, x), \mathcal{S}.\mathcal{M}(x))}{\pi(x)}\right)^2 - 1\right]
        && (\mathbb{E}[\check{Z}(\pi, \mathcal{S})^{-1}]^2 = Z^{-2} = 1)\\
        &= \mathbb{E}\left[\left(\frac{\mathcal{S}.q(x)}{\pi(x)} \cdot \frac{\hat{Z}(\mathcal{S}.q(\cdot, x), \mathcal{S}.\mathcal{M}(x))}{\mathcal{S}.q(x)}\right)^2 - 1\right]
        && \text{(divide and multiply by } \mathcal{S}.q(x))\\
        &= \mathbb{E}\left[\left(\frac{\mathcal{S}.q(x)}{\pi(x)} \cdot {\hat{Z}(\mathcal{S}.q(\cdot \mid x), \mathcal{S}.\mathcal{M}(x))}\right)^2 - 1\right]
        &&(\mathcal{S}.q(x) \text{ is the normalizing constant of } \mathcal{S}.q(\cdot, x))\\
        &= \mathbb{E}\left[\left(\frac{\mathcal{S}.q(x)}{\pi(x)}\right)^2 \left(\mathbb{E}\left[{\hat{Z}(\mathcal{S}.q(\cdot \mid x), \mathcal{S}.\mathcal{M}(x))}^{2} \bigl\vert x\right]\right) - 1\right]\\
        &= \mathbb{E}\left[\left(\frac{\mathcal{S}.q(x)}{\pi(x)}\right)^2 \left(\text{Var}_{\hat{Z}}(\mathcal{S}.q(\cdot \mid x), \mathcal{S}.\mathcal{M}(x)) + 1\right) - 1\right]
        && \text{(definition of Var}_{\hat{Z}}(\cdot, \cdot))\\
        &= \mathbb{E}\left[\left(\frac{\mathcal{S}.q(x)}{\pi(x)}\right)^2 \left(\text{Var}_{\hat{Z}}(\mathcal{S}.q(\cdot \mid x), \mathcal{S}.\mathcal{M}(x))\right) + \left(\frac{\mathcal{S}.q(x)}{\pi(x)}\right)^2 - 1\right]
        && \text{(distributing product over sum)}\\
        &= \mathbb{E}\left[\left(\frac{\mathcal{S}.q(x)}{\pi(x)}\right)^2 \left(\text{Var}_{\hat{Z}}(\mathcal{S}.q(\cdot \mid x), \mathcal{S}.\mathcal{M}(x))\right)\right] + \chi^2(\mathcal{S}.q || \pi).\\
    \end{align*}
\end{itemize}



% \noindent\textbf{Theorem 3.} {\it Let $p_0, \dots, p_n$ be an inference strategy. As above, let $Z = p_0(x_0)$ for some fixed observation $x_0$, and let $\hat{Z}$ denote the recursive estimator for $Z$ corresponding to the inference strategy $p_0, \dots, p_n$. Further, let $\hat{Z}_i$ be the estimator corresponding to the inference strategy $p_0, \dots, p_i$ (which will typically be intractable to compute, as it requires a marginal density for $p_i(x_i)$). Then $\frac{\hat{Z}}{Z}$ has variance}
% $$\text{Var}\left(\frac{\hat{Z}}{Z}\right) = \sum_{i=0}^{n-1} \mathbb{E}%_{x_0, \dots, x_{i-1}}
% \left[%
% \frac{\hat{Z}_i^2}{Z^2} \cdot \chi^2(p_{2\lceil \frac{i}{2}\rceil}(x_{i+1} \mid x_{i}, \dots, x_0) \,\, ||\,\, p_{2\lfloor\frac{i}{2}\rfloor + 1}(x_{i+1} \mid x_i, \dots, x_0))
% %w_i^2 \cdot d_i%
% \right],$$
% {\it where the expectation is taken over $x_1, \dots, x_i$.}


% \textbf{Proof.} The proof is by induction on the length of the inference strategy.

% When $n = 0$, $\frac{\hat{Z}}{Z} = 1$ deterministically, and so has variance $\sum_{i=0}^{-1} \left(\dots\right) = 0$, as desired. 

% % When $n = 1$, then 

% % \begin{align*}
% % \text{Var}\left(\frac{\hat{Z}}{Z}\right) &= \mathbb{E}\left[\frac{\hat{Z}^2}{Z^2}\right] - \mathbb{E}\left[\frac{\hat{Z}}{Z}\right]^2 \\
% % &= \mathbb{E}\left[\frac{\hat{Z}^2}{Z^2}\right] - 1 \\
% % &= \mathbb{E}_{x_1 \sim p_1(\cdot; x_0)}\left[\frac{p_0(x_1, x_0)^2}{p_1(x_1; x_0)^2 p_0(x_0)^2}\right] - 1 \\
% % &= \mathbb{E}_{x_1 \sim p_1(\cdot; x_0)}\left[\frac{p_0(x_1 \mid x_0)^2}{p_1(x_1; x_0)^2}\right] - 1 \\
% % &= \chi^2(p_0(x_1 \mid x_0) \, || \, p_1(x_1; x_0)).
% % \end{align*}

% Now assume the theorem proven for $n = k-1$ and consider $n = k$.
% Note that $\hat{Z} = \hat{Z}_k = \hat{Z}_{k-1} \cdot \frac{p_{2\lfloor \frac{k}{2}\rfloor}(x_k \mid x_{0:k-1})}{p_{2\lceil \frac{k}{2}\rceil - 1}(x_k \mid x_{0:k-1})}$. Therefore, we have:

% \begin{align*}
%     \text{Var}\left(\frac{\hat{Z}}{Z}\right) &= \mathbb{E}\left[\frac{\hat{Z}^2}{Z^2}\right] - \mathbb{E}\left[\frac{\hat{Z}}{Z}\right]^2 \\
%     &= \mathbb{E}\left[\frac{\hat{Z}^2}{Z^2} - 1\right] \\
%     &= \mathbb{E}\left[\frac{\hat{Z}_{k-1}^2}{Z^2} \left(\frac{p_{2\lfloor \frac{k}{2}\rfloor}(x_k \mid x_{0:k-1})^2}{p_{2\lceil \frac{k}{2}\rceil - 1}(x_k \mid x_{0:k-1})^2}\right) - 1\right] \\
%     &= \mathbb{E}\left[\frac{\hat{Z}_{k-1}^2}{Z^2} \left(\mathbb{E}_{x_k \sim p_{2\lceil \frac{k}{2}\rceil - 1}}\left[\frac{p_{2\lfloor \frac{k}{2}\rfloor}(x_k \mid x_{0:k-1})^2}{p_{2\lceil \frac{k}{2}\rceil - 1}(x_k \mid x_{0:k-1})^2} \, \Big| x_{0:k-1}\right]\right) - 1\right] \\
%     &= \mathbb{E}\left[\frac{\hat{Z}_{k-1}^2}{Z^2} \left(\chi^2({p_{2\lfloor \frac{k}{2}\rfloor}(x_k \mid x_{0:k-1})} \, ||\, {p_{2\lceil \frac{k}{2}\rceil - 1}(x_k \mid x_{0:k-1})}) + 1\right) - 1\right] \\
%     &= \mathbb{E}\left[\frac{\hat{Z}_{k-1}^2}{Z^2} \left(\chi^2({p_{2\lfloor \frac{k}{2}\rfloor}(x_k \mid x_{0:k-1})} \, ||\, {p_{2\lceil \frac{k}{2}\rceil - 1}(x_k \mid x_{0:k-1})})\right) + \frac{\hat{Z}_{k-1}^2}{Z^2} - 1\right] \\
%     &= \mathbb{E}\left[\frac{\hat{Z}_{k-1}^2}{Z^2} \left(\chi^2({p_{2\lfloor \frac{k}{2}\rfloor}(x_k \mid x_{0:k-1})} \, ||\, {p_{2\lceil \frac{k}{2}\rceil - 1}(x_k \mid x_{0:k-1})})\right)\right] + \text{Var}\left(\frac{\hat{Z}_{k-1}}{Z}\right) \\
%     &= \sum_{i=0}^{k-1} \mathbb{E}%_{x_0, \dots, x_{i-1}}
% \left[%
% \frac{\hat{Z}_i^2}{Z^2} \cdot \chi^2(p_{2\lceil \frac{i}{2}\rceil}(x_{i+1} \mid x_{i}, \dots, x_0) \,\, ||\,\, p_{2\lfloor\frac{i}{2}\rfloor + 1}(x_{i+1} \mid x_i, \dots, x_0))
% %w_i^2 \cdot d_i%
% \right],
% \end{align*}

% where the last step uses the inductive hypothesis to rewrite $\text{Var}(\frac{\hat{Z}_{k-1}}{Z})$ as a sum from $i = 0$ to $k-2$.



\subsection{Proof of Theorem 4.}

\noindent\textbf{Theorem 4.}
{\it Consider a joint distribution $p(x, y)$ and an inference strategy $\mathcal{S}$ targeting $p(x \mid y)$. Then the following equations give the bias of ${\hat{\LL}}$ and ${\hat{\UU}}$ as estimators of $\log p(y)$:}
\begin{align*}
\text{Bias}_\mathcal{L}(p, y, \mathcal{S}) =&\, -\text{KL}(\mathcal{S}.q || p(\cdot \mid y)) \\ 
&-\mathbb{E}_{x \sim \mathcal{S}.q}[\text{Bias}_\mathcal{U}(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x))]\\
\text{Bias}_\mathcal{U}(p, y, \mathcal{S}) =&\, \text{KL}(p(\cdot \mid y) || \mathcal{S}.q)\\ 
&\,\,\,-\mathbb{E}_{x \sim p(\cdot \mid y)}[\text{Bias}_\mathcal{L}(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x))]
\end{align*}
{\it where the second term in each equation is 0 when $\mathcal{S}.q$ has a tractable marginal density.}

\textbf{Proof.}
% Unfolding the definition of $\text{Bias}_\mathcal{L}$ and $\text{Bias}_\mathcal{U}$, and applying Theorem 2, we see that the theorem holds if and only if:

% $$\LL(p, y, \mathcal{S}) = \log p(y) - KL(\mathcal{S}.q || p(\cdot \mid y)) - \mathbb{E}_{x \sim \mathcal{S}.q}[\text{Bias}_\mathcal{U}(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x))]$$
% and $$\UU(p, y, \mathcal{S}) = \log p(y) + KL(p(\cdot \mid y) || \mathcal{S}.q) - \mathbb{E}_{x \sim \mathcal{S}.q}[\text{Bias}_\mathcal{L}(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x))].$$

In the base case, where $\mathcal{S}.q$ has a tractable marginal density, the theorem states that $\log p(y) - \LL(p, y, \mathcal{S}) = KL(\mathcal{S}.q || p(\cdot \mid y))$, the familiar relationship between the standard ELBO and the KL divergence. The $\UU$ case is similar:
\begin{align*}
    \text{Bias}_\UU(p, y, \mathcal{S}) 
    &= \E_{x \sim p(\cdot \mid y)}[\log p(x, y) - \log \mathcal{S}.q(x)] - \log p(y)\\
    &= \log p(y) + \E_{x \sim p(\cdot \mid y)}[\log p(x \mid y) - \log \mathcal{S}.q(x)] - \log p(y)\\ 
    &= KL(p(\cdot \mid y) || \mathcal{S}.q).
\end{align*}

Now consider the inductive step, in which $\mathcal{S}.q$ does not have a tractable marginal density. We assume the theorem holds for $\mathcal{S}.q$ and $\mathcal{S}.\mathcal{M}(x)$. Then: 
\begin{align*}
    \text{Bias}_\LL(p, y, \mathcal{S}) 
    &= \LL(p, y, \mathcal{S}) - \log p(y)\\
    &= \E_{x \sim \mathcal{S}.q}[\log p(x, y) - \UU(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x))] - \log p(y)\\
    &= \log p(y) + \E_{\sim \mathcal{S}.q}[\log p(x \mid y) - \UU(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x))]\\
    &= \E_{x \sim \mathcal{S}.q}[\log p(x \mid y) - \UU(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x))]\\
    &= \E_{x \sim \mathcal{S}.q}[\log p(x \mid y) - \log \mathcal{S}.q(x) + \log \mathcal{S}.q(x) - \UU(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x))]\\
    &= -KL(\mathcal{S}.q || p(\cdot \mid y)) + \E_{x \sim \mathcal{S}.q}[\log \mathcal{S}.q(x) - \UU(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x))]\\
    &= -KL(\mathcal{S}.q || p(\cdot \mid y)) - \E_{x \sim \mathcal{S}.q}[\text{Bias}_\UU(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x))].
\end{align*}

Nearly the same proof applies for $\UU$, flipping the necessary signs:
\begin{align*}
    \text{Bias}_\UU(p, y, \mathcal{S}) 
    &= \UU(p, y, \mathcal{S}) - \log p(y)\\
    &= \E_{x \sim p(\cdot \mid y)}[\log p(x, y) - \LL(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x))] - \log p(y)\\
    &= \log p(y) + \E_{\sim p(\cdot \mid y)}[\log p(x \mid y) - \LL(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x))]\\
    &= \E_{x \sim p(\cdot \mid y)}[\log p(x \mid y) - \LL(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x))]\\
    &= \E_{x \sim p(\cdot \mid y)}[\log p(x \mid y) - \log \mathcal{S}.q(x) + \log \mathcal{S}.q(x) - \LL(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x))]\\
    &= KL(p(\cdot \mid y) || \mathcal{S}.q) + \E_{x \sim p(x \mid y)}[\log \mathcal{S}.q(x) - \LL(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x))]\\
    &= KL(p(\cdot \mid y) || \mathcal{S}.q) - \E_{x \sim p(\cdot \mid y)}[\text{Bias}_\LL(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x))].
\end{align*}

\subsection{Stationarity of MCMC algorithm}
\label{sec:ravi-mcmc}
In Section~\ref{sec:inference-algs}, we mention that RAVI can be used to run Metropolis-Hastings kernels 
with proposals that have intractable densities. 
Here, we present and justify the algorithm.

Let $\tilde{\pi}(x) = \int \tilde{\pi}(r, x) \text{d}r = Z \int \pi(r, x) \text{d}r$ be a possibly unnormalized target density, and let $q(x'; x) = \int q(s, x'; x) \text{d}s$ be a proposal kernel mapping previous state $x$ to new state $x'$. We note that (1) both $\tilde{\pi}$ and $q$ have intractable marginal densities, and (2) the target marginal $\tilde{\pi}(x)$ itself may be unnormalized. As is typical in pseudomarginal MCMC, even this unnormalized target density cannot be evaluated pointwise, due to the additional nuisance variables $r$.

Now suppose we have a family of inference strategies $\mathcal{S}(x)$ targeting $\pi(r \mid x)$, and a family of inference strategies $\mathcal{M}(x, x')$ targeting $q(s \mid x'; x)$. Let $x$ be a starting position for our Markov chain. We can run Algorithm~\hyperref[alg:alg1]{1} on $\mathcal{S}$, targeting $\pi(r \mid x)$, to obtain an initial estimate $\hat{Z}_x$ of the unnormalized marginal density $\tilde{\pi}(x)$. Then Algorithm~\hyperref[alg:mh]{5} defines a stationary MCMC kernel for the target distribution $\pi(x)$, starting at input point $x$:

\begin{wrapfigure}{L}{0.5\textwidth}
    \begin{algorithm}[H]
    \SetAlgoLined\DontPrintSemicolon
    \label{alg:mh}
    \footnotesize{
    \textbf{Algorithm 5:} RAVI Metropolis-Hastings\;
    \KwIn{model $\tilde{\pi}(x) = Z \int \pi(r, x) \text{d}r$}
    \KwIn{proposal $q(x'; x) = \int q(s, x'; x) \text{d}s$}
    \KwIn{family $\mathcal{S}(x)$ of inference strategies targeting $\pi(r \mid x)$}
    \KwIn{family $\mathcal{M}(x, x')$ of inference strategies targeting $q(s \mid x'; x)$}
    \KwIn{initial position $x$ and estimate $\hat{Z}_{x}$ of $\tilde{\pi}(x)$}
    \KwOut{next position $x'$ and estimate $\hat{Z}_{x'}$ of $\tilde{\pi}(x')$}
    \nl $(s, x') \sim q(s, x'; x)$\;
    \nl $w_{x'} \gets {\texttt{HME}}(q(\cdot, x'; x), s, \mathcal{M}(x, x'))$\;
    \nl $(\_, w_x) \gets \texttt{IMPORTANCE}(q(\cdot \mid x; x'), \mathcal{M}(x', x))$\;
    \nl $(\_, \hat{Z}_{x'}) \gets \texttt{IMPORTANCE}(\pi(\cdot \mid x'), \mathcal{S}(x'))$\;
    \nl $u \sim \text{Uniform}(0, 1)$\;
    \nl \If{$u < \text{min}(1, \frac{\hat{Z}_{x'}}{\hat{Z}_x}w_{x'}w_x)$}{
        \nl \Return{$(x', \hat{Z}_{x'})$}\;
    }
    \nl \Else{
        \nl \Return{$(x, \hat{Z}_x)$}\;
    }
    }
    \end{algorithm}
\end{wrapfigure}

% \begin{enumerate}
%     \item Run Algorithm 2 on the distribution $q(s, x'; x)$ and the family of strategies $\mathcal{M}(x, \cdot)$, to generate $(x', w_{x'})$.
    
%     \item Run Algorithm 1 on the strategy $\mathcal{M}(x', x)$, targeting $q(s \mid x; x')$, to generate $(\_, w_x)$.
    
%     \item Run Algorithm 1 on the strategy $\mathcal{S}$ targeting $\pi(r \mid x')$, to generate $(\_, \hat{Z}_{x'})$.
    
%     \item With probability $\min(1, \frac{\hat{Z}_{x'}}{\hat{Z}_x} w_{x'} w_x)$, accept $x'$ as the next point in the chain, with estimated target density $\hat{Z}_{x'}$. Else, reject $x'$ and return $x$.
% \end{enumerate}
When $q$'s marginal density is known exactly, the above algorithm recovers variants of Particle-Marginal MH~\citep{andrieu2010particle}, except instead of using SMC to marginalize $r$, any RAVI algorithm can be applied. When $q$'s marginal density is unavailable, however, the algorithm instead becomes a pseudo-marginal \textit{ratio} algorithm~\citep{andrieu2018utility}, because not just $p$ but also $q$ is estimated unbiasedly. In general, it is \textit{not} valid to use arbitrary unbiased estimates of $p$ \textit{and} $q$, or even of $\alpha = \frac{p(x')q(x; x')}{p(x)q(x'; x)}$, within an MH algorithm. However, the added structure of the RAVI strategy ensures that the above procedure is sound.

To see why our MCMC kernel is stationary, we consider an extended target distribution. First, some notation. For an inference strategy $\mathcal{S}$ targeting $\pi(x)$, write $v_\mathcal{S}$ for the complete set of auxiliary variables in the strategy: if $\mathcal{S}.q$ has a tractable marginal density, then $v_\mathcal{S} = \emptyset$, and otherwise, if $\mathcal{S}.q(x) = \int \mathcal{S}.q(r, x) \text{d}r$, then $v_\mathcal{S}$ is defined recursively as $\{r\} \cup v_{\mathcal{S}.\mathcal{M}}$. Calling $\texttt{IMPORTANCE}$ on $\mathcal{S}$ yields a joint distribution over these auxiliary variables and $x$, which we denote as $p^{\mathcal{S}}_{\texttt{IMP}}(v_\mathcal{S}, x)$. Calling $\texttt{HME}$ on $\mathcal{S}$ and a particular sample $x$ yields a distribution over just $v_\mathcal{S}$, which we denote $p^\mathcal{S}_{\texttt{HME}}(v_\mathcal{S}; x)$.
When $x \sim \pi$ and $v_\mathcal{S} \sim p^\mathcal{S}_\texttt{HME}(v_\mathcal{S}; x)$, the ratio $\frac{p^\mathcal{S}_\texttt{IMP}(v_\mathcal{S}, x)}{\tilde{\pi}(x)p^\mathcal{S}_\texttt{HME}(v_\mathcal{S}; x)}$ is the weight $\check{Z}(\tilde{\pi}, \mathcal{S})^{-1}$ returned by \texttt{HME}, and similarly, when $(v_\mathcal{S}, x) \sim p^\mathcal{S}_\texttt{IMP}$, the ratio $\frac{\tilde{\pi}(x)p^\mathcal{S}_\texttt{HME}(v_\mathcal{S}; x)}{p^\mathcal{S}_\texttt{IMP}(v_\mathcal{S}, x)}$ is the weight $\hat{Z}(\tilde{\pi}, \mathcal{S})$ returned by $\texttt{IMPORTANCE}$.

Using this notation, we can extend the target distribution $\tilde{\pi}(x)$ to one over $(x, s, x', s', v_{\mathcal{S}(x)}, v_{\mathcal{M}(x, x')}, v_{\mathcal{M}(x', x)})$ that admits $\tilde{\pi}(x)$ as a marginal:
$$\tilde{\pi}(r, x, s, x', s', v_{\mathcal{S}(x)}, v_{\mathcal{M}(x, x')}, v_{\mathcal{M}(x', x)}) = \tilde{\pi}(r, x) \cdot p^{\mathcal{S}(x)}_\texttt{HME}(v_{\mathcal{S}(x)}; r) \cdot q(s, x'; x) \cdot p^{\mathcal{M}(x, x')}_\texttt{HME}(v_{\mathcal{M}(x, x')}; s) \cdot p^{\mathcal{M}(x', x)}_\texttt{IMP}(v_{\mathcal{M}(x', x)}, s')$$


Our algorithm can be understood as sequencing two stationary kernels for this extended target. The first (implemented by lines 1-3) is a blocked Gibbs update on the variables $(s, x', s', v_{\mathcal{M}(x, x')}, v_{\mathcal{M}(x', x)})$, conditioned on everything else. Lines 1-3 sample exactly from the conditional distribution of these variables. The second is a Metropolis-Hastings proposal that simultaneously: (i) swaps $x$ with $x'$ (the `main' proposed update), (ii) swaps $(s, v_{\mathcal{M}(x, x')})$ with $(s', v_{\mathcal{M}(x', x)})$, and (iii) proposes an update to $r$ and to $v_{\mathcal{S}(x)}$ from $p_\texttt{IMP}^{\mathcal{S}(x')}$. The usual Metropolis-Hastings acceptance probability for this kernel, computed on the extended state space, is precisely the formula in Line 6.

One consequence of this justification is that the \textit{same} family $\mathcal{S}$ of inference strategies for $\pi$ must be used at each iteration. The family $\mathcal{M}$ can be freely switched out (as can $q$), however, to develop a cycle of kernels that use different proposal distributions.

% \section{Experimental Details}
% \label{sec:experimental-details}

% Code to reproduce all experimental results is included as part of this supplement. This section reports the details of each experiment, and instructions for reproducing our results.

% \subsection{IWAE with $K=2$}

% This section gives details on the experiment used to produce the results for the IWAE experiment.

% Consider a simple generative model $p_0$, and a RAVI inference strategy whose corresponding $\hat{Z}$ is precisely the two-particle importance sampling estimator with proposal $q(x_1) = \frac{1}{2} \mathcal{N}(x_1; 0, 10) + \frac{1}{2}\mathcal{N}(x_1; 0, 0.2)$:
% %In Appendix~\ref{sec:sir-example}, we show how $N$-particle importance sampling can be viewed as a single-particle estimator arising from Alg. 1 applied to a particular RAVI inference strategy. For now, we specialize to a simple generative model $p_0$, and the RAVI inference strategy $p_1, p_2$ corresponding to importance sampling with number of particles $N=2$ and proposal distribution $q(x_1) = \frac{1}{2} \mathcal{N}(x_1; 0, 10) + \frac{1}{2}\mathcal{N}(x_1; 0, 0.2)$:
% \begin{flalign*}
%     p_0(x_1, x_0) &= \mathcal{N}(x_1; 0, 10) \cdot \mathcal{N}(x_0; x_1, 0.1)&\\
%     p_1(x_2, x_1; x_0) &= \prod_{i=1}^2 q(x_2^{(i)}) \cdot \sum_{i=1}^{2} \frac{w_i}{w_1 + w_2} \delta_{x_2^{(i)}}(x_1)\\
% %    p_1(x_2, x_1; x_0) &= \frac{p_0(x_2^{(c)}, x_0) \cdot \delta_{x_2^{(c)}}(x_1) \cdot q(x_2^{(\overline{c})})}{p_0(x_2^{(1)}, x_0)/q(x_2^{(1)}) + p_0(x_2^{(2)}, x_0)/q(x_2^{(2)})} \\
%     p_2(x_3, x_2; x_1, x_0) &= \text{Bern}(x_3; \frac{1}{2})\cdot q(x_2^{(x_3+1)}) \cdot \delta_{x_1}(x_2^{(2 - x_3)})\\
%     p_3(x_3; x_2, x_1, x_0) &= \delta_{\mathbf{1}[x_1 = x_2^{(1)}]}(x_3)
% \end{flalign*}
% \noindent Here, $p_1$ corresponds to the `sampling/importance resampling' process of generating two particles $x_2^{(1)}$ and $x_2^{(2)}$ from $q$, computing their weights $w_i = p_0(x_2^{(i)}, x_0)/q(x_2^{(i)})$, and then based on the weights, selecting one to return as $x_1$.
% One layer down the inference strategy, $p_2$ observes the chosen particle $x_1$, and tries to guess the values of $x_2^{(i)}$. Our $p_2$ above flips a coin $x_3$ to decide which particle ($x_2^{(1)}$ or $x_2^{(2)}$) to set equal to $x_1$, and guesses the value of the other, \textit{unchosen} particle blindly using $q$. This RAVI inference strategy recovers the usual 2-particle IS estimator $\hat{Z} = \frac{1}{2} (w_1 + w_2)$. But $p_2$ could be better: knowing the chosen particle $x_1$ reveals information about the other, unchosen particle, which $p_2$ could exploit. For example, if a particularly \textit{unlikely} value for $x_1$ was chosen by $p_1$, the other particle must also have been unlikely; otherwise, its large relative weight would have caused it to be chosen.

% %
% We can learn to exploit this knowledge if we replace the $q$ in $p_2$ with an $x_1$-aware distribution  
% $h_\theta(x; x_1) = \frac{1}{2} \left(f_\theta(x_1) \cdot \mathcal{N}(x; 0, 10) + (1-f_\theta(x_1)) \cdot \mathcal{N}(x; 0, 0.2)\right)$, where $f_\theta$ is a neural network predicting probabilities in $[0, 1]$.

% To learn $\theta$, we used ADAM, with learning rate 1e-3, to maximize $\LL_0$. We performed 500 gradient updates, and used 10 samples to estimate each gradient.

% \subsection{Multiple Importance Sampling}

% This section gives model and inference details for the Multiple Importance Sampling experiments, performed on a commodity laptop using a single CPU core.

% \textbf{Model.} We used the simple multimodal target from~\citet{elvira2019generalized}, 
% $$\pi(x) = \frac{1}{3} \left(\mathcal{N}(x; -3, \sigma) + \mathcal{N}(x; 0, \sigma) + \mathcal{N}(x; 3, \sigma)\right).$$

% We set $\sigma = 0.8$.

% \textbf{Proposals.} Following~\cite{elvira2019generalized}, the proposal distributions were $g_k(x) = \int_{r=0.8}^{1.3} 2\mathcal{N}(x; \mu_k, r) dr$.

% \textbf{Estimators.} 

% We compared two estimators. For each of 1,000,000 trials, we generated 30 particles $(r_i, x_i)$, ten from each random-weight proposal $g_k(r, x) = \mathcal{U}(r; 0.8, 1.3) \cdot \mathcal{N}(x; \mu_k, r)$. We then computed $\hat{Z}_1^*$ and $\hat{Z}_2^*$ for each, as follows:

% $$\hat{Z}_1^* = \frac{1}{30} \sum_{i=1}^{30} \frac{\pi(x_i)}{g_{k_i}(x_i \mid r_i)}$$
% $$\hat{Z}_2^* = \frac{1}{30} \sum_{i=1}^{30} \frac{\pi(x_i)}{\frac{1}{3}\sum_{j=1}^3 g_j(x_i \mid r_i)}.$$

% The first estimator is a standard multiple importance sampling estimator. We derived the second by improving meta-inference in the RAVI representation of the first. It resembles the ``deterministic mixture'' MIS estimator, except that instead of using tractable marginal densities $g_j(x_i)$ to compute the denominator, only the likelihoods $g_j(x_i \mid r_i)$ are used.


% \subsection{Agglomerative Monte Carlo}

% \textbf{Model.} We use a generative model adapted from~\citep{lew2021pclean}. We first define a base measure $H$ over strings, which is a character-level bigram model:

% $$H(s) = h(s_1) \prod_{i=2}^{|s|} h(s_i \mid s_{i-1}).$$

% The unigram frequencies $h(s_1)$ and bigram frequencies $h(s_i \mid s_{i-1})$ are included in the supplement, as \texttt{letter\_probabilities.csv} and \texttt{transition\_probabilities.csv}. 

% We model observed strings $y_i$, for $i=1, \dots, 1000$, as distributed according to a Dirichlet process mixture:

% \begin{align*}
%     G &\sim DP(H, \alpha = 1.0)\\
%     x_i \mid G &\sim G\\
%     y_i \mid x_i &\sim f(\cdot \mid x_i).
% \end{align*}

% Here, the likelihood $f(y_i \mid x_i)$ models typos. We set $f$ to be

% \[
% f(y_i \mid x_i) \propto 
%  \begin{cases} 
%       \mathbf{1}[x_i = y_i] & (x_i, y_i) \not\in \mathcal{L} \times \mathcal{L} \\
%       \frac{\text{NegBin}(\tau(x_i, y_i); \lceil \frac{|s|}{5} \rceil, 0.9)}{(5.09|s|)^{\tau(x_i, y_i)}} & (x_i, y_i) \in \mathcal{L} \times \mathcal{L}
%   \end{cases},
% \]

% where $\tau(x_i, y_i)$ is the Damerau-Levenshtein edit distance between $x_i$ and $y_i$, and $\mathcal{L}$ is the set of all observed strings $\{y \mid \exists i. \, y = y_i\}$. This somewhat strange likelihood can be understood as follows. We assume that the data we have, $\mathcal{L}$, includes at least one example of every `clean' string. Setting $f(y_i \mid x_i) = \mathbf{1}[x_i = y_i]$ for $x_i \not\in \mathcal{L}$ encodes that `if someone had meant to type $x_i \not\in \mathcal{L}$, they would have typed it correctly; therefore, the explanation for $y_i \in \mathcal{L}$ cannot be a string $x_i \not\in \mathcal{L}$.' When $x_i \in \mathcal{L}$, we model a negative-binomially distributed number of typos, where the number of trials depends on the length of the string.


% We perform inference in a collapsed version of the model, with the $x_i$ marginalized out. This collapsed version can be expressed as a Chinese Restaurant Process model,
% \begin{align*}
%     \Pi &\sim CRP(n = 1000, \alpha = 1.0)\\
%     y_I \mid \Pi &\sim F(y_I),
% \end{align*}

% where $\Pi$ is a partition, $I$ ranges over the components of $\Pi$ (each of which is a subset of indices $I \subseteq \{1, \dots, 1000\}$), and $F(y_I) = \sum_{x \in \mathcal{L}} h(x) \prod_{i \in I} f(y_i \mid x)$ is the marginal likelihood of $y_I$ when considered as a sequence of noisy observations of a latent string.

% Given a partition $\Pi$ of a set of indices $I \subseteq \{1, \dots, 1000\}$, we write $\pi_I(\Pi)$ for the posterior probability of the partition, given $y_I$, and $\tilde{\pi}_I(\Pi) = CRP(\Pi; n=|I|, \alpha=1.0) \cdot \prod_{J \in \Pi} F(y_J)$ for the unnormalized posterior.

% \textbf{Locally optimal SMC baseline.} As a baseline, we consider a sequential Monte Carlo algorithm that targets a sequence of posteriors, where the $t^{\text{th}}$ posterior incorporates the first $t$ datapoints. The SMC kernel $K$ maps a partition of the first $t-1$ datapoints to a partition of the first $t$ datapoints, by randomly assigning the newest datapoint to an existing component $I$ with probability proportional to $\frac{|I|}{t + \alpha - 1} \cdot F(y_{I} \cup \{y_t\})$, or to a new component with probability proportional to $\frac{\alpha}{t + \alpha - 1} \cdot F(\{y_t\})$. We perform multinomial resampling every iteration, and a complete Gibbs sweep every 100 iterations.

% \textbf{The Agglomerative SMC algorithm.} 

% Whereas the baseline algorithm is parameterized by a time step $t$, and considers all data points $1 \leq i \leq t$, the Agglomerative SMC algorithm is instead parameterized by an arbitrary index set $I$ of data points to consider. 

% \textbf{To generate a collection $\hat{\pi} = \{(\hat{\Pi}_1, w_1), \dots, (\hat{\Pi}_M, w_M)\}$ of $M$ particles weighted for the posterior $\pi_I$, where $I \subseteq \{1, \dots, N\}$ is a subset of datapoint indices:}

% \begin{enumerate}
%     \item If $I = \{i\}$ is a singleton set, set $\hat{\Pi}_j$ to the unique partition of $I$ for $j=1, \dots, M$, and set each $w_j = F(\{y_i\})$. Return $\hat{\pi} = \{(\hat{\Pi}_1, w_1), \dots, (\hat{\Pi}_M, w_M)\}$.
    
%     \item Else, split $I$ into two halves, $I_1$ and $I_2$. (If $|I|$ is odd, divide almost-evenly, so that $|I_2| = |I_1| + 1$.) Recursively run the Agglomerative SMC algorithm to generate two weighted collections $\hat{\pi}_{I_1}$ and $\hat{\pi}_{I_2}$.
    
%     \item For each $j \in 1, \dots, M$, draw $\Pi_j^1 \sim \hat{\pi}_{I_1}$ and $\Pi_j^2 \sim \hat{\pi}_{I_2}$ according to the particle weights. 
    
%     \item Set $\hat{\Pi}_j$ by stochastically \textit{merging} the two partitions $\Pi_j^1$ and $\Pi_j^2$ of $I_1$ and $I_2$:
    
%     \begin{itemize}
%         \item Set $N = |I_1|$ and for each component $J \subseteq I_2$ in $\Pi_j^2$:
        
%         \begin{itemize}
%             \item With probability proportional to $\frac{\alpha}{\alpha + N} \cdot \prod_{l=2}^{|J|} \frac{l - 1}{\alpha + N + l - 1}$, add $J$ as its own component to the partition $\hat{\Pi}_j$.
            
%             \item Alternatively, letting $L$ range over the partition components in $\Pi_j^1$, with probability proportional to $\prod_{l=1}^{|J|} \frac{|L| + l - 1}{\alpha + N + l - 1}$, add $J \cup L$ as a new component to the partition $\hat{\Pi}_j$.
%             Remove the selected component $L$ from $\Pi_j^1$ so it is unavailable for matching in future steps.
            
%             \item Increment $N$ by $|J|$.
            
%         \end{itemize}
        
%         \item Add any components that remain in $\Pi_1^j$ to $\hat{\Pi}_j$.
        
%     \end{itemize}
    
%     \item Letting $\hat{Z}_1$ be the mean weight of $\hat{\pi}_1$ and likewise for $\hat{Z}_2$, compute $w_j = \hat{Z}_1 \hat{Z}_2 \frac{\tilde{\pi}_{I}(\hat{x}_j)}{\tilde{\pi}_{I_1}(x_j^1) \tilde{\pi}_{I_2}(x_j^2) K((x_j^1, x_j^2), d\hat{x}_j)}$, where the kernel $K$ represents the merging process described in Step 4. (Its density can be computed by multiplying the probabilities of the merge decisions made for each $J$.)
    
%     \item With probability $\rho$ (a hyperparameter of the algorithm), perform a Gibbs sweep on each $\hat{\Pi}_j$ for $j = 1, \dots, M$.
    
%     \item Return $\hat{\pi} = \{(\hat{\Pi}_1, w_1), \dots, (\hat{\Pi}_M, w_M)\}$.
    
% \end{enumerate}







% If there is only one active index, i.e. $I = \{i\}$, we return the $M$ copies of the trivial clustering, with the uniform weights $w_j = F(\{y_i\})$. Otherwise we divide the index set into halves, $I_1$ and $I_2$. We then run the algorithm recursively to obtain $M$-particle collections $\hat{\pi}_1$ and $\hat{\pi}_2$ representing the respective posteriors. We draw $M$ particles with replacement from each approximation, $x_{j}^1 \sim \hat{\pi}_1$ and $x_{j}^2 \sim \hat{\pi}_2$, for $j \in [1, M]$. Then, for each $j$, we run a proposal kernel $K$ on $(x_j^1, x_j^2)$ to yield $x_j$. The kernel considers each component of the $x_j^2$ partition in turn, deciding whether to merge it with an available component in $x_j^1$, or leave it as its own component. If a merge is decided, the chosen component is no longer available to be merged with other components at this stage of the algorithm. In the resulting particle $x_j$, if $a, b \in I_1$ or $a, b \in I_2$, then $a$ and $b$ have the same relationship in $x_j$ (i.e., either same-cluster or different-cluster) as they did in $x_j^1$ or $x_j^2$. But for $a \in I_1, b \in I_2$, after the merge any relationship is possible.

% \textbf{Dataset shuffling.} The agglomerative SMC algorithm can be additionally modified so that, to some extent, different particles see different groupings of the data, leading to more particle diversity. In partuclar, at step (2) of the algorithm, if $M$ is above some threshold $\tau$, we consider two possible index splits, $(I_{11}, I_{12})$ and $(I_{21}, I_{22})$. For each split, we perform the two recursive calls as described in Step (2), but with only $\frac{M}{2}$ particles each. Then, we follow steps 3-7 individually for each split, to yield two $\frac{M}{2}$-particle collections $\hat{\pi}_1$ and $\hat{\pi}_2$. We take their union to yield a final $\hat{\pi}$ that we return.

% \textbf{Experimental details.} We ran the locally optimal SMC baseline with 32 particles, which took approximately 30 seconds, and ran the Agglomerative SMC algorithm with 128 particles, with $\rho = 0.0$ and $\tau = 2$, which took the same amount of time. In the body of the paper, we ran each experiment independently three times to estimate error bars; we reported the mean and standard deviations of the log weights, and the best data cleaning accuracy achieved by any of the 3 runs (after performing one additional Gibbs sweep on the algorithm results).\footnote{We have since run additional trials (30 for each algorithm), yielding log weights of $-32,000 \pm 709$ for Agglomerative SMC and $-40,239 \pm 1,532$ for the baseline.}





\section{Further Examples}
\label{sec:appendix-examples}

This appendix lists examples of popular Monte Carlo and variational inference algorithms, and explains how they can be viewed as inference strategies. In addition, some of these algorithms can be viewed as \textit{inference strategy combinators}, because they feature user-chosen proposal distributions or variational families that can themselves be instantiated with inference strategies.\footnote{This `combinator' viewpoint evokes earlier work by~\citep{scibior2018denotational} and \citep{stites2021learning}. For example, \citep{stites2021learning} introduce combinators for creating properly weighted samplers compositionally, with parameters that can be optimized using standard or nested variational objectives. Some of their combinators have equivalents in this section, e.g. their \texttt{propose} combinator is similar to the construction we present for Nested Importance Sampling in Section~\ref{sec:nsmc-example}. However: (1) the fundamental compositional operation in RAVI, of combining a posterior approximation with a meta-posterior approximation, cannot be achieved using their combinators; (2) as such, some of the algorithms that RAVI covers cannot be constructed using their combinators; and (3) their combinators produce properly weighted samplers, which contain `less information' than inference strategies: an inference strategy can be used, e.g., as a proposal distribution in Metropolis-Hastings, whereas properly weighted samplers cannot in general be used this way.}


% \subsection*{Notation}

% Throughout this appendix, we use a notation that fully \textit{unrolls} an inference strategy into a sequence $p_0, p_1, \dots, p_n$, where $p_0(x_1, x_0)$ is the model, $p_1(x_2, x_1; x_0)$ is $\mathcal{S}.q$, $p_2(x_3, x_2; x_1, x_0)$ is $\mathcal{S}.\mathcal{M}(x_1).q$, and so on. %This is depicted in Figure~\ref{fig:nested} and described Section~\ref{sec:theory}. 
% We call $n$ the \textit{length} of an inference strategy.

\subsection{$N$-particle Importance Sampling}
\label{sec:sir-example}

\begin{wrapfigure}{L}{0.5\textwidth}
\vspace{-6mm}
\begin{algorithm}[H]
    \label{infstrat:sir}
    \SetAlgoLined\DontPrintSemicolon
    \footnotesize{
    \textbf{RAVI Inference Strategy:} $N$-particle Importance Sampling\;
    \SetKwFunction{sir}{sir($\tilde{\pi}, q, N$).q}\SetKwFunction{sirm}{sir($\tilde{\pi}, q, N$).M($x$).q}
    % \SetKwFunction{mcvimm}{rmcvi($M, K$).M($x$).M($x_{0:M})$.q}
    \SetKwProg{infalg}{Posterior Approx.}{}{}
    \SetKwInOut{Infers}{Target of inference}
    \SetKwInOut{Aux}{Auxiliary variables}
    \infalg{\sir{}}{
    \Infers{latent variable $x$}
    \Aux{particles $x_{1:N}$, chosen particle index $j$}
    \nl \For{$i \in 1, \dots, N$}{
        \nl $x_i \sim q$\;
        \nl $w_i \gets \frac{\tilde{\pi}(x_i)}{q(x_i)}$\;
    }
    \nl $j \sim \text{Discrete}(w_{1:N})$\;
    \nl \Return{$x_j$}\;}{}
    \setcounter{AlgoLine}{0}
    \SetKwProg{metaalg}{Meta-Posterior Approx.}{}{}
    \metaalg{\sirm{}}{
    \Infers{particles $x_{1:N}$, chosen particle index $j$}
    \Aux{None}
    \nl $j \sim \text{Uniform}(1, N)$\;
    \nl $x_j \gets x$\;
    \nl \For{$i \in 1,\dots,j-1,j+1,\dots,N$}{
        \nl $x_i \sim q$\;
    }
    \nl \Return{$(x_{1:N}, j)$}}
    % \setcounter{AlgoLine}{0}
    % \SetKwProg{metaalg}{Meta-Meta-Posterior Approx.}{}{}
    % \metaalg{\mcvimm{}}{
    % \Infers{SMC particles $x_{0:M}^{1:K}$, ancestor indices $a_0, a_{1:M}^{1:K}$}
    % \Aux{None}
    % %\nl $a_\text{prev} \gets a_0$\;
    % \nl \For{$i \in 0, \dots, M$}{
    %     $b_i \sim  \text{Uniform}(1, K)$\;
    %     %$a_\text{prev} \gets a_i^{a_{\text{prev}}}$\;
    % }
    % \nl \For{$k \in 1,\dots, K$}{
    %     \nl $(x_M^k, w_M^k) \gets (x, q_m(x)])$\;
    % }
    % \nl \For{$i \in M-1, \dots, 0$}{
    %     \nl \For{$k \in 1,\dots,K$}{
    %         \nl \If{$k = b_i$}{
    %             \nl $(a_{i+1}^k, x_i^k) \gets (b_{i+1}, x_i)$\;
    %         }
    %         \nl \Else{
    %             \nl $a_{i+1}^k \sim \text{Discrete}(w_{i+1}^{1:K})$\;
    %             \nl $x_i^k \sim R_i(x_{i+1}^{a_{i+1}^k} \rightarrow \cdot)$\;
    %         }
    %         \nl $w_i^k \gets \frac{q_i(x_i^k)T(x_i \rightarrow x_{i+1}^{a_{i+1}^k})}{q_{i+1}(x_{i+1}^{a_{i+1}^k})R_i(x_{i+1}^{a_{i+1}^k} \rightarrow x_i^k)}$\;
    %     }
    % }
    % \nl $a_0 \gets b_0$\;
    % \nl \Return{$(a_0, a_{1:M}^{1:K}, x_{0:M}^{1:K})$}}
    }
    \vspace{-10mm}
\end{algorithm} 
\end{wrapfigure}

Consider the $N$-particle importance sampling estimator $$\hat{Z} = \frac{1}{N} \sum_{i=1}^N \frac{\tilde{\pi}(x_i)}{q(x_i)},\text{ for }x_i \sim q.$$ The same estimator can be recovered as a \textit{one-particle} \texttt{IMPORTANCE} estimate, by applying Alg.~\hyperref[alg:alg1]{1} to the~\hyperref[infstrat:sir]{\texttt{sir}} inference strategy.

The proposal $\mathcal{S}.q$ generates $N$ particles $x_{1:N}$, and selects an index $j$ from a discrete distribution on $1, \dots, N$, with weights proportional to $w_i = \tilde{\pi}(x_i) / q(x_i)$. The meta-proposal is responsible for inferring $j$ and the complete set of particles $x_{1:M}$, given the chosen particle $x$. It uses the conditional SIR algorithm~\citep{andrieu2010particle} to do so, proposing $j$ uniformly in $\{1, \dots, N\}$, and generating values for the un-chosen particles $x_{-j}$ from $q$. 


\begin{wrapfigure}{L}{0.5\textwidth}
    \vspace{-6mm}
    \begin{algorithm}[H]
        \label{infstrat:ravisir}
        \SetAlgoLined\DontPrintSemicolon
        \footnotesize{
        \textbf{RAVI Inference Strategy:} $N$-particle IS with RAVI strategy $\mathcal{S}$\;
        \SetKwFunction{ravisir}{ravi-sir($\tilde{\pi}, \mathcal{S}, N$).q}\SetKwFunction{ravisirm}{ravi-sir($\tilde{\pi}, \mathcal{S}, N$).M($x$).q}
        % \SetKwFunction{mcvimm}{rmcvi($M, K$).M($x$).M($x_{0:M})$.q}
        \SetKwProg{infalg}{Posterior Approx.}{}{}
        \SetKwInOut{Infers}{Target of inference}
        \SetKwInOut{Aux}{Auxiliary variables}
        \infalg{\ravisir{}}{
        \Infers{latent variable $x$}
        \Aux{particles $x_{1:N}$, aux. proposal variables $v_\mathcal{S}^{1:N}$, chosen particle index $j$}
        \nl \For{$i \in 1, \dots, N$}{
            \nl $x_i, w_i \sim \texttt{IMPORTANCE}(\tilde{\pi}, \mathcal{S})$ w. aux. vars $v_\mathcal{S}^i$\;
            %\nl $w_i \gets \frac{\tilde{\pi}(x_i)}{q(x_i)}$\;
        }
        \nl $j \sim \text{Discrete}(w_{1:N})$\;
        \nl \Return{$x_j$}\;}{}
        \setcounter{AlgoLine}{0}
        \SetKwProg{metaalg}{Meta-Posterior Approx.}{}{}
        \metaalg{\ravisirm{}}{
        \Infers{particles $x_{1:N}$, aux. proposal variables $v_\mathcal{S}^{1:N}$, chosen particle index $j$}
        \Aux{None}
        \nl $j \sim \text{Uniform}(1, N)$\;
        \nl $x_j \gets x$\;
        \nl $\_ \sim \texttt{HME}(\tilde{\pi}, x_j, \mathcal{S})$ w. aux. vars $v_\mathcal{S}^j$\;
        \nl \For{$i \in 1,\dots,j-1,j+1,\dots,N$}{
            \nl $\_, x_i \sim q$ w. aux. vars $v_{\mathcal{S}}^i$\;
        }
        \nl \Return{$(v_\mathcal{S}^{1:N}, x_{1:N}, j)$}}
        }
        \vspace{-5mm}
    \end{algorithm} 
\end{wrapfigure}


This is a suboptimal choice of $\mathcal{S}.\mathcal{M}(x).q$; lower-variance estimates $\hat{Z}$ can be obtained by improving meta-inference, either by incorporating problem-specific domain knowledge or via learning. However, in many cases, improved meta-inference may not be worth the computation required; it remains to be seen whether techniques such as amortized learning can be applied to deliver accuracy gains at low computational cost.

\textbf{Instantiating the proposal $q$ as its own inference strategy.} 
The above assumes that $q$ has a tractable marginal density. When it doesn't, the inner importance sampling loop can use a RAVI inference strategy $\mathcal{S}$ instead of a tractable proposal $q$. This modification is presented in the higher-order inference strategy~\hyperref[infstrat:ravisir]{\texttt{ravi-sir}}. 
One way to think about this construction is as a way to improve any existing inference strategy $\mathcal{S}$ by `adding replicates.' The resulting estimator of $Z$ is the mean of $N$ independent $\hat{Z}$ estimates from the original inference strategy.


\subsection{Importance-Weighted Autoencoders}
\label{sec:iwae-example}

The importance-weighted auto-encoder arises by considering the same inference strategy as in Section~\ref{sec:sir-example}, but as a variational inference procedure (Alg. 3) rather than a Monte Carlo procedure. 

Because $\texttt{sir}(\tilde{\pi}, q, N).q$ of this inference strategy corresponds to $N$-particle sampling importance-resampling (SIR), it has been argued that IWAE is in fact `vanilla' variational inference, but with a variational family that uses SIR to more closely approximate the posterior~\cite{bachman2015training}. However, \citep{cremer2017reinterpreting} show that deriving the ELBO for that variational family gives rise to a different objective, and that IWAE gives a looser lower bound on $\log Z$ than this idealized (but generally intractable) objective. 

In the RAVI framework, these two objectives arise from different inference strategies, which share the same $\mathcal{S}.q$ (SIR in both cases), but use different meta-inference $\mathcal{S}.\mathcal{M}$. IWAE uses the simple conditional SIR meta-inference introduced in Section~\ref{sec:sir-example}, whereas \citep{cremer2017reinterpreting}'s idealized objective can be derived by using the optimal choice of $\mathcal{S}.\mathcal{M}(x).q(j, x_{1:N})$\textemdash the exact posterior of the SIR procedure. The looser bound obtained by IWAE can be seen as a result of its $\mathcal{S}.\mathcal{M}$ performing poorer meta-inference: inference about the auxiliary variables of the SIR inference algorithm used in $\mathcal{S}.q$.

\subsection{$N$-particle Sequential Monte Carlo}
\label{sec:smc-example}


\begin{wrapfigure}{L}{0.6\textwidth}
    \vspace{-2mm}
    \begin{algorithm}[H]
        \label{infstrat:smc}
        \SetAlgoLined\DontPrintSemicolon
        \footnotesize{
        \textbf{RAVI Inference Strategy:} $N$-particle SMC w. RAVI strategies\;
        \SetKwFunction{ravismc}{smc($\tilde{\pi}_{1:T}, \mathcal{S}, K_{2:T}, L_{2:T}, N$).q}\SetKwFunction{ravismcm}{smc($\tilde{\pi}_{1:T}, \mathcal{S}, K_{2:T}, L_{2:T}, N$).M($x$).q}
        % \SetKwFunction{mcvimm}{rmcvi($M, K$).M($x$).M($x_{0:M})$.q}
        \SetKwProg{infalg}{Posterior Approx.}{}{}
        \SetKwInOut{Infers}{Target of inference}
        \SetKwInOut{Aux}{Auxiliary variables}
        \infalg{\ravismc{}}{
        \Infers{latent variable $x$ targeting $\tilde{\pi}_T$}
        \Aux{particles $x^{1:T}_{1:N}$, aux. proposal variables $v_\mathcal{S}^{1:N}$, aux. $K$ vars $v_{K_{2:T}}^{1:N}$, aux. $L$ vars $v_{L_{2:T}}^{1:N}$, ancestor variables $a^{1:T-1}_{1:N}$, final chosen particle index $j$}
        \nl \For{$i \in 1, \dots, N$}{
            \nl $x^1_i, w^1_i \sim \texttt{IMPORTANCE}(\tilde{\pi}_1, \mathcal{S})$ w. aux. vars $v_\mathcal{S}^i$\;
        }
        \nl \For{$t \in 2, \dots, T$}{
            \nl \For{$i \in 1, \dots, N$}{
                \nl $a_i^{t-1} \sim \text{Discrete}(w^{t-1}_{1:N})$\;
                \nl $x_i^t, \hat{w} \sim \texttt{IMPORTANCE}(\tilde{\pi}_t, K_t(x^{t-1}_{a^{t-1}_i}))$ w. aux. vars $v_{K_t}^i$\;
                \nl $\check{w} \sim \texttt{HME}(\tilde{\pi}_{t-1}, x^{t-1}_{a^{t-1}_i}, L_t(x_i^t))$ w. aux. vars $v_{L_t}^i$\;
                \nl $w_i^{t} \gets \hat{w} \cdot \check{w}$\;
            }
        }
        \nl $j \sim \text{Discrete}(w^T_{1:N})$\;
        \nl \Return{$x^T_j$}\;}{}
        \setcounter{AlgoLine}{0}
        \SetKwProg{metaalg}{Meta-Posterior Approx.}{}{}
        \metaalg{\ravismcm{}}{
        \Infers{particles $x^{1:T}_{1:N}$, aux. proposal variables $v_\mathcal{S}^{1:N}$, aux. $K$ vars $v_{K_{2:T}}^{1:N}$, aux. $L$ vars $v_{L_{2:T}}^{1:N}$, ancestor variables $a^{1:T-1}_{1:N}$, final chosen particle index $j$}
        \Aux{None}
        \nl $j \sim \text{Uniform}(1, N)$\;
        \nl $x_j^T, b_T \gets x, j$\;
        \nl \For{$t \in T, \dots, 2$}{
            \nl $a_{b_t}^{t-1} \sim \text{Uniform}(1, N)$\;
            \nl $b_{t-1} \gets a_{b_t}^{t-1}$\;
            \nl $x_{b_{t-1}}^{t-1}, \check{w} \sim \texttt{IMPORTANCE}(\tilde{\pi}_{t-1}, L_t(x_{b_t}^t))$ w. aux. vars $v_{L_t}^{b_t}$\;
            \nl $\hat{w} \sim \texttt{HME}(\tilde{\pi}_{t}, x_{b_t}^t, K_t(x_{b_{t-1}}^{t-1}))$ w. aux. vars $v_{K_t}^{b_t}$\;
            \nl $w_{b_t}^t \gets (\hat{w} \cdot \check{w})^{-1}$\;
        }
        $w^1_{b_1} \sim \texttt{HME}(\tilde{\pi}_1, x_{b_1}^1, \mathcal{S})$ w. aux. vars $v_\mathcal{S}^{b_1}$\;
        \nl \For{$i \in 1, \dots, b_1-1, b_1+1, \dots, N$}{
            \nl $x_{i}^1, w_i^1 \sim \texttt{IMPORTANCE}(\tilde{\pi}_1, \mathcal{S})$ w. aux. vars $v_\mathcal{S}^i$\;
        }
        \nl \For{$t \in 2, \dots, T$}{
            \nl \For{$i \in 1, \dots, b_t-1, b_t+1, \dots, N$}{
                \nl $a_i^{t-1} \sim \text{Discrete}(w^{t-1}_{1:N})$\;
                \nl $x_i^t, \hat{w} \sim \texttt{IMPORTANCE}(\tilde{\pi}_t, K_t(x^{t-1}_{a^{t-1}_i}))$ w. aux. vars $v_{K_t}^i$\;
                \nl $\check{w} \sim \texttt{HME}(\tilde{\pi}_{t-1}, x^{t-1}_{a^{t-1}_i}, L_t(x_i^t))$ w. aux. vars $v_{L_t}^i$\;
                \nl $w_i^{t} \gets \hat{w} \cdot \check{w}$\;
            }
        }
        % \nl $\_ \sim \texttt{HME}(\tilde{\pi}, x_j, \mathcal{S})$ w. aux. vars $v_\mathcal{S}^j$\;
        % \nl \For{$i \in 1,\dots,j-1,j+1,\dots,N$}{
        %     \nl $\_, x_i \sim q$ w. aux. vars $v_{\mathcal{S}}^i$\;
        % }
        \nl \Return{$(x_{1:N}^{1:T}, v_\mathcal{S}^{1:N}, v_{K_{2:T}}^{1:N}, v_{L_{2:T}}^{1:N}, a_{1:N}^{1:T-1},j)$}}
        }
        \vspace{-25mm}
    \end{algorithm} 
\end{wrapfigure}

The sequential Monte Carlo family of algorithms~\citep{chopin2020introduction, del2006sequential} evolve a population of \textit{weighted particles} to approximate a sequence of target distributions. SMC can be viewed as standard importance sampling, with an inference strategy in which $\mathcal{S}.q$ is the sampling distribution for SMC, and $\mathcal{S}.\mathcal{M}(x)$ is the conditional SMC algorithm~\citep{andrieu2010particle}.

Standard SMC is parameterized by:

\begin{enumerate}
    \item A sequence $\tilde{\pi}_{1:T}$ of intermediate target distributions, with $\tilde{\pi}_T = \tilde{\pi}$ the ultimate target;
    \item An initial proposal $q(x_1)$;
    \item A sequence $K_t(x_{t-1} \rightarrow x_t)$ of proposal kernels for $t=2, \dots, T$; and
    \item A sequence $L_t(x_{t} \rightarrow x_{t-1})$ of backward kernels for $t=2, \dots, T$.
\end{enumerate}

Here, we show a version of SMC (the inference strategy~\hyperref[infstrat:smc]{\texttt{smc}}) that behaves as a `higher-order inference strategy,' or `inference strategy combinator': it allows for an initial proposal, proposal kernels, and backward kernels that do not have tractable marginal densities. Our version is parameterized by:

\begin{enumerate}
    \item A sequence $\tilde{\pi}_{1:T}$ of intermediate target distributions, with $\tilde{\pi}_T = \tilde{\pi}$ the ultimate target;
    \item An initial proposal $\mathcal{S}$ (a RAVI strategy);
    \item A sequence of inference strategy families $K_t(x_{t-1})$ parameterized by $x_{t-1}$, for $t=2, \dots, T$, targeting $\tilde{\pi}_t$; and
    \item A sequence of inference strategy families $L_t(x_{t})$ of backward kernels, parameterized by $x_t$, for $t=2, \dots, T$.
\end{enumerate}

The posterior approximation $\mathcal{S}.q$ runs a version of SMC that uses $\texttt{HME}$ and $\texttt{IMPORTANCE}$ to compute weights. The meta-posterior approximation $\mathcal{S}.\mathcal{M}(x).q$ runs a similarly modified version of conditional SMC~\citep{andrieu2010particle}. When \texttt{IMPORTANCE} is run on the \texttt{smc} inference strategy, the final weight $\hat{Z}$ is the SMC marginal likelihood esitmate, the product of the averages of the weights from each time step.

It is possible to adapt this strategy to use adaptive resampling and rejuvenation. (Rejuvenation moves do not actually require modification: can be incorporated by including them as explicit $(K, L)$ pairs, where $L$ is the time-reversal of an MCMC kernel $K$.) However, we are not aware of a way to justify the adaptive choice of rejuvenation kernel.

% Consider the goal of generating a properly weighted collection of particles for a target distribution $p(x \mid y)$. At the top layer of the inference strategy, we choose a proposal that first generates a weighted collection of particles for an intermediate target $\pi(z)$, perhaps recursively using sequential Monte Carlo to do so. Having generated the weighted collection, $\{(z_1, w_1), \dots, (z_M, w_M)\}$ a \textit{resampling} step is performed. We consider a multi-particle resampling strategy $\Psi(\mathbf{z}, d\mathbf{j})$ that takes as input $K$ particles and outputs $M$ indices $j_1, \dots, j_M$. For example, we might set $\Psi(\mathbf{z}, d\mathbf{j}) = \prod_{i=1}^{M} \frac{w_{j_i}}{\sum_{l=1}^M w_l}$ to perform \textit{multinomial resampling}, selecting each $j_i$ independently according to the weights of the $z$ particles. Each resampled $z_{j_i}$ is then transformed independently into a proposed $x_i$ using a kernel $K(z, dx)$, and then reweighted to target $p$: $w'_i = w_{j_i} \frac{p(x_i \mid y) L(z_{j_i}; x_i)}{\pi(z_{j_i}) K(x_i; z_{j_i})}$. As described in \citet{del2006sequential}, $L$ is a user-chosen meta-inference kernel that attempts to reverse $K$, recovering $z$ from $x$.

% Meta-inference requires three ingredients. Given an $x$, we can use $L$ to propose a $z$ that was used to generate it. We then need to propose a possible weighted collection of $z$ values that includes that $z$ at some index. This can be done in two steps: first choosing an index where the distinguished $z$ will end up (using an index distribution $\hat{\psi}$), and then using a distribution $\hat{G}$ to perform meta-infernece about the sampling process that generated the collection of $z$ particles. When that generating process is itself an SMC algorithm, $\hat{G}$ will involve running another $L$ kernel on the distinguished $z$ particle, sampling another ancestor index using $\hat{\psi}$, and running another $\hat{G}$ to generate the previous step's weighted collection. Repeating this process yields the conditional SMC algorithm of~\citep{andrieu2010particle}; variants can be designed to accommodate different resampling strategies $\Psi$ or more general SMC-like algorithms, as in~\citet{lindsten2017divide} or our Agglomerative Monte Carlo algorithm. 


% The first approximation $p_1$ generates a proposed $x_1$ by running Sequential Monte Carlo, and choosing a particle $x_1 = x_1^{(c)}$ to return, where $p_1(c = i) \propto w_i$. This procedure introduces many auxiliary variables, which we view as constituting $x_2$: the proposals at each step of SMC, any resampling choices made between SMC steps, and the final choice $c$ of a particle to return. 

% The next approximation $p_2$ is meant to approximate the posterior $p_1(x_2 \mid x_1; x_0)$ over these latent auxiliary variables, given the observed chosen particle $x_1$. We choose the \textit{conditional SMC} algorithm as this distribution, which begins by sampling a history for the chosen particle $x_1$, then simulates SMC forward, but fixing this one particle's trajectory to ensure it appears in the final set of particles. 

% In the ratio $\frac{p_0(x_1, x_0) \cdot p_2(x_2; x_1, x_0)}{p_1(x_2, x_1; x_0)}$, many terms cancel, leaving only $\frac{1}{N} \sum_{i=1}^{N} w_i$. 


\subsection{Variational Sequential Monte Carlo}
\label{sec:vsmc-example}

The Variational Sequential Monte Carlo~\citep{naesseth2018variational} objective corresponds to Alg. 3, with the same RAVI inference strategy as in Appendix~\ref{sec:smc-example}. However, the default gradient estimator from Alg. 3 will have high variance. \citet{naesseth2018variational} recommend using a biased estimator of the gradient, that uses reparameterization where possible and discards the score function terms arising from resampling steps.

\subsection{Annealed Importance Sampling}
\label{sec:ais-example}

\begin{wrapfigure}{L}{0.5\textwidth}
    \vspace{-6mm}
    \begin{algorithm}[H]
        \label{infstrat:ais}
        \SetAlgoLined\DontPrintSemicolon
        \footnotesize{
        \textbf{RAVI Inference Strategy:} Annealed Importance Sampling\;
        \SetKwFunction{ais}{ais($\tilde{\pi}_{1:T}, \mathcal{S}, K_{2:T}$).q}\SetKwFunction{aism}{ais($\tilde{\pi}_{1:T}, \mathcal{S}, K_{2:T}$).M($x$).q}
        % \SetKwFunction{mcvimm}{rmcvi($M, K$).M($x$).M($x_{0:M})$.q}
        \SetKwProg{infalg}{Posterior Approx.}{}{}
        \SetKwInOut{Infers}{Target of inference}
        \SetKwInOut{Aux}{Auxiliary variables}
        \infalg{\ais{}}{
        \Infers{latent variable $x$ targeting $\tilde{\pi}_T$}
        \Aux{$x^{1:T}$, aux. vars $v_\mathcal{S}$ of initial proposal}
        \nl $x_1, \_ \sim \texttt{IMPORTANCE}(\tilde{\pi}_1, \mathcal{S})$ w. aux. vars $v_\mathcal{S}$\;
        \nl \For{$t \in 2, \dots, T$}{
            \nl $x_t \sim K_t(x_{t-1} \rightarrow \cdot)$\;
        }
        \nl \Return{$x_T$}\;}{}
        \setcounter{AlgoLine}{0}
        \SetKwProg{metaalg}{Meta-Posterior Approx.}{}{}
        \metaalg{\aism{}}{
        \Infers{$x^{1:T}$, aux. vars $v_\mathcal{S}$ of initial proposal}
        \Aux{None}
        \nl $x_T \gets x$\;
        \nl \For{$t \in T, \dots, 2$}{
            \nl $x_{t-1} \sim \tilde{K}_t(x_t \rightarrow \cdot)$\tcp*{$\tilde{K}_t$ is time reversal of $K_t$}
        }
        \nl $\_ \sim \texttt{HME}(\tilde{\pi}_1, x_1, \mathcal{S})$ w. aux. vars $v_\mathcal{S}$\;
        % \nl $\_ \sim \texttt{HME}(\tilde{\pi}, x_j, \mathcal{S})$ w. aux. vars $v_\mathcal{S}^j$\;
        % \nl \For{$i \in 1,\dots,j-1,j+1,\dots,N$}{
        %     \nl $\_, x_i \sim q$ w. aux. vars $v_{\mathcal{S}}^i$\;
        % }
        \nl \Return{$(x_{1:T}, v_\mathcal{S})$}}
        }
        \vspace{-7mm}
    \end{algorithm} 
\end{wrapfigure}

In annealed importance sampling, the practitioner chooses a sequence of unnormalized target distributions $\tilde{\pi}_{1:T}$, where $\pi_T$ is the posterior distribution of interest. Typically $\pi_1$ is chosen to be a distribution that is easy to approximate with a proposal $q$, and each $\pi_i$ is slightly closer to the true target $\pi_T$ than the last. The user also chooses a sequence of kernels $K_t(x_{t-1} \rightarrow x_t)$, where $K_t$ is stationary for $\pi_{t-1}$. The algorithm begins by sampling an initial point $x_1 \sim q$, transforming it through the sequence of kernels to obtain $x_2, \dots, x_T$, and returning $x_T$ as the inferred value of $x$. The associated weight is $$\hat{Z} = \frac{\tilde{\pi}_1(x_1) \cdot \dots \cdot \tilde{\pi}_T(x_T)}{q(x_1) \cdot \tilde{\pi}_1(x_2) \cdot \dots \cdot \tilde{\pi}_{T-1}(x_T)}.$$

This procedure corresponds to running Alg.~\hyperref[alg:alg1]{1} on the~\hyperref[infstrat:ais]{\texttt{ais}} inference strategy. The inference process runs the kernels $K_t$ forward, whereas the meta-inference process runs their time reversals backward: $\tilde{K}_t(x_t \rightarrow x_{t-1}) \propto \pi_t(x_{t-1}) \cdot K_t(x_{t-1} \rightarrow x_t)$.

Note that if $K$ is a stationary kernel for $\pi_i$, so is $K^m$ for any natural number $m$. With sufficient computation (increasing $m$), we can ensure that the AIS top-level proposal $\texttt{ais}(\dots).q$ is arbitrarily close to the target posterior $\pi_T$. However, doing so will not necessarily lead to lower-variance weights: RAVI makes clear that it is also necessary to consider the quality of meta-inference. 

Consider the job of $\tilde{K}_T$, which in the context of the meta-posterior approximation $\texttt{ais}.\mathcal{M}(x)$ is supposed to infer $x_{T-1}$ from $x_{T}$. $\tilde{K}_T$  is the exact meta-posterior of $x_{T-1}$ given $x_T$ \textit{assuming that, in the forward direction, $x_{T-1}$ was distributed according to $\pi_{T-1}$}. However, in the forward direction, if each $K_t$ is run sufficiently many times to ensure mixing at each step, $x_{T-1}$ will in fact be distributed according to $\pi_{T-2}$. This gap\textemdash between the optimal meta-inference kernels and the actual $\tilde{K}$ kernels\textemdash is partly responsible for the variance of the AIS estimator, and can be mitigated by using a finer annealing schedule that brings successive target distributions closer together.  It could also be mitigated by learning a better reverse annealing chain.

\subsection{Nested Sequential Monte Carlo}
\label{sec:nsmc-example}

We first consider Nested Importance Sampling. As in RAVI, Nested Importance Sampling is concerned with importance sampling when the proposal distribution $q$  cannot be tractably evaluated. But RAVI and NIS take different approaches:

\begin{enumerate}
    \item RAVI assumes $q$ can be simulated, but that the (normalized) density cannot be evaluated. RAVI generates proposals exactly distributed according to the user's desired proposal $\mathcal{S}.q$, and generates approximations to the ideal importance weights.
    
    \item NIS does not assume $q$ can be simulated, but does assume that its unnormalized density $\tilde{q}$ is available. As such, proposals are not simulated from $q$, but rather from a Sampling/Importance-Resampling (SIR) approximation to $q$.
\end{enumerate}

The NIS procedure with an intractable proposal $q$ corresponds exactly to a special case of the RAVI algorithm, with the RAVI proposal $\mathcal{S}.q$ set \textit{not} to $q$ but rather to an SIR sampling distribution targeting $q$ using some tractable proposal $h$. Compare:

\begin{itemize}
    \item Ordinary SIR targeting $\tilde{\pi}$ with proposal $h$: recovered by running $\texttt{IMPORTANCE}(\tilde{\pi}, \texttt{sir}(\tilde{\pi}, h, N))$ (see Section~\ref{sec:sir-example} for \texttt{sir} inference strategy).
    \item Nested IS targeting $\tilde{\pi}$ with unnormalized proposal density $\tilde{q}$, approximated using SIR with $h$ as a proposal: recovered by running $\texttt{IMPORTANCE}(\tilde{\pi}, \texttt{sir}(\tilde{q}, h, N))$.
\end{itemize}

That is, under the RAVI perspective, the only difference between ordinary SIR using $h$ and nested IS is that the ideal proposal density $\tilde{q}$ (rather than the target density $\tilde{\pi}$) is used to make the resampling decision about the particles generated by $h$ (the index $j$ in the listing for \texttt{sir}).

% The remainder of the inference strategy is then the same as in Section~\ref{sec:sir-example}. The only difference is that in NIS's $p_1$, the particle index $c$ is chosen according to weights $w_i = q(x_1^{(i)})/h(x_1^{(i)})$ that correct a tractable distribution $h$ toward an intractable but desired proposal $q$, whereas in the version from Section~\ref{sec:sir-example}, the weights $w_i = p_0(x_1^{(i)}, x_0)/h(x_1^{(i)})$ correct the tractable proposal directly toward the target distribution, and there is no intractable proposal $q$.

More generally, \citet{naesseth2015nested} consider procedures other than SIR for approximating ${q}$, arguing that any properly weighted sampler for the intractable proposal $q$ will do. If we let $\mathcal{H}$ be a RAVI inference strategy representing the properly weighted sampler for the intractable proposal $q$ (with unnormalized density $\tilde{q}$), then the Nested IS procedure that uses this properly weighted proposal to perform inference in $\tilde{\pi}$ is $\texttt{IMPORTANCE}(\tilde{\pi}, \texttt{ravi-sir}(\tilde{q}, \mathcal{H}, 1))$ (see~\hyperref[infstrat:ravisir]{\texttt{ravi-sir}} in Section~\ref{sec:sir-example}).

%When Nested Importance Sampling is applied in a nested way (the SIR routine itself using nested importance sampling to generate particles), $p_1$ becomes an SMC distribution: particles are generated from some tractable base distribution, then repeated resampling and reweighting steps are performed (according to a user-designed sequence of intermediate target densities) before finally choosing a single particle to return. In this case $p_2$ corresponds to a CSMC algorithm, which first generates ancestor indices for the chosen particle, then generates the other particles and their trajectories.

Nested SMC is similar, performing Nested IS at each iteration of SMC. To recover this algorithm using RAVI, we use the~\hyperref[infstrat:smc]{\texttt{smc}} inference strategy, but for the proposals $K_t(x_{t-1})$ (which, as described in Section~\ref{sec:smc-example}, can be instantiated with inference strategies), we use~\hyperref[infstrat:ravisir]{\texttt{ravi-sir}} targeting the desired but intractable proposal.

% % A different approach:
% Nested IS is presented in terms of an object-oriented interface: it is assumed that the proposal distribution $Q$ can generate properly weighted samples. We represent $Q$ as its own tower, $q_0, \dots, q_n$, and express the NIS estimator in terms of that tower.

% \textbf{Layer 0.} The model, $p_0(x_1, x_0)$.

% \textbf{Layer 1.} $$p_1(x_2, x_1; x_0) = q_1(x_2^{(y_2)}, x_2^{(y_1)}) \cdot \delta_{x_2^{(y_1)}}(x_1)$$

% % Overall purpose of nesting in NSMC: design an NSMC sampler that targets an 'easier' but still complex distribution, then use that distribution as the proposal for a top-level NSMC algorithm.

% \citep{naesseth2015nested} write that Nested IS ``is different from a random weight IS, since it approximates the proposal distribution (and not just the importance weights).'' 
% %We take the view that there is no such thing as approximating the proposal in a properly weighted sampler; rather, we view whatever procedure is actually used to simulate $X^i$ \textit{as} the proposal. In the case of NIS, this proposal may itself involve sampling/importance-resampling. 
% But Nested IS algorithms \textit{are} random-weight IS algorithms, if we view them not as approximating some proposal, but rather as simply using a different proposal: the actual procedure they use to generate $X^i$. In the typical case, the proposal used by a Nested IS algorithm is $SIR(q, h, M)$, and the weight is simply the 

\subsection{SMC$^2$}
\label{sec:smcsq-example}

\begin{wrapfigure}{L}{0.6\textwidth}
    \vspace{-6mm}
    \begin{algorithm}[H]
        \label{infstrat:smcsq}
        \SetAlgoLined\DontPrintSemicolon
        \footnotesize{
        \textbf{RAVI Inference Strategy:} SMC$^2$\;
        \SetKwFunction{smcsq}{smc$^2$($p, q_1, q, M, N$).q}\SetKwFunction{smcsqm}{smc$^2$($p, q_1, q, M, N$).M($\theta, x_{1:T}$).q}
        % \SetKwFunction{mcvimm}{rmcvi($M, K$).M($x$).M($x_{0:M})$.q}
        \SetKwProg{infalg}{Posterior Approx.}{}{}
        \SetKwInOut{Infers}{Target of inference}
        \SetKwInOut{Aux}{Auxiliary variables}
        \infalg{\smcsq{}}{
        \Infers{parameters $\theta$, sequence $x_{1:T}$}
        \Aux{inner SMC vars $v_\texttt{smc}^T$ of chosen SMC$^2$ particle, other SMC$^2$ vars $v$}
        \nl \tcp{the targets $\tilde{\pi}_t$ depend on $M$, $p$, $q_1$, and $q$}
        \nl $(\theta, x_{1:T}, v_\texttt{smc}^T), \_ \sim \texttt{IMPORTANCE}(\tilde{\pi}_T, \texttt{smc}(\tilde{\pi}_{1:T}, K_{2:T}^2, L_{2:T}^2, N))$ w. aux. vars $v$\;
        \nl \Return{$\theta, x_{1:T}$}\;}{}
        \setcounter{AlgoLine}{0}
        \SetKwProg{metaalg}{Meta-Posterior Approx.}{}{}
        \metaalg{\smcsqm{}}{
        \Infers{inner SMC vars $v_\texttt{smc}^T$ of chosen SMC$^2$ particle, other SMC$^2$ vars $v$}
        \Aux{None}
        \nl $\_ \sim \texttt{HME}(p_T^\theta, x_{1:T}, \texttt{smc}(p^\theta_{1:T}, q_1, K_{2:T}, L_{2:T}, M))$ w. aux. vars $v_\texttt{smc}^T$\;
        \nl $\_ \sim \texttt{HME}(\tilde{\pi}_T, (\theta, x_{1:T}, v_\texttt{smc}^T), \texttt{smc}(\tilde{\pi}_{1:T}, K_{2:T}^2, L_{2:T}^2, N))$ w. aux. vars $v$\;
        % \nl $\_ \sim \texttt{HME}(\tilde{\pi}, x_j, \mathcal{S})$ w. aux. vars $v_\mathcal{S}^j$\;
        % \nl \For{$i \in 1,\dots,j-1,j+1,\dots,N$}{
        %     \nl $\_, x_i \sim q$ w. aux. vars $v_{\mathcal{S}}^i$\;
        % }
        \nl \Return{$(v_\texttt{smc}^T, v)$}}
        }
        \vspace{-7mm}
    \end{algorithm} 
\end{wrapfigure}

Suppose we are working with a state-space model $p(\theta) \prod_{i=1}^T p(x_i \mid x_{1:i}, \theta) p(y_i \mid x_i, \theta)$. For a fixed $\theta$, an SMC algorithm could be used to target the successive posteriors $p_t^\theta(x_{1:t}) = p(x_{1:t} \mid y_{1:t}, \theta)$, with proposal kernels $K_t(x^{t-1}_{1:t-1} \rightarrow x^t_{1:t}) = \delta_{x^{t-1}_{1:t-1}}(x^t_{1:t-1})q(x^t_t; x^t_{1:t-1}, y_{1:t}, \theta)$ (for some choice of $q$) and deterministic backward kernels $L_t(x^t_{1:t} \rightarrow x^{t-1}_{1:t-1}) = \delta_{x^t_{1:t-1}}(x^{t-1}_{1:t-1})$. The RAVI strategy implementing that SMC algorithm is $\texttt{smc}(p_{1:T}, q_1, K_{2:T}, L_{2:T}, N)$, where $q_1(x_1; \theta)$ is a proposal for an initial $x_1$ and $N$ is the number of particles. 

If we also wish to infer $\theta$, we can instead use the SMC$^2$ algorithm~\citep{chopin2013smc2}. We define extended targets
$$\pi_t(\theta, x_{1:t}, v_\texttt{smc}^t) = p(\theta \mid y_{1:t}) p(x_{1:t} \mid y_{1:t}, \theta) p_{\texttt{HME}}^{\texttt{smc}(p_{1:t}^\theta, q_1, K_{2:t}, L_{2:t}, N)}(v_\texttt{smc}^t; x_{1:t}),$$
which are defined over not only $\theta$ and $x_{1:t}$ but also all the auxiliary variables $v_\texttt{smc}^t$ used during steps 1 through $t$ of SMC. The variables $v_\texttt{smc}$ and the $p_\texttt{HME}$ distribution over them are as defined in Appendix~\ref{sec:ravi-mcmc}. We write $\tilde{\pi}_t$ for the unnormalized versions of these targets, with normalizing constant $p(y_{1:t})$.

The SMC$^2$ algorithm targets this sequence of extended posteriors. We write $K_t^2$ for the forward kernels used  by this outer SMC algorithm. The kernel $K_t^2$ extends the SMC state variables $v_\texttt{SMC}^{t-1}$ to new state variables $v_\texttt{SMC}^{t}$ by running the particle filter forward one step, resampling the chosen trajectory index $j$ based on the new weights for time step $t$, and updating $x_{1:t}$ to match the $j^\text{th}$ trajectory. The corresponding backward kernel $L_t^2$ deletes the $t^\text{th}$ step of the particle deterministically, then reproposes $j$ based on the step $t-1$ weights, setting $x_{1:t-1}$ to match the $j^\text{th}$ trajectory.

The SMC$^2$ algorithm corresponds to the RAVI strategy~\hyperref[infstrat:smcsq]{\texttt{smc$^2$}}. Running the other SMC yields an approximate sample from $\tilde{\pi}_T$, which includes auxiliary variables $v_\texttt{smc}^T$. Meta-inference runs two rounds of conditional SMC: first, to recover the inner layer of SMC's variables $v_\texttt{smc}^T$ for the chosen outer-layer particle, and second, to recover the outer layer of SMC's auxiliary variables $v$.
As discussed by~\citet{chopin2013smc2}, particle MCMC rejuvenation moves can also be included; to justify using RAVI, we would insert these kernels as additional proposals within the sequence $K_{2:T}^2$. 
% Set pi_t to include a posterior sample plus 
% HME on the smc strategy.
% Then the SMC^2 strategy:
%   - calls IMPORTANCE on an SMC strategy with those pi_t's, to generate a theta and weighted trajectory. The K distributions are tractable (they extend the particle collection), and we can include in an algorithm box. the L distributiosn 
%   -

\subsection{Amortized Rejection Sampling}
\label{sec:amrej-example}

Consider a generative model $p(K, x_{1:K+1}, y)$ where the latent variables $x_{1:K+1}$ to be marginalized or inferred represent the trace of a rejection sampling loop, with sampling distribution $h(x)$ and predicate $\mathcal{A}(x)$ determining acceptance:
$$p(K, x_{1:K+1}, y) = \prod_{i=1}^{K} \left[h(x_i)(1-\mathcal{A}(x_i))\right] h(x_{K+1})\mathcal{A}(x_{K+1}) p(y \mid x_{K+1})$$

\begin{wrapfigure}{L}{0.7\textwidth}
    \vspace{-6mm}
    \begin{algorithm}[H]
        \label{infstrat:amrej}
        \SetAlgoLined\DontPrintSemicolon
        \footnotesize{
        \textbf{RAVI Inference Strategy:} Amortized Rejection Sampling\;
        \SetKwFunction{amrej}{amrej($h, q, \mathcal{A}, N, M$).q}\SetKwFunction{amrejm}{amrej($h, q, \mathcal{A}, N, M$).M($K, x_{1:K}$).q}
        \SetKwFunction{amrejmm}{amrej($h, q, \mathcal{A}, N, M$).M($K, x_{1:K}$).M($K', x'_{1:K'}, (K''_i, x''^i_{1:K''_i})_{i=1:M}, j$).q}
        \SetKwFunction{amrejmmm}{amrej($h, q, \mathcal{A}, N, M$).M($K, x_{1:K}$).M($K', x'_{1:K'}, (K''_i, x''^i_{1:K''_i})_{i=1:M}, j$).M($z_{K'+1}$).q}
        \SetKwProg{infalg}{Posterior Approx.}{}{}
        \SetKwInOut{Infers}{Target of inference}
        \SetKwInOut{Aux}{Auxiliary variables}
        \infalg{\amrej{}}{
        \Infers{number $K$ of rejected samples, rejected samples $x_{1:K}$, accepted sample $x_{K+1}$}
        \Aux{rejection loops $(K', x'_{1:K'})$ and $(K''_i, x''^i_{1:K''_i})_{i=1:M}$, index $j$}
        \nl $K' \gets 0$\;
        \nl $x'_{1} \sim q$\;
        \nl \While{$\mathcal{A}(x'_{K'+1}) \neq 1$}{
            \nl $K' \gets K' + 1$\;
            \nl $x'_{K'+1} \sim q$
        }
        \nl $x_{K+1} \gets x'_{K'+1}$\;
        \nl \For{$i \in 1, \dots, M$}{
            \nl $K''_i \gets 0$\;
            \nl $x''^i_{1} \sim h$\;
            \nl \While{$\mathcal{A}(x''^i_{K''_i+1}) \neq 1$}{
                \nl $K''_i \gets K''_i + 1$\;
                \nl $x''^i_{K''_i+1} \sim h$\;
            }
        }
        \nl $j \sim \text{Discrete}(K''_{1:M})$\;
        \nl $K \sim \text{Uniform}(0, K''_j)$\;
        \nl $x_{1:K} \gets x''^j_{1:K}$\;
        \nl \Return{$(K, x_{1:K}, x_{K+1})$}\;}{}
        \setcounter{AlgoLine}{0}
        \SetKwProg{metaalg}{Meta-Posterior Approx.}{}{}
        \metaalg{\amrejm{}}{
        \Infers{rejection loops $(K', x'_{1:K'})$ and $(K''_i, x''^i_{1:K''_i})_{i=1:M}$, index $j$}
        \Aux{superfluous accepted sample $z_{K'+1}$}
        \nl $j \sim \text{Uniform}(1, M)$\;
        \nl \For{$i \in 1, \dots, j-1, j+1, \dots, M$}{
            \nl $K''_i \gets 0$\;
            \nl $x''^i_{1} \sim h$\;
            \nl \While{$\mathcal{A}(x''^i_{K''_i+1}) \neq 1$}{
                \nl $K''_i \gets K''_i + 1$\;
                \nl $x''^i_{K''_i+1} \sim h$\;
            }
        }
        \nl $K''_j \gets K$\;
        \nl $x''^j_{K+1} \sim h$\;
        \nl \While{$\mathcal{A}(x''^j_{K+K''_j+1}) \neq 1$}{
            \nl $K''_j \gets K''_j + 1$\;
            \nl $x''^j_{K+K''_j+1} \sim h$\;
        }
        \nl $K' \gets 0$\;
        \nl $z_{1} \sim q$\;
        \nl \While{$\mathcal{A}(z_{K'+1}) \neq 1$}{
            \nl $x'_{K'+1} \gets z_{K'+1}$\;
            \nl $K' \gets K' + 1$\;
            \nl $z_{K'+1} \sim q$
        }
        \nl \Return{$(K', x'_{1:K'}, (K''_i, x''^i_{1:K''_i})_{i=1:M}, j)$}}
        \setcounter{AlgoLine}{0}
        \SetKwProg{metaalg}{Meta-Meta-Posterior Approx.}{}{}
        \metaalg{\amrejmm{}}{
        \Infers{superfluous accepted sample $z_{K'+1}$}
        \Aux{index $l$, unchosen particles $z_{-l}$}
        \nl \For{$i \in 1, \dots, N$}{
            \nl $z_i \sim q$\;
        }
        \nl $l \sim \text{Uniform}(\{i \mid \mathcal{A}(z_i)\})$\;
        \nl \Return{$z_l$}}
        \setcounter{AlgoLine}{0}
        \SetKwProg{metaalg}{Meta-Meta-Meta-Posterior Approx.}{}{}
        \metaalg{\amrejmmm{}}{
        \Infers{index $l$, unchosen particles $z_{-l}$}
        \Aux{None}
        \nl $l \sim \text{Uniform}(1, N)$\;
        \nl \For{$i \in 1, \dots, l-1,l+1,\dots, N$}{
            \nl $z_i \sim q$\;
        }
        \nl \Return{$(z_1, \dots, z_{l-1}, z_{l+1}, \dots, z_N)$}}
        }
        \vspace{-15mm}
    \end{algorithm} 
\end{wrapfigure}

Here, the $x_i$ are drawn independently from a distribution $h$, until some predicate $\mathcal{A}$ holds of the most recent particle, at which point the loop stops. The observation $y$ depends on the final sample $x_{K+1}$, but not the earlier, rejected samples $x_{1:K}$ or the number of rejected samples $K$.
\citet{naderiparizi2019amortized} proposed a technique called \textit{Amortized Rejection Sampling} for performing inference in this model. The technique corresponds to the rather involved RAVI strategy \texttt{amrej}, which has parameters $N$ and $M$ that can be used to trade accuracy for computational cost. 

The idea behind the top-level, intractable posterior approximation $\texttt{amrej}(h, q, \mathcal{A}, N, M).q$ is to:

\begin{itemize}
    \item use the observation $y$ to intelligently guess the \textit{accepted} particle $x_{K+1}$, using a learned proposal $q$. (For example, $q$ may be parameterized by a neural network that accepts $y$ as input.) To satisfy the constraint that $x_{K+1}$ satisfies $\mathcal{A}$, however, it is necessary to run $q$ within a rejection sampling loop, generating auxiliary variables $x'_{1:K'}$, where $K'$ is the number of rejected $q$-samples. (We could try directly using $x'_{1:K'}$ as our proposal for $x_{1:K}$, the rejected samples from the model. But $q$'s goal is to propose $x_{K+1}$ in a \textit{data-driven} way, influenced by the observation $y$, and the rejected samples $x_{1:K}$ from the model have no connection to the data\textemdash so, using samples from $q$ as proposals for the rejected model samples would result in a poor approximation.)

    \item use rejection sampling from the prior $h$ to infer the \textit{rejected} samples $x_{1:K}$. We run $M$ independent rejection sampling loops, randomly choose one with probability proportional to its length, and then randomly choose a \textit{prefix} of the chosen loop as our proposal for $x_{1:K}$. 
\end{itemize}

The meta-posterior approximation must solve two new challenges: recovering the rejected $q$ samples $x'_{1:K'}$ from the posterior approximation, and recovering the many unused rejection loops (and the suffix of the chosen rejection loop) from the second step of the posterior approximation (the $x''$ variables). The latter of these tasks is simple enough: we can generate $M-1$ rejection loops from scratch for the un-chosen loops, and a further rejection loop from scratch to use as the suffix of the chosen loop. The first task is more complex: we run a new rejection loop using $q$ as a proposal, and discard the final accepted sample. Meta-meta-inference must infer this discarded accepted sample, for which it uses SIR with $N$ particles. The final layer, the Meta-Meta-Meta-Posterior Approximation, uses conditional SIR.

The meta-meta-posterior is not absolutely continuous with respect to its approximation (it is possible that the approximation generates $N$ $z$-values that all fail to satisfy the predicate, in which case $z_l$ is not in the support of the meta-meta-posterior). As such, this is an example of a \textit{wide} inference strategy (Appendix~\ref{sec:even-odd}).

\subsection{Hamiltonian Variational Inference}
\label{sec:ham-example}


\begin{wrapfigure}{L}{0.5\textwidth}
    \vspace{-6mm}
    \begin{algorithm}[H]
        \label{infstrat:ham}
        \SetAlgoLined\DontPrintSemicolon
        \footnotesize{
        \textbf{RAVI Inference Strategy:} Hamiltonian Variational Inference\;
        \SetKwFunction{ham}{hamvi($q_0, q_v, r_v, \text{LF}$).q}\SetKwFunction{hamm}{hamvi($q_0, q_v, r_v, \text{LF}$).M($x$).q}
        \SetKwFunction{hammm}{hamvi($q_0, q_v, r_v, \text{LF}$).M($x$).M($x_0, v$).q}
        \SetKwProg{infalg}{Posterior Approx.}{}{}
        \SetKwInOut{Infers}{Target of inference}
        \SetKwInOut{Aux}{Auxiliary variables}
        \infalg{\ham{}}{
        \Infers{latent variable $x$}
        \Aux{initial position $x_0$, momentum $v$}
        \nl $x_0 \sim q_0$\;
        \nl $v \sim q_v$\;
        \nl $(x, v') \gets \text{LF}(x_0, v)$\;
        \nl \Return{$x$}\;}{}
        \setcounter{AlgoLine}{0}
        \SetKwProg{metaalg}{Meta-Posterior Approx.}{}{}
        \metaalg{\hamm{}}{
        \Infers{initial position $x_0$, momentum $v$}
        \Aux{negated final momentum $v'_{-}$}
        \nl $v'_{-} \sim r_v(\cdot; x)$\;
        \nl $(x_0, v_{-}) \gets \text{LF}(x, v'_{-})$\;
        \nl \Return{$(x_0, -v_{-})$}}
        \setcounter{AlgoLine}{0}
        \SetKwProg{metaalg}{Meta-Posterior Approx.}{}{}
        \metaalg{\hammm{}}{
        \Infers{negated final momentum $v'_{-}$}
        \Aux{None}
        \nl $(\_, v') \gets \text{LF}(x_0, v)$\;
        \nl \Return{$-v'$}}
        }
        \vspace{-12mm}
    \end{algorithm} 
\end{wrapfigure}

Hamiltonian Variational Inference~\citep{salimans2015markov} is a hybrid of Hamiltonian Monte Carlo and variational inference. It is a special case of Markov Chain Variational Inference (see Section~\ref{sec:overview} and Section~\ref{sec:examples} for detailed discussion, and~\hyperref[infstrat:mcvi]{\texttt{mcvi}} for the RAVI implementation). The algorithm specializes the Markov Chain Variational Inference procedure for use with a Hamiltonian Monte Carlo kernel. 

We present the specialized strategy as~\hyperref[infstrat:ham]{\texttt{hamvi}}. It accepts as input:

\begin{enumerate}
    \item a distribution $q_0$ from which to propose an initial point;
    \item a momentum distribution $q_v$ from which momenta $v$ are proposed at each iteration;
    \item a proposal distribution $r_v(\cdot; x)$ over momenta; and
    \item a leapfrog integrator $\texttt{LF}$ that runs Hamiltonian dynamics on an initial position and momentum (we think of both the number of leapfrog steps $L$ and the Hamiltonian $H$ being targeted as part of the \texttt{LF} object provided to \texttt{hamvi}).
\end{enumerate}

Given these inputs, the top-level posterior approximation runs an iteration of HMC from a randomly initialized location $x_0$. The meta-posterior approximation randomly proposes a (negated) \textit{final} momentum from the proposal $r_v$, and runs the leapfrog integrator to find a plausible initial location $x_0$. Finally, the (deterministic) meta-meta-posterior finds the initial momentum that could have taken $x_0$ to $x$.

\subsection{Antithetic Sampling}
\label{sec:antithetic-example}

Consider a target $\tilde{\pi}(x)$ and a proposal $q(x)$ that approximates $\pi$. Suppose $q$ is invariant under some bijective transformation $T$: $$\forall x, q(x) = q(T(x)).$$ For example, a univariate Gaussian proposal with mean $\mu$ is invariant under $T(x) = 2\mu - x$. Antithetic sampling generates a sample $x$ from $q$, but instead of using the estimator $\hat{Z} = \tilde{\pi}(x)/q(x)$, it uses $$\hat{Z} = \frac{\tilde{\pi}(x) + \tilde{\pi}(T(x))}{2q(x)}.$$ 


\begin{wrapfigure}{L}{0.6\textwidth}
    \vspace{-6mm}
    \begin{algorithm}[H]
        \label{infstrat:antithetic}
        \SetAlgoLined\DontPrintSemicolon
        \footnotesize{
        \textbf{RAVI Inference Strategy:} Antithetic Sampling\;
        \SetKwFunction{antithetic}{antithetic($\tilde{\pi}, q, T$).q}\SetKwFunction{antitheticm}{antithetic($\tilde{\pi}, q, T$).M($x$).q}
        \SetKwProg{infalg}{Posterior Approx.}{}{}
        \SetKwInOut{Infers}{Target of inference}
        \SetKwInOut{Aux}{Auxiliary variables}
        \infalg{\antithetic{}}{
        \Infers{latent variable $x$}
        \Aux{sampled $x_0$, choice $b$}
        \nl $x_0 \sim q$\;
        \nl $w_0 \gets \tilde{\pi}(x_0) / q(x_0)$\;
        \nl $w_1 \gets \tilde{\pi}(T(x_0)) / q(x_0)$\;
        \nl $b \sim \texttt{Bernoulli}(\frac{w_1}{w_0 + w_1})$\;
        \nl \Return{$bT(x_0) + (1-b)x_0$}\;}{}
        \setcounter{AlgoLine}{0}
        \SetKwProg{metaalg}{Meta-Posterior Approx.}{}{}
        \metaalg{\antitheticm{}}{
        \Infers{sampled $x_0$, choice $b$}
        \Aux{None}
        \nl $b \sim \text{Bernoulli}(0.5)$\;
        \nl $x_0 \gets bT(x) + (1-b)x$\;
        \nl \Return{$(x_0, b)$}}
        }
        \vspace{-3mm}
    \end{algorithm} 
\end{wrapfigure}

This can be justified as Algorithm 1 (\texttt{IMPORTANCE}) applied to the strategy~\hyperref[infstrat:antithetic]{\texttt{antithetic}}. The posterior approximation generates an initial sample $x_0 \sim q$, evaluates both $x_0$ and $T(x_0)$ as possible proposals, and selects one. The meta-posterior approximation must recover whether $x$ or its transformed version was the sampled one; it does so by flipping a fair coin, which is optimal when $T = T^{-1}$, i.e., when $T$ is an involution. In the general case a lower-variance estimator could be derived by setting $\mathcal{M}(x).q$ to the exact posterior of the proposal process. Antithetic sampling can also be generalized to the case where a finite family of bijective transformations $T_i$ are available.

Note that although the final expression for $\hat{Z}$ falls out of this inference strategy only when $q(x) = q(T(x))$ for all $x$, nothing in the inference strategy itself exploits this assumption, and the same inference strategy could be applied to $T$ without this property, to derive other estimators that\textemdash intuitively\textemdash simultaneously consider a proposal $x$ and a deterministic function of it $T(x)$ as possible locations.


\section{Absolute continuity}
\label{sec:even-odd}

When we defined inference strategies $\mathcal{S}$ targeting $\pi$, we required that $\mathcal{S}.q$ and $\pi$ be \textit{mutually} absolutely continuous, a stronger requirement than in importance sampling. We now consider relaxing this requirement, by requiring only \textit{one-sided} absolute continuity. We define two \textit{kinds} of inference strategy, depending on which direction of absolute continuity holds:

\begin{enumerate}
    \item An inference strategy $\mathcal{S}$ targeting $\pi$ is \textit{wide} if $\pi$ is absolutely continuous with respect to $\mathcal{S}.q$, and either $\mathcal{S}.q$ has a tractable marginal density or $\mathcal{S}.\mathcal{M}(x)$ is a narrow inference strategy targeting $\mathcal{S}.q(\cdot \mid x)$ for all $x$.
    
    \item An inference strategy $\mathcal{S}$ targeting $\pi$ is \textit{narrow} if $\mathcal{S}.q$ is absolutely continuous with respect to $\pi$, and either $\mathcal{S}.q$ has a tractable marginal density or $\mathcal{S}.\mathcal{M}(x)$ is a wide inference strategy targeting $\mathcal{S}.q(\cdot \mid x)$ for all $x$.
\end{enumerate}

Then an inference strategy as defined in the main paper is one that is both wide and narrow. 

Narrow inference strategies can serve as variational families within variational inference algorithms. Wide inference strategies can be used as importance sampling and SMC proposals, as well as variational families for \textit{amortized} variational inference.  Inference strategies used as MCMC proposals must be both wide and narrow.

\section{Other applications of RAVI inference strategies}
\label{sec:other-applications}

\subsection{Rejection sampling with RAVI}
As in any properly weighted sampler, if the weights produced by Alg. 1 can be bounded above by a constant $M$, a RAVI inference strategy can be used for exact inference via rejection sampling: a sample $(x, \hat{Z})$ is drawn using Alg. 1, and then accepted with probability $\frac{\hat{Z}}{M}$. The weight $\hat{Z}$ for an inference strategy can be viewed as a product of the normalizing constant $Z$ with normalized importance weights $w_\mathcal{S} = \frac{\pi(x)}{\mathcal{S}.q(x)}$, $w_{\mathcal{S}.\mathcal{M}(x)} = \frac{\mathcal{S}.q(r \mid x)}{\mathcal{S}.\mathcal{M}(x).q(r)}$, and so on. As such, if upper bounds $M_Z$ and $M_\mathcal{S}$, $M_{\mathcal{S}.\mathcal{M}(x)}$, etc. can be found for these quantities, the product of these bounds is a bound on $\hat{Z}$. Thus, as in properly weighted sampling and in variational inference with RAVI, it is possible to reason about the RAVI inference strategy compositionally, in terms of bounds at each layer of nesting.

\subsection{Estimating KL divergences between models with RAVI inference strategies equipped}

Suppose $p(y) = \int p(x, y) \text{d}x$ and $q(y) = \int q(z, y) \text{d}z$ are mutually absolutely continuous distributions over some space $\mathcal{Y}$. Suppose also that we have two families of inference strategies, $\mathcal{S}_p(y)$ and $\mathcal{S}_q(y)$, targeting $p(x\mid y)$ and $q(z \mid y)$ respectively. Then the AIDE algortihm~\citep{cusumano2017aide} can be adapted to give a stochastic upper bound on the symmetric KL divergence between $p(y)$ and $q(y)$.

First, we generate $(x, y_p) \sim p$, $(z, y_q) \sim q$, and run $\texttt{HME}$ on each pair to obtain weights $w^p_p$ and $w^q_q$ respectively. Then, we run $\texttt{IMPORTANCE}$ on $p$ with data $y_q$, and on $q$ with data $y_p$, to obtain weights $w^p_q$ and $w^q_p$ respectively. Finally, we sum the logs of the foru weights, to give an estimate $\hat{D}$ whose expectation is:

$$
\mathbb{E}[\hat{D}] =
\mathbb{E}_{y \sim p}[\UU(p, y, \mathcal{S}_p(y)) - \LL(q, y, \mathcal{S}_q(y))] + \mathbb{E}_{y \sim q}[\UU(q, y, \mathcal{S}_q(y)) - \LL_p(p, y, \mathcal{S}_p(y))] \geq KL(p || q) + KL(q || p).
$$

As the marginal likelihood bounds $\UU$ and $\LL$ become tighter, this expectation approaches the true symmetric KL between $p$ and $q$, i.e., $D = KL(p|| q) + KL(q || p)$. Theorem 4 allows us to characterize the tightness of these bounds, and thus of the stochastic upper bound $\hat{D}$ on the symmetric KL, in terms of KL divergences between successive layers of each inference strategy. Improving inference at any layer of the inference strategy tightens the bound $\hat{D}$, yielding less biased estimates of $D$.

\section{Reparameterization Trick Gradient Estimators}
\label{sec:reparam}
In this section, we present versions of Algorithms 3 and 4 that 
utilize reparameterization gradients, rather than score function 
gradients. Using these algorithms requires that an inference 
strategy be \textit{reparameterizable}.

\textbf{Definition:} A \textit{reparameterizable inference strategy $\mathcal{S}$ with arguments $\theta$} specifies:

\begin{itemize}
\item A reparameterizable posterior approximation $\mathcal{S}.q$, which is one of:
\begin{itemize} 
    \item a tractable proposal: a tuple $(\mathcal{S}.q(x; \theta), \mathcal{S}.q.g(\epsilon), \mathcal{S}.q.f(\epsilon, \theta)$, such that $q$ is the pushforward of $g$ by $f$; or
    \item an intractable proposal: a tuple $(\mathcal{S}.q(r, x; \theta), \mathcal{S}.q.g(\epsilon_r, \epsilon_x), \mathcal{S}.q.f_r(\epsilon_r, \theta), \mathcal{S}.q.f_x(\epsilon_x, \theta))$, such that $q$ is the pushforward of $g$ by $\lambda (\epsilon_r, \epsilon_x). (f_r(\epsilon_r, \theta), f_x(\epsilon_x, \theta))$.
\end{itemize}
\item If the latter, a reparameterizable meta-inference strategy $\mathcal{S}.\mathcal{M}$, with arguments $(x, \theta)$, that given argument $(x, \theta)$, targets $\mathcal{S}.q(r \mid x; \theta)$.
\end{itemize}

Now, reparameterized estimators can be derived by applying standard automatic differentiation to the following algorithm, which only samples from distributions that do not depend on parameters:

\begin{minipage}[t]{0.52\textwidth}
    {
    \removelatexerror
    \vspace{-10pt}
    \begin{algorithm}[H]
        \label{alg:alg6}
        %\label{infstrat:alg1}
        \SetAlgoLined\DontPrintSemicolon
        \textbf{Algorithm 6:} RAVI ELBO estimator ($\texttt{ELBO}$)\;
        \KwIn{unnormalized model $\tilde{p}(x)$}
        \KwIn{inference strategy $\mathcal{S}$ with arguments}
        \KwIn{arguments $\theta$}
        \KwOut{unbiased estimates of $\mathcal{L}$ (differentiable w.r.t. $\theta$)}
        \nl \If{$\mathcal{S}.q$ has a tractable marginal density}{
            \nl $\epsilon_x \sim \mathcal{S}.q.g$\;
            \nl $x \gets \mathcal{S}.q.f(\epsilon_x, \theta)$\;
            \nl $\hat{U} \gets \log \mathcal{S}.q(x; \theta)$\;
%            \nl $\widehat{\grad} \gets \nabla_\theta \log \mathcal{S}.q(\mathcal{S}.f(\epsilon_x, \theta); \theta)$\;
            % \nl $x \sim \mathcal{S}.q$\;
            % \nl$ (\hat{U}, \widehat{\grad}) \gets (\log \mathcal{S}.q(x), \grad \log \mathcal{S}.q(x) \cdot (1 + \log \mathcal{S}.q(x)))$\; 
            % \nl $\mathbf{g} \gets \grad \log \mathcal{S}.q(x)$\;
        }
        \nl \ElseIf{$\mathcal{S}.q(x; \theta) = \int \mathcal{S}.q(r, x; \theta)\text{d}r$}{
            \nl $(\epsilon_r, \epsilon_x) \sim \mathcal{S}.q.g$\;
            \nl $(x, r) \gets (\mathcal{S}.q.f_x(\epsilon_x, \theta), \mathcal{S}.q.f_r(\epsilon_r, \theta))$\;
            \nl $\hat{U} \gets \texttt{EUBO}(\mathcal{S}.q(\cdot, x; \theta), r, \mathcal{S}.\mathcal{M}, (x, \theta))$\;
    %        \nl $(r, w) \gets \text{Alg1}(\mathcal{S}.q(\cdot, x), \mathcal{S}.\mathcal{M}(x))$\;
        }
        \nl \Return{$\log \tilde{p}(x) - \hat{U}$}\;
    \end{algorithm}
    % \noindent\textbf{Algorithm 3.} Let $\mathcal{S}$ be a strategy targeting $p(x \mid y)$. Then the following procedure yields unbiased estimates of the ELBO
    % $\LL(y)$ and $\grad \LL(y)$:
    % \begin{enumerate}[leftmargin=*]
    %     \item If $\mathcal{S}.q$ has a tractable density, sample $x \sim \mathcal{S}.q$ and set $(\hat{U}, \widehat{\grad}, \mathbf{g}) \gets (\log \mathcal{S}.q(x), \grad \log \mathcal{S}.q(x) \cdot (1 + \log \mathcal{S}.q(x)), \grad \log \mathcal{S}.q(x))$.
    
    %     \item Else, if $\mathcal{S}.q(x) = \int \mathcal{S}.q(r, x) \text{d}r$, generate $(r, x) \sim \mathcal{S}.q$, and run {Algorithm 4} on the inference strategy $\mathcal{S}.\mathcal{M}(x)$ targeting $\mathcal{S}.q(r \mid x)$, and the sample $r$, to obtain $(\hat{U}, \widehat{\grad}, \mathbf{g})$.
        
    %     \item Set $\hat{L} \gets \log p(x, y) - \hat{U}$.
        
    %     \item Set $\widehat{\grad}'\gets \grad \log p(x, y)
    %         + \mathbf{g} 
    %         \log p(x, y)
    %         - \widehat{\grad}.$
        
    %     \item Return $(\hat{L}, \widehat{\grad}')$.
    % \end{enumerate}
    }
    \end{minipage}\hfill%
    \begin{minipage}[t]{0.48\textwidth}
    {
        \removelatexerror
        \vspace{-10pt}
        \begin{algorithm}[H]
            \label{alg:alg7}
            \SetAlgoLined\DontPrintSemicolon
            \textbf{Algorithm 7:} RAVI EUBO estimator ($\texttt{EUBO}$)\;
            \KwIn{unnormalized model $\tilde{p}(x)$}
            \KwIn{exact sample $x \sim p(x)$}
            \KwIn{inference strategy $\mathcal{S}$ with arguments}
            \KwIn{arguments $\theta$}
            \KwOut{unbiased estimate of $\mathcal{U}$ (differentiable w.r.t. $\theta$)}
            \nl \uIf{$\mathcal{S}.q$ has a tractable marginal density}{
                \nl$\hat{L} \gets \log \mathcal{S}.q(x; \theta)$\;
            }
            \nl \ElseIf{$\mathcal{S}.q(x; \theta) = \int \mathcal{S}.q(r, x; \theta)\text{d}r$}{
                \nl $\hat{L} \gets \texttt{ELBO}(\mathcal{S}.q(\cdot, x; \theta), \mathcal{S}.\mathcal{M}, (x, \theta))$\;
        %        \nl $(r, w) \gets \text{Alg1}(\mathcal{S}.q(\cdot, x), \mathcal{S}.\mathcal{M}(x))$\;
            }
            \nl \Return{$\log \tilde{p}(x) - \hat{L}$}\;
        \end{algorithm}
    % \noindent\textbf{Algorithm 4.} Let $\mathcal{S}$ be a strategy targeting $p(x \mid y)$ and $x$ an exact posterior sample. The following procedure yields unbiased estimates of $\UU(y)$ and $\grad \UU(y)$, and the quantity $\mathbf{g}$ (see Thm. 2):
    % %the gradient $\grad \log p_0(x_0, x_1)$.
    
    % \begin{enumerate}[leftmargin=*]
    %     %\item Generate $(x, y) \sim p$.
        
    %     \item If $\mathcal{S}.q$ has a tractable marginal density, then set $(\hat{L}, \widehat{\grad}) \gets (\log \mathcal{S}.q(x), \grad \log \mathcal{S}.q(x))$.
    %     \item Else, if $\mathcal{S}.q(x) = \int \mathcal{S}.q(r, x)\text{d}r$, run Algorithm 3 on the strategy $\mathcal{S}.\mathcal{M}(x)$, targeting $\mathcal{S}.q(r \mid x)$, to obtain $(\hat{L}, \widehat{\grad})$.
    
    %     \item Set $\hat{U} \gets \log p(x, y) - \hat{L}$.
        
    %     \item Set $\mathbf{g} \gets \grad \log p(x, y)$.
    
    %     \item Set $\widehat{\grad}' \gets
    %         \grad \log p(x, y)
    %         + \mathbf{g} \cdot \hat{U}
    %         - \widehat{\grad}$.
            
    %     \item Return $\left(\hat{U}, \widehat{\grad}', \mathbf{g} \right)$.
    % \end{enumerate}
    }
    \end{minipage}

    Note that in fact only every \textit{other} posterior approximation in the unrolled strategy requires a reparameterized version: Algorithm 7 never samples from its $\mathcal{S}.q$, only evaluates the densities.
 
    It would be interesting to develop variants of these algorithms that allow users to combine score-function and reparameterization estimation at different layers of nesting, or exploit other variance reduction tactics compositionally.

%     \section{Mass-capturing and mode-seeking VI}
 
% \begin{figure*}
%     %\centering
% %    \includegraphics[width=0.9\linewidth]{figs/tutorial.pdf}
%     \begin{subfigure}[b]{0.22\linewidth}
%         \includegraphics[width=\linewidth]{figs/target.pdf}
%         % \begin{align*}
%         %     p(z, x) = \mathcal{N}&(z; \mathbf{0}, 5\mathbf{I})\cdot\\ 
%         %     &\mathcal{N}(x; ||z||_2^2, \frac{3}{4})
%         % \end{align*}
%         \caption{Target distribution, the posterior $p_0(x_1 \mid x_0)$ of the latent-variable generative model $p(x_1, x_0) = \mathcal{N}(x_1; \mathbf{0}, 5\mathbf{I}) \cdot \mathcal{N}(x_0; ||x_1||_2^2, \frac{3}{4})$. Variational inference can be applied to approximate the posterior, but requires the choice of a variational family. }
%         \label{fig:target}
%     \end{subfigure}\hfill
%     \begin{subfigure}[b]{0.22\linewidth}
%     \includegraphics[width=\linewidth]{figs/simple_gaussian.pdf}
%     % \begin{align*}
%     %         q(z; x) = \mathcal{N}(z; \mu, \Sigma)\\
%     %         \,
%     % \end{align*}
%     \caption{A Gaussian variational family. The Gaussian's density is tractable, so  standard (top) and amortized (bottom) variational inference can be applied. However, the Gaussian approximation is not flexible enough to accurately characterize the posterior distribution.}
%     \label{fig:inadequate}
%     \end{subfigure}\hfill
%     \begin{subfigure}[b]{0.22\linewidth}
%     \includegraphics[width=\linewidth]{figs/angle.pdf}
%     % \begin{align*}
%     %         q(z; x) = \mathcal{N}(z; \mu, \Sigma)\\
%     %         \,
%     % \end{align*}
%     \caption{A RAVI inference strategy $\mathcal{S}$. $\mathcal{S}.q$ (left) approximates the posterior better by sampling a latent angle (red) and then a point (green), but has an intractable marginal density. RAVI's Algs. 3 (top) and 4 (bottom) are necessary to compute standard and amortized VI gradients.}
%     \label{fig:angle}
%     \end{subfigure}\hfill
%     \begin{subfigure}[b]{0.22\linewidth}
%     \includegraphics[width=\linewidth]{figs/SIR.pdf}
%     % \begin{align*}
%     %         q(z; x) = \mathcal{N}(z; \mu, \Sigma)\\
%     %         \,
%     % \end{align*}
%     \caption{Existing algorithms, such as IWAE~\citep{burda2015importance}, can often be seen as RAVI inference strategies. This $\mathcal{S}.q$ generates a vector of $K$ particles (red) and chooses one (green) to propose, and has an intractable density. Algs. 3 (top) and 4 (bottom) can be used for optimization.}
%     \label{fig:multiple}
%     \end{subfigure}
%     \caption{\textbf{Example of RAVI applied to a simple 2D posterior.}
%     }
%    \label{fig:tutorial}
% \end{figure*}
\bibliography{lew_657}
\end{document}