\documentclass[accepted,x11names]{uai2022} % for initial submission
% \documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for  an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage[numbers]{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{array}
\usepackage{ragged2e}
\usepackage{xr-hyper} 
\externaldocument{lew_657-supp}

\newcolumntype{P}[1]{>{\RaggedRight\hspace{0pt}}p{#1}}

% For theorems and such 
\usepackage{lipsum}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{amsthm}
\usepackage{wrapfig}
\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors
\usepackage{graphicx}
\usepackage{placeins}
\usepackage{enumitem}
\usepackage{wrapfig}
\usepackage{subcaption}
\usepackage[noend]{algorithm2e}


%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\newcommand{\figref}[1]{Fig. \ref{#1}}
\newcommand{\secref}[1]{\S \ref{#1}}
\newcommand{\E}[0]{\mathbb{E}}
\newcommand{\LL}[0]{\mathcal{L}}
\newcommand{\UU}[0]{\mathcal{U}}
\newcommand{\grad}[0]{\nabla_{\theta}}

\makeatletter
\newcommand{\removelatexerror}{\let\@latex@error\@gobble}
\makeatother

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% THEOREMS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}

\hypersetup{
    colorlinks=true,
    urlcolor=black,
    linkcolor=cyan,
    citecolor=SpringGreen3,
    linkbordercolor=white,
}


\title{Recursive Monte Carlo and Variational Inference with Auxiliary Variables}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<alexlew@mit.edu>?Subject=Your UAI 2022 paper}{Alexander~K.~Lew}{}}
\author[1]{\href{mailto:<marcoct@mit.edu>?Subject=Your UAI 2022 paper}{Marco~Cusumano-Towner}{}}
\author[1]{\href{mailto:<vkm@mit.edu>?Subject=Your UAI 2022 paper}{Vikash~K.~Mansinghka}{}}
% \author[3]{Further~Coauthor}
% \author[1]{Further~Coauthor}
% \author[3]{Further~Coauthor}
% \author[3,1]{Further~Coauthor}
% Add affiliations after the authors
\affil[1]{%
    Massachusetts Institute of Technology\\
    Cambridge, Massachusetts, USA
}
% \affil[2]{%
%     Second Affiliation\\
%     Address\\
%     …
% }
% \affil[3]{%
%     Another Affiliation\\
%     Address\\
%     …
%   }
  
\begin{document}
\maketitle

\begin{abstract}
A key design constraint when implementing Monte Carlo 
and variational inference algorithms is that it must 
be possible to cheaply and exactly evaluate the marginal 
densities of proposal distributions and variational families.
This takes many interesting 
proposals off the table, such as those
based on involved simulations or stochastic optimization.
This paper broadens the design space, by presenting 
a framework for applying Monte Carlo and variational 
inference algorithms when proposal densities cannot 
be exactly evaluated. Our framework, \textit{recursive
auxiliary-variable inference} (RAVI), instead approximates 
the necessary densities using \textit{meta-inference}: 
an additional layer of Monte Carlo or variational inference,
that targets the proposal, rather than the model.
%
%A key challenge in applying Monte Carlo and variational inference (VI) is the design of \textit{proposals} and \textit{variational families} that are flexible enough to closely approximate the posterior, but simple enough to admit tractable densities and variational bounds.
%This paper presents \textit{recursive auxiliary-variable inference} (RAVI), a new framework for exploiting flexible proposals, for example based on involved simulations or stochastic optimization, within Monte Carlo and VI algorithms. 
%The key idea is to estimate intractable proposal densities via \textit{meta-inference}: additional Monte Carlo or variational inference targeting the proposal, rather than the model. 
RAVI generalizes and unifies several existing methods for 
inference with expressive approximating families, 
which we show correspond to specific choices 
of meta-inference algorithm, and provides new theory for analyzing their bias and variance.
% we prove that a Monte Carlo algorithm's variance, and a variational bound's tightness, decompose as sums of terms quantifying the quality of the intractable proposal and of each recursive layer of meta-inference.
We illustrate RAVI's design framework and theorems by using them to analyze and improve upon \citet{salimans2015markov}'s Markov Chain Variational Inference, and to design a novel sampler for Dirichlet process mixtures, achieving state-of-the-art results on a standard benchmark dataset from astronomy and on a challenging data-cleaning task with Medicare hospital data.
  
%   to develop three new algorithms: (1) an improvement to~\citet{salimans2015markov}'s Markov Chain Variational Inference (MCVI) that scales to handle variational families incorporating longer MCMC chains; 
%   (2) a novel sampler for Dirichlet process mixtures that uses a
%   randomized agglomerative clustering algorithm as a proposal,
%   and (3) a new method for ensembling multiple samplers 
%   or variational families.
\end{abstract}



\newcommand\mycommfont[1]{\scriptsize\ttfamily\textcolor{darkgray}{#1}}
\SetCommentSty{mycommfont}






\begin{table*}[t]
    \begin{tabular}{P{0.28\linewidth}P{0.32\linewidth}P{0.35\linewidth}}
    \textbf{Monte Carlo or variational inference algorithm} & \textbf{Distributions that no longer need fast exact density evaluators} & \textbf{Example applications} \\
    Importance Sampling~\citep{glynn1989importance} (Alg.~\hyperref[alg:alg1]{1}, Appendix~\ref{sec:sir-example})  & proposal $q(x; y)$                                                                & Nested IS~\citep{naesseth2015nested} (Appendix~\ref{sec:nsmc-example}), Agglomerative Monte Carlo (Section~\ref{sec:examples}, RAVI strategy~\hyperref[infstrat:agglom]{2}), Annealed IS~\citep{neal2001annealed} (Appendix~\ref{sec:ais-example})   \\
    Particle Filtering~\citep{djuric2003particle} (Appendix~\ref{sec:smc-example}) & initial proposal $q_0(x_0; y_0)$, step proposals $q_t(x_i \mid x_{t-1}, y_{t})$        &    Nested SMC~\citep{naesseth2015nested} (Appendix~\ref{sec:nsmc-example}), SMC$^2$~\citep{chopin2013smc2} (Appendix~\ref{sec:smcsq-example})   \\
    Del-Moral SMC~\citep{del2006sequential} (Appendix~\ref{sec:smc-example})      & initial proposal $q_0(x_0)$, forward kernels $K_t(x_t \mid x_{t-1})$, reverse kernels $L_t(x_{t-1} \mid x_t)$, targets $\tilde\pi_t(x)$                                                                                &                                           \\
    Black-Box Variational Inference~\citep{ranganath2014black} (Alg.~\hyperref[alg:alg3]{3}) & variational family $q_\theta(x; y)$ & IWAE~\citep{burda2015importance} (Appendix~\ref{sec:iwae-example}), MCVI~\citep{salimans2015markov} (Section~\ref{sec:overview}, Appendix~\ref{sec:ham-example}), Variational SMC~\citep{naesseth2018variational} (Appendix~\ref{sec:vsmc-example})\\
    Amortized Variational Inference~\citep{le2017inference} (Alg.~\hyperref[alg:alg4]{4}) & variational family $q_\theta(x; y)$ & Amortized Rejection Sampling~\citep{naderiparizi2019amortized} (Appendix~\ref{sec:amrej-example})\\
    Metropolis-Hastings  (Alg.~\hyperref[alg:alg5]{5})  &   transition proposal $q(x'; x)$                                                                                 &             pseudo-marginal ratio MH~\citep{andrieu2018utility}                              \\
    Hierarchical Variational Inference~\citep{ranganath2016hierarchical} & variational family $q_\theta(z, x; y)$, reverse proposal $r_\theta(z; x, y)$ & Importance-Weighted HVI~\citep{sobolev2019importance}, RAVI-MCVI (Sections~\ref{sec:overview} and \ref{sec:examples}, RAVI strategy~\hyperref[infstrat:mcvi]{1})
    \end{tabular}
    \caption{RAVI generalizes many algorithms for Monte Carlo and variational inference, by allowing 
    practitioners to choose proposals, variational families, and intermediate targets for which 
    exact density evaluators are not available. In the ``example applications'' column, we list 
    both novel examples of algorithms that exploit this degree of freedom (e.g., the Agglomerative Monte Carlo algorithm we develop in Section~\ref{sec:examples}), and algorithms from the literature that --- as we show in Appendix~\ref{sec:appendix-examples} --- can be viewed as instances of simpler algorithms, but with certain sophisticated proposals whose density RAVI estimates.}
\end{table*}

\section{INTRODUCTION}
\label{sec:intro}
Monte Carlo and variational inference algorithms 
are the workhorses of modern probabilistic inference,
a fundamental problem with applications in many disciplines~\citep{murphy2012machine}.
A key challenge in applying these algorithms is the design of \textit{proposal distributions} (in VI, variational families), which can greatly affect their performance~\citep{chatterjee2018sample}. A good proposal should incorporate any knowledge the practitioner might have about the shape of the posterior; however, this goal is often in tension with the requirement that a proposal's marginal density be analytically tractable, in order to compute importance weights, MCMC acceptance probabilities, or gradient updates for VI. The challenge is that proposal distributions that are simple enough to admit exact density evaluators may not be flexible enough to solve real-world posterior inference problems.
%Inference in latent-variable generative models is a fundamental problem with applications in many disciplines~\citep{murphy2012machine}.
%Given a latent-variable model $p_\theta(x, y)$ and observed data $y$, the goal of inference may be to approximate the posterior $p_\theta(x \mid y)$, estimate the marginal likelihood $p_\theta(y) = \int p_\theta(x, y) \text{d}x$, or fit the model parameters $\theta$ to data.% by finding $\theta^* = \text{argmax}_\theta p_\theta(y)$.
% \begin{enumerate}
%     \item  approximate the posterior distribution $p_\theta(x \mid y)$,
%     \item estimate the data's marginal likelihood under $p_\theta$, $$p_\theta(y) = \int p_\theta(x, y) dx,\text{ or}$$
%     \item fit the model to data by finding $\theta^* = \text{argmax}_{\theta}\, p_\theta(y).$
% \end{enumerate}
%


% \begin{figure*}
%     \centering
%     \includegraphics[width=\linewidth]{figs/nested_long.pdf}
%     \caption{\textbf{Left:} A RAVI inference strategy $\mathcal{S}$ for inference in the latent-variable model $p_0(x_0) = \int p_0(x_1, x_0) \text{d}x_1$. RAVI enables the use of posterior approximations with intractable marginal densities (such as $p_1(x_1; x_0) = \int p_1(x_2, x_1; x_0) \text{d}x_1$) within Monte Carlo and variational inference algorithms. It does this by casting the estimation of $p_1$'s marginal density as a nested posterior inference problem, targeting the \textit{meta-posterior} $p_1(x_2 \mid x_1; x_0)$, to which RAVI can be recursively applied. \textbf{Top right:} an inference strategy can be flattened into an auxiliary-variable model $p^*$ and an auxiliary-variable proposal $q^*$, then used as a proposal in IS, a transition kernel in SMC and MCMC, or a variational family in VI. \textbf{Bottom right:} Theorems characterize approximate inference quality (the variance of the importance weights $\hat{Z}$, or the tightness of the ELBO $\mathcal{L}$) in terms of the quality of each posterior or meta-posterior approximation in the strategy. These theorems can be used to guide the design of new algorithms, or analyze and improve existing algorithms, many of which can be seen as instances of RAVI.}
%     \label{fig:nested}
% \end{figure*}
%

%Monte Carlo and variational inference (VI) algorithms are the workhorses of modern probabilistic inference~\citep{murphy2012machine}. 


In this paper, we present a new framework, called \textit{Recursive Auxiliary-Variable Inference} (RAVI), for incorporating more complex proposals, without exact marginal density evaluators, into standard Monte Carlo and VI algorithms.
The key idea is to approximate the 
proposal densities using \textit{meta-inference}~\citep{cusumano2017aide}: an additional layer of Monte Carlo or variational inference targeting the proposal, rather than the model. RAVI generalizes and unifies several existing methods for inference with expressive proposals~\citep{salimans2015markov,ranganath2016hierarchical,sobolev2019importance}, which we show correspond to specific choices of meta-inference algorithm (see Appendix~\ref{sec:appendix-examples} for 10 examples).

\textbf{Contributions.} Our key contributions are:
\begin{itemize}
\item the RAVI framework, including new recursive algorithms for IS, VI, SMC, and MH using proposals without exact marginal density evaluators (Sections~\ref{sec:overview} \&~\ref{sec:inference-algs});
\item theorems characterizing the impact of RAVI's estimated densities on inference quality (sampler variance, or tightness of variational bounds) (Section~\ref{sec:theory}); and
\item two extended examples of RAVI's application to algorithm design and analysis: (1) a novel variant of~\citet{salimans2015markov}'s Markov Chain Variational Inference (MCVI) algorithm that, unlike vanilla MCVI, scales to handle proposals incorporating long MCMC chains; and (2) a novel sampler for Dirichlet process mixtures that uses a randomized agglomerative clustering algorithm as a proposal, outperforming strong baselines on a standard benchmark from astronomy~\citep{drinkwater2004large} and a challenging Medicare data cleaning problem~\citep{MedicareHosp,lew2021pclean}.
\end{itemize} 

%for designing and analyzing algorithms that use more complex proposals, without tractable densities. 
%We apply RAVI 
% In Section~\ref{sec:overview}, we introduce the framework 
% and apply it to incorporate MCMC chains into 
% variational families, generalizing and improving on
% ~\citet{salimans2015markov}'s Markov Chain Variational
% Inference algorithm. In Section~\ref{sec:examples}, we 
% apply RAVI to design a new sampler for Dirichlet process mixtures that outperforms strong baselines on real-world datasets, and a new method for ensembling multiple Monte Carlo or variational inference algorithms.

% \begin{figure*}
%     %\centering
% %    \includegraphics[width=0.9\linewidth]{figs/tutorial.pdf}
%     \begin{subfigure}[b]{0.22\linewidth}
%         \includegraphics[width=\linewidth]{figs/target.pdf}
%         % \begin{align*}
%         %     p(z, x) = \mathcal{N}&(z; \mathbf{0}, 5\mathbf{I})\cdot\\ 
%         %     &\mathcal{N}(x; ||z||_2^2, \frac{3}{4})
%         % \end{align*}
%         \caption{Target distribution, the posterior $p_0(x_1 \mid x_0)$ of the latent-variable generative model $p(x_1, x_0) = \mathcal{N}(x_1; \mathbf{0}, 5\mathbf{I}) \cdot \mathcal{N}(x_0; ||x_1||_2^2, \frac{3}{4})$. Variational inference can be applied to approximate the posterior, but requires the choice of a variational family. }
%         \label{fig:target}
%     \end{subfigure}\hfill
%     \begin{subfigure}[b]{0.22\linewidth}
%     \includegraphics[width=\linewidth]{figs/simple_gaussian.pdf}
%     % \begin{align*}
%     %         q(z; x) = \mathcal{N}(z; \mu, \Sigma)\\
%     %         \,
%     % \end{align*}
%     \caption{A Gaussian variational family. The Gaussian's density is tractable, so  standard (top) and amortized (bottom) variational inference can be applied. However, the Gaussian approximation is not flexible enough to accurately characterize the posterior distribution.}
%     \label{fig:inadequate}
%     \end{subfigure}\hfill
%     \begin{subfigure}[b]{0.22\linewidth}
%     \includegraphics[width=\linewidth]{figs/angle.pdf}
%     % \begin{align*}
%     %         q(z; x) = \mathcal{N}(z; \mu, \Sigma)\\
%     %         \,
%     % \end{align*}
%     \caption{A RAVI inference strategy $\mathcal{S}$. $\mathcal{S}.q$ (left) approximates the posterior better by sampling a latent angle (red) and then a point (green), but has an intractable marginal density. RAVI's Algs. 3 (top) and 4 (bottom) are necessary to compute standard and amortized VI gradients.}
%     \label{fig:angle}
%     \end{subfigure}\hfill
%     \begin{subfigure}[b]{0.22\linewidth}
%     \includegraphics[width=\linewidth]{figs/SIR.pdf}
%     % \begin{align*}
%     %         q(z; x) = \mathcal{N}(z; \mu, \Sigma)\\
%     %         \,
%     % \end{align*}
%     \caption{Existing algorithms, such as IWAE~\citep{burda2015importance}, can often be seen as RAVI inference strategies. This $\mathcal{S}.q$ generates a vector of $K$ particles (red) and chooses one (green) to propose, and has an intractable density. Algs. 3 (top) and 4 (bottom) can be used for optimization.}
%     \label{fig:multiple}
%     \end{subfigure}
%     \caption{\textbf{Example of RAVI applied to a simple 2D posterior.} 
    %\textit{Left strategy (A + B):} When using a tractable variational family $p_1$, e.g. a Gaussian, RAVI’s Algorithms 3 and 4 reduce to standard variational inference and amortized variational inference respectively. \textit{Center strategy (A + C + E):} A more expressive variational family first generates a vector of $K$ particles (red), and then chooses one to propose (green) based on their importance weights. The marginal density of the chosen particle is intractable, but with RAVI, we can still perform standard and amortized variational inference, by adding meta-inference $p_2$ that approximates $p_1$’s posterior. This particular choice recovers IWAE (when using Alg. 3), but is suboptimal; see Section~\ref{sec:IWAE}. \textit{Right strategy (A + D + F):} Another way to use latent variables to create an expressive approximation, exploiting special problem structure. We first generate a latent angle (red) then a proposed point (green) from a Gaussian around the corresponding point on the circle of radius $\sqrt{x_0}$. This distribution again has an intractable marginal density, but RAVI enables VI if we specify a $p_2$ distribution for inferring the noisy angle $x_2$ from the point $x_1$.
    %}
%    \label{fig:tutorial}
%\end{figure*}

% RAVI's key idea is to approximate intractable proposal densities using \textit{meta-inference}~\citep{cusumano2017aide}: additional Monte Carlo or variational inference targeting the proposal, rather than the model. More precisely, RAVI generalizes the notion of a tractable \textit{proposal distribution} to that of an
% \textit{inference strategy} (Figure~\ref{fig:nested}). An inference strategy $\mathcal{S}$ specifies a posterior approximation $\mathcal{S}.q$ that \textit{need not} have a tractable marginal density, unlocking a flexible class of proposals that perform involved simulations or optimization in order to generate approximate posterior samples. When $\mathcal{S}.q$'s  marginal density is intractable, $\mathcal{S}$ also specifies a \textit{meta-inference strategy} $\mathcal{S}.\mathcal{M}$, to be applied recursively to the problem of estimating $\mathcal{S}.q$'s marginal density.
% This in turn enables the computation of importance weights, MCMC acceptance probabilities, and stochastic gradient estimates for VI. We present a suite of recursive algorithms for using inference strategies as drop-in replacements for proposal distributions in a variety of contexts, including importance sampling~\citep{glynn1989importance}, sequential Monte Carlo~\citep{chopin2020introduction}, Metropolis-Hastings~\citep{tierney1998note}, and variational inference. In all cases, the unbiasedness or consistency properties of the original algorithms are preserved.

% %RAVI takes inspiration from 
% %AIDE~\citep{cusumano2017aide}, which, like RAVI, treats inference processes (e.g. SIR) as latent-variable models, called `generative inference models,' in which inference can be performed. We discuss the relationship further, and RAVI-based extensions to AIDE, in Appendix~\ref{sec:other-applications}.

% \textbf{Contributions.} This paper contributes:
% \begin{enumerate}
%     \item the RAVI framework, including new recursive algorithms for IS, VI, SMC, and MCMC using proposals that lack tractable marginal likelihoods (Section~\ref{sec:inference-algs});
%     \item theorems precisely characterizing approximation variance and accuracy (Section~\ref{sec:theory}); and
% %    \item unifying soundness arguments for diverse algorithms, achieved by reformulating these algorithms in terms of RAVI inference strategies (Section~\ref{sec:inference-algs} and~\ref{sec:unifying});  and
%     \item examples of RAVI's application to algorithm design, including: a novel approach to incorporating MCMC moves into variational families, improving on MCVI~\citep{salimans2015markov};
%     a novel sampler for Dirichlet process mixtures, outperforming a strong SMC baseline~\citep{lew2021pclean}; and a novel approach to 
%     ensembling multiple Monte Carlo algorithms or variational 
%     families, outperforming a naive ensembling strategy (Section~\ref{sec:examples}). The algorithms are tested on a standard astronomy benchmark~\citep{drinkwater2004large} and a challenging 1k-row Medicare dataset~\citep{MedicareHosp}.
% \end{enumerate}



\section{RECURSIVE AUXILIARY-VARIABLE INFERENCE}
\label{sec:overview}


% \begin{tiny}
%     \begin{table*}[]
%         \begin{tabular}{lll}
%         \textbf{Algorithm} & 
%         \textbf{Intractable posterior approximation $\mathcal{S}.q$} & 
%         \textbf{Meta-posterior approximation $\mathcal{S}.\mathcal{M}(x).q$}\\
%         IWAE~\citep{burda2015importance} & 
%         \begin{tabular}[t]{@{}l@{}}\textcolor{gray}{SIR with tractable proposal}\\ $q(x) = \sum_{j=1}^K \int \prod_{i=1}^K q(x_i) \frac{w_j}{\sum_{l=1}^K w_l}\delta_{x_j}(x)\text{d}x_{1:K}$\end{tabular} & \begin{tabular}[t]{@{}l@{}}\textcolor{gray}{Conditional SIR}\\ $ h(x_{1:K}, j) = \frac{1}{K} \delta_{x}(x_j) \prod_{i \neq j} q(x_i)$\end{tabular} \\
%         HVM~\citep{ranganath2016hierarchical}   & \begin{tabular}[t]{@{}l@{}}\textcolor{gray}{Latent-variable proposal}\\ $q(x) = \int q(z, x)\text{d}z$\end{tabular}                                            & 
%         \begin{tabular}[t]{@{}l@{}}\textcolor{gray}{Tractable proposal}\\ $h(z; x)$\end{tabular}                                                \\
%         MCVI~\citep{salimans2015markov} & \begin{tabular}[t]{@{}l@{}}\textcolor{gray}{MCMC chain initialized from tractable proposal}\\  $q(x) = \int q(x_0) \prod_{i=1}^M T(x_i; x_{i-1}) \delta_{x_M}(x) \text{d}x_{0:M}$\end{tabular}                       &                                   
%         \begin{tabular}[t]{@{}l@{}}\textcolor{gray}{Sequence of tractable proposals}\\ $h(x_{0:M}; x) = \delta_{x}(x_M) \prod_{i=0}^{M-1} r_i(x_i; x_{i+1})$\end{tabular}                                                \\
%         IWHVI~\citep{sobolev2019importance} & \begin{tabular}[t]{@{}l@{}}\textcolor{gray}{Latent-variable proposal}\\  $q(x) = \int q(z, x) \text{d}z$\end{tabular}                       &                                   
%         \begin{tabular}[t]{@{}l@{}}\textcolor{gray}{SIR with tractable proposal}\\ 
%         $h(z; x) = \sum_{j=1}^K \int \prod_{i=1}^K h(z_i; x) \frac{w_j}{\sum_{l=1}^K w_l}\delta_{z_j}(z)\text{d}z_{1:K}$\end{tabular}  \\       
%         DIWHVI~\citep{sobolev2019importance} & \begin{tabular}[t]{@{}l@{}}\textcolor{gray}{SIR with latent-variable proposal}\\  $q(x) = \sum_{j=1}^K \int \prod_{i=1}^K q(z_i, x_i)\cdot$\\ \,\,\,\,\,\,\,\,\,\,\,\,\,\,\,\,\,\,\,\,\,\,\,\,\,\,\,\,\,\,\,\,\,\,\,\,\,\,\,\,\,\,\,\,\,\,\,\,\,\,$\frac{w_j}{\sum_{l=1}^K w_l} \delta_{x_j}(x) \text{d}(z, x)_{1:K}$\end{tabular}                       &                                   
%         \begin{tabular}[t]{@{}l@{}}\textcolor{gray}{Conditional SIR for $j, x_{1:K}, z_{-j}$}\\ 
%         $h(z_{-j}, x_{1:K}, j; x) = \frac{1}{K} \delta_{x}(x_j) \prod_{i \neq j} q(z_i, x_i)$\\
%         \textcolor{gray}{SIR with tractable proposal for $z_j$}\\
%         $h(z_j; x) = \sum_{k=1}^K \int \prod_{i=1}^K h(z_j^{(i)}; x) \frac{w_k}{\sum_{m=1}^K w_m}\delta_{z_j^{(k)}}(z_j)\text{d}z_j^{(1:K)}$\end{tabular}  \\     
%     \end{tabular}
%         \caption{Many existing approaches to inference with flexible proposals can be understood as 
%                 instances of RAVI, with particular choices of inference strategy $\mathcal{S}$.
%                 In some cases, including IWHVI and DIWHVI, the meta-posterior approximation is itself intractable,
%                 necessitating meta-meta-inference $\mathcal{S}.\mathcal{M}(x).\mathcal{M}$. See Appendix~\ref{sec:appendix-examples} for eight additional examples.}
%         \label{tab:algorithms}
%     \end{table*}
% \end{tiny}

In this section, we introduce the RAVI framework in the context of a running example: 
we incorporate a chain of MCMC steps into a proposal, so that it can more accurately approximate a
posterior distribution. Our approach generalizes~\citet{salimans2015markov}'s Markov Chain Variational Inference (MCVI) algorithm, and fixes a flaw that prevents it from scaling to longer MCMC chains.

% The incorporation of MCMC steps into 
% variational families has previously been 
% studied by~\citet{salimans2015markov}; we analyze 
% their Markov Chain Variational Inference (MCVI) algorithm 
% as an instance of the RAVI framework, and reveal a flaw 
% that prevents it from scaling to handle longer chains.
% We then use RAVI to fix the problem, 

% RAVI is a framework for incorporating involved simulations
% into variational families and proposals. In this section,
% we introduce the framework by way of an example:
% a variational family that incorporates several steps of 
% MCMC to more accurately approximate the posterior.

\textbf{An expressive proposal based on MCMC.} 
Let $p(x, y)$ be a latent-variable model and  
$y$ an observation. Suppose we wish to approximate  
$p(x \mid y)$ using an expressive proposal $q(x)$, that
generates an initial location $x_0$ from a simple 
parametric distribution $q_0$, then 
iterates $M$ steps of an MCMC kernel $T$:\footnote{Why incorporate $M$ MCMC steps into a proposal $q$, rather than simply running MCMC?
Several reasons: (1) if we use $q$ as an importance sampling proposal, the importance weights are unbiased 
estimates of the marginal likelihood $p(y)$, which we can use to evaluate our model; (2) if we use $q$ as a variational family, we can optimize the ELBO to learn parameters of the initial proposal or the MCMC transition kernel; and (3) if we generate many importance sampling particles using $q$, their importance weights can in theory correct for the bias of finite-sample MCMC.}%\footnote{Our figures in this section were generated with $T$ set to an unadjusted Langevin ascent kernel.}: 
$$q(x) = \int q_0(x_0) \left(\prod_{i=1}^M T(x_{i-1}\rightarrow x_i)\right) \delta_{x_M}(x) \text{d}x_{0:M}.$$
Even when $q_0$ is a poor approximation to $p(x \mid y)$, $q(x)$ may be close to the posterior, if $M$ is sufficiently high. However, because the density $q(x)$ cannot be efficiently evaluated, we cannot use $q(x)$ as a proposal within importance sampling (we have no way to evaluate the importance weight $\frac{p(x, y)}{q(x)}$), nor as a variational family in VI (we cannot estimate the ELBO $\mathcal{L} = \mathbb{E}_{x \sim q} [\log \frac{p(x,y)}{q(x)}]$ or its gradient, making it impossible to learn $p$'s or $q$'s parameters).%
\begin{figure}[t]
    \centering
    \includegraphics[width=0.9\linewidth]{figs/nested.pdf}
    \caption{Structure of a RAVI inference strategy $\mathcal{S}$ targeting the posterior $p(x \mid y)$ of a latent-variable model. 
    The proposal $\mathcal{S}.q(x) = \int \mathcal{S}.q(r, x) \text{d}r$ has an intractable marginal density, so the strategy also specifies a \textit{meta-inference} strategy $\mathcal{S}.\mathcal{M}$ that targets $\mathcal{S}.q(r \mid x)$. Nesting continues until the $q$
    approximation at some layer has a tractable density, at which 
    point no further meta-inference is needed.}
    \label{fig:nested}
\end{figure}

\textbf{Approximating proposal densities with meta-inference.} RAVI's goal is to enable inference even when we cannot compute the marginal densities of our proposals and variational families exactly. To apply RAVI, we must specify not just the proposal itself but also a \textit{meta-inference} algorithm, bundled with the proposal into an \textit{inference strategy}:


\textbf{Definition.} An \textit{inference strategy $\mathcal{S}$ targeting $\pi$} specifies:
\begin{itemize}
    \item a posterior approximation $\mathcal{S}.q(x) \approx \pi(x)$\footnote{To simplify the exposition, we assume that if an inference strategy $\mathcal{S}$ targets $\pi$, then the approximation $\mathcal{S}.q$ is \textit{mutually absolutely continuous} with $\pi$, i.e. the measure-zero events under $\pi$ are exactly the same as those under $\mathcal{S}.q$. This requirement can be relaxed somewhat; see Appendix~\ref{sec:even-odd}.} that either has an efficient density evaluator, or is the marginal distribution of a joint distribution with a tractable density, i.e. $\mathcal{S}.q(x) = \int \mathcal{S}.q(r, x) \text{d}r$, and,
    
    \item if $\mathcal{S}.q$'s marginal density cannot be efficiently evaluated, a \textit{meta-inference strategy} $\mathcal{S}.\mathcal{M}$, assigning to each value of $x$ an inference strategy $\mathcal{S}.\mathcal{M}(x)$ targeting $\mathcal{S}.q(r \mid x)$.
\end{itemize}


Figure~\ref{fig:nested} illustrates the recursive 
structure of an inference strategy. 
The key novelty is the inclusion of \textit{meta-inference}, in the form of 
\textit{meta-posterior approximations}: 
additional proposals that the user specifies for 
inferring auxiliary variables introduced by existing proposal distributions. 
In our running example, we take $\mathcal{S}.q(x)$ to be our MCMC-based posterior approximation: it lacks a tractable density, but is the marginal of a tractable joint density $\mathcal{S}.q(x_{0:M}, x)$ over entire MCMC traces. A \textit{meta-posterior approximation}, then, is a probability distribution $\mathcal{S}.\mathcal{M}(x).q(x_{0:M})$ that approximates the \textit{meta-posterior} $\mathcal{S}.q(x_{0:M} \mid x)$: the distribution over traces of the MCMC chain, given 
the final location $x$. 

The meta-posterior approximations enable RAVI to estimate the intractable marginal density of the top-level posterior approximation, to compute weights and gradients:

\textit{In Monte Carlo:} If $\mathcal{S}.q(x) = \int \mathcal{S}.q(r, x) \text{d}r$ is intended for use as a Monte Carlo proposal, RAVI uses meta-inference to obtain an unbiased estimate of $\frac{1}{\mathcal{S}.q(x)}$ (Algorithm~\hyperref[alg:alg2]{2}), which is then multiplied by $p(x, y)$ to estimate the importance weight $\frac{p(x, y)}{\mathcal{S}.q(x)}$. This process relies on the \textit{harmonic mean identity}~\citep{newton1994approximate}, that for any meta-posterior approximation $h$, $$\mathbb{E}_{\mathcal{S}.q(r \mid x)}\left[\frac{h(r)}{\mathcal{S}.q(r, x)}\right] = \frac{1}{\mathcal{S}.q(x)}\mathbb{E}\left[\frac{h(r)}{\mathcal{S}.q(r \mid x)}\right] = \frac{1}{\mathcal{S}.q(x)}.$$
(Harmonic mean estimators are infamous for having potentially infinite variance, but only when $h$ is set to a broad prior; we give a general analysis of the variance of RAVI's importance weights in Section~\ref{sec:theory}.)

\textit{In Variational Inference:} If $\mathcal{S}.q(x) = \int \mathcal{S}.q(r, x) \text{d}r$ is intended as a variational family, then RAVI uses the meta-posterior approximation to formulate an \textit{upper bound} on $\log \mathcal{S}.q(x)$: for any meta-posterior approximation $h(r)$,
$$\log \mathcal{S}.q(x) \leq \mathcal{U}(x) := \mathbb{E}_{\mathcal{S}.q(r \mid x)}[\log \mathcal{S}.q(r, x) - \log h(r)].$$
This follows from Jensen's inequality, and the harmonic 
mean identity from above. %The tightness of the bound is controlled by the quality of meta-inference, as $\mathcal{U}(x) - \log \mathcal{S}.q(x) = KL(\mathcal{S}.q(r \mid x) || h(x))$. 
With this upper bound in hand, we formulate a surrogate ELBO $\mathcal{L}_\mathcal{S} = \mathbb{E}_{\mathcal{S}.q(x)}[\log p(x, y) - \mathcal{U}(x)] \leq \mathcal{L}$, which we can tractably estimate and optimize via stochastic gradient descent (Algorithm~\hyperref[alg:alg3]{3}).

In Section~\ref{sec:inference-algs}, we show how similar estimators can be built up recursively when the meta-posterior approximations themselves have intractable marginal densities.
% How does meta-inference help? Just as 
% variational inference in the model $p(x, y)$ allows us to 
% tractably approximate the evidence $\log p(y)$ (via the ELBO), 
% variational \textit{meta-inference} in the proposal $\mathcal{S}.q$ allows us to approximate \textit{its} log marginal density, $\log \mathcal{S}.q(x)$, and this 
% approximation can stand in for the intractable $\log \mathcal{S}.q(x)$ term in the ELBO, yielding a tractable surrogate 
% objective (Algorithm~\hyperref[alg:alg3]{3}).

\textbf{A meta-inference strategy that recovers the MCVI objective~\citep{salimans2015markov}.} In our running example, where the auxiliary randomness $r$ is a trace $x_{0:M}$ of locations visited by MCMC, one option for meta-inference is to learn neurally parameterized 
reverse Markov kernels $R_i(x_{i+1} \rightarrow x_i)$, and apply them 
in sequence to infer a plausible trace of MCMC steps leading to the 
final location $x$:%
$$\mathcal{S}.\mathcal{M}(x).q(x_{0:M}) = \delta_{x}(x_M) \prod_{i=0}^{M-1} R_i(x_{i+1} \rightarrow x_i).$$%
This approximation to $\mathcal{S}.q(x_{0:M} \mid x)$ has a tractable 
density, and so completely specifies the meta-inference strategy $\mathcal{S}.\mathcal{M}$; there is no need to specify a \textit{meta-}meta-inference 
strategy. Given $\mathcal{S}$, RAVI optimizes the surrogate objective
$\mathcal{L}_\mathcal{S} = \mathbb{E}_{x \sim \mathcal{S}.q}[\log p(x, y) - \mathcal{U}_{\mathcal{S}.\mathcal{M}(x)}]$, where
%where
$$\mathcal{U}_{\mathcal{S}.\mathcal{M}(x)} = \mathbb{E}_{x_{0:M} \sim \mathcal{S}.q(x_{0:M} \mid x)}\left[\log \frac{\mathcal{S}.q(x_{0:M}, x)}{\mathcal{S}.\mathcal{M}(x).q(x_{0:M})}\right].$$
For the above choice of $\mathcal{S}.\mathcal{M}$, the RAVI objective $\mathcal{L}_\mathcal{S}$ exactly coincides with the Markov Chain Variational Inference (MCVI) objective of~\citet{salimans2015markov}. In fact, RAVI unifies and generalizes many existing methods; 10 examples are collected in Appendix~\ref{sec:appendix-examples}.
%$$\mathcal{L}_\mathcal{S} = \mathbb{E}_{x_{0:M} \sim \mathcal{S}.q}\left[\log \frac{p(x_M, y) \prod_{i=0}^{M-1} R_i(x_{i+1} \rightarrow x_i)}{q_0(x_0) \prod_{i=1}^M T(x_i \rightarrow x_{i+1})}\right],$$


\textbf{Analyzing MCVI within the RAVI framework.} Framing MCVI as a RAVI algorithm lets us analyze it using general theory about RAVI objectives. For example, the relative tightness of the bound $\mathcal{L}_\mathcal{S}$ is controlled by the quality of meta-inference:
$$\mathcal{L} - \mathcal{L}_\mathcal{S} = \mathbb{E}_{\mathcal{S}.q(x)}[KL(\mathcal{S}.q(x_{0:M} \mid x) || \mathcal{S}.\mathcal{M}(x).q(x_{0:M}))].$$
We can use this characterization to analyze the MCVI objective's behavior as $M$ grows, i.e., as MCMC steps are added. Informally, as the MCMC chain begins to mix, the marginal distribution $\mathcal{S}.q(x)$ over the final location of the chain should grow closer to the posterior $p(x \mid y)$, tightening the (intractable) ELBO $\mathcal{L}$. Unfortunately, the meta-inference gap $\mathcal{L} - \mathcal{L}_\mathcal{S}$ \textit{grows} with $M$, unless each kernel $R_i$ exactly captures the local 
posterior $\mathcal{S}.q(x_i \mid x_{i+1})$. (This can be seen as an instance of the well-known \textit{degeneracy problem} of sequential importance sampling~\citep[Proposition 1]{doucet2000sequential}.) As MCMC converges, the rate of improvement in $\mathcal{L}$ slows, and the meta-inference penalty 
for increasing the chain's length eventually outweighs the 
benefit of improving the posterior approximation $\mathcal{S}.q$. The red curves in Figure~\ref{fig:mcvi-experiment} show this phenomenon playing out on two toy targets: we see that $\mathcal{L}_\mathcal{S}$ does 
become tighter as more MCMC steps are added, but only to a point, before the bound begins to \textit{loosen}.

\textbf{Resolving the issue with improved meta-inference.} 
RAVI clarifies that the variational bound loosens with increasing $M$ due to poor meta-inference: as the MCMC chain grows longer, error in the learned backward kernels accumulates.
This analysis also points to 
a solution: use a meta-inference 
strategy $\mathcal{S}.\mathcal{M}$ that \textit{can} scale to 
longer MCMC histories. 

\begin{algorithm}[t]
    \label{infstrat:mcvi}
    \SetAlgoLined\DontPrintSemicolon
    \scriptsize{
    \textbf{RAVI Inference Strategy 1:} RAVI-MCVI\;
    \SetKwFunction{mcvi}{rmcvi($M, K$).q}\SetKwFunction{mcvim}{rmcvi($M, K$).M($x$).q}\SetKwFunction{mcvimm}{rmcvi($M, K$).M($x$).M($x_{0:M})$.q}
    \SetKwProg{infalg}{Posterior Approx.}{}{}
    \SetKwInOut{Infers}{Target of inference}
    \SetKwInOut{Aux}{Auxiliary variables}
    \infalg{\mcvi{}}{
    \Infers{latent variable $x$}
    \Aux{MCMC trace $x_{0:M}$}
    \nl $x_0 \sim q_0$\;
    \nl \For{$i \in 1, \dots, M$}{
        \nl $x_i \sim T(x_{i-1} \rightarrow \cdot)$\;
    }
    \nl \Return{$x_M$}\;}{}
    \setcounter{AlgoLine}{0}
    \SetKwProg{metaalg}{Meta-Posterior Approx.}{}{}
    \metaalg{\mcvim{}}{
    \Infers{MCMC trace $x_{0:M}$}
    \Aux{SMC particles $x_{0:M}^{1:K}$, ancestor indices $a_0, a_{1:M}^{1:K}$}
    \nl \For{$k \in 1,\dots, K$}{
        \nl $(x_M^k, w_M^k, t_k) \gets (x, q_m(x), [x])$\;
    }
    \nl \For{$i \in M-1, \dots, 0$}{
        \nl \For{$k \in 1,\dots,K$}{
            \nl $a_{i+1}^k \sim \text{Discrete}(w_{i+1}^{1:K})$\;
            \nl $x_i^k \sim R_i(x_{i+1}^{a_{i+1}^k} \rightarrow \cdot)$\tcp*{MCVI backward kernel}\;
            \nl $w_i^k \gets \frac{q_i(x_i^k)T(x_i \rightarrow x_{i+1}^{a_{i+1}^k})}{q_{i+1}(x_{i+1}^{a_{i+1}^k})R_i(x_{i+1}^{a_{i+1}^k} \rightarrow x_i^k)}$\;
            \nl $t_k \gets [x_i^k, t_k^{a_{i+1}^k}\dots]$\;
        }
    }
    \nl $a_0 \sim \text{Discrete}(w_{0}^{1:K})$\;
    \nl \Return{$t_{a^0}$}}
    \setcounter{AlgoLine}{0}
    \SetKwProg{metaalg}{Meta-Meta-Posterior Approx.}{}{}
    \metaalg{\mcvimm{}}{
    \Infers{SMC particles $x_{0:M}^{1:K}$, ancestor indices $a_0, a_{1:M}^{1:K}$}
    \Aux{None}
    %\nl $a_\text{prev} \gets a_0$\;
    \nl \For{$i \in 0, \dots, M$}{
        $b_i \sim  \text{Uniform}(1, K)$\;
        %$a_\text{prev} \gets a_i^{a_{\text{prev}}}$\;
    }
    \nl \For{$k \in 1,\dots, K$}{
        \nl $(x_M^k, w_M^k) \gets (x, q_m(x)])$\;
    }
    \nl \For{$i \in M-1, \dots, 0$}{
        \nl \For{$k \in 1,\dots,K$}{
            \nl \If{$k = b_i$}{
                \nl $(a_{i+1}^k, x_i^k) \gets (b_{i+1}, x_i)$\;
            }
            \nl \Else{
                \nl $a_{i+1}^k \sim \text{Discrete}(w_{i+1}^{1:K})$\;
                \nl $x_i^k \sim R_i(x_{i+1}^{a_{i+1}^k} \rightarrow \cdot)$\;
            }
            \nl $w_i^k \gets \frac{q_i(x_i^k)T(x_i \rightarrow x_{i+1}^{a_{i+1}^k})}{q_{i+1}(x_{i+1}^{a_{i+1}^k})R_i(x_{i+1}^{a_{i+1}^k} \rightarrow x_i^k)}$\;
        }
    }
    \nl $a_0 \gets b_0$\;
    \nl \Return{$(a_0, a_{1:M}^{1:K}, x_{0:M}^{1:K})$}}
    }
    \vspace{-5mm}
\end{algorithm} 


A standard approach 
to resolving the degeneracy problem when inferring 
sequences of latent variables is \textit{sequential 
Monte Carlo} (SMC)~\citep{del2006sequential}. SMC tracks 
$K$ hypotheses about a latent sequence, periodically 
weighting the hypotheses and \textit{resampling}, to 
clone promising particles and cull poor ones. Using 
RAVI, we can use SMC for 
\textit{meta-inference}: we choose $\mathcal{S}.\mathcal{M}(x).q(x_{0:M})$ to generate a collection of $K$ possible backward 
MCMC trajectories, using SMC, before selecting one to return.
This meta-posterior approximation is shown in RAVI Inference Strategy~\hyperref[infstrat:mcvi]{1}. 
% Letting $x_{0:M}^{1:K}$ be the $K$ particles, $a_{1:M}^{1:K}$ the integer-valued ancestor variables tracking the results of resampling decisions, and $a_0$ the index of the final chosen particle after the entire collection of $K$ trajectories has been generated, we have $\mathcal{S}.\mathcal{M}(x).q(a_0, a_{1:M}^{1:K}, x_{0:M}^{1:K}, x_{0:M}) = $

% \begin{scriptsize}
% $$\prod_{j=1}^K \delta_{x}(x_M^j) \left[\prod_{i=0}^{M-1} \frac{w_{i+1}^{a_{i+1}^j}}{\sum_{k=1}^K w_{i+1}^{k}} R_i(x_{i+1}^{a_{i+1}^{j}} \rightarrow x_{i}^{j})\right] \cdot \\
% \frac{w_0^{a_0}\delta_{\mathbf{x}^{a_0}}(x_{0:M})}{\sum_{k=1}^K w_0^K},$$
% \end{scriptsize}

% where $\mathbf{x}^{a_0}$ is the vector of locations that make up the final chosen trajectory. The importance weights $w_i^j$ that guide resampling decisions are computed as $w_i^j = \frac{q_i(x_i^j) T(x_i \rightarrow x_{i+1}^{a_{i+1}^j})}{q_{i+1}(x_{i+1}^{a_{i+1}^{j}}) R_i(x_{i+1}^{a_{i+1}^j} \rightarrow x_i^j)}$, where the $q_i$
% are learned approximations (in our experiments, Gaussians) to the marginal distribution at each step of the MCMC chain. 

This algorithm does not itself have a tractable marginal density: computing $\mathcal{S}.\mathcal{M}(x).q(x_{0:M})$
%$\sum_{a_0} \sum_{a_{1:M}^{1:K}} \int \mathcal{S}.\mathcal{M}(x).q(a_0, a_{1:M}^{1:K}, x_{0:M}^{1:K}, x_{0:M}) \text{d}x_{0:M}^{1:K}$
would require large sums over the resampling variables and intractable integrals over the particle collection. But 
this is where RAVI's recursive structure comes into play:
a meta-inference strategy may use an intractable meta-posterior
approximation, so long as 
we attach a \textit{meta-}meta-inference strategy $\mathcal{S}.\mathcal{M}(x).\mathcal{M}(x_{0:M}).q(a_0, a_{1:M}^{1:K}, x_{0:M}^{1:K})$. In this case meta-meta-inference must infer the auxiliary variables of SMC (ancestor variables and unchosen trajectories), given the final chosen trajectory $x_{0:M}$. For this we can use the \textit{conditional SMC} algorithm~\citep{andrieu2010particle}, 
which runs SMC, with the same auxiliary variables, but constrained to ensure that one of the $K$ particles 
traces the observed trajectory $x_{0:M}$. Because cSMC 
introduces no new auxiliary variables, it has a 
tractable density, and there is no need to specify 
a fourth layer of meta-inference. The full 
tower of posterior approximations is given in RAVI Inference Strategy~\hyperref[infstrat:mcvi]{1}.

In Section~\ref{sec:examples}, we compare MCVI to \texttt{rmcvi}, for different $K$ and $M$. Figure~\ref{fig:mcvi-experiment} shows that meta-inference error is greatly reduced by using SMC, so that the variational bound $\mathcal{L}_\mathcal{S}$ continues to tighten as the MCMC chain grows longer. 

%Second, in the ``many short chains'' regime, importance weights can be computed for each chain. Unlike in traditional MCMC, where we rely on a single chain having mixed to obtain accurate estimates of posterior expectations, here we rely only on the marginal distribution of the final iterate of the chain being a reasonable proposal distribution. Thus, the method will still fail if the MCMC chain fails to explore all modes, but it matters much less if the wrong mass is assigned to each mode (due to bias from the intiail distribution, e.g.); the importance weights can correct for this bias.

% The results of mode-seeking and mass-capturing VI on the more expressive variational family are shown in Figure~\ref{fig:angle}. %The key idea enabling this inference is illustrated in Figure~\ref{fig:nested}: the inference strategy can be ``flattened'' to yield an auxiliary-variable model and proposal, to which standard algorithms can be applied. In the case of the example from Figure~\ref{fig:tutorial}, the extended model is $p^*(x_2, x_1, x_0) = p_0(x_1, x_0) p_2(x_2; x_1, x_0)$, and the extended proposal is $q^*(x_2, x_1; x_0) = p_1(x_2, x_1; x_0)$. 
%Our theorems characterize the tightness of the variational bounds in terms of a \textit{sum} of KL divergences, between each posterior approximation in the strategy and its target. %In Section~\ref{sec:examples}, we show it is possible to use this characterization to improve the choice of $p_2$ from panel E, yielding a tighter bound.
%

% \textbf{RAVI generalizes many existing methods.} It turns out that many existing algorithms can be seen as standard Monte Carlo or variational algorithms, but using RAVI inference strategies instead of tractable proposal densities. For example, Figure~\ref{fig:multiple} shows how IWAE~\citep{burda2015importance} can be seen as standard variational inference, with a posterior approximation $\mathcal{S}.q$ that samples a latent vector of $K$ particles (shown in red) before choosing a high-weight particle to return (green). The meta-inference strategy that recovers IWAE is $$\mathcal{S}.\mathcal{M}(x).q(x_{1:K}) = \frac{1}{K} \cdot \sum_{j=1}^K \prod_{i \neq j} q(x_i) \cdot \delta_{x}(x_j).$$
% That is, observing the chosen particle $x$, meta-inference generates a random index $j$ uniformly at random, sets the $j$th particle deterministically to $x$, and randomly generates possible values for the unchosen particles using the Gaussian proposal $q$. (This distribution has a tractable marginal density, and so fully specifies the meta-inference strategy.) %Viewing IWAE this way has advantages. For example, in Section~\ref{sec:examples}, we show how we can swap out this choice of  $\mathcal{S}.\mathcal{M}.q$ for a \textit{learned} proposal, to achieve an even tighter variational bound.
% Table~\ref{tab:algorithms} shows how several other 
% algorithms arise as instances of RAVI; full details can be found in Appendix~\ref{sec:appendix-examples}.



\begin{figure*}[t]
%\hline
\vspace{1mm}
\begin{center}
\textbf{Recursive Monte Carlo Estimation}
\end{center}
%\\
%\vspace{2mm}
\begin{minipage}[t]{0.52\textwidth}
\footnotesize{
\removelatexerror
\vspace{-10pt}
\begin{algorithm}[H]
    \label{alg:alg1}
    %\label{infstrat:alg1}
    \SetAlgoLined\DontPrintSemicolon
    \textbf{Algorithm 1:} RAVI Importance Sampling (\texttt{IMPORTANCE})\;
    \KwIn{unnormalized target $\tilde{\pi}(x) = Z\pi(x)$}
    \KwIn{inference strategy $\mathcal{S}$}
    \KwOut{$(x, \hat{Z})$ properly weighted for $\pi(x)$, s.t. $\mathbb{E}[\hat{Z}] = Z$}
    \nl \uIf{$\mathcal{S}.q$ has a tractable marginal density}{
        \nl $x \sim \mathcal{S}.q$\; 
        \nl $w \gets \frac{1}{\mathcal{S}.q(x)}$\;
    }
    \nl \ElseIf{$\mathcal{S}.q(x) = \int \mathcal{S}.q(r, x)\text{d}r$}{
        \nl $(r, x) \sim \mathcal{S}.q$\;
        \nl $w \gets \texttt{HME}(\mathcal{S}.q(\cdot \mid x), r, \mathcal{S}.\mathcal{M}(x))$\;
    }
    \nl \Return{$(x, w\tilde{\pi}(x))$}\;
\end{algorithm}
% \noindent\textbf{Algorithm 1.} Let $\tilde{\pi} = Z\pi$ be an unnormalized target density, and let $\mathcal{S}$ be a strategy targeting $\pi$. Then the following yields a properly weighted sample $(x, \hat{Z})$ for $\pi(x)$, with $\mathbb{E}[\hat{Z}] = Z$:
% \begin{enumerate}[leftmargin=*]
%     \item If $\mathcal{S}.q$ has a tractable marginal density, generate $x \sim \mathcal{S}.q$ and set $w \gets \frac{1}{\mathcal{S}.q(x)}$.

%     \item Else, if $\mathcal{S}.q(x) = \int \mathcal{S}.q(r, x) \text{d}r$, generate $(r, x) \sim \mathcal{S}.q$, and run Algorithm 2 with unnormalized target $\mathcal{S}.q(\cdot, x)$, inference strategy $\mathcal{S}.\mathcal{M}(x)$, and sample $r$, to obtain a weight $w$. % obtained by setting $p_1(x_2, x_1; x_0)$ as the top-level target, and $p_2, \dots, p_n$ as the layers below.

%     \item Return $(x, w \cdot \tilde{\pi}(x))$.
% \end{enumerate}
}
\end{minipage}\hfill%
\begin{minipage}[t]{0.48\textwidth}
\footnotesize{
\removelatexerror
\vspace{-10pt}
\begin{algorithm}[H]
    \label{alg:alg2}
    %\label{infstrat:alg1}
    \SetAlgoLined\DontPrintSemicolon
    \textbf{Algorithm 2:} RAVI Harmonic Mean Estimation (\texttt{HME})\;
    \KwIn{unnormalized target $\tilde{\pi}(x) = Z\pi(x)$}
    \KwIn{exact sample $x \sim \pi$}
    \KwIn{inference strategy $\mathcal{S}$}
    \KwOut{unbiased estimate $\check{Z}^{-1}$ of $Z^{-1}$}
    \nl \uIf{$\mathcal{S}.q$ has a tractable marginal density}{
        \nl $w \gets \mathcal{S}.q(x)$\;
    }
    \nl \ElseIf{$\mathcal{S}.q(x) = \int \mathcal{S}.q(r, x)\text{d}r$}{
        \nl $(r, w) \gets \texttt{IMPORTANCE}(\mathcal{S}.q(\cdot, x), \mathcal{S}.\mathcal{M}(x))$\;
    }
    \nl \Return{$w/\tilde{\pi}(x)$}\;
\end{algorithm}

% \noindent\textbf{Algorithm 2.} Let $\tilde{\pi} = Z\pi$ be an unnormalized target density, and let $\mathcal{S}$ be a strategy targeting $\pi$. If $x$ is an exact sample from $\pi$, the following procedure generates an unbiased estimate $\frac{1}{\check{Z}}$ of $\frac{1}{Z}$:

% \begin{enumerate}[leftmargin=*]
%     \item If $\mathcal{S}.q$ has a tractable marginal density, set $w \gets  \mathcal{S}.q(x)$.\\

%     \item Else, if $\mathcal{S}.q(x) = \int \mathcal{S}.q(r, x) \text{d}r$, run Algorithm 1 with unnormalized target $\mathcal{S}.q(\cdot, x)$ and inference strategy $\mathcal{S}.\mathcal{M}(x)$, to obtain a pair $(r, w)$.

%     \item  Return $w / \tilde{\pi}(x)$.
% \end{enumerate}
% \noindent\textbf{Algorithm 1.} Let $\mathcal{S}$ be a strategy targeting $p(x \mid y)$. Then the following yields a properly weighted sample $(x, \hat{Z})$ for $p(x \mid y)$, with $\mathbb{E}[\hat{Z}] = p(y)$:
% \begin{enumerate}[leftmargin=*]
%     \item If $\mathcal{S}.q$ has a tractable marginal density, generate $x \sim \mathcal{S}.q$ and set $w \gets \frac{1}{\mathcal{S}.q(x)}$.

%     \item Else, if $\mathcal{S}.q(x) = \int \mathcal{S}.q(r, x) \text{d}r$, run Algorithm 2 on the joint distribution $\mathcal{S}.q(r, x)$ and the meta-inference strategy $\mathcal{S}.\mathcal{M}$ to obtain a pair $(x, w)$. % obtained by setting $p_1(x_2, x_1; x_0)$ as the top-level target, and $p_2, \dots, p_n$ as the layers below.
    
%     \item Return $(x, w \cdot p(x, y))$.
% \end{enumerate}
% }
% \end{minipage}\hfill%
% \begin{minipage}[t]{0.48\textwidth}
% \footnotesize{
% \noindent\textbf{Algorithm 2.} Let $p(x, y)$ be a joint distribution, and let $\mathcal{M}(y)$ be a strategy targeting $p(x \mid y)$ for each $y$. The following procedure generates $y \sim p$ along with an unbiased estimate $\frac{1}{\check{Z}}$ of the inverse marginal density $1 / p(y)$:

% \begin{enumerate}[leftmargin=*]
%     \item Generate $(x, y) \sim p$.
    
%     \item If $\mathcal{M}(y).q$ has a tractable density, set $w \gets  \mathcal{M}(y).q(x)$.
    
%     \item Else, if $\mathcal{M}(y).q(x) = \int \mathcal{M}(y).q(r, x) \text{d}r$, run Algorithm 1 on the strategy $\mathcal{M}(y).\mathcal{M}(x)$, targeting $\mathcal{M}(y).q(r \mid x)$. This yields a pair $(r, w)$.

%     \item  Return $(y, w / p(x, y))$.
% \end{enumerate}
}
\end{minipage}
%\\
%\hline 
%\vspace{2mm}
\vspace{-5mm}
\begin{center}
    \textbf{Recursive Variational Objectives and Gradient Estimation}
\end{center}
\vspace{-2mm}
\begin{minipage}[t]{0.52\textwidth}
\footnotesize{
\removelatexerror
\vspace{-10pt}
\begin{algorithm}[H]
    \label{alg:alg3}
    %\label{infstrat:alg1}
    \SetAlgoLined\DontPrintSemicolon
    \textbf{Algorithm 3:} RAVI ELBO and gradient estimator ($\texttt{ELBO}\nabla$)\;
    \KwIn{model $p(x, y)$}
    \KwIn{data $y$}
    \KwIn{inference strategy $\mathcal{S}$}
    \KwOut{unbiased estimates of $\mathcal{L}(p, y, \mathcal{S})$ and of $\nabla_\theta \mathcal{L}(p, y, \mathcal{S})$}
    \nl \If{$\mathcal{S}.q$ has a tractable marginal density}{
        \nl $x \sim \mathcal{S}.q$\;
        \nl$ (\hat{U}, \widehat{\grad}) \gets (\log \mathcal{S}.q(x), \grad \log \mathcal{S}.q(x) \cdot (1 + \log \mathcal{S}.q(x)))$\; 
        \nl $\mathbf{g} \gets \grad \log \mathcal{S}.q(x)$\;
    }
    \nl \ElseIf{$\mathcal{S}.q(x) = \int \mathcal{S}.q(r, x)\text{d}r$}{
        \nl $(r, x) \sim \mathcal{S}.q$\;
        \nl $(\hat{U}, \widehat{\nabla_\theta}, \mathbf{g}) \gets \texttt{EUBO}\nabla(\mathcal{S}.q, x, r, \mathcal{S}.\mathcal{M}(x))$\;
%        \nl $(r, w) \gets \text{Alg1}(\mathcal{S}.q(\cdot, x), \mathcal{S}.\mathcal{M}(x))$\;
    }
    \nl $\hat{L} \gets \log p(x, y) - \hat{U}$\;
    \nl $\widehat{\grad}'\gets \grad \log p(x, y)
    + \mathbf{g} 
    \log p(x, y)
    - \widehat{\grad}.$\;
    \nl \Return{$(\hat{L}, \widehat{\grad}')$}\;
\end{algorithm}
% \noindent\textbf{Algorithm 3.} Let $\mathcal{S}$ be a strategy targeting $p(x \mid y)$. Then the following procedure yields unbiased estimates of the ELBO
% $\LL(y)$ and $\grad \LL(y)$:
% \begin{enumerate}[leftmargin=*]
%     \item If $\mathcal{S}.q$ has a tractable density, sample $x \sim \mathcal{S}.q$ and set $(\hat{U}, \widehat{\grad}, \mathbf{g}) \gets (\log \mathcal{S}.q(x), \grad \log \mathcal{S}.q(x) \cdot (1 + \log \mathcal{S}.q(x)), \grad \log \mathcal{S}.q(x))$.

%     \item Else, if $\mathcal{S}.q(x) = \int \mathcal{S}.q(r, x) \text{d}r$, generate $(r, x) \sim \mathcal{S}.q$, and run {Algorithm 4} on the inference strategy $\mathcal{S}.\mathcal{M}(x)$ targeting $\mathcal{S}.q(r \mid x)$, and the sample $r$, to obtain $(\hat{U}, \widehat{\grad}, \mathbf{g})$.
    
%     \item Set $\hat{L} \gets \log p(x, y) - \hat{U}$.
    
%     \item Set $\widehat{\grad}'\gets \grad \log p(x, y)
%         + \mathbf{g} 
%         \log p(x, y)
%         - \widehat{\grad}.$
    
%     \item Return $(\hat{L}, \widehat{\grad}')$.
% \end{enumerate}
}
\end{minipage}\hfill%
\begin{minipage}[t]{0.48\textwidth}
\footnotesize{
    \removelatexerror
    \vspace{-10pt}
    \begin{algorithm}[H]
        \label{alg:alg4}
        \SetAlgoLined\DontPrintSemicolon
        \textbf{Algorithm 4:} RAVI EUBO and gradient estimator ($\texttt{EUBO}\nabla$)\;
        \KwIn{model $p(x, y)$}
        \KwIn{data $y$}
        \KwIn{exact sample $x \sim p(x \mid y)$}
        \KwIn{inference strategy $\mathcal{S}$}
        \KwOut{unbiased estimates of $\mathcal{U}(p, y, \mathcal{S})$ and $\nabla_\theta \mathcal{U}(p, y, \mathcal{S})$}
        \KwOut{quantity $\mathbf{g}$ (see Thm. 2)}
        \nl \uIf{$\mathcal{S}.q$ has a tractable marginal density}{
            \nl$ (\hat{L}, \widehat{\grad}) \gets (\log \mathcal{S}.q(x), \grad \log \mathcal{S}.q(x))$\;
        }
        \nl \ElseIf{$\mathcal{S}.q(x) = \int \mathcal{S}.q(r, x)\text{d}r$}{
            \nl $(\hat{L}, \widehat{\nabla_\theta}) \gets \texttt{ELBO}\nabla(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x))$\;
    %        \nl $(r, w) \gets \text{Alg1}(\mathcal{S}.q(\cdot, x), \mathcal{S}.\mathcal{M}(x))$\;
        }
        \nl $\hat{U} \gets \log p(x, y) - \hat{L}$\;
        \nl $\mathbf{g} \gets \nabla_\theta \log p(x, y)$\;
        \nl $\widehat{\grad}'\gets \grad \log p(x, y)
        + \mathbf{g} \cdot \hat{U}
        - \widehat{\grad}$\;
        \nl \Return{$(\hat{U}, \widehat{\grad}', \mathbf{g})$}\;
    \end{algorithm}
% \noindent\textbf{Algorithm 4.} Let $\mathcal{S}$ be a strategy targeting $p(x \mid y)$ and $x$ an exact posterior sample. The following procedure yields unbiased estimates of $\UU(y)$ and $\grad \UU(y)$, and the quantity $\mathbf{g}$ (see Thm. 2):
% %the gradient $\grad \log p_0(x_0, x_1)$.

% \begin{enumerate}[leftmargin=*]
%     %\item Generate $(x, y) \sim p$.
    
%     \item If $\mathcal{S}.q$ has a tractable marginal density, then set $(\hat{L}, \widehat{\grad}) \gets (\log \mathcal{S}.q(x), \grad \log \mathcal{S}.q(x))$.
%     \item Else, if $\mathcal{S}.q(x) = \int \mathcal{S}.q(r, x)\text{d}r$, run Algorithm 3 on the strategy $\mathcal{S}.\mathcal{M}(x)$, targeting $\mathcal{S}.q(r \mid x)$, to obtain $(\hat{L}, \widehat{\grad})$.

%     \item Set $\hat{U} \gets \log p(x, y) - \hat{L}$.
    
%     \item Set $\mathbf{g} \gets \grad \log p(x, y)$.

%     \item Set $\widehat{\grad}' \gets
%         \grad \log p(x, y)
%         + \mathbf{g} \cdot \hat{U}
%         - \widehat{\grad}$.
        
%     \item Return $\left(\hat{U}, \widehat{\grad}', \mathbf{g} \right)$.
% \end{enumerate}
}
\end{minipage}
\vspace{-5mm}
%\hline
% \caption{Caption}
%    \label{fig:my_label}
\end{figure*}



\textbf{Using the inference strategy within a Monte Carlo algorithm, to estimate marginal likelihoods from MCMC results.} Our inference strategy $\mathcal{S}$ can also be used as proposal within Monte Carlo algorithms, such as importance sampling. In the context of our example, where $\mathcal{S}.q$ incorporates $M$ steps of a Markov chain,
this allows us to assign an \textit{importance weight} to 
each run of the Markov chain.
The weight is an unbiased estimate of the marginal likelihood $p(y)$ of the model;
%By our Theorem 3, the variance of the estimate is the $\chi^2$ divergence between the marginal distribution of the final iterate in the MCMC chain, and the true posterior $p(x \mid y)$, plus a term characterizing the expected error of meta-inference. 
%If meta-inference error can be kept low, the resulting 
thus, we can view the algorithm as a way to derive marginal likelihood estimates from MCMC runs, a task of long-standing interest in the Monte Carlo community~\citep{neal2001annealed}. In Section~\ref{sec:examples}, we show that in some settings MCVI compares favorably a standard algorithm for the task, annealed importance sampling (AIS)~\citep{neal2001annealed}.
%\vspace{-5mm}    

\section{ALGORITHMS}
\label{sec:inference-algs}


\begin{algorithm}[t]
    \SetAlgoLined\DontPrintSemicolon
    \label{alg:alg5}
    $$\textbf{MCMC}$$
    \footnotesize{
    \hspace{-3mm}\textbf{Algorithm 5:} RAVI Metropolis-Hastings\;
    \KwIn{model $\tilde{\pi}(x) = Z \int \pi(r, x) \text{d}r$}
    \KwIn{proposal $q(x'; x) = \int q(s, x'; x) \text{d}s$}
    \KwIn{family $\mathcal{S}(x)$ of inference strategies targeting $\pi(r \mid x)$}
    \KwIn{family $\mathcal{M}(x, x')$ of inference strategies targeting $q(s \mid x'; x)$}
    \KwIn{initial position $x$ and estimate $\hat{Z}_{x}$ of $\tilde{\pi}(x)$}
    \KwOut{next position $x'$ and estimate $\hat{Z}_{x'}$ of $\tilde{\pi}(x')$}
    \nl $(s, x') \sim q(s, x'; x)$\;
    \nl $w_{x'} \gets {\texttt{HME}}(q(\cdot, x'; x), s, \mathcal{M}(x, x'))$\;
    \nl $(\_, w_x) \gets \texttt{IMPORTANCE}(q(\cdot \mid x; x'), \mathcal{M}(x', x))$\;
    \nl $(\_, \hat{Z}_{x'}) \gets \texttt{IMPORTANCE}(\pi(\cdot \mid x'), \mathcal{S}(x'))$\;
    \nl $u \sim \text{Uniform}(0, 1)$\;
    \nl \If{$u < \text{min}(1, \frac{\hat{Z}_{x'}}{\hat{Z}_x}w_{x'}w_x)$}{
        \nl \Return{$(x', \hat{Z}_{x'})$}\;
    }
    \nl \Else{
        \nl \Return{$(x, \hat{Z}_x)$}\;
    }
    }
\vspace{-3mm}
\end{algorithm}

%\vspace{-3mm}
In this section, we present algorithms for using RAVI inference strategies within Monte Carlo and variational inference algorithms, as proposals and variational families.

%\textbf{Notation.} Let $\mathcal{S}$ be an inference strategy targeting $\pi(x_1) = p_0(x_1 \mid x_0)$ for a latent-variable model $p_0(x_1, x_0)$. We write $p_1(x_1; x_0)$ for $\mathcal{S}.q(x_1)$, and $p_i(x_i; x_{0:i-1})$ for $\mathcal{S}.\mathcal{M}(x_1).\mathcal{M}(x_2).\cdots.\mathcal{M}(x_{i-1}).q(x_i)$. Then each $p_i(x_i; x_{0:i-1}) = \int p_i(x_{i+1}, x_i; x_{0:i-1})\text{d}x_{i+1}$ is an approximation of the previous layer's posterior $p_{i-1}(x_i \mid x_{i-1}; x_{0:i-2})$, as depicted in Figure~\ref{fig:nested}. 

\textbf{RAVI for Importance Sampling and SMC.} In importance sampling and SMC algorithms, proposals $q$ are used to (1) generate proposed values $x \sim q$, and (2) compute importance weights $\frac{p(x)}{q(x)}$. But in both IS and SMC, it suffices to produce \textit{unbiased estimates} of $\frac{p(x)}{q(x)}$~\citep{chopin2020introduction}. 
%(This weaker requirement is sometimes called ``proper weighting,'' and implies that $\mathbb{E}_{x \sim q}[w f(x)] = \mathbb{E}_{x \sim p}[f(x)]$ when $p$ is a normalized target density.) 
RAVI exploits this degree of freedom to generate proper importance weights even when $q(x)$ is intractable.
Suppose $\tilde{\pi} = Z\pi$ is an unnormalized target density, and $\mathcal{S}$ is a RAVI inference strategy targeting $\pi$. Algorithm~\hyperref[alg:alg1]{1} simulates $x \sim \mathcal{S}.q$ and computes an unbiased estimate $\hat{Z}$ of $\frac{\tilde{\pi}(x)}{\mathcal{S}.q(x)}$:

\textbf{Theorem 1.} \textit{
    Let $\tilde{\pi}(x) = Z\pi(x)$ be an unnormalized target density, and $\mathcal{S}$ an inference strategy targeting $\pi(x)$. Then:
    \begin{itemize}
        \item $\texttt{IMPORTANCE}(\mathcal{S}, \tilde{\pi})$ generates $(x, \hat{Z})$ with $x \sim \mathcal{S}.q$ and $\mathbb{E}[\hat{Z} \mid x] = Z\frac{\pi(x)}{\mathcal{S}.q(x)}$. Furthermore, the unconditional expectation $\mathbb{E}[\hat{Z}(\tilde{\pi}, \mathcal{S})] = Z$.
        \item When $x \sim \pi$, \texttt{HME}($\mathcal{S}, x, \tilde{\pi}$) generates $\check{Z}$ with $\mathbb{E}[{\check{Z}}^{-1}] = Z^{-1}.$
    \end{itemize}
}

When $\mathcal{S}.q$ has a tractable marginal density, Algorithm~\hyperref[alg:alg1]{1} computes an exact importance weight. Otherwise, it calls Algorithm~\hyperref[alg:alg2]{2}, which uses the meta-inference strategy $\mathcal{S}.\mathcal{M}(x)$ to estimate $\frac{1}{\mathcal{S}.q(x)}$. The proof of Theorem 1 is by induction on the level of nesting in the strategy (see Appendix~\ref{sec:proofs}). 


\textbf{RAVI for MCMC.} When models or proposals (or both) in a Metropolis-Hastings sampler do not have tractable closed-form densities, RAVI inference strategies enable computation of MH acceptance probabilities (Algorithm~\hyperref[alg:alg5]{5}). Intuitively, to compute the usual Metropolis-Hastings acceptance probability $\alpha = \frac{\tilde{\pi}(x')q(x; x')}{\tilde{\pi}(x)q(x'; x)}$, Algorithm~\hyperref[alg:alg5]{5} estimates the necessary proposal densities, using $\texttt{HME}$ for the forward proposal density that appears in the denominator, and $\texttt{IMPORTANCE}$ for the backward proposal density that appears in the numerator. If necessary, it also uses $\texttt{IMPORTANCE}$ to estimate the new model density $\tilde{\pi}(x')$.
%This goes beyond typical pseudo-marginal algorithms~\citep{andrieu2010particle}, in which only the model density is intractable. MCMC algorithms that use RAVI to handle intractable proposal densities can instead be justified as \textit{pseudo-marginal ratio} algorithms~\citep{andrieu2018utility}. 
We show the algorithm implements a stationary kernel for $\pi$ in Appendix~\ref{sec:ravi-mcmc}.

% Let $\tilde{\pi}(x) = \int \tilde{\pi}(r, x) \text{d}r = Z \int \pi(r, x) \text{d}r$ be a possibly unnormalized target density, and let $q(x'; x) = \int q(s, x'; x) \text{d}s$ be a proposal kernel mapping previous state $x$ to new state $x'$. We note that (1) both $\tilde{\pi}$ and $q$ have intractable marginal densities, and (2) the target marginal $\tilde{\pi}(x)$ itself may be unnormalized. As is typical in pseudomarginal MCMC, even this unnormalized target density cannot be evaluated pointwise, due to the additional nuisance variables $r$.

% Now suppose we have an inference strategy $\mathcal{S}$ targeting $\pi(r \mid x)$, and a family of inference strategies $\mathcal{M}(x, x')$ targeting $q(s \mid x'; x)$. Let $x$ be a starting position for our Markov chain. We can run Algorithm~\hyperref[alg:alg1]{1} on $\mathcal{S}$, targeting $\pi(r \mid x)$, to obtain an initial estimate $\hat{Z}_x$ of the unnormalized marginal density $\tilde{\pi}(x)$. Then Algorithm~\hyperref[alg:mh]{5} defines a stationary MCMC kernel for the target distribution $\pi(x)$, starting at input point $x$ (see Appendix~\ref{sec:proofs} for details):
% \begin{algorithm}[h]
%     \label{alg:mh}
%     \footnotesize{
%     \SetAlgoLined\DontPrintSemicolon
%     \textbf{Algorithm 5:} RAVI Metropolis-Hastings\;
%     \KwIn{model $\tilde{\pi}(x) = Z \int \pi(r, x) \text{d}r$}
%     \KwIn{proposal $q(x'; x) = \int q(s, x'; x) \text{d}s$}
%     \KwIn{inference strategy $\mathcal{S}$ targeting $\pi(r \mid x)$}
%     \KwIn{family $\mathcal{M}(x, x')$ of inference strategies targeting $q(s \mid x'; x)$}
%     \KwIn{initial position $x$ and estimate $\hat{Z}_{x}$ of $\tilde{\pi}(x)$}
%     \KwOut{next position $x'$ and estimate $\hat{Z}_{x'}$ of $\tilde{\pi}(x')$}
%     \nl $(s, x') \sim q(s, x'; x)$\;
%     \nl $w_{x'} \gets {\texttt{HME}}(q(\cdot, x'; x), s, \mathcal{M}(x, x'))$\;
%     \nl $(\_, w_x) \gets \texttt{IMPORTANCE}(q(\cdot \mid x; x'), \mathcal{M}(x', x))$\;
%     \nl $(\_, \hat{Z}_{x'}) \gets \texttt{IMPORTANCE}(\pi(\cdot \mid x'), \mathcal{S})$\;
%     \nl $u \sim \text{Uniform}(0, 1)$\;
%     \nl \If{$u < \text{min}(1, \frac{\hat{Z}_{x'}}{\hat{Z}_x}w_{x'}w_x)$}{
%         \nl \Return{$(x', \hat{Z}_{x'})$}\;
%     }
%     \nl \Else{
%         \nl \Return{$(x, \hat{Z}_x)$}\;
%     }
%     }
% \end{algorithm}


% \begin{enumerate}
%     \item Run Algorithm 2 on the distribution $q(s, x'; x)$ and the family of strategies $\mathcal{M}(x, \cdot)$, to generate $(x', w_{x'})$.
    
%     \item Run Algorithm 1 on the strategy $\mathcal{M}(x', x)$, targeting $q(s \mid x; x')$, to generate $(\_, w_x)$.
    
%     \item Run Algorithm 1 on the strategy $\mathcal{S}$ targeting $\pi(r \mid x')$, to generate $(\_, \hat{Z}_{x'})$.
    
%     \item With probability $\min(1, \frac{\hat{Z}_{x'}}{\hat{Z}_x} w_{x'} w_x)$, accept $x'$ as the next point in the chain, with estimated target density $\hat{Z}_{x'}$. Else, reject $x'$ and return $x$.
% \end{enumerate}
% When $q$'s marginal density is known exactly, the above algorithm recovers variants of Particle-Marginal MH~\citep{andrieu2010particle}, except instead of using SMC to marginalize $r$, any RAVI algorithm can be applied. When $q$'s marginal density is unavailable, however, the algorithm instead becomes a pseudo-marginal \textit{ratio} algorithm~\citep{andrieu2018utility}, because not just $p$ but also $q$ is estimated unbiasedly. In general, it is \textit{not} valid to use arbitrary unbiased estimates of $p$ \textit{and} $q$, or even of $\alpha = \frac{p(x')q(x; x')}{p(x)q(x'; x)}$, within an MH algorithm. However, the added structure of the RAVI strategy ensures that the above procedure is sound. See appendix for details.

% To see why, consider the extended target distribution

% $$p_0(x_0) \cdot \prod_{i=1}^n p_{2\lfloor \frac{i}{2} \rfloor}(x_i \mid x_{0:i-1}) \cdot \prod_{i=0}^{m} q_{2\lfloor \frac{i}{2} \rfloor}(y_i \mid y_{0:i-1}; x_0) \cdot \prod_{i=1}^{m} q_{2\lceil \frac{i}{2} \rceil - 1}(y'_i; y'_{1:i-1}, x_0; y_0).$$

% Our algorithm can be understood as sequencing two stationary kernels for this target (which admits our target of interest, $p_0$, as a marginal). The first is a Gibbs update on the $y_i$ and $y'_i$ variables: the $y_i$ variables are generated while running Algorithm 2 on the $q$ strategy, and the $y'_i$ variables are generated while running Algorithm 1 on the $q$ strategy. As a Gibbs update, this move is accepted with probability 1. The second kernel is a Metropolis-Hastings proposal that simultaneously: (i) swaps $x_0$ with $y_0$ (the proposed update of the `main' state variable $x_0$), (ii) swaps each $y_i$ with $y'_i$, for $1 \leq i \leq m$, and (iii) proposes updated $x_{1:n}$ values by running Algorithm 1 on the $p$ strategy. The usual Metropolis-Hastings acceptance probability for this kernel, computed on the extended state space, is precisely the formula from Step (4) of the algorithm given above.

% One consequence of this justification is that the \textit{same} strategy for $q$ must be used in steps 1 and 2 of the algorithm. Furthermore, from iteration to iteration, the strategy used in Step 3 for $p$ must also be kept constant. 

%TODO: intuition for *how*.


\textbf{RAVI for Variational Inference.} Let $p_\theta(x, y)$ be a latent-variable generative model with parameters $\theta$, and $\mathcal{S}_\theta(y)$ is a family of strategies targeting $p_\theta(x \mid y)$. Given a dataset $y$, variational inference can be applied to maximize (a lower bound on) $\log  p_\theta(y)$, and also to optimize parameters of the posterior approximations in $\mathcal{S}_\theta$, to bring them closer (in KL divergence) to their targets.
Let
\begin{align*}
\LL(p, y, \mathcal{S}) &:= \E [ \log \hat{Z}(p(\cdot, y), \mathcal{S}) ] \le \log p(y)\\ \text{and } \UU(p, y, \mathcal{S}) &:= \E [ \log \check{Z}(p(\cdot, y), \mathcal{S}) ] \ge \log p(y),
\end{align*}
% \begin{align}
% \LL_0(x_0) &:= \E [ \log \hat{Z} ] \le \log p_0(x_0)\\
% \UU_0(x_0) &:= \E [ \log \check{Z} | x_0 ] \ge \log p_0(x_0)
% \end{align}
%\begin{equation}
%\end{equation}
where $\hat{Z}(\tilde{\pi}, \mathcal{S})$ is the estimate returned by \texttt{IMPORTANCE} (Alg.~\hyperref[alg:alg1]{1}) on $\mathcal{S}$ and unnormalized target $\tilde{\pi}$,
% TODO change to \le if it's not a model strategy..
%For a
%valid model
%strategy $p_0, \ldots, p_n$, let:
and $\check{Z}(\tilde{\pi}, \mathcal{S})$ is the inverse of the weight returned from \texttt{HME} (Alg.~\hyperref[alg:alg2]{2}) when run with unnormalized target $\tilde{\pi}$, inference strategy $\mathcal{S}$, and an exact sample $x \sim \pi$. Because $\hat{Z}$ is an unbiased estimate of $p_\theta(y)$, and ${\check{Z}}^{-1}$ is an unbiased estimate of ${p_\theta(y)}^{-1}$, we have by Jensen's inequality that $\LL(p, y, \mathcal{S})$ and $\UU(p, y, \mathcal{S})$ are lower and upper bounds (respectively) on $\log p_\theta(y)$. As such, we can fit the model parameters $\theta$ to data $y$ by minimizing $\UU(p, y, \mathcal{S})$ or maximizing $\LL(p, y, \mathcal{S})$. 
% ($\log p_0(x_0) \ge \LL_0(x_0)$ follows
% from $\E [ \hat{Z} ] = p_0(x_0)$ and Jensen's inequality,  and
% $\log p_0(x_0) \le \UU_0(x_0)$ follows from
% $\E [ 1 / \check{Z} | x_0 ] = 1 / p_0(x_0)$ and Jensen's inequality). % TODO change to \ge (?) if itis not an inference strategy..

{\it Recursive stochastic gradient estimation.}
$\texttt{ELBO}\nabla$ (Alg.~\hyperref[alg:alg3]{3}) is a procedure for estimating $\LL(p, y, \mathcal{S})$ and its gradient $\grad \LL(p, y, \mathcal{S})$ with respect to the parameters $\theta$ of the model and the strategy. When $(x, y) \sim p(x, y)$, $\texttt{EUBO}\nabla$ (Alg.~\hyperref[alg:alg4]{4}) estimates $\UU(p, y, \mathcal{S})$ and the gradient $\grad\E_{y \sim p}[\UU(p, y, \mathcal{S})]$.
%Note that $p_i$ and $p_j$ for $i \ne j$ may share parameters.
These procedures employ score function estimation of gradients, but 
it is straightforward to incorporate baselines within each procedure to reduce variance.
Depending on $\mathcal{S}$, the reparametrization trick may also be applicable (Appendix~\ref{sec:reparam}). %(not shown). 


\noindent\textbf{Theorem 2.} \textit{
Given a model $p_\theta(x, y)$ and an inference strategy $\mathcal{S}_\theta$ targeting $p_\theta(x \mid y)$,
%Let $p_0, \dots, p_n$ be a
%valid model strategy and a valid inference strategy.
Alg. 3 yields unbiased estimates of $\LL(p, y, \mathcal{S})$ and of $\grad \LL(p, y, \mathcal{S})$.
Furthermore, when $(x, y) \sim p_\theta$, Alg. 4 yields
(i) $\hat{U}$ such that
$\E [ \hat{U} \mid y ] = \UU(p, y, \mathcal{S})$,
(ii) $\widehat{\grad}$ such that
$\E [ \widehat{\grad} ] = \grad \E_{y \sim p}[\UU(p, y, \mathcal{S})]$,
and (iii) a value $\mathbf{g}$ such that for any function $R$ that does not depend on $\theta$,
$\E [ \mathbf{g} \cdot R(y) ] = \grad \E_{y \sim p_\theta} [ R(y) ]$  if
$\grad \E_{y \sim p_\theta} [ R(y) ]$ is defined.}

In Section 4, we show the tightness of the variational bounds $\LL$ and $\UU$ is given by sums of KL divergences between posterior approximations in $\mathcal{S}_\theta$ and their targets. Thus, optimizing these bounds improves the posterior approximations, either encouraging mass-capturing or mode-seeking behavior.
%\vspace{-4mm}

\section{THEORETICAL ANALYSIS}
\label{sec:theory}
%\vspace{-4mm}

We now present theorems characterizing the quality of RAVI inference: Thm. 3 concerns the variance of weights in a Monte Carlo sampler, and Thm. 4 the tightness of variational bounds. In both cases, error is related to each approximation in the RAVI strategy's divergence to its target posterior.

%\textbf{Notation.} Let $\mathcal{S}$ be an inference strategy targeting $\pi(x_1) = p_0(x_1 \mid x_0)$ for a latent-variable model $p_0(x_1, x_0)$. We write $p_1(x_1; x_0)$ for $\mathcal{S}.q(x_1)$, and $p_i(x_i; x_{0:i-1})$ for $\mathcal{S}.\mathcal{M}(x_1).\mathcal{M}(x_2).\cdots.\mathcal{M}(x_{i-1}).q(x_i)$. Then each $p_i(x_i; x_{0:i-1}) = \int p_i(x_{i+1}, x_i; x_{0:i-1})\text{d}x_{i+1}$ is an approximation of the previous layer's posterior $p_{i-1}(x_i \mid x_{i-1}; x_{0:i-2})$, as depicted in Figure~\ref{fig:nested}. 

\textbf{Sampler variance in Monte Carlo.}
Let $\tilde{\pi} = Z\pi$ be an unnormalized target density, and $\mathcal{S}$ an inference strategy targeting $\pi$. As in Section~\ref{sec:inference-algs}, we write $\hat{Z}(\tilde{\pi}, \mathcal{S})$ for the weight returned by \texttt{IMPORTANCE}, and $\text{Var}_{\hat{Z}}(\pi, \mathcal{S})$ for the \textit{relative variance} of the estimator, $\text{Var}(\frac{\hat{Z}(\tilde{\pi}, \mathcal{S})}{Z})$, which does not depend on $Z$ (and therefore is a function of $\pi$, not $\tilde{\pi}$). Similarly, we write $\check{Z}(\tilde{\pi}, \mathcal{S})$ for the reciprocal of the weight returned by \texttt{HME}, run with an input $x \sim \pi$. $\text{Var}_{\check{Z}}(\pi, \mathcal{S})$ is its relative variance, $\text{Var}(\frac{Z}{\check{Z}(\tilde{\pi}, \mathcal{S})})$. 

\noindent\textbf{Theorem 3.} {\it Consider an unnormalized target distribution $\tilde{\pi}(x) = Z\pi(x)$ and an inference strategy $\mathcal{S}$ targeting $\pi(x)$. Then the relative variances of the estimators $\hat{Z}(\tilde{\pi}, \mathcal{S})$ and $\check{Z}(\tilde{\pi}, \mathcal{S})$ are given by the following recursive equations:}
\begin{align*}
\text{Var}_{\hat{Z}}&(\pi, \mathcal{S}) = \chi^2(\pi || \mathcal{S}.q) \, +\\
 & \mathbb{E}_{x \sim \mathcal{S}.q}\left[\left(\frac{\pi(x)^2}{\mathcal{S}.q(x)^2}\right) \cdot \text{Var}_{\check{Z}}(\mathcal{S}.q(\cdot \mid x), \mathcal{S}.\mathcal{M}(x))\right]\\
\text{Var}_{\check{Z}}&(\pi, \mathcal{S}) = \chi^2(\mathcal{S}.q || \pi) + \\
& \mathbb{E}_{x \sim \pi}\left[\left(\frac{\mathcal{S}.q(x)^2}{\pi(x)^2}\right) \cdot \text{Var}_{\hat{Z}}(\mathcal{S}.q(\cdot \mid x), \mathcal{S}.\mathcal{M}(x))\right]
%
%\text{Var}\left(\frac{\hat{Z}}{Z}\right) = \sum_{i=0}^{n-1} \mathbb{E}%_{x_0, \dots, x_{i-1}}
%\Big[%
%\frac{\hat{Z}_i^2}{Z^2} \cdot \chi^2(&p_{2\lceil \frac{i}{2}\rceil}(x_{i+1} \mid x_{i}, \dots, x_0) \,\, ||\\
%&p_{2\lfloor\frac{i}{2}\rfloor + 1}(x_{i+1} \mid x_i, \dots, x_0))
%w_i^2 \cdot d_i%
%\Big],
\end{align*}
{\it When $\mathcal{S}.q$ is tractable, the second term of each sum is 0.}

% Note that at each layer of the strategy, the direction of the $\chi^2$ divergence changes.

\textbf{Tightness of variational bounds.}
In VI, the tightness of the variational bounds $\mathcal{L}$ and $\mathcal{U}$ can be characterized as a sum of a KL divergence and a term measuring meta-inference error. 
%Let $\mathcal{L}(p, y, \mathcal{S})$ be the objective optimized by Algorithm 3 given 
%target model $p(x, y)$, observed data $y$, and inference strategy $\mathcal{S}$ targeting $p(x \mid y)$\textemdash and similarly for $\mathcal{U}(p,y,\mathcal{S})$ and Algorithm 4. 
The random variables $\hat{\mathcal{L}}$ and $\hat{\mathcal{U}}$ returned by $\texttt{ELBO}\grad$ and $\texttt{EUBO}\grad$, respectively, are unbiased estimators of $\mathcal{L}(p, y, \mathcal{S})$ and $\mathcal{U}(p, y, \mathcal{S})$, and so can also be viewed as
 \textit{biased} estimators of $\log p(y)$. Writing their bias as $\text{Bias}_\mathcal{L}(p, y, \mathcal{S})$ (and similarly for $\mathcal{U}$), we have:

\noindent\textbf{Theorem 4.}
{\it Consider a joint distribution $p(x, y)$ and an inference strategy $\mathcal{S}$ targeting $p(x \mid y)$. Then the following equations give the bias of ${\hat{\LL}}$ and ${\hat{\UU}}$ as estimators of $\log p(y)$:}
\begin{align*}
\text{Bias}_\mathcal{L}(p, y, \mathcal{S}) =&\, -\text{KL}(\mathcal{S}.q || p(\cdot \mid y)) \\ 
&-\mathbb{E}_{x \sim \mathcal{S}.q}[\text{Bias}_\mathcal{U}(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x))]\\
\text{Bias}_\mathcal{U}(p, y, \mathcal{S}) =&\, \text{KL}(p(\cdot \mid y) || \mathcal{S}.q)\\ 
&\,\,\,-\mathbb{E}_{x \sim p(\cdot \mid y)}[\text{Bias}_\mathcal{L}(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x))]
\end{align*}
{\it where the second term in each equation is 0 when $\mathcal{S}.q$ has a tractable marginal density.}


Maximizing $\LL$, or minimizing $\UU$, also minimizes these KL divergences. In particular, maximizing $\LL(p, y, \mathcal{S})$ minimizes a `mode-seeking' KL from $\mathcal{S}.q$ to the posterior, whereas minimizing $\mathbb{E}_{y \sim p}[\UU(p, y, \mathcal{S})]$, e.g. by following the gradients of Alg.~\hyperref[alg:alg4]{4}, implements amortized variational inference, and encourages $\mathcal{S}.q$ to cover the mass of the posterior.

\textbf{Inference and Meta-Inference.} In both Theorems 3 and 4, the first term of the sum is a divergence between $\mathcal{S}.q(x)$, the intractable posterior approximation, and the actual target posterior $p(x \mid y)$. The other term measures the \textit{expected} quality of meta-inference. Thus the overall error of a RAVI algorithm can be understood as decomposing cleanly into (1) the mismatch between the posterior and the intractable proposal, and (2) the error introduced by meta-inference. 

% In Appendix~\ref{sec:appendix-examples}, we show that a wide variety of existing Monte Carlo and variational inference algorithms can be understood as applications of our Algorithms 1-4 to particular inference strategies\textemdash sometimes up to five layers deep. Our theorems provide a novel approach to analyzing these algorithms, allowing us to \textit{blame} different approximations in their corresponding inference strategies for high variance or loose variational bounds. We can also use such an analysis to improve the algorithms in question, by swapping out layers of the strategy for better posterior approximations. The next section gives implemented examples that demonstrate this process. 


% \section{RAVI AS A UNIFYING FRAMEWORK}
% \label{sec:unifying}
% \begin{table*}[]
% \centering
% \begin{tabular}{|lc|}
% \hline
% \textbf{Inference Method}                  & \textbf{RAVI Inference Strategy} \\
% \hline
% $N$-particle importance sampling~\citep{glynn1989importance} &    \ref{sec:sir-example}               \\
% Importance-weighted autoencoders~\citep{burda2015importance} &   \ref{sec:iwae-example}                 \\
% $N$-particle SMC~\citep{chopin2020introduction}  &   \ref{sec:smc-example}                 \\
% Variational SMC~\citep{naesseth2018variational} & \ref{sec:vsmc-example}\\
% Annealed importance sampling~\citep{neal2001annealed}  &  \ref{sec:ais-example} \\
% Nested SMC~\citep{naesseth2015nested} & \ref{sec:nsmc-example} \\
% SMC$^2$~\citep{chopin2013smc2} & \ref{sec:smcsq-example} \\
% Amortized Rejection Sampling~\citep{naderiparizi2019amortized} & \ref{sec:amrej-example} \\
% Hamiltonian Variational Inference~\citep{salimans2015markov} & \ref{sec:ham-example} \\
% Antithetic Sampling~\citep{pmlr-v97-ren19b} & \ref{sec:antithetic-example} \\
% \hline
% \end{tabular}
% \caption{Examples of Monte Carlo and variational inference algorithms that can be understood as standard importance sampling or variational optimization, but on an extended state space. RAVI can represent these algorithms in a more structured, unified framework, and help to understand the design tradeoffs they strike. It can also be used to identify possible improvements, extensions, and combinations.}
% \label{tab:examples}
% %\vspace{-5mm}
% \end{table*}

% It turns out that many existing inference algorithms can be viewed as standard Monte Carlo or variational inference, but instantiated with particular RAVI inference strategies rather than with tractable proposal distributions (see Table~\ref{tab:examples}). This unifying perspective yields:

% \begin{enumerate}
%     \item novel arguments for the unbiasedness and consistency of the likelihood estimates produced by these algorithms, or of the fact that the variational objectives they define are lower or upper bounds on $\log p(x)$;
    
%     \item novel characterizations of the variance of these algorithms (or the tightness of the variational bounds) in terms of inference and meta-inference, as described in Section~\ref{sec:theory}; and
    
%     \item novel improvements to Monte Carlo and VI algorithms that increase accuracy / reduce variance.
% \end{enumerate}

% It can also help to clarify the design trade-offs that underlie many auxiliary-variable methods, between improving the quality of a posterior approximation and ensuring that meta-inference is still tractable.

% We spell out many examples in Appendix~\ref{sec:appendix-examples}; here, we elaborate on the IWAE/SIR case mentioned in Section~\ref{sec:overview}, giving examples of insights RAVI provides.

% \textbf{IWAE and SIR.} It has been noted several times in the literature that IWAE~\citep{burda2015importance} corresponds to ``ordinary'' variational inference with SIR as the variational family. However, \citet{cremer2017reinterpreting} have challenged this interpretation, arguing that directly using SIR as a variational family would yield a distinct variational objective. The RAVI perspective is that IWAE corresponds to variational inference with a particular \textit{inference strategy} $\mathcal{S}$: $\mathcal{S}.q$ is the SIR distribution, and meta-inference $\mathcal{S}.\mathcal{M}$ attempts to infer the values of the (auxiliary) particles that were sampled but \textit{not} selected by the SIR procedure. When $\mathcal{S}.\mathcal{M}$ is set optimally, to exactly recover the posterior of $\mathcal{S}.q$, RAVI yields the variational objective discussed by~\citet{cremer2017reinterpreting}. However, when it is set to use a simpler approach (generating the un-chosen particles from the prior), the IWAE objective falls out. In the next section, we show that it is possible to learn meta-inference that better approximates the ideal but intractable meta-posterior.

%\textit{Corollary to Theorem 1.} Let $\pi$ be a target distribution and let $Q_N$ represent the marginal distribution of $N$-particle SIR targeting $\pi$. Then $\chi^2(\pi || Q_N)$ approaches $0$ as $N$ approaches $\infty$. 


%\vspace{-3mm}
\section{EXPERIMENTS}
\label{sec:examples}
%\vspace{-3mm}
% In this section we illustrate the design of RAVI inference strategies with two examples:
% \begin{itemize}
% \item First, we apply RAVI's
% theory to analyze~\citet{salimans2015markov}'s Markov Chain 
% Variational Inference (MCVI) algorithm, and identify a flaw 
% that prevents it from scaling to Markov chains longer than 
% 20-30 steps. We use 
% RAVI to diagnose the problem and design an improved variant 
% that does scale. % TODO: mention other applications?
% RAVI also yields a Monte Carlo variant of the algorithm, which we compare to Annealed Importance 
% Sampling~\citep{neal2001annealed} as a method of obtaining marginal likelihood estimates 
% from MCMC.
% \item First, we use RAVI to design a
% novel bottom-up weighted sampler for 
% Dirichlet Process mixture models, based on randomized 
% agglomerative clustering, and compare it to strong 
% baselines on two real-world inference problems:
% a standard benchmark dataset of galaxy velocities~\citep{drinkwater2004large,cusumano2017aide},
% and identifying typos in 1k
% Medicare hospital records~\citep{lew2021pclean,MedicareHosp}.
% \end{itemize}

\begin{figure}
    \begin{minipage}[t]{0.48\linewidth}
        \scriptsize{$\mathcal{S}_{\texttt{agglom}}.q$: \textbf{Randomized Clustering}}
        \includegraphics[width=\linewidth]{figs/agglomerative_clustering_illustration.pdf}
    \end{minipage}%
    \hfill%
    \begin{minipage}[t]{0.48\linewidth}
        \scriptsize{$\mathcal{S}_{\texttt{rmcvi}}.q$: \textbf{Langevin Monte Carlo}}
        \includegraphics[width=\linewidth]{figs/ula-illustration.pdf}
    \end{minipage}
    \scriptsize{$$\mathcal{S}_\texttt{agglom}.q(\Pi) = \sum_{mv_1} \dots \sum_{mv_{N-|\Pi|}} \prod_{i=1}^{N-|\Pi|}q(mv_i \mid mv_{<i})$$}
    \vspace{-3mm}
    \scriptsize{$$\mathcal{S}_\texttt{rmcvi}.q(x) = \int_{\mathbb{R}^M} q_0(x_0) \left(\prod_{i=1}^{M-1} q(x_i \mid x_{i-1})\right) q(x \mid x_{M-1}) \text{d} x_{0:M-1}$$} 

    \caption{Illustrations of the proposals $\mathcal{S}.q$ used in each experiment.
    In each case, $\mathcal{S}.q$ makes a sequence of auxiliary choices before returning a final proposal (the clustering $\Pi$, or the location $x$). Sequential Monte Carlo meta-inference is used to marginalize the sequence of auxiliary variables introduced 
    by the inference process (the merges $\textit{mv}_i$ in $\texttt{agglom}$, and the locations $x_{i}$ in \texttt{rmcvi}).}
\end{figure}


\begin{table}[t]
    \footnotesize{
    \begin{tabular}{p{0.05\textwidth}p{0.07\textwidth}p{0.1\textwidth}p{0.18\textwidth}}
                              & {\bf Inference}              &{\bf Meta-inference}                             &{\bf Meta-meta-inference}  \\
    {\bf\texttt{agglom}} & \textit{Discrete}: $3.0 \times 10^{1928}$ & \textit{Discrete}: $\prod_{n=|\Pi|}^{1000} \frac{n(n-1)}{2}$ & \textit{Discrete}: $(K - 1) \cdot (\prod_{n=|\Pi|}^{1000} \frac{n(n-1)}{2}) \cdot ({1000-|\Pi|})(K-1)!$ \\
    {\bf\texttt{rmcvi}}  & \textit{Continuous}: 1 & \textit{Continuous}: $M$ &\textit{Continuous}: $(K - 1) \cdot M$, \textit{Discrete}: $M(K-1)!$
    \end{tabular}
    }
    \caption{Dimensionality of the continuous latent space, and cardinality of the discrete latent space, 
    over which each layer's inference problem is defined. $K$ is the number of SMC particles used for meta-inference
    (maximum 50 for $\texttt{rmcvi}$, 5 for $\texttt{agglom}$).
    In $\texttt{rmcvi}$, $M$ is the number of MCMC steps (maximum 100 in our experiments).}
    \label{tab:dimension}
\end{table}

\subsection{Improving MCVI}
%\vspace{-3mm}
In Section~\ref{sec:overview}, we developed a 
variant of~\citet{salimans2015markov}'s MCVI algorithm 
that used SMC for meta-inference.
In Figure~\ref{fig:mcvi-experiment},
we compare vanilla MCVI to the RAVI variant,
with varying $K$ (number of particles used for 
meta-inference) and $M$ (number of MCMC steps 
in the variational family).

\textbf{Experimental details.}\footnote{Code is available: https://github.com/probcomp/ravi-uai-2022} For the MCMC kernel $T$, we use Langevin 
ascent with step size $0.015$. For the 
meta-inference proposals $R_i(x_{i+1} \rightarrow x_i)$,
we use $\mathcal{N}(x_{i}; f_\mu(x_{i+1}, i), e^{f_{\log \sigma}(x_{i+1}, i)})$, where $f$ is a 4-layer MLP, the step number $i$ 
is encoded as a one-hot vector, and $f$ outputs the mean $\mu$ and log standard deviation $\log\sigma$ for a conditionally Gaussian proposal.
The same $f$ is used for each experiment, and is trained on 
forward rollouts of MCMC (equivalent to using Alg. 3 on $\texttt{rmcvi}$ with $K=1$). The unimodal model is Gaussian with 
$\sigma=0.2$, and the multimodal model is a mixture 
of 3 Gaussians with standard deviations $0.2, 0.3,$ and $2.0$. 
The distributions $q_i$ used for importance weighting during sequential Monte Carlo meta-inference are Gaussians with learned $\mu$ and $\sigma$.

\textbf{Results.} Figure~\ref{fig:mcvi-experiment} plots the gap $\log p(y) - \mathcal{L}$ for each algorithm's variational bound $\mathcal{L}$. By Theorem 4 this gap is the sum of two terms: $KL(\mathcal{S}.q || p(x \mid y))$ and the expected meta-inference divergence $\mathbb{E}_{x \sim \mathcal{S}.q}[KL(\mathcal{S}.q(x_{0:M} \mid x) || \mathcal{S}.\mathcal{M}(x).q(x_{0:M}))]$. The first term is constant across the algorithms, since they all use the same MCMC-based posterior approximation, so the plots primarily illustrate differences in the quality of meta-inference. MCVI's meta-inference steadily worsens as the chain's length grows, and after 15-25 steps, the meta-inference cost of adding new steps outweighs the benefits to $\mathcal{S}_\texttt{mcvi}.q$, causing the bound $\mathcal{L}$ to loosen. Our variant, with SMC meta-inference, does not suffer the same penalty, and continues to improve as more steps are added. As discussed in Section~\ref{sec:overview}, the same inference strategy (\hyperref[infstrat:mcvi]{\texttt{rmcvi}}) can be used within an importance sampler to derive unbiased marginal likelihood estimates from MCMC runs. The right-hand plot in Figure~\ref{fig:mcvi-experiment} shows that this technique can yield accurate estimates with less computation than AIS~\citep{neal2001annealed}, at least on simple targets. (To fairly account for the computational cost of meta-inference, in the RAVI algorithm we multiply $M$ by $K$ when plotting the total number of MCMC steps.) 
%Unlike AIS, the RAVI algorithm does not require the design of intermediate or annealed target distributions:
%$\mathcal{S}.q$ runs an ordinary MCMC chain targeting the posterior. We see that the RAVI algorithm can converge with significantly shorter chains than AIS. 
Because the variance of AIS is bounded below by sums of divergences between subsequent pairs of intermediate target distributions, the MCMC chain must be long enough to support a very fine annealing schedule, without large jumps. By contrast, RAVI-MCVI requires 
only that the marginal distribution of the chain be a good approximation to the posterior, and that SMC meta-inference is sufficiently accurate. For some problems, this may be less expensive than the long chain required by AIS.
%\vspace{-7mm}
% In this section we apply RAVI to analyze and improve
% an existing algorithm, as well as to design two new 
% RAVI 

% We begin in Section~\ref{sec:mcmc-vi} 
% by revisiting \citet{salimans2015markov}'s 
% Markov Chain Variational Inference (MCVI) algorithm.
% We apply RAVI's Theorem 4 to predict that the bound 
% optimized by MCVI will \textit{become looser} as 
% additional MCMC 

% This section demonstrates the application of RAVI to: 
% (1) analyze and improve an existing algorithm (Markov Chain 
% Variational Inference), and (2) design a new algorithm 
% from scratch (Bayesian Agglomerative Clustering). 
%How can the RAVI framework and theorems be applied? In Section~\ref{sec:mcmc-experiment}, we study the long-standing problems of employing MCMC  within variational families~\citep{salimans2015markov} and importance samplers~\citep{neal2001annealed}. 

% \subsection{Analyzing and Improving Markov Chain Variational Inference}
% \label{sec:mcmc-experiment}


% We first apply RAVI to analyze and improve Markov Chain Variational 
% Inference (MCVI)~\citep{salimans2015markov}, which incorporates one
% or more MCMC steps into a variational family to improve the approximation's accuracy. \citet{salimans2015markov} argue informally that 
% MCVI lets practitioners trade additional computation 
% for arbitrarily accurate VI, by incorporating arbitrarily long MCMC
% chains into their variational families.
% However, using Theorem 4, we show 
% that the bound optimized by MCVI will eventually become \textit{looser},
% not tighter, as the length of the Markov chain grows, a phenomenon 
% we demonstrate empirically on toy targets.
% We apply RAVI both to diagnose the problem, and to design 
% an improved version of MCVI that does scale to long chains.
% The resulting RAVI inference strategy can also be used with Algorthm 1
% to obtain unbiased marginal likelihood estimates from MCMC runs, and 
% with AIDE~\citep{cusumano2017aide} to assess the 
% convergence of finite-time MCMC chains.

% % Markov Chain Variational Inference (MCVI)~\citep{salimans2015markov} 
% % is a method for performing variational inference with approximating 
% % families that incorporate one or more MCMC steps. 
% % By adding MCMC steps, practitioners can in principle make their 
% % variational families arbitrarily accurate, at the cost of additional computation.
% % %Its aim is 
% % %to help VI practitioners trade computation for accuracy, 
% % %by lengthening the MCMC chain as desired. 
% % However, RAVI's theorems predict\textemdash and we confirm empirically%
% % \textemdash that the bound optimized by MCVI will eventually become \textit{looser}, not tighter, as
% % the length of the user's Markov chain grows.\footnote{Indeed, although 
% % \citet{salimans2015markov} present a general method for any 
% % length of chain, they report empirical results only for 
% % variational families that run $M=1$ step of MCMC.} We apply RAVI both to diagnose the problem, and to design an improved 
% % version of MCVI that can scale to long chains.
% % In RAVI's vocabulary, the problem is that
% % although $\mathcal{S}.q$ becomes more accurate with more MCMC, 
% % meta-inference error accumulates, outpacing and counteracting $\mathcal{S}.q$'s improvements.

% \textbf{RAVI analysis of MCVI.} To apply RAVI to analyze MCVI,
% we first cast MCVI as an instance of RAVI variational inference 
% (Alg. 3), with a particular inference strategy $\mathcal{S}$.
% %First, we note that MCVI is equivalent to RAVI variational 
% %inference (Alg. 3) with a particular inference strategy $\mathcal{S}$. 
% The intractable variational family $$\mathcal{S}.q(x) = \int q_0(x_0) \prod_{i=1}^{M} T(x_{i-1}\rightarrow x_{i}) \delta_{x_M}(x) \text{d}x_{0:M}$$ initializes a Markov chain from a proposal $q_0$ and iterates $M$ steps of a Markov kernel $T$, finally returning $x_M$. 
% For meta-inference, MCVI employs sequential importance sampling 
% with learned backward proposals $r_i$: $$\mathcal{S}.\mathcal{M}(x).q(x_{0:M}) = \delta_{x}(x_M) \prod_{i=0}^{M-1} r_i(x_i; x_{i+1}).$$
% % which must infer the chain's history $x_{0:M-1}$ from its final state $x_M$, is sequential importance sampling (SIS), with learned ``reverse'' proposals $r_i(x_{i-1}; x_i)$. 
% Unfortunately, unless the proposals $r_i$
% exactly match their target distributions, sequential importance sampling suffers a \textit{degeneracy} problem: as $M$ grows, so does the expected variance of SIS~\citep[Proposition 1]{doucet2000sequential}.
% Furthermore, the improvements to $\mathcal{S}.q$ as more MCMC steps are 
% added diminish over time, as the chain converges to its target. 
% Thus, by Theorem 4, after a point\textemdash when the 
% incremental meta-inference error 
% outstrips the incremental benefit to $\mathcal{S}.q$\textemdash 
% the auxiliary-variable ELBO will \textit{decrease} (i.e., become \textit{less tight}) as more steps are added to the chain. In Fig.~\ref{fig:mcvi-experiment}, we demonstrate this empirically on two toy target distributions, one unimodal and one multimodal. For the MCMC transition kernel $T$, we use unadjusted Langevin ascent (ULA), and for the proposals $r_i$, we use mixtures of Gaussians parameterized by neural 
% networks ($13,425 + 10M$ parameters). 
% As the quality of the variational approximation improves, we initially see
% the bound become tighter, but after $M=25$ steps or so, it begins to loosen,
% and continues to loosen as $M$ grows.

% TODO: figure showing K=1 with three different meta-inference algorithms 
% and two different targets. 

% \textbf{Improving MCVI with Recursive Meta-Inference.} 
% To resolve this issue, we need a meta-inference algorithm $\mathcal{S}.\mathcal{M}$ that scales to infer longer Markov chain trajectories. 
% One such algorithm is SMC~\citep{chopin2020introduction},
% which is often applied in state-space models to overcome 
% the degeneracy problem. %requires $O(M)$ particles to infer a trajectory of length $M$.
% We introduce a neural network $r(x_i; x_{i+1}, i)$ to propose previous locations of the chain, given the current location and a one-hot encoding of the current time step.
% We consider the following RAVI inference strategy:
% \begin{itemize}
%     \item $\mathcal{S}.q$: as in MCVI. Initialize $x_0$ from a Gaussian $q_0$, and iterate $M$ steps of an MCMC kernel $T$ to generate $x_{1:M}$. Return $x_M$.
%     \item $\mathcal{S}.\mathcal{M}(x_M).q$: given $x_M$, run a $K$-particle sequential Monte Carlo algorithm targeting the sequence of unnormalized posteriors $\tilde{\pi}_i(x_{i:M-1}) = q_i(x_i) \prod_{j=i}^{M-1} T(x_{j} \rightarrow x_{j+1})$ for $i=M-1, \dots, 0$, where $q_i$ is a learned parametric approximation to the marginal of the chain at  step $i$. An MLP $r(x_i; x_{i+1}, i)$ is used to make proposals backward in time. Given SMC's final particle collection, we resample a single trajectory to return as $x_{0:M-1}$.
%     \item $\mathcal{S}.\mathcal{M}(x_M).\mathcal{M}(x_{0:M-1}).q$: run a $K$-particle \textit{conditional} SMC algorithm~\citep{andrieu2010particle}, with $x_{0:M-1}$ as the retained particle, to infer SMC's ancestor variables and the values of all unresampled particles.
% \end{itemize} 

% The third layer is necessary because the SMC algorithm itself introduces auxiliary variables that cannot be exactly marginalized. Figure~\ref{fig:mcvi-experiment} shows that by setting $K$ sufficiently high, we can ensure that the variational bound $\mathcal{L}$ continues to improve as the chain's length increases.

% \textbf{Marginal likelihood estimates for MCMC.} 
% By running Algorithm 1 on the above inference strategy,
% we obtain a novel method for estimating marginal likelihoods $p(y)$ using MCMC chains. Unlike annealed 
% importance sampling (AIS)~\citep{neal2001annealed}, 
% our algorithm does not require the design of intermediate or annealed target distributions;
% $\mathcal{S}.q$ runs an ordinary MCMC chain targeting the posterior. Figure~\ref{fig:mcvi-experiment} shows 
% that our algorithm can converge with significantly 
% shorter chains than AIS. This is because the error of 
% AIS is bounded below by the sums of divergences between subsequent pairs of intermediate target distributions; no matter how difficult or easy the 
% inference problem, the MCMC chain must be long enough 
% to support a very fine annealing schedule, without 
% large jumps. By contrast, our variant of MCVI requires 
% only that the marginal distribution of the last iterate of the chain be a good approximation to the posterior, and that our learned meta-inference is sufficiently accurate.

\begin{figure*}[t]
    \includegraphics[width=0.33\linewidth]{figs/unimodal-mcvi.pdf}
    \includegraphics[width=0.33\linewidth]{figs/multimodal-mcvi.pdf}
    %\includegraphics[width=0.24\linewidth]{figs/metainf-compare.pdf}
    \includegraphics[width=0.33\linewidth]{figs/Unimodal-AIS-comp.pdf}
    \caption{Improving Markov Chain Variational Inference with RAVI. \textbf{Left and Middle:} On unimodal and multimodal targets, MCVI begins to degrade after 15-25 steps of MCMC. RAVI-MCVI with sufficiently many particles continues to improve as more MCMC steps are added. 
    %\textbf{Middle Right:} RAVI-MCVI's improvements are due to its more accurate meta-inference. In the top plot, gray lines show 300 trajectories of MCMC on a 1D multimodal target, initialized from a broad Gaussian prior. Red lines show simulated \textit{backward} trajectories, generated by MCVI meta-inference: an RNN trained on the forward data. Green lines show RAVI-MCVI meta-inference: sequential Monte Carlo with the RNN as a proposal. SMC helps correct the RNN's mistakes, generating a more accurate posterior distribution over backward traces. 
    \textbf{Right:} When MCMC converges quickly to a reasonable approximation of the posterior, RAVI-MCVI can give more accurate estimates of marginal likelihoods than standard techniques such as AIS. The $x$ axis of this plot counts total MCMC steps simulated, whether as part of inference or meta-inference; for RAVI-MCVI($K$), this is $KM$, where $M$ is the length of the forward Markov chain and $K$ is the number of SMC particles used for meta-inference.}
    \label{fig:mcvi-experiment}
    \vspace{-5mm}
\end{figure*}

\subsection{Agglomerative Clustering for Dirichlet Process Mixtures}
A promising application of RAVI is to transform heuristic randomized algorithms into 
unbiased and consistent Monte Carlo estimators, by using them as proposal distributions. In this section, we design a RAVI inference strategy for clustering in Dirichlet process mixtures, based on a randomized agglomerative clustering algorithm (Inference Strategy~\hyperref[infstrat:agglom]{2}).
\begin{algorithm}[t]
    \label{infstrat:agglom}
    \SetAlgoLined\DontPrintSemicolon
    \scriptsize{
    \textbf{RAVI Inference Strategy 2:} Agglomerative Clustering\;
    \SetKwFunction{agglom}{agglom($X, K$).q}\SetKwFunction{agglomm}{agglom($X, K$).M($\Pi$).q}
    \SetKwProg{infalg}{Posterior Approx.}{}{}
    \SetKwInOut{Infers}{Target of inference}
    \SetKwInOut{Aux}{Auxiliary variables}
    \infalg{\agglom{}}{
    \Infers{partition $\Pi$ of dataset $X$}
    \Aux{merge sequence $\textit{mv}_{1:|X|-|\Pi|}$}
    \nl $\Pi \gets \{\{x\} \mid x \in X\}$ \tcp*{Initial partition}
    \nl \For{$l \in 1, \dots, |X|$}{
        \nl \For{\text{unordered pair }$\{i, j\}$\text{ of clusters in }$\Pi$}{
            \nl $w_{\{i,j\}} \gets \pi((\Pi \setminus \{i, j\}) \cup \{i \cup j\})$\;
        }
        \nl $w_{\text{stop}} = \pi(\Pi)$\;
        \nl $\textit{All} \gets \{\text{stop}\} \cup \{\{i, j\} \mid i, j \in \Pi\}$\;
        \nl $\textit{mv}_l \sim \text{Discrete}(\{m \Rightarrow w_m \mid m \in \textit{All}\})$ \;
        \nl \lIf{$\textit{mv}_l = \text{stop}$}{$\textbf{break}$}
        \nl $\Pi \gets (\Pi \setminus \textit{mv}_l) \cup (\cup \textit{mv}_l)$\tcp*{Perform the merge}
    }
    \nl \Return{$\Pi$}\;}{}
    \setcounter{AlgoLine}{0}
    \SetKwProg{metaalg}{Meta-Posterior Approx.}{}{}
    \metaalg{\agglomm{}}{
    \Infers{merge sequence $\textit{mv}_{1:|X|-|\Pi|}$}
    \Aux{particles $\textit{mv}_{1:|X|-|\Pi|}^{1:K}$, ancestors $a_{1:|X|-|\Pi|}^{1:K}$}
    \nl \lFor{$k \in 1,\dots, K$}{$\Pi_0^k, \textit{tr}_k  \gets \{\{x\} \mid x \in X\}, []$}
    \nl \For{$l \in 1, \dots, |X|-|\Pi|$}{
        \nl \For{$k \in 1,\dots,K$}{
            \nl \For{\text{unordered pair }$\{i, j\}$\text{ in }$\Pi_{l-1}^k$}{
            \nl $w_{\{i,j\}} \gets \pi((\Pi_{l-1}^k \setminus \{i, j\}) \cup \{i \cup j\})$\;
        }
        \nl $w_{\text{stop}} = \pi(\Pi_{l-1}^k)$\;
        \nl $\textit{All} \gets \{\text{stop}\} \cup \{\{i, j\} \mid i, j \in \Pi_{l-1}^k\}$\;
        \nl $\textit{Ok} \gets \{\{i, j\} \in \textit{All} \mid \exists c \in \Pi. i \cup j \subseteq c\}$\;
        \nl $\textit{mv}_l \sim \text{Discrete}(\{m \Rightarrow w_m \mid m \in \textit{Ok}\})$ \;
        \nl $\Pi_{l-1}^k \gets (\Pi_{l-1}^k \setminus \textit{mv}_l) \cup (\cup \textit{mv}_l)$\;
        \nl $\textit{tr}_k \gets [\textit{tr}_k\dots, \textit{mv}_l]$\;
        \nl $W_l^k \gets \frac{\sum_{m \in \textit{Ok}} w_m}{\sum_{m \in \textit{All}} w_m}$\;
        }

        \nl \For{$k \in 1,\dots,K$}{
            \nl $a_l^k \sim \text{Discrete}(W_l^{1:K})$ 
            \tcp*{resampling step}
            \nl $\Pi_l^k, \textit{tr}_k \gets \Pi_{l-1}^{a_l^k}, \textit{tr}_{a_l^k}$\;
        }
    }
    \nl \Return{$[\textit{tr}_1\dots, \text{stop}]$}}
    \SetKwProg{mmetaalg}{Meta-Meta-Posterior }{}{}
    \SetKwFunction{agglommm}{agglom($X, K$).M($\Pi$).M($\textit{mv}_{1:|X|-|\Pi|}$).q}
    \mmetaalg{\agglommm{}}{
        \tcp{Conditional SMC (omitted for space, but similar to that of \texttt{rmcvi})}
    }
}
\vspace{-5mm}
\end{algorithm} 
% \begin{algorithm}[t]
%     \label{infalg:ensemble}
%     \SetAlgoLined\DontPrintSemicolon
%     \textbf{RAVI Inference Strategy 2:} RAVI ensemble\;
%     \SetKwFunction{ensemble}{ensemble($\mathcal{S}_{1:N}, \tilde{\pi}$).q}\SetKwFunction{ensemblem}{ensemble($\mathcal{S}_{1:N}, \tilde{\pi}$).M($x$).q}
%     \SetKwProg{infalg}{Inference}{}{}
%     \infalg{\ensemble{}}{
%     \nl \For{$i \in 1, \dots, N$}{
%         \nl $(x_i, w_i) \sim \text{Alg1}(\mathcal{S}_i, \tilde{\pi})$ (with aux. vars $r_i^i$)\;
%         \nl $w_i \gets \frac{1}{w_i}$\;
%         \nl \For{$j \neq i$}{
%             \nl $w_i^j \sim \text{Alg2}(\mathcal{S}_j, \tilde{\pi}, x_i)$ (with aux. vars $r_i^j$)\;
%             \nl $w_i \gets w_i + w_i^j$\;
%         }
%         \nl $w_i \gets \frac{N}{w_i}$  \tcp*{is an estimate of $\frac{\tilde\pi(x_i)}{\frac{1}{N}\sum_j \mathcal{S}_j.q(x_i)}$}
%     }
%     \nl $j \sim \text{Discrete}(w_{1:N})$\;
%     \nl \Return{$x_j$}\;}{}
%     \setcounter{AlgoLine}{0}
%     \SetKwProg{metaalg}{Meta-Inference}{}{}
%     \metaalg{\ensemblem{}}{
%     \nl \For{$i \in 1, \dots, N$}{
%         \nl $w_i \sim \text{Alg2}(\mathcal{S}_i, \tilde{\pi}, x)$ (with aux. vars $r_*^i$)\;
%     }
%     \nl $j \sim \text{Discrete}(w_{1:N})$\;
%     \nl $x_j \gets x$\;
%     \nl \For{$i \in 1, \dots, N$}{
%         \nl $r_j^i \gets r_*^i$\;
%         \nl \If{$j \neq i$}{
%             \nl $(x_i, \_) \sim \text{Alg1}(\mathcal{S}_i, \tilde{\pi})$ (with aux. vars $r_i^i$)\;
%         }
%         \nl \For{$k \not\in \{i,j\}$}{
%             \nl $\_ \sim \text{Alg2}(\mathcal{S}_k, \tilde{\pi},  x_i)$ (with aux. vars $r_i^k$)\;
%         }
%     }
%     \nl \Return{$(x_{1:N}, r_{1:N}^{1:N}, j)$}}
% \end{algorithm} 
 

\textbf{Datasets and Models.} We test our algorithm on three clustering problems. The first is a synthetic 1D dataset sampled from a Dirichlet process (DP) mixture prior. The second is a standard benchmark dataset of galaxy velocities~\citep{drinkwater2004large,cusumano2017aide}, which we model using a DP mixture with Gaussian likelihoods and $\alpha = 1$.  The last is a data-cleaning task, correcting typos in 1k strings from Medicare records~\citep{MedicareHosp}. We adapt the generative model of~\citet{lew2021pclean}. Using an English character-level bigram model $H(s) = h(s_1) \prod_{i=2}^{|s|} h(s_i \mid s_{i-1})$, we model the data $\{y_i\}$  with a DP prior:
%The unigram frequencies $h(s_1)$ and bigram frequencies $h(s_i \mid s_{i-1})$ are included in the supplement, as \texttt{letter\_probabilities.csv} and \texttt{transition\_probabilities.csv}. 
$$G \sim DP(H, \alpha = 1.0), \quad  x_i \mid G \sim G, \quad y_i \mid x_i \sim f(\cdot \mid x_i)$$
Here, the likelihood $f(y_i \mid x_i)$ models typos. We set $f$ to be
\[
f(y_i \mid x_i) \propto 
 \begin{cases} 
      \mathbf{1}[x_i = y_i] & (x_i, y_i) \not\in \mathcal{L} \times \mathcal{L} \\
      \frac{\text{NegBin}(\tau(x_i, y_i); \lceil \frac{|s|}{5} \rceil, 0.9)}{(5.09|s|)^{\tau(x_i, y_i)}} & (x_i, y_i) \in \mathcal{L} \times \mathcal{L}
  \end{cases},
\]
where $\tau(x_i, y_i)$ is the Damerau-Levenshtein edit distance between $x_i$ and $y_i$, and $\mathcal{L}$ is the set of all observed strings $\{y \mid \exists i. \, y = y_i\}$.%This somewhat strange likelihood can be understood as follows. 
\footnote{We assume that the data $\mathcal{L}$ includes at least one example of every clean string. 
%Setting $f(y_i \mid x_i) = \mathbf{1}[x_i = y_i]$ for $x_i \not\in \mathcal{L}$ encodes that `if someone had meant to type $x_i \not\in \mathcal{L}$, they would have typed it correctly; therefore, the explanation for $y_i \in \mathcal{L}$ cannot be a string $x_i \not\in \mathcal{L}$.' 
When $x_i \in \mathcal{L}$, we model a negative-binomially distributed number of typos, where the number of trials depends on the length of the string.}
We perform inference in a collapsed version of the model, with the $x_i$ marginalized out:
\begin{align*}
    \Pi &\sim CRP(n = N, \alpha = 1.0)\\
    y_I \mid \Pi &\sim F(y_I).
\end{align*}
Here $\Pi$ is a partition, $I$ ranges over the components of $\Pi$ (each of which is a subset of data indices), and $F(y_I) = \sum_{x \in \mathcal{L}} h(x) \prod_{i \in I} f(y_i \mid x)$ is the marginal likelihood of $y_I$ as a sequence of noisy observations of a latent string.


%Given a partition $\Pi$ of a set of indices $I \subseteq \{1, \dots, 1000\}$, we write $\pi_I(\Pi)$ for the posterior probability of the partition, given $y_I$, and $\tilde{\pi}_I(\Pi) = CRP(\Pi; n=|I|, \alpha=1.0) \cdot \prod_{J \in \Pi} F(y_J)$ for the unnormalized posterior.


\textbf{Baseline.} We compare to an SMC baseline, inspired by PClean's inference~\citep{lew2021pclean}, that targets a sequence of posteriors, where the $t^{\text{th}}$ posterior incorporates the first $t$ datapoints. 
The SMC proposal is locally optimal, assigning the newest datapoint to an existing component $I$ with probability proportional to $\frac{|I|}{t + \alpha - 1} \cdot F(y_{I} \cup \{y_t\})$, or to a new component with probability proportional to $\frac{\alpha}{t + \alpha - 1} \cdot F(\{y_t\})$. We do not compare to a Gibbs sampling baseline, as Gibbs sampling does not yield marginal likelihood estimates, but do perform a Gibbs rejuvenation sweep every 20 iterations of SMC. %We perform multinomial resampling when the esitmated ESS falls below $0.25K$.

\textbf{RAVI algorithm.} We apply Algorithm~\hyperref[alg:alg1]{1} to the inference strategy $\texttt{agglom}$ (Inference Strategy~\hyperref[infstrat:agglom]{2}). The strategy is based 
on a randomized agglomerative clustering algorithm: each datapoint begins 
in its own cluster (L1), and we repeatedly 
choose to either merge two clusters (L9) or stop and propose the current partition (L8). The sequence of merge 
decisions $\textit{mv}_1, \dots, \textit{mv}_{|X|-|\Pi|}$ are the auxiliary variables 
of our proposal distribution; the final output is the proposed clustering $\Pi$. Our meta-inference $\texttt{agglom}(X, K).\mathcal{M}(\Pi).q$ infers the sequence of merges from the observed clustering $\Pi$, using $K$-particle SMC with proposals that mimic the forward process but choose only from a restricted set $\textit{Ok}$ of possible merges (L8), to avoid making any choices that disagree with $\Pi$. SMC introduces additional auxiliary variables, so we also include a conditional SMC meta-meta-posterior approximation (not shown, but nearly identical to~\hyperref[infstrat:mcvi]{\texttt{rmcvi}}'s).

\textbf{Results.} Table~\ref{tab:experiments} shows 
average log marginal likelihood estimates; higher is 
better. On synthetic Gaussian data, the algorithms 
perform comparably. On the galaxy data, RAVI
agglomerative clustering finds modes that SMC misses,
leading to a 3-nat improvement in the 
average log marginal likelihood. In the Medicare 
data example, SMC misses the ground-truth clustering  
and hypothesizes many unlikely typos to 
explain the data. The RAVI agglomerative clustering is
less greedy, considering $O(N^2)$ possible merges at each 
step, rather than $O(N)$. As such, it is able to 
find the ground truth clustering, correctly identifying 
all typos (unlike PClean~\citep{lew2021pclean}, which achieves 
only 90\% accuracy on this dataset) and reporting a log marginal likelihood thousands of 
nats higher than the SMC algorithm. 




\begin{table}[]
\scriptsize{
\begin{tabular}{ll|l}
& & $\hat{\mathcal{L}}$ \\ 
\hline
\multicolumn{2}{l|}{Gaussian likelihood~\citep{cusumano2017aide}, synthetic data}     &   \\ 
        & SMC + adapted proposals  &   $-125.09 \pm 0.38 $                   \\
        & RAVI agglomerative clustering                   &  ${-125.97 \pm 1.62}$        \\ 
     %   & Naive ensembling  &  ${-125.23 \pm 0.31}$        \\ 
     %   & RAVI ensembling & ${-125.11 \pm 0.45}$\\ \hline
\multicolumn{2}{l|}{Gaussian likelihood~\citep{cusumano2017aide}, Galaxy data~\citep{drinkwater2004large}}     &   \\ 
        & SMC + adapted proposals                &   $-426.20 \pm 1.26  $                   \\
        & RAVI agglomerative clustering                   &  $\mathbf{-423.03 \pm 0.94}$                     \\ 
     %   & Naive ensembling & $\mathbf{-423.22 \pm 1.05}$ \\
     %   & RAVI ensembling & $\mathbf{-422.65 \pm 0.97}$\\ \hline
\multicolumn{2}{l|}{PClean typos likelihood~\citep{lew2021pclean}, Hospital data~\citep{MedicareHosp}}&                       \\
%     & Gaussian proposal                             &                        \\
        & SMC + adpated proposals       &      $-40,239 \pm 1,532$ \\
        & RAVI agglomerative clustering                 &      $\mathbf{-13,851.0 \pm 0.01}  $      \\ 
      %  & Naive ensembling & $-13,852.4 \pm 0.01$ \\
     %   & RAVI ensembling & $-13,851.6 \pm 0.01$\\ \hline
% \multicolumn{2}{l|}{Experiment 3 (DPMM)}              &                       \\
%      & SMC + adapted proposals    &  $-40,239 \pm 1,532$    \\
%      & Agglomerative Monte Carlo                     &   $\mathbf{-32,000 \pm 709}$    
\end{tabular}
}
\caption{RAVI agglomerative clustering vs. SMC baseline.}
\label{tab:experiments}
\vspace{-6mm}
\end{table}
    

% \subsection{ENSEMBLING MULTIPLE MONTE CARLO OR VARIATIONAL INFERENCE ALGORITHMS}

% Which algorithm is best for a job may be data-dependent,
% in which case it can be fruitful to ensemble multiple 
% algorithms. Both Monte Carlo and variational inference 
% support a naive approach to ensembling algorithms:

% \begin{itemize}
%     \item \textbf{Monte Carlo.} To ensemble results $(x_i, w_i)$ from $N$ multiple weighted sampling algorithms targeting $\tilde{\pi}$, average the weights and randomly select an index $j$ with probability proportional to the weights $w_j$, to yield a new weighted sample $(x_j, \frac{\sum_i w_i}{N})$.

%     \item \textbf{Variational Inference.} Given $N$ variational bounds of the form $\mathbb{E}_{q_i}[\log \hat{Z}_i]$, 
%     average the $\hat{Z}_i$ values to obtain an ensemble bound $\mathbb{E}_{q_{1:N}}[\log \frac{1}{N}\sum_i \hat{Z}_i]$. 
% \end{itemize}

% We compare this approach to an improved RAVI ensembling method, possible when all $N$ algorithms are given as RAVI inference strategies $\mathcal{S}_i$. The improved method is to run 
% Algorithm 1 (or, for VI, Algorithm 3) on the inference strategy \texttt{ensemble}($\mathcal{S}_{1:N}$, $\tilde{\pi}$) (Inference Strategy~\hyperref[infalg:ensemble]{2}).
% The key idea is to use an approximate version of the intractable \textit{deterministic-mixture} multiple importance sampling algorithm~\citep{elvira2019generalized}, which weights each hypothesis $x_i$ by the formula $\frac{\tilde{\pi}(x_i)}{\frac{1}{N} \sum_j q_j(x_i)}$. In our setting, the terms $\mathcal{S}_j.q(x_i)$ are unavailable, but we can use Algorithms 1 and 2 to approximate them. Intuitively, the algorithm generates proposals from each $\mathcal{S}_i$, then 
% estimates the probability that each proposal could have 
% been generated by each other inference strategy. Proposals 
% that could have been generated by many strategies are down-weighted, whereas those that only one strategy could 
% have generated are up-weighted. Table~\ref{tab:experiments} 
% shows that the RAVI ensembling method, applied to the SMC and agglomerative clustering algorithms from the previous section, can outperform the naive ensembling method.

 %Let $\hat{Z}_\mathcal{S}(r, x)$ be the joint distribution over auxiliary variables $r$ and final output $x$ sampled by Algorithm 1 applied to $\mathcal{S}$, and similarly for $\check{Z}_\mathcal{S}(r; x)$ and Algorithm 2.

%$$\mathcal{S}.q(x_{1:N}, r_{1:N}^{1:N}, x) = \prod_{i=1}^N \hat{Z}(r_{1:N}, )$$

% \textbf{The Agglomerative SMC algorithm.} 

% Whereas the baseline algorithm is parameterized by a time step $t$, and considers all data points $1 \leq i \leq t$, the Agglomerative SMC algorithm is instead parameterized by an arbitrary index set $I$ of data points to consider. 

% \textbf{To generate a collection $\hat{\pi} = \{(\hat{\Pi}_1, w_1), \dots, (\hat{\Pi}_M, w_M)\}$ of $M$ particles weighted for the posterior $\pi_I$, where $I \subseteq \{1, \dots, N\}$ is a subset of datapoint indices:}

% \begin{enumerate}
%     \item If $I = \{i\}$ is a singleton set, set $\hat{\Pi}_j$ to the unique partition of $I$ for $j=1, \dots, M$, and set each $w_j = F(\{y_i\})$. Return $\hat{\pi} = \{(\hat{\Pi}_1, w_1), \dots, (\hat{\Pi}_M, w_M)\}$.
    
%     \item Else, split $I$ into two halves, $I_1$ and $I_2$. (If $|I|$ is odd, divide almost-evenly, so that $|I_2| = |I_1| + 1$.) Recursively run the Agglomerative SMC algorithm to generate two weighted collections $\hat{\pi}_{I_1}$ and $\hat{\pi}_{I_2}$.
    
%     \item For each $j \in 1, \dots, M$, draw $\Pi_j^1 \sim \hat{\pi}_{I_1}$ and $\Pi_j^2 \sim \hat{\pi}_{I_2}$ according to the particle weights. 
    
%     \item Set $\hat{\Pi}_j$ by stochastically \textit{merging} the two partitions $\Pi_j^1$ and $\Pi_j^2$ of $I_1$ and $I_2$:
    
%     \begin{itemize}
%         \item Set $N = |I_1|$ and for each component $J \subseteq I_2$ in $\Pi_j^2$:
        
%         \begin{itemize}
%             \item With probability proportional to $\frac{\alpha}{\alpha + N} \cdot \prod_{l=2}^{|J|} \frac{l - 1}{\alpha + N + l - 1}$, add $J$ as its own component to the partition $\hat{\Pi}_j$.
            
%             \item Alternatively, letting $L$ range over the partition components in $\Pi_j^1$, with probability proportional to $\prod_{l=1}^{|J|} \frac{|L| + l - 1}{\alpha + N + l - 1}$, add $J \cup L$ as a new component to the partition $\hat{\Pi}_j$.
%             Remove the selected component $L$ from $\Pi_j^1$ so it is unavailable for matching in future steps.
            
%             \item Increment $N$ by $|J|$.
            
%         \end{itemize}
        
%         \item Add any components that remain in $\Pi_1^j$ to $\hat{\Pi}_j$.
        
%     \end{itemize}
    
%     \item Letting $\hat{Z}_1$ be the mean weight of $\hat{\pi}_1$ and likewise for $\hat{Z}_2$, compute $w_j = \hat{Z}_1 \hat{Z}_2 \frac{\tilde{\pi}_{I}(\hat{x}_j)}{\tilde{\pi}_{I_1}(x_j^1) \tilde{\pi}_{I_2}(x_j^2) K((x_j^1, x_j^2), d\hat{x}_j)}$, where the kernel $K$ represents the merging process described in Step 4. (Its density can be computed by multiplying the probabilities of the merge decisions made for each $J$.)
    
%     \item With probability $\rho$ (a hyperparameter of the algorithm), perform a Gibbs sweep on each $\hat{\Pi}_j$ for $j = 1, \dots, M$.
    
%     \item Return $\hat{\pi} = \{(\hat{\Pi}_1, w_1), \dots, (\hat{\Pi}_M, w_M)\}$.
    
% \end{enumerate}



% If there is only one active index, i.e. $I = \{i\}$, we return the $M$ copies of the trivial clustering, with the uniform weights $w_j = F(\{y_i\})$. Otherwise we divide the index set into halves, $I_1$ and $I_2$. We then run the algorithm recursively to obtain $M$-particle collections $\hat{\pi}_1$ and $\hat{\pi}_2$ representing the respective posteriors. We draw $M$ particles with replacement from each approximation, $x_{j}^1 \sim \hat{\pi}_1$ and $x_{j}^2 \sim \hat{\pi}_2$, for $j \in [1, M]$. Then, for each $j$, we run a proposal kernel $K$ on $(x_j^1, x_j^2)$ to yield $x_j$. The kernel considers each component of the $x_j^2$ partition in turn, deciding whether to merge it with an available component in $x_j^1$, or leave it as its own component. If a merge is decided, the chosen component is no longer available to be merged with other components at this stage of the algorithm. In the resulting particle $x_j$, if $a, b \in I_1$ or $a, b \in I_2$, then $a$ and $b$ have the same relationship in $x_j$ (i.e., either same-cluster or different-cluster) as they did in $x_j^1$ or $x_j^2$. But for $a \in I_1, b \in I_2$, after the merge any relationship is possible.

% \textbf{Dataset shuffling.} The agglomerative SMC algorithm can be additionally modified so that, to some extent, different particles see different groupings of the data, leading to more particle diversity. In partuclar, at step (2) of the algorithm, if $M$ is above some threshold $\tau$, we consider two possible index splits, $(I_{11}, I_{12})$ and $(I_{21}, I_{22})$. For each split, we perform the two recursive calls as described in Step (2), but with only $\frac{M}{2}$ particles each. Then, we follow steps 3-7 individually for each split, to yield two $\frac{M}{2}$-particle collections $\hat{\pi}_1$ and $\hat{\pi}_2$. We take their union to yield a final $\hat{\pi}$ that we return.

% \textbf{Experimental details.} We ran the locally optimal SMC baseline with 32 particles, which took approximately 30 seconds, and ran the Agglomerative SMC algorithm with 128 particles, with $\rho = 0.0$ and $\tau = 2$, which took the same amount of time. In the body of the paper, we ran each experiment independently three times to estimate error bars; we reported the mean and standard deviations of the log weights, and the best data cleaning accuracy achieved by any of the 3 runs (after performing one additional Gibbs sweep on the algorithm results).\footnote{We have since run additional trials (30 for each algorithm), yielding log weights of $-32,000 \pm 709$ for Agglomerative SMC and $-40,239 \pm 1,532$ for the baseline.}








% Using RAVI, we can allocate additional computation to 
% solve the harder meta-inference problem accurately. To infer 
% the history of an MCMC chain, given its final state, 
% we employ sequential Monte Carlo, with learned proposals.
% SMC is known to require $O(T)$ particles to converge for 
% problems with $T$ time steps. 


% \citet{salimans2015markov} proposed Markov Chain 
% Variational Inference (MCVI),
% a method for performing variational inference with 
% approximations that incorporate one or more MCMC steps.

% In this section, we experiment with novel RAVI inference strategies
% in which the proposal $\mathcal{S}.q$ simulates many steps of an
% MCMC algorithm:

% \begin{itemize}
% \item We use RAVI's theorems to expose a trade-off between the length of an MCMC chain and the difficulty of meta-inference, and demonstrate empirically that \citet{salimans2015markov}'s approach to meta-inference does not scale to longer chains.
% \item We apply RAVI to incorporate sequential Monte Carlo as the meta-inference strategy $\mathcal{S}.\mathcal{M}$, with neural amortized proposals. SMC is used to infer the history of an MCMC chain, given its final state, enabling the approach of~\citet{salimans2015markov} to scale to longer chains.
% \item We explore two additional use cases of accurate meta-inference for MCMC: (1) as an alternative to Annealed Importance Sampling~\citep{neal2001annealed}, the harmonic mean estimator~\citep{}, and other methods for estimating marginal likelihoods based on runs of MCMC; and (2) to estimate KL divergences between MCMC and the true posterior, addressing an open question from~\citet{cusumano2017aide}.

% \end{itemize}

% FIGURE: Using the ELBO from a RAVI variational family 
% to assess convergence of MCMC.
% ULA performs well on a unimodal target, but struggles 
% to accurately characterize a posterior with multiple modes.

% FIGURE: Using RAVI to scale~\citet{salimans2015markov}'s MCMC-based 
% variational families to longer chains. As the number of MCMC steps
% increases, $\mathcal{S}.q$ becomes closer to the posterior, but 
% meta-inference ($\mathcal{S}.\mathcal{M}$) becomes more difficult,
% and the sequential training 




% Because they can explore the sample space iteratively, 
% long MCMC chains can often approximate posterior distributions  
% more accurately than one-off importance proposals.
% However, the marginal distribution of an MCMC chain's output
% is almost never  analytically tractable, so MCMC cannot be used directly 
% within importance sampling or variational inference algorithms, to estimate or optimize the marginal likelihood of the data. For the same reason,
% it is difficult to evaluate the finite-sample accuracy of MCMC algorithms. 

% Because of this, there has been much interest in developing 
% methods that bridge the gap between MCMC and VI/IS, including 
% Annealed Importance Sampling~\citep{neal2001annealed} and 
% Hamiltonian Variational Inference~\citep{salimans2015markov}. 
% In this section we 


% As such, various proposals have been floated for 


% MCMC is a powerful tool for approximating posterior distributions, 
% but suffers several severe shortcomings compared with 



% show how RAVI can be used to design and analyze variational families that incorporate many MCMC steps. We use RAVI's theorems to analyze the shortcomings of existing approaches~\citep{salimans2015markov} and to motivate the need for improved meta-inference.
% Second, we apply RAVI to design novel particle-based and MCMC algorithms for inference  in Dirichlet Process mixture models, based on randomized agglomerative clustering~\citep{heller2005bayesian}. 

% This section gives three examples of using RAVI to improve inference, relative to baselines that use proposals with tractable densities. Table~\ref{tab:experiments} summarizes the results, showing means and standard deviations of log marignal likelihood estimates obtaind using each algorithm; higher means and lower standard deviations are better.


% \textbf{SIR with learned meta-inference.} We first consider a simple Gaussian target, the posterior $p(z \mid x = 0)$ of the model $p(z, x) = \mathcal{N}(z; 0, 10) \cdot \mathcal{N}(x; z, 0.1)$. As discussed in Section~\ref{sec:unifying}, the IWAE objective can be seen as an instance of RAVI, where $\mathcal{S}.q$ implements sampling/importance resasmpling (SIR) and $\mathcal{S}.\mathcal{M}$ uses a naive meta-inference strategy. In this experiment we test whether learning the meta-inference strategy leads to a tighter bound. As our IWAE proposal, we use $q(x_1) = \frac{1}{2} \mathcal{N}(x_1; 0, 10) + \frac{1}{2}\mathcal{N}(x_1; 0, 0.2)$. In the two-particle case, the job of meta-inference is to infer the value of the unchosen particle, observing the chosen particle's value. We learn meta-inference using Algorithm 4 and the parameterized meta-inference strategy $\mathcal{S}.\mathcal{M}(x)(x') = \frac{1}{2} \left(f_\theta(x) \cdot \mathcal{N}(x'; 0, 10) + (1-f_\theta(x)) \cdot \mathcal{N}(x'; 0, 0.2)\right)$, where $x'$ is the unchosen particle, $x$ is the chosen particle, and $f_\theta$ is a neural network predicting probabilities in $[0, 1]$. We see that learned meta-inference produces a tighter bound. 

% \textbf{Multiple importance sampling with random weights.} When combining multiple properly weighted~\citep{liu2008monte} samplers, it is valid to simply average their weights to estimate marginal likelihoods. This is equivalent to using a RAVI inference strategy $\mathcal{S}$ for which $\mathcal{S}.q$ generates proposals from each sampler, then chooses one. However, the $\mathcal{S}.\mathcal{M}$ implicit in this procedure is a poor meta-inference algorithm. We consider an improved and, to our knowledge, novel weighting scheme, derived by improving the meta-inference algorithm, on the mixture model considered by~\citet{elvira2019generalized}. See Appendix~\ref{sec:experimental-details} for details.

% \textbf{Agglomerative Monte Carlo for Dirichlet Process mixtures.} We now consider a variant of sequential Monte Carlo that is inspired by \citep{del2006sequential, lindsten2017divide} but to our knowledge does not fit neatly into existing frameworks. In order to solve a DPMM clustering problem, we design a posterior approximation that splits the dataset in half, and recursively solves each clustering subproblem by the same algorithm. To merge the subproblem solutions, the posterior approximation randomly resamples a clustering for each, and then sequentially decides, for each cluster in the chosen right subproblem, whether to merge it with a cluster in the chosen left subproblem. Meta-inference employs a version of conditional SMC~\citep{andrieu2010particle} that first generates a possible tree of subproblem clusterings backwards, and then runs the agglomerative SMC algorithm forward conditioned on that tree-shaped trajectory. See supplement for full details.

% \subsection{Learned meta-inference for sampling importance-resampling}

% In this section, we demonstrate the use of the RAVI framework to improve an existing estimator: 2-particle vanilla importance sampling. We consider a simple generative model $p_0$, and a RAVI strategy whose corresponding $\hat{Z}$ is precisely the two-particle importance sampling estimator with proposal $q(x_1) = \frac{1}{2} \mathcal{N}(x_1; 0, 10) + \frac{1}{2}\mathcal{N}(x_1; 0, 0.2)$:
% %In Appendix~\ref{sec:sir-example}, we show how $N$-particle importance sampling can be viewed as a single-particle estimator arising from Alg. 1 applied to a particular RAVI strategy. For now, we specialize to a simple generative model $p_0$, and the RAVI strategy $p_1, p_2$ corresponding to importance sampling with number of particles $N=2$ and proposal distribution $q(x_1) = \frac{1}{2} \mathcal{N}(x_1; 0, 10) + \frac{1}{2}\mathcal{N}(x_1; 0, 0.2)$:
% \begin{flalign*}
%     p_0(x_1, x_0) &= \mathcal{N}(x_1; 0, 10) \cdot \mathcal{N}(x_0; x_1, 0.1)&\\
%     p_1(x_2, x_1; x_0) &= \prod_{i=1}^2 q(x_2^{(i)}) \cdot \sum_{i=1}^{2} \frac{w_i}{w_1 + w_2} \delta_{x_2^{(i)}}(x_1)\\
% %    p_1(x_2, x_1; x_0) &= \frac{p_0(x_2^{(c)}, x_0) \cdot \delta_{x_2^{(c)}}(x_1) \cdot q(x_2^{(\overline{c})})}{p_0(x_2^{(1)}, x_0)/q(x_2^{(1)}) + p_0(x_2^{(2)}, x_0)/q(x_2^{(2)})} \\
%     p_2(x_3, x_2; x_1, x_0) &= \text{Bern}(x_3; \frac{1}{2})\cdot q(x_2^{(x_3+1)}) \cdot \delta_{x_1}(x_2^{(2 - x_3)})\\
%     p_3(x_3; x_2, x_1, x_0) &= \delta_{\mathbf{1}[x_1 = x_2^{(1)}]}(x_3)
% \end{flalign*}
% \noindent Here, $p_1$ corresponds to the `sampling/importance resampling' process of generating two particles $x_2^{(1)}$ and $x_2^{(2)}$ from $q$, computing their weights $w_i = p_0(x_2^{(i)}, x_0)/q(x_2^{(i)})$, and then based on the weights, selecting one to return as $x_1$.
% One layer down the strategy, $p_2$ observes the chosen particle $x_1$, and tries to guess the values of $x_2^{(i)}$. Our $p_2$ above flips a coin $x_3$ to decide which particle ($x_2^{(1)}$ or $x_2^{(2)}$) to set equal to $x_1$, and guesses the value of the other, \textit{unchosen} particle blindly using $q$. This RAVI strategy recovers the usual 2-particle IS estimator $\hat{Z} = \frac{1}{2} (w_1 + w_2)$. But $p_2$ could be better: knowing the chosen particle $x_1$ reveals information about the other, unchosen particle, which $p_2$ could exploit. For example, if a particularly \textit{unlikely} value for $x_1$ was chosen by $p_1$, the other particle must also have been unlikely; otherwise, its large relative weight would have caused it to be chosen.
% \begin{wrapfigure}{L}{0.3\textwidth}
% \vspace{-2mm}
% \centering
% \includegraphics[width=0.3\textwidth]{figs/violinplot_expanded.pdf}
% \caption{\label{fig:violin}Estimates of $Z$ with 1 particle, and with default vs. learned $\hat{G}$ for two particles.}
% \vspace{-2mm}
% \end{wrapfigure}
% %
% We can learn to exploit this knowledge if we replace the $q$ in $p_2$ with an $x_1$-aware distribution  
% $h_\theta(x; x_1) = \frac{1}{2} \left(f_\theta(x_1) \cdot \mathcal{N}(x; 0, 10) + (1-f_\theta(x_1)) \cdot \mathcal{N}(x; 0, 0.2)\right)$, where $f_\theta$ is a neural network predicting probabilities in $[0, 1]$.

% To learn $\theta$, we use stochastic gradient descent with gradients from Alg. 3. Using the resulting $p_2$ in the RAVI strategy for Monte Carlo, we see that $\hat{Z}$'s variance decreases (Fig.~\ref{fig:violin}, right).
% Although this is not a particularly pronounced change, our toy example illustrates the basic idea: viewing algorithms as RAVI strategies helps identify places where variance is being `left on the table,' opening up avenues for improvement. (E.g., an intriguing possibility for PPLs is to automatically tune parameters of generic inference methods' strategies for particular user models.)

% \subsection{Random-weight multiple importance sampling}

% If we choose $G$ so that the $z_i$ are not iid, we recover the family of algorithms known as Multiple Importance Sampling~\citep{owen2000safe, elvira2015efficient, elvira2019generalized, cornuet2012adaptive}. Different choices of $\psi, \hat{G},$ and $\hat{\psi}$ correspond to different MIS estimation schemes from the literature%
% %(and several that, to our knowledge, have not been proposed before)
% . In general, the $z_i$ needn't be independent (see, e.g., the RAVI formulation of \textbf{antithetic sampling}~\citep{pmlr-v97-ren19b} in Appendix~\ref{sec:antithetic-example}), but we focus here on the case where $G(z_1, \dots, z_K; y) = \prod_{i=1}^K g_i(z_i; y)$, so that the particles are independent but not identically distributed; we use a different proposal $g_i$ for each. If we proceed as before, with $w_i = \frac{p(z_i, y)}{g_i(z_i; y)}$, $\psi(j) \propto w_j$, $\hat{\psi}(j) = \frac{1}{K}$, and $\hat{G}(z_{-j} \mid j; x, y) = \prod_{i \neq j} g_i(z_i; y)$, then we recover the usual IS estimator, but with proposal-dependent weights: $\hat{Z}_1 = \frac{1}{K}\sum_{i=1}^{K} \frac{p(z_i, y)}{g_i(z_i; y)}$.

% \begin{figure}
% \vspace{-5mm}
% \centering
% \includegraphics[width=\linewidth]{figs/mis_demo_2.pdf}
% \vspace{-7mm}
% \caption{Different estimators for Multiple Importance Sampling, with mixture of Gaussians target and Gaussian proposals as in~\citep{elvira2019generalized}. For each estimator, 2000 trials were run, with $K=30$ particles each. On the left and right, the estimators $\hat{Z}_1$ and $\hat{Z}_2$. Center: Using the RAVI strategies for $\hat{Z}_1$ and $\hat{Z}_2$, we can develop a new estimator, by combining inference ($G$ and $\psi$) from $\hat{Z}_1$ with meta-inference ($\hat{\psi}$ and $\hat{G}$) from $\hat{Z}_2$. The middle estimator has higher variance than $\hat{Z}_2$, but performs many fewer proposal density evaluations.} % The \texttt{importance} primitive generates a single weighted trace, and \texttt{discrete} samples from a list of possible values, based on unnormalized probabilities.}
% \label{fig:mis_demo}
% %\vspace{-10mm}
% \end{figure}
% Depending on the proposals $g_i$, the posterior approximation $q$ may or may not be better than the standard SIR algorithm with a fixed $g$. However, meta-inference suffers: in addition to the suboptimal $\hat{G}$, now $\hat{\psi}$ too is flawed. This is because observing the chosen particle $x$ should inform $\hat{\psi}$; the value $x$ is more likely to have been proposed by certain $g_i$ than others. We could incorporate this knowledge to create a better $\hat{\psi}$, e.g. by setting $\hat{\psi}(j; x, y) \propto g_j(x; y)$; Fig.~\ref{fig:mis_demo} shows that this estimator (center panel) reduces variance somewhat. However, this $\hat{\psi}$ is still imperfect, because it does not account for the fact that when $g_i$ is very low, the corresponding importance weight $w_i$ will be very high. The popular `deterministic mixture' estimator~\citep{owen2000safe,elvira2019generalized} changes $w_i$ not to depend on the particular proposal $g_i$ used to generate $z_i$: it instead sets $\psi(j; \mathbf{z}, y) \propto w_j := \frac{p(z_j, y)}{\frac{1}{K} \sum_k g_k(z_j; y)}$. This makes our $\hat{\psi}$ exact, and the resulting estimator, $\hat{Z}_2 = \frac{1}{K}\sum_{i=1}^{K} \frac{p(z_i, y)}{\frac{1}{K} \sum_k g_k(z_j; y)}$ is known to have lower variance than $\hat{Z}_1$~\citep{elvira2019generalized}. Another benefit of $\hat{Z}_2$, also apparent from the RAVI formulation, is that it can be applied even when the $g_k$ do not all have the same support. The first estimator $\hat{Z}_1$ cannot, because its $\hat{\psi}$ may infer a $j$ for which $x$ is not in the support of $g_j$, violating the absolute continuity requirement for  strategies. (However, $\log \hat{Z}_1$ can still be used as a variational bound; see Appendix~\ref{sec:even-odd}.) \textbf{Stratified sampling} exploits this affordance, by using proposals $g_k$ that cover disjoint regions of the state space.

% \begin{table}
% \centering
% \begin{tabular}{|lr|}
% \hline
% \textbf{Method}                       & \textbf{$\widehat{MSE}$} \\
% Naive MIS estimator                   & $1.4 \times 10^4$                 \\
% ``Case 1'' RAVI Estimator  & \textbf{$5.1 \times 10^{-4}$}      \\
% %Better MIS estimator, with better $h$ &                      \\
% \hline
% \end{tabular}
% \caption{Sample MSE is reported for $10^6$ trials with 100 particles each. As when the proposal density is known, the naive MIS estimator can have very high variance, owing to exceedingly rare but exceedingly high importance weights. The better MIS estimator, adapted to the case where the proposal density is unknown, can eliminate most of the variance.}
% \label{tab:mis_table}
% \end{table}

% \textbf{Case 1:} If $\xi(r)$ is shared for all $i$, sample $r_i \sim \xi(\cdot)$, $z_i \sim q_i^*(\cdot; r_i)$, and compute the weight $w_i = \frac{p(z_i)}{\frac{1}{K} \sum_{j=1}^K q_j^*(z_i; r_i) \frac{\xi_i(r_i)}{h_j(r_i; z_i)}}$ for each particle $z_i$.

% \textbf{Case 2:} If instead each proposal uses a different auxiliary distribution $\xi_i$ (possibly defined on different spaces), one can instead sample $r_{ii} \sim \xi_i(\cdot)$, $z_i \sim q_i^*(\cdot; r_{ii})$, and for each $j \neq i$, $r_{ij} \sim h_j(\cdot; z_i)$. Then the weight becomes $w_i = \frac{p(z_i)}{\frac{1}{K} \sum_{j=1}^K q_j^*(z_i; r_{ij}) \frac{\xi_j(r_{ij})}{h_j(r_{ij}; z_i)}}$.

% We compare this scheme to the naive MIS estimator, on the multimodal mixture-of-Guassians target from \cite{elvira2019generalized}, modifying their Gaussian proposal distributions to use a random standard deviation $r \sim \mathcal{U}(0.8, 1.3)$. Table~\ref{tab:mis_table} shows that when proposal weights are not available exactly, the RAVI-inspired estimator still reduces variance.

% \subsection{Agglomerative sequential Monte Carlo for Dirichlet proceses mixtures}

% \begin{figure*}

% \begin{minipage}[!]{0.55\textwidth}
% \centering
% \scriptsize{
% \begin{tabular}{|lll|}
% \hline
% \textbf{Method}              & \textbf{Acc.} & \textbf{LML} \\
% \hline
% RAVI Agglom. SMC (ours)     &  {\bf 100\%}                    & $\mathbf{-30,632 \pm 292}$      \\
% Locally-optimal SMC baseline &   67.8\%                   & $-38,121 \pm 233$      \\
% %Gibbs sampling               &                      & N/A          \\ 
% \hline
% \end{tabular}}
% \\
% \addvspace{2mm}
% \includegraphics[width=0.95\textwidth]{figs/HospitalDandC-3.pdf}
% \end{minipage}\hfill%
% \begin{minipage}[!]{0.45\textwidth}
% \includegraphics[width=0.95\textwidth]{figs/HospitalDandC-2.pdf}
% \end{minipage}

% \caption{RAVI Agglomerative SMC applied to a standard data cleaning benchmark, \textit{Hospital}~\citep{MedicareHosp,rekatsinas2017holoclean}. The model is a DPMM, with a broad base measure over English-language strings, and an edit-distance-based likelihood to model typos~\citep{lew2021pclean}. The goal is to cluster together strings representing the same hospital quality measure, regardless of typos. Bottom Left: a particle from the locally optimal SMC baseline, in which strings describing different metrics were incorrectly clustered together. Gibbs rejuvenation fails to escape this local mode, because considered one at a time, none of these strings would rather form a singleton cluster. Right: a particle from RAVI agglomerative SMC, which correctly separates the clusters. Top: the quality gap is reflected both in the accuracy achieved on the data-cleaning task, and the log marginal likelihood estimates produced by each algorithm.}
% \vspace{-5mm}
% \end{figure*}

% We now consider a variant of sequential Monte Carlo that is inspired by \citep{del2006sequential, lindsten2017divide} but to our knowledge does not fit neatly into existing frameworks. In order to solve a DPMM clustering problem, we design a posterior approximation that splits the dataset in half, and recursively solves each clustering subproblem by the same algorithm. To merge the subproblem solutions, the posterior approximation randomly resamples a clustering for each, and uses a kernel $K$ that sequentially decides, for each cluster in the chosen right subproblem, whether to merge it with a cluster in the chosen left subproblem. Meta-inference employs a version of conditional SMC~\citep{andrieu2010particle} that first generates a possible tree of subproblem clusterings backwards, and then runs the agglomerative SMC algorithm forward conditioned on that tree-shaped trajectory. See supplement for full details.
\vspace{-3mm}
\section{RELATED WORK AND DISCUSSION}

\textbf{Related work.} RAVI builds on and generalizes 
recent work from both the Monte Carlo and variational 
inference literatures. For example, \citet{salimans2015markov} and \citet{ranganath2016hierarchical} showed how 
auxiliary variables could be used to 
construct and optimize variational bounds for 
specific families of expressive variational approximations.
\citet{sobolev2019importance} presented tighter bounds 
in a more general setting. RAVI is a further 
generalization, in two directions: first, we show 
that these bounds arise from particular 
choices of meta-inference strategy, and can be 
tightened by improving meta-inference; and second, we
extend the results to the Monte Carlo setting, enabling 
learned variational families to be used as IS, SMC, or MH proposals. We also provide general theorems about the 
variance of RAVI samplers and the bias of RAVI variational bounds, which can be applied to analyze both new and existing algorithms.

RAVI is also related to other compositional or unifying frameworks for thinking about broad classes of inference algorithms~\citep{mansinghka2014venture,zinkov2016composing,scibior2018denotational,scibior2018functional,mansinghka2018probabilistic,cusumano2019gen, stites2021learning, neklyudov2020involutive, zimmermann2021nested, andrieu2020general, storvik2011flexibility, finke2015extended, finke2019importance}, some of which involve recursive constructions~\citep{naesseth2015nested, del2006sequential, domke2019divide}. However, to our knowledge, RAVI's inference strategies are novel. For example, although RAVI and Nested IS (NIS)~\citep{naesseth2015nested} are both approaches to inference with `intractable proposals,' NIS \textit{approximately samples} a proposal distribution with a \textit{tractable} (unnormalized) density, whereas RAVI \textit{approximates the density} of a proposal that \textit{can} be simulated tractably, but whose marginal density (even unnormalized) is intractable.
As another example, \citet{domke2019divide}'s framework of {\it estimator-coupling pairs} constructs variational bounds and marginal likelihood estimators recursively, but unlike in RAVI, 
the posterior approximations cannot be used to formulate objectives for \textit{amortized} VI or as components of Metropolis-Hastings proposals.

Finally, researchers have used meta-inference to construct bounds on KL divergences~\citep{cusumano2017aide} and other information-theoretic quantities~\citep{saad2022estimators}. In Appendix~\ref{sec:other-applications}, we show how to apply such bounds in the general RAVI setting.

\textbf{Outlook and Limitations.} RAVI expands the design space for Monte Carlo and variational inference. It gives unifying correctness proofs for over a dozen methods from the literature, and novel theorems that  characterize their behavior. Experiments show that RAVI helps to design algorithms that significantly improve accuracy over previously introduced Monte Carlo and variational inference methods.
%This may seem surprising, because as Table~\ref{tab:dimension} shows, complicated proposals can induce high-dimensional meta-inference problems for RAVI to solve; why should these be any easier than the original inference problem? One explanation is that many proposals feature conditional independence structure not present in the model. For instance, whether or not the original model is an HMM, a proposal that runs an MCMC chain has Markov structure that makes it amenable to sequential Monte Carlo meta-inference. 
However, some difficulties remain. For example, the gradient estimators we present (Algs.~\hyperref[alg:alg3]{3} and~\hyperref[alg:alg4]{4})
have high variance for some strategies $\mathcal{S}$; in Appendix~\ref{sec:reparam}, we give estimators that exploit the reparameterization trick, but they only help when the proposals in $\mathcal{S}$ can be 
reparameterized, which is not the case, e.g., for SMC. In these cases, RAVI can still be used to derive 
objectives for optimization, but practitioners will need other ways of reducing the variance of gradient estimates; many results from 
the literature~\citep{mnih2016variational,tucker2018doubly} should apply. 

Another difficulty is that RAVI algorithms can be complex to implement. We are exploring an automated implementation based on probabilistic programming languages~\citep{cusumano2019gen,van2018introduction}: if the posterior and meta-posterior approximations in a RAVI strategy $\mathcal{S}$ are given as probabilistic programs, we can provide Algs. 1-5 as higher-order functions, which automate the necessary densities, gradients, and MCMC acceptance probabilities. This could be viewed as a generalization of existing 
PPL support for \textit{programmable inference}~\citep{mansinghka2014venture,mansinghka2018probabilistic,cusumano2019gen,lew2019trace}. %As a framework, RAVI 
% unifies many algorithms from the literature, but does not apply to biased estimators of normalizing constants, variational objectives that are not bounds on the log marginal likelihood (e.g. Nested Variational Inference~\citep{zimmermann2021nested}), or techniques that are justified by incorporating additional assumptions not made by RAVI (e.g., discriminance sampling, which is justified in part by assuming that samples generated using a Markov chain are distributed according to its stationary distribution). % Furthermore, although RAVI provides a useful lens on many variance reduction strategies for Monte Carlo, techniques that may result in negative estimates cannot be represented as RAVI strategies.




% \textbf{Related Work.} 
% Many researchers have proposed compositional or unifying frameworks for thinking about broad classes of inference algorithms~\citep{mansinghka2014venture,zinkov2016composing,scibior2018denotational,scibior2018functional,mansinghka2018probabilistic,cusumano2019gen, stites2021learning, neklyudov2020involutive, andrieu2020general, storvik2011flexibility, finke2015extended, finke2019importance}, some of which involve sequences of distributions or recursive constructions~\citep{naesseth2015nested, del2006sequential, domke2019divide}. However, to our knowledge, RAVI's inference strategies are novel. We discuss some of the most related work in more detail:


% \textbf{Estimator-Coupling pairs.} \citet{domke2019divide} introduce the \textit{estimator-coupling pair} framework for representing Monte Carlo algorithms and variational objectives, and provide mechanisms for building new EC pairs recursively from existing ones. An EC pair yields auxiliary-variable ELBOs and marginal likelihood estimates, but not objectives for amortized variational inference, or Metropolis-Hastings proposals.


% \textbf{Outlook.} % 
% We hope researchers find RAVI to be a useful conceptual tool for understanding the relationships between auxiliary-variable algorithms, and for designing and establishing soundness of new techniques. The distributions that make up RAVI inference strategies are often complex, involving stochastic control flow; we see great promise in the use of probabilistic programs to specify, iterate on, and compose RAVI strategies, especially since, once represented as a sequence of probabilistic programs, a strategy's corresponding sampler and gradient estimators can be automated.

% \begin{contributions} % will be removed in pdf for initial submission,
%                       % so you can already fill it to test with the
%                       % ‘accepted’ class option
%     Briefly list author contributions.
%     This is a nice way of making clear who did what and to give proper credit.

%     H.~Q.~Bovik conceived the idea and wrote the paper.
%     Coauthor One created the code.
%     Coauthor Two created the figures.
% \end{contributions}

\begin{acknowledgements} % will be removed in pdf for initial submission,
                         % so you can already fill it to test with the
                         % ‘accepted’ class option
    The authors are grateful to Feras Saad, Tan Zhi-Xuan, Ben Sherman,
    Cameron Freer, George Matheos, Sam Witty, McCoy Becker, Jan-Willem van de Meent, 
    Sam Stites, Eli Sennesh, Cathy Wong, and Nishad Gothoskar for useful conversations and feedback, and to our anonymous referees for helpful feedback on
    earlier drafts of the paper.
    This material is based on work supported by the NSF Graduate Research Fellowship under Grant No. 1745302.
\end{acknowledgements}
 

\bibliography{lew_657}

% \appendix


% \onecolumn  



% % This document was modified from the file originally made available by
% % Pat Langley and Andrea Danyluk for ICML-2K. This version was created
% % by Iain Murray in 2018, and modified by Alexandre Bouchard in
% % 2019 and 2021 and by Csaba Szepesvari, Gang Niu and Sivan Sabato in 2022. 
% % Previous contributors include Dan Roy, Lise Getoor and Tobias
% % Scheffer, which was slightly modified from the 2010 version by
% % Thorsten Joachims & Johannes Fuernkranz, slightly modified from the
% % 2009 version by Kiri Wagstaff and Sam Roweis's 2008 version, which is
% % slightly modified from Prasad Tadepalli's 2007 version which is a
% % lightly changed version of the previous year's version by Andrew
% % Moore, which was in turn edited from those of Kristian Kersting and
% % Codrina Lauth. Alex Smola contributed to the algorithmic style files.





% \section*{Supplementary Material for ``Recursive Monte Carlo and Variational Inference with Auxiliary Variables''}

% This document and the accompanying code files contain supplementary material for the submission ``Recursive Monte Carlo and Variational Inference with Auxiliary Variables.'' In particular, we provide:

% \begin{enumerate}
%     \item In Section~\ref{sec:proofs}, \textbf{proofs} of Theorems 1-4.
    
% %    \item In Section~\ref{sec:experimental-details}, \textbf{experimental details} on all experiments appearing in the main paper, including model details, inference algorithm details, and computational resources used.
    
%     \item In Section~\ref{sec:appendix-examples}, \textbf{RAVI inference strategies for many existing algorithms}.
    
%     \item In Section~\ref{sec:even-odd}, a further discussion of the {\bf absolute continuity requirements} for RAVI and how they can be relaxed.
    
%     \item In Section~\ref{sec:other-applications}, \textbf{other applications} of RAVI inference strategies, to parameterize rejection sampling and KL divergence estimation algorithms.
% \end{enumerate}

% \section{Omitted Proofs.}
% \label{sec:proofs}

% Throughout this section, we use the notation introduced in Section~\ref{sec:theory}: the random variable $\hat{Z}(\tilde{\pi}, \mathcal{S})$ is the weight returned by $\texttt{IMPORTANCE}(\tilde{\pi}, \mathcal{S})$, and $\check{Z}(\tilde{\pi}, \mathcal{S})$ is the reciprocal of the weight returned by $\texttt{HME}(\tilde{\pi}, x, \mathcal{S})$, for $x \sim \pi$.


% \subsection{Proof of Theorem 1.}

% \textbf{Theorem 1.} \textit{
%     Let $\tilde{\pi}(x) = Z\pi(x)$ be an unnormalized target density, and $\mathcal{S}$ an inference strategy targeting $\pi(x)$. Then:
%     \begin{itemize}
%         \item $\texttt{IMPORTANCE}(\mathcal{S}, \tilde{\pi})$ generates $(x, \hat{Z})$ with $x \sim \mathcal{S}.q$ and $\mathbb{E}[\hat{Z} \mid x] = Z\frac{\pi(x)}{\mathcal{S}.q(x)}$. Furthermore, the unconditional expectation $\mathbb{E}[\hat{Z}(\tilde{\pi}, \mathcal{S})] = Z$.
%         \item $\mathbb{E}[{\check{Z}}(\tilde{\pi}, \mathcal{S})^{-1}] = \mathbb{E}_{x \sim \pi}[\texttt{HME}(\mathcal{S}, x, \tilde{\pi})] = Z^{-1}.$
%     \end{itemize}
% }

% \textbf{Proof.} The proof is by induction on the level of nesting 
% present in the inference strategy. 

% First consider the case where $\mathcal{S}.q$ has a tractable 
% marginal density. Then:
% \begin{itemize}
%     \item \texttt{IMPORTANCE} samples $x \sim \mathcal{S}.q$ on line 2, and 
% computes $\hat{Z} = \frac{\tilde{\pi}(x)}{\mathcal{S}.q(x)} = Z\frac{\pi(x)}{\mathcal{S}.q(x)}$ exactly (lines 3 and 7). By the standard importance sampling argument, the unconditional expectation $\mathbb{E}[\hat{Z}(\tilde{\pi}, \mathcal{S})] = \mathbb{E}_{x \sim \mathcal{S}.q}[Z\frac{\pi(x)}{\mathcal{S}.q(x)}]=Z\mathbb{E}_{x \sim \pi}[1] = Z$. (This argument relies on the fact that, because $\mathcal{S}$ targets $\pi$, $\pi$ is absolutely continuous with respect to $\mathcal{S}.q$.)
%     \item 
% $\texttt{HME}(\mathcal{S}, x, \tilde{\pi})$ returns exactly $\frac{\mathcal{S}.q(x)}{\tilde{\pi}(x)}$ (lines 2 and 5), and $$\mathbb{E}_{x \sim \pi}\left[\frac{\mathcal{S}.q(x)}{\tilde{\pi}(x)}\right] = \int \pi(x) \frac{\mathcal{S}.q(x)}{Z\pi(x)}\text{d}x = \frac{1}{Z} \int \mathcal{S}.q(x) \text{d}x = \frac{1}{Z},$$ where the last step follows because $\mathcal{S}.q$ is a normalized probability density, and $\mathcal{S}.q$ is absolutely continuous with respect to $\pi$.
% \end{itemize}

% Now consider the inductive step. Assume $\mathcal{S}.q(x) = \int \mathcal{S}.q(r, x)\text{d}r$ and that for all $x$, the theorem holds for the inference strategy $\mathcal{S}.\mathcal{M}(x)$ and the unnormalized target distribution $\mathcal{S}.q(\cdot, x)$. In this case:

% \begin{itemize}
%     \item On line 5, \texttt{IMPORTANCE} generates $x \sim \mathcal{S}.q$ and $r \sim \mathcal{S}.q(r \mid x)$. In the call to $\texttt{HME}$, the unnormalized target distribution is $\mathcal{S}.q(\cdot, x)$, and so the normalizing constant is $\mathcal{S}.q(x)$ and the normalized target is $\mathcal{S}.q(r \mid x)$. By the inductive hypothesis, the call to $\texttt{HME}$ on line 6 returns an unbiased estimate of the normalizing constant's reciprocal, i.e. $\mathbb{E}[w \mid x] = \frac{1}{\mathcal{S}.q(x)}$. Since \texttt{IMPORTANCE} returns $\hat{Z} = w\tilde{\pi}(x)$ on line 7, this implies that $\mathbb{E}[\hat{Z} \mid x] = \frac{\tilde{\pi}(x)}{\mathcal{S}.q(x)} = Z\frac{\pi(x)}{\mathcal{S}.q(x)}$. From this, the same standard importance sampling argument as above shows that the unconditional expectation $\mathbb{E}[\hat{Z}(\tilde{\pi}, \mathcal{S})] = Z$.
%     \item On line 4, \texttt{HME} calls \texttt{IMPORTANCE} on the 
%     unnormalized target $\mathcal{S}.q(\cdot, x)$, and so by the inductive hypothesis, $\mathbb{E}[w] = \mathcal{S}.q(x)$ (the normalizing constant). On line 5, the returned weight has expectation $\mathbb{E}_{x \sim \pi}\left[\frac{w}{\tilde{\pi}(x)}\right]=\frac{1}{Z} \int \pi(x) \cdot \frac{\mathcal{S}.q(x)}{\pi(x)} \text{d}x = \frac{1}{Z}$, where the last equality again follows because $\mathcal{S}.q$ is a normalized density, and $\mathcal{S}.q$ is absolutely continuous with respect to $\pi$.
% \end{itemize}

% % As for \texttt{HME}, for $w$ generated on L4, we have $$\mathbb{E}[w] = \mathbb{E}_{r \sim \mathcal{S}.\mathcal{M}(x).q}\left[\mathbb{E}[w \mid r]\right] = \mathbb{E}_{r \sim \mathcal{S}.\mathcal{M}(x).q}\left[\mathcal{S}.q(x) \frac{\mathcal{S}.q(r \mid x)}{\mathcal{S}.\mathcal{M}.q(r)}\right] = \mathcal{S}.q(x),$$ where the second step follows from the inductive hypothesis, and the third by the fact that $\mathcal{S}.q(r \mid x)$ is a normalized probability density. The return value 
% % of $\texttt{HME}$ is $\frac{w}{\tilde{\pi}(x)}$, and 
% % $$\mathbb{E}_{x \sim \pi(x)}\left[\frac{w}{\tilde{\pi}(x)}\right]$$

% % \textbf{Lemma.} Let $\tilde{\pi}(x) = Z\pi(x)$ be an unnormalized target density and $q(x)$ a proposal distribution, with $\pi(x)$ and $q(x)$ mutually absolutely continuous. Then $\mathbb{E}_{x \sim \pi}[\frac{q(x)}{\tilde{\pi}(x)}] = \frac{1}{Z}$ and $\mathbb{E}_{x\sim q}[\frac{\tilde{\pi}(x)}{q(x)}] = Z$.

% % \textbf{Theorem 1.} \textit{
% %     Let $\tilde{\pi}(x) = Z\pi(x)$ be an unnormalized target density, and $\mathcal{S}$ an inference strategy targeting $\pi(x)$. Then Algorithm 1 implements a properly weighted sampler for $\pi$, and Algorithm 2 transforms samples $x \sim \pi$ into unbiased estimates of $\frac{1}{Z}$.
% % }

% % \textbf{Theorem 1.} \textit{Given an unnormalized density $\tilde{\pi}(x) = Z\pi(x)$ and an inference strategy $\mathcal{S}$ targeting $\pi(x)$, Algorithm 1 generates $x \sim \mathcal{S}.q$ and a weight $\hat{Z}$ such that $\mathbb{E}[\hat{Z} \mid x] = Z \frac{\pi(x)}{\mathcal{S}.q(x)}$. If additionally given a sample $x \sim \pi$, Algorithm 2 generates a weight $\check{Z}$ such that $\mathbb{E}[\frac{1}{\check{Z}}] = \frac{1}{Z}$.}

% % \textbf{Proof.} The proof is by induction on the level of nesting present in the inference strategy $\mathcal{S}$. First consider the case where $\mathcal{S}.q$ has a tractable density and no $\mathcal{S}.\mathcal{M}$ is needed. Then:

% % \begin{itemize}
% %     \item Algorithm 1 generates $x \sim \mathcal{S}.q$ in its first step, and returns $\frac{\tilde{\pi}(x)}{\mathcal{S}.q(x)} = Z\frac{\pi(x)}{\mathcal{S}.q(x)}$ exactly.
% %     \item Algorithm 2 returns $\frac{\mathcal{S}.q(x)}{\tilde{\pi}(x)} = \frac{1}{Z} \frac{\mathcal{S}.q(x)}{\pi(x)}$. In expectation over $x \sim \pi$, we have $\mathbb{E}_{x \sim \pi}[\frac{1}{Z} \frac{\mathcal{S}.q(x)}{\pi(x)}] = \frac{1}{Z} \int \mathcal{S}.q(x) \text{d}x = \frac{1}{Z}.$
% % \end{itemize}

% % Now assume the theorem holds for the strategy $\mathcal{S}.\mathcal{M}(x)$, for all $x$, and consider $\mathcal{S}$ itself. Algorithm 1 samples $(r, x) \sim \mathcal{S}.q$, so $r$ is distributed as $\mathcal{S}.q(r \mid x)$. By the inductive hypothesis, the call to Algorithm 2 yields a weight $w$ with $\mathbb{E}[w] = \frac{1}{\mathcal{S}.q(x)}$. 


% % \textbf{Theorem 1.} \textit{If $p_0$ is a model and $p_1, \dots, p_n$ encodes an inference strategy, then Alg. 1 is properly weighted for $p_0(x_1 \mid x_0)$. Alg. 2 generates $x_0 \sim p_0$ and a weight $\frac{1}{\check{Z}}$ such that $\mathbb{E}[\frac{1}{\check{Z}} \mid x_0] = \frac{1}{p_0(x_0)}$.}


% % \textbf{Proof.} The proof is by induction on the length of the inference strategy. 

% % First consider $n = 0$. In this case Algorithm 2 terminates on its first step, producing $x_0 \sim p_0$ and returning $\frac{1}{p_0(x_0)}$ exactly. For Algorithm 1, as there is no latent variable $x_1$, we view $x_1$ as deterministically equal to a null value $\emptyset$. In this case, Algorithm 1 returns $(x_1 = \emptyset, w = p_0(x_0))$ on its first step, and $\mathbb{E}[w f(x_1)] = p_0(x_0) \cdot f(\emptyset)$ as required.

% % % Algorithm 1 then reduces to ordinary importance sampling, with weight $\frac{p(x, y)}{q_0(x; y)}$. The usual argument shows that $p(y)$ is the expected weight:
% % % $$\mathbb{E}\left[\frac{p(x,y)}{q_0(x; y)}\right] = \int q_0(x; y) \frac{p(x,y)}{q_0(x; y)} dx = \int p(x, y) dx = p(y).$$

% % Now consider the inductive step. Assume the theorem proven for inference strategies of length $n < k$ and consider an inference strategy of length $k$. 

% % We first consider Algorithm 2. The variable $x_0$ is generated from $p_0$ in Step 2, so we have directly that $x_0 \sim p_0$. By induction, the call to Algorithm 1 generates a weight $w'$ whose expectation (given $x_0$ and $x_1$) is $p_1(x_1; x_0)$. Then the conditional expectation of $w = \frac{w'}{p_0(x_1, x_0)}$, given $x_0$, is:
% % $$\mathbb{E}_{x_1, \dots, x_k}[w \mid x_0] = \mathbb{E}_{x_1}\left[\frac{\mathbb{E}_{x_2, \dots, x_k}\left[w' \mid x_1, x_0\right]}{p_0(x_1, x_0)} \mathrel{\Big|} x_0 \right] = \mathbb{E}_{x_1}\left[\frac{p_1(x_1; x_0)}{p_0(x_1, x_0)} \mathrel{\Big|} x_0 \right].$$
% % This last expression can be viewed as an ordinary importance sampling estimator, with proposal distribution $p_0(x_1 \mid x_0)$ and target $\frac{p_1(x_1; x_0)}{p_0(x_0)}$:
% % $$\mathbb{E}_{x_1}\left[\frac{p_1(x_1; x_0)}{p_0(x_1, x_0)} \mathrel{\Big|} x_0 \right] = \int p_0(x_1 \mid x_0) \cdot \frac{p_1(x_1; x_0)}{p_0(x_0)p_0(x_1\mid x_0)} dx_1 = \int \frac{p_1(x_1; x_0)}{p_0(x_0)} dx_1 = \frac{1}{p_0(x_0)}.$$
% % The last equations use the absolute continuity assumption: the integral is only over the support of $p_0(x_1 \mid x_0)$, and so for $\int p_1(x_1; x_0) dx_1$ to equal 1, we must have that $p_1(x_1; x_0) \ll p_0(x_1 \mid x_0)$.

% % We now turn to Algorithm 1. We have, for any measurable test function $f$:
% % \begin{align*}
% %     \mathbb{E}\left[w \cdot p_0(x_1, x_0) \cdot f(x_1) \right] &= \mathbb{E}_{x_1 \sim p_1(x_1; x_0)} \left[\mathbb{E}\left[w \mathrel{\Big|} x_1\right] \cdot p_0(x_1, x_0) \cdot f(x_1) \right]\\ 
% %     &=
% %     \mathbb{E}_{x_1 \sim p_1(x_1; x_0)}\left[\frac{p_0(x_1, x_0)}{p_1(x_1; x_0)} \cdot f(x_1)\right] \\ 
% %     &= p_0(x_0) \cdot \mathbb{E}_{x_1 \sim p_0(x_1 \mid x_0)}[f(x_1)],
% % \end{align*}
% % where the second equation holds due to the proof for Algorithm 2 and the third is the ordinary importance sampling argument.
% % \\

% \subsection{Proof of Theorem 2}


% \begin{lemma}
%     For an inference strategy $\mathcal{S}$ targeting $p(x \mid y)$, if $\mathcal{S}.q(x) = \int \mathcal{S}.q(r, x)\text{d}r$ has an intractable marginal density, then:
%     \[
%     \LL(p, y, \mathcal{S}) = \E_{x \sim \mathcal{S}.q} [\log p(x, y) - \UU(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x)]
%     \]
%     and
%     \[
%     \UU(p, y, \mathcal{S}) = \E_{x \sim p( \cdot | y) } [ \log p(x, y) - \LL(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x)) ]
%     \]
% \end{lemma}

% \textbf{Proof.} For the first conclusion,

% \begin{align}
% \LL(p, y, \mathcal{S})
%     &= \E[\log \hat{Z}(p(\cdot \mid y), \mathcal{S})]\\
%     &= \E\left[\log \frac{p(x, y)}{\check{Z}(\mathcal{S}.q(\cdot, x), \mathcal{S}.\mathcal{M}(x))}\right]\\
%     &= \E_{x \sim \mathcal{S}.q}[\E[\log p(x, y) - \log \check{Z}(\mathcal{S}.q(\cdot, x), \mathcal{S}.\mathcal{M}(x)) \mid x]]\\
%     &= \E_{x \sim \mathcal{S}.q}[\log p(x, y) - \mathbb{E}[\log \check{Z}(\mathcal{S}.q(\cdot, x), \mathcal{S}.\mathcal{M}(x)) \mid x]]\\
%     &= \E_{x \sim \mathcal{S}.q} [\log p(x, y) - \UU(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x)) ]
% \end{align}
% The same approach, but with $\E [ \log \check{Z} ]$, can be used to prove the other conclusion.

% \noindent\textbf{Theorem 2.} \textit{
% Given a model $p_\theta(x, y)$ and an inference strategy $\mathcal{S}_\theta$ targeting $p_\theta(x \mid y)$,
% %Let $p_0, \dots, p_n$ be a
% %valid model strategy and a valid inference strategy.
% Alg. 3 yields unbiased estimates of $\LL(p, y, \mathcal{S})$ and of $\grad \LL(p, y, \mathcal{S})$.
% Furthermore, when $(x, y) \sim p_\theta$, Alg. 4 yields
% (i) $\hat{U}$ such that
% $\E [ \hat{U} \mid y ] = \UU(p, y, \mathcal{S})$,
% (ii) $\widehat{\grad}$ such that
% $\E [ \widehat{\grad} ] = \grad \mathbb{E}_{y \sim p_\theta}[\UU(p, y, \mathcal{S})]$,
% and (iii) a value $\mathbf{g}$ such that for any function $R$ that does not depend on $\theta$,
% $\E [ \mathbf{g} \cdot R(y) ] = \grad \E_{y \sim p_\theta} [ R(y) ]$  if
% $\grad \E_{y \sim p_\theta} [ R(y) ]$ is defined.}

% \textbf{Proof.} The proof is by induction on the level of nesting present in the inference strategy.

% First consider inference strategies $\mathcal{S}$ with tractable proposals $\mathcal{S}.q(x)$. In this case $\texttt{ELBO}\nabla$ generates $x \sim \mathcal{S}.q$ and returns $\hat{L} = \log p(x, y) - \log \mathcal{S}.q(x)$ and $\widehat{\grad} = \grad (\log p(x, y) - \log \mathcal{S}.q(x)) + (\grad \log \mathcal{S}.q(x))(\log p(x, y) - \log \mathcal{S}.q(x))$. Clearly, $\E_{x \sim \mathcal{S}.q}[\hat{L}] = \E[\log \hat{Z}(p(\cdot, y), \mathcal{S})] = \LL(p, y, \mathcal{S})$. And by the log-derivative trick, $\E_{x \sim \mathcal{S}.q}[\widehat{\grad}] = \E[\grad (\log p(x, y) - \log \mathcal{S}.q(x))] = \E[\mathcal{L}(p, y, \mathcal{S})]$. When we apply $\texttt{EUBO}\nabla$ to $\mathcal{S}$ with $(x, y) \sim p$, it returns (1) $\hat{U} = \log p(x, y) - \log \mathcal{S}.q(x)$ (for which $\E[\hat{U} \mid y] = \UU(p, y, \mathcal{S})$), (2) $\widehat{\grad} = \grad (\log p(x, y) - \log \mathcal{S}.q(x)) + \grad \log p(x, y) (\log p(x, y) - \log \mathcal{S}.q(x))$ (for which, by the log-derivative trick, $\mathbb{E}[\widehat{\grad}] = \grad \E_{y \sim p}[\UU(p, y, \mathcal{S})]$), and (3) $\mathbf{g} = \grad \log p(x, y)$. This last return value satisfies the spec for $\mathbf{g}$ because if $R$ does not depend on $\theta$, then $\mathbb{E}_{(x, y) \sim p}[R(y) \cdot \grad\log p(x, y)] = \int\int p(x, y) \cdot 
% \frac{\grad p(x, y)}{p(x, y)} \cdot R(y) \text{d}x \text{d}y = \grad \int \int p(x, y) R(y) \text{d}x \text{d}y = \grad \E [R(y)]$, as required. 

% Now consider the inductive step. Assume the theorem holds for the inference strategy $\mathcal{S}.\mathcal{M}(x)$ and joint distribution $\mathcal{S}.q(r, x)$. 

% We first consider $\texttt{ELBO}\nabla$. It generates $(r, x) \sim \mathcal{S}.q$ before calling $\texttt{EUBO}\nabla$, which by induction returns $(\hat{U}, \widehat{\grad}, \mathbf{g})$ such that:

% \begin{enumerate}
%     \item $\mathbb{E}[\hat{U} \mid x] = \UU(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x))$
%     \item $\E[\widehat{\grad}] = \grad\E_{x \sim \mathcal{S}.q}[\UU(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x))]$
%     \item $\E[g \cdot R(x)] = \grad \E_{x \sim \mathcal{S}.q}[R(x)]$ for all valid $R$.
% \end{enumerate}


% $\texttt{ELBO}\nabla$ computes its first return value, $\hat{L}$, as $\log p(x, y) - \hat{U}$, so
% \begin{align*}
%     \E[\hat{L}] &= \E[\log p(x, y) - \hat{U}]\\ 
%     &= \E_{x \sim \mathcal{S}.q}[\E[\log p(x, y) - \hat{U} \mid x]]\\
%     &= \E_{x \sim \mathcal{S}.q}[\log p(x, y) - \E[\hat{U} \mid x]]\\
%     &= \E_{x \sim \mathcal{S}.q}[\log p(x, y)-\UU(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x))]\\ 
%     &= \LL(p, y, \mathcal{S}),
% \end{align*}
% where the fourth equality holds by the inductive hypothesis and the final one by Lemma 1. Its second return value is computed as $\widehat{\grad}' = \grad\log p(x, y) + \mathbf{g}\log p(x, y) - \widehat{\grad}$, and so
% \begin{align*}
%     \E [ \widehat{\grad}' ]
%         &= \E \left[ \grad \log p(x, y) + \mathbf{g} \cdot \log p(x, y) - \widehat{\grad} \right]\\
%         &= \E \left[ \grad \log p(x, y) \right]
%             + \grad \E_{x \sim \mathcal{S}.q} [ \log \boxed{p}(x, y) ]
%             - \grad \E_{x \sim \mathcal{S}.q} [ \mathcal{U}(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x)) ]\\
%         &= \grad \E \left[ \log p(x, y) \right]
%             - \grad \E_{x \sim \mathcal{S}.q} [ \mathcal{U}(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x)) ]\\
%         &= \grad \E_{x \sim \mathcal{S}.q} [ \log p(x, y) - \UU(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x)) ]\\
%         &= \grad \LL(p, y, \mathcal{S}),
% \end{align*}
% where $\boxed{p}(x, y)$ denotes the distribution $p(x, y)$ but without a dependence on $\theta$, for the purposes of differentiation with respect to $\theta$. The second equality holds by the inductive hypothesis about $\mathbf{g}$ (with $R(x) = \log \boxed{p}(x, y)$) and about $\widehat{\grad}$, and the third uses the log-derivative trick. The final equation is due to Lemma 1.

% We now turn to $\texttt{EUBO}\nabla$. 
% By induction, the call to $\texttt{ELBO}\nabla$ satisfies the theorem, and so: \begin{enumerate}
%     \item $\E [ \hat{L} ] = \LL(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x))$
%     \item $\E [ \widehat{\grad} ] = \grad\LL(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x))$
% \end{enumerate}

% We treat each of the return values, $(\hat{U}, \widehat{\grad}, \mathbf{g})$, in sequence. We view them as random variables, accounting for stochasticity in the algorithm as well as the inputs $(x, y)$, which are assumed in the theorem's statement to be jointly distributed according to $p$.

% First, $\hat{U}$ is computed as $\log p(x, y) - \hat{L}$, and so 
% \begin{align*}
% \E [ \hat{U} | y ] 
%     &= \E_{x \sim p(\cdot | y)} [ \E [ \log p(x, y) - \hat{L} | x, y ] ]\\
%     &= \E_{x \sim p(\cdot | y) } [ \log p(x, y) - \LL(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x)) ]\\
%     &= \UU(p, y, \mathcal{S}).
% \end{align*}
% Next, $\E[ \widehat{\grad}']$:
% \begin{align*}
%     \E[\widehat{\grad}']
%     &= \E_{x, y \sim p} \left[
%             \E \left[
%                 \grad \log p(x, y)
%                 + (\grad \log p(x, y)) \cdot \hat{U}
%                 - \widehat{\grad} | x, y
%             \right]
%         \right]\\
%     &= \E_{x, y \sim p} \left[
%             \grad \log p(x, y) +
%             (\grad \log p(x, y))
%                 \cdot \E \left[ \hat{U} | x, y \right]
%             - \E \left[\widehat{\grad} | x, y \right]
%         \right]\\
%     &= \E_{x, y \sim p} \left[
%         \grad \log p(x, y) +
%         (\grad \log p(x, y))
%             \cdot \E \left[ \log p(x, y) - \hat{L} \mid x, y \right]
%         - \grad \LL(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x))
%     \right]\\
%     &= \E_{x, y \sim p} \left[
%         \grad \log p(x, y) 
%         - \grad \LL(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x))
%         + (\grad \log p(x, y))
%             \cdot (\log p(x, y) - \LL(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x))
%     \right]\\
%     &= \grad \E_{x, y \sim p} \left[
%             \log p(x, y) - \LL(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x))
%         \right]\\        
%     &= \grad \E_{y \sim p} [ \E_{x \sim p(\cdot | y) } [ \log p(x, y) - \LL(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x)) ] ]\\
%     &= \grad \E_{y \sim p} [ \UU(p, y, \mathcal{S}) ].    
% \end{align*}
% Finally, we consider $\E_{y \sim p} [ \E [ \mathbf{g} \cdot R(y) | y ] ]$ (and recall that $R(y)$ is not to be treated as a function of $\theta$):
% \begin{align*}
% \E_{y \sim p} [ \E [ \mathbf{g} \cdot R(y) | y ] ]
%     &= \E_{y \sim p} \left[ \E [ (\grad \log p(x, y)) \cdot R(y) | y ] \right]\\
%     &= \E_{x, y \sim p} \left[ (\grad \log p(x, y)) \cdot R(y) \right]\\
%     &= \grad \E_{x, y \sim p} \left[ R(y) \right]\\
%     &= \grad \E_{y \sim p} [ R(y) ].
% \end{align*}


% \subsection{Proof of Theorem 3}

% \noindent\textbf{Theorem 3.} {\it Consider an unnormalized target distribution $\tilde{\pi}(x) = Z\pi(x)$ and an inference strategy $\mathcal{S}$ targeting $\pi(x)$. Then the relative variances of the estimators $\hat{Z}(\tilde{\pi}, \mathcal{S})$ and $\check{Z}(\tilde{\pi}, \mathcal{S})$ are given by the following recursive equations:}
% \begin{align*}
% \text{Var}_{\hat{Z}}&(\pi, \mathcal{S}) = \chi^2(\pi || \mathcal{S}.q) \, +\\
%  & \mathbb{E}_{x \sim \mathcal{S}.q}\left[\left(\frac{\pi(x)^2}{\mathcal{S}.q(x)^2}\right) \cdot \text{Var}_{\check{Z}}(\mathcal{S}.q(\cdot \mid x), \mathcal{S}.\mathcal{M}(x))\right]\\
% \text{Var}_{\check{Z}}&(\pi, \mathcal{S}) = \chi^2(\mathcal{S}.q || \pi) + \\
% & \mathbb{E}_{x \sim \pi}\left[\left(\frac{\mathcal{S}.q(x)^2}{\pi(x)^2}\right) \cdot \text{Var}_{\hat{Z}}(\mathcal{S}.q(\cdot \mid x), \mathcal{S}.\mathcal{M}(x))\right]
% %
% %\text{Var}\left(\frac{\hat{Z}}{Z}\right) = \sum_{i=0}^{n-1} \mathbb{E}%_{x_0, \dots, x_{i-1}}
% %\Big[%
% %\frac{\hat{Z}_i^2}{Z^2} \cdot \chi^2(&p_{2\lceil \frac{i}{2}\rceil}(x_{i+1} \mid x_{i}, \dots, x_0) \,\, ||\\
% %&p_{2\lfloor\frac{i}{2}\rfloor + 1}(x_{i+1} \mid x_i, \dots, x_0))
% %w_i^2 \cdot d_i%
% %\Big],
% \end{align*}
% {\it When $\mathcal{S}.q$ is tractable, the second term of each sum is 0.}

% \textbf{Proof.} The proof is by induction on the level of nesting present in the inference strategy $\mathcal{S}$. 

% First suppose $\mathcal{S}.q$ has a tractable marginal density. Then:

% \begin{itemize}
%     \item $\hat{Z}(\pi, \mathcal{S})$ is the normalized importance weight 
%     $\frac{\pi(x)}{\mathcal{S}.q(x)}$, with $x \sim \mathcal{S}.q$. So the relative variance is:
%     $$\text{Var}_{\hat{Z}}({\pi}, \mathcal{S}) = \text{Var}\left(\hat{Z}({\pi}, \mathcal{S})\right) = \mathbb{E}_{x \sim \mathcal{S}.q}\left[\frac{\pi(x)^2}{\mathcal{S}.q(x)^2}\right] - \mathbb{E}_{x \sim \mathcal{S}.q}\left[\frac{\pi(x)}{\mathcal{S}.q(x)}\right]^2 = \mathbb{E}_{x \sim \mathcal{S}.q}\left[\frac{\pi(x)^2}{\mathcal{S}.q(x)^2} - 1\right] = \chi^2(\pi || \mathcal{S}.q),$$
%     where the third equality holds because $\pi$ is a normalized density and $\pi$ is absolutely continuous with respect to $\mathcal{S}.q$.

%     \item $\check{Z}({\pi}, \mathcal{S})$ is the weight $\frac{\pi(x)}{\mathcal{S}.q(x)}$, with $x \sim \pi$. Then the relative variance $$\text{Var}_{\check{Z}}({\pi}, \mathcal{S}) = \text{Var}\left(\check{Z}(\pi, \mathcal{S})^{-1}\right) = \mathbb{E}_{x \sim \pi}\left[\frac{\mathcal{S}.q(x)^2}{\pi(x)^2}\right] - \mathbb{E}_{x \sim \pi}\left[\frac{\mathcal{S}.q(x)}{\pi(x)}\right]^2 = \mathbb{E}_{x \sim \pi}\left[\frac{\mathcal{S}.q(x)^2}{\pi(x)^2} - 1\right] = \chi^2(\mathcal{S}.q || \pi),$$ where the third equality holds because $\mathcal{S}.q$ is a normalized density and is absolutely continuous with respect to $\pi$.
% \end{itemize}
% Now consider the inductive step. Assume that for all $x$, the theorem holds of the strategy $\mathcal{S}.\mathcal{M}(x)$ targeting $\mathcal{S}.q(\cdot \mid x)$. $\text{Var}_{\hat{Z}}(\mathcal{S}.q(\cdot \mid x), \mathcal{S}.\mathcal{M}(x))$, for all $x$. Then:
% \begin{itemize}
%     \item The $\texttt{IMPORTANCE}(\pi, \mathcal{S})$ algorithm generates $x \sim \mathcal{S}.q$. It then calls \texttt{HME} (with $r \sim \mathcal{S}.q(\cdot \mid x)$) to obtain $w = \check{Z}(\mathcal{S}.q(\cdot, x), \mathcal{S}.\mathcal{M}(x))^{-1}$, and returns $\hat{Z} = w\pi(x)$. The variance of $\hat{Z}$ is then:
%     \begin{align*}
%     \text{Var}_{\hat{Z}}(\pi, \mathcal{S}) &= \text{Var}\left(\frac{\pi(x)}{\check{Z}(\mathcal{S}.q(\cdot, x), \mathcal{S}.\mathcal{M}(x))}\right)\\
%     &= \mathbb{E}\left[\left(\frac{\pi(x)}{\check{Z}(\mathcal{S}.q(\cdot, x), \mathcal{S}.\mathcal{M}(x))}\right)^2 - 1\right]
%     && (\mathbb{E}[\hat{Z}(\pi, \mathcal{S})]^2 = Z^2 = 1)\\
%     &= \mathbb{E}\left[\left(\frac{\pi(x)}{\mathcal{S}.q(x)} \cdot \frac{\mathcal{S}.q(x)}{\check{Z}(\mathcal{S}.q(\cdot, x), \mathcal{S}.\mathcal{M}(x))}\right)^2 - 1\right]
%     && \text{(divide and multiply by } \mathcal{S}.q(x))\\
%     &= \mathbb{E}\left[\left(\frac{\pi(x)}{\mathcal{S}.q(x)} \cdot \frac{1}{\check{Z}(\mathcal{S}.q(\cdot \mid x), \mathcal{S}.\mathcal{M}(x))}\right)^2 - 1\right]
%     &&(\mathcal{S}.q(x) \text{ is the normalizing constant of } \mathcal{S}.q(\cdot, x))\\
%     &= \mathbb{E}\left[\left(\frac{\pi(x)}{\mathcal{S}.q(x)}\right)^2 \left(\mathbb{E}\left[{\check{Z}(\mathcal{S}.q(\cdot \mid x), \mathcal{S}.\mathcal{M}(x))}^{-2} \bigl\vert x\right]\right) - 1\right]\\
%     &= \mathbb{E}\left[\left(\frac{\pi(x)}{\mathcal{S}.q(x)}\right)^2 \left(\text{Var}_{\check{Z}}(\mathcal{S}.q(\cdot \mid x), \mathcal{S}.\mathcal{M}(x)) + 1\right) - 1\right]
%     && \text{(definition of Var}_{\check{Z}}(\cdot, \cdot))\\
%     &= \mathbb{E}\left[\left(\frac{\pi(x)}{\mathcal{S}.q(x)}\right)^2 \left(\text{Var}_{\check{Z}}(\mathcal{S}.q(\cdot \mid x), \mathcal{S}.\mathcal{M}(x))\right) + \left(\frac{\pi(x)}{\mathcal{S}.q(x)}\right)^2 - 1\right]
%     && \text{(distributing product over sum)}\\
%     &= \mathbb{E}\left[\left(\frac{\pi(x)}{\mathcal{S}.q(x)}\right)^2 \left(\text{Var}_{\check{Z}}(\mathcal{S}.q(\cdot \mid x), \mathcal{S}.\mathcal{M}(x))\right)\right] + \chi^2(\pi || \mathcal{S}.q).\\
%     \end{align*}


%     \item The argument for $\check{Z}$ is largely the same:
%     \begin{align*}
%         \text{Var}_{\check{Z}}(\pi, \mathcal{S}) &= \text{Var}\left(\frac{\hat{Z}(\mathcal{S}.q(\cdot, x), \mathcal{S}.\mathcal{M}(x))}{\pi(x)}\right)\\
%         &= \mathbb{E}\left[\left(\frac{\check{Z}(\mathcal{S}.q(\cdot, x), \mathcal{S}.\mathcal{M}(x))}{\pi(x)}\right)^2 - 1\right]
%         && (\mathbb{E}[\check{Z}(\pi, \mathcal{S})^{-1}]^2 = Z^{-2} = 1)\\
%         &= \mathbb{E}\left[\left(\frac{\mathcal{S}.q(x)}{\pi(x)} \cdot \frac{\hat{Z}(\mathcal{S}.q(\cdot, x), \mathcal{S}.\mathcal{M}(x))}{\mathcal{S}.q(x)}\right)^2 - 1\right]
%         && \text{(divide and multiply by } \mathcal{S}.q(x))\\
%         &= \mathbb{E}\left[\left(\frac{\mathcal{S}.q(x)}{\pi(x)} \cdot {\hat{Z}(\mathcal{S}.q(\cdot \mid x), \mathcal{S}.\mathcal{M}(x))}\right)^2 - 1\right]
%         &&(\mathcal{S}.q(x) \text{ is the normalizing constant of } \mathcal{S}.q(\cdot, x))\\
%         &= \mathbb{E}\left[\left(\frac{\mathcal{S}.q(x)}{\pi(x)}\right)^2 \left(\mathbb{E}\left[{\hat{Z}(\mathcal{S}.q(\cdot \mid x), \mathcal{S}.\mathcal{M}(x))}^{2} \bigl\vert x\right]\right) - 1\right]\\
%         &= \mathbb{E}\left[\left(\frac{\mathcal{S}.q(x)}{\pi(x)}\right)^2 \left(\text{Var}_{\hat{Z}}(\mathcal{S}.q(\cdot \mid x), \mathcal{S}.\mathcal{M}(x)) + 1\right) - 1\right]
%         && \text{(definition of Var}_{\hat{Z}}(\cdot, \cdot))\\
%         &= \mathbb{E}\left[\left(\frac{\mathcal{S}.q(x)}{\pi(x)}\right)^2 \left(\text{Var}_{\hat{Z}}(\mathcal{S}.q(\cdot \mid x), \mathcal{S}.\mathcal{M}(x))\right) + \left(\frac{\mathcal{S}.q(x)}{\pi(x)}\right)^2 - 1\right]
%         && \text{(distributing product over sum)}\\
%         &= \mathbb{E}\left[\left(\frac{\mathcal{S}.q(x)}{\pi(x)}\right)^2 \left(\text{Var}_{\hat{Z}}(\mathcal{S}.q(\cdot \mid x), \mathcal{S}.\mathcal{M}(x))\right)\right] + \chi^2(\mathcal{S}.q || \pi).\\
%     \end{align*}
% \end{itemize}



% % \noindent\textbf{Theorem 3.} {\it Let $p_0, \dots, p_n$ be an inference strategy. As above, let $Z = p_0(x_0)$ for some fixed observation $x_0$, and let $\hat{Z}$ denote the recursive estimator for $Z$ corresponding to the inference strategy $p_0, \dots, p_n$. Further, let $\hat{Z}_i$ be the estimator corresponding to the inference strategy $p_0, \dots, p_i$ (which will typically be intractable to compute, as it requires a marginal density for $p_i(x_i)$). Then $\frac{\hat{Z}}{Z}$ has variance}
% % $$\text{Var}\left(\frac{\hat{Z}}{Z}\right) = \sum_{i=0}^{n-1} \mathbb{E}%_{x_0, \dots, x_{i-1}}
% % \left[%
% % \frac{\hat{Z}_i^2}{Z^2} \cdot \chi^2(p_{2\lceil \frac{i}{2}\rceil}(x_{i+1} \mid x_{i}, \dots, x_0) \,\, ||\,\, p_{2\lfloor\frac{i}{2}\rfloor + 1}(x_{i+1} \mid x_i, \dots, x_0))
% % %w_i^2 \cdot d_i%
% % \right],$$
% % {\it where the expectation is taken over $x_1, \dots, x_i$.}


% % \textbf{Proof.} The proof is by induction on the length of the inference strategy.

% % When $n = 0$, $\frac{\hat{Z}}{Z} = 1$ deterministically, and so has variance $\sum_{i=0}^{-1} \left(\dots\right) = 0$, as desired. 

% % % When $n = 1$, then 

% % % \begin{align*}
% % % \text{Var}\left(\frac{\hat{Z}}{Z}\right) &= \mathbb{E}\left[\frac{\hat{Z}^2}{Z^2}\right] - \mathbb{E}\left[\frac{\hat{Z}}{Z}\right]^2 \\
% % % &= \mathbb{E}\left[\frac{\hat{Z}^2}{Z^2}\right] - 1 \\
% % % &= \mathbb{E}_{x_1 \sim p_1(\cdot; x_0)}\left[\frac{p_0(x_1, x_0)^2}{p_1(x_1; x_0)^2 p_0(x_0)^2}\right] - 1 \\
% % % &= \mathbb{E}_{x_1 \sim p_1(\cdot; x_0)}\left[\frac{p_0(x_1 \mid x_0)^2}{p_1(x_1; x_0)^2}\right] - 1 \\
% % % &= \chi^2(p_0(x_1 \mid x_0) \, || \, p_1(x_1; x_0)).
% % % \end{align*}

% % Now assume the theorem proven for $n = k-1$ and consider $n = k$.
% % Note that $\hat{Z} = \hat{Z}_k = \hat{Z}_{k-1} \cdot \frac{p_{2\lfloor \frac{k}{2}\rfloor}(x_k \mid x_{0:k-1})}{p_{2\lceil \frac{k}{2}\rceil - 1}(x_k \mid x_{0:k-1})}$. Therefore, we have:

% % \begin{align*}
% %     \text{Var}\left(\frac{\hat{Z}}{Z}\right) &= \mathbb{E}\left[\frac{\hat{Z}^2}{Z^2}\right] - \mathbb{E}\left[\frac{\hat{Z}}{Z}\right]^2 \\
% %     &= \mathbb{E}\left[\frac{\hat{Z}^2}{Z^2} - 1\right] \\
% %     &= \mathbb{E}\left[\frac{\hat{Z}_{k-1}^2}{Z^2} \left(\frac{p_{2\lfloor \frac{k}{2}\rfloor}(x_k \mid x_{0:k-1})^2}{p_{2\lceil \frac{k}{2}\rceil - 1}(x_k \mid x_{0:k-1})^2}\right) - 1\right] \\
% %     &= \mathbb{E}\left[\frac{\hat{Z}_{k-1}^2}{Z^2} \left(\mathbb{E}_{x_k \sim p_{2\lceil \frac{k}{2}\rceil - 1}}\left[\frac{p_{2\lfloor \frac{k}{2}\rfloor}(x_k \mid x_{0:k-1})^2}{p_{2\lceil \frac{k}{2}\rceil - 1}(x_k \mid x_{0:k-1})^2} \, \Big| x_{0:k-1}\right]\right) - 1\right] \\
% %     &= \mathbb{E}\left[\frac{\hat{Z}_{k-1}^2}{Z^2} \left(\chi^2({p_{2\lfloor \frac{k}{2}\rfloor}(x_k \mid x_{0:k-1})} \, ||\, {p_{2\lceil \frac{k}{2}\rceil - 1}(x_k \mid x_{0:k-1})}) + 1\right) - 1\right] \\
% %     &= \mathbb{E}\left[\frac{\hat{Z}_{k-1}^2}{Z^2} \left(\chi^2({p_{2\lfloor \frac{k}{2}\rfloor}(x_k \mid x_{0:k-1})} \, ||\, {p_{2\lceil \frac{k}{2}\rceil - 1}(x_k \mid x_{0:k-1})})\right) + \frac{\hat{Z}_{k-1}^2}{Z^2} - 1\right] \\
% %     &= \mathbb{E}\left[\frac{\hat{Z}_{k-1}^2}{Z^2} \left(\chi^2({p_{2\lfloor \frac{k}{2}\rfloor}(x_k \mid x_{0:k-1})} \, ||\, {p_{2\lceil \frac{k}{2}\rceil - 1}(x_k \mid x_{0:k-1})})\right)\right] + \text{Var}\left(\frac{\hat{Z}_{k-1}}{Z}\right) \\
% %     &= \sum_{i=0}^{k-1} \mathbb{E}%_{x_0, \dots, x_{i-1}}
% % \left[%
% % \frac{\hat{Z}_i^2}{Z^2} \cdot \chi^2(p_{2\lceil \frac{i}{2}\rceil}(x_{i+1} \mid x_{i}, \dots, x_0) \,\, ||\,\, p_{2\lfloor\frac{i}{2}\rfloor + 1}(x_{i+1} \mid x_i, \dots, x_0))
% % %w_i^2 \cdot d_i%
% % \right],
% % \end{align*}

% % where the last step uses the inductive hypothesis to rewrite $\text{Var}(\frac{\hat{Z}_{k-1}}{Z})$ as a sum from $i = 0$ to $k-2$.



% \subsection{Proof of Theorem 4.}

% \noindent\textbf{Theorem 4.}
% {\it Consider a joint distribution $p(x, y)$ and an inference strategy $\mathcal{S}$ targeting $p(x \mid y)$. Then the following equations give the bias of ${\hat{\LL}}$ and ${\hat{\UU}}$ as estimators of $\log p(y)$:}
% \begin{align*}
% \text{Bias}_\mathcal{L}(p, y, \mathcal{S}) =&\, -\text{KL}(\mathcal{S}.q || p(\cdot \mid y)) \\ 
% &-\mathbb{E}_{x \sim \mathcal{S}.q}[\text{Bias}_\mathcal{U}(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x))]\\
% \text{Bias}_\mathcal{U}(p, y, \mathcal{S}) =&\, \text{KL}(p(\cdot \mid y) || \mathcal{S}.q)\\ 
% &\,\,\,-\mathbb{E}_{x \sim p(\cdot \mid y)}[\text{Bias}_\mathcal{L}(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x))]
% \end{align*}
% {\it where the second term in each equation is 0 when $\mathcal{S}.q$ has a tractable marginal density.}

% \textbf{Proof.}
% % Unfolding the definition of $\text{Bias}_\mathcal{L}$ and $\text{Bias}_\mathcal{U}$, and applying Theorem 2, we see that the theorem holds if and only if:

% % $$\LL(p, y, \mathcal{S}) = \log p(y) - KL(\mathcal{S}.q || p(\cdot \mid y)) - \mathbb{E}_{x \sim \mathcal{S}.q}[\text{Bias}_\mathcal{U}(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x))]$$
% % and $$\UU(p, y, \mathcal{S}) = \log p(y) + KL(p(\cdot \mid y) || \mathcal{S}.q) - \mathbb{E}_{x \sim \mathcal{S}.q}[\text{Bias}_\mathcal{L}(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x))].$$

% In the base case, where $\mathcal{S}.q$ has a tractable marginal density, the theorem states that $\log p(y) - \LL(p, y, \mathcal{S}) = KL(\mathcal{S}.q || p(\cdot \mid y))$, the familiar relationship between the standard ELBO and the KL divergence. The $\UU$ case is similar:
% \begin{align*}
%     \text{Bias}_\UU(p, y, \mathcal{S}) 
%     &= \E_{x \sim p(\cdot \mid y)}[\log p(x, y) - \log \mathcal{S}.q(x)] - \log p(y)\\
%     &= \log p(y) + \E_{x \sim p(\cdot \mid y)}[\log p(x \mid y) - \log \mathcal{S}.q(x)] - \log p(y)\\ 
%     &= KL(p(\cdot \mid y) || \mathcal{S}.q).
% \end{align*}

% Now consider the inductive step, in which $\mathcal{S}.q$ does not have a tractable marginal density. We assume the theorem holds for $\mathcal{S}.q$ and $\mathcal{S}.\mathcal{M}(x)$. Then: 
% \begin{align*}
%     \text{Bias}_\LL(p, y, \mathcal{S}) 
%     &= \LL(p, y, \mathcal{S}) - \log p(y)\\
%     &= \E_{x \sim \mathcal{S}.q}[\log p(x, y) - \UU(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x))] - \log p(y)\\
%     &= \log p(y) + \E_{\sim \mathcal{S}.q}[\log p(x \mid y) - \UU(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x))]\\
%     &= \E_{x \sim \mathcal{S}.q}[\log p(x \mid y) - \UU(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x))]\\
%     &= \E_{x \sim \mathcal{S}.q}[\log p(x \mid y) - \log \mathcal{S}.q(x) + \log \mathcal{S}.q(x) - \UU(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x))]\\
%     &= -KL(\mathcal{S}.q || p(\cdot \mid y)) + \E_{x \sim \mathcal{S}.q}[\log \mathcal{S}.q(x) - \UU(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x))]\\
%     &= -KL(\mathcal{S}.q || p(\cdot \mid y)) - \E_{x \sim \mathcal{S}.q}[\text{Bias}_\UU(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x))].
% \end{align*}

% Nearly the same proof applies for $\UU$, flipping the necessary signs:
% \begin{align*}
%     \text{Bias}_\UU(p, y, \mathcal{S}) 
%     &= \UU(p, y, \mathcal{S}) - \log p(y)\\
%     &= \E_{x \sim p(\cdot \mid y)}[\log p(x, y) - \LL(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x))] - \log p(y)\\
%     &= \log p(y) + \E_{\sim p(\cdot \mid y)}[\log p(x \mid y) - \LL(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x))]\\
%     &= \E_{x \sim p(\cdot \mid y)}[\log p(x \mid y) - \LL(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x))]\\
%     &= \E_{x \sim p(\cdot \mid y)}[\log p(x \mid y) - \log \mathcal{S}.q(x) + \log \mathcal{S}.q(x) - \LL(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x))]\\
%     &= KL(p(\cdot \mid y) || \mathcal{S}.q) + \E_{x \sim p(x \mid y)}[\log \mathcal{S}.q(x) - \LL(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x))]\\
%     &= KL(p(\cdot \mid y) || \mathcal{S}.q) - \E_{x \sim p(\cdot \mid y)}[\text{Bias}_\LL(\mathcal{S}.q, x, \mathcal{S}.\mathcal{M}(x))].
% \end{align*}

% \subsection{Stationarity of MCMC algorithm}
% \label{sec:ravi-mcmc}
% In Section~\ref{sec:inference-algs}, we mention that RAVI can be used to run Metropolis-Hastings kernels 
% with proposals that have intractable densities. 
% Here, we present and justify the algorithm.

% Let $\tilde{\pi}(x) = \int \tilde{\pi}(r, x) \text{d}r = Z \int \pi(r, x) \text{d}r$ be a possibly unnormalized target density, and let $q(x'; x) = \int q(s, x'; x) \text{d}s$ be a proposal kernel mapping previous state $x$ to new state $x'$. We note that (1) both $\tilde{\pi}$ and $q$ have intractable marginal densities, and (2) the target marginal $\tilde{\pi}(x)$ itself may be unnormalized. As is typical in pseudomarginal MCMC, even this unnormalized target density cannot be evaluated pointwise, due to the additional nuisance variables $r$.

% Now suppose we have a family of inference strategies $\mathcal{S}(x)$ targeting $\pi(r \mid x)$, and a family of inference strategies $\mathcal{M}(x, x')$ targeting $q(s \mid x'; x)$. Let $x$ be a starting position for our Markov chain. We can run Algorithm~\hyperref[alg:alg1]{1} on $\mathcal{S}$, targeting $\pi(r \mid x)$, to obtain an initial estimate $\hat{Z}_x$ of the unnormalized marginal density $\tilde{\pi}(x)$. Then Algorithm~\hyperref[alg:mh]{5} defines a stationary MCMC kernel for the target distribution $\pi(x)$, starting at input point $x$:

% \begin{wrapfigure}{L}{0.5\textwidth}
%     \begin{algorithm}[H]
%     \SetAlgoLined\DontPrintSemicolon
%     \label{alg:mh}
%     \footnotesize{
%     \textbf{Algorithm 5:} RAVI Metropolis-Hastings\;
%     \KwIn{model $\tilde{\pi}(x) = Z \int \pi(r, x) \text{d}r$}
%     \KwIn{proposal $q(x'; x) = \int q(s, x'; x) \text{d}s$}
%     \KwIn{family $\mathcal{S}(x)$ of inference strategies targeting $\pi(r \mid x)$}
%     \KwIn{family $\mathcal{M}(x, x')$ of inference strategies targeting $q(s \mid x'; x)$}
%     \KwIn{initial position $x$ and estimate $\hat{Z}_{x}$ of $\tilde{\pi}(x)$}
%     \KwOut{next position $x'$ and estimate $\hat{Z}_{x'}$ of $\tilde{\pi}(x')$}
%     \nl $(s, x') \sim q(s, x'; x)$\;
%     \nl $w_{x'} \gets {\texttt{HME}}(q(\cdot, x'; x), s, \mathcal{M}(x, x'))$\;
%     \nl $(\_, w_x) \gets \texttt{IMPORTANCE}(q(\cdot \mid x; x'), \mathcal{M}(x', x))$\;
%     \nl $(\_, \hat{Z}_{x'}) \gets \texttt{IMPORTANCE}(\pi(\cdot \mid x'), \mathcal{S}(x'))$\;
%     \nl $u \sim \text{Uniform}(0, 1)$\;
%     \nl \If{$u < \text{min}(1, \frac{\hat{Z}_{x'}}{\hat{Z}_x}w_{x'}w_x)$}{
%         \nl \Return{$(x', \hat{Z}_{x'})$}\;
%     }
%     \nl \Else{
%         \nl \Return{$(x, \hat{Z}_x)$}\;
%     }
%     }
%     \end{algorithm}
% \end{wrapfigure}

% % \begin{enumerate}
% %     \item Run Algorithm 2 on the distribution $q(s, x'; x)$ and the family of strategies $\mathcal{M}(x, \cdot)$, to generate $(x', w_{x'})$.
    
% %     \item Run Algorithm 1 on the strategy $\mathcal{M}(x', x)$, targeting $q(s \mid x; x')$, to generate $(\_, w_x)$.
    
% %     \item Run Algorithm 1 on the strategy $\mathcal{S}$ targeting $\pi(r \mid x')$, to generate $(\_, \hat{Z}_{x'})$.
    
% %     \item With probability $\min(1, \frac{\hat{Z}_{x'}}{\hat{Z}_x} w_{x'} w_x)$, accept $x'$ as the next point in the chain, with estimated target density $\hat{Z}_{x'}$. Else, reject $x'$ and return $x$.
% % \end{enumerate}
% When $q$'s marginal density is known exactly, the above algorithm recovers variants of Particle-Marginal MH~\citep{andrieu2010particle}, except instead of using SMC to marginalize $r$, any RAVI algorithm can be applied. When $q$'s marginal density is unavailable, however, the algorithm instead becomes a pseudo-marginal \textit{ratio} algorithm~\citep{andrieu2018utility}, because not just $p$ but also $q$ is estimated unbiasedly. In general, it is \textit{not} valid to use arbitrary unbiased estimates of $p$ \textit{and} $q$, or even of $\alpha = \frac{p(x')q(x; x')}{p(x)q(x'; x)}$, within an MH algorithm. However, the added structure of the RAVI strategy ensures that the above procedure is sound.

% To see why our MCMC kernel is stationary, we consider an extended target distribution. First, some notation. For an inference strategy $\mathcal{S}$ targeting $\pi(x)$, write $v_\mathcal{S}$ for the complete set of auxiliary variables in the strategy: if $\mathcal{S}.q$ has a tractable marginal density, then $v_\mathcal{S} = \emptyset$, and otherwise, if $\mathcal{S}.q(x) = \int \mathcal{S}.q(r, x) \text{d}r$, then $v_\mathcal{S}$ is defined recursively as $\{r\} \cup v_{\mathcal{S}.\mathcal{M}}$. Calling $\texttt{IMPORTANCE}$ on $\mathcal{S}$ yields a joint distribution over these auxiliary variables and $x$, which we denote as $p^{\mathcal{S}}_{\texttt{IMP}}(v_\mathcal{S}, x)$. Calling $\texttt{HME}$ on $\mathcal{S}$ and a particular sample $x$ yields a distribution over just $v_\mathcal{S}$, which we denote $p^\mathcal{S}_{\texttt{HME}}(v_\mathcal{S}; x)$.
% When $x \sim \pi$ and $v_\mathcal{S} \sim p^\mathcal{S}_\texttt{HME}(v_\mathcal{S}; x)$, the ratio $\frac{p^\mathcal{S}_\texttt{IMP}(v_\mathcal{S}, x)}{\tilde{\pi}(x)p^\mathcal{S}_\texttt{HME}(v_\mathcal{S}; x)}$ is the weight $\check{Z}(\tilde{\pi}, \mathcal{S})^{-1}$ returned by \texttt{HME}, and similarly, when $(v_\mathcal{S}, x) \sim p^\mathcal{S}_\texttt{IMP}$, the ratio $\frac{\tilde{\pi}(x)p^\mathcal{S}_\texttt{HME}(v_\mathcal{S}; x)}{p^\mathcal{S}_\texttt{IMP}(v_\mathcal{S}, x)}$ is the weight $\hat{Z}(\tilde{\pi}, \mathcal{S})$ returned by $\texttt{IMPORTANCE}$.

% Using this notation, we can extend the target distribution $\tilde{\pi}(x)$ to one over $(x, s, x', s', v_{\mathcal{S}(x)}, v_{\mathcal{M}(x, x')}, v_{\mathcal{M}(x', x)})$ that admits $\tilde{\pi}(x)$ as a marginal:
% $$\tilde{\pi}(r, x, s, x', s', v_{\mathcal{S}(x)}, v_{\mathcal{M}(x, x')}, v_{\mathcal{M}(x', x)}) = \tilde{\pi}(r, x) \cdot p^{\mathcal{S}(x)}_\texttt{HME}(v_{\mathcal{S}(x)}; r) \cdot q(s, x'; x) \cdot p^{\mathcal{M}(x, x')}_\texttt{HME}(v_{\mathcal{M}(x, x')}; s) \cdot p^{\mathcal{M}(x', x)}_\texttt{IMP}(v_{\mathcal{M}(x', x)}, s')$$


% Our algorithm can be understood as sequencing two stationary kernels for this extended target. The first (implemented by lines 1-3) is a blocked Gibbs update on the variables $(s, x', s', v_{\mathcal{M}(x, x')}, v_{\mathcal{M}(x', x)})$, conditioned on everything else. Lines 1-3 sample exactly from the conditional distribution of these variables. The second is a Metropolis-Hastings proposal that simultaneously: (i) swaps $x$ with $x'$ (the `main' proposed update), (ii) swaps $(s, v_{\mathcal{M}(x, x')})$ with $(s', v_{\mathcal{M}(x', x)})$, and (iii) proposes an update to $r$ and to $v_{\mathcal{S}(x)}$ from $p_\texttt{IMP}^{\mathcal{S}(x')}$. The usual Metropolis-Hastings acceptance probability for this kernel, computed on the extended state space, is precisely the formula in Line 6.

% One consequence of this justification is that the \textit{same} family $\mathcal{S}$ of inference strategies for $\pi$ must be used at each iteration. The family $\mathcal{M}$ can be freely switched out (as can $q$), however, to develop a cycle of kernels that use different proposal distributions.

% % \section{Experimental Details}
% % \label{sec:experimental-details}

% % Code to reproduce all experimental results is included as part of this supplement. This section reports the details of each experiment, and instructions for reproducing our results.

% % \subsection{IWAE with $K=2$}

% % This section gives details on the experiment used to produce the results for the IWAE experiment.

% % Consider a simple generative model $p_0$, and a RAVI inference strategy whose corresponding $\hat{Z}$ is precisely the two-particle importance sampling estimator with proposal $q(x_1) = \frac{1}{2} \mathcal{N}(x_1; 0, 10) + \frac{1}{2}\mathcal{N}(x_1; 0, 0.2)$:
% % %In Appendix~\ref{sec:sir-example}, we show how $N$-particle importance sampling can be viewed as a single-particle estimator arising from Alg. 1 applied to a particular RAVI inference strategy. For now, we specialize to a simple generative model $p_0$, and the RAVI inference strategy $p_1, p_2$ corresponding to importance sampling with number of particles $N=2$ and proposal distribution $q(x_1) = \frac{1}{2} \mathcal{N}(x_1; 0, 10) + \frac{1}{2}\mathcal{N}(x_1; 0, 0.2)$:
% % \begin{flalign*}
% %     p_0(x_1, x_0) &= \mathcal{N}(x_1; 0, 10) \cdot \mathcal{N}(x_0; x_1, 0.1)&\\
% %     p_1(x_2, x_1; x_0) &= \prod_{i=1}^2 q(x_2^{(i)}) \cdot \sum_{i=1}^{2} \frac{w_i}{w_1 + w_2} \delta_{x_2^{(i)}}(x_1)\\
% % %    p_1(x_2, x_1; x_0) &= \frac{p_0(x_2^{(c)}, x_0) \cdot \delta_{x_2^{(c)}}(x_1) \cdot q(x_2^{(\overline{c})})}{p_0(x_2^{(1)}, x_0)/q(x_2^{(1)}) + p_0(x_2^{(2)}, x_0)/q(x_2^{(2)})} \\
% %     p_2(x_3, x_2; x_1, x_0) &= \text{Bern}(x_3; \frac{1}{2})\cdot q(x_2^{(x_3+1)}) \cdot \delta_{x_1}(x_2^{(2 - x_3)})\\
% %     p_3(x_3; x_2, x_1, x_0) &= \delta_{\mathbf{1}[x_1 = x_2^{(1)}]}(x_3)
% % \end{flalign*}
% % \noindent Here, $p_1$ corresponds to the `sampling/importance resampling' process of generating two particles $x_2^{(1)}$ and $x_2^{(2)}$ from $q$, computing their weights $w_i = p_0(x_2^{(i)}, x_0)/q(x_2^{(i)})$, and then based on the weights, selecting one to return as $x_1$.
% % One layer down the inference strategy, $p_2$ observes the chosen particle $x_1$, and tries to guess the values of $x_2^{(i)}$. Our $p_2$ above flips a coin $x_3$ to decide which particle ($x_2^{(1)}$ or $x_2^{(2)}$) to set equal to $x_1$, and guesses the value of the other, \textit{unchosen} particle blindly using $q$. This RAVI inference strategy recovers the usual 2-particle IS estimator $\hat{Z} = \frac{1}{2} (w_1 + w_2)$. But $p_2$ could be better: knowing the chosen particle $x_1$ reveals information about the other, unchosen particle, which $p_2$ could exploit. For example, if a particularly \textit{unlikely} value for $x_1$ was chosen by $p_1$, the other particle must also have been unlikely; otherwise, its large relative weight would have caused it to be chosen.

% % %
% % We can learn to exploit this knowledge if we replace the $q$ in $p_2$ with an $x_1$-aware distribution  
% % $h_\theta(x; x_1) = \frac{1}{2} \left(f_\theta(x_1) \cdot \mathcal{N}(x; 0, 10) + (1-f_\theta(x_1)) \cdot \mathcal{N}(x; 0, 0.2)\right)$, where $f_\theta$ is a neural network predicting probabilities in $[0, 1]$.

% % To learn $\theta$, we used ADAM, with learning rate 1e-3, to maximize $\LL_0$. We performed 500 gradient updates, and used 10 samples to estimate each gradient.

% % \subsection{Multiple Importance Sampling}

% % This section gives model and inference details for the Multiple Importance Sampling experiments, performed on a commodity laptop using a single CPU core.

% % \textbf{Model.} We used the simple multimodal target from~\citet{elvira2019generalized}, 
% % $$\pi(x) = \frac{1}{3} \left(\mathcal{N}(x; -3, \sigma) + \mathcal{N}(x; 0, \sigma) + \mathcal{N}(x; 3, \sigma)\right).$$

% % We set $\sigma = 0.8$.

% % \textbf{Proposals.} Following~\cite{elvira2019generalized}, the proposal distributions were $g_k(x) = \int_{r=0.8}^{1.3} 2\mathcal{N}(x; \mu_k, r) dr$.

% % \textbf{Estimators.} 

% % We compared two estimators. For each of 1,000,000 trials, we generated 30 particles $(r_i, x_i)$, ten from each random-weight proposal $g_k(r, x) = \mathcal{U}(r; 0.8, 1.3) \cdot \mathcal{N}(x; \mu_k, r)$. We then computed $\hat{Z}_1^*$ and $\hat{Z}_2^*$ for each, as follows:

% % $$\hat{Z}_1^* = \frac{1}{30} \sum_{i=1}^{30} \frac{\pi(x_i)}{g_{k_i}(x_i \mid r_i)}$$
% % $$\hat{Z}_2^* = \frac{1}{30} \sum_{i=1}^{30} \frac{\pi(x_i)}{\frac{1}{3}\sum_{j=1}^3 g_j(x_i \mid r_i)}.$$

% % The first estimator is a standard multiple importance sampling estimator. We derived the second by improving meta-inference in the RAVI representation of the first. It resembles the ``deterministic mixture'' MIS estimator, except that instead of using tractable marginal densities $g_j(x_i)$ to compute the denominator, only the likelihoods $g_j(x_i \mid r_i)$ are used.


% % \subsection{Agglomerative Monte Carlo}

% % \textbf{Model.} We use a generative model adapted from~\citep{lew2021pclean}. We first define a base measure $H$ over strings, which is a character-level bigram model:

% % $$H(s) = h(s_1) \prod_{i=2}^{|s|} h(s_i \mid s_{i-1}).$$

% % The unigram frequencies $h(s_1)$ and bigram frequencies $h(s_i \mid s_{i-1})$ are included in the supplement, as \texttt{letter\_probabilities.csv} and \texttt{transition\_probabilities.csv}. 

% % We model observed strings $y_i$, for $i=1, \dots, 1000$, as distributed according to a Dirichlet process mixture:

% % \begin{align*}
% %     G &\sim DP(H, \alpha = 1.0)\\
% %     x_i \mid G &\sim G\\
% %     y_i \mid x_i &\sim f(\cdot \mid x_i).
% % \end{align*}

% % Here, the likelihood $f(y_i \mid x_i)$ models typos. We set $f$ to be

% % \[
% % f(y_i \mid x_i) \propto 
% %  \begin{cases} 
% %       \mathbf{1}[x_i = y_i] & (x_i, y_i) \not\in \mathcal{L} \times \mathcal{L} \\
% %       \frac{\text{NegBin}(\tau(x_i, y_i); \lceil \frac{|s|}{5} \rceil, 0.9)}{(5.09|s|)^{\tau(x_i, y_i)}} & (x_i, y_i) \in \mathcal{L} \times \mathcal{L}
% %   \end{cases},
% % \]

% % where $\tau(x_i, y_i)$ is the Damerau-Levenshtein edit distance between $x_i$ and $y_i$, and $\mathcal{L}$ is the set of all observed strings $\{y \mid \exists i. \, y = y_i\}$. This somewhat strange likelihood can be understood as follows. We assume that the data we have, $\mathcal{L}$, includes at least one example of every `clean' string. Setting $f(y_i \mid x_i) = \mathbf{1}[x_i = y_i]$ for $x_i \not\in \mathcal{L}$ encodes that `if someone had meant to type $x_i \not\in \mathcal{L}$, they would have typed it correctly; therefore, the explanation for $y_i \in \mathcal{L}$ cannot be a string $x_i \not\in \mathcal{L}$.' When $x_i \in \mathcal{L}$, we model a negative-binomially distributed number of typos, where the number of trials depends on the length of the string.


% % We perform inference in a collapsed version of the model, with the $x_i$ marginalized out. This collapsed version can be expressed as a Chinese Restaurant Process model,
% % \begin{align*}
% %     \Pi &\sim CRP(n = 1000, \alpha = 1.0)\\
% %     y_I \mid \Pi &\sim F(y_I),
% % \end{align*}

% % where $\Pi$ is a partition, $I$ ranges over the components of $\Pi$ (each of which is a subset of indices $I \subseteq \{1, \dots, 1000\}$), and $F(y_I) = \sum_{x \in \mathcal{L}} h(x) \prod_{i \in I} f(y_i \mid x)$ is the marginal likelihood of $y_I$ when considered as a sequence of noisy observations of a latent string.

% % Given a partition $\Pi$ of a set of indices $I \subseteq \{1, \dots, 1000\}$, we write $\pi_I(\Pi)$ for the posterior probability of the partition, given $y_I$, and $\tilde{\pi}_I(\Pi) = CRP(\Pi; n=|I|, \alpha=1.0) \cdot \prod_{J \in \Pi} F(y_J)$ for the unnormalized posterior.

% % \textbf{Locally optimal SMC baseline.} As a baseline, we consider a sequential Monte Carlo algorithm that targets a sequence of posteriors, where the $t^{\text{th}}$ posterior incorporates the first $t$ datapoints. The SMC kernel $K$ maps a partition of the first $t-1$ datapoints to a partition of the first $t$ datapoints, by randomly assigning the newest datapoint to an existing component $I$ with probability proportional to $\frac{|I|}{t + \alpha - 1} \cdot F(y_{I} \cup \{y_t\})$, or to a new component with probability proportional to $\frac{\alpha}{t + \alpha - 1} \cdot F(\{y_t\})$. We perform multinomial resampling every iteration, and a complete Gibbs sweep every 100 iterations.

% % \textbf{The Agglomerative SMC algorithm.} 

% % Whereas the baseline algorithm is parameterized by a time step $t$, and considers all data points $1 \leq i \leq t$, the Agglomerative SMC algorithm is instead parameterized by an arbitrary index set $I$ of data points to consider. 

% % \textbf{To generate a collection $\hat{\pi} = \{(\hat{\Pi}_1, w_1), \dots, (\hat{\Pi}_M, w_M)\}$ of $M$ particles weighted for the posterior $\pi_I$, where $I \subseteq \{1, \dots, N\}$ is a subset of datapoint indices:}

% % \begin{enumerate}
% %     \item If $I = \{i\}$ is a singleton set, set $\hat{\Pi}_j$ to the unique partition of $I$ for $j=1, \dots, M$, and set each $w_j = F(\{y_i\})$. Return $\hat{\pi} = \{(\hat{\Pi}_1, w_1), \dots, (\hat{\Pi}_M, w_M)\}$.
    
% %     \item Else, split $I$ into two halves, $I_1$ and $I_2$. (If $|I|$ is odd, divide almost-evenly, so that $|I_2| = |I_1| + 1$.) Recursively run the Agglomerative SMC algorithm to generate two weighted collections $\hat{\pi}_{I_1}$ and $\hat{\pi}_{I_2}$.
    
% %     \item For each $j \in 1, \dots, M$, draw $\Pi_j^1 \sim \hat{\pi}_{I_1}$ and $\Pi_j^2 \sim \hat{\pi}_{I_2}$ according to the particle weights. 
    
% %     \item Set $\hat{\Pi}_j$ by stochastically \textit{merging} the two partitions $\Pi_j^1$ and $\Pi_j^2$ of $I_1$ and $I_2$:
    
% %     \begin{itemize}
% %         \item Set $N = |I_1|$ and for each component $J \subseteq I_2$ in $\Pi_j^2$:
        
% %         \begin{itemize}
% %             \item With probability proportional to $\frac{\alpha}{\alpha + N} \cdot \prod_{l=2}^{|J|} \frac{l - 1}{\alpha + N + l - 1}$, add $J$ as its own component to the partition $\hat{\Pi}_j$.
            
% %             \item Alternatively, letting $L$ range over the partition components in $\Pi_j^1$, with probability proportional to $\prod_{l=1}^{|J|} \frac{|L| + l - 1}{\alpha + N + l - 1}$, add $J \cup L$ as a new component to the partition $\hat{\Pi}_j$.
% %             Remove the selected component $L$ from $\Pi_j^1$ so it is unavailable for matching in future steps.
            
% %             \item Increment $N$ by $|J|$.
            
% %         \end{itemize}
        
% %         \item Add any components that remain in $\Pi_1^j$ to $\hat{\Pi}_j$.
        
% %     \end{itemize}
    
% %     \item Letting $\hat{Z}_1$ be the mean weight of $\hat{\pi}_1$ and likewise for $\hat{Z}_2$, compute $w_j = \hat{Z}_1 \hat{Z}_2 \frac{\tilde{\pi}_{I}(\hat{x}_j)}{\tilde{\pi}_{I_1}(x_j^1) \tilde{\pi}_{I_2}(x_j^2) K((x_j^1, x_j^2), d\hat{x}_j)}$, where the kernel $K$ represents the merging process described in Step 4. (Its density can be computed by multiplying the probabilities of the merge decisions made for each $J$.)
    
% %     \item With probability $\rho$ (a hyperparameter of the algorithm), perform a Gibbs sweep on each $\hat{\Pi}_j$ for $j = 1, \dots, M$.
    
% %     \item Return $\hat{\pi} = \{(\hat{\Pi}_1, w_1), \dots, (\hat{\Pi}_M, w_M)\}$.
    
% % \end{enumerate}







% % If there is only one active index, i.e. $I = \{i\}$, we return the $M$ copies of the trivial clustering, with the uniform weights $w_j = F(\{y_i\})$. Otherwise we divide the index set into halves, $I_1$ and $I_2$. We then run the algorithm recursively to obtain $M$-particle collections $\hat{\pi}_1$ and $\hat{\pi}_2$ representing the respective posteriors. We draw $M$ particles with replacement from each approximation, $x_{j}^1 \sim \hat{\pi}_1$ and $x_{j}^2 \sim \hat{\pi}_2$, for $j \in [1, M]$. Then, for each $j$, we run a proposal kernel $K$ on $(x_j^1, x_j^2)$ to yield $x_j$. The kernel considers each component of the $x_j^2$ partition in turn, deciding whether to merge it with an available component in $x_j^1$, or leave it as its own component. If a merge is decided, the chosen component is no longer available to be merged with other components at this stage of the algorithm. In the resulting particle $x_j$, if $a, b \in I_1$ or $a, b \in I_2$, then $a$ and $b$ have the same relationship in $x_j$ (i.e., either same-cluster or different-cluster) as they did in $x_j^1$ or $x_j^2$. But for $a \in I_1, b \in I_2$, after the merge any relationship is possible.

% % \textbf{Dataset shuffling.} The agglomerative SMC algorithm can be additionally modified so that, to some extent, different particles see different groupings of the data, leading to more particle diversity. In partuclar, at step (2) of the algorithm, if $M$ is above some threshold $\tau$, we consider two possible index splits, $(I_{11}, I_{12})$ and $(I_{21}, I_{22})$. For each split, we perform the two recursive calls as described in Step (2), but with only $\frac{M}{2}$ particles each. Then, we follow steps 3-7 individually for each split, to yield two $\frac{M}{2}$-particle collections $\hat{\pi}_1$ and $\hat{\pi}_2$. We take their union to yield a final $\hat{\pi}$ that we return.

% % \textbf{Experimental details.} We ran the locally optimal SMC baseline with 32 particles, which took approximately 30 seconds, and ran the Agglomerative SMC algorithm with 128 particles, with $\rho = 0.0$ and $\tau = 2$, which took the same amount of time. In the body of the paper, we ran each experiment independently three times to estimate error bars; we reported the mean and standard deviations of the log weights, and the best data cleaning accuracy achieved by any of the 3 runs (after performing one additional Gibbs sweep on the algorithm results).\footnote{We have since run additional trials (30 for each algorithm), yielding log weights of $-32,000 \pm 709$ for Agglomerative SMC and $-40,239 \pm 1,532$ for the baseline.}





% \section{Further Examples}
% \label{sec:appendix-examples}

% This appendix lists examples of popular Monte Carlo and variational inference algorithms, and explains how they can be viewed as inference strategies. In addition, some of these algorithms can be viewed as \textit{inference strategy combinators}, because they feature user-chosen proposal distributions or variational families that can themselves be instantiated with inference strategies.\footnote{This `combinator' viewpoint evokes earlier work by~\citep{scibior2018denotational} and \citep{stites2021learning}. For example, \citep{stites2021learning} introduce combinators for creating properly weighted samplers compositionally, with parameters that can be optimized using standard or nested variational objectives. Some of their combinators have equivalents in this section, e.g. their \texttt{propose} combinator is similar to the construction we present for Nested Importance Sampling in Section~\ref{sec:nsmc-example}. However: (1) the fundamental compositional operation in RAVI, of combining a posterior approximation with a meta-posterior approximation, cannot be achieved using their combinators; (2) as such, some of the algorithms that RAVI covers cannot be constructed using their combinators; and (3) their combinators produce properly weighted samplers, which contain `less information' than inference strategies: an inference strategy can be used, e.g., as a proposal distribution in Metropolis-Hastings, whereas properly weighted samplers cannot in general be used this way.}


% % \subsection*{Notation}

% % Throughout this appendix, we use a notation that fully \textit{unrolls} an inference strategy into a sequence $p_0, p_1, \dots, p_n$, where $p_0(x_1, x_0)$ is the model, $p_1(x_2, x_1; x_0)$ is $\mathcal{S}.q$, $p_2(x_3, x_2; x_1, x_0)$ is $\mathcal{S}.\mathcal{M}(x_1).q$, and so on. %This is depicted in Figure~\ref{fig:nested} and described Section~\ref{sec:theory}. 
% % We call $n$ the \textit{length} of an inference strategy.

% \subsection{$N$-particle Importance Sampling}
% \label{sec:sir-example}

% \begin{wrapfigure}{L}{0.5\textwidth}
% \vspace{-6mm}
% \begin{algorithm}[H]
%     \label{infstrat:sir}
%     \SetAlgoLined\DontPrintSemicolon
%     \footnotesize{
%     \textbf{RAVI Inference Strategy:} $N$-particle Importance Sampling\;
%     \SetKwFunction{sir}{sir($\tilde{\pi}, q, N$).q}\SetKwFunction{sirm}{sir($\tilde{\pi}, q, N$).M($x$).q}
%     % \SetKwFunction{mcvimm}{rmcvi($M, K$).M($x$).M($x_{0:M})$.q}
%     \SetKwProg{infalg}{Posterior Approx.}{}{}
%     \SetKwInOut{Infers}{Target of inference}
%     \SetKwInOut{Aux}{Auxiliary variables}
%     \infalg{\sir{}}{
%     \Infers{latent variable $x$}
%     \Aux{particles $x_{1:N}$, chosen particle index $j$}
%     \nl \For{$i \in 1, \dots, N$}{
%         \nl $x_i \sim q$\;
%         \nl $w_i \gets \frac{\tilde{\pi}(x_i)}{q(x_i)}$\;
%     }
%     \nl $j \sim \text{Discrete}(w_{1:N})$\;
%     \nl \Return{$x_j$}\;}{}
%     \setcounter{AlgoLine}{0}
%     \SetKwProg{metaalg}{Meta-Posterior Approx.}{}{}
%     \metaalg{\sirm{}}{
%     \Infers{particles $x_{1:N}$, chosen particle index $j$}
%     \Aux{None}
%     \nl $j \sim \text{Uniform}(1, N)$\;
%     \nl $x_j \gets x$\;
%     \nl \For{$i \in 1,\dots,j-1,j+1,\dots,N$}{
%         \nl $x_i \sim q$\;
%     }
%     \nl \Return{$(x_{1:N}, j)$}}
%     % \setcounter{AlgoLine}{0}
%     % \SetKwProg{metaalg}{Meta-Meta-Posterior Approx.}{}{}
%     % \metaalg{\mcvimm{}}{
%     % \Infers{SMC particles $x_{0:M}^{1:K}$, ancestor indices $a_0, a_{1:M}^{1:K}$}
%     % \Aux{None}
%     % %\nl $a_\text{prev} \gets a_0$\;
%     % \nl \For{$i \in 0, \dots, M$}{
%     %     $b_i \sim  \text{Uniform}(1, K)$\;
%     %     %$a_\text{prev} \gets a_i^{a_{\text{prev}}}$\;
%     % }
%     % \nl \For{$k \in 1,\dots, K$}{
%     %     \nl $(x_M^k, w_M^k) \gets (x, q_m(x)])$\;
%     % }
%     % \nl \For{$i \in M-1, \dots, 0$}{
%     %     \nl \For{$k \in 1,\dots,K$}{
%     %         \nl \If{$k = b_i$}{
%     %             \nl $(a_{i+1}^k, x_i^k) \gets (b_{i+1}, x_i)$\;
%     %         }
%     %         \nl \Else{
%     %             \nl $a_{i+1}^k \sim \text{Discrete}(w_{i+1}^{1:K})$\;
%     %             \nl $x_i^k \sim R_i(x_{i+1}^{a_{i+1}^k} \rightarrow \cdot)$\;
%     %         }
%     %         \nl $w_i^k \gets \frac{q_i(x_i^k)T(x_i \rightarrow x_{i+1}^{a_{i+1}^k})}{q_{i+1}(x_{i+1}^{a_{i+1}^k})R_i(x_{i+1}^{a_{i+1}^k} \rightarrow x_i^k)}$\;
%     %     }
%     % }
%     % \nl $a_0 \gets b_0$\;
%     % \nl \Return{$(a_0, a_{1:M}^{1:K}, x_{0:M}^{1:K})$}}
%     }
%     \vspace{-10mm}
% \end{algorithm} 
% \end{wrapfigure}

% Consider the $N$-particle importance sampling estimator $$\hat{Z} = \frac{1}{N} \sum_{i=1}^N \frac{\tilde{\pi}(x_i)}{q(x_i)},\text{ for }x_i \sim q.$$ The same estimator can be recovered as a \textit{one-particle} \texttt{IMPORTANCE} estimate, by applying Alg.~\hyperref[alg:alg1]{1} to the~\hyperref[infstrat:sir]{\texttt{sir}} inference strategy.

% The proposal $\mathcal{S}.q$ generates $N$ particles $x_{1:N}$, and selects an index $j$ from a discrete distribution on $1, \dots, N$, with weights proportional to $w_i = \tilde{\pi}(x_i) / q(x_i)$. The meta-proposal is responsible for inferring $j$ and the complete set of particles $x_{1:M}$, given the chosen particle $x$. It uses the conditional SIR algorithm~\citep{andrieu2010particle} to do so, proposing $j$ uniformly in $\{1, \dots, N\}$, and generating values for the un-chosen particles $x_{-j}$ from $q$. 


% \begin{wrapfigure}{L}{0.5\textwidth}
%     \vspace{-6mm}
%     \begin{algorithm}[H]
%         \label{infstrat:ravisir}
%         \SetAlgoLined\DontPrintSemicolon
%         \footnotesize{
%         \textbf{RAVI Inference Strategy:} $N$-particle IS with RAVI strategy $\mathcal{S}$\;
%         \SetKwFunction{ravisir}{ravi-sir($\tilde{\pi}, \mathcal{S}, N$).q}\SetKwFunction{ravisirm}{ravi-sir($\tilde{\pi}, \mathcal{S}, N$).M($x$).q}
%         % \SetKwFunction{mcvimm}{rmcvi($M, K$).M($x$).M($x_{0:M})$.q}
%         \SetKwProg{infalg}{Posterior Approx.}{}{}
%         \SetKwInOut{Infers}{Target of inference}
%         \SetKwInOut{Aux}{Auxiliary variables}
%         \infalg{\ravisir{}}{
%         \Infers{latent variable $x$}
%         \Aux{particles $x_{1:N}$, aux. proposal variables $v_\mathcal{S}^{1:N}$, chosen particle index $j$}
%         \nl \For{$i \in 1, \dots, N$}{
%             \nl $x_i, w_i \sim \texttt{IMPORTANCE}(\tilde{\pi}, \mathcal{S})$ w. aux. vars $v_\mathcal{S}^i$\;
%             %\nl $w_i \gets \frac{\tilde{\pi}(x_i)}{q(x_i)}$\;
%         }
%         \nl $j \sim \text{Discrete}(w_{1:N})$\;
%         \nl \Return{$x_j$}\;}{}
%         \setcounter{AlgoLine}{0}
%         \SetKwProg{metaalg}{Meta-Posterior Approx.}{}{}
%         \metaalg{\ravisirm{}}{
%         \Infers{particles $x_{1:N}$, aux. proposal variables $v_\mathcal{S}^{1:N}$, chosen particle index $j$}
%         \Aux{None}
%         \nl $j \sim \text{Uniform}(1, N)$\;
%         \nl $x_j \gets x$\;
%         \nl $\_ \sim \texttt{HME}(\tilde{\pi}, x_j, \mathcal{S})$ w. aux. vars $v_\mathcal{S}^j$\;
%         \nl \For{$i \in 1,\dots,j-1,j+1,\dots,N$}{
%             \nl $\_, x_i \sim q$ w. aux. vars $v_{\mathcal{S}}^i$\;
%         }
%         \nl \Return{$(v_\mathcal{S}^{1:N}, x_{1:N}, j)$}}
%         }
%         \vspace{-5mm}
%     \end{algorithm} 
% \end{wrapfigure}


% This is a suboptimal choice of $\mathcal{S}.\mathcal{M}(x).q$; lower-variance estimates $\hat{Z}$ can be obtained by improving meta-inference, either by incorporating problem-specific domain knowledge or via learning. However, in many cases, improved meta-inference may not be worth the computation required; it remains to be seen whether techniques such as amortized learning can be applied to deliver accuracy gains at low computational cost.

% \textbf{Instantiating the proposal $q$ as its own inference strategy.} 
% The above assumes that $q$ has a tractable marginal density. When it doesn't, the inner importance sampling loop can use a RAVI inference strategy $\mathcal{S}$ instead of a tractable proposal $q$. This modification is presented in the higher-order inference strategy~\hyperref[infstrat:ravisir]{\texttt{ravi-sir}}. 
% One way to think about this construction is as a way to improve any existing inference strategy $\mathcal{S}$ by `adding replicates.' The resulting estimator of $Z$ is the mean of $N$ independent $\hat{Z}$ estimates from the original inference strategy.


% \subsection{Importance-Weighted Autoencoders}
% \label{sec:iwae-example}

% The importance-weighted auto-encoder arises by considering the same inference strategy as in Section~\ref{sec:sir-example}, but as a variational inference procedure (Alg. 3) rather than a Monte Carlo procedure. 

% Because $\texttt{sir}(\tilde{\pi}, q, N).q$ of this inference strategy corresponds to $N$-particle sampling importance-resampling (SIR), it has been argued that IWAE is in fact `vanilla' variational inference, but with a variational family that uses SIR to more closely approximate the posterior~\cite{bachman2015training}. However, \citep{cremer2017reinterpreting} show that deriving the ELBO for that variational family gives rise to a different objective, and that IWAE gives a looser lower bound on $\log Z$ than this idealized (but generally intractable) objective. 

% In the RAVI framework, these two objectives arise from different inference strategies, which share the same $\mathcal{S}.q$ (SIR in both cases), but use different meta-inference $\mathcal{S}.\mathcal{M}$. IWAE uses the simple conditional SIR meta-inference introduced in Section~\ref{sec:sir-example}, whereas \citep{cremer2017reinterpreting}'s idealized objective can be derived by using the optimal choice of $\mathcal{S}.\mathcal{M}(x).q(j, x_{1:N})$\textemdash the exact posterior of the SIR procedure. The looser bound obtained by IWAE can be seen as a result of its $\mathcal{S}.\mathcal{M}$ performing poorer meta-inference: inference about the auxiliary variables of the SIR inference algorithm used in $\mathcal{S}.q$.

% \subsection{$N$-particle Sequential Monte Carlo}
% \label{sec:smc-example}


% \begin{wrapfigure}{L}{0.6\textwidth}
%     \vspace{-2mm}
%     \begin{algorithm}[H]
%         \label{infstrat:smc}
%         \SetAlgoLined\DontPrintSemicolon
%         \footnotesize{
%         \textbf{RAVI Inference Strategy:} $N$-particle SMC w. RAVI strategies\;
%         \SetKwFunction{ravismc}{smc($\tilde{\pi}_{1:T}, \mathcal{S}, K_{2:T}, L_{2:T}, N$).q}\SetKwFunction{ravismcm}{smc($\tilde{\pi}_{1:T}, \mathcal{S}, K_{2:T}, L_{2:T}, N$).M($x$).q}
%         % \SetKwFunction{mcvimm}{rmcvi($M, K$).M($x$).M($x_{0:M})$.q}
%         \SetKwProg{infalg}{Posterior Approx.}{}{}
%         \SetKwInOut{Infers}{Target of inference}
%         \SetKwInOut{Aux}{Auxiliary variables}
%         \infalg{\ravismc{}}{
%         \Infers{latent variable $x$ targeting $\tilde{\pi}_T$}
%         \Aux{particles $x^{1:T}_{1:N}$, aux. proposal variables $v_\mathcal{S}^{1:N}$, aux. $K$ vars $v_{K_{2:T}}^{1:N}$, aux. $L$ vars $v_{L_{2:T}}^{1:N}$, ancestor variables $a^{1:T-1}_{1:N}$, final chosen particle index $j$}
%         \nl \For{$i \in 1, \dots, N$}{
%             \nl $x^1_i, w^1_i \sim \texttt{IMPORTANCE}(\tilde{\pi}_1, \mathcal{S})$ w. aux. vars $v_\mathcal{S}^i$\;
%         }
%         \nl \For{$t \in 2, \dots, T$}{
%             \nl \For{$i \in 1, \dots, N$}{
%                 \nl $a_i^{t-1} \sim \text{Discrete}(w^{t-1}_{1:N})$\;
%                 \nl $x_i^t, \hat{w} \sim \texttt{IMPORTANCE}(\tilde{\pi}_t, K_t(x^{t-1}_{a^{t-1}_i}))$ w. aux. vars $v_{K_t}^i$\;
%                 \nl $\check{w} \sim \texttt{HME}(\tilde{\pi}_{t-1}, x^{t-1}_{a^{t-1}_i}, L_t(x_i^t))$ w. aux. vars $v_{L_t}^i$\;
%                 \nl $w_i^{t} \gets \hat{w} \cdot \check{w}$\;
%             }
%         }
%         \nl $j \sim \text{Discrete}(w^T_{1:N})$\;
%         \nl \Return{$x^T_j$}\;}{}
%         \setcounter{AlgoLine}{0}
%         \SetKwProg{metaalg}{Meta-Posterior Approx.}{}{}
%         \metaalg{\ravismcm{}}{
%         \Infers{particles $x^{1:T}_{1:N}$, aux. proposal variables $v_\mathcal{S}^{1:N}$, aux. $K$ vars $v_{K_{2:T}}^{1:N}$, aux. $L$ vars $v_{L_{2:T}}^{1:N}$, ancestor variables $a^{1:T-1}_{1:N}$, final chosen particle index $j$}
%         \Aux{None}
%         \nl $j \sim \text{Uniform}(1, N)$\;
%         \nl $x_j^T, b_T \gets x, j$\;
%         \nl \For{$t \in T, \dots, 2$}{
%             \nl $a_{b_t}^{t-1} \sim \text{Uniform}(1, N)$\;
%             \nl $b_{t-1} \gets a_{b_t}^{t-1}$\;
%             \nl $x_{b_{t-1}}^{t-1}, \check{w} \sim \texttt{IMPORTANCE}(\tilde{\pi}_{t-1}, L_t(x_{b_t}^t))$ w. aux. vars $v_{L_t}^{b_t}$\;
%             \nl $\hat{w} \sim \texttt{HME}(\tilde{\pi}_{t}, x_{b_t}^t, K_t(x_{b_{t-1}}^{t-1}))$ w. aux. vars $v_{K_t}^{b_t}$\;
%             \nl $w_{b_t}^t \gets (\hat{w} \cdot \check{w})^{-1}$\;
%         }
%         $w^1_{b_1} \sim \texttt{HME}(\tilde{\pi}_1, x_{b_1}^1, \mathcal{S})$ w. aux. vars $v_\mathcal{S}^{b_1}$\;
%         \nl \For{$i \in 1, \dots, b_1-1, b_1+1, \dots, N$}{
%             \nl $x_{i}^1, w_i^1 \sim \texttt{IMPORTANCE}(\tilde{\pi}_1, \mathcal{S})$ w. aux. vars $v_\mathcal{S}^i$\;
%         }
%         \nl \For{$t \in 2, \dots, T$}{
%             \nl \For{$i \in 1, \dots, b_t-1, b_t+1, \dots, N$}{
%                 \nl $a_i^{t-1} \sim \text{Discrete}(w^{t-1}_{1:N})$\;
%                 \nl $x_i^t, \hat{w} \sim \texttt{IMPORTANCE}(\tilde{\pi}_t, K_t(x^{t-1}_{a^{t-1}_i}))$ w. aux. vars $v_{K_t}^i$\;
%                 \nl $\check{w} \sim \texttt{HME}(\tilde{\pi}_{t-1}, x^{t-1}_{a^{t-1}_i}, L_t(x_i^t))$ w. aux. vars $v_{L_t}^i$\;
%                 \nl $w_i^{t} \gets \hat{w} \cdot \check{w}$\;
%             }
%         }
%         % \nl $\_ \sim \texttt{HME}(\tilde{\pi}, x_j, \mathcal{S})$ w. aux. vars $v_\mathcal{S}^j$\;
%         % \nl \For{$i \in 1,\dots,j-1,j+1,\dots,N$}{
%         %     \nl $\_, x_i \sim q$ w. aux. vars $v_{\mathcal{S}}^i$\;
%         % }
%         \nl \Return{$(x_{1:N}^{1:T}, v_\mathcal{S}^{1:N}, v_{K_{2:T}}^{1:N}, v_{L_{2:T}}^{1:N}, a_{1:N}^{1:T-1},j)$}}
%         }
%         \vspace{-25mm}
%     \end{algorithm} 
% \end{wrapfigure}

% The sequential Monte Carlo family of algorithms~\citep{chopin2020introduction, del2006sequential} evolve a population of \textit{weighted particles} to approximate a sequence of target distributions. SMC can be viewed as standard importance sampling, with an inference strategy in which $\mathcal{S}.q$ is the sampling distribution for SMC, and $\mathcal{S}.\mathcal{M}(x)$ is the conditional SMC algorithm~\citep{andrieu2010particle}.

% Standard SMC is parameterized by:

% \begin{enumerate}
%     \item A sequence $\tilde{\pi}_{1:T}$ of intermediate target distributions, with $\tilde{\pi}_T = \tilde{\pi}$ the ultimate target;
%     \item An initial proposal $q(x_1)$;
%     \item A sequence $K_t(x_{t-1} \rightarrow x_t)$ of proposal kernels for $t=2, \dots, T$; and
%     \item A sequence $L_t(x_{t} \rightarrow x_{t-1})$ of backward kernels for $t=2, \dots, T$.
% \end{enumerate}

% Here, we show a version of SMC (the inference strategy~\hyperref[infstrat:smc]{\texttt{smc}}) that behaves as a `higher-order inference strategy,' or `inference strategy combinator': it allows for an initial proposal, proposal kernels, and backward kernels that do not have tractable marginal densities. Our version is parameterized by:

% \begin{enumerate}
%     \item A sequence $\tilde{\pi}_{1:T}$ of intermediate target distributions, with $\tilde{\pi}_T = \tilde{\pi}$ the ultimate target;
%     \item An initial proposal $\mathcal{S}$ (a RAVI strategy);
%     \item A sequence of inference strategy families $K_t(x_{t-1})$ parameterized by $x_{t-1}$, for $t=2, \dots, T$, targeting $\tilde{\pi}_t$; and
%     \item A sequence of inference strategy families $L_t(x_{t})$ of backward kernels, parameterized by $x_t$, for $t=2, \dots, T$.
% \end{enumerate}

% The posterior approximation $\mathcal{S}.q$ runs a version of SMC that uses $\texttt{HME}$ and $\texttt{IMPORTANCE}$ to compute weights. The meta-posterior approximation $\mathcal{S}.\mathcal{M}(x).q$ runs a similarly modified version of conditional SMC~\citep{andrieu2010particle}. When \texttt{IMPORTANCE} is run on the \texttt{smc} inference strategy, the final weight $\hat{Z}$ is the SMC marginal likelihood esitmate, the product of the averages of the weights from each time step.

% It is possible to adapt this strategy to use adaptive resampling and rejuvenation. (Rejuvenation moves do not actually require modification: can be incorporated by including them as explicit $(K, L)$ pairs, where $L$ is the time-reversal of an MCMC kernel $K$.) However, we are not aware of a way to justify the adaptive choice of rejuvenation kernel.

% % Consider the goal of generating a properly weighted collection of particles for a target distribution $p(x \mid y)$. At the top layer of the inference strategy, we choose a proposal that first generates a weighted collection of particles for an intermediate target $\pi(z)$, perhaps recursively using sequential Monte Carlo to do so. Having generated the weighted collection, $\{(z_1, w_1), \dots, (z_M, w_M)\}$ a \textit{resampling} step is performed. We consider a multi-particle resampling strategy $\Psi(\mathbf{z}, d\mathbf{j})$ that takes as input $K$ particles and outputs $M$ indices $j_1, \dots, j_M$. For example, we might set $\Psi(\mathbf{z}, d\mathbf{j}) = \prod_{i=1}^{M} \frac{w_{j_i}}{\sum_{l=1}^M w_l}$ to perform \textit{multinomial resampling}, selecting each $j_i$ independently according to the weights of the $z$ particles. Each resampled $z_{j_i}$ is then transformed independently into a proposed $x_i$ using a kernel $K(z, dx)$, and then reweighted to target $p$: $w'_i = w_{j_i} \frac{p(x_i \mid y) L(z_{j_i}; x_i)}{\pi(z_{j_i}) K(x_i; z_{j_i})}$. As described in \citet{del2006sequential}, $L$ is a user-chosen meta-inference kernel that attempts to reverse $K$, recovering $z$ from $x$.

% % Meta-inference requires three ingredients. Given an $x$, we can use $L$ to propose a $z$ that was used to generate it. We then need to propose a possible weighted collection of $z$ values that includes that $z$ at some index. This can be done in two steps: first choosing an index where the distinguished $z$ will end up (using an index distribution $\hat{\psi}$), and then using a distribution $\hat{G}$ to perform meta-infernece about the sampling process that generated the collection of $z$ particles. When that generating process is itself an SMC algorithm, $\hat{G}$ will involve running another $L$ kernel on the distinguished $z$ particle, sampling another ancestor index using $\hat{\psi}$, and running another $\hat{G}$ to generate the previous step's weighted collection. Repeating this process yields the conditional SMC algorithm of~\citep{andrieu2010particle}; variants can be designed to accommodate different resampling strategies $\Psi$ or more general SMC-like algorithms, as in~\citet{lindsten2017divide} or our Agglomerative Monte Carlo algorithm. 


% % The first approximation $p_1$ generates a proposed $x_1$ by running Sequential Monte Carlo, and choosing a particle $x_1 = x_1^{(c)}$ to return, where $p_1(c = i) \propto w_i$. This procedure introduces many auxiliary variables, which we view as constituting $x_2$: the proposals at each step of SMC, any resampling choices made between SMC steps, and the final choice $c$ of a particle to return. 

% % The next approximation $p_2$ is meant to approximate the posterior $p_1(x_2 \mid x_1; x_0)$ over these latent auxiliary variables, given the observed chosen particle $x_1$. We choose the \textit{conditional SMC} algorithm as this distribution, which begins by sampling a history for the chosen particle $x_1$, then simulates SMC forward, but fixing this one particle's trajectory to ensure it appears in the final set of particles. 

% % In the ratio $\frac{p_0(x_1, x_0) \cdot p_2(x_2; x_1, x_0)}{p_1(x_2, x_1; x_0)}$, many terms cancel, leaving only $\frac{1}{N} \sum_{i=1}^{N} w_i$. 


% \subsection{Variational Sequential Monte Carlo}
% \label{sec:vsmc-example}

% The Variational Sequential Monte Carlo~\citep{naesseth2018variational} objective corresponds to Alg. 3, with the same RAVI inference strategy as in Appendix~\ref{sec:smc-example}. However, the default gradient estimator from Alg. 3 will have high variance. \citet{naesseth2018variational} recommend using a biased estimator of the gradient, that uses reparameterization where possible and discards the score function terms arising from resampling steps.

% \subsection{Annealed Importance Sampling}
% \label{sec:ais-example}

% \begin{wrapfigure}{L}{0.5\textwidth}
%     \vspace{-6mm}
%     \begin{algorithm}[H]
%         \label{infstrat:ais}
%         \SetAlgoLined\DontPrintSemicolon
%         \footnotesize{
%         \textbf{RAVI Inference Strategy:} Annealed Importance Sampling\;
%         \SetKwFunction{ais}{ais($\tilde{\pi}_{1:T}, \mathcal{S}, K_{2:T}$).q}\SetKwFunction{aism}{ais($\tilde{\pi}_{1:T}, \mathcal{S}, K_{2:T}$).M($x$).q}
%         % \SetKwFunction{mcvimm}{rmcvi($M, K$).M($x$).M($x_{0:M})$.q}
%         \SetKwProg{infalg}{Posterior Approx.}{}{}
%         \SetKwInOut{Infers}{Target of inference}
%         \SetKwInOut{Aux}{Auxiliary variables}
%         \infalg{\ais{}}{
%         \Infers{latent variable $x$ targeting $\tilde{\pi}_T$}
%         \Aux{$x^{1:T}$, aux. vars $v_\mathcal{S}$ of initial proposal}
%         \nl $x_1, \_ \sim \texttt{IMPORTANCE}(\tilde{\pi}_1, \mathcal{S})$ w. aux. vars $v_\mathcal{S}$\;
%         \nl \For{$t \in 2, \dots, T$}{
%             \nl $x_t \sim K_t(x_{t-1} \rightarrow \cdot)$\;
%         }
%         \nl \Return{$x_T$}\;}{}
%         \setcounter{AlgoLine}{0}
%         \SetKwProg{metaalg}{Meta-Posterior Approx.}{}{}
%         \metaalg{\aism{}}{
%         \Infers{$x^{1:T}$, aux. vars $v_\mathcal{S}$ of initial proposal}
%         \Aux{None}
%         \nl $x_T \gets x$\;
%         \nl \For{$t \in T, \dots, 2$}{
%             \nl $x_{t-1} \sim \tilde{K}_t(x_t \rightarrow \cdot)$\tcp*{$\tilde{K}_t$ is time reversal of $K_t$}
%         }
%         \nl $\_ \sim \texttt{HME}(\tilde{\pi}_1, x_1, \mathcal{S})$ w. aux. vars $v_\mathcal{S}$\;
%         % \nl $\_ \sim \texttt{HME}(\tilde{\pi}, x_j, \mathcal{S})$ w. aux. vars $v_\mathcal{S}^j$\;
%         % \nl \For{$i \in 1,\dots,j-1,j+1,\dots,N$}{
%         %     \nl $\_, x_i \sim q$ w. aux. vars $v_{\mathcal{S}}^i$\;
%         % }
%         \nl \Return{$(x_{1:T}, v_\mathcal{S})$}}
%         }
%         \vspace{-7mm}
%     \end{algorithm} 
% \end{wrapfigure}

% In annealed importance sampling, the practitioner chooses a sequence of unnormalized target distributions $\tilde{\pi}_{1:T}$, where $\pi_T$ is the posterior distribution of interest. Typically $\pi_1$ is chosen to be a distribution that is easy to approximate with a proposal $q$, and each $\pi_i$ is slightly closer to the true target $\pi_T$ than the last. The user also chooses a sequence of kernels $K_t(x_{t-1} \rightarrow x_t)$, where $K_t$ is stationary for $\pi_{t-1}$. The algorithm begins by sampling an initial point $x_1 \sim q$, transforming it through the sequence of kernels to obtain $x_2, \dots, x_T$, and returning $x_T$ as the inferred value of $x$. The associated weight is $$\hat{Z} = \frac{\tilde{\pi}_1(x_1) \cdot \dots \cdot \tilde{\pi}_T(x_T)}{q(x_1) \cdot \tilde{\pi}_1(x_2) \cdot \dots \cdot \tilde{\pi}_{T-1}(x_T)}.$$

% This procedure corresponds to running Alg.~\hyperref[alg:alg1]{1} on the~\hyperref[infstrat:ais]{\texttt{ais}} inference strategy. The inference process runs the kernels $K_t$ forward, whereas the meta-inference process runs their time reversals backward: $\tilde{K}_t(x_t \rightarrow x_{t-1}) \propto \pi_t(x_{t-1}) \cdot K_t(x_{t-1} \rightarrow x_t)$.

% Note that if $K$ is a stationary kernel for $\pi_i$, so is $K^m$ for any natural number $m$. With sufficient computation (increasing $m$), we can ensure that the AIS top-level proposal $\texttt{ais}(\dots).q$ is arbitrarily close to the target posterior $\pi_T$. However, doing so will not necessarily lead to lower-variance weights: RAVI makes clear that it is also necessary to consider the quality of meta-inference. 

% Consider the job of $\tilde{K}_T$, which in the context of the meta-posterior approximation $\texttt{ais}.\mathcal{M}(x)$ is supposed to infer $x_{T-1}$ from $x_{T}$. $\tilde{K}_T$  is the exact meta-posterior of $x_{T-1}$ given $x_T$ \textit{assuming that, in the forward direction, $x_{T-1}$ was distributed according to $\pi_{T-1}$}. However, in the forward direction, if each $K_t$ is run sufficiently many times to ensure mixing at each step, $x_{T-1}$ will in fact be distributed according to $\pi_{T-2}$. This gap\textemdash between the optimal meta-inference kernels and the actual $\tilde{K}$ kernels\textemdash is partly responsible for the variance of the AIS estimator, and can be mitigated by using a finer annealing schedule that brings successive target distributions closer together.  It could also be mitigated by learning a better reverse annealing chain.

% \subsection{Nested Sequential Monte Carlo}
% \label{sec:nsmc-example}

% We first consider Nested Importance Sampling. As in RAVI, Nested Importance Sampling is concerned with importance sampling when the proposal distribution $q$  cannot be tractably evaluated. But RAVI and NIS take different approaches:

% \begin{enumerate}
%     \item RAVI assumes $q$ can be simulated, but that the (normalized) density cannot be evaluated. RAVI generates proposals exactly distributed according to the user's desired proposal $\mathcal{S}.q$, and generates approximations to the ideal importance weights.
    
%     \item NIS does not assume $q$ can be simulated, but does assume that its unnormalized density $\tilde{q}$ is available. As such, proposals are not simulated from $q$, but rather from a Sampling/Importance-Resampling (SIR) approximation to $q$.
% \end{enumerate}

% The NIS procedure with an intractable proposal $q$ corresponds exactly to a special case of the RAVI algorithm, with the RAVI proposal $\mathcal{S}.q$ set \textit{not} to $q$ but rather to an SIR sampling distribution targeting $q$ using some tractable proposal $h$. Compare:

% \begin{itemize}
%     \item Ordinary SIR targeting $\tilde{\pi}$ with proposal $h$: recovered by running $\texttt{IMPORTANCE}(\tilde{\pi}, \texttt{sir}(\tilde{\pi}, h, N))$ (see Section~\ref{sec:sir-example} for \texttt{sir} inference strategy).
%     \item Nested IS targeting $\tilde{\pi}$ with unnormalized proposal density $\tilde{q}$, approximated using SIR with $h$ as a proposal: recovered by running $\texttt{IMPORTANCE}(\tilde{\pi}, \texttt{sir}(\tilde{q}, h, N))$.
% \end{itemize}

% That is, under the RAVI perspective, the only difference between ordinary SIR using $h$ and nested IS is that the ideal proposal density $\tilde{q}$ (rather than the target density $\tilde{\pi}$) is used to make the resampling decision about the particles generated by $h$ (the index $j$ in the listing for \texttt{sir}).

% % The remainder of the inference strategy is then the same as in Section~\ref{sec:sir-example}. The only difference is that in NIS's $p_1$, the particle index $c$ is chosen according to weights $w_i = q(x_1^{(i)})/h(x_1^{(i)})$ that correct a tractable distribution $h$ toward an intractable but desired proposal $q$, whereas in the version from Section~\ref{sec:sir-example}, the weights $w_i = p_0(x_1^{(i)}, x_0)/h(x_1^{(i)})$ correct the tractable proposal directly toward the target distribution, and there is no intractable proposal $q$.

% More generally, \citet{naesseth2015nested} consider procedures other than SIR for approximating ${q}$, arguing that any properly weighted sampler for the intractable proposal $q$ will do. If we let $\mathcal{H}$ be a RAVI inference strategy representing the properly weighted sampler for the intractable proposal $q$ (with unnormalized density $\tilde{q}$), then the Nested IS procedure that uses this properly weighted proposal to perform inference in $\tilde{\pi}$ is $\texttt{IMPORTANCE}(\tilde{\pi}, \texttt{ravi-sir}(\tilde{q}, \mathcal{H}, 1))$ (see~\hyperref[infstrat:ravisir]{\texttt{ravi-sir}} in Section~\ref{sec:sir-example}).

% %When Nested Importance Sampling is applied in a nested way (the SIR routine itself using nested importance sampling to generate particles), $p_1$ becomes an SMC distribution: particles are generated from some tractable base distribution, then repeated resampling and reweighting steps are performed (according to a user-designed sequence of intermediate target densities) before finally choosing a single particle to return. In this case $p_2$ corresponds to a CSMC algorithm, which first generates ancestor indices for the chosen particle, then generates the other particles and their trajectories.

% Nested SMC is similar, performing Nested IS at each iteration of SMC. To recover this algorithm using RAVI, we use the~\hyperref[infstrat:smc]{\texttt{smc}} inference strategy, but for the proposals $K_t(x_{t-1})$ (which, as described in Section~\ref{sec:smc-example}, can be instantiated with inference strategies), we use~\hyperref[infstrat:ravisir]{\texttt{ravi-sir}} targeting the desired but intractable proposal.

% % % A different approach:
% % Nested IS is presented in terms of an object-oriented interface: it is assumed that the proposal distribution $Q$ can generate properly weighted samples. We represent $Q$ as its own tower, $q_0, \dots, q_n$, and express the NIS estimator in terms of that tower.

% % \textbf{Layer 0.} The model, $p_0(x_1, x_0)$.

% % \textbf{Layer 1.} $$p_1(x_2, x_1; x_0) = q_1(x_2^{(y_2)}, x_2^{(y_1)}) \cdot \delta_{x_2^{(y_1)}}(x_1)$$

% % % Overall purpose of nesting in NSMC: design an NSMC sampler that targets an 'easier' but still complex distribution, then use that distribution as the proposal for a top-level NSMC algorithm.

% % \citep{naesseth2015nested} write that Nested IS ``is different from a random weight IS, since it approximates the proposal distribution (and not just the importance weights).'' 
% % %We take the view that there is no such thing as approximating the proposal in a properly weighted sampler; rather, we view whatever procedure is actually used to simulate $X^i$ \textit{as} the proposal. In the case of NIS, this proposal may itself involve sampling/importance-resampling. 
% % But Nested IS algorithms \textit{are} random-weight IS algorithms, if we view them not as approximating some proposal, but rather as simply using a different proposal: the actual procedure they use to generate $X^i$. In the typical case, the proposal used by a Nested IS algorithm is $SIR(q, h, M)$, and the weight is simply the 

% \subsection{SMC$^2$}
% \label{sec:smcsq-example}

% \begin{wrapfigure}{L}{0.6\textwidth}
%     \vspace{-6mm}
%     \begin{algorithm}[H]
%         \label{infstrat:smcsq}
%         \SetAlgoLined\DontPrintSemicolon
%         \footnotesize{
%         \textbf{RAVI Inference Strategy:} SMC$^2$\;
%         \SetKwFunction{smcsq}{smc$^2$($p, q_1, q, M, N$).q}\SetKwFunction{smcsqm}{smc$^2$($p, q_1, q, M, N$).M($\theta, x_{1:T}$).q}
%         % \SetKwFunction{mcvimm}{rmcvi($M, K$).M($x$).M($x_{0:M})$.q}
%         \SetKwProg{infalg}{Posterior Approx.}{}{}
%         \SetKwInOut{Infers}{Target of inference}
%         \SetKwInOut{Aux}{Auxiliary variables}
%         \infalg{\smcsq{}}{
%         \Infers{parameters $\theta$, sequence $x_{1:T}$}
%         \Aux{inner SMC vars $v_\texttt{smc}^T$ of chosen SMC$^2$ particle, other SMC$^2$ vars $v$}
%         \nl \tcp{the targets $\tilde{\pi}_t$ depend on $M$, $p$, $q_1$, and $q$}
%         \nl $(\theta, x_{1:T}, v_\texttt{smc}^T), \_ \sim \texttt{IMPORTANCE}(\tilde{\pi}_T, \texttt{smc}(\tilde{\pi}_{1:T}, K_{2:T}^2, L_{2:T}^2, N))$ w. aux. vars $v$\;
%         \nl \Return{$\theta, x_{1:T}$}\;}{}
%         \setcounter{AlgoLine}{0}
%         \SetKwProg{metaalg}{Meta-Posterior Approx.}{}{}
%         \metaalg{\smcsqm{}}{
%         \Infers{inner SMC vars $v_\texttt{smc}^T$ of chosen SMC$^2$ particle, other SMC$^2$ vars $v$}
%         \Aux{None}
%         \nl $\_ \sim \texttt{HME}(p_T^\theta, x_{1:T}, \texttt{smc}(p^\theta_{1:T}, q_1, K_{2:T}, L_{2:T}, M))$ w. aux. vars $v_\texttt{smc}^T$\;
%         \nl $\_ \sim \texttt{HME}(\tilde{\pi}_T, (\theta, x_{1:T}, v_\texttt{smc}^T), \texttt{smc}(\tilde{\pi}_{1:T}, K_{2:T}^2, L_{2:T}^2, N))$ w. aux. vars $v$\;
%         % \nl $\_ \sim \texttt{HME}(\tilde{\pi}, x_j, \mathcal{S})$ w. aux. vars $v_\mathcal{S}^j$\;
%         % \nl \For{$i \in 1,\dots,j-1,j+1,\dots,N$}{
%         %     \nl $\_, x_i \sim q$ w. aux. vars $v_{\mathcal{S}}^i$\;
%         % }
%         \nl \Return{$(v_\texttt{smc}^T, v)$}}
%         }
%         \vspace{-7mm}
%     \end{algorithm} 
% \end{wrapfigure}

% Suppose we are working with a state-space model $p(\theta) \prod_{i=1}^T p(x_i \mid x_{1:i}, \theta) p(y_i \mid x_i, \theta)$. For a fixed $\theta$, an SMC algorithm could be used to target the successive posteriors $p_t^\theta(x_{1:t}) = p(x_{1:t} \mid y_{1:t}, \theta)$, with proposal kernels $K_t(x^{t-1}_{1:t-1} \rightarrow x^t_{1:t}) = \delta_{x^{t-1}_{1:t-1}}(x^t_{1:t-1})q(x^t_t; x^t_{1:t-1}, y_{1:t}, \theta)$ (for some choice of $q$) and deterministic backward kernels $L_t(x^t_{1:t} \rightarrow x^{t-1}_{1:t-1}) = \delta_{x^t_{1:t-1}}(x^{t-1}_{1:t-1})$. The RAVI strategy implementing that SMC algorithm is $\texttt{smc}(p_{1:T}, q_1, K_{2:T}, L_{2:T}, N)$, where $q_1(x_1; \theta)$ is a proposal for an initial $x_1$ and $N$ is the number of particles. 

% If we also wish to infer $\theta$, we can instead use the SMC$^2$ algorithm~\citep{chopin2013smc2}. We define extended targets
% $$\pi_t(\theta, x_{1:t}, v_\texttt{smc}^t) = p(\theta \mid y_{1:t}) p(x_{1:t} \mid y_{1:t}, \theta) p_{\texttt{HME}}^{\texttt{smc}(p_{1:t}^\theta, q_1, K_{2:t}, L_{2:t}, N)}(v_\texttt{smc}^t; x_{1:t}),$$
% which are defined over not only $\theta$ and $x_{1:t}$ but also all the auxiliary variables $v_\texttt{smc}^t$ used during steps 1 through $t$ of SMC. The variables $v_\texttt{smc}$ and the $p_\texttt{HME}$ distribution over them are as defined in Appendix~\ref{sec:ravi-mcmc}. We write $\tilde{\pi}_t$ for the unnormalized versions of these targets, with normalizing constant $p(y_{1:t})$.

% The SMC$^2$ algorithm targets this sequence of extended posteriors. We write $K_t^2$ for the forward kernels used  by this outer SMC algorithm. The kernel $K_t^2$ extends the SMC state variables $v_\texttt{SMC}^{t-1}$ to new state variables $v_\texttt{SMC}^{t}$ by running the particle filter forward one step, resampling the chosen trajectory index $j$ based on the new weights for time step $t$, and updating $x_{1:t}$ to match the $j^\text{th}$ trajectory. The corresponding backward kernel $L_t^2$ deletes the $t^\text{th}$ step of the particle deterministically, then reproposes $j$ based on the step $t-1$ weights, setting $x_{1:t-1}$ to match the $j^\text{th}$ trajectory.

% The SMC$^2$ algorithm corresponds to the RAVI strategy~\hyperref[infstrat:smcsq]{\texttt{smc$^2$}}. Running the other SMC yields an approximate sample from $\tilde{\pi}_T$, which includes auxiliary variables $v_\texttt{smc}^T$. Meta-inference runs two rounds of conditional SMC: first, to recover the inner layer of SMC's variables $v_\texttt{smc}^T$ for the chosen outer-layer particle, and second, to recover the outer layer of SMC's auxiliary variables $v$.
% As discussed by~\citet{chopin2013smc2}, particle MCMC rejuvenation moves can also be included; to justify using RAVI, we would insert these kernels as additional proposals within the sequence $K_{2:T}^2$. 
% % Set pi_t to include a posterior sample plus 
% % HME on the smc strategy.
% % Then the SMC^2 strategy:
% %   - calls IMPORTANCE on an SMC strategy with those pi_t's, to generate a theta and weighted trajectory. The K distributions are tractable (they extend the particle collection), and we can include in an algorithm box. the L distributiosn 
% %   -

% \subsection{Amortized Rejection Sampling}
% \label{sec:amrej-example}

% Consider a generative model $p(K, x_{1:K+1}, y)$ where the latent variables $x_{1:K+1}$ to be marginalized or inferred represent the trace of a rejection sampling loop, with sampling distribution $h(x)$ and predicate $\mathcal{A}(x)$ determining acceptance:
% $$p(K, x_{1:K+1}, y) = \prod_{i=1}^{K} \left[h(x_i)(1-\mathcal{A}(x_i))\right] h(x_{K+1})\mathcal{A}(x_{K+1}) p(y \mid x_{K+1})$$

% \begin{wrapfigure}{L}{0.7\textwidth}
%     \vspace{-6mm}
%     \begin{algorithm}[H]
%         \label{infstrat:amrej}
%         \SetAlgoLined\DontPrintSemicolon
%         \footnotesize{
%         \textbf{RAVI Inference Strategy:} Amortized Rejection Sampling\;
%         \SetKwFunction{amrej}{amrej($h, q, \mathcal{A}, N, M$).q}\SetKwFunction{amrejm}{amrej($h, q, \mathcal{A}, N, M$).M($K, x_{1:K}$).q}
%         \SetKwFunction{amrejmm}{amrej($h, q, \mathcal{A}, N, M$).M($K, x_{1:K}$).M($K', x'_{1:K'}, (K''_i, x''^i_{1:K''_i})_{i=1:M}, j$).q}
%         \SetKwFunction{amrejmmm}{amrej($h, q, \mathcal{A}, N, M$).M($K, x_{1:K}$).M($K', x'_{1:K'}, (K''_i, x''^i_{1:K''_i})_{i=1:M}, j$).M($z_{K'+1}$).q}
%         \SetKwProg{infalg}{Posterior Approx.}{}{}
%         \SetKwInOut{Infers}{Target of inference}
%         \SetKwInOut{Aux}{Auxiliary variables}
%         \infalg{\amrej{}}{
%         \Infers{number $K$ of rejected samples, rejected samples $x_{1:K}$, accepted sample $x_{K+1}$}
%         \Aux{rejection loops $(K', x'_{1:K'})$ and $(K''_i, x''^i_{1:K''_i})_{i=1:M}$, index $j$}
%         \nl $K' \gets 0$\;
%         \nl $x'_{1} \sim q$\;
%         \nl \While{$\mathcal{A}(x'_{K'+1}) \neq 1$}{
%             \nl $K' \gets K' + 1$\;
%             \nl $x'_{K'+1} \sim q$
%         }
%         \nl $x_{K+1} \gets x'_{K'+1}$\;
%         \nl \For{$i \in 1, \dots, M$}{
%             \nl $K''_i \gets 0$\;
%             \nl $x''^i_{1} \sim h$\;
%             \nl \While{$\mathcal{A}(x''^i_{K''_i+1}) \neq 1$}{
%                 \nl $K''_i \gets K''_i + 1$\;
%                 \nl $x''^i_{K''_i+1} \sim h$\;
%             }
%         }
%         \nl $j \sim \text{Discrete}(K''_{1:M})$\;
%         \nl $K \sim \text{Uniform}(0, K''_j)$\;
%         \nl $x_{1:K} \gets x''^j_{1:K}$\;
%         \nl \Return{$(K, x_{1:K}, x_{K+1})$}\;}{}
%         \setcounter{AlgoLine}{0}
%         \SetKwProg{metaalg}{Meta-Posterior Approx.}{}{}
%         \metaalg{\amrejm{}}{
%         \Infers{rejection loops $(K', x'_{1:K'})$ and $(K''_i, x''^i_{1:K''_i})_{i=1:M}$, index $j$}
%         \Aux{superfluous accepted sample $z_{K'+1}$}
%         \nl $j \sim \text{Uniform}(1, M)$\;
%         \nl \For{$i \in 1, \dots, j-1, j+1, \dots, M$}{
%             \nl $K''_i \gets 0$\;
%             \nl $x''^i_{1} \sim h$\;
%             \nl \While{$\mathcal{A}(x''^i_{K''_i+1}) \neq 1$}{
%                 \nl $K''_i \gets K''_i + 1$\;
%                 \nl $x''^i_{K''_i+1} \sim h$\;
%             }
%         }
%         \nl $K''_j \gets K$\;
%         \nl $x''^j_{K+1} \sim h$\;
%         \nl \While{$\mathcal{A}(x''^j_{K+K''_j+1}) \neq 1$}{
%             \nl $K''_j \gets K''_j + 1$\;
%             \nl $x''^j_{K+K''_j+1} \sim h$\;
%         }
%         \nl $K' \gets 0$\;
%         \nl $z_{1} \sim q$\;
%         \nl \While{$\mathcal{A}(z_{K'+1}) \neq 1$}{
%             \nl $x'_{K'+1} \gets z_{K'+1}$\;
%             \nl $K' \gets K' + 1$\;
%             \nl $z_{K'+1} \sim q$
%         }
%         \nl \Return{$(K', x'_{1:K'}, (K''_i, x''^i_{1:K''_i})_{i=1:M}, j)$}}
%         \setcounter{AlgoLine}{0}
%         \SetKwProg{metaalg}{Meta-Meta-Posterior Approx.}{}{}
%         \metaalg{\amrejmm{}}{
%         \Infers{superfluous accepted sample $z_{K'+1}$}
%         \Aux{index $l$, unchosen particles $z_{-l}$}
%         \nl \For{$i \in 1, \dots, N$}{
%             \nl $z_i \sim q$\;
%         }
%         \nl $l \sim \text{Uniform}(\{i \mid \mathcal{A}(z_i)\})$\;
%         \nl \Return{$z_l$}}
%         \setcounter{AlgoLine}{0}
%         \SetKwProg{metaalg}{Meta-Meta-Meta-Posterior Approx.}{}{}
%         \metaalg{\amrejmmm{}}{
%         \Infers{index $l$, unchosen particles $z_{-l}$}
%         \Aux{None}
%         \nl $l \sim \text{Uniform}(1, N)$\;
%         \nl \For{$i \in 1, \dots, l-1,l+1,\dots, N$}{
%             \nl $z_i \sim q$\;
%         }
%         \nl \Return{$(z_1, \dots, z_{l-1}, z_{l+1}, \dots, z_N)$}}
%         }
%         \vspace{-15mm}
%     \end{algorithm} 
% \end{wrapfigure}

% Here, the $x_i$ are drawn independently from a distribution $h$, until some predicate $\mathcal{A}$ holds of the most recent particle, at which point the loop stops. The observation $y$ depends on the final sample $x_{K+1}$, but not the earlier, rejected samples $x_{1:K}$ or the number of rejected samples $K$.
% \citet{naderiparizi2019amortized} proposed a technique called \textit{Amortized Rejection Sampling} for performing inference in this model. The technique corresponds to the rather involved RAVI strategy \texttt{amrej}, which has parameters $N$ and $M$ that can be used to trade accuracy for computational cost. 

% The idea behind the top-level, intractable posterior approximation $\texttt{amrej}(h, q, \mathcal{A}, N, M).q$ is to:

% \begin{itemize}
%     \item use the observation $y$ to intelligently guess the \textit{accepted} particle $x_{K+1}$, using a learned proposal $q$. (For example, $q$ may be parameterized by a neural network that accepts $y$ as input.) To satisfy the constraint that $x_{K+1}$ satisfies $\mathcal{A}$, however, it is necessary to run $q$ within a rejection sampling loop, generating auxiliary variables $x'_{1:K'}$, where $K'$ is the number of rejected $q$-samples. (We could try directly using $x'_{1:K'}$ as our proposal for $x_{1:K}$, the rejected samples from the model. But $q$'s goal is to propose $x_{K+1}$ in a \textit{data-driven} way, influenced by the observation $y$, and the rejected samples $x_{1:K}$ from the model have no connection to the data\textemdash so, using samples from $q$ as proposals for the rejected model samples would result in a poor approximation.)

%     \item use rejection sampling from the prior $h$ to infer the \textit{rejected} samples $x_{1:K}$. We run $M$ independent rejection sampling loops, randomly choose one with probability proportional to its length, and then randomly choose a \textit{prefix} of the chosen loop as our proposal for $x_{1:K}$. 
% \end{itemize}

% The meta-posterior approximation must solve two new challenges: recovering the rejected $q$ samples $x'_{1:K'}$ from the posterior approximation, and recovering the many unused rejection loops (and the suffix of the chosen rejection loop) from the second step of the posterior approximation (the $x''$ variables). The latter of these tasks is simple enough: we can generate $M-1$ rejection loops from scratch for the un-chosen loops, and a further rejection loop from scratch to use as the suffix of the chosen loop. The first task is more complex: we run a new rejection loop using $q$ as a proposal, and discard the final accepted sample. Meta-meta-inference must infer this discarded accepted sample, for which it uses SIR with $N$ particles. The final layer, the Meta-Meta-Meta-Posterior Approximation, uses conditional SIR.

% The meta-meta-posterior is not absolutely continuous with respect to its approximation (it is possible that the approximation generates $N$ $z$-values that all fail to satisfy the predicate, in which case $z_l$ is not in the support of the meta-meta-posterior). As such, this is an example of a \textit{wide} inference strategy (Appendix~\ref{sec:even-odd}).

% \subsection{Hamiltonian Variational Inference}
% \label{sec:ham-example}


% \begin{wrapfigure}{L}{0.5\textwidth}
%     \vspace{-6mm}
%     \begin{algorithm}[H]
%         \label{infstrat:ham}
%         \SetAlgoLined\DontPrintSemicolon
%         \footnotesize{
%         \textbf{RAVI Inference Strategy:} Hamiltonian Variational Inference\;
%         \SetKwFunction{ham}{hamvi($q_0, q_v, r_v, \text{LF}$).q}\SetKwFunction{hamm}{hamvi($q_0, q_v, r_v, \text{LF}$).M($x$).q}
%         \SetKwFunction{hammm}{hamvi($q_0, q_v, r_v, \text{LF}$).M($x$).M($x_0, v$).q}
%         \SetKwProg{infalg}{Posterior Approx.}{}{}
%         \SetKwInOut{Infers}{Target of inference}
%         \SetKwInOut{Aux}{Auxiliary variables}
%         \infalg{\ham{}}{
%         \Infers{latent variable $x$}
%         \Aux{initial position $x_0$, momentum $v$}
%         \nl $x_0 \sim q_0$\;
%         \nl $v \sim q_v$\;
%         \nl $(x, v') \gets \text{LF}(x_0, v)$\;
%         \nl \Return{$x$}\;}{}
%         \setcounter{AlgoLine}{0}
%         \SetKwProg{metaalg}{Meta-Posterior Approx.}{}{}
%         \metaalg{\hamm{}}{
%         \Infers{initial position $x_0$, momentum $v$}
%         \Aux{negated final momentum $v'_{-}$}
%         \nl $v'_{-} \sim r_v(\cdot; x)$\;
%         \nl $(x_0, v_{-}) \gets \text{LF}(x, v'_{-})$\;
%         \nl \Return{$(x_0, -v_{-})$}}
%         \setcounter{AlgoLine}{0}
%         \SetKwProg{metaalg}{Meta-Posterior Approx.}{}{}
%         \metaalg{\hammm{}}{
%         \Infers{negated final momentum $v'_{-}$}
%         \Aux{None}
%         \nl $(\_, v') \gets \text{LF}(x_0, v)$\;
%         \nl \Return{$-v'$}}
%         }
%         \vspace{-12mm}
%     \end{algorithm} 
% \end{wrapfigure}

% Hamiltonian Variational Inference~\citep{salimans2015markov} is a hybrid of Hamiltonian Monte Carlo and variational inference. It is a special case of Markov Chain Variational Inference (see Section~\ref{sec:overview} and Section~\ref{sec:examples} for detailed discussion, and~\hyperref[infstrat:mcvi]{\texttt{mcvi}} for the RAVI implementation). The algorithm specializes the Markov Chain Variational Inference procedure for use with a Hamiltonian Monte Carlo kernel. 

% We present the specialized strategy as~\hyperref[infstrat:ham]{\texttt{hamvi}}. It accepts as input:

% \begin{enumerate}
%     \item a distribution $q_0$ from which to propose an initial point;
%     \item a momentum distribution $q_v$ from which momenta $v$ are proposed at each iteration;
%     \item a proposal distribution $r_v(\cdot; x)$ over momenta; and
%     \item a leapfrog integrator $\texttt{LF}$ that runs Hamiltonian dynamics on an initial position and momentum (we think of both the number of leapfrog steps $L$ and the Hamiltonian $H$ being targeted as part of the \texttt{LF} object provided to \texttt{hamvi}).
% \end{enumerate}

% Given these inputs, the top-level posterior approximation runs an iteration of HMC from a randomly initialized location $x_0$. The meta-posterior approximation randomly proposes a (negated) \textit{final} momentum from the proposal $r_v$, and runs the leapfrog integrator to find a plausible initial location $x_0$. Finally, the (deterministic) meta-meta-posterior finds the initial momentum that could have taken $x_0$ to $x$.

% \subsection{Antithetic Sampling}
% \label{sec:antithetic-example}

% Consider a target $\tilde{\pi}(x)$ and a proposal $q(x)$ that approximates $\pi$. Suppose $q$ is invariant under some bijective transformation $T$: $$\forall x, q(x) = q(T(x)).$$ For example, a univariate Gaussian proposal with mean $\mu$ is invariant under $T(x) = 2\mu - x$. Antithetic sampling generates a sample $x$ from $q$, but instead of using the estimator $\hat{Z} = \tilde{\pi}(x)/q(x)$, it uses $$\hat{Z} = \frac{\tilde{\pi}(x) + \tilde{\pi}(T(x))}{2q(x)}.$$ 


% \begin{wrapfigure}{L}{0.6\textwidth}
%     \vspace{-6mm}
%     \begin{algorithm}[H]
%         \label{infstrat:antithetic}
%         \SetAlgoLined\DontPrintSemicolon
%         \footnotesize{
%         \textbf{RAVI Inference Strategy:} Antithetic Sampling\;
%         \SetKwFunction{antithetic}{antithetic($\tilde{\pi}, q, T$).q}\SetKwFunction{antitheticm}{antithetic($\tilde{\pi}, q, T$).M($x$).q}
%         \SetKwProg{infalg}{Posterior Approx.}{}{}
%         \SetKwInOut{Infers}{Target of inference}
%         \SetKwInOut{Aux}{Auxiliary variables}
%         \infalg{\antithetic{}}{
%         \Infers{latent variable $x$}
%         \Aux{sampled $x_0$, choice $b$}
%         \nl $x_0 \sim q$\;
%         \nl $w_0 \gets \tilde{\pi}(x_0) / q(x_0)$\;
%         \nl $w_1 \gets \tilde{\pi}(T(x_0)) / q(x_0)$\;
%         \nl $b \sim \texttt{Bernoulli}(\frac{w_1}{w_0 + w_1})$\;
%         \nl \Return{$bT(x_0) + (1-b)x_0$}\;}{}
%         \setcounter{AlgoLine}{0}
%         \SetKwProg{metaalg}{Meta-Posterior Approx.}{}{}
%         \metaalg{\antitheticm{}}{
%         \Infers{sampled $x_0$, choice $b$}
%         \Aux{None}
%         \nl $b \sim \text{Bernoulli}(0.5)$\;
%         \nl $x_0 \gets bT(x) + (1-b)x$\;
%         \nl \Return{$(x_0, b)$}}
%         }
%         \vspace{-3mm}
%     \end{algorithm} 
% \end{wrapfigure}

% This can be justified as Algorithm 1 (\texttt{IMPORTANCE}) applied to the strategy~\hyperref[infstrat:antithetic]{\texttt{antithetic}}. The posterior approximation generates an initial sample $x_0 \sim q$, evaluates both $x_0$ and $T(x_0)$ as possible proposals, and selects one. The meta-posterior approximation must recover whether $x$ or its transformed version was the sampled one; it does so by flipping a fair coin, which is optimal when $T = T^{-1}$, i.e., when $T$ is an involution. In the general case a lower-variance estimator could be derived by setting $\mathcal{M}(x).q$ to the exact posterior of the proposal process. Antithetic sampling can also be generalized to the case where a finite family of bijective transformations $T_i$ are available.

% Note that although the final expression for $\hat{Z}$ falls out of this inference strategy only when $q(x) = q(T(x))$ for all $x$, nothing in the inference strategy itself exploits this assumption, and the same inference strategy could be applied to $T$ without this property, to derive other estimators that\textemdash intuitively\textemdash simultaneously consider a proposal $x$ and a deterministic function of it $T(x)$ as possible locations.


% \section{Absolute continuity}
% \label{sec:even-odd}

% When we defined inference strategies $\mathcal{S}$ targeting $\pi$, we required that $\mathcal{S}.q$ and $\pi$ be \textit{mutually} absolutely continuous, a stronger requirement than in importance sampling. We now consider relaxing this requirement, by requiring only \textit{one-sided} absolute continuity. We define two \textit{kinds} of inference strategy, depending on which direction of absolute continuity holds:

% \begin{enumerate}
%     \item An inference strategy $\mathcal{S}$ targeting $\pi$ is \textit{wide} if $\pi$ is absolutely continuous with respect to $\mathcal{S}.q$, and either $\mathcal{S}.q$ has a tractable marginal density or $\mathcal{S}.\mathcal{M}(x)$ is a narrow inference strategy targeting $\mathcal{S}.q(\cdot \mid x)$ for all $x$.
    
%     \item An inference strategy $\mathcal{S}$ targeting $\pi$ is \textit{narrow} if $\mathcal{S}.q$ is absolutely continuous with respect to $\pi$, and either $\mathcal{S}.q$ has a tractable marginal density or $\mathcal{S}.\mathcal{M}(x)$ is a wide inference strategy targeting $\mathcal{S}.q(\cdot \mid x)$ for all $x$.
% \end{enumerate}

% Then an inference strategy as defined in the main paper is one that is both wide and narrow. 

% Narrow inference strategies can serve as variational families within variational inference algorithms. Wide inference strategies can be used as importance sampling and SMC proposals, as well as variational families for \textit{amortized} variational inference.  Inference strategies used as MCMC proposals must be both wide and narrow.

% \section{Other applications of RAVI inference strategies}
% \label{sec:other-applications}

% \subsection{Rejection sampling with RAVI}
% As in any properly weighted sampler, if the weights produced by Alg. 1 can be bounded above by a constant $M$, a RAVI inference strategy can be used for exact inference via rejection sampling: a sample $(x, \hat{Z})$ is drawn using Alg. 1, and then accepted with probability $\frac{\hat{Z}}{M}$. The weight $\hat{Z}$ for an inference strategy can be viewed as a product of the normalizing constant $Z$ with normalized importance weights $w_\mathcal{S} = \frac{\pi(x)}{\mathcal{S}.q(x)}$, $w_{\mathcal{S}.\mathcal{M}(x)} = \frac{\mathcal{S}.q(r \mid x)}{\mathcal{S}.\mathcal{M}(x).q(r)}$, and so on. As such, if upper bounds $M_Z$ and $M_\mathcal{S}$, $M_{\mathcal{S}.\mathcal{M}(x)}$, etc. can be found for these quantities, the product of these bounds is a bound on $\hat{Z}$. Thus, as in properly weighted sampling and in variational inference with RAVI, it is possible to reason about the RAVI inference strategy compositionally, in terms of bounds at each layer of nesting.

% \subsection{Estimating KL divergences between models with RAVI inference strategies equipped}

% Suppose $p(y) = \int p(x, y) \text{d}x$ and $q(y) = \int q(z, y) \text{d}z$ are mutually absolutely continuous distributions over some space $\mathcal{Y}$. Suppose also that we have two families of inference strategies, $\mathcal{S}_p(y)$ and $\mathcal{S}_q(y)$, targeting $p(x\mid y)$ and $q(z \mid y)$ respectively. Then the AIDE algortihm~\citep{cusumano2017aide} can be adapted to give a stochastic upper bound on the symmetric KL divergence between $p(y)$ and $q(y)$.

% First, we generate $(x, y_p) \sim p$, $(z, y_q) \sim q$, and run $\texttt{HME}$ on each pair to obtain weights $w^p_p$ and $w^q_q$ respectively. Then, we run $\texttt{IMPORTANCE}$ on $p$ with data $y_q$, and on $q$ with data $y_p$, to obtain weights $w^p_q$ and $w^q_p$ respectively. Finally, we sum the logs of the foru weights, to give an estimate $\hat{D}$ whose expectation is:

% $$
% \mathbb{E}[\hat{D}] =
% \mathbb{E}_{y \sim p}[\UU(p, y, \mathcal{S}_p(y)) - \LL(q, y, \mathcal{S}_q(y))] + \mathbb{E}_{y \sim q}[\UU(q, y, \mathcal{S}_q(y)) - \LL_p(p, y, \mathcal{S}_p(y))] \geq KL(p || q) + KL(q || p).
% $$

% As the marginal likelihood bounds $\UU$ and $\LL$ become tighter, this expectation approaches the true symmetric KL between $p$ and $q$, i.e., $D = KL(p|| q) + KL(q || p)$. Theorem 4 allows us to characterize the tightness of these bounds, and thus of the stochastic upper bound $\hat{D}$ on the symmetric KL, in terms of KL divergences between successive layers of each inference strategy. Improving inference at any layer of the inference strategy tightens the bound $\hat{D}$, yielding less biased estimates of $D$.

% \section{Reparameterization Trick Gradient Estimators}
% \label{sec:reparam}
% In this section, we present versions of Algorithms 3 and 4 that 
% utilize reparameterization gradients, rather than score function 
% gradients. Using these algorithms requires that an inference 
% strategy be \textit{reparameterizable}.

% \textbf{Definition:} A \textit{reparameterizable inference strategy $\mathcal{S}$ with arguments $\theta$} specifies:

% \begin{itemize}
% \item A reparameterizable posterior approximation $\mathcal{S}.q$, which is one of:
% \begin{itemize} 
%     \item a tractable proposal: a tuple $(\mathcal{S}.q(x; \theta), \mathcal{S}.q.g(\epsilon), \mathcal{S}.q.f(\epsilon, \theta)$, such that $q$ is the pushforward of $g$ by $f$; or
%     \item an intractable proposal: a tuple $(\mathcal{S}.q(r, x; \theta), \mathcal{S}.q.g(\epsilon_r, \epsilon_x), \mathcal{S}.q.f_r(\epsilon_r, \theta), \mathcal{S}.q.f_x(\epsilon_x, \theta))$, such that $q$ is the pushforward of $g$ by $\lambda (\epsilon_r, \epsilon_x). (f_r(\epsilon_r, \theta), f_x(\epsilon_x, \theta))$.
% \end{itemize}
% \item If the latter, a reparameterizable meta-inference strategy $\mathcal{S}.\mathcal{M}$, with arguments $(x, \theta)$, that given argument $(x, \theta)$, targets $\mathcal{S}.q(r \mid x; \theta)$.
% \end{itemize}

% Now, reparameterized estimators can be derived by applying standard automatic differentiation to the following algorithm, which only samples from distributions that do not depend on parameters:

% \begin{minipage}[t]{0.52\textwidth}
%     {
%     \removelatexerror
%     \vspace{-10pt}
%     \begin{algorithm}[H]
%         \label{alg:alg6}
%         %\label{infstrat:alg1}
%         \SetAlgoLined\DontPrintSemicolon
%         \textbf{Algorithm 6:} RAVI ELBO estimator ($\texttt{ELBO}$)\;
%         \KwIn{unnormalized model $\tilde{p}(x)$}
%         \KwIn{inference strategy $\mathcal{S}$ with arguments}
%         \KwIn{arguments $\theta$}
%         \KwOut{unbiased estimates of $\mathcal{L}$ (differentiable w.r.t. $\theta$)}
%         \nl \If{$\mathcal{S}.q$ has a tractable marginal density}{
%             \nl $\epsilon_x \sim \mathcal{S}.q.g$\;
%             \nl $x \gets \mathcal{S}.q.f(\epsilon_x, \theta)$\;
%             \nl $\hat{U} \gets \log \mathcal{S}.q(x; \theta)$\;
% %            \nl $\widehat{\grad} \gets \nabla_\theta \log \mathcal{S}.q(\mathcal{S}.f(\epsilon_x, \theta); \theta)$\;
%             % \nl $x \sim \mathcal{S}.q$\;
%             % \nl$ (\hat{U}, \widehat{\grad}) \gets (\log \mathcal{S}.q(x), \grad \log \mathcal{S}.q(x) \cdot (1 + \log \mathcal{S}.q(x)))$\; 
%             % \nl $\mathbf{g} \gets \grad \log \mathcal{S}.q(x)$\;
%         }
%         \nl \ElseIf{$\mathcal{S}.q(x; \theta) = \int \mathcal{S}.q(r, x; \theta)\text{d}r$}{
%             \nl $(\epsilon_r, \epsilon_x) \sim \mathcal{S}.q.g$\;
%             \nl $(x, r) \gets (\mathcal{S}.q.f_x(\epsilon_x, \theta), \mathcal{S}.q.f_r(\epsilon_r, \theta))$\;
%             \nl $\hat{U} \gets \texttt{EUBO}(\mathcal{S}.q(\cdot, x; \theta), r, \mathcal{S}.\mathcal{M}, (x, \theta))$\;
%     %        \nl $(r, w) \gets \text{Alg1}(\mathcal{S}.q(\cdot, x), \mathcal{S}.\mathcal{M}(x))$\;
%         }
%         \nl \Return{$\log \tilde{p}(x) - \hat{U}$}\;
%     \end{algorithm}
%     % \noindent\textbf{Algorithm 3.} Let $\mathcal{S}$ be a strategy targeting $p(x \mid y)$. Then the following procedure yields unbiased estimates of the ELBO
%     % $\LL(y)$ and $\grad \LL(y)$:
%     % \begin{enumerate}[leftmargin=*]
%     %     \item If $\mathcal{S}.q$ has a tractable density, sample $x \sim \mathcal{S}.q$ and set $(\hat{U}, \widehat{\grad}, \mathbf{g}) \gets (\log \mathcal{S}.q(x), \grad \log \mathcal{S}.q(x) \cdot (1 + \log \mathcal{S}.q(x)), \grad \log \mathcal{S}.q(x))$.
    
%     %     \item Else, if $\mathcal{S}.q(x) = \int \mathcal{S}.q(r, x) \text{d}r$, generate $(r, x) \sim \mathcal{S}.q$, and run {Algorithm 4} on the inference strategy $\mathcal{S}.\mathcal{M}(x)$ targeting $\mathcal{S}.q(r \mid x)$, and the sample $r$, to obtain $(\hat{U}, \widehat{\grad}, \mathbf{g})$.
        
%     %     \item Set $\hat{L} \gets \log p(x, y) - \hat{U}$.
        
%     %     \item Set $\widehat{\grad}'\gets \grad \log p(x, y)
%     %         + \mathbf{g} 
%     %         \log p(x, y)
%     %         - \widehat{\grad}.$
        
%     %     \item Return $(\hat{L}, \widehat{\grad}')$.
%     % \end{enumerate}
%     }
%     \end{minipage}\hfill%
%     \begin{minipage}[t]{0.48\textwidth}
%     {
%         \removelatexerror
%         \vspace{-10pt}
%         \begin{algorithm}[H]
%             \label{alg:alg7}
%             \SetAlgoLined\DontPrintSemicolon
%             \textbf{Algorithm 7:} RAVI EUBO estimator ($\texttt{EUBO}$)\;
%             \KwIn{unnormalized model $\tilde{p}(x)$}
%             \KwIn{exact sample $x \sim p(x)$}
%             \KwIn{inference strategy $\mathcal{S}$ with arguments}
%             \KwIn{arguments $\theta$}
%             \KwOut{unbiased estimate of $\mathcal{U}$ (differentiable w.r.t. $\theta$)}
%             \nl \uIf{$\mathcal{S}.q$ has a tractable marginal density}{
%                 \nl$\hat{L} \gets \log \mathcal{S}.q(x; \theta)$\;
%             }
%             \nl \ElseIf{$\mathcal{S}.q(x; \theta) = \int \mathcal{S}.q(r, x; \theta)\text{d}r$}{
%                 \nl $\hat{L} \gets \texttt{ELBO}(\mathcal{S}.q(\cdot, x; \theta), \mathcal{S}.\mathcal{M}, (x, \theta))$\;
%         %        \nl $(r, w) \gets \text{Alg1}(\mathcal{S}.q(\cdot, x), \mathcal{S}.\mathcal{M}(x))$\;
%             }
%             \nl \Return{$\log \tilde{p}(x) - \hat{L}$}\;
%         \end{algorithm}
%     % \noindent\textbf{Algorithm 4.} Let $\mathcal{S}$ be a strategy targeting $p(x \mid y)$ and $x$ an exact posterior sample. The following procedure yields unbiased estimates of $\UU(y)$ and $\grad \UU(y)$, and the quantity $\mathbf{g}$ (see Thm. 2):
%     % %the gradient $\grad \log p_0(x_0, x_1)$.
    
%     % \begin{enumerate}[leftmargin=*]
%     %     %\item Generate $(x, y) \sim p$.
        
%     %     \item If $\mathcal{S}.q$ has a tractable marginal density, then set $(\hat{L}, \widehat{\grad}) \gets (\log \mathcal{S}.q(x), \grad \log \mathcal{S}.q(x))$.
%     %     \item Else, if $\mathcal{S}.q(x) = \int \mathcal{S}.q(r, x)\text{d}r$, run Algorithm 3 on the strategy $\mathcal{S}.\mathcal{M}(x)$, targeting $\mathcal{S}.q(r \mid x)$, to obtain $(\hat{L}, \widehat{\grad})$.
    
%     %     \item Set $\hat{U} \gets \log p(x, y) - \hat{L}$.
        
%     %     \item Set $\mathbf{g} \gets \grad \log p(x, y)$.
    
%     %     \item Set $\widehat{\grad}' \gets
%     %         \grad \log p(x, y)
%     %         + \mathbf{g} \cdot \hat{U}
%     %         - \widehat{\grad}$.
            
%     %     \item Return $\left(\hat{U}, \widehat{\grad}', \mathbf{g} \right)$.
%     % \end{enumerate}
%     }
%     \end{minipage}

%     Note that in fact only every \textit{other} posterior approximation in the unrolled strategy requires a reparameterized version: Algorithm 7 never samples from its $\mathcal{S}.q$, only evaluates the densities.
 
%     It would be interesting to develop variants of these algorithms that allow users to combine score-function and reparameterization estimation at different layers of nesting, or exploit other variance reduction tactics compositionally.

% %     \section{Mass-capturing and mode-seeking VI}
 
% % \begin{figure*}
% %     %\centering
% % %    \includegraphics[width=0.9\linewidth]{figs/tutorial.pdf}
% %     \begin{subfigure}[b]{0.22\linewidth}
% %         \includegraphics[width=\linewidth]{figs/target.pdf}
% %         % \begin{align*}
% %         %     p(z, x) = \mathcal{N}&(z; \mathbf{0}, 5\mathbf{I})\cdot\\ 
% %         %     &\mathcal{N}(x; ||z||_2^2, \frac{3}{4})
% %         % \end{align*}
% %         \caption{Target distribution, the posterior $p_0(x_1 \mid x_0)$ of the latent-variable generative model $p(x_1, x_0) = \mathcal{N}(x_1; \mathbf{0}, 5\mathbf{I}) \cdot \mathcal{N}(x_0; ||x_1||_2^2, \frac{3}{4})$. Variational inference can be applied to approximate the posterior, but requires the choice of a variational family. }
% %         \label{fig:target}
% %     \end{subfigure}\hfill
% %     \begin{subfigure}[b]{0.22\linewidth}
% %     \includegraphics[width=\linewidth]{figs/simple_gaussian.pdf}
% %     % \begin{align*}
% %     %         q(z; x) = \mathcal{N}(z; \mu, \Sigma)\\
% %     %         \,
% %     % \end{align*}
% %     \caption{A Gaussian variational family. The Gaussian's density is tractable, so  standard (top) and amortized (bottom) variational inference can be applied. However, the Gaussian approximation is not flexible enough to accurately characterize the posterior distribution.}
% %     \label{fig:inadequate}
% %     \end{subfigure}\hfill
% %     \begin{subfigure}[b]{0.22\linewidth}
% %     \includegraphics[width=\linewidth]{figs/angle.pdf}
% %     % \begin{align*}
% %     %         q(z; x) = \mathcal{N}(z; \mu, \Sigma)\\
% %     %         \,
% %     % \end{align*}
% %     \caption{A RAVI inference strategy $\mathcal{S}$. $\mathcal{S}.q$ (left) approximates the posterior better by sampling a latent angle (red) and then a point (green), but has an intractable marginal density. RAVI's Algs. 3 (top) and 4 (bottom) are necessary to compute standard and amortized VI gradients.}
% %     \label{fig:angle}
% %     \end{subfigure}\hfill
% %     \begin{subfigure}[b]{0.22\linewidth}
% %     \includegraphics[width=\linewidth]{figs/SIR.pdf}
% %     % \begin{align*}
% %     %         q(z; x) = \mathcal{N}(z; \mu, \Sigma)\\
% %     %         \,
% %     % \end{align*}
% %     \caption{Existing algorithms, such as IWAE~\citep{burda2015importance}, can often be seen as RAVI inference strategies. This $\mathcal{S}.q$ generates a vector of $K$ particles (red) and chooses one (green) to propose, and has an intractable density. Algs. 3 (top) and 4 (bottom) can be used for optimization.}
% %     \label{fig:multiple}
% %     \end{subfigure}
% %     \caption{\textbf{Example of RAVI applied to a simple 2D posterior.}
% %     }
% %    \label{fig:tutorial}
% % \end{figure*}
\end{document}
