\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
%\usepackage[american]{babel}
\usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
\usepackage{algorithm}
\usepackage{algpseudocode}
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{amsmath,amsfonts,amssymb}
\usepackage{subcaption}
\usepackage{tikz}
\usetikzlibrary{bayesnet}
\usetikzlibrary{arrows} % nice language for creating drawings and diagrams
\usepackage{xr}
\externaldocument{Heap_216}

\newcommand{\bracket}[3]{\left#1 #3 \right#2}
\newcommand{\mbracket}[5]{\left#1 #4 \middle#2 #5 \right#3}
\renewcommand{\b}{\bracket{(}{)}}
\newcommand{\bc}{\mbracket{(}{\vert}{)}}
\newcommand{\ab}{\bracket{\langle}{\rangle}}
\newcommand{\cb}{\bracket{\{}{\}}}
\newcommand{\abs}{\bracket{\lvert}{\rvert}}
\newcommand{\sqb}{\bracket{[}{]}}
\newcommand{\E}[1][]{\mathrm{E}_{#1}\sqb}
\newcommand{\Var}[1][]{\mathrm{Var}_{#1}\sqb}
\newcommand{\bareP}{\operatorname{P}}
\renewcommand{\P}[1][]{\bareP_{#1}\b}
\newcommand{\Pc}[1][]{\bareP_{#1}\bc}
\newcommand{\Pef}{\P[\text{ef}]}
\newcommand{\Qef}{\Q[\text{ef}]}
\newcommand{\bareQ}{{\operatorname{Q}}}
\newcommand{\bareQt}{\tilde{\operatorname{Q}}}
\newcommand{\Q}[1][]{\bareQ_{#1}\b}
\newcommand{\Qc}[1][]{\bareQ_{#1}\bc}
\newcommand{\Qglob}{\bareQ_{\glob}\b}
\newcommand{\Qglobc}{\bareQ_{\glob}\bc}
\newcommand{\Qmp}{\bareQ_{\mp}\b}
\newcommand{\Qmpc}{\bareQ_{\mp}\bc}
\newcommand{\Qtmc}{\bareQ_{\tmc}\b}
\newcommand{\Qtmcc}{\bareQ_{\tmc}\bc}
\newcommand{\Z}{\mathbf{Z}}
\newcommand{\0}{\mathbf{0}}
\newcommand{\T}{\mathbf{T}}
\newcommand{\I}{\mathbf{I}}
\newcommand{\J}{\mathbf{J}}
\renewcommand{\u}{\mathbf{u}}
\newcommand{\m}{\boldsymbol{\mu}}
\newcommand{\n}{\boldsymbol{\eta}}
\newcommand{\np}{\boldsymbol{\eta}_{\bareP}}
\newcommand{\nq}{\boldsymbol{\eta}_{\bareQ}}
\renewcommand{\k}{\mathbf{k}}
\newcommand{\K}{\mathbf{K}}
\newcommand{\ceq}{{=}}
\newcommand{\cbi}[1]{\{ #1\}_{i=1}^n}
\newcommand{\cbk}[1]{\{ #1\}_{k=1}^K}
\newcommand{\cbik}[1]{\{ #1\}_{ik}}
\newcommand{\pa}[1]{{\textrm{pa}\b{#1}}}
\newcommand{\pl}[1]{{\textrm{pl}\b{#1}}}
\newcommand{\apl}{\textrm{pl}}
\newcommand{\pax}{\pa{x}}
\newcommand{\qa}[1]{{\textrm{qa}\b{#1}}}
\newcommand{\fa}{\textrm{qa}\b}
\newcommand{\zi}{z^{\text{ind}}}
\newcommand{\zn}{z^{\text{non-ind}}}
\newcommand{\dd}[2][]{\frac{\partial #1}{\partial #2}}
\newcommand{\at}{\bracket{.}{\rvert}}
\newcommand{\Dkl}{\operatorname{D}_\text{KL}\mbracket{(}{\Vert}{)}}
\newcommand{\argmax}{\operatorname*{argmax}}

\newcommand{\tmc}{\textrm{TMC}}
\newcommand{\nis}{\textrm{NIS}}
\newcommand{\snis}{\textrm{SNIS}}
\renewcommand{\mp}{\textrm{MP}}
\newcommand{\glob}{\textrm{global}}
\newcommand{\post}{\textrm{post}}
\newcommand{\old}{\textrm{old}}
\newcommand{\Pe}{\mathcal{P}}
\renewcommand{\L}{\mathcal{L}}
\newcommand{\Pmp}{\Pe_\mp}
\newcommand{\Pmpexp}{\Pmp^\text{exp}}
\newcommand{\Pmpmarg}{\Pmp^\text{marg}}
\newcommand{\Pmpsamp}{\Pmp^\text{samp}}
\newcommand{\Pglob}{\Pe_\glob}
\newcommand{\Pold}{\Pe_\old}
\newcommand{\Lmp}{\L_\mp}
\newcommand{\Lglob}{\L_\glob}
\newcommand{\const}{\operatorname{const}}

\newcommand{\tsum}{{\textstyle \sum}}
\newcommand{\tprod}{{\textstyle \prod}}


\newcommand{\thetaglob}{\Delta \theta_\glob}
\newcommand{\phiglob}{\Delta \phi_\glob}
\newcommand{\thetamp}{\Delta \theta_\mp}
\newcommand{\phimp}{\Delta \phi_\mp}
\newcommand{\thetapost}{\Delta \theta_\post}
\newcommand{\phipost}{\Delta \phi_\post}

\newcommand{\Dpost}{\Delta_\post}
\newcommand{\Dglob}{\Delta_\glob}
\newcommand{\Dmp}{\Delta_\mp}

\newcommand{\texttmc}{TMC}
\newcommand{\textglob}{global}
\newcommand{\textGlob}{Global}

\newcommand{\cdo}{\operatorname{do}\b}

\newcommand{\Normal}{\mathcal{N}}

\newcommand{\citedeg}[1][]{\citep[#1][]{carpenter1999improved,li2012deterministic,li2014fight,zhou2016new,wang2017survey}}

\newcommand{\citepf}[1][]{\citep[#1][]{gordon1993novel,doucet2009tutorial,andrieu2010particle,maddison2017filtering,le2017auto,lindsten2017divide,naesseth2018variational,lai2022variational}}

\newcommand{\citevpf}{\citep{maddison2017filtering,le2017auto,lindsten2017divide,naesseth2018variational,lai2022variational}}

\newcommand{\citevi}[1][]{\citep[#1][]{jordan1999introduction,wainwright2008graphical,kingma2013auto,rezende2014stochastic,blei2017variational,nguyen2017variational,zhang2018advances,kingma2019introduction,gayoso2021joint}}
\newcommand{\citeiwae}[1][]{\citep[#1][]{burda2015importance,cremer2017reinterpreting}}
\newcommand{\citerws}[1][]{\citep[#1][]{bornschein2014reweighted,le2020revisiting}}


\onecolumn
\title{Massively Parallel Reweighted Wake-Sleep (Supplementary Material)}

\author[1]{Thomas Heap}
\author[1]{Gavin Leech}
\author[1]{\href{mailto:<laurence.aitchison@brisol.ac.uk>?Subject=Your UAI 2023 paper}{Laurence Aitchison}{}}



% Add affiliations after the authors
\affil[1]{%
    Department of Computer Science\\
    University of Bristol\\
    Bristol
}

\begin{document}
\onecolumn
\maketitle

%% Turn this off if single column is desired for the supplement


\section{Proof of equivalence of the different forms of the global RWS updates}
\label{app:rws:updates}
We start with the RWS $\bareP$ update (Eq.~\ref{eq:rws_global:obj:P}), then use $\nabla_\theta \log \mathcal{P}_\text{global}(z) = \b{\nabla_\theta \mathcal{P}_\text{global}(z)} / \mathcal{P}_\text{global}(z)$,
\begin{align}
  % &= \E[{\Q[\phi]{z| x}}]{\nabla_\theta \log \mathcal{P}_\text{global}(z)}.\\
  %\intertext{
  \E{\Delta \theta_\text{RWS}} &= \E[{\Q[\phi]{z| x}}]{\frac{\nabla_\theta \mathcal{P}_\text{global}(z)}{\Pglob(z)}}.\\
  \intertext{Using the definition of $\mathcal{P}_\text{global}(z)$ (Eq.~\ref{eq:Pglobal}),}
  \E{\Delta \theta_\text{RWS}} &= \E[{\Q[\phi]{z| x}}]{\frac{\nabla_\theta \tfrac{1}{K} \sum_k r_k(z)}{\Pglob(z)}}\\
  \intertext{Substituting for $r_k(z)$ (Eq.~\ref{eq:rglobal}) in the numerator,}
  \E{\Delta \theta_\text{RWS}} &= \E[{\Q[\phi]{z| x}}]{\frac{\tfrac{1}{K} \sum_k \frac{\nabla_\theta \P[\theta]{z^k, x}}{\Q[\phi]{z^k| x}} }{\Pglob(z)}}\\
  \intertext{substituting $\nabla_\theta \P[\theta]{z^k, x} = \P[\theta]{z^k, x} \nabla_\theta \log \P[\theta]{z^k, x}$,}
  \E{\Delta \theta_\text{RWS}} &= \E[{\Q[\phi]{z| x}}]{\frac{1}{K} \sum_k \frac{\frac{\P[\theta]{z^k, x}}{\Q[\phi]{z^k| x}}}{\Pglob(z)}\nabla_\theta \log \P[\theta]{z^k, x}}
  %\E{\Delta \theta_\text{RWS}} &= \E[{\Q[\phi]{z| x}}]{\sum_k \frac{r(z^k)}{\sum_{k'} r(z^{k'})}\nabla_\theta \log \P[\theta]{z^k, x}},
\end{align}
Noticing that the ratio of $\P[\theta]{z^k, x}$ and $\Q[\phi]{z^k| x}$ in the numerator is equal to $r_k(z)$ (Eq.~\ref{eq:rglobal}), we get back to Eq.~\eqref{eq:rws_global:iw:P}, as required.

The RWS $\bareQ$ update is very similar.  Again, we start with Eq.~\eqref{eq:rws_global:obj:Q}, then use $\nabla_\theta \log \mathcal{P}_\text{global}(z) = \b{\nabla_\theta \mathcal{P}_\text{global}(z)} / \mathcal{P}_\text{global}(z)$,
\begin{align}
  \E{\Delta \phi_\text{RWS}}&= - \E[{\Q[\phi]{z| x}}]{\frac{\nabla_\phi \mathcal{P}_\text{global}(z)}{\mathcal{P}_\text{global}(z)}}\\
  \intertext{Using the definition of $\mathcal{P}_\text{global}(z)$ (Eq.~\ref{eq:Pglobal}),}
  \E{\Delta \phi_\text{RWS}}&= - \E[{\Q[\phi]{z| x}}]{\frac{\nabla_\phi \tfrac{1}{K} \sum_k r_k(z)}{\Pglob(z)}}\\
  \intertext{Substituting for $r_k(z)$ (Eq.~\ref{eq:rglobal}) in the numerator,}
  \E{\Delta \phi_\text{RWS}}&= - \E[{\Q[\phi]{z| x}}]{\frac{\tfrac{1}{K} \sum_k \nabla_\phi \frac{\P[\theta]{z^k, x}}{\Q[\phi]{z^k| x}} }{\Pglob(z)}}\\
  \intertext{Computing the derivative,}
  \E{\Delta \phi_\text{RWS}}&= \E[{\Q[\phi]{z| x}}]{\frac{\tfrac{1}{K} \sum_k \frac{\P[\theta]{z^k, x}}{\b{\Q[\phi]{z^k| x}}^2} \nabla_\phi \Q[\phi]{z^k| x}}{\Pglob(z)}}.\\
  \intertext{Noticing that $\b{\nabla_\phi \Q[\phi]{z^k| x}} / \Q[\phi]{z^k| x} = \nabla_\phi \log \Q[\phi]{z^k| x}$,}
  \E{\Delta \phi_\text{RWS}}&= \E[{\Q[\phi]{z| x}}]{\frac{\tfrac{1}{K} \sum_k \frac{\P[\theta]{z^k, x}}{\Q[\phi]{z^k| x}} \nabla_\phi \log \Q[\phi]{z^k| x}}{\Pglob(z)}}.
\end{align}
Finally, noticing that the ratio of $\P[\theta]{z^k, x}$ and $\Q[\phi]{z^k| x}$ in the numerator is equal to $r_k(z)$ (Eq.~\ref{eq:rglobal}), we get back to Eq.~\eqref{eq:rws_global:iw:Q}, as required.

Both of these derivations may be straightforwardly repeated for the massively parallel setting, simply by replacing $k \in \mathcal{K}$ with $\k\in\mathcal{K}^n$, and by replacing $1/K$ with $1/K^n$.

\section{TMC vs massively parallel approximate posteriors}
\label{app:ap}
TMC approximate posteriors draw the $K$ samples of the $i$th latent variable IID,
\begin{align}
  \Qtmcc{z_i}{z_j \text{ for all } j \in \qa{i}} &=
  \prod_{k_i\in\mathcal{K}} \Qtmcc{z_i^{k_i}}{z_j \text{ for all } j \in \qa{i}}\\
  \intertext{Specifically, TMC draws each sample from an equally weighted mixture over all parent particles,}
  \Qtmcc{z_i^{k_i}}{z_j \text{ for all } j \in \qa{i}} &=
    \tfrac{1}{K^{|\qa{i}|}} \sum_{\k_{\qa{i}}} \Qglobc{z_i^{k_i}}{z_j^{k_j} \text{ for all } j \in \qa{i}}.
\end{align}
In contrast, massively parallel methods do not force us to sample particles IID.
The key issue with IID sampling is that it introduces the risk of particle degeneracy \citedeg{}.
In particle degeneracy, some of the parent samples (e.g.\ $z_j^1$ where $j \in \qa{i}$) might have multiple children, in the sense that multiple $z_i^k$ are sampled from the mixture component arising from $z_j^1$.
At the same time, some of the parents, (e.g.\ $z_j^2$) might have no children, in the sense that no $z_i^k$ are sampled from a mixture component arising from $z_j^2$.
This is problematic because it reduces diversity in the population of samples, $z_i=(z_i^1,\dotsc,z_i^K)$, and this reduction in diversity can be especially problematic in models with long chains of latent variables, such as timeseries models.
To reduce the risk of particle degeneracy, the massively parallel methods considered here couple the distribution over each of the $K$ particles,
\begin{align}
  \Qmpc{z_i}{z_j \text{ for all } j \in \qa{i}} &\neq \prod_{k_i\in\mathcal{K}} \Qtmcc{z_i^{k_i}}{z_j \text{ for all } j \in \qa{i}}.
\end{align}
However, we do ensure that the marginal for a single particle is the same as for TMC,
\begin{align}
  \Qmpc{z_i^{k_i}}{z_j \text{ for all } j \in \qa{i}} &=
    \tfrac{1}{K^{|\qa{i}|}} \sum_{\k_{\qa{i}}} \Qglobc{z_i^{k_i}}{z_j^{k_j} \text{ for all } j \in \qa{i}}.
\end{align}
To achieve this, we sample a permutation, $\pi$ for each latent variable, and the permutation tells us which parent particle to consider.
To give an example for one parent,
\begin{align}
  \Qmpc{z_i}{\pi, z_j} &= \prod_{k_i} \Qmpc{z_i^{k_i}}{\pi, z_j} \\
  \Qmpc{z_i^{k_i}}{\pi, z_j} &= \tfrac{1}{K} \sum_{k_i} \Qc[\phi]{z_i^{k_i}}{z_j^{\pi_{k_i}}}
\end{align}
Critically, if we marginalise over the permutation, the distribution over a single $z_i^{k_i}$ has the same density as that from a uniform mixture,
\begin{align}
  \Qmpc{z_i^{k_i}}{z_j} &= \sum_{\pi} \Qmpc{z_i^{k_i}}{\pi, z_j}\\
  \Qmpc{z_i^{k_i}}{z_j}  &= \tfrac{1}{K} \sum_{k_j} \Qc[\phi]{z_i^{k_i}}{z_j^{k_j}}.
\end{align}
Finally, if we have multiple parent latent variables, we independently sample a permutation for each latent variable.

\section{Massively parallel IWAE and RWS}
\label{app:MP}
Before getting started, it will prove useful to define some briefer notation than that used in the main text.
Specifically, we use,
\begin{align}
  z_\qa{i} &= \cb{z_j \text{ for all } j \in \qa{i}},\\
  z^{k_i}_\qa{i} &= \cb{z_j^{k_i} \text{ for all } j \in \qa{i}},\\
  z^{\k_\pa{i}}_\pa{i} &= \cb{z_j^{k_j} \text{ for all } j \in \pa{i}},
\end{align}
so,
\begin{align}
  \label{eq:Qgeneral}
  \Qc{z_i^{k_i}}{x, z_\qa{i}} &= \Qc{z_i^{k_i}}{x, z_j \text{ for all } j \in \qa{i}},\\
  % \label{eq:Qspecial}
  % \Qc{z_i^{k_i}}{x, z_\qa{i}^{k_i}} &= \Qc{z_i^{k_i}}{x, z_j^{k_i} \text{ for all } j \in \qa{i}},\\
  \Pc[\theta]{z^{k_i}_i}{z^{\k_\pa{i}}_\pa{i}} &= \Pc[\theta]{z^{k_i}_i}{z_j^{k_j} \text{ for all } j \in \pa{i}}
\end{align}
Note that in Eq.~\eqref{eq:Qgeneral}, we allow for the possibility of a slightly more general form for the approximate posterior, where the distribution over $z_i^{k_i}$ may depend on any of the parent samples.
% The usual TMC approximate posterior (Eq.~\ref{eq:Qspecial}) is a special case of this more general form.
This generalisation ensures that the subsequent derivations generalise to other possible forms for the approximate posterior, such as those for TMC (Eq.~\ref{eq:Qmp}).

In addition, it is useful to introduce notation to describe the ``non-indexed'' latent variables (i.e.\ everything in $z$ that is not $z^\k$).
The $i$th non-indexed latents are, $z_i^{/\k_i}$,
\begin{align}
  z_i^{/k_i} &= \b{z_i^{1},\dotsc,z_i^{k_i-1},z_i^{k_i+1},\dotsc,z_i^K} \in \mathcal{Z}_i^{K-1}.\\
  \intertext{and $z^{/\k}$ are all non-indexed latents,}
  z^{/\k} &= \b{z_1^{/k_1}, z_2^{/k_2}, \dotsc, z_n^{/k_n}} \in \mathcal{Z}^{K-1}.
\end{align}
%Using the notion of the non-indexed latent variables, we can write down two distributions that will become useful later.
%These distributions arise by causal interventions on the approximate posterior sampling process, \citep{pearl1995causal,pearl2000models,pearl2009causal,hernan2010causal,glymour2016causal},
%\begin{subequations}
%\label{eq:Qdo}
%\begin{align}
%  \label{eq:Qdo_z/k}
%  \Qc{z^\k}{\cdo{z^{/\k}}, x} &= \prod_i \Q{z_i^{k_i}| x, z_{\qa{i}}},\\
%  \label{eq:Qdo_zk}
%  \Qc{z^{/\k}}{\cdo{z^{\k}}, x} &= \prod_i \prod_{\kappa \in \mathcal{K}/k_i} \Q{z_i^{\kappa}| x, z_{\qa{i}}}.
%\end{align}
%\end{subequations}
%Here, $\mathcal{K}/k_i = \{1,\dotsc,k_i-1,k_i+1,\dotsc,K\}$ is the set of all indicies from $1$ to $K$, with $k_i$ left out.
%The resulting probabilities are subsets of terms in the full approximate posterior probability (Eq.~\ref{eq:Qmp}).
%Of course, distributions resulting from these causal interventions are very different from those resulting from traditional conditioning.
%For our purposes, the critical property of these two distributions resulting from different causal interventions is that the product of their probabilities is the overall joint distribution,
%\begin{align}
%  \label{eq:Qz_QkQ/k}
%  \Q{z| x} &= \Qc{z^\k}{\cdo{z^{/\k}}, x} \Qc{z^{/\k}}{\cdo{z^{\k}}, x},
%\end{align}
%which arises by inspecting the definitions of these distributions in Eq.~\eqref{eq:Qdo}.

\subsection{IWAE}
\subsubsection{Single-Sample VI}
We begin by building intuition by looking at the derivation for the ELBO in the standard single-sample VAE.
We start by writing the marginal likelihood as an integral,
\begin{align}
  \P[\theta]{x} &= \int dz' \P[\theta]{x, z'}.
  \intertext{Here, we use $z'\in \mathcal{Z}$ to denote a single sample from the full joint state space; we use $z'$ instead of $z$ because $z$ is reserved for $K$ samples (Eq.~\ref{eq:z}). Next, we divide and multiply by the approximate posterior probability, $\Q[\phi]{z'|x}$,}
  \P[\theta]{x} &= \int dz' \Q[\phi]{z'|x} \frac{\P[\theta]{x, z'}}{\Q[\phi]{z'|x}}.
  \intertext{Now, we can rewrite the integral as an expectation under the approximate posterior,}
  \label{eq:vae:E}
  \P[\theta]{x} &= \E[{\Q[\phi]{z'|x}}]{\frac{\P[\theta]{x,z'}}{\Q[\phi]{z'|x}}}.\\
  \intertext{Now we take the logarithm on both sides and apply Jensen's inequality,}
  \label{eq:vae:jensen}
  \log \P[\theta]{x} &= \log \E[{\Q[\phi]{z'|x}}]{\frac{\P[\theta]{x,z'}}{\Q[\phi]{z'|x}}} \geq \E[{\Q[\phi]{z'|x}}]{\log \frac{\P[\theta]{x,z'}}{\Q[\phi]{z'|x}}} = \L_\text{VAE}
\end{align}
Of course, this derivation is specific to the single-sample VAE.
But we can pull out an underlying strategy that generalises to the multi-sample setting.
In particular, we first come up with an unbiased estimator of the marginal likelihood.
In our VAE, this is,
\begin{align}
  \mathcal{P}_\text{VAE}(z') &= \frac{\P[\theta]{x, z'}}{\Q[\phi]{z'|x}}\\
  \intertext{Following Eq.~\eqref{eq:vae:E} we can see that this quantity is an unbiased estimator of the marginal likelihood if $z'$ is sampled from $\Q[\phi]{z'|x}$,}
  \P[\theta]{x} &= \E[{\Q[\phi]{z'|x}}]{\mathcal{P}_\text{VAE}(z')}
  \intertext{Then we apply Jensen's inequality (Eq.~\ref{eq:vae:jensen}),}
  \log \P[\theta]{x} &\geq \L_\text{VAE} = \E[{\Q[\phi]{z'|x}}]{\log \mathcal{P}_\text{VAE}(z')}.
\end{align}
However, this approach highlights key issues with the usual single-sample bound.
In particular, the single-sample estimator, $\mathcal{P}_\text{VAE}(z')$ can be very high-variance, and variance in the unbiased estimator causes the Jensen bound to become looser.

\subsubsection{Global IWAE}
\label{app:iwae_glob}
To reduce variance in the unbiased estimator, a natural approach is to average $K$ independent samples, and this is exactly what global IWAE does,
\begin{align}
  \Pglob(z) &= \frac{1}{K} \sum_{k=1}^K r_k(z) = \frac{1}{K} \sum_{k=1}^K  \mathcal{P}_\text{VAE}(z^k)
\end{align}
This is of course an unbiased estimator, as it is the average of $K$ unbiased estimators,
\begin{align}
  \P[\theta]{x} &= \E[{\Q[\phi]{z| x}}]{\Pglob(z)}.
\end{align}
Therefore, applying Jensen's inequality gives a new lower-bound on the log-marginal likelihood,
\begin{align}
  \log \P[\theta]{x} &= \log \E[{\Q[\phi]{z| x}}]{\Pmp(z)} \geq \E[{\Q[\phi]{z| x}}]{\log \Pglob(z)} = \mathcal{L}_\text{IWAE}
\end{align}
which is tighter than the usual single-sample ELBO \citep{burda2015importance}, and which matches Eq.~\eqref{eq:L:iwae} in the main text.

\subsubsection{Massively Parallel IWAE}
\label{app:iwae_tmc}
Our proposed $\Pmp(z)$ (Eq.~\ref{eq:Pmp}) is the average of $K^n$ terms, rather than $K$ terms in global IWAE.
To prove that our massively parallel strategy is valid, our strategy is to show that every term in this average is an unbiased estimator of $\log \P[\theta]{x}$, in which case the average is also an unbiased estimator, and we can again apply Jensen.

Each term in the average $\Pmp(z)$ (Eq.~\ref{eq:Pmp}) is of the form $r_\k(z)$ (Eq.~\ref{eq:rtmc}).
The expectation of each term is,
\begin{align}
  \E[{\Q[\phi]{z|x}}]{r_\k(z)} &= \E[{\Q[\phi]{z|x}}]{\frac{\P[\theta]{x, z^{\k}}}{\prod_i \Q[\phi]{z_i^{k_i}|x, z_{\qa{i}}}}}.
  \intertext{We can rewrite the expectation as an integral,}
  \E[{\Q[\phi]{z|x}}]{r_\k(z)} &= \int dz \P[\theta]{x, z^{\k}} \prod_i  \frac{\Q[\phi]{z_i| x, z_{\qa{i}}}}{\Q[\phi]{z_i^{k_i}|x, z_{\qa{i}}}}.
  \intertext{Bayes theorem tells us,}
  \frac{\Q[\phi]{z_i| x, z_{\qa{i}}}}{\Q[\phi]{z_i^{k_i}|x, z_{\qa{i}}}} &=
  \frac{\Q[\phi]{z_i^{k_i}, z_i^{/k_i}| x, z_{\qa{i}}}}{\Q[\phi]{z_i^{k_i}|x, z_{\qa{i}}}} = \Q{z_i^{/k_i}|x, z_i^{k_i}, z_{\qa{i}}},
  \intertext{Applying Bayes theorem,}
  \E[{\Q[\phi]{z|x}}]{r_\k(z)} &= \int dz \P[\theta]{x, z^{\k}} \prod_i \Q{z_i^{/k_i}|x, z_i^{k_i}, z_{\qa{i}}}.
\end{align}
Importantly, the integrand is a valid joint distribution over $x$ and $z$, or equivalently over $x$, $z^\k$ and $z^{/\k}$.
Thus, integrating over $z^{/\k}$ then $z^\k$, we find,
\begin{align}
  \E[{\Q[\phi]{z|x}}]{r_\k(z)} &= \P[\theta]{x}.
\end{align}
As such, each of the $r_\k(z)$ terms is an unbiased estimator of the marginal likelihood.
As $\Pmp(z)$ (Eq.~\ref{eq:Pmp}) is just an average of $K^n$ $r_\k(z)$ terms, it is also an unbiased estimator.
Applying Jensen's inequality to this unbiased estimator,
\begin{align}
  \log \P[\theta]{x} &\geq \E[{\Q[\phi]{z| x}}]{\log \Pmp(z)} = \Lmp,
\end{align}
which mirrors Eq.~\eqref{eq:L:tmc} in the main text.


\subsection{RWS}
\subsubsection{\textGlob{} RWS}
\label{app:rws_glob}
To build intuition, we first give a derivation of the standard RWS updates.
Ideally the updates would use samples drawn from the true posterior, $\P[\theta]{z| x}$,
\begin{subequations}
\begin{align}
  \thetapost &= \E[{\P[\theta]{z^k| x}}]{\nabla_\theta \log \P[\theta]{z, x}}\\
  \phipost &= \E[{\P[\theta]{z^k| x}}]{\nabla_\phi   \log \Q[\phi]{z| x}}
\end{align}
\end{subequations}
The $\bareP$ update is exactly the M-step in EM, and the $\bareQ$ step trains $\Q[\phi]{z|x}$ using maximum likelihood based on samples from the true posterior.
To simplify the derivations, we note that both of these updates can be understood as computing a moment under the true posterior,
\begin{align}
  \Dpost &= \E[{\P[\theta]{z^k| x}}]{\Delta(z^k)}.
\end{align}
For the $\bareP$ update, we have $\Dpost = \thetapost$ and $\Delta(z^k) = \nabla_\theta \log \P[\theta]{z, x}$.
For the $\bareQ$ update, we have $\Dpost = \phipost$ and $\Delta(z^k) = \nabla_\theta \log \Q[\phi]{z, x}$.
Of course, in practice, the true posterior is intractable, so instead we must use some form of importance weighting.
We begin by writing the generic form for the updates as an integral,
\begin{align}
  \Dpost &= \int dz^k \P{z^k| x}\Delta(z^k).
  \intertext{We then multiply and divide by an approximate posterior, $\Q{z^k| x}$,}
  \Dpost &= \int dz^k \Q{z^k| x} \frac{\P{z^k| x}}{\Q{z^k| x}} \Delta(z^k).
  \intertext{We can rewrite the integral as expectation over the approximate posterior, $\Q{z^k| x}$,}
  \Dpost &= \E[\Q{z^k| x}]{\frac{\P{z^k| x}}{\Q{z^k| x}} \Delta(z^k)}.
%  \intertext{And we can take the average over $k\in\mathcal{K}$,}
%  \Dpost &= \E[\Q{z^k| x}]{\frac{1}{K} \sum_{k\in\mathcal{K}} \frac{\P{z^k| x}}{\Q{z^k| x}} \Delta(z^k)}.
\end{align}
This quantity is difficult to use directly, because computing the posterior, $\P{z^k| x}$ involves the marginal likelihood, $\P[\theta]{x}$, which is intractable,
\begin{align}
  \P[\theta]{z^k| x} &= \tfrac{\P[\theta]{z^k, x}}{\P[\theta]{x}}  & \P[\theta]{x} = \int dz^k \P[\theta]{z^k, x}.
\end{align}
%Substituting this form for the true posterior,
%\begin{align}
%  \Dpost &= \E[\Q{z^k| x}]{\frac{\frac{\P{z^k| x}}{\Q{z^k| x}}}{\P[\theta]{x}} \Delta(z^k)}.
%\end{align}
As the true marginal likelihood is intractable, we instead use $\Pglob(z)$ (Eq.~\ref{eq:Pglobal}), which is an unbiased estimator of $\P{x}$, and is correct in the limit as $K\rightarrow \infty$ \citep{burda2015importance}.
This gives updates of the form,
\begin{align}
  \Dglob &= \E[\Q{z^k| x}]{\frac{\frac{\P{z^k, x}}{\Q{z^k| x}}}{\Pglob(z)} \Delta(z^k)}.
\end{align}
Remembering the definition of $r_k(z)$ (Eq.~\ref{eq:rglobal}), this can be written,
\begin{align}
  \Dglob &= \E[{\Q[\phi]{z|x}}]{\frac{r_k(z)}{\Pglob(z)}\Delta(z^k)}.
  %\Dglob &= \E[{\Q[\phi]{z|x}}]{\frac{1}{K} \sum_k \frac{r_k(z)}{\Pglob(z)}\Delta(z^k)}.
\end{align}
Finally as the expectation is the same for all $k$, we can average over $k$, which gives the expression in the main text (Eq.~\ref{eq:rws_global:iw})
%Finally, note that the expectation is the same for every value of $k$.
%We therefore get the same thing if we average over all $k$,
%\begin{align}
%  \Dglob &= \E[{\Q[\phi]{z|x}}]{\tfrac{1}{K} \sum_k \frac{r_k(z)}{\tfrac{1}{K} \sum_{k'} r_{k'}(z)}\Delta(z^k)}.
%\end{align}

\subsubsection{Massively Parallel RWS}
\label{app:rws_tmc}


Now, we can move on to massively parallel RWS.
In the previous derivation for global RWS, we showed that each sample, $z^k$, individually constituted an unbiased estimator.
In the massively parallel setting, the key difference is that instead of having $K$ samples $z^k$, we have $K^n$ samples, $z^\k$.
In particular,
\begin{align}
  \Dpost &= \E[\P{z^\k| x}]{\Delta(z^\k)} = \int dz^\k \P{z^\k| x}\Delta(z^\k).\\
  \intertext{Now, we multiply and divide by $\prod_i \Qc{z_i^{k_i}}{x, z_\qa{i}}$,}
  \label{eq:dpost_int}
  \Dpost &= \int dz^\k \b{\prod_i \Qc{z_i^{k_i}}{x, z_\qa{i}}} \frac{\P{z^\k| x}}{\prod_i \Qc{z_i^{k_i}}{x, z_\qa{i}}}  \Delta(z^\k).
\end{align}
Now, we introduce and integrate out a distribution over the non-indexed latent variables, $\prod_i \Qc{z_i^{/k_i}}{x, z_i^{k_i}, z_\qa{i}}$ \begin{align}
  \label{eq:intQz/k}
  1 &= \int dz^{/\k} \prod_i \Qc{z_i^{/k_i}}{x, z_i^{k_i}, z_\qa{i}},
\end{align}
Multiplying Eq.~\eqref{eq:dpost_int} by $1$ (Eq.~\ref{eq:intQz/k}),
\begin{align}
  \Dpost &= \int dz^\k \b{\prod_i \Qc{z_i^{k_i}}{x, z_\qa{i}}} \frac{\P{z^\k| x}}{\prod_i \Qc{z_i^{k_i}}{x, z_\qa{i}}}  \Delta(z^\k) \int dz^{/\k} \prod_i \Qc{z_i^{/k_i}}{x, z_i^{k_i}, z_\qa{i}}.
\end{align}
Combining the integrals over $z^{\k}$ and $z^{/\k}$ into a single integral over $z$, %and noting that

% \begin{align}
% \prod_i \Qc{z_i^{k_i}}{x, z_\qa{i}} \Qc{z_i^{/k_i}}{x, z_i^{k_i}, z_\qa{i}} = \Qc{z}{x},
% \end{align} %and using Eq.~\eqref{eq:Qz_QkQ/k} to combine $\Qc{z^\k}{\cdo{z^{/\k}}, x}$ and $\Qc{z^{/\k}}{\cdo{z^{\k}}, x}$ into $\Qc{z}{x}$,
\begin{align}
  \Dpost &= \int dz \Qc{z}{x} \frac{\P{z^\k| x}}{\prod_i \Qc{z_i^{k_i}}{x, z_\qa{i}}} \Delta(z^\k).
\end{align}
Writing the integral as an expectation,
\begin{align}
  \Dpost &= \E[{\Q[\phi]{z|x}}]{\frac{\P{z^\k| x}}{\prod_i \Q{z_i^{k_i}| x, z_{\pa{i}}}} \Delta(z^\k)}.
\end{align}
Again, the posterior can be written,
\begin{align}
  \P[\theta]{z^\k| x} &= \tfrac{\P[\theta]{z^\k, x}}{\P[\theta]{x}}  & \P[\theta]{x} = \int dz^\k \P[\theta]{z^\k, x}.
\end{align}
Again, the marginal likelihood, $\P[\theta]{x}$ is intractable.
Instead, we use the massively parallel estimate of the marginal likelihood, which was shown to be unbiased in Sec.~\ref{app:iwae_tmc},
\begin{align}
  \Dmp &= \E[{\Q[\phi]{z|x}}]{\frac{\frac{\P{z^\k, x}}{\prod_i \Q{z_i^{k_i}| x, z_{\pa{i}}}}}{\Pmp(z)} \Delta(z^\k)}.
\end{align}
Remembering the definition of $r_\k(z)$ (Eq.~\ref{eq:rtmc}), this can be written,
\begin{align}
  \Dmp &= \E[{\Q[\phi]{z|x}}]{\frac{r_\k(z)}{\Pmp(z)}\Delta(z^\k)}.
\end{align}
Finally, note that the expectation is the same for every value of $\k$.
Averaging over all $K^n$ values of $\k$, we get the form in the main text (Eq.~\ref{eq:rws_tmc:iw}).
%We therefore get the same thing if we average over all $\k$,
%\begin{align}
%  \Dmp &= \E[{\Q[\phi]{z|x}}]{\frac{1}{K^n} \sum_\k \frac{r_\k(z)}{\Pmp(z)}\Delta(z^\k)}.
%\end{align}
%which matches the updates in the main text (Eq.~\ref{eq:rws_tmc:iw}).

%\section{Computing posterior samples by reweighting}
%Consider computing a posterior expectation,
%\begin{align}
%  m_\post &= \E[{\P[\theta]{z'| x}}]{m(z')}.\\
%  \intertext{As that is intractable, we often use importance reweighting (which was shown to be correct in Appendix~\ref{app:rws_tmc})}
%  m_\tmc &= \E[{\Q[\phi]{z| x}}]{\sum_{\k\in\mathcal{K}^n} \frac{r_\k(z)}{\sum_{\k'\in\mathcal{K}^n} r_{\k'}(z)} m(z^\k)}.
%\end{align}
%We can approximate this expectation by sampling indicies, $\k$, with a probability given by,
%\begin{align}
%  \P{\k} &= \frac{r_\k(z)}{\sum_{\k'\in\mathcal{K}^n} r_{\k'}(z)} \propto r_\k(z)
%\end{align}
%
%\section{Plates}
%In principle, we could implement plates with a separate latent variable for each element.
%However, this turns out to be extremely slow.
%We therefore exploit some of the structure implied by plates to speed up the computations.
%In particular, we consider variables of the form, $z_{i,\pl{i}}^{k_i}$.
%Now, $i$ indexes a ``type'' of variable, and $\pl{i}$ represents the plate indices relevant for this variable, and $k_i$ represents the copy.
%Now, we can write,
%\begin{align}
%  \Pmp(z) &= \sum_{k^0} \sum_{k^1_1,\dotsc,k^1_{P_1}} \sum_{k^2_1,\dotsc,k^2_{P_2}}
%  f^0_{k_0}(z) \prod_{p_1} f^{1}_{p_1, k^1_{p_1}}(z) \prod_{p_2} f^{2}_{k_2, k_1, k_0 p_1, p_2}(z).
%  \intertext{putting the sums inside the products,}
%  \label{eq:sums_inside_prods}
%  \Pmp(z) &=
%  \sum_{k_0} f^{0}_{k_0}(z)
%  \prod_{p_1} \sum_{k^1_{p_1}} f^{1}_{k^1_{p_1}, k_0, p_1}(z)
%  \prod_{p_2} \sum_{k^2_{p_1}} f^{2}_{k^2_{p_2}, k^1_{p_1}, p_1, p_2}(z).
%  \intertext{As there is only one $k^1_{p_1}$ inside the sums, we can relabel the summation index as just $k_1$ (and analogously for $k_2$),}
%  \Pmp(z) &=
%  \sum_{k_0} f^{0}_{k_0}(z)
%  \prod_{p_1} \sum_{k_1} f^{1}_{k_1, k_0, p_1}(z)
%  \prod_{p_2} \sum_{k_2} f^{2}_{k_2, k_1, p_1, p_2}(z).
%  %f^{i;\pl{i}}_{k_i, \k_{\pa{i}}}(z) &= \frac{\P[\theta]{z^{k_i}_i| z_{\pa{i}}^{\k_\pa{i}}}}{\Q{z_i^{k_i}| x, z_\qa{i}}}.
%  %\Pmp(z) &= \frac{1}{K^n} \sum_{\k\in\mathcal{K}^n} \prod_{\apl} f^{i;\pl{i}}_{k_i, \k_{\pa{i}}}(z)
%\end{align}
%%The basic algorithm for a plate is to do an einsum over all $k$'s associated with latent variables at that plate.
%%This eliminates all $k$'s for those dimensions.
%%Then we do a product over the plate dimension.
%%
%%What about plates that only appear at the output?
%%\begin{align}
%%  \Pmp(z) &= \sum_{k^0} \sum_{k^1_1,\dotsc,k^1_{P_1}} \sum_{k^2_1,\dotsc,k^2_{P_2}}
%%  f^0_{k_0}(z) \prod_{p_1} f^{1}_{p_1, k^1_{p_1}}(z) \prod_{p_2} f^{2}_{k^2_{p_2}, k^1_{p_1}, k_0 p_1, p_2}(z)
%%  \intertext{The standard thing is,}
%%  \Pmp(z) &= \sum_{k^0}
%%  f^0_{k_0}(z) \prod_{p_1} \sum_{k^1_{p_1}} f^{1}_{p_1, k^1_{p_1}}(z) \prod_{p_2} \sum_{k_2} f^{2}_{k^2_{p_2}, k^1_{p_1}, k_0 p_1, p_2}(z)
%%  \intertext{Then relabel,}
%%  \Pmp(z) &= \sum_{k^0}
%%  f^0_{k_0}(z) \prod_{p_1} \sum_{k_1} f^{1}_{p_1, k_1}(z) \prod_{p_2} \sum_{k_2} f^{2}_{k_2, k_1, k_0 p_1, p_2}(z)
%%\end{align}
%%The question is can we somehow push the sum over $k_0$ and the factor $f^0_{k_0}(z)$ further in?
%%The answer is no!
%%The reason is that we can only push the sum inside the products if it is a sum over a different $k$ for each plate. e.g. in Eq.~\eqref{eq:sums_inside_prods}.
%%To simplify,
%%\begin{align}
%%  \Pmp(z) &= \sum_{k^0} \sum_{k^1_1,\dotsc,k^1_{P_1}}
%%  f^0_{k_0}(z) \prod_{p_1} f^{1}_{p_1, k_0, k^1_{p_1}}(z)
%%  \intertext{We swap the sum and product,}
%%  \Pmp(z) &= \sum_{k^0}
%%  f^0_{k_0}(z) \prod_{p_1}\sum_{k^1_{p_1}} f^{1}_{p_1, k_0, k^1_{p_1}}(z)
%%  \intertext{Then relabel,}
%%  \Pmp(z) &= \sum_{k^0}
%%  f^0_{k_0}(z) \prod_{p_1}\sum_{k_1} f^{1}_{p_1, k_0, k_1}(z)
%%\end{align}
%
%%\begin{align}
%%  \underbrace{\sum_{k_0} f^{0}_{k_0}(z)}_\text{top layer; no plate}
%%  \underbrace{\prod_{p_1} \sum_{k_1} f^{1}_{k_1, p_1}(z)}_\text{plate 1; has indices $k_0$}
%%  \underbrace{\prod_{p_2} \sum_{k_2} f^{2}_{k_2, k_1, k_0 p_1, p_2}(z)}_\text{plate 2; has indices $k_1, p_1$}
%%  \Pmp(z) &=
%%  \underbrace{\sum_{k_0} f^{0}_{k_0}(z)}_\text{top layer; no plate}
%%  \underbrace{\prod_{p_1} \sum_{k_1} f^{1}_{k_1, p_1}(z)}_\text{plate 1; has indices $k_0$}
%%  \underbrace{\prod_{p_2} \sum_{k_2} f^{2}_{k_2, k_1, k_0 p_1, p_2}(z)}_\text{plate 2; has indices $k_1, p_1$}
%%\end{align}
%
%
%\section{Importance weighting of an arbitrary quantity,}
%Ultimately, our goal is to compute a posterior expectation,
%\begin{align}
%  m_\post &= \E[{\P[\theta]{z'| x}}]{m(z')}.\\
%  \intertext{As that is intractable, we often use importance sampling (which was shown to be correct in Appendix~\ref{app:rws_tmc})}
%  m_\tmc &= \E[{\Q[\phi]{z| x}}]{\sum_{\k\in\mathcal{K}^n} \frac{r_\k(z)}{\sum_{\k'\in\mathcal{K}^n} r_{\k'}(z)} m(z^\k)}\\
%  \intertext{Rearranging,}
%  &= \E[{\Q[\phi]{z| x}}]{\frac{\tfrac{1}{K^n} \sum_{\k\in\mathcal{K}^n} r_\k(z) m(z^\k)}{\tfrac{1}{K^n} \sum_{\k'\in\mathcal{K}^n} r_{\k'}(z)}}\\
%  \intertext{We identify the denominator as $\Pmp(z)$,}
%  \label{eq:moments_plain}
%  &= \E[{\Q[\phi]{z| x}}]{\frac{\frac{1}{K^n}\sum_{\k\in\mathcal{K}^n} r_\k(z) m(z^\k)}{\Pmp(z)}}
%\end{align}
%And we identify the numerator as a $\Pmp(z)$ like term that can also be computed.
%
%If $m$ really is a scalar, this is straightforwardly okay, as the structure of the computation is exactly the same as that for $\Pmp(z)$.
%Its also fine if $m(z)$ is e.g. a vector with multiple moments of a single latent variable (e.g. $\E{\b{z_i, z_i^2} | x}$): we just have an extra dimension that doesn't get summed out.
%
%The problem with this is that it scales linearly in the number of moments we want to collect, and this becomes when we want to compute e.g.\ the means of all variables in a plate.
%To do this, we introduce a new plate dimension, $p_1'$, so that the mean for each element of the plate is treated separately, and isn't summed over,
%\begin{align}
%  m_{p_1'} &=
%  \sum_{k_0} f^{0}_{k_0}(z)
%  \prod_{p_1} \sum_{k_1} m_{p_1', k_1}(z)f^{1}_{k_1, k_0, p_1}(z)
%  \prod_{p_2} \sum_{k_2} f^{2}_{k_2, k_1, p_1, p_2}(z)
%\end{align}
%However, there are two problems with this setup.
%First, the required computation is number of moments $\times$ compute $\Pmp(z)$.
%Thus, if we want alot of moments (e.g. all the means of a lower-layer plate), that is potentially prohibitive (formally, computations become quadratic in the plate dimensions).
%Second, the computations in $\Pmp(z)$ involve probaiblity tensors.  These are usually very large/small, so we work in the log-domain to ensure numerical stability (along with almost all serious probabilistic computations).  However, as $m(z^\k)$ can be positive or negative, we cannot easily work in the log-domain when computing the numerator for $m_\tmc$.



%\section{TMC vs massively parallel approximate posteriors}
%\label{app:ap}
%Our massively parallel approximate posteriors draw $K$ independent samples independently from the full joint approximate posterior Eq.~\eqref{eq:Qglobal}.
%In these approximate posteriors, the distribution over $z_i^k$ depends only on the $k$th particle of the parents, $z_{\pa{i}}^k$,
%\begin{align}
%  \Q{z| x} &= \prod_{i=1}^n \prod_{\kappa\in\mathcal{K}} \Q{z_i^{\kappa}|x, z_{\qa{i}}^\kappa}.
%\end{align}
%(Appendix~\ref{app:ap} uses a slightly more general form for the approximate posterior, as the more general form can be used without changing the derivations, but in practice we use this more specific form).
%In contrast, the TMC approximate posterior has a much more complex form,
%\begin{align}
%  \label{eq:Qold}
%  \Qtmc{z_i^{\kappa}|x, z_{\qa{i}}} &= \frac{1}{K^{\abs{\qa{i}}}} \sum_{\k_{\qa{i}} \in \mathcal{K}^{\abs{\qa{i}}}} \Q{z_i^{\kappa}|x, z_{\qa{i}}^{\k_\qa{i}}}
%\end{align}
%resembling an unweighted resampling step in a particle filter.
%However, our approach of using $K$ copies of the full joint space (Eq.~\ref{eq:Qglobal}), has four considerable advantages.
%
%\subsection{Simplicity}
%
%First, mixture proposals add considerable complexity both to sampling and computing the log-probability.
%When sampling, drawing $K$ copies from the full joint space (Eq.~\ref{eq:Qglobal}), is much simpler, easier to implement and more efficient that using these mixture proposals (Eq.~\ref{eq:Qold}).
%Further, when computing the log-probability, summing over all combinations of the parent particles adds considerable additional complexity.
%
%\subsection{Computational complexity}
%
%Second, this form for the approximate posterior can give worse computational complexity.
%The number of parents under the approximate posterior is denoted $\abs{\qa{i}}$.
%For TMC approximate posteriors, we need to compute the approximate posterior probability marginalising over all $K^{\abs{\qa{i}}}$ combinations of parent samples.
%Further, we need to compute the approximate posterior probability of all $K$ samples of $z_i$.
%Overall, this gives a computational cost of $\mathcal{O}(K^{1+\abs{\qa{i}}})$, compared to $\mathcal{O}(K)$ for massively parallel methods.
%For simple approximate posteriors, we expect the computational cost of computing $\Pmp(z)$, which is $\mathcal{O}(K^{\text{treewidth}})$ where treewidth is the treewidth of the generative model to dominate.
%However, there are settings where the cost of computing the TMC approximate posterior probabilities will dominate.
%A nice example is given by a generative model for the latents that factorises as:
%\begin{align}
%  \P{z_1',z_2',z_3'} &= \P{z_1'} \P{z_2'|z_1'} \P{z_3'|z_1'}\\
%\end{align}
%The corresponding graphical model is:
%\begin{align}
%  z_2' \leftarrow z_1' \rightarrow z_3'
%\end{align}
%The computational cost of computing $\Pmp(z)$ is $\mathcal{O}(K^2)$, as the treewidth of this model is $2$.
%One natural approximate posterior which goes ``backwards'' factorises as,
%\begin{align}
%  \Q{z_1',z_2',z_3'} &= \Q{z_3'} \Q{z_2'} \Q{z_1'| z_2', z_3'}.
%\end{align}
%The corresponding graphical model is the same as that for the generative model, except the arrows are reversed,
%\begin{align}
%  z_2' \rightarrow z_1' \leftarrow z_3'
%\end{align}
%Critically, the computational cost of computing the TMC approximate posterior terms associated with $\Q{z_1'| z_2', z_3'}$ is $\mathcal{O}(K^3)$, as we need to compute $\Q{z_1^{k_1}| z_2^{k_2}, z_3^{k_3}}$ for all $K^3$ possible settings of $(k_1, k_2, k_3)$.
%
%\subsection{Particle degeneracy}
%
%Third, the uniform mixture is analogous to resampling with a uniform mixture in a particle filter, and can therefore exhibit similar problems.
%In a particle filter, we propagate the current particles through a transition model, then ``resampling'', i.e.\ sampling each future particle from a distribution over the previous particles that emphasises particles with a high likelihood.
%A sample from the latents conditioned on the data is obtained by taking one of the current particles, and tracing its parents.
%The problem is that the resampling process implies that all the current particles may have the same parent.
%This is known as particle degeneracy and dramatically reduces diversity in samples from a particle filter \citedeg{}.
%Our problem is that the uniform mixture in Eq.~\ref{eq:Qold} mirrors the resampling step in particle filters (albeit without weighting based on a likelihood), and can therefore also exhibit particle degeneracy.
%In contrast, the massively parallel proposal (Eq.~\ref{eq:Qglobal}) cannot display particle degeneracy, as it simply draws K samples IID from the approximate posterior.
%
%\subsection{Breaking approximate posterior dependencies}
%\label{app:tmc_ap_marg}
%
%Fourth, the marginals for the mixture proposal and the corresponding single-sample proposal do not match.
%In this section, we give an example where the marginal distribution over a particular latent variable depends on the number of samples, $K$, with big differences between $K=1$ (corresponding to the single sample setting), and practically usable values for $K$, such as $K=10$.
%These changes occur because uniform mixture of proposals tends to break dependencies between latent variables, artificially reducing dependencies between random variables under the approximate posterior (see Appendix~\ref{app:tmc_ap_marg}).
%This behaviour is very undesirable.
%In particular, we would like the user to be able to specify an approximate posterior, $\Q{z'|x}$ for a single copy of the joint space, without having to think about the $K$ copies.
%However, this may not be possible if the approximate posterior depends on $K$.
%
%Specifically, consider a model with three latent variables, $z_1'$, $z_2'$, $z_3'$.
%All these variables are binary (i.e.\ $z_i' \in \{0, 1\}$), and the overall model has dependencies,
%\begin{align}
%  \Q{z_1', z_2', z_3'} &= \Q{z_1'} \Q{z_2'| z_1'} \Q{z_3'| z_1', z_2'}
%\end{align}
%We first sample $z_1'$ from a Bernoulli with probability $\tfrac{1}{2}$ (e.g.\ by tossing a coin,)
%\begin{align}
%  \Q{z_1'} &= \text{Bernoulli}\b{z_1'; \tfrac{1}{2}} & \Q{z_1'=1} &= \tfrac{1}{2}
%  \intertext{Then, $z_2' = z_1'$ (or written as a distribution),}
%  \label{eq:counter2|1}
%  \Q{z_2'| z_1'} &= \text{Bernoulli}\b{z_2'; z_1'} &
%  \Q{z_2'=1| z_1'} &= z_1'
%  \intertext{Then, $z_3'=\delta_{z_1', z_2'}$, meaning that $z_3'=1$ if $z_1' = z_2'$, and $z_3'=0$ otherwise.
%  We can write the resulting probability as,}
%  \Q{z_3'| z_1', z_2'} &= \text{Bernoulli}\b{z_1'; \delta_{z_1', z_2'}} & \Q{z_3'=1| z_1',  z_2'} &= \delta_{z_1', z_2'}.
%\end{align}
%That fully defines the approximate posterior.
%The distinction between the single-sample approximate posterior and mixture proposal approximate posterior, $\bareQ_\text{TMC}$ emerges if we consider the marginal of $z_3'$.
%In particular, as we always have $z_1' = z_2'$ (Eq.~\ref{eq:counter2|1}), we always have $z_3'=\delta_{z_1',z_2'} = 1$, so
%\begin{align}
%  \label{eq:counter:z3'}
%  \Q{z_3'=1} = 1.
%\end{align}
%This marginal emerges because of the tight coupling between $z_1'$ and $z_2'$ in the approximate posterior.
%However, the TMC approximate posterior in effect breaks dependencies between $z_1'$ and $z_2'$, so we no longer always have $z_1'=z_2'$, and hence $z_3'$ is no longer always $1$.
%
%More fully, the TMC approximate posterior specifies the same distribution over a single particle of $z_1$,
%\begin{align}
%  \Qtmc{z_1^{k_1}} &= \text{Bernoulli}\b{z_1^{k_1}; \tfrac{1}{2}} &
%  \Qtmc{z_1^{k_1}=1} &= \tfrac{1}{2}
%\end{align}
%We will be able to summarise $z_1$ and $z_2$ in terms of the number that are $1$,
%\begin{align}
%  n_i &= \sum_{k=1}^K z_i^{k_i}.\\
%  \intertext{The number of $z_1^k$, that are on, $n_1$, has a Binomial distribution,}
%  \label{eq:counter:n1}
%  \Qtmc{n_1} &= \text{Binomial}\b{n_1; K, \tfrac{1}{2}}.
%\end{align}
%Then the distribution over $z_2^{k_2}$ is given by a mixture over all particles of $z_1$,
%\begin{align}
%  \Qtmc{z_2^{k_2}=1| z_1} &= \tfrac{1}{K} \sum_{k_1=1}^K \Qtmc{z_2^{k_2}=1| z_1^{k_1}}
%  = \tfrac{1}{K} \sum_{k_1=1}^K z_1^{k_1}
%  = \tfrac{n_1}{K}.
%\end{align}
%Thus, the distribution over $z_2^{k_2}$ is Bernoulli with probability $\tfrac{n_1}{K}$.  Importantly, this probability depends only on $n_1$.  We can therefore write the distribution as,
%\begin{align}
%  \Qtmc{z_2^{k_2}=1| n_1} &= \tfrac{n_1}{K} &
%  \Qtmc{z_2^{k_2}| n_1} &= \text{Bernoulli}\b{z_2^{k_2}; \tfrac{n_1}{K}}
%\end{align}
%And the distribution over the number of $z_2$'s that are on is again Binomial,
%\begin{align}
%  \label{eq:counter:n2}
%  \Qtmc{n_2| n_1} &= \text{Binomial}\b{n_2; K, \tfrac{n_1}{K}}.
%\end{align}
%Finally, we have,
%\begin{align}
%  \Qtmc{z_3^{k_3}=1| z_1,  z_2} &= \tfrac{1}{K^2} \sum_{k_1\in\mathcal{K}} \sum_{k_2\in\mathcal{K}} \delta_{z_1^{k_1}, z_2^{k_2}}.
%  \intertext{This distribution can again be rewritten in terms of $n_1$ and $n_2$,}
%  \label{eq:counter:z3}
%  \Qtmc{z_3^{k_3}=1| n_1,  n_2} &= \tfrac{n_1}{K} \tfrac{n_2}{K} + \b{1-\tfrac{n_1}{K}} \b{1 -\tfrac{n_2}{K}}.
%\end{align}
%We are interested in the marginal distribution of $z_3^{k_3}$.  As this random variable is binary, its probability is equal to its expectation,
%\begin{align}
%  \Qtmc{z_3^{k_3}=1} &= \E{z_3^{k_3}}.\\
%  \intertext{Now, we use the law of total expectation, where the inner expectation is under $\Q{n_2| n_1}$, and the outer expectation is under $\Q{n_1}$,}
%  \Qtmc{z_3^{k_3}=1} &= \E{\E{z_3^{k_3}|n_1}}.\\
%  \intertext{Substituting $\E{z_3^{k_3}| n_1}$ from Eq.~\eqref{eq:counter:z3},}
%  \Qtmc{z_3^{k_3}=1} &= \E{\tfrac{n_1}{K} \tfrac{\E{n_2|n_1}}{K} + \b{1-\tfrac{n_1}{K}}\b{ \tfrac{\E{n_2|n_1}}{K}}}\\
%  \intertext{From Eq.~\eqref{eq:counter:n2}, $\E{n_2| n_1} = n_1$,}
%  \Qtmc{z_3^{k_3}=1} &= \E{\b{\tfrac{n_1}{K}}^2 + \b{1-\tfrac{n_1}{K}}\b{1 - \tfrac{n_1}{K}}}\\
%  \intertext{Separating out the moments of $n_1$,}
%  \Qtmc{z_3^{k_3}=1} &= \E{1 - 2 \tfrac{n_1}{K} + 2 \b{\tfrac{n_1}{K}}^2}\\
%  \intertext{And applying the expectation separately to each moment,}
%  \Qtmc{z_3^{k_3}=1} &= 1 - 2 \frac{\E{n_1}}{K} + 2 \frac{\E{n_1}^2 + \Var{n_1}}{K^2}
%  \intertext{The usual binomial expectation and variance for $n_1$ (based on Eq.~\ref{eq:counter:n1}) are,}
%  \E{n_1} &= \tfrac{K}{2} & \Var{n_1} &= \tfrac{K}{4}.
%  \intertext{Substituting these, we get,}
%  \Q{z_3^{k_3}=1} &= 1 - 1 + 2 \frac{\b{\tfrac{K}{2}}^2 + \tfrac{K}{4}}{K^2} = \tfrac{1}{2} \b{1 + \tfrac{1}{K}}
%\end{align}
%For large $K$, this tends to $\tfrac{1}{2}$, which is very different from the single-sample marginal (Eq.~\ref{eq:counter:z3'}),
%\begin{align}
%  \lim_{K\rightarrow \infty} \Q{z_3^{k_3}=1} &= \tfrac{1}{2}.
%\end{align}
%While for $K=1$, we have,
%\begin{align}
%  \Q{z_3^{k_3}=1} &= 1,
%\end{align}
%which is expected, because $K=1$ corresponds to the single-sample setting (Eq.~\ref{eq:counter:z3'}).


\subsection{MovieLens Graphical Model}
\label{movielens}
\begin{figure*}[!htb]
\begin{center}
  \tikz{
    % nodes
     \node[obs] (rating) {$\mathrm{Rating}_{mn}$};%
     \node[latent,above=of rating] (peruser) {$\mathrm{Per \ user \ mean}_m$}; %
     \node[latent, above=of peruser, xshift=-1cm] (psi) {$\psi$};
    \node[latent, above=of peruser, xshift=1cm] (mu) {$\mu$};
    % plate
     \plate [inner sep=.3cm] {plate2} {(rating)} {$\mathrm{N}$ Films}; %
     \plate [inner sep=.3cm] {plate1} {(peruser)(plate2)} {$\mathrm{M}$ Users}; %
    edges
     \edge {psi,mu} {peruser}
     \edge {peruser} {rating}}
\caption{Graphical model for the MovieLens dataset}
\label{fig:movielens_gm}
\end{center}
\end{figure*}


% \subsection{Radon Model Specification}
% \label{radon}
% \begin{equation}
% \label{eq:radon}
% \begin{split}
% \mathrm{StateVariance} &\sim \mathcal{U}(0,100) \\
% \mathrm{StateMean} &\sim \Normal(0,10^{-4}) \\
% \mathrm{CountyMean_m} &\sim \Normal(\mathrm{StateMean_m},\mathrm{StateVariance_m}), \ \mathrm{m}=1,...,\mathrm{M} \\
% \mathrm{CountyVariance_m} &\sim \mathcal{U}(0,100), \mathrm{m}=1,...,\mathrm{M} \\
% \mathrm{w_j} &\sim \mathcal{U}(0,100), \ \mathrm{j}=1,...,\mathrm{J} \\
% \mathrm{ZipMean_{mj}} &\sim \Normal(\mathrm{CountyMean_m} + \mathrm{w_j} * \mathrm{Uranium}_\mathrm{mj},\mathrm{CountyVariance_m}), \ \mathrm{j}=1,...,\mathrm{J} \\
% \mathrm{ZipVariance_j} &\sim \mathcal{U}(0,100), \ \mathrm{j}=1,...,\mathrm{J} \\
% \mathrm{ReadingMean_{mji}} &\sim \Normal(\mathrm{ZipMean_{mj}},\mathrm{ZipVariance_ji}), \ \mathrm{i}=1,...,\mathrm{I} \\
% \mathrm{ReadingVariance_i} &\sim \mathcal{U}(0,100), \ \mathrm{i}=1,...,\mathrm{I} \\
% \mathrm{b_n} &\sim \Normal(0,1), \ \mathrm{n}=1,...,\mathrm{N} \\
% \mathrm{Reading_{mjin}} &\sim \Normal(\mathrm{ReadingMean_{mji}} + b_n * \mathrm{Basement_{mji}},\mathrm{ReadingVariance_i}), \ \mathrm{n}=1,...,\mathrm{N} \\
% \end{split}
% \end{equation}

% Where $\mathrm{Uranium}_\mathrm{mj}$ is the average Uranium concentration in state $\mathrm{m}$ and county $\mathrm{j}$ and $\mathrm{Basement}_\mathrm{mjin}$ is an indicator variable if reading $\mathrm{n}$ in zipcode $\mathrm{j}$ in county $\mathrm{i}$ in state $\mathrm{m}$ is taken in a basement. Here $\beta \in \mathbb{R}^M$, $\alpha \in \mathbb{R}^{(M,J)}$, $\omega \in \mathbb{R}^{(M,J, I)}$ and $\mathbf{y} \in \mathbb{R}^{(M,J,I,N)}$. A corresponding graphical model can be seen in \ref{fig:radon_gm}.

% \clearpage

% \subsection{Radon Graphical Model}

% \begin{figure*}[!htb]
% \begin{center}
%   \tikz{
%     % nodes
%     %N
%     \node[obs] (reading) {$\mathrm{Reading_{mjin}}$};%

%     \node[latent, left=of reading, xshift=-3.1cm, yshift=-2cm] (b) {$\mathrm{b_n}$};
%     %I
%     \node[latent,above=of reading] (readingmean) {$\mathrm{ReadingMean_{mji}}$}; %
%     \node[latent, right=of readingmean,xshift=3.5cm] (readingvariance) {$\mathrm{ReadingVariance_i}$};
%     %J
%     \node[latent,above=of readingmean] (zipmean) {$\mathrm{ZipMean_{mj}}$}; %

%     \node[latent, left=of zipmean, xshift=-0.5cm] (w) {$\mathrm{w_j}$};
%     \node[latent, below=of w,xshift=-.5cm] (zipvariance) {$\mathrm{ZipVariance_j}$};
%     %M
%     \node[latent,above=of zipmean] (countymean) {$\mathrm{CountyMean_{m}}$}; %
%     \node[latent, right=of countymean, xshift=0cm] (countyvariance) {$\mathrm{CountyVariance_m}$};
%     \node[latent, above=of countymean, xshift=2cm] (statevariance) {$\mathrm{StateVariance}$};
%     \node[latent, above=of countymean, xshift=-2cm] (statemean) {$\mathrm{StateMean}$};
%     % plate
%      \plate [inner sep=.6cm] {platereading} {(reading)(b)} {$\mathrm{N}$ Readings}; %
%      \plate [inner sep=.6cm] {platezips} {(readingmean)(readingvariance)(reading)} {$\mathrm{I}$ Zipcodes};
%      \plate [inner sep=.6cm] {platecounty} {(zipmean)(zipvariance)(w)(readingmean)(reading)} {$\mathrm{J}$ Counties};
%      \plate [inner sep=.6cm] {platestates} {(countymean)(countyvariance)(zipmean)(readingmean)(reading)} {$\mathrm{M}$ States};
%     %edges
%      \edge {statemean,statevariance} {countymean}
%      \edge {w,countymean,countyvariance} {zipmean}
%      \edge {zipvariance,zipmean} {readingmean}
%      \edge {b,readingvariance,readingmean} {reading}
%      }
% \caption{Graphical model for the Radon dataset}
% \label{fig:radon_gm}
% \end{center}
% \end{figure*}

\subsection{Bus Delay Model Specification}
\label{bus}

\begin{equation}
\label{eq:bus model}
\begin{split}
\mathrm{YearVariance} &\sim \mathrm{Cat}([0.1,0.5,0.4,0.05,0.05]) \\
\mathrm{YearMean} &\sim \Normal(0,10^{-4})  \\
\mathrm{BoroughMean}_m &\sim \Normal(\mathrm{YearMean},\exp(\mathrm{YearVariance})), \ m=1,...,\mathrm{M} \\
\mathrm{BoroughVariance}_j &\sim \mathrm{Cat}([0.1,0.4,0.05,0.5,0.05]), j=1,...,\mathrm{J} \\
\mathrm{IdMean}_{mj} &\sim \Normal(\mathrm{BoroughMean}_m,\mathrm{BoroughVariance}_j), \ j=1,...,\mathrm{J}, \ m=1,...,\mathrm{M} \\
\mathrm{WeightVariance}_i &\sim \mathrm{Cat}([0.1,0.4,0.5,0.05,0.05]), \ i=1,...,\mathrm{I} \\
\mathbf{C}_i &\sim \mathcal{N}(\mathbf{0}_{\#\mathrm{BusCo.s}}, \mathrm{WeightVariance}_i), \ i=1,...,\mathrm{I} \\
\mathbf{J}_i &\sim \mathcal{N}(\mathbf{0}_{\#\mathrm{JourneyTypes}}, \mathrm{WeightVariance}_i), \ i=1,...,\mathrm{I} \\
\mathrm{logits}_{mji} &= \mathrm{IdMean}_{mj} + \mathbf{C}_i * \mathrm{Bus \ company \ name}_{mji} + \mathbf{J}_i * \mathrm{Journey \ type}_{mji} \\
\mathrm{Delay}_{mji} &\sim \mathrm{NegativeBinomial}(\mathrm{total \ count}=130, \mathrm{logits}_{mji}), \ i=1,...,\mathrm{I}, \ j=1,...,\mathrm{J}, \m=1,...,\mathrm{M}  \\
\end{split}
\end{equation}

Where $\mathrm{Bus \ company \ name}_{mji}$ is a one-hot encoded indicator variable indicating which bus company was running that route, and $\mathrm{Journey \ type}_{mji}$ similarly indicates which kind of bus journey was being undertaken. A $\mathrm{total \ county}$ of 130 is chosen as this is the largest recorded delay in the dataset.


\subsection{Bus Breakdown Graphical Model}
\label{bus_gm}
\begin{figure}[!htb]
\begin{center}
\resizebox{0.65\textwidth}{!}{%
  \begin{tikzpicture}
    % nodes

    %I
    \node[obs] (delay) {$\mathrm{Delay}_{mji}$};%

    \node[latent, left=of delay, xshift=-4cm, yshift=2cm]  (weightvariance) {$\mathrm{WeightVariance}_i$};
    \node[latent, below=of weightvariance,yshift=-1cm, xshift=-0.5cm] (companyweight) {$\mathbf{C}_i$};
    \node[latent, below=of weightvariance, xshift=0.5cm]  (journeyweight) {$\mathbf{J}_i$};

    %J
    \node[latent,above=of delay] (idmean) {$\mathrm{IDMean}_{mj}$}; %
    \node[latent, right=of idmean, xshift=0.2cm] (boroughvariance) {$\mathrm{BoroughVariance}_j$};
    %M
    \node[latent,above=of idmean, yshift=0.5cm] (boroughmean) {$\mathrm{BoroughMean}_m$}; %
    \node[latent, left=of boroughmean] (yearmean) {$\mathrm{YearMean}$};
    \node[latent, above=of yearmean, yshift=-0.5cm] (yearvariance) {$\mathrm{YearVariance}$};
    % plate
     \plate [inner sep=.6cm] {platedelay} {(delay)(companyweight)(journeyweight)(weightvariance)} {$\mathrm{I}$ Ids}; %
     \plate [inner sep=.6cm] {plateboroughs} {(idmean)(boroughvariance)(delay)} {$\mathrm{J}$ Boroughs};
     \plate [inner sep=.6cm] {plateyears} {(boroughmean)(idmean)(delay)} {$\mathrm{M}$ Years};
    %edges
     \edge {yearmean,yearvariance} {boroughmean}
     \edge {boroughmean,boroughvariance} {idmean}
     \edge {idmean} {delay}
     \edge {weightvariance} {journeyweight,companyweight}
     \edge {journeyweight,companyweight} {delay}
     \end{tikzpicture} }
\caption{Graphical model for the bus breakdown dataset}
\label{fig:bus_gm}
\end{center}
\end{figure}

\end{document}
