%\documentclass{uai2023} % for initial submission
%\externaldocument[]{main}


\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
                                    
\usepackage{tikz,graphics,color,fullpage,float,epsf,caption,subcaption}
\usepackage{placeins}

\newcommand{\KL}{\text{KL}}

\usepackage[round]{natbib}
\renewcommand{\bibname}{References}
\renewcommand{\bibsection}{\subsubsection*{\bibname}}
\bibliographystyle{apalike}

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    %\bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
\externaldocument{uai2023-template}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{A Decoder Suffices for Query-Adaptive Variational Inference \\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1*]{\href{mailto:<sakshia1@uci.edu>}{Sakshi Agarwal}{}}
\author[1*]{Gabriel Hope}
\author[1]{Ali Younis}
\author[1]{Erik B. Sudderth}
% Add affiliations after the authors
\affil[1]{%
    Department of Computer Science\\
    University of California, Irvine
}
\affil[*]{%
    Equal contribution
}
  
  \begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle

The supplementary section is organized as follows. Section 1 focuses on deriving the optimization objective used in QAVI, and elaborates on the tricks used to estimate gradients for the parameters in Mix. QAVI. Section 2 includes some implementation details. %on pre-trained VAEs, different baselines and QAVI optimization. 
Lastly, we discuss some more experiments in section 3.

\section{QAVI: Derivations}

\subsection{Feat. QAVI}
As specified in the main paper, we define a variational distribution over unobserved features $x_M$ as $q_\lambda(x_M)$ and re-use the encoder to model $q(z|x_O, x_M)$. Hence, the ELBO can be written as : 

\begin{align}
    \mathcal{L}_M(\lambda;x_O) &= E_{q_{\lambda}(x_M,z|x_O)}[\log p_\theta(x_O,x_M,z) - \log q_{\lambda}(z,x_M|x_O)]  \nonumber\\
    &=  E_{q_{\lambda}(x_M,z|x_O)}[\log p_\theta(x_O, x_M|z) + \log p(z) - \log q_{\lambda}(x_M) - \log q_{\phi}(z|x_M,x_O)]     \nonumber \\
    &=  E_{q_{\lambda}(x_M)}[E_{q_{\phi}(z|x_O,x_M)}[\log p_\theta(x_O, x_M|z) - \log q_{\phi}(z|x_M,x_O)  + 
 \log p(z)]] - E_{q_{\lambda}(x_M)}[\log q_\lambda(x_M)]   \nonumber \\
     &=  E_{q_{\lambda}(x_M)}[E_{q_{\phi}(z|x_O,x_M)}\log [p_\theta(x_O, x_M|z)] - KL(q_{\phi}(z|x_M,x_O)||p(z))] - E_{q_{\lambda}(x_M)}[ \log q_\lambda(x_M)]   \nonumber \\
 &=  E_{q_{\lambda}(x_M)}[E_{q_{\phi}(z|x_O,x_M)}\log [p_\theta(x_O, x_M|z)] - KL(q_{\phi}(z|x_M,x_O)||p(z))] + H(q_\lambda (x_M))
 \label{varFeat}
\end{align}
where $H(q_\lambda (x_M))$ is the entropy of $q_\lambda (x_M)$ and $KL$ is the Kullback-Leibler divergence. Monte-Carlo approximation of Eq. \ref{varFeat} with $S$ samples from $q_{\lambda}(x_M,z|x_O)$ is equivalent to sampling $x_M^{(s)} \sim q_\lambda(x_M)$ and $ z^{(s)} \sim q_\phi(z|x_O,x_M^{(s)})$. We get: 
\begin{equation}
    \mathcal{L}_M(\lambda;x_O)  \approx  \frac{1}{S} \sum_{s=1}^S [ \log p_\theta(x_O,x_M^{(s)}|z^{(s)}) - KL(q_\phi (z|x_O,x_M^{(s)})|| p(z))] + H(q_\lambda (x_M) )
\end{equation}

\subsection{Non-Amortized Inference}
Here, we define a variational distribution on the latent space variable $z$ as $q_\lambda(z)$. The ELBO can thus be derived as follows:
\begin{align}
    \mathcal{L}_{N}(\lambda; x_O) &= E_{q_{\lambda}(z, x_M|x_O)}[\log p_\theta(x_O,x_M,z) - \log q_{\lambda}(z,x_M|x_O)]  \nonumber \\
    &= E_{q_{\lambda}(z, x_M|x_O)}[\log p_\theta(x_O, x_M|z) + \log p(z) - \log q_{\lambda}(z) - \log p_{\theta}(x_M|z)] \nonumber \\
   &= E_{q_{\lambda}(z)}[\log p_\theta(x_O|z) + \log p(z) - \log q_{\lambda}(z)] \nonumber \\
   &= E_{q_{\lambda}(z)}[\log p_\theta(x_O|z)] - \KL(q_{\lambda}(z)||p(z))
\label{varlatent}
\end{align}
Approximating the above with $S$ samples, $z^{(s)} \sim q(z)$, we get :   
\begin{equation}
    \mathcal{L}_{N}(\lambda; x_O) \approx \frac{1}{S} \sum_{s=1}^S [ \log p_\theta(x_O|z^{(s)})] - \KL(q_\lambda (z)|| p(z))
\label{eq:optim-z}
\end{equation}

\subsubsection{Gradients for mixture posterior}
\textbf{Implicit Reparameterized Gradients.} \cite{graves2016stochastic_implicit_tech_report} and \cite{figurnov2018implicit} propose a method of computing pathwise gradients that relies on reparameterization where a standardization function $S_{\lambda}(z)$ is used on samples $z \sim q_\lambda(z)$ to remove the dependence of the parameters $\lambda$ from those samples: $S_{\lambda}(z) = \epsilon \sim q(\epsilon)$.  For univariate Gaussian mixture models, one such standardization function is the CDF, $F(z|\lambda)$, which transforms samples from the mixture to samples from a uniform distribution with support $[0,1]$.  In the multivariate case, the multivariate distributional transform \citep{Ruschendorf2013} can be used: 
\[
    S_{\lambda}(z) = \Big( F(z_1|\lambda), F(z_2|z_1, \lambda), ..., F(z_D|z_1, ..., z_{D-1}, \lambda)   \Big)
\]
Implicit differentiation is then performed to derive the following pathwise gradient estimator as shown in \citet{figurnov2018implicit}:
\begin{equation}
	\nabla_{\lambda}z = -\big(\nabla_z S_{\lambda}(z)\Big)^{-1} \nabla_{\lambda}S_{\lambda}(z) 
\end{equation}

In \citet{figurnov2018implicit}, it is mentioned that this method can be used with mixture models however no empirical results were given.  In practice we find that the variance of the pathwise gradient estimates w.r.t. the mixture weights is prohibitively large, preventing learning of the mixture weights. To address this, we use a different estimator for computing gradients w.r.t. the mixture weights. 

\textbf{Importance Sampling Gradient Estimator.} \citet{scibior2021differentiable_importance} propose that samples drawn from $q_\lambda(z)$ may be augmented with a gradient operator that does not change the sample in the forward pass but allows for gradient computation in the backward pass. A stop-gradient operator, $\bot$ is used in this augmentation. When $\bot$ is applied to a function $f\theta(x)=y$, the value of that function does not change in the forward pass: $\bot f\theta(x)=f\theta(x)=y$.  In the backward pass, $\bot f\theta(x)$ is treated as a constant value and not a function, yielding a gradient value of $0$: $\nabla_{\theta}\bot f_{\theta}(x) = 0$. For a given sample $z \sim q_\lambda(z)$, \cite{scibior2021differentiable_importance} augments that sample using importance sampling:
\begin{equation}
	z' = \frac{q_\lambda(z)}{\bot q_\lambda(z)} \cdot z
\end{equation}
In the forward pass $z'=z$ since the numerator and denominator cancel out. In the backward pass, we compute gradients for the numerator only while treating the denominator as a constant, yielding the following estimator:
\begin{equation}
	\nabla_{\lambda}z' =  \frac{z}{\bot q_\lambda(z)} \cdot \nabla_{\lambda} q_\lambda(z)
\end{equation}
Unlike the Implicit Reparameterized Gradients estimator, we find that this method computes low variance estimates of the pathwise gradient w.r.t. the mixture weights $w_t$ while having high variance w.r.t. the mixture component means and standard deviations. 

\textbf{Hybrid Approach}: We adopt a novel, hybrid approach to pathwise gradient estimation.  We use implicit reparameterized gradients to compute gradients w.r.t. the mixture component means and standard deviations, $\mu_t$ and $\Lambda_t$.  We use the importance sampling gradient estimator to compute the gradients with respect to the mixture weights, $w_t$.


\section{Implementation Details}
\subsection{Pre-trained VAEs}
\paragraph{MNIST} We train a VAE using a WideResNet architecture \cite{Zagoruyko2016WideRN}, on fully-observed real-valued MNIST. The encoder is a WideResNet with 3 downsampling levels and 2 resnet blocks within each level, with ReLU nonlinearities. The output layer has no non-linearity for the mean and a softplus nonlinearity for the standard deviation. We use a latent dimension of 50.  The decoder has a similar upsampling WideResNet. The decoder has no non-linearities at the output for the continuous-Bernoulli distribution \cite{10.5555/3454287.3455477}. We train this VAE using Adam \cite{kingma2014method} with a learning rate of $1e^{-3}$ for 2000 epochs. 

\paragraph{SVHN} We have a similar architecture for SVHN with Leaky ReLU non-linearities and an output distribution of discretized truncated normal distribution \cite{salimans2017pixelcnn}, considering the range [-1, 1]. %For the UCI datasets, we use a 2 hidden layer neural network architecture for both the inference and generative model with ReLU non-linearities and a latent dimension of 10. 
When training the VAE for SVHN, we used a learning rate of $1e^{-4}$ and chose a model that has the best validation loss to encourage a more generalized generative model. We trained the model for $500$ epochs. %For FFHQ-256, .... %we use the pre-trained "very deep" VAE from \cite{childVeryDeep2021} as our generative model. 

\paragraph{Tabular datasets} We trained a VAE for each UCI dataset with: a latent dimension of 10; both encoder and decoder are multi-layer perceptrons with 3 hidden layers (128 hidden dimension each) and ReLU activations; we use an independent diagonal Normal distribution for the variational family and the observation model. We constrain output standard deviations of the decoder to be larger than $0.001$. We trained the model using the Adam optimizer with a learning rate of $2e^{-4}$ for $1000$ epochs, saving the parameters of the models with lowest loss on the validation set. Our architectures are chosen to match \cite{pmlr-v97-mattei19a}. All UCI datasets are normalized to have mean 0 and variance 1. We use $65\%$ of the data for training, $15\%$ for validation and $20\%$ as test set. We corrupt the test set by removing half of the features in each row uniformly at random. 

\paragraph{FFHQ HVAE} As discussed, for our deep-VAE experiments we adopt the architecture proposed by \cite{child2021very} for the FFHQ-256 dataset. For our quatitative experiments and baselines, we reduce the model size by reducing the internal channel width from 512 to 64 and reducing the number of layers at each resolution by half. This reduces the total number of model parameters from 115M to 5.9M. We retain the original latent dimension channel width of 16. Despite this substantial decrease in parameters, we find that performance is still good, as seen in our reported results.

\subsection{Baselines}
Below we discuss the implementation details for the different inference baselines considered in the paper. We refer to $x_O$ as $x$ that has had unobserved features set to zero and $x_O'$ as $x_O$ concatenated with a bitmask that indicates which features are observed. 

\begin{enumerate}
    \item \textbf{Fill 0s:} Passing $x_O$ through the pre-trained encoder yields a simple baseline posterior over the latent space, $q(z)$.
    \item \textbf{pseudo-Gibbs sampling:} Initially, we set $x=x_O$, 
    (i) pass $x$ through the VAE, (ii) sample unobserved features in $x$ from the decoder and (iii) repeatedly perform (i) and (ii) for $300$ iterations. In the end, the resulting posterior over the latent space $q(z)$ is the inferred posterior. 
    \item \textbf{Metropolis-within-Gibbs sampling:}  This is similar to pseudo-Gibbs sampling, instead here we accept/reject a sample according to Eq. 15 in \cite{NEURIPS2018_0609154f} and perform pseudo-Gibbs for the first $20$ iterations to avoid early rejections. After a total of $300$ iterations, we pick the lower-dimensional latent space posterior for evaluation.
    %\item \textbf{Retuned-Encoder:} For each case of missingness pattern considered in the paper (patches, half image or $50\%$ missing), the encoder (only) was re-tuned on the partially-observed test set ($x_O$) with the same ELBO used for training the base VAE (Eq. 2, with $x$ replaced by $x_O$) for a $100$ epochs. We employ the same set of hyper-parameters used for base VAE training per dataset. After training, we estimate the amortized $q(z|x_O)$ for each data point.     

    \item \textbf{Retuned-Encoder:}  We train an inference network (same architecture as base VAE encoder with inputs as $x_O'$) for each case of partially-observed test data for $100$ epochs using the same ELBO in Eq. 2 of the main paper (with $x$ replaced by $x_O$). We do not consider masking information in the latent space as done in \cite{collier2020vaes} and instead, exploit the previously trained generative model. Post training, we infer the latent space posterior $q(z)$ using this new encoder model.

    \item \textbf{Posterior Matching:} In order to train an inference network for this case, we consider the fully-observed train set $x$ and a masking distribution from \cite{zhao2021comodgan_comodgan} to generate partially observed train data points $x_O'$. We then consider an external inference network (again similar in architecture to the base (H-)VAE with inputs as $x_O'$) to approximate the partially-observed posterior in the latent space as $q_\psi(z|x_O')$. Training this network further maximizes the objective : $E_{z\sim q_\phi(z|x)} \log q_\psi(z | x_O')$ using the same hyper-parameters as training the base VAE for $1000$ epochs, and does not require the generative model. After this general training is complete, we use this inference network to estimate the latent space posteriors ($q(z)$) for each case of partially-observed test data. For both single-layer and hierarchical VAEs, we initialize the partial encoder with the weights of the original encoder, save for the first layer, which is modified to take an additional mask channel. For the HVAE variant of posterior matching, trained on the FFHQ dataset, we train for 300k steps with a batch size of 16.
    \item \textbf{VAEAC:} We compare a hierarchical VAE trained with the VAEAC objective in our experiments on FFHQ. We do not compare with VAEAC for our standard VAE experiments as the skip connections in the VAEAC architecture makes architecture matching imprecise and posterior matching has been shown to outperform the corresponding VAEAC approach in this setting \citep{strauss2022posterior}. The ``ladder'' structure of our chosen HVAE architecture provides a similar ``skip'' structure to the conditional model. As with posterior matching, we initialize the partial encoder for VAEAC with the weights of the original HVAE encoder. We also initialize the full-encoder and decoder with the corresponding weights from the pre-trained model. For VAEAC, all three networks are trained end-to-end for 300k steps with a batch size of 8 images.  
\end{enumerate}

We evaluate each latent-space posterior, $q(z)$, resulted from the above inference methods to compute missing feature log-likelihoods.
 

\subsection{Feat. QAVI}
As this approach requires repeated passes of both the encoder and decoder, we evaluate only on the MNIST dataset (for both patches and top-half missingness patterns). Since MNIST is normalized to [0,1], an ideal choice of $q_\lambda(x_M)$ is a continuous bernoulli distribution~\citep{10.5555/3454287.3455477} with parameter $\lambda \in (0,1)$. We allow for unconstrained optimization by re-parameterizing as: $\lambda = \sigma(\hat{\gamma})$, where $\sigma(\cdot)$ is the sigmoid function and $\hat{\gamma}$ is an unconstrained parameter. We initialize the parameters randomly, and then, optimize the ELBO in Eq. 5 above with $S=100$ samples for $300$ iterations using Adam with a learning rate of $1.0$. We decrease the learning rate by a factor of 10 every 100 steps. To sample from the latent space distribution $q_\lambda(z)$ here means we first draw $k$ samples $\hat x \sim q_\lambda(x_M)$, followed by one sample $z \sim q(z|\hat x)$. %Substituting this helps evaluation using Eq. \ref{eq:IWAE}. 


\subsection{Gaus./Flow/Mix. QAVI} Initializations to the mean and standard deviations of Gaussian distributions for each variational posterior considered in the paper is crucial. Means for the Gaussian posterior, base Gaussian distribution for IAF and individual Gaussian components in the Mixture is carried out via the outputs of the amortized inference model $q_\phi(z|x_O)$. The standard deviation for each posterior is initialized to $1$ per latent dimension.  For the mixture case, we introduce a [-1,1] uniform random noise to the means to break symmetry among its individual components and keep the initial components' weights uniform.

The variational parameters in the Gaussian Posterior are optimized using Adam with a learning rate of $1.0$, decreasing it by a factor of 2 every $30$ iterations.

We follow \cite{kingmainverseflows} and use a 2-layer MADE network \citep{pmlr-v37-germain15} with 320 dimensions (or $50$ dimension in case of UCI datasets) per layer to implement each IAF transformation. We stack 2 IAF transformations on top of a simple Gaussian distribution in the latent space. For computational reasons, we share the parameters of the two transformations across a batch (size 100) of test images. We optimize the base Gaussian distribution using Adam with a learning rate of $0.1$ and the parameters of the inverse autoregressive neural network using Adam with a learning rate of $0.01$. 

When the variational posterior is a Gaussian Mixture, we use 10 Gaussian components. Training of all parameters (means, standard deviations, weights) is done via Adam optimizer, involving 3 phases. The first phase consists of the first $50$ iterations, where we optimize all parameters with a learning rate of $0.1$ and at an interval of every $10$ iterations, attempt to re-initialize those components whose weights fall below a threshold $t=0.7$. We reset the weights for all components to be equal when this re-initialization occurs. This method of re-initializing allows us to throw away any ``bad'' initializations. The second phase consists of $150$ iterations, where only the means and standard deviation in the individual components are optimized with a learning rate of $1.0$, decreased every $30$ iterations by a factor of $2$. During the last $100$ iterations, we optimize all parameters, where the unconstrained weights of individual components are optimized with a learning rate of $0.1$, decreasing every $30$ iterations by a factor of $10$. The idea behind doing this is to let the individual components converge first, followed by adjusting weights to these individual components. 

\iffalse
  \begin{figure}[h]
%\vspace{.3in}
\begin{subfigure}[t]{\linewidth}
   \centerline{\includegraphics[width=\linewidth]{supplement_plots/pmvsQAVI.pdf}}
%\vspace{.3in}
%\caption{}
 \end{subfigure}

  \caption[]
{\small This plot shows the estimated log-likelihood values for missing pixels (IWAE) on the y-axis vs the wall-clock training time (in hrs) on the x-axis for a fair comparison with the amortized approach of Posterior-Match. The log-likelihood is estimated while optimizing posterior parameters, $\lambda$ for a batch of 100 MNIST images with missing pixels. We see that the performance of Posterior-Match ($\approx 1.6$) is far lower than QAVI \emph{and} requires more time ($>3.5$ hrs) for a small batch of data. Gaus. and Flow. QAVI seem to converge in $\approx 1$ min, whereas if given more time Mix. QAVI converges to higher log-likelihoods. 
} 
\label{fig:timecomplexity}
\end{figure}
\fi

\begin{figure}[h]
%\vspace{.3in}
\begin{subfigure}[t]{\linewidth}
   \centerline{\includegraphics[width=0.6\linewidth]{plots/beta_iwae.pdf}}
%\vspace{.3in}
%\caption{}
 \end{subfigure}

  \caption[]
{\small The plot shows the estimated log-likelihood for missing pixels for our non-amortized Gaussian posterior method when $1 \leq \beta \leq 50$. This is performed for the random missing patches in MNIST test images. We observe that up-weighting the $\KL$ term by the hyper-parameter $\beta$ in Eq. 8 from the main paper boosts performance until the value ($\approx 15$) and slowly decreases the log-likelihood with higher values. This translates clearly to the fact that some variance in the Gaussian posterior is helpful to capture uncertainty, but after a threshold this variance might result in inaccurate imputations.
} 
\label{fig:beta_iwae}
\end{figure}

%\paragraph{}

\subsection{Hierarchical QAVI}

 Figures \ref{fig:vdvae} and \ref{fig:vdvae_non}, show the amortized and QAVI posterior achitectures respectively. Figures 5-7 show further results using the HVAE FFHQ QAVI model.

\paragraph{Modifications to P-IDS and U-IDS metrics.} In order to reduce sensitivity to the test set size we modify the original P-IDS and U-IDS metrics. In the original formulation used by \cite{zhao2021comodgan_comodgan}, P-IDS and U-IDS accuracies are computed on the same set of images used to train the discriminative SVM model. In our modified approach, we use 10-fold cross validation: training the SVM and evaluating the scores on distinct subsets.


\section{Other Experimental Details}

%\paragraph{Performance-Time Tradeoff} From Figure \ref{fig:timecomplexity}, we can observe the trade-off between time and performance for QAVI and the best performing amortized approach, Posterior Match for a batch of 100 MNIST images with patches missing. For a small batch of data, QAVI performs better than Posterior Match and with less time. We also observe that Mix. QAVI outperforms Gaus. QAVI when given more training time.  
%For MNIST, our QAVI Gaussian posterior took 3.18 minutes to perform inference for a batch of 100 images. Hence, taking $\sim 30$ minutes to infer the randomly sampled test set of 1000 images. On the other hand, Posterior Match takes 3.6 hours to train the encoder, clearly more time consuming for small test sets.  
%For SVHN, QAVI Gaussian takes 11.2 minutes to perform inference for a 100 images, hence 1.87 hrs for the test set of 1000 images. Posterior Match takes 5.14 hrs to train the encoder. 

\paragraph{Number of samples} Increasing the number of samples to estimate the loss in Eq. 7 \& 8 has a sub-linear effect on the runtime since they can be computed in parallel on modern GPUs. Gauss. QAVI suffers virtually no performance penalty from reducing the number of samples to 10 (as measured by IWAE), while our flow and mixture based posteriors perform similar to Gauss. QAVI with fewer samples: they require more samples to capture all the modes of the approximate posterior. 

\paragraph{$\beta$ hyper-parameter}
Figure \ref{fig:beta_iwae} shows the estimated log-likelihood for missing pixels for our non-amortized Gaussian posterior method when $1 \leq \beta \leq 50$. The plot suggests that using $\beta=20$ is optimal.



 \begin{figure*}[h]
%\vspace{.3in}
\begin{subfigure}[t]{\linewidth}
   \centerline{\includegraphics[width=\linewidth]{supplement_plots/vdvae_diagrams_amor.pdf}}
%\vspace{.3in}
%\caption{}
 \end{subfigure}

  \caption[]
{\small Architecture of the original (amortized inference) ``very-deep'' VAE. Adapted from figure 3 of \cite{child2021very}.
} 
\label{fig:vdvae}
\end{figure*}

\begin{figure*}[h]
%\vspace{.3in}
\begin{subfigure}[t]{\linewidth}
   \centerline{\includegraphics[width=\linewidth]{supplement_plots/vdvae_diagrams_non_amor.pdf}}
%\vspace{.3in}
%\caption{}
 \end{subfigure}

  \caption[]
{\small Architecture of the local variational distribution ($q_\lambda(z)$) for QAVI with the ``very-deep'' VAE. We show how intermediate outputs of the bottom-up encoder are replaced with trainable network parameters to definite a local variational distribution. 
} 
\label{fig:vdvae_non}
\end{figure*}



\begin{figure*}[t!]
%\vspace{.3in}
\begin{subfigure}[t]{\linewidth}
   \centerline{\includegraphics[width=0.85\linewidth]{plots/supplement.pdf}}
%\vspace{.3in}
%\caption{}
 \end{subfigure}

  \caption[]
{\small Digit completion results on real-valued MNIST (left) and SVHN (right) images for the rotating-half and random-patches missing criteria. %drawn from $q_\lambda(z)p_\theta(x|z)$. 
} 
\label{fig:imputations}
\end{figure*}




  \begin{figure*}[h]
%\vspace{.3in}
\begin{subfigure}[t]{\linewidth}
   \centerline{\includegraphics[width=0.9\linewidth]{supplement_plots/Big_model_comp.pdf}}
%\vspace{.3in}
%\caption{}
 \end{subfigure}

  \caption[]
{\small Comparison of QAVI inpainting using the reduced-size (5.9M parameter) very deep HVAE model used for our main experiments with QAVI inpainting using the original FFHQ-256 model of \cite{child2021very} (115M parameters).
} 
\label{fig:vdvae_big_com}
\end{figure*}





\begin{figure*}[h]
\centering
%\vspace{.3in}
\begin{subfigure}[t]{\linewidth}
   \centerline{\includegraphics[width=0.7\linewidth]{plots/vd_imgs/img_0.pdf}}
%\vspace{.3in}
%\caption{}
 \end{subfigure}
 \begin{subfigure}[t]{\linewidth}
   \centerline{\includegraphics[width=0.7\linewidth]{plots/vd_imgs/img_1.pdf}}
%\vspace{.3in}
%\caption{}
 \end{subfigure}
 \begin{subfigure}[t]{\linewidth}
   \centerline{\includegraphics[width=0.7\linewidth]{plots/vd_imgs/img_2.pdf}}
%\vspace{.3in}
%\caption{}
 \end{subfigure}
 \begin{subfigure}[t]{\linewidth}
   \centerline{\includegraphics[width=0.7\linewidth]{plots/vd_imgs/img_3.pdf}}
%\vspace{.3in}
%\caption{}
 \end{subfigure}
 \begin{subfigure}[t]{\linewidth}
   \centerline{\includegraphics[width=0.7\linewidth]{plots/vd_imgs/img_4.pdf}}
%\vspace{.3in}
%\caption{}
 \end{subfigure}
 \begin{subfigure}[t]{\linewidth}
   \centerline{\includegraphics[width=0.7\linewidth]{plots/vd_imgs/img_5.pdf}}
%\vspace{.3in}
%\caption{}
 \end{subfigure}
 \begin{subfigure}[t]{\linewidth}
   \centerline{\includegraphics[width=0.7\linewidth]{plots/vd_imgs/img_6.pdf}}
%\vspace{.3in}
%\caption{}
 \end{subfigure}

  \caption[]
{\small Comparison of inpainting results on the FFHQ-256 dataset. We compare our non-amortized deep VAE inpainting with IPA and Posterior Matching. For each image we show the true image, the masked image and 6 inpainted samples from each method. We also extract a 2048-dimensional feature vector for each image using the Inception-V3 network (as for FID score). We plot the distribution of distances for 100 sampled images from the true image using euclidean distance in this extracted feature space. We see that in most cases, the IPA model produces more varied imputations, while non-amortized inference produces more consistent results.
} 
\label{fig:vdvae_example}
\end{figure*}

\begin{figure*}[h]
\centering
%\vspace{.3in}
\begin{subfigure}[t]{\linewidth}
   \centerline{\includegraphics[width=0.7\linewidth]{plots/vd_imgs/img_14.pdf}}
%\vspace{.3in}
%\caption{}
 \end{subfigure}
 \begin{subfigure}[t]{\linewidth}
   \centerline{\includegraphics[width=0.7\linewidth]{plots/vd_imgs/img_15.pdf}}
%\vspace{.3in}
%\caption{}
 \end{subfigure}
 \begin{subfigure}[t]{\linewidth}
   \centerline{\includegraphics[width=0.7\linewidth]{plots/vd_imgs/img_9.pdf}}
%\vspace{.3in}
%\caption{}
 \end{subfigure}
 \begin{subfigure}[t]{\linewidth}
   \centerline{\includegraphics[width=0.7\linewidth]{plots/vd_imgs/img_16.pdf}}
%\vspace{.3in}
%\caption{}
 \end{subfigure}
 \begin{subfigure}[t]{\linewidth}
   \centerline{\includegraphics[width=0.7\linewidth]{plots/vd_imgs/img_11.pdf}}
%\vspace{.3in}
%\caption{}
 \end{subfigure}
 \begin{subfigure}[t]{\linewidth}
   \centerline{\includegraphics[width=0.7\linewidth]{plots/vd_imgs/img_12.pdf}}
%\vspace{.3in}
%\caption{}
 \end{subfigure}
 \begin{subfigure}[t]{\linewidth}
   \centerline{\includegraphics[width=0.7\linewidth]{plots/vd_imgs/img_13.pdf}}
%\vspace{.3in}
%\caption{}
 \end{subfigure}
\end{figure*}

\vfill

%\pagebreak

\clearpage
\bibliography{agarwal_747}


\end{document}
