\documentclass[accepted]{uai2023} %


\usepackage[american]{babel}

\usepackage{natbib} %
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} %
\usepackage{booktabs} %
\usepackage{tikz} %



\usepackage{xr}
\makeatletter

\newcommand*{\addFileDependency}[1]{%
\typeout{(#1)}%
\@addtofilelist{#1}
\IfFileExists{#1}{}{\typeout{No file #1.}}
}\makeatother

\newcommand*{\myexternaldocument}[1]{%
\externaldocument{#1}%
\addFileDependency{#1.tex}%
\addFileDependency{#1.aux}%
}

\myexternaldocument{tailor_40}

\usepackage{amsmath}
\usepackage{amsthm}
\usepackage{amssymb}
\usepackage{bm}
\usepackage{subcaption}
\usepackage[normalem]{ulem}
\usepackage[makeroom]{cancel}
\usepackage{amsfonts}
\usepackage{xcolor}
\usepackage{graphicx}
\usepackage{tikz}
\usepackage{booktabs}
\usepackage{array}
\usepackage{multirow}
\usepackage{enumitem}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{setspace}
\usepackage{xfrac}


\renewcommand{\algorithmicrequire}{\textbf{Input:}}
\newcommand\algored[1]{\textcolor{red}{#1}}

\usepackage[capitalize,nameinlink]{cleveref}
\crefname{section}{Sec.}{Secs.}
\crefname{appendix}{App.}{Apps.}
\crefname{algorithm}{Alg.}{Algs.}
\creflabelformat{equation}{#2\textup{#1}#3}

\definecolor{darkgreen}{rgb}{0.0, 0.5, 0.0}
\newcommand{\newtext}[1]{\textcolor{darkgreen}{#1}}
\newcommand{\note}[1]{\textbf{\textcolor{red}{#1}}}


\input{macros.tex} %

\newcommand{\swap}[3][-]{#3#1#2} %

\title{Exploiting Inferential Structure in Neural Processes\\(Supplementary Material)}

\author[1]{\href{mailto:<d.v.tailor@uva.nl>?Subject=Exploiting Inferential Structure in Neural Processes}{Dharmesh Tailor}{}}
\author[2]{Mohammad Emtiyaz Khan}
\author[1]{Eric Nalisnick}
\affil[1]{%
    University of Amsterdam\\
    Amsterdam\\
    Netherlands
}
\affil[2]{%
    RIKEN Center for AI Project\\
    Tokyo\\
    Japan
}
  
\begin{document}
  
\onecolumn %
\maketitle



\appendix

\section{Derivations}
\subsection{Variational lower bound with structured inference network} \label{sup:sin_elbo}
For the conjugate case in \cref{sec:sin}, we show the ELBO in \cref{eq:elbo} (for a single task and dropping task indices for clarity) simplifies after substitution of the structured inference network (\cref{eq:sin}).
We denote the factor by $q(\vz \mid \vf_{\phi_{\textrm{NN}}}(\data_c^{(i)}) ) := \exp \rnd{ \big\langle\MT(\vz), \vf_{\phi_{\textrm{NN}}}(\vx_{c,i}, \vy_{c,i}) \big\rangle}$ (and analogous expression for each target point):
\begin{align}
    &\log p(\data_t \mid \data_c) \\
    &\phantom{{}=1}= \log \int_{\vz} p(\data_t \mid \vz) \; p(\vz \mid \data_c) \\
    &\phantom{{}=1}= \log \int_{\vz} q_{\phi}(\vz \mid \data_c \cup \data_t) \frac{p(\data_t \mid \vz) \; p(\vz \mid \data_c)}{q_{\phi}(\vz \mid \data_c \cup \data_t)} \\
    &\phantom{{}=1}\geq \E_{q_{\phi}(\vz \mid \data_c \cup \data_t)} \sqr{ \log \frac{p(\data_t \mid \vz) \; p(\vz \mid \data_c)}{q_{\phi}(\vz \mid \data_c \cup \data_t)} } \\
    &\phantom{{}=1}\approx \E_{q} \sqr{ \log \frac{p(\data_t \mid \vz) \;  q_{\phi}(\vz \mid \data_c)}{q_{\phi}(\vz \mid \data_c \cup \data_t)} } \\
    &\phantom{{}=1}= \E_q \sqr{ \log p(\data_t \mid \vz) }
    + \E_q \bigg[ \log
    \frac{%
    \cancel{\prod_{i=1}^{N_c} q(\vz \mid \vf_{\phi_{\textrm{NN}}}(\data_c^{(i)}) )} \; 
    }%
    {%
    \cancel{\prod_{i=1}^{N_c} q(\vz \mid \vf_{\phi_{\textrm{NN}}}(\data_c^{(i)}) )} 
    \prod_{i=1}^{N_t} q(\vz \mid \vf_{\phi_{\textrm{NN}}}(\data_t^{(i)}) )
    }%
    \frac{\cancel{q(\vz ; \phi_{\textrm{PGM}})} \; Z_{c,t}(\phi)}{\cancel{q(\vz ; \phi_{\textrm{PGM}})} \; Z_c(\phi)} \bigg] \\
    &\phantom{{}=1}= \E_q \sqr{ \log p(\data_t \mid \vz)}
    - \sum_{i=1}^{N_t} \textcolor{blue}{\E_q \sqr{ \log q(\vz \mid \vf_{\phi_{\textrm{NN}}}(\data_t^{(i)}) ) } }
    + \log Z_{c,t}(\phi) - \log Z_{c}(\phi)
\end{align}
where the 2\textsuperscript{nd} term resembles the entropy on the individual factors (shown in blue).

\subsection{EQUIVALENCE TO BAYESIAN AGGREGATION MEAN UPDATE EQUATION}\label{sup:bca_equivalence}
We show the posterior mean in \cref{eq:gauss_mean} can be expressed in the incremental form stated in \cite{volpp2020bayesian} (\cref{eq:bca_mean}):
\begingroup
\allowdisplaybreaks
\begin{align}
    \widetilde{\vmu} &= \widetilde{\MSigma} \rnd{\sum_{i=1}^{N_c} \MV_{c,i}^{-1} \vm_{c,i} + \MSigma_0^{-1} \vmu_0} \\
    &= \widetilde{\MSigma} \rnd{\sum_{i=1}^{N_c} \MV_{c,i}^{-1} \vm_{c,i} + \widetilde{\MSigma}^{-1} \vmu_0 - \sum_{i=1}^{N_c} \MV_{c,i}^{-1} \vmu_0 } \\
    &= \vmu_0 + \widetilde{\MSigma} \sum_{i=1}^{N_c} \MV_{c,i}^{-1} \rnd{\vm_{c,i} - \vmu_0}
\end{align}
\endgroup


\subsection{MIXTURE OF GAUSSIAN PRIOR NORMALIZATION CONSTANT}\label{sup:mog}
\begin{equation}
    C_k = (2\pi)^{-\frac{DN}{2}} \prod_{i=1}^{N_c} \det(\MV_{c,i})^{-\frac{1}{2}} \rnd{\frac{\det(\MSigma_k)}{\det(\widetilde{\MSigma}_k)}}^{-\frac{1}{2}}
    \exp\Bigg\{-\frac{1}{2} \Bigg( \sum_{i=1}^{N_c} \vm_{c,i}^\T \MV_{c,i}^{-1} \vm_{c,i} + \vmu_k^\T \MSigma_k^{-1} \vmu_k
    - \widetilde{\vmu}_k^\T \widetilde{\MSigma}_k^{-1} \widetilde{\vmu}_k \Bigg) \Bigg\}
\end{equation}






\section{EXPERIMENTAL DETAILS}\label{sup:exp}
The implementation for robust and mixture Bayesian aggregation is adapted from the implementation of BA\footnote{\url{https://github.com/boschresearch/bayesian-context-aggregation}} and the other baselines are taken from the codebase of Bootstrapped Neural Process\footnote{\url{https://github.com/juho-lee/bnp}} (with modifications elaborated in the appendix) \citep{lee2020bootstrapping}.

For our proposed mixture Bayesian Aggregation, we use the \texttt{MixtureOfDiagNormals} implementation from the \texttt{Pyro} package.
However due to numerical issues with using 32-bit floating-point precision with this implementation, we disable gradient flow through the categorical distribution.

\subsection{1-D regression}
We consider the following kernels:
\begin{enumerate}[leftmargin=8em,itemsep=0.5em]
    \item[RBF:]
        baseline used in \citet{lee2020bootstrapping}
        \[
            k(x, x') = s^2 \exp \rnd{ - \frac{(x-x')^2}{2 \ell^2} }
        \]
        with $s \sim \U[0.1, 1.0)$ and $\ell \sim \U[0.1,0.6)$;
    \item[Mat\'ern--$\frac52$:]
        baseline used in \citet{gordon2019convolutional}
        \[
            k(x, x') = \left(1 + \sqrt{5} d  + \frac53 d^2\right) \exp\left(-\sqrt{5} d \vphantom{\frac52}\right)
        \]
        with $d = 4|x - x'|$.
\end{enumerate}

Following \citet{lee2020bootstrapping}, the inputs of the context and target sets are sampled according to $x \sim \U(-2,2)$.
The sizes of the context and target sets are sampled according to $N_c \sim \U(3, 47)$ and $N_t \sim \U(3, 50-N_c)$.
In the evaluation phase, 5000 tasks are drawn identically to the data generating process for training. 



\subsection{Image completion}
Image completion is formulated as a regression problem where pixel coordinates are transformed to $[-1,1]$ and pixel intensities are rescaled to $[-0.5,0.5]$ following \citet{lee2020bootstrapping}.
A single image constitutes a task.
During training, images are restricted to the first 10 classes, with the size of the context and target sets are sampled according to 
$N_c \sim \U(3,197)$ and $N_t \sim \U(3,200-N_c)$.
For the in-distribution setting, we evaluate on a different set of images but restricted also to the first 10 classes. %
For \cref{fig:mog_emnist_context} we sample the target sets according to $N_t \sim \U(3,500-N_c)$ (evaluation only).
For evaluation on the out-of-distribution setting, images are taken from the unseen classes 10-46.


\subsection{Model architectures}
\paragraph{Decoder architecture} Across all models, we keep the decoder architecture the same, with separate networks outputting the mean and standard deviation of the predictive distribution (following \citet{volpp2020bayesian}).
The networks have 128 hidden units and ReLU activation functions.
For the 1-D regression experiment, we use a 3-layer MLP and for the image completion experiment, a 4-layer MLP.
Following \citet{le2018empirical}, the standard deviation of the predictive distribution is processed using a lower-bounded softplus with a lower bound of $0.1$.

\paragraph{Encoder architecture}
For models with latent path, the latent dimensionality is set to 128. ReLU activation function for the hidden layers are used throughout. Unless otherwise stated, the hidden size is 128.
Here we state the architectures of the baselines with mean-pooling aggregation in the 1-D regression experiment:
\begin{enumerate}[leftmargin=8em,itemsep=0.5em]
    \item[NP:] This is adapted from \cite{garnelo2018neural} where the deterministic path is removed (as done in \citet{volpp2020bayesian}). 
    \item[NP+SA:] This incorporates (multi-head) self-attention into the encoder architecture of NP.
\end{enumerate}
Following \citet{le2018empirical}, we process the standard deviation of the latent variable using a lower-bounded sigmoid with a lower bound of $10^{-4}$.

The architecture of the baselines with Bayesian Aggregation (BA) follow \citet{volpp2020bayesian} with separate MLPs predicting each neural sufficient statistic (or latent observation and observation variance as elaborated in \citet{volpp2020bayesian}).
The 2nd neural sufficient statistic (i.e. observation noise) is processed using a lower-bounded sigmoid, identical to how the latent variance is processed in the baselines with mean-pooling.
Following \citet{volpp2020bayesian}, a Gaussian prior with fixed parameters is used: $\vmu_0=\vzero, \MSigma_0 = \MI$.
We use MLPs with 64 hidden units and 4 layers.

For our proposed robust Bayesian Aggregation, we extend the aforementioned BA implementation. 
The gamma prior parameters are set as follows: $a_0 = b_0 = 10^{-6} \cdot D$ and $c_0 = 10^{-2} \cdot D$. This is similar to \citet{tipping2005variational} but appropriately scaled by the latent dimensionality. 
For the image completion experiment, the depth of all MLPs is increased by 1.

The models evaluated in \cref{sec:exp_mba} are trained using 10 latent samples and those evaluated in \cref{sec:exp_rba} are trained using 5 latent samples.
We use more latent samples for the mixture experiments as suggested in \citet{wang2022moe}.
Following \citet{lee2020bootstrapping}, all models are optimized using ADAM with initial learning rate $5 \cdot 10^{-4}$ and cosine annealing scheme for the learning rate schedule. 
For the 1-D regression experiment, models are trained for 100,000 steps where each step consists of 16 tasks.
For the image completion experiment, models are trained for 200 epochs with batches of 100 images.

To evaluate the models, we compute the posterior predictive log-likelihood and RMSE using a Monte-Carlo approximation. Following \citet{kim2019attentive} we also evaluate the criterion by using the context points as targets as well. This gives an indication of how well the model is fitting the context points (reconstruction error).

\section{Extended Table 1 with robust BA comparison}

\begin{table*}[h]
\centering
\resizebox{.99\linewidth}{!}{\input{tables/mog_emnist_ext.tex}}
\end{table*}







\bibliography{tailor_40}

\end{document}
