% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{microtype}
\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{amssymb}
\usepackage{color,soul}
\usepackage{dsfont}
\usepackage{nccmath}
\usepackage{caption}
\usepackage{stfloats}
\input{mohseni_273-commands.tex}

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
\externaldocument{uai2023-template}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example
\title{Adaptive Conditional Quantile Neural Processes (Supplementary material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<peiman.mohseni@tamu.edu>?Subject=Adaptive Conditional Quantile Neural Processes}{Peiman Mohseni}{}}
\author[2]{Nick Duffield}
\author[3]{Bani Mallick}
\author[4]{Arman Hasanzadeh}


% Add affiliations after the authors
\affil[1]{%
    Computer Science and Engineering Department\\
    Texas A\&M University
}
\affil[2]{%
    Electrical and Computer Engineering Department\\
    Texas A\&M University
}
\affil[3]{%
    Statistics Department\\
    Texas A\&M University
}

\affil[4]{%
    Google Cloud
}
  
\begin{document}
% \appendix

\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle

\section{Additional Results}\label{sec: supp-additional-result}
\begin{figure*}[!h]
	\centering
        \scalebox{0.9}{
            \begingroup
            \setlength{\tabcolsep}{-0.5pt}
            \begin{tabular}{c@{\hskip -0.2pt}ccccc}
                \raisebox{4.8\normalbaselineskip}[0pt][0pt]{\rotatebox[origin=c]{90}{Sawtooth}} & \includegraphics[width=0.22\textwidth]{figures/sawtooth/sawtooth-data} & 
	          \includegraphics[width=0.22\textwidth]{figures/sawtooth/ACQNP/sawtooth-AL-total-mean-ACQNP} &
                \includegraphics[width=0.22\textwidth]{figures/sawtooth/CQNP/sawtooth-AL-total-mean-CQNP} &
                \includegraphics[width=0.22\textwidth]{figures/sawtooth/BNP/sawtooth-normal-mean-BNP} &
                \includegraphics[width=0.22\textwidth]{figures/sawtooth/CANP/sawtooth-normal-mean-CANP} \\
                \raisebox{4.8\normalbaselineskip}[0pt][0pt]{\rotatebox[origin=c]{90}{RBF}} & \includegraphics[width=0.22\textwidth]{figures/RBF/RBF-data} & 
	          \includegraphics[width=0.22\textwidth]{figures/RBF/ACQNP/RBF-AL-total-mean-ACQNP} &
                \includegraphics[width=0.22\textwidth]{figures/RBF/CQNP/RBF-AL-total-mean-CQNP} &
                \includegraphics[width=0.22\textwidth]{figures/RBF/BNP/RBF-normal-mean-BNP} &
                \includegraphics[width=0.22\textwidth]{figures/RBF/CANP/RBF-normal-mean-CANP} \\
                \raisebox{4.8\normalbaselineskip}[0pt][0pt]{\rotatebox[origin=c]{90}{Mat\'ern 5/2}} & \includegraphics[width=0.22\textwidth]{figures/Matern52/Matern-data} & 
	          \includegraphics[width=0.22\textwidth]{figures/Matern52/ACQNP/Matern-AL-total-mean-ACQNP} &
                \includegraphics[width=0.22\textwidth]{figures/Matern52/CQNP/Matern-AL-total-mean-CQNP} &
                \includegraphics[width=0.22\textwidth]{figures/Matern52/BNP/Matern-normal-mean-BNP} &
                \includegraphics[width=0.22\textwidth]{figures/Matern52/CANP/Matern-normal-mean-CANP} \\
                & a) Data & b) ACQNP & c) CQNP & d) BNP & e) CANP
            \end{tabular}
            \endgroup
        }
	\caption{Examples of predictions made by different methods. For A/CQNP, the mean of the compound predictive distribution, approximated with $N_{\tau}=10$ samples, is plotted. For BNP and CANP, we plot the mean of the Gaussian predictive distribution as the predictions. For BNP, we plot the predictions obtained from 20 different sets of bootstrap contexts.}
        \label{fig: 1d-synthetic-benchmark-supp}
\end{figure*}

We compare A/CQNPs and baselines on data generated from three additional processes described in table \ref{tab: 1d-benchmark-generative-processes-supp} with the following choice of parameters:
\begin{itemize}
    \item Sawtooth\citep{Gordon2020Convolutional}: $s \sim \mathcal{U}[-2,\,2)$, $\alpha \sim \mathcal{U}[1, 2)$, $\omega \sim [1, 3)$, $\delta \sim \mathcal{U}[-2, 2)$, $K \sim \mathcal{U}[10, 20)$
    \item RBF: $s \sim \mathcal{U}[-2,\,2)$, $\ell = 0.25$, $\sigma=0.75$, $\delta=0.02$
    \item Mat\'ern 5/2: $s \sim \mathcal{U}[-2,\,2)$, $\ell = 0.25$, $\sigma=0.75$, $\delta=0.02$
\end{itemize}
The results are provided in table \ref{tab: 1d-benchmark-synthetic-unimodal-supp}. Figure \ref{fig: 1d-synthetic-benchmark-supp} illustrates examples of predictions made by each method. Note that for A/CQNP, the predictions correspond to the mean of the conditional distribution $p(y\,|\,x)$, not the quantiles. The mean of the uncountable mixture of $\gA L$ distributions can be computed as the following:
\begin{equation}
\begin{split}
    \mathbb{E}_{\ry}[p(\ry \,|\, x)] & = \mathbb{E}_{\ry}\left[\mathbb{E}_{\tau \,\sim\, \mathcal{U}(0,1)}\left[\alpha_{\tau}(x) \, \mathcal{A}L\left(\ry \,|\, \mu_{\tau}(x), \sigma_{\tau}(x), \tau\right)\right]\right] \\
    & = \mathbb{E}_{\tau \,\sim\, \mathcal{U}(0,1)}\left[\mathbb{E}_{\ry}\left[\alpha_{\tau}(x) \, \mathcal{A}L\left(\ry \,|\, \mu_{\tau}(x), \sigma_{\tau}(x), \tau\right)\right]\right] \\
    & = \mathbb{E}_{\tau \,\sim\, \mathcal{U}(0,1)}\left[\alpha_{\tau}(x) \, \mathbb{E}_{\ry}\left[\mathcal{A}L\left(\ry \,|\, \mu_{\tau}(x), \sigma_{\tau}(x), \tau\right)\right]\right] \\
    & = \mathbb{E}_{\tau \,\sim\, \mathcal{U}(0,1)}\left[\alpha_{\tau}(x) \, (\mu_{\tau}(x) + \frac{1-2\tau}{\tau(1-\tau)}\sigma_{\tau}(x))
    \right].
\end{split}
\end{equation}
Similar to section 3.2, we use Monte Carlo to approximate this expectation. For sawtooth, RBF, and Mat\'ern 5/2, we used 100, 50, and 50 maximum context points, respectively.
\begin{table*}[!h]
	% \vspace*{5pt}
	\centering
        \caption{Synthetic processes used in unimodal 1D regression experiments.}
        \label{tab: 1d-benchmark-generative-processes-supp}
        \scalebox{1}{
	{\setlength{\tabcolsep}{1.2pt}
	\begin{tabular}{lc} \toprule
		Process          & $g(s)=(g_x(s),\, g_y(s))$ \\ \midrule
		Sawtooth    & $\quad g_x(s)=s\, , \, g_{y}(s) = \frac{\alpha}{2} - \frac{\alpha}{\pi}\sum_{k=1}^{K}(-1)^k\frac{\sin{(2 \pi k \omega (s+\delta)})}{k}$ \vspace*{5pt} \\ 
		GP (RBF)        & $\quad g_x(s)=s\, , \, g_{y}\sim\mathcal{GP}(0, C)\, , \, C(x, x')=\sigma^2\exp{(-\frac{{\lVert x-x'\rVert}^2}{2\mathcal{\ell}})}+\delta$ \vspace*{5pt} \\
		 GP (Mat\'ern 5/2)   & $\quad g_x(s)=s\, , \, g_{y}\sim\mathcal{GP}(0, C)\, , \, C(x, x')=\sigma^2(1+\frac{\sqrt{5}d}{\ell}+\frac{5d^2}{3\ell^2})+\delta, d = \lVert x-x'\rVert $ \vspace*{1pt}\\ \bottomrule
	\end{tabular}}
 }
\end{table*}

\begin{table*}[h!]\centering
    \caption{Context and target log-likelihoods on synthetic 1D regression tasks ($6$ Seeds).}
    \label{tab: 1d-benchmark-synthetic-unimodal-supp}
    \scalebox{0.9}{
        \begin{tabular}{@{}l cc cc cc@{}}
        \toprule
            & \multicolumn{2}{c}{Sawtooth}      & \multicolumn{2}{c}{RBF}      & \multicolumn{2}{c}{Mat\'ern 5/2} \\
            \cmidrule[0.2pt]{2-7}  
            & context & target      & context & target      & context & target \\
            \midrule
            CNP         & ${0.937}_{\pm0.023}$ & ${0.586}_{\pm0.038}$       & ${0.837}_{\pm0.058}$ & ${0.100}_{\pm0.023}$       & ${0.626}_{\pm0.056}$ & ${-0.183}_{\pm0.013}$ \vspace*{2pt}\\
            CANP        & ${1.191}_{\pm0.190}$ & ${0.341}_{\pm0.085}$      & ${\bm{1.269}}_{\pm0.083}$ & ${0.225}_{\pm0.052}$       & ${\bm{1.058}}_{\pm0.382}$ & ${\bm{-0.015}}_{\pm0.198}$ \vspace*{2pt}\\
            BNP         & ${0.884}_{\pm0.038}$ & ${0.769}_{\pm0.039}$       & ${1.121}_{\pm0.008}$ & ${\bm{0.339}}_{\pm0.009}$       & ${0.879}_{\pm0.018}$ & ${\bm{-0.048}}_{\pm0.018}$ \vspace*{2pt}\\
            CQNP(ours)  & ${\bm{1.229}}_{\pm0.031}$ & ${\bm{0.833}}_{\pm0.035}$       & ${0.947}_{\pm0.042}$ & ${0.083}_{\pm0.037}$       & ${0.515}_{\pm0.039}$ & ${-0.373}_{\pm0.041}$ \vspace*{2pt}\\
            ACQNP(ours) & ${\bm{1.386}}_{\pm0.042}$ & ${\bm{1.026}}_{\pm0.039}$      & ${\bm{1.215}}_{\pm0.027}$ & ${\bm{0.254}}_{\pm0.020}$       & ${\bm{0.912}}_{\pm0.042}$ & ${-0.117}_{\pm0.025}$ \\
        \bottomrule 
    \end{tabular}}
\end{table*}

\section{Implementation details}\label{sec: supp-imp-detailes}
This section provides a detailed description of different methods' implementation mentioned throughout the paper. All the implementations are based on PyTorch \citep{paszke2019pytorch}. For CNP and CANP, we closely followed the official implementation\footnote{\url{https://github.com/deepmind/neural-processes}}. Note that the implementation of CANP is identical to ANP, but without the latent path shown in figure 2 of \cite{kim2018attentive}. For BNP, however, we borrowed the implementation provided by the authors \footnote{\url{https://github.com/juho-lee/bnp}}, and thus use their terminology in describing the network architecture.
Nonetheless, neural networks used in all models are instances of multi-layer perceptrons (MLPs) with ReLU activations and the only differences are regarding their depth and width. To specify the architecture of MLPs, we use the following notation:
$$[d_{h_0}] \times [d_{h_1}, \dots, d_{h_{n-1}}] \times [d_{h_n}],$$
where $d_{h_0}$ and $d_{h_n}$ denote the dimension of the network's input and output respectively. Furthermore, $[d_{h_1}, \dots, d_{h_{n-1}}]$ shows that the network has $n-1$ hidden layers with $d_{h_i}$ as the width of i-th hidden layer. In all the experiments, Adam \citep{adam-optimizer} is used for optimizing the objective function. Other than the learning rate and the $\ell_2$ regularizer, rest of the hyper-parameters used with Adam are set to the default values in Pytorch. 

\subsection{Synthetic Data}\label{sec: supp-imp-detailes-synth-data}
For synthetic data, each model is trained for $10^5$ iterations with 128 sampled functions per batch. During training, the tasks are generated at the moment, i.e. the training data is not fixed across different models and seeds. However, for evaluation, we generate and save $5\times10^3$ batches, each containing 16 curves. This data is later used to evaluate all the models. Note that in our implementation, $N_{\mathrm{total}}$ is the same across all observations in each batch. More precisely, for a batch $\mathcal{E}=\{\mathcal{E}_k\}_{k=1}^{n_b}$ with $n_b$ as the batch size, all $\mathcal{E}_k$s contain the same number of data points. However, $N_{\mathrm{total}}$ is not necessarily the same between two different batches as explained in section 4.1. The same setup holds for $N_{\mathrm{context}}$.
%----------------------------------------------------------------
\subsubsection{CNP}\label{sec: supp-CNP-imp-synth-data}
Table \ref{tab: supp-CNP-imp-detail-synth-data} shows the encoder and decoder architectures used in the implementation of CNPs for experiments on synthetic data. Table \ref{tab: supp-CNP-hyperparams-detail-synth-data} summarizes the choice of optimization hyperparameters along with the GPU devices used for training and testing.

\begin{table*}[!h]
    \centering
    \caption{Architectural details of CNPs for experiments on synthetic data.} \label{tab: supp-CNP-imp-detail-synth-data}
    \scalebox{0.9}{
    \begin{tabular}{c|c|c}
      \toprule 
      Benchmark   & Encoder   & Decoder\\
      \midrule
        \begin{tabular}{c} Sawtooth\\ RBF\\ Mat\'ern 5/2 \\ Double Sine\\ Circle \\ Lissajous \end{tabular}  & $[2]\times[128, 128, 128]\times[128]$   & $[129]\times[128, 128, 128]\times[2]$ \\
      \bottomrule
    \end{tabular}}
\end{table*}

\begin{table*}[!h]
    \centering
    \caption{Hyper-parameters and GPU devices used for training and testing CNPs on synthetic data.} \label{tab: supp-CNP-hyperparams-detail-synth-data}
    \scalebox{0.9}{
    \begin{tabular}{c|c|c|c|c}
      \toprule 
      Benchmark   & Learning rate   & L2 regularizer  & GPU (Training)   & GPU (Testing)\\
      \midrule 
      Sawtooth      & $5\times10^{-4}$  & $0$   & Quadro RTX 6000  & NVIDIA A100\\
      RBF           & $5\times10^{-4}$  & $0$   & NVIDIA A100  & Quadro RTX 6000\\
      Mat\'ern 5/2  & $5\times10^{-4}$  & $0$   & NVIDIA A100  & Quadro RTX 6000\\
      Double Sine   & $5\times10^{-4}$  & $0$   & NVIDIA A100  & NVIDIA A100\\ 
      Circle        & $5\times10^{-4}$  & $10^{-5}$   & Quadro RTX 6000  & NVIDIA A100\\
      Lissajous     & $5\times10^{-4}$  & $0$   & Quadro RTX 6000  & NVIDIA A100\\
      \bottomrule
    \end{tabular}}
\end{table*}

%----------------------------------------------------------------
\subsubsection{CANP}\label{sec: supp-CANP-imp-synth-data}
Table \ref{tab: supp-CANP-imp-detail-synth-data} contains details on the MLP architectures used for modeling the encoder and decoder modules in CANPs. Note that instead of passing raw context and target inputs as keys and queries to the attention modules, we first pass them through separate MLPs, namely key encoder and query encoder, and then apply the attention to the obtained embeddings. Here we work with the same 8-headed attention \citep{vaswani2017attention} mechanism used in the official implementation. Table \ref{tab: supp-CANP-hyperparams-detail-synth-data} summarizes the choice of optimization hyperparameters along with the GPU devices used for training and testing.

\begin{table*}[!h]
    \centering
    \caption{Architectural details of CANPs for experiments on synthetic data.} \label{tab: supp-CANP-imp-detail-synth-data}
    \scalebox{0.9}{
    \begin{tabular}{c|c|c|c|c}
      \toprule 
      Benchmark   & Context Encoder   & Key Encoder   & Query Encoder   & Decoder\\
      \midrule
        \begin{tabular}{c} Sawtooth\\ RBF\\ Mat\'ern 5/2 \\ Double Sine\\ Circle \\ Lissajous \end{tabular}  & $[2]\times[128, 128, 128]\times[128]$   & $[1]\times[128]\times[128]$   & $[1]\times[128]\times[128]$   & $[129]\times[128, 128, 128]\times[2]$ \\
      \bottomrule
    \end{tabular}}
\end{table*}

\begin{table*}[!h]
    \centering
    \caption{Hyper-parameters and GPU devices used for training and testing CANPs on synthetic data.} \label{tab: supp-CANP-hyperparams-detail-synth-data}
    \scalebox{0.9}{
    \begin{tabular}{c|c|c|c|c}
      \toprule 
      Benchmark   & Learning rate   & L2 regularizer  & GPU (Training)   & GPU (Testing)\\
      \midrule 
      Sawtooth      & $10^{-4}$  & $0$   & Quadro RTX 6000  & NVIDIA A100\\ 
      RBF           & $10^{-4}$  & $0$   & Quadro RTX 6000  & Quadro RTX 6000\\
      Mat\'ern 5/2  & $10^{-4}$  & $0$   & Tesla T4  & NVIDIA A100\\
      Double Sine   & $10^{-4}$  & $10^{-5}$   & NVIDIA A100  & NVIDIA A100\\ 
      Circle        & $10^{-4}$  & $10^{-5}$   & Quadro RTX 6000  & NVIDIA A100\\
      Lissajous     & $10^{-4}$  & $10^{-5}$   & Quadro RTX 6000  & NVIDIA A100
      \\
      \bottomrule
    \end{tabular}}
\end{table*}

% %----------------------------------------------------------------
\subsubsection{BNP}\label{sec: supp-BNP-imp-synth-data}
The architecture details for different components of BNPs including the encoder, adaptation layer, and decoder are provided in table \ref{tab: supp-BNP-imp-detail-synth-data}. For all the benchmarks, we use $k=4$ and $k=50$ bootstrap contexts for training and testing, respectively. The choice of optimization hyperparameters along with the GPU devices used for training and testing are included in table \ref{tab: supp-BNP-hyperparams-detail-synth-data}.

\begin{table*}[!h]
    \centering
    \caption{Architectural details of BNPs for experiments on synthetic data.} \label{tab: supp-BNP-imp-detail-synth-data}
    \scalebox{0.9}{
    \begin{tabular}{c|c|c|c|c|c|c}
      \toprule 
      Benchmark   & $d_x$   & $d_y$   & $d_h$   & $l_{pre}$   & $l_{post}$   & $l_{dec}$\\
      \midrule
        \begin{tabular}{c} Sawtooth\\ RBF\\ Mat\'ern 5/2 \\ Double Sine\\ Circle \\ Lissajous \end{tabular}  & $1$   & $1$   & $128$   & $5$   & $3$   & $5$ \\
      \bottomrule
    \end{tabular}}
\end{table*}

\begin{table*}[!h]
    \centering
    \caption{Hyper-parameters and GPU devices used for training and testing BNPs on synthetic data.} \label{tab: supp-BNP-hyperparams-detail-synth-data}
    \scalebox{0.9}{
    \begin{tabular}{c|c|c|c|c|c}
      \toprule 
      Benchmark   & Learning rate   & L2 regularizer  & Scheduler  & GPU (Training)   & GPU (Testing)\\
      \midrule
      Sawtooth      & $5\times10^{-4}$  & $0$   & cosine annealing   & Quadro RTX 6000  & Quadro RTX 6000\\ 
      RBF           & $5\times10^{-4}$  & $0$   & cosine annealing   & Quadro RTX 6000  & Quadro RTX 6000\\ 
      Mat\'ern 5/2  & $5\times10^{-4}$  & $0$   & cosine annealing   & Tesla T4  & NVIDIA A100\\
      Double Sine   & $5\times10^{-4}$  & $0$   & cosine annealing   & Quadro RTX 6000  & Quadro RTX 6000\\ 
      Circle        & $5\times10^{-4}$  & $0$   & cosine annealing  & Quadro RTX 6000  & Quadro RTX 6000\\
      Lissajous     & $5\times10^{-4}$  & $0$   & cosine annealing  & Quadro RTX 6000  & Quadro RTX 6000\\
      \bottomrule
    \end{tabular}}
\end{table*}

% %----------------------------------------------------------------
\subsubsection{CQNP}\label{sec: supp-CQNP-imp-synth-data}
The encoder and decoder architectures used for implementing CQNPs in different benchmarks are shown in table \ref{tab: supp-CQNP-imp-detail-synth-data}. Table \ref{tab: supp-CQNP-hyperparams-detail-synth-data} summarizes the choice of hyperparameters along with the GPU devices used for training and testing.

\begin{table*}[!h]
    \centering
    \caption{Architectural details of CQNPs for experiments on synthetic data.} \label{tab: supp-CQNP-imp-detail-synth-data}
    \scalebox{0.9}{
    \begin{tabular}{c|c|c}
      \toprule 
      Benchmark   & Context Encoder   & Decoder\\
      \midrule
        \begin{tabular}{c} Sawtooth\\ RBF\\ Mat\'ern 5/2 \\ Double Sine\\ Circle \\ Lissajous \end{tabular}  & $[2]\times[128, 128, 128]\times[128]$   & $[130]\times[128, 128, 128]\times[3]$ \\
      \bottomrule
    \end{tabular}}
\end{table*}

\begin{table*}[!h]
    \centering
    \caption{Hyper-parameters and GPU devices used for training and testing CQNPs on synthetic data.} \label{tab: supp-CQNP-hyperparams-detail-synth-data}
    \scalebox{0.9}{
    \begin{tabular}{c|c|c|c|c|c|c}
      \toprule 
      Benchmark   & Learning rate   & L2 regularizer   & $N_{\tau}$(Training)   & $N_{\tau}$(Testing)  & GPU (Training)   & GPU (Testing)\\
      \midrule
      Sawtooth      & $5\times10^{-4}$  & $10^{-5}$  & 50  & 100   & Quadro RTX 6000  & NVIDIA A100\\ 
      RBF           & $10^{-3}$  & $10^{-5}$  & 50  & 100   & Quadro RTX 6000  & Quadro RTX 6000\\ 
      Mat\'ern 5/2  & $5\times10^{-3}$  & $10^{-5}$  & 50  & 100   & Tesla T4  & Tesla T4\\
      Double Sine   & $10^{-3}$  & $10^{-5}$  & 50  & 100   & NVIDIA A100  & Quadro RTX 6000\\ 
      Circle        & $10^{-3}$  & $0$  & 50  & 100   & NVIDIA A100  & NVIDIA A100\\
      Lissajous     & $10^{-3}$  & $10^{-5}$  & 50  & 100   & NVIDIA A100  & NVIDIA A100
      \\
      \bottomrule
    \end{tabular}}
\end{table*}

% %----------------------------------------------------------------
\subsubsection{ACQNP}\label{sec: supp-ACQNP-imp-synth-data}
Compared to CQNP, ACQNP has an additional component named the adaptation layer which takes in the raw sample $u$ together with context representation and target input and maps them to a new set of quantile levels $\tau$ that we eventually approximate. Note that this is different from the adaptation layer used in BNPs. Also, we apply a sigmoid function to the final outputs of the adaptation layer to make sure that they correspond to valid quantile levels. The depth and width of the MLPs used for parameterizing the encoder, adaptation layer, and decoder in ACQNPs are presented in table \ref{tab: supp-ACQNP-imp-detail-synth-data}. We summarize the choice of hyperparameters along with the GPU models used for training and testing in table \ref{tab: supp-ACQNP-hyperparams-detail-synth-data}.

\begin{table*}[!h]
    \centering
    \caption{Architectural details of ACQNPs for experiments on synthetic data.} \label{tab: supp-ACQNP-imp-detail-synth-data}
    \scalebox{0.9}{
    \begin{tabular}{c|c|c|c}
      \toprule 
      Benchmark   & Context Encoder   & Adaptor    & Decoder\\
      \midrule
      Sawtooth    & $[2]\times[128, 128, 128]\times[128]$   & $[129]\times[128, 128, 128]\times[1]$   & $[130]\times[128, 128, 128]\times[3]$\\
      \midrule
      RBF         & $[2]\times[128, 128, 128]\times[128]$   & $[129]\times[128, 128, 128, 128, 128]\times[1]$   & $[130]\times[128, 128, 128]\times[3]$\\
      \midrule
      Mat\'ern 5/2 & $[2]\times[128, 128, 128]\times[128]$   & $[129]\times[128, 128, 128, 128]\times[1]$   & $[130]\times[128, 128, 128]\times[3]$\\
      \midrule
        \begin{tabular}{c} Double Sine\\ Circle \\ Lissajous \end{tabular}  & $[2]\times[128, 128, 128]\times[128]$   & $[129]\times[128, 128, 128, 128, 128]\times[1]$   & $[130]\times[128, 128, 128]\times[3]$ \\
      \bottomrule
    \end{tabular}}
\end{table*}

\begin{table*}[!h]
    \centering
    \caption{Hyper-parameters and GPU devices used for training and testing ACQNPs on synthetic data.} \label{tab: supp-ACQNP-hyperparams-detail-synth-data}
    \scalebox{0.9}{
    \begin{tabular}{c|c|c|c|c|c|c}
      \toprule 
      Benchmark   & Learning rate   & L2 regularizer   & $N_{\tau}$(Training)   & $N_{\tau}$(Testing)  & GPU (Training)   & GPU (Testing)\\
      \midrule 
      Sawtooth      & $5\times10^{-4}$  & $0$  & 50  & 100   & Quadro RTX 6000  & NVIDIA A100\\ 
      RBF           & $10^{-3}$  & $0$  & 50  & 100   & NVIDIA A100  & Quadro RTX 6000\\ 
      Mat\'ern 5/2  & $10^{-3}$  & $10^{-5}$  & 50  & 100   & Tesla T4  & NVIDIA A100\\
      Double Sine   & $10^{-3}$  & $10^{-5}$  & 50  & 100   & NVIDIA A100  & NVIDIA A100\\ 
      Circle        & $10^{-3}$  & $10^{-5}$  & 50  & 100   & Quadro RTX 6000  & NVIDIA A100\\
      Lissajous     & $10^{-3}$  & $10^{-5}$  & 50  & 100   & NVIDIA A100  & NVIDIA A100\\
      \bottomrule
    \end{tabular}}
\end{table*}

% %----------------------------------------------------------------
% %----------------------------------------------------------------
\subsection{Speed-Flow}\label{sec: supp-imp-detailes-speed-flow}
For the speed-flow data, $75\%$ of each lane's observations ($\approx 988$) are randomly selected for training and the rest are held out for testing. The batch size for both training and testing is 2 since we have data from 2 lanes. For the final evaluation of each method, we take the context and target sets to be the training and testing data, respectively. For training, we generate and save $10^4$ copies of the training data with random partitioning to context and target sets. This means that we fix the training curves as well as the evaluation data across all models as the dataset is quite small and does not require a lot of memory for storage.
% %----------------------------------------------------------------
\subsubsection{CNP}\label{sec: supp-CNP-imp-speed-flow}
Table \ref{tab: supp-CNP-imp-detail-speed-flow} shows the encoder and decoder architectures used for implementing CNPs in different benchmarks. Table \ref{tab: supp-CNP-hyperparams-detail-speed-flow} summarizes the choice of optimization hyperparameters along with the GPU devices used for training and testing.

\begin{table*}[!h]
    \centering
    \caption{Architectural details of CNPs for experiments on speed-flow data.} \label{tab: supp-CNP-imp-detail-speed-flow}
    \scalebox{0.9}{
    \begin{tabular}{c|c|c}
      \toprule 
      Benchmark   & Encoder   & Decoder\\
      \midrule
        Speed-Flow   & $[2]\times[64, 64]\times[64]$   & $[65]\times[64, 64]\times[2]$ \\
      \bottomrule
    \end{tabular}}
\end{table*}

\begin{table*}[!h]
    \centering
    \caption{Hyper-parameters and GPU devices used for training and testing CNPs on speed-flow data.} \label{tab: supp-CNP-hyperparams-detail-speed-flow}
    \scalebox{0.9}{
    \begin{tabular}{c|c|c|c|c}
      \toprule 
      Benchmark   & Learning rate   & L2 regularizer  & GPU (Training)   & GPU (Testing)\\
      \midrule
      Speed-Flow    & $10^{-4}$  & $10^{-5}$   & Tesla T4  & Tesla T4\\
      \bottomrule
    \end{tabular}}
\end{table*}

% %----------------------------------------------------------------
\subsubsection{CANP}\label{sec: supp-CANP-imp-speed-flow}
Table \ref{tab: supp-CANP-imp-detail-speed-flow} contains details on the MLP architectures used for modeling the encoder and decoder modules in CANPs. Note that instead of passing raw context and target inputs as keys and queries to the attention modules, we first pass them through separate MLPs, namely the key encoder and query encoder, and then apply the attention mechanism to the obtained embedding. Here we work with the same 8-headed attention mechanism used in the official implementation. Table \ref{tab: supp-CANP-hyperparams-detail-speed-flow} summarizes the choice of optimization hyperparameters along with the GPU devices used for training and testing.

\begin{table*}[!h]
    \centering
    \caption{Architectural details of CANPs for experiments on speed-flow data.} \label{tab: supp-CANP-imp-detail-speed-flow}
    \scalebox{0.9}{
    \begin{tabular}{c|c|c|c|c}
      \toprule 
      Benchmark   & Context Encoder   & Key Encoder   & Query Encoder   & Decoder\\
      \midrule
        Speed-Flow   & $[2]\times[64, 64]\times[64]$   & $[1]\times[64]\times[64]$   & $[1]\times[64]\times[64]$   & $[65]\times[64, 64]\times[2]$ \\
      \bottomrule
    \end{tabular}}
\end{table*}

\begin{table*}[!h]
    \centering
    \caption{Hyper-parameters and GPU devices used for training and testing CANPs on speed-flow data.} \label{tab: supp-CANP-hyperparams-detail-speed-flow}
    \scalebox{0.9}{
    \begin{tabular}{c|c|c|c|c}
      \toprule 
      Benchmark   & Learning rate   & L2 regularizer  & GPU (Training)   & GPU (Testing)\\
      \midrule
      Speed-Flow    & $10^{-4}$  & $10^{-5}$   & Tesla T4  & Tesla T4
      \\
      \bottomrule
    \end{tabular}}
\end{table*}

% %----------------------------------------------------------------
\subsubsection{BNP}\label{sec: supp-BNP-imp-speed-flow}
The architecture details for different components of the BNPs including the encoder, adaptation layer, and decoder are provided in table \ref{tab: supp-BNP-imp-detail-speed-flow}. For all the benchmarks, we use $k=4$ and $k=50$ bootstrap contexts for training and testing, respectively. The choice of optimization hyperparameters along with the GPU devices used for training and testing are included in table \ref{tab: supp-BNP-hyperparams-detail-speed-flow}.

\begin{table*}[!h]
    \centering
    \caption{Architectural details of BNPs for experiments on speed-flow data.} \label{tab: supp-BNP-imp-detail-speed-flow}
    \scalebox{0.9}{
    \begin{tabular}{c|c|c|c|c|c|c}
      \toprule 
      Benchmark   & $d_x$   & $d_y$   & $d_h$   & $l_{pre}$   & $l_{post}$   & $l_{dec}$\\
      \midrule
        Speed-Flow   & $1$   & $1$   & $64$   & $4$   & $3$   & $4$ \\
      \bottomrule
    \end{tabular}}
\end{table*}

\begin{table*}[!h]
    \centering
    \caption{Hyper-parameters and GPU devices used for training and testing BNPs on speed-flow data.} \label{tab: supp-BNP-hyperparams-detail-speed-flow}
    \scalebox{0.9}{
    \begin{tabular}{c|c|c|c|c|c}
      \toprule 
      Benchmark   & Learning rate   & L2 regularizer  & Scheduler  & GPU (Training)   & GPU (Testing)\\
      \midrule 
      Speed-Flow    & $5\times10^{-4}$  & $10^{-5}$   & None  & Tesla T4  & Tesla T4
      \\
      \bottomrule
    \end{tabular}}
\end{table*}

% %----------------------------------------------------------------
\subsubsection{CQNP}\label{sec: supp-CQNP-imp-speed-flow}
The encoder and decoder architectures used for implementing CQNPs in different benchmarks are shown in table \ref{tab: supp-CQNP-imp-detail-speed-flow}. Table \ref{tab: supp-CQNP-hyperparams-detail-speed-flow} summarizes the choice of hyperparameters along with the GPU devices used for training and testing.

\begin{table*}[!h]
    \centering
    \caption{Architectural details of CQNPs for experiments on speed-flow data.} \label{tab: supp-CQNP-imp-detail-speed-flow}
    \scalebox{0.9}{
    \begin{tabular}{c|c|c}
      \toprule 
      Benchmark   & Context Encoder   & Decoder\\
      \midrule
        Speed-Flow   & $[2]\times[64, 64]\times[64]$   & $[66]\times[64, 64]\times[3]$ \\
      \bottomrule
    \end{tabular}}
\end{table*}

\begin{table*}[!h]
    \centering
    \caption{Hyper-parameters and GPU devices used for training and testing CQNPs on speed-flow data.} \label{tab: supp-CQNP-hyperparams-detail-speed-flow}
    \scalebox{0.9}{
    \begin{tabular}{c|c|c|c|c|c|c}
      \toprule 
      Benchmark   & Learning rate   & L2 regularizer   & $N_{\tau}$(Training)   & $N_{\tau}$(Testing)  & GPU (Training)   & GPU (Testing)\\
      \midrule
      Speed-Flow    & $5\times10^{-3}$  & $10^{-5}$  & 100  & 50   & Tesla T4  & Quadro RTX 6000
      \\
      \bottomrule
    \end{tabular}}
\end{table*}

% %----------------------------------------------------------------
\subsubsection{ACQNP}\label{sec: supp-ACQNP-imp-speed-flow}
Compared to CQNP, ACQNP has an additional component named the adaptation layer which takes in the raw sample of $u$ together with context representation and target inputs and maps them to a new set of quantile levels $\tau$ that we eventually approximate. Note that this is different from the adaptation layer used in BNPs. Also, we apply a sigmoid function to the outputs of the adaptation layer to make sure that they correspond to valid quantile levels. The depth and width of the MLPs used for parameterizing the encoder, adaptation layer, and decoder in ACQNPs are presented in table \ref{tab: supp-ACQNP-imp-detail-speed-flow}. We summarize the choice of hyperparameters along with the GPU models used in training and testing in table \ref{tab: supp-ACQNP-hyperparams-detail-speed-flow}.

\begin{table*}[!h]
    \centering
    \caption{Architectural details of ACQNPs for experiments on speed-flow data.} \label{tab: supp-ACQNP-imp-detail-speed-flow}
    \scalebox{0.9}{
    \begin{tabular}{c|c|c|c}
      \toprule 
      Benchmark   & Context Encoder   & Adaptor    & Decoder\\
      \midrule
        Speed-Flow   & $[2]\times[64, 64]\times[64]$   & $[65]\times[64, 64]\times[1]$    & $[66]\times[64, 64]\times[3]$ \\
      \bottomrule
    \end{tabular}}
\end{table*}

\begin{table*}[!h]
    \centering
    \caption{Hyper-parameters and GPU devices used for training and testing ACQNPs on speed-flow data.} \label{tab: supp-ACQNP-hyperparams-detail-speed-flow}
    \scalebox{0.9}{
    \begin{tabular}{c|c|c|c|c|c|c}
      \toprule 
      Benchmark   & Learning rate   & L2 regularizer   & $N_{\tau}$(Training)   & $N_{\tau}$(Testing)  & GPU (Training)   & GPU (Testing)\\
      \midrule
      Speed-Flow    & $5\times10^{-3}$  & $10^{-5}$  & 100  & 50   & Tesla T4  & Quadro RTX 6000\\
      \bottomrule
    \end{tabular}}
\end{table*}

% %----------------------------------------------------------------
% %----------------------------------------------------------------
\subsection{Image completion}\label{sec: supp-imp-detailes-image-comp}
In image completion experiments on MNIST, Fashion-MNIST, SVHN, and Omniglot, we use the default train/test split of the data. For FreyFace, however, we randomly select $75\%$ of the images for training and keep the rest for testing. Similar to the experiments on synthetic data, the partitioning of image pixels to context and target sets is done randomly and during training, i.e. context and target sets are not fixed across different models and seeds. For evaluation, however, we saved the generated batches from test images. In the case of FreyFace, we repeat this process 4 more times so that each test image has 5 copies with different context/target splits in the stored evaluation batches.
Note that the number of context points across different tasks in a batch is the same, but might change from one batch to another. Obviously, the union of context and target sets which comprises all pixels of an image is the same in all cases as the image size is fixed in each dataset. All the models were trained for 100 epochs with 16 images per batch. The same batch size is used for testing.
% %----------------------------------------------------------------
\subsubsection{CNP}\label{sec: supp-CNP-imp-image-comp}
Table \ref{tab: supp-CNP-imp-detail-image-comp} shows the encoder and decoder architectures used for implementing CNPs in different benchmarks. Table \ref{tab: supp-CNP-hyperparams-detail-image-comp} summarizes the choice of optimization hyperparameters along with the GPU devices used for training and testing.

\begin{table*}[!h]
    \centering
    \caption{Architectural details of CNPs for image completion tasks.} \label{tab: supp-CNP-imp-detail-image-comp}
    \scalebox{0.9}{
    \begin{tabular}{c|c|c}
      \toprule 
      Benchmark   & Encoder   & Decoder\\
      \midrule
        \begin{tabular}{c} MNIST\\ Fashion MNIST \\ Omniglot \\ FreyFace  \end{tabular}   & $[3]\times[128, 128, 128]\times[128]$   & $[130]\times[128, 128, 128]\times[2]$ \\
      \midrule
        SVHN   & $[5]\times[128, 128, 128]\times[128]$   & $[130]\times[128, 128, 128]\times[6]$ \\
      \bottomrule
    \end{tabular}}
\end{table*}

\begin{table*}[!h]
    \centering
    \caption{Hyper-parameters and GPU devices used for training and testing CNPs on image completion tasks.} \label{tab: supp-CNP-hyperparams-detail-image-comp}
    \scalebox{0.9}{
    \begin{tabular}{c|c|c|c|c}
      \toprule 
      Benchmark   & Learning rate   & L2 regularizer  & GPU (Training)   & GPU (Testing)\\
      \midrule
      MNIST         & $5\times10^{-4}$  & $0$   & Quadro RTX 6000  & Quadro RTX 6000\\
      Fashion MNIST & $5\times10^{-4}$  & $0$   & Tesla T4  & Quadro RTX 6000\\
      Omniglot      & $5\times10^{-4}$  & $0$   & Quadro RTX 6000  & Quadro RTX 6000\\
      FreyFace      & $5\times10^{-4}$  & $0$   & Quadro RTX 6000  & NVIDIA A100\\
      SVHN          & $5\times10^{-4}$  & $0$   & Tesla T4  & Quadro RTX 6000
      \\
      \bottomrule
    \end{tabular}}
\end{table*}


% %----------------------------------------------------------------
\subsubsection{CANP}\label{sec: supp-CANP-imp-image-comp}
Table \ref{tab: supp-CANP-imp-detail-image-comp} contains details on the MLP architectures used for modeling the encoder and decoder modules in CANPs. Note that instead of passing raw context and target inputs as keys and queries to the attention modules, we first pass them through separate MLPs, namely the key encoder and query encoder, and then apply attention to the obtained embedding. Here we work with the same 8-headed attention mechanism used in the official implementation. Table \ref{tab: supp-CANP-hyperparams-detail-image-comp} summarizes the choice of optimization hyperparameters along with the GPU devices used for training and testing.

\begin{table*}[!h]
    \centering
    \caption{Architectural details of CANPs for image completion tasks.} \label{tab: supp-CANP-imp-detail-image-comp}
    \scalebox{0.9}{
    \begin{tabular}{c|c|c|c|c}
      \toprule 
      Benchmark   & Context Encoder   & Key Encoder   & Query Encoder   & Decoder\\
      \midrule
        \begin{tabular}{c} MNIST\\ Fashion MNIST \\ Omniglot \\ FreyFace  \end{tabular}   & $[3]\times[128, 128, 128]\times[128]$   & $[1]\times[128]\times[128]$   & $[1]\times[128]\times[128]$   & $[130]\times[128, 128, 128]\times[2]$ \\
      \midrule
        SVHN   & $[5]\times[128, 128, 128]\times[128]$   & $[2]\times[128]\times[128]$   & $[2]\times[128]\times[128]$   & $[130]\times[128, 128, 128]\times[6]$ \\
      \bottomrule
    \end{tabular}}
\end{table*}

\begin{table*}[!h]
    \centering
    \caption{Hyper-parameters and GPU devices used for training and testing CANPs on image completion tasks.} \label{tab: supp-CANP-hyperparams-detail-image-comp}
    \scalebox{0.9}{
    \begin{tabular}{c|c|c|c|c}
      \toprule 
      Benchmark   & Learning rate   & L2 regularizer  & GPU (Training)   & GPU (Testing)\\
      \midrule
      MNIST         & $5\times10^{-4}$  & $0$   & Quadro RTX 6000  & Quadro RTX 6000\\
      Fashion MNIST & $5\times10^{-4}$  & $0$   & Quadro RTX 6000  & Quadro RTX 6000\\
      Omniglot      & $5\times10^{-4}$  & $0$   & Quadro RTX 6000  & Quadro RTX 6000\\
      FreyFace      & $5\times10^{-4}$  & $10^{-5}$   & Quadro RTX 6000  & NVIDIA A100\\
      SVHN          & $5\times10^{-4}$  & $0$   & Quadro RTX 6000  & NVIDIA A100
      \\
      \bottomrule
    \end{tabular}}
\end{table*}

% %----------------------------------------------------------------
\subsubsection{BNP}\label{sec: supp-BNP-imp-image-comp}
The architecture details for different components of the BNPs including the encoder, adaptation layer, and decoder are provided in table \ref{tab: supp-BNP-imp-detail-image-comp}. For all the benchmarks, we use $k=4$ and $k=50$ bootstrap contexts for training and testing, respectively. The choice of optimization hyperparameters along with the GPU devices used for training and testing are included in table \ref{tab: supp-BNP-hyperparams-detail-image-comp}.

\begin{table*}[!h]
    \centering
    \caption{Architectural details of BNPs for image completion tasks.} \label{tab: supp-BNP-imp-detail-image-comp}
    \scalebox{0.9}{
    \begin{tabular}{c|c|c|c|c|c|c}
      \toprule 
      Benchmark   & $d_x$   & $d_y$   & $d_h$   & $l_{pre}$   & $l_{post}$   & $l_{dec}$\\
      \midrule
        \begin{tabular}{c} MNIST\\ Fashion MNIST \\ Omniglot \\ FreyFace  \end{tabular}   & $2$   & $1$   & $128$   & $5$   & $3$   & $5$ \\
      \midrule
        SVHN   & $2$   & $3$   & $128$   & $5$   & $3$   & $5$ \\
      \bottomrule
    \end{tabular}}
\end{table*}

\begin{table*}[!h]
    \centering
    \caption{Hyper-parameters and GPU devices used for training and testing BNPs on image completion tasks.} \label{tab: supp-BNP-hyperparams-detail-image-comp}
    \scalebox{0.9}{
    \begin{tabular}{c|c|c|c|c|c}
      \toprule 
      Benchmark   & Learning rate   & L2 regularizer  & Scheduler  & GPU (Training)   & GPU (Testing)\\
      \midrule
      MNIST         & $5\times10^{-4}$  & $0$   & cosine annealing  & NVIDIA A100  & Quadro RTX 6000 \\
      Fashion MNIST & $5\times10^{-4}$  & $0$   & cosine annealing  & NVIDIA A100  & Quadro RTX 6000\\
      Omniglot      & $5\times10^{-4}$  & $0$   & cosine annealing  & Tesla T4  & Quadro RTX 6000\\
      FreyFace      & $5\times10^{-4}$  & $0$   & cosine annealing  & Quadro RTX 6000  & Quadro RTX 6000\\
      SVHN          & $5\times10^{-4}$  & $0$   & cosine annealing  & NVIDIA A100  & Quadro RTX 6000
      \\
      \bottomrule
    \end{tabular}}
\end{table*}

% %----------------------------------------------------------------
\subsubsection{CQNP}\label{sec: supp-CQNP-imp-image-comp}
The encoder and decoder architectures used for implementing CQNPs in different benchmarks are shown in table \ref{tab: supp-CQNP-imp-detail-image-comp}. Table \ref{tab: supp-CQNP-hyperparams-detail-image-comp} summarizes the choice of hyperparameters along with the GPU devices used for training and testing.

\begin{table*}[!h]
    \centering
    \caption{Architectural details of CQNPs for image completion tasks.} \label{tab: supp-CQNP-imp-detail-image-comp}
    \scalebox{0.9}{
    \begin{tabular}{c|c|c}
      \toprule 
      Benchmark   & Context Encoder   & Decoder\\
      \midrule
        \begin{tabular}{c} MNIST\\ Fashion MNIST \\ Omniglot \\ FreyFace  \end{tabular}   & $[3]\times[128, 128, 128]\times[128]$   & $[131]\times[128, 128, 128]\times[3]$ \\
      \midrule
        SVHN   & $[5]\times[128, 128, 128]\times[128]$   & $[131]\times[128, 128, 128]\times[9]$ \\
      \bottomrule
    \end{tabular}}
\end{table*}

\begin{table*}[!h]
    \centering
    \caption{Hyper-parameters and GPU devices used for training and testing CQNPs on image completion tasks.} \label{tab: supp-CQNP-hyperparams-detail-image-comp}
    \scalebox{0.9}{
    \begin{tabular}{c|c|c|c|c|c|c}
      \toprule 
      Benchmark   & Learning rate   & L2 regularizer   & $N_{\tau}$(Training)   & $N_{\tau}$(Testing)  & GPU (Training)   & GPU (Testing)\\
      \midrule
      MNIST         & $5\times10^{-4}$  & $0$  & 25  & 50   & NVIDIA A100  & NVIDIA A100\\
      Fashion MNIST & $10^{-3}$  & $10^{-5}$  & 25  & 50   & NVIDIA A100  & NVIDIA A100\\
      Omniglot      & $10^{-3}$  & $10^{-5}$  & 25  & 50   & Quadro RTX 6000  & NVIDIA A100\\
      FreyFace      & $10^{-3}$  & $0$  & 25  & 50   & Quadro RTX 6000  & NVIDIA A100\\
      SVHN          & $10^{-3}$  & $10^{-5}$  & 25  & 50   & Quadro RTX 6000  & NVIDIA A100
      \\
      \bottomrule
    \end{tabular}}
\end{table*}

% %----------------------------------------------------------------
\subsubsection{ACQNP}\label{sec: supp-ACQNP-imp-image-comp}
Compared to CQNP, ACQNP has an additional component named the adaptation layer which takes in the raw sample of $u$ together with context representation and target inputs and maps them to a new set of quantile levels $\tau$ that we eventually approximate. Note that this is different from the adaptation layer used in BNPs. Also, we apply a sigmoid function to the outputs of the adaptation layer to make sure that they correspond to valid quantile levels. The depth and width of the MLPs used for parameterizing the encoder, adaptation layer, and decoder in ACQNPs are presented in table \ref{tab: supp-ACQNP-imp-detail-image-comp}.  
We summarize the choice of hyperparameters along with the GPU models used in training and testing in table \ref{tab: supp-ACQNP-hyperparams-detail-image-comp}.

\begin{table*}[!h]
    \centering
    \caption{Architectural details of ACQNPs for image completion tasks.} \label{tab: supp-ACQNP-imp-detail-image-comp}
    \scalebox{0.9}{
    \begin{tabular}{c|c|c|c}
      \toprule 
      Benchmark   & Context Encoder   & Adaptor    & Decoder\\
      \midrule
        \begin{tabular}{c} MNIST\\ Fashion MNIST \\ Omniglot \\ FreyFace  \end{tabular}   & $[3]\times[128, 128, 128]\times[128]$   & $[129]\times[128, 128, 128, 128, 128]\times[1]$   & $[130]\times[128, 128, 128]\times[3]$ \\
      \midrule
        SVHN   & $[5]\times[128, 128, 128]\times[128]$   & $[129]\times[128, 128, 128, 128, 128]\times[1]$  & $[130]\times[128, 128, 128]\times[9]$ \\
      \bottomrule
    \end{tabular}}
\end{table*}

\begin{table*}[!h]
    \centering
    \caption{Hyper-parameters and GPU devices used for training and testing ACQNPs on image completion tasks.} \label{tab: supp-ACQNP-hyperparams-detail-image-comp}
    \scalebox{0.9}{
    \begin{tabular}{c|c|c|c|c|c|c}
      \toprule 
      Benchmark   & Learning rate   & L2 regularizer   & $N_{\tau}$(Training)   & $N_{\tau}$(Testing)  & GPU (Training)   & GPU (Testing)\\
      \midrule 
      MNIST         & $10^{-3}$  & $10^{-5}$  & 25  & 50   & NVIDIA A100  & NVIDIA A100\\
      Fashion MNIST & $10^{-3}$  & $10^{-5}$  & 25  & 50   & NVIDIA A100  & NVIDIA A100\\
      Omniglot      & $10^{-3}$  & $10^{-5}$  & 25  & 50   & NVIDIA A100  & NVIDIA A100\\
      FreyFace      & $10^{-3}$  & $10^{-5}$  & 25  & 50   & NVIDIA A100  & Tesla T4\\
      SVHN          & $10^{-3}$  & $10^{-5}$  & 25  & 50   & NVIDIA A100  & NVIDIA A100
      \\
      \bottomrule
    \end{tabular}}
\end{table*}

%----------------------------------------------------------------
\bibliography{mohseni_273-bibfile}

\end{document}
