% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
% For theorems and such
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}
\usepackage{color}
\usepackage{soul}
\usepackage{dsfont}
\usepackage{nccmath}
\usepackage{caption}
\usepackage{stfloats}
\usepackage{anyfontsize}
\input{mohseni_273-commands.tex}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Adaptive Conditional Quantile Neural Processes}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<peiman.mohseni@tamu.edu>?Subject=Adaptive Conditional Quantile Neural Processes}{Peiman Mohseni}{}}
\author[2]{Nick Duffield}
\author[3]{Bani Mallick}
\author[4]{Arman Hasanzadeh}


% Add affiliations after the authors
\affil[1]{%
    Computer Science and Engineering Department\\
    Texas A\&M University
}
\affil[2]{%
    Electrical and Computer Engineering Department\\
    Texas A\&M University
}
\affil[3]{%
    Statistics Department\\
    Texas A\&M University
}

\affil[4]{%
    Google Cloud
}
  
\begin{document}
\maketitle

\begin{abstract}\label{abstract}
Neural processes are a family of probabilistic models that inherit the flexibility of neural networks to parameterize stochastic processes. Despite providing well-calibrated predictions, especially in regression problems, and quick adaptation to new tasks, the Gaussian assumption that is commonly used to represent the predictive likelihood fails to capture more complicated distributions such as multimodal ones. To overcome this limitation, we propose Conditional Quantile Neural Processes (CQNPs), a new member of the neural processes family, which exploits the attractive properties of quantile regression in modeling the distributions irrespective of their form. By introducing an extension of quantile regression where the model learns to focus on estimating \emph{informative} quantiles, we show that the sampling efficiency and prediction accuracy can be further enhanced. Our experiments with real and synthetic datasets demonstrate substantial improvements in predictive performance compared to the baselines, and better modeling of heterogeneous distributions' characteristics such as multimodality.
\end{abstract}


\section{Introduction}\label{sec: intro}
Conventionally, regression problems are approached by modeling the relation between inputs and outputs with a deterministic function where the parameters of this function are optimized with respect to a loss function. Non-parametric statistical methods, however, choose a different perspective by viewing the regression function itself as a random object. This allows for fitting a family of functions that are coherent with the data instead of a single one. (Conditional) Neural Processes (C/NPs) \citep{CNP, NP} are a class of such models that inherit the computational efficiency of neural networks and integrate it with desirable properties of Gaussian Processes (GPs), specifically uncertainty quantification and rapid adaptation to new observations \citep{CNP, NP}. In fact, \citet{rudner2018connection} showed that under certain conditions, NPs recover GPs with deep kernels. 

NPs can be viewed as the composition of an encoder and a decoder where the encoder embeds a finite collection of observations $\mathcal{E}_{\mathrm{context}}=\{(x_i, y_i)\}_{i=1}^{N}$, also known as context set, into a latent space. Subsequently, the decoder takes in a new target location $x^*$ together with the latent representation of context data to parameterize the conditional distribution $p(\ry^*\,|\,x^*, \mathcal{E}_{\mathrm{context}})$ of the corresponding target output $y^*$. Several variants of NPs have been introduced. \citet{kim2018attentive, nguyen2022transformer, kim2022neural, guo2023versatile, feng2023latent} incorporate attention mechanisms to make NPs less prone to under-fitting. \citet{Gordon2020Convolutional, foong2020meta} build translation equivariance into NPs by introducing convolutional deep sets. \citet{holderrieth2021equivariant} further extend the translation equivariance to more complicated transformations such as rotations and reflections. \citet{volpp2021bayesian} improve context aggregation by casting it as a Bayesian inference problem. \citet{lee2020bootstrapping} use the bootstrap technique for inducing the functional uncertainty. \citet{GaussianNP, markou2022practical} propose Gaussian Neural Processes to model predictive correlations between different target locations. \citet{wang2020doubly} propose Doubly Stochastic Variational Neural Processes to capture target-specific local variation by adding local latent variables in a hierarchical encoder.

Despite their higher expressive power compared to vanilla C/NP, they fail to model multimodality that may be exhibited by the predictive distribution in real-world problems such as those in transportation science, economics, astronomy, and meteorology \citep{chen2016nonparametric}. 
This is due to the fact that existing models have a common Gaussian likelihood assumption.
To circumvent this issue, we propose to employ an infinite mixture of asymmetric Laplace distributions as the likelihood for C/NPs. This likelihood, which can be seen as an extension to the well-known quantile regression \citep{Koenker1978QR, koenker_2005_QRbook}, has shown to be an effective approach for modeling heterogeneous distributions \citep{dabney2018implicit, Brando19UMAL}.
To further improve the expressive power of the model, we propose an adaptive extension of our approach where instead of fixing the quantile levels, the model learns to predict the quantiles that contribute more to the predictive likelihood which we will refer to as \emph{informative} quantiles.
We integrate our model with CNPs resulting in (Adaptive) Conditional Quantile Neural Processes (A/CQNPs) and conduct several experiments on both synthetic and real-world datasets to illustrate the performance enhancements achieved by our method. 
While, in this work, our focus is on CNPs, we emphasize that the simple, yet generic nature of the proposed approach allows for quick adaptation to other members of the NPs family.

\section{Preliminaries}\label{sec: prelim}
\subsection{Quantile regression}\label{subsec: quantile-regression}
In regression analysis, given a set of observations $\train = \{(x_i, y_i)\}_{i=1}^{N}$ from a pair of random variables $(\rvx, \ry)$, the objective is to learn a function $f$ that maps the inputs $x_i$ to the outputs $y_i$. 
We further assume that $\rvx$ and $\ry$ are vector and scalar random variables, respectively. In the case where $\rvy$ is a vector, we consider its scalar elements independently.
In decision theory, the optimality of an estimator is measured through its risk function $\mathbb{E}_{\train}[\mathcal{L}(y_i,\, f(x_i)]$ where $\mathcal{L}(\cdot\,,\,\cdot)$ is a loss function and an estimator with lower risk is preferred.
A common choice for the loss function is the mean squared error. It is well-known that for this loss, the estimator with minimum risk is $f(x) = \mathbb{E}[\ry \,|\, \rvx=x]$ which relates the inputs and outputs through the conditional mean \citep{casella2021statistical}.
However, in more complicated instances where $p(\ry \,|\, \rvx=x)$ is asymmetric, or multimodal, the conditional mean is not a sufficient statistic to summarize the distribution characteristics. This can be alleviated by using more robust statistics such as \emph{quantiles}. For $\tau \in (0, 1)$, the $\tau$-th conditional quantile $\mu_{\tau}(x) = \inf\, \left\{ \mu \,\,|\,\, p(\ry\leq \mu \,|\, \rvx=x)\geq \tau \right\}$ is obtained by minimizing the asymmetric absolute loss $\mathcal{L}_{\tau}(y_i, x_i) = \rho_{\tau}(y_i - \mu_{\tau}(x_i))$ where $\rho_{\tau}(y) = \max\, \left\{y\tau,\, y(\tau - 1)\right\}$. 
It is straightforward to show that minimizing $\gL_{\tau}$ is equivalent to maximizing the log-likelihood of an Asymmetric Laplace ($\gA L$) distribution \citep{YU-2001-BayesianQR} with a constant scale parameter. The density function of $\gA L$ distribution is defined as the following:
\begin{multline}\label{eq: al-distribution}
    \gA L\left(y\,|\, q_{\tau},\sigma_{\tau} ,\tau \right) = \\ \frac{\tau (1-\tau)}{\sigma_{\tau}}\times\exp{\left(-\frac{1}{\sigma_{\tau}}\rho_{\tau}(y - q_{\tau})\right)},
\end{multline}
where $q_{\tau} \in \mathbb{R}$, $\sigma_{\tau} \in \mathbb{R}_{>0}$, and $\tau \in (0, 1)$ are the location, scale, and skew parameters \citep{Yu-2005-3param}. Note that in vanilla quantile regression $\tau$ and $\sigma_{\tau}$ are fixed.

Like mean, a single quantile may not be sufficient to model heterogeneous distributions. To address this, several works have proposed to predict a set of quantiles instead of a single one \citep{liu2009stepwise, liu2011simultaneous, Maxime2916Joint, dabney2018implicit, Brando19UMAL, brando2022deep}. Among them, \citet{Brando19UMAL} use an uncountable mixture of $\mathcal{A}L$ distributions to approximate the predictive distribution. More specifically, the predictive distribution is parameterized as follows:
\begin{multline}\label{eq: al-mixture}
    p(\ry \,|\, x) = \\ \mathbb{E}_{\tau \,\sim\, \mathcal{U}(0,1)}\left[\alpha_{\tau}(x) \, \mathcal{A}L\left(\ry \,|\, \mu_{\tau}(x), \sigma_{\tau}(x), \tau\right)\right],
\end{multline}
where $\mathcal{U}(0, 1)$ is the uniform distribution and $\alpha_{\tau}(x) \geq 0$ is the mixture coefficient such that $\mathbb{E}_{\tau \,\sim\, \mathcal{U}(0, 1)}[\alpha_{\tau}(x)] = 1$.
The parameters of this mixture distribution are estimated using deep neural networks.

\subsection{Conditional neural processes}\label{subsec: CNP}
Let $\mathcal{D} = \{(x_i, y_i) \in \sX \times \sY \}_{i=1}^N$ be a set of training observations corresponding to a realization of the following stochastic process; let $p(\rf)$ be a probability distribution over functions $f: \sX \rightarrow \sY$, then for $f \sim p(\rf)$, set $y_i = f(x_i)$ \citep{CNP}.
CNP is a conditional stochastic process $q_\upsilon$ (where $\upsilon$ is the set of parameters of $q$) that approximates the distribution over functions $p(\rf)$. In vanilla CNP, first, each pair $(x_i, y_i)$ of training observations (or a subset of them, i.e. \emph{context} set) is embedded into a latent space. Next, these embeddings are aggregated to form a representation, which is used with target inputs to predict the target outputs. We note that the aggregated representation is invariant to the ordering of context points. More specifically, given an encoder $\varphi_{\mathrm{enc}} \colon \sX \times \sY \rightarrow \sR^{d_e}$ and a decoder $\varphi_{\mathrm{dec}} \colon \sX \times \sR^{d_e} \rightarrow \Theta$, where $\sR^{d_e}$ is the embedding space and $\Theta$ is the set of parameters of the predictive distribution, CNP formulates the predictive distribution of $f(x^*)$ for a given target $x^*$ as:
\begin{equation}\label{eq: CNP-general-likelihood}
    p_\theta(f(x^*)\,|\, x^*, \mathcal{D}) = p_\theta\left(f(x^*) \,|\, \varphi_{\mathrm{dec}}\big(x^*, \varphi_{\mathrm{enc}} \left(x^*, \mathcal{D}\right)\right)\big).
\end{equation}
As mentioned earlier, different members of the CNPs family, have different encoder and decoder architectures.
In the vast majority of CNP variants, the predictive distribution is chosen to be a simple Gaussian distribution resulting in:
\begin{equation}\label{eq: CNP-Gaussian-likelihood}
    \begin{split}
        p_\theta(f(x^*) \,|\, x^*, \mathcal{D})
        &= \mathcal{N}(f(x^*)\,|\,\varphi(x^*, \mathcal{D})),
    \end{split}
\end{equation}
where $\varphi(x^*, \mathcal{D}) = \{\varphi_{\mathrm{dec}}^{\mu}(x^*, \varphi_{\mathrm{enc}} (x^*, \mathcal{D})),\, \varphi_{\mathrm{dec}}^{\sigma}(x^*,$ $ \varphi_{\mathrm{enc}} (x^*, \mathcal{D}))\}$ is the set of functions mapping the embeddings and $x^*$ to the mean and standard deviation
of $y^*$. To the best of our knowledge, none of the existing CNPs are capable of modeling heterogeneous distributions such as multi-modal ones.

\section{Method}\label{sec: method}

\subsection{Adaptive quantile regression} \label{subsec: adaptive-quantile-regression}
Although the uncountable mixture of $\gA L$s in \eqref{eq: al-mixture} gives us a comprehensive picture of the conditional distribution by fitting the full quantile function, it might lead to some practical inefficiencies that will be discussed below. For the rest of this discussion, we use $\mathcal{B}(c, \epsilon)$ to denote an open interval of length $2\epsilon>0$ and centered at $c$, i.e. $\mathcal{B}(c, \epsilon) = (c-\epsilon,\, c+\epsilon)$. 

Let's consider a case where there are two equally probable outcomes $y_1$ and $y_2$ ($y_1 \neq y_2$) for an input variable $x$. It is easy to verify that fitting the mixture distribution in \eqref{eq: al-mixture} will force $\gA L$ components with $\tau \in \mathcal{B}(0.5, \epsilon)$ ($0<\epsilon < |y_1 - y_2|/4 $) to settle around the median which is $\mu_{\tau}(x) = (y_1 + y_2)/2$. However, in this scenario, the median is not of high interest as we like our model to concentrate the probability density around $y_1$ and $y_2$. This example indicates that depending on the problem at hand, not all quantiles are equally important. 
To compensate for this, the mixture weights of $\gA L$ components corresponding to non-informative quantiles are expected to shrink to zero.
Theoretically, where we have infinite samples of $\tau$, this is not an issue, but in practice, the expectation in \eqref{eq: al-mixture} is approximated by a finite number of Monte Carlo samples. In other words, we can approximate the integral up to a certain precision which depends on the number of samples of $\tau$. Hence, it would be more efficient to avoid drawing samples of $\tau$ that correspond to non-informative quantiles. In our example, we like to sample $\tau$ such that $\mu_{\tau} \in \mathcal{B}(y_1, \epsilon) \cup \mathcal{B}(y_2, \epsilon)$. This will yield a more accurate approximation of $p(\ry\,|\,x)$ around $y_1$ and $y_2$ and prevent wasting computing resources.

% The second 
Another possible issue is regarding making point estimations. In most applications, the final stage involves reporting a set of values predicted by the model for a given input. In the case of using a symmetric unimodal distribution like Gaussian for representing $p(\ry\,|\,x)$, we usually report the distribution mean as it coincides with the mode. 
However, finding the modes of the uncountable mixture in \eqref{eq: al-mixture} is not always straightforward. A naive way to address this is by considering a set of uniformly sampled $\tau$ values, calculating their corresponding quantities $\{ \alpha_{\tau},\, \mu_{\tau},\, \sigma_{\tau}\}$, and finally selecting the most probable quantiles by comparing their mixture weights (or likelihoods). Similar to the previous case, it is quite likely that a small subset of quantiles is of interest. In case of having knowledge of these quantiles, we will be able to select better candidates as our final predictions with less effort which comes as a result of reducing the search space. 

Motivated by the above discussion, we propose using an adaptive set of quantiles $\mathcal{T}_x$ for each $x$ where the model learns to approximate the quantiles that are more significant in modeling $p(\ry\,|\,x)$. A simple approach for finding such a set incorporates replacing the non-informative uniform distribution $\mathcal{U}(0, 1)$ in \eqref{eq: al-mixture} with $q(\tau \,|\,x)$ (such that $\tau \in (0, 1)$) whose density is mostly concentrated around values corresponding to informative quantiles of $p(\ry\,|\,x)$. Note that the dependence of $q(\tau \,|\,x)$ on $x$ can be arbitrarily complex. Therefore, $\mathcal{T}_x$ can be viewed as samples from $q(\tau \,|\,x)$ and the problem of finding $\mathcal{T}_x$ changes to estimation of $q(\tau \,|\,x)$. Notice that conditioning on $x$ is crucial as the conditional distribution $p(\ry\,|\,x)$, and, hence, its quantiles (presumably) change at different inputs. Rewriting \eqref{eq: al-mixture} yields:
\begin{multline}\label{eq: al-mixture-with-posterior-of-tau}
    p(\ry \,|\, x) = \\ \mathbb{E}_{\tau \,\sim\, q(\tau \,|\,x)}\left[\alpha_{\tau}(x) \, \mathcal{A}L\left(\ry \,|\, \mu_{\tau}(x), \sigma_{\tau}(x), \tau\right)\right].
\end{multline}
The Monte Carlo Approximation of this expectation only requires samples from $q(\tau \,|\,x)$. Hence, having an analytic probability density function for $q(\tau \,|\,x)$ is not necessary as far as samples can be drawn. Various density estimation techniques can be deployed to find $q(\tau \,|\,x)$. Considering the complicated nature of $q(\tau \,|\,x)$, we propose to approximate it with a reparameterizable implicit distribution \citep{Diggle84implicit, mohamed2016implicit}. This means that to sample from $q(\tau \,|\,x)$, we can first draw an auxiliary variable $u \sim \mathcal{U}(0, 1)$ and then set $\tau$ as a deterministic function $\psi \colon \sX \times (0,1) \rightarrow (0,1)$ of $x$ and $u$:
\begin{equation}\label{eq: implicit-distribution-def}
    u \sim \mathcal{U}(0, 1),\, \tau = \psi(x, u) \quad \equiv \quad \tau \sim q(\tau \,|\,x)
\end{equation}
When $\psi(x, u)$ is invertible w.r.t. $u$ and for a fixed $x$, $q(\tau \,|\,x)$ can be calculated by a simple application of the change of variable formula: 
\begin{equation*}\label{eq: change-of-variable-formula}
    q(\tau \,|\,x) = \mathds{1}_{(0, 1)}(g_x^{-1}(\tau))\, \frac{d}{d\tau}(g_x^{-1}(\tau)),
\end{equation*}
where $\tau=g_x(u)=\psi(x, u)$. However, this is generally not the case, and hence $q(\tau \,|\,x)$ is implicit. Using equation \ref{eq: implicit-distribution-def}, we can approximate the conditional distribution in equation \ref{eq: al-mixture-with-posterior-of-tau} as follows:
\begin{multline}\label{eq: al-mixture-with-adaptive-quantiles}
    p(\ry \,|\, x) \approx \mathbb{E}_{u \,\sim\, \mathcal{U}(0, 1)}\big[\alpha_{\psi(x, u)}(x)\\ \qquad\quad\mathcal{A}L(\ry\,|\,\mu_{\psi(x, u)}(x), \sigma_{\psi(x, u)}(x), \psi(x, u))\big],
\end{multline}
where $\psi$ is a fully-connected neural network. The high expressive power of neural networks allows $q(\tau \,|\,x)$ to be highly flexible and capture the dependencies between the elements of $x$ and $u$.

\subsection{Conditional Quantile Neural Processes} \label{subsec: CQNP}
Despite the attractive properties of likelihood-based models, their expressive power is highly impacted by the form of conditional distribution. Inherently, CNPs with Gaussian likelihood struggle to model more complicated distributions. We remedy this by adapting the predictive distribution in \eqref{eq: CNP-general-likelihood} to the compound distribution in \eqref{eq: al-mixture-with-adaptive-quantiles}. This requires augmenting the domain of $\varphi_{dec}$ and $\psi$ as demonstrated below:
\begin{equation*}
    \begin{split}
        \psi \colon \sX \times (0,1) \rightarrow (0,1) \quad &\text{to} \quad \psi \colon \sX \times \sR^{d_e} \times (0,1) \rightarrow \mathcal{T}_{\sX} \\
        \varphi_{\text{dec}} \colon \sX \times \sR^{d_e} \rightarrow \Theta \quad &\text{to} \quad \varphi_{\text{dec}} \colon \sX \times \sR^{d_e} \times \mathcal{T}_{\sX} \rightarrow \Theta
    \end{split}
\end{equation*}
Putting all pieces together, the predictive distribution of $f(x^*)$ for a given target input location $x^*$ would be:
\begin{multline}\label{eq: ACQNP-liklihood}
    p(f(x^*)\,|\,x^*, \mathcal{D}) =
    \mathbb{E}_{u \sim \mathcal{U}(0, 1)}[\alpha_{\tau}(x^*, \mathcal{D})\\ \times \mathcal{A}L (f(x^*)\,|\,\mu_{\tau}(x^*, \mathcal{D}), \sigma_{\tau}(x^*, \mathcal{D}), \tau)]
\end{multline}
where
\begin{align*}
    \tau=\psi(x^*, \varphi_{enc}(x^*, \mathcal{D}), u) \,\,\,\, & \\
    \{\alpha_{\tau} (x^*, \mathcal{D}) ,\, \mu_{\tau} (x^*, \mathcal{D}) ,\, \sigma_{\tau} &(x^*, \mathcal{D})\} = \\ &\varphi_{dec}(x^*, \varphi_{enc}(x^*, \mathcal{D}), \tau)
\end{align*}
We refer to this model as Adaptive Conditional Quantile Neural Process (ACQNP). In the case where $\tau = u$, the resulting model is Conditional Quantile Neural Process (CQNP). The expectation in \eqref{eq: ACQNP-liklihood} is approximated by drawing $N_{\tau}$ Monte Carlo samples from $\mathcal{U}(0, 1)$. Unlike \cite{Brando19UMAL}, we avoid posing a uniform distribution over the mixing weights $\alpha_{\tau}$. Instead, we normalize them using a SoftMax function to have a valid convex combination of $\gA L$ distributions. The final likelihood can be expressed as follows:
\begin{multline}\label{eq: ACQNP-likelihood-approximation}
    p(f(x^*)\,|\,x^*, \mathcal{D}) \approx 
    \sum_{k=1}^{N_{\tau}} \big[\frac{e^{\alpha_{\tau_k}(x^*, \mathcal{D})}}{\sum_{k=1}^{N_{\tau}} e^{\alpha_{\tau_k}(x^*, \mathcal{D})}}\, \\ \times \mathcal{A}L (f(x^*)\,|\,\mu_{\tau_k}(x^*, \mathcal{D}), \sigma_{\tau_k}(x^*, \mathcal{D}), \tau_k)\big]
\end{multline}
Since CNPs and A/CQNPs use the same architectural design for computing the context representation, the computational complexity imposed by the encoder, i.e. $\mathcal{O}(\varphi_{enc})$, remains unchanged. However, the computational complexity of the decoder which comes from estimating $N_\tau$ quantiles at each input location is raised from $\mathcal{O}(\varphi_{dec})$ to $\mathcal{O}(N_{\tau}\varphi_{dec})$ resulting in the overall complexity of $\mathcal{O}(\varphi_{enc} + N_{\tau}\varphi_{dec})$. This extra computation can be done in parallel as different quantiles are estimated independently and mixed in the final stage.

\begin{figure*}[!t]
	\centering
        \scalebox{0.9}{
            \begingroup
            \setlength{\tabcolsep}{-0.5pt}
            \begin{tabular}{c@{\hskip -0.2pt}ccccc}
                \raisebox{4.8\normalbaselineskip}[0pt][0pt]{\rotatebox[origin=c]{90}{Double Sine}} &
                \includegraphics[width=0.22\textwidth]{figures/double-sine/double-sine-data} & 
	          \includegraphics[width=0.22\textwidth]{figures/double-sine/ACQNP/double-sine-AL-loc-ACQNP} &
                \includegraphics[width=0.22\textwidth]{figures/double-sine/CQNP/double-sine-AL-loc-CQNP} &
                \includegraphics[width=0.22\textwidth]{figures/double-sine/BNP/double-sine-normal-mean-BNP} &
                \includegraphics[width=0.22\textwidth]{figures/double-sine/CANP/double-sine-normal-mean-CANP} 
                \\
                \raisebox{4.6\normalbaselineskip}[0pt][0pt]{\rotatebox[origin=c]{90}{Circle}} &
                \includegraphics[width=0.22\textwidth]{figures/circle/circle-data} & 
	          \includegraphics[width=0.22\textwidth]{figures/circle/ACQNP/circle-AL-loc-ACQNP} &
                \includegraphics[width=0.22\textwidth]{figures/circle/CQNP/circle-AL-loc-CQNP} &
                \includegraphics[width=0.22\textwidth]{figures/circle/BNP/circle-normal-mean-BNP} &
                \includegraphics[width=0.22\textwidth]{figures/circle/CANP/circle-normal-mean-CANP}
                \\
                \raisebox{4.8\normalbaselineskip}[0pt][0pt]{\rotatebox[origin=c]{90}{Lissajous}} &
                \includegraphics[width=0.22\textwidth]{figures/lissajous/lissajous-data} & 
	          \includegraphics[width=0.22\textwidth]{figures/lissajous/ACQNP/lissajous-AL-loc-ACQNP} &
                \includegraphics[width=0.22\textwidth]{figures/lissajous/CQNP/lissajous-AL-loc-CQNP} &
                \includegraphics[width=0.22\textwidth]{figures/lissajous/BNP/lissajous-normal-mean-BNP} &
                \includegraphics[width=0.22\textwidth]{figures/lissajous/CANP/lissajous-normal-mean-CANP} \\
                &a) Data & b) ACQNP & c) CQNP & d) BNP & e) CANP
            \end{tabular}
            \endgroup
        }
	\caption{Examples of predictions made by different methods on synthetic datasets. For A/CQNP, 10 randomly chosen conditional quantiles of $p(\ry \,|\,x)$ at each input location $x$ are plotted. For BNP, we plot the conditional means of the Gaussian predictive distributions obtained from 20 different sets of bootstrap contexts. Similarly, we plot the mean of CANP's conditional distribution as its predictions.}
        \label{fig: 1d-synthetic-benchmark}
\end{figure*}

\begin{table*}[!bp]
	% \vspace*{5pt}
	\centering
        \caption{Synthetic processes used in multimodal 1D regression experiments.}
        \label{table: 1d-benchmark-generative-processes}
        \scalebox{0.9}{
	{\setlength{\tabcolsep}{1.2pt}
	\begin{tabular}{lc} \toprule
		Process          & $g(s)=(g_x(s),\, g_y(s))$ \\ \midrule
		Double Sine   & $\quad g_x(s)=s\, , \, g_{y}(s) = \alpha_{1} \sin{(\omega_{1} s)} \, \mathds{1}_{(0, 0.5)}(p(s))+ \alpha_{2} \cos{(\omega_{2} s)}\,\mathds{1}_{[0.5, 1)}(p(s))$ \vspace*{5pt} \\ 
		Circle        & $\quad g_x(s)=\alpha \cos(s)+\delta \, , \, g_{y}(s) = \alpha\sin(s)+\delta$ \vspace*{5pt} \\
		Lissajous     & $\quad g_x(s)=\alpha_{1} \sin(\omega s + \delta) \, , \, g_{y}(s) = \alpha_{2} \sin(s)$ \vspace*{1pt}\\ \bottomrule
	\end{tabular}}
 }
\end{table*}

\section{Experiments}\label{sec: experiments}
We evaluate our proposed framework on one and two-dimensional regression tasks. Note that unlike other members of the NPs family that focus on building more expressive encoder-decoder blocks, our work is primarily concerned with the form of conditional likelihood and its effect on the model's performance. Nonetheless, we compare our A/CQNP with Conditional Attentive Neural Process (CANP) \citep{kim2018attentive}, and Bootstrapping Neural Process (BNP, \citet{lee2020bootstrapping}) as our baselines to provide a better overview of performance gains obtained by modifying different components of vanilla CNP.
Note that, although NPs share fundamental properties with GPs, they usually are not compared directly because of the different training regimes \citep{kim2018attentive}. While NPs are trained on different functions sampled from the underlying generative process, GPs are fit to observations corresponding to one realization of the process.
In order to compare the goodness of fit across different methods, we report the  log-likelihood on context and target data separately. Methods with higher context log-likelihood offer better reconstructions of context points and hence, are less prone to under-fitting, while higher target log-likelihood indicates more accurate predictions \citep{kim2018attentive, lee2020bootstrapping}. 
Detailed information on model architectures, training, and testing procedures are included in the supplementary materials.
\footnote{Code at {\url{https://github.com/peiman-m/ACQNP}}}

\subsection{Synthetic Data}\label{subsec: synthetic-data-results}
We start our study by examination of each model over several synthetically generated datasets. Each collection consists of a handful number of functions sampled from a known stochastic process. In each iteration, a batch of $n_b$ functions $\mathcal{G} = \{g_k\}_{k=1}^{n_b}$ are sampled from a stochastic process such that $g_k: \mathbb{R} \rightarrow \sX \times \sY$ and $g_k(s) = (g_{k, x}(s),\, g_{k, y}(s))$. For each $g_k$, a set $\mathcal{S}_k$ of $N_{\mathrm{total}}$ random input locations is chosen where $\mathcal{S}_k = \{ s_{k,l} \}_{l=1}^{N_{\mathrm{total}}}$ and $s_{k, l} \sim \mathcal{U}[I_\mathrm{min},\, I_\mathrm{max}]$ ($I_\mathrm{min}$ and $I_\mathrm{max}$ are fixed constants). Applying $g_k$ to the corresponding $\mathcal{S}_k$ will yield a collection of pairs $\mathcal{E}_k = \{(x_{k, l},\, y_{k, l})\}_{l=1}^{N_{\mathrm{total}}}$ where $(x_{k, l},\, y_{k, l})=(g_{k, x}(s_{k, l}),\, g_{k, y}(s_{k, l}))$. By repeating this process for each $k$, we end up with a hierarchical dataset\footnote{A hierarchical dataset is a collection of observations from many functions sharing some underlying characteristics.} $\mathcal{E} = \{\mathcal{E}_k\}_{k=1}^{n_b}$ \citep{CNP, NP}. The variable $s$ is discarded after sampling $\mathcal{E}_k$. In the course of training, the total number of data points $N_{\mathrm{total}}$ is randomly chosen for each batch such that $N_{\mathrm{total}} \sim \mathcal{U}[6, 100]$. Lastly, each $\mathcal{E}_k$ is split into context and target sets by choosing a random index $N_{\mathrm{context}} \sim \mathcal{U}[3, N_{\mathrm{total}}-3]$ and setting $\mathcal{E}_{k, {\mathrm{context}}} = \{(x_{k, l},\, y_{k, l})\}_{l=1}^{N_{\mathrm{context}}}$ and $\mathcal{E}_{k, {\mathrm{target}}} = \{(x_{k, l},\, y_{k, l})\}_{l=N_{\mathrm{context}}+1}^{N_{\mathrm{total}}}$.
During testing, however, we fix $N_{\mathrm{total}}=500$ and select $N_{\mathrm{context}} \sim \mathcal{U}[3, 100]$. Each method is trained and tested over $10^5$ and $10^3$ batches with $n_b=128$ and $n_b=16$, respectively. Note that, unlike training data which is generated in each iteration of training, we fix the testing data across different models by generating them in advance. We consider data arising from the three stochastic processes described in table \ref{table: 1d-benchmark-generative-processes} with the following choice of parameters:
\begin{itemize}
  \item Double-Sine: $s \sim \mathcal{U}[-2,\,2)$, $\alpha_{1}, \alpha_{2} \sim \mathcal{U}[0.5, 1.5)$, $\omega_{1}, \omega_{2} \sim \mathcal{U}[1, 3)$ and $p(s) \sim \mathcal{U}[0, 1)$
  \item Circle: $s \sim \mathcal{U}[-\pi,\,\pi)$, $\alpha \sim \mathcal{U}[0.5, 1.5)$, $\delta \sim \mathcal{U}[-0.5, 0.5)$
  \item Lissajous: $s \sim \mathcal{U}[-\pi,\,\pi)$, $\alpha_{1}, \alpha_{2} \sim \mathcal{U}[1, 2)$, $\omega \sim [0.5, 2)$, $\delta \sim \mathcal{U}[0, 2)$
\end{itemize}
Table \ref{tabel: 1d-benchmark-synthetic-results} summarizes the predictive log-likelihood of different methods over testing data. We see that A/CQNP constantly outperforms baselines despite using the vanilla deterministic encoder in CNP which: 1) does not incorporate any latent variable for capturing the functional uncertainty and correlations as in BNP, and 2) does not enjoy the expressive context representations provided by the attention mechanism used in CANP. As depicted in figure \ref{fig: 1d-synthetic-benchmark}, CANP fails to capture multimodality and instead tends to make predictions that resemble the average of modes. This happens less severely with BNP. Nonetheless, the sampled curves bounce between the modes which results in unnecessarily wide prediction bands. CQNP, on the other hand, provides decent fits which are further polished by incorporating the quantile adaptation structure in ACQNP. 
Figure \ref{fig: 1d-benchmark-tau-distribution} illustrates the quantile levels $\tau$ that were adapted by ACQNP for predicting the quantiles shown in figure \ref{fig: 1d-synthetic-benchmark}b. We see that ACQNP behaves in line with our motivations behind adaptive quantile regression discussed in section \ref{subsec: adaptive-quantile-regression}. The distribution of $\tau$ imitates the distribution of the data, in the sense that $p(\ry \,|\, x, \mathcal{D})$ and $p(\tau \,|\,x, \mathcal{D})$ have the same number of modes almost everywhere. Additional experimental results on unimodal regression tasks are provided in the supplements.
\begin{figure*}[!tp]
	\centering
        \scalebox{0.85}{
            \begingroup
            \setlength{\tabcolsep}{-0.5pt}
            \begin{tabular}{ccc} 
	          \includegraphics[width=0.33\textwidth]{figures/double-sine/ACQNP/double-sine-AL-tau-ACQNP} & 
	          \includegraphics[width=0.33\textwidth]{figures/circle/ACQNP/circle-AL-tau-ACQNP} & 
	          \includegraphics[width=0.33\textwidth]{figures/lissajous/ACQNP/lissajous-AL-tau-ACQNP} \\
                a) Double Sine  & b) Circle  & c) Lissajous
            \end{tabular}
            \endgroup
        }
	\caption{Distribution of the adapted $\tau$ levels corresponding to the estimated quantiles in figure \ref{fig: 1d-synthetic-benchmark}b after applying the transformation $\psi$ in equation \ref{eq: ACQNP-liklihood}.}
        \label{fig: 1d-benchmark-tau-distribution}
\end{figure*}

\begin{table*}[!tp]\centering
    \caption{Comparison of predictive log-likelihood obtained by different methods over synthetically generated tasks ($6$ Seeds).}
    \label{tabel: 1d-benchmark-synthetic-results}
    \scalebox{0.9}{
        \begin{tabular}{@{}l cc cc cc@{}}
        \toprule
            & \multicolumn{2}{c}{Double Sine}      & \multicolumn{2}{c}{Circle}      & \multicolumn{2}{c}{Lissajous} \\
            \cmidrule[0.2pt]{2-7}  
            & context & target      & context & target      & context & target \\
            \midrule
            CNP         & ${-0.195_{\pm0.009}}$ & ${-0.520_{\pm0.019}}$       & ${-2.086_{\pm0.204}}$ & ${-2.387_{\pm0.216}}$       & ${-2.212_{\pm0.146}}$ & ${-2.962_{\pm0.165}}$ \vspace*{2pt}\\
            CANP        & ${0.436_{\pm0.236}}$ & ${-1.742_{\pm0.222}}$       & ${-0.272_{\pm0.043}}$ & ${-1.685_{\pm0.082}}$       & ${-1.112_{\pm0.499}}$ & ${-2.151_{\pm0.248}}$ \vspace*{2pt}\\
            BNP         & ${0.330_{\pm0.010}}$ & ${0.134_{\pm0.017}}$       & ${0.150_{\pm0.010}}$ & ${0.065_{\pm0.009}}$       & ${-0.314_{\pm0.011}}$ & ${-0.434_{\pm0.010}}$ \vspace*{2pt}\\
            CQNP(ours)  & ${\bm{1.448}_{\pm0.042}}$ & ${\bm{1.244}_{\pm0.049}}$       & ${\bm{2.047}_{\pm0.076}}$ & ${\bm{1.932}_{\pm0.080}}$       & ${\bm{0.798}_{\pm0.020}}$ & ${\bm{0.508}_{\pm0.021}}$ \vspace*{2pt}\\
            ACQNP(ours) & ${\bm{1.582}_{\pm0.108}}$ & ${\bm{1.349}_{\pm0.098}}$       & ${\bm{2.118}_{\pm0.059}}$ & ${\bm{2.028}_{\pm0.057}}$       & ${\bm{0.929}_{\pm0.038}}$ & ${\bm{0.634}_{\pm0.034}}$ \\
        \bottomrule 
    \end{tabular}}
\end{table*}

\subsection{Speed-Flow Data}\label{subsec: speed-flow-experiment}
\begin{figure*}[!h]
	\centering
        \scalebox{0.9}{
            \begingroup
            \setlength{\tabcolsep}{-1.5pt}
            \begin{tabular}{c@{\hskip 2pt}ccccc}
                \raisebox{4.4\normalbaselineskip}[0pt][0pt]{\rotatebox[origin=c]{90}{Lane 2}} &
                \includegraphics[width=0.22\textwidth]{figures/speed-flow/speedFlow-lane2-data} & 
	          \includegraphics[width=0.22\textwidth]{figures/speed-flow/ACQNP/speedFlow-lane2-AL-loc-ACQNP} &
                \includegraphics[width=0.22\textwidth]{figures/speed-flow/CQNP/speedFlow-lane2-AL-loc-CQNP} &
                \includegraphics[width=0.22\textwidth]{figures/speed-flow/BNP/speedFlow-lane2-normal-quantile-BNP} &
                \includegraphics[width=0.22\textwidth]{figures/speed-flow/CANP/speedFlow-lane2-normal-quantiles-CANP} 
                \\
                \raisebox{4.4\normalbaselineskip}[0pt][0pt]{\rotatebox[origin=c]{90}{Lane 3}} &
                \includegraphics[width=0.22\textwidth]{figures/speed-flow/speedFlow-lane3-data} & 
	          \includegraphics[width=0.22\textwidth]{figures/speed-flow/ACQNP/speedFlow-lane3-AL-loc-ACQNP} &
                \includegraphics[width=0.22\textwidth]{figures/speed-flow/CQNP/speedFlow-lane3-AL-loc-CQNP} &
                \includegraphics[width=0.22\textwidth]{figures/speed-flow/BNP/speedFlow-lane3-normal-quantile-BNP} &
                \includegraphics[width=0.22\textwidth]{figures/speed-flow/CANP/speedFlow-lane3-normal-quantiles-CANP} 
                \\
                &a) Data & b) ACQNP & c) CQNP & d) BNP & e) CANP
            \end{tabular}
            \endgroup
        }
	\caption{First column from left includes the ground truth data. Second and third columns show the quantiles of the predictive distribution of A/CQNP at 10 different levels. Fourth column shows the ensembled quantiles at the same levels for 5 bootstrap context sets. Last column shows the conditional quantiles of CANP at the same levels.}
        \label{fig: speed-flow}
\end{figure*}
The problem of traffic speed prediction has been widely investigated in transportation science with applications in approximating the expected arrival time \citep{Einbeck2006ModalRegression}. A key factor in such models is the traffic flow which is usually presented by speed-flow diagrams. As a case study, we consider the speed-flow data collected by \citet{PETTY199671}. The dataset contains the speed-flow diagrams of lanes 2 and 3 of a four-lane Californian freeway with 1318 measurements for each lane (Figure \ref{fig: speed-flow}a) and is included in the R-package \texttt{hdrcde} \citep{hdrcde}. This collection can be viewed as an example of a hierarchical dataset where observations from different lanes correspond to different realizations of a random process. The hierarchical structure of this data makes it an ideal fit with NPs as they allow for sharing information among different lanes. 
We randomly select $75\%$ of each lane's observations ($\approx 988$) for training and use the rest for testing. The speed and flow values are both scaled to $[0, 1]$.
In each training iteration, we split the training data into context and target sets such that $N_{\mathrm{context}} \sim \mathcal{U}[500, 985]$. For testing, however, we take the context and target sets to be the training and testing data, respectively. Table \ref{tabel: speed-flow-benchmark-results} provides a quantitative comparison of different approaches, with A/CQNP being the best-performing model on this real-world task. As shown in figure \ref{fig: speed-flow}, the Gaussian predictive distributions in BNP and CANP, which resemble the conditional mean regression, lead to wide prediction bands compared to A/CQNP. This is due to the sensitivity of mean estimators in dealing with the less dense cloud of data points at the bottom which can be interpreted as outliers \citep{Feng2020ModalReg}. 
In contrast to the mean which acts on a global level, the local nature of quantiles makes them more robust to the tail behavior.

\begin{table}[!t]\centering
    \caption{Context and target log-likelihoods from experiments on speed-flow data ($6$ Seeds).}
    \label{tabel: speed-flow-benchmark-results}
    \scalebox{0.9}{
        \begin{tabular}{@{}l cc@{}}
        \toprule
            & \multicolumn{2}{c}{Speed-Flow} \\
            \cmidrule[0.2pt]{2-3}  
            & context & target \\
            \midrule
            CNP         & ${0.845_{\pm0.010}}$ & ${0.719_{\pm0.002}}$ \vspace*{2pt}\\
            CANP        & ${0.887_{\pm0.010}}$ & ${0.741_{\pm0.014}}$  \vspace*{2pt}\\
            BNP         & ${0.879_{\pm0.005}}$ & ${0.720_{\pm0.015}}$   \vspace*{2pt}\\
            CQNP(ours)  & ${\bm{1.518}_{\pm0.013}}$ & ${\bm{1.495}_{\pm0.007}}$  \vspace*{2pt}\\
            ACQNP(ours) & ${\bm{1.544}_{\pm0.001}}$ & ${\bm{1.507}_{\pm0.006}}$  \\
        \bottomrule 
    \end{tabular}}
\end{table}

\begin{table*}[!bp]\centering
    \caption{Context and target log-likelihoods on 2D regression tasks ($6$ Seeds).}
    \label{tabel: 2d-benchmark-image-completion-results}
    \scalebox{0.9}{
    \begin{tabular}{@{}l cc cc cc cc cc@{}}
        \toprule
        & \multicolumn{2}{c}{MNIST}      & \multicolumn{2}{c}{FashionMNIST}      & \multicolumn{2}{c}{SVHN}      & \multicolumn{2}{c}{Omniglot}      & \multicolumn{2}{c}{FreyFace} \\
        \cmidrule[0.2pt]{2-11}
        & context & target      & context & target       & context & target      & context & target      & context & target \\
        \midrule
        CNP         & $\underset{(\pm0.006)}{1.061}$ & $\underset{(\pm0.001)}{0.938}$       & $\underset{(\pm0.005)}{0.963}$ & $\underset{(\pm0.004)}{0.872}$       & $\underset{(\pm0.014)}{3.554}$ & $\underset{(\pm0.013)}{3.388}$       & $\underset{(\pm0.004)}{0.978}$ & $\underset{(\pm0.009)}{0.874}$       & $\underset{(\pm0.083)}{0.970}$ & $\underset{(\pm0.088)}{0.941}$ \vspace*{2pt}\\
        CANP        & $\underset{(\pm0.003)}{1.350}$ & $\underset{(\pm0.006)}{0.913}$       & $\underset{(\pm0.007)}{1.226}$ & $\underset{(\pm0.024)}{0.857}$       & $\underset{(\pm0.002)}{4.112}$ & $\underset{(\pm0.016)}{3.715}$       & $\underset{(\pm0.005)}{1.366}$ & $\underset{(\pm0.004)}{0.974}$       & $\underset{(\pm0.053)}{1.062}$ & $\underset{(\pm0.023)}{1.015}$ \vspace*{2pt}\\
        BNP         & $\underset{(\pm0.013)}{1.128}$ & $\underset{(\pm0.009)}{1.061}$       & $\underset{(\pm0.002)}{1.039}$ & $\underset{(\pm0.001)}{0.971}$       & $\underset{(\pm0.008)}{3.679}$ & $\underset{(\pm0.007)}{3.580}$       & $\underset{(\pm0.006)}{0.983}$ & $\underset{(\pm0.004)}{0.950}$       & $\underset{(\pm0.015)}{1.073}$ & $\underset{(\pm0.014)}{1.052}$ \vspace*{2pt}\\
        CQNP(ours)  & $\underset{(\pm0.006)}{\bm{2.683}}$ & $\underset{(\pm0.006)}{\bm{2.609}}$       & $\underset{(\pm0.004)}{\bm{2.012}}$ & $\underset{(\pm0.002)}{\bm{1.932}}$       & $\underset{(\pm0.033)}{\bm{4.725}}$ & $\underset{(\pm0.029)}{\bm{4.447}}$       & $\underset{(\pm0.034)}{\bm{2.529}}$ & $\underset{(\pm0.038)}{\bm{2.471}}$       & $\underset{(\pm0.062)}{\bm{1.433}}$ & $\underset{(\pm0.061)}{\bm{1.392}}$ \vspace*{2pt}\\
        ACQNP(ours) & $\underset{(\pm0.011)}{\bm{2.681}}$ & $\underset{(\pm0.013)}{\bm{2.616}}$       & $\underset{(\pm0.016)}{\bm{2.040}}$ & $\underset{(\pm0.015)}{\bm{1.954}}$       & $\underset{(\pm0.042)}{\bm{4.959}}$ & $\underset{(\pm0.034)}{\bm{4.651}}$       & $\underset{(\pm0.009)}{\bm{2.516}}$ & $\underset{(\pm0.007)}{\bm{2.461}}$       & $\underset{(\pm0.089)}{\bm{1.522}}$ & $\underset{(\pm0.086)}{\bm{1.475}}$ \\
        \bottomrule 
    \end{tabular}}
\end{table*}

\subsection{Image Completion}\label{subsec: image-completion-experiment}
A collection of images can also be thought of as a hierarchical dataset where each image is a realization of some random process mapping 2D pixel coordinates to pixel intensities. Motivated by this observation, image inpainting can be framed as a regression problem where conditioned on a set of observed pixels, we are interested in filling the missing regions of the image.
MNIST \citep{lecun1998gradient-MNIST}, Fashion-MNIST \citep{Fashion-MNIST}, SVHN \citep{netzer-2011-SVHN}, Omniglot \citep{omniglot} (resized to 32 $\times$32) and FreyFace \citep{roweis2001global-Frey} are the datasets that we consider here. Except for FreyFace, we use the default train/test split used by the publishers. For FreyFace, we randomly select $75\%$ of the images for training and keep the rest for testing. For all the benchmarks, the pixel values and pixel coordinates are rescaled to $[0, 1]$ and $[-1,\, 1]$ respectively.
In each case, we take $\mathcal{E}_{k}$ as the set of all image pixels. Similar to section \ref{subsec: synthetic-data-results}, the context and target sets in both training and testing are chosen such that $N_{\mathrm{context}} \sim \mathcal{U}[3, N_{\mathrm{total}}/2)$ where $N_{\mathrm{total}}$ is the number of image pixels. Table \ref{tabel: 2d-benchmark-image-completion-results} shows that A/CQNP substantially outperforms the baselines across all datasets in terms of predictive log-likelihood. This holds for both context and target sets revealing that A/CQNP yields better reconstruction of context data. Moreover, results from Omniglot experiments suggest that A/CQNP has better generalization capabilities as the default test split has distinct classes from training.

\subsection{Ablation Study}\label{subsec: ablation-study}
\begin{figure*}[h!]
	\centering
        \scalebox{0.75}{
            \begingroup
            \setlength{\tabcolsep}{0pt}
            \begin{tabular}{ccc}
                \includegraphics[width=0.5\textwidth]{figures/lissajous/Ablation-N_tau} &
                \qquad &
                \includegraphics[width=0.5\textwidth]{figures/lissajous/Ablation-depth}
            \end{tabular}
            \endgroup
        }
	\caption{\textbf{Left}: Comparison of the predictive log-likelihood over context and target points for A/CQNP with different values of $N_\tau$ during testing (6 seeds); \textbf{Right}: Predictive performance of ACQNP versus the depth of $\psi$ (6 seeds).}
        \label{fig: ablation-adaptive-quantile}
\end{figure*}
\paragraph{Number of quantiles $N_{\tau}$.}
As mentioned earlier in section \ref{subsec: CQNP}, we approximate the conditional likelihood by $N_{\tau}$ Monte Carlo samples. While using a larger sample size offers more precise approximation, it demands further computational resources which highlights the importance of a rather fine-grained scheme for sampling $\tau$ instead of random draws from a uniform distribution, especially when we are restricted to work with a small $N_{\tau}$. To check if the adaptive mechanism can alleviate this issue, we compare the predictive performance of ACQNP against CQNP on Lissajous curves studied in section \ref{subsec: synthetic-data-results}. 
Both models have the same architectural design for their encoder/decoder modules and are trained with $N_{\tau}=50$. During testing, however, we evaluate each method with $N_{\tau} \in \{3, 5, 7, 9, 11\}$ as depicted in figure \ref{fig: ablation-adaptive-quantile}a. Note that in addition to testing data, we also fix the input quantile levels $u$ to be $\{u_0, u_0+d, \dotsc, u_0+(N_{\tau}-2)d, u_1\}$ where $u_0=0.001$, $u_1=0.999$, and $d=\frac{u_1-u_0}{N_{\tau}-1}$.
It can be seen that the log-likelihood of CQNP decreases significantly with smaller sample sizes, whereas ACQNP suffers less as it can manage to use its few shots efficiently and locate informative quantiles.

\paragraph{Flexibility of $\psi$.}
The adaptive process that we followed in this paper works by transforming $u$ through some nonlinear mapping $\psi$ introduced in equation \ref{eq: al-mixture-with-adaptive-quantiles}; thus, the choice of this mapping is expected to affect the performance. Throughout this work, we modeled $\psi$ with a fully-connected neural network. We investigate the effect of the depth of the neural network as a measure of its expressive power. As illustrated in figure \ref{fig: ablation-adaptive-quantile}, a deeper neural network improves the overall performance of the model. However, the performance gain comes at the cost of additional memory usage and computational complexity which needs to be considered as a tradeoff. 
 
\section{Related Works}

\paragraph{Vector quantile regression}
Quantile regression (QR), introduced by \citet{Koenker1978QR}, is a compelling statistical technique that can be used for studying the dependence between random variables by modeling the conditional quantiles of a target variable as a function of some explanatory variables. Unfortunately, the QR framework only considers scalar target variables as the notion of quantile is not well-defined in higher dimensions. We can apply QR to scalar components of a vector-valued response variable by assuming independence among them. This assumption, however, is usually violated. Recently, \citet{carlier2016vector, Chernozhukov-17-VQR, CARLIER201796} introduced vector quantiles as extensions to the univariate quantiles which allows for vector quantile regression (VQR). \citet{rosenberg2023fast} propose nonlinear vector quantile regression (NL-VQR) which drops the restrictive specification of a linear conditional quantile function. They further introduce a solver allowing for applying their method to large datasets.

\paragraph{Function-space inference}
There is a growing line of research on using neural networks for direct parameterization of distributions over functions. NPs and their variants are well-known byproducts of this viewpoint. \citet{ma19VIP} introduced implicit processes (IPs) as priors over functions by placing an implicit joint distribution over any finite collection of random variables. However, they use GPs for approximating the intractable posterior of IPs which is: 1) computationally expensive, and 2) limited by the Gaussian likelihood assumption. Sparse implicit processes (SIPs, \citet{santana22sparse}) try to address these issues by relying on inducing points \citep{snelson2005sparse} and using a mixture of Gaussians as the predictive distribution. \citet{yang-20-EBP} use energy-based models (EBMs) to construct a family of expressive stochastic processes for exchangeable data. The additional flexibility, however, comes at the cost of complicated training and inference schemes requiring several approximations. Inspired by contrastive methods \citep{durkan2020contrastive, gutmann2010noise, gondal21FCLR}, \citet{mathieu2021contrastive} drop the explicit likelihood requirement used in NPs which necessitates exact reconstruction of observations. Despite higher tolerance in facing noisy high-dimensional data, their method is incapable of uncertainty quantification.

\section{Conclusions}
In this paper, we proposed Conditional Quantile Neural Processes (CQNPs), a new member of the CNP family that leverages advances in quantile regression to 
increase the expressive power of CNPs in modeling heterogeneous distributions.
Furthermore, we introduced an extended framework for quantile regression, named adaptive quantile regression, where instead of fixing the quantile levels, the model gets to choose which quantiles to estimate. 
Our experiments with several synthetic and real-world datasets showed that A/CQNPs improve the predictive performance of CNPs across regression tasks in terms of log-likelihood, and faithfully model multimodality in predictive distributions.

\begin{acknowledgements}
This work was supported in part by the National Science Foundation under award 1848596. We also thank Texas A\&M High Performance Research Computing for providing computational resources to perform experiments in this work.
\end{acknowledgements}

% References
{\bibliography{mohseni_273-bibfile}}
\end{document}
