% !TEX root = ../main.tex

\subsection{Datasets}
\label{assec:datasets}

In this section, we describe the simulated and real-world datasets used in our experiments.
We use UCI machine learning repository for real-world datasets \citep{UCIMachi51:online}, as described in Section~\ref{asssec:datasets_real}.
The generative processes of simulated data are explained below in Section~\ref{asssec:datasets_sim}.
For all datasets, $\Xb$ are normalized (0 centered and unit variance) before training. 
\subsubsection{Real-world Datasets}
\label{asssec:datasets_real}

\paragraph{Physicochemical properties of protein tertiary structure dataset (protein).} 
A physicochemical data collection containing the properties of protein tertiary structure, specifically sourced from CASP 5-9. The dataset includes 45730 data points and 9 features \citep{rana2013physicochemical}.

\paragraph{Bike sharing dataset (bike).} A bike sharing dataset comprised of 17 features and 17379 data points \citep{bike}.

\paragraph{Parkinsons telemonitoring dataset (parkinsons).} A biomedical voice measurements dataset obtained from 42 individuals in the early stages of Parkinson's disease. These individuals were enrolled in a six-month trial for remote symptom progression monitoring, using a telemonitoring device \citep{little2007exploiting}. There are 20 features and 5875 datapoints.


\paragraph{SkillCraft1 master table dataset (skillcraft).} A video gaming telemetry data collection consisting of 12 features and 3338 data points \citep{thompson2013video}.

\paragraph{Wine quality dataset (wine).} A collection of red wine samples with 11 features that are used to predict the wine's quality. In total, there are 1600 data points available for analysis \citep{cortez2009modeling}.

\paragraph{Year Prediction MSD (song).} A collection of audio features. The goal is to predict the year a song is released \citep{year_prediction_msd_203}.

\paragraph{Relative location of CT slices on axial axis
 (slice).}53500 CT images from 74 different patients. The goal is to predict the relative location of the CT slice \citep{graf2011relative}.

\subsubsection{Simulated Datasets}
\label{asssec:datasets_sim}

We generate $1000$ examples for each of the following synthetic datasets.
\paragraph{Synthetic 1.} A 1-dimensional dataset following the below generative process:

\begin{align}
    f &= \frac{2}{5}  \left(\sin{3x} \cos{2x} + \sin{\frac{x}{2}} + \cos{2x} + \exp\left\{-x^2\right\} + |x| \right), & x \sim U(-4,4) \;, \\
    y &= f + \epsilon \sin2\pi f \;, & \epsilon \sim \N{\epsilon \mid 0, 3 \times 10^{-1}} \; .
\end{align}

\paragraph{Synthetic 2.} A 1-dimensional dataset following the below generative process:

\begin{align}
    f &= \sin x^2 + \cos x^2 + \sin 3x + \cos 5x + \frac{\sqrt{|x|}}{2} \;, & x \sim U(-4,4) \;,\\
    y &= f + \epsilon \sin2\pi f \;, & \epsilon \sim \N{\epsilon \mid 0, 3 \times 10^{-1}} \;.
\end{align}

\paragraph{Synthetic 3.} A 1-dimensional dataset following the below generative process:

\begin{align}
    f  &= \cos 2\pi x \;, & x \sim U(0,2) \;, \\
    y &= f + \epsilon x^3 \;, & \epsilon \sim \N{\epsilon \mid 0, 1} \;.
\end{align}

\paragraph{Synthetic 4.} A 2-dimensional dataset following the below generative process:

\begin{align}
    \xb &\sim \texttt{MakeBlobs}(centers=3, std=0.4) \;,\\
    f_1  &= 4 \sin x_1 + 2 \sin 2 x_1 \;, \\
    f_2  &= 3 \cos 3 x_2 + 4 \sin 5 x_2 \;, \\
    f_{12} &=  \expp{-(x_1 + x_2)^2} \;, \\
    y &= f_1 + f_2 + f_{12} + \epsilon \;, & \epsilon \sim \N{\epsilon \mid 0, 2 \times 10^{-1}} \;.
\end{align}
where the function \texttt{MakeBlobs} is implemented as in \citet{scikit-learn}.

\paragraph{Synthetic 5.} A 2-dimensional dataset following the below generative process:

\begin{align}
    \xb &\sim \texttt{MakeMoons}( noise=0.05)\\
    f_1  &= \frac{x_1}{2} + \sin 2 x_1\\
    f_2  &= \frac{x_2}{2}+ \cos 5 x_2 \\
    f_{12} &=   \frac{\expp{
    -(x_1 + x_2)^2}}{2}\\
    y &= f_1 + f_2 + f_{12} + \epsilon \;, & \epsilon \sim \N{\epsilon \mid 0, 2 \times 10^{-1}}
\end{align}
where the function 
\texttt{MakeMoons} is implemented by \citet{scikit-learn}.

\clearpage
\subsection{Baselines}
\label{assec:baselines}

We use the GPytorch \citep{gardner2018gpytorch} implementation of SparseGP, SGVP, and PPGPR.
For ExactGP, we simply use the derivation of \citet{rasmussen2006gaussian} implemented using the \texttt{MultivariateNormal} method of Pytorch \citep{paszke2019pytorch}.

\paragraph{SparseGP.} Introduced by \citep{titsias2009variational}, SparseGP offers a variational solution to inducing point methods. In particular, SparseGP minimizes the KL divergence between an approximate and true posterior distribution. The loss function is derived by finding and plugging the optimal posterior variational distribution, which can be derived in terms of the $\gp$ kernel parameters.

\paragraph{SVGP.} SVGP minimizes the KL divergence between an approximate and true posterior distribution where the posterior distribution is explicitly defined \citep{hensman2013gaussian}. The parameters of the posterior and model are learned jointly.
SVGP is an stochastic approximation to SparseGP, which allows computationally efficient learning.

\paragraph{PPGPR.} PPGPR is a variational predictive method for $\gp$s that, instead of lower-bounding the prior-predictive distribution as the methods above,
proposed to optimize a lower-bound over the posterior predictive \citep{jankowiak2020parametric}.
This method provides predictive uncertainty estimates that model the variance of the observed data more accurately.
PPGPR results in~\citet{jankowiak2020parametric} were based on 400 epochs, which we found insufficient for convergence to optimal RMSE in our experiments. Although longer training helps with better predictive RMSE performance, it also causes severe overfitting on noise ---a behavior not observed in other $\gp$ algorithms.

\paragraph{ExactGP.} The exact learning of Gaussian processes as described by \citet{rasmussen2006gaussian}. We compute the marginal log-likelihood (prior predictive) by integrating the likelihood over the latent function-space (with respect to prior distribution) to learn model hyperparameters, at a computational complexity of $\bigO{N^3}$.

\subsection{Experiment details for reproducibility}
\label{assec:reproducibility}
We employ 5-fold cross-validation
to compute and report each variational technique's (lower-bound) objective $\mathcal{L}$ in inference,
as well as their predictive root-mean-squared error (RMSE) 
and posterior predictive log-likelihood (PPLL)
over held out test splits.

We do not scale the KL-divergence terms in each model's objective, for them to be valid lower-bounds.
We use a fixed random seed over all datasets to ensure that the folds (with 70\%-30\% train and validations splits) for different models are the same. 

We use Adam optimizer with a learning rate of  $10^{-3}$ for all methods and single precision floating point \citep{kingma2014adam}.
For the techniques amenable to stochastic optimization (SVGP, PPGPR, and CVGP), we use a batch size of 512.
Each model is run on a single NVIDIA$^{\text{\textregistered}}$ GeForce$^{\text{\textregistered}}$ RTX 20 series graphics card.

To leverage full model capacity and achieve full optimization performance,
we train for $10^5$ epochs maximum, and stop training only if there are no RMSE improvements for $3\times 10^3$ consecutive epochs over the validation set.
We early stop with respect to the best \emph{held-out} validation set RMSE metric attained.

\newpage
\section{Additional Experiments}

We showcase here additional and different performance findings.
For box plots below, unless otherwise stated, the best performing stochastic model ---in comparison to other stochastic sub-sampling methods--- is showcased in bold. 

\label{assec:app_experiments}
\input{appendix/app_experiments}
