\subsection{Evolution of Performance Metrics by Training Epochs}\label{asec:epoch_results}

\begin{figure*}[!ht]
    \centering
\includegraphics[width=\textwidth]{uai2025-template/figures/epoch_results.pdf}
        \caption{
Evolution of RMSE and PPLL across training epochs. For CVGP and SVGP, validation RMSE and PPLL consistently decrease ---showing no critical indication of overfitting. In contrast, PPGPR’s RMSE improves, but PPLL worsens, indicating overfitting of noise and cross-correlation, leading to suboptimal PPLL. Training stops only when RMSE no longer improves. Large negative PPLL values prevent reporting PPGPR’s PPLL in Figure \ref{fig:exp_predictive_real_all}. 
%However, it is reasonable to assume that PPGPR could outperform ExactGP and other baselines in PPLL if regularized (\ie early stopped) with respect to this metric. However, its predictive RMSE performance would suffer and likely be worse.
}
\label{fig:epoch_results}
\end{figure*}

\newpage
\subsection{Model Learning and Inference Gaps}\label{asec:gap_results}
\begin{figure*}[!ht]
        \includegraphics[width=\textwidth]{uai2025-template/figures/gap_figures_all.pdf}
 \label{fig:gap_results_all}
    \caption{
Divergence from the true posterior $\cp{f^\star}{\xb^\star, \Xb, \yb}$ and lower-bound tightness relative to the exact solution (\ie the difference in log-marginal and ELBO, always positive). PPGPR does not lower-bound ExactGP and diverges, as shown in the figure. By directly fitting noisy observations, PPGPR overfits early, while SVGP and CVGP filter noise.
    }
    \label{fig:sim}
\end{figure*}


%\subsection{Experiments on Synthetic Datasets}\label{asssec:exp_quant_simulated}

%Below we investigate the performance in simulated datasets in 3 folds: (1) We compare all benchmarks in a single plot. Then for better visibility we showcase the performance of (2) stochastic models only and how (3) the robustness of CVGP to random initialization (\ie RandomCVGP). RandomCVGP is initialized using white noise for $\{\XbC, \ybC, \betabC\}$.

%\subsubsection{Comparison of All Models}

%\begin{figure*}[!ht]
%        \includegraphics[width=\textwidth]%{uai2025-template/figures/box_whisker_synthetic_all.pdf}
% \label{fig:sim_stochastic}
%    \caption{
%Results on simulated data. We observe that methods that do not allow for sub-sampling may exhibit numerical instability and demonstrate larger variance between folds while methods that do allow for sub-sampling are more robust in general. 
%    }
%    \label{fig:exp_sim_all}
%\end{figure*}

%\subsubsection{Comparison of Stochastic Models}


% Best
%Figure~\ref{fig:sim} shows that CVGP obtains, in simulated datasets, 
%better inference (lower-bound), predictive (RMSE), and posterior predictive log-likelihood (PPLL) performance. Note that CVGP outperforms stochastic $\gp$ competitors with as little as $C=50$ coresets. We also emphasize that CVGP shows robust performance over different initializations (RandomCVGP is initialized with standard Gaussian for kernel hyperparamters, inducing points, and likelihood noise), and outperforms SVGP and PPGPR
%consistently across all synthetic datasets.

%\begin{figure*}[!ht]
%        \includegraphics[width=\textwidth]{uai2025-template/figures/box_whisker_synthetic.pdf}
% \label{fig:sim_stochastic}
 %   \caption{
%        Box-and-whisker diagrams of $\mathcal{L}$ and RMSE results on the simulated datasets,
 %       for (a) all $\gp$ baselines and (b) \emph{stochastic} $\gp$ baselines.
 %       Arrows indicate the better direction: higher $\mathcal{L}$ and lower RMSE.
  %      Titles denote in parenthesis the dataset size.
  %      CVGP outperforms SVGP and PPGPR, is on-par with SparseGP, and sometimes even ExactGP.
 %       CVGP outperforms stochastic $\gp$ competitors with as little as $50$ inducing points/coresets.
%    }
 %   \label{fig:sim}
%\end{figure*}

\clearpage

\subsection{Robustness to Initialization}
\label{asssec:app_exp_robustness}

Below, we demonstrate CVGP's robustness to random initialization. We observe that RandomCVGP (CVGP initialized with white Gaussian noise) performs on par with CVGP in almost all cases and metrics. For very big datasets with many input-features (\eg Song), a random initialization over high-dimensional input-output spaces is a clear disadvantage.
Hence, we recommend, in general, to initialize CVGP with k-means.

%\begin{figure*}[!ht]
%        \includegraphics[width=\textwidth]%{uai2025-template/figures/box_whisker_synthetic_randomcvgp_cvgp.p%df}
% \label{fig:sim_stochastic}
%    \caption{Comparison between random initialization of inducing %points/variables and k-means initialization reveals similar performance across most cases. Qualitative analyses indicate that when RandomCVGP cannot learn inducing points/variables that effectively capture the $y|x$ relationship, it compensates by driving the corresponding coreset weights to $0$.
%    }
%    \label{fig:exp_sim_cvgp_vs_randomcvgp}
%\end{figure*}


\begin{figure*}[!ht]
    \includegraphics[width=\textwidth]{uai2025-template/figures/box_whisker_real_randomcvgp_cvgp.pdf}
    \label{fig:sim_stochastic}
    \caption{{
    Predictive performance comparison for CVGP when initialized with random points (RandomCVGP) or k-means over the observed (real) datasets.
    CVGP is robust to random initializations.  The best performing initiation mean statistic (\textcolor{darkgreen}{\scalebox{1}{$\blacktriangle$}}) is $\textbf{\underline{emphasized}}^\star$.
    }}
    \label{fig:exp_predictive_cvgp_randomcvgp_real}
\end{figure*}

\newpage

\subsection{Posterior-Prior interpolation: Noisy Real-World Data}
\label{assec:app_exp_noisy}

We discuss all sparse $\gp$ method's ability for their posterior to interpolate between the model prior and the information provided by observations.

For CVGP, as the observation noise increases ($\sigma^2 \rightarrow \infty$),
its posterior mean $\mb_{\fbC | \ybC}$ converges to $\mathbf{0}$ (the $\gp$ prior mean),
and its posterior covariance $\Kb_{\fbC | \ybC}$ converges to the prior covariance $\KbZZ$;
\ie the observations are noninformative and CVGP's posterior reverts to the $\gp$ prior.
Conversely, for noiseless data ($\sigma^2 \rightarrow 0$),
CVGP's posterior mean approaches $\ybC$,
and its posterior covariance diminishes to $\mathbf{0}$ (see Equation \ref{eq:cvtgp_q}).
On the contrary, %SVGP's posterior is adjusted based purely on variational parameter optimization.
SVGP's posterior statistics ($\mathbf{m}, \mathbf{S}$)
have no explicit model dependencies, and therefore,
are adjusted based purely on variational parameter optimization.

We run an empirical experiment below, where
we take a real-world dataset and progressively add noise to the true regression values,
before training SVGP, PPGPR, and CVGP on these extra-noisy versions of the datasets.
As in any Bayesian model,
we expect that for low noise regimes,
the posterior should diverge from the prior to capture the information provided by observations;
while for high noise regimes (uninformative data), the posterior should remain similar to the prior.
Below, we notice that
CVGP effectively resorts to the prior under uninformative data, a behavior exhibited by ExactGP,
while PPGPR does not recover the prior ---fitting the noisy data.

\begin{figure}[!h]
    \centering
    % First subfigure
    \begin{subfigure}{0.45\textwidth}
        \centering
        \includegraphics[width=\linewidth]{uai2025-template/figures/noise_figures_bike.pdf}
    \end{subfigure}
    % Second subfigure
    \begin{subfigure}{0.45\textwidth}
        \centering
        \includegraphics[width=\linewidth]{uai2025-template/figures/noise_figures_bike_only_svgp_cvgp.pdf}
    \end{subfigure}
    \caption{Study of the difference between sparse $\gp$ approximate posteriors and model prior for the Bike dataset, as measured by the KL-divergence between prior and approximate variational posterior ($\kl{q(\fbC)}{p(\fbC)}$) across different observation noise regimes. Left: CVGP, PPGPR, and SVGP, right: CVGP and SVGP. We see that PPGPR diverges from prior vastly while SVGP and CVGP retains the Gaussian prior-likelihood conjugacy (\ie as noise increase they do not diverge from the prior vastly).} \label{fig:exp_inference_prior_to_posterior2}
\end{figure}

\newpage

\section{Qualitative Study}\label{asec:qual_study}

\subsection{Qualitative Evaluation of Posterior Predictive}
\label{asssec:app_exp_posterior_predictive}
Below, we showcase the predictive distributions of each trained $\gp$ model, for the synthetic 1D datasets,
\ie synthetic 1 in Figure~\ref{fig:posterior_predictive_synthetic1},
synthetic 2 in Figure~\ref{fig:posterior_predictive_synthetic2},
and synthetic 3 in Figure~\ref{fig:posterior_predictive_synthetic3}.

Note that RandomCVGP is initialized with Gaussian white noise and faces a significantly more challenging task in fitting the data compared to SVGP, PPGPR, and CVGP. In fact, some of its learned inducing points/coresets fall off-the-grid and appear unrelated to the data. In these cases, the corresponding coreset weights are low (indicated in purple). 

Conversely, points that effectively capture the $y|x$ relationship are shown in yellow-green, while those that do not are depicted in purple ---and are consequently disregarded during posterior inference.

On the contrary, SVGP and PPGPR do not have this behavior. Hence, we use a single color for their coresets. This coreset-based posterior design endows CVGP with significant flexibility: if an inducing point proves unhelpful for predictions, its influence can be driven to 0 (with the help of its corresponding $\beta_m$). In contrast, both PPGPR and SVGP must select inducing points that consistently capture the data structure.

\begin{figure}[h!]
    \centering
    \begin{subfigure}[c]{0.85\textwidth}
        \includegraphics[width=\textwidth]{figures/posterior_predictive_synthetic1_fold_0.pdf}
    \end{subfigure}
    \begin{subfigure}[c]{0.85\textwidth}
        \includegraphics[width=\textwidth]{figures/posterior_predictive_synthetic1_fold_1.pdf}
    \end{subfigure}
    \begin{subfigure}[c]{0.85\textwidth}
        \includegraphics[width=\textwidth]{figures/posterior_predictive_synthetic1_fold_2.pdf}
    \end{subfigure}
    \begin{subfigure}[c]{0.85\textwidth}
        \includegraphics[width=\textwidth]{figures/posterior_predictive_synthetic1_fold_3.pdf}
    \end{subfigure}
    \begin{subfigure}[c]{0.85\textwidth}
        \includegraphics[width=\textwidth]{figures/posterior_predictive_synthetic1_fold_4.pdf}
    \end{subfigure}
    \caption{Posterior predictive distribution for the synthetic 1 dataset across different 5 folds, with 100 inducing points. }
    \label{fig:posterior_predictive_synthetic1}
\end{figure}

% synthetic 2
\begin{figure}[h!]
    \centering
    \begin{subfigure}[c]{0.85\textwidth}
        \includegraphics[width=\textwidth]{figures/posterior_predictive_synthetic2_fold_0.pdf}
    \end{subfigure}
    \begin{subfigure}[c]{0.85\textwidth}
        \includegraphics[width=\textwidth]{figures/posterior_predictive_synthetic2_fold_1.pdf}
    \end{subfigure}
    \begin{subfigure}[c]{0.85\textwidth}
        \includegraphics[width=\textwidth]{figures/posterior_predictive_synthetic2_fold_2.pdf}
    \end{subfigure}
    \begin{subfigure}[c]{0.85\textwidth}
        \includegraphics[width=\textwidth]{figures/posterior_predictive_synthetic2_fold_3.pdf}
    \end{subfigure}
    \begin{subfigure}[c]{0.85\textwidth}
        \includegraphics[width=\textwidth]{figures/posterior_predictive_synthetic2_fold_4.pdf}
    \end{subfigure}
    \caption{Posterior predictive distribution for the synthetic 2 dataset across different 5 folds, with 100 inducing points. }
    \label{fig:posterior_predictive_synthetic2}
\end{figure}

% synthetic 3
\begin{figure}[h!]
    \centering
    \begin{subfigure}[c]{0.85\textwidth}
        \includegraphics[width=\textwidth]{figures/posterior_predictive_synthetic3_fold_0.pdf}
    \end{subfigure}
    \begin{subfigure}[c]{0.85\textwidth}
        \includegraphics[width=\textwidth]{figures/posterior_predictive_synthetic3_fold_1.pdf}
    \end{subfigure}
    \begin{subfigure}[c]{0.85\textwidth}
        \includegraphics[width=\textwidth]{figures/posterior_predictive_synthetic3_fold_2.pdf}
    \end{subfigure}
    \begin{subfigure}[c]{0.85\textwidth}
        \includegraphics[width=\textwidth]{figures/posterior_predictive_synthetic3_fold_3.pdf}
    \end{subfigure}
    \begin{subfigure}[c]{0.85\textwidth}
        \includegraphics[width=\textwidth]{figures/posterior_predictive_synthetic3_fold_4.pdf}
    \end{subfigure}
    \caption{Posterior predictive distribution for the synthetic 3 dataset across different 5 folds, with 100 inducing points.
    We observe that PPGPR captures heteroscholastic uncertainty. \emph{Although this seems like a plausible property, the noise in $y$ can sometimes be pure noise, and could lead PPGPR to overfit as we discussed and demonstrated in Figures \ref{fig:exp_predictive_training_parkinsons} and \ref{fig:epoch_results}}.}
    \label{fig:posterior_predictive_synthetic3}
\end{figure}

\clearpage
\newpage

\subsection{Study of Inducing Points ($\XbZ$)}
\label{asssec:app_exp_coresets}

We showcase the density of $\XbC$s learned by CVGP (weighted by $\betabC$),
and the $\XbZ$ points learned by other sparse $\gp$ methods
on the 2-dimensional synthetic \texttt{Blobs} (Figure~\ref{fig:coreset_kde_synthetic4_folds}) and \texttt{TwoMoons} (Figure~\ref{fig:coreset_kde_synthetic5_folds}) datasets,
across different folds of the training data.
Notice how CVGP consistently learns meaningful data representations over all folds.


\begin{figure*}[!ht]
    \centering
        % synthetic 4
    \begin{subfigure}[c]{0.95\textwidth}
        \includegraphics[width=\textwidth]{figures/coreset_kde_synthetic4_fold_1.pdf}

                \caption{Synthetic 4 dataset where $y = f(\xb) + \epsilon$, and $\xb \sim \texttt{MakeBlobs}(.)$}
        \label{fig:coreset_syn4}
    \end{subfigure}

    \begin{subfigure}[c]{0.95\textwidth}
        \includegraphics[width=\textwidth]{figures/coreset_kde_synthetic5_fold_4.pdf}

        \caption{Synthetic 5 dataset where $y = f(\xb) + \epsilon$, and $\xb \sim \texttt{MakeMoons}(.)$}
        \label{fig:coreset_syn5}
    \end{subfigure}

        \caption{
        Kernel density estimation (KDE) plots for $\XbC$ learned by CVGP and $\XbZ$ for sparse baselines,
        on (a) synthetic 4 and (b) synthetic 5 datasets.
        For CVGP we use $\betabC$-weighted KDE plots, not possible for alternatives.
        All methods capture the clustered \texttt{Blobs} empirical distribution in Figure~\ref{fig:coreset_syn4},
        yet CVGP models the bi-modal nature of data more clearly.
        CVGP, RandomCVGP, and SparseGP adeptly capture the distinctive \texttt{TwoMoons} shape exhibited by the empirical data distribution in Figure~\ref{fig:coreset_syn5},
        in contrast to other stochastic sparse inference alternatives.
    }
    \label{fig:coreset}
\end{figure*}


% Coresets, x-density
\begin{figure}[h]
    \centering
        % synthetic 4
    \begin{subfigure}[c]{0.85\textwidth}
        \includegraphics[width=\textwidth]{figures/coreset_kde_synthetic4_fold_0.pdf}
    \end{subfigure}
    \begin{subfigure}[c]{0.85\textwidth}
        \includegraphics[width=\textwidth]{figures/coreset_kde_synthetic4_fold_1.pdf}
    \end{subfigure}
    \begin{subfigure}[c]{0.85\textwidth}
        \includegraphics[width=\textwidth]{figures/coreset_kde_synthetic4_fold_2.pdf}
    \end{subfigure}
    \begin{subfigure}[c]{0.85\textwidth}
        \includegraphics[width=\textwidth]{figures/coreset_kde_synthetic4_fold_3.pdf}
    \end{subfigure}
    \begin{subfigure}[c]{0.85\textwidth}
        \includegraphics[width=\textwidth]{figures/coreset_kde_synthetic4_fold_4.pdf}
    \end{subfigure}
    \caption{Learned representations for synthetic 4 dataset over 5 different folds with 100 inducing points. CVGP learns meaningful representations over different folds. RandomCVGP is more noisy than CVGP as it is initialized with Gaussian white noise. }
    \label{fig:coreset_kde_synthetic4_folds}
\end{figure}

\begin{figure}[h]
    \centering
        % synthetic 4
    \begin{subfigure}[c]{0.85\textwidth}
        \includegraphics[width=\textwidth]{figures/coreset_kde_synthetic5_fold_0.pdf}
    \end{subfigure}
    \begin{subfigure}[c]{0.85\textwidth}
        \includegraphics[width=\textwidth]{figures/coreset_kde_synthetic5_fold_1.pdf}
    \end{subfigure}
    \begin{subfigure}[c]{0.85\textwidth}
        \includegraphics[width=\textwidth]{figures/coreset_kde_synthetic5_fold_2.pdf}
    \end{subfigure}
    \begin{subfigure}[c]{0.85\textwidth}
        \includegraphics[width=\textwidth]{figures/coreset_kde_synthetic5_fold_3.pdf}
    \end{subfigure}
    \begin{subfigure}[c]{0.85\textwidth}
        \includegraphics[width=\textwidth]{figures/coreset_kde_synthetic5_fold_4.pdf}
    \end{subfigure}
    \caption{Learned representations for synthetic 5 dataset over 5 different folds.
    CVGP learns meaningful representations over the different folds while other models, except for SparseGP,
    struggle to capture the empirical distribution.  RandomCVGP is more noisy than CVGP as it is initialized with Gaussian white noise.}
    \label{fig:coreset_kde_synthetic5_folds}
\end{figure}

\clearpage
\newpage

\subsection{Learned Coreset Weight Distribution for K-means and Random Initializations}
\label{asssec:app_exp_coreset_weights}

We show below the histogram of CVGP's learned coreset's weights across all synthetic datasets.
Note that, all learned coresets have nonzero weights $\beta_m >0, \; \forall m$,
with very different histograms depending on the dataset:
for some datasets, some pseudo input-output points $\{\XbC, \ybC\}$ are considerably up-weighted. We observe that RandomCVGP consistently drives the weight of unplausible inducing points to $0$.

\begin{figure}[h!]
\centering
\includegraphics[width=\linewidth]{figures/hist_simulated.pdf}
\caption{Histogram of learned CVGP coreset weights $\betabC$. Top CVGP, bottom RandomCVGP (\ie initialization with white noise). We see that some of weights of RandomCVGP go to $0$ while almost all weight values of CVGP are non-zero (\ie no coreset tuple $\{\XbC, \ybC\}$ is discarded ($\beta_m >0, \; \forall m$)).
}
\label{fig:beta_hist}
\end{figure}
