% !TEX root = ../main.tex
% Predictive metric figure: moved here for better location
\begin{figure*}[!th]
    \centering
    \includegraphics[width=\textwidth]{figures/box_whisker_real.pdf}
    \caption{{
        Box-and-whisker diagrams of predictive metrics (RMSE and PPLL) on real datasets.
        The titles denote the dataset and in parenthesis, its size and feature-dimensionality.
        Arrows indicate the desirable metric direction: higher PPLL (to the right) and lower RMSE (to the left). CVGP outperforms SVGP and PPGPR, and is on-par with SparseGP,
        with as few as $50$ coresets. The best performing \emph{stochastic gradient} model mean statistic (\textcolor{darkgreen}{\scalebox{1}{$\blacktriangle$}}) is $\textbf{\underline{emphasized}}^\star$.
        SparseGP and ExactGP results are omitted for the largest datasets due to computational complexities.
        }}
    \label{fig:exp_predictive_real_all}
\end{figure*}

We demonstrate CVGP's superior predictive performance in real-datasets in Section~\ref{ssec:exp_predictive},
before delving into its inference advantages in Section~\ref{ssec:exp_inference}.
We showcase the quality and explainability of the learned CVGP posteriors in Section~\ref{ssec:exp_coresets}.

% Set-up
\subsection{Experimental Setup}

\label{ssec:exp_setup}
We compare CVGP against benchmark $\gp$ alternatives described in Section~\ref{sec:background}:
ExactGP~\citep{rasmussen2006gaussian},
SparseGP~\citep{titsias2009variational},
and SVGP~\citet{hensman2013gaussian}.
We also incorporate Parametric Gaussian Process Regressors (PPGPR) by~\citet{jankowiak2020parametric}
as a strong predictive baseline.
We implement CVGP using Pytorch and GPyTorch libraries,
and use benchmark GPytorch implementations \citep{gardner2018gpytorch} for the baselines.

We use a zero-mean $\gp$ prior with a Radial basis kernel function (RBF) in all experiments,
as the goal is to compare ---for the same GP model---
which approximate $\gp$ technique provides better inference and predictive performance.
We evaluate different coreset (CVGP) and inducing point sizes $M$ for sparse $\gp$ baselines (SparseGP, SVGP and PPGPR),
all initialized with k-means~\citep{hartigan1979algorithm}.
%
We employ 5-fold cross-validation to compute and report each technique's
predictive root-mean-squared error (RMSE)
and posterior predictive log-likelihood (PPLL),
over held out test splits.
We enforce best validation RMSE performance as early stopping criteria.
All details for the reproducibility of the experiments are provided in Appendix Section~\ref{assec:reproducibility}.

CVGP predictive and inference experiments of Sections~\ref{ssec:exp_predictive} and ~\ref{ssec:exp_inference}
are based on real-world regression datasets from the UCI machine learning repository data~\citep{asuncion2007uci}.
We use simulated datasets to showcase learned predictive posteriors in Section~\ref{ssec:exp_coresets},
with all dataset details described in Appendix Section~\ref{assec:datasets}.

\subsection{Predictive performance}
\label{ssec:exp_predictive}
% Figure of predictive performance
We assess the predictive performance of all sparse $\gp$ methods for a variety of real-datasets,
and illustrate the performance of ExactGP
---when computationally possible--- as the optimal benchmark,
in Figure~\ref{fig:exp_predictive_real_all}.

Figure~\ref{fig:exp_predictive_real_all} demonstrates how
CVGP outperforms (higher PPPL, lower RMSE) \emph{stochastic} sparse $\gp$ alternatives (SVGP and PPGPR) consistently,
with performance on par with SparseGP across all predictive metrics
---we inspect the learning and inference gaps between methods in Section~\ref{ssec:exp_inference} and Appendix \ref{asec:gap_results}.

Although CVGP, SVGP and SparseGP share the same theoretical optimum,
empirical predictive performance in Figure~\ref{fig:exp_predictive_real_all} showcases that
SVGP rarely reaches the desirable performance of SparseGP,
while CVGP's is consistently similar to SparseGP
---recall that SparseGP does not allow for stochastic optimization, while CVGP does.

CVGP's performance improves with increaset coreset size and ---with as little as 50 coresets---
consistently outperforms alternative stochastic methods,
even when these baselines use \emph{4-times} more inducing points, \ie SVGP (200) and PPGPR (200).
CVGP's predictive performance is also better than PPGPR,
an approximate $\gp$ algorithm specifically designed for predictive performance.

% % PPGPR worse than CVGP figure
 \begin{figure}[!ht]
     \includegraphics[width=0.5\textwidth]{uai2025-template/figures/epoch_figures_parkinsons.pdf}
    \caption{{
         Evolution of RMSE and PPLL across training epochs. CVGP and SVGP's RMSE and PPLL consistently decrease with training epochs. Even though PPGPR's RMSE improves over epochs, its PPLL deteriorates ---indicating some form of overfitting.
         }
     }
     \label{fig:exp_predictive_training_parkinsons}
\end{figure}

% CVGP Vs PPGPR
We showcase in
Figure~\ref{fig:exp_predictive_training_parkinsons} and Appendix \ref{asec:epoch_results}
the evolution of RMSE and PPLL across training,
where training of models does not stop until there are no RMSE improvements.
Notice that, while CVGP metrics improve consistently over training,
the RMSE for PPGPR improves, while its PPLL deteriorates over training epochs.
\footnote{
Due to the large negative PPLL values of PPGPR,
we have not reported them in Figure \ref{fig:exp_predictive_real_all},
see Appendix \ref{asec:epoch_results}.
%Note that if PPLL was used as early stopping metric, PPLL would have performed even worse for RMSE while potentially outpeforming other methods on this metric due to incorporating heteroscedasticit noise into $\fb$.
}

% CVGP Initialization figure
We demonstrate CVGP's predictive performance robustness to initialization in Figure~\ref{fig:exp_predictive_cvgp_randomcvgp_real} in Appendix Section~\ref{asssec:app_exp_robustness}.
We notice k-means and randomly initialized CVGPs' performance
to be similar across metrics and datasets,
which is likely due to the coreset-based posteriors' flexibility
to up- and down-weight pseudo-input/output pairs via $\betabC$, \emph{a property other methods do not pose}.

\subsection{Inference performance}
\label{ssec:exp_inference}
% Inference gaps: moved here for placement
\begin{figure*}[!h]
    \centering
\includegraphics[width=\textwidth]{uai2025-template/figures/gap_figures_only_js_and_marginal_gap_svgp_cvgp.pdf}
        \caption{{
        Learning and inference gaps for sparse $\gp$ methods over training,
        as measured by
        (top-row) the difference between the log-marginal of ExactGP and the variational bound for SVGP and CVGP; and
        (bottom-row) the Jensen-Shannon divergence between the exact posterior predictive and each method's approximate posterior predictive.
        CVGP provides a better approximation to the exact $\gp$ posterior,
        consistently optimizing a tighter lower-bound.
        A more detailed figure including PPGPR is available in Appendix \ref{asec:gap_results}.}}
        \label{fig:exp_inference_gap}
\end{figure*}

We investigate why CVGP approximates the $\gp$ posterior predictive distributions more accurately,
by studying the relationship between the variational lower-bounds ($\mathcal{L}$)
of sparse $\gp$ alternatives and the true $\gp$ marginal log-likelihood in Equation~\eqref{eq:data_marginal_analytical}.

To that end, we depict in Figure~\ref{fig:exp_inference_gap} 
the difference between the log-marginal of ExactGP and the variational loss optimized by SVGP and CVGP. 
We also show the inference gap of these methods while in training, over held-out datasets,
using the Jensen-Shannon divergence between the exact posterior predictive distribution 
$\cp{\fb^\star}{\xb^\star, \yb}=\int \cp{\fb^\star}{\xb^\star, \fb}\cp{\fb}{\yb} \diff{\fb}$ in Equation~\eqref{eq:gp_posterior},
and each method's approximate posterior predictive $\cq{\fb^\star}{\xb^\star}=\int \cp{\fb^\star}{\xb^\star, \fb_M}\q{\fb_M} \diff{\fb_M}$.
We employ fixed, equal $\gp$ prior hyperparameters for all models.

Results in Figure~\ref{fig:exp_inference_gap} demonstrate how
CVGP better closes the learning gap with ExactGP.
In contrast, SVGP offers a looser bound even if, in theory, both loss functions have the same optimum.
Moreover, smaller divergence from CVGP's posterior to that of ExactGP
suggests that CVGP better approximates the $\gp$ posterior of interest,
at only $\bigO{M}$ parameter, $\bigO{M^3}$ time and $\bigO{M^2}$ space complexities.

This notable inference improvement is attained with as little as 50 coresets,
performance not reached by SVGP even with 200 inducing points.
We argue that this performance gap
is the result of the distinct optimization landscapes
of the former compared to the latter,
induced by the lower-dimensionality of CVGP's optimization problem
and the explicit inductive biases present in CVGP's posterior:
($i$) its ability to interpolate easily between prior and posterior
(see Appendix~\ref{assec:app_exp_noisy} for more experiments), and
($ii$) its ability to learn informative pseudo-points
---further investigated in what follows and in Appendix \ref{asec:qual_study}.

\subsection{CVGP as Bayesian Coreset Learning}\label{ssec:exp_coresets}

\begin{figure*}[!h]
    \includegraphics[width=\textwidth]{uai2025-template/figures/posterior_predictive_synthetic1_fold_4_with exact.pdf}
    \caption{{
    True data generating function ({\red{\textbf{---}}}), posterior predictive mean ({\textbf{\blue{---}}}), and
    2-unit credible intervals (shaded)
    for stochastic $\gp$ methods.
    We indicate  the inducing variables 
    $\{\XbZ, \mb_M=\Ex{q}{\fbZ}\}$
    learned by SVGP and PPGPR;
    and for CVGP, the learned coreset pseudo-points $\{\XbC, \ybC\}$, with each pseudo-point's color intensity weighted by the learned $\beta_m$ on the right hand-side bars.
    Notice CVGP's high-quality posterior, most similar to that of ExactGP, which serves as gold standard. All methods revert to the prior for ranges where \emph{no data} has been observed.
    }}
    \label{fig:exp_coresets_predictive}
\end{figure*}

% Predictive distributions text
We illustrate posterior predictive distributions learned by stochastic sparse $\gp$ methods in Figure~\ref{fig:exp_coresets_predictive},
for a 1-dimensional synthetic dataset.

We observe CVGP's approximate posterior to be closest to the exact predictive posterior,
both in predicted mean and uncertainty quantification.
On the contrary, SVGP and PPGPR encounter difficulties in accurately modeling
the function of interest and their uncertainty in the $x \in (0,2)$ range:
SVGP computes a \emph{low-uncertainty}, \emph{smooth} posterior predictive mean, while PPGPR captures the mean but overestimates uncertainty for $x \in (1,2)$. CVGP, regardless of initialization, better handles this noisy region, matching ExactGP’s mean and uncertainty by learning coreset triplets $\{\XbC, \ybC, \betabC\}$ with up-weighted $\ybC$ that mitigate posterior $\gp$ uncertainty overestimation.

The input locations  $\XbC$ learned by all sparse $\gp$ methods spread across the range of observed data $\Xb$
in Figure~\ref{fig:exp_coresets_predictive}.
While inducing-points methods SVGP and PPGPR learn
$\{\XbZ, \mb_M=\Ex{q}{\fbZ}\}$ pairs,
CVGP learns pseudo-points $\{\XbC, \ybC\}$ with pseudo-observations $\ybC$ in the observation space $\mathcal{Y}$.
Hence, CVGP can learn pseudo-observations $\ybC$ that are correlated
with observed data $\yb$.
Notice how, in Figure \ref{fig:exp_coresets_histograms},
CVGP's posterior is based on pseudo-outputs $\ybC$ that are 
far from the $\gp$ latent values $\fb$ in the $x \in (1, 2)$ range,
which are up-weighted (\ie green colored dots),
where the observations are subject to heteroskedasticity.

Figure~\ref{fig:exp_coresets_histograms}
also shows CVGP’s learned histograms of $\betabC$,
where we compare CVGP with k-means and random initializations (RandomCVGP).

\begin{figure}[!h]
    \includegraphics[width=0.45\textwidth]{uai2025-template/figures/hist_and_predictive_coreset.pdf}
    \caption{Learned coresets (top) and histograms of their weights (bottom) for CVGP with random (RandomCVGP) and k-means initialization, with legend as in Figure \ref{fig:exp_coresets_predictive}. CVGP down-weights uninformative data, yielding many $\betabC\approx0$ for RandomCVGP (removing unhelpful points from its posterior).
    \emph{Unlike other inducing-point methods
    ---which must learn good locations---
    CVGP can eliminate (down-weight) points that do not converge to plausible values.}
    }
    \label{fig:exp_coresets_histograms}
\end{figure}

Figure~\ref{fig:exp_coresets_histograms}
illustrates CVGP's ability to up- and down-weight pseudo-input/output pairs, for both initializations.
RandomCVGP drives many $\beta_m$ to 0 for uninformative data regions,
effectively ignoring those pseudo-points, while up-weighting more informative ones, improving posterior efficiency
---recall that, in coreset-based posteriors, $\beta_m \geq 0$ corresponds to drawing $\beta_m$ samples for each pseudo-point $\{\Xb_m, \yb_m\}$.

We argue that it is CVGP's coreset-based distribution
that enables efficient and accurate approximation of $\gp$ posteriors at a lower parameter complexity:
\ie better predictive posterior, based on fewer pseudo-points $M$.
% 2D densities
Additional benefits of CVGP coreset-based posteriors,
namely posterior explainability
and compact, informative representations of datasets
are illustrated in Appendix Section~\ref{asssec:app_exp_coresets}.