% !TEX root = ../main.tex

CVGP's key novelty is a coreset-based distribution 
designed to incorporate
the $\gp$'s prior model and likelihood characterizations into the CVGP posterior.

We formulate a coreset-based distribution $\q{\fbC}$
over $\gp$ variables $\fbC=f(\XbC)$ at pseudo-inputs $\XbC =\{\xb_m\}_{m=1}^M$
and associated pseudo-observations $\ybC=\{y_m\}_{m=1}^M$ as
\begin{align}
	&\cq{\fbC}{\XbC, \ybC, \betabC} \nonumber = \frac {\cq{\ybC}{ \fbC, \betabC} \cp{\fbC}{\XbC}}{\cp{\ybC}{\XbC, \betabC} } \nonumber \\
	&\qquad = \frac {\left(\prod_{m=1}^M p(y_{m} | f_{m})^{\beta_{m}} \right) p(\fbC|\XbC)}{p(\ybC | \XbC, \betabC) } \;, 
	\label{eq:coreset_gp_posterior_C_f}  
\end{align}
where the data likelihood for each pseudo-observation
$p(y_{m} | f_{m}), m \in \{1, \cdots, M\}$,
is raised to the power of learnable parameters $\betabC=(\beta_{1}, \cdots, \beta_M)$.
The CVGP posterior is a tempered distribution,
which can be understood as if
a small subset $M \leq N$ of pseudo-input/output pairs $\{\Xb_m, y_m\}$
are each drawn $\beta_m \geq 0$ times.

For a Gaussian observation likelihood,\footnote{
    Derivation of closed-form, coreset-based posteriors for non-Gaussian likelihoods is part of future investigations.
}
we derive in Appendix Section~\ref{assec:cvtgp_qf_analytical}
the closed-form multivariate Gaussian distribution 
of CVGP's posterior over $\gp$ function variables $\fbC$,
given coreset triplet $\{\XbC, \ybC, \betabC\}$:
\begin{align}
&\cq{\fbC}{\XbC, \ybC, \betabC} = \N{\fbC | \mb_{\fbC|\ybC}, \Kb_{\fbC|\ybC}},
\label{eq:coreset_gp_posterior_fC_analytical}
\\
& \quad \mb_{\fbC|\ybC} = \KbCC(\KbCC + \Sigmab_{\betabC})^{-1} \ybC , \nonumber \\
& \quad \Kb_{\fbC|\ybC} = \KbCC - \KbCC(\KbCC + \Sigmab_{\betabC})^{-1}\KbCC , \nonumber
\end{align}
where $\Sigmab_{\betabC}= \sigma^2 \cdot \diag{\betabC^{-1}}$.

With this coreset-based distribution over coreset $\gp$ values $q(\fbC)$,\footnote{
The interested reader can find the complementary weight-space derivations in Appendix Section~\ref{assec:cvtgp_weightspace}.
}
we now accommodate the $\gp$ prior's conditional dependency,
$\q{\fb, \fbC} = \cp{\fb}{\fbC} \q{\fbC}$, 
where 
\begin{align}
    \cp{\fb}{\fbC} &= \N{\fb | \mb_{\fb|\fbC}, \Kb_{\fb|\fbC}} \;, \text{ with}\label{eq:gp_cond_coreset}\\
    \mb_{\fb|\fbC} &= \KbXC\KbCC^{-1} \fbC , \nonumber \\
    \Kb_{\fb|\fbC} &= \KbXX  - \KbXC\KbCC^{-1} \KbCX , \nonumber
\end{align}
and compute the variational posterior of interest,
$\cq{\fb}{\XbC, \ybC, \betabC}$ over $\gp$ function values $\fb=f(\Xb)$,
by marginalizing the coreset-based, tempered posterior of Equation~\eqref{eq:coreset_gp_posterior_fC_analytical}
from the joint distribution $\q{\fb, \fbC}$. The resulting CVGP coreset-based variational posterior is
\begin{align}
    &\cq{\fb}{\XbC, \ybC, \betabC} = \N{\fb | \mb_{\fb|\ybC}, \Kb_{\fb|\ybC}}\;, %\text{ with}
    \label{eq:coreset_gp_posterior_f_analytical} \\
    &  \mb_{\fb|\ybC} = \KbXC\left( \KbCC + \Sigmab_{\betabC} \right)^{-1} \ybC \;, \nonumber\\
    &\Kb_{\fb|\ybC} = \KbXX  - \KbXC \left(\KbCC + \Sigmab_{\betabC}\right)^{-1} \KbCX . \nonumber 
\end{align}

If one were to follow standard Bayesian coreset procedures,
we would directly aim to learn the coresets
that best approximate $\cq{\fb}{\XbC, \ybC, \betabC}$ to the true posterior
---which requires computing the $\gp$ posterior in Equation~\eqref{eq:gp_posterior} of $\bigO{N^3}$ complexity \citep{manousakas2022black}.
%
On the contrary, we learn the coreset triplet $\{\XbC, \ybC, \betabC\}$
using a variational objective that aims to minimize the divergence between such two distributions at reduced computational cost, and in a form amenable to its stochastic minimization.
