% !TEX root = ../main.tex

For completeness and a complementary perspective, we derive CVGP inference from the weight-space view of $\gp$s,
again under the assumption of standard Gaussian, uncorrelated observation noise,
\ie $\yb = \fb + \epsilon\;, \epsilon \sim \N{\epsilon \mid \zerob, \sigma^2 \Ib_N}$.

Recall the weight-space definition of $\gp$s,~\citep{rasmussen2006gaussian}:
\begin{align}
       y_i \mid \wb, \xb_i \sim \N{\yb \mid \Phi(\xb_i)^\top\wb, \sigma^{2}\Ib_N} \; \text{ with } \wb \sim \N{\wb \mid \zerob, \Ib_D} \;, 
\end{align}
where $\map{\cdot}: \xspace \rightarrow \hilbert$ is a feature map with associated kernel $k(\cdot,\cdot): \xspace \times \xspace \rightarrow \RR$ and Hilbert space $\hilbert$.
Namely, a $\gp$ can be viewed as a Bayesian linear regression where the covariates $\xspace$ are embedded into a potentially infinite dimensional Hilbert space $\hilbert$.
An advantage of the weight-space view is that it allows for conditional independence between different data points $\xb$ given $\wb$; the key property we leverage in the following derivations. 

% Weight-space posterior coreset
\subsubsection{CVGP's coreset-based tempered-posterior}
\label{assec:cvtgp_qw_analytical}

We aim for a small subset of psuedo-points $\{\XbC, \ybC\}$ that, 
if drawn $\beta_m \geq 0$ times, approximate the true weight posterior:
\begin{align}
    \underbrace{\frac{\p{\wb} \prod_{m=1}^M  \cp{y_c}{\wb, \xb_m}^{\beta_m}}{Z_q} }_{\cq{\wb}{\ybC, \XbC,\betabC}} \approx \underbrace{\frac{\p{\wb} \prod_{i=1}^N  \cp{y_i}{\wb, \xb_i}}{Z_p}}_{\cp{\wb}{ \yb, \Xb}} \;.
\end{align}

Typically, the objective of the coreset problem is to learn vector $\beta^\star = \argmin_{\beta} \Dist{q(\wb), p(\wb)}$,
where $\Dist{.}$ is a distance metric such as the KL divergence~\citep{campbell2019sparse}. 

We derive the coreset-based tempered posterior over $\gp$ weights $\cq{\wb}{\XbC, \ybC, \betabC}$, by noting it is proportional to 
 \begin{align}
    & \expp{-\frac{1}{2} \wb^\top \wb} \prod_{m=1}^M \expp{ -\frac{1}{2} \sigma^{-2} \beta_m  \left(y_c - \map{\xb_m}^\top \wb \right)^2}\\
    &\propto \expp{ -\frac{1}{2} \left( - 2 \sigma^{-2}  \wb^\top \sum_{m=1}^M y_c \beta_m \map{\xb_m}  +   \wb^\top \underbrace{\left(\sigma^{-2} \sum_{m=1}^M \map{\xb_m}\beta_m  \map{\xb_m}^\top + \Ib_D\right)}_{\Sb_{\wb|\ybC}^{-1}}\wb\right)}, 
    \label{eq:coreset_posterior_w_propto}
\end{align}

which is a Gaussian distribution with covariance matrix: 
\begin{align}
    \Sb_{\wb|\ybC} &= \left(\sigma^{-2} \sum_{m=1}^M \map{\xb_m}\beta_m  \map{\xb_m}^\top + \Ib_D\right)^{-1} = \left(\map{\XbC}^\top \Sigma_{\betabC}^{-1}  \map{\XbC} + \Ib_D\right)^{-1} \;,
\end{align}
with $\Sigma_{\betabC} = \sigma^2 \cdot \diag{\betabC^{-1}}$.
%and we can ensure positivity of $\betabC$ using the softplus function.
Let us define $\Sigma_{\betabC}^{-1} = C^{1/2}C^{1/2}$, 
with $\map{\XbC}^\top C^{1/2} = {\map{\XbC}^\prime}^\top$ and $C^{1/2} \map{\XbC} = {\map{\XbC}^\prime}$.
Then we can write the covariance matrix as
\begin{align}
    \Sb_{\wb|\ybC} &= \left( {\map{\XbC}^\prime}^\top{\map{\XbC}^\prime}+ \Ib_D\right)^{-1}\\
    &= \Ib_D - {\map{\XbC}^\prime}^\top\left( \Ib_D + {\map{\XbC}^\prime}{\map{\XbC}^\prime}^\top\right)^{-1}{\map{\XbC}^\prime}\\
    &=\Ib_D - \map{\XbC}^\top\left( \Sigma_{\betabC} + \KbCC\right)^{-1} \map{\XbC} \;.
\end{align}

We revisit Equation~\eqref{eq:coreset_posterior_w_propto} to identify the mean of the Gaussian distribution as follows,
where we use ${\ybC}^\prime=C^{1/2} \ybC$:
\begin{align}
    \mb_{\wb|\ybC} &= \Sb_{\wb|\ybC} \left( \sigma^{-2} \sum_{m=1}^M y_c \beta_m \map{\xb_m} \right)\\
    &= \Sb_{\wb|\ybC} \left( \map{\XbC}^\top \Sigma_{\betabC}^{-1} \ybC\right)\\
    &= \Sb_{\wb|\ybC} \left( \map{\XbC}^\top C^{1/2}C^{1/2} \ybC\right)\\
    &= \Sb_{\wb|\ybC} \left( {\map{\XbC}^\prime}^\top {\ybC}^\prime\right) \\
    &= \left(\Ib_D - {\map{\XbC}^\prime}^\top\left( \Ib_D + {\map{\XbC}^\prime}{\map{\XbC}^\prime}^\top\right)^{-1}{\map{\XbC}^\prime}\right)\left( {\map{\XbC}^\prime}^\top {\ybC}^\prime\right)\\
    &= \left({\map{\XbC}^\prime}^\top - {\map{\XbC}^\prime}^\top\left( \Ib_D + {\map{\XbC}^\prime}{\map{\XbC}^\prime}^\top\right)^{-1}{\map{\XbC}^\prime} {\map{\XbC}^\prime}^\top \right) {\ybC}^\prime\\
    &= \left({\map{\XbC}^\prime}^\top - {\map{\XbC}^\prime}^\top\left( \left({\map{\XbC}^\prime} {\map{\XbC}^\prime}^\top \right)^{-1} + \Ib_D\right)^{-1}\right) {\ybC}^\prime\\
    &= {\map{\XbC}^\prime}^\top\left(\Ib_D - \left( \left({\map{\XbC}^\prime} {\map{\XbC}^\prime}^\top \right)^{-1} + \Ib_D\right)^{-1}\right) {\ybC}^\prime\\
    &= {\map{\XbC}^\prime}^\top\left(\Ib_D - {\map{\XbC}^\prime} {\map{\XbC}^\prime}^\top \left(\underbrace{\Ib_D + {\map{\XbC}^\prime}{\map{\XbC}^\prime}^\top}_{D}\right)^{-1}\right) {\ybC}^\prime\\
    &={\map{\XbC}^\prime}^\top\left(DD^{-1} - {\map{\XbC}^\prime} {\map{\XbC}^\prime}^\top D^{-1}\right) {\ybC}^\prime\\
    &= {\map{\XbC}^\prime}^\top\left(\bcancel{D} - \bcancel{{\map{\XbC}^\prime} {\map{\XbC}^\prime}^\top} \right) D^{-1} {\ybC}^\prime\\
    &=  {\map{\XbC}^\prime}^\top D^{-1} {\ybC}^\prime\\
    &= \map{\XbC}^\top C^{1/2}\left( \Ib_D + C^{1/2}  \map{\XbC} \map{\XbC}^\top C^{1/2}\right)^{-1} C^{1/2} \ybC\\
    &= \map{\XbC}^\top\left( \Sigma_{\betabC} +  \KbCC \right)^{-1}\ybC \;.
\end{align}

All in all, we have

\begin{align}
q(\wb|\XbC, \ybC, \betabC) &= \N{\wb | \mb_{\wb|\ybC}, \Sb_{\wb|\ybC}} \;, \label{eq:coreset_posterior_wc} 
        \text{ with} \begin{cases}
		\mb_{\wb|\ybC} = \map{\XbC}^\top\left( \Sigma_{\betabC} +  \KbCC \right)^{-1}\ybC \\
		\Sb_{\wb|\ybC} = \Ib_D - \map{\XbC}^\top\left( \Sigma_{\betabC} + \KbCC\right)^{-1} \map{\XbC} .
	\end{cases}
\end{align}

\subsubsection{CVGP's Weight-space Variational Lower-bound}
\label{assec:cvtgp_qw_lowerbound}

We write the variational lower-bound of the log-marginal likelihood as:
\begin{align}
     \log\cp{\yb}{\Xb} &= \int \q{\wb} \log \left\{ \frac{\cp{\yb, \wb}{\Xb}\q{\wb}}{\cp{\wb}{\Xb, \yb}\q{\wb}}  \right\}\diff{\wb}\\
    &= \Ex{\q{\wb}}{\log \cp{\yb}{\wb, \Xb}} - \kl{\q{\wb}}{\p{\wb}}  + \kl{\q{\wb}}{\cp{\wb}{\Xb, \yb}}\\
    &\geq \underbrace{\Ex{\q{\wb}}{\log \cp{\yb}{\wb, \Xb}} - \kl{\q{\wb}}{\p{\wb}}}_{\mathcal{L}_{CVGP}} \; ,
\end{align}

which is the lower-bound of the weight-space view of CVGP,
where we set $\q{\wb} = \cp{\wb}{\XbC, \ybC, \betabC}$ and derive
\begin{align}
    \Loss_{CVGP} &= \Ex{\cp{\wb}{\XbC, \ybC, \betabC}}{\log \cp{\yb}{\wb, \Xb}} - \kl{\cp{\wb}{\XbC, \ybC, \betabC}}{\p{\wb}}\\
    &= \Ex{\cp{\wb}{\XbC, \ybC, \betabC}}{\sum_{i=1}^N\log \cp{y_i}{\wb, \xb_i}} - \kl{\cp{\wb}{\XbC, \ybC, \betabC}}{\p{\wb}}\\
    &= \sum_{i=1}^N\underbrace{\Ex{\cp{\wb}{\XbC, \ybC, \betabC}}{\log \cp{y_i}{\wb, \xb_i}}}_{\ell_i} - \kl{\cp{\wb}{\XbC, \ybC, \betabC}}{\p{\wb}} .
\end{align}

We compute below the analytical expressions for $\ell_i$ and $\kl{\cp{\yb}{\wb, \Xb}}{\p{\wb}}$.

We start with $\ell_i$:
\begin{align}
    &\ell_i = \int  \cp{\wb}{\XbC, \ybC, \betabC}  \log\frac{1}{\sqrt{2 \pi\sigma^2}} \expp{ - \frac{1}{2\sigma^2}\left(y_i - \map{\xb_i}^\top\wb\right)^2 }  \diff{\wb}\\
    &= -\frac{1}{2} \left( \log 2 \pi\sigma^2  + \sigma^{-2} \int  \cp{\wb}{\XbC, \ybC, \betabC}  \left( y_i^2 - 2 y_i \map{\xb_i}^\top \wb  + \map{\xb_i}^\top \wb\wb^\top \map{\xb_i}\right) \diff{\wb} \right) \;
\end{align}

We need to compute $\int\wb\cp{\wb}{\XbC, \ybC, \betabC}\diff{\wb}$ and $\int\wb\wb^\top\cp{\wb}{\XbC, \ybC, \betabC}\diff{\wb}$.
Note that the former is $\mb_{\wb|\ybC}$
and the latter is $\Sb_{\wb|\ybC} + \mb_{\wb|\ybC}\mb_{\wb|\ybC}^\top$.
Hence,
\begin{align}
    \ell_i =-\frac{1}{2} \left( \log 2 \pi\sigma^2  + \sigma^{-2}\left( y_i^2 - 2 y_i \map{\xb_i}^\top \mb_{\wb|\ybC}  + \map{\xb_i}^\top (\Sb_{\wb|\ybC} + \mb_{\wb|\ybC} \mb_{\wb|\ybC}^\top) \map{\xb_i}\right)\right) \;.
\end{align}

Let us now define
\begin{align}
    m_{f_i|\ybC} &= \map{\xb_i}^\top \mb_{\wb|\ybC}\\
    &= \map{\xb_i}^\top \map{\XbC}^\top\left( \Sigma_{\betabC} +  \KbCC \right)^{-1}\ybC\\
    &= \kb_{iM}\left( \Sigma_{\betabC} +  \KbCC \right)^{-1}\ybC \;,
\end{align}

and

\begin{align}
    k_{f_i|\ybC} &= \map{\xb_i}^\top \Sb_{\wb|\ybC} \map{\xb_i}\\
    &= k_{ii} - \kb_{iM}\left( \Sigma_{\betabC} + \KbCC\right)^{-1} \kb_{\XbC, \xb_i} \;,
\end{align}
where the above relate to the $\gp$ function values via transformation of the weights by the feature vectors, \ie $f_i=f(\xb_i)=\map{\xb_i}^\top \wb$.
Notice how the above expressions match those in Equation~\eqref{eq:coreset_posterior_f_mean} and~\eqref{eq:coreset_posterior_f_cov}.
We can therefore write
\begin{align}
    \ell_i &=-\frac{1}{2} \left( \log 2 \pi\sigma^2  + \sigma^{-2}\left( y_i^2 - 2 y_i m_{f_i|\ybC}  + k_{f_i|\ybC} + {m_{f_i|\ybC}}^2 \right)\right)\\
    &= \log \N{y_i \mid m_{f_i|\ybC}, \sigma^2}\exp\left\{- \frac{1}{2}\sigma^{-2} k_{f_i|\ybC} \right\} \;.
\end{align}

We continue with the KL divergence term,
recalling $\p{\wb}=\N{\wb \mid \zerob, \Ib_D}$,
and write
\begin{align}
    \kl{\cp{\wb}{\XbC, \ybC, \betabC}}{\p{\wb}} &= \frac{1}{2} \left( \mb_{\wb|\ybC}^\top \Ib_D^{-1} \mb_{\wb|\ybC} + \tr {\Ib_D^{-1} \Sb_{\wb|\ybC} } +  \log \mid \Ib_D \mid - \log \mid \Sb_{\wb|\ybC} \mid  - \tr{ \Ib_D } \right)\\
    &= \frac{1}{2} \left( \mb_{\wb|\ybC}^\top \mb_{\wb|\ybC} + \tr { \Sb_{\wb|\ybC} } - \log \mid \Sb_{\wb|\ybC} \mid  - \tr { \Ib_D } \right) \;.
\end{align}

We first compute

\begin{align}
    \mb_{\wb|\ybC}^\top \mb_{\wb|\ybC} &=  \left(\map{\XbC}^\top\left( \Sigma_{\betabC} +  \KbCC \right)^{-1}\ybC\right)^\top \left(\map{\XbC}^\top\left( \Sigma_{\betabC} +  \KbCC \right)^{-1}\ybC\right)\\
    &= \ybC^\top \left( \Sigma_{\betabC} +  \KbCC \right)^{-1} \map{\XbC} \map{\XbC}^\top\left( \Sigma_{\betabC} +  \KbCC \right)^{-1}\ybC\\
    &= \ybC^\top \left( \Sigma_{\betabC} +  \KbCC \right)^{-1} \KbCC\left( \Sigma_{\betabC} +  \KbCC \right)^{-1}\ybC \;,
\end{align}

then,

\begin{align}
     &\tr { \Sb_{\wb|\ybC} } = \tr {\Ib_D - \map{\XbC}^\top\left( \Sigma_{\betabC} + \KbCC\right)^{-1} \map{\XbC}}\\
    &= \tr {\Ib_D } -\tr {\map{\XbC}^\top\left( \Sigma_{\betabC} + \KbCC\right)^{-1} \map{\XbC}}\\
    &=  \tr {\Ib_D } -\tr {\left( \Sigma_{\betabC} + \KbCC\right)^{-1} \map{\XbC}\map{\XbC}^\top}\\
    &= \tr {\Ib_D } -\tr {\left( \Sigma_{\betabC} + \KbCC\right)^{-1} \KbCC} \;,
\end{align}

and finally,

\begin{align}
    \log \mid \Sb_{\wb|\ybC} \mid &= \log \left|  \left( {\map{\XbC}^\prime}^\top{\map{\XbC}^\prime}+ \Ib_D\right)^{-1} \right| \\
    &= -\log \left|  \left( {\map{\XbC}^\prime}^\top{\map{\XbC}^\prime}+ \Ib_D\right)\right|\\
    &= -\log \left|  \left( {\map{\XbC}}^\top C{\map{\XbC}}+ \Ib_D\right)\right|\\
    &= -\log \left|\Sigma_{\betabC} + \map{\XbC}{\map{\XbC}}^\top \right| \left| \Sigma_{\betabC}^{-1} \right| \bcancel{\left| \Ib_D \right|}\\
    & \quad\text{ using \href{https://en.wikipedia.org/wiki/Matrix_determinant_lemma}{metrix determinant lemma} } \\
    &= - \log \left|\Sigma_{\betabC} + \KbCC \right| - \log \left| \Sigma_{\betabC}^{-1} \right| \;.
\end{align}

We put it all together for the analytical, weight-space variational lower-bound of CVGP,

\begin{align}
    \Loss_{CVGP} &=\sum_{i=1}^N \left( \log \N{y_i \mid m_{f_i|\ybC}, \sigma^2}\exp\left\{- \frac{1}{2}\sigma^{-2} k_{f_i|\ybC} \right\}\right) \nonumber \\
    & \qquad -\frac{1}{2} \left( +\ybC \left( \Sigma_{\betabC} +  \KbCC \right)^{-1} \KbCC\left( \Sigma_{\betabC} +  \KbCC \right)^{-1}\ybC \right. \nonumber  \\
    & \qquad \qquad + \bcancel{\tr {\Ib_D }} -\tr {\left( \Sigma_{\betabC} + \KbCC\right)^{-1} \KbCC} \nonumber \\
    & \qquad \qquad \left. + \log \left|\Sigma_{\betabC} + \KbCC \right| + \log \left| \Sigma_{\betabC}^{-1} \right| - \bcancel{\tr{\Ib_D}} \right) \\
    % Newline
    &=\sum_{i=1}^N \left(\log \N{y_i \mid m_{f_i|\ybC}, \sigma^2} - \frac{1}{2}\sigma^{-2} k_{f_i|\ybC} \right) \nonumber \\
    & \qquad -\frac{1}{2} \left( -\tr {\left( \KbCC + \Sigma_{\betabC} \right)^{-1} \KbCC} \right. \nonumber  \\
    & \qquad \qquad +\ybC \left( \Sigma_{\betabC} +  \KbCC \right)^{-1} \KbCC\left( \Sigma_{\betabC} +  \KbCC \right)^{-1}\ybC \nonumber \\
    & \qquad \qquad \left. + \log \left|\KbCC + \Sigma_{\betabC} \right| - \log \left| \Sigma_{\betabC} \right| \right)\\
    &=  \log \N{\yb | \mb_{\fb|\ybC}, \sigma^2 \Ib_D} - \frac{1}{2 \sigma^2}\tr{\Kb_{\fb|\ybC} } \nonumber\\
    & - \frac {1}{2} \left[ - \tr{\Ab \KbCC}  + \ybC^\top
        \Ab \KbCC \Ab \ybC  - \ln \left|\Ab\right| - \ln \left|\Sigmab_{\betabC} \right|  \right]\;,
\end{align}
where $\Ab = \left(\Sigmab_{\betabC} + \KbCC \right)^{-1}$ and
we have combined
($a$) the sum over $N$ scalar likelihoods
into a single, multivariate Gaussian with mean $\mb_{\fb|\ybC}$
(composed of $\mb_{f_i|\ybC}, \forall i)$ and diagonal unit covariance; and
($b$) all $\kb_{\fb_i|\ybC}$ terms into a diagonal matrix
${\Kb_{\fb|\ybC}}_{ii} = \kb_{\fb_i|\ybC}$.
