% !TEX root = ../main.tex

We derive below,
under the assumption of standard Gaussian, uncorrelated observation noise,
\ie $\yb = \fb + \epsilon\;, \epsilon \sim \N{\epsilon\mid \zerob, \sigma^2 \Ib_N}$,
the coreset-based tempered posterior over $\gp$ coreset function values $q(\fbC)$.
The derivations are equivalent for non-zero mean and/or correlated noise functions.

% Function-space posterior coreset
\subsubsection{CVGP's Coreset-based Posterior}
\label{assec:cvtgp_qf_analytical}

To be able to accurately approximate the full $\gp$ posterior with a coreset $\{\XbC, \ybC \}$,
we propose to weight with $\beta_c \geq 0$ \footnote{
In practice, we ensure positive $\beta_m$ using the $\mathrm{softplus}(.)$ function.
}
the likelihood of each psuedo-point when computing their corresponding coreset $\gp$ value $\fbC$, \ie 

\begin{align}
	\cq{\fbC}{\XbC, \ybC, \betabC} &= \frac {\cq{\ybC}{\fbC, \betabC} \cp{\fbC}{\XbC}}{\cq{\ybC}{\XbC, \betabC} } 
		= \frac {\left(\prod_{m=1}^M \cp{y_{m}}{f_{m}}^{\beta_{m}} \right) \cp{\fbC}{\XbC}}{\cq{\ybC }{\XbC, \betabC} } \; .
\end{align}

We start by deriving a closed form expression for the $\betabC$-weighted likelihood function $q(\ybC | \fbC, \betabC)$,
by considering each coreset pair $\{\xb_{m},y_{m}\}$, for $m = 1, \cdots, M,$ independently:

\begin{align}
	\cq{y_m}{f_m, \beta_m} &= \cp{y_{m}}{f_{m}}^{\beta_{m}} = \N{y_{m} \mid f_{m}, \sigma^2}^{\beta_{m}} \\
	&= \left(\frac{1}{\sqrt{2 \pi \sigma^2}} e^{- \frac{1}{2} (y_{m} - f_{m}) \sigma^{-2} (y_{m} - f_{m})}\right)^{\beta_{m}} \\
	&= \left(\frac{1}{\sqrt{2 \pi \sigma^2}}\right)^{\beta_{m}} e^{- \frac{1}{2} (y_{m} - f_{m}) (\beta_{m}^{-1}\sigma^2)^{-1} (y_{m} - f_{m})} \\
	&= \left(\frac{1}{\sqrt{2 \pi \sigma^2}}\right)^{\beta_{m}}
	\left(\frac{\sqrt{2 \pi \beta_{m}^{-1}\sigma^2}}{\sqrt{2 \pi \beta_{m}^{-1}\sigma^2}}\right) \exp\left\{- \frac{1}{2} (y_{m} - f_{m}) (\beta_{m}^{-1}\sigma^2)^{-1} (y_{m} - f_{m})\right\} \\
        &= \frac{\sqrt{2 \pi \beta_{m}^{-1}\sigma^2}}{\left(\sqrt{2 \pi \sigma^2}\right)^{\beta_{m}}}
	\left(\frac{1}{\sqrt{2 \pi \beta_{m}^{-1}\sigma^2}}\right) \exp\left\{- \frac{1}{2} (y_{m} - f_{m}) (\beta_{m}^{-1}\sigma^2)^{-1} (y_{m} - f_{m})\right\} \\
        &= \frac{\sqrt{2 \pi \beta_{m}^{-1}\sigma^2}}{\left(\sqrt{2 \pi \sigma^2}\right)^{\beta_{m}}} \cdot \N{y_m|f_m, \beta_{m}^{-1}\sigma^2} \\
        &= Q_c \cdot \N{y_m|f_m, \beta_{m}^{-1}\sigma^2} \;, 
            \text{ with } Q_c = \frac{\sqrt{2 \pi \beta_{m}^{-1}\sigma^2}}{\left(\sqrt{2 \pi \sigma^2}\right)^{\beta_{m}}} \;.
\end{align}

We write the joint over the full coreset pseudo-observations as a product over each likelihood term:
\begin{align}
	&q(\ybC | \fbC, \betabC) = \prod_{m=1}^M p(y_{m} | f_{m})^{\beta_{m}} \\
	& \qquad = \prod_{m=1}^M\left(\frac{1}{\sqrt{2 \pi \sigma^2}}\right)^{\beta_{m}}
	\left(\frac{\sqrt{2 \pi \beta_{m}^{-1}\sigma^2}}{\sqrt{2 \pi \beta_{m}^{-1}\sigma^2}}\right) \exp\left\{- \frac{1}{2} (y_{m} - f_{m}) (\beta_{m}^{-1}\sigma^2)^{-1} (y_{m} - f_{m})\right\} \\
        & \qquad = \prod_{m=1}^M Q_m \cdot \N{y_m|f_m, \beta_{m}^{-1}\sigma^2} \\
        & \qquad = Q_M \cdot \N{\ybC \mid \fbC, \Sigma_{\betabC}} \;, \text{ with }
        \begin{cases}
            Q_M = \prod_{m=1}^M\frac{\sqrt{2 \pi \beta_{m}^{-1}\sigma^2}}{\left(\sqrt{2 \pi \sigma^2}\right)^{\beta_{m}}}  \\
            \Sigma_{\betabC} = \sigma^2 \cdot \diag{\betabC^{-1}} .
        \end{cases} 
        \label{eq:coreset_likelihood}
\end{align}

We derive the marginalized pseudo-observation coreset distribution
\begin{align}
	q(\ybC|\XbC, \betabC) &= \int_{\fbC} q(\ybC, \fbC | \XbC, \betabC) \diff{\fbC} = \int_{\fbC} q(\ybC | \fbC, \betabC) p(\fbC|\XbC) \diff{\fbC} \\
        & = \int_{\fbC} Q_M \cdot \N{\ybC \mid \fbC, \Sigma_{\betabC}} \cdot \N{\fbC \mid \zerob, \KbCC}  \diff{\fbC} \\
        & = Q_M \int_{\fbC} \N{\ybC \mid \fbC, \Sigma_{\betabC}} \cdot \N{\fbC \mid \zerob, \KbCC}  \diff{\fbC} \\
        & = Q_M \cdot \N{\ybC \mid \zerob, \KbCC + \Sigma_{\betabC}} \;.
	\label{eq:coreset_marginal_likelihood}
\end{align}

We leverage the above distributions to derive the coreset-based, tempered $\gp$ posterior
\begin{align}
	& q(\fbC|\XbC, \ybC, \betabC) = \frac {q(\ybC | \fbC, \betabC) q(\fbC|\XbC)}{q(\ybC | \XbC, \betabC) } 	\\
	& \qquad = \frac{
		Q_M \cdot \N{\ybC \mid \fbC, \Sigma_{\betabC}} 
		 \N{\fbC \mid \zerob, \KbCC}
	}{
		Q_M \cdot \N{\ybC \mid \zerob, \KbCC + \Sigma_{\betabC}} 
	} \\
        & \qquad = \frac{
		\N{\ybC \mid \fbC, \Sigma_{\betabC}} 
		 \N{\fbC \mid \zerob, \KbCC}
	}{
		\N{\ybC \mid \zerob, \KbCC + \Sigma_{\betabC}} 
	} \\
        &\qquad = \N{\fbC \mid \mb_{\fbC|\ybC}, \Kb_{\fbC|\ybC}} \;, \text{ with} \begin{cases}
		\mb_{\fbC|\ybC} = \Kb_{\fbC|\ybC} \left(
		\Sigma_{\betabC}^{-1} \ybC 
		\right) \\
		\Kb_{\fbC|\ybC} = \left( 
		\KbCC^{-1} + \Sigma_{\betabC}^{-1}
		\right)^{-1}.
	\end{cases}
	\label{eq:coreset_posterior_fc}
\end{align}

The sufficient statistics of the coreset-based, tempered distibution above can be rewritten as
\begin{align}
    q(\fbC) = q(\fbC|\XbC, \ybC, \betabC) &= \N{\fbC \mid \mb_{\fbC|\ybC}, \Kb_{\fbC|\ybC}} \;, \\
    &\text{ with}
     \begin{cases}
		\mb_{\fbC|\ybC} = \KbCC \left( \KbCC + \Sigma_{\betabC} \right)^{-1} \ybC \\
		\Kb_{\fbC|\ybC} = \KbCC - \KbCC \left( \KbCC + \Sigma_{\betabC} \right)^{-1} \KbCC  \\
                \qquad \qquad \text{by \href{https://en.wikipedia.org/wiki/Woodbury_matrix_identity}{Woodbury matrix identity}}  \; .
	\end{cases} 
\end{align}

\paragraph{The coreset-based posterior over $\gp$ function values.}
We now compute the posterior over $\gp$ values for any given data point $\Xb$,
by marginalizing the $\gp$'s prior-conditional over the coreset-based distribution, \ie

\begin{align}
    q(\fb|\XbC, \ybC) &= \int_{\fbC} p(\fb|\fbC) q(\fbC | \XbC, \ybC, \betabC) \diff{\fbC} \;.
\end{align}

The above is analytically solvable due to all the distributions being Gaussian:
\begin{align}
	q(\fbC|\XbC, \ybC, \betabC) &= \N{\fbC \mid \mb_{\fbC|\ybC}, \Kb_{\fbC|\ybC}} \;, \\
        & \text{ with} \begin{cases}
		\mb_{\fbC|\ybC} = \Kb_{\fbC|\ybC} \left( \Sigma_{\betabC}^{-1} \ybC \right) \\
		\Kb_{\fbC|\ybC} = \KbCC - \KbCC \left(\KbCC + \Sigma_{\betabC}\right)^{-1} \KbCC,
	\end{cases}
	\\
	p(\fb|\fbC) &= \N{\fb \mid \mb_{\fb|\fbC}, \Kb_{\fb|\fbC}} \;, \\
        & \text{ with} \begin{cases}
		\mb_{\fb|\fbC} = \KbXC\KbCC^{-1} \fbC \\
		\Kb_{\fb|\fbC} = \KbXX - \KbXC\KbCC^{-1} \KbCX,
	\end{cases}
	\\
	q(\fb) = q(\fb|\XbC, \ybC, \betabC) &= \N{\fb \mid \mb_{\fb|\ybC}, \Kb_{\fb|\ybC}} \;, \\
        & \text{ with} \begin{cases}
		\mb_{\fb|\ybC} = \KbXC\KbCC^{-1} \mb_{\fbC|\ybC} \\
		\Kb_{\fb|\ybC} = \Kb_{\fb|\fbC} + \KbXC\KbCC^{-1} \Kb_{\fbC|\ybC} \KbCC^{-1} \KbCX.
	\end{cases}
    \label{eq:coreset_posterior_f}
\end{align}

We elaborate on the sufficient statistics of $q(\fb|\XbC, \ybC, \betabC)$.

First, we rewrite the expected value as
\begin{align}
	\mb_{\fb|\ybC} &= \KbXC\KbCC^{-1} \mb_{\fbC|\ybC} \\
	&= \KbXC\KbCC^{-1} \Kb_{\fbC|\ybC} \Sigma_{\betabC}^{-1} \ybC \\
	&= \KbXC\KbCC^{-1} \left( \KbCC^{-1} + \Sigma_{\betabC}^{-1} \right)^{-1} \Sigma_{\betabC}^{-1} \ybC \\
		& \qquad \text{ by using equivalence in Equation~\eqref{eq:equiv_1}} \nonumber \\
	&= \KbXC\left( \KbCC + \Sigma_{\betabC} \right)^{-1} \ybC  \; ,
    \label{eq:coreset_posterior_f_mean}
\end{align} 

where we have made use of the following equivalences,
\begin{align}
    % Equiv1
    & \KbCC^{-1} \left( \KbCC^{-1} + \Sigma_{\betabC}^{-1} \right)^{-1} \Sigma_{\betabC}^{-1} = \KbCC^{-1} \left( \Sigma_{\betabC} \KbCC^{-1} + \Ib_M \right)^{-1} 
            = \left( \Sigma_{\betabC} + \KbCC \right)^{-1}
        \label{eq:equiv_1} \;, \\
    % Equiv2
    & \Sigma_{\betabC}^{-1} \left( \KbCC^{-1} + \Sigma_{\betabC}^{-1} \right)^{-1} \KbCC^{-1} = \Sigma_{\betabC}^{-1} \left( \Ib_M + \KbCC\Sigma_{\betabC}^{-1}  \right)^{-1} 
	= \left( \Sigma_{\betabC} + \KbCC \right)^{-1}
		\label{eq:equiv_2} \;.
\end{align}
Second, for the covariance matrix, we write
\begin{align}
	\Kb_{\fb|\ybC} &= \Kb_{\fb|\fbC} + \KbXC\KbCC^{-1} \Kb_{\fbC|\ybC} \KbCC^{-1} \KbCX\\
	&= \KbXX - \KbXC\KbCC^{-1} \KbCX + \KbXC\KbCC^{-1} \left( \KbCC^{-1} + \Sigma_{\betabC}^{-1} \right)^{-1} \KbCC^{-1} \KbCX\\
	& \qquad \qquad \text{ by using the \href{https://en.wikipedia.org/wiki/Woodbury_matrix_identity}{Woodbury matrix identity} for } \left( \KbCC^{-1} + \Sigma_{\betabC}^{-1} \right)^{-1} \nonumber \\
	&= \KbXX - \KbXC\KbCC^{-1} \KbCX + \KbXC\KbCC^{-1} \left( \KbCC - \KbCC \left(\KbCC + \Sigma_{\betabC}\right)^{-1} \KbCC  \right) \KbCC^{-1} \KbCX\\
	&= \KbXX - \KbXC\KbCC^{-1} \KbCX + \KbXC\left( \Ib_M - \left(\KbCC + \Sigma_{\betabC}\right)^{-1} \KbCC  \right) \KbCC^{-1} \KbCX\\	
	&= \KbXX - \KbXC\KbCC^{-1} \KbCX + \KbXC\KbCC^{-1} \KbCX - \KbXC \left(\KbCC + \Sigma_{\betabC}\right)^{-1} \KbCC \KbCC^{-1} \KbCX\\
	&= \KbXX - \KbXC \left(\KbCC + \Sigma_{\betabC}\right)^{-1} \KbCX  \; .
    \label{eq:coreset_posterior_f_cov}
\end{align}

\clearpage
\subsubsection{CVGP's Variational Lower-bound}
\label{assec:cvtgp_qf_lowerbound}
We derive the variational lower-bound by writing everything in terms of sufficient statistics of $\q{\fbC}$:
\begin{align}
	\Loss_{CVGP} & = \eValue{q(\fb)}{\log p(\yb| \fb) }  - \kl{q(\fbC)}{p(\fbC)}  \\
	&= \log \N{\yb | \mb_{\fb|\ybC}, \sigma^2 \Ib_N} - \frac{1}{2 \sigma^{2}} \tr{\Kb_{\fb|\ybC} } \nonumber \\
	& \qquad - \frac{1}{2} \left(
		\tr{\KbCC^{-1}\Kb_{\fbC|\ybC}} - M + \mb_{\fbC|\ybC}^\top \KbCC^{-1} \mb_{\fbC|\ybC} + \log \frac{\left|\KbCC\right|}{\left|\Kb_{\fbC|\ybC}\right|} 
	 	\right) \\
 	% Replace most generic expression
	&= \log \N{\yb | \KbXC \KbCC^{-1} \mb_{\fbC|\ybC} , \sigma^2 \Ib_N} \nonumber \\
	& \qquad - \frac{1}{2 \sigma^{2}} \tr{\KbXX - \KbXC \KbCC^{-1} \KbCX + \KbXC \KbCC^{-1}\Kb_{\fbC|\ybC} \KbCC^{-1} \KbCX} \nonumber \\
	& \qquad - \frac{1}{2} \left(
	\tr{\KbCC^{-1}\Kb_{\fbC|\ybC}} - M + \mb_{\fbC|\ybC}^\top \KbCC^{-1} \mb_{\fbC|\ybC} + \log \frac{\left|\KbCC\right|}{\left|\Kb_{\fbC|\ybC}\right|} 
	\right) \\
	% Detailed Gaussian
	&= \left( -\frac{N}{2} \log (2\pi) - \frac{1}{2} \log\left|\sigma^2 \Ib_N \right| -\frac{1}{2} (\yb-\KbXC \KbCC^{-1} \mb_{\fbC|\ybC})^\top \sigma^{-2} \Ib_N (\yb-\KbXC \KbCC^{-1} \mb_{\fbC|\ybC}) \right) \nonumber \\
	& \qquad - \frac{1}{2 \sigma^{2}} \tr{\KbXX - \KbXC \KbCC^{-1} \KbCX}
		- \frac{1}{2 \sigma^{2}} \tr{\KbXC \KbCC^{-1}\Kb_{\fbC|\ybC} \KbCC^{-1} \KbCX} \nonumber \\
	& \qquad - \frac{1}{2} \left(
	\tr{\KbCC^{-1}\Kb_{\fbC|\ybC}} - M + \mb_{\fbC|\ybC}^\top \KbCC^{-1} \mb_{\fbC|\ybC} + \log \frac{\left|\KbCC\right|}{\left|\Kb_{\fbC|\ybC}\right|} 
	\right) \\
	% Expanded and rearranged 
	&= -\frac{N}{2} \log (2\pi) + \frac{M}{2} - \frac{1}{2} \log\left|\sigma^2 \Ib_N \right| - \frac{1}{2} \log \left|\KbCC\right| \nonumber \\
	& -\frac{1}{2} \yb^\top \sigma^{-2} \yb 
		+ \sigma^{-2} \mb_{\fbC|\ybC}^\top \KbCC^{-1}  \KbCX \yb \\
		&-\frac{1}{2} \sigma^{-2} \mb_{\fbC|\ybC}^\top \KbCC^{-1}  \KbCX \KbXC \KbCC^{-1} \mb_{\fbC|\ybC}
		- \frac{1}{2} \mb_{\fbC|\ybC}^\top \KbCC^{-1} \mb_{\fbC|\ybC} \nonumber \\
	& - \frac{1}{2 \sigma^{2}} \tr{\KbXX - \KbXC \KbCC^{-1} \KbCX} \nonumber \\
	& - \frac{1}{2 \sigma^{2}} \tr{ \KbCC^{-1} \KbCX \KbXC \KbCC^{-1}\Kb_{\fbC|\ybC} } 
		- \frac{1}{2} \tr{\KbCC^{-1}\Kb_{\fbC|\ybC}} + \frac{1}{2} \log \left|\Kb_{\fbC|\ybC} \right|\\
	% Expanded and rearranged 
	&= -\frac{N}{2} \log (2\pi) + \frac{M}{2} - \frac{1}{2} \log\left|\sigma^2 \Ib_N\right| - \frac{1}{2} \log \KbCC -\frac{1}{2} \yb^\top \sigma^{-2} \yb  \nonumber \\
	& + \sigma^{-2} \mb_{\fbC|\ybC}^\top \KbCC^{-1}  \KbCX \yb 
	-\frac{1}{2} \mb_{\fbC|\ybC}^\top \left(  \KbCC^{-1} + \sigma^{-2}  \KbCC^{-1}  \KbCX \KbXC \KbCC^{-1} \right)\mb_{\fbC|\ybC} \nonumber \\
	& - \frac{1}{2 \sigma^{2}} \tr{\KbXX - \KbXC \KbCC^{-1} \KbCX} \nonumber \\
	& - \frac{1}{2 } \tr{ \left(\KbCC^{-1} + \sigma^{-2}\KbCC^{-1} \KbCX \KbXC \KbCC^{-1} \right)\Kb_{\fbC|\ybC} } 
		+ \frac{1}{2} \log \left|\Kb_{\fbC|\ybC} \right| \\
	&= -\frac{N}{2} \log (2\pi) + \frac{M}{2} - \frac{1}{2} \log\left|\sigma^2 \Ib_N \right| - \frac{1}{2} \log \KbCC -\frac{1}{2} \yb^\top \sigma^{-2} \yb  \nonumber \\
	& + \sigma^{-2} \mb_{\fbC|\ybC}^\top \KbCC^{-1}  \KbCX \yb 
	-\frac{1}{2} \mb_{\fbC|\ybC}^\top \KbCC^{-1} \left(\KbCC^{-1} - \left(\KbCC - \Sigma_{\betabC}\right)^{-1} \right)^{-1} \KbCC^{-1}  \mb_{\fbC|\ybC} \nonumber \\
	& - \frac{1}{2 \sigma^{2}} \tr{\KbXX - \KbXC \KbCC^{-1} \KbCX} \nonumber \\
	& - \frac{1}{2 } \tr{ \KbCC^{-1} \left(\KbCC^{-1} - \left(\KbCC - \Sigma_{\betabC}\right)^{-1} \right)^{-1} \KbCC^{-1}\Kb_{\fbC|\ybC} } 
	+ \frac{1}{2} \log \left|\Kb_{\fbC|\ybC} \right|\\
    &= \log \N{\yb | \mb_{\fb|\ybC}, \sigma^2 \Ib_N} - \frac{1}{2 \sigma^2}\tr{\Kb_{\fb|\ybC} } \nonumber\\
    & - \frac {1}{2} \left[ - \tr{\Ab \KbCC}  + \ybC^\top
        \Ab \KbCC \Ab \ybC  - \ln \left|\Ab\right| - \ln \left|\Sigmab_{\betabC} \right|  \right] \:,
	\label{eq:cvtgp_loss_data_marginal_suffstats}
\end{align}

where $\Ab = \left(\KbCC + \Sigmab_{\betabC}\right)^{-1}$.

\newpage
\clearpage

