% !TEX root = ../main.tex

Before expanding CVGP's lower-bound in Equation~\eqref{eq:cvtgp_loss_data_marginal_suffstats},
we defining some auxiliary quantities
\begin{align}
    \Ab &= \left( \KbCC + \SigmabetaC \right)^{-1} = \KbCC^{-1} - \SigmabCC^{-1}\\
    \text{ where } & \SigmabCC^{-1} = \KbCC^{-1} -  \left( \KbCC + \SigmabetaC \right)^{-1}
\end{align}
to write it explicitly in terms of its parameters:
\begin{align}
	\Loss_{CVGP} 
	&= -\frac{N}{2} \log (2\pi) + \frac{M}{2} - \frac{1}{2} \log\left|\sigma^2 \Ib_N \right| - \frac{1}{2} \log \left| \KbCC \right|-\frac{1}{2} \yb^\top \sigma^{-2} \yb  \nonumber \\
	& + \sigma^{-2} \mbtilde{\fbC}^\top \KbCC^{-1}  \KbCX \yb 
	-\frac{1}{2} \mbtilde{\fbC}^\top \KbCC^{-1} \SigmabCC \KbCC^{-1}  \mbtilde{\fbC} \nonumber \\
	& - \frac{1}{2 \sigma^{2}} \tr{\KbXX - \KbXC \KbCC^{-1} \KbCX} \nonumber \\
	& - \frac{1}{2 } \tr{ \KbCC^{-1} \SigmabCC \KbCC^{-1} \Kbtilde{\fbC}{\fbC} } 
	+ \frac{1}{2} \log \left| \Kbtilde{\fbC}{\fbC} \right| \\
	% Replace parameterization
	&= -\frac{N}{2} \log (2\pi) + \frac{M}{2} - \frac{1}{2} \log\left|\sigma^2 \Ib_N \right| - \frac{1}{2} \log \left| \KbCC \right|-\frac{1}{2} \yb^\top \sigma^{-2} \yb  \nonumber \\
	& + \sigma^{-2} \ybC^\top \left(\KbCC  + \SigmabetaC \right)^{-1} \KbCC 
	\KbCC^{-1}  \KbCX \yb \\
	& -\frac{1}{2} \ybC^\top \left(\KbCC  + \SigmabetaC \right)^{-1} \KbCC 
	\KbCC^{-1} \SigmabCC \KbCC^{-1} 
	 \KbCC \left( \KbCC + \SigmabetaC \right)^{-1} \ybC  \nonumber \\
	& - \frac{1}{2 \sigma^{2}} \tr{\KbXX - \KbXC \KbCC^{-1} \KbCX} \nonumber \\
	& - \frac{1}{2 } \tr{ \KbCC^{-1} \SigmabCC \KbCC^{-1} \left( \KbCC - \KbCC \left( \KbCC + \SigmabetaC \right)^{-1} \KbCC \right) } \\
	& + \frac{1}{2} \log \left| \KbCC - \KbCC \left( \KbCC + \SigmabetaC \right)^{-1} \KbCC \right| \\
	% Clean-up
	% Replace parameterization
	&= -\frac{N}{2} \log (2\pi) + \frac{M}{2} - \frac{1}{2} \log\left|\sigma^2 \Ib_N \right| - \frac{1}{2} \log \left| \KbCC \right|-\frac{1}{2} \yb^\top \sigma^{-2} \yb  \nonumber \\
	& + \sigma^{-2} \ybC^\top \left(\KbCC  + \SigmabetaC \right)^{-1} \KbCX \yb \\
	& -\frac{1}{2} \ybC^\top \left(\KbCC  + \SigmabetaC \right)^{-1} 
	\SigmabCC \left( \KbCC + \SigmabetaC \right)^{-1} \ybC  \nonumber \\
	& - \frac{1}{2 \sigma^{2}} \tr{\KbXX - \KbXC \KbCC^{-1} \KbCX} \nonumber \\
	& - \frac{1}{2 } \tr{ \KbCC^{-1} \SigmabCC } \\
	& + \frac{1}{2 } \tr{ \KbCC^{-1} \SigmabCC \left( \KbCC + \SigmabetaC \right)^{-1} \KbCC } \\
	& + \frac{1}{2} \log \left| \KbCC - \KbCC \left( \KbCC + \SigmabetaC \right)^{-1} \KbCC \right| \;.
\label{eq:cvtgp_loss_data_marginal_params}
\end{align}

We now compute the derivatives with respect to its free parameters
\begin{align}
	´\frac{\partial \Loss_{CVGP}}{\partial \ybC} &= 
		\sigma^{-2} \left(\KbCC  + \SigmabetaC \right)^{-1} \KbCX \yb 
		-\left(\KbCC  + \SigmabetaC \right)^{-1} \SigmabCC \left( \KbCC + \SigmabetaC \right)^{-1} \ybC  \; , \\
	\frac{\partial \Loss_{CVGP}}{\partial \Ab } &= 
	\sigma^{-2} \ybC^\top \KbCX \yb - \ybC^\top \SigmabCC A \ybC  \nonumber \\
	& \qquad + \frac{1}{2 } \tr{ \KbCC^{-1} \SigmabCC \KbCC } \nonumber \\
	& \qquad - \frac{1}{2} \tr{ \left(\KbCC - \KbCC \left( \KbCC + \SigmabetaC \right)^{-1} \KbCC \right)^{-1} \KbCC \KbCC} \; .
\end{align}

We can readily resolve that
\begin{align}
	\ybC^* &= \sigma^{-2} \left( \KbCC + \SigmabetaC \right) \SigmabCC^{-1} \KbCX \yb  \\
		&= \sigma^{-2} A^{-1} \SigmabCC^{-1} \KbCX \yb  \\
\end{align}
and replace it in the covariance expression
\begin{align}
	0 & = \sigma^{-2} \sigma^{-2} \yb^\top \KbXC \SigmabCC^{-1} A^{-1} \KbCX \yb \nonumber \\
	& \qquad -  \sigma^{-2} \yb^\top \KbXC \SigmabCC^{-1} A^{-1} \SigmabCC A \sigma^{-2} A^{-1} \SigmabCC^{-1} \KbCX \yb  \nonumber \\
	& \qquad + \frac{1}{2 } \tr{ \KbCC^{-1} \SigmabCC \KbCC } \nonumber \\
	& \qquad - \frac{1}{2} \tr{ \left(\KbCC - \KbCC \left( \KbCC + \SigmabetaC \right)^{-1} \KbCC \right)^{-1} \KbCC \KbCC} \\
	% Elaborate
	0 & = \sigma^{-2} \sigma^{-2} \yb^\top \KbXC \SigmabCC^{-1} A^{-1} \KbCX \yb \nonumber \\
	& \qquad -  \sigma^{-2} \sigma^{-2} \yb^\top \KbXC \SigmabCC^{-1} A^{-1} \KbCX \yb  \nonumber \\
	& \qquad + \frac{1}{2 } \tr{ \KbCC^{-1} \SigmabCC \KbCC } \nonumber \\
	& \qquad - \frac{1}{2} \tr{ \left(\KbCC - \KbCC \left( \KbCC + \SigmabetaC \right)^{-1} \KbCC \right)^{-1} \KbCC \KbCC} \\
	% clean
	0 & = \frac{1}{2 } \tr{ \KbCC^{-1} \SigmabCC \KbCC } \nonumber \\
	& \qquad - \frac{1}{2} \tr{ \left(\KbCC - \KbCC \left( \KbCC + \SigmabetaC \right)^{-1} \KbCC \right)^{-1} \KbCC \KbCC}
\end{align}
Equating the matrices inside the traces, we have
\begin{align}
	% equate
	\KbCC^{-1} \SigmabCC \KbCC  & = \left(\KbCC - \KbCC \left( \KbCC + \SigmabetaC \right)^{-1} \KbCC \right)^{-1} \KbCC \KbCC \\
	% Simplify
	\KbCC^{-1} \SigmabCC & = \left(\KbCC - \KbCC \left( \KbCC + \SigmabetaC \right)^{-1} \KbCC \right)^{-1} \KbCC \\
	\KbCC^{-1} \SigmabCC \KbCC^{-1} & = \left(\KbCC - \KbCC \left( \KbCC + \SigmabetaC \right)^{-1} \KbCC \right)^{-1} \\
	% Inverse
	\KbCC \SigmabCC^{-1} \KbCC & = \left(\KbCC - \KbCC \left( \KbCC + \SigmabetaC \right)^{-1} \KbCC \right) \\
	\KbCC \SigmabCC^{-1} \KbCC & = \KbCC \left( \KbCC^{-1}- \left( \KbCC + \SigmabetaC \right)^{-1} \right) \KbCC\\
	\SigmabCC^{-1} & = \left( \KbCC^{-1}- \left( \KbCC + \SigmabetaC \right)^{-1} \right) \\
	\left( \KbCC + \SigmabetaC \right)^{-1} & = \left( \KbCC^{-1} - \SigmabCC^{-1} \right) \\
        % Elaborate on left hand side
        \KbCC^{-1} - \KbCC^{-1} \left(\SigmabetaC^{-1} + \KbCC^{-1}\right)^{-1}\KbCC^{-1} & = \left( \KbCC^{-1} - \SigmabCC^{-1} \right) \\
	% Simplify
        \KbCC^{-1} \left(\SigmabetaC^{-1} + \KbCC^{-1}\right)^{-1}\KbCC^{-1} & = \SigmabCC^{-1} \\
        % Inverses
        \KbCC \left(\SigmabetaC^{-1} + \KbCC^{-1}\right)\KbCC & = \SigmabCC \\
        % Break parentheses and expand right hand side
        \KbCC \SigmabetaC^{-1} \KbCC + \KbCC \KbCC^{-1} \KbCC & = \SigmabCC = \KbCC + \frac{1}{\sigma^2} \KbCX \KbXC \\
        % Collect
        \KbCC \SigmabetaC^{-1} \KbCC & = \frac{1}{\sigma^2} \KbCX \KbXC \\
        % move \KbCCs to other side
        \SigmabetaC^{-1} & = \frac{1}{\sigma^2} \KbCC^{-1} \KbCX \KbXC \KbCC^{-1}\\
        % Undo inverse
        \SigmabetaC^* & = \sigma^2 \KbCC \left(\KbCX \KbXC\right)^{-1} \KbCC
\end{align}

We now elaborate on the optimal values for CVGP's pseudo-coresets,
rewriting CVGP's optimal pseudo-observations as
\begin{align}
	\ybC^* &= \sigma^{-2} \left( \KbCC + \SigmabetaC \right)\SigmabCC^{-1} \KbCX \yb \\
	&= \sigma^{-2} \left( \KbCC + \SigmabetaC \right) \left[ \KbCC^{-1} -  \left( \KbCC + \SigmabetaC \right)^{-1} \right] \KbCX \yb \\
	&= \sigma^{-2} \left[ \left( \KbCC + \SigmabetaC \right) \KbCC^{-1} -  \Ib_M \right] \KbCX \yb \\
	&= \sigma^{-2} \left[ \KbCC \left[ \KbCC^{-1} + \sigma^{2} \left(\KbCX \KbXC\right)^{-1} \right] \KbCC \KbCC^{-1} -  \Ib_M \right] \KbCX \yb \\
	&= \sigma^{-2} \left[ \Ib_M  + \sigma^{2} \KbCC \left(\KbCX \KbXC\right)^{-1}  -  \Ib_M \right] \KbCX \yb \\
	&= \sigma^{-2} (\sigma^{2} \KbCC \left(\KbCX \KbXC\right)^{-1} \KbCX) \yb \\
        &= \sigma^{-2} \SigmabetaC^* \yb \\
        &= \KbCC \left(\KbCX \KbXC\right)^{-1} \KbCX \yb
\end{align}

\newpage

With this optimal values, we can now rewrite the lower-bound at its maxima
\begin{align}
	\Loss_{CVGP}(\ybC^*, \SigmabetaC^*)
	&= -\frac{N}{2} \log (2\pi) + \frac{M}{2} - \frac{1}{2} \log\left|\sigma^2 \Ib_N \right| - \frac{1}{2} \log \left| \KbCC \right|-\frac{1}{2} \yb^\top \sigma^{-2} \yb  \nonumber \\
	& + \sigma^{-2} \ybC^{*^{\top}} \left(\KbCC  + \SigmabetaC^* \right)^{-1} \KbCX \yb \\
	& -\frac{1}{2} \ybC^{*^{\top}} \left(\KbCC  + \SigmabetaC^* \right)^{-1} 
			\SigmabCC \left( \KbCC + \SigmabetaC^* \right)^{-1} \ybC^*  \nonumber \\
	& - \frac{1}{2 \sigma^{2}} \tr{\KbXX - \KbXC \KbCC^{-1} \KbCX} \nonumber \\
	& - \frac{1}{2 } \tr{ \KbCC^{-1} \SigmabCC } + \frac{1}{2 } \tr{ \KbCC^{-1} \SigmabCC \left( \KbCC + \SigmabetaC^* \right)^{-1} \KbCC } \\
	& + \frac{1}{2} \log \left| \KbCC - \KbCC \left( \KbCC + \SigmabetaC^* \right)^{-1} \KbCC \right| \\
	% Replace parameterization
	&= -\frac{N}{2} \log (2\pi) + \frac{M}{2} - \frac{1}{2} \log\left|\sigma^2 \Ib_N \right| - \frac{1}{2} \log \left| \KbCC \right|-\frac{1}{2} \yb^\top \sigma^{-2} \yb  \nonumber \\
	& + \sigma^{-2} \sigma^{-2} \yb^\top \KbXC \SigmabCC^{-1} \left( \KbCC + \SigmabetaC \right)  \left(\KbCC  + \SigmabetaC^* \right)^{-1} \KbCX \yb \\
	& -\frac{1}{2} \sigma^{-2} \yb^\top \KbXC \SigmabCC^{-1} \left( \KbCC + \SigmabetaC \right) \left(\KbCC  + \SigmabetaC^* \right)^{-1} 
	\SigmabCC \left( \KbCC \right.\\
    & \left.\quad \quad \quad \quad  \quad \quad + \SigmabetaC^* \right)^{-1} \sigma^{-2} \left( \KbCC + \SigmabetaC \right)\SigmabCC^{-1} \KbCX \yb \nonumber \\
	& - \frac{1}{2 \sigma^{2}} \tr{\KbXX - \KbXC \KbCC^{-1} \KbCX} \nonumber \\
	& - \frac{1}{2 } \tr{ \KbCC^{-1} \SigmabCC } + \frac{1}{2 } \tr{ \KbCC^{-1} \SigmabCC \left(\KbCC^{-1} - \SigmabCC^{-1} \right) \KbCC } \\
	& + \frac{1}{2} \log \left| \KbCC - \KbCC \left( \KbCC^{-1} - \SigmabCC^{-1} \right) \KbCC \right| \\
	% Simplify
	&= -\frac{N}{2} \log (2\pi) + \frac{M}{2} - \frac{1}{2} \log\left|\sigma^2 \Ib_N \right| - \frac{1}{2} \log \left| \KbCC \right|-\frac{1}{2} \yb^\top \sigma^{-2} \yb  \nonumber \\
	& + \sigma^{-2} \sigma^{-2} \yb^\top \KbXC \SigmabCC^{-1} \KbCX \yb \\
	& -\frac{1}{2} \sigma^{-2} \yb^\top \KbXC \sigma^{-2} \SigmabCC^{-1} \KbCX \yb \nonumber \\
	& - \frac{1}{2 \sigma^{2}} \tr{\KbXX - \KbXC \KbCC^{-1} \KbCX} \nonumber \\
	& - \frac{1}{2 } \tr{ \KbCC^{-1} \SigmabCC } + \frac{1}{2 } \tr{ \KbCC^{-1} \SigmabCC} - \frac{1}{2 } \tr{ \KbCC^{-1} \KbCC } \\
	& + \frac{1}{2} \log \left| \KbCC \left( \KbCC^{-1} - \left( \KbCC^{-1} - \SigmabCC^{-1} \right)\right) \KbCC \right| \\
	% Compress
	&= -\frac{N}{2} \log (2\pi) + \frac{M}{2} - \frac{1}{2} \log\left|\sigma^2 \Ib_N \right| - \frac{1}{2} \log \left| \KbCC \right|-\frac{1}{2} \yb^\top \sigma^{-2} \yb  \nonumber \\
	& + \frac{1}{2} \sigma^{-2} \sigma^{-2} \yb^\top \KbXC \SigmabCC^{-1} \KbCX \yb \nonumber \\
	& - \frac{1}{2 \sigma^{2}} \tr{\KbXX - \KbXC \KbCC^{-1} \KbCX} - \frac{1}{2 } M \nonumber \\
	& + \frac{1}{2} \log \left| \KbCC \SigmabCC^{-1} \KbCC \right| \\
	% Reorganize
	&= -\frac{N}{2} \log (2\pi) \nonumber \\
	& -\frac{1}{2} \yb^\top \sigma^{-2} \left( \Ib_M + \sigma^{-2} \KbXC \SigmabCC^{-1} \KbCX \right) \yb \nonumber \\
	& - \frac{1}{2 \sigma^{2}} \tr{\KbXX - \KbXC \KbCC^{-1} \KbCX} \nonumber \\
	& - \frac{1}{2} \log\left|\sigma^2 \Ib_N \right| + \frac{1}{2} \log \left| \SigmabCC^{-1} \KbCC \right| \\
	&= \log \N{ \yb \mid \zerob, \sigma^2 \Ib_N + \Qb_{\fbC,\fbC} } - \frac{1}{2 \sigma^2}\tr{ \KbXX - \Qb_{\fbC,\fbC} } \label{eq:cvtgp_loss_data_marginal_optima}
\end{align}
which, for $M=Z$, and $\XbC = \Xb_Z$\footnote{Matching notations.}, corresponds with the same lower-bound as demonstrated by~\citet{titsias2009variational} for SparseGP. $\mathcal{L}_{CVGP} \leq \mathcal{L}_{SparseGP}$ and the bound is tight with equality when  $\KbCC \left(\KbCX \KbXC\right)^{-1} \KbCC$ is diagonal.
%, which is possible if the columns of $\KbCC$ are $(\KbCX \KbCX)^{-1}$–orthogonal