% !TEX root = ../main.tex
We denote with $\q{\cdot}$ a generic variational family of distributions over a $\gp$.
Whenever $\q{\fb} \neq \cp{\fb}{\yb}$, we can lower-bound the log-marginal distribution,
\begin{align}
    \log\p{\yb } \geq \Loss & = \Ex{\q{\fb}}{\log \cp{\yb}{\fb}} \nonumber - \kl{\q{\fb}}{\p{\fb}} \;,
\end{align}
incurring on a gap determined by the Kullback–Leibler (KL) divergence between the variational distribution 
and the $\gp$ posterior.
Hence, maximizing the loss $\Loss$ is equivalent to minimizing 
the KL divergence between the variational family $q(\fb)$ and the true posterior $p(\fb|\yb)$,
\ie minimizing the gap $\Delta(\fb) = \kl{\q{\fb}}{\cp{\fb}{\yb}}$.

In CVGP, we use the coreset-based posteriors of Equations~\eqref{eq:coreset_gp_posterior_fC_analytical} and~\eqref{eq:coreset_gp_posterior_f_analytical}
to maximize the lower-bound, \ie
\begin{align}
\Loss_{CVGP} &= \Ex{q(\fb|\ybC, \XbC, \betabC)}{\log p(\yb|\fb) } \nonumber\\
& \quad - \kl{q(\fbC|\ybC, \XbC, \betabC)}{p(\fbC)} \;,
\label{eq:loss_coreset_posterior_gp}
\end{align}
which has the following analytical solution:
\begin{align}
&\Loss_{CVGP}=\log \N{\yb | \mb_{\fb|\ybC},\sigma^2 \Ib}-\frac{1}{2 \sigma^2}\tr{\Kb_{\fb|\ybC}} \nonumber \\
&+ \frac {1}{2} \left[
    \tr{\Ab \KbCC} 
    - \ybC^\top \Ab \KbCC \Ab \ybC 
    + \ln \left|\Ab \Sigmab_{\betabC}\right|
    \right] \:,
\label{eq:loss_coreset_posterior_gp_analytical}
\end{align}
where $\Ab = \left(\KbCC + \Sigmab_{\betabC}\right)^{-1}$.
Full details of the derivation are provided in Appendix Section~\ref{asec:cvtgp_derivation}.

\paragraph{CVGP's lower-bound $\Loss_{CVGP}$:} \textbf{\textit{optimality at reduced complexity and increased numerical stability}}.
Maximization of the variational lower-bound in Equation~\eqref{eq:loss_coreset_posterior_gp_analytical}
to learn CVGP's coreset-based posterior in Equation\eqref{eq:coreset_gp_posterior_f_analytical}
gives rise to the following desirable properties:

\begin{enumerate}[leftmargin=2ex]

    \item The maximum of Equation~\eqref{eq:loss_coreset_posterior_gp_analytical} is identical to the loss in Equation~\eqref{eq:dataloglikelihood_lowerbound_titsias_analytical} derived by~\citet{titsias2009variational}:
    \ie SparseGP and CVGP \emph{have the same optimum} 
    ---see proofs in Appendix Section~\ref{assec:cvgp_lower_bound_optimum}.
    
    \item The lower-bound in Equation~\eqref{eq:loss_coreset_posterior_gp_analytical}
    is amenable to data-subsampling.
    Due to the uncorrelated Gaussian likelihood term and properties of the trace,
    we can apply \emph{stochastic optimization} for its maximization, computing unbiased loss estimates with reduced (a single) data sample.
    
    \item The algorithmic complexity of CVGP, for coreset size $M$, 
    is $\bigO{M^3}$ in computational time and $\bigO{M^2}$ in space complexity.
    Importantly, CVGP's parameter complexity is of \emph{reduced $\bigO{M}$ order}, as it only requires learning coreset triplets $(\XbC, \ybC, \betabC)$, each of size $M$
    ---see Table~\ref{table:bigo} for a full comparison.
    
    \item CVGP's posterior and lower-bound inherently provide a numerically stable stochastic algorithm, as all matrix inverse operations in Equation\eqref{eq:loss_coreset_posterior_gp_analytical}
    involve $\Ab = \left(\KbCC + \Sigmab_{\betabC}\right)^{-1}$:
    the sum of a diagonal matrix ($\Sigmab_{\betabC}$)
    defined by positive coreset weights $\betabC \geq 0 $
    and positive definite matrix $\KbCC$.\footnote{
    In theory, $\KbCC$ is positive definite and invertible.
    However, numerical issues can cause instability when inverted in practice.
    }
\end{enumerate}

% Complexity table
\begin{table}[!ht]
    \rowcolors{5}{}{gray!10}
    \begin{tabular}{*4c}
        \toprule
        & \multicolumn{3}{c}{Complexities} \\
        \cmidrule(lr){2-4}
        Algorithm & Time  & Space & Parameter\\    
        \midrule
        SparseGP &  $\;$$\;$$\;$$ {\bigO{NM^2}}$  &  $\;$$\;$$\;$${\bigO{NM^2}}$    & $\boldsymbol{{\bigO{M}}}$\\
        SVGP  &     $\boldsymbol{{\bigO{M^3}}}$   & $\boldsymbol{{\bigO{M^2}}}$ &      $\;$$\;$${\bigO{M^2}}$ \\
        \textbf{CVGP} &  $\boldsymbol{{\bigO{M^3}}}$      & $\boldsymbol{{\bigO{M^2}}}$ &  $\boldsymbol{{\bigO{M}}}$ \\
        \bottomrule
    \end{tabular}
    \centering
   \caption{{Computational analysis of CVGP and sparse variational $\gp$ alternatives:
    time and space complexities for obtaining an unbiased estimate of
    their objectives.
    Desirable complexities are highlighted in \textbf{bold}.
    CVGP enjoys same time and space complexity as SVGP,
    at a reduced variational parameter dimensionality.
    \label{table:bigo}
    }}
\end{table}