% !TEX root = ../main.tex
We introduce the notation and foundations of $\gp$ regression in Section~\ref{ssec:background_gp},
and describe sparse approximations for scalable $\gp$ inference in Section~\ref{ssec:background_sparsegp}.
We review the variational inducing point-based foundational 
work of~\citet{titsias2009variational} and~\citet{hensman2013gaussian}, in Sections~\ref{ssec:background_titsias} and~\ref{ssec:background_svgp}, respectively.
These are SOTA $\gp$ algorithms that will serve as competitive baselines for the experiments in Section~\ref{sec:experiments}.
\subsection{$\gp$ Regression}
\label{ssec:background_gp}
A (univariate) $\gp$ is a non-parametric prior over functions from input domain $\xb \in \xspace$ into scalar space $y \in \yspace$,
denoted as $f(\xb ;\Theta) \sim \GP{m(\xb ; \theta_m), k(\xb, \xb ; \theta_k)}$.
A $\gp$ is specified by its
mean, $m(\xb ; \theta_m): \xspace \rightarrow \RR$,
and covariance (kernel), $k(\xb, \xb ; \theta_k) : \xspace \times \xspace \rightarrow \RR$, functions
with parameters $\theta_m$ and $\theta_k$ that are jointly referred to as $\gp$ hyperparameters $\Theta = \{\theta_m, \theta_k \}$.

The function $f$ is a mapping from $\xspace$ to the real numbers and we may equivalently write $f \in \RR^{\xspace}$,
viewing functions as (infinite-dimensional) vectors with elements indexed by members of $\xspace$.
Using vector notation, we define  $\fb=f(\Xb)$ as the vector containing the $\gp$ prior values at a collection of points $\Xb =\{ \xb_i \}_{i=1}^N$.
The $\gp$ prior evaluated at any subset of points $\Xb$ follows a multivariate Gaussian distribution
$\p{\fb ; \Theta } \sim \N{\fb \mid \zerob, \KbXX}$,
with $\KbXX = (k(\xb_i, \xb_j))_{1\leq i,j \leq N}$.
We assume zero mean $\gp$ priors without loss of generality,
and suppress explicit dependence on input points $\Xb$ to avoid notation clutter.

In $\gp$ regression with observations subject to Gaussian noise,
\ie $y = f(\xb;\Theta) + \epsilon$, $\epsilon \sim \N{\epsilon | 0, \sigma^2}$,
the data marginal likelihood is given by
\begin{align}
    \p{\yb} =  \int_{\fb} \p{\yb|\fb; \sigma} \p{\fb ; \Theta } \diff{\fb} =
    \N{\yb | \zerob, \sigma^2 \Ib+\KbXX} . 
    \label{eq:data_marginal_analytical} 
\end{align}
Given some observed data $\Xb$, the posterior over the $\gp$ function at any input $\xbstar$, $f^\star = f(\xbstar)$, is a Gaussian distribution computable in closed form, \ie
\begin{align}
& \cp{f^\star}{\xb^\star, \yb} = \N{f \mid m_{f^\star|\xb^\star}, k_{f^\star|\xb^\star} } , \; \text{ with} \label{eq:gp_posterior} \\
& m_{f^\star|\xbstar} = \KbSX \left( \sigma^2 \Ib+\KbXX\right)^{-1} \yb \;, \nonumber \\
& k_{f^\star|\xb^\star} = k(\xbstar, \xbstar) - \KbSX \left( \sigma^2 \Ib+\KbXX\right)^{-1} \KbXS \;. \nonumber
\end{align}
where $\KbSX$ is the $N$-dimensional row vector of kernel function values between a new input $\xbstar$ and observed data $\Xb$.

\subsection{Sparse Gaussian Process Regression}\label{ssec:background_sparsegp}
Even though posterior statistics in Equation~\eqref{eq:gp_posterior} are analytically tractable,
they raise computational challenges for big data, 
as they require computation of the inverse of $N \times N$ matrices
with, in general, $\bigO{N^3}$ time and $\bigO{N^2}$ space complexity.
An overview of sparse approximations to reduce such computational burden for $\gp$ regression can be found in~\citep[Chapter 8]{rasmussen2006gaussian},
with a unifying view presented in~\citep{quinonero2005unifying},
summarized below.
The innovation in sparse $\gp$s is to design \textit{approximate} posteriors
over $\gp$ function values $\fbZ=f(\XbZ)$ at a subset of $M$ inducing inputs $\XbZ$.
%\footnote{
%    The classical sparse $\gp$ literature~\citep{quinonero2005unifying, titsias2009variational} refers to inducing points with $\Zb$,
%    and use $\ub=f(\Zb)$ for their corresponding $\gp$ values:
%   we use a different notation, for the sake of a clear and unified %exposition of methods, easing the comparison across them.
%}.

\citet{quinonero2005unifying} presented the Fully Independent Training Conditional (FITC) technique,
as a unifying framework for many of the sparse $\gp$ formulations that had previously been presented, \eg~\citep{csato2002sparse, smola2000,snelson2005sparse}.
FITC, which was later connected to methods that approximate the $\gp$ posterior via Expectation Propagation~\citep{snelson2008flexible,yuan2012,bui2017},
uses ---unlike previous methods~\citep{csato2002sparse, seeger2003fast}--- 
the marginal likelihood to jointly learn the hyperparameters and the inducing points~\citep{snelson2005sparse}.
This relaxes the constraint of having the inducing points limited to a subset of the dataset,
and turns a discrete inducing point selection problem into a continuous optimization one.
%
Careful inspection of these sparse methodologies and, in particular, FITC~\citep{quinonero2005unifying, bauer2016}
pointed out several limitations related to their tendency to overestimate marginal likelihood,
which motivated \citet{titsias2009variational} to propose a variational formulation for sparse $\gp$ regression.

\subsection{Variational Sparse $\gp$}
\label{ssec:background_titsias}
\citet{titsias2009variational} revisited sparse $\gp$ inference and pose it as a variational optimization problem on jointly learning $M$ inducing inputs $\XbZ$
(and $\gp$ hyperparameters $\Theta$),
by maximizing a lower-bound of the log-marginal likelihood:
\begin{align}
   \log \p{\yb} \geq \Loss_{SparseGP} = \Ex{\q{\fb,\fbZ}}{\log\frac{\p{\yb,\fb,\fbZ}}{\q{\fb,\fbZ}}} ,
    \label{eq:dataloglikelihood_lowerbound_titsias}
\end{align}
which is equivalent to minimizing the Kullback–Leibler (KL) divergence
between the variational family $q \in \mathcal{Q}$ and the $\gp$ posterior, \ie $\kl{\q{\fb,\fbZ}}{\p{\fb, \fbZ | \yb}}$.

\citet{titsias2009variational} showed that, for a factorization of the variational family of the $\q{\fb,\fbZ}=\cp{\fb}{\fbZ}\q{\fbZ}$ form,
one can marginalize over the $\gp$ inducing variables $\fbZ=f(\XbZ)$,
to derive the following analytical lower-bound
\begin{align}
    &\Loss_{SparseGP} = \log \mathcal{N}
    \left(
        \yb \mid
        \zerob, \sigma^2 \Ib+ \KbXZ \KbZZ^{-1} \KbZX
    \right) \nonumber\\
    &\qquad \qquad -\frac{1}{2 \sigma^2}\tr{\KbXX -   \KbXZ \KbZZ^{-1}\KbZX} ,
\label{eq:dataloglikelihood_lowerbound_titsias_analytical}
\end{align}
which can be computed in $\bigO{NM^2}$ time and $\bigO{NM}$ space complexity.
Equation~\eqref{eq:dataloglikelihood_lowerbound_titsias_analytical} is the result of integrating out the \emph{optimal} Gaussian variational posterior $q^*(\fbZ)$, available in closed form for a set of inducing points $\XbZ$, 
and expressed in terms of the prior modeling choices of
kernel and likelihood noise, only used implicitly in inference.

\subsection{Stochastic Variational $\gp$}
\label{ssec:background_svgp}

\citet{hensman2013gaussian} revisited \citet{titsias2009variational}'s evidence lower-bound (ELBO), and showed that it can be amenable to stochastic variational inference for $\gp$s (SVGP),
by re-organizing it and avoiding direct marginalization over inducing variables $\XbZ$, \ie
\begin{align}
       \log p(\yb) \geq \Loss_{SVGP} &= \Ex{q(\fbZ)}{\Ex{q(\fb|\fbZ)}{\log p(\yb|\fb)}} \nonumber \\
       & \qquad - \kl{q(\fbZ)}{p(\fbZ)} \;.
\end{align}

SVGP proceeds by defining a free-form variational family $q(\fbZ) = \N{\fbZ \mid \mb, \Sb}$
and analytically computing the revised ELBO:
\begin{align}
    \Loss_{SVGP} &= 
    \log \N{\yb \mid \KbXZ\KbZZ^{-1}\mb, \sigma^2 \Ib} \label{eq:dataloglikelihood_lowerbound_SVGP} \\
    &-\frac{1}{2 \sigma^2} \tr{\KbXX - \KbXZ\KbZZ^{-1}\KbZX } \nonumber \\
    &  -\frac{1}{2 \sigma^2} \tr{\KbXZ \KbZZ^{-1}\Sb \KbZZ^{-1} \KbZX} \nonumber\\
    &- \kl{\q{\fbZ}}{\p{\fbZ}}\nonumber \;. 
\end{align}

Equation~\eqref{eq:dataloglikelihood_lowerbound_SVGP} allows for data subsampling,
hence enabling stochastic optimization
to learn the free variational parameters $\{\mb, \Sb\}$ in $q(\fbZ) = \N{\fbZ \mid \mb, \Sb}$, of order $\bigO{2M + M^2}$,
where an unbiased estimate of the SVGP loss can be computed with $\bigO{M^3}$ time- and $\bigO{M^2}$ space-complexity.

The optimum of Equation~\eqref{eq:dataloglikelihood_lowerbound_SVGP} matches that of Equation~\eqref{eq:dataloglikelihood_lowerbound_titsias_analytical}, 
yet the latter directly leverages the optimal variational distribution $q^*(\fbZ$),
while the former resorts to stochastic optimization of its free-form,
variational $\bigO{M^2}$ parameters to find it.
Namely, SparseGP operates by maximizing a tight
---based on the optimal $q^*(\fbZ)$)--- lower-bound,
with the disadvantage of not being able to use stochastic optimization. 

Our goal here is to leverage the best of each world
and to design a variational posterior that incorporates the dependencies set by the prior $\gp$ model (\ie the kernel and the likelihood noise)
for approximate $\gp$ inference that is amenable to stochastic optimization.







