\par To mitigate the restriction to symmetric kernel imposed on existing GP-based transformers, we will instead explore an alternative perspective of correlated Gaussian process (CGP) modeling~\citep{aueb2013variational}. This will allow us to model the kernel attention unit in terms of the cross-covariance between two correlated Gaussian processes, which naturally permits kernel asymmetries while preserving the GP's built-in capability to calibrate uncertainty. 

This consequently inspires a principled approach to calibrate a transformer model without compromising its attention mechanism. To elaborate, Section~\ref{sec:canonical rep} and Section~\ref{sec: canonical GP} will provide background on the canonical representation of GP  and CGP modeling. Section~\ref{sec: CGP attention} will then derive a new CGP-based attention structure that can accommodate both attention asymmetries and uncertainty calibration. A sparse approximation of this CGP-based structure is further derived in Section~\ref{sec: SCGPT} for better scalability.



%We will elaborate more on this in what follows. First, we will provide background on the canonical representation of GP  and CGP modeling in (Section~\ref{sec:canonical rep} and Section~\ref{sec: canonical GP}). Then, we derive a new asymmetric kernel attention mechanism based on this representation in Section~\ref{sec: CGP attention}.


\subsection{Canonical Representation of GP} \label{sec:canonical rep}
Our correlated GP (CGP) modeling was inspired from a representation of GP that parameterizes its kernel in terms of an affine input scaling applied to another parameter-free, canonical GP~\citep{aueb2013variational}. We review this representation below and show how this can be extended towards correlated GP modeling.

\begin{definition}[Canonical Gaussian process (GP)] \label{def:canonical gp}
A canonical GP $z_o(\mathbf{x}) \sim \mathcal{GP}(m_o(\mathbf{x}), \kappa_o(\mathbf{x}, \mathbf{x}'))$ is a Gaussian process  specified with a zero mean function $m_o(\mathbf{x}) = 0$ and a parameter-free kernel function $\kappa_o(\mathbf{x}, \mathbf{x}')$.
\end{definition}
A canonical GP defined in Definition~\ref{def:canonical gp}, for instance, can attain a squared exponential kernel with the kernel length-scales and global scale equal to 1, 
$\kappa_o(\mathbf{x}, \mathbf{x}') = \mathrm{exp}(-0.5 \|\mathbf{x} - \mathbf{x}'\|^2)$.
Another GP $z_\lambda(\mathbf{x})$ can then be represented in terms of an affine scaling of $z_o(\mathbf{x})$,
\begin{eqnarray}
z_\lambda(\mathbf{x}) &\triangleq& \sigma_\lambda \cdot z_o\Big(\mathbf{x}\mathbf{W}^\top\Big), \label{eq:canonocal}
\end{eqnarray}
with mean $m(\mathbf{x}) = 0$ and covariance function 
\begin{eqnarray}
\kappa_\lambda(\mathbf{x}, \mathbf{x}') 
&=& \mathbb{E}\left[\sigma_\lambda^2z_o\Big(\mathbf{x}\mathbf{W}_\lambda^\top\Big)\cdot z_o\Big(\mathbf{x}'\mathbf{W}_\lambda^\top\Big)\right] \nonumber\\
&=&\sigma_\lambda^2 \kappa_o\Big(\mathbf{x}\mathbf{W}_\lambda^\top, \mathbf{x}'\mathbf{W}_\lambda^\top\Big), \label{eq:cov}
%&=& \mathbb{E}\left[\Big(\sigma_z \cdot z_o(\mathbf{x}) - \sigma_z \cdot\mathbb{E}\left[ z_o(\mathbf{x})\right]\Big) \cdot \Big(\sigma_z \cdot z_o(\mathbf{x}') - \sigma_z \cdot\mathbb{E}\left[ z_o(\mathbf{x}')\right]\Big)\right] \nonumber\\
%&=&\mathbb{E}\left[\sigma_z^2 \cdot z_o(\mathbf{x}) \cdot z_o(\mathbf{x}')\right] \ \ =\ \  \mathbb{E}\left[\sigma_z^2 \cdot z_o(\mathbf{x}) \cdot z_o(\mathbf{x}')\right] \nonumber\\
%&=& \sigma_z^2 \mathbb{E}[z_o(\mathbf{x}), z_o(\mathbf{x}')] \ \ =\ \ \sigma_z^2 \cdot \kappa_o(\mathbf{x}, \mathbf{x}')
\end{eqnarray}
where the first equality follows from the definition of covariance and $z_\lambda(\mathbf{x})$ in Eq.~\eqref{eq:canonocal}, as well as the fact that $z_o(\mathbf{x})$ has zero means. The second equality holds because of the canonical kernel definition. The parameter of this kernel function is defined as a tuple $(\mathbf{W}, \sigma_\lambda)$ of parameters.


\subsection{CGP Modeling}  \label{sec: canonical GP}
\iffalse
Our CGP modeling was inspired from a representation of Gaussian process that parameterizes its kernel in terms of an affine input scaling applied to another parameter-free, canonical GP \citep{aueb2013variational}. We concisely review this canonical representation below and show how this can be extended towards correlated GP modeling.

{\bf Canonical Representation.} To elaborate, the canonical GP $z_o(\mathbf{x})$ is specified with a zero mean function $m_o(\mathbf{x}) = 0$ and a parameter-free kernel function $\kappa_o(\mathbf{x}, \mathbf{x}')$, e.g. $\kappa_o(\mathbf{x}, \mathbf{x}') = \mathrm{exp}(-0.5 \|\mathbf{x} - \mathbf{x}'\|^2)$. We can then parameterize and represent another GP $z_\lambda(\mathbf{x})$ as the result of an affine scaling of $z_o(\mathbf{x})$,
\begin{eqnarray}
z_\lambda(\mathbf{x}) &\triangleq& \sigma_\lambda \cdot z_o\Big(\mathbf{x}\mathbf{W}^\top\Big) \label{eq:canonocal}
\end{eqnarray}
with mean $m(\mathbf{x}) = 0$ and covariance function 
\begin{eqnarray}
\kappa_\lambda(\mathbf{x}, \mathbf{x}') 
&=& \mathbb{E}\left[\sigma_\lambda^2z_o\Big(\mathbf{x}\mathbf{W}_\lambda^\top\Big)\cdot z_o\Big(\mathbf{x}'\mathbf{W}_\lambda^\top\Big)\right] \nonumber\\
&=&\sigma_\lambda^2 \kappa_o\Big(\mathbf{x}\mathbf{W}_\lambda^\top, \mathbf{x}'\mathbf{W}_\lambda^\top\Big) \label{eq:cov}
%&=& \mathbb{E}\left[\Big(\sigma_z \cdot z_o(\mathbf{x}) - \sigma_z \cdot\mathbb{E}\left[ z_o(\mathbf{x})\right]\Big) \cdot \Big(\sigma_z \cdot z_o(\mathbf{x}') - \sigma_z \cdot\mathbb{E}\left[ z_o(\mathbf{x}')\right]\Big)\right] \nonumber\\
%&=&\mathbb{E}\left[\sigma_z^2 \cdot z_o(\mathbf{x}) \cdot z_o(\mathbf{x}')\right] \ \ =\ \  \mathbb{E}\left[\sigma_z^2 \cdot z_o(\mathbf{x}) \cdot z_o(\mathbf{x}')\right] \nonumber\\
%&=& \sigma_z^2 \mathbb{E}[z_o(\mathbf{x}), z_o(\mathbf{x}')] \ \ =\ \ \sigma_z^2 \cdot \kappa_o(\mathbf{x}, \mathbf{x}')
\end{eqnarray}
where the first equality follows from the definition of covariance and $z_\lambda(\mathbf{x})$ in Eq.~\eqref{eq:canonocal}, as well as the fact that $z_o(\mathbf{x})$ has zero means. The $2^{\mathrm{nd}}$ quality holds because of the canonical kernel definition. The parameter of this kernel function is thus defined as a tuple $(\mathbf{W}, \sigma_\lambda)$.
\fi
Inspired by the canonical representation in Definition~\ref{def:canonical gp}, we can now characterize two GPs $z_k(\mathbf{x})$ and $z_q(\mathbf{x})$, both of which are obtained via scaling the input of $z_o(\mathbf{x})$ using the above mechanism with separate parameters $(\mathbf{W}_k, \sigma_k)$ and $(\mathbf{W}_q, \sigma_q)$. Following Eq.~\eqref{eq:cov} above, 
\begin{eqnarray} \label{eq: kernel q k}
\kappa_k(\mathbf{x}, \mathbf{x}') 
&=& \sigma_k^2 \kappa_o\Big(\mathbf{x}\mathbf{W}_k^\top, \mathbf{x}'\mathbf{W}_k^\top\Big) \nonumber\\
\kappa_q(\mathbf{x}, \mathbf{x}') 
&=& \sigma_q^2 \kappa_o\Big(\mathbf{x}\mathbf{W}_q^\top, \mathbf{x}'\mathbf{W}_q^\top\Big), \label{eq:K_k}
\end{eqnarray}
where $\kappa_o$ is a parameter-free kernel function of the canonical GP $z_o(\mathbf{x})$.
Furthermore, it can be shown that this representation also allows analytic derivation of the cross-covariance between $z_k(\mathbf{x})$ and $z_q(\mathbf{x})$ as follow, 
\begin{eqnarray} \label{eq: cross-kernel func}
\kappa_{kq}(\mathbf{x}, \mathbf{x}') &=& \sigma_k\sigma_q \ \kappa_o\Big(\mathbf{x}\mathbf{W}_k^\top, \mathbf{x}'\mathbf{W}_q^\top\Big) \nonumber\\
\kappa_{qk}(\mathbf{x}, \mathbf{x}') &=& \sigma_q\sigma_k \ \kappa_o\Big(\mathbf{x}\mathbf{W}_q^\top, \mathbf{x}'\mathbf{W}_k^\top\Big).
\end{eqnarray}
Note that, unlike the in-domain covariance functions $\kappa_k$ and $\kappa_q$, the cross-domain covariance $\kappa_{kq}$ and $\kappa_{qk}$ are not symmetric, unless we force $\mathbf{W}_k = \mathbf{W}_q = \mathbf{W}$. This relaxes the restrictive Gaussian imposition on the marginal of $(z_k(\mathbf{x}), z_q(\mathbf{x}))$ on a finite set of inputs  $\mathbf{X} = \{\mathbf{x}_1, \mathbf{x}_2, \ldots, \mathbf{x}_n\}$ while still enabling a closed-form computation of the cross-function prediction of $\mathbf{z}_q = [z_q(\mathbf{x}_1), \ldots, z_q(\mathbf{x}_n)]^\top$ given the perturbed observations $\mathbf{z}_k = [z_{k1}, z_{k2}, \ldots, z_{kn}]^\top$ with $z_{ki} \sim \mathbb{N}(z_k(\mathbf{x}_i), \sigma^2)$. This is possible because $(z_q(\mathbf{x}), z_o(\mathbf{x}))$ and $(z_k(\mathbf{x}), z_o(\mathbf{x}))$ are both Gaussian even though $(z_k(\mathbf{x}), z_q(\mathbf{x}))$ is not. This also enables a mathematical quantification of the prediction uncertainty in terms of the conditional covariance of $\mathbf{z}_q \mid \mathbf{z}_k$.

Such modeling properties are desirable because (1) the closed-form representation of the cross-function prediction will help reproduce the kernel attention form in Eq.~\eqref{eq:kernel attention} with $\mathbf{z}_q$ being the attention output and $\mathbf{z}_k$ being the input to the attention unit; and (2) the mathematically induced form of its predictive covariance can be leveraged to calibrate the uncertainty output of the attention unit. To achieve this, we will establish the closed-form prediction of $\mathbf{z}_q \mid \mathbf{z}_k$ in the rest of this section, and then establish its correspondence to (asymmetric) kernel attention in Section~\ref{sec: CGP attention}. 

\iffalse
Nonetheless, regardless of whether these kernel functions are symmetric or not, the marginal of $(z_k(\mathbf{x}), z_q(\mathbf{x}))$ on any finite set of inputs  $\mathbf{X} = \{\mathbf{x}_1, \mathbf{x}_2, \ldots, \mathbf{x}_n\}$ is always a multivariate Gaussian. 
This means we can predict the value of $\mathbf{z}_q = [z_q(\mathbf{x}_1), \ldots, z_q(\mathbf{x}_n)]^\top$ given the perturbed observations $\mathbf{z}_k = [z_{k1}, z_{k2}, \ldots, z_{kn}]^\top$ where $z_{ki} \sim \mathbb{N}(z_k(\mathbf{x}_i), \sigma^2)$ and the above cross-covariance functions\footnote{Note that there is a slight abuse of notation here where $\mathbf{z}_q$ is used to denote true oracle values of $z_q(\mathbf{x})$ on $\{\mathbf{x}_i\}_{i=1}^n$ while $\mathbf{z}_k$ is used to instead denote the perturbed observations of $z_q(\mathbf{x})$ on $\{\mathbf{x}_i\}_{i=1}^n$.},
\begin{eqnarray}
\mathbf{z}_q &\sim& \mathbb{N}\Bigg(\mathcal{K}_{qk} \Big(\mathcal{K}_k + \sigma^2\mathbf{I}\Big)^{-1} \mathbf{z}_k,\ \mathcal{K}_q - \mathcal{K}_{qk}\Big(\mathcal{K}_k + \sigma^2\mathbf{I}\Big)^{-1}\mathcal{K}_{kq}\Bigg) \label{eq:cross-function-prediction}
\end{eqnarray}
where $\mathcal{K}_k$ and $\mathcal{K}_q$ are the induced Gram matrices of $\kappa_k$ and $\kappa_q$ on $\mathbf{X} = (\mathbf{x}_1, ,\mathbf{x}_2, \ldots, \mathbf{x}_n)$. Likewise, $\mathcal{K}_{kq}$ and $\mathcal{K}_{qk}$ are induced Gram matrices of $\kappa_{kq}$ and $\kappa_{qk}$ on $\mathbf{X} = (\mathbf{x}_1, ,\mathbf{x}_2, \ldots, \mathbf{x}_n)$.
\fi

To derive the closed form for $\mathbb{E}[\mathbf{z}_q \mid \mathbf{z}_k]$, note that for any set of outputs $\mathbf{z}_o = [z_o(\mathbf{x}_{o1}), z_o(\mathbf{x}_{o2}), \ldots, z_o(\mathbf{x}_{on})]$ of the canonical GP-distributed function $z_o(\mathbf{x})$ at a set of latent inputs $\mathbf{X}_o = [\mathbf{x}_{o1}, \mathbf{x}_{o2}, \ldots, \mathbf{x}_{on}]$,
\begin{eqnarray}
\hspace{-14mm}p\big(\mathbf{z}_q|\mathbf{z}_k\big) &=& \int_{\mathbf{z}_o}p(\mathbf{z}_q\mid\mathbf{z}_o) \cdot p(\mathbf{z}_o \mid \mathbf{z}_k)\ \mathrm{d}\mathbf{z}_o \ .
\end{eqnarray}
The mean of the above distribution is therefore:
\begin{eqnarray}
\hspace{-0.75mm}\mathbb{E}\Big[\mathbf{z}_q \mid \mathbf{z}_k\Big] 
\hspace{-3mm}&=&\hspace{-3mm} \int_{\mathbf{z}_o}\left(\int_{\mathbf{z}_q}\mathbf{z}_q p(\mathbf{z}_q \mid \mathbf{z}_o)\mathrm{d}\mathbf{z}_q\right) p(\mathbf{z}_o\mid\mathbf{z}_k)\ \mathrm{d}\mathbf{z}_o\ . \nonumber
\end{eqnarray}
% \begin{eqnarray}
% \mathbb{E}\Big[\mathbf{z}_q \mid \mathbf{z}_k\Big] 
% \hspace{-3mm}&=&\hspace{-3mm} \int_{\mathbf{z}_q} \mathbf{z}_q\left(\int_{\mathbf{z}_o}p(\mathbf{z}_q\mid\mathbf{z}_o) \cdot p(\mathbf{z}_o \mid \mathbf{z}_k)\mathrm{d}\mathbf{z}_o\right)\mathrm{d}\mathbf{z}_q \nonumber\\
% \hspace{-3mm}&=&\hspace{-3mm} \int_{\mathbf{z}_o}\left(\int_{\mathbf{z}_q}\mathbf{z}_q p(\mathbf{z}_q \mid \mathbf{z}_o)\mathrm{d}\mathbf{z}_q\right) p(\mathbf{z}_o\mid\mathbf{z}_k)\ \mathrm{d}\mathbf{z}_o \nonumber
% \end{eqnarray}
The above can be rewritten concisely as
\begin{eqnarray}
\hspace{-13mm}\mathbb{E}\Big[\mathbf{z}_q \mid \mathbf{z}_k\Big] &=& \mathbb{E}_{\mathbf{z}_o \sim p(\mathbf{z}_o \mid \mathbf{z}_k)}\Bigg[\mathbb{E}\Big[\mathbf{z}_q \mid \mathbf{z}_o\Big] \mid \mathbf{z}_k\Bigg],\label{eq:preda}
\end{eqnarray}
where the inner expectation is over $\mathbf{z}_q \sim p(\mathbf{z}_q \mid \mathbf{z}_o)$. Now, let $\mathcal{K}_o$ denote the induced Gram matrix of $\kappa_o(\mathbf{x},\mathbf{x}')$ on the set of $n$ latent inputs $\mathbf{X}_o$. As the marginals $(z_o(\mathbf{x}), z_q(\mathbf{x}))$ and $(z_o(\mathbf{x}), z_k(\mathbf{x}))$ are both Gaussian, it follows that 
\begin{eqnarray}
\hspace{-27mm}\mathbb{E}[\mathbf{z}_q \mid \mathbf{z}_o] &=& \mathcal{K}_{qo}(\mathcal{K}_o + \sigma^2\mathbf{I})^{-1}\mathbf{z}_o \ ,\\
\hspace{-27mm}\mathbb{E}[\mathbf{z}_o \mid \mathbf{z}_k] &=& \mathcal{K}_{ok}(\mathcal{K}_k + \sigma^2\mathbf{I})^{-1}\mathbf{z}_k \ ,\label{eq:cond}
\end{eqnarray}
where $\mathcal{K}_{qo}$ and $\mathcal{K}_{ok}$ denote the cross covariance matrix between $z_q(\mathbf{x})$ and $z_o(\mathbf{x})$; and between $z_k(\mathbf{x})$ and $z_o(\mathbf{x})$ on $\mathbf{X}_o = [\mathbf{x}_{o1}, \ldots, \mathbf{x}_{on}]$, respectively. This means the entry at row $a$ and column $b$ of $\mathcal{K}_{qo}$ is $\kappa_o(\mathbf{x}_{a}\mathbf{W}_q, \mathbf{x}_{ob})$ and likewise, the entry at row $a$ and column $b$ of $\mathcal{K}_{ok}$ is $\kappa_o(\mathbf{x}_{oa}, \mathbf{x}_{b}\mathbf{W}_k)$. Eq.~\eqref{eq:cond} is the direct result of the Gaussian conditional identity. Thus, plugging Eq.~\eqref{eq:cond} into Eq.~\eqref{eq:preda} gives
\begin{eqnarray}
\hspace{-7mm}\mathbb{E}[\mathbf{z}_q \mid \mathbf{z}_k] \hspace{-2mm}&=&\hspace{-2mm} \mathbb{E}_{\mathbf{z}_o \sim p(\mathbf{z}_o\mid \mathbf{z}_k)}\Big[\mathbb{E}[\mathbf{z}_q \mid \mathbf{z}_o] \mid \mathbf{z}_k\Big] \\
% \hspace{-2mm}&=&\hspace{-2mm} \mathbb{E}_{\mathbf{z}_o \sim p(\mathbf{z}_o\mid \mathbf{z}_k)}\Big[\mathcal{K}_{qo}(\mathcal{K}_o + \sigma^2\mathbf{I})^{-1}\mathbf{z}_o\Big] \\
% \hspace{-2mm}&=&\hspace{-2mm} \mathcal{K}_{qo}(\mathcal{K}_o + \sigma^2\mathbf{I})^{-1}\mathbb{E}\Big[\mathbf{z}_o \mid \mathbf{z}_k\Big] \\
\hspace{-2mm}&=&\hspace{-2mm} \mathcal{K}_{qo}(\mathcal{K}_o + \sigma^2\mathbf{I})^{-1}\mathcal{K}_{ok}(\mathcal{K}_k + \sigma^2\mathbf{I})^{-1}\mathbf{z}_k, \label{eq: exp closed form}
\end{eqnarray}
which establishes the closed form for the cross-function prediction of $\mathbf{z}_q$ conditioned on $\mathbf{z}_k$.

% \textcolor{blue}{Long: Add definition for canonical and correlated}

\iffalse
\par The \textit{canonical} or standardized representation of GP was proposed by \citep{aueb2013variational} to overcome the intractability of GP when the number of hyperparameters to be estimated by maximum likelihood is large. Assume we have data $\{\mathbf{x}_i, z_i\}_{i=1}^n \in \mathbb{R}^d \times \mathbb{R}$, of $n$ data points generated from a latent function $z(\mathbf{x})$, such that each $z_i$ is obtained by adding a Gaussian noise to the value of $z(\mathbf{x})$ at $\mathbf{x}_i$,
    $$
        z_i = z(\mathbf{x}_i) + \epsilon_i, \quad \epsilon_i \sim \mathbb{N}(0, \sigma^2).
    $$
    
\par Consider the scalar function $s(\mathbf{h}), \ \mathbf{h}\in \mathbb{R}^s$, which is a random sample from a GP in the space $\mathbb{R}^s$ and assumed to have zero mean function and covariance  given by the kernel function $\kappa_s(\mathbf{h}_a, \mathbf{h}_b)$, whose hyperparameters are set to 1. This GP is referred as a \textit{canonical} or standardized GP and the function $s(\mathbf{h})$ is called the standardized function \citep{aueb2013variational}.
    % For example, the covariance function could be a squared exponential function $$k_s(\textbf{z}, \textbf{z}')=\text{exp}\{{-0.5||\textbf{z}-\textbf{z}'||^2}\},$$ with length-scale and global scale set to 1 \citep{aueb2013variational}.
    After obtaining the function $s(\textbf{z})$ in the space $\mathbb{R}^s$ , we can compute the function $z(\mathbf{x})$ in the input space $\mathbb{R}^{d}$ as
    \begin{align}
        z(\mathbf{x}) = \sigma_zs(\mathbf{x}\mathbf{W}^T),
    \end{align}
    where $\sigma_z$ is a scalar hyperparameter associate with the latent function $z$ and $\mathbf{W} \in \mathbb{R}^{s\times d}$ is the affine transformation from the space $\mathbb{R}^d$ to $\mathbb{R}^s$. 
    Given the values of $\sigma_z$ and $\textbf{W}$, a Gaussian prior can be induced on the function $z(\textbf{x})$, which has zero mean and covariance function $\kappa_z(\mathbf{x}_a, \mathbf{x}_b)$,
    \begin{equation}
        \begin{aligned}
            z(\textbf{x}) &\sim \mathbb{N}(0, \kappa_z(\mathbf{x}_a, \mathbf{x}_b)) \\
            \kappa_z(\mathbf{x}_a, \mathbf{x}_b)&= \mathbb{E}[\sigma_zs(\mathbf{x}_a\mathbf{W}^\top)\sigma_zs(\mathbf{x}_b\mathbf{W}^\top)] = \sigma_z^2 \kappa_s(\mathbf{x}_a\mathbf{W}^\top, \mathbf{x}_b\mathbf{W}^\top)
        \end{aligned}
    \end{equation}
    The GP model can be represented with the following structure:
     \begin{equation} \label{eq: canonical GP}
        \begin{aligned}
            s(\textbf{h}) &\sim \mathbb{N}(0, \kappa_s(\mathbf{h}_a, \mathbf{h}_b)), \\
            z(\textbf{x}) &=  \sigma_zs(\textbf{xW}^\top) \\
            z_i &\sim \mathbb{N}(z(
            \textbf{x}_i), \sigma^2), \quad i=1,\ldots,N.
        \end{aligned}
    \end{equation}
    \textcolor{red}{TODO: Say more about this}
    We refer to GPs with structure given by (\ref{eq: canonical GP}) canonical GPs.
\fi
    
\subsection{Kernel Attention via CGP} \label{sec: CGP attention}
With the above cross-function GP prediction recipe, we are now ready to draw correspondence with (asymmetric) kernel attention. As usual, we have
\begin{eqnarray}
\mathbf{V}^+ &=& \mathcal{K}\mathbf{V} \ \ =\ \ \mathcal{K}\mathbf{X}\mathbf{W}_v^\top \ ,
\end{eqnarray}
where $\mathcal{K}$ corresponds to a particular choice of kernel. To draw the correspondence between this and the CGP prediction, we choose $\mathcal{K} = \mathcal{K}_{qo}(\mathcal{K}_o+\sigma^2 \mathbf{I})^{-1}\mathcal{K}_{ok}$. With this,
\begin{eqnarray}
\hspace{-14mm}\mathbf{V}^+ \hspace{-2mm}&=&\hspace{-2mm} \mathcal{K}\mathbf{V} \ =\ \mathcal{K}\mathbf{X}\mathbf{W}_v^\top \nonumber \\ 
\hspace{-7mm}\hspace{-2mm}&=&\hspace{-2mm} \mathcal{K}_{qo}(\mathcal{K}_o+\sigma^2 \mathbf{I})^{-1}\mathcal{K}_{ok}(\mathcal{K}_k+\sigma^2 \mathbf{I})^{-1}\mathbf{Z} \ ,\label{eq:cgpt_output}
\end{eqnarray}
% \begin{eqnarray}
% \hspace{-8.5mm}\mathbf{V}^+ \hspace{-2mm}&=&\hspace{-2mm} \mathcal{K}\mathbf{V} \ =\ \mathcal{K}\mathbf{X}\mathbf{W}_v^\top \ = \  \mathcal{K}_{qo}(\mathcal{K}_o+\sigma^2 \mathbf{I})^{-1}\mathcal{K}_{ok}\nonumber\\
% \hspace{-7mm}\hspace{-2mm}&\times&\hspace{-2mm} (\mathcal{K}_k+\sigma^2 \mathbf{I})^{-1} \ (\mathcal{K}_k+\sigma^2 \mathbf{I}) \ \mathbf{X}\mathbf{W}_v^\top \nonumber \\ \hspace{-7mm}\hspace{-2mm}&=&\hspace{-2mm} \mathcal{K}_{qo}(\mathcal{K}_o+\sigma^2 \mathbf{I})^{-1}\mathcal{K}_{ok}(\mathcal{K}_k+\sigma^2 \mathbf{I})^{-1}\mathbf{Z} \ ,\label{eq:cgpt_output}
% \end{eqnarray}
where $\mathbf{Z} \triangleq (\mathcal{K}_k + \sigma^2\mathbf{I})\ \mathbf{X}\mathbf{W}_v^\top$. Again, let $\boldsymbol{\nu}_a$ denotes the $a$-th column of $\mathbf{V}^+$ and $\mathbf{z}_a$ denote the $a$-column of $\mathbf{Z}$,
\begin{eqnarray} 
\label{eq:cgp_attention}
\hspace{-7.5mm}\boldsymbol{\nu}_a &=& \mathcal{K}_{qo}\Big(\mathcal{K}_o+\sigma^2 \mathbf{I}\Big)^{-1}\mathcal{K}_{ok}\Big(\mathcal{K}_k+\sigma^2 \mathbf{I}\Big)^{-1}\mathbf{z}_a \ .
\end{eqnarray}
This implies the $a$-th column $\boldsymbol{\nu}_a$ of the kernel attention output in fact corresponds to the mean prediction of $z_q(\mathbf{x}_1), \ldots, z_q(\mathbf{x}_n)$ of the conditional distribution, $p(\mathbf{z}_q|\mathbf{z}_k)$ which was fitted on the dataset $\{\mathbf{x}_r, [\mathbf{z}_a]_r\}_{r=1}^n$. Here, the target fitting output $\{[\mathbf{z}_a]_r\}_{r=1}^n$ are treated as perturbed observations of $\{z_k(\mathbf{x}_r)\}_{r=1}^n$ such that $[\mathbf{z}_a]_r \sim \mathbb{N}(z_k(\mathbf{x}_r), \sigma^2)$. 

\begin{remark}[CGP-based Attention can be Asymmetric.]
Since the induced Gram matrices $\mathcal{K}_{qk}$ and $\mathcal{K}_{kq}$ do not need to be symmetric to guarantee a consistent CGP model, we are no longer constrained to set $\mathbf{W}_q = \mathbf{W}_k$ to enforce symmetric kernel. As a result, our CGP-based attention can accommodate attention asymmetries.
%With the above correspondence, we are no longer constrained to set $\mathbf{W}_q = \mathbf{W}_k$ to enforce symmetric kernel  
\end{remark}

\begin{remark}[One CGP per Attention Dimension]
Suppose the attention output is $s$-dimensional, the kernel attention unit is equivalent to a collection of $s$ CGPs modeling of $s$ datasets $\{\mathbf{x}_r, [\mathbf{z}_a]_r\}_{r=1}^n$ with $a \in [s]$. 
\end{remark}



\iffalse
The vectors $\mathbf{z}_Q$ and $\mathbf{z}_K$ covary with each other based on the cross-covariance function:
    \begin{equation} \label{eq: cross kernel}
        \begin{aligned}
        \kappa_{z_Q, z_K}(\textbf{x}_a, \textbf{x}_b) = \mathbb{E}[\sigma_{Q}s(\textbf{x}_a\textbf{W}_Q^T)\sigma_{K}s(\textbf{x}_b\textbf{W}_K^T)]
        &= \sigma_{Q}\sigma_{K}\kappa_s(\textbf{q}_a, \textbf{k}_b).
        \end{aligned}
    \end{equation}
    Based on these kernel functions, we can compute the kernel matrices $\mathbf{K}_{\mathbf{z}_Q, \mathbf{z}_Q}, \mathbf{K}_{\mathbf{z}_K, \mathbf{z}_K}, \mathbf{K}_{\mathbf{z}_Q, \mathbf{z}_K} \in \mathbb{R}^{n\times n}$ with entries $(a,b)$ given by eqn (\ref{eq: kernel qq}), (\ref{eq: kernel kk}) and (\ref{eq: cross kernel}) respectively. Using these kernel matrices, we can express the conditional Gaussian distribution
     \begin{align} \label{eq: cond GP prior}
        P(\mathbf{z}_Q| \mathbf{z}_K) = \mathbb{N}(\mathbf{K}_{\mathbf{z}_Q,\mathbf{z}_K} \mathbf{K}_{\mathbf{z}_K,\mathbf{z}_K}^{-1}\mathbf{z}_K, \mathbf{K}_{\mathbf{z}_Q,\mathbf{z}_Q} - \mathbf{K}_{\mathbf{z}_Q,\mathbf{z}_K} \mathbf{K}_{\mathbf{z}_K,\mathbf{z}_K}^{-1}\mathbf{K}_{\mathbf{z}_Q,\mathbf{z}_K}^\top).
    \end{align}
    \par Denote $[\mathbf{v}]_{:,i}$ as the $i$-th column of the value matrix $\mathbf{V}$, if we parameterize each column $[\mathbf{v}]_{:,i}$ as
    \begin{align}
        [\mathbf{v}]_{:,i} = \mathbf{K}_{\mathbf{z}_K,\mathbf{z}_K}^{-1}\mathbf{z}_K,
    \end{align}
    we can re-formalize the mean of the conditional Gaussian in Eq. (\ref{eq: cond GP prior}),
    \begin{align}
        \boldsymbol{\mu} \triangleq \mathbf{K}_{\mathbf{z}_Q,\mathbf{z}_K} [\mathbf{v}]_{:,i}.
    \end{align}
    The value of the mean $\boldsymbol{\mu}$ is equivalent to the $i$-th column of the kernel attention output \eqref{eq:kernel attention}. We remark that this equivalence does not require the valid symmetric kernel condition as in the case of vanilla GP Inference in Section \eqref{sec: attention as GP}. This is due to the fact that the
    kernel matrix $\mathbf{K}_{\mathbf{z}_Q,\mathbf{z}_K}$ needs not to be symmetric, as its entries $\kappa_{\mathbf{z}_Q,\mathbf{z}_K}(\mathbf{x}_a, \mathbf{x}_b)$ are defined as the cross-covariance between two correlated GPs $z_Q(\mathbf{x})$ and $z_K(\mathbf{x})$. Based on the conditional Gaussian \eqref{eq: cond GP prior}, we can address the uncertainty of the kernel attention output using the variance,
    \begin{align}
        \boldsymbol{\Sigma} \triangleq \mathbf{K}_{\mathbf{z}_Q,\mathbf{z}_Q} - \mathbf{K}_{\mathbf{z}_Q,\mathbf{z}_K} \mathbf{K}_{\mathbf{z}_K,\mathbf{z}_K}^{-1}\mathbf{K}_{\mathbf{z}_Q,\mathbf{z}_K}^\top. 
    \end{align}
    Therefore, we give the definition for Canonical Gaussian Process Attention (CGPA) in Def. \eqref{def: CGPA}.

    \begin{definition} \label{def: CGPA}
        
    \end{definition}
\fi

\subsection{Correlated GP Transformer} 
\label{sec:learning kernel}
Our proposed Correlated Gaussian Process Transformer (CGPT) framework is derived via replacing the conventional attention unit with the above CGP prediction mechanism (Section~\ref{sec:cgp-attention}). Unlike a conventional transformer which optimizes for performance while neglecting uncertainty calibration for the attention output, our CGPT will optimize for both to avoid making prediction with high uncertainty while preserving overall performance. This is achieved via augmenting the original transformer loss with a regularization loss per attention block, which is expressed in terms of its prediction's uncertainty (Section~\ref{sec:CGP-regularize}). This highlights the importance of CGP's uncertainty quantification mechanism. Our overall CGPT workflow is depicted in Fig \ref{fig:diagram}.

\begin{figure*}[t]
    \centering
    \includegraphics[scale=0.5]{gptransformers/images/architecture.pdf}
    \caption{Diagram of the training workflow of CGPT. Each attention block forwards the CGP's prediction to the next block and caches the prediction uncertainty into a CGP regularizing term (see Algorithm~\ref{alg:cap}). Once the attention output is propagated to the last classification block, the original transformer loss is computed and augmented with the CGP regularizing term. Gradient propagation from this augmented loss will help optimize the CGP parameters to reduce prediction uncertainty while maximizing predictive performance.}
    \label{fig:diagram}
\end{figure*}

\subsubsection{CGP-based Attention}
\label{sec:cgp-attention}
Given the above correspondence between the CGP model and kernel attention mechanism, we can replace the original kernel attention with the following CGP-based attention: 

{\bf CGP-based Attention Workflow.} At each training iteration, upon receiving $\mathbf{X} = (\mathbf{x}_1, \mathbf{x}_2, \ldots, \mathbf{x}_n)$ from the preceding neural block, we will run the following routine in Alg.~\ref{alg:cap}.
\begin{algorithm}[h!]
\caption{CGP-based Attention}\label{alg:cap}
\textbf{input:} sequence of tokens $\mathbf{X} = (\mathbf{x}_1, \mathbf{x}_2, \ldots, \mathbf{x}_n)$\\
\textbf{output:} attention output $\mathbf{V}_+$ and uncertainty $\mathcal{U}$\\\vspace{-4mm}
\begin{algorithmic}[1]
\STATE compute $\mathbf{Z} = (\mathcal{K}_k + \sigma^2\mathbf{I})\ \mathbf{X}\mathbf{W}_v^\top$ and initialize $\mathcal{U} \leftarrow 0$
%\STATE Initialize $\mathrm{reg} \leftarrow 0$
\FOR{\(a \gets 1 : s\)}
\STATE build $\{\mathbf{x}_r, [\mathbf{z}_a]_r\}_{r=1}^n$ where $\mathbf{z}_a \leftarrow [\mathbf{Z}]_a$

\STATE compute $\boldsymbol{\nu}_a$ using Eq.~\eqref{eq:cgp_attention}. %Assemble $\{\boldsymbol{\nu}_a\}_a$ into $\mathbf{V}_+$

\STATE set $\mathbf{z}_k \leftarrow ([\mathbf{z}_a]_1, \ldots, [\mathbf{z}_a]_n)$

\STATE compute $\log p(\mathbf{z}_q = \boldsymbol{\nu}_a, \mathbf{z}_k = \mathbf{z}_k)$ using Eq.~\eqref{eq:CGP-loss-compute}

\STATE update $\mathcal{U} \leftarrow \mathcal{U} - \log p(\mathbf{z}_q = \boldsymbol{\nu}_a, \mathbf{z}_k = \mathbf{z}_k)$
\ENDFOR

\STATE return $\mathbf{V}_+ \leftarrow [\boldsymbol{\nu}_1, \ldots, \boldsymbol{\nu}_s]$ and $\mathcal{U}$
\end{algorithmic}
\end{algorithm}
%\end{tcolorbox}
% The whole model is trained using our objective function \eqref{eq:final_loss}, which is discussed in the following text.


In the above workflow, the output of each CGP-based attention block will be forwarded to the classification block that computes the logit output of the transformer, which induces its training loss. As the CGP-based attention's output is a function of the CGP's modeling parameters, the transformer loss is also a function of these parameters. Hence, the CGP can be fitted via minimizing this training loss. However, the CGP parameters learned in this manner might induced brittle prediction with high variance, especially in out-of-distribution data regime. Step $4$ of the above workflow is therefore necessary to encode a preference for parameters that induce output with low uncertainty. This is achieved via accumulating the CGP's output uncertainty -- Eq.~\eqref{eq:CGP-loss} and Eq.~\eqref{eq:CGP-loss-compute} -- per attention block into a regularizer, which is added to the main loss of the transformer prior to gradient propagation, as demonstrated in Eq.~\eqref{eq:loss}.

\iffalse
\begin{definition}[Correlated Gaussian Process Transformer] A \textbf{Correlated Gaussian Process Transformer} (CGPT) has the $a$-th column of the attention output $\mathbf{V}^+$ given by Eq. \eqref{eq:cgp_attention} and is trained to minimize the CGP objective function.
\end{definition}
\fi

\subsubsection{CGP Regularization Loss}
\label{sec:CGP-regularize}
For each attention output dimension $a$, the observations $\mathbf{z}_k = [z_k(\mathbf{x}_1), z_k(\mathbf{x}_2), \ldots, z_k(\mathbf{x}_n)]$ are set to be $\mathbf{z}_a$ which is the $a$-th column of $\mathbf{Z} = (\mathcal{K}_k + \sigma^2\mathbf{I})\mathbf{X}\mathbf{W}^\top_v$. 

Following Eq.~\eqref{eq:cgp_attention}, the attention output for this dimension, $\boldsymbol{\nu}_a = \mathbb{E}[\mathbf{z}_q \mid \mathbf{z}_k]$, is the expected CGP prediction of $\mathbf{z}_q = [z_q(\mathbf{x}_1), z_q(\mathbf{x}_2),\ldots, z_q(\mathbf{x}_n)]$ given the observation $\mathbf{z}_k$. We would therefore want to maximize:
\begin{eqnarray}
\hspace{-7mm}\log p\Big(\mathbf{z}_q = \boldsymbol{\nu}_a, \mathbf{z}_k\Big) \hspace{-2mm}&=&\hspace{-2mm} \log \mathbb{E}_{\mathbf{z}_o}\Big[ p\Big(\mathbf{z}_q = \boldsymbol{\nu}_a, \mathbf{z}_k \Big | \mathbf{z}_o\Big)\Big] \label{eq:CGP-loss} 
\end{eqnarray}
because this would minimize the output uncertainty of our CGP-based attention mechanism, i.e. maximizing the fit between the input and output of the kernel attention unit. In other words, the output uncertainty of the attention output is the negation of the above log probability term. To compute it, we note that $p(\mathbf{z}_q, \mathbf{z}_k \mid \mathbf{z}_o) = p(\mathbf{z}_q \mid \mathbf{z}_o) \cdot p(\mathbf{z}_k \mid \mathbf{z}_o)$
% \begin{eqnarray}
% \hspace{-21mm}p(\mathbf{z}_q, \mathbf{z}_k \mid \mathbf{z}_o) &=& p(\mathbf{z}_q \mid \mathbf{z}_o) \cdot p(\mathbf{z}_k \mid \mathbf{z}_o)
% \end{eqnarray}
since $\mathbf{z}_k \perp \mathbf{z}_q \mid \mathbf{z}_o$ which follows from the CGP definition. Consequently, we have
\begin{equation} \label{eq:CGP-loss-compute}
    \begin{aligned}
        &\hspace{-0.8mm}\log \mathbb{E}_{\mathbf{z}_o}\Big[p(\mathbf{z}_q, \mathbf{z}_k \mid \mathbf{z}_o)\Big] \hspace{-2.5mm}\\
        &=\log \mathbb{E}_{\mathbf{z}_o}\Big[p(\mathbf{z}_q \mid \mathbf{z}_o) \cdot p(\mathbf{z}_k \mid \mathbf{z}_o)\Big]\\
        &=\log \mathbb{E}_{\mathbf{z}_o}\Big[p(\mathbf{z}_q \mid \mathbf{z}_o)\Big] + \log \mathbb{E}_{\mathbf{z}_o}\Big[p(\mathbf{z}_k \mid \mathbf{z}_o)\Big]. 
    \end{aligned}
\end{equation}
% \begin{eqnarray}
% \log \mathbb{E}_{\mathbf{z}_o}\Big[p(\mathbf{z}_q, \mathbf{z}_k \mid \mathbf{z}_o)\Big] \hspace{-2.5mm}&=&\hspace{-2.5mm} \log \mathbb{E}_{\mathbf{z}_o}\Big[p(\mathbf{z}_q \mid \mathbf{z}_o) \cdot p(\mathbf{z}_k \mid \mathbf{z}_o)\Big]  \nonumber\\
% \hspace{-2.5mm}&=&\hspace{-2.5mm} \log \mathbb{E}_{\mathbf{z}_o}\Big[p(\mathbf{z}_q \mid \mathbf{z}_o)\Big] \nonumber\\
% \hspace{-2.5mm}&+&\hspace{-2.5mm} \log \mathbb{E}_{\mathbf{z}_o}\Big[p(\mathbf{z}_k \mid \mathbf{z}_o)\Big] \ .\label{eq:CGP-loss-compute}
% \end{eqnarray}

%However, this maximization task still does not provide the attention output $\boldsymbol{\nu}_a = \mathbb{E}[\mathbf{z}_q \mid \mathbf{z}_k]$ with a feedback on how it helps improve the performance on ultimate learning target of the transformer. 

Now, let $\mathrm{loss}(\boldsymbol{\nu}_a)$ denote the original loss of the transformer which is a function of the attention output\footnote{For simplicity, we narrate this part assuming there is a single attention block with one output dimension. Otherwise, it is straight-forward to extend the above to multiple attention blocks with multiple output dimensions by including one uncertainty term per attention block and output dimension into the final loss.} $\boldsymbol{\nu}_a$. To opt for both uncertainty minimization and performance maximization, we propose to minimize the following augmented loss $\theta_\ast = \argmin_\theta \mathfrak{L}(\theta)$ where
\begin{eqnarray}
\hspace{-10mm}\mathfrak{L}(\theta) \hspace{-2mm}&\triangleq&\hspace{-2mm} \mathrm{loss}(\boldsymbol{\nu}_a) - \alpha\cdot \log p(\mathbf{z}_q = \boldsymbol{\nu}_a, \mathbf{z}_k)  \nonumber\\
\hspace{-2mm}&=&\hspace{-2mm} \mathrm{loss}(\boldsymbol{\nu}_a) - \alpha\cdot \log \mathbb{E}_{\mathbf{z}_o}\Big[p(\mathbf{z}_q = \boldsymbol{\nu}_a, \mathbf{z}_k \mid \mathbf{z}_o)\Big],\label{eq:loss}
\end{eqnarray}
where $\alpha > 0$ is a regularization coefficient while $\theta$ represents the collection of all CGP parameters from which the attention output $\boldsymbol{\nu}_a = \mathbb{E}[\mathbf{z}_q \mid \mathbf{z}_k]$ and the CGP density $p(\mathbf{z}_q = \boldsymbol{\nu}_a, \mathbf{z}_k \mid \mathbf{z}_o)$ are computed. %This principle was also previously adopted in \citep{chen2023calibrating}. 

Finally, plugging Eq.~\eqref{eq:CGP-loss-compute} in Eq.~\eqref{eq:loss},
\begin{eqnarray} 
% \label{eq:final_loss}
\hspace{-6mm}\mathfrak{L}(\theta)
&=& \mathrm{loss}(\boldsymbol{\nu}_a) - \alpha\cdot \Big[\log \mathbb{E}_{\mathbf{z}_o}\Big[p(\mathbf{z}_q = \boldsymbol{\nu}_a \mid \mathbf{z}_o)\Big] \nonumber\\
\hspace{-15mm}&+& \log \mathbb{E}_{\mathbf{z}_o}\Big[p(\mathbf{z}_k \mid \mathbf{z}_o)\Big] \Big] \ ,\label{eq:expanded_loss}
\end{eqnarray}
where $p(\mathbf{z}_q \mid \mathbf{z}_o)$ and $p(\mathbf{z}_k \mid \mathbf{z}_o)$ are both Gaussian whose specific forms are detailed in Appendix~\ref{app:A}. We refer to the objective function in \eqref{eq:expanded_loss} as the CGP objective and its full derivation as well as the uncertainty calibration of the induced attention output is detailed in Appendix~\ref{app:A}.

\begin{remark}
The regularization coefficient $\alpha > 0$ is a hyper-parameter balances between performance maximization and uncertainty minimization. In practice, it can be empirically selected using a validation set. 
\end{remark}
% {\bf Practical Implementation.} Furthermore, note that the cost of computing $\boldsymbol{\nu}_a = \mathbb{E}[\mathbf{z}_q \mid \mathbf{z}_k]$ following Eq.~\eqref{eq: exp closed form} is cubic in the number of inputs to the attention unit, which can be expensive. To avoid this, we adopt a practical relaxation of Eq.~\eqref{eq:expanded_loss} above. Instead of computing $\boldsymbol{\nu}_a = \mathbb{E}[\mathbf{z}_q \mid \mathbf{z}_k]$ exactly, we treat it as the output of a parameterized function $\boldsymbol{\nu}_a = g([\mathbf{x}_i]_{i=1}^n; \phi)$ with learnable parameter $\phi$, which is optimized together with all other CGP parameters $\theta$ via maximizing 
% \begin{eqnarray}
% \hspace{-1.5mm}\mathfrak{L}(\theta, \phi)
% \hspace{-2mm}&=&\hspace{-2mm} \log \mathbb{E}_{\mathbf{z}_o}\Big[p(\mathbf{z}_q = g([\mathbf{x}_i]_{i=1}^n; \phi) \mid \mathbf{z}_o)\Big] \\
% \hspace{-2mm}&+&\hspace{-2mm} \log \mathbb{E}_{\mathbf{z}_o}\Big[p(\mathbf{z}_k \mid \mathbf{z}_o)\Big] - \alpha \cdot \mathrm{loss}\Big(g([\mathbf{x}_i]_{i=1}^n; \phi)\Big) \label{eq:final_loss} \nonumber
% \end{eqnarray}    
% Intuitively, the above maximization task is structure to incentivize a balance between $(\theta, \phi)$ such that (1) the induced attention output $\boldsymbol{\nu}_a$ via $\phi$ will fit well with the kernel attention input $\mathbf{z}_k$ under our CGP formalism while (2) being regularized to effectively reduce the overall performance loss on the learning target of the transformer. Our empirical studies (Section~\ref{sec:experiments}) show that this practical trick achieves better compute cost and performance than those of SGPA, which is a symmetric kernel attention form based on sparse Gaussian processes \citep{chen2023calibrating}.


\section{Sparse Approximation} \label{sec: SCGPT}
As CGPT is developed based on the correlation structure between the two full-rank, correlated Gaussian processes, its complexity also scales cubically in the size of the number of input tokens. This is evident in Eq.~\eqref{eq:cgp_attention} which computes the CGP's predictive mean via inverting Gram matrices of size $n$ by $n$ where $n$ is the length of the input sequence. This incurs a prohibitively expensive computation cost of $\mathbb{O}(n^3)$. To mitigate this cubic dependence on the input length, we further develop a sparse approximation to CGP whose processing cost is only linear in the length of the input sequences. The resulting sparse approximation can thus replaced the aforementioned CGP-based attention, which gives rise to a new framework of sparse correlated Gaussian process transformer (SCGPT).

To derive this sparse approximation, we begin with the predictive mean $\mathbb{E}[\mathbf{z}_q\mid \mathbf{z}_k]$, whose explicit form is essential to draw correspondence to the output of kernel attention,
\begin{eqnarray} \label{eq: predictive step 1}
\hspace{-11mm}\mathbb{E}\Big[\mathbf{z}_q \mid \mathbf{z}_k\Big] &=& \mathbb{E}_{\mathbf{z}_o \sim p(\mathbf{z}_o \mid \mathbf{z}_k)}\Bigg[\mathbb{E}\Big[\mathbf{z}_q \mid \mathbf{z}_o\Big] \mid \mathbf{z}_k\Bigg]\ .\label{eq:s1}
\end{eqnarray}
Following the previous derivation in Section~\ref{sec: canonical GP}, we recognize that the main computational bottleneck stems from the fact that we are computing the nested expectation above with respect to the predictive distributions of two full-rank Gaussian processes, $p(\mathbf{z}_q \mid \mathbf{z}_o)$ and $p(\mathbf{z}_o \mid \mathbf{z}_k)$. Thus, to mitigate such bottleneck, we can instead adopt existing sparse approximations of Gaussian processes~\citep{Smola01,Tresp00,Tresp03,seeger2003fast,Candela05,Snelson06,Titsias09,Miguel10,Hensman13,NghiaICML15,NghiaICML16,NghiaAAAI17,NghiaAAAI19}. In this work, we use the Deterministic Training Conditional (DTC) approximation of~\citep{seeger2003fast}. 

Specifically, we first replace the exact $p(\mathbf{z}_q \mid \mathbf{z}_o)$ with its DTC approximation, which results in the following sparse approximation of $\mathbb{E}[\mathbf{z}_q \mid \mathbf{z}_o]$,
\begin{eqnarray}
\hspace{-7mm}\mathbb{E}[\mathbf{z}_q\mid\mathbf{z}_o] \hspace{-3mm}&=&\hspace{-3mm} \frac{1}{\sigma^2} \mathcal{K}_{qm}\Big (\mathcal{K}_{mm} + \frac{1}{\sigma^2} \mathcal{K}_{mo}\mathcal{K}_{om}\Big )^{-1}\hspace{-2mm}\mathcal{K}_{mo}\mathbf{z}_o, \label{eq:s2}
\end{eqnarray}
where $\mathcal{K}_{mm}$ is the Gram matrix of a set of inducing points $\{\mathbf{s}_1, \mathbf{s}_2, \ldots, \mathbf{s}_m\}$ that lives on the input space of $z_o(\mathbf{x})$ while $\mathcal{K}_{qm}$ and $\mathcal{K}_{om}$ denote the cross-covariance matrices between the $\{z_q(\mathbf{x}_i)\}_{i=1}^n$, $\{z_o(\mathbf{x}_i)\}_{i=1}^n$ with $\{\mathbf{s}_i\}_{i=1}^m$, respectively; and $\mathcal{K}_{mo}$ is the transposition of $\mathcal{K}_{om}$.

Likewise, we can do the same for $p(\mathbf{z}_o \mid \mathbf{z}_k)$, which results in the following sparse approximation for $\mathbb{E}[\mathbf{z}_o \mid \mathbf{z}_q]$,
\begin{eqnarray}
\hspace{-8.5mm}\mathbb{E}(\mathbf{z}_o\mid\mathbf{z}_k) \hspace{-2mm}&=&\hspace{-2mm} \frac{1}{\sigma^2} \mathcal{K}_{o\ell}\Big (\mathcal{K}_{\ell\ell} + \frac{1}{\sigma^2} \mathcal{K}_{\ell k}\mathcal{K}_{k\ell}\Big )^{-1}\mathcal{K}_{\ell k}\mathbf{z}_k, \label{eq:s3}
\end{eqnarray}
which is based on another set of inducing points $\{\mathbf{s}'_i\}_{i=1}^{\ell}$ that lives on the input space of $z_o(\mathbf{x})$ while $\mathcal{K}_{o\ell}$ and $\mathcal{K}_{k\ell}$ denote the cross-covariance between the $\{z_o(\mathbf{x}_i)\}_{i=1}^n$, $\{z_k(\mathbf{x}_i)\}_{i=1}^n$ with $\{\mathbf{s}'_i\}_{i=1}^{\ell}$, respectively; $\mathcal{K}_{\ell k}$ is the transposition of $\mathcal{K}_{k\ell}$ and $\mathcal{K}_{\ell\ell}$ is the Gram matrix of $\{\mathbf{s}'_i\}_{i=1}^{\ell}$.
Plugging Eq.~\eqref{eq:s2} and Eq.~\eqref{eq:s3} into Eq.~\eqref{eq:s1} leads to a closed form for the SCGP's predictive mean,
\begin{eqnarray}
\hspace{-2mm}\mathbb{E}[\mathbf{z}_q\mid\mathbf{z}_k] &=& \frac{1}{\sigma^4} \mathcal{K}_{qm}\Big (\mathcal{K}_{mm} + \frac{1}{\sigma^2} \mathcal{K}_{mo}\mathcal{K}_{om}\Big )^{-1}\mathcal{K}_{mo}\nonumber\\ &\times&\mathcal{K}_{o\ell}\Big (\mathcal{K}_{\ell\ell} + \frac{1}{\sigma^2} \mathcal{K}_{\ell k}\mathcal{K}_{k\ell}\Big )^{-1}\mathcal{K}_{\ell k}\mathbf{z}_k ,   \label{eq:s4}
\end{eqnarray}
which is a direct consequence of taking expectation of Gaussian random variables. The readers are referred to Appendix~\ref{sec: predictive mean} for a detailed step-by-step derivation. Using Eq.~\eqref{eq:s4}, we can now draw a correspondence between the predictive mean of SCGP and the output of kernel attention, 
\begin{eqnarray}
\mathbf{V}^+ &=& \mathcal{K}\mathbf{V} \ \ =\ \ \mathcal{K}\mathbf{X}\mathbf{W}_v^\top.
\end{eqnarray}
via setting $\mathbf{Z} = \mathbf{X}\mathbf{W}_v^\top$ and the kernel attention matrix $\mathcal{K}$ as
\begin{eqnarray}
\mathcal{K} &\triangleq& \frac{1}{\sigma^4} \mathcal{K}_{qm}\Big (\mathcal{K}_{mm} + \frac{1}{\sigma^2} \mathcal{K}_{mo}\mathcal{K}_{om}\Big )^{-1}\nonumber\\
&\times&\mathcal{K}_{mo}
\mathcal{K}_{o\ell}\Big (\mathcal{K}_{\ell\ell} + \frac{1}{\sigma^2} \mathcal{K}_{\ell k}\mathcal{K}_{k\ell}\Big )^{-1}\mathcal{K}_{\ell k} \ .\label{eq:s5}
\end{eqnarray}
\par As such, the kernel computation scales linearly in the number $n$ of input tokens. The cubic cost is now mitigated to the number $m$ and $\ell$ of the two set of inducing inputs introduced above. Thus, to improve scalability, we can opt for small values of $m$ and $\ell$ such that $\kappa = \max(m, \ell) \ll n$. Hence, the overall complexity of computing the predictive mean of SCGP is now $\mathcal{O}(\kappa^3+n \cdot \kappa^2)$ which is  much more efficient than CGPT's which is $\mathcal{O}(n^3)$.

\begin{remark}
Both sets of inducing inputs $\{\mathbf{s}_1, \mathbf{s}_2, \ldots, \mathbf{s}_m\}$ and $\{\mathbf{s}'_1, \mathbf{s}'_2, \ldots, \mathbf{s}'_{\ell}\}$ can be treated as part of the kernel parameters in our SCGP scheme and can be learned together with other kernel parameters while we optimize the corresponding augmented loss of SCGPT, whose details are deferred to Appendix~\ref{sec: SCGP objective} due to limited space.
\end{remark}


\iffalse
We provide here the high level derivation of the sparse predictive mean while the full details of our derivations can be found in Appendix \ref{sec: predictive mean}.  Consider the distribution $p(\mathbf{z}_q \mid \mathbf{z}_o)$, using DTC, we can formulate the distribution as an integral over the latent inducing variable $\mathbf{z}_m$. We can design the inducing variable $\mathbf{z}_m$ as an $m$-dimensional tensor where $m << n$
\begin{eqnarray}
    p(\mathbf{z}_q \mid \mathbf{z}_o) &=& \int_{\mathbf{z}_m} p(\mathbf{z}_q \mid \mathbf{z}_m) p(\mathbf{z}_m \mid \mathbf{z}_o) \mathrm{d} \mathbf{z}_m
\end{eqnarray}
where $p(\mathbf{z}_q \mid \mathbf{z}_m)$ and $ p(\mathbf{z}_m \mid \mathbf{z}_o)$ are analytically known to be Gaussians, following the DTC scheme. From here we can take the expectation of both side w.r.t $\mathbf{z}_q$ and obtain the closed form of the expectation $ \mathbb{E}(\mathbf{z}_q\mid\mathbf{z}_o)$

\begin{align} \label{eq: predictive step 2}
    \mathbb{E}(\mathbf{z}_q\mid\mathbf{z}_o) = \frac{1}{\sigma^2} \mathcal{K}_{qm}\Big (\mathcal{K}_{mm} + \frac{1}{\sigma^2} \mathcal{K}_{mo}\mathcal{K}_{om}\Big )^{-1}\mathcal{K}_{mo}\mathbf{z}_o
\end{align}

In a similar fashion, we can find the expectation of $\mathbf{z}_o\mid\mathbf{z}_k$ 
\begin{align} \label{eq: predictive step 3}
    \mathbb{E}(\mathbf{z}_o\mid\mathbf{z}_k) =  \frac{1}{\sigma^2} \mathcal{K}_{ol}\Big (\mathcal{K}_{ll} + \frac{1}{\sigma^2} \mathcal{K}_{lk}\mathcal{K}_{kl}\Big )^{-1}\mathcal{K}_{lk}\mathbf{z}_k
\end{align}
where $\mathbf{z}_l$ is another latent inducing variable of size $m$. Combining \eqref{eq: predictive step 1}, \eqref{eq: predictive step 2} and \eqref{eq: predictive step 3}, we have the closed form for the SCGPT predictive mean
\begin{equation}
    \begin{aligned}
       \mathbb{E}(\mathbf{z}_q\mid\mathbf{z}_k) &= \frac{1}{\sigma^4} \mathcal{K}_{qm}\Big (\mathcal{K}_{mm} + \frac{1}{\sigma^2} \mathcal{K}_{mo}\mathcal{K}_{om}\Big )^{-1}\mathcal{K}_{mo} \times \\ 
       &\mathcal{K}_{ol}\Big (\mathcal{K}_{ll} + \frac{1}{\sigma^2} \mathcal{K}_{lk}\mathcal{K}_{kl}\Big )^{-1}\mathcal{K}_{lk}\mathbf{z}_k
    \end{aligned}
\end{equation}
With the closed form of the SCGPT predictive mean, we can draw a correspondence with kernel attention as follow. We have the output of kernel attention given as
\begin{eqnarray}
\mathbf{V}^+ &=& \mathcal{K}\mathbf{V} \ \ =\ \ \mathcal{K}\mathbf{X}\mathbf{W}_v^\top.
\end{eqnarray}
Let $\mathcal{K} =  \frac{1}{\sigma^4} \mathcal{K}_{qm}\Big (\mathcal{K}_{mm} + \frac{1}{\sigma^2} \mathcal{K}_{mo}\mathcal{K}_{om}\Big )^{-1}\mathcal{K}_{mo}
\mathcal{K}_{ol}\Big (\mathcal{K}_{ll} + \frac{1}{\sigma^2} \mathcal{K}_{lk}\mathcal{K}_{kl}\Big )^{-1}\mathcal{K}_{lk}$, we have
\begin{equation}
    \begin{aligned}
       \mathbf{V}^+&=\frac{1}{\sigma^4} \mathcal{K}_{qm}\Big (\mathcal{K}_{mm} + \frac{1}{\sigma^2} \mathcal{K}_{mo}\mathcal{K}_{om}\Big )^{-1}\mathcal{K}_{mo} \times \\
&\mathcal{K}_{ol}\Big (\mathcal{K}_{ll} + \frac{1}{\sigma^2} \mathcal{K}_{lk}\mathcal{K}_{kl}\Big )^{-1}\mathcal{K}_{lk} \mathbf{Z},
    \end{aligned}
\end{equation}
% \begin{equation}
%     \begin{aligned}
%        \mathbf{V}^+& = \mathcal{K}\mathbf{V} = \mathcal{K}\mathbf{X}\mathbf{W}_v^\top \\
%        &=\frac{1}{\sigma^4} \mathcal{K}_{qm}\Big (\mathcal{K}_{mm} + \frac{1}{\sigma^2} \mathcal{K}_{mo}\mathcal{K}_{om}\Big )^{-1}\mathcal{K}_{mo} \times \\
% &\mathcal{K}_{ol}\Big (\mathcal{K}_{ll} + \frac{1}{\sigma^2} \mathcal{K}_{lk}\mathcal{K}_{kl}\Big )^{-1}\mathcal{K}_{lk} \mathbf{Z},
%     \end{aligned}
% \end{equation}
where $\mathbf{Z} \triangleq  \mathbf{X}\mathbf{W}_v^\top$.
\fi


