% \subsection{Variational inference...}
%     \textbf{Variational inference}. To do variational inference, let us define $m \leq n$ auxiliary variables $\textbf{u} \in \mathbb{R}^m$ where $u_i = s(k_i)$. We can view the keys $(\textbf{k}_1,\ldots, \textbf{k}_m)$ as the inducing points in variational inference of the standardized GP model (\ref{eq: standardized GP model}). The distribution of $\textbf{u}$ is Gaussian:
%     \begin{align*}
%         p(\textbf{u}) = \mathcal{N}(\textbf{0}, \textbf{K}_{\textbf{uu}}),
%     \end{align*}
%     where $[\textbf{K}_{\textbf{uu}}]_{ij}= k_s(\textbf{k}_i, \textbf{k}_j)$.
%     The cross-covariance between input $\textbf{x}_i$ and inducing key $\textbf{k}_j$ is calculated as:
%     \begin{align}
%         k_{f_Q,u} (\textbf{x}_i, \textbf{k}_j) &= E[\sigma_{f_Q}s(\textbf{x}_i\textbf{W}_Q^T)s(\textbf{k}_j)] \\
%         &= E[\sigma_{f_Q}s(\textbf{x}_i\textbf{W}_Q^T)s(\textbf{x}_j\textbf{W}_K^T)]\\
%         &=\sigma_{f_Q}exp \Big( \frac{-1}{2} ||\textbf{x}_i\textbf{W}_Q^T-\textbf{x}_j\textbf{W}_K^T ||^2 \Big)=\sigma_{f_Q} k_s(\textbf{q}_i, \textbf{k}_j),
%     \end{align}
%     where $i \in \{1,\ldots,n\}$ and $j \in \{1,\ldots,m \}$. Using this cross-covariance function, we can compute the covariance matrix $\textbf{K}_{\textbf{f}_Q, \textbf{u}} \in \mathbb{R}^{n\times m}$.
%     \par \textcolor{blue}{Long: Can we use variational inference to learn the inducing keys?} 
%     \par The conditional Gaussian distribution is given by:
%     \begin{align}
%         p(\textbf{f}_Q|\textbf{u}) = \mathcal{N}(\textbf{K}_{\textbf{f}_Q, \textbf{u}} \textbf{K}_{\textbf{u}, \textbf{u}}^{-1}\textbf{u}, \textbf{K}_{\textbf{f}_Q, \textbf{f}_Q} + \textbf{K}_{\textbf{f}_Q, \textbf{u}} \textbf{K}_{\textbf{u}, \textbf{u}}^{-1}\textbf{K}_{\textbf{f}_Q, \textbf{u}}^T)
%     \end{align}
%     Consider the mean of $p(\textbf{f}_Q|\textbf{u})$, $\textbf{m} = \textbf{K}_{\textbf{f}_Q, \textbf{u}} \textbf{K}_{\textbf{u}, \textbf{u}}^{-1}\textbf{u} $, if we set the values vector $\textbf{v}:=\textbf{K}_{\textbf{f}_Q, \textbf{u}} \textbf{K}_{\textbf{u}, \textbf{u}}^{-1}\textbf{u}$, we obtain the kernel attention with assymetric kernel $\textbf{K}_{\textbf{f}_Q, \textbf{u}}$.

% \textbf{Outline.} Firstly, in Section \ref{app:A}, we derive the analytical form of the CGP objective function in equation \eqref{eq:final_loss}. In Section \ref{appx:B}, we derive in details the predictive variance of CGPT for uncertainty calibration. Additional experiments conducted and the experimental settings are shown in Section \ref{app:C} and Section \ref{app:D}, respectively.

% \section{CGPT's workflow diagram}
% \begin{figure}[h!]
%     \centering
%     \includegraphics[width=\linewidth]{gptransformers/images/architecture.pdf}
%     \caption{Diagram of the training workflow of CGPT. Each attention block forwards the CGP's prediction to the next block and caches the prediction uncertainty into a CGP regularizing term (see Algorithm~\ref{alg:cap}). Once the attention output is propagated to the last classification block, the original transformer loss is computed and augmented with the CGP regularizing term. Gradient propagation from this augmented loss will help optimize the CGP parameters to reduce prediction uncertainty while maximizing predictive performance.}
%     \label{fig:diagram}
% \end{figure}

\section{Derivation of CGP Objective Function in Eq.~\ref{eq:expanded_loss}}
\label{app:A}
    \iffalse
    \textcolor{red}{
        This section further derives a more specific expression for our main objective function in Eq.~\eqref{eq:main_obj}, which is quoted below
        \begin{eqnarray} \label{eq:maximize}
        \min_\theta \Big\{ \mathfrak{L}(\theta) &\triangleq& \mathrm{loss}(\boldsymbol{\nu}_a) - \log p(\mathbf{z}_a) \ +\ \log p\left(\boldsymbol{\nu}_a \mid \mathbf{z}_a\right) \Big\} \ .
        \end{eqnarray}
        where $\boldsymbol{\nu}_a$ and $\mathbf{z}_a$ is defined previously in Eq.~\eqref{eq:cgp_attention}. 
        % Here, note that $\boldsymbol{\nu}_a$ is set to be the mean of the predictive Gaussian whose parameters are previously specified in Eq.~\eqref{eq:log Gaussian},
        \begin{eqnarray} 
        p(\boldsymbol{\nu} \mid \mathbf{z}_a) &=& \mathbb{N}\Bigg(\boldsymbol{\nu} \mid \mathcal{K}_{qk}\Big(\mathcal{K}_k + \sigma^2\mathbf{I}\Big)^{-1}\mathbf{z}_a, \mathcal{K}_q - \mathcal{K}_{qk}\Big(\mathcal{K}_k + \sigma^2\mathbf{I}\Big)^{-1}\mathcal{K}_{kq}\Bigg) \nonumber\\
        &=& \mathbb{N}\Bigg(\boldsymbol{\nu} \mid\boldsymbol{\nu}_a, \mathcal{K}_q - \mathcal{K}_{qk}\Big(\mathcal{K}_k + \sigma^2\mathbf{I}\Big)^{-1}\mathcal{K}_{kq}\Bigg)
        \end{eqnarray}
        Thus, let $\boldsymbol{\Sigma} \triangleq \mathcal{K}_q - \mathcal{K}_{qk}\Big(\mathcal{K}_k + \sigma^2\mathbf{I}\Big)^{-1}\mathcal{K}_{kq}$ which characterizes the attention output uncertainty, 
        \begin{eqnarray}
        \log p(\boldsymbol{\nu}_a \mid \mathbf{z}_a) &=& -\ \frac{1}{2}n\log(2\pi) \ -\  \frac{1}{2}\log \left|\mathbf{det}(\boldsymbol{\Sigma})\right|
        \end{eqnarray}
        Thus, plugging the above into the expression of $\mathfrak{L}(\theta)$ along with the fact that $\log\mathbb{N}(\mathbf{z}_a \mid \mathbf{0}, \mathcal{K}_k) = (-n/2)\log(2\pi) - (1/2)\log|\mathbf{det}(\mathcal{K}_k)| - (1/2)\mathbf{z}^\top_a\mathcal{K}_k^{-1}\mathbf{z}_a$, we have
        \begin{eqnarray}
        \hspace{-8mm}\mathfrak{L}(\theta) 
        \hspace{-2mm}&=&\hspace{-2mm} -\  \frac{1}{2} \log |\mathbf{det}(\mathcal{K}_k)| \ - \ \frac{1}{2} \mathbf{z}_a^\top \mathcal{K}_k^{-1} \mathbf{z}_a \ -\ \frac{1}{2} \log |\mathbf{det}(\boldsymbol{\Sigma})| -\ n\log(2\pi) -\ \alpha\cdot\mathrm{loss}(\boldsymbol{\nu}_a)
        \end{eqnarray}
        Maximizing $\mathfrak{L}(\theta)$ is thus equivalent to solving for 
        \begin{eqnarray}
        \hspace{-10.5mm}\theta_\ast &\triangleq&  \min_\theta \Bigg\{\mathfrak{L}'(\theta) \ \triangleq\  \log |\mathbf{det}(\mathcal{K}_k)| \ +\ \mathbf{z}_a^\top \mathcal{K}_k^{-1} \mathbf{z}_a \ +\  \log |\mathbf{det} (\boldsymbol{\Sigma})| \ +\ \alpha \cdot \mathrm{loss}(\boldsymbol{\nu}_a)\Bigg\}
        \end{eqnarray}
        which specifies a parameterization $\theta_\ast$ for the CGP-based attention that balances between (1) minimizing the output uncertainty, which is characterized by the first $3$ terms; and (2) minimizing the transformer's prediction loss, which is characterized by the last term.\\
         Practical Implementation. In practice, we find that the above can be further relaxed to increase the flexibility in balancing between minimizing attention uncertainty and prediction loss via augmenting the above minimization task with an additional parameterized surrogate $\boldsymbol{\nu}(\mathbf{x}; \gamma)$ of the CGP-based attention output, which leads to improved overall performance. This process is detailed below
        \begin{eqnarray}
        \theta_\ast, \gamma_\ast \ \ =\ \  \min_{\theta,\gamma}\ \widehat{\mathfrak{L}}(\theta, \gamma) 
        \end{eqnarray}
        where the augmented minimization loss is defined as
        \begin{eqnarray} 
        \widehat{\mathfrak{L}}(\theta, \gamma) &\triangleq& \Big(\boldsymbol{\nu}(\mathbf{x}; \gamma) - \boldsymbol{\nu}_a\Big)^\top\mathbf{\Sigma}^{-1}\Big(\boldsymbol{\nu}(\mathbf{x}; \gamma) - \boldsymbol{\nu}_a\Big) \ +\ \alpha \cdot \mathrm{loss}(\boldsymbol{\nu}(\mathbf{x}; \gamma))\nonumber\vspace{4mm}\\
        &+& \log \Big|\mathbf{det}(\mathcal{K}_k)\Big| \ +\  \log \Big|\mathbf{det} (\boldsymbol{\Sigma})\Big| \ +\  \mathbf{z}_a^\top \mathcal{K}_k^{-1} \mathbf{z}_a 
        \end{eqnarray}
        This is in fact equivalent to 
        \begin{eqnarray} \label{eq:maximize_augmented}
        \max_{\theta,\gamma} \Big\{ \log p(\mathbf{z}_a) \ +\ \log p\left(\boldsymbol{\nu}(\mathbf{x};\gamma) \mid \mathbf{z}_a\right) \ - \  \alpha \cdot \mathrm{loss}(\boldsymbol{\nu}(\mathbf{x};\gamma))\Big\} \ .
        \end{eqnarray}
        which reduces to the original maximization task in Eq.~\eqref{eq:maximize} when $\gamma$ is selected such that $\boldsymbol{\nu}_a = \boldsymbol{\nu}(\mathbf{x};\gamma)$. Otherwise, if we optimize for $\gamma$ along with $\theta$, it will provide a better handle to balance between minimizing attention uncertainty and prediction loss (as verified empirically).
    }
    \fi 
    
This section derives a more specific expression for our objective function in Eq.~\eqref{eq:expanded_loss}, which is quoted below
\begin{eqnarray} \label{eq:maximize}
\min_\theta \Big\{ \mathfrak{L}(\theta) &\triangleq& \mathrm{loss}(\boldsymbol{\nu}_a) - \alpha \cdot \Big(\log \mathbb{E}_{\mathbf{z}_o}\big[p(\mathbf{z}_q = \boldsymbol{\nu}_a \mid \mathbf{z}_o)\big] \ +\ \log \mathbb{E}_{\mathbf{z}_o}\big[p(\mathbf{z}_k \mid \mathbf{z}_o)\big]\Big) \Big\}  \ .
\end{eqnarray}
where $\boldsymbol{\nu}_a$ and $\mathbf{z}_a$ is defined previously in Eq.~\eqref{eq:cgp_attention}. \\\\
% Here, note that $\boldsymbol{\nu}_a$ is set to be the mean of the predictive Gaussian whose parameters are previously specified in Eq.~\eqref{eq:log Gaussian}.\\\\
To sidestep the intractability of $\log \mathbb{E}[p(\mathbf{z}_q = \boldsymbol{\nu}_a \mid \mathbf{z}_o)]$ and $\log \mathbb{E}[p(\mathbf{z}_k \mid \mathbf{z}_0)]$, we will instead optimize their lower bounds as follow. First, recall that
\begin{eqnarray}
p\Big(\mathbf{z}_q = \boldsymbol{\nu}_a \mid \mathbf{z}_o\Big)  
&=&  \mathcal{N}\Big(\boldsymbol{\nu}_a\ ;\  \mathcal{K}_{qo}\left(\mathcal{K}_o + \sigma^2 \mathbf{I}\right)^{-1}\mathbf{z}_o, \ \mathcal{K}_q-\mathcal{K}_{qo}\left(\mathcal{K}_o+\sigma^2 \mathbf{I}\right)^{-1}\mathcal{K}_{oq}\Big),
\end{eqnarray}
which follows from the CGP definition. Next, let $\boldsymbol{\Sigma}_{q} \triangleq \mathcal{K}_q-\mathcal{K}_{qo}\left(\mathcal{K}_o+\sigma^2 \mathbf{I}\right)^{-1}\mathcal{K}_{oq}$ and $\mathbf{m}_q \triangleq \mathcal{K}_{qo}\left(\mathcal{K}_o + \sigma^2 \mathbf{I}\right)^{-1}\mathbf{z}_o$. Using Jensen inequality, 
\begin{eqnarray}
\hspace{-9mm}\log \mathbb{E}\big[p(\mathbf{z}_q = \boldsymbol{\nu}_a\mid\mathbf{z}_o)\big] &\geq& \mathbb{E}\big[\log p(\mathbf{z}_q = \boldsymbol{\nu}_a\mid\mathbf{z}_o)\big] \\
\hspace{-2mm}&=&\hspace{-2mm} 0.5 \cdot \mathbb{E}_{\mathbf{z}_o}\Big[-(\boldsymbol{\nu}_a-\mathbf{m}_q)^\top \boldsymbol{\Sigma}_{q}^{-1}(\boldsymbol{\nu}_a-\mathbf{m}_{q}) - \log \mathrm{det}\big(\boldsymbol{\Sigma}_{q}\big)-n\log 2\pi\Big]\\
\hspace{-2mm}&=&\hspace{-2mm} -0.5\cdot\int_{\mathbf{z}_o} p(\mathbf{z}_0)\Big[(\boldsymbol{\nu}_a - \mathbf{m}_{q})^\top \boldsymbol{\Sigma}_{q}^{-1}(\boldsymbol{\nu}_a-\mathbf{m}_{q}) + \log \mathrm{det}(\boldsymbol{\Sigma}_{q})+n\log 2\pi\Big]\mathrm{d}\mathbf{z}_0\\
\hspace{-2mm}&=&\hspace{-2mm} -0.5\cdot\int_{\mathbf{z}_0} p(\mathbf{z}_0)\Big[(\boldsymbol{\nu}_a-\mathbf{m}_{q})^\top \boldsymbol{\Sigma}_{q}^{-1}(\boldsymbol{\nu}_a-\mathbf{m}_{q})\Big]\mathrm{d}\mathbf{z}_o -0.5 \cdot \Big(\log \mathrm{det}(\boldsymbol{\Sigma}_{q}) - \log 2\pi\Big) \ .
\end{eqnarray}  
Finally, the integral in the above lower-bound can be approximated arbitrarily closely via an empirical average based on a sufficiently large number of samples $\mathbf{z}_o^i \sim p(\mathbf{z}_o) =\mathcal{N}(\mathbf{0}, \mathcal{K}_o)$. Thus, approximately, we have the following lower-bound
\begin{eqnarray}
\hspace{-14mm}\log \mathbb{E}\Big[p(\mathbf{z}_q = \boldsymbol{\nu}_a\mid\mathbf{z}_o)\Big] \hspace{-2mm}&\geq&\hspace{-2mm} -0.5\cdot \frac{1}{n}\sum_{i=1}^n \Big[\left(\boldsymbol{\nu}_a-\mathbf{m}_{q}^i\right)^\top \boldsymbol{\Sigma}_{q}^{-1}\left(\boldsymbol{\nu}_a-\mathbf{m}_{q}^i\right)\Big]  - 0.5\cdot \log \mathrm{det}(\boldsymbol{\Sigma}_{q}) - 0.5\cdot n\log 2\pi,
\end{eqnarray}
where $\mathbf{m}_q^i \triangleq \mathcal{K}_{qo}\left(\mathcal{K}_o + \sigma^2 \mathbf{I}\right)^{-1}\mathbf{z}_o^i$.
Likewise, we can also lower bound $\log \mathbb{E}[p(\mathbf{z}_k\mid\mathbf{z}_o)]$:
\begin{eqnarray}
\hspace{-22mm}\log \mathbb{E}\Big[p(\mathbf{z}_k\mid\mathbf{z}_o)\Big] \hspace{-2mm}&\geq&\hspace{-2mm} -0.5\cdot \frac{1}{n}\sum_{i=1}^n \Big[\left(\mathbf{z}_k-\mathbf{m}_k^i\right)^\top \boldsymbol{\Sigma}_{k}^{-1}\left(\mathbf{z}_k-\mathbf{m}_k^i\right)\Big]  - 0.5 \cdot \log \mathrm{det}(\boldsymbol{\Sigma}_{k}) - 0.5 \cdot n\log 2\pi,
\end{eqnarray}
where $\mathbf{m}_k^i \triangleq \mathcal{K}_{ko}\left(\mathcal{K}_o + \sigma^2 \mathbf{I}\right)^{-1}\mathbf{z}_o^i$ and $\boldsymbol{\Sigma}_{k} \triangleq \mathcal{K}_k-\mathcal{K}_{ko}\left(\mathcal{K}_o+\sigma^2 \mathbf{I}\right)^{-1}\mathcal{K}_{ok}$. Therefore, our CGP objective becomes
\begin{equation} \label{eq:maximize full}
\begin{aligned}
\max_\theta \Bigg\{ \hat{\mathfrak{L}}(\theta) &\triangleq  
\alpha \Big(-\frac{1}{n}\sum_{i=1}^n \Big[\left(\boldsymbol{\nu}_a-\mathbf{m}_q^i\right)^\top \boldsymbol{\Sigma}_{q}^{-1}\left(\boldsymbol{\nu}_a-\mathbf{m}_{q}^i\right)\Big]  -\log \mathrm{det}(\boldsymbol{\Sigma}_{q})\\
- \frac{1}{n}&\sum_{i=1}^n \Big[\left(\mathbf{z}_k-\mathbf{m}_k^i\right)^\top \boldsymbol{\Sigma}_{k}^{-1}\left(\mathbf{z}_k-\mathbf{m}_k^i\right)\Big]  - \log \mathrm{det}(\boldsymbol{\Sigma}_{k}) \Big) - \mathrm{loss}(\boldsymbol{\nu}_a) \Bigg\} \ .
\end{aligned}
\end{equation}
    
    \iffalse
    Thus, our original maximization task in \eqref{eq:maximize} can be cast into the following minimization problem
    \begin{equation}
        \begin{aligned}
            \min_\theta \Big\{ \mathfrak{L}'(\theta) \ \ &\triangleq \ \ \log |\mathbf{det}(\mathcal{K}_k)| \ + \ \log |\mathbf{det}(\boldsymbol{\Sigma})| \ +\ \mathbf{z}_a^\top \mathcal{K}_k^{-1} \mathbf{z}_a \\ \ \ &+ \ \ (\boldsymbol{\nu} \ - \ \boldsymbol{\nu}_a)^\top \boldsymbol{\Sigma}^{-1} (\boldsymbol{\nu} \ - \ \boldsymbol{\nu}_a) \ + \  2\alpha \cdot \mathrm{loss}(\boldsymbol{\nu}_a) \Big\}
        \end{aligned}\label{eq:minimize}
    \end{equation}
    \fi
    
    %if the family of surrogate parameters $\gamma$ is sufficiently expressive such that the optimal parameter $\gamma$ that minimizes the prediction loss $\mathrm{loss}(\boldsymbol{\nu}(\mathbf{x};\gamma))$ is also approximately close to the optimal parameter that minimizes the output uncertainty $\boldsymbol{\nu}(\mathbf{x};\gamma) \simeq \boldsymbol{\nu}_a$.

    
    %to increase the flexibility in balancing between fitting a CGP-based attention unit with low uncertainty\footnote{This is achieved via minimizing exclusively the first $2$ terms in Eq.~\eqref{eq:main_obj} or equivalently, the first $4$ terms in Eq.~\eqref{eq:minimize}} and fitting the attention output to minimize the prediction loss, we further parameterize $\boldsymbol{\nu}$ as a learnable function of the queries.
    
        

\section{Analytic Form of CGPT's Predictive Variance from Section \ref{sec:learning kernel}} \label{appx:B}
In Eq.~\eqref{eq: exp closed form}, we have derived the expectation $\mathbb{E}[\mathbf{z}_q \mid \mathbf{z}_k]$ of the CGP model, which then can be modeled as the predictive mean of CGPT in equation \eqref{eq:cgp_attention}. To perform uncertainty calibration, we need to further derive the predictive variance of $\mathbf{z}_q \mid \mathbf{z}_k$. We have the following identity:
\begin{eqnarray} \label{eq: variance formula}
\mathbb{V}[\mathbf{z}_q \mid \mathbf{z}_k] &=& \mathbb{E}\left[\mathbf{z}_q\mathbf{z}_q^\top\mid \mathbf{z}_k\right] -  \mathbb{E}\left[\mathbf{z}_q|\mathbf{z}_k]\cdot \mathbb{E}[\mathbf{z}_q|\mathbf{z}_k\right]^\top,
\end{eqnarray}
where $\mathbb{E}[\mathbf{z}_q|\mathbf{z}_k]$ is the predictive mean given in \eqref{eq: exp closed form} and $\mathbb{E}\left[\mathbf{z}_q\mathbf{z}_q^\top\mid \mathbf{z}_k\right]$ is given by the following integral, 
\begin{eqnarray} 
\label{eq:temp1}
\mathbb{E}\Big[\mathbf{z}_q\mathbf{z}_q^\top\mid \mathbf{z}_k\Big] &=& \int_{\mathbf{z}_q} \mathbf{z}_q\mathbf{z}_q^\top
\Bigg(\int_{\mathbf{z}_o}p\left(\mathbf{z}_q\mathbf{z}_q^\top\mid \mathbf{z}_o\right)p(\mathbf{z}_o\mid\mathbf{z}_k)d\mathbf{z}_o\Bigg)\mathrm{d}\mathbf{z}_q \\
&=& \int_{\mathbf{z}_0}\int_{\mathbf{z}_q} \mathbf{z}_q\mathbf{z}_q^\top p\left(\mathbf{z}_q\mathbf{z}_q^\top\mid\mathbf{z}_o\right)p(\mathbf{z}_o\mid\mathbf{z}_k)\mathrm{d}\mathbf{z}_o\mathrm{d}\mathbf{z}_q\\
&=&\int_{\mathbf{z}_o}\mathbb{E}\Big[\mathbf{z}_q\mathbf{z}_q^\top\mid\mathbf{z}_o\Big]p(\mathbf{z}_o\mid\mathbf{z}_k)\mathrm{d}\mathbf{z}_o = \mathbb{E}\Big[\mathbb{E}\Big[\mathbf{z}_q\mathbf{z}_q^\top\mid\mathbf{z}_o\Big]\mid\mathbf{z}_k\Big].
\end{eqnarray}
By the canonical representation of GP, we have 
\begin{eqnarray}
\mathbf{z}_q\mid\mathbf{z}_o &\sim& \mathcal{N}\Big(\mathcal{K}_{qo}(\mathcal{K}_o+\sigma^2\mathbf{I})^{-1}\mathbf{z}_o, \mathcal{K}_q-\mathcal{K}_{qo}(\mathcal{K}_o+\sigma^2\mathbf{I})^{-1}\mathcal{K}_{oq}\Big).
\end{eqnarray}    
Thus, using the identity $\mathbb{E}(\mathbf{x}\mathbf{x}^\top) \ = \  \boldsymbol{\Sigma} \ + \ \mathbf{m}\mathbf{m}^\top \text{ for } \mathbf{x} \sim \mathcal{N}(\mathbf{m}, \boldsymbol{\Sigma})$ we have,
\begin{eqnarray}
\mathbb{E}\Big[\mathbf{z}_q\mathbf{z}_q^\top\mid \mathbf{z}_o\Big]
&=& \mathcal{K}_q-\mathcal{K}_{qo}(\mathcal{K}_o+\sigma^2 \mathbf{I})^{-1}\mathcal{K}_{oq} + \mathcal{K}_{qo}(\mathcal{K}_o+\sigma^2\mathbf{I})^{-1}\mathbf{z}_o\mathbf{z}_o^\top (\mathcal{K}_o+\sigma^2\mathbf{I})^{-1} \mathcal{K}_{oq} \ .
\end{eqnarray}
Next, taking the expectation of $\mathbb{E}[\mathbf{z}_q\mathbf{z}_q^\top|\mathbf{z}_o]$ with respect to $\mathbf{z}_o\mid \mathbf{z}_k$, gives
\begin{eqnarray} \label{eq:temp2}
\hspace{-9mm}\mathbb{E}_{\mathbf{z}_o\mid\mathbf{z}_k}\Big[\mathbb{E}\left[\mathbf{z}_q\mathbf{z}_q^\top\mid \mathbf{z}_o\right]\Big] &=& \mathcal{K}_q-\mathcal{K}_{qo}(\mathcal{K}_o+\sigma^2 \mathbf{I})^{-1}\mathcal{K}_{oq}  
\ +\ \mathcal{K}_{qo}(\mathcal{K}_o+\sigma^2 \mathbf{I})^{-1}\mathbb{E}\left[\mathbf{z}_o\mathbf{z}_o^\top\mid \mathbf{z}_k\right] (\mathcal{K}_o+\sigma^2\mathbf{I})^{-1} \mathcal{K}_{oq}.
\end{eqnarray}
Note that $\mathbf{z}_o\mid\mathbf{z}_k \sim \mathcal{N}(\mathcal{K}_{ok}(\mathcal{K}_k+\sigma^2\mathbf{I})^{-1}\mathbf{z}_k, \mathcal{K}_o - \mathcal{K}_{ok}(\mathcal{K}_k+\sigma^2\mathbf{I})^{-1}\mathcal{K}_{ko})$ due to the canonical GP representation. Thus, we have
\begin{eqnarray} 
\label{eq:temp3}
\mathbb{E}\Big[\mathbf{z}_o\mathbf{z}_o^\top\mid \mathbf{z}_k\Big] &=& \mathcal{K}_o - \mathcal{K}_{ok}(\mathcal{K}_k+\sigma^2 \mathbf{I})^{-1}\mathcal{K}_{ok} + \mathcal{K}_{ok}(\mathcal{K}_k+\sigma^2\mathbf{I})^{-1}\mathbf{z}_k \mathbf{z}_k^\top (\mathcal{K}_k+\sigma^2\mathbf{I})^{-1} \mathcal{K}_{ko} \ .
\end{eqnarray}
Hence, we can obtain the closed form of the predictive variance $\mathbb{V}[\mathbf{z}_q \mid \mathbf{z}_k]$ by putting together Eq.~\eqref{eq: variance formula}, Eq.~\eqref{eq: exp closed form}, Eq.~\eqref{eq:temp1}, Eq.~\eqref{eq:temp2} and Eq.~\eqref{eq:temp3}. This consequently allows us to perform uncertainty calibration for the CGP-based attention unit's output analytically.




\section{Sparse CGPT Predictive Mean} \label{sec: predictive mean}
We need to find the predictive mean,
\begin{eqnarray}
\hspace{-13mm}\mathbb{E}\Big[\mathbf{z}_q \mid \mathbf{z}_k\Big] &=& \mathbb{E}_{\mathbf{z}_o \sim p(\mathbf{z}_o \mid \mathbf{z}_k)}\Bigg[\mathbb{E}\Big[\mathbf{z}_q \mid \mathbf{z}_o\Big] \mid \mathbf{z}_k\Bigg].\label{eq:pred}
\end{eqnarray}
The distribution $\mathbf{z}_q \mid \mathbf{z}_o$ can be approximated using sparse GP techniques, such as DTC,
\begin{eqnarray}
    p(\mathbf{z}_q \mid \mathbf{z}_o) &=& \int_{\mathbf{z}_m} p(\mathbf{z}_q \mid \mathbf{z}_m) p(\mathbf{z}_m \mid \mathbf{z}_o) \mathrm{d} \mathbf{z}_m,
\end{eqnarray}
where $p(\mathbf{z}_m \mid \mathbf{z}_o)$ is the inducing posterior and has the form
\begin{equation}
    \begin{aligned}
    p(\mathbf{z}_m \mid \mathbf{z}_o) &=&\mathbb{N} \Big (\mathbf{z}_m \mid 
    \frac{1}{\sigma^2} \mathcal{K}_{mm}\Big (\mathcal{K}_{mm} + \frac{1}{\sigma^2} \mathcal{K}_{mo}\mathcal{K}_{om}\Big )^{-1}\mathcal{K}_{mo}\mathbf{z}_o,
    & \mathcal{K}_{mm} \Big (\mathcal{K}_{mm} + \frac{1}{\sigma^2} \mathcal{K}_{mo}\mathcal{K}_{om}\Big )^{-1}\mathcal{K}_{mm} \Big ).
    \end{aligned}
\end{equation}
The distribution $p(\mathbf{z}_q \mid \mathbf{z}_m)$ has the form $\mathbb{N}(\mathbf{z}_q \mid \mathcal{K}_{qm}\mathcal{K}_{mm}^{-1}\mathbf{z}_m, \sigma^2 \mathbf{I})$. Therefore, with some algebras, we can calculate the expectation of $\mathbf{z}_q\mid\mathbf{z}_o$,
\begin{equation}
    \begin{aligned}
        \mathbb{E}(\mathbf{z}_q\mid\mathbf{z}_o) &=  \int_{\mathbf{z}_q}\mathbf{z}_q\int_{\mathbf{z}_m} p(\mathbf{z}_q \mid \mathbf{z}_m) p(\mathbf{z}_m \mid \mathbf{z}_o) \mathrm{d} \mathbf{z}_m \mathrm{d} \mathbf{z}_q \\
        &=\int_{\mathbf{z}_m}\Big(\int_{\mathbf{z}_q} \mathbf{z}_q p(\mathbf{z}_q \mid \mathbf{z}_m) \mathrm{d} \mathbf{z}_q\Big) p(\mathbf{z}_m \mid \mathbf{z}_o) \mathrm{d} \mathbf{z}_m \\
        &= \int_{\mathbf{z}_m} \mathbf{z}_q \mathcal{K}_{qm}\mathcal{K}_{mm}^{-1}\mathbf{z}_m p(\mathbf{z}_m \mid \mathbf{z}_o) \mathrm{d} \mathbf{z}_m  \\
        &=\frac{1}{\sigma^2} \mathcal{K}_{qm}\mathcal{K}_{mm}^{-1} \mathcal{K}_{mm}\Big (\mathcal{K}_{mm} + \frac{1}{\sigma^2} \mathcal{K}_{mo}\mathcal{K}_{om}\Big )^{-1}\mathcal{K}_{mo}\mathbf{z}_o \\
        &= \frac{1}{\sigma^2} \mathcal{K}_{qm}\Big (\mathcal{K}_{mm} + \frac{1}{\sigma^2} \mathcal{K}_{mo}\mathcal{K}_{om}\Big )^{-1}\mathcal{K}_{mo}\mathbf{z}_o.
    \end{aligned}
\end{equation}
    
In a similar fashion, we can find the expectation of $\mathbf{z}_o\mid\mathbf{z}_k$ 
\begin{eqnarray}
    \mathbb{E}(\mathbf{z}_o\mid\mathbf{z}_k) &=&  \frac{1}{\sigma^2} \mathcal{K}_{ol}\Big (\mathcal{K}_{ll} + \frac{1}{\sigma^2} \mathcal{K}_{lk}\mathcal{K}_{kl}\Big )^{-1}\mathcal{K}_{lk}\mathbf{z}_k.
\end{eqnarray}
Since $\mathbf{z}_q\mid\mathbf{z}_o$ and $\mathbf{z}_o\mid\mathbf{z}_k$ are Gaussians, we can analytically calculate
\begin{equation}
    \begin{aligned}
        \mathbb{E}(\mathbf{z}_q\mid\mathbf{z}_k) &= \frac{1}{\sigma^2} \mathcal{K}_{qm}\Big (\mathcal{K}_{mm} + \frac{1}{\sigma^2} \mathcal{K}_{mo}\mathcal{K}_{om}\Big )^{-1}\mathcal{K}_{mo} \cdot \frac{1}{\sigma^2} \mathcal{K}_{ol}\Big (\mathcal{K}_{ll} + \frac{1}{\sigma^2} \mathcal{K}_{lk}\mathcal{K}_{kl}\Big )^{-1}\mathcal{K}_{lk}\mathbf{z}_k \\
        &= \frac{1}{\sigma^4} \mathcal{K}_{qm}\Big (\mathcal{K}_{mm} + \frac{1}{\sigma^2} \mathcal{K}_{mo}\mathcal{K}_{om}\Big )^{-1}\mathcal{K}_{mo} \mathcal{K}_{ol}\Big (\mathcal{K}_{ll} + \frac{1}{\sigma^2} \mathcal{K}_{lk}\mathcal{K}_{kl}\Big )^{-1}\mathcal{K}_{lk}\mathbf{z}_k. \\
        % &= \frac{1}{\sigma^4} \mathcal{K}_{qm}\Big (\mathcal{K}_{mm} + \frac{1}{\sigma^2} \mathcal{K}_{mo}\mathcal{K}_{om}\Big )^{-1}\mathcal{K}_{mo} \mathcal{K}_{ol}\Big (\mathcal{K}_{ll} + \frac{1}{\sigma^2} \mathcal{K}_{lk}\mathcal{K}_{kl}\Big )^{-1}\mathcal{K}_{lk}(\mathcal{K}_k+\sigma^2\mathbf{I})(\mathcal{K}_k+\sigma^2\mathbf{I})^{-1}\mathbf{z}_k \\
        % &= \frac{1}{\sigma^4} \mathcal{K}_{qm}\Big (\mathcal{K}_{mm} + \frac{1}{\sigma^2} \mathcal{K}_{mo}\mathcal{K}_{om}\Big )^{-1}\mathcal{K}_{mo}\cdot \mathcal{K}_{ol}  \Big(\mathcal{K}_{ll} + \frac{1}{\sigma^2} \mathcal{K}_{lk}\mathcal{K}_{kl}\Big )^{-1} \mathcal{K}_{lk}(\mathcal{K}_k+\sigma^2\mathbf{I})\mathbf{v}
    \end{aligned}
\end{equation}

\section{Sparse CGPT Predictive Variance}
The variance of $\mathbf{z}_q\mid \mathbf{z}_k$ is given by
\begin{align} \label{eq: predictive variance}
    \mathbb{V}[\mathbf{z}_q\mid \mathbf{z}_k] = \mathbb{E}[\mathbf{z}_q\mathbf{z}_q^\top\mid \mathbf{z}_k] - \mathbb{E}[\mathbf{z}_q\mid \mathbf{z}_k]\mathbb{E}[\mathbf{z}_q\mid \mathbf{z}_k]^\top.
\end{align}
where $\mathbb{E}[\mathbf{z}_q\mid \mathbf{z}_k]$ is the predictive mean in Section \ref{sec: predictive mean}. The expectation of $\mathbf{z}_q\mathbf{z}_q\top|z_k$ is given by
\begin{align} \label{eq: quad zq|zk}
    \mathbb{E}[\mathbf{z}_q\mathbf{z}_q^\top|\mathbf{z}_k] = \mathbb{E}_{\mathbf{z}_o \sim p(\mathbf{z}_o \mid \mathbf{z}_k)}\Bigg[\mathbb{E}\Big[\mathbf{z}_q\mathbf{z}_q^\top \mid \mathbf{z}_o\Big] \mid \mathbf{z}_k\Bigg].
\end{align}
Consider,
\begin{align}   \mathbb{E}\Big[\mathbf{z}_q\mathbf{z}_q^\top \mid \mathbf{z}_o\Big] &=\int_{\mathbf{z}_q} \mathbf{z}_q\mathbf{z}_q^\top p(\mathbf{z}_q|\mathbf{z}_o)\mathrm{d}\mathbf{z}_q = \int_{\mathbf{z}_q} \mathbf{z}_q\mathbf{z}_q^\top \int_{\mathbf{z}_m}p(\mathbf{z}_q|\mathbf{z}_m)p(\mathbf{z}_m|\mathbf{z}_o) \mathrm{d}\mathbf{z}_m\mathrm{d}\mathbf{z}_q\\
&= \int_{\mathbf{z}_m}\Big(\int_{\mathbf{z}_q} \mathbf{z}_q\mathbf{z}_q^\top p(\mathbf{z}_q|\mathbf{z}_m)\mathrm{d}\mathbf{z}_q \Big)p(\mathbf{z}_m|\mathbf{z}_o)\mathrm{d}\mathbf{z}_m = \int_{\mathbf{z}_m}\mathbb{E}[\mathbf{z}_q\mathbf{z}_q^\top\mid\mathbf{z}_m]p(\mathbf{z}_m|\mathbf{z}_o)\mathrm{d}\mathbf{z}_m.
\end{align}

Since $p(\mathbb{E}[\mathbf{z}_q\mid\mathbf{z}_m])=\mathbb{N}(\mathbf{z}_q \mid \mathcal{K}_{qm}\mathcal{K}_{mm}^{-1}\mathbf{z}_m, \sigma^2 \mathbf{I})$, we have,
\begin{align*}
    \mathbb{E}[\mathbf{z}_q\mathbf{z}_q^\top\mid\mathbf{z}_m] &= \sigma^2\mathbf{I} + \mathcal{K}_{qm}\mathcal{K}_{mm}^{-1}\mathbf{z}_m \mathbf{z}_m^\top\mathcal{K}_{mm}^{-1}\mathcal{K}_{mq}.
\end{align*}
Therefore,
\begin{equation}
    \begin{aligned} \label{eq:quad zq|zo}
    \mathbb{E}\Big[\mathbf{z}_q\mathbf{z}_q^\top \mid \mathbf{z}_o\Big] &= \int_{\mathbf{z}_m}(\sigma^2\mathbf{I} + \mathcal{K}_{qm}\mathcal{K}_{mm}^{-1}\mathbf{z}_m \mathbf{z}_m^\top\mathcal{K}_{mm}^{-1}\mathcal{K}_{mq})p(\mathbf{z}_m|\mathbf{z}_o)\mathrm{d}\mathbf{z}_m\\
    &= \sigma^2\mathbf{I} + \mathcal{K}_{qm}\mathcal{K}_{mm}^{-1} \int_{\mathbf{z}_m} \mathbf{z}_m \mathbf{z}_m^\top p(\mathbf{z}_m|\mathbf{z}_o)\mathrm{d}\mathbf{z}_m \mathcal{K}_{mm}^{-1}\mathcal{K}_{mq}\\
    &= \sigma^2\mathbf{I} + \mathcal{K}_{qm}\mathcal{K}_{mm}^{-1} \mathbb{E}[\mathbf{z}_m\mathbf{z}_m^\top\mid \mathbf{z}_o]\mathcal{K}_{mm}^{-1}\mathcal{K}_{mq}.
\end{aligned}
\end{equation}

Since $p(\mathbf{z}_m\mid \mathbf{z}_o)=\mathbb{N} \Big (\mathbf{z}_m \mid 
\frac{1}{\sigma^2} \mathcal{K}_{mm}\Big (\mathcal{K}_{mm} + \frac{1}{\sigma^2} \mathcal{K}_{mo}\mathcal{K}_{om}\Big )^{-1}\mathcal{K}_{mo}\mathbf{z}_o,
\mathcal{K}_{mm} \Big (\mathcal{K}_{mm} + \frac{1}{\sigma^2} \mathcal{K}_{mo}\mathcal{K}_{om}\Big )^{-1}\mathcal{K}_{mm} \Big )$, we have the following
\begin{equation} \label{eq: quad zm|zo}
    \begin{aligned}
    \mathbb{E}[\mathbf{z}_m\mathbf{z}_m^\top\mid \mathbf{z}_o] &= \mathcal{K}_{mm} \Big (\mathcal{K}_{mm} + \frac{1}{\sigma^2} \mathcal{K}_{mo}\mathcal{K}_{om}\Big )^{-1}\mathcal{K}_{mm} + \frac{1}{\sigma^4} \mathcal{K}_{mm}\Big (\mathcal{K}_{mm} + \frac{1}{\sigma^2} \mathcal{K}_{mo}\mathcal{K}_{om}\Big )^{-1}\mathcal{K}_{mo}\mathbf{z}_o \times \\
    &\mathbf{z}_o^\top \mathcal{K}_{om}\Big (\mathcal{K}_{mm} + \frac{1}{\sigma^2} \mathcal{K}_{mo}\mathcal{K}_{om}\Big )^{-1}\mathcal{K}_{mm}.
\end{aligned}
\end{equation}
Combining Eq \eqref{eq:quad zq|zo} and \eqref{eq: quad zm|zo}, we have
\begin{equation} \label{eq: quad zq|zo full}
    \begin{aligned}
    \mathbb{E}\Big[\mathbf{z}_q\mathbf{z}_q^\top \mid \mathbf{z}_o\Big] &= \sigma^2\mathbf{I} + \mathcal{K}_{qm}\Big[\Big (\mathcal{K}_{mm} + \frac{1}{\sigma^2} \mathcal{K}_{mo}\mathcal{K}_{om}\Big )^{-1} + \frac{1}{\sigma^4} \Big (\mathcal{K}_{mm} + \frac{1}{\sigma^2} \mathcal{K}_{mo}\mathcal{K}_{om}\Big )^{-1} \mathcal{K}_{mo} \mathbf{z}_o \times \\
    &\mathbf{z}_o^\top\mathcal{K}_{om}\Big (\mathcal{K}_{mm} + \frac{1}{\sigma^2} \mathcal{K}_{mo}\mathcal{K}_{om}\Big )^{-1} \Big] \mathcal{K}_{mq}. 
\end{aligned}
\end{equation}

Plug Eq \eqref{eq: quad zq|zo full} to Eq \eqref{eq: quad zq|zk}, we have
\begin{equation} \label{eq: quad zq|zk 2}
    \begin{aligned}
    \mathbb{E}[\mathbf{z}_q\mathbf{z}_q^\top|\mathbf{z}_k] 
    &= \sigma^2\mathbf{I} + \mathcal{K}_{qm}\Big[\Big (\mathcal{K}_{mm} + \frac{1}{\sigma^2} \mathcal{K}_{mo}\mathcal{K}_{om}\Big )^{-1} + \frac{1}{\sigma^4} \Big (\mathcal{K}_{mm} + \frac{1}{\sigma^2} \mathcal{K}_{mo}\mathcal{K}_{om}\Big )^{-1} \mathcal{K}_{mo} \times \\
    &\Big(\int_{\mathbf{z}_o}\mathbf{z}_o\mathbf{z}_o^\top p(\mathbf{z}_o\mid
    \mathbf{z}_k)\mathrm{d}\mathbf{z}_o\Big)\mathcal{K}_{om}\Big (\mathcal{K}_{mm} + \frac{1}{\sigma^2} \mathcal{K}_{mo}\mathcal{K}_{om}\Big )^{-1} \Big] \mathcal{K}_{mq}. 
\end{aligned}
\end{equation}

In a similar manner, we can calculate the integral
\begin{equation} \label{eq: quad zo|zk full}
    \begin{aligned}
    &\int_{\mathbf{z}_o}\mathbf{z}_o\mathbf{z}_o^\top p(\mathbf{z}_o\mid
    \mathbf{z}_k)\mathrm{d}\mathbf{z}_o =  \mathbb{E}[\mathbf{z}_o\mathbf{z}_o^\top|\mathbf{z}_k] \\
    &=\sigma^2\mathbf{I} + \mathcal{K}_{ol}\Big[\Big (\mathcal{K}_{ll} + \frac{1}{\sigma^2} \mathcal{K}_{lk}\mathcal{K}_{kl}\Big )^{-1} + \frac{1}{\sigma^4} \Big (\mathcal{K}_{ll} + \frac{1}{\sigma^2} \mathcal{K}_{lk}\mathcal{K}_{kl}\Big )^{-1} \mathcal{K}_{lk} \mathbf{z}_k\mathbf{z}_k^\top\mathcal{K}_{kl}\Big (\mathcal{K}_{ll} + \frac{1}{\sigma^2} \mathcal{K}_{lk}\mathcal{K}_{kl}\Big )^{-1} \Big] \mathcal{K}_{lo}. 
\end{aligned}
\end{equation}

From equation \eqref{eq: quad zo|zk full}, \eqref{eq: quad zq|zk 2} and \eqref{eq: predictive variance}, we have the full predictive varicance of sparse CGPT.







\section{Derivation of Sparse CGP Loss Function} \label{sec: SCGP objective}
The objective function of Sparse CGP is given by
\begin{equation} \label{eq:maximize}
    \begin{aligned}
        \min_\theta \Big\{ \mathfrak{L}(\theta) &\triangleq \mathrm{loss}(\boldsymbol{\nu}_a) - \alpha \cdot \Big(\log \mathbb{E}_{\mathbf{z}_o}\big[p(\mathbf{z}_q = \boldsymbol{\nu}_a \mid \mathbf{z}_o)\big] \ +\ \log \mathbb{E}_{\mathbf{z}_o}\big[p(\mathbf{z}_k \mid \mathbf{z}_o)\big]\Big) \Big\}.
        % &\alpha\Big(\log p(\mathbf{z}_o\mid \mathbf{X}_m) + \log p(\mathbf{z}_o\mid \mathbf{X}_l)\Big)\Big\}  \ .
    \end{aligned}
\end{equation} 



We will optimize the lower bound of $\log p(\mathbf{z}_q = \boldsymbol{\nu}_a\mid \mathbf{z}_o)$ and $\log p(\mathbf{z}_k \mid \mathbf{z}_o)$. Consider $p(\mathbf{z}_q = \boldsymbol{\nu}_a \mid \mathbf{z}_o)$, we have
\begin{equation} \label{eq: log q}
    \begin{aligned}
        \log \mathbb{E}_{\mathbf{z}_o} \Big[ p(\mathbf{z}_q=\boldsymbol{\nu}_a \mid \mathbf{z}_o)\Big] 
        &\geq \mathbb{E}_{\mathbf{z}_o}\log \Big[ p(\mathbf{z}_q =\boldsymbol{\nu}_a\mid \mathbf{z}_o)\Big] = \mathbb{E}_{\mathbf{z}_o} \Big[\log \mathbb{E}_{\mathbf{z}_m\mid\mathbf{z}_o}[p(\mathbf{z}_q=\boldsymbol{\nu}_a\mid \mathbf{z}_m)] \Big] \\&\geq \mathbb{E}_{\mathbf{z}_o} \Big[ \mathbb{E}_{\mathbf{z}_m\mid\mathbf{z}_o}\Big[\log [p(\mathbf{z}_q=\boldsymbol{\nu}_a\mid \mathbf{z}_m)] \Big] \Big ],
    \end{aligned}
\end{equation}
where we use Jensen's inequality to lower bound the log expectation. Since $\mathbf{z}_q\mid \mathbf{z}_m \sim \mathbb{N}(\mathcal{K}_{qm}\mathcal{K}_{mm}^{-1}\mathbf{z}_m, \sigma^2\mathbf{I})$, we have the following
\begin{eqnarray} \label{eq: bound}
    \mathbb{E}_{\mathbf{z}_m\mid\mathbf{z}_o}\Big[\log p(\mathbf{z}_q=\boldsymbol{\nu}_a\mid\mathbf{z}_m) \Big] &= -\frac{1}{2\sigma^2}\mathbb{E}_{\mathbf{z}_m\mid\mathbf{z}_o}  \Big[ ||\boldsymbol{\nu}_a-\mathcal{K}_{qm}\mathcal{K}_{mm}^{-1}\mathbf{z}_m||^2 \Big] -n\log 2\pi - \frac{1}{2\sigma}.
\end{eqnarray}
We have the following identity: If $X\sim\mathbb{N}(\mu, \Sigma)$ and $A$ is a symmetric matrix, then $\mathbb{E}[X^\top A X] = \mu^\top A \mu + \text{trace}(A\Sigma)$. Since $\mathbf{z}_m\mid\mathbf{z}_o \sim \mathbb{N}\Big( \frac{1}{\sigma^2}\mathcal{K}_{mm}\Big(\mathcal{K}_{mm}+\frac{1}{\sigma^2}\mathbf{I}\Big)^{-1}\mathcal{K}_{mo}\mathbf{z}_o, \mathcal{K}_{mm}\Big(\mathcal{K}_{mm}+\frac{1}{\sigma^2}\mathbf{I}\Big)^{-1}\mathcal{K}_{mm}\Big)$, following the above identity, we have
\begin{align} \label{eq: analytic}
    \mathbb{E}_{\mathbf{z}_m\mid\mathbf{z}_o}  \Big[ ||\boldsymbol{\nu}_a-\mathcal{K}_{qm}\mathcal{K}_{mm}^{-1}\mathbf{z}_m||^2 \Big] = \Big|\Big|\boldsymbol{\nu}_a - \frac{1}{\sigma^2}\mathcal{K}_{qm}\Big(\mathcal{K}_{mm}+\frac{1}{\sigma^2}\mathbf{I}\Big)^{-1}\mathcal{K}_{mo}\mathbf{z}_o\Big|\Big|^2 + \text{trace}\Big[\mathbf{K}_{qm}\Big(\mathcal{K}_{mm}+\frac{1}{\sigma^2}\mathbf{I}\Big)^{-1}\mathbf{K}_{mq}\Big].
\end{align}

Taking the expectation over $\mathbf{z}_o \sim \mathbb{N}(\boldsymbol{\mu}_o, \mathcal{K}_{oo})$, where $\boldsymbol{}$ we have
\begin{equation} \label{eq: final lower bound}
    \begin{aligned}
        \mathbb{E}_{\mathbf{z}_m\mid\mathbf{z}_o}  \Big[ ||\boldsymbol{\nu}_a-\mathcal{K}_{qm}\mathcal{K}_{mm}^{-1}\mathbf{z}_m||^2 \Big] &= \mathbb{E}_{\mathbf{z}_o} \Big[ 
 \Big(\boldsymbol{\nu}_a - \frac{1}{\sigma^2}\mathcal{K}_{qm}\Big(\mathcal{K}_{mm}+\frac{1}{\sigma^2}\mathbf{I}\Big)^{-1}\mathcal{K}_{mo}\mathbf{z}_o \Big) \Big(\boldsymbol{\nu}_a^\top - \frac{1}{\sigma^2} \mathbf{z}_o^\top \mathcal{K}_{om} \Big(\mathcal{K}_{mm}+\frac{1}{\sigma^2}\mathbf{I}\Big)^{-1}\mathcal{K}_{mq} \Big )\Big ]+ \\ &\text{trace}\Big[\mathbf{K}_{qm}\Big(\mathcal{K}_{mm}+\frac{1}{\sigma^2}\mathbf{I}\Big)^{-1}\mathbf{K}_{mq}\Big] \\
 =& ||\boldsymbol{\nu}_a||^2 + \frac{1}{\sigma^4} \mathcal{K}_{qm}\Big(\mathcal{K}_{mm}+\frac{1}{\sigma^2}\mathbf{I}\Big)^{-1}\mathcal{K}_{mo} \mathcal{K}_{oo} \mathcal{K}_{om} \Big(\mathcal{K}_{mm}+\frac{1}{\sigma^2}\mathbf{I}\Big)^{-1}\mathcal{K}_{mq} +\\ &\text{trace}\Big[\mathbf{K}_{qm}\Big(\mathcal{K}_{mm}+\frac{1}{\sigma^2}\mathbf{I}\Big)^{-1}\mathbf{K}_{mq}\Big] \\
 \geq& \frac{1}{\sigma^4} \mathcal{K}_{qm}\Big(\mathcal{K}_{mm}+\frac{1}{\sigma^2}\mathbf{I}\Big)^{-1}\mathcal{K}_{mo} \mathcal{K}_{oo} \mathcal{K}_{om} \Big(\mathcal{K}_{mm}+\frac{1}{\sigma^2}\mathbf{I}\Big)^{-1}\mathcal{K}_{mq} + \\
 &\text{trace}\Big[\mathbf{K}_{qm}\Big(\mathcal{K}_{mm}+\frac{1}{\sigma^2}\mathbf{I}\Big)^{-1}\mathbf{K}_{mq}\Big]
    \end{aligned}
\end{equation}

% In order to find the inducing inputs $\mathbf{X}_m$, we maximize,
% \begin{align}
%     \log p(\mathbf{z}_o\mid \mathbf{X}_m) = -\frac{1}{2} \log |\mathcal{K}_{om}\mathcal{K}_{mm}^{-1}\mathcal{K}_{mo} + \sigma^2 \mathbf{I}| -\frac{1}{2} \mathbf{z}_o^\top (\mathcal{K}_{om}\mathcal{K}_{mm}^{-1}\mathcal{K}_{mo} + \sigma^2 \mathbf{I})^{-1} \mathbf{z}_o - \frac{n}{2}\log 2\pi
% \end{align}
% Similarly, to find $\mathbf{X}_l$, we maximize
% \begin{align}
%     \log p(\mathbf{z}_k\mid \mathbf{X}_l) = -\frac{1}{2} \log |\mathcal{K}_{kl}\mathcal{K}_{ll}^{-1}\mathcal{K}_{lk} + \sigma^2 \mathbf{I}| -\frac{1}{2} \mathbf{z}_k^\top (\mathcal{K}_{kl}\mathcal{K}_{ll}^{-1}\mathcal{K}_{lk} + \sigma^2 \mathbf{I})^{-1} \mathbf{z}_k - \frac{n}{2}\log 2\pi
% \end{align}

Combining \eqref{eq: log q}, \eqref{eq: bound}, \eqref{eq: analytic} and \eqref{eq: final lower bound}, we have the closed form lower bound for $\log p(\mathbf{z}_q = \mid \mathbf{z}_o)$. Using similar argument, we also obtain the lower bound for $\log p(\mathbf{z}_k = \mid \mathbf{z}_o)$


\section{Additional Experiment Results} \label{app:C}
    \subsection{Out-of-Distribution Calibration}
       \textbf{CIFAR100-C.} 
      This section expands on our previous empirical comparison between SGPA and CGPT. Previously, we have shown that CGPT has comparable performances with SGPA in terms of accuracy (MCC) and has much better uncertainty calibration on the CIFAR10 dataset (see Table~\ref{tab:in-distribution} and Table~\ref{tab:OOD CIFAR}). In addition, to compare the robust performance of CGPT and SGPA on larger scale OOD learning scenarios, we also use the corrupted CIFAR100-C dataset. Similar to the CIFAR10-C dataset, the CIFAR100-C dataset also contains corrupted images from CIFAR100, which can be divided into $19$ types of distortion belonging to $4$ distortion categories: Noise, Blur, Weather and Digital. For each method, we calculate the mean performance metrics over the distortion types in each distortion category. The results in Table \ref{tab:OOD CIFAR100 appx} shows that while CGPT has comparable accuracy with the SGPA baseline, the calibration capacity of CGPT is much better than SGPA  with lower NLL, MCE and ECE across all types of distortion.

      \par \textbf{CIFAR10-C.} We provide additional results for CGPT on CIFAR10-C. Beside the results for CGPT with the value $\alpha$ gradually increases from $0.5$ to $1.0$ during training as in Table~\ref{tab:OOD CIFAR}, we also train another CGPT with fixed value of $\alpha=0.7$. We found that CGPT in this setting can help achieve better accuracy and calibration results, which are shown in Table \ref{tab:OOD CIFAR10 appendix}.    
        \begin{table*}[t]
        \centering
        \caption{Test Accuracy and other calibration metrics achieved by our CGPT model with 2 different settings of $\alpha$ on CIFAR10-C dataset. For each of the $4$ distortion categories, we report the mean metrics over all distortion types in the category. And for each reported result, we run with 3 random seeds and report mean and standard deviation.}
        \vspace{0.5em}
        \resizebox{17cm}{!}{
        \begin{tabular}{|l l l l l l l|} 
            \toprule
            \textbf{Metric} & \textbf{Model} & \bf{Noise} & \bf{Blur} & \bf{Weather} & \bf{Digital} & \bf{Avg.}\\
            \midrule
            
            \multirow{2}{*}{Acc $\uparrow$} & SGPA & 50.803 $\pm$ 0.447 & \textbf{59.264 $\pm$ 0.915} & \textbf{64.148 $\pm$ 0.472} & \textbf{63.028 $\pm$ 0.334} & \textbf{59.722 $\pm$ 0.323}\\ 
            & CGPT ($\alpha=0.5 \rightarrow 1.0$)& \textbf{55.177 $\pm$ 0.953} & 56.412 $\pm$ 1.506 & 61.515 $\pm$ 0.703 & 60.373 $\pm$ 0.123 & 58.591 $\pm$ 0.664\\
            & CGPT ($\alpha=0.7$)& 54.110 $\pm$ 0.298 & 58.056 $\pm$ 0.233 & 61.655 $\pm$ 0.348 & 61.029 $\pm$ 0.258 & 58.971 $\pm$ 0.111 \\
            \midrule

             \multirow{2}{*}{NLL $\downarrow$} & SGPA & 3.464 $\pm$ 0.423 & 2.551 $\pm$ 0.091 & 2.137 $\pm$ 0.162 & 2.298 $\pm$ 0.045 & 2.626 $\pm$ 0.202\\ 
            & CGPT ($\alpha=0.5 \rightarrow 1.0$) & 1.688 $\pm$ 0.033 & 1.565 $\pm$ 0.068 & 1.352 $\pm$ 0.049 & 1.461 $\pm$ 0.027 & 1.516 $\pm$ 0.029\\
            & CGPT ($\alpha=0.7$)& \textbf{1.670 $\pm$ 0.180} & \textbf{1.403 $\pm$ 0.131} & \textbf{1.281 $\pm$ 0.132} & \textbf{1.341 $\pm$ 0.099} & \textbf{1.414 $\pm$ 0.131}\\
            \midrule

             \multirow{2}{*}{MCE $\downarrow$} & SGPA & 0.668$\pm$ 0.009 & 0.592 $\pm$ 0.014 & 0.576 $\pm$ 0.014 & 0.575 $\pm$ 0.001 & 0.593 $\pm$ 0.002\\ 
            & CGPT ($\alpha = 0.5 \rightarrow 1.0$) & \textbf{0.360 $\pm$ 0.011} & 0.334 $\pm$ 0.013 & \textbf{0.284 $\pm$ 0.002} & \textbf{0.314 $\pm$ 0.003} & \textbf{0.324 $\pm$ 0.002}\\
            & CGPT ($\alpha=0.7$)& 0.379 $\pm$ 0.025 & \textbf{0.330 $\pm$ 0.009} & 0.299 $\pm$ 0.017 & 0.318 $\pm$ 0.000 & 0.330 $\pm$ 0.011\\
            \midrule

             \multirow{2}{*}{ECE $\downarrow$} & SGPA & 0.532 $\pm$ 0.021 & 0.488 $\pm$ 0.012 & 0.469 $\pm$ 0.003 & 0.472 $\pm$ 0.010 & 0.487 $\pm$ 0.012\\ 
            & CGPT ($\alpha=0.5 \rightarrow 1.0$) & \textbf{0.226 $\pm$ 0.012} & \textbf{0.202 $\pm$ 0.007} & \textbf{0.159 $\pm$ 0.004} & \textbf{0.183 $\pm$ 0.003} & \textbf{0.192 $\pm$ 0.001}\\
            & CGPT ($\alpha=0.7$)& 0.241 $\pm$ 0.021 & 0.199 $\pm$ 0.001 & 0.169 $\pm$ 0.013 & 0.180 $\pm$ 0.003 & 0.195 $\pm$ 0.007\\
            \bottomrule
        \end{tabular}}
        \label{tab:OOD CIFAR10 appendix}
    \end{table*}

    \begin{table*}[t]
        \centering
        \caption{Test Accuracy and other calibration metrics achieved by our CGPT model on CIFAR100-C dataset under the OOD setting. For each of the $4$ distortion categories, we report the mean metrics over all distortion types in the category. And for each reported result, we run with 3 random seeds and report mean and standard deviation. We again observe that CGPT attains better calibration metrics than SGPA across all cases.}
        \vspace{0.5em}
         \resizebox{16cm}{!}{
        \begin{tabular}{|l l l l l l l|} 
            \toprule
            \textbf{Metric} & \textbf{Model} & \bf{Noise} & \bf{Blur} & \bf{Weather} & \bf{Digital} & \bf{Avg.}\\
            \midrule
            
            \multirow{2}{*}{Acc $\uparrow$} & SGPA & \textbf{23.383 $\pm$ 0.308} & \textbf{36.405 $\pm$ 0.263} & \textbf{35.940 $\pm$ 0.120} & \textbf{35.533 $\pm$ 0.084} & \textbf{33.117 $\pm$ 0.126}\\ 
            & CGPT ($\alpha=0.7$) & 22.664 $\pm$ 0.007 & 34.488 $\pm$ 0.949 & 35.341 $\pm$ 0.375 & 34.259 $\pm$ 0.059 & 31.973 $\pm$ 0.313 \\
            \midrule

             \multirow{2}{*}{NLL $\downarrow$} & SGPA & 10.163 $\pm$ 0.583 & 6.987 $\pm$ 0.033 & 6.856 $\pm$ 0.050 & 7.284 $\pm$ 0.039 & 7.763 $\pm$ 0.161\\ 
            & CGPT ($\alpha=0.7$) & \textbf{5.600 $\pm$ 0.527} & \textbf{3.270 $\pm$ 0.360} & \textbf{3.197 $\pm$ 0.303} & \textbf{3.348 $\pm$ 0.272} & \textbf{3.797 $\pm$ 0.355}\\
            \midrule

             \multirow{2}{*}{MCE $\downarrow$} & SGPA & 0.723$\pm$ 0.008 & 0.628 $\pm$ 0.003 & 0.626 $\pm$ 0.004 & 0.637 $\pm$ 0.001 & 0.652 $\pm$ 0.004\\ 
            & CGPT ($\alpha=0.7$) & \textbf{0.633 $\pm$ 0.030} & \textbf{0.456 $\pm$ 0.068} & \textbf{0.459 $\pm$ 0.049} & \textbf{0.459 $\pm$ 0.057} & \textbf{0.497 $\pm$ 0.052}\\
            \midrule

             \multirow{2}{*}{ECE $\downarrow$} & SGPA & 0.597 $\pm$ 0.015 & 0.491 $\pm$ 0.004 & 0.492 $\pm$ 0.002 & 0.495 $\pm$ 0.001 & 0.521 $\pm$ 0.001\\ 
            & CGPT ($\alpha=0.7$) & \textbf{0.454 $\pm$ 0.038} & \textbf{0.294 $\pm$ 0.060} & \textbf{0.295 $\pm$ 0.055} & \textbf{0.289 $\pm$ 0.050} & \textbf{0.328 $\pm$ 0.050}\\
            \bottomrule
        \end{tabular}}
        \label{tab:OOD CIFAR100 appx}
    \end{table*}

    % \subsection{Out-of-Distribution Detection}

    \subsection{CGPT Helps Reduce Oversmoothing in Transformers} \label{sec appendix: OVSMT}
        \par In this section, we conduct additional oversmoothing analysis similar to that in section \ref{sec:experiments} on the larger dataset CIFAR100. We compare the oversmoothing effect of SGPA and CGPT and use the settings for CIFAR100 detailed in Section \ref{sec:details image}. For CGPT, we fix $\alpha=0.7$ in the CGP objective function in the training phase. After training both CGPT and SGPA, we measured the cosine similarity between the outputs of the attention block in each layer to depict the oversmoothing effect.
        \par  This is visually demonstrated in Fig.~\ref{fig:over_cifar100}, which shows that as the number of attention blocks increases, the cosine similarities between the representations learned with SGPA become gradually higher. This implies that these representations will become more similar with each other as the models get deeper. On the contrary, the learned representations of CGPT have much lower cosine similarity as the model depth increases, which implies that CGPT will suffer less from oversmoothing than the SGPA.

        \begin{figure}[t!]
                \centering
                \captionsetup{font=small} 
                \includegraphics[scale=0.5]{gptransformers/images/OVSMT2.pdf}
                \vspace{-1em}
                \caption{The cosine similarity between the token representations
                vs. the layer index of CGPT and SGPA on CIFAR10. CGPT is much less vulnerable to oversmoothing compared to SGPA.}
                \label{fig:over_cifar10}
                \vspace{-0.2in}
            \end{figure}

        \begin{figure}[t!]
                \centering
                \includegraphics[scale=0.55]{gptransformers/images/OVSMT_C100.png}
                \vspace{-1em}
                \caption{The cosine similarity between the token representations after the attention calculation
                vs. the layer index of CGPT and SGPA on CIFAR100. CGPT is much less vulnerable to oversmoothing compared to SGPA.}
                \label{fig:over_cifar100}
                \vspace{-0.2in}
            \end{figure}
    
            
% \section{Experiment Settings} \label{app:D}
%     Following the framework of ~\citep{chen2023calibrating}, we will conduct experiments on the image classification tasks and the linguistic acceptability prediction task with the following configurations:
%     \par \textbf{Tasks.} In Section \ref{sec:experiments}, we study the performance of CGPT and SCGPT on image classification task using CIFAR10 \citep{krizhevsky2009cifar} dataset and linguistic acceptability prediction task using the CoLA dataset \citep{warstadt2019neural}. For the out-of-distribution (OOD) evaluations, we use the corrupted CIFAR10-C dataset \citep{hendrycks2019benchmarking} for the image classification task, while the CoLA dataset already contains the out-of-distribution data. In Section \ref{sec: OOD detection}, we consider the OOD detection tasks for the models trained on vision tasks including our methods and other baselines to further evaluate the uncertainty calibration ability of the models.
    
%     \par \textbf{General settings for all tasks.} For SGPA and kernel attention, we use the ARD-RBF kernel \citep{Rasmussen06} for the image classification tasks
%     $ \kappa(\mathbf{x}, \mathbf{x}') = \sigma_s^2 \exp({-0.5\sum_{i=1}^d (x_i-x'_i)^2/\sigma_i^2})$ , and an exponential of scaled dot product variant for the linguistic acceptability task $\kappa(\mathbf{x},\mathbf{x}')=\sigma_s^2 \exp(\sum_{i=1}^d x_ix'_i/\sigma_i^2)$ . Here, $\mathbf{x}$ 
%     and $\mathbf{x}'$ are $d$-dimensional inputs, $\sigma_s^2$ denotes the output variance and $\{\sigma_i^2\}_{i=1}^d$ are the length scales. 
%     For CGPT and SCGPT, we use the parameter-free squared exponential kernel function for all tasks $\kappa_o(\mathbf{x}, \mathbf{x}') = \mathrm{exp}(-0.5 \|\mathbf{x} - \mathbf{x}'\|^2)$ as the canonical representation and model the latent inputs $\mathbf{X}_o$ by linear projection of a finite set of inputs $\mathbf{X}$ for simplicity. 
%     % We estimate predictive uncertainty by using 10 Monte Carlo samples. 
%     The regularisation coefficient $\alpha$ in our objective function is chosen by evaluating performance on the validation set. Our experiments are conducted on A100 40GB SMX NVIDIA GPUs.  

%     \par \textbf{Baselines.} We compare our methods against SGPA \citep{chen2023calibrating}, which leverages sparse GP to design attention and calibrate Transformer. Other baselines we consider are symmetric and asymmetric kernel attention \citep{tsai2019transformer}. 

% \par \textbf{Architectures.} We use Vision Transformer \citep{dosovitskiy2020image} for image classification and standard transformer architecture \citep{vaswani2017attention} for linguistic acceptability prediction. We use the parameter-free squared exponential kernel for CGPT and SCGPT for both of the tasks while in SGPA, we use the ARD kernel \citep{Rasmussen06} for image classification and the exponential kernel for linguistic acceptability prediction. 
% \par \textbf{Evaluation.} We study the calibration capacity of the models by evaluating the robustness of them under out-of-distribution setting in section \ref{sec:experiments}. We also compare the out-of-distribution detection capacity of our methods against other baselines in section \ref{sec: OOD detection}. We report the accuracy (Acc) for the image classification tasks and Matthew correlation coefficient (MCC) for CoLA, as well as other test calibration metrics, including negative log likelihood (NLL), expected calibration error (ECE) and maximum calibration error (MCE).

% \par \textbf{CGPT and SCGPT proprietary hyperparameters.} The $\alpha$ value in our CGP objective function is linearly annealed from $0.0$ to $1.0$ during the training phase. For SCGPT, we set the inducing variable dimension $m$ to be $m=16$ in image classification tasks, which is smaller than the sequence length $n$ in order to be more memory and computationally efficient, as discussed in Section \ref{sec: SCGPT}. The value of the noise $\sigma$ in SCGPT is tuned from $0$ to $1$ and chosen to be $\sigma=0.1$ as we find that value gives the best performance for SCGPT.

%     \subsection{Image Classification} \label{sec:details image}
%     For the OOD tasks on CIFAR10-C and CIFAR100-C,
% we use the models trained on the clean datasets and use the corrupted datasets to evaluate the OOD performances. The CIFAR10-C dataset contains 19 types of distortions covering 4 distortion categories: Noise, Blur, Weather and Digital. We train 3 independent runs for each experiment and for each run, we calculate the mean results for each category of corruption. Then we report the mean and standard deviations of these results across 3 independent runs.

%     \textbf{Datasets.} The original training set of the CIFAR10 dataset is randomly split into 45,000 instances for training and 5,000 instances for validation. The same splitting method is applied for the original training set of the CIFAR100 dataset. 

%     \textbf{Implementation details.}  The architecture of ViT for the CIFAR10 dataset contains 5 MHSA layers with each layer has 4 attention heads and the hidden dimension is set to 128. For the CIFAR100 dataset, ViT has 6 MHSA layers with each layer contains 4 attention heads and we set the hidden dimension of 256. We tokenize input images with patch size $4 \times 4$. 

%     Both CGPT and SCGPT is trained with batch-size 100 for 600 epochs and we use ADAM for optimization with an initial learning rate of 0.0005 which decays to 0.00001 linearly. We adapt the similar training scheme as in ~\citep{chen2023calibrating} for training CGPT, i.e. we train ViT with asymmetric kernel attention for the first 200 epochs to initialize parameters for CGPT and continue training for 400 epochs with the CGPT/SCGPT predictive means and objective function for the CIFAR10 dataset. For the the CIFAR100 dataset, we pretrain for 100 epochs to initialize parameters for CGPT and continue training for 500 epochs. For SGPA, we use the same hyper-parameters for training as following  ~\citep{chen2023calibrating}.

%     \textbf{Evaluation.} We choose the best model by computing the validation accuracy computed after each $10$ epochs. For each reported result, we run with 3 random seeds and report mean and standard deviation.

%     \subsection{Linguistic Acceptability} 
%     For the OOD task on COLA,
% we use the model trained on the clean dataset and use the originally provided OOD set to evaluate robustness performance. 

% \textbf{Datasets.} The COLA dataset contains 516 OOD samples and the original training set, which we randomly split into $7,262$ in-distribution training samples and $1,816$ in-distribution testing samples. 

% \textbf{Implementation details.} The architecture of Transformer for the COLA dataset has 2 MHSA layers with each layer contains 4 attention heads. The hidden dimension and embedding dimension are 256 and 128 respectively. We also use ELMO-style representation ~\citep{DBLP:conf/naacl/PetersNIGCLZ18} for the input embeddings  as in ~\citep{chen2023calibrating}.

% We train CGPT and SCGPT with batch-size 32 for 50 epochs. We also use ADAM optimizer with an initial learning rate of 0.0005 which decays to 0.00001 linearly. For SGPA, we use the same hyper-parameters for training as following  ~\citep{chen2023calibrating}. We choose the noise term to be $\sigma=0.5$ for SCGPT.

% \textbf{Evaluation.} We choose the model at the 50th epoch for evaluating performance. For each reported result, we run with 3 random seeds and report mean and standard deviation.


% \section{Additional Background}
% \subsection{Multi-Head Self-Attention (MHSA)}
% MHSA helps capture more diverse patterns in the input and increase the representation capacity of transformers. A MHSA comprises $h$ units of self-attention $\mathbf{V}^+_1, \mathbf{V}^+_2, \ldots, \mathbf{V}^+_h$ where $\mathbf{V}^+_i$ denote the output of the $i$-th self-attention unit defined above. The output of the MHSA is then computed as an affine transformation of these self-attention units,
% \begin{eqnarray}
% \hspace{-12mm}\mathbf{H} &\triangleq& \mathrm{MultiHead}\Big(\mathbf{V}^+_1, \mathbf{V}^+_2, \ldots, \mathbf{V}^+_h\Big) \nonumber\\
% \hspace{-12mm}&=& \mathrm{Concatenate}\Big(\mathbf{V}^+_1, \mathbf{V}^+_2, \ldots, \mathbf{V}^+_h\Big) \ \mathbf{W}^{\top}_o,
% \end{eqnarray}
% where $\mathbf{W}_o \in \mathbb{R}^{d \times (h \cdot d)}$ is the weight matrix.

% \section{Additional Related Work}
% \par \textbf{Bayesian Deep Learning.} Convolutional and recurrent neural networks, specifically, have benefited from the application of Bayesian approaches ~\citep{mukhoti2018evaluating, kendall2017uncertainties, gustafsson2020evaluating, chien2015bayesian, ritter2021sparse, tran2019bayesian}, and early efforts to employ similar methods for transformers have attained initial successes ~\citep{xue2021bayesian}. Another line of work by ~\citep{muller2021transformers} make the connection between transformers and Bayesian inference, showing that transformers can efficiently do Bayesian inference. Our proposed CGPT is complementary to those methods.


% % \par \textbf{Self-attention mechanism interpretation.} A recent line of research is focusing on understanding the attention mechanism of transformers under various perspective. In the probabilistic point of view, \citep{} model the attention as a Gaussian mixture model 

% % \par \textbf{Bayesian Approaches for Transformers.} Recent works have aimed to calibrate transformers using Bayesian approaches. In particular, ~\citep{fan2020bayesian} and ~\citep{cinquin2021pathologies} consider applying variational inference to the attention matrices. A Bayesian inference approach using GP by ~\citep{liu2020simple}, ~\citep{bradshaw2017adversarial} suggests fitting a GP on the output of the last attention layer. Another work utilizing GP was proposed by ~\citep{chen2023calibrating} that fits a sparse variational GP to each attention layer and propagates uncertainty across the layers. CGPT extends this research direction by fitting correlated GPs to the attention outputs.%\vspace{-2mm}

% % \newpage
% % \section*{Checklist}


% % % %%% BEGIN INSTRUCTIONS %%%

% % % %%% END INSTRUCTIONS %%%


% %  \begin{enumerate}

% %  \item For all models and algorithms presented, check if you include:
% %  \begin{enumerate}
% %    \item A clear description of the mathematical setting, assumptions, algorithm, and/or model. [Yes]
% %    \item An analysis of the properties and complexity (time, space, sample size) of any algorithm. [Yes]
% %    \item (Optional) Anonymized source code, with specification of all dependencies, including external libraries. [Yes]
% %  \end{enumerate}


% %  \item For any theoretical claim, check if you include:
% %  \begin{enumerate}
% %    \item Statements of the full set of assumptions of all theoretical results. [Yes]
% %    \item Complete proofs of all theoretical results. [Yes]
% %    \item Clear explanations of any assumptions. [Yes]     
% %  \end{enumerate}


% %  \item For all figures and tables that present empirical results, check if you include:
% %  \begin{enumerate}
% %    \item The code, data, and instructions needed to reproduce the main experimental results (either in the supplemental material or as a URL). [Yes]
% %    \item All the training details (e.g., data splits, hyperparameters, how they were chosen). [Yes]
% %          \item A clear definition of the specific measure or statistics and error bars (e.g., with respect to the random seed after running experiments multiple times). [Yes]
% %          \item A description of the computing infrastructure used. (e.g., type of GPUs, internal cluster, or cloud provider). [Yes]
% %  \end{enumerate}

% %  \item If you are using existing assets (e.g., code, data, models) or curating/releasing new assets, check if you include:
% %  \begin{enumerate}
% %    \item Citations of the creator If your work uses existing assets. [Yes]
% %    \item The license information of the assets, if applicable. [Yes]
% %    \item New assets either in the supplemental material or as a URL, if applicable. [Yes]
% %    \item Information about consent from data providers/curators. [Yes]
% %    \item Discussion of sensible content if applicable, e.g., personally identifiable information or offensive content. [Not Applicable]
% %  \end{enumerate}

% %  \item If you used crowdsourcing or conducted research with human subjects, check if you include:
% %  \begin{enumerate}
% %    \item The full text of instructions given to participants and screenshots. [Not Applicable]
% %    \item Descriptions of potential participant risks, with links to Institutional Review Board (IRB) approvals if applicable. [Not Applicable]
% %    \item The estimated hourly wage paid to participants and the total amount spent on participant compensation. [Not Applicable]
% %  \end{enumerate}

% %  \end{enumerate}




% % % \subsection{More tasks}
% % %         \begin{enumerate}
% % %         \item OOD detection (Done)
% % %         \item Compare with kernel attention (Done)
% % %         \item ViT on CIFAR100 (Done) and CIFAR100-C (done)
% % %         \item Oversmoothing for attention output (Done - weird behavior in fig \ref{fig:over cifar100})
% % %         \item head redundancy analysis for (CGPT, SGPA, Kernel, softmax) (cosine similarity too small)
% % %         \item Sym CGPT vs asym CGPT (nan encountered)
% % %         % \item small scale data (SGPA)
            
            
% % %         \end{enumerate}

