\subsection{Self-Attention}
\par Given an input sequence $\mathbf{X} = [\mathbf{x}_1, \ldots,\mathbf{x}_n]^\top \in \mathbb{R}^{n\times d}$ of $n$ $d$-dimensional vectors, the self-attention mechanism transforms it into the output sequence $\mathbf{V}^+=[\mathbf{v}^+_1,\ldots,\mathbf{v}^+_n]^\top\in \mathbb{R}^{n\times d}$ via two steps:

\textbf{Step 1.} The input $\mathbf{X}$ is linearly transformed into the query $\mathbf{Q}$, the key $\hat{\mathbf{K}}$, and the value $\mathbf{V}$ matrices,
\begin{eqnarray}
\mathbf{Q} &\triangleq& \left[\mathbf{q}_1,  \mathbf{q}_2,\ldots ,\mathbf{q}_n\right]^\top \ =\ \mathbf{X}\mathbf{W}_q^\top, \\
\mathbf{K} &\triangleq& \left[\mathbf{k}_1,  \mathbf{k}_2,\ldots, \mathbf{k}_n\right]^\top \ =\ \mathbf{X}\mathbf{W}_k^\top,  \\
\mathbf{V} &\triangleq& \left[\mathbf{v}_1,  \mathbf{v}_2,\ldots, \mathbf{v}_n\right]^\top \ =\ \mathbf{X}\mathbf{W}_v^\top,
\end{eqnarray}
where $\mathbf{W}_q, \mathbf{W}_k \in \mathbb{R}^{s\times d}$ and $\mathbf{W}_v \in \mathbb{R}^{s\times d}$ are weight matrices. Each tuple of vectors $\{\mathbf{q}_i, \mathbf{k}_i, \mathbf{v}_i\}_{i=1}^n$ comprise respectively the key, query and value vectors\footnote{For simplicity, we assume that the key, query and value vectors have the same dimension $s$.}.
    
\textbf{Step 2.} Given $\mathbf{Q}$, $\mathbf{K}$ and $\mathbf{V}$, the final output of the attention mechanism is as follow:
\begin{eqnarray}
\label{eq:attention-mat}
\mathbf{V}^+ &=& \mathrm{softmax}\left(\frac{\mathbf{Q}\mathbf{K}^\top}{\sqrt{d}}\right) \cdot \mathbf{V} \ =\ \mathbf{A}\mathbf{V},
\end{eqnarray}
where the softmax operator is applied to each row of the matrix $\mathbf{A} = \mathrm{softmax}({\mathbf{Q}}\mathbf{K}^\top/\sqrt{d})$. Here, $\mathbf{A}$ is the attention matrix. Eq.~\eqref{eq:attention-mat} details the softmax attention mechanism. 
\subsection{Multi-head Self-attention (MHSA)}
MHSA helps capture more diverse patterns in the input and increase the representation capacity of transformers. A MHSA comprises $h$ units of self-attention $\mathbf{V}^+_1, \mathbf{V}^+_2, \ldots, \mathbf{V}^+_h$ where $\mathbf{V}^+_i$ denote the output of the $i$-th self-attention unit defined above. The output of the MHSA is then computed as an affine transformation of these self-attention units,
\begin{eqnarray}
\hspace{-12mm}\mathbf{H} &\triangleq& \mathrm{MultiHead}\Big(\mathbf{V}^+_1, \mathbf{V}^+_2, \ldots, \mathbf{V}^+_h\Big) \nonumber\\
\hspace{-12mm}&=& \mathrm{Concatenate}\Big(\mathbf{V}^+_1, \mathbf{V}^+_2, \ldots, \mathbf{V}^+_h\Big) \ \mathbf{W}^{\top}_o,
\end{eqnarray}
where $\mathbf{W}_o \in \mathbb{R}^{d \times (h \cdot d)}$ is the weight matrix.

\subsection{Kernel Attention}

As the softmax attention essentially requires computing the similarity between pairs of keys and queries, \citet{tsai2019transformer} proposes kernel attention which replaces the $\mathrm{softmax}$ operator with a kernel function $\kappa(\mathbf{x}_a, \mathbf{x}_b)$. Kernel attention thus replaces Eq.~\eqref{eq:attention-mat} with
\begin{eqnarray} \label{eq:kernel attention}
\mathbf{V}^+ &=& \mathcal{K}\mathbf{V},
\end{eqnarray}
where the $(a,b)$-cell of the Gram matrix $\mathcal{K}$ takes value $\kappa(\mathbf{x}_a, \mathbf{x}_b) = \kappa_o\Big(\mathbf{x}_a\mathbf{W}_q^\top, \mathbf{x}_b\mathbf{W}_k^\top\Big)$, {\color{black} where $\kappa_o$ is a valid symmetric kernel}.

Note that even though $\kappa_o(\mathbf{z}, \mathbf{z}')$ can be selected to be symmetric, $\kappa(\mathbf{x}_a, \mathbf{x}_b)$ might not be so since
\begin{eqnarray}
\label{eq: kernel_attention-mat}
\hspace{-8mm}\kappa(\mathbf{x}_a, \mathbf{x}_b) &=&  \kappa_o\Big(\mathbf{x}_a\mathbf{W}_q^\top, \mathbf{x}_b\mathbf{W}_k^\top\Big) \nonumber\\
&\ne& \kappa_o\Big(\mathbf{x}_b\mathbf{W}_q^\top, \mathbf{x}_a\mathbf{W}_k^\top\Big) \ =\ \kappa(\mathbf{x}_b, \mathbf{x}_a).
\end{eqnarray}

Thus, to construct a valid symmetric kernel in kernel attention, the key and query matrices, $\mathbf{W}_k$ and $\mathbf{W}_q$, need to be identical, $\mathbf{W}_k = \mathbf{W}_q = \mathbf{W}$. Tying the parameters defining these matrices saves computational costs but will result in a limitation in the representation capacity of the model, as empirically shown in Section 3.2 in \citep{tsai2019transformer}, where attention with asymmetric kernels tends to outperform attention with symmetric kernels.     
    

% \subsection{Bayesian Gaussian Process Regression and standardized Gaussian Process}
%     \par Gaussian Processes \citep{} are a powerful tools when it comes to robust function-space uncertainty calibration. Assume we have a dataset $\{y_i, \textbf{x}_i\}_{i=1}^n$, $\textbf{x}_i \in \mathbb{R}^{D_x}$ and $y_i \in \mathbb{R}$. We suppose that each observation $y_i$ is generated from $y_i = f(\textbf{x}_i)+\epsilon_i, \epsilon_i \sim \mathcal{N}(0, \sigma^2)$, where $f(\cdot)$ is sampled from a Gaussian Process prior $\mathcal{GP}(0, k_f(\textbf{x}, \textbf{x'}))$. The covariance function is the squared exponential kernel function:
%     \begin{align}
%         k_f(\textbf{x}, \textbf{x}') = \sigma_f^2 exp \Big( \frac{-1}{2} ||\textbf{xW}^T-\textbf{x}'\textbf{W}^T ||^2 \Big) 
%     \end{align}
%     where $\textbf{W}\in \mathbb{R}^{D\times D_x}$ and $\sigma_f^2$ are hyperparameters.


%     \par Consider a scalar function $s(\textbf{z})$ where $z\in \mathbb{R^{D}}$, this function is sampled from a standardized GP with zero mean and covariance function:
%     \begin{align} \label{eq: standardized square exponential}
%         k_s(\textbf{z}, \textbf{z}') = exp \Big( \frac{-1}{2} ||\textbf{z}-\textbf{z}' ||^2 \Big) .
%     \end{align}
%     This GP is called a standardized GP and the sampled function $s(\textbf{z})$ is called a standardized function. We can express the function $f(\textbf{x})$ with high dimensional input  $x\in \mathbb{R}^{D_x}$ as:
%     \begin{align}
%         f(\textbf{x}) = \sigma_f s(\textbf{xW}^T).
%     \end{align}
%     The GP model can now be expressed with the following structure:
%     \begin{equation}
%         \begin{aligned}
%             s(\textbf{z}) &\sim \mathcal{GP}(0, k_s(\textbf{z}, \textbf{z}')), \ \ \boldsymbol{\theta} = p(\boldsymbol{\theta}) \\
%             f(\textbf{x}) &=  \sigma_fs(\textbf{xW}^T) \\
%             y_i &\sim \mathcal{N}(f(\textbf{x}_i), \sigma^2), \ \ \ i=1,\ldots,n.
%         \end{aligned}
%     \end{equation}
    

\iffalse
\subsection{Contribution}
    In this work, we propose Canonical Transformer, a novel framework to calibrate uncertainty of Transformer based on the canonical representation of GPs \citep{aueb2013variational}. In Canonical Transformer, the self-attention units are modeled as the cross-covariance between two correlated GPs, which are induced from a canonical representation of GP. As the cross-covariance do not correspond to any single GP, we relax the symmetric condition imposed on the kernel used in other GP-based transformers and therefore our model can have better representation capacity. Our contributions are three-fold:

    \begin{itemize}
        \item We establish a connection between kernel attention and the mean of the conditional prior distribution of a canonical GP.
        \item \textcolor{blue}{We propose a novel regularization term in the loss function to learn the weight matrices $\textbf{W}_Q$ and $\textbf{W}_K$ though maximizing a log joint distribution of two correlated GPs.}
        \item We empirically verify that our Canonical Transformer achieves better results in both in-distribution and out-of-distribution calibration while also improves the accuracy compared to other GP-based Transformers.
    \end{itemize}
\fi

    % \par \textbf{Variational inference}. Defined auxiliary variables $\textbf{u}\in \mathbb{R}^m$ such that $u_i = s(\textbf{z}_i)$. The set $\textbf{Z} = (\textbf{z}_1,\ldots,\textbf{z}_N)$ are called inducing inputs. The inducing  variables $\textbf{u}$ have Gaussian distribution 
    % \begin{align}
    %     p(\textbf{u}) = \mathcal{N}(\textbf{0}, \textbf{K}_{\textbf{uu}}).
    % \end{align}
    % The cross-covariance function of $\textbf{f}$ and $\textbf{u}$ is given by
    % \begin{align}
    %     k_{f,u} (\textbf{x}, \textbf{z}) = E[\sigma_fs(\textbf{xW}^T)s(\textbf{z})] = \sigma_f^2 exp \Big( \frac{-1}{2} ||\textbf{xW}^T-\textbf{z} ||^2 \Big) = \sigma_f^2 k_s(\textbf{xW}^T, \textbf{z}) .
    % \end{align}
    % The conditional Gaussian density can be computed as follow:
    % \begin{align}
    %     p(\textbf{f}|\textbf{u}, \textbf{W}) = \mathcal{N}(\textbf{f}| \textbf{K}_{\textbf{fu}}\textbf{K}_{\textbf{uu}}^{-1}\textbf{u}, \textbf{K}_{\textbf{ff}} - \textbf{K}_{\textbf{fu}}\textbf{K}_{\textbf{uu}}^{-1}\textbf{K}_{\textbf{fu}}^T)
    % \end{align}

\subsection{Gaussian Processes}
A Gaussian process~\citep{Rasmussen06} defines a probabilistic prior over a random function $z(\mathbf{x})$ defined by mean function $m(\mathbf{x}) = 0$ and kernel function $\kappa(\mathbf{x}, \mathbf{x}')$.\footnote{For simplicity, we assume a zero mean function since we can always re-center the training outputs around $0$.} These functions induce a marginal Gaussian prior over the evaluations $\mathbf{z} = [z(\mathbf{x}_1) \ldots z(\mathbf{x}_n)]^\top$ on an arbitrary finite subset of inputs  $\{\mathbf{x}_1, \ldots, \mathbf{x}_n\}$. 

Let $\mathbf{x}_\ast$ be an unseen input whose corresponding output $z_\ast = z(\mathbf{x}_\ast)$ we wish to predict. The Gaussian prior over $[z(\mathbf{x}_1) \ldots z(\mathbf{x}_n)\ z(\mathbf{x}_\ast)]^\top$ implies the following conditional distribution:
\begin{eqnarray}
z_\ast &\triangleq& z(\mathbf{x}_\ast) \mid \mathbf{z} \nonumber\\
&\sim& \mathbb{N}\Big(\mathbf{k}_\ast^\top\mathcal{K}^{-1}\mathbf{z},\  \kappa(\mathbf{x}_\ast,\mathbf{x}_\ast) - \mathbf{k}_\ast^\top\mathcal{K}^{-1}\mathbf{k}_\ast\Big) \ ,\label{eq:1}
\end{eqnarray}
where $\mathbf{k}_\ast = [\kappa(\mathbf{x}_\ast, \mathbf{x}_1) \ldots \kappa(\mathbf{x}_\ast,\mathbf{x}_n)]^\top$ and $\mathcal{K}$ denotes the Gram matrix induced by $\kappa(\mathbf{x}, \mathbf{x}')$ on $\{\mathbf{x}_1, \ldots, \mathbf{x}_n\}$ whose value at cell $(a,b)$ is $\kappa(\mathbf{x}_a, \mathbf{x}_b)$. For noisy observation $z_i$ perturbed by Gaussian noise such that $z_i \sim \mathbb{N}(z(\mathbf{x}_i), \sigma^2)$,  Eq.~\eqref{eq:1} above can be integrated with $\mathbb{N}(\mathbf{z}, \sigma^2\mathbf{I})$ to yield:
\begin{eqnarray}
z_\ast &\triangleq& z(\mathbf{x}_\ast) \mid \mathbf{z} \nonumber\\
&\sim& \mathbb{N}\Big(\mathbf{k}_\ast^\top\mathcal{K}_{\sigma}^{-1}\mathbf{z},\  \kappa(\mathbf{x}_\ast,\mathbf{x}_\ast) - \mathbf{k}_\ast^\top \mathcal{K}_{\sigma}^{-1}\mathbf{k}_\ast\Big) \ ,\label{eq:2}
\end{eqnarray}
where $\mathcal{K}_\sigma = \mathcal{K} + \sigma^2\mathbf{I}$. Eq.~\eqref{eq:2} forms the predictive distribution of the Gaussian process (GP).

\subsection{Kernel Attention as GP Inference} \label{sec: attention as GP}
Suppose $\mathbf{X} = [\mathbf{x}_1, \ldots, \mathbf{x}_n]^\top$ is the input fed into a kernel-attention unit. Assuming that the key and query matrices are set to be identical, its output $\mathbf{V}^+ \in \mathbb{R}^{n \times s}$ is given as $\mathbf{V}^+ = \mathcal{K}\mathbf{V}=\mathcal{K}\mathbf{X}\mathbf{W}_v^\top$ where
\begin{eqnarray}
\hspace{-7.5mm}\mathcal{K}\mathbf{X}\mathbf{W}_v^\top &\triangleq& 
\mathcal{K}\Bigg(\mathcal{K} + \sigma^2\mathbf{I}\Bigg)^{-1}\mathbf{Z}. \label{eq:Z}
\end{eqnarray}
% \begin{eqnarray}
% \hspace{-7.5mm}\mathcal{K}\mathbf{X}\mathbf{W}_v^\top &\triangleq& \mathcal{K}\Bigg(\mathcal{K} + \sigma^2\mathbf{I}\Bigg)^{-1}\Bigg(\mathcal{K} + \sigma^2\mathbf{I}\Bigg)\mathbf{X}\mathbf{W}_v^\top \nonumber\\
% \hspace{-10mm}&=&\mathcal{K}\Bigg(\mathcal{K} + \sigma^2\mathbf{I}\Bigg)^{-1}\mathbf{Z}
% \end{eqnarray}
Here, we set $\mathbf{Z} = (\mathcal{K} + \sigma^2\mathbf{I})\mathbf{X}\mathbf{W}_v^\top \in \mathbb{R}^{n\times s}$ with $\mathcal{K}$ being the induced Gram matrix of $\kappa(\mathbf{x}_a, \mathbf{x}_b)$ as defined in Eq.~\eqref{eq: kernel_attention-mat} above. Thus, let $\boldsymbol{\nu}_a$ denote the $a$-th column of $\mathbf{V}^+$ and $\mathbf{z}_a$ denote the $a$-th column of $\mathbf{Z}$, we have
\begin{eqnarray}
\boldsymbol{\nu}_a &=& \mathcal{K}\Bigg(\mathcal{K} + \sigma^2\mathbf{I}\Bigg)^{-1} \mathbf{z}_a,  \label{eq:equiv}
\end{eqnarray}
or equivalently, 
$[\boldsymbol{\nu}_{a}]_r =  \mathbf{k}_r^\top(\mathcal{K} + \sigma^2\mathbf{I})^{-1} \mathbf{z}_a$ where $[\boldsymbol{\nu}_{a}]_r$ is the $r$-th component of the column vector $\boldsymbol{\nu}_a$ and $\mathbf{k}_r = [\kappa(\mathbf{x}_r, \mathbf{x}_1) \ldots \kappa(\mathbf{x}_r, \mathbf{x}_n)]^\top$. 

Comparing this to Eq.~\eqref{eq:2} earlier, it appears that the $a$-th column $\boldsymbol{\nu}_a$ of the attention output is the mean prediction on $\mathbf{x}_1, \mathbf{x}_2, \ldots, \mathbf{x}_n$ of a  modeling the dataset $\{\mathbf{x}_r, [\mathbf{z}_a]_r\}_{r=1}^n$. As such, one can assess the attention uncertainty or variance of $\boldsymbol{\nu}_a$ (i.e., the $a$-th column of $\mathbf{V}^+$),
\begin{eqnarray}
\mathbb{V}\left[\boldsymbol{\nu}_a\right] &=& \mathcal{K} \ -\  \mathcal{K}\left(\mathcal{K} + \sigma^2\mathbf{I}\right)^{-1}\mathcal{K}.
\end{eqnarray}
Overall, if the output dimension of the kernel attention unit is $s$, we can equivalently represent it using $s$ independent GPs. Furthermore, we can extend the above formalism towards multi-head self-attention with GPs by concatenating the equivalent GP inferences corresponding to each head and multiplying all with the weight matrix $\mathbf{W}_o$. 

Note that this equivalence is only possible if the kernel matrix above is symmetric, which requires $\mathbf{W}_q = \mathbf{W}_k$ as explained earlier. A more recent work by~\citep{chen2023calibrating} has also extended the above to instead align with a sparse GP inference, which similarly cast the kernel attention output in terms of the sparse GP inference. Nonetheless, like the GP attention approach, the proposed sparse GP attention will still require the use of symmetric kernel to ensure the modeling consistency of its underlying GP.
 


 
     
\iffalse
     Suppose we have training data $\{\textbf{K}, \textbf{y}\}$ where $\textbf{K}=[\textbf{k}_1,..,\textbf{k}_N]^T\in \mathbb{R}^{N\times D}$ are the keys in attention and $\textbf{y}\in \mathbb{R}^{N\times D_v}$ are the training targets. We assume that the data is generated via a latent function $f: D \mapsto D_v$,
    \begin{align*}
        \textbf{y}_i = f(\textbf{k}_i) + \boldsymbol{\epsilon}_i, \ i=0,..,N, \ \boldsymbol{\epsilon}_i \sim \mathcal{N}(\textbf{0}, \sigma^2\textbf{I}_{D_v}).
    \end{align*}
    We specify a GP prior over the function $f$ with zero mean and covariance $k(\cdot, \cdot)$. The marginal distribution of function values $\textbf{f} \in \mathbb{R}^{N \times D_v}$ over the training data $\textbf{K}$ is given by the Gaussian,
    \begin{align}
        p(\textbf{f}|\textbf{K}) = \mathcal{N}(\textbf{0}, \mathcal{K}_{\mathbf{K,K}}),
    \end{align}
    where $[\mathcal{K}_{\mathbf{K,K}}]_{i,j} = k(\textbf{k}_i, \textbf{k}_j), \ i,j=0,\ldots,N$.
    \par Using standard Bayesian regression, we can derive the posterior process at test points $(\textbf{K}^*, \textbf{f}^*)$ given the training data as follows:
    \begin{align}
        p(\textbf{f}^*|\textbf{K}^*, \textbf{K}, \textbf{y}) = \mathcal{N}(\mathcal{K}_{\textbf{K}^*\textbf{K}}\mathcal{K}_{\textbf{K}\textbf{K}}^{-1}\textbf{y}, \mathcal{K}_{\textbf{K}^*\textbf{K}^*} - \mathcal{K}_{\textbf{K}^*\textbf{K}}\mathcal{K}_{\textbf{K}\textbf{K}}^{-1}\mathcal{K}_{\textbf{K}\textbf{K}^*}).
    \end{align}
     Setting $\textbf{K}^* := \textbf{Q}\in \mathbb{R}^{N\times D}$, $\textbf{V}:=\mathcal{K}_{\textbf{K}\textbf{K}}^{-1}\textbf{y} \in \mathbb{R}^{N\times D_v}$, the posterior process becomes
    \begin{align} \label{eq: full GP post}
        p(\textbf{f}^*|\textbf{Q}, \textbf{K}, \textbf{y}) = \mathcal{N}(\mathcal{K}_{\textbf{Q}\textbf{K}}\textbf{V}, \mathcal{K}_{\textbf{K}^*\textbf{Q}} - \mathcal{K}_{\textbf{Q}\textbf{K}}\mathcal{K}_{\textbf{K}\textbf{K}}^{-1}\mathcal{K}_{\textbf{K}\textbf{Q}}) := \mathcal{N}(\textbf{m}, \boldsymbol{\Sigma})
    \end{align}
    We observe that the posterior mean of the full GP (\ref{eq: full GP post}) is equivalent to the kernel attention in (\ref{eq: kernel_attention-mat}). In the case of a MHA, we can view kernel attention as the mean of a GP posterior which is fitted to each attention head.
    After obtaining the posterior mean and covariance $\textbf{m}^h$ and $\boldsymbol{\Sigma}^h$ for each head $\textbf{h} \in \{1,\ldots,H\}$, we generate the attention output for each head using the reparameterization trick \citep{}:
    \begin{align}
        \hat{\textbf{V}}^h = \textbf{m}^h + \boldsymbol{L}^h\boldsymbol{\epsilon}, \ \ \boldsymbol{\epsilon}^h \sim \mathcal{N}(\textbf{0}, \textbf{I}).
    \end{align}
    where $\textbf{L}^h(\textbf{L}^h)^T=\boldsymbol{\Sigma}^h$ is the Cholesky factorization of $\boldsymbol{\Sigma}$.
    The final attention output is given by a linear transformation: 
    \begin{align*}
        \textbf{F} = \text{Concat}(\hat{\textbf{V}}^1,\ldots,\hat{\textbf{V}}^H)\textbf{W}_O,
    \end{align*}
    where $\textbf{W}_O \in \mathbb{R}^{D_v \times HD_v}$ is the output weight matrix.
    % \textcolor{blue}{SGPA considers variational inference => still needs to use symmetric kernel => We use standardized GP to lift this symmetry condition}
    \fi

%In practice, we note that a recent work leveraging Gaussian Process to calibrate transformer, Sparse Gaussian Process Attention (SGPA), \citep{chen2023calibrating} models the kernel attention as the posterior mean of a sparse variational Gaussian Process and use variational inference \citep{} to reduce the computational complexity of the full Gaussian Process regression (\ref{eq: full GP post}). However, SGPA still requires the use of a valid symmetric kernel for attention, leading to limited representation capacity.