\begin{table*}[t!]
    % \captionsetup{font=small} 
        \centering
        \caption{Test MCC and other uncertainty calibration metrics achieved by SGPA and our CGPT/SCGPT on the CoLA dataset under both in-distribution and out-of-distribution settings. The first rows report results on CoLA under the in-distribution setting. The last row reports results on CoLA under the OOD setting. For each metric, we report the mean value and its standard deviation obtained over multiple runs.}
        \vspace{-1em}
        \scriptsize
        \begin{tabularx}{0.665\linewidth}{|l l l l l l|} 
            \toprule
            \textbf{Dataset} & \textbf{Model} & \bf{MCC $\uparrow$} & \bf{NLL $\downarrow$} & \bf{MCE $\downarrow$} & \bf{ECE $\downarrow$}\\
            \midrule
            
            \multirow{2}{*}{CoLA} & SGPA & \textbf{28.826 $\pm$ 0.982} & 0.842 $\pm$ 0.045 & \textbf{0.713 $\pm$ 0.031} & 0.257 $\pm$ 0.011\\ 
            & CGPT (ours) & 26.471 $\pm$ 0,387 & \textbf{0.774 $\pm$ 0.010} & 0.725 $\pm$ 0.013 & \textbf{0.236 $\pm$ 0.004}\\
            & SCGPT (ours) & 27.686 $\pm$ 1.425 & 2.254 $\pm$ 0.204 & 0.724 $\pm$ 0.004 & {0.293 $\pm$ 0.003}\\
            \midrule

            \multirow{2}{*}{CoLA (OOD)} & SGPA & 22.500 $\pm$ 0.877 & 0.876 $\pm$ 0.053 & 0.740 $\pm$ 0.040 & 0.271 $\pm$ 0.0192\\ 
            & CGPT (ours) & \textbf{26.957 $\pm$ 0.748} & \textbf{0.749 $\pm$ 0.038} & {0.711 $\pm$ 0.002} & \textbf{0.230 $\pm$ 0.004}\\
            & SCGPT (ours) & {25.369 $\pm$ 0.452} & {2.243 $\pm$ 0.110} & \textbf{0.700 $\pm$ 0.003} & {0.306 $\pm$ 0.010}\\
            
            \bottomrule
        \end{tabularx}
        \label{tab:in-distribution}
        \vspace{-1.2em}
    \end{table*}
{\small
    \begin{table*}[t]
    % \captionsetup{font=footnotesize} 
        \centering
        \caption{Test accuracy and other calibration metrics achieved by our CGPT/SCGPT models on CIFAR10-C dataset under the OOD setting. For each distortion category, we report the mean metrics over all distortion types. We observe that CGPT/SCGPT attains better accuracy and calibration metrics than SGPA across $14/16$ cases, indicating that CGPT/SCGPT is more robust than SGPA under distribution shift.}
        \vspace{-1em}
        \scriptsize
         % \small
        \begin{tabularx}{0.759\linewidth}{|l l l l l l l|} 
            \toprule
            \textbf{Metric} & \textbf{Model} & \bf{Noise} & \bf{Blur} & \bf{Weather} & \bf{Digital} & \bf{Avg.}\\
            \midrule
            
            \multirow{2}{*}{Acc $\uparrow$} & SGPA & 50.803 $\pm$ 0.447 & {59.264 $\pm$ 0.915} & \textbf{64.148 $\pm$ 0.472} & \textbf{63.028 $\pm$ 0.334} & {59.722 $\pm$ 0.323}\\
            & Kernel (asym) & 53.014 $\pm$ 0.040 & \textbf{61.327 $\pm$ 1.511} & {63.426 $\pm$ 0.930} & {62.507 $\pm$ 0.847} & {60.340 $\pm$ 0.816}\\
            & Kernel (sym) & 52.675 $\pm$ 0.190 & {60.093 $\pm$ 1.846} & {62.643 $\pm$ 0.472} & {62.710 $\pm$ 0.520} & {59.884 $\pm$ 0.838}\\
            & CGPT (ours) & {55.177 $\pm$ 0.953} & 56.412 $\pm$ 1.506 & 61.515 $\pm$ 0.703 & 60.373 $\pm$ 0.123 & 58.591 $\pm$ 0.664\\
            & SCGPT (ours) & \textbf{57.701 $\pm$ 0.870} & {59.647 $\pm$ 0.925} & {63.287 $\pm$ 0.849} & {62.516 $\pm$ 0.252} & \textbf{61.746 $\pm$ 0.438}\\
            \midrule

             \multirow{2}{*}{NLL $\downarrow$} & SGPA & 3.464 $\pm$ 0.423 & 2.551 $\pm$ 0.091 & 2.137 $\pm$ 0.162 & 2.298 $\pm$ 0.045 & 2.626 $\pm$ 0.202\\ 
             & Kernel (asym) & {3.779 $\pm$ 0.604} & 2.690 $\pm$ 0.293 & {2.462 $\pm$ 0.305} & \text{2.673 $\pm$ 0.176} & {2.875 $\pm$ 0.384}\\
             & Kernel (sym) & 3.379 $\pm$ 0.448 & {2.435 $\pm$ 0.177} & 2.262 $\pm$ 0.283 & {2.389 $\pm$ 0.303} & 2.591 $\pm$ 0.331\\
            & CGPT (ours) & \textbf{1.688 $\pm$ 0.033} & \textbf{1.565 $\pm$ 0.068} & \textbf{1.352 $\pm$ 0.049} & \textbf{1.461 $\pm$ 0.027} & \textbf{1.516 $\pm$ 0.029}\\
            & SCGPT (ours) & 2.060 $\pm$ 0.064 & {1.835 $\pm$ 0.081} & {1.663 $\pm$ 0.046} & {1.796 $\pm$ 0.051} & {1.787 $\pm$ 0.017}\\
            \midrule

             \multirow{2}{*}{MCE $\downarrow$} & SGPA & 0.668$\pm$ 0.009 & 0.592 $\pm$ 0.014 & 0.576 $\pm$ 0.014 & 0.575 $\pm$ 0.001 & 0.593 $\pm$ 0.002\\ 
             & Kernel (asym) & 0.512$\pm$ 0.021 & 0.460 $\pm$ 0.016 & 0.456 $\pm$ 0.010 & 0.457 $\pm$ 0.020 & 0.470 $\pm$ 0.018\\
             & Kernel (sym) & 0.498$\pm$ 0.014 & 0.449 $\pm$ 0.011 & 0.443 $\pm$ 0.007 & 0.437 $\pm$ 0.020 & 0.456 $\pm$ 0.024\\
            & CGPT (ours) & \textbf{0.360 $\pm$ 0.011} & \textbf{0.334 $\pm$ 0.013} & \textbf{0.284 $\pm$ 0.002} & \textbf{0.314 $\pm$ 0.003} & \textbf{0.324 $\pm$ 0.002}\\
            & SCGPT (ours) & {0.443 $\pm$ 0.018} & {0.417 $\pm$ 0.016} & {0.400 $\pm$ 0.003} & {0.419 $\pm$ 0.003} & {0.421 $\pm$ 0.004}\\
            \midrule

             \multirow{2}{*}{ECE $\downarrow$} & SGPA & 0.532 $\pm$ 0.021 & 0.488 $\pm$ 0.012 & 0.469 $\pm$ 0.003 & 0.472 $\pm$ 0.010 & 0.487 $\pm$ 0.012\\ 
             & Kernel (asym) & 0.377 $\pm$ 0.015 & 0.294 $\pm$ 0.004 & 0.275 $\pm$ 0.008 & 0.280 $\pm$ 0.011 & 0.304 $\pm$ 0.009\\
             & Kernel (sym) & 0.363 $\pm$ 0.023 & 0.285 $\pm$ 0.001 & 0.266 $\pm$ 0.012 & 0.267 $\pm$ 0.012 & 0.292 $\pm$ 0.010\\
            & CGPT (ours) & \textbf{0.226 $\pm$ 0.012} & \textbf{0.202 $\pm$ 0.007} & \textbf{0.159 $\pm$ 0.004} & \textbf{0.183 $\pm$ 0.003} & \textbf{0.192 $\pm$ 0.001}\\
            & SCGPT (ours) & {0.292 $\pm$ 0.010} & {0.259 $\pm$ 0.004} & {0.234 $\pm$ 0.005} & {0.243 $\pm$ 0.004} & {0.249 $\pm$ 0.002}\\
            \bottomrule
        \end{tabularx}
        \label{tab:OOD CIFAR}
        \vspace{-0.15in}
    \end{table*}
}
{\small
\begin{table*}[t!]
            \centering
% \captionsetup{font=small} 
            \caption{Averaged OOD detection performance achieved by SCGPT, SGPA and kernel attention over $4$ datasets (Textures, LSUNCrop, LSUNResize and TinyImageNetCrop). For each method, the average OOD performance is reported for each detector. SCGPT outperforms the baselines in most OOD detection metrics and has the best performance on average, suggesting its advantage on OOD detection task.}
            \vspace{-1em}
            \scriptsize	
            \begin{tabularx}{\linewidth}{|X X X X X X|} 
                \toprule
                \textbf{Model} & \textbf{Detector} & \bf{AUROC $\uparrow$} & \bf{AUPR-IN $\uparrow$} & \bf{AUPR-OUT $\uparrow$} & \bf{FPR@95 $\downarrow$}\\
                \midrule

                \multirow{5}{*}{Kernel (sym)} & KLMatching & 63.80 & 60.61 & 63.70 & 87.08\\
                
                & MaxSoftmax & 69.39 & 61.00 & \textbf{75.21} & 71.68\\
                
                & Entropy & 69.82 & 62.08 & 75.35 & 71.68\\
                
                & Energy-Based & 72.83 & 62.79 & 76.85 & 65.08\\
                
                & Average & 68.21 & 61.97 & 72.03 & 73.38\\

                \midrule

                \multirow{5}{*}{Kernel (asym)} & KLMatching & 64.72 & 60.62 & 62.83 & 92.47\\
                
                & MaxSoftmax & \textbf{69.51} & \textbf{61.40} & 75.11 & \textbf{71.30}\\
                
                & Entropy & \textbf{70.01} & 62.54 & 75.39 & 70.78\\
                
                & Energy-Based & 76.15 & 66.61 & 80.96 & 58.95\\
                
                & Average & 70.10 & 62.78 & 73.32 & 73.37\\

                \midrule

                \multirow{5}{*}{SGPA} & KLMatching & 64.82 & 60.32 & 63.75 & 90.72\\
                
                & MaxSoftmax & 68.63 & 60.76 & 74.50 & 72.62\\
                
                & Entropy & 69.16 & 61.97 & 74.74 & 72.31\\
                
                & Energy-Based & \textbf{77.78} & 62.66 & \textbf{82.46} & \textbf{58.21}\\
                
                & Average & 70.09 & 61.42 & 73.86 & 73.47\\

                \midrule

                \multirow{5}{*}{SCGPT (ours)} & KLMatching & \textbf{67.31} & \textbf{62.11} & \textbf{66.78} & \textbf{86.50}\\
                
                & MaxSoftmax & 69.18 & 61.10 & 75.13 & 71.39\\
                
                & Entropy & 69.91 & \textbf{62.82} & \textbf{75.47} & 70.89\\
                
                & Energy-Based & 77.70 & \textbf{68.56} & 82.06 & 60.09\\
                
                & Average & \textbf{70.27} & \textbf{63.40} & \textbf{74.12} & \textbf{72.47} \\
                
                % \midrule
                % \multirow{5}{*}{CGPT} & KLMatching & \textbf{66.67} & \textbf{62.18} & \textbf{64.62} & 90.26\\
                
                % & MaxSoftmax & 67.31 & 60.02 & 73.05 & \textbf{75.17}\\
                
                % & Entropy & \textbf{69.91} & \textbf{62.87} & \textbf{74.80} & \textbf{73.87}\\
                
                % & Energy-Based & 77.14 & \textbf{63.24} & 78.68 & \textbf{66.92}\\
                
                % & Average & \textbf{70.25} & \textbf{62.08} & 72.79 & \textbf{76.56} \\
                 
                \bottomrule
            \end{tabularx}
            \vspace{-0.2in}
            \label{tab:ood detection2}
        \end{table*}
}
\par In this section, we empirically study the advantages of CGPT and SCGPT in calibrating transformers on a variety of tasks including the COLA linguistic acceptability prediction task \citep{warstadt2019neural} and CIFAR10 classification and out-of-distribution (OOD) evaluation \citep{krizhevsky2009cifar,hendrycks2019benchmarking}. We aim to show that both of our CGPT and SCGPT can attain comparable or better calibration ability than the SGPA \citep{chen2023calibrating} and kernel attention \citep{tsai2019transformer} baselines due to the increased representation capacity, which is enabled via the use of asymmetric kernel function. Moreover, we also compare the complexity efficiency of SCGPT against the SGPA \citep{chen2023calibrating} baseline.  We adopt similar experiment settings as in \citep{chen2023calibrating}. Additional experimental results are provided in Appendix~\ref{app:C}. 
    % \begin{table*}[t]
    %     \centering
    %     \caption{ In-distribution test accuracy and other uncertainty calibration metrics of kernel attention and our CGPT model evaluated on the CIFAR10 dataset.
    %     All reported results are averaged over 3 independent runs. The results show that CGPT achieves better performance than kernel attention with symmetric kernel, supporting our claim earlier that enforcing attention kernel to be symmetric will hamper the overall performance.}
    %     \vspace{0.5em}
    %     \begin{tabularx}{\textwidth}{ |X X X X X X|} 
    %         \toprule
    %         \textbf{Dataset} & \textbf{Model} & \bf{Accuracy $\uparrow$} & \bf{NLL $\downarrow$} & \bf{MCE $\downarrow$} & \bf{ECE $\downarrow$} \\
    %         \midrule
            
    %         \multirow{2}{*}{CIFAR10} & Kernel Attn & 76.12 $\pm$ 0.10 & 1.10 $\pm$ 0.02 & 0.61 $\pm$ 0.1 & 0.51 $\pm$ 0.06 \\ 
    %         & CGPT (ours) & \textbf{76.21 $\pm$ 0.30} & \textbf{0.87 $\pm$ 0.03} & \textbf{0.27 $\pm$ 0.02} & \textbf{0.13 $\pm$ 0.05} \\

    %         \bottomrule
    %     \end{tabularx}
    %     \label{tab:cifar compare kernel}
    % \end{table*}

% \subsection{In-Distribution Calibration}
%     \par We conduct experiments for the CIFAR10 and COLA tasks and report the accuracies as well as the calibration metrics evaluated on the in-distribution test datasets.  Our results are reported in Table \ref{tab:in-distribution}, which show that our CGPT outperforms SGPA in all metrics including test accuracy on the CIFAR10 image classification task. For the CoLA dataset, CGPT outperforms SGPA in all calibration metrics, showing better calibration ability. Please refer to the first two rows of Table~\ref{tab:in-distribution}.
    
%     % We conduct experiments on image classification (CIFAR10) and linguistic acceptability (CoLA) tasks and report the in-distribution calibration test results. For predictive accuracy, we use test accuracy for CIFAR10 and Matthew correlation coefficient (MCC) for CoLA. We also use the negative log likelihood (NLL) as our calibration metric for both of the tasks.

\subsection{Experiment Settings} \label{app:D}
    Following the prior work of ~\citep{chen2023calibrating}, we will conduct experiments on image classification and the linguistic acceptability prediction with the following setup:
    \par \textbf{Tasks.} We study the performance of CGPT and SCGPT on image classification using the CIFAR10 \citep{krizhevsky2009cifar} dataset and linguistic acceptability prediction using the CoLA dataset \citep{warstadt2019neural}. For the out-of-distribution (OOD) evaluations, we use the corrupted CIFAR10-C dataset \citep{hendrycks2019benchmarking} for image classification and the out-of-distribution data within the CoLA dataset for linguistic acceptability prediction. We also evaluate and compare the uncertainty calibration of our proposed models and other baselines in OOD detection tasks for image classification (see Section~\ref{sec: OOD detection}).
    %In Section \ref{sec: OOD detection}, we consider the OOD detection tasks for the models trained on vision tasks including our methods and other baselines to further evaluate the uncertainty calibration ability of the models.
    
    \par \textbf{General settings for all tasks.} For SGPA and kernel attention, we use the ARD-RBF kernel \citep{Rasmussen06} for the image classification tasks
    $ \kappa(\mathbf{x}, \mathbf{x}') = \sigma_s^2 \exp({-0.5\sum_{i=1}^d (x_i-x'_i)^2/\sigma_i^2})$ , and an exponential of scaled dot product variant for the linguistic acceptability task $\kappa(\mathbf{x},\mathbf{x}')=\sigma_s^2 \exp(\sum_{i=1}^d x_ix'_i/\sigma_i^2)$ . Here, $\mathbf{x}$ 
    and $\mathbf{x}'$ are $d$-dimensional inputs, $\sigma_s^2$ denotes the output variance and $\{\sigma_i^2\}_{i=1}^d$ are the length scales. 
    For CGPT and SCGPT, we use the parameter-free squared exponential kernel function for all tasks $\kappa_o(\mathbf{x}, \mathbf{x}') = \mathrm{exp}(-0.5 \|\mathbf{x} - \mathbf{x}'\|^2)$ as the canonical representation and model the latent inputs $\mathbf{X}_o$ by linear projection of a finite set of inputs $\mathbf{X}$ for simplicity. 
    % We estimate predictive uncertainty by using 10 Monte Carlo samples. 
    The regularization coefficient $\alpha$ in our objective function is chosen using its induced performance on a validation dataset. Our experiments are conducted on A100 40GB SMX NVIDIA GPUs.  

    \par \textbf{Baselines.} We compare the performance of our proposed models against that of SGPA~\citep{chen2023calibrating}, which leverages sparse GP to design (symmetric) attention and provide uncertainty calibration for the Transformer. Our proposed models are also compared against standard non-GP baselines with symmetric and asymmetric kernel attention \citep{tsai2019transformer}. 

\par \textbf{Architectures.} We use Vision Transformer \citep{dosovitskiy2020image} for image classification and standard transformer architecture \citep{vaswani2017attention} for linguistic acceptability prediction. We use the parameter-free squared exponential kernel for CGPT and SCGPT for both of the tasks while in SGPA, we use the ARD kernel \citep{Rasmussen06} for image classification and the exponential kernel for linguistic acceptability prediction. 
\par \textbf{Evaluation.} We study the calibration capacity of the models by evaluating the robustness of them under out-of-distribution setting in section \ref{sec:experiments}. We also compare the out-of-distribution detection capacity of our methods against other baselines in section \ref{sec: OOD detection}. We report the accuracy (Acc) for the image classification tasks and Matthew correlation coefficient (MCC) for CoLA, as well as other test calibration metrics, including negative log likelihood (NLL), expected calibration error (ECE) and maximum calibration error (MCE).

\par \textbf{CGPT and SCGPT proprietary hyperparameters.} The $\alpha$ value in our CGP objective function is linearly annealed from $0.0$ to $1.0$ during the training phase. For SCGPT, we set the inducing variable dimension $m$ to be $m=16$ in image classification tasks, which is smaller than the sequence length $n$ in order to be more memory and computationally efficient, as discussed in Section \ref{sec: SCGPT}. The value of the noise $\sigma$ in SCGPT is tuned from $0$ to $1$ and chosen to be $\sigma=0.1$ as we find that value gives the best performance for SCGPT.

    \subsubsection{Image Classification} \label{sec:details image}
    For the OOD tasks on CIFAR10-C,
    we use the corrupted datasets and the models trained on the clean datasets to evaluate the OOD performances. The CIFAR10-C dataset contains 19 types of distortions covering 4 distortion categories: Noise, Blur, Weather and Digital. For each experiment on each type of corruption, we report the mean OOD results over multiple independent runs. The corresponding standard deviations are also reported.

    \textbf{Datasets.} The original training partition of the CIFAR10 dataset is randomly split into 45,000 instances for training and 5,000 instances for validation.

    \textbf{Implementation details.}  The architecture of ViT for the CIFAR10 dataset contains 5 MHSA layers. Each layer has 4 attention heads whose the hidden dimension is set to 128.

    Both CGPT and SCGPT are trained with batch-size 100 for 600 epochs. Their loss functions are minimized using ADAM ~\citep{kingma2014adam} with an initial learning rate of 0.0005 which decays to 0.00001 linearly. We adopt the same training scheme of~\citep{chen2023calibrating} for CGPT/SCGPT: ViT with asymmetric kernel attention is trained for the first 200 epochs and its parameters are used to initialize parameters which are continued to be updated for the next 400 epochs using the CGPT's/SCGPT's loss function. For SGPA, we use the same hyper-parameter configuration as reported in~\citep{chen2023calibrating} for training.

    \textbf{Evaluation.} We choose the best model using the validation accuracy evaluated after each $10$ epochs. The reported results are averaged over multiple independent runs. Their corresponding mean and standard deviation are also reported.
    
    \subsubsection{Linguistic Acceptability} 
    For the OOD task on the COLA dataset,
    we use the provided OOD set and the model trained on the corresponding clean dataset to evaluate the robustness of model's performance. 
    
    \textbf{Datasets.} The COLA dataset contains 516 OOD samples and the original (clean) training set, which is randomly split into $7,262$ in-distribution training samples and $1,816$ in-distribution testing samples. 
    
    \textbf{Implementation details.} The architecture of Transformer for the COLA dataset has 2 MHSA layers with each layer contains 4 attention heads. The hidden dimension and embedding dimension are 256 and 128 respectively. We also use ELMO-style representation ~\citep{DBLP:conf/naacl/PetersNIGCLZ18} for the input embeddings  as in ~\citep{chen2023calibrating}.
    
    CGPT and SCGPT are trained with batch-size 32 for 50 epochs. Their loss functions are minimized using the ADAM optimizer with an initial learning rate of 0.0005 which decays to 0.00001 linearly. For SGPA, we use the same hyper-parameter configuration of  ~\citep{chen2023calibrating}. We choose the noise term to be $\sigma=0.5$ for SCGPT.
    
    \textbf{Evaluation.} The performance of the model is evaluated after $50$ training epochs. The reported performance is averaged over multiple independent runs with different random seeds. The corresponding standard deviations are also reported.

\label{sec:OOD robustness}
\subsection{Out-of-Distribution Calibration} We perform out-of-distribution (OOD) prediction under distribution perturbation on image classification (CIFAR10) and linguistic acceptability (CoLA) tasks. We use the OOD data for CoLA provided in \citep{warstadt2019neural}. For classification task, we use the corrupted CIFAR datasets (CIFAR10-C) \citep{hendrycks2019benchmarking} as OOD data, featuring images under different forms of distortion. 
    
    \par  On the OOD CoLA dataset, Table \ref{tab:in-distribution} shows that CGPT achieves the best performance across all metrics evaluated. SCGPT also outperforms SGPA on $2$ out of $4$ metrics including MCC on the OOD setting.
    For the vision task, we observe that SCGPT achieves the best average out-of-distribution accuracy over all forms of distortion. SCGPT also has the second-highest performance across all calibration metrics, falling slightly behind CGPT, while surpassing all other baseline models. Interestingly, SCGPT significantly outperforms SGPA while using a set of $16$ inducing inputs, which is  half the number of inducing inputs used by SGPA. As a result, SCGPT uses less GPU memory than SGPA (see Figure \ref{fig:memory_comp compare}) while achieving better results. CGPT also outperforms all baselines across all the calibration metrics and distortion types introduced in CIFAR10-C while preserving comparable accuracy as shown in Table~\ref{tab:OOD CIFAR}. These results justify that our proposed methods are more robust than SGPA and kernel attention under distribution shift in terms of both accuracy and calibration ability. 

%     \begin{table*}[t!]
%         \centering
%         \caption{Test Accuracy (MCC for CoLA) and other uncertainty calibration metrics achieved by SGPA and our CGPT on the CIFAR10 and CoLA datasets. The first two rows report results on CIFAR10 and CoLA under the in-distribution setting. The last row reports results on CoLA under the OOD setting. For each metric, we report the mean value and its standard deviation obtained over 3 independent runs. We observe that CGPT achieves better results than SGPA over the calibration metrics for the in-distribution setting. CGPT also outperforms SGPA on all metrics for the CoLA OOD setting.}
%         \vspace{0.5em}
%         \begin{tabular}{ |l l c c c c| } 
%             \toprule
%             \textbf{Dataset} & \textbf{Model} & \bf{Accuracy/MCC $\uparrow$} & \bf{NLL $\downarrow$} & \bf{MCE $\downarrow$} & \bf{ECE $\downarrow$}\\
%             \midrule
            
%             \multirow{2}{*}{CIFAR10} & SGPA & 74.85 $\pm$ 0.95 & 1.06 $\pm$ 0.08 & 0.36 $\pm$ 0.03 & 0.17 $\pm$ 0.10\\ 
%             & CGPT (ours) & \textbf{76.21 $\pm$ 0.30} & \textbf{0.87 $\pm$ 0.03} & \textbf{0.27 $\pm$ 0.02} & \textbf{0.13 $\pm$ 0.05}\\
%             \midrule
            
%             \multirow{2}{*}{CoLA} & SGPA & \textbf{26.27 $\pm$ 2.04} & 2.31 $\pm$ 0.14 & 0.57 $\pm$ 0.03 & 0.29 $\pm$ 0.01\\ 
%             & CGPT (ours) & 20.63 $\pm$ 3.3 & \textbf{1.36 $\pm$ 0.13} & \textbf{0.50 $\pm$ 0.02} & \textbf{0.25 $\pm$ 0.01}\\
%             \midrule

%             \multirow{2}{*}{CoLA (OOD)} & SGPA & 13.63 $\pm$ 4.41 & 2.67 $\pm$ 0.21 & 0.55 $\pm$ 0.02 & 0.31 $\pm$ 0.01\\ 
%             & CGPT (ours) & \textbf{17.41 $\pm$ 2.70} & \textbf{1.41 $\pm$ 0.12} & \textbf{0.49 $\pm$ 0.01} & \textbf{0.28 $\pm$ 0.01}\\
            
%             \bottomrule
%         \end{tabular}
%         \label{tab:in-distribution}
% \end{table*}



\label{sec: OOD detection}
\subsection{Out-of-Distribution Detection} 
        % \par This section evaluates and compares the performance of SCGPT against other baselines on OOD detection task for image classification. In this task, our goal is to determine if a test data point originates from the same distribution as the training data or from a different distribution \citep{hendrycks2016baseline}, \citep{yang2022openood}. 
        % For both methods, 
        In this experiment, we use the CIFAR10 dataset as the in-distribution dataset for OOD detection. 
        We choose $4$ different common image datasets for the OOD detection task which includes Textures, LSUNCrop, LSUNResize and TinyImageNetCrop as our OOD datasets. For the detectors that detect outliers, we choose $4$ state-of-the-art detectors to be used in our experiments: KLMatching \citep{hendrycks2019scaling}, Maximum Softmax Probability (MaxSoftmax) \citep{hendrycks2016baseline}, Entropy Maximization (Entropy) \citep{chan2021entropy} and Energy-Based OOD Detection (EnergyBased) \citep{liu2020energy}. We use the following standard OOD detection metrics for evaluation, which includes (1) the area under the Receiver Operating Characteristic curve (AUROC), (2) the in-distribution and out-distribution area under the Precision-Recall curve (AUPR-IN and AUPR-OUT) and (3) the false positive rate when the true positive rate is equal to 95\% (FPR@95). For each method, we evaluate the OOD performance of SCGPT and SGPA measured using the above metrics on the $4$ OOD datasets and report the mean metrics for each of the $4$ detectors. All results are reported in Table \ref{tab:ood detection2}, which shows that SCGPT has the best performance in $7/16$ cases ($4$ metrics $\times$ $4$ detectors) while the rest of the baselines (including SGPA and the two variants of kernel attention) only has the best performance in no more than $4/16$ cases. Furthermore, on average, SCGPT also outperforms all other baselines across all metrics, showing the best (averaged) quality of uncertainty estimates.
        
        %again outperforms SGPA and both kernel attention variants in terms of OOD performance with respect to most of the detectors and metrics. On average, SCGPT outperforms all baselines across all metrics, showing the best quality of uncertainty estimates. 

% \label{sec:oversmooth}
\subsection{Reducing Oversmoothing} Oversmoothing \citep{shi2022revisiting} occurs when the output of transformers converge to a low-rank sub-space as the number of attention blocks increases, limiting the expressivity of the models. 
            % As such, low values of the cosine similarity generally means high diversity among representations, which in turn suggests less risk of oversmoothing and vice versa. Thus, transformer methods with low cosine similarities of their attention outputs are more preferable.
Fig.~\ref{fig:over_cifar10} in Appendix \ref{sec appendix: OVSMT} demonstrates that our CGPT and SCGPT help alleviate oversmoothing in transformers while the SGPA and kernel attention baselines do not. This is demonstrated via comparing the cosine similarities between the output of the attention blocks, i.e., the greater the cosine similarities, the more oversmoothing.
            % Specifically, we measure the similarities of the learned representations after the attention calculation after each attention block. 

            % In this section, we will further show that CGPT interestingly has less risk regarding oversmoothing than the SGPA baseline. This is demonstrated via measuring and comparing the corresponding representational similarities between the output of their attention blocks, as described above.
            % Specifically, we measure the similarities of the learned representations after the attention calculation after each attention block. 
            
            % This is visually demonstrated in Fig.~\ref{fig:over_cifar10}, which shows that as the number of attention blocks increases, the cosine similarities between the representations learned with SGPA become gradually higher. This implies that these representations will become more similar with each other as the models get deeper. On the contrary, the learned representations of CGPT have much lower cosine similarity as the model depth increases, which implies that CGPT will suffer less from oversmoothing than the kernel attention methods.
        
            % \begin{figure}[t!]
            %     \centering
            %     \captionsetup{font=small} 
            %     \includegraphics[scale=0.3]{gptransformers/images/OVSMT2.pdf}
            %     \vspace{-1em}
            %     \caption{The cosine similarity between the token representations
            %     vs. the layer index of CGPT and SGPA on CIFAR10. CGPT is much less vulnerable to oversmoothing compared to SGPA.}
            %     \label{fig:over_cifar10}
            %     \vspace{-0.2in}
            % \end{figure}
\label{sec: memory}
\subsection{Efficiency Analysis} 
{\small
\begin{figure}[!t]
    \centering
    \captionsetup{font=small} 
    \includegraphics[width=0.9\linewidth]{gptransformers/images/Efficiency.pdf}
    \vspace{-0.5em}
    \caption{Runtime per training epoch (right) and GPU memory allocated during training (left) of SCGPT and SGPA on CIFAR10. SCGPT is more efficient than SGPA in terms of GPU memory usage while having a comparable runtime per epoch to SGPA. }
    \label{fig:memory_comp compare}
    \vspace{-0.2in}
\end{figure}
}
This section compares the processing and memory costs incurred by SCGPT and SGPA during training. Since both methods utilize sparse GPs, we report their average processing time per training epoch and GPU memory usage with respect to the number of inducing inputs used in their sparse approximation. For CIFAR10, the sequence length is $64$. So, we report the processing time and memory consumption of both methods with respect to using $8$, $16$ and $32$ inducing inputs. Figure~\ref{fig:memory_comp compare} indicates that SCGPT incurs less memory cost
than SGPA while still preserving a comparable processing time to SGPA. Particularly, the GPU memory cost incurred by SCGPT is less than that of SGPA over all the above settings and the processing time per epoch of the two methods are also comparable to each other. This implies that SCGPT scales better to larger tasks than SGPA. 
% and shows the promising scalability ability of the CGPT framework.
        
        