% Report runtime (done), 
% Coong thuwsc loss (appendix)
% Add organization and notation of paper to intro
% Compare CIFAR with kernel attention (done) 
% Compare COLA (appendix)
% Draw architecture
% related work
% Limitation in conclusion

% \textcolor{red}{
% \subsection{Runtime and Memory}
%     \par This section compares the runtime and memory of training our CGPT with those of training SGPA. In particular, we report the average runtime per training epoch and the GPU memory allocated in the training phase for both tasks. The results reported in Table \ref{tab:runtime mem} indicate that our CGPT is more efficient than SGPA in runtime and memory. Concretely, on the CoLA task, the runtime per epoch of CGPT is slightly lower than SGPA's while the GPU memory allocated of both methods are comparable. However,
%     we observe that for the larger scale CIFAR10 task, the average runtime per training epoch of CGPT is significantly lower than that of SGPA while CGPT allocates much less GPU memory than SGPA does. This result implies that CGPT scales better to larger tasks than the baseline SGPA.
%    \begin{table*}[!t]
%         \centering
%         \caption{Average runtime(s) per training epoch and GPU memory allocated during training of CGPT and SGPA on CIFAR10 and CoLA. Our results show that CGPT is more efficient than SGPA in terms of runtime and GPU memory requirement.}
%         \vspace{0.5em}
%         \begin{tabularx}{\textwidth}{ |X X X X|} 
%             \toprule
%             \textbf{Dataset} & \textbf{Model} & \bf{Runtime(s)/epoch} & \bf{GPU memory (Mb)} \\
%             \midrule    
%             \multirow{2}{*}{CIFAR10} & SGPA & 107.48 & 10013 \\ 
%             & CGPT (ours) & \textbf{58.72} & \textbf{7197} \\
%             \midrule         
%              \multirow{2}{*}{CoLA} & SGPA & 16.52 & \textbf{3711} \\ 
%             & CGPT (ours) & \textbf{14.75} & 3715 \\
%             \bottomrule
%         \end{tabularx}
%         \label{tab:runtime mem}
%     \end{table*}
% }

% \subsection{Effect of the choice of kernel function}

\textcolor{blue}{
    Long: I think we can move Appendix B.4 here instead
}

\subsection{Compare with non-GP methods}
    \par In this section, we compare the performance of CGPT with non-GP methods on the image classification task CIFAR10. The comparison results on the CoLA dataset is provided in the Appendix. Specifically, for the CIFAR10 dataset, we compare the in-distribution performance of CGPT with that of kernel attention \citep{tsai2019transformer}, which uses a valid symmetric ARD kernel and is trained under the same experiment settings provided in the Appendix. We report the in-distribution test accuracy, NLL, MCE and ECE for both models in Table \ref{tab:cifar compare kernel}, which shows that CGPT outperforms kernel attention in test accuracy as well as all other calibration metrics. This result suggests that CGPT with asymmetric kernel indeed have better representation capacity than symmetric kernel attention, while being more suitable for uncertainty calibration, which is a desirable property inherited from the GP representation.\vspace{-3mm}

    
