% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create gOOD-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{zref-xr}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{hyperref}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}
\usepackage[textsize=tiny]{todonotes}
\usepackage{multirow}
\usepackage{microtype}
\usepackage{subfigure}
\usepackage{booktabs}
\usepackage{dsfont}
% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
\externaldocument{uai2023-template}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Mitigating Transformer Overconfidence via Lipschitz Regularization\\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1,4]{\href{mailto:<wenqian@virginia.edu>?Subject=Your UAI 2023 paper}{Wenqian Ye}{}}
\author[2,4]{Yunsheng Ma}
\author[3,4]{\href{mailto:<xucao2@illinois.edu>?Subject=Your UAI 2023 paper}{Xu Cao}}
\author[5]{Kun Tang}
% Add affiliations after the authors
\affil[1]{%
    Department of Computer Science, University of Virginia, Charlottesville, VA, USA
}
\affil[2]{%
   College of Engineering, Purdue University, West Lafayette, IN, USA
}
\affil[3]{%
    Department of Computer Science, University of Illinois Urbana-Champaign, Urbana, IL, USA
  }
\affil[4]{%
    AI Lab, Shenzhen Children’s Hospital, Shenzhen, China
  }  
\affil[5]{%
    T Lab, Tencent, Beijing, China
  }  
  
  \begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle



\appendix

\section{Proof for the Lipschitz Constant of LayerNorm}

The LayerNorm operation~\citep{layernorm} used in LRFormer can be expressed as:
\begin{align*}
    \text{LN}(\mathbf{x}) &= \frac{\mathbf{x}-\mu(\mathbf{x})}{\sqrt{\sigma^2(\mathbf{x}) + \epsilon}} * \boldsymbol\gamma + \boldsymbol\beta 
\end{align*}
where $\mathbf{x}, \boldsymbol\beta, \boldsymbol\gamma \in \mathbb{R}^N$, $\mu(\mathbf{x}) = \frac{1}{N} \sum_{i=1}^N x_i$, $\sigma^2(\mathbf{x}) = \frac{1}{N}\sum_{i=1}^N (x_i - \mu(\mathbf{x}))^2$. 

WLOG, assume $N > 2$ and not all $x_i$ are equal.

The derivatives of $\mu$ and $\sigma^2$ w.r.t $x$:
$$\frac{\partial \mu}{\partial \mathbf{x}} = \frac{1}{N} \mathds{1}^\top$$ $$\frac{\partial \sigma^2}{\partial \mathbf{x}} = \frac{2}{N}(\mathbf{x} - \mu)^\top$$

Take the derivative of $\text{LN}(\mathbf{x})_i$, the $i$th element of $\text{LN}(\mathbf{x})$, with respect to $\mathbf{x}$ is:

\begin{align}
\begin{split}
    \frac{\partial \text{LN}(\mathbf{x})_i}{\partial \mathbf{x}}
    &= \gamma_i (\sigma^2 + \epsilon)^{-\frac{1}{2}} \bigg[(\mathbf{e}_i - \frac{1}{N}\mathds{1})^\top - \frac{1}{N} (\sigma^2 + \epsilon)^{-1} (x_i - \mu)(\mathbf{x} - \mu)^\top \bigg].
\end{split}
\end{align}

where $\mathbf{e}_I \in \mathbb{R}^N$ is a one-hot vector with $1$ at the $i$th element.
Therefore,
\begin{align*}
    \frac{\partial \text{LN}(\mathbf{x})}{\partial \mathbf{x}} &= (\sigma^2 + \epsilon)^{-\frac{1}{2}} \bigg[ \text{diag}(\boldsymbol\gamma) - \frac{1}{N}\boldsymbol\gamma \mathds{1}^\top - \frac{1}{N} (\sigma^2 + \epsilon)^{-1}\text{diag}(\boldsymbol\gamma)(\mathbf{x} - \mu)(\mathbf{x} - \mu)^\top \bigg].
\end{align*}


\begin{equation} \label{eq:first_terms_inf_norm}
    \left\Vert \text{diag}(\boldsymbol\gamma) - \frac{1}{N}\boldsymbol\gamma \mathds{1}^\top \right\Vert_{\infty} = \frac{2(N-1)}{N}\max_i |\gamma_i|,
\end{equation}

Take the infinity-norm on both sides, we have:
\begin{align*}
    \left\Vert \frac{\partial \text{LN}(\mathbf{x})}{\partial \mathbf{x}} \right\Vert_{\infty} &= (\sigma^2 + \epsilon)^{-\frac{1}{2}} \left\Vert   \text{diag}(\boldsymbol\gamma) - \frac{1}{N}\boldsymbol\gamma \mathds{1}^\top - \frac{1}{N} (\sigma^2 + \epsilon)^{-1}\text{diag}(\boldsymbol\gamma)(\mathbf{x} - \mu)(\mathbf{x} - \mu)^\top \right\Vert_{\infty} \\
    &\leq \epsilon^{-\frac{1}{2}} \bigg( \frac{2(N-1)}{N}\max_i |\gamma_i| + \frac{1}{N} \max_i |\gamma_i| N(N-2) \bigg) \\
    &\leq \epsilon^{-\frac{1}{2}} \max_i |\gamma_i| N.
\end{align*}

\section{Proof for the Lipschitz Constant of LRSA}
The pair-wise LRSA function is expressed as:
\begin{equation}
\label{LRSA}
    \begin{aligned}
    S_{ij} = -\frac{\alpha \left\Vert x_i^\top W_Q - x_j^\top W_K \right\Vert_2^2}{\left\Vert Q \right\Vert _F \left\Vert X^\top \right\Vert _{(\infty, 2)}}
\end{aligned}
\end{equation}

\begin{align*}
P_i = S_i(X)
\end{align*}


\begin{align*}
P_{ij} = \frac{e^{S_{ij}}}{\sum_{t=1}^n  e^{S_{it}} } \leq 1
\end{align*}

To take the derivative $P_{ij}$, there are two cases. 

When $t = j$:
\begin{equation}
    \begin{aligned}
\frac{\partial P_{ij}}{\partial S_{it}} &= \frac{\partial P_{ij}}{\partial S_{ij}} = 
\frac{\partial}{\partial S_{ij}}\bigg(\frac{e^{S_{ij}}}{\sum_{t=1}^n  e^{S_{it}}}\bigg) 
=
\frac{e^{S_{ij}}(\sum_{t=1}^n  e^{S_{it}}) - (e^{S_{ij}})^2 }{(\sum_{t=1}^n  e^{S_{it}})^2} \\ &= 
\frac{e^{S_{ij}}}{\sum_{t=1}^n  e^{S_{it}}}\bigg(1-\frac{e^{S_{ij}}}{\sum_{t=1}^n  e^{S_{it}}}\bigg) = P_{ij}(1- P_{ij})
\end{aligned}
\end{equation}

When $t \neq j$:
\begin{align*}
    \frac{\partial P_{ij}}{\partial S_{it}} = 
\frac{\partial}{\partial S_{it}}\bigg(\frac{e^{S_{ij}}}{\sum_{t=1}^n  e^{S_{it}}}\bigg) =
-\frac{e^{S_{ij}}}{\sum_{t=1}^n  e^{S_{it}}}\frac{e^{S_{it}}}{\sum_{t=1}^n  e^{S_{it}}} = -P_{ij}P_{it}
\end{align*}

\begin{equation}
    \begin{aligned}
    \frac{\partial P_{ij}}{\partial x_k}
    = \sum_{t=1}^n \frac{\partial P_{ij}}{\partial S_{it}} \frac{\partial S_{it}}{\partial x_k}
    = P_{ij}(1-P_{ij})\frac{\partial S_{ij}}{\partial x_k} - \sum_{t = 1, t \neq j}^nP_{ij}P_{it} \frac{\partial S_{it}}{\partial x_k}
    = P_{ij}\frac{\partial S_{ij}}{\partial x_k} - P_{ij}\sum_{t = 1} ^ n P_{it} \frac{\partial S_{it}}{\partial x_k}
\end{aligned}
\end{equation}

Take the infinity-norm on $S_{it}$, we get:
\begin{align*}
\left\Vert \frac{\partial S_{it}}{\partial x_k} \right\Vert _\infty & = \left\Vert \frac{\partial} {\partial x_k} \bigg( -\frac{\alpha \left\Vert x_i^\top W_Q - x_j^\top W_K \right\Vert_2^2}{\left\Vert Q \right\Vert _F \left\Vert X^\top \right\Vert _{(\infty, 2)}} \bigg) \right\Vert _\infty \\
& = \left\Vert -\frac{2 \alpha \left\Vert x_i^\top W_Q - x_j^\top W_K \right\Vert_2}{\left\Vert Q \right\Vert _F \left\Vert X^\top \right\Vert _{(\infty, 2)}}
\frac{\partial \left\Vert x_i^\top W_Q - x_j^\top W_K \right\Vert_2}{\partial x_k} + \frac{\alpha \left\Vert x_i^\top W_Q - x_j^\top W_K \right\Vert_2^2}{\left\Vert Q \right\Vert _F \left\Vert X^\top \right\Vert _{(\infty, 2)}^2}\frac{\partial \left\Vert X^\top \right\Vert _{(\infty, 2)}}{\partial x_k} \right\Vert _\infty \\
&\leq\left\Vert  \frac{2 \alpha \left\Vert x_i^\top W_Q - x_j^\top W_K \right\Vert_2}{\left\Vert Q \right\Vert _F \left\Vert X^\top \right\Vert _{(\infty, 2)}}
\frac{\partial \left\Vert x_i^\top W_Q - x_j^\top W_K \right\Vert_2}{\partial x_k}  \right\Vert _\infty
+ \left\Vert \frac{\alpha \left\Vert x_i^\top W_Q - x_j^\top W_K \right\Vert_2^2}{\left\Vert Q \right\Vert _F \left\Vert X^\top \right\Vert _{(\infty, 2)}^2}\frac{\partial \left\Vert X^\top \right\Vert _{(\infty, 2)}}{\partial x_k} \right\Vert _\infty \\
&\leq \frac{2\alpha}{\left\Vert Q \right\Vert _F} \frac{\left\Vert x_i^\top W_Q \right\Vert _2 + \left\Vert x_j^\top W_K \right\Vert_2}{\left\Vert X^\top \right\Vert _{(\infty, 2)}} 
\bigg( \frac{\partial \left\Vert x_j^\top W_Q \right\Vert_2}{\partial x_k} + \frac{\partial \left\Vert x_j^\top W_K \right\Vert_2}{\partial x_k} \bigg) + \frac{\alpha}{\left\Vert Q \right\Vert _F} 
\bigg( \frac{\left\Vert x_i^\top W_Q \right\Vert _2 + \left\Vert x_j^\top W_K \right\Vert_2}{\left\Vert X^\top \right\Vert _{(\infty, 2)}} \bigg) ^2 \\
&\leq \frac {2 \alpha (\left\Vert W_Q\right\Vert_2 + \left\Vert W_K \right\Vert_2) }{\left\Vert Q \right\Vert _F}^2 + \frac {\alpha (\left\Vert W_Q\right\Vert_2 + \left\Vert W_K \right\Vert_2) }{\left\Vert Q \right\Vert _F}^2 \\
& = \frac {3 \alpha (\left\Vert W_Q\right\Vert_2 + \left\Vert W_K \right\Vert_2) }{\left\Vert Q \right\Vert _F}^2
\end{align*}

Thus,
\begin{align*}
\left\Vert \frac{\partial P_{ij}}{\partial x_k} \right\Vert _\infty &= 
\left\Vert P_{ij}\frac{\partial S_{ij}}{\partial x_k} - P_{ij} \sum_{t = 1} ^ n P_{it} \frac{\partial S_{it}}{\partial x_k} \right\Vert _\infty \leq P_{ij} \frac {3 \alpha (\left\Vert W_Q\right\Vert_2 + \left\Vert W_K \right\Vert_2) }{\left\Vert Q \right\Vert _F}^2 + P_{ij}\sum_{t=1}^n P_{it} \frac {3 \alpha (\left\Vert W_Q\right\Vert_2 + \left\Vert W_K \right\Vert_2) }{\left\Vert Q \right\Vert _F}^2 \\
&\leq \frac {6 \alpha (\left\Vert W_Q\right\Vert_2 + \left\Vert W_K \right\Vert_2) }{\left\Vert Q \right\Vert _F}^2 \leq \frac{6 \alpha}{\left\Vert X \right\Vert _F} \cdot \frac { (\left\Vert W_Q\right\Vert_2 + \left\Vert W_K \right\Vert_2) }{\left\Vert W_Q \right\Vert _F }^2 
\end{align*}

% \begin{equation}
% S_{ij} = -\frac{\alpha}{\left\Vert Q \right\Vert ^F \left\Vert X^\top \right\Vert _{(\infty, 2)}}
% (x_i^\top W_Q - x_j^\top W_K)(x_i^\top W_Q - x_j^\top W_K)^\top
% \end{equation}
\section{Gaussian Process Layer}

As an optional module in LRFormer, Gaussian Process (GP) with an RBF kernel following SNGP \citep{Liu2020SimpleAP} is capable of perserving the distance awareness between input test sample and previously seen training data. This approach makes sure the model returns a uniform distribution over output labels when the input sample is OOD.

To make it end-to-end trainable, the Gaussian Process layer can be implemented a two-layer network:
\begin{equation}
    \operatorname{logits}(x)=\Phi(x) \beta, \quad \Phi(x)=\sqrt{\frac{2}{M}} * \cos (W x+b)
\end{equation}

Here, $x$ is the input, and $W$ and $b$ are frozen weights initialized randomly from Gaussian and uniform distributions, respectively. $\Phi(x)$ is Random Fourier Features (RFF) \citep{williams2006gaussian}. $\beta$ is the learnable kernel weight similar to that of a Dense layer. The layer outputs the class prediction $\operatorname{logits}(x) \in \mathbb{R}_{\operatorname{Num Classes}}$ .

\section{Experimental Details}

In Table~\ref{tab:supp-exp-details}, we provide the training details used for reproducing the main results in Tables above. The $Depth=12$ (pretraining) is the experimental setup of the ImageNet1K dataset pretraining. The other hyperparameters follows the same setting from DeiT III~\citep{Touvron2022ThreeTE}.

\begin{table}[!h]
    \centering
    \caption{Hyperparameters for LRFormer Training.} \label{tab:supp-exp-details}
    \begin{tabular}{lrrr}
      \toprule % from booktabs package
      \bfseries Hyperparameters & \bfseries $Depth=6$ & \bfseries $Depth=12$ & {\bfseries $Depth=12$} (pretraining) \\
      \midrule % from booktabs package
      Layer depth & 6 & 12 & 12 \\
      Input size & $224\times 224$ & $224\times 224$ & $224\times 224$ \\
      Batch size & 128 & 32 & 32 \\
      Warm-up steps & 5 & 5 & 5 \\
      Optimizer & SGD & AdamW & AdamW \\
      Learning rate & 0.01 & 0.006 & 0.004 \\
      Weight decay & 0.05 & 0.05 & 0.05 \\
      Learning rate scheduler & cosine & cosine & cosine \\
      Training epochs & 100 & 100 & 100 \\
      \bottomrule % from booktabs package
    \end{tabular}
\end{table}


\bibliography{ye_722}
\end{document}
