\documentclass[accepted, table]{uai2023} % for initial submission
% \documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
\externaldocument{uai2023-template}

\usepackage{times}
\usepackage{epsfig}
\usepackage{graphicx}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{chngpage}
\usepackage{mathtools}
\usepackage{caption}
\captionsetup[table]{skip=10pt}
% \usepackage[table,xcdraw]{xcolor}
%\usepackage[section]{placeins}
\usepackage{multirow}
\usepackage{listings}
\usepackage{subcaption}
\usepackage{sidecap}

\usepackage[ruled,vlined]{algorithm2e}
\definecolor{textblue}{rgb}{.2,.2,.7}
\definecolor{textred}{rgb}{0.54,0,0}
\definecolor{textgreen}{rgb}{0,0.43,0}
\usepackage{listings}
\lstset{language=Python, 
numbers=left, 
numberstyle=\tiny, 
stepnumber=1,
numbersep=5pt, 
tabsize=4,
basicstyle=\ttfamily,
keywordstyle=\color{textblue},
commentstyle=\color{textred},   
stringstyle=\color{textgreen},
frame=none,                    
columns=fullflexible,
keepspaces=true,
xleftmargin=\parindent,
showstringspaces=false}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{ViBid: Linear Vision Transformer with Bidirectional Normalization\\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1,*]{Jeonggeun Song}
\author[1,*]{\href{mailto:<andrew.com@kakaoenterprise.com>?Subject=ViBid}{Heung-Chang Lee}{}}
% Add affiliations after the authors
\affil[1]{%
    AI Lab \& Service\\
    Kakao Enterprise\\
    Seongnam-si, South Korea
}
\affil[*]{%
    Equal Contributions
}
  
  \begin{document}
  
\onecolumn
\maketitle

\begin{table}[ht]
\small
\caption{\textbf{Hyperparameter settings for our various models on ImageNet1k dataset.} Values in parentheses "( )" mean values used in fine-tuning.}
\centering
\begin{tabular}{l|c|c|c|c|c}
\hline
\rowcolor[HTML]{EFEFEF}
Hyperparameter & ViBid-U & ViBid-T & ViBid-S & ViBid-M & ViBid-B \\
\hline
Learning rate & \multicolumn{3}{c|}{5e-5} & \multicolumn{2}{c}{4e-5 (0.01)} \\
\hline
Warm-up LR & \multicolumn{5}{c}{1e-6 (None)} \\
\hline
Batch size & \multicolumn{5}{c}{4096 (4096)} \\
\hline
Optimizer & \multicolumn{5}{c}{AdamW (SGD)} \\
\hline
LR scheduler & \multicolumn{5}{c}{Cosine (Cosine)} \\
\hline
Gradient clip & \multicolumn{5}{c}{0.5 (0.5)} \\
\hline
Stochastic depth & 0.0 & 0.05 & 0.1 & 0.15 (0.15) & 0.25 (0.25) \\
\hline
Warm-up epochs & \multicolumn{5}{c}{5 (0)} \\
\hline
RandAugment & \multicolumn{3}{c|}{2, 7} & 2, 9 (2, 9) & 2, 12 (2, 12) \\
\hline
Label smoothing & \multicolumn{5}{c}{0.1 (0.1)} \\
\hline
Train epochs & \multicolumn{5}{c}{400 (10)} \\
\hline
Weight decay & \multicolumn{5}{c}{0.05 (0.0)} \\
\hline
\end{tabular}
\label{tb:hyp}
\end{table}

\begin{figure}[ht]
\centering
  \includegraphics[width=0.6\linewidth]{figures/acc/binorm_epoch_training.pdf}
  \caption{\textbf{Comparison of the effects of BiNorm presence or absence on early epoch training by using ViBid-M.} See Section 3 for details.}
  \label{fig:binorm_training}
\end{figure}


\begin{algorithm*}[ht]
\begin{small}
\DontPrintSemicolon
\begin{lstlisting}
    def attend(self, x):
        b, n, d = x.shape
        qkv = self.qkv_proj(x)
        h = qkv.shape[-1] // 3
        qkv = qkv.reshape(b, n, 3, self.num_heads, h // self.num_heads)
        qkv = qkv.permute(2, 0, 3, 1, 4)
        
        q = output[0]
        k = output[1]
        v = output[2]
        
        # we commented the lines of the original SA
        # output = (q @ k.transpose(-2, -1)) * self.scale
        # output = output.softmax(dim=-1)
        output = k.transpose(-2, -1) @ v
        output = normalize(output, dim=-2)
        q = normalize(q, dim=-1)  # BiNorm
        
        # output = (output @ v).transpose(1, 2).reshape(b, n, h)
        output = (q @ output).reshape(b, n, h)
        output = self.proj(output)
        return output
\end{lstlisting}
\caption{Python style pseudo-code of BiNorm-based attention.}
\label{alg:binorm}
\end{small}
\end{algorithm*}


\begin{figure}[ht]
  \centering
  \includegraphics[width=0.7\linewidth]{figures/visualized_attention.pdf}
  \caption{\textbf{Visualized attentions.} Visualization of attention matrices using pseudo-inverse scheme. These matrices are extracted from class attention module of pretrained ViBid-S.}
  \label{fig:vis_attn}
\end{figure}

% \begin{figure}[t!]
% \centering
%     \begin{subfigure}[b]{0.32026144\textwidth}
%       \centerline{\includegraphics[width=\linewidth]{figures/overall_workflow/self_attention.pdf}}
%       \caption{Self-attention.}
%       \label{fig:comp_att_sa}
%     \end{subfigure}
%     \begin{subfigure}[b]{0.30065359\textwidth}
%       \centerline{\includegraphics[width=\linewidth]{figures/overall_workflow/window_attention.pdf}}
%       \caption{Window attention.}
%       \label{fig:comp_att_wa}
%     \end{subfigure}
%     \begin{subfigure}[b]{0.29738562\textwidth}
%       \centerline{\includegraphics[width=\linewidth]{figures/overall_workflow/pattern_attention.pdf}}
%       \caption{Pattern-based attention.}
%       \label{fig:comp_att_pa}
%     \end{subfigure}\\
%     \medskip
%     \begin{subfigure}[b]{0.42892157\textwidth}
%       \centerline{\includegraphics[width=\linewidth]{figures/overall_workflow/kernel_attention.pdf}}
%       \caption{Kernel-based attention.}
%       \label{fig:comp_att_ka}
%     \end{subfigure}\hspace{0.03\textwidth}
%     \begin{subfigure}[b]{0.46650327\textwidth}
%       \centerline{\includegraphics[width=\linewidth]{figures/overall_workflow/binorm_attention.pdf}}
%       \caption{BiNorm-based attention. (Ours)}
%       \label{fig:comp_att_binorm}
%     \end{subfigure}
% \smallskip
% \caption{\textbf{Comparison with the existing self-attention algorithms.} (a) \textbf{Self-attention.} The query and key generates $N\times N$ attention maps. It is $O(N^2)$ operation and causes computational inefficiency. (b) \textbf{Window attention.} It splits the input images to several number of windows before it computes self-attention. Since the window size is fixed, it avoids $O(N^2)$ problem of self-attention. However, its computational graph is complicated and it is not optimized to most of frameworks. (c)(d) \textbf{Efficient self-attention.} They choose fixed number of tokens to generate attention maps. Those procedures are too heuristic and complicated. In addition, they often requires the custom operations which is not supported in most of frameworks. (e) \textbf{Ours.} BiNorm-based self-attention is not much complicated as well as it has $O(N)$ complexity. No other process is needed to operate, so it is simple and fast.}
% \label{fig:comparison_attention}
% \end{figure}

\end{document}

