% \documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Authors added:
\usepackage{graphicx}
\usepackage{caption}
\usepackage{subcaption}
\usepackage{xcolor}
\usepackage[notextcomp]{stix}
\usepackage{amsmath} 
\usepackage{tabularx,colortbl}
\usepackage{multirow}
\usepackage{multicol}

\def\UrlBreaks{\do\/\do-}

\usetikzlibrary{arrows}

% For references in other .tex files (main.tex)
\usepackage{xr-hyper}

\makeatletter
\newcommand*{\addFileDependency}[1]{% argument=file name and extension
  \typeout{(#1)}
  \@addtofilelist{#1}
  \IfFileExists{#1}{}{\typeout{No file #1.}}
}
\makeatother

\newcommand*{\myexternaldocument}[1]{%
    \externaldocument{#1}%
    \addFileDependency{#1.tex}%
    \addFileDependency{#1.aux}%
}

\myexternaldocument{samarin_622}
%%%%%%%%%%%%%%%

\usepackage{hyperref}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Feature Learning and Random Features in Standard Finite-Width Convolutional Neural Networks: An Empirical Study\\(Supplementary Material)}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<maxim.samarin@unibas.ch>?Subject=Your UAI 2022 paper}{Maxim~Samarin}}
\author[1]{\href{mailto:<volker.roth@unibas.ch>?Subject=Your UAI 2022 paper}{Volker~Roth}}
\author[1]{\href{mailto:<david.belius@unibas.ch>?Subject=Your UAI 2022 paper}{David~Belius}}
% Add affiliations after the authors

\affil[1]{%
      Department of Mathematics and Computer Science\\
      University of Basel, Switzerland
}
 

\begin{document}
\maketitle

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\appendix
\section{Supplementary Material}

\renewcommand\thefigure{A.\arabic{figure}}
\renewcommand\thetable{A.\arabic{table}}

\subsection{Snakes Dataset}
\label{sec:appendix_snakes_dataset}
%
For a challenging classification task, we chose a subset of ImageNet 2012 \citep{ImageNet_2015} comprised of ten snake categories illustrated in Fig. \ref{fig:appendix_snake_overview}. The extracted dataset contains 1300 training and 50 test images per class, resulting in 13000 train and 500 test images in total. As a benchmark performance results, we evaluate a standard pre-trained AlexNet on this dataset, achieving $47.6\%$ test accuracy. Training our implementation of AlexNet with cross-entropy loss on the snakes dataset provides $98.5\%$ train and $51.4\%$ test accuracy (single run). In our experiments, we used a mean squared error loss, which in comparison led to $99.1\%$ train and $53.8\%$ test accuracy (single run). These results indicate that both loss functions lead to comparable performance and outperform a standard pre-trained AlexNet (trained on full ImageNet) with respect to generalization. 
%
\begin{figure}[ht]
\centering{}%
\includegraphics[width=\linewidth]{Images/snake_dataset_1example.pdf}
\caption{Ten snake categories from ImageNet.}
\label{fig:appendix_snake_overview}
\end{figure}

\subsection{Early training trajectory}
%
Following \cite{lee_wide_2019}, we study training trajectories for data samples $x$ of the MNIST test set during training. For illustration, we plot the iteration $t=0,1,\ldots$ against the standard LeNet output $f^{l}(w_t, x)$ and the linearization $ f^l_\text{lin}(u_t, x)$ for different widths. Note that $w_t$ and $u_t$ are the weights after $t$ gradient updates for LeNet and LinLeNet trained on MNIST. As we use one-hot encoding, output $l$ denotes the predicted output for the correct class of the data point $x$. The same hyperparameters as for the other MNIST experiments are used (see Sec. \ref{sec:experiments_multiclass}). A fixed random seed ensures that both LeNet and LinLeNet at a particular width factor are initialized exactly the same and receive the same mini-batches during training. Exemplary results are shown in Fig. \ref{fig:appendix_trajectories} for a digit 8 of the test set, with similar results being obtained for other samples, too. At small widths, training trajectories immediately diverge. With increasing width, the curves behave more similar; they are however not close in a path-wise sense, but the statistics of training trajectories become more alike.
%
\begin{figure}[ht]
    \centering
    \includegraphics[width=\linewidth]{Images/LeNet_lr-0.1_1875-iter_trajectory_plot_overview_8.png}
    \caption{Training trajectories of LeNet (dark) and LinLeNet (light) do not stay close for small width factors. Shown are the output values during training for the same MNIST input example from the test set at different widths.}
    \label{fig:appendix_trajectories}
\end{figure}


\subsection{Effective Rank}
\label{sec:appendix_effective_rank}
%
The effective rank was introduced by \cite{roy_effective_2007} and can be viewed as the exponential entropy of normalized singular values. We restate the main definition in the following.
\paragraph{Definition.} Let $A$ be a complex-valued non-all-zero matrix of size $M\times N$ with (real positive) singular values $\sigma_1 \geq \sigma_2 \geq ... \geq \sigma_Q \geq 0$, where $Q=\min\{M,N\}$. Let $\sigma=(\sigma_1,\sigma_2,...,\sigma_Q)^\top$ and the singular value distribution be
%
\begin{equation}
    p_k = \frac{\sigma_k}{\sum_{j=1}^{Q} \sigma_j} \quad \text{with } k=1,2,...,Q.
\end{equation}
%
The effective rank of matrix $A$ is then defined as
\begin{equation}
    \text{erank}(A):=\exp\left(H(p_1,p_2,...,p_Q)\right)
\end{equation}
where $H(p_1,p_2,...,p_Q)$ is the Shannon entropy
\begin{equation}
    H(p_1,p_2,...,p_Q)=-\sum_{k=1}^{Q} p_k \log p_k.
\end{equation}
%
In comparison to the usual notion of rank, an important property of the effective rank is that $erank(A)\leq rank(A)$ \citep{roy_effective_2007}. 


\subsection{Singular Values LinAlexNet}
%
\begin{figure}[ht]
\begin{centering}
\includegraphics[width=0.7\linewidth]{Images/singular_values_alexnet_snakes.pdf}
\par\end{centering}
\caption{Singular value distribution of LinAlexNet for 600 samples of the snakes dataset.}
\label{fig:appendix_sing_vals_alexnet}
\end{figure}

\subsection{Train and Test Accuracy Values}
%
Tables \ref{tab:appendix_accuracy_values_alexnet} and \ref{tab:appendix_accuracy_values_lenet_all} provide the values for train and test accuracy in the AlexNet and LeNet experiments, respectively. The sample mean accuracy as well as sample standard deviation for 5 independent reruns of models are shown.

\begin{table}[htbp]
    \centering
    \caption{Mean accuracy and standard deviation for 5 independent reruns of the AlexNet experiment (see Fig. \ref{fig:alexnet-snakes-width}).
    }
    \resizebox{\linewidth}{!}{%
	\setlength{\tabcolsep}{4pt} 
	\begin{tabular}{llllllll}
		\toprule 
		& & $\mathbf{\times1}$ & $\mathbf{\times2}$ & $\mathbf{\times3}$ & $\mathbf{\times4}$ \\ \midrule 
		\multirow{3}{*}{\rotatebox[origin=c]{90}{\textbf{Test}}} & Lin. 0.1 & 30.48$\pm$1.68 & 33.52$\pm$0.63 & 33.48$\pm$1.14 & 33.88$\pm$1.68  \\
		 & Lin. 1.0 & 33.64$\pm$1.26 & 35.2$\pm$1.53 & 36.76$\pm$1.52 & 36.6$\pm$2.05   \\
		& LeNet & 54.04$\pm$1.13 & 54.72$\pm$1.27 & 54.72$\pm$0.98 & 55.16$\pm$1.54 \\\midrule 
		\multirow{3}{*}{\rotatebox[origin=c]{90}{\textbf{Train}}} & Lin. 0.1 & 36.75$\pm$0.5 & 42.62$\pm$0.8 & 46.64$\pm$0.54 & 49.84$\pm$0.95  \\
		& Lin. 1.0 & 51.05$\pm$2.17 & 64.45$\pm$3.12 & 72.63$\pm$3.02 & 79.45$\pm$3.12 \\
		& LeNet & 99.15$\pm$0.05 & 99.19$\pm$0.03 & 99.16$\pm$0.05 & 99.16$\pm$0.04 \\\bottomrule
	\end{tabular}
	}
	\label{tab:appendix_accuracy_values_alexnet}
\end{table}


\begin{table}[htbp]
    \centering
    \caption{Mean accuracy and standard deviation for 5 independent reruns of the LeNet experiments}
	\setlength{\tabcolsep}{4pt} 
	\begin{tabular}{lllll}
		\toprule 
		& & $\mathbf{\times1}$ & $\mathbf{\times2}$ & $\mathbf{\times5}$ \\ \midrule 
    	\multicolumn{5}{l}{MNIST (see Fig. \ref{fig:lenet-mnist-width})} \\ \midrule 
		\multirow{2}{*}{\rotatebox[origin=c]{90}{\textbf{Test}}} & Lin. & 94.48$\pm$1.04  & 96.42$\pm$0.17 & 97.86$\pm$0.02 \\
		& LeNet & 99.15$\pm$0.1 & 99.29$\pm$0.05 & 99.38$\pm$0.06 \\ \midrule 
		\multirow{2}{*}{\rotatebox[origin=c]{90}{\textbf{Train}}} & Lin. & 94.3$\pm$1.07 & 96.56$\pm$0.33 & 98.14$\pm$0.07 \\
		& LeNet & 99.86$\pm$0.02 & 99.94$\pm$0.01 & 99.95$\pm$0.01 \\ \midrule
    	\multicolumn{5}{l}{MNIST with translation (see Fig. \ref{fig:lenet-mnist-7pixel-translation-width})} \\ \midrule 
		\multirow{2}{*}{\rotatebox[origin=c]{90}{\textbf{Test}}} & Lin. & 69.95$\pm$5.86 & 80.91$\pm$1.77 & 89.57$\pm$0.49 \\
		& LeNet & 97.6$\pm$0.16 & 98.35$\pm$0.12 & 98.61$\pm$0.03 \\ \midrule 
		\multirow{2}{*}{\rotatebox[origin=c]{90}{\textbf{Train}}} & Lin. & 68.93$\pm$5.44 & 80.54$\pm$1.64 & 88.89$\pm$0.29 \\
		& LeNet & 97.55$\pm$0.05 & 98.28$\pm$0.06 & 98.59$\pm$0.03 \\\midrule
    	\multicolumn{5}{l}{CIFAR-10  (see Fig. \ref{fig:lenet-cifar-width})} \\ \midrule 
		\multirow{2}{*}{\rotatebox[origin=c]{90}{\textbf{Test}}} & Lin. & 42.98$\pm$0.53 & 48.07$\pm$1.12 & 54.42$\pm$0.4 \\
		& LeNet & 63.2$\pm$0.58 & 69.58$\pm$0.48 & 75.76$\pm$0.22 \\ \midrule 
		\multirow{2}{*}{\rotatebox[origin=c]{90}{\textbf{Train}}} & Lin. & 43.99$\pm$0.97 & 50.3$\pm$1.52 & 60.45$\pm$1.14 \\
		& LeNet & 92.07$\pm$0.24 & 98.53$\pm$0.07 & 99.76$\pm$0.04 \\ \bottomrule
		& & & & \\
		\toprule
		& & $\mathbf{\times10}$ & $\mathbf{\times25}$ & $\mathbf{\times60}$ \\ \midrule 
    	\multicolumn{5}{l}{MNIST (see Fig. \ref{fig:lenet-mnist-width})} \\ \midrule 
		\multirow{2}{*}{\rotatebox[origin=c]{90}{\textbf{Test}}} & Lin. & 98.36$\pm$0.04 & 98.72$\pm$0.05 & 98.91$\pm$0.1  \\
		& LeNet & 99.4$\pm$0.03 & 99.42$\pm$0.04 & 99.39$\pm$0.06 \\ \midrule 
		\multirow{2}{*}{\rotatebox[origin=c]{90}{\textbf{Train}}} & Lin. & 98.82$\pm$0.05 & 99.45$\pm$0.02 & 99.83$\pm$0.03 \\
		& LeNet & 99.97$\pm$0.00 & 99.97$\pm$0.01 & 99.97$\pm$0.01 \\ \midrule
    	\multicolumn{5}{l}{MNIST with translation (see Fig. \ref{fig:lenet-mnist-7pixel-translation-width})} \\ \midrule 
		\multirow{2}{*}{\rotatebox[origin=c]{90}{\textbf{Test}}} & Lin. & 92.13$\pm$0.11 & 94.25$\pm$0.48 & 94.66$\pm$0.58 \\
		& LeNet & 98.63$\pm$0.13 & 98.63$\pm$0.33 & 98.57$\pm$0.12 \\ \midrule 
		\multirow{2}{*}{\rotatebox[origin=c]{90}{\textbf{Train}}} & Lin. & 91.76$\pm$0.13 & 94.0$\pm$0.1 & 94.42$\pm$0.29  \\
		& LeNet & 98.71$\pm$0.05 & 98.76$\pm$0.05 & 98.77$\pm$0.03 \\\midrule
    	\multicolumn{5}{l}{CIFAR-10  (see Fig. \ref{fig:lenet-cifar-width})} \\ \midrule 
		\multirow{2}{*}{\rotatebox[origin=c]{90}{\textbf{Test}}} & Lin. & 58.23$\pm$0.43 & 62.47$\pm$0.26 & 65.8$\pm$0.23  \\
		& LeNet & 77.56$\pm$0.21 & 78.83$\pm$0.1 & 78.97$\pm$0.13 \\ \midrule 
		\multirow{2}{*}{\rotatebox[origin=c]{90}{\textbf{Train}}} & Lin. & 68.32$\pm$0.92 & 81.24$\pm$0.74 & 93.84$\pm$0.33 \\
		& LeNet & 99.92$\pm$0.01 & 99.96$\pm$0.00 & 99.98$\pm$0.00 \\ \bottomrule
	\end{tabular}
	\label{tab:appendix_accuracy_values_lenet_all}
\end{table}

\end{document}
