% \documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Authors added:
\usepackage{graphicx}
\usepackage{caption}
\usepackage{subcaption}
\usepackage{xcolor}
\usepackage[notextcomp]{stix}
\usepackage{amsmath} 
\usepackage{tabularx,colortbl}
\usepackage{multirow}
\usepackage{multicol}

\def\UrlBreaks{\do\/\do-}

\usetikzlibrary{arrows}

% For references in other .tex files (main.tex)
\usepackage{xr-hyper}

\makeatletter
\newcommand*{\addFileDependency}[1]{% argument=file name and extension
	\typeout{(#1)}
	\@addtofilelist{#1}
	\IfFileExists{#1}{}{\typeout{No file #1.}}
}
\makeatother

\newcommand*{\myexternaldocument}[1]{%
	\externaldocument{#1}%
	\addFileDependency{#1.tex}%
	\addFileDependency{#1.aux}%
}

\myexternaldocument{samarin_622-supp}
%%%%%%%%%%%%%%%

\usepackage{hyperref}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Feature Learning and Random Features in Standard Finite-Width Convolutional Neural Networks: An Empirical Study}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<maxim.samarin@unibas.ch>?Subject=Your UAI 2022 paper}{Maxim~Samarin}}
\author[1]{\href{mailto:<volker.roth@unibas.ch>?Subject=Your UAI 2022 paper}{Volker~Roth}}
\author[1]{\href{mailto:<david.belius@unibas.ch>?Subject=Your UAI 2022 paper}{David~Belius}}
% Add affiliations after the authors

\affil[1]{%
      Department of Mathematics and Computer Science\\
      University of Basel, Switzerland
}


\begin{document}
\maketitle

\begin{abstract}
The Neural Tangent Kernel is an important milestone in the ongoing effort to build a theory for deep learning. Its prediction that sufficiently wide neural networks behave as kernel methods, or equivalently as random feature models arising from linearized networks, has been confirmed empirically for certain wide architectures. In this paper, we compare the performance of two common finite-width convolutional neural networks, LeNet and AlexNet, to their linearizations on common benchmark datasets like MNIST and modified versions of it, CIFAR-10 and an ImageNet subset. We demonstrate empirically that finite-width neural networks, generally, greatly outperform the finite-width linearization of these architectures. When increasing the problem difficulty of the classification task, we observe a larger gap which is in line with common intuition that finite-width neural networks perform feature learning which finite-width linearizations cannot. At the same time, finite-width linearizations improve dramatically with width, approaching the behavior of the wider standard networks which in turn perform slightly better than their standard width counterparts. Therefore, it appears that feature learning for non-wide standard networks is important but becomes less significant with increasing width. We furthermore identify cases where both standard and linearized networks match in performance, in agreement with NTK theory, and a case where a wide linearization outperforms its standard width counterpart.
\end{abstract}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
\section{Introduction}
%
The Neural Tangent Kernel (NTK) \citep{jacot_neural_2018} is a seminal contribution to the study of deep neural networks which extended important insights about the connection of Gaussian processes and neural networks \citep{neal1996bayesian,williams1996computing,matthews2018gaussian,lee2018deep,garriga-alonso2018cnngp}. Ever since, subsequent investigations have refined our view on the NTK with results suggesting both its validity as well as insufficiency as an explanation for the performance of practical finite-width neural networks, and the focus of investigation has moved to the network parameterization and architecture differences and their relationship to the NTK \citep{chizat_lazy_2019,lee_wide_2019,chen2020labelawareNTK,Hanin2020Finite, xiao2020disentangling,seleznova2022analyzing}. The NTK framework has inspired work in many directions like infinite ensembles of trees \citep{kanoh2022a}, federated learning \citep{huang2021flntk} and thus continues to stimulate various advances in deep learning theory.

\cite{jacot_neural_2018} proved that when modeling neural network training under gradient flow, i.e. full batch gradient descent of infinitesimal step size, the training trajectory $f(w,x)$ satisfies an ordinary differential equation (ODE) involving the finite-width Neural Tangent Kernel
%
\begin{align}
    \Theta(w,x,x') &= \left\langle \nabla_{w} f(w,x), \nabla_{w} f(w,x') \right\rangle \nonumber \\ &= \sum_{i=1}^{p} \frac{\partial}{\partial w_i} f(w,x) \frac{\partial}{\partial w_i} f(w,x')
    \label{eq:finite-width-ntk}
\end{align}
%
for weights $w \in \mathbb{R}^p$ and inputs $x,x'\in \mathbb{R}^d$. The form of this kernel depends on the network architecture and the current time-dependent weights $w$ as well as a particular initialization. In this \emph{NTK parameterization}, they showed that when scaling the learning rate per layer in an appropriate way and letting the width tend to infinity, the kernel converges to the infinite-width NTK $\hat{\Theta}$ which is \emph{independent} of the weights and \emph{stays constant} during training, greatly simplifying the ODE in this limit. They showed for the $L^2$ loss that the predictor at convergence is precisely what a kernel regression using the infinite-width NTK would produce. Importantly, the infinite-width NTK depends only on the architecture of the network; it is not learned and thus data-independent. The formalism was extended from fully-connected to other architectures including convolutional networks \citep{arora_exact_2019,yang_scaling_2020}, recurrent neural networks \citep{alemohammad2020rntk}, residual networks \citep{huang2020deep} and more general architectures \citep{yang2020tensor_ntk}. 

This result can be understood as the convergence of wide networks to random feature models \citep{chizat_lazy_2019}. Let $f: \mathbb{R}^d \to \mathbb{R}^L$ be the function given by a network parameterized by weights $w \in \mathbb{R}^p$ with input $x\in \mathbb{R}^d$ and let $f^{l}$ be the output in component $l=1,\ldots,L$, with $L$ being typically the number of classes in a classification task. For $w$ sufficiently close to the random initial weights $w_0$ and $u = w - w_0$, the first order Taylor expansion in the weights
%
\begin{equation}\label{eq:taylor}
  f^{l}(w,x) \approx f^{l}(w_0, x) + \nabla_w f^{l}(w_0, x)\cdot u =: f_\text{lin}^{l}(u,x)
\end{equation}
%
is an accurate approximation. The right-hand side $ f_\text{lin}(u,x)$ is a random feature model with weights $u\in\mathbb{R}^p$ and the feature mapping $\phi(x) \in \mathbb{R}^{L\times p}$ is given by the gradients $\phi^{l}(x)=\nabla_w f^{l}(w_0, x)$ with respect to the weights at initialization $w_0$. If approximation \eqref{eq:taylor} holds, then also the gradients $\nabla_w f(w,x)$ and $\nabla_u f_\text{lin}(u,x)$ of the two models will be close. When training these models with some form of gradient descent and sufficiently small step size for a sufficiently small number of steps, then the training trajectories will stay close, as long as the weight vectors remain in the region around $u=0$ or $w=w_0$, respectively. Using an $L^2$ loss in over-parameterized models, one can expect both models to converge to zero loss \citep{du2018overparamerization}. If convergence occurs before leaving this region, then the models -- whether trained with early stopping or until convergence -- will predict a similar function. For the infinite-width case, \cite{lee_wide_2019} proved that $f(w,x)$ and $f_\text{lin}(u,x)$ converge in distribution to the same Gaussian distribution. Furthermore, the NTK result can be proved by showing that for very wide neural networks the models $f(w,x)$ and $f_\text{lin}(u,x)$ reach zero loss and thus stop evolving before leaving the region where the approximation in Eq. \eqref{eq:taylor} is accurate \citep{chizat_lazy_2019,lee_wide_2019}, known as \emph{lazy training}.
%
The linearized model $f_\text{lin}$ does not learn a representation but uses the random representation $\nabla_w f^{l}(w_0,x)$ which is fixed by the initial weights $w_0$ and remains unchanged throughout training. More in line with Gaussian processes and random feature models \citep{rahimi2007random} but at odds with general intuition on deep learning, NTK theory predicts that, at large widths, a network and its linearization behave similarly and no significant feature learning takes place. This seems to imply that -- even for standard neural networks -- {\it learning} a good representation might become decreasingly relevant with increasing over-parameterization.

Motivated by this conjecture, we study standard convolutional neural networks (CNNs) and their respective linearizations (at initialization) given by Eq. \eqref{eq:taylor}. We complement previous work in that direction (see Sec. \ref{sec:related_work}) and extend these results for more standard architectures in more common classification tasks. In particular, we perform a thorough study of two standard CNNs, LeNet \citep{lenet_98} and AlexNet \citep{alexnet_2012}, for increasingly difficult classification tasks at different widths. 
We observe test accuracy gaps between these networks, in line with the idea that standard neural networks perform feature learning while their linearizations do not. For the wider networks the generalization gap closes, in line with NTK theory, supporting the picture summarized in Fig. \ref{fig:paper_overview}.

However, we also observe low training accuracy for the linearized networks. We investigate numerical issues which can explain reduced training performance in the linearizations and consider a simplified binary classification setting in which we can solve the linear system in Eq. \eqref{eq:taylor} with a standard solver achieving 100\% training accuracy, but observe that this generally causes even worse test accuracy for the linearized models.

In this work, we make the following contributions: 
%
\begin{itemize}
    \item We show that for (nearly) all considered widths, there is a prominent performance gap between the standard and linearized LeNet and AlexNet and this gap increases when the classification task increases in difficulty. This is shown for MNIST, CIFAR-10, and a subset of ImageNet. We believe this gap exhibits the importance of feature learning for non-wide standard networks.
    \item We present further instances where wide linearized networks perform as well as the standard network and cases where linearized wide networks outperform their standard width counterpart.
    \item As for wider networks the generalization gap closes, in line with NTK theory, we raise the question if this means that the non-wide and wide standard network generalize due to a very different mechanism: feature learning for non-wide networks and effectively employing unlearned random features at larger widths.
    \item We extend the discussion in previous work of numerical aspects of training the non-wide linearized models by considering the effective rank of the kernel.
\end{itemize}

\begin{figure}
    \centering
    \begin{tikzpicture}[
        scale=4,
        axis/.style={thick, ->, >=stealth'},
        dashed line/.style={dashed, thin}
        ]
        
        % Axes
        \draw[axis] (0,0)  -- (1.6,0) node(xaxis)[right] {Width};
        \draw[axis] (0,0) -- (0,1); 
        
        % Separation lines
        \draw[dashed line] (-0.1,0.5)  -- (1.77,0.5);
        \draw[dashed line] (0.8,0) -- (0.8,1);
        
        % Text
        
        \node at (0.15,1.06) {Feature Learning};

        \node[rotate=90] at (-0.1,0.75) {standard net.};
        \node at (0.4,0.75) [align=left] {\textbf{I}: learned features,\\strong performance};
        \node at (1.25,0.75) [align=left] {\textbf{II}: approaching lazy\\training, performance\\improvement};
        \node[rotate=90] at (-0.1,0.25) {linearization};
        \node at (0.4,0.25) [align=left] {\textbf{III}: random features,\\weak performance};
        \node at (1.24,0.25) [align=left] {\textbf{IV}: random features,\\lazy training,\\strong performance\\improvement};
    \end{tikzpicture}
    \caption{Our results on neural networks exhibit different behavior in different regimes: For wide architectures, standard networks and their linearization become increasingly alike. While the performance of linearized networks benefits substantially from width, standard networks only show small improvements. At usual widths, standard networks and their linearization behave differently due to the relevance of feature learning.}
    \label{fig:paper_overview}
\end{figure}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Related Work}
\label{sec:related_work}
%
The original motivation and prevailing appeal of (finite) deep neural networks is that they are powerful methods to extract statistics and learn features leading to strong performance for down-stream tasks (regime I in Fig. \ref{fig:paper_overview}) \citep{lee2009convdeepbeliefnet,alekseev2019gabornet}. The behavior of neural networks in the highly over-parameterized regime has been extensively studied, too, suggesting minor weight changes from initialization during training (regime II) \citep{du2018overparamerization, allen2019convergence,zou2020gradient}. In the NTK literature, typically, the infinite-width limit for finite-depth neural networks is considered (connection between regimes II and IV). In contrast, Deep Equilibrium Models consider the infinite-depth limit at finite width \citep{bai2019deep}. \cite{Hanin2020Finite} study the NTK for both infinitely wide and deep ReLU networks, showing particular data-dependent features of the resulting NTK in these limits. Focusing on the finite-depth case, there are several studies which compare the finite-width NTK $\Theta$ or infinite-width NTK $\hat{\Theta}$ to their standard network (regimes I and IV) and provide, to some extent, diverging results.

The original work by \cite{jacot_neural_2018} gives experimental results for small synthetic datasets, as well as fully-connected networks trained on MNIST of widths $n=10^2,10^3,$ and $10^4$, showing good agreement with the infinite-width NTK for the widest network. \cite{lee_wide_2019} extend the original work and show good agreement for small synthetic datasets (of size $\le256$) and for a two hidden layer fully-connected network trained with SGD on MNIST. Most interestingly, a wide ResNet %\citep{zagoruyko_wide_2016} 
trained on CIFAR-10 shows similar behavior, though the non-linearized model appears to have been trained only to below $80\%$ training accuracy, and in the test accuracy a gap seems to develop towards the end of training (see Fig. 7 in their paper). In contrast to that, in \cite{chizat_lazy_2019} VGG-11 
%\citep{simonyan_very_2015} 
and ResNet-18 
%\citep{he_deep_2016} 
-- trained on CIFAR-10 and widened with a scaling factor $\alpha$ for tuning the models into the non-linearized and linearized regimes -- exhibit large gaps in test accuracy. They highlight that the decreased training performance of the linearization is due to bad conditioning and effectively low rank of the associated kernel matrix. 
%
In their extension to CNNs, \cite{arora_exact_2019} compare CNNs with two to 20 convolutional layers combined with fully-connected or global average pooling output layers to the derived infinite-width convolutional NTK (CNTK), observing large gaps in test accuracy on CIFAR-10.

Our work is most closely related to \cite{lee2020finite} and \cite{Geiger_2020}. In \cite{lee2020finite} an extensive empirical study of neural networks, their linearizations and the infinite-width NTK as well as the Neural Network Gaussian process (NNGP) kernel \citep{lee2018deep,matthews2018gaussian} is conducted. 
For fully-connected and simple convolutional architectures, they show cases where NTK can both outperform but also underperform their corresponding networks on CIFAR-10. Importantly, they study the relevance of regularization of the kernels and identify bad conditioning of the kernel as a reason for decreased performance. In line with results by \cite{wei2020regularization}, they showed that $L^2$ regularization (like weight decay) of the kernel is required for good performance in practice, although this breaks the infinite-width correspondence to kernel methods. In contrast to their work, we focus on two more standard but also more extensive CNNs where we increase the widths of the standard and linearized networks explicitly and study their properties with a focus on feature learning. In that regard, our work differs from \cite{Geiger_2020} who also study lazy training and feature learning but for fully-connected networks of depth three to five and CNNs with four convolutional layers and in the framework of \cite{chizat_lazy_2019} with a scaling factor $\alpha$ controlling the lazy training regime. Another related line of research was conducted by \cite{ortiz2021linNNgeneralization} which study linearizations with respect to task complexity defined on the basis of the NTK eigenfunctions as targets. 
In their evaluation on CIFAR-10, they show that linearization performance can rank learning complexity and show that neural networks do not always outperform their kernel approximations.

Other relevant work includes \cite{seleznova2022analyzing} which investigates the ordered and chaotic phase phenomena of vanishing and exploding gradients in the context of NTK theory, providing guarantees when the NTK is ill-conditioned (ordered phase) or well-conditioned (chaotic phase and at the border between the two phases). 
Furthermore, \cite{yang2021tensorIV} note that standard and NTK parameterizations do not lead to representations that learn features in the infinite-width limit and propose an alternative parameterization enabling feature learning in this limit.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Method}
%
We examine two standard ReLU CNNs, LeNet and AlexNet, trained for classification tasks of increasing difficulty. One task is digit recognition in MNIST \citep{lenet_98} and modified versions which include random translations of the otherwise centered digits. In addition, we train on CIFAR-10 \citep{CIFAR_2009}, and a subset of ImageNet \citep{ImageNet_2015} which contains ten different snake classes (see Supp. Sec. \ref{sec:appendix_snakes_dataset}), whereby we deliberately chose similar classes to form a challenging classification task. 

In this setup, we study the performance of the standard network and its linearization $f_\text{lin}$ (see Eq. \eqref{eq:taylor}) and the effect of increasing the width of the networks, thereby investigating the relationships between regimes I and III as well as III and IV in Fig. \ref{fig:paper_overview}. This is done by multiplying the number of channels in each convolutional layer and all widths of fully-connected layers by a common factor. Due to GPU memory limitations, we are able to train LeNet and \emph{LinLeNet} up to width factors of 60 and for AlexNet and \emph{LinAlexNet} up to width factors of 4. As the number of parameters increase quadratically in the width, and standard width LeNet and AlexNet have about $60$k and $60m$ parameters, we were hence able to train networks of up to $216m$ and $960m$ parameters, respectively.

Our implementation makes use of PyTorch's \citep{pytorch_2019} standard modules for defining and training neural networks with our own custom-made modifications for linearization of the architectures. 
For LeNet, we adapt the original LeNet-5 architecture \citep{lenet_98} to use max-pooling and ReLU activations. For AlexNet, we use the PyTorch implementation \citep{alexnetV2_2014} with $10$ outputs rather than $1000$ (see below). Despite training for classification, we use the $L^2$ loss with one-hot encoded target vectors. Firstly, with standard cross-entropy loss the networks never converge to exactly zero loss, so the networks must at some point leave the region where the approximation in Eq. \eqref{eq:taylor} is valid, causing some ambiguity in the heuristic. Secondly, the $L^2$ loss allows for an easier and more efficient implementation of the training of the linearized models. We furthermore do not make use of dropout, since it is not clear to us how to model it in the NTK framework (see however \cite{novak_neural_2020}). We find that after optimizing hyperparameters, we can train LeNet and AlexNet to similar train and test performance as with cross-entropy loss without dropout (see Supp. Sec. \ref{sec:appendix_snakes_dataset}). We predict the class whose one-hot vector is closest to the output vector, which is equivalent to predicting the argmax of the output layer. We train $f_\text{lin}(u,x)$ with SGD in the standard way by optimizing $u$ with gradient updates obtained by
%
\begin{align}
    \nabla_u \left\vert f_\text{lin}(u, x)-y \right\vert ^2 = 2 \sum_{l=1}^L &\nabla_w f^{l}(w_0, x) \nonumber \\ 
    &\times \left(f_\text{lin}^{l}(u,x)-y^l\right).
\end{align}
%
Computing the gradients of the linear model with $L$ outputs requires computing $L$ gradients of the original network per data point, and thus $L$ backward passes, which is computationally intensive if $L$ is large. We therefore train (Lin)AlexNet on the snakes subset of ImageNet consisting of $L=10$ classes, while we can use full MNIST and CIFAR-10 for (Lin)LeNet.

As our goal is to stay as close as possible to standard neural network training practices, we use SGD with weight decay and momentum. In addition, we use the standard PyTorch weight initialization, which is a variant of Kaiming initialization \citep{he_delving_2015}, rather than the NTK parameterization used in the NTK proofs. 


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Experimental Results}
%
In the experiments, five independent reruns of the specified networks for 100 epochs and batch size 32 were performed unless stated otherwise.  Hyperparameter search was conducted for each network architecture and its linearization at all widths separately, for learning rates including $\{1,0.1, 0.01, 0.001\}$ and weight decay values including $5\times\{10^{-4},10^{-5},10^{-6}, 10^{-7},10^{-8}\}$. The momentum parameter was set to the default value of $0.9$. For each rerun, a different fixed random seed was used to ensure that both the standard and linearized models at a particular width are initialized exactly the same and receive the same mini-batches during training.
For experiments involving CIFAR-10 and the snakes dataset, the learning rate was decreased by a factor $10$ every 30 epochs. Otherwise, we follow the standard preprocessing for standardizing the input images and standard resizing (256 pixels) and center cropping (224 pixels) for ImageNet images. Computations were conducted on Nvidia GeForce Titan X Pascal and Tesla V100 GPUs. For experiments involving LinAlexNet$\times3$ and LinAlexNet$\times4$, we used an Nvidia Quadro RTX 8000 with 48 GB memory due to the increased memory requirement. All displayed results are obtained with single precision. We also carried out all experiments in section \ref{sec:experiments_multiclass} with double precision, but did not observe any striking differences.

\subsection{Classification with increasing feature learning requirement}
\label{sec:experiments_multiclass}
%
\paragraph{LeNet trained on MNIST and CIFAR-10:}
For LeNet with about $60$k parameters, we used width factors ranging from $1$ to $60$. In all experiments involving MNIST and CIFAR-10, a learning rate of $0.1$ and weight decay of $5\times10^{-5}$ led to overall best test accuracies.
%
\begin{figure}[ht]
    \begin{centering}
        \includegraphics[width=0.53\linewidth]{Images/LeNet-MNIST-accuracy_vs_width.pdf}
        \includegraphics[width=0.46\linewidth]{Images/LeNet-MNIST-val_accuracy_vs_width.pdf}
    \par\end{centering}
\caption{Accuracy of LeNet ($\bullet$) and LinLeNet ($\star$) trained on MNIST at different widths (values in Supp. Table \ref{tab:appendix_accuracy_values_lenet_all}).}
\label{fig:lenet-mnist-width}
\end{figure}

The results for MNIST are presented in Fig. \ref{fig:lenet-mnist-width}. For the standard width, a substantial difference of $4.67$ percentage points in (mean) test error between LeNet and LinLeNet is observed. While LeNet does not gain appreciably from increasing the width, LinLeNet does, and the gap shrinks to $0.48$ percentage points for width factor $60$. 
Similarly, though not close in a path-wise sense, the statistics of trajectories of output values become more alike with increasing width (shown in Supp. Fig. \ref{fig:appendix_trajectories}), indicating a more similar behavior of training dynamics of the linearized and standard models at large width factors.

For factor $1$ the linearized model outperforms a logistic regression on normalized MNIST pixels only by a small margin, which achieves about $93\%$ train and $92\%$ test accuracy. The low training accuracy of the linearized models is investigated in more detail in Sec. \ref{sec:numerical_aspects} and \ref{sec:binary_classification}.

When increasing the problem difficulty by randomly translating the digits horizontally and vertically, larger gaps in test (and train) accuracy are observed which also decrease with width. For factors $1$ and $60$, we observe $27.65$ and $3.91$ percentage points difference in test error. The full results are illustrated in Fig. \ref{fig:lenet-mnist-7pixel-translation-width} for translations up to 7 pixels (i.e. up to a quarter of the image size). 

\begin{figure}[ht]
\begin{centering}
\includegraphics[width=0.53\linewidth]{Images/LeNet-MNIST-trans-7-accuracy_vs_width.pdf}
\includegraphics[width=0.46\linewidth]{Images/LeNet-MNIST-trans-7-val_accuracy_vs_width.pdf}
\par\end{centering}
\caption{Accuracy of LeNet ($\bullet$) and LinLeNet ($\star$) trained on MNIST at different widths (values in Supp. Table \ref{tab:appendix_accuracy_values_lenet_all}). Digits were shifted randomly by up to 7 pixels.}
\label{fig:lenet-mnist-7pixel-translation-width}
\end{figure}

When training on the more challenging CIFAR-10 dataset even larger gaps are observed, as shown in Fig. \ref{fig:lenet-cifar-width}. For the standard width, a difference of $20.22$ percentage points in test error between LeNet and LinLeNet is observed. This shrinks to a smaller but still appreciable gap of $13.17$ percentage points at width factor $60$. Interestingly, LinLeNet$\times 60$ outperforms standard width LeNet$\times$1 in both training and test error (gray dashed line).

\begin{figure}[ht]
\begin{centering}
\includegraphics[width=0.53\linewidth]{Images/LeNet-CIFAR10-accuracy_vs_width.pdf}
\includegraphics[width=0.46\linewidth]{Images/LeNet-CIFAR10-val_accuracy_vs_width.pdf}
\par\end{centering}
\caption{Accuracy of LeNet ($\bullet$) and LinLeNet ($\star$) trained on CIFAR-10 at different widths (values in Supp. Table \ref{tab:appendix_accuracy_values_lenet_all}).}
\label{fig:lenet-cifar-width}
\end{figure}


\paragraph{AlexNet trained on snakes dataset:}
For AlexNet with about $60$m parameters, width factors 1, 2, 3, and 4 were used and the networks were trained on the ten-class snakes subset of ImageNet (see Supp. Sec. \ref{sec:appendix_snakes_dataset}). For the linearized networks, a learning rate of $1$ and weight decay of $5\times10^{-7}$ provided the best test errors. For the standard networks, however, a learning rate of $0.1$ and weight decay of $5\times10^{-6}$  lead to best test performance. In addition, we trained the linearized networks with these hyperparameters settings, too, for comparison. 

Figure \ref{fig:alexnet-snakes-width} summarizes the findings, which fall in line with the observed trend for LeNet but give even larger gaps in test error. Trained with the same hyperparameters as their non-linearized counterparts, the gaps in test error between standard AlexNet and LinAlexNet are more than $20$ percentage points at all considered widths. For the optimal hyperparameters in the linearized setting, the generalization gap shrinks only slightly to $20.4$ and $18.56$ percentage points for widths $1$ and $4$. While increasing the width has little impact on train and test error of AlexNet, for LinAlexNet the test error shows a slight decrease and the training error a strong decrease with width. 
%
\begin{figure}[ht]
    \begin{centering}
        \includegraphics[width=0.53\linewidth]{Images/AlexNet-Snakes_both-LRs_accuracy_vs_width.pdf}
        \includegraphics[width=0.46\linewidth]{Images/AlexNet-Snakes_both-LRs_val_accuracy_vs_width.pdf}
    \par\end{centering}
    \caption{Accuracy of AlexNet ($\bullet$), LinAlexNet with learning rate $0.1$ (\textcolor{orange}{$\star$}) and learning rate $1$ (\textcolor{green!60!black}{$\star$}) trained on the snakes dataset at different widths (values in Supp. Table \ref{tab:appendix_accuracy_values_alexnet}).}\label{fig:alexnet-snakes-width}
\end{figure}
%

These results show that, at standard width or small width expansion factors, the random feature models given by the linearized networks perform poorly compared to their standard network counterpart or the random feature models of wider linearized networks. 
%
With increasing problem difficulty, the increasing gap between linearized and standard LeNet and AlexNet suggests that at standard widths significant feature learning is taking place in the standard (non-linearized) model.
%
But with increased over-parameterization, these gaps indeed shrink as predicted by NTK theory. The way the gap shrinks is through a dramatic improvement in performance of the linearized networks with width, while standard networks are less affected in their performance by width. 
%
However, as theory proves that the wide standard trained networks behave as random feature models, we hypothesize that the small improvements in accuracy of standard networks with width might be hiding a significant transition is the underlying {\it reason} for their good performance, namely from feature learning for the non-wide networks to utilizing non-learned random features that apparently provide a good inductive bias for the tasks at hand for the wider networks (both linearized and non-linearized).


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Numerical Aspects}
\label{sec:numerical_aspects}
%
The low training accuracy of the non-wide linearized models in the previous experiments raise the question of whether they are well-trained at all. Fitting the linearized model with $m$ data points $x\in \mathbb{R}^d$ is effectively solving the linear system
%
\begin{equation}
	y - f(w_0,x) = \nabla_w f(w_0,x) \cdot u
	\label{eq:lin_system}
\end{equation}
%
for weights $u\in\mathbb{R}^p$ and target $y \in \mathbb{R}^{L}$, where $p$ is the number of parameters of the original model and the rows of the matrix $\nabla_w f(w_0,x)$ are the gradients of each output of the network at data point $x$. With $X\in \mathbb{R}^{m\times d}$, the matrix $\nabla_w f(w_0,X)\in \mathbb{R}^{n\times p}$ has $n=10 \times m$ rows since one must fit each of the $L=10$ outputs for each data point. LeNet at width factors $1$ and $2$ has roughly $p_1=60$k and $p_2=240$k parameters, respectively. Thus, the matrix $\nabla_w f(w_0,X)$ cannot have full rank when fitting a dataset of size $m=50$k (CIFAR-10) or $m=60$k (MNIST), i.e. $p_1,p_2<10\times m$, making it impossible to fit arbitrary targets. Moreover, even for wider networks it appears that matrix $\nabla_w f(w_0,X)$ remains effectively of low rank. 
%
\begin{figure}[ht]
    \begin{centering}
        \includegraphics[width=0.515\linewidth]{Images/effective_rank_lenet_mnist_translation.pdf}
        \includegraphics[width=0.475\linewidth]{Images/effective_rank_alexnet_snakes.pdf}
    \par\end{centering}
    \caption{Effective rank of matrix $\nabla_w f(w_0,X)$ in LinLeNet and LinAlexNet for $600$ data samples of the MNIST (standard in \textcolor{orange}{$\star$} and shifted in \textcolor{green!60!black}{$\star$}) and snakes datasets, respectively, with full rank $6000$.
    }
    \label{fig:eff_rank_lenet_mnist}
\end{figure}

We quantify this by computing the \textit{effective rank} \citep{roy_effective_2007} which takes the distribution of singular values into consideration and can be viewed as the exponential entropy of normalized singular values (see Supp. Sec. \ref{sec:appendix_effective_rank}). For computational reasons, we consider $m=600$ data samples and the corresponding $6000 \times 6000$ kernel matrix $\nabla_w f(w_0,X)\nabla_w f(w_0,X)^\top$ for each width factor. For LinLeNet and considering MNIST samples, these effective ranks are much lower than the number of rows, i.e. $6000$, and increase with width factor as illustrated in Fig. \ref{fig:eff_rank_lenet_mnist} (left). A similar but less pronounced improvement in effective rank with width is obtained for MNIST samples with additional random translation of up to 7 pixels. Although AlexNet$\times 1$ with about $60$m parameters is well in the over-parameterized regime for $m=13$k datapoints and thus $n=130$k rows in matrix $\nabla_w f(w_0,X)$, we still observe large gaps in training accuracy. As for LinLeNet, we show for $600$ examples in Fig. \ref{fig:eff_rank_lenet_mnist} (right) that the effective ranks at all widths are significantly lower than the number of rows of $\nabla_w f(w_0,X)$ and increase with width (marginally).

In Fig. \ref{fig:sing_vals_lenet_mnist}, the distribution of singular values $\sigma$ of the kernel matrix is shown (see Supp. Fig. \ref{fig:appendix_sing_vals_alexnet} for LinAlexNet). We observe that increasing the width effectively increases the smallest (non-vanishing) singular values of matrix $\nabla_w f(w_0,X)$ and generally leads to a lower condition number (i.e. ratio $\sigma_\text{max} / \sigma_\text{min}$), for this matrix, thereby improving numerical properties. 

We suspect that, in order to perfectly fit the training data, one needs to fit $u$ also in a subspace with very small singular values, making it difficult to achieve close to $100\%$ train accuracy with SGD with non-infinitesimal step sizes.
%
\begin{figure}[ht]
    \begin{centering}
        \includegraphics[width=0.515\linewidth]{Images/singular_values_lenet_mnist.pdf}
        \includegraphics[width=0.475\linewidth]{Images/singular_values_lenet_mnist_translations.pdf}
    \par\end{centering}
    \caption{Singular value distribution of LinLeNet for 600 samples of MNIST and MNIST digits randomly shifted by up to 7 pixels. See Supp. Fig. \ref{fig:appendix_sing_vals_alexnet} for LinAlexNet.}
    \label{fig:sing_vals_lenet_mnist}
\end{figure}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Binary classification on MNIST}
\label{sec:binary_classification}
%
In order to study these numerical aspects in more detail, we take a closer look at the solution of the linear system in Eq. \eqref{eq:lin_system}. In particular, we examine if the multiclass setting might be the cause for numerical stability issues due to having multiple outputs (the different classes) for a single input, potentially leading to e.g. collinearity of rows in the matrix. Therefore, we consider binary classification with one output and train to classify a digit as 0 or not 0. Qualitatively similar results were obtained for other target classes. In the following, we solve the one-vs-rest classification task with the same least-squares objective in three ways: by training LeNet with SGD, by training LinLeNet with SGD, and by using a standard solver for linear systems. The presented results are obtained from single runs of the respective model with a fixed random seed. 

\paragraph{Solving the linear system with SGD:} The training is performed in the same manner as before, but with a learning rate of 0.01 and for 200 epochs. Tables \ref{tab:accs_linlenet_binary_translation-0} and \ref{tab:accs_linlenet_binary_translation-7} summarize the binary classification results with target class 0 for standard MNIST and MNIST with up to 7 pixels translation. The qualitative behavior with SGD training follows the same trend as in Fig.s \ref{fig:lenet-mnist-width} and \ref{fig:lenet-mnist-7pixel-translation-width} for the multiclass results (for this reason an illustration is omitted). As before, by including translations of up to 7 pixels of the digits, we observe a drop in accuracies which is particularly pronounced for the linearized setting. 
%
\begin{table}[ht]
    \centering
	\caption{Accuracy of LeNet, LinLeNet and the solver on binary MNIST (0 vs. not 0) at different widths.}
	\resizebox{\linewidth}{!}{%
	\setlength{\tabcolsep}{4pt} 
	\begin{tabular}{llllllll}
		\toprule 
		& & $\mathbf{\times1}$ & $\mathbf{\times2}$ & $\mathbf{\times5}$ & $\mathbf{\times10}$ & $\mathbf{\times25}$ & $\mathbf{\times60}$ \\ \midrule 
		\multirow{3}{*}{\rotatebox[origin=c]{90}{\textbf{Test}}} & Solver & $97.8$ & $99.86$ & 99.89 & $-$ & $-$ & $-$ \\
		& Lin. &  $99.61$ & $99.75$ & $99.81$ & $99.85$ & $99.85$ & 99.89 \\
		& LeNet &  99.89 & 99.89 & 99.88 & 99.89 & 99.89 & 99.88 \\ \midrule
		\multirow{3}{*}{\rotatebox[origin=c]{90}{\textbf{Train}}} & Solver & $100$ & $100$ & $100$ & $-$ & $-$ & $-$ \\
		& Lin. & $99.42$ & $99.74$ & $99.88$ & $99.97$ & $99.9983$ & 100 \\
		& LeNet &  $100$ & $100$ & $100$ & $100$ & $100$ & 100 \\ \bottomrule
	\end{tabular}
	}
	\label{tab:accs_linlenet_binary_translation-0}
\end{table}	

In comparison to the harder multiclass task, the gap in training accuracy between LeNet and LinLeNet is greatly reduced but persists for the less wide networks, especially for LinLeNet$\times 1$ in the translated MNIST task. While training the standard network consistently leads to perfect training accuracy in the standard MNIST setting, it is not possible to achieve 100\% training accuracy when solving the linear system in Eq. \eqref{eq:lin_system} with SGD, in most cases. However, from a width factor of 5 on, we observe for LinLeNet in the standard MNIST task that the linearized networks start agreeing (up to the second decimal place) with the results of the corresponding LeNet. In particular, LinLeNet$\times 60$ matches the train and test results of LeNet at all considered widths, which is in agreement with NTK theory.
%
\begin{table}[ht]
    \centering
    \caption{Accuracy of LeNet, LinLeNet and the solver on binary MNIST (0 vs. not 0) at different widths. Input digits were randomly shifted by up to 7 pixels. 
    }
    \resizebox{\linewidth}{!}{%
	\setlength{\tabcolsep}{4pt} 
	\begin{tabular}{llllllll}
		\toprule 
		& & $\mathbf{\times1}$ & $\mathbf{\times2}$ & $\mathbf{\times5}$ & $\mathbf{\times10}$ & $\mathbf{\times25}$ & $\mathbf{\times60}$ \\ \midrule 
		\multirow{3}{*}{\rotatebox[origin=c]{90}{\textbf{Test}}}& Solver & $86.05$ & $98.7$ & $99.17$ & $-$ & $-$ & $-$ \\
		& Lin. & $95.48$ & $98.51$ & $98.97$ & $99.31$ & $99.51$ & $99.55$ \\
		& LeNet & $99.72$ & $99.80$ & $99.82$ & $99.77$ & $99.82$ & $99.86$ \\ \midrule 
		\multirow{3}{*}{\rotatebox[origin=c]{90}{\textbf{Train}}}& Solver & $100$ & $100$ & $100$ & $-$ & $-$ & $-$ \\
		& Lin. & $95.42$ & $98.29$ & $98.90$ & $99.25$ & $99.49$ & $99.61$ \\
		& LeNet & $99.77$ & $99.83$ & $99.86$ & $99.90$ & $99.92$ & $99.90$ \\ \bottomrule
	\end{tabular}
	}
	\label{tab:accs_linlenet_binary_translation-7}
\end{table}

\paragraph{Solving the linear system with a standard solver:} Since SGD is not able to attain high train accuracy for linearized models for all widths, it raises the question whether a different algorithm can, and if so, what its generalization properties are for the tasks at hand. An advantage of the binary classification setting is that we can directly solve the linear system in Eq. \eqref{eq:lin_system} for $u$ for width multipliers 1, 2, and 5, as the amount of memory required to store the entire matrix $\nabla_w f(w_0,x)$ in memory is reduced and becomes manageable. Larger widths were not feasible for us as more than 1 TB of memory is required even for binary classification, without taking additional memory requirements for the computation into account. We make use of the SciPy least-squares solver which utilizes the highly optimized LAPACK library \citep{anderson1999lapack}. The results are included in Tables \ref{tab:accs_linlenet_binary_translation-0} and \ref{tab:accs_linlenet_binary_translation-7}.

Interestingly, the solver attains perfect training accuracy in all considered cases, but at the cost of a diminished test accuracy for LinLeNet$\times1$ in standard MNIST (see Table \ref{tab:accs_linlenet_binary_translation-0}) and, particularly, in translated MNIST (see Table \ref{tab:accs_linlenet_binary_translation-7}), indicating overfitting of the solver solution. Apparently, the implicit regularization of the SGD solution significantly improves generalization for these widths, while precluding a perfect train accuracy. For LinLeNets of larger widths, an improved generalization is attained which we view to match the SGD results to a reasonable degree (considering fluctuations in the second decimals place as in the multiclass results, see Supp. Table \ref{tab:appendix_accuracy_values_lenet_all}). In the standard MNIST task, the attained solver solutions for LinLeNet$\times 2$ and LinLeNet$\times 5$ match the test accuracies of their corresponding standard LeNets at otherwise 100\% train accuracy. It should be noted that the solver results were obtained without regularization. Additional regularization should lead to similar results as for LinLeNet$\times 1$ trained with SGD, that is higher generalization and lower training accuracy.

Therefore, it appears that the observed generalization gaps and poor performance of non-wide linearized models in Sec. \ref{sec:experiments_multiclass} are not due to poor training optimization. We suspect that moderately wide linearized networks in the multiclass experiments operate in a similar regime as LinLeNet$\times 1$ in the binary classification setting.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Conclusion}
%
Motivated by conflicting results in NTK literature, we studied two classical convolutional neural networks, LeNet and AlexNet, and their corresponding linearizations at different widths and increasing difficulty of classification tasks. We investigated four regimes of different behavior in neural networks (see Fig. \ref{fig:paper_overview}) which complement previous results on lazy training \citep{chizat_lazy_2019} and random feature models \citep{lee_wide_2019,lee2020finite} summarized in the following.

Firstly, in agreement with previous results like by \cite{arora_exact_2019}, we observed significant train and test performance gaps between standard width LeNet and AlexNet and their corresponding linearization. By considering different classification tasks of increasing difficulty, we showed that the performance gaps increase accordingly suggesting that richer features need to be learned, which the effectively random feature models LinLeNet and LinAlexNet cannot provide. 

Secondly, in agreement with work such as \cite{lee_wide_2019,jacot_neural_2018}, we showed, however, that width improves the performance of linearized networks significantly. We hypothesize that the comparatively minor improvements in performance of standard networks might hide a transition from feature learning to utilizing random features at moderate widths. 
This might be related to previous results suggesting that the intermediate representations of networks of increasing width become increasingly alike to each other and to the representation in the large width limit \citep{kornblith2019similarity}.

Thirdly, we showed that numerical aspects like the effective rank (see Fig. \ref{fig:eff_rank_lenet_mnist}) and distribution of singular values (see Fig. \ref{fig:sing_vals_lenet_mnist} and \ref{fig:appendix_sing_vals_alexnet}) of the feature mapping $\nabla_w f(w_0,X)$ have a role in explaining low training accuracy of SGD trained non-wide linearized models. Increasing width appears to remedy these numerical issues of the associated kernel. 

In summary, our investigation is based on the finite-width NTK at initialization and explores deviations, as described above, but also convergence to NTK theory. In particular, we show agreement in performance of standard networks and their linearization as well as an instance where a wide LinLeNet($\times 60$) outperforms its standard width LeNet($\times 1$) on CIFAR-10.

Our study highlights the need to study theoretical descriptions of neural network generalization beyond the finite-width NTK at initialization, for instance by considering time-dependent NTK \citep{huang_dynamics_2019,jacot_neural_2018} for finite-width networks (see e.g. \cite{fort2020deep}) or by further developing the various proposed \emph{mean-field} theories \citep{chizat_global_2018,hu_mean-field_2019,javanmard_analysis_2019,mei_mean-field_2019,nguyen_mean_2019,rotskoff_global_2019}. Additionally, it highlights the need to study the nature of the potential transition to effectively random features at moderate widths in standard neural network training.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{contributions} 
    D.~Belius and M.~Samarin jointly conceived the idea and wrote the paper. M.~Samarin wrote the code, performed the experiments, created the figures and revised the paper. V.~Roth participated in the discussions and the refinement of the results.
\end{contributions}

\begin{acknowledgements} 
    We are grateful to Levent Sagun, Peter Zaspel, and Ivan Dokmani\'{c} for enlightening discussions regarding the results presented in this article. M.S. would like to thank the Swiss National Science Foundation for supporting the research with grant 167333 as part of the Swiss National Research Programme NRP 75 "Big Data". Calculations were performed at sciCORE (\url{http://scicore.unibas.ch/}) scientific computing core facility at University of Basel and Amazon Web Services (AWS).
\end{acknowledgements}

\bibliography{samarin_622}

\end{document}
