
\section{Experimental details}
We present the experimental details of \cref{sec:experiments-main,sec:understanding,sec:practical,sec:theory}.
Most of the experiments are conducted with a single RTX3090 GPU with $24$GB VRAM while some experiments requiring larger memory are conducted with multiple RTX3090 GPUs.
The code to reproduce the results of this work is implemented with JAX \citep{jax2018github} and Flax \citep{flax2020github}, which is available at \url{https://github.com/LOG-postech/SAM-overparam}.


\subsection{Experiments for Section \ref{sec:experiments-main}}

\label{app:exp_details}

\begin{table*}[!th]
    \renewcommand{\arraystretch}{1.4}
    \centering
    \resizebox{\linewidth}{!}{
    % \footnotesize
    \begin{tabular}{l c c c c c c c c}
      \toprule
      Workload & Epochs/steps & Learning rate / decay & Weight decay & Batch size & $\rho$ search & Baseline optimizer &  \\
      \midrule
      Synthetic         & 100 epochs   & $0.1$ / step & $0.0$ & $128$ & $\Big\{\substack{0.001, 0.01, 0.05, 0.07, 0.1, \\0.2, 0.3, 0.5, 0.7, 1.0, 2.0}\Big\}$ & SGD \\ 
      MNIST/MLP         & $100$ epochs   & $0.1$ / step & $0.0001$ & $128$ & $\{0.01, 0.02, 0.05, 0.1, 0.2\}$ & SGD with momentum $0.9$ \\ 
      CIFAR-10/ResNet-18  & $200$ epochs & $0.1$ / step & $0.0005$ & $128$ & $\Big\{\substack{0.001, 0.005, 0.01, 0.02, \\0.05, 0.1, 0.2, 0.5, 1.0}\Big\}$ & SGD with momentum $0.9$\\ 
      ImageNet/ResNet-50  & $90$ epochs & $0.1$ / cosine & $0.0001$ & $512$ & $\{0.01, 0.02, 0.05, 0.1, 0.2\}$ & SGD with momentum $0.9$\\ 
      PoS tagging  & $75000$ steps & $0.05$ / inverse sqrt & $0.1$ & $64$ & $\{0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.5\}$ & AdamW ($\beta_1 = 0.9, \beta_2 = 0.98$) \\ 
      Sentiment classification  & $30$ epochs & $0.1$ / constant & $3e\text{-}6$ & $64$ & $\{0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.5\}$ & SGD with momentum $0.8$ \\ 
      Graph property prediction  & $10^5$ steps & $0.001$ / constant & $0.0$ & $256$ & $\{0.01, 0.02, 0.05, 0.1, 0.2\}$ & Adam ($\beta_1 = 0.9, \beta_2 = 0.999$) \\ 
      Atari game  & $10^7$ steps & $2.5e\text{-}4$ / linear & $0.0$ & $256$ & $\{0.01, 0.02, 0.05, 0.1, 0.2\}$ & Adam ($\beta_1 = 0.9, \beta_2 = 0.999$) \\ 
      CIFAR-10/ViT  & $200$ epochs & $0.1$ / cosine & $0.0001$ & $128$ & $\{0.01, 0.02, 0.05, 0.1, 0.2\}$ & SGD with momentum $0.9$\\ 
      \bottomrule
  \end{tabular}
  }
  \caption{
    Hyperparameters for each workload.
  }
  \label{tab:hyperparams}
\end{table*}

\begin{table*}[!th]
    \renewcommand{\arraystretch}{1.1}
    \centering
    \resizebox{0.8\linewidth}{!}{
    % \footnotesize
    \begin{tabular}{l c c}
      \toprule
      Workload & Scaling factor & Values \\
      \midrule
      Synthetic         & \# of neurons   & $\{k*100 \vert 1 \leq k \leq 10\}$ \\ 
      MNIST/MLP         & \# of neurons   & $\{[300*p, 100*p] \vert p \in \{0.25, 0.5, 1, 4, 10\}\}$ \\ 
      CIFAR-10/ResNet-18  & \# of convolutional filters & $\{2^k \vert 2 \leq k \leq 8\}$ \\ 
      ImageNet/ResNet-50  & \# of convolutional filters & $\{16*k \vert 1 \leq k \leq 5\}$ \\ 
      PoS tagging  & dimension of hidden states & $\{128*k \vert 1 \leq k \leq 5\}$  \\ 
      Sentiment classification  & dimension of hidden states & $\{2^k \vert 5 \leq k \leq 9\}$  \\ 
      Graph property prediction  & \# of neurons & $\{2^k \vert 7 \leq k \leq 9\}$ \\ 
      Atari game  & \# of convolutional filters & $\{16*k \vert 1 \leq k \leq 4\}$ \\ 
      CIFAR-10/ViT  & dimension of hidden states  & $\{2^k \vert 5 \leq k \leq 10\}$ \\ 
      \bottomrule
  \end{tabular}
  }
  \caption{
    Model scaling factors and values for each workload.
  }
  \label{tab:scaling}
\end{table*}

For all the experiments in \cref{sec:experiments-main}, we run the experiments with the same configurations over three different random seeds. 
We visualize the average and standard error (\ie, std$/\sqrt{n_{\text{seed}}}$) as a line plot and a shaded region surrounding it.
Many of our experiments and the hyperparameter values are based on examples provided by Flax \citep{flax2020github} official repository.\footnote{\url{https://github.com/google/flax/tree/main/examples}}
The hyperparameter values and how the models are scaled for each workload are summarized in \cref{tab:hyperparams,tab:scaling}, respectively.
We present the additional details for individual workloads below.

\paragraph{Synthetic Regression / 2-layer MLP}

We follow the student-teacher setting from \citet{advani2020highdi} where the teacher is a randomly initialized $2$-layer ReLU network with $200$ neurons and the student is a $2$-layer ReLU network with a different number of neurons.
Each element for the input $x \in \R^{100}$ is sampled from a standard normal distribution while the target $y \in \R$ is calculated as the output of the teacher network added by Gaussian noise sampled from a standard normal distribution.
The models are trained on $20400$ training data, which is roughly the same as the number of parameters in the teacher model, and tested on the $5100$ data, which is a quarter of the number of the training data.

\paragraph{MNIST / 3-layer MLP}

We train LeNet-300-100 \citep{lecun1998gradient} for the MNIST \citep{lecun2010mnist}.
The learning rate decays by $0.1$ after $50\%$ and $75\%$ of the total epochs.
We scale the models while preserving the relative proportions of the number of neurons in each layer as $3:1$.

\paragraph{CIFAR-10 / ResNet-18}

We train ResNet-18 \citep{he2016deep} for the CIFAR-10 \citep{krizhevsky2009learning}.
We choose the hyperparameters as similar to \citet{andriushchenko2022towards}.
The learning rate decays by $0.1$ after $50\%$ and $75\%$ of the total epochs.

\paragraph{ImageNet / ResNet-50}

We train ResNet-50 \citep{he2016deep} for the ImageNet \citep{deng2009imagenet}.
We choose the hyperparameters as similar to \citet{du2021efficient} and use a linear warmup of $5000$ steps.
% We experiment with different model widths, \ie, the number of convolutional filters in the first residual block, of $[16, 32, 48, 64, 80]$.
We additionally experiment with $\rho = 0.005$ for the two smallest models.

\paragraph{PoS tagging/ Transformer}

We train Encoder-only Transformer \citep{vaswani2017attention} for the Universal Dependencies \citep{nivre2016universal} -- Ancient Greek.
We use a linear warmup of $8000$ steps.
We evaluate the validation accuracy once every $1000$ step and report the best value except for the experiment in \cref{fig:result_overparam_diff_pos_wo_es}.
The dimension of MLP and the number of attention heads are scaled as $4\times$ and $1/64 \times$ of the dimension of the hidden states following the Flax example.

\paragraph{SST / LSTM}

We train LSTM \citep{hochreiter1997long} for SST2  \citep{socher2013recursive} where the task is a binary classification (positive/negative) of the movie reviews.
We evaluate the validation accuracy for every epoch and report the best value.
The embedding size is scaled as $300/256 \times$ of the dimension of hidden states following the Flax example.

\paragraph{Graph property prediction / GCN}

We train $2$-layer Graph Convolutional Networks \citep{kipf2016semi} for the ogbg-molpcba \citep{hu2020open}.
Here, the input is a graph of a molecule where nodes and edges each represent atoms and chemical bonds.
The task is a binary classification of whether a molecule inhibits HIV replication or not.

\paragraph{Atari game / CNN}

We train $5$-layer CNNs for the Atari Breakout-v5 game \citep{mnih2013playing}.
We train the Actor-Critic networks \citep{konda1999actor} with proximal policy optimization \citep{schulman2017proximal}.
We evaluate the validation score once every $100$ step and report the best value.
We also use gradient clipping of $0.5$ for all models.

\paragraph{CIFAR-10 / ViT}

For the experiment in \cref{fig:result_overparam_diff_cifar_vit}, we train $6$-layer Vision Transformers \citep{dosovitskiy2020image} for the CIFAR-10 \citep{krizhevsky2009learning} using the patch size of $4 \times 4$.
We scale the dimension of MLP and the number of attention heads as $2 \times$ and $1/32 \times$ of the dimension of hidden states.

\subsection{Experiments for Section \ref{sec:understanding}}

\label{app:exp_details_understanding}

For the experiments in \cref{fig:onehidden-function,fig:onehidden-trajectories}, we follow the setting in \citet{andriushchenko2022towards}.\footnote{\url{https://github.com/tml-epfl/understanding-sam/tree/main/one_layer_relu_nets}}
Specifically, we train one-hidden-layer ReLU networks where each data has input $x \in \R$ and target $y \in \R$.
% Here, the networks are trained on the quadratic loss with full-batch GD or SAM with $\rho = 0.3$.
Here, the networks are trained on the quadratic loss with mini-batch SGD or SAM with $\rho = 0.2$ where we randomly choose $6$ data points every iteration.
Additionally, the optimization trajectories in \cref{fig:onehidden-trajectories} are plotted following \citet{li2018visualizing}.\footnote{\url{https://github.com/tomgoldstein/loss-landscape}}
Specifically, the trajectories are plotted along the PCA directions calculated from converged minima of two different paths from SGD and one path from SAM.


\subsection{Experiments for Section \ref{sec:theory}}

\label{app:exp_details_theory}

\paragraph{Linear stability}
For the experiments of \cref{fig:stability_uniformity,fig:landscape}, we follow the setting in \citet{wu2018sgd}.
Specifically, we set up $3$-layer MLP having $[3000, 1000]$ hidden neurons with squared loss, so that the local quadratic approximation becomes precise, and train the networks on MNIST.
We use $1000$ random samples to calculate the non-uniformity, and all models are trained to reach near zero loss.
The networks are trained with a constant learning rate of $0.1$ without weight decay or momentum.

\paragraph{Convergence -- Matrix Factorization}
For the matrix factorization experiment in \cref{fig:convergence}, we solve the following non-convex regression problem: $\min_{W_1, W_2} \E_{x\sim \mathcal{N}(0, I)}{\|W_2W_1x-Ax\|^2}$ where the objective function is smooth and satisfies the PL-condition \citep{loizou2021stochastic}. 
We choose $A\in\R^{10\times6}$ and generate 1000 training samples, which are used for training a rank $k$ linear network with two matrix factors $W_1\in\R^{k\times6}$ and $W_2\in\R^{10\times k}$. Here, interpolation is satisfied when rank $k=10$.
We train two linear networks with $k\in\{4, 10\}$ for $100$ epochs with a constant learning rate of $0.0005$ and compare the convergence speed.



\section{Absolute validation metric for Section \ref{sec:experiments-main}} \label{app:add-overparam_results}

We present the full results of \cref{fig:result_overparam_acc}, including the absolute validation metrics of SAM and SGD in \cref{fig:result_overparam_acc_app}.
There is a consistent trend that SAM improves with overparameterization in all tested cases.

\begin{figure*}[!th]
  \begin{subfigure}{\linewidth}
      \centering
      \includegraphics[width=0.24\linewidth]{figures/synth/sam-sgd/synth-loss_diff.pdf}
      \includegraphics[width=0.24\linewidth]{figures/mnist/overparam_diff/overparamwd[0.0001].pdf}
      \includegraphics[width=0.24\linewidth]{figures/cifar/ResNet18/overparam_diff/overparamwd[0.0005].pdf}
      \includegraphics[width=0.24\linewidth]{figures/imagenet/overparam_diff/overparam.pdf}
      \includegraphics[width=0.24\linewidth]{figures/pos/overparam_diff/overparam_bestvalacc.pdf}
      \includegraphics[width=0.24\linewidth]
      {figures/sst/overparam_diff/overparam_bestvalacc.pdf}
      \includegraphics[width=0.24\linewidth]{figures/gnn/overparam_diff/overparam.pdf}
      \includegraphics[width=0.24\linewidth]{figures/ppo/overparam_diff/overparam_bestvalacc.pdf}
      \caption{Improvement in validation metrics by SAM}
      \label{fig:result_overparam_acc_rel_app}
      \vspace{1em}
  \end{subfigure}
  \begin{subfigure}{\linewidth}
      \centering
      \includegraphics[width=0.24\linewidth]{figures/synth/sam-sgd/synth-loss.pdf}
      \includegraphics[width=0.24\linewidth]{figures/mnist/overparam/overparamwd[0.0001].pdf}
      \includegraphics[width=0.24\linewidth]{figures/cifar/ResNet18/overparam/overparamwd[0.0005].pdf}
      \includegraphics[width=0.24\linewidth,trim={0.4cm 0.4cm 0.4cm 0.4cm},clip]{figures/imagenet/overparam/overparam.pdf}
      \includegraphics[width=0.24\linewidth]{figures/pos/overparam/overparam_bestvalacc.pdf}
      \includegraphics[width=0.24\linewidth]{figures/sst/overparam/overparam_bestvalacc.pdf}
      \includegraphics[width=0.24\linewidth]{figures/gnn/overparam/overparam.pdf}
      \includegraphics[width=0.24\linewidth]{figures/ppo/overparam/overparam_bestvalacc.pdf}
      \caption{Absolute validation metrics of SAM and baseline optimizers}
      \label{fig:result_overparam_acc_abs}
  \end{subfigure}
  \caption{
    % The absolute metrics for SAM and baseline optimizers.
    Effects of overparameterization on SAM. 
    Improvement in validation metrics by SAM.
    (a) Improvement in validation metrics by SAM and (b) the absolute metrics for SAM and baseline optimizers.
    The generalization benefit of SAM tends to increase as the model becomes more overparameterized.
  }
  \label{fig:result_overparam_acc_app}
\end{figure*}





\section{Full results on optimal perturbation bound} \label{app:add-rho}

\begin{figure}[!h]
    \centering
    \includegraphics[width=0.16\linewidth]{figures/mnist/rho/filter_75,_25__val_acc.pdf}
    \includegraphics[width=0.16\linewidth]{figures/mnist/rho/filter_150,_50__val_acc.pdf}
    \includegraphics[width=0.16\linewidth]{figures/mnist/rho/filter_300,_100__val_acc.pdf}
    \includegraphics[width=0.16\linewidth]{figures/mnist/rho/filter_1200,_400__val_acc.pdf}
    \includegraphics[width=0.16\linewidth]{figures/mnist/rho/filter_3000,_1000__val_acc.pdf}
    \includegraphics[width=0.16\linewidth]{figures/mnist/rho/best_rho.pdf}\\
    \vspace{-0.6em}
  \caption{
    Validation accuracy versus $\rho$ for MNIST and $3$-layer MLP.
  }
  \vspace{-0.6em}
  \label{fig:result_overparam_each_rho_mnist}
\end{figure}

\begin{figure}[!h]
    \centering
    \includegraphics[width=0.16\linewidth]{figures/imagenet/rho/filter16_val_acc.pdf}
    \includegraphics[width=0.16\linewidth]{figures/imagenet/rho/filter32_val_acc.pdf}
    \includegraphics[width=0.16\linewidth]{figures/imagenet/rho/filter48_val_acc.pdf}
    \includegraphics[width=0.16\linewidth]{figures/imagenet/rho/filter64_val_acc.pdf}
    \includegraphics[width=0.16\linewidth]{figures/imagenet/rho/filter80_val_acc.pdf}
    \includegraphics[width=0.16\linewidth]{figures/imagenet/rho/best_rho.pdf}\\
    \vspace{-0.6em}
  \caption{
    Validation accuracy versus $\rho$ for ResNet-50 and ImageNet.
  }
  \vspace{-0.6em}
  \label{fig:result_overparam_each_rho_imagenet}
\end{figure}

\begin{figure}[!h]
    \centering
    \includegraphics[width=0.16\linewidth]{figures/cifar/ResNet18/sam_rho/filter4_val_acc.pdf}
    \includegraphics[width=0.16\linewidth]{figures/cifar/ResNet18/sam_rho/filter8_val_acc.pdf}
    \includegraphics[width=0.16\linewidth]{figures/cifar/ResNet18/sam_rho/filter16_val_acc.pdf}
    \includegraphics[width=0.16\linewidth]{figures/cifar/ResNet18/sam_rho/filter32_val_acc.pdf} \\
    \includegraphics[width=0.16\linewidth]{figures/cifar/ResNet18/sam_rho/filter64_val_acc.pdf}
    \includegraphics[width=0.16\linewidth]{figures/cifar/ResNet18/sam_rho/filter128_val_acc.pdf}
    \includegraphics[width=0.16\linewidth]{figures/cifar/ResNet18/sam_rho/filter256_val_acc.pdf}
    \includegraphics[width=0.16\linewidth]{figures/cifar/ResNet18/sam_rho/best_rho.pdf}
  \caption{
    Validation accuracy versus $\rho$ for ResNet-18 and CIFAR-10.
  }
  \label{fig:result_overparam_each_rho_cifar}
\end{figure}

\begin{figure}[!h]
    \centering
    \includegraphics[width=0.16\linewidth]{figures/sst/rho/hidden32_val_acc_bestvalacc.pdf}
    \includegraphics[width=0.16\linewidth]{figures/sst/rho/hidden64_val_acc_bestvalacc.pdf}
    \includegraphics[width=0.16\linewidth]{figures/sst/rho/hidden128_val_acc_bestvalacc.pdf} 
    \includegraphics[width=0.16\linewidth]{figures/sst/rho/hidden256_val_acc_bestvalacc.pdf} \\
    \includegraphics[width=0.16\linewidth]{figures/sst/rho/hidden384_val_acc_bestvalacc.pdf}
    \includegraphics[width=0.16\linewidth]{figures/sst/rho/hidden512_val_acc_bestvalacc.pdf}
    \includegraphics[width=0.16\linewidth]{figures/sst/rho/best_rho_bestvalacc.pdf}
  \caption{
    Validation accuracy versus $\rho$ for LSTM and SST2.
  }
  \label{fig:result_overparam_each_rho_sst}
\end{figure}

Extending from \cref{sec:understanding}, we plot the validation accuracy of SAM versus different values of $\rho$, along with their optimal value of $\rho$ for 3-layer-MLP/MNIST, ResNet-50/ImageNet, ResNet-18/CIFAR-10, and LSTM/SST2 in \cref{fig:result_overparam_each_rho_mnist,fig:result_overparam_each_rho_cifar,fig:result_overparam_each_rho_imagenet,fig:result_overparam_each_rho_sst}, respectively.
It is observed that $\rho^\star$ tends to increase as the model becomes more overparameterized;
on CIFAR-10 with ResNet18, the smallest model has $\rho^\star = 0.01$ while the largest three have $\rho^\star = 0.2$.




\section{Additional results for Section \ref{sec:practical}} \label{app:add_practical}

\subsection{Label noise}

\begin{figure}[!th]
    \centering
  \begin{subfigure}{0.24\linewidth}
      \centering
      \includegraphics[width=\linewidth]{figures/labelnoise/Noise rate.pdf}
      \vspace{-1.5em}
      \caption{Effect of label noise}
      \label{fig:resut_labelnoise_noiserate_app}
  \end{subfigure}
  % \hspace{1em}
  \begin{subfigure}{0.24\linewidth}
      \centering
      \includegraphics[width=\linewidth]{figures/labelnoise/overparamwd[0.0005]dataratio_1.0labelnoise_0.75.pdf}
      \vspace{-1.5em}
      \caption{Noise rate = $0.75$}
      \label{fig:resut_labelnoise_fixednoise_0.75}
  \end{subfigure}
  % \vspace{1em}
  \begin{subfigure}{0.24\linewidth}
      \centering
      \includegraphics[width=\linewidth]{figures/labelnoise/overparamwd[0.0005]dataratio_1.0labelnoise_0.5.pdf}
      \vspace{-1.5em}
      \caption{Noise rate = $0.5$}
      \label{fig:resut_labelnoise_fixednoise_0.5}
  \end{subfigure}
  \begin{subfigure}{0.24\linewidth}
      \centering
      \includegraphics[width=\linewidth]{figures/labelnoise/overparamwd[0.0005]dataratio_1.0labelnoise_0.25.pdf}
      \vspace{-1.5em}
      \caption{Noise rate = $0.25$}
      \label{fig:resut_labelnoise_fixednoise_0.25}
  \end{subfigure}
  \caption{
    Effect of overparameterization on SAM under label noise for CIFAR-10 and ResNet-18.
    (a) SAM benefits a lot more from overparameterization than SGD; it is more pronounced with high noise level.
    (b-d) Under label noise, SGD tends to overfit as with more parameters unlike SAM.
  }
  \label{fig:result_labelnoise}
\end{figure}

More results on the effect of overparameterization on SAM under label noise are presented in \cref{fig:result_labelnoise}.
Overall, we find SAM benefits from overparameterization significantly more than SGD in the presence of label noise.
Precisely, the accuracy improvement made by SAM keeps on increasing as the model becomes more overparameterized, and this trend is more pronounced with higher noise levels;
e.g., it rises from $5\%$ to nearly $50\%$ at the highest noise rate.



\subsection{Sparse overparameterization}

\begin{figure}[!th]
    \begin{subfigure}{0.49\linewidth}
      \centering
      \includegraphics[width=0.45\linewidth]{figures/cifar/ResNet18/largesparse_diff/largesparse_random_dense16.pdf}
      \includegraphics[width=0.45\linewidth]{figures/cifar/ResNet18/largesparse/largesparse_random_dense16.pdf}
      \caption{Random pruning}
      \label{fig:largesparse_cifar_random}
  \end{subfigure}
  \hspace*{\fill}
  \begin{subfigure}{0.49\linewidth}
      \centering
      \includegraphics[width=0.45\linewidth]{figures/cifar/ResNet18/largesparse_diff/largesparse_snip_dense16.pdf}
      \includegraphics[width=0.45\linewidth]{figures/cifar/ResNet18/largesparse/largesparse_snip_dense16.pdf}
      \caption{SNIP}
      \label{fig:result_largesparse_cifar_snip}
  \end{subfigure}
    \caption{
    Effect of sparsification on SAM for ResNet-18 and CIFAR-10.
    Here, all the models have approximately $701$k parameters.
    The improvement tends to increase in large sparse models compared to their small dense counterparts.
  }
  \label{fig:result_largesparse_cifar}
\end{figure}

\begin{figure}[!th]
  \begin{subfigure}{0.49\linewidth}
      \centering
      \includegraphics[width=0.45\linewidth]{figures/mnist/largesparse_diff/largesparse_random_dense[75, 25].pdf}
      \includegraphics[width=0.45\linewidth]{figures/mnist/largesparse/largesparse_random_dense[75, 25].pdf}
      \caption{Random pruning}
      \label{fig:largesparse_mnist_random}
  \end{subfigure}
  \hspace*{\fill}
  \begin{subfigure}{0.49\linewidth}
      \centering
      \includegraphics[width=0.45\linewidth]{figures/mnist/largesparse_diff/largesparse_snip_dense[75, 25].pdf}
      \includegraphics[width=0.45\linewidth]{figures/mnist/largesparse/largesparse_snip_dense[75, 25].pdf}
      \caption{SNIP}
      \label{fig:largesparse_mnist_snip}
  \end{subfigure}
  \caption{
    Effect of sparsification on SAM for MNIST and 3-layer MLP.
    Here, all the models have approximately $61$k parameters.
    The improvement tends to increase in large sparse models compared to their small dense counterparts.
  }
  \label{fig:result_largesparse_mnist}
\end{figure}

\begin{figure}[!th]
      \centering
      \begin{subfigure}{0.49\linewidth}
          \includegraphics[width=0.45\linewidth]{figures/mnist/rho/sparsityrandom_dense_75,_25_.pdf}
          \includegraphics[width=0.45\linewidth]{figures/mnist/rho/sparsitysnip_dense_75,_25_.pdf}
          \caption{MNIST/MLP}
      \end{subfigure}    
      \begin{subfigure}{0.49\linewidth}
          \includegraphics[width=0.45\linewidth]{figures/cifar/ResNet18/sam_rho_sparsity/sparsityrandom_dense16.pdf}
          \includegraphics[width=0.45\linewidth]{figures/cifar/ResNet18/sam_rho_sparsity/sparsitysnip_dense16.pdf}
          \caption{CIFAR-10/ResNet-18}
      \end{subfigure}      
  \caption{
    Effect of sparsification on $\rho^\star$.
    $\rho^\star$ can be sometimes different across different sparsity patterns despite having a similar number of parameters.
  }
  \label{fig:result_largesparse_rho_mnist}
\end{figure}

Additional results on the effect of sparsification on the generalization benefit of SAM are plotted in \cref{fig:result_largesparse_cifar,fig:result_largesparse_mnist}.
Here, we try two sparsification methods that do not require pertaining, random pruning and SNIP \citep{leesnip}.
For both methods, we note that the generalization improvement by SAM tends to increase as the model becomes more sparsely overparameterized.

We also plot the effect of sparsification on $\rho^\star$ in \cref{fig:result_largesparse_rho_mnist}.
We find that $\rho^\star$ is sometimes different between small dense and large sparse models despite having a similar number of parameters;
for the MLP of $61$k parameters on MNIST, $\rho^\star$ changes over different sparsity levels and sparsification methods, but this does not generalize to the CIFAR-10 and ResNet-18.
This indicates that it is not just the parameter count that affects the behavior of SAM, but some other factors such as the pattern of parameterization also have an influence on how SAM shapes training.



\subsection{Regularization}


More results on the effect of regularization on SAM are presented in \cref{fig:result_overparam_overfit}.
We find that overparameterization does not increase the generalization benefit of SAM.
We suspect this is because the models are prone to overfitting in these cases and overparameterizing models may decrease the overall performance both for SGD and SAM;
for example in \cref{fig:result_overparam_cifar_vit}, the validation accuracy drops after $11.2$m parameters.


\begin{figure}[!th]
\centering
  \resizebox{0.7\linewidth}{!}{
  \begin{subfigure}{0.27\linewidth}
      \centering
      \includegraphics[width=\linewidth]{figures/cifar/ResNet18/overparam_diff/overparamwd[0.0].pdf}
  \end{subfigure}
  \hspace{0.3em}
  \begin{subfigure}{0.245\linewidth}
      \centering
      \includegraphics[width=\linewidth]{figures/pos/overparam_diff/overparam.pdf}
  \end{subfigure}
  \hspace{0.3em}
  \begin{subfigure}{0.245\linewidth}
      \centering
      \includegraphics[width=\linewidth]{figures/cifar/ViT/overparam_diff/overparam.pdf}
  \end{subfigure}
  }
  \resizebox{0.7\linewidth}{!}{
  \begin{subfigure}{0.27\linewidth}
      \centering
      \includegraphics[width=\linewidth]{figures/cifar/ResNet18/overparam/overparamwd[0.0].pdf}
      \caption{w/o weight decay}
      \label{fig:result_overparam_resnet_wo_wd}
  \end{subfigure}
  \begin{subfigure}{0.27\linewidth}
      \centering
      \includegraphics[width=\linewidth]{figures/pos/overparam/overparam.pdf}
      \caption{w/o early stop.}
      \label{fig:result_overparam_pos_wo_es}
  \end{subfigure}
  \begin{subfigure}{0.27\linewidth}
      \centering
      \includegraphics[width=\linewidth]{figures/cifar/ViT/overparam/overparam.pdf}
      \caption{ViT}
      \label{fig:result_overparam_cifar_vit}
  \end{subfigure}
  }
  \caption{
    Effect of overparameterization on SAM without regularization:
    (a) CIFAR-10/ResNet-18 without weight decay, (b) Transformer/PoS tagging without early stopping, and (c) ViT/CIFAR-10.
    SAM does not always benefit from overparameterization in these cases.
  }
  \label{fig:result_overparam_overfit}
\end{figure}


\section{Ablation} \label{app:ablation}

\subsection{Effect of depth} \label{app:depth}

\begin{wrapfigure}{r}{0.5\linewidth}
  \vspace{-3em}
  \centering
  \includegraphics[width=0.49\linewidth]{figures/depth/mlp_depth.pdf} % 
  \includegraphics[width=0.49\linewidth]{figures/depth/resnet_depth.pdf} 
  % \hspace{2em}
  \caption{
  Improvement in validation metrics by SAM over different model depths.
  Deeper models tend to yield higher validation accuracy improvements.
  Here we change the number of width-1000 hidden layers in MLP and resblock in each stage of ResNet-18 for MNIST and CIFAR-10, respectively. Benefits of SAM also improves with overparameterization in terms of depth, although the increase is not significant for ResNet.
  }
  \label{fig:depth}
\end{wrapfigure}


We experiment with changing the number of layers for MNIST/MLP and Cifar-10/ResNet-18.
Precisely, we change the number of width-1000 hidden layers in MLP and resblock in each stage of ResNet-18 for MNIST and CIFAR-10 respectively. 
The results are provided in \cref{fig:depth}. 
We find that SAM also improves with overparameterization for MLPs, while the increase is not significant for ResNets.
We suspect that this may result from the complex interplay of various intricate factors and decisions involved in increasing depth in modern architectures such as ResNets (\eg, deciding whether to increase the number of resblocks, layers within the resblock, width stages, or some combination of them), each affecting the training dynamics in distinct ways.
Further study into these factors would be an interesting direction to understand these influences more comprehensively.

\subsection{SAM vs. weight decay} \label{app:wd}

\begin{figure}[!th]
    \centering
    \includegraphics[width=0.19\linewidth]{figures/wd/wd_4.pdf}
    \includegraphics[width=0.19\linewidth]{figures/wd/wd_16.pdf}
    \includegraphics[width=0.19\linewidth]{figures/wd/wd_32.pdf}
    \includegraphics[width=0.19\linewidth]{figures/wd/wd_64.pdf}
    \hspace{0.15em}
    \includegraphics[width=0.19\linewidth]{figures/wd/diff.pdf}
  \caption{
    Effect of weight decay on validation accuracy of ResNet-18 trained on CIFAR-10 with SAM and SGD over various model scales and the improvement in validation metrics by SAM when considering weight decay. 
    Even after being given much larger values of weight decay, SGD isn't able to outperform SAM on any model size.
  }
  \label{fig:sam_vs_wd}
\end{figure}

We conduct experiments on Cifar-10/ResNet-18 for four different model sizes and five values of weight decay. 
The results are provided in \cref{fig:sam_vs_wd}. 
We find that SGD with stronger weight decay does not compete to replace SAM for overparameterized models; for overparameterized models, using larger weight
decay rather degrades the performance for SGD. This potentially indicates that a generic regularization strategy may not suffice for overparameterized models relatively compared to SAM.


\subsection{Results on SAM under Linearized Regime}

\label{app:exp-linear}

\begin{wrapfigure}{r}{0.45\linewidth}
  \vspace{-2.7em}
    \centering
    \begin{subfigure}{\linewidth}
        \centering
        \includegraphics[width=0.49\linewidth]{figures/linearized/alpha_final_val_acc.pdf}
        \includegraphics[width=0.49\linewidth]{figures/linearized/alpha_stability.pdf}
        \vspace{-2em}
    \end{subfigure}
  \caption{
    Effect of linearization on SAM.
    Here, $\alpha$ controls the degree of linearization.
    High linearization does not yield improvement, and in fact, SAM ($\rho=0.001$) underperforms SGD in the linearized regime (left), although both achieve effective linearization at $\alpha = 1000$ with stability close to $1$ (right).
  }
  \label{fig:result_overparam_linearization}
  \vspace{-1em}
\end{wrapfigure}

Recent studies suggest that highly overparameterized models can behave like linearized networks \citep{jacot2018neural}, while such implicit linearization phenomenon can coincide independently of overparameterization \citep{chizat2019lazy}.
One might wonder if the increased effectiveness of SAM directly comes from the overparameterization itself or is rather due to linearization.
To verify, we reproduce experiments in \citet{chizat2019lazy} and see how SAM performs in the linearized regimes while fixing the number of parameters.
Specifically, we train VGG-11 \citep{simonyan2014very} on the Cifar-10 with the $\alpha$-scaled squared loss $L(x, y) = \| f(x) - y/\alpha \|^2$ and use the centered model whose initial output is set to $0$.
Here, a large value of $\alpha$ leads to a higher degree of linearization of the models.

The results are reported in \cref{fig:result_overparam_linearization}.
We observe that SAM underperforms SGD in the linearized regimes;
while SAM ($\rho=0.001$) and SGD both achieve effective linearization at $\alpha=1000$, SAM underperforms SGD by more than $10\%$.
This indicates that linearization is not the main factor, and overparameterization itself is what leads to the improvement of SAM in previous experiments.

\subsection{SGD with twice the epochs}
\label{app:sgd-twice}

\begin{wrapfigure}{r}{0.45\linewidth}
  \vspace{-1.5em}
    \centering
    \begin{subfigure}{\linewidth}
        \centering
        \includegraphics[width=0.49\linewidth]{figures/cifar/ResNet18/sgd_twice/largesgd_mlf(optimizer)-flt(pai_sparsity_0.0-num_filters!_24)-max.pdf}
        \includegraphics[width=0.49\linewidth]{figures/cifar/ResNet18/sgd_twice/largesgd_diff_mlf(optimizer)-flt(pai_sparsity_0.0-num_filters!_24)-max.pdf}
        \vspace{-1.5em}
    \end{subfigure}
  \caption{
    SGD with twice the epochs. SAM still increasingly outperforms SGD with further overparameterization.
  }
  \label{fig:sgd_twice}
  \vspace{-1em}
\end{wrapfigure}

We compare SGD, trained for twice the number of epochs as SAM, to account for the additional gradient computation of SAM. 
Precisely, we report the validation accuracy of SGD trained for 400 epochs and compare it to SAM trained for the original 200 epochs on ResNet-18/CIFAR-10 in \cref{fig:sgd_twice}.
Similar to our observation in \cref{sec:experiments-main}, we find that the generalization benefit of SAM improves with overparameterization.
This outcome is expected: granting SGD additional training iterations can lead to overfitting and degrade generalization, potentially placing it at a disadvantage.
Thus, while providing a larger training budget to SGD may appear fair, it is not necessarily a more equitable comparison, and may in fact be unfair.



\section{Empirical Measurement of Lipschitz Smoothness and PL Constants} \label{app:emp-measure}

\begin{wrapfigure}{r}{0.45\linewidth}
  \centering
  \vspace{-1.3em}
  \begin{subfigure}{0.46\linewidth}
      \centering
      \includegraphics[width=\linewidth]{figures/emp_measure/lipschitz.pdf}
      \caption{Lipschitz smoothness}
      \label{fig:result_empirical_lipschitz}
  \end{subfigure}
  \begin{subfigure}{0.46\linewidth}
      \centering
      \includegraphics[width=\linewidth]{figures/emp_measure/PL.pdf}
      \caption{PL-ness}
      \label{fig:resut_empirical_pl}
  \end{subfigure}
  \vspace{-0.5em}
  \caption{
    The empirical measurement of Lipschitz smoothness (a) and PL-ness (b) for CIFAR-10 and ResNet-18.
    The Lipschitz smoothness becomes smaller and PL constant becomes larger as the model size increases.
  }
  \label{fig:result_overparam_empirical_measure}
  \vspace{-4em}
\end{wrapfigure}

We measure the empirical Lipschitz smoothness constant and PL constant based on \citet{zhang2019which}.
Specifically, the empirical smoothness $\hat\beta(x_k)$ and empirical PL constant $\hat\alpha(x_k)$ at iteration $k$ is computed as follows:
\begin{align}
    \hat{\beta}(x_k) &= \max_{\gamma \in \{\delta, 2\delta, \dots, 1\}} \frac{\|\nabla f(x_k+\gamma \mathbf{d})-\nabla f(x_k)\|_2}{\|\gamma \mathbf{d}\|_2},\\
    \hat{\alpha}(x_k) &= \min_{\gamma \in \{\delta, 2\delta, \dots, 1\}} \frac{\|\nabla f(x_k+\gamma \mathbf{d})\|_2^2}{f(x_k+\gamma \mathbf{d})-f(x^\star)}.
\end{align}
where $\mathbf{d}=x_{k+1}-x_{k}$ and $\delta \in (0, 1)$ where we choose $\delta = 0.1$.
We measure these quantities at the end of every epoch throughout training.
The results are shown in \cref{fig:result_overparam_empirical_measure}.


\input{text/SAM_stability_unbatched}
\input{text/Linear_convergence_Stoch_SAM}
\input{text/Generalization_bound}

