\section{Proof of Theorem \ref{thm:main-PL-stochSAM}} \label{app:prooflinconv}
In this section, we show that a stochastic SAM converges linearly under an overparameterized regime.
To put into perspective, this is the rate of convergence of gradient descent for a family of functions satisfying the PL-condition and smoothness assumptions \citep{karimi2016linear}.
We first make several remarks on this result below.
\begin{itemize}[leftmargin=*]
    \item Crucially, this result shows that with overparameterization, a stochastic SAM can converge as fast as the deterministic gradient method at a linear convergence rate.
    It is much faster than the well-known sublinear rate of $\mathcal{O}(1/t)$ for SAM \citep{andriushchenko2022towards}.
    \item When $\rho=0$, we recover the well-known convergence rate for SGD in the interpolated regime \citep{bassily2018exponential}.
    \item This result does not require the bounded variance assumption \citep{andriushchenko2022towards} since the interpolation provides necessary guarantees.
    This suggests that overparameterization can ease the convergence of SAM.
\end{itemize}

We prove the convergence for an unnormalized mini-batch SAM given as
$$
x_{t+1} = x_t - \eta g_{t}^{_B}(x_t+\rho g_{t}^{_B}(x_t)),
$$
where $g_{t}^{_B}(x)=\frac{1}{B}\sum_{i \in I_t^B} \nabla f_i(x)$ and $I_t^B \subseteq \{1,...,n\}$ is a set of indices for data points in the mini-batch of size $B$ sampled at step $t$.
This is a more general stochastic variant of SAM where a stochastic SAM in Section \ref{sec:convergence} is a particular case of a mini-batch SAM with mini-batch size $B=1$.

We first make the following assumptions:
\begin{description}
    \item[\textbf{(A1)}] \hypertarget{assump:betasmoo}{($\beta$-smothness of $f_i$)}. \textit{There exists  $\beta > 0$ s.t. $ 
    \| \nabla f_i (x) - \nabla f_i (y) \| \leq \beta \| x-y \|$ for all $x,y  \in \mathbb R ^d$,}
    \item[\textbf{(A2)}] \hypertarget{assump:lambdasmoo}{($\lambda$-smothness of $f$)}. \textit{There exists  $\lambda\!>\!0$ such that $ 
    \| \nabla f (x) -\nabla f (y) \| \leq \lambda \| x-y \|$ for all $x,y  \in \mathbb R ^d$,} 
    \item [\textbf{(A3)}] \hypertarget{assump:PL}{($\alpha$-PLness of $f$)}. \textit{There exists $\alpha > 0$ s.t. $\| \nabla f (x) \|^2 \geq \alpha (f(x)-f(x^\star))$ for all $w,v  \in \mathbb R ^d$,}
    \item[\textbf{(A4)}] \hypertarget{assump:interpol}{(Interpolation)}. \textit{If $f(x^\star)=0$ and $~\nabla f(x^\star)=0$, then $f_i(x^\star)=0$ and $\nabla f_i(x^\star)=0$ for $i=1,\hdots,n$, where $n$ is the number of training data points.}
\end{description}

Before we prove the main theorem, we first introduce two lemma important to the proof.

\begin{lemma}\label{lemma:mini-batch-grad-align}
Suppose that Assumption \hyperlink{assump:betasmoo}{\textbf{(A1)}} holds.
Then
\begin{align}
   \langle \nabla f_i( x_{t+1/2}), \nabla f (x_t) \rangle & \geq \langle  \nabla f_i(x_t) , \nabla f (x_t)\rangle - \frac{{ \beta} \rho}{2} \| \nabla f_i (x_t) \|^2  - \frac{\beta\rho}{2} \| \nabla f (x_t)\|^2,
\end{align}
where $x_{t+1/2}=x_t + \rho \nabla f_i(x_t)$.
\end{lemma}
This lemma shows how well a stochastic SAM gradient $\nabla f_i( x_{t+1/2})$ aligns with the true gradient $\nabla f (x_t)$. 
The two gradients become less aligned as $\beta$ and $\rho$ grow bigger, \ie for sharper landscape and larger perturbation size. 

\begin{proof}
We first add and subtract $\nabla f_i^{}(x_t)$ on the left side of the inner product

\begin{align} \label{eq:step1bounding1}
   \langle \nabla f_i( x_{t+1/2}), \nabla f (x_t) \rangle =   \underbrace{\langle \nabla f_i( x_{t+1/2})- \nabla f_i(x_t) , \nabla f (x_t) \rangle}_{\tau_1} + \langle  \nabla f_i(x_t) , \nabla f (x_t)\rangle.
\end{align}

We here bound the term $\tau_1$ so that it becomes an equality when $\rho=0$. 
To achieve this, we start with the following binomial square, which is trivially lower bounded by 0.
\begin{align*}
   0 &\leq \frac{1}{2}\| \nabla f_i( x_{t+1/2})- \nabla f_i(x_t) +  \beta\rho \nabla f (x_t)\|^2
\end{align*}
We then expand the above binomial square so that the term containing $\tau_1$ appears.

\begin{align*}
   0 &\leq \frac{1}{2}\| \nabla f_i( x_{t+1/2}) - \nabla f_i(x_t) \|^2 + \underbrace{\langle \nabla f_i( x_{t+1/2})- \nabla f_i(x_t) ~,~  \beta\rho \nabla f (x_t) \rangle}_{\beta\rho  \tau_1} \, + \,\frac{1}{2}\| \beta\rho \nabla f (x_t)\|^2
\end{align*}

We subtract the term $\beta\rho  \tau_1$ on both sides of the inequality which gives
\begin{align*}
   -  \langle \nabla f_i( x_{t+1/2})- \nabla f_i(x_t) ~,~  \beta\rho \nabla f (x_t) \rangle
    &\leq  \frac{1}{2}\| \nabla f_i( x_{t+1/2})- \nabla f_i(x_t) \|^2  +\frac{\beta^2\rho^2 }{2}\| \nabla f (x_t)\|^2. 
\end{align*}

Then we upper bound the right-hand side using the Assumption \hyperlink{assump:betasmoo}{\textbf{(A1)}}: 

\begin{align*}
   -   \langle \nabla f_i( x_{t+1/2})- \nabla f_i(x_t) ~,~  \beta\rho \nabla f (x_t) \rangle
    &\leq  \frac{{ \beta}^2}{2} \|  x_{t+1/2}- x \|^2 +\frac{\beta^2\rho^2 }{2}\| \nabla f (x_t)\|^2 \\
    &=  \frac{{ \beta}^2 \rho^2}{2} \| \nabla f_i (x_t) \|^2  +\frac{\beta^2\rho^2 }{2} \| \nabla f (x_t)\|^2. 
\end{align*}

We divide both sides with $\beta\rho$, obtaining: 

   \begin{align*}
   -   \langle \nabla f_i( x_{t+1/2})- \nabla f_i(x_t) , \nabla f (x_t) \rangle 
    &\leq  \frac{{ \beta} \rho}{2} \| \nabla f_i (x_t) \|^2  + \frac{\beta\rho}{2} \| \nabla f (x_t)\|^2.
   \end{align*}
   
Applying this to (\ref{eq:step1bounding1}) gives the bound in the lemma statement.
\end{proof}

\begin{lemma}\label{lemma:mini-batch-grad-norm-bound}
    Suppose that Assumption \hyperlink{assump:betasmoo}{\textbf{(A1)}} holds.
    Then
    \begin{equation}
    \left\|  \nabla f_i(x_{t+{1/2}})\right\|^2 \leq (\beta\rho+1)^2\| \nabla f_i ( x_t) \|^2,
    \end{equation}
    where $x_{t+1/2}=x_t + \rho \nabla f_i(x_t)$.
\end{lemma}

This second lemma shows that the norm of a stochastic SAM gradient is bounded by the norm of the stochastic gradient.
Similar to the Lemma \ref{lemma:mini-batch-grad-align}, as $\beta$ and $\rho$ grow bigger the norm for a stochastic SAM gradient can become larger than the norm of the true gradient.

\begin{proof}
    We use the following binomial squares:
    \begin{equation*}\label{eq:binomsq}
        \begin{split}
            & \| \nabla f_i ( x_{t+1/2}) \|^2  \\
            &= \| \nabla f_i ( x_{t+1/2}) - \nabla f_i(x_t) \|^2  +2 \langle \nabla f_i(  x_{t+1/2}  )-\nabla f_i ( x_t) , \nabla f_i (x_t) \rangle + \| \nabla f_i ( x_t) \|^2.
        \end{split}
    \end{equation*}
    
    We bound the right-hand side using Cauchy-Schwarz inequality and Assumption \hyperlink{assump:betasmoo}{\textbf{(A1)}}, which gives
    \begin{align*}\label{eq:mb-norm_bound}
    & \left\|  \nabla f_i(x_{t+{1/2}})\right\|^2 \\
    &= \| \nabla f_i ( x_{t+1/2}) - \nabla f_i(x_t) \|^2 + 2 \langle \nabla f_i(  x_{t+1/2}  )-\nabla f_i ( x_t), \nabla f_i (x_t) \rangle + \| \nabla f_i ( x_t) \|^2\\
    &\underset{\text{C.S.}}{\leq}
    \| \nabla f_i ( x_{t+1/2}) - \nabla f_i(x_t) \|^2 + 2\| \nabla f_i(  x_{t+1/2}  )-\nabla f_i ( x_t) \|  \|\nabla f_i (x_t)\| + \| \nabla f_i ( x_t) \|^2\\
    &\underset{\hyperlink{assump:betasmoo}{\textbf{(A1)}}}{\leq}
    ~ \beta^2\|  x_{t+1/2}-x_t \|^2 +  2 \beta\|  x_{t+1/2}-x_t \|  \|\nabla f_i (x_t)\| + \| \nabla f_i ( x_t) \|^2 ~\\
    &~=~ 
    ~ \beta^2\rho^2\| \nabla f_i(x_t) \|^2 +  2 \beta\rho\| \nabla f_i(x_t) \|^2 + \| \nabla f_i ( x_t) \|^2 ~\\
    &~=~
    (\beta\rho+1)^2\| \nabla f_i ( x_t) \|^2
    \end{align*}
\end{proof}

These two lemmas essentially show how similar a stochastic SAM gradient is to the stochastic gradient, where the two become more similar as $\beta$ and $\rho$ decrease, which aligns well with our intuition.
Using Lemma \ref{lemma:mini-batch-grad-align} and \ref{lemma:mini-batch-grad-norm-bound}, we provide the convergence result in the following theorem.

\begin{theorem}\label{thm:PL-mini-batch-SAM}
    Suppose that Assumptions \hyperlink{assump:betasmoo}{\textbf{(A1-4)}} holds.
    For any mini-batch size $B \in \mathbb{N}$ and  $\rho\leq \frac{1}{(\beta/\alpha + 1/2)\beta}$, unnormalized mini-batch SAM with constant step size $\eta^\star_B \defeq \frac{1-(\kappa_B +1/2)\beta\rho}{2\lambda\kappa_B(\beta\rho+1)^2}$ gives the following guarantee at step $t$:
    \begin{equation*}\label{eq:theorem-PL-mini-batch-SAM}
    \ex{x_t}{f(x_t)}\leq
    \left(1 -  \frac{\alpha\,\eta^\star_B}{2}  \Big(1-\Big(\kappa_B +\frac{1}{2}\Big)\beta\rho\Big)\right)^t\,f(x_0),
    \end{equation*}
    where $\kappa_B = \frac{1}{B}\left(\frac{B-1}{2}+\frac{\beta}{\alpha}\right)$. 
\end{theorem}

This theorem states that mini-batch SAM converges at a linear rate under overparameterization.

\begin{proof}
Proof can be outlined in 3 steps.\\

\begin{framed}
\vspace{-4mm}
    \begin{description}
        \item[step 1.] Handle terms containing mini-batch SAM gradient $g_{t}^{_B}(x_t + \rho g_{t}^{_B}(x_t))$ using bounds from \hyperlink{assump:betasmoo}{\textbf{(A1)}}.
        \item[step 2.] Take conditional expectation $\expec{}{\,\cdot \, | x_t}$ and substitute expectation of function of mini-batch gradient $g_t^{_B}$ with terms containing $\| \nabla f(x_t)\|$ and $\expec{}{\| \nabla f_i(x_t)\|^2 ~\Big\lvert~ x_t}$.
        \item[step 3.] Bound the two terms from \textbf{step 2}, one using Assumptions \hyperlink{assump:betasmoo}{\textbf{(A1)}} and \hyperlink{assump:interpol}{\textbf{(A4)}} and the other using Assumption \hyperlink{assump:PL}{\textbf{(A3)}} and \hyperlink{assump:interpol}{\textbf{(A4)}} which results in all the terms to contain $f(x_t)$. 
        Then finally we take total expectations to derive the final runtime bound.
    \end{description}
\vspace{-3.5mm}
\end{framed}

\enspace

We start from the quadratic upper bound derived from Assumption \hyperlink{assump:lambdasmoo}{\textbf{(A2)}};
$$f(x_{t+1})\leq f(x_{t})+\langle \nabla f(x_t),~ x_{t+1}-x_t\rangle+\frac{\lambda}{2}\|x_{t+1}-x_{t}\|^2.$$

Applying mini-batch SAM update, we then have

$$f(x_{t})-f(x_{t+1})\geq \eta \left\langle~ \nabla f(x_t)~,~ g_{t}^{_B}(x_{t+{1/2}})
~\right\rangle -\frac{\eta^2\lambda}{2}\left\|  g_{t}^{_B}(x_{t+{1/2}})\right\|^2,$$ 
where $x_{t+1/2}=x_t+\rho g_{t}^{_B}(x_t)$.

\fbox{\textbf{step 1.}} \ \ We can see that there are two terms that contain a mini-batch SAM gradient $g_{t}^{_B}(x_{t+1/2})$.
We see that each can be bounded directly using Lemma \ref{lemma:mini-batch-grad-align} and \ref{lemma:mini-batch-grad-norm-bound}, which gives


\begin{align*}
    &f(x_{t})-f(x_{t+1}) \\
    &\geq
    \eta \left(\langle  g_{t}^{_B}(x_t) , \nabla f (x_t)\rangle - \frac{{ \beta} \rho}{2} \| g_{t}^{_B} (x_t) \|^2  - \frac{\beta\rho}{2} \| \nabla f (x_t)\|^2\right) \\
    &~~~~~~~~~
    -\frac{\eta^2\lambda}{2}~(\beta\rho+1)^2~\| g_{t}^{_B} ( x_t) \|^2\\
    &~~~=~~
    \eta\langle  g_{t}^{_B}(x_t) , \nabla f (x_t)\rangle - \frac{\eta \beta\rho}{2} \| \nabla f (x_t)\|^2 
    - \frac{\eta}{2} \left(\eta\lambda~(\beta\rho+1)^2 + \beta\rho \right)~\| g_{t}^{_B} ( x_t) \|^2.
\end{align*}

\fbox{\textbf{step 2.}} \ \ Now we apply $\expec{}{\, \cdot \, | x_t}$ to all the terms.

\begin{align*}
    &\expec{}{f(x_{t})-f(x_{t+1}) ~\big\lvert~ x_t}\\
    &=
    f(x_{t})-\expec{}{f(x_{t+1}) ~\big\lvert~ x_t}\\
    &\geq 
    \eta\expec{}{\langle  g_{t}^{_B}(x_t) , \nabla f (x_t)\rangle ~\Big\lvert~ x_t} - \frac{\eta \beta\rho}{2} \expec{}{\| \nabla f (x_t)\|^2  ~\big\lvert~ x_t} \\
    &\ \ \ \ \ \ 
    - \frac{\eta}{2} \left(\eta\lambda~(\beta\rho+1)^2 + \beta\rho \right)~ \expec{}{\| g_{t}^{_B} ( x_t) \|^2 ~\Big\lvert~ x_t}\\
    &=
    \eta\left( 1 - \frac{\beta\rho}{2}\right) \| \nabla f (x_t)\|^2 
    - \frac{\eta}{2} \left(\eta\lambda~(\beta\rho+1)^2 + \beta\rho \right)~ \expec{}{\| g_{t}^{_B} ( x_t) \|^2 ~\Big\lvert~ x_t}.
\end{align*}


Here we expand the term $\expec{}{\| g_{t}^{_B} ( x_t) \|^2 ~\Big\lvert~ x_t}$ by expanding the mini-batched function into individual function estimators as follows.

\begin{equation}\label{eq:mb-norm}
    \begin{split}
        &\expec{g^{_B}_t}{\norm{g_{t}^{_B}(x_t)}^2 ~\Big\lvert~ x_t}\\
        &= 
        \expec{I_t^B}{\left\langle \frac{1}{B}
        \sum_{i\in I_t^B} \nabla f_i(x_t)~,~ \frac{1}{B}
        \sum_{j \in I_t^B} \nabla f_j(x_t) \right\rangle ~\Bigg\lvert~ x_t}\\
        &=
        \frac{1}{B^2} \left\lbrace
        \sum_{i\in I_t^B} \expec{f_i}{\norm{\nabla f_i (x_t)}^2 ~\Big\lvert~ x_t}
        + \sum_{i\in I_t^B} \sum_{\substack{j\in I_t^B\\\text{(}j\neq i\text{)}}}
        \expec{f_i, f_j}{\iprod{\nabla f_i (x_t)} {\nabla f_j (x_t)} ~\Big\lvert~ x_t}
        \right\rbrace\\
        &=
        \frac{1}{B} \expec{}{\norm{\nabla f_i(x_t)}^2 ~\Big\lvert~ x_t} + \frac{B-1}{B} \norm{\nabla f(x_t)}^2.
    \end{split}
\end{equation}

Using (\ref{eq:mb-norm}), we get
\begin{align}
    &f(x_{t})-\expec{}{f(x_{t+1}) ~\big\lvert~ x_t}\\
    &\geq
    \eta\left( 1 - \frac{\beta\rho}{2}\right) \| \nabla f (x_t)\|^2 \nonumber\\
    &~~~
    - \frac{\eta}{2} \left(\eta\lambda~(\beta\rho+1)^2 + \beta\rho \right)  \left( \frac{1}{B}\expec{}{\norm{\nabla f_i(x_t)}^2 ~\Big\lvert~ x_t} + \frac{B-1}{B} \norm{\nabla f(x_t)}^2\right) \nonumber\\
    &=
    \eta \left( \left(1-\frac{\beta\rho}{2}\right) - \frac{B-1}{2B}\left(\eta\lambda(\beta\rho+1)^2+\beta\rho\right) \right) \| \nabla f (x_t) \|^2 \nonumber \\
    &~~~
    -\frac{\eta}{2B} \left(\eta\lambda(\beta\rho+1)^2+\beta\rho\right) \expec{}{\norm{\nabla f_i(x_t)}^2 ~\Big\lvert~ x_t} \label{eq:step2-result}.
\end{align}

\fbox{\textbf{step 3.}} \ \ In this step, we bound the two terms and take the total expectation to derive the final runtime bound.

We first derive a bound for $\expec{}{\norm{\nabla f_i(x_t)}^2 ~\Big\lvert~ x_t}$.
We start from the following bound derived from Assumption \hyperlink{assump:betasmoo}{\textbf{(A1)}}:
\begin{equation*}
\|\nabla f_i(x_t)-\nabla f_i(x^\star)\|^2 \leq 2\beta (f_i(x_t) - f_i(x^\star)).
\end{equation*}

By Assumption \hyperlink{assump:interpol}{\textbf{(A4)}}, this reduces to
\begin{equation*}
\|\nabla f_i(x_t)\|^2 \leq 2\beta f_i(x_t).
\end{equation*}

Applying this to (\ref{eq:step2-result}) gives
\begin{align}
f(x_{t})-\expec{}{f(x_{t+1}) ~\big\lvert~ x_t}
&\geq
\eta \left( \left(1-\frac{\beta\rho}{2}\right) - \frac{B-1}{2B}\left(\eta\lambda(\beta\rho+1)^2+\beta\rho\right) \right) \| \nabla f (x_t) \|^2 \nonumber\\
&~~~~~
-\frac{\eta\beta}{B} \left(\eta\lambda(\beta\rho+1)^2+\beta\rho\right) \expec{}{f_i(x_t) \lvert x_t} \nonumber\\
&=
\eta \underbrace{\left( \left(1-\frac{\beta\rho}{2}\right) - \frac{B-1}{2B}\left(\eta\lambda(\beta\rho+1)^2+\beta\rho\right) \right)}_{\tau_2} \| \nabla f (x_t) \|^2 \nonumber\\
&~~~~~
-\frac{\eta\beta}{B} \left(\eta\lambda(\beta\rho+1)^2+\beta\rho\right)  f(x_t).
\label{eq:step3-1}
\end{align}

Next, to bound $\| \nabla f(x_t) \|^2$, we use the following bound derived from applying $f(x^*)=0$ from \hyperlink{assump:interpol}{\textbf{(A4)}} to \hyperlink{assump:PL}{\textbf{(A3)}}:
\begin{equation} \label{eq:final_pl}
    \| \nabla f (x) \|^2 \geq \alpha f(x).
\end{equation}
Assuming $\tau_2\geq0$ which we provide a sufficient condition at the end of the proof, we apply (\ref{eq:final_pl}) to (\ref{eq:step3-1}) which gives
\begin{align*}
& f(x_{t})-\expec{}{f(x_{t+1}) ~\big\lvert~ x_t}\\
&\geq
\eta\alpha \left( \left(1-\frac{\beta\rho}{2}\right) - \frac{B-1}{2B}\left(\eta\lambda(\beta\rho+1)^2+\beta\rho\right) \right)  f (x_t) -\frac{\eta\beta}{B} \left(\eta\lambda(\beta\rho+1)^2+\beta\rho\right)  f(x_t) \\
&=
\eta
\left( 
    \alpha-\alpha\bigg(\underbrace{\frac{1}{B}\Big(\frac{B-1}{2}+\frac{\beta}{\alpha}\Big)}_{\kappa_B} +\frac{1}{2} \bigg)\beta\rho
    - \eta (\beta\rho+1)^2 \underbrace{\frac{\lambda}{B}\Big(\alpha\frac{B-1}{2} + \beta\Big)}_{\lambda\alpha\kappa_B}
\right)  f(x_t) \\
&=
\eta\alpha\left(1-\left(\kappa_B +\frac{1}{2}\right)\beta\rho- \eta\lambda (\beta\rho+1)^2 \kappa_B\right)  f(x_t).
\end{align*}


Hence, we get
\begin{align*}
\expec{}{f(x_{t+1}) ~\big\lvert~ x_t}&\leq \left(1-\eta\alpha\Big(1-\Big(\kappa_B +\frac{1}{2}\Big)\beta\rho\Big) + \eta^2\alpha\lambda (\beta\rho+1)^2 \kappa_B \right)f(x_t).
\end{align*}

Applying total expectation on both sides gives

\begin{align}
    \expec{}{f(x_{t+1})} \leq \left(1-\eta\alpha\Big(1-\Big(\kappa_B +\frac{1}{2}\Big)\beta\rho\Big) + \eta^2\alpha\lambda (\beta\rho+1)^2 \kappa_B \right)\expec{}{f(x_t)}.\label{eq:final_bound_PL-SGD}
\end{align}


Optimizing the multiplicative term in (\ref{eq:final_bound_PL-SGD}) with respect to $\eta$ gives $\eta = \frac{1-(\kappa_B +1/2)\beta\rho}{2\lambda\kappa_B(\beta\rho+1)^2}$, which is $\eta^\star_B$ in the theorem statement. 
With assumption of $\rho\leq \frac{1}{(\beta/\alpha + 1/2)\beta}$ so that we have $\eta^\star_B \geq 0$, applying this to (\ref{eq:final_bound_PL-SGD}) gives 
$$\ex{}{f(x_{t+1})}\leq
\left(1 -  \frac{\alpha\,\eta^\star_B}{2}  \Big(1-\Big(\kappa_B +\frac{1}{2}\Big)\beta\rho\Big)\right)\,\ex{}{f(x_t)},$$
which provides the desired convergence rate.

Last but not least, we calculate the upper bound for $\rho$ to satisfy the assumption $\tau_2\geq 0$ by substituting $\eta$ for $\eta^\star_B$ in $\tau_2$, yielding $\rho\leq\frac{2B\kappa_B+2\beta/\alpha}{(2B-1)\kappa_B+\beta/\alpha} \frac{1}{\beta}$. 
Minimizing this upper bound with respect to $B$ gives $\rho\leq\frac{1}{\beta}$, which is a looser bound than $\rho\leq \frac{1}{(\beta/\alpha + 1/2)\beta}$. 
\end{proof}