\section{Proof of Theorem \ref{thm:sam-stability}}

\label{app:sam-stability}

Here, we provide the detailed proof of \cref{thm:sam-stability}.

We first define a linearized stochastic SAM, which is derived from applying a first-order Taylor approximation to a stochastic SAM update given as follows:
\begin{definition} (Linearized stochastic SAM)
    We define a linearized stochastic SAM as
    \begin{equation} \label{eq:quad-linearized-SAM}
        x_{t+1} = x_t - \eta H_{\xi_t} (x_{t+1/2} - x^\star),
    \end{equation}
    where $x_{t+1/2} = x_t + \rho H_{\xi_t} (x_t - x^\star)$ is the linearized ascent step and $H_{\xi_t}$ is the Hessian estimation at step $t$.
\end{definition}
This actually corresponds to using SAM for the quadratic approximation of $f$ near $x^\star$, and we use this fact in the experiment setup.
We assume without loss of generality that the fixed point $x^\star$ satisfies $x^\star = 0$.

Then, we are ready to present the proof of \cref{thm:sam-stability}. Our goal is to derive a bound of the form $\E\| x_{t}\|^2\leq C \|x_0\|^2$.
We first apply (\ref{eq:quad-linearized-SAM}) to $\expec{}{\|x_{t+1}\|^2 \,\lvert\, x_t}$ and continue expanding the terms as follows:

\begin{align*}
    \expec{}{\|x_{t+1}^2\| \,\lvert\, x_t}
    &= \E \|x_t - \eta H_{\xi_t} (x_t + \rho H_{\xi_t}x_t)\|^2\\
    &= x_t^\top \expec{}{\left(I - \eta H_{\xi_t} - \eta\rho H_{\xi_t}^2\right)^2 \,\Big\lvert\, x_t}x_t\\
    &= x_t^\top \expec{}{ I - 2\eta(H_{\xi_t} + \rho H_{\xi_t}^2) + \eta^2 \left( H_{\xi_t} + \rho H_{\xi_t}^2\right)^2 \,\Big\lvert\, x_t}x_t\\
    &= x_t^\top \expec{}{ I - 2\eta(H_{\xi_t} + \rho H_{\xi_t}^2) + \eta^2 \left( H_{\xi_t}^2 + 2\rho H_{\xi_t}^3 + \rho^2 H_{\xi_t}^4\right)  \,\Big\lvert\, x_t}x_t\\
    &= x_t^\top \expec{}{I - 2\eta H_{\xi_t} + \eta(\eta-2\rho)H_{\xi_t}^2 + 2\eta^2\rho H_{\xi_t}^3 + \eta^2\rho^2 H_{\xi_t}^4 \,\Big\lvert\, x_t}x_t\\
    &= x_t^\top \Big( I - 2\eta H + \eta(\eta-2\rho) \E H_{\xi_t}^2 + 2\eta^2\rho \E H_{\xi_t}^3 + \eta^2\rho^2 \E H_{\xi_t}^4 \Big) x_t \\
    &= x_t^\top \Big( I - 2\eta H  + \eta(\eta-2\rho) H^2 + 2\eta^2\rho H^3 + \eta^2\rho^2 H^4  \\
    & \hspace{3em} + \eta(\eta-2\rho) (\E H_{\xi_t}^2-H^2) + 2\eta^2\rho (\E H_{\xi_t}^3-H^3) + \eta^2\rho^2 (\E H_{\xi_t}^4 -H^4) \Big) x_t \\
    &= x_t^\top \Big( \left(I - \eta H - \eta\rho H^2\right)^2  \\
    & \hspace{4em} + \eta(\eta-2\rho) (\E H_{\xi_t}^2-H^2) + 2\eta^2\rho (\E H_{\xi_t}^3-H^3) + \eta^2\rho^2 (\E H_{\xi_t}^4 -H^4) \Big) x_t
\end{align*}

Since $x^\top A x \leq \lambda_{\text{max}}(A)\|x\|^2$ always holds for any $x$ and any matrix $A$ with the maximum eigenvalue $\lambda_{\text{max}}(A)$, applying this inequality and taking the total expectation gives the following;

\begin{align*}
    \expec{}{\|x_{t+1}\|^2 }
    &\leq \lambda_{\text{max}}\bigg( \left(I - \eta H - \eta\rho H^2\right)^2 + \eta(\eta-2\rho) (\E H_{\xi}^2-H^2)  \\
    & \hspace{5em} + 2\eta^2\rho (\E H_{\xi}^3-H^3) + \eta^2\rho^2 (\E H_{\xi}^4 -H^4) \bigg) \expec{}{\|x_t\|^2}.
\end{align*}

Recursively applying this bound gives

\begin{align*}
    \E\| x_{t}\|^2
    &\leq \lambda_{\text{max}}\bigg( \left(I - \eta H - \eta\rho H^2\right)^2 + \eta(\eta-2\rho) (\E H_{\xi}^2-H^2) \\
    & \hspace{5em} + 2\eta^2\rho (\E H_{\xi}^3-H^3) + \eta^2\rho^2 (\E H_{\xi}^4 -H^4) \bigg)^t \|x_0\|^2.
\end{align*}

Here, we can see that $x^\star$ is linearly stable if

\begin{align*}
    & \lambda_{\text{max}}\bigg((I - \eta H - \eta \rho H^2)^2  \\
    & \hspace{4em} + \eta(\eta-2\rho) (\E H_{\xi}^2-H^2) + 2\eta^2\rho (\E H_{\xi}^3-H^3) + \eta^2\rho^2 (\E H_{\xi}^4 -H^4) \bigg) \leq 1. \\
    && \qedsymbol{}
\end{align*}
