\documentclass[accepted]{uai2022} 
\usepackage[american]{babel}
% \usepackage[british]{babel}
\usepackage{url}            % simple URL typesetting
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{booktabs}
\usepackage{comment}
% \setlength{\marginparwidth}{2cm}
\usepackage[colorinlistoftodos]{todonotes}
\usepackage{amsthm}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{bbm}
\usepackage{nicefrac}  
\usepackage{array}
\usepackage{color}
\usepackage{xr}
%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\newtheorem{thm}{Theorem}
\newtheorem{ifthm}{Informal Theorem}
\newtheorem{lemma}{Lemma}
\newtheorem{cor}{Corollary}
\newtheorem{prop}{Proposition}
% \theoremstyle{definition}
\newtheorem{definition}{Definition}
\newtheorem{remark}{Remark}
\newtheorem{example}{Example}
\newtheorem{assume}{Assumption}
\newtheorem{obs}{Observation}
\newtheorem{claim}{Claim}
\usepackage{amsmath}
\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}


\makeatletter
\newcommand*{\addFileDependency}[1]{% argument=file name and extension
  \typeout{(#1)}
  \@addtofilelist{#1}
  \IfFileExists{#1}{}{\typeout{No file #1.}}
}
\makeatother

\newcommand*{\myexternaldocument}[1]{%
    \externaldocument{#1}%
    \addFileDependency{#1.tex}%
    \addFileDependency{#1.aux}%
}

\myexternaldocument{zhang_654}

\title{Stability of SGD: Tightness Analysis and  Improved Bounds}
% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
% \author[1]{\href{mailto:<jj@example.edu>?Subject=Your UAI 2022 paper}{Jane~J.~von~O'L\'opez}{}}
\author[1]{Yikai~Zhang $^*$}
\author[2]{Wenjia~Zhang $^*$}
\author[3]{Sammy~Bald $^*$}
\author[4]{Vamsi~Pingali}
\author[5]{Chao~Chen}
\author[3]{Mayank Goswami}
% Add affiliations after the authors
\affil[1]{%
    Machine Learning Research.\\
    Morgan Stanley
}
\affil[2]{%
    Computer Science Dept.\\
    Rutgers University
}
\affil[3]{%
    Computer Science Dept.\\
    Queens College of CUNY
 }
\affil[4]{
    Mathematics Dept.\\
    Indian Institue of Science
}
\affil[5]{
    Biomedical Informatics Dept.\\
    Stony Brook University
}
  
\begin{document}
\onecolumn
\maketitle
\def\thefootnote{*}\footnotetext{These authors contributed equally to this work}\def\thefootnote{\arabic{footnote}}



\setcounter{thm}{0}
\setcounter{lemma}{0}
\setcounter{obs}{0}
\setcounter{prop}{0}

\section{Missing Proofs}

% ========== Lemma 1 (Dynamics of divergence) ===========


\begin{lemma}[Dynamics of divergence]
Let $f(w;x) = \frac{1}{2}w^\top Aw - yx^\top w$. Assume $y_i = y_i^{'}=1$ for all $i$. Suppose $[x_i-x_i']/\|x_i-x_i'\|$ is an eigenvector of $A$ where $A[x_i-x_i'] = \lambda_{xx'}[x_i-x_i']$. Let $\Delta_{t}=w_t-w_t'$, $\alpha_t \leq \lambda_{xx'}$ be the step size of SGD and $\Delta_0 =0$. Suppose one  runs SGD on $f(w;S)$ and $f(w;S')$ where $S,S'$ are twin datasets and  ${x'}_i^\top x_j = 0, x_i^\top x_j = 0, \;\forall j \neq i$, the dynamics of $\Delta_t$ are given by:

\begin{equation}
\mathbb{E}_{\mathcal{A}} \|\Delta_{t+1}\| = (1-\alpha_t \lambda_{xx'})\mathbb{E}_{\mathcal{A}}\|\Delta_t\|+\frac{\alpha_t}{n}\|x_i-x_i'\|
\end{equation}
\end{lemma}

\begin{proof}


In case the different entry $z_i,z_i'$ is not picked,
the gradient difference of $f(w;z)$ and $f(w;z')$ is 
$$\nabla f(w;z) - \nabla f(w';z') = A[w-w']$$
and in case  different entry $z_i,z_i'$ is picked,
$$\nabla f(w;z) - \nabla f(w';z') = A[w-w'] + [x_i-x_i']$$

By definition $x_i - x'_i/\|x_i - x'_i\|$ is an eigenvector of $A$ and $x_i\perp x_j$ if $i\neq j$. Therefore $w-w'$ will stay in linear space of $x_i - x'_i$. Since $\Delta_0 = 0$, one can inductively show $\Delta_{t} = \theta_t [x_i-x_i']$ where $\theta_t >0$.
Since $\Delta_0 = 0$, one can inductively show $\Delta_{t} = \theta_t [x_i-x_i']$ where $\theta_t >0$. 
Since SGD selects $z_t = z_t'$ with probability $1-\frac{1}{n}$ and a different entry with probability $\frac{1}{n}$ we have the following dynamic:


\begin{equation}\label{divergence_eq}
        \Delta_{t+1}= \left\{
        \begin{array}{ll}
          (I-\alpha_t A)[w_t-w'_t]   & \quad  \text{with prob. } 1-\frac{1}{n} \\
        (I-\alpha_t A)[w_t-w'_t] + \alpha_t[x_i-x_i'] & \quad \text{with prob } 1/n.
        \end{array}
        \right.
\end{equation}



\begin{equation*}
    \begin{aligned}
    \mathbb{E}_{\mathcal{A}}\|\Delta_{t+1}\| =& \mathbb{E}_{\mathcal{A}} \left[\|\Delta_{t+1}\| | \text{Index $i$ is not selected} \right]\mathbb{P}[\text{Index $i$ is not selected}]\\
    &+ \mathbb{E}_{\mathcal{A}} \left[\|\Delta_{t+1}\| |  \text{Index $i$ is selected} \right]\mathbb{P}[ \text{Index $i$ is selected}]\\
    =& (1-\frac{1}{n}) \|(I-\alpha_t A)[w_t-w'_t]\| + \frac{1}{n}\|(I-\alpha_t A)[w_t-w'_t] + \alpha_t[x_i-x_i']\|\\
    =&(1-\frac{1}{n})(1-\alpha_t \lambda_{xx'})\theta_t\|x_i-x_i'\|+\frac{1}{n}[1-\alpha_t\lambda_{xx'}\theta_t+\alpha_t]\|x_i-x_i'\|\\
    =&(1-\alpha_t \lambda_{xx'})\mathbb{E}_{\mathcal{A}}\|\Delta_t\|+\frac{\alpha_t}{n}\|x_i-x_i'\|
    \end{aligned}
\end{equation*}
    % The lemma follows by combining the above two facts and $A[x_i-x_i'] = \lambda_{xx'}[x_i-x_i']$.
\end{proof}

% ========== Lemma 2 (Lovwer bound on divergence) ===========
\begin{lemma} [Lower bound on divergence]
% Let $\Delta_{t}$ be $w_t-w_t'$, $\alpha_t$ be the step size of SGD and $\Delta_0 = 0$.
% Suppose $[x_i-x_i']/\|x_i-x_i'\|$ is an eigenvector of $A$ where $A[x_i-x_i'] = \lambda_{xx'}[x_i-x_i']$. Running SGD on $f(w,S)$, we have:\\
Let $f(w;x) = \frac{1}{2}w^\top Aw - yx^\top w$. Assume $y_i = y_i^{'}=1$ for all $i$. Suppose $[x_i-x_i']/\|x_i-x_i'\|$ is an eigenvector of $A$ where $A[x_i-x_i'] = \lambda_{xx'}[x_i-x_i']$. Let $\Delta_{t}$ be $w_t-w_t'$, $\alpha_t \leq \lambda_{xx'}$ be the step size of SGD and $\Delta_0 =0$. Suppose one  runs SGD on $f(w;S)$ and $f(w;S')$ where $S,S'$ are twin datasets and  ${x'}_i^\top x_j = 0, x_i^\top x_j = 0, \;\forall j \neq i$, we have 
$$\mathbb{E}_{\mathcal{A}}\|\Delta_{T}\| \geq \frac{\|x_i-x_i'\|}{n} \sum_{t=1}^{T-1} \prod_{\tau=t+1}^{T-1} \alpha_t(1-\alpha_\tau\lambda_{xx'})$$

%  &= \mathbb{E}\| (I-\alpha_{T-1} A)\Delta_{T-1}+\frac{\alpha_{T-1}}{n}[x_i-x_i']\|
\end{lemma}

\begin{proof}
By iterative applying Lemma \ref{lem1} we have 
\begin{equation*}
\begin{aligned}
\mathbb{E}_{\mathcal{A}}\|\Delta_{T}\|&=(1-\alpha_{T-1} \lambda_{xx'})\mathbb{E}_{\mathcal{A}}\|\Delta_{T-1}\|+\frac{\alpha_{T-1}}{n}\|x_i-x_i'\|\\
% & = \|\frac{1}{n} \sum_{t=1}^{T-1} \alpha_t \prod_{\tau=t+1}^{T-1} (I-\alpha_\tau A) [x_i-x_i'] \|\\
&= \|[x_i-x_i']\|\frac{1}{n} \sum_{t=1}^{T-1} \alpha_t \prod_{\tau=t+1}^{T-1} (1-\alpha_\tau\lambda_{xx'}) 
\end{aligned}
\end{equation*}
% The lemma follows from the fact that $A(x_i-x_i') = \lambda_{xx'}(x_i-x_i')$.
\end{proof}


% ========== Theorem 1 ===========
\begin{thm}
Let $w_t,w_t'$ be the outputs of SGD on twin datasets $S,S'$ respectively, $\Delta_{t}$ be $w_t-w_t'$ and $\alpha_t$ be the step size of SGD initialized with $w_0=w_0'=0$. There exists a function $f$ which is convex and $\beta$-smooth, $L$-Lipschitz on domain of $w_t,w_t'$ and  twin datasets $S,S'$ such that the divergence of the two SGD outputs satisfies:
\begin{equation}
    \mathbb{E}_{\mathcal{A}}\|\Delta_T\| \geq \frac{1}{n} \sum_{t=1}^{T} \alpha_t; \;\;\; \varepsilon_{stab} \geq  \frac{L}{2n} \sum_{t=1}^{T} \alpha_t
\end{equation}
\end{thm}



\begin{proof}
The sketch of the proof is as follows: we construct a Huber function~\cite{huber1992robust} so that 
\begin{enumerate}
    \item The function is quadratic within certain region to ensure the divergence of SGD.
    \item By carefully choosing the function, SGD will never step out the quadratic region.
    \item The function is linear outside the region to ensure the global Lipschitzness.
\end{enumerate}
We start with constructing the quadratic part. Let $z=(x,y)$, $f(w;z) = \frac{1}{2}w^\top Aw - yx^\top w$. We choose $A=U\Sigma_KU^\top\in\mathbb{R}^{d\times d}$ to be a symmetric PSD matrix with rank $K$ where $K<d$. Let $U=[u_1,\ldots,u_K]$ be an orthorgonal matrix representing eigenvectors of $A$ and $\Sigma = diag[\lambda_1,\ldots, \lambda_K]$ where $\lambda_1\geq\lambda_2\geq\ldots\geq\lambda_K$ are non-zero eigenvalues of $A$. For twin datasets $S = \{z_1,\ldots,z_n\}$ and $S'=\{z_1,\ldots,z'_i,\ldots,z_n\}$, define $z_i = (v, 0.5)$ and $z'_i = (-v, 0.5)$ where $v^\top Av = 0$. For the rest of the data, $z_j = (x_j, 1)$ for any $j\neq i$ where $x_j$ are unit vectors that lie in the column space of $A$. Let $\lambda_1=2$ and $\lambda_K=1$. And the SGD update follows $w_{t+1} = w_t - a_t(Aw_t - yx)$ with initialization $w_0 = 0$.

We could rewrite $w = a + b$ and $w' = a - b$ having $a\perp b$. We have $w^{\top}Aw - w'^{\top}Aw' = 0$. Hence $f(w_T; z) - f(w'_T; z) = x^{\top}(w_T - w'_T)$. Thus the lower bound of $\epsilon_{stab}$ comes from it.

\textbf{Claim 1}: $\|U^\top w_t\|\leq \frac{1}{\lambda_K}$ $\forall\ t$


\textbf{Proof}:
We will proof this claim by induction. For $t = 0$, $w_0 = 0$ the conclusion holds.

Suppose for $w_{t}$ the claim holds. We have
\begin{equation*}
    \begin{aligned}
    \|U^\top w_{t+1}\| & = \|U^\top [(I-\alpha_tA)w_t + \alpha_tyx]\|\\
     & = \|(I-\alpha_t\Sigma)U^\top w_t + \alpha_tyU^\top x\|   \\
     & \leq (1-\alpha_t\lambda_K)\|U^\top w_t\| + \alpha_t\|U^\top x\|    \\
     & \leq \frac{1}{\lambda_K}
    \end{aligned}
\end{equation*}

Next we will proof that in this bounded region, the weight divergence is lower bounded by the summation of step size.


\textbf{Claim 2}: Suppose $w_0=w'_0 = 0$, $\mathbb{E}_{\mathcal{A}}\|w_T-w'_T\|=\frac{1}{n}\sum_{t=1}^T\alpha_t$.


\textbf{Proof}:
Follows from the proof of Lemma~\ref{lem1} and Lemma~\ref{lem2}.

By Claim 1 and Claim 2 we know that with zero initialization, SGD is bounded in the region $\|\Sigma^{\frac{1}{2}}U^\top w\|\leq\frac{1}{\sqrt{\lambda_K}}$ for all $t$. Also, the weight divergence is lower bounded in this area by $\frac{1}{n}\sum\alpha_t$.

Lastly, we will define $f(w;z)$ outside the $\|\Sigma^{\frac{1}{2}}U^\top w\|\leq\frac{1}{\sqrt{\lambda_K}}$ region and ensure the global Lipschitzness. Define 
\[
f(w, z) = \mathbbm{1}_{||\Sigma^\frac{1}{2}U^\top w||> \frac{1}{\sqrt{\lambda_k}}} \left\{\frac{1}{\sqrt{\lambda_k}}(||\Sigma^\frac{1}{2}U^\top w||-\frac{1}{2\sqrt{\lambda_k}}) - yx^\top w\right\}
.\]

Then the global Lipschitzness is ensured by choosing $L\leq \frac{1}{\sqrt{\lambda_K}}$.

So the final function $f(w; S)$ is 
\begin{equation*}
    \begin{aligned}
     f(w;S) & =\frac{1}{n}\sum_{j=1}^{n} f(w;x_j,y_j) \\
      & = \mathbbm{1}_{||\Sigma^\frac{1}{2}U^\top w||\leq \frac{1}{\sqrt{\lambda_k}}}\left\{\frac{1}{2} w^\top A w \right\}
     + \mathbbm{1}_{||\Sigma^\frac{1}{2}U^\top w||> \frac{1}{\sqrt{\lambda_k}}} \left\{ \frac{1}{\sqrt{\lambda_k}}\left(||\Sigma^\frac{1}{2}U^\top w||-\frac{1}{2\sqrt{\lambda_k}} \right) \right\} - \frac{1}{n}\sum_{j=1}^{n} y_jx_j^\top w
    \end{aligned}
\end{equation*}
By Lemma~\ref{lem2} one can bound the divergence as $ \mathbb{E}_{\mathcal{A}}  \|w_T-w'_T\|  \geq \frac{L}{2n} \sum_{t=1}^{T} \alpha_t.$
By a proof similar to Lemma~\ref{lem1}, we have $\Delta_t = \theta_t v$. Since $f(w,z)$ is a quadratic function we have   $f(w_T;z)-f(w_T';z) = z^\top [w_T-w_T']$ by the construction of $f(w_T;S)$. Since $\lambda_K=1$ implies $L\leq 1$, we have:
 \[
\varepsilon_{stab} = \sup\limits_{z} \mathbb{E}_{\mathcal{A}} [f(w_T;z)-f(w'_T;z)] = \mathbb{E}_{\mathcal{A}}  [w_T-w'_T]^\top x \geq \frac{L}{2n} \sum_{t=1}^{T} \alpha_t
 \]
 
\end{proof}

% ============ Theorem 2 =========== 
\begin{thm}[Lower bound for strongly-convex losses] 
Let $w_t,w_t'$ be the outputs of SGD on twin datasets $S,S'$ respectively, $\Delta_{t}$ be $w_t-w_t'$ and $\alpha = \frac{1}{2 \beta}$ be the step size of SGD. There exists a function $f$ which is $\gamma$ strongly convex and $\beta$-smooth, $L$-Lipschitz on domain of $w_t,w_t'$ and  twin datasets $S,S'$ such that the divergence and stability of the two SGD outputs satisfies:
\begin{equation}
    \mathbb{E}_{\mathcal{A}}\|\Delta_T\| \geq  \frac{1}{16\gamma n}\; \;\;\; \varepsilon_{stab} \geq  \frac{1}{16\gamma n}
\end{equation}
\end{thm}


\begin{proof}
We will construct $S$, $S'$ and $f(w; z)$ as follows:
\begin{enumerate}
    \item Let $A$ be a positive definite matrix with minimum eigenvalue to be $\gamma$ and maximum eigenvalue bounded by $\beta$. And let the eigenvector corresponding with the minimum eigenvalue to be $v$ and $\|v\| = 1$. Let $\gamma  = \frac{\beta}{2}$. We have the function $f(w;z) = \frac{1}{2}w^\top A w - yx$. 
    \item Define the twin datasets $S$ and $S'$ to be $z_j = (x_j, 0.5)$ where $x_j^\top v = 0$ and $\|x_j\|=1$ for all $j\neq i$. And let $z_i = (v, 0.5)$ and $z'_i = (-v, 0.5)$.
\end{enumerate}
By Lemma~\ref{lem2}, one can easily bound $\mathbb{E}_{\mathcal{A}}\|\Delta_T\| \geq  \frac{1}{16\gamma n}$. To bound $\varepsilon_{stab} $, we need a more involved analysis.
In this setting, we have the similar observation as equation~\ref{divergence_eq} in Lemma~\ref{lem1}. We have
\begin{equation*}
        \Delta_{t+1}= \left\{
        \begin{array}{ll}
          (I-\alpha_t A)[w_t-w'_t]   & \quad  \text{with prob. } 1-\frac{1}{n} \\
        (I-\alpha_t A)[w_t-w'_t] + \frac{\alpha_t}{2}[x_i-x_i'] & \quad \text{with prob } 1/n.
        \end{array}
        \right.
\end{equation*}

Then by induction, we could obtain that with $w_0=w'_0=0$, $\Delta_t = v\theta_t$, where $\theta_t>0$ for $t>0$. Let $\tau$ be the first time that $x_i,x_i'$ are picked, we have $\Delta_{\tau+1}= \frac{\alpha}{2}[x_i-x_i'] = v \alpha_{\tau}$. The iterative step of $\Delta_{t+1}$ and $\Delta_{t}$ implies that $\Delta_{t+1}= v \theta_{t+1}$ where $\theta_{t+1} = (1-\alpha\gamma) \theta_t$ with probability $(1-\frac{1}{n})$ and $\theta_{t+1} = (1-\alpha \gamma) \theta_t+ \alpha_t$ with probability $\frac{1}{n}$ .

Given $t>t_0$ the above construction then yields:
\begin{equation}\label{exp_div}
\begin{aligned}
    \mathbb{E}_{1:{t+1}}\left[ \|\Delta_{t+1}\| \right] = &\mathbb{E}_{1:{t}} \left[\left( 1-\frac{1}{n}\right)\|(I-\alpha A)\Delta_{t}\| + \frac{1}{n} \| (I- \alpha A)\Delta_{t} + \alpha v\|  \right]\\ 
    & =\|v\| \mathbb{E}_{1:{t}}   \left[ \left( 1-\frac{1}{n}\right)(1-\alpha \gamma )\theta_t+ \frac{1}{n} ((1+\alpha \beta)\theta_t+ \alpha )  \right] \\
    & = \|v\| \mathbb{E}_{1:{t}} \left[\left[ (1-\alpha \gamma )\theta_t \right] + \frac{ \alpha}{n}   | \Delta_{t_0} \neq 0  \right]
\end{aligned}
\end{equation}
By literately applying Equation~\ref{exp_div}, we have $\mathbb{E}_{\mathcal{A}} [\|\Delta_T\|] = \theta_T \geq \frac{\left(1-(\frac{3}{4})^{T}\right)}{\gamma n} \geq \frac{1}{16\gamma n}$. 

Next we show that $f(w_T;z)-f(w_T';z) = x^\top [w_T-w_T']$.

In case the $i$-th sample is picked: $w_{t+1}^\top v = (I-\alpha)v^\top Aw_t- \frac{\alpha}{2}$
 and ${w'_{t+1}}^\top v = (I-\alpha)v^\top Aw'_t+\frac{\alpha}{2}$. In case $i$-th sample is not picked $w_{t+1}^\top v =(1-\alpha\gamma)w_t^\top v$ and ${w'_{t+1}}^\top v =  (1-\alpha\gamma){w'_t}^\top v$. Therefore, by induction one can show $w_{t+1}^\top v = -{w'_{t+1}}^\top v$. 
 
 By the  fact that $\Delta_t = \theta_t v$, we know $w_{t+1}^\top v^{\perp} ={w'_{t+1}}^\top v^{\perp}  $. Combing the fact that $w_{t+1}^\top v = -{w'_{t+1}}^\top v$ and $w_{t+1}^\top v^{\perp} ={w'_{t+1}}^\top v^{\perp}$ we have $w_t^\top Aw_t = {w'_t}^\top A w'_t$ which implies  $f(w_T;z)-f(w_T';z) = x^\top [w_T-w_T']$ by the construction of $f(w_T;S)$. Hence we have
 \[
 \varepsilon_{stab} = \sup\limits_{z} \mathbb{E}_{\mathcal{A}} [f(w_T;z)-f(w'_T;z)] = \mathbb{E}_{\mathcal{A}}  [w_T-w'_T]^\top x = \theta_T\geq \frac{1}{16\gamma n}
 \]
\end{proof}

% ===========Proposition 1=================

\begin{prop}[Example of distribution with Inversely Bounded Second Moment]  \label{prop1}

Let $\mathbb{E}_{x\sim \mathcal{D}} [xx^\top ] = \Sigma$ and $\xi$ be the minimum non-zero eigenvalue of $\Sigma$. Suppose $S=\{(x_1,y_1),...,(x_n,y_n)\}$ is sampled from $\mathcal{D}$  with the $x \in \mathbb{R}^d$ with $\|x\| \leq 1$. Then, there exists universal constants $C,c$ so that if  $n\geq max\{\frac{4C^2d}{\xi}, \frac{512}{c\xi^2} \log(\frac{1}{\xi})\}$, $\mathcal{D}$ has a $(\frac{\xi}{3},n,\mu)$-inversely bounded Second Moment if $\mu \geq \frac{1}{n^4}$.
\end{prop}
% \begin{proof}
\noindent\textbf{Proof}:

Let $\|\cdot \|_{\psi_2}$ be the sub-Gaussian norm: $\|Z\|_{\psi_2} = \sup\limits_{z\in \mathbb{S}^{n-1}}\sup\limits_{p \geq 1}p^{-1/2} (\mathbb{E}|Z^\top z|^p)^{1/p} $ with $\mathbb{S}^{n-1}$ be the $n$-Sphere. We have $\|x\|_{\psi_2} \leq 1$ since $\|x\| \leq 1$ always holds. Let $\lambda_{min}$ be the minimum eigenvalue of empirical second moment $\frac{1}{n} \sum_{i=1}^{n} x_ix_i^\top$. Theorem 5.39 in~\cite{vershynin2010introduction} implies that there exists universal constant (since $\|x\|_{\psi_2} \leq 1$) $C,c$ s.t.
% \textcolor{blue}{n}
%  our $d$ is the $n$ and our $n$ is the $N$ in Vershynin's book.
$$\mathbb{P}\left[ \lambda_{min}\leq \xi-C\sqrt{\frac{d}{n}}-\frac{t}{\sqrt{n}}\right]\leq 2e^{-\frac{c t^2}{2}}.$$
Let $n\geq max\{\frac{4C^2d}{\xi^2}, \frac{512}{c\xi^2} \log(\frac{1}{\xi})\}$, we have 
\begin{equation} \label{ineq:gauss_rand_mat}
   \mathbb{P}\left[ \lambda_{min}\leq \frac{\xi}{2}-\frac{\eta}{\sqrt{n}}\right]\leq  \mathbb{P}\left[ \lambda_{min}\leq \xi-C\sqrt{\frac{d}{n}}-\frac{\eta}{\sqrt{n}}\right] \leq  2e^{-\frac{c\eta^2}{2}}.
\end{equation}
% By plugging in the $\frac{1}{n}$ term we have : 
% $$\mathbb{P}\left[ \frac{1}{\lambda_{min}+\frac{1}{n}}\geq \frac{1}{\frac{1}{n}+\frac{\xi}{2}-\frac{\eta}{\sqrt{n}}}\right]\leq  2e^{-\frac{c\eta^2}{2}}.$$

Here we use the fact that for a non-negative random variable $X$, $\mathbb{E}[X] = \int_0^{\infty} \mathbb{P}[X>s] ds$. 
% By choosing $\frac{1}{n} = \Omega(\frac{1}{n^4})$, we have
\begin{equation*}
    \begin{aligned}
    &\mathbb{E}\left [\frac{1}{\lambda_{min}+\mu}\right]=\int_{0}^{\infty} \mathbb{P}\left[ \frac{1}{\lambda_{min}+\mu}\geq  s \right] ds     \\
    &= \int_0^{\frac{2}{\xi}}  \mathbb{P}\left[ \frac{1}{\lambda_{min}+\mu}\geq  s \right]ds  + \int_{\frac{2}{\xi}}^{\frac{1}{\frac{\xi}{2}- \sqrt{\frac{4\log(n)}{cn}}}}\mathbb{P}\left[ \frac{1}{\lambda_{min}+\mu}\geq  s \right]ds\\
    &+\int_{\frac{1}{\frac{\xi}{2}-\sqrt{ \frac{4\log(n)}{cn}}}}^{\frac{1}{\mu}} \mathbb{P}\left[ \frac{1}{\lambda_{min}+\mu}\geq  s \right]ds
    +\int_{\frac{1}{\mu}}^{\infty} \mathbb{P}\left[ \frac{1}{\lambda_{min}+\mu}\geq  s \right]ds\\
    &\leq \int_0^{\frac{2}{\xi}}  \mathbb{P}\left[ \frac{1}{\lambda_{min}}\geq  s \right]ds
    + \int_{\frac{2}{\xi}}^{\frac{1}{\frac{\xi}{2}- \sqrt{\frac{4\log(n)}{cn}}}}\mathbb{P}\left[\frac{1}{\lambda_{min}}\geq  s \right]ds\\
    &+\int_{\frac{1}{\frac{\xi}{2}- \sqrt{\frac{4\log(n)}{cn}}}}^{\frac{1}{\mu}} \mathbb{P}\left[ \frac{1}{\lambda_{min}}\geq  s \right]ds
    +\int_{\frac{1}{\mu}}^{\infty} \mathbb{P}\left[ \frac{1}{\lambda_{min}+\mu}\geq  s \right]ds\\
    & \leq \int_0^{\frac{2}{\xi}}   1 ds + \underbrace{\int_{\frac{2}{\xi}}^{\frac{1}{\frac{\xi}{2}- \sqrt{\frac{4\log(n)}{cn}}}} 2e^{-\frac{cn \left(\frac{\xi}{2}-\frac{1}{s} \right)^2}{2}}ds}_{=\int_{\frac{\xi}{2}- \sqrt{\frac{4\log(n)}{cn}}}^{\frac{\xi}{2}} \frac{2}{{s' }^2} e^{-\frac{cn \left(\frac{\xi}{2}-s' \right)^2}{2}}d s'  \leq \frac{16\sqrt{\pi}}{\xi^2\sqrt{2cn}}\int_{-\infty}^{\infty} \sqrt{\frac{cn}{2\pi}} e^{-\frac{cn (\frac{\xi}{2}-s')^2}{2}}ds'=\frac{16\sqrt{\pi}}{\xi^2\sqrt{2cn}}}\\
    &+\int_{\frac{1}{\frac{\xi}{2}- \sqrt{\frac{4\log(n)}{cn}}}}^{\frac{1}{\mu}} \underbrace{\mathbb{P}\left[ \frac{1}{\lambda_{min}}\geq  \frac{1}{\frac{\xi}{2}- \sqrt{\frac{4\log(n)}{cn}}} \right]ds}_{ \leq \frac{1}{n^2} }
    +\int_{\frac{1}{\mu}}^{\infty} 0\\
    &\leq \frac{2}{\xi} +\frac{16\sqrt{\pi}}{\xi^2\sqrt{2cn}}+\frac{1}{n^2}\leq \frac{1}{\frac{\xi}{3}+\mu} .
    \end{aligned}
\end{equation*}


\qed


% ========== Theorem 3 ===========

\begin{thm}[Data-dependent stability of SGD with inversely bounded Second Moment]
 Suppose a loss function $f(w,z)$ is of the form  $$f(w,S) =\frac{1}{n} \sum_{j=1}^{n} f_{y_j}(w^\top x_j)+\frac{\mu}{2} w^\top w\;\;; w\in \mathcal{W}$$ where $f_y(w^\top x)$ satisfies $(1)\; |f_y'(\cdot)| \leq L $ , $(2)\; 0<\gamma \leq f_y''(\cdot) \leq \beta$, (3) $S , S'$ are sampled from $\mathcal{D}$ with $\xi$ be the minimum nonzero eignvalue of $\mathbb{E}_{x\sim \mathcal{D}} [x x^\top]$ and a uniformly bounded support $\mathcal{X}:\|x\| \leq 1, \mathcal{X}\subset \mathbb{R}^d$ and 4) $\mu \geq \frac{\gamma}{n^4}$. Let $\mathcal{W}$ be a  convex and compact set, $w_t$ and $w_t'$ be the outputs of SGD on $S$ and $S'$ after $t$ steps, respectively. Let the divergence $\Delta_{t}:=w_t-w_t'$ and $\alpha\leq \frac{\mu
 }{2\beta^2}$ be the step size of SGD. There exists universal constant $C,c$ so that if $n\geq max\{\frac{4C^2d}{\xi^2}, \frac{512}{c\xi^2} \log(\frac{1}{\xi})\}$, then
$$\mathbb{E}_{S}\mathbb{E}_{\mathcal{A}}\|\Delta_T\| \leq  \frac{12L}{\xi\gamma n}, \ \ \text{and} \ \  \varepsilon_{stab} (\mathcal{D}) \leq \frac{16L^2}{\xi\gamma n} .$$
\end{thm}
\begin{proof}
 For simplicity we omit the dependence of $f$ on $y_j$ so that $f_{y_j} (w^\top x_j) = f(w,z_j)$.
Note that the gradient of the loss function is $\nabla f_{y_j}(w_t ^\top x_j) = f_{y_j}'(w_t ^\top x_j) x_j$ and the Hessian is $\nabla^2 f_{y_j}(w_t ^\top x_j) = f_{y_j}''(w_t ^\top x_j) x_jx_j^\top$. 
The projected stochastic gradient step of $f_{y_j}(w_t^\top x_j )$ is
$w_{t+1} = \Pi_{\mathcal{W}}\bigg( w_t - \alpha_t f_{y_j}'(w_t^\top x_{j} ) x_{j}\bigg).$
Since $\mathcal{W}$ is convex, we have $\|w_{t+1}-w'_{t+1}\| \leq \| w_t-w'_t - \alpha_t f_{y_j}'(w_t^\top x_{j} ) x_{j}+\alpha_t f_{y'_j}'({w'_t}^\top {x'_{j}} ) x'_{j}\|$ (see Lemma 4.6 in~\cite{hardt2016train}). The dynamics of the divergence can be described as:
\begin{equation}\label{eq4}
\begin{aligned}
\mathbb{E}_{S,1:t+1} \|\Delta_{t+1}\| & =\mathbb{E}_{S}\mathbb{E}_{1:t} [ \frac{1}{n}\sum_{j \neq i} \|\Delta_t- \alpha_t [f'_{y_j}(w_t^\top x_j)-f'_{y_j}({w_t'}^\top x_j)] x_j \| \\
& +\frac{1}{n} \|\Delta_t-\alpha_t [f'_{y_i}(w_t^\top x_i)x_i-f'_{y_i'}(w_t'^\top x'_i)x_i']\|  ] \\
\end{aligned}
\end{equation} 
Note that $[f'_{y_j}(w_t^\top x_j)-f'_{y_j}(w_t'^\top x_j)]x_j$ can be rewritten as $f''_{y_j}({w_t^{\theta_j}} ^\top x_j)x_jx_j^\top\Delta_t    $ where $w_t^{\theta_j} = (1-\theta_j) w_t +\theta_j w_t', 0<\theta_j<1$.  
% By assumption that $f''(\cdot) \geq \gamma$, 
Similarly we can also rewrite 
$f'_{y_i}(w_t^\top x_i)x_i-f'_{y_i'}(w_t'^\top x_i)x_i'$ as
\begin{equation}\label{re_write}
\begin{aligned}
f'_{y_i}(w_t^\top x_i)x_i-f'_{y_i'}(w_t'^\top x'_i)x_i' & =\frac{1}{2}\{f'_{y_i}(w_t^\top x_i)x_i-f'_{y_i}(w_t'^\top x_i)x_i\} +\frac{1}{2}\{f'_{y_i'}(w_t^\top x_i')x_i'-f'_{y_i'}(w_t'^\top x_i')x_i'\}\\
 & \;\;\;\;+ \frac{1}{2}\{f'_{y_i} (w_t'^\top x_i)+f'_{y_i}(w_t^\top x_i)\}x_i-\frac{1}{2}\{f'_{y_i'}(w_t^\top x_i')+f_{y_i'}(w_t'^\top x_i')\}x_i' \\
& =\frac{1}{2}f''_{y_i}({w_t^{\theta_i}} ^\top x_i) x_i x_i^\top\Delta_t +\frac{1}{2}f''_{y'_i}({w_t^{{\theta'}_i}} ^\top x'_i) {x'_i x'_i} ^ \top \Delta_t \\
 & \;\;\;\;+ \frac{1}{2}\{f'_{y_i} (w_t'^\top x_i)+f'_{y_i}(w_t^\top x_i)\}x_i -\frac{1}{2}\{f'_{y_i'}(w_t^\top x_i')+f_{y_i'}(w_t'^\top x_i')\}x_i'\\
\end{aligned}
\end{equation} 
Let $\mathcal{H}_j = x_j x_j^\top $, $\mathcal{H}_i = \frac{1}{2}\{x_i x_i^\top + x'_i{x'_i}^\top\}$ and $\mathcal{H} = \frac{1}{n}\sum_{j} \mathcal{H}_j$. Let $\xi_S$ be the minimum non-zero eigenvalue of $\mathcal{H}$.

Next we show the gradient of term $\frac{\mu}{2}w^\top w$ is bounded. This is because $w_{t+1} = (1-\alpha_t \mu)w_t -\alpha_t f'(w_t^\top x_j) x_j$ which implies that $\|w_{t}\| \leq \frac{ L}{\mu}$ which implies that $\mu\|w\|\leq L$.

By Equation~\ref{re_write}, Equation~\ref{eq4} can be written as 
\begin{equation*}
    \begin{aligned}
    &\mathbb{E}_{S}\mathbb{E}_{1:t} \left[ \frac{1}{n}\sum_{j \neq i} \|(1-\alpha_t \mu)\Delta_t- \alpha_t [f'_{y_j}(w_t^\top x_j)-f'_{y_j}({w_t'}^\top x_j)] x_j \| 
    +\frac{1}{n} \|(1-\alpha_t \mu)\Delta_t-\alpha_t [f'_{y_i}(w_t^\top x_i)x_i-f'_{y_i'}(w_t'^\top x_i)x_i']\|  \right] \\
    & \leq \mathbb{E}_{S}\mathbb{E}_{1:t} \big[ \frac{1}{n}\sum_{j \neq i} \|((1-\alpha_t \mu)I- \alpha_t f''_{y_j}({w_t^{\theta_j}} ^\top x_j) x_jx_j ^\top) \Delta_t \|\\
    & \;\;\;\; \;\;\;\;+ \frac{1}{n} \|((1-\alpha_t \mu)I-\frac{\alpha_t}{2} [f''_{y_i}({w_t^{\theta_i}} ^\top x_i) x_i x_i^\top +f''_{y'_i}({w_t^{{\theta'}_i}} ^\top x'_i) {x'_i x'_i} ^ \top ])\Delta_t\|   \big] +\frac{2 \alpha_t L}{n}\\
    &\leq\mathbb{E}_{S}\mathbb{E}_{1:t}  \frac{1}{n}\sum_{j } \|((1-\alpha_t \mu)I- \alpha_t \gamma \mathcal{H}_j) \Delta_t \| \big] +\frac{2 \alpha_t L}{n}\\
    &\leq\mathbb{E}_{S}\mathbb{E}_{1:t} \big[  \sqrt{\frac{1}{n}\sum_{j } \|((1-\alpha_t \mu)I- \alpha_t \gamma \mathcal{H}_j) \Delta_t \|^2 }\big] +\frac{2 \alpha_t L}{n}\\
    & = \mathbb{E}_{S}\mathbb{E}_{1:t}  \sqrt{\frac{1}{n}\sum_{j } \left[  (1-\alpha_t \mu)^2\|\Delta_t\|^2 - 2(1-\alpha_t \mu)\alpha_t \gamma \Delta_t ^\top \mathcal{H}_j \Delta_t + \alpha_t^2\gamma^2\|\mathcal{H}_j \Delta_t\|^2 \right] } +\frac{2 \alpha_t L}{n}\\
    &\leq \mathbb{E}_{S}\mathbb{E}_{1:t} \sqrt{  (1-\alpha_t \mu)^2\|\Delta_t\|^2 - 2\alpha_t(1-\alpha_t \mu)\gamma \Delta_t ^\top \mathcal{H} \Delta_t + \alpha_t^2\beta^2   \|\Delta_t\|^2 } +\frac{2 \alpha_t L}{n}\\
    &\leq \mathbb{E}_{S}\mathbb{E}_{1:t} \sqrt{  (1-\alpha_t \mu)\|\Delta_t\|^2 - 2\alpha_t(1-\alpha_t \mu)\gamma \Delta_t ^\top \mathcal{H} \Delta_t -\alpha_t (\mu-\alpha_t\mu^2-\alpha_t\beta^2) \|\Delta_t\|^2 } +\frac{2 \alpha_t L}{n}\\
    &\leq \mathbb{E}_{S}\mathbb{E}_{1:t} \sqrt{  (1-\alpha_t \mu)\|\Delta_t\|^2 - 2\alpha_t(1-\alpha_t \mu)\gamma \Delta_t ^\top \mathcal{H} \Delta_t } +\frac{2 \alpha_t L}{n}\\
    &\underset{*}{\leq} \mathbb{E}_{S}\mathbb{E}_{1:t}\sqrt{(1-\alpha_t \mu-2\alpha_t\gamma \xi_{S} ) \|\Delta_t\|^2} +\frac{2 \alpha_t L}{n}\\
    &\leq \mathbb{E}_{S}\mathbb{E}_{1:t} \left[ (1-\frac{\alpha_t(\gamma\xi_S+\mu)}{2} )\|\Delta_t\| \right] +\frac{2 \alpha_t L}{n}\\
    &\leq  \mathbb{E}_{S} \left[(1 - \frac{\alpha_t (\gamma \xi_S+\mu)  }{2}) \mathbb{E}_{1:t}\|\Delta_t\|+\frac{2\alpha_t L}{n} \right]
    \end{aligned}
\end{equation*}

where in inequality $(*)$ we apply the fact that $\xi_S$ is the minimum non-zero eigenvalue of $\mathcal{H}$, which implies $\Delta_t \mathcal{H} \Delta_t \geq \xi_S \|\Delta_t \|^2$ since $\Delta_t \in Span\{x_1,..,x_i,x'_i,...,x_n\}$. Leveraging Proposition~\ref{prop1} and the fact that $\|x\|\leq 1, \mu = \Omega(\frac{1}{n^4}), n\geq max\{\frac{4C^2d}{\xi^2}, \frac{512}{c\xi^2} \log(\frac{1}{\xi})\}$ we have that $\mathcal{D}$ has $(\frac{\xi}{3},n,\mu)$-inversely bounded second moment. Fix $\alpha_t = \alpha$ we have 
\begin{equation*}
    \mathbb{E}_{S} \left[\|\Delta_t\|\right] \leq  \mathbb{E}_{S} \left[\frac{4L}{n(\gamma \xi_S+\mu)} \right] \leq \frac{4L}{n( \frac{\gamma\xi}{3}+\mu)} \leq \frac{12L}{n\gamma \xi}
\end{equation*}

and the theorem follows.
\end{proof}

%  Note that the constraint $w\in \mathcal{W}$ does not affect the solution of the optimization problem if the empirical risk minimizer $w^*$ satisfies $w^* \in \mathcal{W}$. 
\textbf{Example: Linear regression.} Linear regression minimizes the quadratic loss on $w$: $f(w,S) = \frac{1}{2n}\sum_{x_j\in S} (x_j^\top w-y_j)^2, w\in \mathcal{ W}$, where $\mathcal W$ is a convex compact set that contains the origin and has bounded radius $R$.   The Hessian of an individual linear regression loss term is $x_j x_j^\top$ which is \textit{not strongly-convex}. However, one can rewrite the loss function as $f_y(w^\top x)$ where $f''_y(\cdot) = 1 $. Next we show certain conditions that are sufficient to make $|f'(\cdot)|\leq L$. We assume $\|x_i\|= 1, y_i\in[-1,1], \forall i\in[n]$. Let $\Pi_{\mathcal{W}}(v) = 
\text{argmin}_{w\in \mathcal{W}} \|w-v\|$. Note that SGD updates as $$w_{t+1} = \Pi_{
w\in \mathcal{W}} \bigg( w_t -\alpha_t (x_j^\top w_t - y_j)x_j \bigg).$$ One can show that $\sup_{w \in \mathcal{W} }\sup_{x,y \in S} f_y'(w^\top x) \leq R+1$. 
% Note that the 
% Let $U=[u_1,\ldots,u_K]$ be an orthogonal matrix representing eigenvectors of $A = \frac{1}{n} \sum_{j=1}^{n}x_j x_j^\top $ and $\Sigma = diag[\lambda_1,\ldots, \lambda_K]$ where $\lambda_1\geq\lambda_2\geq\ldots\geq\lambda_K$ are non-zero eigenvalues of $A$. We analyze the norm of $\|Aw_{t}\|$ and show that it has a uniform upper bound.  Note that SGD updates as $$w_{t+1} = w_t -\alpha_t (x_j^\top w_t - y_j)x_j,$$ where $(x_j,y_j)$ are samples picked by SGD at $t$-th iteration  with $w_0=0$ be the initialization of SGD. We denote $x^k_j = {x_j}^\top u_k$ and $w^k_
% t= {w_t}^\top u_k$. W.O.L.G, we let $x^k_j \geq 0$. Leveraging the fact that $w_t \in Span\{u_1,...,u_K\}$ we have 
% \begin{equation}
% \begin{aligned}
%     Aw_{t+1} &=  Aw_t -\alpha_t(x_j^\top w_t - y_j) Ax_j\\
%     & = \bigg\{  \sum_{k=1}^{K}\lambda_k u_k u_k^\top \bigg\} \bigg\{  \sum_{k=1}^{K}w_t^k u_k  \bigg\} - \alpha_t\bigg(\bigg\{  \sum_{k=1}^{K} {x_j^k}u_k^\top \bigg\} \bigg\{  \sum_{k=1}^{K}w_t^k u_k  \bigg\} - y_j\bigg)\bigg\{  \sum_{k=1}^{K}\lambda_k u_k u_k^\top \bigg\} \bigg\{  \sum_{k=1}^{K}x_j^k u_k  \bigg\}\\
%      & = \bigg\{  \sum_{k=1}^{K}\lambda_k w_t^k u_k  \bigg\} - \alpha_t\bigg(\bigg\{  \sum_{k=1}^{K} {x_j^k} w_t^k   \bigg\} - y_j\bigg)\bigg\{  \sum_{k=1}^{K}\lambda_k  x_j^k u_k  \bigg\}\\
%      & = \bigg\{  \sum_{k=1}^{K} \bigg( \lambda_k w_t^k  - \alpha_t \lambda_k  x_j^k\bigg\{  \sum_{k'=1}^{K} {x_j^{k'}} w_t^{k'}  \bigg\} + \alpha_ty_j\lambda_k  x_j^k \bigg)u_k  \bigg\}\\
%      & = \bigg\{  \sum_{k=1}^{K} \lambda_k\bigg(  (1-\alpha_t{x_j^k}^2)w_t^k  - \alpha_t x_j^k\bigg\{  \sum_{k'\neq k} {x_j^{k'}} w_t^{k'}  \bigg\} + \alpha_ty_j  x_j^k \bigg)u_k  \bigg\}\\
%     %   & = \bigg\{  \sum_{k=1}^{K} \bigg( \lambda_k w_t^k   + \alpha_ty_j\lambda_k  x_j^k \bigg)u_k  \bigg\} - \bigg\{  \sum_{k=1}^{K} \alpha_t \lambda_k  x_j^k\bigg\{  \sum_{k'=1}^{K} {x_j^{k'}} w_t^{k'}   \bigg\} u_k\bigg\}\\
%     %  & =  \bigg\{  \sum_{k=1}^{K} \bigg( \lambda_k w_t^k  - \alpha_t w_t^{k}  x_j^k\bigg\{  \sum_{k'=1}^{K} \lambda_{k'} {x_j^{k'}}   \bigg\} + \alpha_ty_j\lambda_k  x_j^k \bigg)u_k  \bigg\}\\
%     %  & =   \sum_{k=1}^{K}  \lambda_k\bigg\{  w_t^k  \bigg( 1- \alpha_t  \frac{x_j^k}{ \lambda_k}\bigg\{  \sum_{k'=1}^{K} \lambda_{k'} {x_j^{k'}}   \bigg\}\bigg) + \alpha_ty_j  x_j^k \bigg)\bigg\}u_k  \\
% \end{aligned}
% \end{equation}
% Note $\|Ax_j\|  = \sum_{k=1}^{K} \lambda_{k} {x_j^{k}} \geq \lambda_K $ and $w \in Span\{u_1,...,u_K\}$, with above equation one can derive that $|w_t^k| \leq \frac{\lambda_k}{\lambda_K}$. One can show that $\|w_t\| \leq \frac{\sqrt{K} \lambda_1}{\lambda_K}$, which implies that $|f'(\cdot)|= |w^\top x -y|\leq \frac{\sqrt{K} \lambda_1}{\lambda_K}+1$.



% ========== Claim 1 ===========
\begin{claim}\label{old_lemma_3}
	Suppose $x_{t_0}=0$, $x_{t+1} = (1+\frac{a}{0.99t})x_t+ \frac{y}{t}$, we have $x_T \geq y (\frac{T}{t_0})^{a}$ if $a>0$ is a sufficiently small constant.
\end{claim}
\begin{proof}
In the proof we use following inequality:
$$e^{ax} \leq 1 + \frac{ax}{0.99} \leq e^{\frac{ax}{0.99}}$$
where $a>0$ is a sufficiently small constant. 
\begin{equation}
\begin{aligned}
x_T&= \sum_{t=t_{0}+1}^{T} \frac{y}{t}\prod_{s=t+1}^{T} (1+\frac{a}{0.99s})\\
&\geq \sum_{t=t_{0}+1}^{T} \frac{y}{t} \exp \left({a\sum_{s=t+1}^{T} \frac{1}{s}}\right)\\
&\geq \sum_{t=t_{0}+1}^{T} \frac{y}{t} \exp \left( a\log(T/t) \right)\\
& \geq yT^{a}\sum_{t=t_{0}+1}^{T} \frac{1}{t^{1+a}}\\
& \geq y\left(\frac{T}{t_0}\right)^a 
\end{aligned}
\end{equation}
\end{proof}

% ========== Lemma 3 ===========
\begin{lemma}[Divergence of non-convex loss function]
There exists a function $f$  which is non-convex and $\beta$-smooth,  twin datasets $S,S'$ and constant $a>0$ such that the following holds: if SGD is run using step size $\alpha_t = \frac{a}{0.99 \beta t}$ for $1 \leq t < T$, and $w_t, w_t'$ are the outputs of SGD on $S$ and $S'$, respectively, and $\Delta_{t}:=w_t-w_t'$, then:
\begin{equation} \forall 1 \leq t_{0} < T,\ \ \ \ 
       \mathbb{E}_{\mathcal{A}}\left[ \|\Delta_T\| | \Delta_{t_0} \neq 0  \right] \geq \frac{1}{2n} \left(\frac{T}{t_0}\right)^a 
\end{equation}
\end{lemma}

\begin{proof}
Consider the function $f(w,z)= \frac{1}{2} w^\top A w -yw^\top x$ , and choose $A$ to have positive and negative eigenvalues. We set the minimum eigenvalue of $A$ equal to $-\beta$ and all other eigenvalues with absolute value at most $\beta$.  We select twin datasets for such $A$ as follows.
We set all elements in $S \setminus \{x_i\} =S' \setminus\{x_i'\}$ to lie in the column space of $A$. Also, $\forall j \neq i$, choose $x_{j}$ such that $x_j^\top Ax_j >0$, and choose any $y_{j}$ equals 0.5.

Let $v$ be such that $v^\top Av= -\beta $ and $\|v\| =1$. Finally, let $x_i =v, y_i =0.5$, $x_i' = -v, y_i'=0.5$.

In this setting, one observes that the divergence $\Delta_{t}$ follows the following dynamic:

%$\forall t < t_{0}$,  $\Delta_{t+1}=\Delta_{t}$, and

$$
\Delta_{t+1}= \left\{
        \begin{array}{ll}
          (I-\alpha_t A)\Delta_{t}   & \quad  \text{with prob. } 1-\frac{1}{n} \\
            (I-\alpha_tA)\Delta_{t} + \frac{\alpha_t}{2}[x_i-x_i'] & \quad \text{with prob } 1/n.
        \end{array}
    \right\}.
$$
We first observe that $\Delta_{t}:= w_t-w_t'$ is of the form  $v \theta_t$, where $\theta_{t}>0$.  This can be shown using induction. Let $\tau$ be the first time that $x_i,x_i'$ are picked, we have $\Delta_{\tau+1}= \frac{\alpha_{\tau}}{2}[x_i-x_i'] = v \alpha_{\tau}$. The iterative step of $\Delta_{t+1}$ and $\Delta_{t}$ implies that $\Delta_{t+1}= v \theta_{t+1}$ where $\theta_{t+1} = (1+\alpha_t \beta) \theta_t$ with probability $(1-\frac{1}{n})$ and $\theta_{t+1} = (1+\alpha_t \beta) \theta_t+ \alpha_t$ with probability $\frac{1}{n}$ .


Given $t>t_0$, above construction then yields:
\begin{equation}
\begin{aligned}
    \mathbb{E}_{1:{t+1}}\left[ \|\Delta_{t+1}\| | \Delta_{t_0} \neq 0  \right] = &\mathbb{E}_{1:{t}} \left[\left( 1-\frac{1}{n}\right)\|(I-\alpha_t A)\Delta_{t}\| + \frac{1}{n} \| (I- \alpha_t A)\Delta_{t} + \alpha_t v\|  | \Delta_{t_0} \neq 0  \right]\\ 
    & =\|v\| \mathbb{E}_{1:{t}}   \left[ \left( 1-\frac{1}{n}\right)(1+\alpha_t \beta )\theta_t+ \frac{1}{n} ((1+\alpha_t \beta)\theta_t+ \alpha_t )  | \Delta_{t_0} \neq 0  \right] \\
    & = \|v\| \mathbb{E}_{1:{t}} \left[\left[ (1+\alpha_t \beta )\theta_t \right] + \frac{ \alpha_t}{n}   | \Delta_{t_0} \neq 0  \right]\\
    & = (1+\frac{a}{0.99 t})\mathbb{E}_{1:t}[\|\Delta_t\| | \Delta_{t_0} \neq 0]+\frac{\alpha_t}{n}\|v \|\\
    % & = (1+\frac{1}{t})\mathbb{E}\|\Delta_t\|+\frac{\alpha_t}{n}\|v \|\\
\end{aligned}
\end{equation}
% Given that $\|\Delta_t\|=\frac{poly(T)}{n}\|x_i-x_i'\|$,
Now apply Claim \ref{old_lemma_3}, with $x_{t}= \mathbb{E} [\vert \vert \Delta_{t}\vert \vert | \Delta_{t_0} \neq 0 ]$ and $y= \frac{a \vert \vert v \vert \vert}{0.99 \beta n}$. This gives us that $x_{T} \geq \frac{a \vert \vert v \vert \vert}{0.99 \beta n} \left(T/t_{0} \right)^{a} = \frac{a }{0.99 \beta n} \left(T/t_{0} \right)^{a}$, since $\vert\vert v \vert \vert =1$.

Finally, the claimed bound follows by setting the minimum eigenvalue $\beta = \frac{a}{0.99}$. 



\end{proof}



% ========== Theorem 4 ===========
\begin{thm}[Lower bound for non-convex loss functions]
Let $w_t, w_t'$ be the outputs of SGD on twin datasets $S,S'$, and $\Delta_{t}:=w_t-w_t'$. There exists a function $f$ which is non-convex and $\beta$-smooth, twin datasets $S,S'$ and  constants $a<0.1$ such that the divergence of SGD after $T$ rounds ($n<T$) using constant step size $\alpha_t = \frac{a}{0.99\beta t}$ satisfies:
$$\varepsilon_{stab} >\frac{T^{a}}{6n^{1+a}}$$
\end{thm}

\begin{proof}
Follow the same construction of $f(w; z)$ and $S$, $S'$ in Lemma~\ref{thm2}.

We begin the  proof with Lemma~\ref{thm2} plus the idea of a ``burn-in" period. We have:\\
\begin{equation}
    \begin{aligned}
    \mathbb{E}_{\mathcal{A}}\|\Delta_T\| & = \mathbb{E}_{\mathcal{A}}[\|w_t-w_t'\|| \Delta_n = 0]\mathbb{P}[\Delta_n =0]+ \mathbb{E}_{\mathcal{A}}[\|w_t-w_t'\|| \Delta_n \neq 0]\mathbb{P}[\Delta_n \neq 0]\\
     & \geq  \mathbb{E}_{\mathcal{A}}[\|w_t-w_t'\|| \Delta_n \neq  0]\mathbb{P}[\Delta_n \neq  0]\\
  & = \left(1-\big(1-\frac{1}{n}\big)^n\right)\frac{T^a}{2n^{1+a}}\|x_i-x_i'\|\\
  & > \frac{T^a}{6n^{1+a}}\|x_i-x_i'\|
    \end{aligned}
\end{equation}
By a similar proof as in Theorem~\ref{strongly_cvx_lowerbound} we can show  $w_t^\top v = -{w'_t}^\top v$ thus  $f(w_T;z)-f(w_T';z) = z^\top [w_T-w_T']$ and by restricting $z\sim \mathbb{Z}$ where $ \mathbb{Z}$ is the linear span of eigenvectors of $A$, we have $$\sup\limits_{z} \mathbb{E}_{\mathcal{A}} [f(w_T;z)-f(w'_T;z)] = \mathbb{E}_{\mathcal{A}}  [w_T-w'_T]^\top v = \theta_t> \frac{T^a}{6n^{1+a}}$$
\end{proof}

% ========== Lemma 4 ===========
\begin{lemma} \cite{hardt2016train}
	Assume $f$ is $\beta$-smooth and $L$-lipschitz. Let $w_t, w_t'$ be outputs of $SGD$ on twin datasets $S,S'$ respectively after $t$ iterations and let $\Delta_{t}:=[w_t-w_t']$ and $\delta_t = \mathbb{E}_{\mathcal{A}}\|\Delta_t\|$. Running SGD on $f(w;S)$ with step size $\alpha_t = \frac{a}{\beta t }$ satisfies the following conditions:
	\begin{itemize}
		\item 
		The SGD update rule is a $(1+\alpha_t \beta)$-expander and $2\alpha_t L$-bounded. 
		\item 
		$\mathbb{E}_{\mathcal{A}}[ \|\Delta_{t}\| | \Delta_{t-1} ]  \leq \left(1+\alpha_t\delta\right)\|\Delta_{t-1}\| +\frac{2\alpha_tL}{n}$.
		\item 
		$\mathbb{E}_{\mathcal{A}}[ \|\Delta_{T}\| | \Delta_{t_{k}}=0 ] \leq \big(\frac{T}{t_{k}}\big)^{ a}\frac{2L}{ n}$.
	\end{itemize}
\end{lemma}

% ========== Theorem 5 ===========
\begin{thm}[Permutation]\label{permutation}
	Assume $f$ is $\beta$-smooth and $L$-lipschitz. Running $T$ ($T>n$) iterations of  SGD  on $f(w;S)$ with step size $\alpha_t = \frac{a}{\beta t }, a\leq0.1$, the stability of SGD satisfies:\\ 
	\begin{equation}
	\mathbb{E}_{\mathcal{A}}\|\Delta_T\|   \leq\frac{ 3L T^a}{n^{1+a}}, \varepsilon_{stab} \leq  \frac{3L^2T^a}{n^{1+a}}
	\end{equation}
\end{thm}

\begin{proof}

Let $H=t$ represents the event that the first time the SGD pick the different entry is at time $t$:\\
% +\frac{L T^a}{n^{1+a}}
\begin{equation}
    \begin{aligned}
        \mathbb{E}_{\mathcal{A}}\|\Delta_T\|&  =  \mathbb{E}_{\mathcal{A}}[\|\Delta_T\|| H \leq n] \mathbb{P} [H \leq n] + \underbrace{\mathbb{E}_{\mathcal{A}}[\|\Delta_T\|| H > n] \mathbb{P} [H > n]}_{0 \text{(permutation)}}\\
        % &\leq (\mathbb{E}\|\Delta_n\|)\left(\frac{T}{n}\right)^a +\frac{L T^a}{n^{1+a}}\\
        &\leq \frac{1}{n}\sum_{t=1}^{n} \mathbb{E}_{\mathcal{A}}[\|\Delta_T\| | H = t]\\
        % &\leq \frac{1}{n}\sum_{t=1}^{n} \mathbb{E}[\|\Delta_T\| | H = t]  \\
        &\underset{*}{\leq} \frac{1}{n}\sum_{t=1}^{n} \left(\frac{T}{t}\right)^a\frac{2L}{n} \\
        &\leq \frac{2LT^a}{n^2}\int_{t=1}^{n}\frac{1}{t^a} dt +\frac{2LT^a}{n^2} \\
        &\leq \frac{3L T^a}{n^{1+a}}
    \end{aligned}
\end{equation}
The inequality $(*)$ derived by applying Lemma \ref{lem_SGD}.
\end{proof}

% ========== Lemma 5 ===========
% \begin{lemma}\label{lem_prob_rule}
% 	Let $w_t, w_t'$ be outputs of $SGD$ on twin datasets $S,S'$ respectively after $t$ iterations and let $\Delta_{t}:=w_t-w_t'$. Suppose that $t_k = ct_{k-1}$. Then the following conditions hold:
	
% 	\begin{itemize}
% 		\item 
% 		$\mathbb{P}[ \Delta_{t_k-1}=0| \Delta_{t_{k}}\neq 0 ]\leq \frac{n}{n+t_{k-1}}$.
% 		\item 
% 		$\mathbb{P}[ \Delta_{t_k-1} \neq0| \Delta_{t_{k}}\neq 0 ]\leq \frac{1}{c}\left(1+\frac{t_k}{n}\right)$.
% 		\item 
% 		$	\mathbb{E}_{\mathcal{A}}[ \|\Delta_{T}\|| \Delta_{t_{k}}\neq 0 ]
% 		\leq \mathbb{E}_{\mathcal{A}}[ \|\Delta_{T}\| | \Delta_{t_{k-1}}\neq 0 ]  \frac{1}{c}\left(1+\frac{t_k}{n}\right)+\big(\frac{T}{t_{k-1}}\big)^{ a}\frac{2L}{ n}$.
% 		%		\begin{equation}
% 		%		\begin{aligned}
% 		%		&\mathbb{E}[ \Delta_{T} | \Delta_{t_{k}}\neq 0 ]\\
% 		%		&\leq \mathbb{E}[ \Delta_{T} | \Delta_{t_{k-1}}\neq 0 ]  \frac{t_{k-1}}{t_k}(1+\frac{t_k}{n})\\
% 		%		&+(\frac{T}{t_{k-1}})^{   c}\frac{2L}{n+t_{k-1}}
% 		%		\end{aligned}
% 		%		\end{equation}  
% 	\end{itemize}
% \end{lemma}

% \begin{proof} 
% In the proof we will use the following inequality with $r\geq1$:
% $$\frac{n-r}{n} \leq (1-\frac{1}{n})^{r }\leq \frac{n}{n+r}$$

% \noindent i): 

% \begin{equation}
% \begin{aligned}
% & \mathbb{P}[ \Delta_{t_{k-1}}=0| \Delta_{t_{k}}\neq 0 ]=\frac{\mathbb{P}[ \Delta_{t_k-1}=0, \Delta_{t_{k}}\neq 0 ]}{\mathbb{P}[\Delta_{t_{k}}\neq 0]}\\
% &=(1-1/n)^{t_{k-1}}\frac{1-(1-1/n)^{t_k-t_{k-1}}}{1-(1-1/n)^{t_k}}\leq (1-1/n)^{t_{k-1}} \leq \frac{n}{n+t_{k-1}}
% % &=1-\frac{1-(1-1/n)^{t_{k-1}}}{1-(1-1/n)^{t_k}}\\
% % &
% \end{aligned}
% \end{equation}  

% \noindent ii):	
% \begin{equation}
% \begin{aligned}
% &\mathbb{P}[ \Delta_{t_k-1} \neq0| \Delta_{t_{k}}\neq 0 ]=\frac{\mathbb{P}[ \Delta_{t_k} \neq0, \Delta_{t_{k-1}}\neq 0 ]}{\mathbb{P}[ \Delta_{t_k} \neq0]}\\
% &=\frac{\mathbb{P}[\Delta_{t_{k-1}}\neq 0 ]}{\mathbb{P}[ \Delta_{t_k} \neq0]}=\frac{1-(1-1/n)^{t_{k-1}}}{1-(1-1/n)^{t_k}}\\
% &\leq \frac{1-\frac{n}{n+t_{k-1}}}{1-\frac{n-t_k}{n}}\leq \frac{t_{k-1}}{t_k}(1+\frac{t_k}{n})\\
% &=\frac{1}{c}(1+\frac{t_k}{n})
% \end{aligned}
% \end{equation}  

% \noindent iii): By applying i) and ii) in the decomposition of $\mathbb{E}[ \Delta_{T} | \Delta_{t_{k}}\neq 0 ]$ we have
% \begin{equation}
% \begin{aligned}
% \mathbb{E}_{\mathcal{A}}[ \|\Delta_{T}\| | \Delta_{t_{k}}\neq 0 ]&\leq \mathbb{E}_{\mathcal{A}}[ \|\Delta_{T}\| | \Delta_{t_{k-1}}\neq 0 ]  \mathbb{P}[ \Delta_{t_k-1} \neq0| \Delta_{t_{k}}\neq 0 ]+ \mathbb{E}_{\mathcal{A}}[ \|\Delta_{T}\| | \Delta_{t_{k-1}}= 0 ] \mathbb{P}[ \Delta_{t_k-1}=0| \Delta_{t_{k}}\neq 0 ]\\
% &\leq \mathbb{E}_{\mathcal{A}}[ \|\Delta_{T}\| | \Delta_{t_{k-1}}\neq 0 ]  \frac{t_{k-1}}{t_k}(1+\frac{t_k}{n})+(\frac{T}{t_{k-1}})^{a}\frac{2L}{n+t_{k-1}}\\
% &=\frac{1}{c}(1+\frac{t_k}{n})\mathbb{E}_{\mathcal{A}}[ \|\Delta_{T}\| | \Delta_{t_{k-1}}\neq 0 ]  +(\frac{T}{t_{k-1}})^{a}\frac{2L}{n+t_{k-1}}
% \end{aligned}
% \end{equation}  
% where the last inequality uses the fact that $\mathbb{E}_{\mathcal{A}}[ \|\Delta_{T}\| | \Delta_{t_{k}}=0 ] \leq\big(\frac{T}{t_{k-1}}\big)^{ a}\frac{2L}{ n}$.
% \end{proof}
% =========== Theorem 6 ==================
% =========== Theorem 6 ==================
\begin{thm}[Uniformly Sampling SGD]
	Assume $f$ is $\beta$-smooth and $L$-lipschitz. Running $T$ ($T>n$) iterations of SGD  on $f(w;S)$ with step size $\alpha_t = \frac{a}{\beta t }$, the stability of SGD satisfies:\\ 
	\begin{equation}
	\mathbb{E}_{\mathcal{A}} \|\Delta_T\|   \leq \frac{4 L T^a}{n^{1+a}};\;\;\;\varepsilon_{stab} \leq  \frac{4L^2T^a}{n^{1+a}}
	\end{equation}
\end{thm}

\begin{proof}

We first decompose $\Delta_T$ as follows by selecting $t_k=n$:\\
\begin{equation}
\begin{aligned}
\mathbb{E}_{\mathcal{A}}\|\Delta_T\|=&\underbrace{\mathbb{E}_{\mathcal{A}}[\|\Delta_T\| | \Delta_{t_k}=0] \mathbb{P}[\Delta_{t_k}=0]}_\text{Term 1 $ \leq \frac{2LT^{a  }}{n^{1+a  }} $ (Lemma~\ref{lem_SGD})}+\underbrace{\mathbb{E}_{\mathcal{A}}[\|\Delta_T\| | \Delta_{t_k}\neq 0] \mathbb{P}[\Delta_{t_k}\neq 0]}_\text{Term 2 $\leq \frac{2LT^{a  }}{n^{1+a  }}$} 
\end{aligned}
\end{equation}
Term 1 is easily bounded by applying Lemma~\ref{lem_SGD} with $\alpha_t=\frac{a}{t\beta}$. To bound Term 2, we apply scheme similar to proof in the permutation case. Let $H = t$ represent the event that the first time SGD pick different entry at time $t$. Note $H = t$ implies that before $t$, $\Delta_t = 0$ since all the gradient updates of $w$ and $w'$ are the same.

\begin{equation}
    \begin{aligned}
    &\mathbb{E}_{\mathcal{A}}[\|\Delta_T\| | \Delta_{n}\neq 0] \mathbb{P}[\Delta_{n}\neq 0]\\
     &= \mathbb{E}_{\mathcal{A}}[\|\Delta_T\| | H\leq n] \mathbb{P}[ H\leq n]\\
    &=  \mathbb{E}_{\mathcal{A}}[\|\Delta_T\|,  \bigcup_{t=1}^{n} H=t]\\
     &\leq \frac{1}{n}\sum_{t=1}^{n} \mathbb{E}_{\mathcal{A}}[\|\Delta_T\| | H = t]\\
        % &\leq \frac{1}{n}\sum_{t=1}^{n} \mathbb{E}[\|\Delta_T\| | H = t]  \\
        &{\leq} \frac{1}{n}\sum_{t=1}^{n} \left(\frac{T}{t}\right)^a\frac{2L}{n} \\
        &\leq \frac{2LT^a}{n^2}\int_{t=1}^{n}\frac{1}{t^a} dt\\
        &\leq \frac{2L T^a}{n^{1+a}} 
    \end{aligned}
\end{equation}
\end{proof}

% =========== Lemma 6 ==========
\begin{lemma}
    Assume $f$ is $\beta$-smooth $L$-Lipschitz and $\rho$-Lipschitz Hessian. Let $w_0$ be the initialization weight and $w_t$, $w_{t'}$ be the outputs of SGD on twin datasets $S$ and $S'$ respectively after $t$ iterations. Let $\Delta_t :=[w_t - w_{t'}]$. Running SGD on $f(w;S)$ with step size $\alpha_t = \frac{b}{t}$ satisfies $b \leq min\{\frac{2}{\beta}, \frac{1}{8\beta^2\ln T^2}\}$ has the following properties:
    \begin{enumerate}
        \item The SGD update rule is a $(1+\alpha_t\psi_t)$-expander and a $\alpha_t L$-bounded. Here $\psi_t = min\{\beta, \kappa_t\}$ where %$\kappa_t$ follows the definition in Lemma 6 in Lampert et al 2018.
        \[
        \kappa_t = \|\nabla^2 f(w_0, z_t)\|_2 + \frac{\rho}{2}\|\sum_{k=1}^{t-1}\alpha_k\nabla f(w_{S, k}, z_k)\| + \frac{\rho}{2}\|\sum_{k=1}^{t-1}\alpha_k\nabla f(w_{S^{'}, k}, z_k)\|
        \]
        \item $E_{\mathcal{A}}[\|\Delta_{t+1}\||\Delta_{t_0}=0]\leq [1+(1-1/n)\alpha_t\psi_t]E_{\mathcal{A}}[\|\Delta_{t}\||\Delta_{t_0}=0] + \frac{2\alpha_tL}{n}$.
        \item $E_{S}\{E_{\mathcal{A}}[\|\Delta_{T}\||\Delta_{t_0}=0]\} \leq \frac{L}{n}\left(\frac{T}{t_0}\right)^{\zeta b}$, where
        \[
        \zeta := \tilde{O}(\min\{\beta, E_z[\|\nabla^2f(w_0, z)\|_2] + \Delta^*_{1, \sigma^2}\})
        \]
        \[
        \Delta^*_{1, \sigma^2} = \rho(b\sigma + \sqrt{bE_z[f(w_0, z)] - \inf_{w}E_z[f(w, z)]})
        \]
    \end{enumerate}
\end{lemma}
\begin{proof}
    \begin{enumerate}
        \item We could find this results in \cite{kuzborskij2018data} equation (16).
        \item According to equation (19) in \cite{kuzborskij2018data} we could have this conclusion.
        \item In \cite{kuzborskij2018data}'s proof of theorem part 3, we could obtain this inequality.
    \end{enumerate}
\end{proof}

% =========== Theorem 7 ==========
\begin{thm}[Data-dependent version of Theorem~\ref{permutation}]\label{permutation_new}
    Assume $f$ is $\beta$-smooth $L$-Lipschitz and $\rho$-Lipschitz Hessian. Let $w_0$ be the initialization weight and $w_t$, $w_{t'}$ be the outputs of SGD on twin datasets $S$ and $S'$ respectively after $t$ iterations. Let $\Delta_t :=[w_t - w_{t'}]$ and $\delta_t = E_{\mathcal{A}}\|\Delta_t\|$. Running SGD on $f(w;S)$ with step size $\alpha_t = \frac{b}{t}$ satisfies $b \leq min\{\frac{2}{\beta}, \frac{1}{8\beta^2\ln T^2}\}$ has the following properties:\\ 
	\begin{equation}
	\mathbb{E}_S[\delta_T]   \leq\frac{ 2L T^{\zeta b}}{\zeta n^{1+\zeta b}}, \widehat{\varepsilon_{stab}}(\mathcal{D}, w_0) \leq  \frac{2L^2T^{\zeta b}}{\zeta n^{1+\zeta b}}
	\end{equation}
\end{thm}
\begin{proof}
\begin{equation}
\begin{aligned}
        \mathbb{E}_S[\delta_T]&  =  \mathbb{E}_S[\delta_T| H \leq n] \mathbb{P} [H \leq n] + \underbrace{\mathbb{E}_S[\delta_T| H > n] \mathbb{P} [H > n]}_{0 \text{(permutation)}} \\
        &\leq \frac{1}{n}\sum_{t=1}^{n} \mathbb{E}_S[\delta_T | H = t]\\
        &{\leq} \frac{1}{n}\sum_{t=1}^{n} \left(\frac{T}{t}\right)^{\zeta b}\frac{L}{\zeta n} \\
        &\leq \frac{LT^{\zeta b}}{n^2}\int_{t=1}^{n}\frac{1}{t^{\zeta b}} dt + \frac{LT^{\zeta b}}{n^2}\\
        &\leq \frac{2L T^{\zeta b}}{\zeta n^{1+\zeta b}}
    \end{aligned}
\end{equation}
\end{proof}

% =========== Lemma 7 ==============
% \begin{lemma}[Data-dependent version of Lemma~\ref{lem_prob_rule}]\label{lem_prob_rule_new}
% 	Let $w_t, w_t'$ be outputs of $SGD$ on twin datasets $S,S'$ respectively after $t$ iterations and let $\Delta_{t}:=w_t-w_t'$. And $b$, $\zeta$ follow the same definition in Lemma \ref{lem_SGD_data_dep}. Suppose that $t_k = ct_{k-1}$. Then the following condition holds:
% 		$$\mathbb{E}_S\mathbb{E}_{\mathcal{A}}[ \|\Delta_{T}\|| \Delta_{t_{k}}\neq 0 ]
% 		\leq \mathbb{E}_S\mathbb{E}_{\mathcal{A}}[ \|\Delta_{T}\| | \Delta_{t_{k-1}}\neq 0 ]  \frac{1}{c}\left(1+\frac{t_k}{n}\right)+\big(\frac{T}{t_{k-1}}\big)^{\zeta b}\frac{L}{\zeta n}$$
% \end{lemma}
% \begin{proof}
% \begin{equation}
%     \begin{aligned}
%         &\mathbb{E}_S\mathbb{E}_{\mathcal{A}}[ \|\Delta_{T}\|| \Delta_{t_{k}}\neq 0 ]  \leq \mathbb{E}_S\mathbb{E}_{\mathcal{A}}[ \|\Delta_{T}\|| \Delta_{t_{k-1}}\neq 0 ]\mathbb{P}[ \Delta_{t_k-1} \neq0| \Delta_{t_{k}}\neq 0 ] \\
%         &+ \mathbb{E}_S\mathbb{E}_{\mathcal{A}}[ \|\Delta_{T}\| | \Delta_{t_{k-1}}= 0 ] \mathbb{P}[ \Delta_{t_k-1}=0| \Delta_{t_{k}}\neq 0 ] \\
%         &\leq \mathbb{E}_S\mathbb{E}_{\mathcal{A}}[ \|\Delta_{T}\| | \Delta_{t_{k-1}}\neq 0 ]  \frac{t_{k-1}}{t_k}(1+\frac{t_k}{n})+(\frac{T}{t_{k-1}})^{\zeta b}\frac{L}{\zeta(n+t_{k-1})}\\
%         &=\frac{1}{c}(1+\frac{t_k}{n})\mathbb{E}_S\mathbb{E}_{\mathcal{A}}[ \|\Delta_{T}\| | \Delta_{t_{k-1}}\neq 0 ] +(\frac{T}{t_{k-1}})^{\zeta b}\frac{L}{\zeta(n+t_{k-1})}
%     \end{aligned}
% \end{equation}
% The second inequality follows Lemma~\ref{lem_SGD_data_dep}.
% \end{proof}




% =========== Theorem 8 ==============
\begin{thm}[Data-dependent version of Theorem~\ref{unif_indep}]%\label{unif_dep}
	Assume $f$ is $\beta$-smooth $L$-Lipschitz and $\rho$-Lipschitz Hessian. Let $w_t$ and $w_{t'}$ be the outputs of SGD on twin datasets $S$ and $S'$ respectively after $t$ iterations and let $\Delta_t :=[w_t - w_{t'}]$ and $\delta_t = E_A\|\Delta_t\|$. And $\zeta$ follows the same definition in Theorem \ref{permutation_new}. Running SGD on $f(w;S)$ with step size $\alpha_t = \frac{b}{t}$ satisfies $b < 1$ has the following properties:\\ 
	%Let $f$ satisfies the assumptions in Theorem \ref{permutation_new}, we have the followings properties:
    \begin{equation}
	\mathbb{E}_{S}\mathbb{E}_{\mathcal{A}} \|\Delta_T\|   \leq \frac{ 16LT^{\zeta b}}{\zeta n^{1+\zeta b}};\;\;\;\widehat{\varepsilon_{stab}}(\mathcal{D}, w_1) \leq  \frac{16L^2 T^{\zeta b}}{\zeta n^{1+\zeta b}}
	\end{equation}
\end{thm}
	
\begin{proof}
% We follow the assumption and proof  in Theorem~\ref{unif_indep}. To bound the Term 1 in Theorem~\ref{unif_indep}, we directly apply Lemma \ref{lem_SGD_data_dep}. To bound Term 2, we recursively apply Lemma \ref{lem_prob_rule_new} and set $t_{i+1} = ct_i$. We have
We follow the assumption and proof  in Theorem~\ref{unif_indep}. To bound the Term 1 in Theorem~\ref{unif_indep}, we directly apply Lemma \ref{lem_SGD_data_dep}. 
 To bound Term 2, we apply scheme similar to proof in the permutation case.

\begin{equation}
    \begin{aligned}
    &\mathbb{E}_{\mathcal{A}}[\|\Delta_T\| | \Delta_{n}\neq 0] \mathbb{P}[\Delta_{n}\neq 0]\\
     &= \mathbb{E}_{\mathcal{A}}[\|\Delta_T\| | H\leq n] \mathbb{P}[ H\leq n]\\
    &=  \mathbb{E}_{\mathcal{A}}[\|\Delta_T\|,  \bigcup_{t=1}^{n} H=t]\\
     &\leq \frac{1}{n}\sum_{t=1}^{n} \mathbb{E}_{\mathcal{A}}[\|\Delta_T\| | H = t]\\
        % &\leq \frac{1}{n}\sum_{t=1}^{n} \mathbb{E}[\|\Delta_T\| | H = t]  \\
        &{\leq} \frac{1}{n}\sum_{t=1}^{n} \left(\frac{T}{t}\right)^{\zeta b}\frac{2L}{\zeta n} \\
        &\leq \frac{2LT^{\zeta b}}{\zeta n^2}\int_{t=1}^{n}\frac{1}{t^{\zeta b}} dt\\
        &\leq \frac{2L T^{\zeta b}}{ \zeta n^{1+a}} 
    \end{aligned}
\end{equation}



% We have
% \begin{equation}
% \begin{aligned}
% &\mathbb{E}_{S}\mathbb{E}_\mathcal{A}[\|\Delta_T\|| \Delta_{t_k} \neq 0]\mathbb{P}[\Delta_{t_k} \neq 0]\\
% &\leq\frac{L}{\zeta n} \frac{t_k}{n}\sum_{i=1}^{k-1} (\frac{T}{t_i})^{\zeta b}\frac{n}{n+t_i} \prod_{\tau =i+1}^{k-1} (1+\frac{t_{\tau+1}}{n})\frac{t_\tau}{t_{\tau+1}}\\
% %&\leq \frac{L}{\zeta n}\sum_{i=1}^{k-1} (\frac{T}{t_i})^{b}\frac{t_{i+1}}{n+t_i}
% %exp(\sum_{\tau =i+1}^{k-1}\frac{t_{\tau+1}}{n})	\\
% %& \leq\frac{cL}{\zeta n}exp\left( \frac{c}{c-1}\right)\sum_{i=1}^{k-1} (\frac{T}{t_i})^{  b}\frac{t_{i}}{n+t_i}	\\
% %& \leq\frac{cL T^b }{\zeta n}exp\left( \frac{c}{c-1}\right)\sum_{i=1}^{k-1}\frac{t_{i}^{1-b}}{n}\\
% & \leq\frac{L \log(n)T^{\zeta b}}{\zeta n^{1+\zeta b}} \frac{c^{\zeta b}}{\log c}exp\left( \frac{c}{c-1}\right)\\
% &\leq \frac{11\log (n)L T^{\zeta b}}{\zeta n^{1+b}}
% \end{aligned}
% \end{equation}
% By applying $c = 4$ and following same procedure in proving Theorem~\ref{unif_indep} we obtain the last inequality.
% Therefore, we could bound $\mathbb{E}_{S}\mathbb{E}_\mathcal{A}[\|\Delta_T\|]$ by adding two terms together and get
% \begin{equation}
%     \mathbb{E}_{S}\mathbb{E}_{\mathcal{A}} \|\Delta_T\|   \leq 16 \log(n)L\frac{T^{\zeta b}}{\zeta n^{1+\zeta b}}
% \end{equation}
\end{proof}



% ========== Theorem 9 ===========
\begin{thm}%\label{noncvx_lowerbound}
Let $w_t, w_t'$ be the outputs of SGD on twin datasets $S,S'$, and let $\Delta_{t}:=w_t-w_t'$. There exists a function $f$ which is non-convex and $\beta$-smooth, twin sets $S,S'$ and constants $a,\zeta$ such that the divergence of SGD after $T$ rounds ($T>n$) using constant step size $\alpha = \frac{a}{0.99 \zeta }$ satisfies:
\begin{equation}
    % \mathbb{E}\|\Delta_T\| \geq \frac{1}{n^2}e^{aT/2}
    \varepsilon_{stab} \geq \frac{1}{n^2}e^{aT/2}
\end{equation}
\end{thm}
% \sammy{what is meant by hitting time, please clarify}
\begin{proof}
The proof is similar to Theorem \ref{thm3}. Since $\Delta_{t} \in Span\{x_i-x_i'\}$, we have:
$$\mathbb{E}_{\mathcal{A}}\|\Delta_{t+1}\| \geq (1-\frac{1}{n})(1+\alpha_t\beta)\mathbb{E}\|\Delta_t\| +\frac{\alpha_t}{n} \|x_i-x_i\|   $$
Suppose $t_0$ is the hitting time when $\|\Delta_{t_0}\| >0$ and $\|\Delta_{t_0-1}\| =0$ ,$\|\Delta_{T}\| \geq \frac{\|x_i-x_i'\|}{3n}e^{a(T-t_0)/2} $. 
\begin{equation}
    \begin{aligned}
    \mathbb{E}_{\mathcal{A}}\|\Delta_T\| &= \mathbb{E}_{\mathcal{A}}[\|w_t-w_t'\|| \Delta_1 = 0]\mathbb{P}[\Delta_1 =0]+ \mathbb{E}_{\mathcal{A}}[\|w_t-w_t'\|| \Delta_1 \neq 0]\mathbb{P}[\Delta_1 \neq 0]\\
    &\geq \mathbb{E}_{\mathcal{A}}[\|w_t-w_t'\|| \Delta_1 \neq  0]\mathbb{P}[\Delta_1 \neq  0]\\
    &= \frac{1}{n}( \frac{\|x_i-x_i'\|}{n}e^{aT/2})\\
    &= \frac{\|x_i-x_i'\|}{n^2}e^{aT/2}.
    \end{aligned}
\end{equation}
By a similar proof as Theorem ~\ref{thm3} one can obtain the stability lower bound.
\end{proof}
\bibliography{zhang_654-supp.bib}
\end{document}
