\documentclass[accepted]{uai2022} % for initial submission
% \documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}
\usepackage{url}            % simple URL typesetting
\usepackage{amsfonts}       % blackboard math symbols
% \usepackage{amsmath}
\usepackage{booktabs}
\usepackage{comment}
% \setlength{\marginparwidth}{2cm}
\usepackage[colorinlistoftodos]{todonotes}
\usepackage{amsthm}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{bbm}
\usepackage{nicefrac}  
\usepackage{array}
\usepackage{color}
%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{amsmath}
\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}
%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\newtheorem{thm}{Theorem}
\newtheorem{ifthm}{Informal Theorem}
\newtheorem{lemma}{Lemma}
\newtheorem{cor}{Corollary}
\newtheorem{prop}{Proposition}
% \theoremstyle{definition}
\newtheorem{definition}{Definition}
\newtheorem{remark}{Remark}
\newtheorem{example}{Example}
\newtheorem{assume}{Assumption}
\newtheorem{obs}{Observation}
\newtheorem{claim}{Claim}



\title{Stability of SGD: Tightness Analysis and  Improved Bounds}
% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
% \author[1]{\href{mailto:<jj@example.edu>?Subject=Your UAI 2022 paper}{Jane~J.~von~O'L\'opez}{}}
\author[1]{Yikai~Zhang $^*$}
\author[2]{Wenjia~Zhang $^*$}
\author[3]{Sammy~Bald $^*$}
\author[4]{Vamsi~Pingali}
\author[5]{Chao~Chen}
\author[3]{Mayank Goswami}
% Add affiliations after the authors
\affil[1]{%
    Machine Learning Research.\\
    Morgan Stanley
}
\affil[2]{%
    Computer Science Dept.\\
    Rutgers University
}
\affil[3]{%
    Computer Science Dept.\\
    Queens College of CUNY
 }
\affil[4]{
    Mathematics Dept.\\
    Indian Institue of Science
}
\affil[5]{
    Biomedical Informatics Dept.\\
    Stony Brook University
}
  
\begin{document}
\maketitle
\def\thefootnote{*}\footnotetext{These authors contributed equally to this work}\def\thefootnote{\arabic{footnote}}

\begin{abstract}
Stochastic Gradient Descent (SGD) based methods have been widely used for training large-scale machine learning models that also generalize well in practice. Several explanations have been offered for this generalization performance, a prominent one being algorithmic stability \citep{hardt2016train}. However, there are no known examples of smooth loss functions for which the analysis can be shown to be tight. Furthermore, apart from properties of the loss function, data distribution has also been shown to be an important factor in generalization performance.
This raises the question: is the stability analysis of \citep{hardt2016train} tight for smooth functions, and if not, for what kind of loss functions and data distributions can the stability analysis be improved? 

In this paper we first settle open questions regarding tightness of bounds in the data-independent setting: we show that for general datasets, the existing analysis for convex and strongly-convex loss functions is tight, but it can be improved for non-convex loss functions. Next, we give novel and improved data-dependent bounds: we show stability upper bounds for a large class of convex regularized loss functions, with \emph{negligible  regularization} parameters, and improve existing data-dependent bounds in the non-convex setting. We hope that our results will initiate further efforts to better understand the data-dependent setting under non-convex loss functions, leading to an improved understanding of the generalization abilities of deep networks.

\end{abstract}

\section{Introduction}
\emph{Stochastic gradient descent} (SGD) has gained great popularity in solving machine learning optimization problems~\citep{kingma2014adam,johnson2013accelerating}. SGD leverages the finite-sum structure of the objective function, avoids the expensive computation of exact gradients, and thus provides a feasible and efficient optimization solution in large-scale settings~\citep{bottou2012stochastic}.
The convergence and the optimality of SGD have been thoroughly studied ~\citep{ge2015escaping,rakhlin2012making,reddi2018convergence,zhou2019lower,carmon2019lowera,carmon2019lowerb,shamir2013stochastic}. 

In recent years, new research questions have been raised regarding SGD's impact on a model's generalization power. 
The seminal work \citep{hardt2016train} tackled the problem using the \emph{algorithmic stability} of SGD, i.e., the progressive sensitivity of the trained model w.r.t.~the replacement of a single (test) datum in the training set. They showed that the generalization error of an SGD-trained model is upper bounded by a uniform stability parameter $\varepsilon_{\text{stab}}$, and relate $\varepsilon_{\text{stab}}$ to the divergence of the two parameter vectors obtained by training on twin datasets.

This stability-based analysis of the generalization gap allows one to bypass classical model capacity theorems~\citep{vapnic1998statistical, koltchinskii2000rademacher} or weight-based complexity theorems~\citep{neyshabur2017exploring, bartlett2017spectrally, arora2018stronger}.  
This framework also provides theoretical insights into many phenomena observed in practice, e.g.,  
the ``train faster, generalize better'' phenomenon, the power of regularization techniques such as weight decay~\citep{krogh1992simple}, dropout~\citep{srivastava2014dropout}, and gradient clipping.
Other works have developed the stability notion with advanced analysis ~\citep{bassily2020stability,feldman2019high,kuzborskij2018data,lei2020sharper,lei2021generalization,lei2020fine} and adapted it into more sophisticated settings such as Stochastic Gradient Langevin Dynamics and momentum SGD~\citep{mou18a,chaudhari2019entropy,chen2018stability,Li2020On,lei2021stability}.     

% \mayank{Only D of Dropout is capitalized. Why?}
Despite the promises of this stability-based analysis, it remains open whether the analysis in ~\citep{hardt2016train} can be further improved to reveal the full potential of the stability method, either in general or for specific data-distributions. 

\begin{table*}[h]
\begin{center}
\caption{Current landscape of stability bounds. [H] indicates results in~\citep{hardt2016train}, [K] indicates results in \citep{kuzborskij2018data} and * indicates results in this paper. $\beta$ is the smoothness parameter. $\zeta$ is a data-dependent constant defined in Lemma \ref{lem_SGD_data_dep}. $\widehat{\varepsilon}_{\text{stab}}$ is on-average stability defined in Def \ref{avg_stab}. $a$, $b$ are small constants free of $T$ and $n$. We only keep $T$ and $n$ term in the bounds.}
\label{tab:results-summary}
%Bounds without [H], [K] or * are trivial.
% \vskip 0.15in
\renewcommand{\arraystretch}{0.9}
\begin{tabular}{||l||p{.14\textwidth}|p{.14\textwidth}|p{.18\textwidth}|p{.21\textwidth}||}
\hline
\textit{SGD Step Size} &  \multicolumn{2}{c|}{Constant $\alpha_{t} = a/\beta$} & $\alpha_{t}= a/(\beta t)$ & $\alpha_t = b/t$\\
\hline
\textit{Loss function} & Strongly Convex  & Convex & Non-Convex & Non-Convex with $\widehat{\varepsilon}_{\text{stab}}$\\
\hline
\hline
%\textit{Upper Bound} & $O(\frac{1}{n})$  [H]  & $O(aT/n)$   [H] & $O \left(T^{\frac{a}{1+a}}/n\right)$ [H] $ O\left(T^a/{n^{1+a}}\right)$* & $O\left((\mathbb{E}_S[R(\mathcal{A}_S)]T)^{\frac{\zeta b}{1+\zeta b}}/n\right)$[K] $O(T^{\zeta b}/n^{1 + \zeta b})^*$ \\
\textit{Upper Bound} & $O(\frac{1}{n})$  [H]  & $O(T/n)$  [H] & $O \left(T^{\frac{a}{1+a}}/n\right)$ [H] $ O\left(T^a/{n^{1+a}}\right)$*  & $O\left(T^{\frac{\zeta b}{1+\zeta b}}/n\right)$[K] $O(T^{\zeta b}/n^{1 + \zeta b})^*$ \\
\hline
%\textit{Lower Bound} & $\Omega(\frac{1}{n})^*$  & $\Omega(aT/n)$* & $\Omega(\frac{T^a}{n^{1+a}})$*& Open\\
\textit{Lower Bound} & $\Omega(\frac{1}{n})^*$    & $\Omega(T/n)$* & $\Omega({T^a}/{n^{1+a}})$*  & Open\\

%  Open, Evidence$^*$
% \hline
% Landscape & \includegraphics[width=.17\textwidth]{Stronglyconvex.png} &  Figure & Figure & \includegraphics[width=.17\textwidth]{fig/Nonconvex.png} \\
\hline
\end{tabular}
\end{center}
\end{table*}

\noindent\textbf{Our results:} We provide three kinds of results (see Table~\ref{tab:results-summary}) that complement each other: a) tight lower bounds that show settings where stability analysis cannot be improved further for general datasets, b) weaker lower bounds that hint at a possible improvement, along with complementary improved upper bounds, also for general datasets and c) in settings where existing data-independent analysis cannot be improved, we derive improved data-dependent bounds. Below we summarize some of the existing open questions in this line of research, grouped according to properties of the loss function, along with our results addressing these problems.
% \chao{Could we use bold fonts for all the questions and results?}\mayank{Done}
% \chao{Before diving into all the details, could we add a brief summary of what will come?}\mayank{Summary above}


\subsection{Convex and Strongly Convex Loss}
The following are the main results presented in \citep{hardt2016train} for convex and strongly-convex loss functions (with certain Lipschitz and smoothness conditions), when optimized using SGD. Here $n$ denotes the size of the sample, $T$ the number of steps in SGD, and $\alpha_t$ the size of the SGD step in the $t$-th iteration.

\noindent{1.} For convex loss functions, the stability is upper bounded by $\sum_{i=1}^{T} \alpha_{t}/n$. The smaller the number of iterations $T$ is, the lower this upper bound. Hence ``train faster, generalize better".

\noindent{2.}  In practice, one often uses constant step size: $\alpha_t=\alpha$. For convex loss functions the upper bound would then scale linearly in the number of iterations $T$, which seems to be too pessimistic. \citep{hardt2016train} show that by adding a $\frac{\mu}{2} \vert\vert w\vert\vert_{2}^{2}$ regularization term to the convex loss function, where $w$ is the vector of weights and $\mu \in \Theta(1)$ is a small constant, one gets much better stability upper bound for constant step size that does not depend on $T$, and is $O(1/n)$.

This gives rise to the following questions:

\noindent\textbf{Question 1:} Are the upper bounds of \citep{hardt2016train} for convex and strongly-convex functions tight? That is, can one construct loss functions that satisfy the hypotheses and exhibit the claimed worst-case stability performance?

We remark that, to the best of our knowledge, the only construction available in the literature is ~\citep{bassily2020stability}. The authors analyze the stability of a loss function in order to derive lower bounds, but unfortunately, the loss function is not smooth and therefore does not satisfy the hypothesis in \citep{hardt2016train}.

\noindent\textbf{Question 2:} How important is the regularization term in order to make the transition from convex to strongly-convex, and therefore the improvement from an $O(T/n)$ upper bound to an $O(1/n)$ upper bound for constant step-size SGD?

We provide the following answers to the above questions:

\noindent\textbf{Result 1:} The answer to question 1 is yes, i.e., there exist smooth, convex and strongly-convex loss functions that achieve the worst-case stability upper bound, In Theorem ~\ref{CvxLowerBound}, we construct a Huber function which is quadratic in a certain area and linear outside. Under certain restricted assumptions, we proved the tightness of upper bounds in ~\citep{hardt2016train} for convex loss which strengthens the lower bound of~\citep{bassily2020stability} for the non-smooth case.  In Theorem ~\ref{strongly_cvx_lowerbound}, our construction shows the tightness of upper bounds in ~\citep{hardt2016train} for strongly convex loss.


\noindent\textbf{Result 2: (Data-dependent bounds)} We answer question 2 by introducing Theorem~\ref{cvxupperbound}. In Theorem \ref{cvxupperbound}, we derive an upper bound on the stability for linear model loss function that is independent of $T$ (the number of iterations), even when the weight $\mu$ of the regularization term is very small (of the order of $1/n^4$), as long as the data satisfies a natural condition related to the Second Moment. Sharing a similar spirit with \citep{kuzborskij2018data}, our result suggests that the property of distribution plays an important role in generalization of SGD, and nice properties of the data can almost replace the need for regularization.

\subsection{Non-Convex Loss} \citep{hardt2016train} also prove an upper bound for non-convex loss functions, and one wonders again whether the bound is tight. After only being able to prove a slightly weaker lower bound, we realized that this was because one can actually improve the analysis in~\citep{hardt2016train}! 

\noindent\textbf{Result 3:} We provide matching lower ( Theorem~\ref{thm3}) and upper bounds ( Theorem ~\ref{unif_indep}) on the stability of SGD for non-convex functions, that are tighter than the upper bound in \citep{hardt2016train} for a wide and interesting range of values of $T$ (e.g., when $n<T<n^{10}$).

In the non-convex setting, the bounds in both \citep{hardt2016train} and our Result 3 assume a decreasing step-size $\alpha_t \propto 1/t$ in SGD. However, in practice the constant step-size case is very important. Although it is not derived formally, the techniques in \citep{hardt2016train} can be employed to show an \emph{exponential} upper bound for non-convex loss functions minimized using SGD with constant-size step, raising the question of the existence of better analysis. 

\noindent{\textbf{Result 4:}} Also by Theorem~\ref{noncvx_lowerbound}, we show that without any additional assumptions on either the loss function or the data distribution, improving on this analysis is hopeless by providing a lower bound that is exponential in $T$. 

\noindent{Data-dependent bounds:} This naturally raises the question of deriving data-dependent bounds on stability in the non-convex setting. The work in \citep{kuzborskij2018data} took the first step in this direction by analyzing SGD using concept of ``average stability'' from ~\citep{bousquet2002stability,shalev2010learnability}, and deriving upper bounds on it. Finally, we show:

\noindent\textbf{Result 5:} The improved analysis for uniform stability of SGD on non-convex and smooth loss functions can also be applied to improve on the result in~\citep{kuzborskij2018data} and obtain a tighter bound for the average stability of SGD. We present Theorem~\ref{unif_dep} as the data-dependent version of Theorem~\ref{unif_indep}.

In summary, we essentially close the open questions of tightness in data-independent settings for all three classes of functions, and improve upper bounds in the data-dependent setting. We hope that our results will initiate further efforts to better understand the data-dependent setting under non-convex loss functions and analyze the conditions under which one can expect better upper bounds on stability and generalization of SGD. 

% ======= Original Table format ========


\section{Related Works} \label{related}

% \textbf{Stability and generalization.}
 The stability framework suggests that a stable machine learning algorithm results in models with good generalization performance \citep{kearns1999algorithmic,bousquet2002stability,elisseeff2005stability,shalev2010learnability,devroye1979distributiona,devroye1979distributionb,rogers1978finite,bousquet2002stability}. It serves as a mechanism for provable learnability when uniform convergence fails \citep{shalev2010learnability,nagarajan2019uniform}. 
The concept of uniform stability was introduced in order to derive high probability bounds on the generalization error~\citep{bousquet2002stability}. Uniform stability describes the worst-case change in the loss of a model trained on an algorithm when a single data point in the dataset is replaced.  In~\citep{hardt2016train}, a uniform stability analysis for \emph{iterative algorithms} is proposed to analyze SGD, generalizing the one-shot version in~\citep{bousquet2002stability}. Algorithmic uniform stability is widely used in analyzing the generalization performance of SGD~\citep{mou18a,feldman2019high,chen2018stability}.
The worst-case leave-one-out type bounds also closely connect uniform stability with \emph{differential private learning}~\citep{feldman2018privacy,feldman2020private,dwork2006calibrating,wu2017bolt}, where the uniform stability can lead to provable privacy guarantees. Beside uniform stability, ~\citep{liu2017algorithmic} proposed \textit{argument stability} to capture stability of selected hypothesis function space.
%  and makes generalization rate faster than $O\left(1/\sqrt{n}\right)$ possible for convex loss. 

While the upper bounds of algorithmic stability of SGD have been extensively studied, the tightness of those bounds remains open. In addition to uniform stability, an \textit{average stability} of the SGD is studied in \citep{kuzborskij2018data} where the authors provide \textit{data-dependent} upper bounds on stability\footnote{While it is an interesting open problem to get data-dependent lower bounds by lower bounding the average stability, we construct lower bounds on the worst-case stability. Thus our lower bounds are general and not data-dependent.}.  Our analysis framework for deriving improved bounds in ~\citep{hardt2016train} can also be applied to improve the data-dependent stability results in ~\citep{kuzborskij2018data}.
% In particular, in \citep{hardt2016train} it was observed that the uniform stability of SGD in non-convex setting may not be able to explain the generalziation of deep learning model. This casts doubt on the optimality of existing analysis.

In \citep{bassily2020stability}, a lower bound on the stability of SGD for nonsmooth convex losses is proposed. The lower bound is designed to illustrate the tightness of the stability analysis \emph{without} smoothness assumptions. In this work, we report for the first time lower bounds on the uniform stability of SGD for smooth loss functions. 
% Our analysis show that existing bounds are optimal within  a reasonable range.  
Our tightness analysis suggests the necessity of additional assumptions for analyzing the generalization of SGD for deep learning.  


%\citep{nagarajan2019uniform}. In \citep{bassily2020stability}

% Recently, \citep{kawaguchi2016deep} proved that for linear and nonlinear deep models, local minima are also global minima.
% Already in early work on deep learning~\citep{hochreiter1995simplifying}, training straightforward linear networks converged to large, flat minima with low generalization error.
% ~\citep{keskar2016large} showed that using a large batch size during training leads networks to converge to sharp minima with high generalization error. Extending the investigation of deep loss landscapes,~\citep{wu2017towards} proved that for 2-layer networks, there exist good minima which lie in flat regions where the attractor basin has large volume. In this case, the general training process tends to converge to good minima with high probability. To visualize the loss landscape,~\citep{li2018visualizing} proposed filter-wise normalization visualization. Using this method, they visualized the loss landscape for various network architectures and empirically showed that flatter minima  correspond with better generalization error. wu2017towards

% Using the Hessian matrix to approximate the Fisher information,~\citep{jia2019information} proposed a metric based on Fisher information to bound the generalization error of flat minima regions.
% \cc{This is confusing. What do you mean about \citep{jia2019information}?} \yikai{removed}
% \textbf{Geometry of local minima.} 
% The geometry of local minima  plays an important role in the generalization performance of deep neural networks ~\citep{hochreiter1995simplifying,wu2017towards}. 
% The flat minima, i.e., minima whose Hessians have a large portion of zero-valued eigenvalues, are believed to attain better generalization~\citep{keskar2016large,li2018visualizing}.
% In~\citep{chaudhari2019entropy}, the authors construct a local entropy-based objective function which converges to a solution with good generalization in a flat region, where ``flatness" means that the Hessian matrix has a large portion of nearly-zero eigenvalues. 
% However, these observations have not been supported theoretically. 
% % However \citep{hardt2016train} results suggest since the stability of SGD on non-strongly convex region can be worse compared to the strongly convex region thus large positive eigenvalue  in the Hessian should help. While the existing results from different perspective can not meet with each other, we propose the Hessian contractive conditionthat may fill up such gap. 
% In this paper, we propose the Hessian contractive condition that is slight stronger than flat minima. 
% Such condition suggests that the minima is sharp only in the gradient direction while remains flat in other directions, which  unifies the geometrical interpretation of 
% flat minima and uniform stability analysis.
% \textbf{Empirical and Population Risks}
% In summary, we provide tightness results on stability bounds of SGD for different types of functions. Our results imply that the  exponential stability bound for non-convex functions is indeed tight. Both non-convex and standard convex may not be ideal conditions to study the stability. Alternatively, we propose a stronger yet sufficiently flexible condition for the loss near local minima. We show a constant stability bound for such condition and empirically verify such condition in deep neural network training.
% We consider our paper as one step forward in understanding the generalization power of deep neural networks. The results can be used to analyze models trained with more sophisticated optimization algorithms~\citep{mou18a,chaudhari2019entropy,chen2018stability}.
% We view this work as a step toward bridging the gap between the empirical performance and theoretical guarantee of the generalization power of deep neural nets. In summary, our contributions are as follows:
% minima is sharp only in the gradient direction

% \cc{This list of contributions might not be necessary.}
% \begin{itemize}
%     \item
%     In Theorem \ref{CvxLowerBound} we show that for convex, smooth loss functions, the existing stability bounds on SGD are tight. 
%     \item
%     In Theorem \ref{NoncvxLowerbound} we show that for non-convex, smooth loss functions, the existing stability bounds on SGD are not tight. We provide tight upper bounds in Theorem \ref{tight_bound}. However, this still doesn't explain the generalization of deep learning models.   
%     \item
%     In section \ref{geom-local-min} we hypothesize that a Hessian contractive region in the loss landscape--as measured by local convexity--stablizes the SGD iterate thus
%     regulates the generalization performance of deep models. We verify this hypothetical condition empirically by analyzing  statistics of approximate Hessian products  for  several popular deep architectures. 
% \end{itemize}


% \begin{table}[hbtp]
% \begin{center}
% \caption{Current landscape of stability bounds. [H] indicates results in~\citep{hardt2016train}, and * indicates results in this paper. Bounds without [H] or * are trivial. $\beta$ is the smoothness parameter.}
% \renewcommand{\arraystretch}{0.8}
% \begin{tabular}{|l|p{.15\textwidth}|p{.15\textwidth}|p{.18\textwidth}|p{.18\textwidth}|}
% \hline
% \textit{SGD Step Size} &  \multicolumn{2}{c|}{Constant $\alpha_{t} = a/\beta$} & $\alpha_{t}= a/(\beta t)$ &Constant $\alpha_{t} = a/\beta$\\
% \hline
% \textit{Loss function} & Strongly Convex  & Convex & Non-Convex & Hessian Contractive\\
% \hline
% Upper Bound & $O(1)$  [H]  & $O(aT/n)$   [H] & $O \left(T^{\frac{a}{1+a}}/n\right)$ [H] $ O\left(T^a/{n^{1+a}}\right)$* & Theorem ~\ref{nonconvex}* \\
% \hline
% Lower Bound & $\Omega(1)$  & $\Omega(aT/n)$* & Open, evidence* & $\Omega(1)$\\
% % \hline
% % Landscape & \includegraphics[width=.17\textwidth]{Stronglyconvex.png} &  Figure & Figure & \includegraphics[width=.17\textwidth]{fig/Nonconvex.png} \\
% \hline
% \end{tabular}
% \end{center}
% \label{tab:results-summary}
% \end{table}

% \textbf{Older figure. Need pictures for middle two columns}
% \begin{figure}[hbtp]
%     \centering
%     \includegraphics[width=.7\textwidth]{fig/spectrum.png}
%     \caption{Different functions and their tight stability bounds. }
%     \label{fig:motivation}
% \end{figure}\mayank{I know. I told Chao that baguettes would be better.}

% \begin{equation}
%     f(w,S) = \frac{1}{n} \sum_{x_j \in S} f(w,z_i)
% \end{equation}
% The SGD 
% In addition, it In \citep{hardt2016train}, the algorithmic stability upper bound of SGD is proposed on various of loss functions. While those analysis provide 
% knowledge of the stability of SGD can be applied to bound the privacy cost. 





\section{Preliminaries} \label{preliminary}
In this section we introduce the notion of uniform stability and  establish notation. We first introduce the quantities \emph{empirical risk}, \emph{population risk}, and \emph{generalization gap}.
Given an unknown distribution $\mathcal{D}$ on labeled sample space $Z=X \times \mathbb{R}$, let $S=\{z_1,...,z_n\}$ denote a set of $n$ samples $z_i=(x_i,y_i)$ drawn i.i.d. from $\mathcal{D}$. Let $w \in \mathbb{R}^{d}$ be the parameter(s) of a model that predicts $y$ given $x$, and let $f$ be a loss function where $f(w;z)$ denotes the loss of the model with parameter(s) $w$ on sample $z$. Let $f(w;S)$ denote the \textit{empirical risk} 
% \setlength{\belowdisplayskip}{0pt} \setlength{\belowdisplayshortskip}{0pt}
% \setlength{\abovedisplayskip}{0pt} \setlength{\abovedisplayshortskip}{0pt}
$f(w;S)=E_{z \sim S }[f(w;z)]=\frac{1}{n} \sum_{i=1}^{n} f(w;z_i)$
with corresponding \textit{population risk}
$E_{z \sim \mathcal{D} }[f(w;z)]$.
The \textit{generalization error} of the model with parameter(s) $w$ is defined as the difference between the empirical and population risks:    
\begin{equation*}
\vert E_{z\sim \mathcal{D}} [f(w;z)] -E_{z\sim S} [f(w;z)]\vert. 
\end{equation*}
Next we introduce \emph{stochastic gradient descent} (SGD). We follow the setting of \citep{hardt2016train}: starting with initialization $w_{0} \in \mathbb{R}^{d}$, an SGD update step takes the form
\begin{equation*}
w_{t+1} = w_t-\alpha_t\nabla_w f(w;z_{i_t})
\end{equation*}
% \Wenjia{In Hardt, they define the Lipschitzness and smoothness on $f(\cdot)$ first, then expand these to $f(\cdot, z)$.}
where $i_t$ is drawn from $[n]=\{1,2,\cdots,n\}$ uniformly and independently in each round. Let $\mathcal{W}$ be a convex and compact set to be optimized over. For projected SGD we let $$w_{t+1} = \Pi_{w\in\mathcal W} \bigg(w_t-\alpha_t\nabla_w f(w;z_{i_t}) \bigg)$$
where  $\Pi_{\mathcal{W}}(v) = \argmin_{w\in\mathcal{W}} \|w-v\|$.

The analysis of SGD requires the following crucial properties of the loss function $f(\cdot,z)$ at any fixed point $z$, viewed solely as a function of the parameter(s) $w$:



% \textbf{Assumptions on the loss function} The results of \citep{hardt2016train} leverage the following assumptions on the fixed-sample loss function $f(\cdot,z)$:\mayank{I think we should use either $f$ or $F$ throughout.}
% \yikai{Fixed. We can use $f(w,z)$ to represent single loss function and $F(w,S)$ to represent the loss function on set $S$.}
\begin{definition}[$L$-Lipschitz]
A function $f(w)$ is $L$-Lipschitz if $\forall u,v \in \mathbb{R}^{d}$:
$|f(u)-f(v)| \leq L\|u-v\|$.
% In addition, we say $f(w)$ is $L$-lipschitz in domain $\Omega$ if the $L$-lipschitz condition holds $\forall u,v \in \Omega$.
\end{definition}

\begin{definition}[$\beta$-smooth] A function $f(w)$ is $\beta$-smooth if $\forall u,v \in \mathbb{R}^{d}$:
$|\nabla f(u)- \nabla f(v)| \leq \beta\|u-v\|$.
\end{definition}

\begin{definition} [$\gamma$-strongly-convex]
A function $f(w)$ is $\gamma$-strongly-convex if $\forall u,v \in \mathbb{R}^{d}$:
\begin{equation*}
f(u)>f(v)+ \nabla f(v)^\top [u-v]+\frac{\gamma}{2}\|u-v\|^2.
\end{equation*}
% We say $f(w)$ is convex if $\gamma$
\end{definition}

\begin{definition}[$\rho$-Lipschitz Hessian]
A loss function $f$ has a $\rho$-Lispchitz Hessian if $\forall u, v\in \mathbb{R}^d$, $\|\nabla^2f(u) - \nabla^2f(v)\|\leq\rho\|u-v\|.$
\end{definition}

% \textbf{Algortihmic Stability}
% \mayank{Since algorithm is randomized, so is $A(S)$, and we should remember to put in expectation sign before F}
% \mayank{after fixing the random coins of the algorithm}
\paragraph{Algorithmic Stability:}
Next we define the key concept of \textit{algorithmic stability}, which was introduced by~\citep{bousquet2002stability} and adopted by~\citep{hardt2016train}. Informally, an algorithm is \textit{stable} if its output only varies slightly when we change a single sample in the input dataset. When this stability is \textit{uniform} over all datasets differing at a single point, this leads to an upper bound on the generalization gap. We now flesh this out more formally.
\begin{definition}
Two sets of samples $S, S'$ are twin datasets if they differ at a single entry, i.e., $S = \{z_1,...z_i,...,z_n\}$ and $S'=\{z_1,...,z_i',...,z_n\}$.
\end{definition}

Now, let $\mathcal{A}$ be a (possibly randomized) algorithm which is parameterized by a sample $S$ of $n$ datapoints as $\mathcal{A}(S)$. 
\begin{definition}(Stability)
Define the algorithmic stability parameter $\varepsilon_{\text{stab}}(\mathcal{A},n)$ as
\begin{equation*}
    %\begin{aligned}
   \inf \{\varepsilon:\sup_{{z, S, S'}} \mathbb{E}_{\mathcal{A}}\vert f(\mathcal{A}(S);z)-f(\mathcal{A}(S');z)\vert \leq \varepsilon \}.
   % \end{aligned}
\end{equation*}
\end{definition}
The expectation $\mathbb{E}_{\mathcal{A}}$ factors in the possible randomness of $\mathcal{A}$. For such an algorithm, one can define its expected generalization error as 
\begin{equation*}
GE(\mathcal{A},n)\coloneqq \mathbb{E}_{S,\mathcal{A}}[\mathop{E}_{z\sim \mathcal{D}} [f(\mathcal{A}(S);z)] -\mathop{E}_{z\sim S} [f(\mathcal{A}(S^{'});z)]].
\end{equation*}
We also define a data-dependent stability which is an average stability that was introduced by~\citep{rakhlin2005stability,shalev2010learnability} and was applied for analyzing algorithmic stability of SGD by \citep{kuzborskij2018data}.
\begin{definition}[On-average stability]\label{avg_stab}
Let $\mathcal{D}$ be the data distribution and $w_0$ be the initialized weight. A randomized algorithm $\mathcal{A}$ is ${\widehat{\varepsilon}}_{\text{stab}}(\mathcal{D}, w_0)$-on-average stable if
\begin{equation*}
    \mathbb{E}_{S,S'}\mathbb{E}_{\mathcal{A}} [f(\mathcal{A}_S;z) - f(\mathcal{A}_{S^{'}};z)] \leq {\widehat{\varepsilon}}_{\text{stab}}(\mathcal{D}, w_0),
\end{equation*}
where $S\stackrel{iid}{\sim}\mathcal{D}^m$ and $S^{'}$ is its copy with $i$-th example replaced by $z\stackrel{iid}{\sim}\mathcal{D}$.
\end{definition}

Throughout this paper, we will write ${\varepsilon}_{\text{stab}}$ and ${\widehat{\varepsilon}}_{\text{stab}}$ omitting dependencies that are clear in context.

\noindent\textbf{Stability and generalization:} 
It was proved in~\citep{hardt2016train} that $GE(\mathcal{A},n) \leq \varepsilon_{\text{stab}}(\mathcal{A},n)$. Furthermore, the authors observed that an $L$-Lipschitz condition on the loss function $f$ enforces a uniform upper bound: $\sup_{z\in Z}|f(w;z)-f(w';z)| \leq L\|w-w'\|$. This implies that for a Lipschitz loss, the algorithmic stability $\varepsilon_{\text{stab}}(\mathcal{A},n)$ (and hence the generalization error $GE(\mathcal{A},n)$) can be bounded by obtaining bounds on $\|w-w'\|$. And in \citep{kuzborskij2018data} they have similar results in the notion of  on-average stability.

Let $w_{t}$ and $w_{t}^{'}$ be the parameters obtained by running SGD on twin datasets $S,S'$ respectively for $t$ iterations. 
% Throughout this paper we focus on 
The \emph{divergence quantity} is defined as $\delta_{t}\coloneqq \mathbb{E}_{\mathcal{A}}\vert\vert w_{t}-w_{t}^{'}\vert\vert$. While \citep{hardt2016train} reports upper bounds on $\delta_{t}$ for different loss functions, e.g., convex and non-convex loss functions, we investigate the tightness of those bounds.

% The algorithmic stability states that the divergence between two models $m,m'$--represented by their parameters $w,w'$ respectively--trained on the twin datasets upper bounds the generalization gap:
% \begin{equation}
%   |E_{z\sim \mathcal{D}} [f(w;z)] -E_{f\sim S} [f(w;z)]|\le \sup_{z}\mathbb{E}_{A}[f(A(S);z)-f(A(S');z)]
% \end{equation}



% \newpage
\section{Main Results} \label{main}
 In this section, we report our main results. We first consider the convex case with constant step size, where we prove 1) that the existing bounds in ~\citep{hardt2016train} are tight, and 2) for linear models, we report a data-dependent analysis to show that $\varepsilon_{\text{stab}}$ does not increase with $t$. Then we move on to the non-convex case, where a) for decreasing step size we report a lower bound suggests that within a wide range of $T$, existing bound in ~\citep{hardt2016train} is not tight. We prove a tighter upper bound which matches our lower bound thus, and b) for constant step size we give loss functions whose divergence $\delta_{t}$ increases exponentially with $t$.
%We construct a family of loss functions which prohibit SGD from being stable. 

\subsection{Convex Case}
In this section we analyze the stability of SGD when the loss function is convex and smooth. We begin with a construction which shows that Theorem 3.8 in ~\citep{hardt2016train} is tight. 
Our lower bound analysis will require the quadratic function
\begin{equation}\label{quad_fn}
    f(w;z)=\frac{1}{2} w^\top Aw - yx^\top w,
\end{equation}
where $A$ is a $d\times d$ matrix. In the construction of lower bounds, we  carefully choose $A$ and $S$ so that the single data point replaced in the twin data set will cause the instability of SGD.
In particular, we will choose $A$ to be a PSD matrix in the convex case in the construction of the lower bound and choose $A$ to be an indefinite matrix with some strictly negative eigenvalues in the non-convex case. We first begin with the following lemma which describes how $\|w_t-w'_t\|$ behaves for functions defined in Equation~\ref{quad_fn}.

\begin{lemma}[Dynamics of divergence] \label{lem1}
Let $f(w;x) = \frac{1}{2}w^\top Aw - yx$. Suppose $[x_i-x_i']/\|x_i-x_i'\|$ is an eigenvector of $A$, i.e., $A[x_i-x_i'] = \lambda_{xx'}[x_i-x_i']$. Let $\Delta_{t}$ be $w_t-w_t'$, $\alpha_t \leq \lambda_{xx'}$ be the step size of SGD and $\Delta_0 =0$. If one  runs SGD on $f(w,S)$ and $f(w,S')$ where $S,S'$ are twin datasets and  ${x'}_i^\top x_j = 0, x_i^\top x_j = 0, \;\forall j \neq i$, then the dynamics of $\Delta_t$ are given by

\begin{equation}
\mathbb{E}_{\mathcal{A}} \|\Delta_{t+1}\| = (1-\alpha_t \lambda_{xx'})\mathbb{E}_{\mathcal{A}}\|\Delta_t\|+\frac{\alpha_t}{n}\|x_i-x_i'\|.
% \|\Delta_t -\alpha_t (\nabla_t f(w_t,z_t)-\nabla_t f(w',z_t') \|    
\end{equation}
\end{lemma}
\begin{remark}
In this work, we assume that the different entry data $x_i,x'_i$ are orthogonal to all other samples. Such a restrictive setting serves as a corner case to prove the tightness, suggesting the necessity of additional assumptions to improve the upper bound. Indeed in Theorem~\ref{unif_indep}, we introduce more realistic assumptions to avoid such corner cases, and the upper bound can be improved accordingly.
\end{remark}
The next lemma recursively applies Lemma~\ref{lem1}. We will carefully chose $\lambda_{xx'}$ in the following lemma for lower bound constructions in the convex and non-convex cases. 
\begin{lemma} [Lower bound on divergence] \label{lem2} 
% Let $\Delta_{t}$ be $w_t-w_t'$, $\alpha_t$ be the step size of SGD and $\Delta_0 = 0$.
% Suppose $[x_i-x_i']/\|x_i-x_i'\|$ is an eigenvector of $A$ where $A[x_i-x_i'] = \lambda_{xx'}[x_i-x_i']$. Running SGD on $f(w,S)$, we have:\\
Let $f(w;x) = \frac{1}{2}w^\top Aw - yx$. Suppose $[x_i-x_i']/\|x_i-x_i'\|$ is an eigenvector of $A$ where $A[x_i-x_i'] = \lambda_{xx'}[x_i-x_i']$. Let $\Delta_{t}$ be $w_t-w_t'$, $\alpha_t \leq \lambda_{xx'}$ be the step size of SGD and $\Delta_0 =0$. If one  runs SGD on $f(w,S)$ and $f(w,S')$ where $S,S'$ are twin datasets and  ${x'}_i^\top x_j = 0, x_i^\top x_j = 0, \;\forall j \neq i$, then we have
\begin{equation*}
    \mathbb{E}_{\mathcal{A}}\|\Delta_{T}\| \geq \frac{\|x_i-x_i'\|}{n} \sum_{t=1}^{T-1} \prod_{\tau=t+1}^{T-1} \alpha_t(1-\alpha_\tau\lambda_{xx'}).
\end{equation*}
%  &= \mathbb{E}\| (I-\alpha_{T-1} A)\Delta_{T-1}+\frac{\alpha_{T-1}}{n}[x_i-x_i']\|
\end{lemma}

Now  we can present our tightness results. We begin with the convex case. The main idea of the construction is to leverage Equation~\ref{quad_fn} with specially designed $A$ and $S,S'$ to ensure that $\mathbb{E}_{\mathcal{A}}\|w_T-w'_T\|$ will diverge. However, quadratic function in general does not $L$-Lipschitz condition, which does not match the assumpition used to derive upper bound in ~\cite{hardt2016train}. To obtain the $L$-Lipschitz condition, we trim $f(w;S)$ to mimic the Huber loss function~\citep{huber1992robust} so that the smoothness is maintained for the piecewise function.

\begin{thm} [Lower bound for convex losses]\label{CvxLowerBound}
Let $w_t,w_t'$ be the outputs of SGD on twin datasets $S,S'$ respectively. Let $\Delta_{t} = w_t-w_t'$ and $\alpha_t$ be the step size of SGD. There exists a function $f$ which is convex, $\beta$-smooth, and $L$-Lipschitz, and twin datasets $S,S'$ such that
\begin{equation}
 \varepsilon_{\text{stab}} \geq  \frac{L}{2n} \sum_{t=1}^{T} \alpha_t.
\end{equation}
\end{thm}

    % \mathbb{E}\|\Delta_T\| \geq \frac{1}{n} \sum_{t=1}^{T} \alpha_t; \;\;\;

% i.e., for a family of convex but not-strongly-convex loss functions, 

The convex upper bound in Theorem 3.8 of~\citep{hardt2016train} states that $\mathbb{E}_{\mathcal{A}}\|\Delta_T\| \leq \sum_{i=1}^{T} \frac{ \alpha_t L}{n}$, which implies that the divergence increases throughout training. The lower bound in Theorem~\ref{CvxLowerBound} suggests the tightness of the upper bound. However, in practice, this is not commonly observed; the generalization performance does not deteriorate as the number of training iterations increases. Under the $\gamma$-strongly-convex loss function condition,~\citep{hardt2016train} provides an $O(\frac{1}{n})$ uniform stability bound, which fits better with empirical observations on classical convex losses. In the next theorem, we show the tightness of the $O(\frac{1}{n})$ bound for strongly-convex losses.
\begin{thm} [Lower bound for strongly-convex losses] \label{strongly_cvx_lowerbound}
Let $w_t,w_t'$ be the outputs of SGD on twin datasets $S,S'$ respectively, $\Delta_{t}$ be $w_t-w_t'$ and $\alpha =\frac{1}{2 \beta}$ be the step size of SGD. There exists a function $f$ which is $\gamma$-strongly-convex and $\beta$-smooth, and twin datasets $S,S'$ such that the divergence and stability of the two SGD outputs satisfies
\begin{equation}
 \varepsilon_{\text{stab}} \geq  \frac{1}{16\gamma n}.
\end{equation}

    % \mathbb{E}_{\mathcal{A}}\|\Delta_T\| \geq  \frac{1}{16\gamma n}; \;\;\;
\end{thm}

% simple regression loss seeking for an abstraction of this family of loss. 

% We wrap up the  section with an example which connects our Linear model loss function notation with \emph{Linear Regression loss function} 



%   even if $f(\cdot)$ is not strongly convex.




% \textbf{Example: Linear  Regression} Classical linear regression minimizes the quadratic loss on $w$: $f(w,S) = \frac{1}{2n}\sum_{x_j\in S} (x_j^\top w-y_j)^2$. One can rewrite the loss function in  terms of $f_y(w^\top x)$ where $f''_y(\cdot) = 1 $.  Note that the Hessian of an individual linear regression loss term is $x_j x_j^\top$ which is not strongly convex 
% Thus, $f''_y(\cdot) \geq \gamma$ is a weaker condition than the $\gamma$-strongly convex condition. Theorem \ref{cvxupperbound} implies that a strongly convex Hessian in expectation also implies good generalization. A similar result can also be derived for the logistic regression loss.  \\ 
% In \citep{hardt2016train}, an $O\left( \frac{L^2}{\gamma n}\right)$ stability bound is derived on a loss function $f(w,S)$ which is strongly convex, i.e., $\nabla^2 f(w) \succ \gamma I$.
% such condition is not universally hold. For example, in the case of Linear Regression with degenerated design matrix, 
Theorem \ref{strongly_cvx_lowerbound} provides evidence for the tightness of the $O(\frac{1}{n})$ stability bound on SGD. To obtain such stability, the loss function must satisfy $\nabla^2_w f(w;z) >\gamma I_d$ with $\gamma=\Omega(1)$. In general this does not hold, e.g., the Hessian of an individual linear regression loss term is $x_j x_j^\top$ which is not strongly-convex. 
In practice one can incorporate a strongly-convex regularizer to impose strong convexity, often resulting in improved generalization performance in practice~\citep{shalev2010learnability,bousquet2002stability}. However, an $O(1)$ regularization term  will bias the loss function away from achieving sufficiently low empirical risk. This motivates us to investigate a weaker condition than strong convexity which still can enforce an $O\left( \frac{1}{n} \right)$ stability, without substantially biasing the loss function.
%  strong convexity. 

% divergence will not increase unboundedly during training.???Yikai will say something here.

In the remainder of this section, we restrict ourselves to a family of linear model loss functions and show that the $O(\frac{1}{n})$ stability results can be obtained under the framework of average stability. The results of Theorem~\ref{cvxupperbound} have a dependence on a property of the distribution, and are thus distribution-dependent.  We begin with the definition of a $\xi$-bounded Second Moment. Essentially, a bounded Second Moment dataset requires an average linear dependence of $Span\{x_1,...,x_n\}$. Recall that the $i$-th sample is of the form $z_{i}=(x_{i},y_{i})$.
% Such condition states that $x$ in $S$ shares a common subspace, which implies that $S$ is 'simple'.




% \mayank{bounded twice}
\begin{definition} \label{self_corr}
A set $S=\{(x_1,y_1),...,(x_n,y_n)\}$ is defined to have $\xi_S$- bounded Second Moment if $\forall v \in Span\{x_1,...,x_n\}$
\begin{equation*}
    v^\top (\frac{1}{n}\sum^{n}_{i=1}x_ix_i^\top)v\geq \xi_S v^\top v.
\end{equation*}

A distribution $\mathcal{D}$ has a $(\xi,n,\mu)$-inversely bounded Second Moment if there exists a constant $\xi>0$ such that
\begin{equation*}
    \mathbb{E}_{S\sim \mathcal{D}^n} \left[ \frac{1}{\xi_S+\mu} \right] \leq \frac{1}{\xi+\mu}.
\end{equation*}
\end{definition}
% $\forall j_1,j_2 \in [n],  x_{j_1}^\top (\frac{1}{n}\sum^{n}_{i=1}x_j  x_j^\top) x_{j_2} \geq \xi x_{j_1}^\top x_{j_2}$ where $\xi>0$
% Assuming that $\forall j \in [n]$, $\|x_j\| \geq r $ for some $r>0$, definition \ref{self_corr} implies that $S$ is at least $\frac{r^2}{n}$-self correlated. Thus the above condition holds for all datasets $S$ not containing the zero-feature vector. 
\begin{remark}
The value of $\xi_S$ is always lower bounded by the minimum nonzero eigenvalue of $\frac{1}{n} \sum_{j} x_j x_j^\top $ which is the empirical second moment of data with size $n$. 

\end{remark}

\begin{prop}[Example of distribution with inversely bounded Second Moment] \label{invers_eigen}

Let $\mathbb{E}_{x\sim \mathcal{D}} [xx^\top ] = \Sigma$ and $\xi$ be the minimum non-zero eigenvalue of $\Sigma$. Suppose $S=\{(x_1,y_1),...,(x_n,y_n)\}$ is sampled from $\mathcal{D}$  with the $x \in \mathbb{R}^d$ with $\|x\| \leq 1$. Then, there exists universal constant $C,c$ so that if  $n\geq max\{\frac{4C^2d}{\xi^2}, \frac{512}{c\xi^2} \log(\frac{1}{\xi})\}$, $\mathcal{D}$ has a $(\frac{\xi}{3},n,\mu)$-inversely bounded Second Moment if $\mu \geq  \frac{1}{n^4}$.
\end{prop}

In our next theorem, we leverage the inversely bounded Second Moment property to prove a non-accumulated on-average stability bound for SGD on \emph{linear models} with a regularized loss function. We characterize a linear model by rewriting the loss function $f(w;z)$ in terms of $f_y(w^\top x)$ where $f_y(\cdot)$ is a scalar function depending only on the inner product of the model parameter $w$ and the input feature $x$. 

% \begin{thm}[Distribution-dependent stability of SGD with inversely bounded Second Moment]\label{cvxupperbound}
%  Suppose a loss function $f(w,z)$ is of the form 
%  \begin{equation*}
%      f(w,S) =\frac{1}{n} \sum_{j=1}^{n} f_{y_j}(w^\top x_j)+\frac{\mu}{2} w^\top w  \;\;; w\in \mathcal{W}
%  \end{equation*}
%  where $f_y(w^\top x)$ satisfies $(1)\; |f_y'(\cdot)| \leq L $ , $(2)\; 0<\gamma \leq f_y''(\cdot) \leq \beta$, (3) $S , S'$ are sampled from $\mathcal{D}$ with $\xi$ be the minimum nonzero eigenvalue of $\mathbb{E}_{x\sim \mathcal{D}} [x x^\top]$ and a uniformly bounded support $\|x\| \leq 1$ 4) $\mu =\Omega(\frac{\gamma}{n^4})$. Let $\mathcal{W}$ be a  convex and compact set, $w_t$ and $w_t'$ be the outputs of SGD on $S$ and $S'$ after $t$ steps, respectively. Let the divergence $\Delta_{t}:=w_t-w_t'$ and $\alpha\leq \frac{\mu
%  }{2\beta^2}$ be the step size of SGD.
% Then,
% \begin{equation*}
%     \mathbb{E}_{S}\mathbb{E}_{\mathcal{A}}\|\Delta_T\| \leq  \frac{12L}{\xi\gamma n}, \ \ \text{and} \ \  \varepsilon_{stab} (\mathcal{D}) \leq \frac{16L^2}{\xi\gamma n}.
% \end{equation*}
% \end{thm}


\begin{thm}[Data-dependent stability of SGD with inversely bounded Second Moment]\label{cvxupperbound}
 Suppose a loss function $f(w,z)$ is of the form  $$f(w,S) =\frac{1}{n} \sum_{j=1}^{n} f_{y_j}(w^\top x_j)+\frac{\mu}{2} w^\top w\;\;; w\in \mathcal{W}$$ where $f_y(w^\top x)$ satisfies $(1)\; |f_y'(\cdot)| \leq L $ , $(2)\; 0<\gamma \leq f_y''(\cdot) \leq \beta$, (3) $S , S'$ are sampled from $\mathcal{D}$ with $\xi$ be the minimum nonzero eigenvalue of $\mathbb{E}_{x\sim \mathcal{D}} [x x^\top]$ and a uniformly bounded support $\mathcal{X}:\|x\| \leq 1, \mathcal{X}\subset \mathbb{R}^d$ and 4) $\mu \geq \frac{\gamma}{n^4}$. Let $\mathcal{W}$ be a  convex and compact set, $w_t$ and $w_t'$ be the outputs of SGD on $S$ and $S'$ after $t$ steps, respectively. Let the divergence $\Delta_{t}:=w_t-w_t'$ and $\alpha\leq \frac{\mu
 }{2\beta^2}$ be the step size of SGD. There exists universal constant $C,c$ so that if $n\geq max\{\frac{4C^2d}{\xi^2}, \frac{512}{c\xi^2} \log(\frac{1}{\xi})\}$, then
$$\mathbb{E}_{S}\mathbb{E}_{\mathcal{A}}\|\Delta_T\| \leq  \frac{12L}{\xi\gamma n}, \ \ \text{and} \ \  \widehat{\varepsilon}_{stab} (\mathcal{D}) \leq \frac{16L^2}{\xi\gamma n} .$$
\end{thm}

\begin{remark}

The  inversely bounded Second Moment condition allows SGD to maintain an average stability guarantee for a family of widely used models with a negligible regularizer and large sample size. The theorem suggests that if the dataset $S$ is sampled from a `good' distribution, one can obtain an advanced generalization property which mainly depends on the distribution. The theorem also justifies the common choice of small values for the weight in the $L_2$-regularizer (also known as \textit{weight decay}) when training ridge regression type models. Note that the term $\frac{\mu}{2} w^\top w$ makes the loss function strongly convex, and a $O \left( \frac{1}{n} \right)$ is established with $\mu = O (1)$ in~\citep{hardt2016train}. The major difference of Theorem~\ref{cvxupperbound} is that the \textit{weight of the $\ell_2$ penalty} $\mu$ is $O\left( \frac{1}{n^4}\right)$ for uniformly bounded $x$. A small value of $\mu$ will not bias the original loss function thus allow the SGD to sufficiently minimize the empirical risk. In stead of leveraging the $\ell_2$ penalty, the stability of SGD is obtained upon the `nice' property of the distribution.
%  The theorem also provides conditions that makes generalization rate faster than $O\left( \frac{1}{\sqrt{n}}\right)$. 

% $x_i$ is 
% every $x_i$ lies in a low dimensional subspace, the algorithmic stability of SGD is comparable with a strongly convex loss function. This analysis suggests an alternative condition other can strong convexity can empower SGD an $O(1)$ stability which is Distribution-dependent. This motivates us to go beyond linear model and seek for a generalized condition in Theorem \ref{cvxupperbound}. In section \ref{geom-local-min}, we propose the Hessian Contractive condition  for more general loss function driven by the observation on the linear model.
%  We wrap up the  section with an example which connects our Linear model loss function notation with \emph{Linear Regression loss function} .
% \emph{for a certain family of loss functions of linear models, the algorithmic stability of SGD has an uniform upper bound}. 
% an potential explanation w.r.t aforementioned observations: 
% \frac{1}{n}
\end{remark}
 
 
%  which is \textit{not strongly convex} since it has rank $1$. 
 
% \textbf{Example: Linear regression} minimizes the quadratic loss on $w$: $f(w;S) = \frac{1}{2n}\sum_{x_j\in S} (x_j^\top w-y_j)^2$. Note that the Hessian of an  linear regression loss  is $\frac{1}{n} \sum_{j=1}^{n} x_j x_j^\top$.

% \textbf{Example: Linear regression.} Linear regression minimizes the quadratic loss on $w$: $f(w,S) = \frac{1}{2n}\sum_{x_j\in S} (x_j^\top w-y_j)^2$. Note that the Hessian of an individual linear regression loss term is $x_j x_j^\top$ which is \textit{not strongly-convex}. However, one can rewrite the loss function as $f_y(w^\top x)$ where $f''_y(\cdot) = 1 $. Hence Theorem~\ref{cvxupperbound} can be applied to give a distribution-dependent bound on the stability of SGD in above example.
\textbf{Example: Linear regression.} Linear regression minimizes the quadratic loss on $w$: $f(w,S) = \frac{1}{2n}\sum_{x_j\in S} (x_j^\top w-y_j)^2, w\in \mathcal{ W}$, where $\mathcal W$ is a convex compact set that contains the origin and has bounded radius $R$.   The Hessian of an individual linear regression loss term is $x_j x_j^\top$ which is \textit{not strongly-convex}. However, one can rewrite the loss function as $f_y(w^\top x)$ where $f''_y(\cdot) = 1 $. Next we 
present certain conditions that are sufficient to make $|f'(\cdot)|\leq L$. We assume $\|x_i\|= 1, y_i\in[-1,1], \forall i\in[n]$. Let $\Pi_{\mathcal{W}}(v) = 
\text{argmin}_{w\in \mathcal{W}} \|w-v\|$. Note that SGD updates as $w_{t+1} = \Pi_{
w\in \mathcal{W}} \bigg( w_t -\alpha_t (x_j^\top w_t - y_j)x_j \bigg).$ One can show that $\sup_{w \in \mathcal{W} }\sup_{x,y \in S} f_y'(w^\top x) \leq R+1$.


% A common practice for 
% One cannot apply the strongly convex bound, and the bound for convex suggests stability will increase linearly. However, one can rewrite the loss function as $f_y(w^\top x)$ where $f''_y(\cdot) = 1 $. Hence Thm.~\ref{cvxupperbound} can be applied to give a non-accumulative bound on SGD's stability. A similar result can be derived for the \textit{logistic regression} loss. 
% \textbf{ADD EXAMPLES logistic reg, Linear Reg loss}
% Theorem \ref{cvxupperbound} implies that a strongly convex Hessian in expectation also implies good generalization.
% \textbf{Show why this is different (more general) from Hardt's result on strongly convex functions}

\subsection{Non-Convex Case}
In this section, we construct a non-convex loss function to analyze the tightness of the divergence bound in \citep{hardt2016train}.  We first focus on the case where SGD applies a step size that \textit{decreases with $t$}. Define a \emph{hitting time} to be the time $t$ that satisfies $w_{t-1}-w_{t-1}^{'} = 0$ and $w_{t}-w_{t}^{'} \neq 0$. We first fix a hitting time $t_{0}$ and prove Lemma~\ref{thm2}.
\begin{lemma} [Divergence of non-convex loss function]\label{thm2}
There exists a function $f$  which is non-convex and $\beta$-smooth,  twin datasets $S,S'$ and constant $a>0$ such that the following holds: if SGD is run using step size $\alpha_t = \frac{a}{0.99 \beta t}$ for $1 \leq t < T$, and $w_t, w_t'$ are the outputs of SGD on $S$ and $S'$, respectively, and $\Delta_{t}=w_t-w_t'$, then
$
    \forall 1 \leq t_0 \leq T,\ \ \ \ 
    \mathbb{E}_{\mathcal{A}}\left[ \|\Delta_T\| | \Delta_{t_0} \neq 0  \right] \geq \frac{1}{2n} \left(\frac{T}{t_0}\right)^a.
$
\end{lemma}
The following theorem follows from Lemma~\ref{thm2} by optimizing over $t_{0}$. The choice of hitting time $t_0$ plays an important role in the analysis, which is also illustrated in the ``burn-in Lemma'' 3.11 in \citep{hardt2016train}.




% \begin{thm}
\begin{thm}[Lower bound for non-convex loss functions]\label{thm3}
Let $w_t, w_t'$ be the outputs of SGD on twin datasets $S,S'$, and $\Delta_{t}=w_t-w_t'$. There exists a function $f$ which is non-convex and $\beta$-smooth, twin datasets $S,S'$ and  constants $a<0.1$ such that the divergence of SGD after $T>n$ rounds using constant step size $\alpha_t = \frac{a}{0.99\beta t}$ satisfies
% \begin{equation}
%     \mathbb{E}_{\mathcal{A}}\|\Delta_T\| \geq \frac{T^a}{3n^{1+a}}
% \end{equation}

\begin{equation}\varepsilon_{\text{stab}} \geq \frac{T^{a}}{6n^{1+a}}.\end{equation}

\end{thm}

% \textbf{Proof}:
% The proof is based on Theorem \ref{thm2} plus the idea of a ``burn-in" period. We have:\\
% \begin{equation}
%     \begin{aligned}
%     \|\Delta_T\| &= \mathbb{E}[\|w_t-w_t'\|| \Delta_n = 0]\mathbb{P}[\Delta_n =0]+ \mathbb{E}[\|w_t-w_t'\|| \Delta_n \neq 0]\mathbb{P}[\Delta_n \neq 0]\\
%     \geq & \mathbb{E}[\|w_t-w_t'\|| \Delta_n \neq  0]\mathbb{P}[\Delta_n \neq  0]\\
%     = & (1-(1-\frac{1}{n})^n)\frac{T^a}{n^{1+a}}\|x_i-x_i'\|\\
%     \geq& \frac{T^a}{3n^{1+a}}\|x_i-x_i'\|
%     \end{aligned}
% \end{equation}

% \qed

\begin{remark}
In the above theorem, we require $\alpha_t = \frac{a}{0.99\beta t}$ with an extra constant factor $\frac{1}{0.99}$ to apply the inequality $1+\frac{a x}{0.99} > e^{ax}$ with sufficiently small $a$. To remove the constant $1/0.99$ in the learning rate one need to avoid using the inequality $1+x< e^x$ at the first place in deriving the upper bound. This can be done by a refined analysis for upper bound via setting learning rate $\alpha_t = \frac{e^{\frac{a}{t}} -1}{\beta}$.
\end{remark}
\begin{remark}
Note in ~\citep{hardt2016train}, an assumption is made on the non-convex loss function, namely that $f(u, z) \in (0, 1)$. In our lower bound construction, we do not have such an assumption thus our lower bound can not be directly compared with the upper bound in ~\citep{hardt2016train}. The bound in \citep{hardt2016train} is of the form $O\left( \frac{T^{\frac{a}{1+a}}}{n}\right)$, for $T^{\frac{a}{1+a}} \geq n$, our lower bound will exceed the upper bound in ~\citep{hardt2016train}. However, such a gap implies that even with additional assumptions in~\citep{hardt2016train}, the upper bound still may not be tight. The lower bound is derived by choosing the hitting time $t_0<n$, i.e., the first time SGD picks the different entries $z,z'$ in the twin dataset before round $n$, suggesting additional space for improvement on the analysis. We investigate this gap and derive a tighter bound in the next theorem which improves on Theorem 3.12 in~\citep{hardt2016train}.
\end{remark}
% \begin{remark}
% The lower bound is derived by  choosing $t_0=n$ in Lemma~\ref{thm2}. The bound in \citep{hardt2016train} is of the form $O\left( \frac{T^{\frac{a}{1+a}}}{n}\right)$ which does not match the above lower bound.  According to the lower bound provided in Theorem \ref{thm3},
% the bound in \citep{hardt2016train} may not be tight in the region $T^{\frac{a}{1+a}} \leq n$. We investigate this gap and derive a tighter bound in the next theorem which improves on Theorem 3.12 in~\citep{hardt2016train}.
% % Note in ~\citep{hardt2016train}, an assumption is made on the non-convex loss function, namely that $f(u, z) \in (0, 1)$. We avoid such assumptions on the function used in proving the lower bound in Theorem~\ref{thm3}.  Neither do the upper bounds that we will report in next section. Therefore, for very large $T$, the scale of strongly non-convex function will exceed $1$ thus our lower bound may exceed the upper bound in ~\citep{hardt2016train}, and in general is incomparable due to the lack of this assumption. 
% \end{remark}


To prove a better upper bound for non-convex losses, we need the following lemma, which gives us the expectation of divergence for a given hitting time $t_k + 1$, which is the timestamp of  SGD first selecting the $k$-th different sample.

\begin{lemma} \citep{hardt2016train}\label{lem_SGD}
	Assume $f$ is $\beta$-smooth and $L$-Lipschitz. Let $w_t, w_t'$ be outputs of $SGD$ on twin datasets $S,S'$ respectively after $t$ iterations and let $\Delta_{t}=[w_t-w_t']$ and $\delta_t = \mathbb{E}\|\Delta_t\|$. Running SGD on $f(w;S)$ with step size $\alpha_t = \frac{a}{\beta t }$ satisfies the following conditions:
	\begin{itemize}
		\item 
		The SGD update rule is a $(1+\alpha_t \beta)$-expander and $2\alpha_t L$-bounded. 
		\item 
		$\mathbb{E}_{\mathcal{A}}[ \|\Delta_{t}\| | \Delta_{t-1} ]  \leq \left(1+\alpha_t\beta \right)\|\Delta_{t-1}\| +\frac{2\alpha_tL}{n}$.
		\item 
		$\mathbb{E}_{\mathcal{A}}[ \|\Delta_{T}\| | \Delta_{t_{k}}=0 ] \leq \big(\frac{T}{t_{k}}\big)^{ a}\frac{2L}{ n}$.
	\end{itemize}
\end{lemma}

% ========== Lemma 5 ===========


% (Uniformly Independent)
% \begin{lemma} \label{lem_prob_rule}
% 	Let $w_t, w_t'$ be outputs of $SGD$ on twin datasets $S,S'$  after $t$ iterations and let $\Delta_{t}=w_t-w_t'$. Suppose that $t_k = ct_{k-1}$. Then the following conditions hold:
	
% 	\begin{itemize}
% 		\item 
% 		$\mathbb{P}[ \Delta_{t_k-1}=0| \Delta_{t_{k}}\neq 0 ]\leq \frac{n}{n+t_{k-1}}$.
% 		\item 
% 		$\mathbb{P}[ \Delta_{t_k-1} \neq0| \Delta_{t_{k}}\neq 0 ]\leq \frac{1}{c}\left(1+\frac{t_k}{n}\right)$.
% 		\item
% 	    	$\mathbb{E}_{\mathcal{A}}[ \|\Delta_{T}\|| \Delta_{t_{k}}\neq 0 ]$\\
% 		$\leq \frac{1}{c}\left(1+\frac{t_k}{n}\right)\mathbb{E}_{\mathcal{A}}[ \|\Delta_{T}\| | \Delta_{t_{k-1}}\neq 0 ] 
% 		    	+\big(\frac{T}{t_{k-1}}\big)^{ a}\frac{2L}{ n}$.
% % 		\begin{equation*}
% % 		    \begin{aligned}
% % 		    	&
% % 		    \end{aligned}
% % 		\end{equation*}
		
% % 		$.
% 		%		\begin{equation}
% 		%		\begin{aligned}
% 		%		&\mathbb{E}[ \Delta_{T} | \Delta_{t_{k}}\neq 0 ]\\
% 		%		&\leq \mathbb{E}[ \Delta_{T} | \Delta_{t_{k-1}}\neq 0 ]  \frac{t_{k-1}}{t_k}(1+\frac{t_k}{n})\\
% 		%		&+(\frac{T}{t_{k-1}})^{   c}\frac{2L}{n+t_{k-1}}
% 		%		\end{aligned}
% 		%		\end{equation}  
% 	\end{itemize}
% \end{lemma}


% Lemma~\ref{lem_SGD} bounds the case when the hitting time is equals to $t_k$ and Lemma~\ref{lem_prob_rule} gives an upper bound for the complementary case. Therefore, we could obtain the upper bound for the stability of uniform sampling SGD as follows:
Lemma~\ref{lem_SGD} bounds the case when the hitting time is equals to $t_k$. In the proof of ~\cite{hardt2016train} for non-convex stability upper bound, the $t_k$ is chose to be $T^{\frac{a}{1+a}}$. However, we observe that a choice of $t_k$ with additional care on the analysis leads to an improved upper bound. Therefore, we could obtain the upper bound for the stability of uniform sampling SGD as follows:
\begin{thm}[Uniform sampling SGD]\label{unif_indep}
    Assume $f$ is $\beta$-smooth and $L$-Lipschitz. Running $T>n$ iterations of SGD  on $f(w;S)$ with step size $\alpha_t = \frac{a}{\beta t }$, the stability of SGD satisfies\\ 
% 	\begin{equation}
% 	\mathbb{E}_{\mathcal{A}} \|\Delta_T\|   \leq 16 \log(n)L\frac{T^a}{n^{1+a}},
% 	\end{equation}
    \begin{equation*}
        \varepsilon_{\text{stab}} \leq  \frac{16L^2T^a}{n^{1+a}}.
    \end{equation*}
\end{thm}
 
	

% can be found in the Appendix.
We remark that the above analysis is for uniform sampling SGD, where the algorithm keeps sampling with replacement. We also derive a version of Theorem~\ref{unif_indep} which samples without replacement in the appendix, which also matches the lower bound. Dividing our bound by the bound in Theorem 3.12 of~\citep{hardt2016train}, we obtain the ratio $ \tilde \Omega \left( \frac{T^{\frac{a^2}{1+a}}}{n^a}\right)$. This factor is less than 1 (and so we improve the upper bound) exactly when $T^{\frac{a}{1+a}} \leq n$. Note that this is potentially a large range as $a$ is a small and positive constant.  

% \subsubsection{Tight Upper Bound}

% However, our analysis of SGD on nonconvex function relies on a decreasing step size.
% \begin{thm}[Tight Upper Bound]\label{tight_bound}
% 	Assume $f$ is $\beta$-smooth and $L$-lipschitz. Running $T$ ($T>n$) iterations of SGD  on $f(w;S)$ with decreasing step size $\alpha_t = \frac{a}{\beta t }$, the stability of SGD satisfies:

% 	\begin{equation}
% 	\mathbb{E}\|\Delta_T\|   \leq 16 \log(n)L\frac{T^a}{n^{1+a}}, \ \ \text{and} \ \  \varepsilon_{stab} \leq  16\log(n)L^2\frac{T^a}{n^{1+a}}.
% 	\end{equation}
% \end{thm}
% \begin{thm} \label{tight_bound}
% 	Assume $f$ is $\beta$-smooth and $L$-lipschitz. Running $T$ ($T>n$) iterations of  SGD  on $f(w;S)$ with step size $\alpha_t = \frac{a}{\beta t }$, the stability of SGD satisfies:\\ 
% 	\begin{equation}
% 	\mathbb{E}\|\Delta_T\|   \leq\frac{ 2L T^a}{n^{1+a}},\text{ } \varepsilon_{stab} \leq  \frac{2L^2T^a}{n^{1+a}}.
% 	\end{equation}
% \end{thm}

 In~\citep{kuzborskij2018data}, the data-dependent stability of SGD is analyzed, incorporating the dependence on the variance of SGD  curvature and the loss of the initial parameter $w_0$ in analyzing the divergence of SGD. This framework has applications in transfer learning, as well as implications including optimistic generalization error. We observe that our analysis in Theorems~\ref{unif_indep} can be combined with the data-dependent framework, and we now report our data-dependent versions of Theorems~\ref{unif_indep}.

 %In~\citep{kuzborskij2018data}, the data-dependent stability of SGD is analyzed, incorporating the dependence on the variance of SGD  curvature and the loss of the initial parameter $w_0$ in analyzing the divergence of SGD. This framework has applications in transfer learning, as well as implications including optimistic generalization error. We observe that our analysis in Theorems ~\ref{permutation} and~\ref{unif_indep} can be combined with the data-dependent framework, and we now report our data-dependent versions of Theorems~\ref{permutation} and~\ref{unif_indep}.
 The analysis requires the additional bounded variance assumption for SGD such that
 \[
\mathbb{E}_{S, z}\left[\|\nabla f(w_{t}; z) - \nabla \mathbb{E}_{z}(f(w_{t}; z))\|^2\right] \leq \sigma^2,\;\;\;\; \forall t.
\]
 % Definition of \sigma
 In the rest of this section we assume the variance of SGD satisfies this property.
% Under on-average stability definition, \citep{kuzborskij2018data} proved the following lemma under $\rho$-Lipschitz Hessian assumption and following bounded variance assumption for SGD.

We borrow the following lemma from~\citep{kuzborskij2018data} which is a data-dependent version of Lemma~\ref{lem_SGD}. 


\begin{lemma} \citep{kuzborskij2018data}\label{lem_SGD_data_dep}
    Assume $f$ is $\beta$-smooth, $L$-Lipschitz, and has a $\rho$-Lipschitz Hessian. With $w_0$ the initial weight and $w_t$, $w_{t'}$ the outputs of SGD on twin datasets $S,S'$ respectively after $t$ iterations, let $\Delta_t =[w_t - w_{t'}]$. Running SGD on $f(w;S)$ with step size $\alpha_t = \frac{b}{t}$ where $b \leq \min\{\frac{2}{\beta}, \frac{1}{8\beta^2\ln T^2}\}$ has the following properties:
    \begin{enumerate}
        \item The SGD update rule is a $(1+\alpha_t\psi_t)$-expander and $\alpha_t L$-bounded. Here $\psi_t = \min\{\beta, \kappa_t\}$ where %$\kappa_t$ follows the definition in Lemma 6 in Lampert et al 2018.
        
        $\kappa_t = \|\nabla^2 f(w_0;z_t)\|_2 + \frac{\rho}{2}\|\sum_{k=1}^{t-1}\alpha_k\nabla f(w_{S, k}; z_k)\|$ 
        
        $\;\;\;\;\;\;\;\;+ \frac{\rho}{2}\|\sum_{k=1}^{t-1}\alpha_k\nabla f(w_{S^{'}; k}, z_k)\|.
        $
        
        \item $\mathbb{E}_{\mathcal{A}}[\|\Delta_{t+1}\||\Delta_{t_0}=0] \leq $
        
        $\;\;\;\;\{\mathbb{E}_{\mathcal{A}}[\|\Delta_{t}\||\Delta_{t_0}=0][1+(1-\frac{1}{n})\alpha_t\psi_t]\} + \frac{2\alpha_tL}{n}.$
        
        \item
        $E_{S,S^{'}}\{E_{\mathcal{A}}[\|\Delta_{T}\||\Delta_{t_0}=0]\} \leq \frac{L}{n}\left(\frac{T}{t_0}\right)^{\zeta b}$, where
        
            $\zeta  = \tilde{O}(\min\{\beta, E_z[\|\nabla^2f(w_0; z)\|_2] + \Delta^*_{1, \sigma^2}\})$,
            
            $\Delta^*_{1, \sigma^2}   = \rho(b\sigma + \sqrt{bE_z[f(w_0; z)] - k^*}$
            
            and $k^*  = \inf_{w}E_z[f(w; z)].$
           
    \end{enumerate}
\end{lemma}

% Using same proof technique as theorem \ref{permutation} we could prove an upper bound for on-average stability with permutation SGD.


%Dividing our stability bound by the result in Theorem 4 of~\citep{kuzborskij2018data}, we could obtain the ratio $\Omega(T^{\frac{(\zeta b)^2}{1+\zeta b}}/(\mathbb{E}_{S,\mathcal{A}}[f(w_T;S)]^\frac{1}{1+ \zeta b}n)^{\zeta b})$. This factor is less than 1 when $T^{\frac{\zeta b}{1+\zeta b}}<\mathbb{E}_{S,\mathcal{A}}[f(w_T;S)]^\frac{1}{1+ \zeta b}n$. Since $b\leq \min\{2/\beta, 1/(8\beta^2\ln T^2)\}$ and $\zeta$ is bounded above by $\beta$, and $\mathbb{E}_{S,\mathcal{A}}[f(w_T;S)]$ is usually $\Theta(1)$, within a large range of $T$ we have a polynomial improvement over Theorem 4 of~\citep{kuzborskij2018data}.

%  and also improves Corollary 2 from $O\left( \frac{1}{m} \max \left \{ \right\}\right)$
%The following lemma is a direct application of Lemma \ref{permutation}. It is also an on-average extension of Lemma~\ref{lem_prob_rule} part 3.
% \begin{lemma}[Data-dependent version of Lemma~\ref{lem_prob_rule}]\label{lem_prob_rule_new}
% 	Let $w_t, w_t'$ be outputs of $SGD$ on twin datasets $S,S'$ respectively after $t$ iterations and let $\Delta_{t}=w_t-w_t'$. And let $b$, $\zeta$ be as in Lemma~\ref{lem_SGD_data_dep}. Suppose that $t_k = ct_{k-1}$. Then the following condition holds:
% 	\begin{equation}
% 	    \begin{aligned}
% 	     & \mathbb{E}_{S,S^{'}}\mathbb{E}_{\mathcal{A}}[ \|\Delta_{T}\|| \Delta_{t_{k}}\neq 0 ]
% 		\leq \big(\frac{T}{t_{k-1}}\big)^{\zeta b}\frac{L}{\zeta n} \\
% 		& \;\;\;\; 
% 		 + \mathbb{E}_{S, S{'}}\mathbb{E}_{\mathcal{A}}[ \|\Delta_{T}\| | \Delta_{t_{k-1}}\neq 0 ] \frac{1}{c}\left(1+\frac{t_k}{n}\right). 
% 	    \end{aligned} 
% 	\end{equation}
		
% \end{lemma}

 Based on the above lemma, we can prove an upper bound of on-average stability with uniform sampling SGD using the same technique as for Theorem~\ref{unif_indep}.
\begin{thm}(Data-dependent version of Theorem~\ref{unif_indep}) \label{unif_dep}
    Assume $f$ is $\beta$-smooth, $L$-Lipschitz, and has a $\rho$-Lipschitz Hessian. Let $w_t,w_{t'}$ be the outputs of SGD on twin datasets $S,S'$ respectively after $t$ iterations and let $\Delta_t =[w_t - w_{t'}]$ and $\delta_t = E_\mathcal{A}\|\Delta_t\|$. And let $\zeta$ follow the same definition as in Lemma~\ref{lem_SGD_data_dep}. Running SGD on $f(w;S)$ with step size $\alpha_t = \frac{b}{t}$ where $b < 1$ satisfies\\ 
	%Let $f$ satisfies the assumptions in Theorem \ref{permutation_new}, we have the followings properties:
    \begin{equation}
	{\widehat{\varepsilon}}_{\text{stab}} \leq  \frac{16 L^2T^{\zeta b}}{\zeta n^{1+\zeta b}}.
%\mathbb{E}_{S,S^{'}} [\delta_T]   \leq \frac{16 \log(n)LT^{\zeta b}}{\zeta n^{1+\zeta b}}, \;\;\;\;
	\end{equation}
\end{thm} 
We conclude this section with the following lower bound on the uniform stability of SGD with constant stepsize for non-convex loss functions. We show that for non-convex functions satisfying classical conditions  $\beta$-smooth, we cannot avoid a pessimistic bound. Thus, in order to analyze the generalization power of SGD for deep learning loss functions from an optimization perspective, different conditions are necessary.
\begin{thm}  \label{noncvx_lowerbound}
Let $w_t, w_t'$ be the outputs of SGD on twin datasets $S,S'$, and let $\Delta_{t}=w_t-w_t'$. There exists a non-convex, $\beta$-smooth function $f$, twin sets $S,S'$ and constants $a,\gamma$ such that the divergence of SGD after $T>n$ rounds using constant step size $\alpha = \frac{a}{0.99 \gamma }$ satisfies $\varepsilon_{\text{stab}} \geq e^{aT/2}/n^2$.
% \begin{equation}
%     \mathbb{E}_{\mathcal{A}}\|\Delta_T\| \geq \frac{1}{n^2}e^{aT/2}
% \end{equation}
% \begin{equation}
%     % \mathbb{E}\|\Delta_T\| \geq \frac{1}{n^2}e^{aT/2}
%     \varepsilon_{stab} \geq \frac{1}{n^2}e^{aT/2}
% \end{equation}
\end{thm}
\section{Conclusion and Future Work}\label{conclusion}

% This paper studied stability bounds for different types of loss functions. For the convex case \textcolor{red}{This may not be very precise. We are not proving better bounds for general convex function, we focus on the linear model}, we proved better upper bounds and proved the tightness of various bounds. For the non-convex case, we presented a tighter upper bound for certain ranges of SGD update steps. We also investigated the data-dependent versions of stability bounds and showed that we could obtain a tighter upper bound for on-average stability.
We first provided matching upper and lower data-independent bounds on the stability of SGD for three kinds of loss functions: convex, strongly-convex, and non-convex, essentially closing the gap in all cases. We then provided stronger data-dependent generalization bounds for both convex and non-convex loss functions by analyzing average-stability, showing that nice properties of data can both improve generalization and also reduce the need for regularization. At least two interesting open questions arise from our work: a) Can one obtain data-dependent lower bounds on average-stability that show the tightness of existing analysis? b) Can one devise properties of data-distributions or loss functions (perhaps motivated by deep learning) that imply better data-dependent stability bounds? 

%\bibliographystyle{plainnat}
\begin{acknowledgements} % will be removed in pdf for initial submission,
                         % so you can already fill it to test with the
                         % ‘accepted’ class option
    We thank anonymous reviewers for their constructive feedback.
    Mayank Goswami would like to acknowledge support from NSF awards CRII-1755791 and CCF-1910873. Chao Chen was partially supported by grants
NSF IIS-1909038 and CCF-1855760.

    
    

\end{acknowledgements}

\bibliography{zhang_654.bib}

\end{document}
