% \vspace{-3mm}
\section{Preliminary}
% \vspace{-2mm}
% Unless otherwise noted,  a random variable will be denoted by a capitalized letter, and  its realization is denoted by the corresponding lower-case letter. 
% We use $\mathbb{E}^Y[X] = \mathbb{E}[X|Y]$ as the the conditional expectation.
% For random variables $X$ and $Y$, we denote $\mathbb{E}^Y[X] = \mathbb{E}[X|Y]$ as the the conditional expectation.

\paragraph{Notation}
% Unless otherwise noted, a random variable will be denoted by a capitalized letter, and  its realization by the corresponding lower-case letter. 
The distribution of a random variable $X$ is denoted by $P_X$ (or $Q_X$), and the conditional distribution of $X$ given $Y$ is denoted by $P_{X|Y}$. When conditioning on a specific realization $y$, we use the shorthand $P_{X|Y=y}$ or simply $P_{X|y}$.
Denote by $\mathbb{E}_{X}$ expectation over $X \sim P_X$, and by $\mathbb{E}_{X|Y=y}$ (or $\mathbb{E}^y_{X}$) expectation over $X \sim P_{X|Y=y}$. We may omit the subscript of the expectation when there is no ambiguity.
% The entropy of a random variable $X$ is denoted by $H(X)$, and 
The KL divergence of probability distribution $Q$ with respect to $P$ is denoted by $\mathrm{D_{KL}}(Q||P)$.
The mutual information (MI) between random variables $X$ and $Y$ is denoted by $I(X;Y)$, and the conditional mutual information between $X$ and $Y$ given $Z$ is denoted by $I(X;Y|Z)$. In addition, for a matrix $A\in\mathbb{R}^{d\times d}$, we let $\tr{A}$ denote the trace of $A$ and we use $\tr{\log{A}}$ to indicate $\sum_{k=1}^d\log{A_{k,k}}$.

% $ $\tr{\mathrm{diag}\{\log{A_{1,1}},\log{A_{2,2}}, \dots, \log{A_{d,d}}\}}=\sum_{k=1}^d\log{A_{k,k}}$.
% \textcolor{red}{trace of log of a matrix} 
% We also define the disintegrated mutual information as $I^z(X;Y) \triangleq \mathrm{D_{KL}}(P_{X,Y|Z=z}||P_{X|Z=z}P_{Y|Z=z})$, following the notation in \citep{negrea2019information}. Note that $I(X;Y|Z) = \mathbb{E}_{Z}[I^Z(X;Y)]$.

% \subsection{Expected Generalization Error}
\paragraph{Expected Generalization Error}
Let $\mathcal{Z}$ be the instance space and let $\mu$ be an unknown distribution on $\mathcal{Z}$, specifying the random variable $Z$. We
let ${\mathcal W}\subseteq \mathbb{R}^d$ be the space of hypotheses. In the information-theoretic analysis framework, there is a training sample $S=\{Z_i\}_{i=1}^n$ drawn i.i.d. from $\mu$ and a stochastic learning algorithm $\mathcal{A}$ takes the training sample $S$ as its input and outputs a hypothesis $W\in \mathcal{W}$ according to some  conditional distribution 
$Q_{W|S}$. 
% mapping ${\mathcal Z}^n$ to ${\cal W}$. 
Given a loss function $\ell: \mathcal{W}\times\mathcal{Z}\rightarrow \mathbb{R}^{+}$, where  $\ell(w, z)$ measures the ``unfitness'' or ``error'' of any $z\in {\mathcal Z}$ with respect to a hypothesis $w\in {\mathcal W}$. The goal of learning is to find a hypothesis $w$ that minimizes the population risk, and for any $w\in {\mathcal W}$, the population risk is defined as
$
L_\mu(w) \triangleq \mathbb{E}_{Z\sim \mu}[\ell(w,Z)]
$.
 In practice, since $\mu$ is only partially accessible via the sample $S$,  we instead turn to  use the empirical risk, 
 %as a proxy of the population risk, which is
 defined as
$
L_S(w) \triangleq \frac{1}{n}\sum_{i=1}^n \ell(w,Z_i)
$.
Then, the expected generalization error of %the learning algorithm 
$\mathcal{A}$ is defined as 
$
\mathcal{E}_\mu(\mathcal{A})\triangleq\mathbb{E}_{W,S}[L_\mu(W)-L_S(W)]
$,
where the expectation is taken over $(S,W)\sim\mu^n\otimes Q_{W|S}$.

%(adopting the usual notion ``surrogate loss'' \cite{shalev2014understanding})
Throughout this paper, 
% we take $\ell$ as a continuous function (adopting the usual notion ``surrogate loss'').
% Additionally, 
we assume that $\ell$ is differentiable almost everywhere with respect to $w$.
% In some cases we will assume that $\ell(w, Z)$ is $R$-subgaussian
% \footnote{A random variable $X$ is $R$-subgaussian if for any $\rho\in \mathbb{R}$, $\log {\mathbb E} \exp\left( \rho \left(X- {\mathbb E}X\right) \right) \le \rho^2R^2/2$.}
% for any $w\in\mathcal{W}$. Note that a bounded loss is guaranteed to be subgaussian. 
% for example, if $\ell(w, Z) \in [0,M]$ then $R=M/2$. 
%To simplify notations, whenever $w$ can be clearly inferred from the context, 
In addition, we will denote $\ell(w,Z_i)$ by $\ell_i$ when there is no ambiguity.
% \vspace{-2mm}

% \subsection{SGD and SDE}
\paragraph{SGD and SDE}
At each time step $t$, given the current state $W_{t-1}=w_{t-1}$, let $B_t$ be a random subset that is drawn uniformly from  $\{1,2,\dots,n\}$ and $|B_t|=b$ is the batch size. Let $\widetilde{G}_{t}\triangleq\frac{1}{b}\sum_{i\in B_t}\nabla \ell(w_{t-1},Z_i)$ be the mini-batch gradient. The SGD updating rule with learning rate $\eta$ is then
\begin{eqnarray}
 W_{t} = w_{t-1} - \eta \widetilde{G}_{t}.
 \label{eq:sgd-update}
\end{eqnarray}
The full batch gradient is $G_t\triangleq \frac{1}{n}\sum_{i=1}^n\nabla \ell_i$. 
%Then Eq.  (\ref{eq:sgd-update}) becomes 
It follows that
\begin{eqnarray}
 W_{t} = w_{t-1} - \eta G_t + \eta V_t,
\label{eq:sgd-update-2} 
\end{eqnarray}
where $V_t \triangleq G_t-\widetilde{G}_{t}$ is the mini-batch \textit{gradient noise}. Since $\ex{B_t}{V_t}=0$, $\widetilde{G}_{t}$ is an unbiased estimator of the full batch gradient $G_t$. 
Moreover, the single-draw (i.e. $b=1$) SGD gradient noise covariance (GNC) and the mini-batch GNC are $\Sigma_t=\frac{1}{n}\sum_{i=1}^n\nabla \ell_i\nabla \ell_i^\mathrm{\bf T}-G_tG_t^\mathrm{\bf T}$ and $\quad C_t = \frac{n-b}{b(n-1)}\Sigma_t$, respectively.
% \[
% \Sigma_t=\frac{1}{n}\sum_{i=1}^n\nabla \ell_i\nabla \ell_i^T-G_tG_t^T \qquad\text{and}\quad C_t = \frac{n-b}{b(n-1)}\Sigma_t.
% \] 
If $n\gg b$, then $
C_t = {1}/{b}\Sigma_t.
$
Notice that $\Sigma_t$ (or $C_t$) is state-dependent, i.e. it depends on $w_{t-1}$. If $t$ is not specified, we use $\Sigma_w$ (or $C_w$) to represent its dependence on $w$. In addition, the population GNC at time $t$ is 
\begin{align}
    \Sigma_t^\mu\triangleq&\ex{Z}{\nabla \ell(w_{t-1},Z)\nabla \ell(w_{t-1},Z)^\mathrm{\bf T}}\notag\\
    &\quad-\ex{Z}{\nabla \ell(w_{t-1},Z)}\ex{Z}{\nabla \ell(w_{t-1},Z)^\mathrm{\bf T}}.
\label{eq:population-gradient-noise}
\end{align}
% We will approximate $V_t$ up to its second moment. %then by the central limit theorem, 
%we can make the following assumption.
% , which lets the type of the gradient noise distribution be Gaussian.
We assume that the initial parameter $W_0$ is independent of all other random variables, and SGD stops after $T$ updates, outputting $W_T$ as the learned parameter.


We now approximate $V_t$ up to its second moment, e.g., $V_t\sim \mathcal{N}(0,C_t)$, then we have the following continuous-time evolution, i.e. It\^o SDE:
\begin{align}
    d \omega = - \nabla L_S(\omega) dt + [\eta C_{\omega}]^{\frac{1}{2}} d\theta_t,
    \label{eq:ito-sde}
\end{align}
where $C_{\omega}$ is the GNC at $\omega$ and $\theta_t$ is a Wiener process. 
% This SDE is also called .
Furthermore, the {\em Euler-Maruyama} discretization, as the simplest approximation scheme to It\^o SDE in Eq.~(\ref{eq:ito-sde}), is
\begin{align}
  W_{t} = w_{t-1} - \eta G_t + \eta C_t^{1/2}N_t,
\label{eq:sgd-update-gaussian}
\end{align}
where $N_t\sim\mathcal{N}(0,\mathrm{I}_d)$ is the standard Gaussian random variable. 
% \vspace{-2mm}

\paragraph{Validation of SDE} It is important to understand how accurate is the SDE in Eq.~(\ref{eq:ito-sde}) for approximating the SGD process in Eq.~(\ref{eq:sgd-update}). 
% It is important to understand how accurate of SDE in Eq.~(\ref{eq:ito-sde}) for approximating the SGD process in Eq.~(\ref{eq:sgd-update}).
Previous research, such as \citep{li2017stochastic,li2019stochastic}, has provided theoretical evidence supporting the idea that SDE can approximate SGD in a ``weak sense''. That is, the SDE processes closely mimic the original SGD processes, not on an individual sample path basis, but rather in terms of their distributions (see Lemma~\ref{lem:sde-weak} for a formal result).

Additionally, concerning the validation of the discretization of SDE in Eq.~(\ref{eq:sgd-update-gaussian}), \citet[Theorem~2]{wu2020noisy} has proved that Eq.~(\ref{eq:sgd-update-gaussian}) is {\em an order $1$ strong approximation} to SDE in Eq.~(\ref{eq:ito-sde}). 
% We defer their theoretical result to Appendix. 
Moreover, 
we direct interested readers to the comprehensive investigations carried out by 
\citep{wu2020noisy,li2021validity}, where the authors empirically verify that SGD and Eq.~(\ref{eq:sgd-update-gaussian}) can achieve the similar testing performance in the deep learning scenarios, suggesting that non-Gaussian noise may not be essential to SGD performance. In other words, studying Eq.~(\ref{eq:sgd-update-gaussian}) is arguably sufficient to understand generalization properties of SGD. In Figure~\ref{fig:Acc-Dynamics}, we also empirically verify the approximation of Eq.~(\ref{eq:sgd-update-gaussian}), and show that it can effectively  capture the behavior of SGD.
% \vspace{-2mm}

% so $V_t\sim \mathcal{N}(0,C_t)$. 

% Note that
% Eq.~(\ref{eq:sgd-update-gaussian}) can be view as discretization of the following continuous SDE:
% $
% d W = - \nabla L_S(W) dt + [\eta C(W
% )]^{\frac{1}{2}} d\theta_t,
% $
% where $C(W)$ is the GNC at $W$ and $\theta_t$ is a Wiener process. This SDE is also called It\^o SDE.


% \begin{assum}
% \label{assum-sde}
% Assume the gradient noise $V_t$ follows an Gaussian distribution, i.e. $V_t\sim \mathcal{N}(0,C_t)$, 
% then in SGD,
% %the following equation can well approximate the SGD dynamic (Eq (\ref{eq:sgd-update-2})),
% \begin{eqnarray}
%   W_{t} = w_{t-1} - \eta G_t + \eta C_t^{1/2}N_t,
% \label{eq:sgd-update-gaussian}
% \end{eqnarray}
% where $N_t\sim\mathcal{N}(0,\mathrm{I}_d)$ is the standard Gaussian %random variable
% \footnote{Eq.~(\ref{eq:sgd-update-gaussian}) can be view as discretization of the following continuous SDE:
% $
% d W = - \nabla L_S(W) dt + [\eta C(W
% )]^{\frac{1}{2}} d\theta_t,
% $
% where $C(W)$ is the gradient noise covariance at $W$ and $\theta_t$ is a Wiener process. This SDE is also called It\^o SDE.}.
% \end{assum}
% \begin{rem}
% Regarding the validation of this assumption, we refer readers to a recent work of \citet{li2021validity}, where the authors empirically verify that SGD and Eq.~(\ref{eq:sgd-update-gaussian}) can achieve the similar testing performance, suggesting that non-Gaussian noise is not essential to SGD performance, that is, studying Eq.~(\ref{eq:sgd-update-gaussian}) is arguably sufficient to understand generalization properties of SGD.
% %  \textcolor{red}{On the mathematical validation of this assumption...? convergence to Gaussian in what sense?}
% \end{rem}
 


% \subsection{Two Information-Theoretic Bounds}
\paragraph{Two Information-Theoretic Bounds} 
% Before we delve into the formal justification, we present below two typical information-theoretic bounds in the literature.
% Below are two typical information-theoretic bounds.
The original 
% version of mutual information (MI) based 
information-theoretic bound in \cite{xu2017information} is a sample-based MI bound, whose main component is the mutual information between the output $W$ and the entire input sample $S$. This result is given as follows:
\begin{lem}[{\citet[Theorem~1.]{xu2017information}}]
Assume the loss $\ell(w,Z)$ is $R$-subgaussian\footnote{A random variable $X$ is $R$-subgaussian if for any $\rho\in \mathbb{R}$, $\log {\mathbb E} \exp\left( \rho \left(X- {\mathbb E}X\right) \right) \le \rho^2R^2/2$. Note that a bounded loss is guaranteed to be subgaussian. }
for any $w\in\mathcal{W}$, then
% \vspace{-5pt}
\[
|\mathcal{E}_\mu(\mathcal{A})|\leq \sqrt{\frac{2R^2}{n}I(W;S)}.
\]
% \vspace{-10pt}
% where $I(W;S)=\mathrm{D_{KL}}(Q_{W,S}||Q_W\otimes Q_S)$ is the mutual information and $\mathrm{D_{KL}}$ denotes the KL divergence.
% \cite{cover2012elements} between $W$ and $S$.
\label{lem:xu's-bound}
\end{lem}

This bound is further 
% improved by 
refined to a data-dependent prior based bound. Following the setup in \citet{negrea2019information}, let $J$ be a random subset uniformly drawn from $\{1,\dots,n\}$ and $|J|=m>b$. Let $S_J = \{Z_i\}_{i\in J}$.
% and $L_{S_J}(W) = \frac{1}{m}\sum_{i\in J}\ell(W,Z_i)$. 
Typically, we choose $m=n-1$, then the following result is known.%\looseness=-1  
% and \citet[Theorem~1.]{wang2021optimizing}.
\begin{lem}[{\citet[Theorem~2.5]{negrea2019information}}]
\label{lem:data-dependent-prior}
Let $Q_{W|S}$ be the posterior distribution of $W$ given the training sample $S$, and let $P_{W|S_J}$ be the posterior distribution of $W$ given the training sample $S_J$. Assume the loss $\ell(w,Z)$ is bounded in $[0,M]$, then for any $P_{W|S_J}$,
\[
\mathcal{E}_\mu(\mathcal{A})\leq\frac{M}{\sqrt{2}}\mathbb{E}_{S,J}{\sqrt{\mathrm{D_{KL}}(Q_{W|S}||P_{W|S_J})}}.
\]
\end{lem}
% \begin{rem}
% %In Lemma~\ref{lem:data-dependent-prior}, 
% We may use a subset $S_J$ drawn from the training sample $S$ to conduct a parallel training process based on the same algorithm $\mathcal{A}$ (e.g. SGD) to obtain a data-dependent prior ($P_{W|S_J}$). Then Lemma~\ref{lem:data-dependent-prior} indicates that the generalization error can be bounded by the KL divergence between the distribution of $W$ that obtained by the real training algorithm and this data-dependent prior.
% \end{rem}
 Note that $J$ is drawn before the training starts and is independent of $\{W_t\}_{t=0}^T$. 
 We use the subset $S_J$ to conduct a parallel SGD training process based 
 on the same algorithm $\mathcal{A}$ (e.g. SGD) 
 to obtain a data-dependent prior ($P_{W|S_J}$). When $m=n-1$,  we call this prior process the leave-one-out (LOO) prior.

% We also present the variational representation of mutual information below.
% \begin{lem}[{\citet[Corollary~3.1.]{polyanskiy2019lecture}}]
% \label{lem:mi-center-gravity}
% For two random variables $X$ and $Y$, we have
% \[
% I(X;Y) = \inf_{P} \ex{X}{\mathrm{D_{KL}}(Q_{Y|X}||P)},
% \]
% where the infimum is achieved at $P=Q_Y$.
% \end{lem}

