% \vspace{-2mm}
\section{Generalization Bounds Via Full Trajectories}
\label{sec:itb-sde}
% \vspace{-2mm}
% \subsection{Sample MI Bound}

% The original version of mutual information based bound is a sample-based MI bound whose main component is the mutual information between the output $W$ and the entire input sample $S$. This result is given by the following lemma:

% \begin{lem}[{\citet[Theorem~1.]{xu2017information}}]
% Assume the loss $\ell(w,Z)$ is $R$-subGaussian, the generalization error of  ${\cal A}$ is bounded by
% % \vspace{-5pt}
% \[
% |\mathcal{E}_{\mu}(\mathcal{A})|\leq \sqrt{\frac{2R^2}{n}I(W;S)},
% \]
% % \vspace{-10pt}
% where $I(W;S)=\mathrm{D_{KL}}(P_{W,S}||P_W\otimes P_S)$ is the mutual information and $\mathrm{D_{KL}}$ denotes the KL divergence between two distributions.
% % \cite{cover2012elements} between $W$ and $S$.
% \label{lem:xu's-bound}
% \end{lem}

% Similar to some previous works that applying Lemma~\ref{lem:xu's-bound} to noisy iterative algorithms (e.g., SGLD \cite{pensia2018generalization}), 

% We first unroll the terminal parameters' mutual information $I(W_T;S)$ to the full trajectories' mutual information via the lemma below.
% \begin{lem}
% \label{lem:mi-unroll}
% Let Assumption \ref{assum-sde} hold, then
% $I(W_T;S)\leq\sum_{t=1}^T I(- G_t + C_t^{1/2}N_t;S|W_{t-1} ).
% $
% \end{lem}

% This lemma can be proved by recurrently applying the data processing inequality (DPI) and chain rule of the mutual information \citep{polyanskiy2019lecture}. 
%We defer all the proof details into Appendix.

%\citet{pensia2018generalization} applied Lemma \ref{lem:xu's-bound} to SGLD by relying on a Lipschitz continuity condition. We do not require this condition here. Instead, 
We now discuss the generalization of SGD under the approximation of Eq.~(\ref{eq:sgd-update-gaussian}). 
We first unroll the terminal parameters' mutual information $I(W_T;S)$ to the full trajectories' mutual information via the lemma below.
\begin{lem}
\label{lem:mi-unroll}
The MI term in Lemma~\ref{lem:xu's-bound} is upper bounded by
$I(W_T;S)\leq\sum_{t=1}^T I(- G_t + C_t^{1/2}N_t;S|W_{t-1} ).
$
\end{lem}

This lemma can be proved by recurrently applying the data processing inequality (DPI) and chain rule of the mutual information \citep{polyanskiy2019lecture}. 


Define $\widehat{G}_t = -G_t+C_t^{1/2}N_t$. Let $Q_{\hat{G}_t|s,w_{t-1}}$ and $Q_{\hat{G}_t|w_{t-1}}$ be the conditional and marginal distributions fully characterized by the algorithm, respectively. In addition, let $P_{\widehat{G}_t|w_{t-1}}$ be any prior distribution of $\widehat{G}_t$, satisfying $\mathrm{D_{KL}}(Q_{\widehat{G}_t|s,w_{t-1}}||P_{\widehat{G}_t|w_{t-1}})<\infty$.
we first have the following lemma.
% By Lemma \ref{lem:mi-center-gravity}, we have
\begin{lem}
\label{lem:cmi-golden formula}
% Let $\widehat{G}_t = -G_t+C_t^{1/2}N_t$. 
% then
% Let $P_{\widehat{G}_t|w_{t-1}}$ be any prior, 
% satisfying $\mathrm{D_{KL}}(P_{\widehat{G}_t|S,W_{t-1}}||P_{\widehat{N}_t|W_{t-1}})<\infty$.
% At every time step $t$, let $P_{\widehat{G}_t|W_{t-1}}$ be any distribution satisfying $\mathrm{D_{KL}}(P_{\widehat{G}_t|W_{t-1}}||P_{\widehat{G}_t|W_{t-1}})<\infty$, 
For any time step $t$,
we have
% $
% I(\widehat{G}_t;S|W_{t-1})\!\!=\!\! \ex{}{\inf_{P_{\widehat{G}_t|W_{t-1}}} \cex{}{}{\mathrm{D_{KL}}(Q_{\widehat{G}_t|S,W_{t-1}}||P_{\widehat{G}_t|W_{t-1}})}}$,
$
I(\widehat{G}_t;S|W_{t-1}) = \ex{W_{t-1}}{\inf_{P_{\widehat{G}_t|W_{t-1}}} \cex{S}{W_{t-1}}{\mathrm{D_{KL}}(Q_{\widehat{G}_t|S,W_{t-1}}||P_{\widehat{G}_t|W_{t-1}})}}$,
where the infimum is achieved when the prior distribution $P_{\widehat{G}_t|w_{t-1}} = Q_{\widehat{G}_t|w_{t-1}}$ .
\end{lem}
% \begin{rem}
% Lemma \ref{lem:cmi-golden formula} suggests that for every step $t$, the conditional MI between $\widehat{G}_t$ and $S$ could be upper bounded by the expected KL divergence between $Q_{\widehat{G}_t|S,W_{t-1}}$ and some prior $P_{\widehat{G}_t|W_{t-1}}$. 

% In Lemma~\ref{lem:cmi-golden formula}, $\mathrm{D_{KL}}(Q_{\widehat{G}_t|S,W_{t-1}}||P_{\widehat{G}_t|W_{t-1}})$ may be viewed as an estimate of the sensitivity of the full batch gradient to a specific training sample $S=s$. Later on we will show that if full batch gradient is close to the population gradient (i.e. $\ex{Z}{\nabla\ell(w,Z)}$), 
 %indicating less sensitivity of the full batch gradient w.r.t $S$, 
 % this sensitivity is small, leading to better generalization performance.
 
% \end{rem}
% While $\widehat{G}_t$ is the gradient signal at step $t$, 
 
% Moreover, 
Lemma~\ref{lem:cmi-golden formula} suggests that every choice of the prior $P_{\widehat{G}_t|W_{t-1}}$ gives rise to an upper bound of the MI of interest via 
% $I(\widehat{G}_t;S|W_{t-1}) \le \ex{W_{t-1}}{ \cex{S}{W_{t-1}}{\mathrm{D_{KL}}(Q_{\widehat{G}_t|S,W_{t-1}}||P_{\widehat{G}_t|W_{t-1}})}}$
$I(\widehat{G}_t;S|W_{t-1}) \le \ex{}{{\mathrm{D_{KL}}(Q_{\widehat{G}_t|S,W_{t-1}}||P_{\widehat{G}_t|W_{t-1}})}}$. The closer is $P_{\widehat{G}_t|W_{t-1}}$ to $Q_{\widehat{G}_t|W_{t-1}}$, the tighter is the bound. As the simplest choice, we will first choose an isotropic Gaussian prior, $P_{\widehat{G}_t|w_{t-1}} = \mathcal{N}(\tilde{g}_t, \sigma^2_t\mathrm{I}_d)$ (where both $\tilde{g}_t$ and $\sigma_t$ are only allowed to depend on $W_{t-1}$), and optimize the KL divergence in Lemma~\ref{lem:cmi-golden formula} over $\sigma_t$ for a fixed $\tilde{g}_t$. 
% Then combined with Lemma \ref{lem:mi-unroll}, 
The following result is obtained.
\begin{thm}
\label{thm:isotropic-prior-bound}
Under the conditions of Lemma~\ref{lem:xu's-bound} and assume $C_t$ is a positive-definite matrix. For any $t\in [T]$, let $\tilde{g}_t$ be any constant vector for a given $w_{t-1}$, then
\begin{align}
    \mathcal{E}_{\mu}(\mathcal{A})\leq\sqrt{\frac{R^2}{n}\sum_{t=1}^T\ex{W_{t-1}}{d\log{\frac{h_1(W_{t-1})}{d}}-h_2(W_{t-1})}},
\label{ineq:iso-gen-bound}
\end{align}
% \[
% \mathcal{E}_{\mu}(\mathcal{A})\leq\sqrt{\frac{R^2}{n}\sum_{t=1}^T\ex{W_{t-1}}{d\log{\frac{\cex{S}{W_{t-1}}{\left|\left|G_t-\tilde{g}_t\right|\right|^2 +tr\left\{C_t\right\}}}{d}}-\cex{S}{W_{t-1}}{tr\left\{\log{C_t}\right\}}}},
% \]
where 
$h_1(w) = \cex{S}{w}{\left|\left|G_t-\tilde{g}_t\right|\right|^2 +tr\left\{C_t\right\}}$ and
  $h_2(w) = \cex{S}{w}{tr\left\{\log{C_t}\right\}}$. 
  % and
% \begin{eqnarray*}
%   A_1(t) &=& \ex{}{(\left|\left|G_t-\tilde{g}\right|\right|^2 +tr\left\{C_t\right\}},\\
%   A_2(t) &=& \ex{}{tr\left\{\log{C_t}\right\}},
% \end{eqnarray*}
  % $tr\{\cdot\}$ denotes the trace of a matrix. 
  % and $\mathbb{E}_X^Y$ is the conditional expectation. 
%  Further, the optimal $\sigma_t^* = \sqrt{A_1(t)/d}$ for each step $t$.

Furthermore, if $\tilde{g}_t = \ex{Z}{\nabla\ell(w_{t-1},Z)}$, then
$h_1(w) = \frac{1}{b}\tr{\Sigma_t^\mu}$.
% \begin{align}
%     \mathcal{E}_{\mu}(\mathcal{A})\leq\sqrt{\frac{R^2}{n}\sum_{t=1}^T\ex{W_{t-1}}{d\log{\frac{\tr{\Sigma_t^\mu}}{bd}}-h_2(W_{t-1})}}.
%     \label{ineq:iso-pop-bound}
% \end{align}
\end{thm}

Notice that $\tilde{g}_t$ is any reference ``gradient'' independent of $S$, then the first term in $h_1(W_{t-1})$, $\eucd{G_t-\tilde{g}_t}^2$, characterizes the sensitivity of the full-batch gradient to some variation of the training set $S$,  while the second term in $h_1(W_{t-1})$, i.e. $\tr{C_t}$, reflects the gradient noise magnitude induced by the mini-batch based training. For example, if $\tilde{g}_t = \ex{Z}{\nabla\ell(w_{t-1},Z)}$, then $\cex{S}{w_{t-1}}{\eucd{G_t-\tilde{g}_t}^2}$ is the variance of the gradient sample mean, and such $\tilde{g}_t$ will eventually convert $h_1(W_{t-1})$ to the population GNC, namely $h_1(W_{t-1}) = \frac{1}{b}\tr{\Sigma_t^\mu}$.
% (i.e. Eq.~(\ref{eq:population-gradient-noise})) as shown in Eq.~(\ref{ineq:iso-pop-bound}). 

% In addition, 
% seen from Eq.~(\ref{ineq:iso-gen-bound}) in Theorem~\ref{thm:isotropic-prior-bound}, the GNC impacts the generalization error non-monotonically. On the one hand, we hope the diagonal elements in $C_t$ have small values so that $h_1(W_{t-1})$ is small. This corresponds to the case where most gradient signals align with each other or the gradient norm is small, a phenomenon typically occurring when training approaches local minima. On the other hand,  large diagonal elements in $C_t$ can increase the value of $h_2(W_{t-1})$, giving a smaller bound value. This coincides with the intuition from the optimization perspective, where larger gradient noise magnitude helps to escape saddle points. Reportedly this non-monotonicity is controlled by the ratio of the learning rate to the batch size in practice \citep{hoffer2017train,jastrzkebski2017three}. We note that such trade-off of gradient noise has not been observed in the previous information-theoretic analyses for SGLD.

% \begin{rem}
Moreover, if we simply let $\tilde{g}_t = 0$, then Theorem \ref{thm:isotropic-prior-bound} indicates that one can control the generalization performance via controlling the gradient norm along the entire training trajectories, e.g., if we further let $b=1$, then $h_1(W_{t-1})=\frac{1}{n}\sum_{i=1}^n||\nabla\ell_i||^2$. 
% Note that controlling gradient norm can also control the magnitude of the trace of gradient noise covariance. 
This is consistent with the existing practice, for example, applying gradient clipping \citep{wang2022generalization,geiping2021stochastic} and gradient penalty \citep{jastrzebski2021catastrophic,barrett2020implicit,smith2020origin,geiping2021stochastic} as regularization 
techniques to improve generalization. 
% \end{rem}

% If we let $\tilde{g}_t = \ex{Z}{\nabla\ell(w_{t-1},Z)}$, then the first term in $A_1(t)$,  $\left|\left|G_t-\tilde{g}_t\right|\right|^2$, characterize the sensitivity of the full-batch gradient to the variation of the training set, while the second term, $tr\left\{C_t\right\}$ reflects the gradient noise magnitude induced by the mini-batch based training. In addition, 
% seen from Theorem~\ref{thm:isotropic-prior-bound}, the GNC impacts the generalization error non-monotonically. On the one hand, we hope the diagonal elements in $C_t$ have small values so that $A_1(t)$ is small. This corresponds to the case where most gradient signals align with each other or the gradient norm is small, a phenomenon typically occurring when training approaches local minima. On the other hand,  large diagonal elements in $C_t$ can make $-A_2(t)$ become negative, giving a smaller bound value. This coincides with the intuition from the optimization perspective, where larger gradient noise magnitude helps to escape saddle points. Reportedly this non-monotonicity is controlled by the ratio of the learning rate to the batch size in practice \citep{hoffer2017train,jastrzkebski2017three}.

% Concomitantly, letting $\tilde{g}_t = \ex{Z}{\nabla\ell(w_{t-1},Z)}$ will convert $A_1(t)$ to the population GNC (i.e. Eq.~(\ref{eq:population-gradient-noise})), as shown in the following corollary.
% \begin{cor}
% \label{cor:population-gradient-bound}
% Under the conditions in Theorem~\ref{thm:isotropic-prior-bound}, let $\tilde{g}_t = \ex{Z}{\nabla\ell(w_{t-1},Z)}$ and
% $A_2(t)$ defined as in Theorem~\ref{thm:isotropic-prior-bound}, then
% \[
% \mathcal{E}_{\mu}(\mathcal{A})\leq\sqrt{\frac{R^2}{n}\sum_{t=1}^T\ex{W_{t-1}}{d\log{({tr\left\{\Sigma_t^\mu\right\}}/{bd})}-A_2(t)}},
% \]
% %where $A_2(t)$ is defined as the same in Theorem~\ref{thm:isotropic-prior-bound}.
% % and the population gradient noise covariance is defined in Eq.~(\ref{eq:population-gradient-noise}).
% \end{cor}

% In view of Theorem~\ref{thm:isotropic-prior-bound}, we also find that the gradient noise covariance plays a key role in SGD's generalization performance via $tr\left\{C_t\right\}$ and $tr\left\{\log{C_t}\right\}$. Indeed, if full batch gradient is close to population gradient and the values of diagonal elements in $C_t$ are nearly equal (i.e. the divergence degree of gradients are nearly the same in each model dimension), then the algorithm can achieve small enough generalization error. We will elaborate more on this in Appendix.

% Although it seems that the generalization of SGD only depends on the trace of the gradient noise covariance and does not depend on the non-diagonal elements in the covariance, those non-diagonal elements indeed have huge impact on the training trajectory \cite{haochen2021shape}, and thus, can affect the distribution of $W_{t-1}$ for every $t$. In fact, manipulating the gradient noise has been widely used to improve the generalization \citep{zhu2019anisotropic,xie2021positive}.

% By using the inequality $\log{x}\leq x-1$, we have
% \[
% d\log{\frac{A_1(t)}{d}}-A_2(t)\leq A_1(t)-d-A_2(t)=\ex{}{tr\{C_t-\log{C_t}-\mathrm{I}_d\}}+\ex{}{(\left|\left|G_t-\tilde{g}_t\right|\right|^2},
% \]




% \begin{rem}
% The bound above reflects the fact that the gradient noise covariance plays a key role in SGD's generalization performance. It's important to note that there is a trade-off for the strength of the gradient noise level. Specifically, if $tr\left\{ C_t\right\}$ is too small, then $-tr\left\{\log{C_t}\right\}$ will be very large. In fact, manipulating the gradient noise has been widely used to improve the generalization \citep{zhu2019anisotropic,xie2021positive}.
% \end{rem}

As a by-product, we recover previous information-theoretic bounds for the Gradient Langevin dynamics (GLD) with noise distribution $\mathcal{N}(0,\eta^2 \mathrm{I}_d)$ below.
% It is possible to compare Theorem~\ref{thm:isotropic-prior-bound} with some previous  bounds by letting $C_t=\mathrm{I}_d$, in which Eq.~(\ref{eq:sgd-update-gaussian}) reduces to the Gradient Langevin dynamics (GLD) with noise distribution $\mathcal{N}(0,\eta^2 \mathrm{I}_d)$ and the corresponding generalization bound is stated below.
\begin{cor}
\label{cor:langevin-dynamic}
If $C_t=\mathrm{I}_d$, then
\[
\mathcal{E}_{\mu}(\mathcal{A})\leq\sqrt{\frac{R^2d}{n}\sum_{t=1}^T{{\mathbb{E}_{W_{t-1}}{\log\left({\mathbb{E}_{S}^{W_{t-1}}{\left|\left|G_t-\tilde{g}_t\right|\right|^2}}/{d}+1\right)}}}}.
\]
\end{cor}
% It is easy to verify 
Note that the bound in Corollary \ref{cor:langevin-dynamic} can recover the bound in \citet[Proposition~3.]{neu2021information} by using the inequality
% the inequality 
$\log(x+1)\leq x$. 
% for $x>0$ 
Furthermore, it can also recover the bound in \citet{pensia2018generalization} because we use a state-dependent quantity $\cex{S}{w_{t-1}}{||G_t-\tilde{g}_t||^2}$, 
% rather than 
which is smaller than the global Lipschitz constant used in \citet{pensia2018generalization}.

%Intuitively, if the gradient noise distribution is an isotropic Gaussian, independent of the parameter and the distribution, then our optimal isotropic Gaussian prior used in Lemma \ref{lem:cmi-golden formula} will match the posterior distribution better, leading to a tightest bound. 

% It's important to note that this does not indicate that GLD can outperform SGD. On the one hand, the isotropic gradient noise also has some implicit impact on the term $\ex{}{||G_t-\tilde{g}_t||^2}$, and on the other hand, Langevin dynamics may require longer training time to converge to a minimum than mini-batch SGD, i.e., having larger $T$.

% Theorem \ref{thm:isotropic-prior-bound} and Corollary \ref{cor:langevin-dynamic} both indicate that one can control the generalization performance via controlling the gradient norm along the entire training trajectories. Note that controlling gradient norm can also control the magnitude of the trace of gradient noise covariance. This is consistent with many previous practical applications, for example, applying gradient clipping \citep{wang2022generalization,geiping2021stochastic} and gradient penalty \citep{jastrzebski2021catastrophic,barrett2020implicit,smith2020origin,geiping2021stochastic} as regularization 
% techniques to improve the generalization performance. 
\begin{figure*}[!ht]
    \centering
    \begin{subfigure}[b]{0.245\textwidth}
\includegraphics[scale=0.28]{figs/cov-plot-svhn-vgg-1.png}    
\caption{VGG on (small) SVHN}            \label{fig:vgg-svhn-cov}
    \end{subfigure}
\begin{subfigure}[b]{0.245\textwidth}
\includegraphics[scale=0.28]{figs/cov-plot-cifar10-vgg-1.png}
\caption{VGG on CIFAR10}
    \label{fig:vgg-cifa10-cov}
\end{subfigure}
 \begin{subfigure}[b]{0.245\textwidth}
\includegraphics[scale=0.28]{figs/cov-plot-cifar10-resnetwobn-1.png}
\caption{ResNet on CIFAR10}
\label{fig:resnet-cifa10-cov}
    \end{subfigure}
\begin{subfigure}[b]{0.245\textwidth}
\includegraphics[scale=0.28]{figs/cov-plot-cifa100-resnet-1.png}
\caption{ResNet on CIFAR100}
\label{fig:resnet-cifa100-cov}
\end{subfigure}
\caption{Gradient-related quantities of SGD or its discrete SDE approximation. In (d), since per-sample gradient is ill-defined when BatchNormalization is used, we do not track $\tr{\log\pr{\Sigma^{-1}_t\Sigma^{\mu}_t}}$.}\label{fig:Cov-Dynamics}
% \vspace{-4mm}
\end{figure*}
% Arguably, the  upper  bound  in Theorem \ref{thm:isotropic-prior-bound}  has  the  deficiency  of being dependent on an isotropic Gaussian prior. 
While choosing the isotropic Gaussian prior is common in the GLD or SGLD setting, given that we already know $C_t$ is an anisotropic covariance, one can select an anisotropic prior to better incorporate the geometric structure in the prior distribution. A natural choice of the covariance is a scaled population GNC, namely $\tilde{c}_t\Sigma_t^{\mu}$, where $\tilde{c}_t$ is some positive state-dependent scaling factor. Let $\tilde{g}_t = \ex{Z}{\nabla\ell(w_{t-1},Z)}$ be the state-dependent mean. 
% The following bound is achieved 
By optimizing over $c_t$, we have the bound below.



\begin{thm}
\label{thm:anisotropic-prior-bound}
Under the conditions of Lemma~\ref{lem:xu's-bound} and assume $C_t$ and $\Sigma^\mu_t$ are positive-definite matrices,
% Let $P_{\widehat{G}_t|w_{t-1}}=\mathcal{N}\pr{\ex{Z}{\nabla\ell(w_{t-1},Z)},\tilde{c}_t\Sigma_t^{\mu}}$, 
then
% \[
% \mathcal{E}_{\mu}(\mathcal{A})\leq\sqrt{\frac{R^2}{n}\sum_{t=1}^T\ex{W_{t-1}}{A_3(t)}+\frac{R^2dT}{n}\log\frac{1}{b}},
% \]
% where $A_3(t)=tr\left\{\log\Sigma^\mu_t-\cex{S}{W_{t-1}}{\log C_t}\right\}$.
\[
\mathcal{E}_{\mu}(\mathcal{A})\leq\sqrt{\frac{R^2}{n}\sum_{t=1}^T\ex{W_{t-1},S}{\tr{\log\frac{\Sigma^\mu_tC_t^{-1}}{b}}}}.
\]
% where $A_2(t)$ is defined as the same in Theorem~\ref{thm:isotropic-prior-bound}.
\end{thm}
\begin{rem}
    If we let the diagonal element of ${\Sigma^\mu_t}$ in dimension $k$ be $\alpha_t(k)$ and let the corresponding diagonal element of $\Sigma_t$ be $\beta_t(k)$, and assume $n\gg b$ (so $\Sigma_t = bC_t$), then $\tr{\log({\Sigma^\mu_tC^{-1}_t}/{b})}=\sum_{k=1}^d\log{\frac{\alpha_t(k)}{\beta_t(k)}}$.
Thus, Theorem~\ref{thm:anisotropic-prior-bound} implies that a favorable alignment between the diagonal values of $\Sigma_t$ and $\Sigma^\mu_t$ will positively impact generalization performance. In other words, the perfect alignment of these two matrices indicates that SGD is insensitive to the randomness of $S$. Recall the key quantity in Lemma~\ref{lem:xu's-bound}, $I(W;S)$, which also measures the dependence of $W$ with the randomness of $S$, the term $\Sigma^{\mu}_{t}\Sigma^{-1}_t$ conveys a similar intuition in this context.
\end{rem}

Compared with Theorem~\ref{thm:isotropic-prior-bound} under the same choice of $\tilde{g}_t$, we notice that the main difference is that the term $tr\left\{\log({\Sigma^\mu_t}/{b})\right\}$, instead of  $d\log({tr\left\{\Sigma^\mu_t\right\}}/{bd})$, appears in the bound of Theorem~\ref{thm:anisotropic-prior-bound}. The following lemma demonstrates that Theorem~\ref{thm:anisotropic-prior-bound} is tighter than the bound in Theorem~\ref{thm:isotropic-prior-bound}.

\begin{lem}
    \label{lem:compare-iso-noniso}
    For any $t$, we have $\tr{\log\frac{\Sigma^\mu_t}{b}}\!\!\leq\! d\log{\frac{\tr{\Sigma_t^\mu}}{bd}}$, with the equality holds when all the diagonal elements in $\Sigma^\mu_t$ have the same value, i.e. $\alpha_t(1)=\alpha_t(2)=\cdots=\alpha_t(d)$.
\end{lem}


% Let the diagonal element of ${\Sigma^\mu_t}/{b}$ in dimension $k$ be $a_k$, then
% \begin{align*}
%     \sum_{k=1}^d\log a_k\leq
%     (\sum_{k=1}^d 1) \cdot \log{(\sum_{k=1}^d a_k)}/{(\sum_{k=1}^d 1)}=d\log({tr\left\{\Sigma^\mu_t\right\}}/{bd}),
% \end{align*}
% where we invoked a variant of the Log sum inequality \citep[Theorem~2.7.1]{thomas2006elements}, See Lemma~\ref{lem:log-sum-ineq} in Appendix.
% i.e., $\sum_{i=1}^n b_i\log(a_i/b_i)\leq (\sum_{i=1}^n b_i)\log(\sum_{i=1}^n a_i/\sum_{i=1}^n b_i)$ for non-negative numbers $\{a_i\}_{i=1}^n$ and $\{b_i\}_{i=1}^n$ (See Lemma~\ref{lem:log-sum-ineq} in Appendix).
% Thus, the bound in Theorem~\ref{thm:anisotropic-prior-bound} is tighter than the bound in Eq.~(\ref{ineq:iso-pop-bound}), with the equality holds when all the diagonal elements in $\Sigma^\mu_t$ have the same value.

% \vspace{-0.3in}

The trajectory-based bounds in Theorem~\ref{thm:isotropic-prior-bound} and Theorem~\ref{thm:anisotropic-prior-bound} emphasize the significance of gradient-related information along entire trajectories, including metrics such as gradient norm and gradient covariance alignment, in comprehending the generalization dynamics of 
 understanding the generalization of SGD. In Figure~\ref{fig:Cov-Dynamics}, we visually show that these key gradient-based measures during SDE training closely mirror the dynamics observed in SGD.  
 % This further showcases that we can analyze the generalization of SGD via analyzing SDE.

 % \textcolor{red}{
% \paragraph{Dependency on Time} 
Notably, these trajectory-based information-theoretic bounds are time-dependent, indicating that these bounds may grow with the training iteration number $T$, unless the gradient norm becomes negligible at some point during training. While the stability-based bounds for GD/SGD are also time-dependent \citep{hardt2016train, bassily2020stability} (in the convex learning case), the learning rate in these bounds helps mitigate the growth of $T$. However, the learning rate does not appear in our trajectory-based information-theoretic bounds, making the dependency on $T$ even worse.
% } 

% \textcolor{red}{
Note that \citet{wang2021analyzing} uses the strong data processing inequality to reduce this deficiency, but the bound still increases with $T$. To tackle this weakness, we will invoke some asymptotic SDE results on the terminal parameters of the algorithm,  which will give us a crisp way to characterize the expected generalization gap  without decomposing the mutual information.
% }


% Theorem \ref{thm:isotropic-prior-bound} and Theorem \ref{thm:anisotropic-prior-bound} may grow with the training iteration number $T$ increasing, unless the gradient norm is completely negligible at the end of training. In contrast, the empirical generalization gap between testing loss and training loss will become stable when training loss converges to a minimum, even if the training continues. This contradiction is due to the 
% application of the data processing inequality. Specifically, we upper bound MI of the final output of the algorithm by MI of the full training trajectories term (e.g., Lemma \ref{lem:mi-unroll}). Although data processing inequality and chain rule uncovers some properties of gradient based iterative algorithms like SGD, when $T$ becomes large, using MI  of the full training trajectories as an upper bound may become loose, and such looseness will further increase when $T$ is getting larger. 

% Previous work like \citet{wang2021analyzing} uses the strong data processing inequality to reduce this deficiency, but the bound still increases with $T$. To tackle this weakness, inspired by the literature of PAC-Bayes bounds \cite{xie2021positive},
% % we will connect the SDE approximation and information-theoretic bounds to a PAC-Bayes point of view, 
% we will apply some existing SDE results on the terminal parameters of the algorithm,  which will give us a crisp way to characterize the expected generalization gap  without decomposing the mutual information.

% Until now, we have seen some trajectory-based information-theoretic bounds can indeed provide intuitive insights about the generalization of models trained with SGD. However, we may ask are these bounds, which are both distribution-dependent and algorithm-dependent, really non-vacuous? We now provide a negative answer below.

% \begin{thm}
% \label{thm:lower-bound-traj}
%     Denote the trajectory-based information-theoretic bound in Theorem~\ref{thm:anisotropic-prior-bound} as $\mathrm{TrajMI}_\mu(\mathcal{A})$, then we have $\mathrm{TrajMI}_\mu(\mathcal{A})\geq\Omega\pr{\frac{\sqrt{Tbd}}{n}}$.
%     % \Omega\pr{\sqrt{\frac{dT}{n}\log{\frac{n}{n-b}}}}$.
% \end{thm}

% \paragraph{Dependency on Dimension} Theorem~\ref{thm:lower-bound-traj} indicates that even the dimension $d$ does not explicitly appear in the bound of Theorem~\ref{thm:anisotropic-prior-bound}, it is still dimension-dependent. Due to such dependence, if $d$ grows faster than $n^2$, then the bound of Theorem~\ref{thm:anisotropic-prior-bound} will be vacuous. In other words, a necessary condition to let Theorem~\ref{thm:anisotropic-prior-bound} be non-vacuous is $d\leq \mathcal{O}(n^2)$. On the one hand, this is a negative result for trajectory-based information-theoretic bounds in explaining the success of deep learning, because given a fixed $T,b$ and $n$, deep neural networks with more parameters often give better generalization performance while the lower bound will grow with $d$. On the other hand, some recent works show that many parameters in DNN can be removed without affecting the generalization \citep{frankle2018the}, and GD/SGD may only need to occur at a subspace of $\mathbb{R}^d$ \citep{li2018measuring,gur2018gradient,larsen2022how}. If we use some ``intrinsic dimension'', $d_{\mathrm{int}}$, to replace $d$, then these trajectory-based information-theoretic bounds can be largely tighter. Notice that such $d_{\mathrm{int}}$ itself is distribution-dependent and architecture-dependent \citep{li2018measuring}.

% % Consequently, trajectory-based information-theoretic bounds like Theorem~\ref{thm:anisotropic-prior-bound} and Theorem~\ref{thm:isotropic-prior-bound} cannot explain the success of deep learning.

% \paragraph{Dependency on Time} Clearly, the current trajectory-based information-theoretic bounds are time-dependent, which has been widely known. While the stability-based bounds for GD/SGD are also time-dependent \citep{hardt2016train,bassily2020stability} (in convex learning), the learning rate in these bounds will mitigate the growth of $T$. However, learning rate does not appear in our trajectory-based information-theoretic bounds, making to the dependency on $T$ even worse.

% \paragraph{Dependency on Batch Size} Theorem~\ref{thm:lower-bound-traj} also suggests that large batch size may lead to a worse generalization guarantee. This indeed aligns with the practical evidence that large batch size will degrade the performance \citep{jastrzkebski2017three}. In addition, when batch size is a fixed fraction of sample size, then the trajectory-based information-theoretic bounds have a decaying rate no faster than $\mathcal{O}(1/\sqrt{n})$ even for constant $T$ and $d$, which is known as a slower rate.

% One may argue that this negative result of trajectory-based information-theoretic bounds may come from 1) the SDE approximation in Eq.~(\ref{eq:sgd-update-gaussian}) is problematic or inaccurate 2) the information-theoretic bound in Lemma~\ref{lem:xu's-bound} itself is weak, and such limitation may not exist in more advanced input-output information-theoretic bounds; 3) unrolling the mutual information term in Lemma~\ref{lem:mi-unroll}. 

% For the first point, we have already discussed the theoretical justification for the validation of Eq.~(\ref{eq:sgd-update-gaussian}), we will also empirically verify this. Intuitively, we believe modelling the gradient noise as the Gaussian random variable can only reduce the dependence between $W$ and $S$ (smaller $I(W;S)$), that is, Eq.~(\ref{eq:sgd-update-gaussian}) is already an optimistic approximation, the original SGD may have an even worse lower bound. For the second point, we can also prove such limitation still exists for a variant of Lemma~\ref{lem:data-dependent-prior} and the individual mutual information bound in \cite{bu2019tightening}. These results are defered to Appendix~\ref{sec:other bounds}.


% \begin{rem}
% Inconspicuously, this bound indeed suggests that large batch size may lead to a worse generalization gurantee performance. To see this, we need an inequality below
% \begin{align}
% \label{ineq:bs-large}
%     % tr\left\{\log\Sigma^\mu_t\!-\!\cex{S}{W_{t-1}}{\log C_t}\right\}+d\log\frac{1}{b}\!\!\geq\!\! d\log\frac{n}{n-b}.
%     tr\left\{\log({\Sigma^\mu_t}/{b})\right\}-A_2(t)\geq d\log\frac{n}{n-b}.
% \end{align}
% The derivation can be found in Appendix. Thus, when batch size $b$ is close to $n$, then the generalization bound would be looser. Notice that we cannot show small batch size will necessarily give better generalization from Eq.~(\ref{ineq:bs-large}).
% \end{rem}



% In addition, if we let the diagonal element of $\Sigma_t$ in dimension $k$ be $b_k$ and assume $n\gg b$ (so $\Sigma_t = bC_t$), then
% \[
% tr\left\{\log({\Sigma^\mu_t}/{b})\right\}-A_2(t)=\mathbb{E}_{S}^{W_{t-1}}\sum\log(a_k/b_k).
% \]
% Thus, Theorem~\ref{thm:anisotropic-prior-bound} suggests that if the population gradient variance is small at $w$ (i.e. small $a_k$), then large gradient noise (i.e. large $b_k$) will benefit the generalization.
% \subsection{Tighter Bound: Data-Dependent Prior}

% Following the setup in \cite{negrea2019information} and \cite{wang2021optimizing}, let $J$ be a random subset uniformly drawn from $\{1,\dots,n\}$ and $|J|=m>b$. Let $S_J = \{Z_i\}_{i\in J}$ and $L_{S_J}(W) = \frac{1}{m}\sum_{i\in J}\ell(W,Z_i)$. Typically, we choose $m=n-1$, and the following result is a simple extension of \cite[Theorem~2.5]{negrea2019information} and \citet[Theorem~1.]{wang2021optimizing}.

% \begin{lem}
% \label{lem:data-dependent-prior}
% Assume the loss $\ell(w,Z)$ is bounded in $[0,M]$, the expected generalization gap is bounded by
% \[
% \mathcal{E}_{\mu}(\mathcal{A})\leq\frac{M}{\sqrt{2}}\mathbb{E}_{S,J}{\sqrt{\min\left\{\mathrm{D_{KL}}(Q_{W|S}||P_{W|S_J}),\mathrm{D_{KL}}(P_{W|S_J}||Q_{W|S})\right\}}}
% \]
% \end{lem}

% The reason of requiring a bounded loss rather than a subgaussian loss is explained in \cite[Remark ~B.2]{negrea2019information}. Succinctly, there is no benefit by using subgaussian assumption here, and this assumption may render the bound worse. 

%  In the same spirit with Lemma \ref{lem:mi-unroll}, to apply Lemma \ref{lem:data-dependent-prior} to iterative algorithms, we also need the lemma below, which using the full training trajectories KL divergence to upper bound the final output KL divergence.
% \begin{lem}[{\citet[Proposition ~2.6.]{negrea2019information}}]
% \label{lem:kl-decomposition}
% Assume that $P_{W_0}=Q_{W_0}$, then $\mathrm{D_{KL}}(P_{W_T}||Q_{W_{T}})\leq\sum_{t=1}^T\ex{W_{0:t-1}}{\mathrm{D_{KL}}(P_{W_{t}|W_{0:t-1}}||Q_{W_{t}|W_{0:t-1}})}.$
% \end{lem}

% The main modification from the previous analysis is that now we are using a subset $S_J$ drawn from the training sample $S$ to conduct a parallel weight process as a prior. Note that $J$ is drawn before the training begins and is independent of $\{W_t\}_{t=0}^T$ in the real SGD process. To be precise, let $G_{Jt}=\nabla L_{S_J}(W_{t-1})$, the SDE approximation of this prior updating is defined as:
% \begin{eqnarray}
% \label{eq:prior-update}
%   W_t = W_{t-1} - \eta G_{Jt}+\widehat{G}_t,
% \end{eqnarray}
% where $\widehat{G}_t$ could be a state and subset of sample dependent Gaussian noise. It may be tempting to think that we can let $\widehat{G}_t=\eta C^{\frac{1}{2}}_{Jt} N_t$ wherein $C_{Jt}  = \frac{1}{b}\left(\frac{1}{m}\sum_{i\in J}\nabla \ell_i\nabla \ell_i^T-G_{Jt}G_{Jt}^T\right)$. However, using this prior would give us a bound that explicitly depends on the model dimension. We elaborate more on this in Appendix. In addition, it's possible to seek an optimal prior in a "greedy" sense. Recently, \citet{wang2021optimizing} shows that, under some certain assumptions, the covariance of the optimal prior for each step $t$ is $C_1(w_{t-1})/tr\{(\Sigma_t^\mu)^{{1}/{2}}\}(\Sigma_t^\mu)^{{1}/{2}}$ where $\Sigma_t^\mu=\ex{Z}{\nabla \ell(w_{t-1},Z)\nabla \ell(w_{t-1},Z)^T}-\ex{Z}{\nabla \ell(w_{t-1},Z)}\ex{Z}{\nabla \ell(w_{t-1},Z)}^T$ and $C_1(w_{t-1})$ is some state dependent constant.
% \textcolor{red}{Since... we also defer this in the Appendix}

% \begin{eqnarray}
% \label{eq:prior-update}
%   W_t = W_{t-1} - \eta G_{Jt}+\eta C^{\frac{1}{2}}_{Jt} N_t,
% \end{eqnarray}
% where $C_{Jt}  = \frac{1}{b}\left(\frac{1}{m}\sum_{i\in J}\nabla \ell_i\nabla \ell_i^T-G_{Jt}G_{Jt}^T\right)$ is the gradient noise covariance of the prior process. In this case, the prior distribution $P_{\widehat{G}_{Jt}|W_{0:t-1}}$ will be an anisotropic Gaussian distribution.

% We denote the difference between $G_t$ and $G_{Jt}$ by
% \begin{eqnarray}
%   \xi_t \triangleq  G_{Jt} - G_t.
% \end{eqnarray}

% To see the relationship between $\xi_t$, $C_{Jt}$ and $C_t$, we present a useful lemma below.
% \begin{lem}
% If $m=n-1$, then the following two equations hold,
% \[\ex{}{\xi_t\xi_t^T} = \frac{b}{(n-1)^2}C_t, \quad \ex{}{C_{Jt}}=\frac{n(n-2)}{(n-1)^2}C_t,\]
% where the expectation is taken over $J$.
% \label{lem:disjoint-var-avg}
% \end{lem}

% To see the relationship between $\xi_t$ and $C_t$, we present a useful lemma below.
% \begin{lem}
% If $m=n-1$, then $\ex{J}{\xi_t\xi_t^T} = \frac{b}{(n-1)^2}C_t$
% \label{lem:disjoint-var-avg}
% \end{lem}

% As introduced in \cite{wang2021optimizing}, the subsequent analysis based on the data-dependent prior bound will rely on an additional assumption.
% \begin{assum}
% \label{ass:invariant}
% When $m=n-1$, given dataset $S=s$, the distribution $P_{W_t|J,S_J}$ is invariant of $J$.
% \end{assum}
% In practice, $n$ is usually very large, so this assumption hints that changing one instance in $S_{J}$ will not make $P_{W_t|J,S_J}$ be too different.

% We are now in a position to state the following theorem.
% \begin{thm}
% \label{thm:data-dependent-bound}
% Assume the loss $\ell(w,Z)$ is bounded in $[0,M]$ and Assumption \ref{assum-sde} and \ref{ass:invariant} hold,the expected generalization gap of SGD is bounded by
% \[
% \mathcal{E}_{\mu}(\mathcal{A})\leq\ex{S}{\sqrt{M^2\sum_{t=1}^T\ex{W_{t-1}}{\left(\frac{(b-1)d}{(n-1)^2}+A_3(t)\right)}}},
% \]
% where $A_3(t) = tr\left\{\log{C_t}- \ex{}{\log{C_{Jt}}}\right\}$.
% \end{thm}

% Clearly, this bound makes the batch size $b$ explicitly appear. Let us take this to the extreme by letting $b=1$, then Theorem \ref{thm:data-dependent-bound} shows that the generalization only depends on $A_3(t)$ while in Theorem \ref{thm:isotropic-prior-bound} we do not have this conclusion. That is to say, trace of gradient noise covariance can be the only factor that controls the generalization. Additionally, we notice that the data-dependent prior technique on average can make difference between the mean of  posterior and the mean of  prior be a constant value. 

%  Arguably, the effect of $tr\{{\log{C_t}}\}$ on the magnitude of the bound can be decreased by the $tr\{\mathbb{E}_{J}{\log{C_{Jt}}}\}$, which makes the bound much tighter than previous result. If we further consider Taylor expansion of the function $\log C_{Jt}$ around $\mathbb{E}_J[C_{Jt}]$, we have a well-known approximation 
%  \[\ex{}{\log C_{Jt}}\approx\log\ex{}{C_{Jt}}-\mathrm{Var}(C_{Jt})/(2\mathbb{E}^2[C_{Jt}]).\]
%  Thus, recall Lemma \ref{lem:disjoint-var-avg}, the difference between $tr\{{\log{C_{t}}}\}$ and  $tr\{\mathbb{E}_{J}{\log{C_{Jt}}}\}$ would become:
% \[
% \log{(1+1/(n^2-2n))}+\mathrm{Var}(C_{Jt})/(2\mathbb{E}^2[C_{Jt}]).
% \]

% When $n\rightarrow \infty$, the first term will converges to zero, and for the second term, $\mathbb{E}^2[C_{Jt}]$ will converge to a constant by Lemma \ref{lem:disjoint-var-avg} and $\mathrm{Var}(C_{Jt})$ will also converges to zero.




% It's important to note that bounds in both Theorem \ref{thm:isotropic-prior-bound} and Theorem \ref{thm:anisotropic-prior-bound} may grow with the training iteration number $T$ increasing, unless the gradient norm is completely negligible at the end of training. In contrast, the empirical generalization gap between testing loss and training loss will become stable when training loss converges to a minimum, even if the training continues. This contradiction is due to the 
% application of the data processing inequality. Specifically, we upper bound MI of the final output of the algorithm by MI of the full training trajectories term (e.g., Lemma \ref{lem:mi-unroll}). Although data processing inequality and chain rule uncovers some properties of gradient based iterative algorithms like SGD, when $T$ becomes large, using MI  of the full training trajectories as an upper bound may become loose, and such looseness will further increase when $T$ is getting larger. 

% Previous work like \citet{wang2021analyzing} uses the strong data processing inequality to reduce this deficiency, but the bound still increases with $T$. To tackle this weakness, inspired by the literature of PAC-Bayes bounds \cite{xie2021positive},
% % we will connect the SDE approximation and information-theoretic bounds to a PAC-Bayes point of view, 
% we will apply some existing SDE results on the terminal parameters of the algorithm,  which will give us a crisp way to characterize the expected generalization gap  without decomposing the mutual information.


% We now investigate the third point. Specifically, we will apply some existing SDE results on the terminal parameters of the algorithm,  which will give us a crisp way to characterize the expected generalization gap  without decomposing the mutual information. Additionally, this will also remove the dependency on $T$. Such kind of bound will be called the terminal state-dependent bound.