
\appendix
\title{Supplementary Material}
\maketitle
% \begin{appendices}
% \addcontentsline{toc}{section}{Appendix} % Add the appendix text to the document TOC
% \part{Appendix} % Start the appendix part
% \parttoc


% \addcontentsline{toc}{section}{Appendix} % Add the appendix text to the document TOC
% \part{Appendix} % Start the appendix part
% \parttoc

% \section{Summaries for the Bounds}



% \section{Notation}
% % Unless otherwise noted, a random variable will be denoted by a capitalized letter, and  its realization by the corresponding lower-case letter. 
% The distribution of a random variable $X$ is denoted by $P_X$ (or $Q_X$), and the conditional distribution of $X$ given $Y$ is denoted by $P_{X|Y}$. When conditioning on a specific realization $y$, we use the shorthand $P_{X|Y=y}$ or simply $P_{X|y}$.
% Denote by $\mathbb{E}_{X}$ expectation over $X \sim P_X$, and by $\mathbb{E}_{X|Y=y}$ (or $\mathbb{E}^y_{X}$) expectation over $X \sim P_{X|Y=y}$. We may omit the subscript of the expectation when there is no ambiguity.
% % The entropy of a random variable $X$ is denoted by $H(X)$, and 
% The KL divergence of probability distribution $Q$ with respect to $P$ is denoted by $\mathrm{D_{KL}}(Q||P)$.
% The mutual information (MI) between random variables $X$ and $Y$ is denoted by $I(X;Y)$, and the conditional mutual information between $X$ and $Y$ given $Z$ is denoted by $I(X;Y|Z)$. In addition, for a matrix $A\in\mathbb{R}^{d\times d}$, we let $\tr{A}$ denote the trace of $A$ and we use $\tr{\log{A}}$ to indicate $\sum_{k=1}^d\log{A_{k,k}}$.




\section{Additional Background}
\subsection{Information-Theoretic Bounds for SGD}
\label{sec:IT-SGD}

% \textcolor{red}{
Recently, \citep{neu2021information,wang2022generalization} apply information-theoretic analysis to the generalization of models trained with SGD by invoking an auxiliary weight process (AWP). We now denote this auxiliary weight process by  $\mathcal{A}_{AWP}$. Let $\mathcal{A}_{SGD}$ be the original algorithm of SGD, \citep{neu2021information,wang2022generalization} obtain generalization bounds by the following construction,
\begin{align}
\mathcal{E}_{\mu}\pr{\mathcal{A}_{SGD}}=&\mathcal{E}_{\mu}\pr{\mathcal{A}_{SGD}}+\mathcal{E}_{\mu}\pr{\mathcal{A}_{AWP}}-\mathcal{E}_{\mu}\pr{\mathcal{A}_{AWP}}\notag\\
    \leq& \underbrace{\mathcal{O}\pr{\sqrt{\frac{I(W_{\rm AWP};S)}{n}}}}_{\text{Lemma~\ref{lem:xu's-bound}}}+\underbrace{\abs{\mathcal{E}_{\mu}\pr{\mathcal{A}_{SGD}}-\mathcal{E}_{\mu}\pr{\mathcal{A}_{AWP}}}}_{\text{residual term}}, \label{ineq:sgd-ax}
\end{align}
where $W_{\rm AWP}$ is the output hypothesis by $\mathcal{A}_{AWP}$.


Notably, it remains uncertain whether the residual term is sufficiently small for the information-theoretic bounds of $\mathcal{A}_{AWP}$ to yield meaningful insights into SGD. Although there exists an optimal $\mathcal{A}_{AWP}$ that tightens the bound in Eq.~(\ref{ineq:sgd-ax}), finding such an optimal $\mathcal{A}_{AWP}$ beyond the isotropic Gaussian noise covariance case is challenging. 
It's worth noting that \cite{wang2022generalization} provides an optimal bound for the time-invariant isotropic Gaussian noise case. Nevertheless, our empirical results, as illustrated in Figure~\ref{fig:bounds}, demonstrate that the bounds presented in this paper outperform the isotropic Gaussian noise case.

In this paper, we do not attempt to find an optimal $\mathcal{A}_{AWP}$, but instead,  we invoke the SDE approximation (i.e. Eq.~(\ref{eq:sgd-update-gaussian})), denoted as $\mathcal{A}_{SDE}$. Formally,

\begin{align}
\mathcal{E}_{\mu}\pr{\mathcal{A}_{SGD}}=&\mathcal{E}_{\mu}\pr{\mathcal{A}_{SGD}}+\mathcal{E}_{\mu}\pr{\mathcal{A}_{SDE}}-\mathcal{E}_{\mu}\pr{\mathcal{A}_{SDE}}\notag\\
    \leq& \underbrace{\mathcal{O}\pr{\sqrt{\frac{I(W_{\rm SDE};S)}{n}}}}_{\text{Lemma~\ref{lem:xu's-bound}}}+\underbrace{\abs{\mathcal{E}_{\mu}\pr{\mathcal{A}_{SGD}}-\mathcal{E}_{\mu}\pr{\mathcal{A}_{SDE}}}}_{\text{residual term}}, \label{ineq:sde-res}
\end{align}
where $W_{\rm SDE}$ is the output hypothesis by $\mathcal{A}_{SDE}$.

Empirical evidence from \citep{wu2020noisy,li2021validity} and our Figure~\ref{fig:Acc-Dynamics} suggests that the residual term in Eq.~(\ref{ineq:sde-res}) is small. This observation motivates our investigation into the generalization of SGD using the information-theoretic bounds of SDE directly.

% \subsection{Summary of Results}
% The main generalization bounds obtained in this paper are summarized in Table~\ref{tab:summary}.

% \begin{table*}[bt!]
%     \centering
%     % \vspace{-0.7em}
%     \caption{\small Comparison of the results in this work 
%    % The previous $\Sigma$'s for SGD and SGDM are taken from \cite{Mandt2017}, and the other four previous results are due to \cite{gitman2019understanding}. There is no existing continuous-time result about DNM and NGD.
%     } %\\
%     %$^*$SGD: stochastic gradient descent with learning rate $\lambda$. $^*$SGDM: stochastic gradient descent with momentum hyperparameter $\mu$. $^*$QHM: quasi-hypobolic momentum. $^*$DNM: damped newton's method. $^*$NGD: natural gradient descent.}
%     \label{tab:summary}
%     \vspace{-0.8EM}
%     %\renewcommand\arraystretch{1.2}
%     {\small
%     \resizebox{\textwidth}{!}{
%     {\begin{tabular}{c|c|c}
%     \hline\hline
%     &Bounds& Remarks\\
%     \hline
%     \multicolumn{3}{c}{trajectory-based Bounds. Pros: less assumptions, can track training dynamics; Cro: Time-Dependent}\\
%     \hline%\hline 
%      Theorem~\ref{thm:isotropic-prior-bound} &  $\mathcal{O}\pr{\sqrt{\frac{d}{n}\ex{}{\log{\frac{h_1}{d}}-\frac{h_2}{d}}}}$  
%      & Isotropic covariance for Gaussian prior     \\
%       Corollary~\ref{cor:langevin-dynamic} &  $\mathcal{O}\pr{\sqrt{\frac{d}{n}\sum_{t=1}^T{{\mathbb{E}_{}{\log\left(\frac{\mathbb{E}{\left|\left|G_t-\tilde{g}_t\right|\right|^2}}{d}+1\right)}}}}}$  
%      & Bound for langevin dynamic; tighter than \citet[Prop.~3.]{neu2021information}   \\
%      Theorem~\ref{thm:anisotropic-prior-bound} &  $\mathcal{O}\pr{\sqrt{\frac{1}{n}\sum_{t=1}^T\ex{}{\tr{\log\frac{\Sigma^\mu_tC_t^{-1}}{b}}}}}$ 
%      &  Population GNC for prior;  tighter than  Thm.~\ref{thm:isotropic-prior-bound} \\
%      \hline
%      \multicolumn{3}{c}{Terminal-State-based Bounds. Pro: time-indepedent; Cro: more assumptions, cannot track training dynamics}\\
%     \hline%\hline 
%     % NAG \\
%      Theorem~\ref{thm:opt-state-inde-bound} &  $\mathcal{O}\pr{\sqrt{\frac{1}{n}\ex{}{\tr{\log\pr{\Lambda^{-1}_{W^*_S}\Lambda_{w^*_\mu}}}}}}$ 
%      &  General result; hard to measure in practice \\
%      Corollary~\ref{cor:pacbayes-anisotropic-prior} & $\mathcal{O}\pr{\sqrt{\frac{1}{n\eta}\ex{}{\tr{\log\pr{\br{H_{w^*}C^{-1}_{T}}\Lambda_{w^*_\mu}}}}}}$ & Under conditions: $H_{w^*}\Lambda_{w^*}=\Lambda_{w^*}H_{w^*}$ and $H_{w^*}\Sigma_T=\mathrm{I}_d$\\
%      Corollary~\ref{cor:pacbayes-isotropic-prior} & $\mathcal{O}\pr{\sqrt{\frac{d}{n}\log\left(\frac{b}{\eta d}\mathbb{E}{||W_S^*-\hat{w}||^2}+1\right)}}$ & $\hat{w}$ is flexible; $\frac{2}{\eta}\gg \lambda_1$; other conditions same  as Cor.~\ref{cor:pacbayes-anisotropic-prior} \\
%      Theorem~\ref{thm:pacbayes-data-dependent-prior} & $\mathcal{O}\pr{\mathbb{E}{\sqrt{\frac{M^2b}{\eta}\mathbb{E}_{}{||W^*_{S}-W^*_{S_J}||^2}}}}$ & Bounded loss; $\Lambda(W_{s_j}^*)=\Lambda(W_{s}^*)$; other conditions same as Cor.~\ref{cor:pacbayes-isotropic-prior}  \\
%     \hline\hline
%     \end{tabular}}}}
%     %}
% %\caption*{{\footnotesize } }
% % \vspace{-1.5em}
% \end{table*}

\subsection{Theoretical Validation of SDE}

To theoretically  assess the validation of SDE  in approximating SGD, two essential technical definitions are necessary.
\begin{defn}[Test Functions]
\label{defn:test-func}
    Let $\mathcal{F}$ denote the set of continuous functions ($\mathbb{R}^d\to\mathbb{R}$) with polynomial growth, i.e. if $\forall$ $f \in \mathcal{F}$, there exists constants $K, \kappa > 0$ s.t. $|f(x)|<K(1+|x|^{\kappa})$ for all $x\in\mathbb{R}$.
\end{defn}


\begin{defn}[Order$-\alpha$ weak approximation]
\label{defn:weak-approx}
     Let $\eta\in(0,1)$, $T>0$ and $N=\lfloor T/\eta \rfloor$. Let $\mathcal{F}$ be the set of test Functions.
     We say that the SDE in Eq.~(\ref{eq:ito-sde}) is an order $\alpha$ weak approximation of the SGD in Eq.~(\ref{eq:sgd-update}) if for every $f\in\mathcal{F}$, there exists $C>0$, independent of $\eta$, s.t. for all $k=0, 1, \dots, N$, 
     \[
     \abs{\ex{}{f(\omega_{k\eta})}-\ex{}{f(W_{k})}}\leq C\eta^{\alpha}.
     \]
\end{defn}
Below is a classical result.

\begin{lem}[{\citet[Theorem~1]{li2017stochastic}}]
    Assume $\nabla\ell$ is Lipschitz continuous, has at most linear asymptotic growth and has sufficiently high derivatives belonging to $\mathcal{F}$, then SDE in Eq.~(\ref{eq:ito-sde}) is an order $1$ weak approximation of the SGD in Eq.~(\ref{eq:sgd-update}). Or equivalently, for every
$f \in \mathcal{F}$, there exists $C>0$, independent of $\eta$, s.t. $\max_{k=0, 1, \dots, N}\abs{\ex{}{f(\omega_{k\eta})}-\ex{}{f(W_{k})}}\leq C\eta$.
\label{lem:sde-weak}
\end{lem}

This theorem suggests that SGD and SDE closely track each other when they result in similar distributions of outcomes, such as the returned hypothesis $W$. In addition, the closeness of distributions is formulated through expectations of suitable classes of test functions, as defined in Definition~\ref{defn:test-func}. As mentioned in \cite{li2021validity}, of particular interest for machine learning are test functions like generalization error $\mathcal{E}_\mu$, which may not adhere to formal conditions such as differentiability assumed in classical theory but are still valuable for experimental use. Other typical choices of test functions includes weight norm, gradient norm, and the trace of noise covariance.

\subsection{Gaussian Distribution around Local Minimum}
\label{sec:Gaussian-local}
A multi-dimensional Ornstein-Uhlenbeck process is defined as
\begin{align}
    dx_t=-\mathbf{H}x_tdt+\mathbf{B}d\theta_t,
    \label{eq: OU-sde}
\end{align}
where $x_t\in\mathbb{R}^d$, $\mathbf{H}$, $\mathbf{B}$ are $d\times d$ matrices and $\theta_t$ is an $d$-dimensional Wiener process.

% The solution is

Denote the density function of $x_t$ as
$P(x, t)$, then the corresponding Fokker-Planck equation describes the evolution of $P(x, t)$:
\[
\frac{\partial P(x,t)}{\partial t}=\sum_{i=1}^d\sum_{j=1}^d\frac{\partial}{\partial x_i}\pr{P(x,t)\sum_{j=1}^d\mathbf{H}_{i,j}x_j}+\sum_{i=1}^d\sum_{j=1}^d\mathbf{D}_{i,j}\frac{\partial^2 P(x,t)}{\partial x_i \partial x_j},
\]
where $\mathbf{D}={\mathbf{B}\mathbf{B}^{\bf T}}/{2}$.

Moreover, if $\mathbf{H}$ is positive define, then a stationary solution of $P$ is given by \citep{freidlin2012randomper}:
\begin{align}
\label{eq:OU-stationary}
    P(x)=\frac{1}{\sqrt{\pr{2\pi}^{d}\mathrm{det}\pr{\Sigma}}}\exp\pr{-\frac{1}{2}x^{\bf T}\Sigma^{-1} x},
\end{align}
where $\Sigma=\ex{}{xx^{\bf T}}$ is the covariance matrix of $x$.

When $w$ is close to any local minimum $w^*$, we can use a second-order Taylor expansion to approximate the value of the loss at $w$, 
\begin{eqnarray}
  L_s(w) \approx L_s(w^*) + \frac{1}{2}(w-w^*)^\mathrm{\bf T} H_{w^*}(w-w^*).
  % \label{eq:second-order-taylor}
\end{eqnarray}
In this case, when $w_t\to w^*$, we have $G_t=\nabla L_s(w_t)=H_{w^*}\pr{w_t-w^*}$.
Recall Eq.~(\ref{eq:sgd-update-2}), then
\begin{align*}
    w_{t} =& w_{t-1} - \eta G_t + \eta V_t\notag
    =w_{t-1} - \eta H_{w^*}\pr{w_{t-1}-w^*} + \eta V_t.
\end{align*}
Let $W'_t\triangleq W_t-w^*$ and recall Eq~(\ref{eq: OU-sde}), we thus have the Ornstein-Uhlenbeck process for $x_t=W'_t$ as
\begin{align}
    dW'_t=-\eta H_{w^*}W'_tdt+\eta\sqrt{C_t}d\theta_t.
\end{align}
By Eq.~(\ref{eq:OU-stationary}), we have
\[
P(W')\propto \exp\pr{-\frac{1}{2}W'^{\bf T}\Lambda_{w^*}^{-1} W'}.
\]
Consequently, the stationary distribution of $W$ for a given $w^*$ is $\mathcal{N}(w^*,\Lambda_{w^*})$.


For discrete case, we have
\begin{align*}
    w'_{t} =&\pr{\mathrm{I}_d - \eta H_{w^*}}w'_{t-1} + \eta V_t\\
    =&\pr{\mathrm{I}_d - \eta H_{w^*}}^2w'_{t-2} +  \eta\pr{\pr{\mathrm{I}_d - \eta H_{w^*}}V_{t-1}+V_t}\\
    \vdots& \\
    =& \bar{H}^tw'_0+\eta\sum_{i=0}^t\bar{H}^iV_{t-i},
\end{align*}
where $\bar{H}=\mathrm{I}_d - \eta H_{w^*}$.
Notably, when $t$ is sufficiently large, then the first term is negligible, especially with a small learning rate, we have $w_t'=w_t-w^*=\eta\sum_{i=0}^t\bar{H}^iV_{t-i}$. When $C_t$ does not change in the long time limit, then  $W_t'$ is the weighted sum of independent Gaussian random variables, which follows a Gaussian distribution, namely  $w_t\sim\mathcal{N}(w^*,\Lambda_{w^*})$. We refer readers to \cite[Theorem~1-2.]{liu2021noise} for a relaxed analysis in the discrete case.




\section{Some Useful Facts}



We present the variational representation of mutual information below.
\begin{lem}[{\citet[Corollary~3.1.]{polyanskiy2019lecture}}]
\label{lem:mi-center-gravity}
For two random variables $X$ and $Y$, we have
\[
I(X;Y) = \inf_{P} \ex{X}{\mathrm{D_{KL}}(Q_{Y|X}||P)},
\]
where the infimum is achieved at $P=Q_Y$.
\end{lem}

The following lemma is inspired by the classic Log-sum inequality in \citet[Theorem~2.7.1]{cover2012elements}.
\begin{lem}
\label{lem:log-sum-ineq}
For non-negative numbers $\{a_i\}_{i=1}^n$ and $\{b_i\}_{i=1}^n$,
\[
\sum_{i=1}^n b_i\log\frac{a_i}{b_i}\leq \left(\sum_{i=1}^n b_i \right)\log\frac{\sum_{i=1}^n a_i}{\sum_{i=1}^n b_i},
\]
with equality if and only if $\frac{a_i}{b_i}=const$.
\end{lem}
\begin{proof}
Since $\log$ is a concave function, according to Jensen's inequality, we have
\[
\sum_{i=1}^n\alpha_i \log(x_i)\leq \log(\sum_{i=1}^n\alpha_i x_i),
\]
where $\sum_{i=1}^n \alpha_i=1$.

Let $\alpha_i=\frac{b_i}{\sum_{i=1}^n b_i}$ and $x_i=\frac{a_i}{b_i}$, and plugging them into the inequality above, we have
\[
\sum_{i=1}^n\frac{b_i}{\sum_{i=1}^n b_i} \log(\frac{a_i}{b_i})\leq \log\left(\sum_{i=1}^n\frac{b_i}{\sum_{i=1}^n b_i} \frac{a_i}{b_i}\right)=\log\left(\frac{\sum_{i=1}^n a_i}{\sum_{i=1}^n b_i} \right),
\]
which implies
\[
{\sum_{i=1}^n b_i} \log(\frac{a_i}{b_i})\leq\left(\sum_{i=1}^n b_i\right)\log\left(\frac{\sum_{i=1}^n a_i}{\sum_{i=1}^n b_i} \right).
\]
This completes the proof.
\end{proof}


Below is the KL divergence between two Gaussian distributions $p=\mathcal{N}(\mu_p,\Sigma_p)$ and $q=\mathcal{N}(\mu_q,\Sigma_q)$, where $\mu_p,\mu_q\in\mathbb{R}^d$ and $\Sigma_p,\Sigma_q\in\mathbb{R}^{d\times d}$.
\begin{align}
    \mathrm{D_{KL}}(p||q) = \frac{1}{2}\left[\log\frac{\det(\Sigma_q)}{\det(\Sigma_p)} - d + ({\mu_p}-{\mu_q})^{\bf T}\Sigma_q^{-1}({\mu_p}-{\mu_q}) + tr\left\{\Sigma_q^{-1}\Sigma_p\right\}\right].
    \label{eq:kl-two-gau}
\end{align}

\section{Omitted Proofs and Additional Results in Section %``Generalization Bounds Via Full Trajectories''
\ref{sec:itb-sde}
}

% \subsection{Proof of a variant of Log sum inequality}



\subsection{Proof of Lemma~\ref{lem:mi-unroll}}

% We first unroll the terminal parameters' mutual information $I(W_T;S)$ to the full trajectories' mutual information via the lemma below.
% \begin{lem}
% \label{lem:mi-unroll}
% $I(W_T;S)\leq\sum_{t=1}^T I(- G_t + C_t^{1/2}N_t;S|W_{t-1} ).
% $
% \end{lem}

% This lemma can be proved by recurrently applying the data processing inequality (DPI) and chain rule of the mutual information \citep{polyanskiy2019lecture}. 

\begin{proof}
Recall the SDE approximation of SGD, i.e., Eq (\ref{eq:sgd-update-gaussian}), we then have,
\begin{eqnarray}
  I(W_T;S)&=&I(W_{T-1} - \eta G_T + \eta C_T^{1/2}N_T;S)\notag\\
  &\leq&I(W_{T-1},- \eta G_T + \eta C_T^{1/2}N_T;S )\label{ineq:dpi}\\
  &=&I(W_{T-1};S)+I(- \eta G_T + \eta C_T^{1/2}N_T;S|W_{T-1} )\label{eq:chain-rule}\\
  &\vdots&\notag\\
  &\leq&\sum_{t=1}^T I(- \eta G_t + \eta C_t^{1/2}N_t;S|W_{t-1} )\notag\\
  &=&\sum_{t=1}^T I(- G_t + C_t^{1/2}N_t;S|W_{t-1} ).\notag
\end{eqnarray}
where Eq. (\ref{ineq:dpi}) is by the data processing inequality (e.g., $Z - (X,Y) - (X+Y)$ form a markov chain then $I(X+Y,Z)\leq I(X,Y;Z)$), Eq. (\ref{eq:chain-rule}) is by the chain rule of the mutual information, and learning rate $\eta$ is dropped since mutual information is scale-invariant.
\end{proof}

\subsection{Proof of Lemma \ref{lem:cmi-golden formula}}
\begin{proof}
% Given $P_{\tilde{N}_t}=\mathcal{N}(\tilde{g},\sigma^2 \mathrm{I}_d)$,
For any $t\in[T]$, similar to the proof of Lemma \ref{lem:mi-center-gravity} in \cite{polyanskiy2019lecture}:
\begin{eqnarray}
  &&I(- G_t + C_t^{1/2}N_t;S|W_{t-1}=w_{t-1}) \notag\\
  &=& \cex{S}{w_{t-1}}{\mathrm{D_{KL}}(Q_{\widehat{G}_t|w_{t-1},S}||Q_{\widehat{G}_t|w_{t-1}})}\notag\\
  &=&\cex{S}{w_{t-1}}{\mathrm{D_{KL}}(Q_{\widehat{G}_t|w_{t-1},S}||P_{\widehat{G}_t|w_{t-1}})-\mathrm{D_{KL}}(Q_{\widehat{G}_t|w_{t-1}}||P_{\widehat{G}_t|w_{t-1}})}\notag\\
  &\leq&\cex{S}{w_{t-1}}{\mathrm{D_{KL}}(Q_{\widehat{G}_t|w_{t-1},S}||P_{\widehat{G}_t|w_{t-1}})},\label{ineq:kl-nonnegative}
%   \\
%   &\leq&\inf_{\tilde{g}_t,\sigma_t}\ex{S}{\mathrm{D_{KL}}(P_{- G_t + C_t^{1/2}N_t|W_{t-1}=w_{t-1},S=s}||P_{-\tilde{g}_t+\sigma_t N_t|W_{t-1}=w_{t-1}})},\label{ineq:kl-nonnegative}
\end{eqnarray}
where Eq. (\ref{ineq:kl-nonnegative}) is due to the fact that KL divergence is non-negative, and the equality holds when ${P_{\widehat{G}_t|w_{t-1}}}=Q_{\widehat{G}_t|w_{t-1}}$ for $W_{t-1}=w_{t-1}$.

Thus, we conclude that
\[
I(\widehat{G}_t;S|W_{t-1}=w_{t-1})=\inf_{P_{\widehat{G}_t|w_{t-1}}}\cex{S}{w_{t-1}}{\mathrm{D_{KL}}(Q_{\widehat{G}_t|w_{t-1},S}||P_{\widehat{G}_t|w_{t-1}})}.
\]

Taking expectation over $W_{t-1}$ for both side above, we have
\[
I(\widehat{G}_t;S|W_{t-1})=\ex{W_{t-1}}{\inf_{P_{\widehat{G}_t|W_{t-1}}}\cex{S}{W_{t-1}}{\mathrm{D_{KL}}(Q_{\widehat{G}_t|W_{t-1},S}||P_{\widehat{G}_t|W_{t-1}})}}.
\]
% \[
% I(- G_t + C_t^{1/2}N_t;S|W_{t-1}) \leq \ex{W_{t-1}}{\inf_{\tilde{g}_t,\sigma_t} \ex{S}{\mathrm{D_{KL}}(P_{- G_t + C_t^{1/2}N_t|W_{t-1},S}||P_{-\tilde{g}_t+\sigma_t N_t|W_{t-1}})}}.
% \]
This completes the proof.
\end{proof}

\subsection{Proof of Theorem \ref{thm:isotropic-prior-bound}}

\begin{proof}

We first prove Eq.~(\ref{ineq:iso-gen-bound}).
Recall Lemma \ref{lem:cmi-golden formula} and assume $C_t$ is a positive-definite matrix,  for any $t\in[T]$, we have
\begin{align}
    &I(- G_t + C_t^{1/2}N_t;S|W_{t-1}=w_{t-1}) \notag\\
  \leq&\inf_{\tilde{g}_t,\sigma_t}\cex{S}{w_{t-1}}{\mathrm{D_{KL}}(Q_{- G_t + C_t^{1/2}N_t|w_{t-1},S}||P_{-\tilde{g}_t+\sigma_t N_t|w_{t-1}})}\notag\\
  =&\inf_{\tilde{g}_t,\sigma_t}\cex{S}{w_{t-1}}{\frac{1}{2}\left[\log\frac{\det(\sigma_t^2\mathrm{I}_d)}{\det(C_t)} - d + \frac{1}{\sigma_t^2}((G_t-\tilde{g}_t)^{\bf T} \mathrm{I}_d^{-1}(G_t-\tilde{g}_t)) + \frac{1}{\sigma_t^2}tr\left\{\mathrm{I}_d^{-1} C_t\right\}\right]}\label{eq:kl-gaussian}\\
  =&\frac{1}{2}\inf_{\tilde{g}_t,\sigma_t}\cex{S}{w_{t-1}}{\frac{1}{\sigma_t^2} \left(||G_t-\tilde{g}_t||^2 +tr\left\{C_t\right\}\right)+d \log{\sigma_t^2}-d -tr\left\{\log{C_t}\right\}},\label{eq:trace-form}
\end{align}
where Eq. (\ref{eq:kl-gaussian}) is by Eq.~(\ref{eq:kl-two-gau}),
% \[
% D_{KL}(p||q) = \frac{1}{2}\left[\log\frac{\det(\Sigma_q)}{\det(\Sigma_p)} - k + ({\mu_p}-{\mu_q})^T\Sigma_q^{-1}({\mu_p}-{\mu_q}) + tr\left\{\Sigma_q^{-1}\Sigma_p\right\}\right],
% \]
 Eq. (\ref{eq:trace-form}) is due to the fact that 
 % $G_t^TG_t=tr\{G_tG_t^T\}$ and 
 $\log\det(C_t)=tr\{\log C_t\}$ when $C_t$ is positive definite. 

Recall that $h_1(w) = \cex{S}{w}{ ||G_t-\tilde{g}_t||^2 +tr\left\{C_t\right\}}$ and $h_2(w) = \cex{S}{w}{tr\left\{\log{C_t}\right\}}$, 
% (here we can fix $\tilde{g}_t=\ex{S}{G_t|W_{t-1}=w_{t-1}}$), 
then we have
\begin{eqnarray}
  &&\frac{1}{2}\inf_{\tilde{g}_t,\sigma_t} \frac{1}{\sigma_t^2} \cex{S}{w_{t-1}}{||G_t-\tilde{g}_t||^2 +tr\left\{C_t\right\}}+d \log{\sigma_t^2}-d-\cex{S}{w_{t-1}}{tr\left\{\log{C_t}\right\}}\notag\\
  &\leq& \frac{1}{2}\inf_{\sigma_t>0} \frac{1}{\sigma_t^2}h_1(w_{t-1})+d \log{\sigma_t^2}-d-h_2(w_{t-1})\notag\\
  &=&\frac{1}{2}d\log{\frac{h_1(w_{t-1})}{d}}-\frac{1}{2}h_2(w_{t-1})\notag,
\end{eqnarray}
where we fix an arbitrary $\tilde{g}_t$ and use the optimal $\sigma^*=\sqrt{\frac{h_1(w_{t-1})}{d}}$.

Plugging everything into Lemma \ref{lem:mi-unroll} and Lemma \ref{lem:xu's-bound} will obtain Eq.~(\ref{ineq:iso-gen-bound}).
% Eq. (\ref{ineq:log-x}) is by $log(x+1)\leq x$. In Eq. (\ref{eq:optim-sample-mi}), we let $\tilde{g}_t=\ex{W_{t-1},S}{G_t}$, then optimize over $\sigma^2$.

We then prove the second part. 
% namely Eq.~(\ref{ineq:iso-pop-bound}). 
Let $\tilde{g}_t=\ex{Z}{\nabla\ell(w_{t-1},Z)}$, then 
\begin{align}
    h_1(W_{t-1}) =& \cex{S}{W_{t-1}}{\left|\left|G_t-\tilde{g}_t\right|\right|^2 +tr\left\{C_t\right\}}\notag\\
    =&\cex{S}{W_{t-1}}{tr\left\{(G_t-\tilde{g}_t)((G_t-\tilde{g}_t)^{\bf T}\right\}}+tr\left\{\cex{S}{W_{t-1}}{C_t}\right\}\notag\\
    =&\frac{1}{n}tr\left\{\Sigma_t^\mu\right\}+\frac{n-b}{b(n-1)}tr\left\{\cex{S}{W_{t-1}}{\Sigma_t}\right\}\label{eq:trace-norm-1}\\
    =&\frac{1}{n}tr\left\{\Sigma_t^\mu\right\}+\frac{n-b}{bn}tr\left\{\Sigma_t^\mu\right\}\label{eq:estimate-variance-1}\\
    =&\frac{1}{b}tr\left\{\Sigma_t^\mu\right\},\notag
\end{align}
where Eq.~(\ref{eq:trace-norm-1}) is by $\ex{S}{(G_t-\tilde{g}_t)((G_t-\tilde{g}_t)^{\bf T}}=\frac{1}{n}\Sigma_t^\mu$ for a given $W_{t-1}=w_{t-1}$ and $C_t=\frac{n-b}{b(n-1)}\Sigma_t$, and Eq.~(\ref{eq:estimate-variance-1}) is by $\ex{S}{\Sigma_t}=\frac{n-1}{n}\Sigma_t^\mu$. 
% Plugging the last equation into the bound in Eq.~(\ref{ineq:iso-gen-bound}) will concludes the proof.
This completes the proof.
\end{proof}

% \begin{rem}
% Unlike information-theoretic generalization bounds of SGLD in the literature, learning rate does not explicitly appear in the bound of Theorem~\ref{thm:isotropic-prior-bound}. This is because the noise random variable has a tunable scaling factor in SGLD while the noise random variable has a fixed scaling factor $\eta$. The latter scaling factor is then dropped since mutual information is scale-invariant (see the proof of Lemma~\ref{lem:mi-unroll} for more details).
% \end{rem}

% \subsection{Proof of Corollary~\ref{cor:population-gradient-bound}}
% \begin{proof}
% Recall $A_1(t)$ in Theorem~\ref{thm:isotropic-prior-bound}, when $\tilde{g}_t=\ex{Z}{\nabla\ell(w_{t-1},Z)}$,
% \begin{align}
%     A_1(t) =& \cex{S}{W_{t-1}}{\left|\left|G_t-\tilde{g}_t\right|\right|^2 +tr\left\{C_t\right\}}\notag\\
%     =&\cex{S}{W_{t-1}}{tr\left\{(G_t-\tilde{g}_t)((G_t-\tilde{g}_t)^T\right\}}+tr\left\{\cex{S}{W_{t-1}}{C_t}\right\}\notag\\
%     =&\frac{1}{n}tr\left\{\Sigma_t^\mu\right\}+\frac{n-b}{b(n-1)}tr\left\{\cex{S}{W_{t-1}}{\Sigma_t}\right\}\label{eq:trace-norm-1}\\
%     =&\frac{1}{n}tr\left\{\Sigma_t^\mu\right\}+\frac{n-b}{bn}tr\left\{\Sigma_t^\mu\right\}\label{eq:estimate-variance-1}\\
%     =&\frac{1}{b}tr\left\{\Sigma_t^\mu\right\},\notag
% \end{align}
% where Eq.~(\ref{eq:trace-norm-1}) is by $\ex{S}{(G_t-\tilde{g}_t)((G_t-\tilde{g}_t)^T}=\frac{1}{n}\Sigma_t^\mu$ for a given $W_{t-1}=w_{t-1}$ and $C_t=\frac{n-b}{b(n-1)}\Sigma_t$, and Eq.~(\ref{eq:estimate-variance-1}) is by $\ex{S}{\Sigma_t}=\frac{n-1}{n}\Sigma_t^\mu$. Plugging the last equation into the bound in Theorem~\ref{thm:isotropic-prior-bound} will concludes the proof.
% \end{proof}

\subsection{Proof of Corollary \ref{cor:langevin-dynamic}}

\begin{proof}
Let $C_t=\mathrm{I}_d$, by Theorem \ref{thm:isotropic-prior-bound}, 
\begin{eqnarray}
  \mathcal{E}_{\mu}(\mathcal{A})&\leq&\sqrt{\frac{R^2}{n}\sum_{t=1}^Td\ex{W_{t-1}}{\log{\frac{\cex{S}{W_{t-1}}{ \left|\left|G_t-\tilde{g}_t\right|\right|^2 +tr\left\{C_t\right\}}}{d}}}-\ex{W_{t-1},S}{tr\left\{\log{C_t}\right\}}}\notag\\
  % &=&\sqrt{\frac{R^2}{n}\sum_{t=1}^Td\ex{W_{t-1}}{\log{\frac{\cex{S}{W_{t-1}}{ \left|\left|G_t-\tilde{g}_t\right|\right|^2 +d}}{d}}}}\notag\\
  &=&\sqrt{\frac{R^2}{n}\sum_{t=1}^Td\ex{W_{t-1}}{\log{\frac{\cex{S}{W_{t-1}}{ \left|\left|G_t-\tilde{g}_t\right|\right|^2 }}{d}+1}}}.\notag
\end{eqnarray}
This completes the proof.
% where Eq. (\ref{ineq:sum-root}) is by $\sqrt{\sum_i x_i}\leq \sum_i \sqrt{x_i}$. This completes the proof.
\end{proof}

\subsection{Proof of Theorem~\ref{thm:anisotropic-prior-bound}}
\begin{proof}
Recall Lemma \ref{lem:cmi-golden formula}, we have
\begin{align}
    &I(- G_t + C_t^{1/2}N_t;S|W_{t-1}=w_{t-1}) \notag\\
  \leq&\inf_{\tilde{c}_t}\cex{S}{w_{t-1}}{\mathrm{D_{KL}}(Q_{\widehat{G}_t|w_{t-1},S}||P_{\widehat{G}_t|w_{t-1}})}\notag\\
  =&\inf_{\tilde{c}_t}\cex{S}{w_{t-1}}{\frac{1}{2}\left[\log\frac{\det(\tilde{c}_t\Sigma^\mu_t)}{\det(C_t)} - d + \frac{1}{\tilde{c}_t}((G_t-\tilde{g}_t)^{\bf T} \left(\Sigma^\mu_t\right)^{-1}(G_t-\tilde{g}_t)) + \frac{1}{\tilde{c}_t}tr\left\{\left(\Sigma^\mu_t\right)^{-1} C_t\right\}\right]}\notag\\
  =&\frac{1}{2}\inf_{\tilde{c}_t} \frac{1}{\tilde{c}_t}tr\left\{\left(\Sigma^\mu_t\right)^{-1}\cex{S}{w_{t-1}}{(G_t-\tilde{g}_t)((G_t-\tilde{g}_t)^{\bf T}}\right\}\notag\\
  &\qquad +\frac{1}{\tilde{c}_t}tr\left\{\left(\Sigma^\mu_t\right)^{-1} \cex{S}{w_{t-1}}{C_t}\right\}+tr\left\{\log\Sigma^\mu_t-\cex{S}{w_{t-1}}{\log C_t}\right\}+d \log{\tilde{c}_t}-d\notag\\
  =&\frac{1}{2}\inf_{\tilde{c}_t} \frac{1}{\tilde{c}_tn}tr\left\{\left(\Sigma^\mu_t\right)^{-1}\Sigma^\mu_t\right\}+\frac{n-b}{\tilde{c}_tbn}tr\left\{\left(\Sigma^\mu_t\right)^{-1} \Sigma^\mu_t\right\}+tr\left\{\log\Sigma^\mu_t-\cex{S}{w_{t-1}}{\log C_t}\right\}+d \log{\tilde{c}_t}-d\label{eq:biased-sample-covariance}\\
  =&\frac{1}{2}\inf_{\tilde{c}_t} \frac{d}{\tilde{c}_tn}+\frac{(n-b)d}{\tilde{c}_tbn}+tr\left\{\log\Sigma^\mu_t-\cex{S}{w_{t-1}}{\log C_t}\right\}+d \log{\tilde{c}_t}-d\notag\\
  =&\frac{1}{2}\inf_{\tilde{c}_t} 
  \frac{d}{b\tilde{c}_t}+d \log{\tilde{c}_t}+tr\left\{\log\Sigma^\mu_t-\cex{S}{w_{t-1}}{\log C_t}\right\}-d\notag\\
  =&\frac{d}{2}\log{\frac{1}{b}}+\frac{1}{2}tr\left\{\log\Sigma^\mu_t-\cex{S}{w_{t-1}}{\log C_t}\right\},\notag
\end{align}
where the last equality hold when $\tilde{c}^*_t=1/{b}$ and Eq.~(\ref{eq:biased-sample-covariance}) is by 
\[
\cex{S}{w_{t-1}}{(G_t-\tilde{g}_t)((G_t-\tilde{g}_t)^{\bf T}}=\frac{1}{n}\Sigma_t^\mu, \quad\text{and}
\]
\[
\cex{S}{w_{t-1}}{C_t}=\frac{n-b}{b(n-1)}\cex{S}{w_{t-1}}{\Sigma_t}=\frac{n-b}{b(n-1)}\frac{n-1}{n}\Sigma_t^\mu=\frac{n-b}{bn}\Sigma_t^\mu.
\]
This completes the proof.
\end{proof}

\subsection{Proof of Lemma~\ref{lem:compare-iso-noniso}}
\begin{proof}
Let the diagonal element of ${\Sigma^\mu_t}/{b}$ in dimension $k$ be $a_k$, then
\begin{align*}
    \sum_{k=1}^d\log a_k\leq
    (\sum_{k=1}^d 1) \cdot \log{(\sum_{k=1}^d a_k)}/{(\sum_{k=1}^d 1)}=d\log({tr\left\{\Sigma^\mu_t\right\}}/{bd}),
\end{align*}
where we invoke Lemma~\ref{lem:log-sum-ineq}. 

This completes the proof.
\end{proof}

% Until now, we have seen some trajectory-based information-theoretic bounds can indeed provide intuitive insights about the generalization of models trained with SGD. However, we may ask are these bounds, which are both distribution-dependent and algorithm-dependent, really non-vacuous? We now provide a negative answer below.

% \begin{thm}
% \label{thm:lower-bound-traj}
%     Denote the trajectory-based information-theoretic bound in Theorem~\ref{thm:anisotropic-prior-bound} as $\mathrm{TrajMI}_\mu(\mathcal{A})$, then we have $\mathrm{TrajMI}_\mu(\mathcal{A})\geq\Omega\pr{\frac{\sqrt{Tbd}}{n}}$.
%     % \Omega\pr{\sqrt{\frac{dT}{n}\log{\frac{n}{n-b}}}}$.
% \end{thm}

% \textcolor{red}{
% \paragraph{Dependency on Dimension} Theorem~\ref{thm:lower-bound-traj} indicates that even the dimension $d$ does not explicitly appear in the bound of Theorem~\ref{thm:anisotropic-prior-bound}, it is still dimension-dependent. Due to such dependence, if $d$ grows faster than $n^2$, then the bound of Theorem~\ref{thm:anisotropic-prior-bound} will be vacuous. In other words, a necessary condition to let Theorem~\ref{thm:anisotropic-prior-bound} be non-vacuous is $d\leq \mathcal{O}(n^2)$. On the one hand, this is a negative result for trajectory-based information-theoretic bounds in explaining the success of deep learning, because given a fixed $T,b$ and $n$, deep neural networks with more parameters often give better generalization performance while the lower bound will grow with $d$. On the other hand, some recent works show that many parameters in DNN can be removed without affecting the generalization \citep{frankle2018the}, and GD/SGD may only need to occur at a subspace of $\mathbb{R}^d$ \citep{li2018measuring,gur2018gradient,larsen2022how}. If we use some ``intrinsic dimension'', $d_{\mathrm{int}}$, to replace $d$, then these trajectory-based information-theoretic bounds can be largely tighter. Notice that such $d_{\mathrm{int}}$ itself is distribution-dependent and architecture-dependent \citep{li2018measuring}.}

% % Consequently, trajectory-based information-theoretic bounds like Theorem~\ref{thm:anisotropic-prior-bound} and Theorem~\ref{thm:isotropic-prior-bound} cannot explain the success of deep learning.

% \textcolor{red}{
% \paragraph{Dependency on Time} Clearly, the current trajectory-based information-theoretic bounds are time-dependent, which has been widely known. While the stability-based bounds for GD/SGD are also time-dependent \citep{hardt2016train,bassily2020stability} (in convex learning), the learning rate in these bounds will mitigate the growth of $T$. However, learning rate does not appear in our trajectory-based information-theoretic bounds, making to the dependency on $T$ even worse.}

% \paragraph{Dependency on Batch Size} Theorem~\ref{thm:lower-bound-traj} also suggests that large batch size may lead to a worse generalization guarantee. This indeed aligns with the practical evidence that large batch size will degrade the performance \citep{jastrzkebski2017three}. In addition, when batch size is a fixed fraction of sample size, then the trajectory-based information-theoretic bounds have a decaying rate no faster than $\mathcal{O}(1/\sqrt{n})$ even for constant $T$ and $d$, which is known as a slower rate.

% One may argue that this negative result of trajectory-based information-theoretic bounds may come from 1) the SDE approximation in Eq.~(\ref{eq:sgd-update-gaussian}) is problematic or inaccurate 2) the information-theoretic bound in Lemma~\ref{lem:xu's-bound} itself is weak, and such limitation may not exist in more advanced input-output information-theoretic bounds; 3) unrolling the mutual information term in Lemma~\ref{lem:mi-unroll}. 

% For the first point, we have already discussed the theoretical justification for the validation of Eq.~(\ref{eq:sgd-update-gaussian}), we will also empirically verify this. Intuitively, we believe modelling the gradient noise as the Gaussian random variable can only reduce the dependence between $W$ and $S$ (smaller $I(W;S)$), that is, Eq.~(\ref{eq:sgd-update-gaussian}) is already an optimistic approximation, the original SGD may have an even worse lower bound. For the second point, we can also prove such limitation still exists for a variant of Lemma~\ref{lem:data-dependent-prior} and the individual mutual information bound in \cite{bu2019tightening}. These results are defered to Appendix~\ref{sec:other bounds}.

% \subsection{Derivation of Eq.~(\ref{ineq:bs-large})}
% \subsection{Proof of Theorem~\ref{thm:lower-bound-traj}}
% \begin{proof}
% We derive the lower bound for the key quantity in the bound of Theorem~\ref{thm:anisotropic-prior-bound} as below.
% \begin{align}
% \cex{S}{w_{t-1}}{\tr{\log\pr{\frac{1}{b}\Sigma^\mu_tC_t^{-1}}}}=&
%     tr\left\{\log\Sigma^\mu_t-\cex{S}{w_{t-1}}{\log C_t}\right\}+d\log\frac{1}{b}\notag\\
%     \geq& tr\left\{\log\Sigma^\mu_t-{\log \cex{S}{w_{t-1}}{C_t}}\right\}+d\log\frac{1}{b}\label{ineq:jensen-bs}\\
%     =& tr\left\{\log\Sigma^\mu_t-\log\frac{n-b}{bn}\Sigma^\mu_t\right\}+d\log\frac{1}{b}\label{eq:sample-variance}\\
%     =& tr\left\{\log\frac{bn}{n-b}\mathrm{I}_d\right\}+d\log\frac{1}{b}\notag\\
%     =&d\log\frac{n}{n-b}\notag\\
%     \geq& \frac{db}{n},\notag
% \end{align}
% where Eq.~(\ref{ineq:jensen-bs}) is by Jensen's inequality,  Eq.~(\ref{eq:sample-variance}) is by
%     $\ex{S}{C_t}=\frac{n-b}{bn}\Sigma_t^\mu$ and the last inequality is by $\log{x}\geq 1-\frac{1}{x}$ for $x>0$.

% Plugging the above into the bound of Theorem~\ref{thm:anisotropic-prior-bound}, we have
% \[
% \sqrt{\frac{R^2}{n}\sum_{t=1}^T\ex{W_{t-1},S}{\tr{\log\frac{\Sigma^\mu_tC^{-1}_t}{b}}}}\geq \frac{R\sqrt{Tdb}}{n}.
% \]
% % Furthermore, by $\log{x}>1-\frac{1}{x}$, we have
% % \[
% % d\log\frac{n}{n-b}>\frac{db}{n}.
% % \]
% This completes the proof.
% \end{proof}

\subsection{Additional Result via Data-Dependent Prior}
\label{sec:other bounds}
With the same spirit of Lemma \ref{lem:mi-unroll}, to apply Lemma \ref{lem:data-dependent-prior} to iterative algorithms, we also need the lemma below, which using the full training trajectories KL divergence to upper bound the final output KL divergence.
\begin{lem}[{\citet[Proposition ~2.6.]{negrea2019information}}]
\label{lem:kl-decomposition}
Assume that $P_{W_0}=Q_{W_0}$, then $\mathrm{D_{KL}}(P_{W_T}||Q_{W_{T}})\leq\sum_{t=1}^T\ex{W_{0:t-1}}{\mathrm{D_{KL}}(P_{W_{t}|W_{0:t-1}}||Q_{W_{t}|W_{0:t-1}})}.$
\end{lem}

Let $G_{Jt}\triangleq\nabla L_{S_J}(W_{t-1})$, the SDE approximation of this prior updating is defined as:
\[
  W_t = W_{t-1} - \eta G_{Jt}+\eta C^{\frac{1}{2}}_{Jt} N_t,
\]
where $C_{Jt}  = \frac{1}{b}\left(\frac{1}{m}\sum_{i\in J}\nabla \ell_i\nabla \ell_i^{\bf T}-G_{Jt}G_{Jt}^{\bf T}\right)$ is the gradient noise covariance of the prior process. In this case, the prior distribution $P_{\mathcal{G}_{Jt}|W_{0:t-1}}$ will be an anisotropic Gaussian distribution. We also assume $n\gg b$, then $C_t=\frac{1}{b}\Sigma_t$.

We denote the difference between $G_t$ and $G_{Jt}$ by
\[
  \xi_t \triangleq  G_{Jt} - G_t.
\]
To see the relationship between $\xi_t$, $C_{Jt}$ and $C_t$, we present a useful lemma below.
\begin{lem}
If $m=n-1$, then the following two equations hold,
\[\ex{}{\xi_t\xi_t^{\bf T}} = \frac{b}{(n-1)^2}C_t, \quad \ex{}{C_{Jt}}=\frac{n(n-2)}{(n-1)^2}C_t,\]
where the expectation is taken over $J$.
\label{lem:disjoint-var-avg}
\end{lem}
% \begin{proof}
% Let $m=n-1$. Let $\{U\}_{i=1}^n$ be mask random variables. If $Z_i$ is in $S_J$ then the corresponding $U_i=1$, otherwise $U_i=0$. 

% We first notice that 
% $\ex{U_i}{U_i^2} = P(U_i=1) = \frac{n-1}{n}$ and $\ex{U_i,U_j}{U_iU_j} = P(U_i=1,U_j=1) = \frac{1}{n}\cdot\frac{1}{n-1}\cdot\binom{n-1}{2}\cdot2!=\frac{n-2}{n}$ for every $i\neq j$. 

% We now let $M_t=\frac{1}{n}\sum^n_{i=1}\nabla\ell(W_{t-1},Z_i)\nabla\ell(W_{t-1},Z_i)^T$, then $\Sigma_t=M_t-G_t G_t^T$.

% Hence, the following can be obtained.
% \begin{align}
%     &\ex{J}{G_{Jt}G_{Jt}^T}\notag\\
%     =&\ex{J}{\left(\frac{1}{n-1}\sum_{i\in J}\nabla\ell(W_{t-1},Z_i)\right)\left(\frac{1}{n-1}\sum_{i\in J}\nabla\ell(W_{t-1},Z_i)\right)^T}\notag\\
%     =& \frac{1}{(n-1)^2}\left[{\sum^n_{i=1 }\nabla\ell(W_{t-1},Z_i)\nabla\ell(W_{t-1},Z_i)^T\ex{U_i}{U_i^2}+\sum_{i\neq j}\nabla\ell(W_{t-1},Z_i)\nabla\ell(W_{t-1},Z_j)^T\ex{U_i,U_j}{U_iU_j}}\right]\notag\\
%     =& \frac{1}{(n-1)^2}\left[{\frac{n-1}{n}\sum^n_{i=1 }\nabla\ell(W_{t-1},Z_i)\nabla\ell(W_{t-1},Z_i)^T+\frac{n-2}{n}\sum_{i\neq j}\nabla\ell(W_{t-1},Z_i)\nabla\ell(W_{t-1},Z_j)^T}\right]\notag\\
%     =& \frac{1}{(n-1)^2}\left[{\frac{1}{n}\sum^n_{i=1 }\nabla\ell(W_{t-1},Z_i)\nabla\ell(W_{t-1},Z_i)^T+\frac{n-2}{n}\sum^n_{i=1 }\sum^n_{j=1 }\nabla\ell(W_{t-1},Z_i)\nabla\ell(W_{t-1},Z_i)^T}\right] \notag\\
%     =& \frac{1}{(n-1)^2}\left[{M_t+\frac{n-2}{n}\sum^n_{i=1 }\sum^n_{j=1 }\nabla\ell(W_{t-1},Z_i)\nabla\ell(W_{t-1},Z_i)^T}\right].\label{eq:GJ-moment}
% \end{align}

% Then, to have the first equation,
% \begin{align*}
%   \ex{J}{\xi_t\xi_t^T}
%   =& \ex{J}{(G_{Jt} - G_t)(G_{Jt} - G_t)^T}\\
% %   =& G_t G_t^T - 2G_t\ex{J}{G_{Jt}^T}+\ex{J}{G_{Jt}G_{Jt}^T}\\
%   =& \ex{J}{G_{Jt}G_{Jt}^T} - G_t G_t^T \\
%   % =& \ex{J}{\left(\frac{1}{n-1}\sum_{i\in J}\nabla\ell(W_{t-1},Z_i)\right)\left(\frac{1}{n-1}\sum_{i\in J}\nabla\ell(W_{t-1},Z_i)\right)^T} - G_t G_t^T\\
% %   =& \frac{1}{(n-1)^2}\ex{J}{\sum_{i\in J}\sum_{j\in J}\nabla\ell(W_{t-1},Z_i)\nabla\ell(W_{t-1},Z_j)^T} - G_t G_t^T\\
% %   =& \frac{1}{(n-1)^2}\ex{J}{\sum_{i\in J}\nabla\ell(W_{t-1},Z_i)\nabla\ell(W_{t-1},Z_i)^T+\sum_{i\neq j}\nabla\ell(W_{t-1},Z_i)\nabla\ell(W_{t-1},Z_j)^T} - G_t G_t^T\\
%   % =& \frac{1}{(n-1)^2}\left[{\sum^n_{i=1 }\nabla\ell(W_{t-1},Z_i)\nabla\ell(W_{t-1},Z_i)^T\ex{U_i}{U_i^2}+\sum_{i\neq j}\nabla\ell(W_{t-1},Z_i)\nabla\ell(W_{t-1},Z_j)^T\ex{U_i,U_j}{U_iU_j}}\right] - G_t G_t^T\\
%   % =& \frac{1}{(n-1)^2}\left[{\frac{n-1}{n}\sum^n_{i=1 }\nabla\ell(W_{t-1},Z_i)\nabla\ell(W_{t-1},Z_i)^T+\frac{n-2}{n}\sum_{i\neq j}\nabla\ell(W_{t-1},Z_i)\nabla\ell(W_{t-1},Z_j)^T}\right] - G_t G_t^T\\
%   =& \frac{1}{(n-1)^2}\left[{M_t+\frac{n-2}{n}\sum^n_{i=1 }\sum^n_{j=1 }\nabla\ell(W_{t-1},Z_i)\nabla\ell(W_{t-1},Z_i)^T}\right] - G_t G_t^T\\
% %   =& \frac{1}{(n-1)^2}\left[{\frac{1}{n}\sum^n_{i=1 }\nabla\ell(W_{t-1},Z_i)\nabla\ell(W_{t-1},Z_i)^T+\frac{n-2}{n}\sum^n_{i=1 }\sum^n_{j=1 }\nabla\ell(W_{t-1},Z_i)\nabla\ell(W_{t-1},Z_i)^T} - (n-1)^2G_t G_t^T\right]\\
%   =&\frac{1}{(n-1)^2}\left[M_t - G_t G_t^T\right]\\
%   =&\frac{1}{(n-1)^2}\Sigma_t.
% \end{align*}

% Finally, to have the second equation, 
% \begin{align*}
%     \ex{J}{C_{Jt}}
%   =&\frac{1}{b}\ex{J}{\frac{1}{n-1}\sum_{i\in J }\nabla\ell(W_{t-1},Z_i)\nabla\ell(W_{t-1},Z_i)^T - G_{Jt} G_{Jt}^T}\\
%   =&\frac{1}{b}\frac{1}{n}\sum^n_{i=1}\nabla\ell(W_{t-1},Z_i)\nabla\ell(W_{t-1},Z_i)^T - \frac{1}{b}\ex{J}{G_{Jt} G_{Jt}^T}\\
%   =& \frac{1}{b}M_t - \frac{1}{b}\frac{1}{(n-1)^2}\left[M_t+\frac{n-2}{n}\sum^n_{i=1 }\sum^n_{j=1 }\nabla\ell(W_{t-1},Z_i)\nabla\ell(W_{t-1},Z_i)^T\right]\\
%   =& \frac{n(n-2)}{(n-1)^2}\frac{1}{b}(M_t - G_tG_t^T)\\
%   =& \frac{n(n-2)}{(n-1)^2} C_t,
% \end{align*}
% which completes the proof.
% \end{proof}

Instead of using Lemma~\ref{lem:data-dependent-prior}, we invoke the following result which is a simple extension of \cite[Theorem~2.5]{negrea2019information}.
\begin{lem}[{\cite[Theorem~1.]{wang2021optimizing}}]
\label{lem:data-dependent-prior-2}
Assume the loss $\ell(w,Z)$ is bounded in $[0,M]$, the expected generalization gap is bounded by
\[
\mathcal{E}_{\mu}(\mathcal{A})\leq\frac{M}{\sqrt{2}}\mathbb{E}_{S,J}{\sqrt{\mathrm{D_{KL}}(P_{W|S_J}||Q_{W|S})}}
\]
\end{lem}
\paragraph{Comparison with the work of  \cite{wang2021optimizing}}
    \cite{wang2021optimizing} studies the algorithm of SGD with anisotropic noise, while our SDE analysis focuses on GD with anisotropic noise. This means that the discrete gradient noise arising from mini-batch sampling still exists in their analyzed algorithm, whereas the gradient noise is fully modeled as Gaussian in our Section~\ref{sec:IT-SGD}. Moreover, \cite{wang2021optimizing} uses matrix analysis tools to optimize the prior distribution. A significant distinction lies in their optimization analysis, which relies on the assumption that the trace of gradient noise covariance remains unchanged during training (see {\bf Constriant~1} in their paper). Additionally, their final optimal posterior covariance is derived based on the assumption that the posterior distribution of $W$ is invariant to the data index, see Assumption 1 in their paper. In contrast, our Section~\ref{sec:IT-SGD} avoids making these assumptions and demonstrates the superiority of population gradient noise covariance (GNC) in Lemma~\ref{lem:compare-iso-noniso}, by invoking a variant of the log-sum inequality. In summary, our proof is simpler and more straightforward, while \cite{wang2021optimizing} makes a stronger claim about the optimality of population GNC based on their additional assumptions.


As introduced in \cite{wang2021optimizing}, the subsequent analysis based on the data-dependent prior bound will rely on an additional assumption.
\begin{assum}
\label{ass:invariant}
When $m=n-1$, given dataset $S=s$, the distribution $P_{W_t|J,S_J}$ is invariant of $J$.
\end{assum}
In \cite{wang2021optimizing}, authors mention that in practice, $n$ is usually very large, so this assumption hints that changing one instance in $S_{J}$ will not make $P_{W_t|J,S_J}$ be too different.


We are now in a position to state the following theorem.
\begin{thm}
\label{thm:data-dependent-bound}
Assume the loss $\ell(w,Z)$ is bounded in $[0,M]$ and Assumption 
% \ref{assum-sde} and 
\ref{ass:invariant} hold,the expected generalization gap of SGD is bounded by
\[
\mathcal{E}_{\mu}(\mathcal{A})\leq\ex{S}{\sqrt{M^2\sum_{t=1}^T\ex{W_{t-1}}{\left(\frac{(b-1)d}{(n-1)^2}+tr\left\{\ex{J}{\log{C_tC^{-1}_{Jt}}}\right\}\right)}}}.
\]
% where $h_3(W_{t-1}) = tr\left\{\log{C_t}- \ex{}{\log{C_{Jt}}}\right\}$.
\end{thm}

\begin{proof}
By Lemma \ref{lem:data-dependent-prior-2} and Lemma \ref{lem:kl-decomposition}, we have
\begin{align}
  \mathcal{E}_{\mu}(\mathcal{A})\leq&\ex{S,J}{\sqrt{\frac{R'^2}{2}\sum_{t=1}^T\ex{W_{0:t-1}|S,J}{\mathrm{D_{KL}}(P_{W_t|W_{0:t-1},S_J}||Q_{W_t|W_{0:t-1},S})}}}\notag\\
  \leq&\ex{S}{\sqrt{\frac{R'^2}{2}\sum_{t=1}^T\ex{W_{0:t-1}|S,J}{\ex{J}{\mathrm{D_{KL}}(P_{W_t|W_{0:t-1},S_J}||Q_{W_t|W_{0:t-1},S})}}}},\label{ineq:exchange-expectation}
\end{align}
where Eq. (\ref{ineq:exchange-expectation}) is by Jensen's inequality and Assumption \ref{ass:invariant}.

Recall $\Sigma_t=\frac{1}{n}\sum_{i=1}^n\nabla \ell(W_{t-1},Z_i)\nabla \ell(W_{t-1},Z_i)^{\bf T}-\nabla L_S(W_{t-1})\nabla L_S(W_{t-1})^{\bf T}$ and $C_t = \frac{1}{b}\Sigma_t$.

By the KL divergence between two Gaussian distributions, for any $t\in [T]$, we have
\begin{eqnarray}
  &&\ex{J}{{\mathrm{D_{KL}}(P_{W_t|W_{0:t-1},S_J}||Q_{W_t|W_{0:t-1},S})}}\notag\\&=&\ex{J}{\frac{1}{2}\left(\xi_t^{\bf T} C_{t}^{-1} \xi_t+\log{\frac{\det(C_t)}{\det(C_{Jt})}}+tr\{C_t^{-1}C_{Jt}\}-d\right)}\label{eq:exp-j}\\
  &=&\frac{1}{2}\left(tr\{C_{t}^{-1}\ex{J}{\xi_t  \xi_t^{\bf T}}\}+\ex{J}{\log{\frac{\det(C_t)}{\det(C_{Jt})}}}+\ex{J}{tr\{C_t^{-1}C_{Jt}\}}-d\right)\notag\\
  &=&\frac{1}{2}\left(\frac{1}{(n-1)^2}tr\{C_{t}^{-1}\Sigma_t\}+\ex{J}{\log{\frac{\det(C_t)}{\det(C_{Jt})}}}+\ex{J}{tr\{C_t^{-1}C_{Jt}\}}-d\right)\label{eq:variance-disjoint-set}\\
  &=&\frac{1}{2}\left(\frac{b}{(n-1)^2}tr\{\Sigma_{t}^{-1}\Sigma_t\}+\ex{J}{\log{\frac{\det(C_t)}{\det(C_{Jt})}}}+tr\{C_t^{-1}\ex{J}{C_{Jt}}\}-d\right)\notag\\
%   &=&\frac{1}{2}\left(\frac{bd}{(n-1)^2}+\frac{n(n-2)d}{(n-1)^2}+tr\{\log C_t- \ex{J}{\log{C_{Jt}}}-\mathrm{I}_d\}\right)\\
  &=&\frac{1}{2}\left(\frac{bd}{(n-1)^2}+\frac{n(n-2)d}{(n-1)^2}-d+tr\{\log C_t- \ex{J}{\log{C_{Jt}}}\}\right)\label{eq:average-disjoint-set}\\
  &=&\frac{1}{2}\left(\frac{(b-1)d}{(n-1)^2}+tr\{\log C_t- \ex{J}{\log{C_{Jt}}}\}\right)\notag
%   &\leq&\frac{1}{2}\left(\frac{bd}{(n-1)^2}+tr\{C_t+ \ex{J}{{C^{-1}_{Jt}}}+C_t^{-1}\ex{J}{C_{Jt}}-3\mathrm{I}_d\}\right),
\end{eqnarray}
where Eq. (\ref{eq:variance-disjoint-set}) and Eq. (\ref{eq:average-disjoint-set}) are by Lemma \ref{lem:disjoint-var-avg}. This concludes the proof.
\end{proof}
\begin{rem}
    If the bound in \cite{negrea2019information} is used, then the first term in Eq.~(\ref{eq:exp-j}) is $\xi_t^{\bf T}C_{Jt}^{-1}\xi_t$, where both $C_{Jt}$ and $\xi_t$ dependent on $J$,  making the bound difficult to analyze.
\end{rem}
 The effect of $tr\{{\log{C_t}}\}$ on the magnitude of the bound can be decreased by the $tr\{\mathbb{E}_{J}{\log{C_{Jt}}}\}$. If we further consider Taylor expansion of the function $\log C_{Jt}$ around $\mathbb{E}_J[C_{Jt}]$, we have a well-known approximation 
 \[\ex{}{\log C_{Jt}}\approx\log\ex{}{C_{Jt}}-\mathrm{Var}(C_{Jt})/(2\mathbb{E}^2[C_{Jt}]).\]
 Thus, recall Lemma \ref{lem:disjoint-var-avg}, the difference between $tr\{{\log{C_{t}}}\}$ and  $tr\{\mathbb{E}_{J}{\log{C_{Jt}}}\}$ would become:
\[
\log{(1+1/(n^2-2n))}+\mathrm{Var}(C_{Jt})/(2\mathbb{E}^2[C_{Jt}]).
\]
Thus, the generalization gap should be characterized by the second term above.


When $n\rightarrow \infty$, the first term will converges to zero, and for the second term, $\mathbb{E}^2[C_{Jt}]$ will converge to a constant by Lemma \ref{lem:disjoint-var-avg}, and then the bound is  $\mathrm{Var}(C_{Jt})$ will also converges to zero.

% \subsection{Proof of Lemma \ref{lem:disjoint-var-avg}}
% 


% \subsection{Proof of Theorem \ref{thm:data-dependent-bound}}



\section{Omitted Proofs, Additional Results and Discussions in Section 
% ``Generalization Bounds Via Terminal State'' 
\ref{sec:pac-bayes}
}

In fact, this section provides a PAC-Bayes type analysis. The connection between information-theoretic bounds and PAC-Bays bounds have already been discussed in many previous works \citep{bassily2018learners,hellstrom2020generalization,alquier2021user}. Roughly speaking, the most significant component of a PAC-Bayes bound is the KL divergence between the posterior distribution of a randomized algorithm output and a prior distribution, i.e. $\mathrm{D_{KL}}(Q_{W_T|S}||P_{N})$ for some prior $P_N$. In essence, information-theoretic bounds can be view as having the same spirit. For concreteness, in Lemma \ref{lem:xu's-bound}, 
$I(W_T;S)=\mathbb{E}_S[\mathrm{D_{KL}}(Q_{W_T|S}||P_{W_T})]$, in which case the marginal $P_{W_T}$ is used as a prior of the algorithm output. Furthermore, by using  Lemma \ref{lem:mi-center-gravity}, we have $I(W_T;S) \leq \inf_{P_N} \mathbb{E}_S[\mathrm{D_{KL}}(Q_{W_T|S}||P_N)]$. Hence, Lemma \ref{lem:xu's-bound} can be regarded as a PAC-Bayes bound with the optimal prior. In addition, the PAC-Bayes framework is usually used to provide a high-probability bound, %(with respect to the randomness of $S$), 
while information-theoretic analysis is applied to bounding the expected generalization error. In this sense, information-theoretic framework is closer to another concept called MAC-Bayes \citep{grunwald2021pac}.

\subsection{Proof of Lemma~\ref{lem:stationary-real}}
% We note that this lemma can be recovered from \citet[Theorem~1. and Theorem~4.]{liu2021noise}, we provide a proof here for self-containing. 
\begin{proof}
%     When $w$ is close to any local minimum $w^*$, we can use a second-order Taylor expansion to approximate the value of the loss at $w$, 
% \begin{eqnarray}
%   L_s(w) \approx L_s(w^*) + \frac{1}{2}(w-w^*)^\mathrm{\bf T} H_{w^*}(w-w^*).
%   % \label{eq:second-order-taylor}
% \end{eqnarray}

% Then, when $w_t\to w^*$, we have 
Recall $G_t=\nabla L_s(w_t)=H_{w^*}\pr{w_t-w^*}$.
and Eq.~(\ref{eq:sgd-update-2}), then
\begin{align*}
    w_{t} =& w_{t-1} - \eta G_t + \eta V_t\notag\\
    =&w_{t-1} - \eta H_{w^*}\pr{w_{t-1}-w^*} + \eta V_t.
\end{align*}

Let $W'_t\triangleq W_t-w^*$. Thus, as $T\to \infty$,
\begin{align*}
    &\ex{W'_{T}}{W'_{T}{W'_{T}}^\mathrm{\bf T}}\\
    =&\ex{W'_{T-1}, V_T}{\pr{W'_{T-1}-\eta H_{w^*}W'_{T-1} + \eta V_t}\pr{W'_{T-1}-\eta H_{w^*}W'_{T-1} + \eta V_t}^\mathrm{\bf T}}\\
    =&\ex{W'_{T-1}}{W'_{T-1}{W'}^\mathrm{\bf T}_{T-1}-\eta H_{w^*}W'_{T-1}{W'}^\mathrm{\bf T}_{T-1}-\eta W'_{T-1}{W'}^\mathrm{\bf T}_{T-1}H_{w^*}+\eta^2H_{w^*}W'_{T-1}{W'}^\mathrm{\bf T}_{T-1}H_{w^*}}\\
     &\qquad\qquad\qquad +\eta^2\ex{V_T}{V_T{V_T}^\mathrm{\bf T}},%\label{eq:use-center-mean}
\end{align*}
where the last equation is by $\cex{V_T}{w_{T-1}}{V_{T}}=0$.

Recall that $\ex{V_T}{V_T{V_T}^\mathrm{\bf T}}=C_T$ and notice that $\ex{W'_{T}}{W'_{T}{W'_{T}}^\mathrm{\bf T}}=\ex{W'_{T-1}}{W'_{T-1}{W'_{T-1}}^\mathrm{\bf T}}=\Lambda_{w^*}$ when $T\to\infty$ (i.e. ergodicity), we have
\[
\Lambda_{w^*} H_{w^*} + H_{w^*} \Lambda_{w^*}-\eta H_{w^*} \Lambda_{w^*}H_{w^*} = \eta C_{T}.
\]

Furthermore,  if $H_{w^*}$ and $\Lambda_{w^*}$ commute, namely $\Lambda_{w^*}H_{w^*}=H_{w^*}\Lambda_{w^*}$, we have 
\[
\br{H_{w^*}\pr{2\mathrm{I}_d-\eta H_{w^*}}}\Lambda_{w^*}=\eta C_T, 
\]
which will give use $\Lambda_{w^*}=\eta \br{H_{w^*}\pr{2\mathrm{I}_d-\eta H_{w^*}}}^{-1}C_{T}$.

This completes the proof.
\end{proof}

% \subsection{Proof of Lemma~\ref{lem:solution-stationary}}
% \begin{proof}



%     Although controversy exists \citep{ziyin2022strength}, 
% % the approximation in Eq. (\ref{eq:approx-hessian-gradient}) below, where 
% Hessian is proportional to the GNC near local minima when the loss is the negative log likelihood. To see this, we first note that the remaining analysis is all based on selecting the log-loss, i.e. cross-entropy loss, as the loss function $\ell$. Thus, when $w_t\to w^*$, we have,
% \[
% \Sigma_{w^*}=\frac{1}{n}\sum_{i=1}^n\nabla \ell_i\nabla \ell_i^T-G_tG_t^T\approx \frac{1}{n}\sum_{i=1}^n\nabla \ell_i\nabla \ell_i^T=F_{w^*},
% \]
% where $F_{w^*}$ is the \textit{Fisher information matrix} (FIM). This approximation is true because gradient noise dominates over  gradient mean near local minima. Moreover, FIM is close to the Hessian near local minima with the log-loss \citep[Chapter~8]{pawitan2001all}, namely, $F_{w^*}\approx H_{w^*}$. Let $n\gg b$, we have
% \begin{align}
% \label{eq:approx-hessian-gradient}
%     H_{w^*} \approx \Sigma_{w^*} =  b C_{w^*}.
% \end{align}
% % \textcolor{red}{Need more justification of this approximation. For Cross-entropy loss?}
% Similar approximation is widely used in the literature \citep{jastrzkebski2017three,zhu2019anisotropic,li2020hessian,xie2020diffusion,xie2021positive,liu2021noise}. Therefore, if Eq.~(\ref{eq:approx-hessian-gradient}) holds, then the solution to the equation in Lemma~\ref{lem:stationary-real} is
% \[
%     \Lambda_{w^*}=\frac{\eta}{b}(2\mathrm{I}_d-\eta H_{w^*})^{-1}.
% \]
% This completes the proof.
% \end{proof}


\subsection{Theorem~\ref{thm:state-bound-dis-prior}: A General Bound}

The following bound can be easily proved by using Eq.~(\ref{eq:kl-two-gau}).
\begin{thm}
\label{thm:state-bound-dis-prior}
Under the same conditions in Lemma~\ref{lem:xu's-bound} and Lemma~\ref{lem:stationary-real}, then for any $P_{W_{T}}=\mathcal{N}\pr{\tilde{w}, \widetilde{\Lambda}}$, where $\tilde{w}$ and $\widetilde{\Lambda}$ are independent of $S$,  we have
\[
\mathcal{E}_{\mu}(\mathcal{A})\leq\sqrt{\frac{R^2}{2n}\inf_{\tilde{w}, \widetilde{\Lambda}}\ex{S,W_S^*}{\log\frac{\mathrm{det}\pr{\widetilde{\Lambda}}}{\mathrm{det}\pr{\Lambda_{W^*_{S}}}}+\tr{\widetilde{\Lambda}^{-1}\Lambda_{W^*_{S}}-\mathrm{I}_d}+\mathrm{d}_{\mathrm{M}}^2\pr{W^*_{S},\tilde{w};\widetilde{\Lambda}}}},
\]
where $\mathrm{d}_{\mathrm{M}}\pr{x,y;\Sigma}\triangleq \sqrt{(x-y)^{\bf T}\Sigma^{-1}(x-y)}$ is the Mahalanobis distance.
\end{thm}


\subsection{Proof of Theorem~\ref{thm:opt-state-inde-bound}}
\begin{proof}
    Let $P_{W_T}=\mathcal{N}\pr{w^*_{\mu},\Lambda_{w^*_{\mu}}}$, then
    \begin{align}
        &\ex{S,W_S^*}{\log\frac{\mathrm{det}\pr{\Lambda_{w^*_{\mu}}}}{\mathrm{det}\pr{\Lambda_{W^*_{S}}}}+\tr{\Lambda_{w^*_{\mu}}^{-1}\Lambda_{W^*_{S}}-\mathrm{I}_d}+\pr{W_S^*-w^*_{\mu}}^{\bf T}\Lambda_{w^*_{\mu}}^{-1}\pr{W_S^*-w^*_{\mu}}}\notag\\
        =&\ex{S,W_S^*}{\log\frac{\mathrm{det}\pr{\Lambda_{w^*_{\mu}}}}{\mathrm{det}\pr{\Lambda_{W^*_{S}}}}+\tr{\Lambda_{w^*_{\mu}}^{-1}\Lambda_{W^*_{S}}-\mathrm{I}_d}+\tr{\Lambda_{w^*_{\mu}}^{-1}\pr{W_S^*-w^*_{\mu}}\pr{W_S^*-w^*_{\mu}}^{\bf T}}}\notag\\
        =&\ex{S,W_S^*}{\log\frac{\mathrm{det}\pr{\Lambda_{w^*_{\mu}}}}{\mathrm{det}\pr{\Lambda_{W^*_{S}}}}}+\tr{\Lambda_{w^*_{\mu}}^{-1}\ex{S,W_S^*}{\Lambda_{W^*_{S}}}-\mathrm{I}_d+\Lambda_{w^*_{\mu}}^{-1}\ex{W_S^*}{\pr{W_S^*-w^*_{\mu}}\pr{W_S^*-w^*_{\mu}}^{\bf T}}}.\label{eq:kl-pop-stacov}
    \end{align}

    Denote $\widetilde{\Sigma}_{\mu}\triangleq\ex{S,W_S^*}{\pr{W_S^*-w^*_{\mu}}\pr{W_S^*-w^*_{\mu}}^{\bf T}}=\ex{W_S^*}{W_S^*{W_S^*}^{\bf T}}-w^*_{\mu}{w^*_{\mu}}^{\bf T}$.

    Notice that
    \begin{align*}
        \ex{S,W_S^*}{\Lambda_{W^*_{S}}}=&\ex{S,W_S^*,W_T}{\pr{W_T-W_S^*}\pr{W_T-W_S^*}^{\bf T}}\\
        =&\ex{W_T}{W_T{W_T}^{\bf T}}-\ex{W_S^*}{W_S^*{W_S^*}^{\bf T}}\\
        =&\ex{W_T}{W_T{W_T}^{\bf T}}-w^*_{\mu}{w^*_{\mu}}^{\bf T}-\pr{\ex{W_S^*}{W_S^*{W_S^*}^{\bf T}}-w^*_{\mu}{w^*_{\mu}}^{\bf T}}\\
        =&\Lambda_{w^*_{\mu}}-\widetilde{\Sigma}_{\mu}.
    \end{align*}

    Therefore,
    \begin{align*}
        &\tr{\Lambda_{w^*_{\mu}}^{-1}\ex{S,W_S^*}{\Lambda_{W^*_{S}}}-\mathrm{I}_d+\Lambda_{w^*_{\mu}}^{-1}\ex{W_S^*}{\pr{W_S^*-w^*_{\mu}}\pr{W_S^*-w^*_{\mu}}^{\bf T}}}\\
        =&\tr{\Lambda_{w^*_{\mu}}^{-1}\ex{S,W_S^*}{\Lambda_{W^*_{S}}}-\Lambda_{w^*_{\mu}}^{-1}\Lambda_{w^*_{\mu}}+\Lambda_{w^*_{\mu}}^{-1}\ex{W_S^*}{\pr{W_S^*-w^*_{\mu}}\pr{W_S^*-w^*_{\mu}}^{\bf T}}}\\
        =&\tr{\Lambda_{w^*_{\mu}}^{-1}\pr{\ex{S,W_S^*}{\Lambda_{W^*_{S}}}-\Lambda_{w^*_{\mu}}+\widetilde{\Sigma}_{\mu}}}\\
        =&0.
    \end{align*}

    Plugging this into Eq.~(\ref{eq:kl-pop-stacov}), we have
    \begin{align*}
        &\ex{S,W_S^*}{\log\frac{\mathrm{det}\pr{\Lambda_{w^*_{\mu}}}}{\mathrm{det}\pr{\Lambda_{W^*_{S}}}}+\tr{\Lambda_{w^*_{\mu}}^{-1}\Lambda_{W^*_{S}}-\mathrm{I}_d}+\pr{W_S^*-w^*_{\mu}}^{\bf T}\Lambda_{w^*_{\mu}}^{-1}\pr{W_S^*-w^*_{\mu}}}\\
        &=\ex{S,W_S^*}{\log\frac{\mathrm{det}\pr{\Lambda_{w^*_{\mu}}}}{\mathrm{det}\pr{\Lambda_{W^*_{S}}}}}=\ex{S,W_S^*}{\tr{\log\pr{\Lambda_{W^*_{S}}^{-1}\Lambda_{w^*_{\mu}}}}}.
    \end{align*}
    
    Finally, applying Theorem~\ref{thm:state-bound-dis-prior} will conclude the proof.
\end{proof}


\subsection{Proof of Corollary~\ref{cor:pacbayes-anisotropic-prior}}
\begin{proof}
% The first part in the statement
The proof is straightforward by plugging $\Lambda_{w^*}= \br{H_{w^*}\pr{\frac{2}{\eta}\mathrm{I}_d}}^{-1}C_{T}$ in Theorem~\ref{thm:opt-state-inde-bound}.
\end{proof}

\subsection{Proof of Corollary~\ref{cor:pacbayes-isotropic-prior}}

\begin{proof}
    By Lemma~\ref{lem:log-sum-ineq}, it's easy to obtain the following bound according to Theorem~\ref{thm:opt-state-inde-bound}.
    \[
\mathcal{E}_{\mu}(\mathcal{A})\leq\sqrt{\frac{R^2d}{2n}\log\pr{\frac{\ex{}{\mathrm{d}^2_{\mathrm{M}}\pr{W_S^*,w^*_\mu;\ex{}{\Lambda_{W^*_S}}}}}{d}+1}+\ex{}{\tr{\log\pr{\Lambda_{W^*_S}^{-1}\ex{}{\Lambda_{W^*_S}}}}}}.
\label{ineq:losser-state-bound}
\]

Then, plugging $\Lambda_{W^*_{S}}  = \frac{\eta}{2b} \mathrm{I}_d$ will conclude the proof.
\end{proof}


\subsection{Corollary~\ref{cor:pacbayes-isotropic-prior-init}: Distance to Initialization}

\begin{cor}
\label{cor:pacbayes-isotropic-prior-init}
Under (i-iii) in Lemma~\ref{lem:stationary-real}, then $
\mathcal{E}_{\mu}(\mathcal{A})\leq\sqrt{\frac{dR^2}{n}\log\left(\frac{2b}{\eta d}\mathbb{E}{||W_S^*-W_{0}||^2}+1\right)}
$.
% where $\tilde{w}=w_\mu^* =\ex{}{W_S^*}$ and the optimal $\sigma^*=\sqrt{\ex{}{||W_S^*-\tilde{w}||^2/d+\frac{\eta }{2b}}}$.
\end{cor}
\begin{proof} 
Notice that $I(W_T;S)\leq\mathbb{E}_{S}{\mathrm{D_{KL}}(Q_{W_T|S}||P_{W_T})}$ holds for any $\sigma>0$, then for a given $\tilde{w}$, we have
\begin{eqnarray}
      I(W_T;S)&=&\inf_{P_{W_T}} \ex{S}{\mathrm{D_{KL}}(Q_{W_T|S}||P_{W_T})}\notag\\ &\leq&\inf_{\sigma}\ex{S}{\mathrm{D_{KL}}(P_{W^*_{S}+\sqrt{\frac{\eta}{2b}}N, W^*_{S}|S}||P_{\tilde{w}+\sigma N})}\label{ineq:kl-chain}\\
      &=&\inf_{\sigma}\ex{S,W^*_{S}}{\mathrm{D_{KL}}(P_{W^*_{S}+\sqrt{\frac{\eta}{2b}}N, |S,W^*_{S}}||P_{\tilde{w}+\sigma N})}\notag\\
      &=& \inf_{\sigma} \frac{1}{2}\ex{S,W^*_{S}}{\frac{1}{\sigma^2}(W^*_{S}-\tilde{w})^{\bf T}(W^*_{S}-\tilde{w})+\log\frac{\sigma^{2d}}{(\eta/2b)^d}+tr\{\frac{\eta}{2b\sigma^2}\mathrm{I}_d\}-d}\notag\\
      &=& \frac{1}{2}\inf_{\sigma} \frac{1}{\sigma^2}\ex{S,W^*_{S}}{||W^*_{S}-\tilde{w}||^2+\frac{\eta d}{2b}} + d\log{\sigma^{2}}+ d\log\frac{2b}{\eta}-d\notag\\
      &=&\frac{1}{2} d\log\left(\frac{2b}{\eta d}\ex{S,W^*_{S}}{||W^*_{S}-\tilde{w}||^2}+1\right),
\end{eqnarray}
where Eq.~(\ref{ineq:kl-chain}) is by the chain rule of KL divergence, and the optimal $\sigma^*=\sqrt{\ex{S,W^*_{S}}{||W^*_{S}-\tilde{w}||^2/d+\frac{\eta }{2b}}}$. Let $\tilde{w}=W_0$ will conclude the proof.
\end{proof}

% \subsection{Recover Gradient Norm Based Bound from Theorem~\ref{thm:pacbayes-isotropic-prior}}
Additionally, Corollary~\ref{cor:pacbayes-isotropic-prior-init} can be used to recover a trajectory-based bound.
\begin{cor}
\label{cor:pacbayes-gradient}
Let $W_T=W_s^*$, $\tilde{w}=0$ and W.L.O.G, assume $W_0=0$, then
\[
\mathcal{E}_{\mu}(\mathcal{A})\leq\sqrt{\frac{dR^2}{n}\log\left(\frac{4bT\eta}{d}\sum_{t=1}^T\ex{}{||G_t||^2+tr\{C_t\}}+1\right)},
\]
\end{cor}
\begin{rem}
In Theorem \ref{thm:isotropic-prior-bound}, let $\tilde{g}=0$  and by applying Jensen's inequality, we could also let the summation and factor $T$ move inside the square root. Then the most different part in Corollary \ref{cor:pacbayes-gradient} is that $A_2(t)$ is now removed from the bound. 
% Since $-tr\{\log{C_t}\}$ is usually has very large magnitude in practice, this improvement is significant.
\end{rem}


\begin{proof}
When $W_0 = 0$, we notice that
\[
W_T = \sum_{t=1}^T -\eta G_t + \eta N_{C_t},
\]
where $N_{C_t} = C_t^{1/2} N_t$.

Thus,
\[
||W_T||^2=||\sum_{t=1}^T -\eta G_t + \eta N_{C_t}||^2\leq 2T\eta^2\sum_{t=1}^T||G_t||^2+||N_{C_t}||^2
\]

Let $\tilde{w}=0$, recall the bound in Corollary~\ref{cor:pacbayes-isotropic-prior-init} and plugging the inequality above, we have
\begin{eqnarray}
  \mathcal{E}_{\mu}(\mathcal{A})&\leq&\sqrt{\frac{R^2}{n}d\log\left(\frac{2b}{\eta d}\ex{S,W_T}{||W_T-\tilde{w}||^2}+1\right)}\notag\\
  &\leq&\sqrt{\frac{dR^2}{n}\log\left(4bT\eta/d\ex{S,W_{0:T-1},N_{C_{0:t-1}}}{\sum_{t=1}^T||G_t||^2+||N_{C_t}||^2}+1\right)}\notag\\
  &=&\sqrt{\frac{dR^2}{n}\log\left(\frac{4bT\eta}{d}\sum_{t=1}^T\ex{S,W_{t-1}}{||G_t||^2+tr\{C_t\}}+1\right)}\notag
\end{eqnarray}
This concludes the proof.
\end{proof}


\subsection{Proof of Theorem \ref{thm:pacbayes-data-dependent-prior}}

\begin{proof}
Let $P_{W_T|S_J=s_j} = \mathcal{N}(W^*_{s_j},\frac{\eta}{2b}\mathrm{I}_d)$, then
\begin{eqnarray}
  \mathrm{D_{KL}}(Q_{W_T|S=s}||P_{W_T|S_J=s_j})&=&\mathrm{D_{KL}}(Q_{W^*_{s}+\sqrt{\frac{\eta}{2b}}N|S=s}||P_{W^*_{s_j}+\sqrt{\frac{\eta}{2b}}N|S_J=s_j})\notag\\
  &\leq&\mathrm{D_{KL}}(Q_{W^*_{s}+\sqrt{\frac{\eta}{2b}}N,W^*_{s}|S=s}||P_{W^*_{s_j}+\sqrt{\frac{\eta}{2b}}N,W^*_{s_j}|S_J=s_j})\label{ineq:kl-chain-2}\\
  &=&\ex{W^*_{s},W^*_{s_j}}{\mathrm{D_{KL}}(Q_{W^*_{s}+\sqrt{\frac{\eta}{2b}}N|W^*_{s},S=s}||P_{W^*_{s_j}+\sqrt{\frac{\eta}{2b}}N|W^*_{s_j},S_J=s_j})}\notag\\
  &=&\ex{W^*_{s},W^*_{s_j}}{\frac{b}{\eta}||W^*_{s}-W^*_{s_j}||^2},\label{eq:bound-data-prior}
\end{eqnarray}
where Eq.~(\ref{ineq:kl-chain-2}) is by the chain rule of KL divergence.
Plugging the Eq. (\ref{eq:bound-data-prior}) into Lemma \ref{lem:data-dependent-prior} will obtain the final result.
\end{proof}

% \subsection{Proof of Corollary~\ref{cor:IF-pacbayes-data-prior}}
% \begin{proof}
% % The first part in the statement
% The proof is trivial by plugging the influence function into the bound in Theorem~\ref{thm:pacbayes-data-dependent-prior}.

% For the second part, recall the influence function,
% \[
% W^*_{s_j}-W^*_{s}\approx\frac{1}{n}H^{-1}_{W^*_{s}}\nabla \ell(W^*_{s},z_i).
% \]
% Then,
% \begin{align*}
%     ||W^*_{s}-W^*_{s_j}||^2=&tr\{(W^*_{s_j}-W^*_{s})(W^*_{s_j}-W^*_{s})^T\}\notag\\
%     =&tr\left\{(\frac{1}{n}H^{-1}_{W^*_{s}}\nabla \ell(W^*_{s},z_i))(\frac{1}{n}H^{-1}_{W^*_{s}}\nabla \ell(W^*_{s},z_i))^T\right\}\notag\\
%     =&\frac{1}{n^2}tr\left\{H^{-1}_{W^*_{s}}H^{-1}_{W^*_{s}}(\nabla \ell(W^*_{s},z_i)\nabla \ell(W^*_{s},z_i))^T\right\}.\notag\\
% \end{align*}
% Recall Theorem~\ref{thm:pacbayes-data-dependent-prior}, we have
% \begin{align}
%     \mathcal{E}_{\mu}(\mathcal{A})\leq&\ex{S}{\sqrt{\frac{M^2b}{\eta}\cex{J,W^*_{S},W^*_{S_J}}{S}{{||W^*_{S}-W^*_{S_J}||^2}}}}\label{ineq:jensen-sqrt-1}\\
%     =&\ex{S}{\sqrt{\frac{M^2b}{2\eta}\cex{W^*_{S},W^*_{S_J}}{S}{\ex{J}{||W^*_{S}-W^*_{S_J}||^2}}}}\label{eq:invirant-1}\\
%     =&\frac{M}{n}\ex{S}{\sqrt{\frac{b}{2\eta}\cex{W^*_{S},W^*_{S_J}}{S}{tr\left\{H^{-1}_{W^*_{S}}H^{-1}_{W^*_{S}}\ex{J}{\nabla \ell(W^*_{S},Z_i)\nabla \ell(W^*_{S},Z_i)^T}\right\}}}}\notag\\
%     =&\frac{M}{n}\ex{S}{\sqrt{\frac{b}{2\eta}\cex{W^*_{S},W^*_{S_J}}{S}{tr\left\{H^{-1}_{W^*_{S}}H^{-1}_{W^*_{S}}\Sigma(W^*_{S})\right\}}}}\label{eq:sigma}\\
%     =&\frac{M}{n}\ex{S}{\sqrt{\frac{b}{2\eta}\cex{W^*_{S}}{S}{tr\left\{H^{-1}_{W^*_{S}}\right\}}}},\notag
% \end{align}
% where Eq.~(\ref{ineq:jensen-sqrt-1}) is by Jensen's inequality, Eq.~(\ref{eq:invirant-1}) is by the assumption and Eq.~(\ref{eq:sigma}) is by noticing
% \[
% \ex{J}{\nabla \ell(W^*_{s},z_i)\nabla \ell(W^*_{s},z_i)^T}=\frac{1}{n}\sum_{i=1}^n\nabla \ell(W^*_{s},z_i)\nabla \ell(W^*_{s},z_i)^T=\Sigma(W^*_{s}),
% \]
% since $\nabla L(s)$ is zero at the local minimum $W^*_{s}$. This concludes the proof.
% \end{proof}

% \subsection{Additional Result via More Realistic Learning Rate Setting}
% \label{sec:generalize-lr-condition}


% \begin{lem}[\citet{mandt2017stochastic}]
% Let $C_{w^*}$ be the GNC at $w^*$,
% % If Eq. (\ref{eq:second-order-taylor}) holds, 
% then in the long term limit, the covariance $\Lambda_{w^*}$ satisfies
% \[
% \Lambda_{w^*} H_{w^*} + H_{w^*} \Lambda_{w^*} = \eta C_{w^*}.
% \]
% \label{lem:posterior-covariance}
% Furthermore, the solution to this equation is 
%  $\Lambda_{w^*}=\frac{\eta}{2}H_{w^*}^{-1}C_{w^*}=\frac{\eta}{2b}H_{w^*}^{-1}\Sigma_{w^*}$ when $H_{w^*}$ and $C_{w^*}$ commute.
% \end{lem}
% Lemma~\ref{lem:posterior-covariance} relies on unrealistic small learning rate and state-independent gradient noise. To obtain more general result, we utilize a recent result from \cite{liu2021noise}.
% % \begin{lem}
% % \label{lem:stationary-real}
% % The stationary covariance of $W_T$ satisfies
% % \[
% % \Lambda(w^*) H_{w^*} + H_{w^*} \Lambda(w^*)-\eta H_{w^*} \Lambda(w^*)H_{w^*} = \eta C(w^*),
% % \]
% % If Eq.~\ref{eq:approx-hessian-gradient} holds, then
% % $
% % \Lambda(w^*)=\frac{\eta}{b}(2\mathrm{I}_d-\eta H_{w^*})^{-1}.
% % $
% % \end{lem}

% Then, Theorem~\ref{thm:pacbayes-data-dependent-prior} and Corollary~\ref{cor:IF-pacbayes-data-prior} can be modified in the following Theorem.
% \begin{thm}
% \label{thm:pacbayes-data-dependent-modify}
% Under the same conditions in Theorem~\ref{thm:pacbayes-data-dependent-prior}, let $H_{W_{s_j}^*}\approx H_{W_{s}^*}$ for any $j$, and assume  the distribution $P_{W^*_{S_J}|S_J}$ is invariant of $J$, then
% \[
% \mathcal{E}_{\mu}(\mathcal{A})\leq\frac{M}{n}\mathbb{E}_{S}{\sqrt{\cex{W^*_{S}}{S}{tr\left\{\frac{b}{2\eta}H^{-1}_{W^*_{S}}-\frac{b}{4} \mathrm{I}_d\right\}}}}.
% \]
% \end{thm}
% \begin{rem}
% Compared with Corollary~\ref{cor:IF-pacbayes-data-prior}, the bound in Theorem~\ref{thm:pacbayes-data-dependent-modify} is strictly tighter since there is a negative $\frac{bd}{4}$ term inside the square root. However, the gap between the two bounds is a constant so both the two bounds follow the same trend with respect to the change of the generalization gap.
% \end{rem}

% \begin{proof}
% Notice that now
% \begin{eqnarray*}
%   \mathrm{D_{KL}}(Q_{W_T|S=s}||P_{W_T|S_J=s_j})&\leq&\ex{W^*_{s},W^*_{s_j}}{\frac{1}{2}(W^*_{s}-W^*_{s_j})^T\left(\Lambda(W^*_{S})\right)^{-1}(W^*_{s}-W^*_{s_j})},
% \end{eqnarray*}
% and
% \[
% \left(\Lambda(W^*_{S})\right)^{-1}=\frac{b}{\eta}(2\mathrm{I}_d-\eta H_{W^*_{S}}).
% \]

% Then recall Lemma~\ref{lem:data-dependent-prior},
% \begin{align}
%     \mathcal{E}_{\mu}(\mathcal{A})\leq&\frac{M}{\sqrt{2}}\mathbb{E}_{S,J}{\sqrt{\cex{W^*_{S},W^*_{S_J}}{S,J}{\frac{1}{2}(W^*_{S}-W^*_{S_J})^T\left(\Lambda(W^*_{S})\right)^{-1}(W^*_{S}-W^*_{S_J})}}}\notag\\
%     =&\frac{M}{\sqrt{2}}\mathbb{E}_{S,J}{\sqrt{\cex{W^*_{S},W^*_{S_J}}{S,J}{\frac{1}{2}tr\left\{\frac{b}{\eta}(2\mathrm{I}_d-\eta H_{W^*_{S}})(W^*_{S}-W^*_{S_J})(W^*_{S}-W^*_{S_J})^T\right\}}}}\notag\\
%     =&\frac{M}{n\sqrt{2}}\mathbb{E}_{S,J}{\sqrt{\cex{W^*_{S},W^*_{S_J}}{S,J}{\frac{b}{\eta}tr\left\{(\mathrm{I}_d-\frac{\eta}{2} H_{W^*_{S}})H^{-1}_{W^*_{S}}\nabla \ell(W^*_{S},Z_i)(H^{-1}_{W^*_{S}}\nabla \ell(W^*_{S},Z_i))^T\right\}}}}\notag\\
%     =&\frac{M}{n\sqrt{2}}\mathbb{E}_{S,J}{\sqrt{\cex{W^*_{S},W^*_{S_J}}{S,J}{\frac{b}{\eta}tr\left\{(\mathrm{I}_d-\frac{\eta}{2} H_{W^*_{S}})H^{-1}_{W^*_{S}}\nabla \ell(W^*_{S},Z_i)\nabla \ell(W^*_{S},Z_i)^TH^{-1}_{W^*_{S}}\right\}}}}\notag\\
%     \leq&\frac{M}{n\sqrt{2}}\mathbb{E}_{S}{\sqrt{\cex{W^*_{S}}{S}{\frac{b}{\eta}tr\left\{(\mathrm{I}_d-\frac{\eta}{2} H_{W^*_{S}})H^{-1}_{W^*_{S}}\ex{J}{\nabla \ell(W^*_{S},Z_i)\nabla \ell(W^*_{S},Z_i)^T}H^{-1}_{W^*_{S}}\right\}}}}\notag\\
%     =&\frac{M}{n\sqrt{2}}\mathbb{E}_{S}{\sqrt{\cex{W^*_{S}}{S}{\frac{b}{\eta}tr\left\{H^{-1}_{W^*_{S}}(\mathrm{I}_d-\frac{\eta}{2} H_{W^*_{S}})H^{-1}_{W^*_{S}}\Sigma(W^*_{S})\right\}}}}\notag\\
%     =&\frac{M}{n\sqrt{2}}\mathbb{E}_{S}{\sqrt{\cex{W^*_{S}}{S}{\frac{b}{\eta}tr\left\{H^{-1}_{W^*_{S}}(\mathrm{I}_d-\frac{\eta}{2} H_{W^*_{S}})\right\}}}}\notag\\
%     =&\frac{M}{n}\mathbb{E}_{S}{\sqrt{\cex{W^*_{S}}{S}{tr\left\{\frac{b}{2\eta}H^{-1}_{W^*_{S}}-\frac{b}{4} \mathrm{I}_d\right\}}}}.\notag
% \end{align}
% This concludes the proof.
% \end{proof}


% \begin{figure*}[!ht]
%     \centering
%     \begin{subfigure}[b]{0.245\textwidth}
% \includegraphics[scale=0.28]{figs/stable-plot-cifar10-resnetwobn.pdf}    
% \caption{ResNet on CIFAR10}            \label{fig:resnet-cifar10-stable}
%     \end{subfigure}
% \begin{subfigure}[b]{0.245\textwidth}
% \includegraphics[scale=0.28]{figs/stable-plot-cifa100-resnet.pdf}
% \caption{ResNet on CIFAR100}
%     \label{fig:resnet-cifa100-stable}
% \end{subfigure}
%  \begin{subfigure}[b]{0.245\textwidth}
% \includegraphics[scale=0.28]{figs/weight-plot-cifar10-resnetwobn.pdf}
% \caption{ResNet on CIFAR10}
% \label{fig:resnet-cifar10-weight}
%     \end{subfigure}
% \begin{subfigure}[b]{0.245\textwidth}
% \includegraphics[scale=0.28]{figs/weight-plot-cifa100-resnet.pdf}
% \caption{ResNet on CIFAR100}
% \label{fig:resnet-cifar100-weight}
% \end{subfigure}
% \caption{(a-b) The dynamics of $\eta/2-\lambda_1$. Note that learning rate decays by $0.1$ at the $40,000^{\rm th}$ and the $60,000^{\rm th}$ iteration. (c-d) The distance of current model parameters from its initialization.}\label{fig:Sta-Dynamics-2}
% \end{figure*}

% \section{Experiment Details and Additional Results}
% The implementation in this paper is on PyTorch  \citep{paszke2019pytorch}, and all the experiments are carried out on NVIDIA Tesla V100 GPUs (32 GB). Most experiment settings follow \cite{wu2020noisy}, and the code is also based their implementation, which is available at:  \href{https://github.com/uuujf/MultiNoise}{https://github.com/uuujf/MultiNoise}.

% \subsection{Hyperparameters}
% For CIFAR 10, the initial learning rates used for VGG-11 and ResNet-18 are $0.01$ and $0.1$, respectively. For SVHN, the initial learning rate is $0.05$. For CIFAR100, the initial learning rate is $0.1$.
% The learning rate is then decayed by $0.1$ at iteration $40, 000$ and $60, 000$. If not stated otherwise, the batch size of SGD is $100$. 

\subsection{Additional Empirical Results}


\begin{figure*}[!ht]
    \begin{subfigure}[b]{0.32\textwidth}
    \centering
\includegraphics[scale=0.33]{figs/err-plot-svhn-vgg.pdf}    
\caption{VGG on (small) SVHN}            \label{fig:vgg-svhn-err}
    \end{subfigure}
\begin{subfigure}[b]{0.32\textwidth}
\centering
\includegraphics[scale=0.33]{figs/err-plot-cifar10-vgg.pdf}
\caption{VGG on CIFAR10}
    \label{fig:vgg-cifa10-err}
\end{subfigure}
 \begin{subfigure}[b]{0.32\textwidth}
 \centering
\includegraphics[scale=0.33]{figs/err-plot-cifar10-resnet.pdf}
\caption{ResNet on CIFAR10}
\label{fig:resnet-cifar-err}
    \end{subfigure}
% \begin{subfigure}[b]{0.245\textwidth}
% \includegraphics[scale=0.28]{figs/bound-plot-cifar10-resnet-TS.pdf}
% \caption{VGG on CIFAR10}
% \label{fig:vgg-cifar10-TM-bound}
% \end{subfigure}
\caption{Zoomed-in of generalization error.}\label{fig:errs}
\end{figure*}


\begin{figure*}[!ht]
    % \centering
    \begin{subfigure}[b]{0.48\textwidth}
    \centering
\includegraphics[scale=0.4]{figs/bound-plot-cifar10-resnet.pdf}    
\caption{ResNet (Traj. Bound)}            \label{fig:resnet-cifar10-bound}
    \end{subfigure}
    \hfill
\begin{subfigure}[b]{0.48\textwidth}
\centering
\includegraphics[scale=0.4]{figs/bound-plot-cifar10-resnet-TS.pdf}
\caption{ResNet (Term. Bound)}
    \label{fig:resnet-cifar10-bound-TS}
\end{subfigure}
\caption{Estimated trajectory-based bound and terminal-state based bound, with $R$ excluded. Models trained on CIFAR 10.}\label{fig:resnet-bounds}
\end{figure*}

\section{Additional Result: Inverse Population FIM as both Posterior and Prior Covariance}
% Another choice of the posterior covariance is the inverse population Fisher information matrix,
% \textcolor{red}{since the inverse of the FIM is an estimator of the asymptotic covariance matrix}, 
% which has already been treated as the posterior covariance 
Inspired by some previous works of \citep{achille2019information,harutyunyan2021estimating,wang2022pacbayes}, we can also select the inverse population Fisher information matrix $F^\mu_{w^*}=\ex{Z}{\nabla \ell(w^*,Z)\nabla \ell(w^*,Z)^{\bf T}}$ as the posterior covariance. Then, 
% recall Theorem~\ref{thm:pacbayes-data-dependent-prior},
the following theorem is obtained.
\begin{thm}
\label{thm:IF-pacbayes-FIM}
Under the same conditions in Theorem~\ref{thm:pacbayes-data-dependent-prior}, and assume the distribution $P_{W^*_{S_J}|S_J}$ is invariant of $J$, then
\[
\mathcal{E}_{\mu}(\mathcal{A})\leq\frac{M}{2n}\ex{S}{\sqrt{\cex{W^*_{S}}{S}{tr\{H^{-1}_{W^*_{S}}F^\mu_{W_S^*}\}}}}.
\]
% where $F^\mu_{w^*}=\ex{Z}{\nabla \ell(w^*,Z)\nabla \ell(w^*,Z)^T}$ is the population FIM.
\end{thm}
\begin{rem}
Notice that  $F^\mu_{W_S^*}\approx H^\mu_{W^*_{S}}\approx \Sigma^\mu(W_S^*)$ near minima \citep[Chapter~8]{pawitan2001all}, then $tr\{H^{-1}_{W^*_{S}}\Sigma^\mu(W_S^*)\}$ is very close to the Takeuchi Information Criterion \citep{takeuchi1976distribution}. In addition, our bound in Theorem~\ref{thm:IF-pacbayes-FIM} is similar to \citet[Theorem~3.]{singh2022phenomenology} with the same convergence rate, although strictly speaking, their result is not a generalization bound. Moreover, as also pointed out in \cite{singh2022phenomenology}, here $H^{-1}_{W^*_{S}}$ is evaluated on the training sample, unlike other works that evaluates the inverse Hessian on the testing sample (e.g., \citet{thomas2020interplay}). 
\end{rem}

The invariance assumption is also used in \citet{wang2021optimizing}. In practice, $n$ is usually very large, when $m=n-1$, this assumption indicates that replacing one instance in $s_{j}$ will not make $P_{W^*_{s_j}|s_j}$ be too different. 

% In practice, $n$ is usually very large, when $m=n-1$, the invariance assumption indicates that replacing one instance in $s_{j}$ will not make $P_{W^*_{s_j}|s_j}$ be too different. 

% Note that Theorem~\ref{thm:IF-pacbayes-FIM} is also based on the influence function (Eq.~(\ref{eq:influnce-function})). However, for the deep neural network training, the approximation made by influence function is often erroneous \citep{basu2021influence}. This, unfortunately, limits the practical application of Theorem~\ref{thm:IF-pacbayes-FIM}. 

% \section{Proof of Theorem~\ref{thm:IF-pacbayes-FIM}}
\begin{proof}[Proof of Theorem~\ref{thm:IF-pacbayes-FIM}]
We now use $(F^\mu_{W_S^*})^{-1}$ as both the posterior and prior covariance (again, we assume $F^\mu_{W_S^*}\approx F^\mu_{W_{S_j}^*}$ for any $j$), then
\begin{align*}
    \mathcal{E}_{\mu}(\mathcal{A})\leq&\ex{S}{\sqrt{\frac{M^2}{4}\cex{J,W^*_{S},W^*_{S_J}}{S}{{\left(W^*_{S}-W^*_{S_J}\right)F^\mu_{W_S^*}\left(W^*_{S}-W^*_{S_J}\right)^{\bf T}}}}}\\
    =&\frac{M}{2n}\ex{S}{\sqrt{\cex{W^*_{S},W^*_{S_j}}{S}{tr\left\{F^\mu_{W_S^*}H^{-1}_{W^*_{S}}H^{-1}_{W^*_{S}}\ex{J}{\nabla \ell(W^*_{S},Z_i)\nabla \ell(W^*_{S},Z_i)^{\bf T}}\right\}}}}\\
    =&\frac{M}{2n}\ex{S}{\sqrt{\cex{W^*_{S}}{S}{tr\left\{F^\mu_{W_S^*}H^{-1}_{W^*_{S}}\right\}}}},
\end{align*}
which completes the proof.
\end{proof}
% In the corrupted label experiments, we train all the models until they achieve $100\%$ training accuracy. Other common techniques such as weight decay and batch normalization are not used in all the experiments. 

% \subsection{Estimation of trace of gradient noise covariance, Hessian and inverse Hessian}
% Quantities of $||G_t||^2$, $tr\{C_t\}$, $tr\{H_t\}$ and $tr\{H^{-1}_t\}$ are all evaluated on the full batch data. Specifically, $W_{t-1}$ is updated by mini-batch SGD and we use the full dataset to calculate these quantities so there will be two forward passes for every iteration.

% Notice that $tr\{C_t\}$ or $tr\{\Sigma_t\}$ is the (scaled) empirical gradient variance, $||G_t-\hat{\mathbb{E}}[G_t]||^2$.  To compute the empirical gradient variance, 
% we utilizing a PyTorch library BackPack \citep{dangel2020backpack}. In addition, to compute the trace of Hessian, we use the PyHessian library \citep{yao2020pyhessian}. 

% Notice that computations of the inverse Hessian matrix is only computationally feasible for models containing small number of parameters. Instead, we indeed compute an upper bound of the trace of it. The \textit{condition number} of the Hessian is defined by
% \[
% \kappa_t=\frac{\lambda_{\max}(H_t)}{\lambda_{\min}(H_t)},
% \]
%  where $\lambda_{\max}(H_t)$ and $\lambda_{\min}(H_t)$ are largest eigenvalue and smallest eigenvalue, respectively. Then, we have the inequalities
% \[
% \frac{d^2}{\kappa_t}\leq tr\{H_t\}tr\{H^{-1}_t\}\leq d^2\kappa_t.
% \]
% Thus, we can estimate the trace of inverse Hessian by the trace of Hessian. However, obtaining $\lambda_{\min}(H_t)$ is still very expensive on networks with non-piecewise linear activations when $d$ is large. In most cases, to simplify the computation, we also treat $\kappa_t$ as a constant.

% In Figure~\ref{fig:train-mnist} and \ref{fig:train-cifar}, the trace of Hessian is re-scaled, and in Figure~\ref{fig:norm-mnist}, \ref{fig:norm-cifar}, \ref{fig:inverse-mnist} and \ref{fig:inverse-cifar}, the empirical generalization gap (training and testing accuracy gap) is re-scaled. Finally, in Figure~\ref{fig:weight-mnist} and \ref{fig:weight-cifar}, all the quantities in the bounds, $\sum_{t=1}^T ||G_t||^2$, $||w_T-w_0||^2$ and $||w_s-w_{s_j}||^2$ are re-scaled.

% % \subsection{License of the Assets}
% % MNIST is made available under the terms of the Creative Commons Attribution-Share Alike 3.0 license. CIFAR10 is licensed under the MIT License.
% % Two open source packages used in this paper, BackPACK and PyHessian, are licensed under the MIT License.


% % \cite{wang2022generalization}



% % \bibliographystyle{plain}
% % \bibliography{ref}

% \begin{figure*}[ht!]
% % \vspace{-5pt}
% \centering
% \input{TrajPlot}
% \caption{SGD training dynamics on MNIST (first row) and CIFAR10 (second row). Some quantities in  are re-scaled, see Appendix for more details.
% %(a)(b) show the bound decaying with the network width. (c)(d) show the bound increasing with the noise level.
% }
% \label{fig:train-dynamic}
% % \vspace{-5pt}
% \end{figure*}
% \end{appendices}