% \vspace{-0.1in}
\section{Generalization Bounds Via Terminal State}
\label{sec:pac-bayes}
% The connection between information-theoretic bounds and PAC-Bays bounds have already been discussed in many previous works \citep{bassily2018learners,hellstrom2020generalization,alquier2021user}. Roughly speaking, the most significant component of a PAC-Bayes bound is the KL divergence between the posterior distribution of a randomized algorithm output and a prior distribution, i.e. $\mathrm{D_{KL}}(Q_{W_T|S}||P_{N})$ for some prior $P_N$. In essence, information-theoretic bounds can be view as having the same spirit. For concreteness, in Lemma \ref{lem:xu's-bound}, 
% $I(W_T;S)=\mathbb{E}_S[\mathrm{D_{KL}}(Q_{W_T|S}||P_{W_T})]$, in which case the marginal $P_{W_T}$ is used as a prior of the algorithm output. Furthermore, by using  Lemma \ref{lem:mi-center-gravity}, we have $I(W_T;S) \leq \inf_{P_N} \mathbb{E}_S[\mathrm{D_{KL}}(Q_{W_T|S}||P_N)]$. Hence, Lemma \ref{lem:xu's-bound} can be regarded as a PAC-Bayes bound with the optimal prior. In addition, the PAC-Bayes framework is usually used to provide a high-probability bound, %(with respect to the randomness of $S$), 
% while information-theoretic analysis is applied to bounding the expected generalization error. In this sense, information-theoretic framework is closer to another concept called MAC-Bayes \citep{grunwald2021pac}.
\begin{figure*}[!ht]
% \vspace{-2mm}
    \centering
    \begin{subfigure}[b]{0.245\textwidth}
\includegraphics[scale=0.28]{figs/hess-plot-svhn-vgg-1.png}    
\caption{VGG on (small) SVHN}            \label{fig:vgg-svhn-hess}
    \end{subfigure}
\begin{subfigure}[b]{0.245\textwidth}
\includegraphics[scale=0.28]{figs/hess-plot-cifar10-vgg-1.png}
\caption{VGG on CIFAR10}
    \label{fig:vgg-cifa10-hess}
\end{subfigure}
 \begin{subfigure}[b]{0.245\textwidth}
\includegraphics[scale=0.28]{figs/hess-plot-cifar10-resnetwobn-1.png}
\caption{ResNet on CIFAR10}
\label{fig:resnet-cifa10-hess}
    \end{subfigure}
\begin{subfigure}[b]{0.245\textwidth}
\includegraphics[scale=0.28]{figs/hess-plot-cifa100-resnet-1.png}
\caption{ResNet on CIFAR100}
\label{fig:resnet-cifa100-hess}
\end{subfigure}
\caption{Hessian-related quantities of SGD or its discrete SDE approximation.}\label{fig:Hess-Dynamics}
% \vspace{-4mm}
\end{figure*}

In this section, we directly upper-bound the generalization error by the properties of the terminal state instead of using the full training trajectory information. Particularly, we will first use the stationary distribution of weights at the end of training as $Q_{W_T|S}$. 
To overcome the explicit time-dependence present in the bounds discussed in Section~\ref{sec:itb-sde}, one has to introduce additional assumptions, with these assumptions being the inherent cost. For example, an important approximation used  in this section is the quadratic approximation of the loss. Specifically, let $w_s^*$ be a local minimum for a given training sample $S=s$, when $w$ is close to $w_s^*$, we can use a second-order Taylor expansion to approximate the value of the loss at $w$, 
\begin{eqnarray}
  L_s(w) = L_s(w_s^*) + \frac{1}{2}(w-w_s^*)^\mathrm{\bf T} H_{w_s^*}(w-w_s^*).
  \label{eq:second-order-taylor}
\end{eqnarray}
where $H_{w_s^*}$ is the Hessian matrix of $s$ at $w^*$. Note that in this case, when $w_t\to w_s^*$, we have $G_t=\nabla L_s(w_t)=H_{w_s^*}\pr{w_t-w_s^*}$. Our remaining analysis assumes the validity of Eq.~(\ref{eq:second-order-taylor}).
% two typical choices of the covariance of posterior $Q_{W_T|S}$, the first one is the stationary distribution of weights at the end of training, in which case the model fluctuates around a local minimum, and the second one is the inverse Fisher information matrix (FIM).

% \subsection{Steady-State Covariance as Posterior Covariance}

% Given that SGD can be well approximated by SDE (e.g., Eq.~\ref{eq:ito-sde}, 
In view of Eq.~(\ref{eq:second-order-taylor}), a classical result by \citet{mandt2017stochastic} shows that the posterior distribution $Q_{W|s}$ around $w_s^*$ is a Gaussian distribution $\mathcal{N}(w_s^*, \Lambda_{w_s^*})$, where $\Lambda_{w^*}\triangleq\ex{}{(W-w^*)(W-w^*)^\mathrm{\bf T}}$ is the covariance of the stationary distribution (see Appendix~\ref{sec:Gaussian-local} for an elaboration).  Furthermore, in the context of nonconvex learning, such as deep learning, where multiple local minima exist, we have multiple $w_s^*$ for a give $S=s$. Therefore, it is necessary to treat the local minimum itself as a random variable for a fixed $s$, denoted as $W_s^*\sim Q_{W_s^*|s}$. In this case, we have $Q_{W|s,w_s^*}=\mathcal{N}(w_s^*, \Lambda_{w_s^*})$ and the posterior distribution $Q_{W|s}=\cex{W_s^*}{s}{\mathcal{N}(W_s^*, \Lambda_{W_s^*})}$ should be a mixture of Gaussian distributions. 


In addition, recall that $I(W_T;S)=\inf_{P_{W_{T}}}\mathbb{E}_{S}{\mathrm{D_{KL}}(Q_{W_T|S}||P_{W_{T}})}$ where $P_{W_{T}}=Q_{W_{T}}$ achieves the infimum. Here, the oracle prior $Q_{W_{T}}=\ex{S,W_S^*}{\mathcal{N}(W_S^*, \Lambda_{W_S^*})}$ is also a mixture of Gaussian distributions. From a technical standpoint, given that the KL divergence between two mixtures of Gaussian distributions does not have a closed-form expression, we turn to analyze its upper bound, namely
% we may not be able to direct compute or lower bound $I(W_T;S)$ without further knowledge even if we know $Q_{W|s,w_s^*}$ is a Gaussian distribution. Thus, 
% we turn to analyze its upper bound, namely 
$\inf_{P_{W_{T}}}\mathbb{E}_{S,W_S^*}{\mathrm{D_{KL}}(Q_{W_T|S,W_S^*}||P_{W_{T}})}$.
% which is an upper bound of $I(W_T;S)$. 
When each $s$ has only one local minimum, $I(W;S)$ reaches this upper bound. \looseness=-1

We are ready to give the terminal-state-based bounds.

\begin{thm}
\label{thm:opt-state-inde-bound}
    % Let $P_{W_{T}}=\mathcal{N}\pr{w^*_{\mu}, \Lambda^{\mu}_{w^*_{\mu}}}$, where $w^*_{\mu}=\ex{}{W_S^*}$ is the average ERM solution, and $\Lambda_{w^*_{\mu}}=\ex{W_T}{\pr{W_T-w^*_{\mu}}\pr{W_T-w^*_{\mu}}^T}$ is the population stationary covariance. 
    % Under the conditions in Lemma~\ref{lem:xu's-bound}. 
    Let $w^*_{\mu}=\ex{}{W_S^*}$ be the expected ERM solution and let $\Lambda_{w^*_{\mu}}=\ex{}{\pr{W_T-w^*_{\mu}}\pr{W_T-w^*_{\mu}}^\mathrm{\bf T}}$ be its corresponding stationary covariance, then
    \begin{align*}
        \mathcal{E}_{\mu}(\mathcal{A})\leq\frac{R}{\sqrt{2n}}\sqrt{\ex{S,W_S^*}{\tr{\log\pr{\Lambda^{-1}_{W^*_S}\Lambda_{w^*_\mu}}}}}. 
        % \label{ineq:state-bound-1}
        % \\
        % &\left.\sqrt{d\log\pr{\frac{\ex{}{\mathrm{d}^2_{\mathrm{M}}\pr{W_S^*,w^*_\mu;\ex{}{\Lambda_{W^*_S}}}}}{d}+1}+\ex{}{\tr{\log\pr{\Lambda_{W^*_S}^{-1}\ex{}{\Lambda_{W^*_S}}}}}}\right\},\label{ineq:state-bound-2}
    \end{align*}
    % where $\mathrm{d}_{\mathrm{M}}\pr{x,y;\Sigma}\triangleq \sqrt{(x-y)^T\Sigma^{-1}(x-y)}$ is the Mahalanobis distance, and all the expectation above are taken over $\pr{S,W_S^*}\sim Q_{S,W_S^*}$.
\end{thm}
This result bears resemblance to Theorem~\ref{thm:anisotropic-prior-bound} since both involve the alignment between a population covariance matrix and a sample (or batch) covariance matrix.

Note that $\Lambda_{w^*_{\mu}}\!=\!\ex{}{\pr{W^*_S-w^*_{\mu}}\pr{W^*_S-w^*_{\mu}}^\mathrm{\bf T}}\!+\!\ex{}{\Lambda_{W^*_S}}$. By Jensen's inequality, we can move the expectation over $W_s^*$  inside the logarithmic function. Additionally, if $\ex{W^*_s}{\Lambda_{W^*_s}^{-1}\ex{}{\Lambda_{W^*_s}}}$ is close to the identity matrix---especially evident in scenarios where each $s$ has only one minimum, as in convex learning---we obtain the upper bound 
$
\mathcal{O}\pr{\sqrt{{\ex{}{\mathrm{d}^2_{\mathrm{M}}\pr{W_S^*,w^*_\mu;{\Lambda_{W^*_S}}}}}/{n}}},
$
where $\mathrm{d}_{\mathrm{M}}\pr{x,y;\Sigma}\triangleq \sqrt{(x-y)^{\bf T}\Sigma^{-1}(x-y)}$ is the Mahalanobis distance. Intuitively, this quantity measures the sensitivity of a local minimum to the combined randomness introduced by both the algorithm and the training sample, relative to its local geometry.

In practice, one can estimate $\Lambda_{w^*_{\mu}}$ and $\Lambda_{w^*_{s}}$ by repeatedly conducting training processes and storing numerous checkpoints at the end of each training run. This is still much easier than estimating $I(W;S)$ directly. As an alternative strategy, one may leverage the analytical expression available for $\Lambda_{w_s^*}$.
% Unlike Theorem~\ref{thm:anisotropic-prior-bound}, where we can utilize $\cex{S}{w_{t-1}}{C_t}=\frac{n-b}{bn}\Sigma_t^\mu$ to further develop a dimension-dependent lower bound in Theorem~\ref{thm:lower-bound-traj}. In Theorem~\ref{thm:opt-state-inde-bound}, it's not clear the relationship between $\ex{S,W_S^*}{\Lambda_{W_S^*}}$ and $\Lambda_{w_\mu^*}$. Thus, we need more analytic form for $\Lambda_{W_S^*}$.
\citet{mandt2017stochastic} provides such analysis and give a equation to solve for $\Lambda_{w_s^*}$. However, the result in \citet{mandt2017stochastic} relies on the unrealistic small learning rate, and the GNC in their analysis is regarded as a state-independent covariance matrix. To overcome these limitations, we give the following result under a quadratic approximation of the loss, which is refined from \citet[Theorem~1]{liu2021noise} by using the state-dependent GNC.
% a more general result is given by \cite{liu2021noise}, as stated below.
% Particularly, 
% when $w$ is close to any local minimum $w^*$, we can use a second-order Taylor expansion to approximate the value of the loss at $w$, 
% \begin{eqnarray}
%   L_s(w) \approx L_s(w^*) + \frac{1}{2}(w-w^*)^T H_{w^*}(w-w^*),
%   \label{eq:second-order-taylor}
% \end{eqnarray}
%  where
% let $H_{w^*}$ be the Hessian matrix of $s$ at $w^*$, 
 % Let $\Lambda(w^*)\triangleq\ex{}{(W-w^*)(W-w^*)^T}$ be defined as the covariance of the stationary distribution of SGD, 
% then the following is given by a classical result:
%  \begin{align}
%      \Lambda(w^*) H_{w^*} + H_{w^*} \Lambda(w^*) = \eta C(w^*). \label{eq:constant-posterior-covariance}
%  \end{align}
% % Note that $\Lambda(w^*)=\frac{1}{2\eta}H_{w^*}^{-1}C(w^*)=\frac{b}{2\eta}H_{w^*}^{-1}\Sigma(w^*)$. 
% However, as also pointed out by the literature, one deficiency of Eq.~(\ref{eq:constant-posterior-covariance}) is that it relies on the unrealistic small learning rate and state-independent gradient noise covariance. Instead of using Eq.~(\ref{eq:constant-posterior-covariance}), we invoke a more realistic estimation of $\Lambda(w^*)$ given in \citet{liu2021noise}, 
% \begin{lem}[{\citet[Theorem~1.]{liu2021noise}}]
% \label{lem:stationary-real}
% The stationary covariance of $W_T$ satisfies
% \[
% \Lambda(w^*) H_{w^*} + H_{w^*} \Lambda(w^*)-\eta H_{w^*} \Lambda(w^*)H_{w^*} = \eta C(w^*).
% \]
% % If Eq.~\ref{eq:approx-hessian-gradient} holds, then
% % $
% % \Lambda(w^*)=\frac{\eta}{b}(2\mathrm{I}_d-\eta H_{w^*})^{-1}.
% % $
% \end{lem}
 
%  , the covariance $\Lambda(w^*)\triangleq\ex{}{(W-w^*)(W-w^*)^T}$ of the stationary distribution of SGD is given by a classical result \cite{mandt2017stochastic}:
% \begin{lem}[\citet{mandt2017stochastic}]
% Let $C_{w^*}$ be the GNC at $w^*$,
% % If Eq. (\ref{eq:second-order-taylor}) holds, 
% then in the long term limit, the covariance $\Lambda_{w^*}$ satisfies
% \[
% \Lambda_{w^*} H_{w^*} + H_{w^*} \Lambda_{w^*} = \eta C_{w^*}.
% \]
% \label{lem:posterior-covariance}
% Furthermore, the solution to this equation is 
%  $\Lambda_{w^*}=\frac{\eta}{2}H_{w^*}^{-1}C_{w^*}=\frac{\eta}{2b}H_{w^*}^{-1}\Sigma_{w^*}$ when $H_{w^*}$ and $C_{w^*}$ commute.
% \end{lem}


\begin{lem}
\label{lem:stationary-real}
% Let $H_{w^*}$ be the Hessian matrix of $s$ at $w^*$. If $ L_s(w) \approx L_s(w^*) + \frac{1}{2}(w-w^*)^T H_{w^*}(w-w^*)$ holds when $w$ is close to any local minimal $w^*$,
% If Eq. (\ref{eq:second-order-taylor}) holds, 
% then i
In the long term limit, we have
% the covariance $\Lambda_{w^*}$ satisfies
$
\Lambda_{w^*} H_{w^*} + H_{w^*} \Lambda_{w^*}-\eta H_{w^*} \Lambda_{w^*}H_{w^*} = \eta C_{T}.
$
Moreover, consider the following conditions: 
% \begin{itemize}
\begin{enumerate}[topsep=-0.1cm, parsep=-0cm, align=parleft, labelsep=0.6cm, label=(\roman*)]
    \item $H_{w^*}\Lambda_{w^*}=\Lambda_{w^*}H_{w^*}$;
    \item $H_{w^*}^{-1}\Sigma_T= \mathrm{I}_d$;
    \item $\frac{2}{\eta}\gg \lambda_1$, where $\lambda_1$ is the top-$1$ eigenvalue of $H_{w^*}$.
\end{enumerate}
% \end{itemize}
% (i) $H_{w^*}\Lambda_{w^*}=\Lambda_{w^*}H_{w^*}$; 
% % and $\Lambda_{w^*}$ commute; 
% (ii) $H_{w^*}^{-1}\Sigma_T= \mathrm{I}_d$; (iii) $\frac{2}{\eta}\gg \lambda_1$ where $\lambda_1$ is the top-$1$ eigenvalue of $H_{w^*}$. 
Then, given (i), we have $\Lambda_{w^*}\!=\! \br{H_{w^*}\pr{\frac{2}{\eta}\mathrm{I}_d\!-\! H_{w^*}}}^{-1}\!C_{T}$; given (i-ii), we have $\Lambda_{w^*}= (\frac{2}{\eta}\mathrm{I}_d-H_{w^*})^{-1}$; given (i-iii), we have $\Lambda_{w^*}  = \frac{\eta}{2b} \mathrm{I}_d$.
% If Eq.~\ref{eq:approx-hessian-gradient} holds, then
% $
% \Lambda_{w^*}=\frac{\eta}{b}(2\mathrm{I}_d-\eta H_{w^*})^{-1}.
% $
\end{lem}
% \begin{rem}
%     Note that this result does not rely on Eq.~(\ref{eq:sgd-update-gaussian}), and it holds for any type of gradient noise $V_t$ as long as its covariance is $C_t$. 
%     % In addition, the classical result of  a continues analysis of SGD in \cite[Eq.~(13)]{mandt2017stochastic} (see Lemma~\ref{lem:posterior-covariance}) also gives us $\Lambda_{w^*}  = \frac{\eta}{2b} \mathrm{I}_d$. We recover this result under a mild condition instead of assuming $\eta\ll 1$.
% \end{rem}

\begin{figure*}[!ht]
% \vspace{-3mm}
    \centering
    \begin{subfigure}[b]{0.245\textwidth}
\includegraphics[scale=0.28]{figs/stable-plot-svhn-vgg-1.png}    
\caption{VGG on (small) SVHN}            \label{fig:vgg-svhn-stable}
    \end{subfigure}
\begin{subfigure}[b]{0.245\textwidth}
\includegraphics[scale=0.28]{figs/stable-plot-cifar10-vgg-1.png}
\caption{VGG on CIFAR10}
    \label{fig:vgg-cifa10-stable}
\end{subfigure}
 \begin{subfigure}[b]{0.245\textwidth}
\includegraphics[scale=0.28]{figs/weight-plot-svhn-vgg-1.png}
\caption{VGG on (small) SVHN}
\label{fig:vgg-svhn-weight}
    \end{subfigure}
\begin{subfigure}[b]{0.245\textwidth}
\includegraphics[scale=0.28]{figs/weight-plot-cifar10-vgg-1.png}
\caption{VGG on CIFAR10}
\label{fig:vgg-cifar10-weight}
\end{subfigure}
\begin{subfigure}[b]{0.245\textwidth}
\includegraphics[scale=0.28]{figs/stable-plot-cifar10-resnetwobn.pdf}    
\caption{ResNet on CIFAR10}            \label{fig:resnet-cifar10-stable}
    \end{subfigure}
\begin{subfigure}[b]{0.245\textwidth}
\includegraphics[scale=0.28]{figs/stable-plot-cifa100-resnet.pdf}
\caption{ResNet on CIFAR100}
    \label{fig:resnet-cifa100-stable}
\end{subfigure}
 \begin{subfigure}[b]{0.245\textwidth}
\includegraphics[scale=0.28]{figs/weight-plot-cifar10-resnetwobn.pdf}
\caption{ResNet on CIFAR10}
\label{fig:resnet-cifar10-weight}
    \end{subfigure}
\begin{subfigure}[b]{0.245\textwidth}
\includegraphics[scale=0.28]{figs/weight-plot-cifa100-resnet.pdf}
\caption{ResNet on CIFAR100}
\label{fig:resnet-cifar100-weight}
\end{subfigure}
\caption{(a-b,e-f) The dynamics of $\eta/2-\lambda_1$. Note that learning rate decays by $0.1$ at the $40,000^{\rm th}$ and the $60,000^{\rm th}$ iteration. (c-d,g-h) The distance of current model parameters from its initialization.}\label{fig:Sta-Dynamics}
% \vspace{-4mm}
\end{figure*}
% \begin{figure*}[!ht]
%     \centering
%     \begin{subfigure}[b]{0.245\textwidth}
% \includegraphics[scale=0.28]{figs/stable-plot-cifar10-resnetwobn.pdf}    
% \caption{ResNet on CIFAR10}            \label{fig:resnet-cifar10-stable}
%     \end{subfigure}
% \begin{subfigure}[b]{0.245\textwidth}
% \includegraphics[scale=0.28]{figs/stable-plot-cifa100-resnet.pdf}
% \caption{ResNet on CIFAR100}
%     \label{fig:resnet-cifa100-stable}
% \end{subfigure}
%  \begin{subfigure}[b]{0.245\textwidth}
% \includegraphics[scale=0.28]{figs/weight-plot-cifar10-resnetwobn.pdf}
% \caption{ResNet on CIFAR10}
% \label{fig:resnet-cifar10-weight}
%     \end{subfigure}
% \begin{subfigure}[b]{0.245\textwidth}
% \includegraphics[scale=0.28]{figs/weight-plot-cifa100-resnet.pdf}
% \caption{ResNet on CIFAR100}
% \label{fig:resnet-cifar100-weight}
% \end{subfigure}
% \caption{(a-b) The dynamics of $\eta/2-\lambda_1$. Note that learning rate decays by $0.1$ at the $40,000^{\rm th}$ and the $60,000^{\rm th}$ iteration. (c-d) The distance of current model parameters from its initialization.}\label{fig:Sta-Dynamics-2}
% \end{figure*}

Notably, all the conditions in Lemma~\ref{lem:stationary-real} are only discussed in the context of the terminal state of SGD training. Regarding the condition (ii), as being widely used in the literature \citep{jastrzkebski2017three,zhu2019anisotropic,li2020hessian,xie2020diffusion,xie2021positive,liu2021noise}, Hessian is proportional to the GNC near local minima when the loss is the negative log likelihood, i.e. cross-entropy loss. To see this, when $w_t\to w^*$, we have $
\Sigma_{w^*}=\frac{1}{n}\sum_{i=1}^n\nabla \ell_i\nabla \ell_i^\mathrm{\bf T}-G_tG_t^\mathrm{\bf T}\approx \frac{1}{n}\sum_{i=1}^n\nabla \ell_i\nabla \ell_i^\mathrm{\bf T}=F_{w^*}$, where $F_{w^*}$ is the \textit{Fisher information matrix} (FIM). This approximation is true because gradient noise dominates over gradient mean near local minima. Moreover, FIM is close to the Hessian near local minima with the log-loss \citep[Chapter~8]{pawitan2001all}, namely, $F_{w^*}\approx H_{w^*}$. Let $n\gg b$, we have
$H_{w^*} \approx \Sigma_{w^*} =  b C_{w^*}$. Consequently, when $\Sigma_{T}$ is sufficiently close to $\Sigma_{w^*}$, condition (ii) is satisfied. It's important to note that the debate surrounding $H_{w^*}\approx F_{w^*}$ arises when the loss function deviates from cross-entropy \citep{ziyin2022strength}.
% Similar approximation is widely used in the literature \citep{jastrzkebski2017three,zhu2019anisotropic,li2020hessian,xie2020diffusion,xie2021positive,liu2021noise}. 


For condition (iii), the initial learning rate is typically set at a high value, and this condition may not be satisfied until the learning rate undergoes decay in the later stages of SGD training. This observation is evident in Figure~\ref{fig:vgg-svhn-stable}-\ref{fig:vgg-cifa10-stable}, where the condition becomes easily met at the terminal state following the learning rate decay. Moreover, the interplay between $\frac{2}{\eta}$ and $\lambda_1$ is extensively explored in the context of the {\em edge of stability} \citep{wu2018sgd,cohen2021gradient,arora2022understanding}, which suggests that during the training of GD, $\lambda_1$ approaches $\frac{2}{\eta}$ and hovers just above it in the ``edge of stability'' regime. 
In the context of Theorem~\ref{thm:opt-state-inde-bound},  as indicated by Lemma~\ref{lem:stationary-real}, the diagonal elements of $\Lambda_{w_s^*}$ tend to be close to zero before reaching the ``edge of stability'' regime, the bound presented in Theorem~\ref{thm:opt-state-inde-bound} diverges to infinity.
% In this case, as indicated by Lemma~\ref{lem:stationary-real}, the diagonal elements of $\Lambda_{w_s^*}$ tend to be close to zero before reaching the ``edge of stability''. Consequently, the bound presented in Theorem~\ref{thm:opt-state-inde-bound} diverges to infinity. 
This, as a by-product, provides an alternative explanation to the failure mode of $I(W;S)\to\infty$ in the deterministic algorithm 
% aligns with the fact that $I(W;S)$ may approach infinity for deterministic algorithms, 
(e.g., GD with a fixed initialization).


% \begin{lem}
%     \label{lem:solution-stationary}
%     Let $\ell$ be log-loss, i.e. cross-entropy loss and assume $n \gg b$. For a local minimum $w^*$, we have $H_{w^*}\approx F_{w^*}\approx \Sigma_{w^*}$, where $F_{w^*}=\frac{1}{n}\sum_{i=1}^n\nabla\ell_i\nabla\ell^T_i$ is the \textit{Fisher information matrix} (FIM).  If we further assume $\Sigma_{w^*}\approx b C_{T}$ and $\eta/2>\lambda^{\rm max}_{w^*}$, where $\lambda^{\rm max}_{w^*}$ is the largest eigenvalue of $H_{w^*}$. Thus, the covariance matrix of SGD stationary distribution is
% \begin{align}
%     \Lambda_{w^*}=\frac{\eta}{b}(2\mathrm{I}_d-\eta H_{w^*})^{-1}.
%     \label{eq:solution-stationary}
% \end{align}
% \end{lem}


% \[
% \sqrt{d\log\pr{\frac{\ex{}{\mathrm{d}^2_{\mathrm{M}}\pr{W_S^*,w^*_\mu;\ex{}{\Lambda_{W^*_S}}}}}{d}+1}+\ex{}{\tr{\log\pr{\Lambda_{W^*_S}^{-1}\ex{}{\Lambda_{W^*_S}}}}}}
% \]

% \begin{rem}
% As pointed out by the literature, one deficiency of Lemma~\ref{lem:posterior-covariance} is that it relies on the small learning rate and state-independent gradient noise covariance. By invoking a more realistic estimation of $\Lambda(w^*)$ given in \cite[Theorem~1.]{liu2021noise}, it is a simple matter to generalize the remaining analysis in this paper. We will elaborate more on this in Appendix.
% \end{rem}

% \textcolor{red}{Modify this to realistic learning rate case.}

% Follow the empirical risk minimization principal, 
% Normally, the algorithm will output $W_T$ around a local minimum, $W_T\sim \mathcal{N}(W_s^*, \Lambda(W_s^*))$. Then recall 
% Notice that $I(W_T;S)=\inf_{P_{W_{T}}}\mathbb{E}_{S}{\mathrm{D_{KL}}(Q_{W_T|S}||P_{W_{T}})}$ where the prior distribution $P_{W_{T}}=Q_{W_{T}}$ achieves the infimum (see Lemma~\ref{lem:mi-center-gravity}). Given that we now know $Q_{W_T|s} = \mathcal{N}(W_s^*,\frac{\eta}{2b}H_{W_s^*}^{-1}\Sigma_{W_s^*})$,  we can choose the isotropic Gaussian distribution as  $P_{W_{T}}$. For example, let $P_{W_{T}}=\mathcal{N}(\tilde{w}, \sigma^2 \mathrm{I}_d)$ wherein $\tilde{w}$ and $\sigma$ can be optimized and are independent of $S$. 
% could be any constant vector that does not depend on $S$
% using the population covariance of stationary distribution, $\ex{S}{\Lambda(W_S^*)}$, will make the bound too complicated. Then 
% the isotropic Gaussian distribution, $\mathcal{N}(\tilde{w}, \sigma^2 \mathrm{I}_d)$ wherein $\tilde{w}$ could be any constant vector that does not depend on $S$, is suitable if $\Lambda(W_s^*)$ is also isotropic.

% In fact, although controversy exists \citep{ziyin2022strength}, 
% % the approximation in Eq. (\ref{eq:approx-hessian-gradient}) below, where 
% Hessian is proportional to the GNC near local minima when the loss is the negative log likelihood. To see this, we first note that the remaining analysis is all based on selecting the log-loss, i.e. cross-entropy loss, as the loss function $\ell$. Thus, when $w_t\to w^*$, we have,
% \[
% \Sigma_{w^*}=\frac{1}{n}\sum_{i=1}^n\nabla \ell_i\nabla \ell_i^T-G_tG_t^T\approx \frac{1}{n}\sum_{i=1}^n\nabla \ell_i\nabla \ell_i^T=F_{w^*},
% \]
% where $F_{w^*}$ is the \textit{Fisher information matrix} (FIM). This approximation is true because gradient noise dominates over  gradient mean near local minima. Moreover, FIM is close to the Hessian near local minima with the log-loss \citep[Chapter~8]{pawitan2001all}, namely, $F_{w^*}\approx H_{w^*}$. Let $n\gg b$, we have
% \begin{align}
% \label{eq:approx-hessian-gradient}
%     H_{w^*} \approx \Sigma_{w^*} =  b C_{w^*}.
% \end{align}
% % \textcolor{red}{Need more justification of this approximation. For Cross-entropy loss?}
% Similar approximation is widely used in the literature \citep{jastrzkebski2017three,zhu2019anisotropic,li2020hessian,xie2020diffusion,xie2021positive,liu2021noise}. Therefore, if Eq.~(\ref{eq:approx-hessian-gradient}) holds, then the solution to the equation in Lemma~\ref{lem:stationary-real} is
% \begin{align}
%     \Lambda_{w^*}=\frac{\eta}{b}(2\mathrm{I}_d-\eta H_{w^*})^{-1}.
%     \label{eq:solution-stationary}
% \end{align}
% % plugging Eq.~(\ref{eq:approx-hessian-gradient}) into the solution in Lemma~\ref{lem:posterior-covariance}, we have
% % In this case, the solution of the posterior covariance matrix at $w^*$ becomes
% \begin{rem}
%     In a continues analysis of SGD, where $\eta\ll 1$, we have $\Lambda_{w^*}  = \frac{\eta}{2b} \mathrm{I}_d$ by Eq.~(\ref{eq:solution-stationary}). This recovers the solution to a classical result in \cite[Eq.~(13)]{mandt2017stochastic} (see Lemma~\ref{lem:posterior-covariance}).
% \end{rem}



%  Given that $Q_{W_T|s} = \mathcal{N}\pr{W_s^*, \frac{\eta}{b}(2\mathrm{I}_d-\eta H_{{W_s^*}})^{-1}}$,  
% we can construct an oracle prior distribution as $P_{W_{T}}=\mathcal{N}\pr{w^*_{\mu}, \Lambda^{\mu}_{w^*_{\mu}}}$, where $w^*_{\mu}=\ex{}{W_S^*}$ is the average ERM solution, and $\Lambda^{\mu}_{w^*_{\mu}}=\ex{S,W_s^*}{\Lambda_{W^*_{S}}}$ is the average stationary covariance. 

% we can choose the isotropic Gaussian distribution as  $P_{W_{T}}$. For example, let $P_{W_{T}}=\mathcal{N}(\tilde{w}, \sigma^2 \mathrm{I}_d)$ wherein $\tilde{w}$ and $\sigma$ can be optimized and are independent of $S$. 

% This justifies the selection of isotropic Gaussian as prior. Then 

% \begin{thm}
%     Assume $\lambda_{\rm max}<\frac{2}{\eta}$, where $\lambda_{\rm max}$ is the largest eigenvalue of $H^\mu_{w^*_{\mu}}$. Let $P_{W_T}$ be the oracle prior defined above, and denote the terminal state-dependent bound in Theorem~\ref{thm:state-bound-dis-prior} as $\mathrm{TersMI}_\mu(\mathcal{A})$, then we have 
%     \[
%     \mathrm{TersMI}_\mu(\mathcal{A})\geq \max\left\{\Omega\pr{\sqrt{\frac{d}{n}}}, \Omega\pr{\sqrt{\frac{b(2/\eta-\lambda_{\rm max})\ex{}{||W^*_{S}-w^*_\mu||^2}}{n}}}\right\}.
%     \]
% \end{thm}


The following results can be obtained by combining Theorem~\ref{thm:opt-state-inde-bound} and Lemma~\ref{lem:stationary-real}.
% gives a special case that Theorem~\ref{thm:opt-state-inde-bound} is dimension-independent.
\begin{cor}
\label{cor:pacbayes-anisotropic-prior}
Under (i,iii) in Lemma~\ref{lem:stationary-real}, then \[
\mathcal{E}_{\mu}(\mathcal{A})\leq\frac{R}{\sqrt{n\eta}}\sqrt{\ex{}{\tr{\log\pr{\br{H_{w^*}C^{-1}_{T}}\Lambda_{w^*_\mu}}}}}.
\]
% where $\tilde{w}=w_\mu^* =\ex{}{W_S^*}$ and the optimal $\sigma^*=\sqrt{\ex{}{||W_S^*-\tilde{w}||^2/d+\frac{\eta }{2b}}}$.
\end{cor}

\begin{cor}
\label{cor:pacbayes-isotropic-prior}
Under (i-iii) in Lemma~\ref{lem:stationary-real}, then 
\[
\mathcal{E}_{\mu}(\mathcal{A})\leq\sqrt{\frac{dR^2}{n}\log\left(\frac{2b}{\eta d}\mathbb{E}{||W_S^*-w^*_{\mu}||^2}+1\right)}.
\]
% where $\tilde{w}=w_\mu^* =\ex{}{W_S^*}$ and the optimal $\sigma^*=\sqrt{\ex{}{||W_S^*-\tilde{w}||^2/d+\frac{\eta }{2b}}}$.
\end{cor}

By $\log(x+1)\leq x$, the bound in Corollary~\ref{cor:pacbayes-isotropic-prior} is dimension-independent if the weight norm does not grow with $d$. Furthermore, the information-theoretic bound becomes a norm-based bound in Corollary~\ref{cor:pacbayes-isotropic-prior}, which is widely studied in the generalization literature \citep{bartlett2017spectrally,neyshabur2018pac}. In fact, $w^*_\mu$ can be replaced by any data-independent vector, for example, the initialization, $w_0$ (see Corollary~\ref{cor:pacbayes-isotropic-prior-init}). In this case,
% when $\tilde{w}$ is the initialization of the parameters, $w_0$, then
the corresponding bound suggests that generalization performance can be characterized by the ``distance from initialization'', namely, given that SGD achieves satisfactory performance on the training data, a shorter distance from the initialization tends to yield better generalization. \citet{nagarajan2019generalization} also derived a ``distance from initialization'' based generalization bound by using Rademacher complexity, and \citet{Hu2020Simple} use ``distance from initialization'' as a regularizer to improve the generalization performance on noisy data.\looseness=-1
% Additionally, Corollary~\ref{cor:pacbayes-isotropic-prior} can be used to recover a trajectory-based bound, see Corollary~\ref{cor:pacbayes-gradient} in Appendix.

% \begin{rem}
% Intuitively, the norm of $W_T$ can be controlled via controlling the gradient norm during the entire training, so it's indeed possible to use Theorem \ref{thm:pacbayes-isotropic-prior} to recover a gradient norm and gradient noise trace based bound by constantly unrolling $||W_S^*-\tilde{w}||$ (See Corollary~\ref{cor:pacbayes-gradient} in Appendix).
% \end{rem}



% It turns out that information-theoretic bounds become a norm-based bound, which is widely studied in the generalization literature \citep{bartlett2017spectrally,neyshabur2018pac}. Specifically, when $\tilde{w}$ is the initialization of the parameters, $w_0$, then Theorem~\ref{thm:pacbayes-isotropic-prior} suggests that generalization performance can be characterized by the ``distance from initialization''. \citet{nagarajan2019generalization} also derived a ``distance from initialization'' based generalization bound by using Rademacher complexity, and \citet{Hu2020Simple} use ``distance from initialization'' as a regularizer to improve the generalization performance on noisy data.
% However, let $\tilde{w}=0$, making the bound completely depend on $||W_T||$, will render the bound algorithm-independent. This is because, $\ex{}{||W_T||^2}=\ex{}{W_T^TW_T}=tr\{\ex{}{W_TW_T^T}\}=tr\{\Sigma_{\rm SDE}^*\}=\frac{\eta}{2b}d$. Then, as formally stated in the following corollary,
% \begin{cor}
% \label{cor:vc-bound}
% Let $\tilde{w}=0$, then
% $
% \mathcal{E}_{\mu}(\mathcal{A})\leq c\sqrt{\frac{d}{n}},
% $ where constant $c=R\sqrt{\log 2}$.
% \end{cor}
% This bound becomes a parameter-counting bound and as we already know, will not become a meaningful proxy of generalization in deep learning. Nevertheless, an interesting observation is that Corollary \ref{cor:vc-bound} matches the classic VC-dimension based bound in the case where VC-dimension of the hypothesis class is equal to the number of free parameters\footnote{Note that this is not always true, for example, there exists the
% hypothesis class with one parameter but an infinite VC-dimension}.

% Intuitively, controlling the gradient norm and noise level for the full training trajectories will control the norm of $||W_T||$. With this as a preamble, let us begin the more formal treatment. The following corollary shows that it's possible to use Theorem \ref{thm:pacbayes-isotropic-prior} to provide a gradient norm and noise trace based bound, which is similar to the bound in Theorem \ref{thm:isotropic-prior-bound} that is derived by the information-theoretic analysis.
% \begin{cor}
% \label{cor:pacbayes-gradient}
% Let $W_T=W_s^*$, $\tilde{w}=0$ and W.L.O.G, assume $W_0=0$, then
% \[
% \mathcal{E}_{\mu}(\mathcal{A})\leq\sqrt{\frac{dR^2}{n}\log\left(\frac{4bT\eta}{d}\sum_{t=1}^T\ex{}{||G_t||^2+tr\{C_t\}}+1\right)},
% \]
% \end{cor}
% In Theorem \ref{thm:isotropic-prior-bound}, let $\tilde{g}=0$  and by applying Jensen's inequality, we could also let the summation and factor $T$ move inside the square root. Then the most different part in Corollary \ref{cor:pacbayes-gradient} is that $A_2(t)$ is now removed from the bound. Since $-tr\{\log{C_t}\}$ is usually has very large magnitude in practice, this improvement is significant.

\begin{table*}[bt!]
    \centering
    % \vspace{-0.7em}
    \caption{\small Comparison of the results in this work 
   % The previous $\Sigma$'s for SGD and SGDM are taken from \cite{Mandt2017}, and the other four previous results are due to \cite{gitman2019understanding}. There is no existing continuous-time result about DNM and NGD.
    } %\\
    %$^*$SGD: stochastic gradient descent with learning rate $\lambda$. $^*$SGDM: stochastic gradient descent with momentum hyperparameter $\mu$. $^*$QHM: quasi-hypobolic momentum. $^*$DNM: damped newton's method. $^*$NGD: natural gradient descent.}
    \label{tab:summary}
    \vspace{-0.8EM}
    %\renewcommand\arraystretch{1.2}
    {\small
    \resizebox{\textwidth}{!}{
    {\begin{tabular}{c|c|c}
    \hline\hline
    &Bounds& Remarks\\
    \hline
    \multicolumn{3}{c}{Trajectory-based Bounds. Pros: less assumptions, can track training dynamics; Cro: Time-Dependent}\\
    \hline%\hline 
     Theorem~\ref{thm:isotropic-prior-bound} &  $\mathcal{O}\pr{\sqrt{\frac{d}{n}\ex{}{\log{\frac{h_1}{d}}-\frac{h_2}{d}}}}$  
     & Isotropic covariance for Gaussian prior     \\
      Corollary~\ref{cor:langevin-dynamic} &  $\mathcal{O}\pr{\sqrt{\frac{d}{n}\sum_{t=1}^T{{\mathbb{E}_{}{\log\left(\frac{\mathbb{E}{\left|\left|G_t-\tilde{g}_t\right|\right|^2}}{d}+1\right)}}}}}$  
     & Bound for langevin dynamic; tighter than \citet[Prop.~3.]{neu2021information}   \\
     Theorem~\ref{thm:anisotropic-prior-bound} &  $\mathcal{O}\pr{\sqrt{\frac{1}{n}\sum_{t=1}^T\ex{}{\tr{\log\frac{\Sigma^\mu_tC_t^{-1}}{b}}}}}$ 
     &  Population GNC for prior;  tighter than  Thm.~\ref{thm:isotropic-prior-bound} \\
     \hline
     \multicolumn{3}{c}{Terminal-State-based Bounds. Pro: time-indepedent; Cro: more assumptions, cannot track training dynamics}\\
    \hline%\hline 
    % NAG \\
     Theorem~\ref{thm:opt-state-inde-bound} &  $\mathcal{O}\pr{\sqrt{\frac{1}{n}\ex{}{\tr{\log\pr{\Lambda^{-1}_{W^*_S}\Lambda_{w^*_\mu}}}}}}$ 
     &  General result; hard to measure in practice \\
     Corollary~\ref{cor:pacbayes-anisotropic-prior} & $\mathcal{O}\pr{\sqrt{\frac{1}{n\eta}\ex{}{\tr{\log\pr{\br{H_{w^*}C^{-1}_{T}}\Lambda_{w^*_\mu}}}}}}$ & Under conditions: $H_{w^*}\Lambda_{w^*}=\Lambda_{w^*}H_{w^*}$ and $H_{w^*}\Sigma_T=\mathrm{I}_d$\\
     Corollary~\ref{cor:pacbayes-isotropic-prior} & $\mathcal{O}\pr{\sqrt{\frac{d}{n}\log\left(\frac{b}{\eta d}\mathbb{E}{||W_S^*-\hat{w}||^2}+1\right)}}$ & $\hat{w}$ is flexible; $\frac{2}{\eta}\gg \lambda_1$; other conditions same  as Cor.~\ref{cor:pacbayes-anisotropic-prior} \\
     Theorem~\ref{thm:pacbayes-data-dependent-prior} & $\mathcal{O}\pr{\mathbb{E}{\sqrt{\frac{M^2b}{\eta}\mathbb{E}_{}{||W^*_{S}-W^*_{S_J}||^2}}}}$ & Bounded loss; $\Lambda(W_{s_j}^*)=\Lambda(W_{s}^*)$; other conditions same as Cor.~\ref{cor:pacbayes-isotropic-prior}  \\
    \hline\hline
    \end{tabular}}}}
    %}
%\caption*{{\footnotesize } }
% \vspace{-1.5em}
\end{table*}

In the sequel, we use the data-dependent prior bound, namely, Lemma~\ref{lem:data-dependent-prior}, to derive new results.
% Note that the LOO prior process in Lemma~\ref{lem:data-dependent-prior} is also approximated by SDE with the same learning rate and batch size, so using $S_J$ to train a model will return the same steady-state covariance $\Lambda(W_{S}^*)$ of the final solution by combining Lemma \ref{lem:posterior-covariance} and Eq~(\ref{eq:approx-hessian-gradient}). 

% In the LOO training, to be consistent with the notations used in the algorithmic stability literature, we let $S^{\setminus i}=S_j$ where $i=[n]\setminus j$ is the instance index that is not selected in $j$. 
% Then, we assume the steady-state covariance of SGD remains constant after removing one training instance (\textcolor{red}{Who did this also?}). 
% Hence $\Lambda(W_{S}^*)\approx\Lambda(W_{S_j}^*)$ for any $j$. 
% Let $P_{W_T|S_J=s_j} = \mathcal{N}(W^*_{s_j},\Lambda(W_{s_j}^*))$ where $W^*_{s_j}$ is the local minimum found by the LOO training.
\begin{thm}
\label{thm:pacbayes-data-dependent-prior}
Let $P_{W_T|S_J=s_j} = \mathcal{N}(W^*_{s_j},\Lambda(W_{s_j}^*))$ where $W^*_{s_j}$ is the local minimum found by the LOO training.
Under the same conditions in Lemma~\ref{lem:data-dependent-prior} and (i-iii) in Lemma~\ref{lem:stationary-real}, assuming $\Lambda(W_{s_j}^*)=\Lambda(W_{s}^*)$ for a given $s$, then
% Assume Eq~(\ref{eq:approx-hessian-gradient}) holds, then
\[
\mathcal{E}_{\mu}(\mathcal{A})\leq\mathbb{E}_{S,J}{\sqrt{\frac{M^2b}{2\eta}\mathbb{E}_{W^*_{S},W^*_{S_J}}^{S,J}{||W^*_{S}-W^*_{S_J}||^2}}}.
\]
\end{thm}
% \begin{rem}
This bound implies a strong connection between generalization and the algorithmic stability exhibited by SGD. Specifically, if the hypothesis output does not change much (in the squared $L_2$ distance sense) upon the removal of a single training instance, the algorithm is likely to generalize effectively. In fact, $\mathbb{E}_{W^*_{S},W^*_{S_J}}^{S,J}{||W^*_{S}-W^*_{S_J}||^2}$ can be regarded as an average version of squared {\em argument stability} \citep{liu2017algorithmic}. Moreover, stability-based bounds often demonstrate a fast decay rate in the convex learning cases \citep{hardt2016train,bassily2020stability}. It is worth noting that if argument stability achieves  the fast rate, e.g., $\sup_{s,j}||w^*_{s}-w^*_{s_j}||\leq\mathcal{O}(1/n)$,  then Theorem~\ref{thm:pacbayes-data-dependent-prior} can also achieve the same rate. In addition, note that the stability-based bound usually contains a Lipshitz constant, while the bound in Theorem~\ref{thm:pacbayes-data-dependent-prior} discards such undesired constant.


% \end{rem}
% \begin{rem}
% While the ratio $b/{\eta}$ explicitly appears in both Theorem \ref{thm:pacbayes-isotropic-prior} and Theorem \ref{thm:pacbayes-data-dependent-prior}, it may be tempting to assert that large learning rate and small batch size will improve the generalization performance. Although this argument is consistent with many empirical observations such as \cite{jastrzkebski2017three}, it's worth mentioning that this ratio also has some implicit impact on the norm of the terminal parameters, so it's sill unclear on the role of this ratio in generalization from Theorem~\ref{thm:pacbayes-data-dependent-prior}.
% \end{rem}

% \begin{figure*}[ht!]
% % \vspace{-5pt}
% \centering
% \input{TrajPlot}
% \caption{SGD training dynamics on MNIST (first column) and CIFAR10 (second column). Some quantities in  are re-scaled, see Appendix for more details.
% %(a)(b) show the bound decaying with the network width. (c)(d) show the bound increasing with the noise level.
% }
% \label{fig:train-dynamic}
% % \vspace{-5pt}
% \end{figure*}



Ideally, to estimate the distance of $||w^*_{s}-w^*_{s_j}||^2$, one can use the influence function \citep{hampel1974influence,cook1982residuals,koh2017understanding}, namely $w^*_{s_j}-w^*_{s}\approx\frac{1}{n}H^{-1}_{W^*_{s}}\nabla\ell(w^*_{s},z_i)$,
where $i$ is the instance index that is not selected in $j$. However, for deep neural network training, the approximation made by influence function is often erroneous \citep{basu2021influence}. While this presents a challenge, it motivates further 
% exploration and 
refinement, seeking to enhance the practical application of Theorem~\ref{thm:pacbayes-data-dependent-prior} in deep learning.


The main generalization bounds obtained in this paper are summarized in Table~\ref{tab:summary}. In the remainder of this paper, we will empirically verify our theoretical results.

% Thus, the bound in Theorem \ref{thm:pacbayes-data-dependent-prior} is
% % becomes tractable without rerunning the algorithm for $n$ times in practice, as formally 
% re-stated in the following corollary.
% \begin{cor}
% \label{cor:IF-pacbayes-data-prior}
% Under the same condition in Theorem \ref{thm:pacbayes-data-dependent-prior}, then
% % and assume 
% % the distribution of $W^*_{S_J}$ given $S_J$ 
% % $P_{W^*_{S_J}|S_J}$ is invariant of $J$, then
% % \[
% % \mathcal{E}_{\mu}(\mathcal{A})\leq\frac{M}{n}\ex{S}{\sqrt{\frac{b}{2\eta}\cex{W^*_{S}}{S}{tr\{H^{-1}_{W^*_{S}}\}}}}.
% % \]
% \[
% \mathcal{E}_{\mu}(\mathcal{A})\leq\frac{M}{n}\mathbb{E}_{S,J}{\sqrt{\frac{b}{2\eta}\mathbb{E}_{W^*_{S},W^*_{S_J}}^{S,J}{||H^{-1}_{W^*_{S}}\nabla \ell(W^*_{S},Z_i)||^2}}}.
% \]
% % If we further assume the distribution $P_{W^*_{S_J}|S_J}$ is invariant of $J$, then
% % \begin{align}
% %     \mathcal{E}_{\mu}(\mathcal{A})\leq\frac{M}{n}\mathbb{E}_{S}{\sqrt{\frac{b}{2\eta}\mathbb{E}_{W^*_{S}}^{S}{tr\{H^{-1}_{W^*_{S}}\}}}}.\label{ineq:trace-inverse-hessian}
% % \end{align}
% % where $i=[n]\setminus J$.
% \end{cor}
% \begin{rem}
% \label{rem:invariance-bound}
% If we further assume the distribution $P_{W^*_{S_J}|S_J}$ is invariant of $J$, then the bound becomes $\frac{M}{n}\mathbb{E}_{S}{\sqrt{\frac{b}{2\eta}\mathbb{E}_{W^*_{S}}^{S}{tr\{H^{-1}_{W^*_{S}}\}}}}$. The invariance assumption is also used in \cite{wang2021optimizing}. In practice, $n$ is usually very large, when $m=n-1$, this assumption indicates that replacing one instance in $s_{j}$ will not make $P_{W^*_{s_j}|s_j}$ be too different. 
% The bound of Eq.~(\ref{ineq:trace-inverse-hessian}) indicates that the relationship between generalization and the ratio of learning rate and batch size is still rather complicated. Specifically, previous empirical observation suggests that large learning rate and small batch size will drive SGD to a flat minima, e.g., small $tr\{H_{W^*_{S}}\}$. In this case, small $b/\eta$ gives large $tr\{H^{-1}_{W^*_{S}}\}$, then it's still unclear whether $\frac{b}{\eta}tr\{H^{-1}_{W^*_{S}}\}$ becomes small or not.
% In practice, $n$ is usually very large, when $m=n-1$, this assumption indicates that replacing one instance in $s_{j}$ will not make $P_{W^*_{s_j}|s_j}$ be too different.
% \end{rem}

% The bound of Eq.~(\ref{ineq:trace-inverse-hessian}) indicates that the relationship between generalization and the ratio of learning rate and batch size is still rather complicated. Specifically, previous empirical observation suggests that large learning rate and small batch size will drive SGD to a flat minima, e.g., small $tr\{H_{W^*_{S}}\}$. In this case, small $b/\eta$ gives large $tr\{H^{-1}_{W^*_{S}}\}$, then it's still unclear whether $\frac{b}{\eta}tr\{H^{-1}_{W^*_{S}}\}$ becomes small or not.

% As pointed out by the literature, one deficiency of Lemma~\ref{lem:posterior-covariance} is that it relies on the small learning rate and state-independent gradient noise covariance. By invoking 
% % a more realistic estimation of $\Lambda(w^*)$ 
% a recent result in \citet[Theorem~1.]{liu2021noise}, it is a simple matter to generalize our results above (see Appendix~\ref{sec:generalize-lr-condition}).
% We will elaborate more on this in Appendix.

% In general, it's unclear now that whether the bound in Theorem \ref{thm:pacbayes-data-dependent-prior} is tighter than the bound in Theorem \ref{thm:pacbayes-isotropic-prior} or not. Since the current gradient noise is an isotropic Gaussian (e.g., $\Sigma_{SDE}^*=\frac{\eta}{2b}\mathrm{I}_d$), using the isotropic Gaussian prior to fit the posterior is reasonable and should work well. In this case, the data dependent prior could not give better control on the KL divergence with respect to the covariance shape, and studying the connection between $W_T$ and $W_{JT}$ would be a promising direction, which is likely related the algorithm stability. 

% Lemma~\ref{lem:posterior-covariance} relies on unrealistic small learning rate and state-independent gradient noise. To obtain more general result, we utilize a recent result from \cite{liu2021noise}.
% \begin{lem}
% \label{lem:stationary-real}
% The stationary covariance of $W_T$ satisfies
% \[
% \Sigma_{\rm SDE}^* H_{w^*} + H_{w^*} \Sigma_{\rm SDE}^*-\eta H_{w^*} \Sigma_{\rm SDE}^*H_{w^*} = \eta C(w^*),
% \]
% If Eq.~\ref{eq:approx-hessian-gradient} holds, then
% $
% \Sigma_{\rm SDE}^*=\frac{\eta}{b}(2\mathrm{I}_d-\eta H_{w^*})^{-1}.
% $
% \end{lem}

% Then, Theorem~\ref{thm:pacbayes-data-dependent-prior} is modified in the following Theorem.
% \begin{thm}
% \label{thm:pacbayes-data-dependent-modify}
% Under the same conditions in Theorem~\ref{thm:pacbayes-data-dependent-prior}, let $H_{W_{s^{\setminus i}}^*}\approx H_{W_{s}^*}$ for any $i\in [n]$, then
% \[
% \mathcal{E}_{\mu}(\mathcal{A})\leq\mathbb{E}{\sqrt{\frac{4M^2}{b}\ex{}{tr\left\{\left(\eta H_{W_S^*}^{-1}-2\mathrm{I}_d\right)^{-1}\right\}}}}.
% \]
% \end{thm}

% \begin{proof}
% Recall Lemma~\ref{lem:data-dependent-prior}
% \begin{align}
%     \mathcal{E}_{\mu}(\mathcal{A})\leq&\frac{M}{\sqrt{2}}\mathbb{E}_{S,J}{\sqrt{\mathrm{D_{KL}}(Q_{W|S}||P_{W|S_J})}}\notag\\
%     =&\frac{M}{\sqrt{2}}\mathbb{E}_{S,J}{\sqrt{\frac{1}{2}\left((W_{S}^*-W_{S^{\setminus I}}^*)^T\Sigma_{*}^{-1}(W_{S}^*-W_{S^{\setminus I}}^*)\right)}}\notag\\
%     =&\frac{M}{2}\mathbb{E}_{S,J}{\sqrt{tr\left\{\Sigma_{*}^{-1}(W_{S}^*-W_{S^{\setminus I}}^*)(W_{S}^*-W_{S^{\setminus I}}^*)^T\right\}}}\notag\\
%     =&\frac{M}{2}\mathbb{E}_{S,J}{\sqrt{tr\left\{\Sigma_{*}^{-1}(\frac{1}{n}H^{-1}_{W^*_{S}}\nabla \ell(W^*_{S},Z_I))(\frac{1}{n}H^{-1}_{W^*_{S}}\nabla \ell(W^*_{S},Z_I))^T\right\}}}\notag\\
%     \leq&\frac{M}{2n}\mathbb{E}_{S}{\sqrt{\mathbb{E}_J tr\left\{\Sigma_{*}^{-1}H^{-1}_{W^*_{S}}\nabla \ell(W^*_{S},Z_I)\nabla \ell(W^*_{S},Z_I)^T H^{-1}_{W^*_{S}}\right\}}}\\
%     =&\frac{M}{2n}\mathbb{E}_{S}{\sqrt{ tr\left\{H^{-1}_{W^*_{S}}\Sigma_{*}^{-1}H^{-1}_{W^*_{S}} \ex{I}{\nabla\ell(W^*_{S},Z_I)\nabla \ell(W^*_{S},Z_I)^T} \right\}}}\\
%     =&\frac{M}{2n}\mathbb{E}_{S}{\sqrt{ tr\left\{H^{-1}_{W^*_{S}}\Sigma_{*}^{-1}H^{-1}_{W^*_{S}} C(W^*_{S}) \right\}}}\\
%     =&\frac{M}{2n}\mathbb{E}_{S}{\sqrt{\frac{1}{b} tr\left\{H^{-1}_{W^*_{S}}\Sigma_{*}^{-1} \right\}}}
%     \\
%     =&\frac{M}{2n}\mathbb{E}_{S}{\sqrt{\frac{1}{b} tr\left\{H^{-1}_{W^*_{S}}\Sigma_{*}^{-1} \right\}}}
% \end{align}
% \end{proof}



% \subsection{Inverse Population FIM as both Posterior and Prior Covariance}
% Another choice of covariance of the posterior is the inverse population Fisher information matrix,
% % \textcolor{red}{since the inverse of the FIM is an estimator of the asymptotic covariance matrix}, 
% which has already been treated as the posterior covariance in the literature \cite{achille2019information,harutyunyan2021estimating,wang2022pacbayes}. Then, 
% % recall Theorem~\ref{thm:pacbayes-data-dependent-prior},
% the following Theorem is obtained.

% \begin{thm}
% \label{thm:IF-pacbayes-FIM}
% With the same conditions in Corollary~\ref{cor:IF-pacbayes-data-prior},
% \[
% \mathcal{E}_{\mu}(\mathcal{A})\leq\frac{M}{2n}\ex{S}{\sqrt{\cex{W^*_{S}}{S}{tr\{H^{-1}_{W^*_{S}}F^\mu_{W_S^*}\}}}}.
% \]
% where $F^\mu_{w^*}=\ex{Z}{\nabla \ell(w^*,Z)\nabla \ell(w^*,Z)^T}$ is the population FIM.
% \end{thm}
% \begin{rem}
% Notice that  $F^\mu_{W_S^*}\approx H^\mu_{W^*_{S}}\approx \Sigma^\mu(W_S^*)$ near minima \cite[Chapter~8]{pawitan2001all}, then $tr\{H^{-1}_{W^*_{S}}\Sigma^\mu(W_S^*)\}$ is very close to the Takeuchi Information Criterion \cite{takeuchi1976distribution}. In addition, our bound in Theorem~\ref{thm:IF-pacbayes-FIM} is similar to \cite[Theorem~3.]{singh2022phenomenology} with the same convergence rate, although strictly speaking, their result is not a generalization bound. Moreover, as also pointed out in \cite{singh2022phenomenology}, here $H^{-1}_{W^*_{S}}$ is evaluated on the training sample unlike other works that evaluates the inverse Hessian on the testing sample, i.e. evaluating on the distribution $\mu$, (e.g., \citet{thomas2020interplay}). 
% \end{rem}

% It is important to note that Theorem~\ref{thm:IF-pacbayes-FIM} is also based on the influence function used in Eq.~(\ref{eq:influnce-function}). However, for deep neural network training, the 
% approximation made by influence function is often erroneous \cite{basu2021influence}. This, unfortunately, limits the practical application of Theorem~\ref{thm:IF-pacbayes-FIM} and Corollary~\ref{cor:IF-pacbayes-data-prior}. 


% Obtaining these generalization bounds enable us to figure out which quantities are key to the mystery of the good generalization performance of SGD. Subsequently, we will conduct some experiments and show how the generalization behavior of SGD is related to these quantities.