
% \begin{figure*}[ht!]
% % \vspace{-5pt}
% \centering
% \input{TrajPlot}
% \caption{SGD training dynamics on MNIST (first row) and CIFAR10 (second row). Some quantities in  are re-scaled, see Appendix for more details.
% %(a)(b) show the bound decaying with the network width. (c)(d) show the bound increasing with the noise level.
% }
% \label{fig:train-dynamic}
% % \vspace{-5pt}
% \end{figure*}

% \vspace{-0.1in}


% \vspace{-0.1in}
\section{Empirical Study}
% \vspace{-3mm}
In this section, we present some empirical results including tracking training dynamics of SGD and SDE, along with the estimation of several obtained generalization bounds.
% the evolution of some key quantities in our bounds. Additionally, we also empirically compare the trajectory-based bound with the bound in \cite{wang2022generalization}, and also estimate the terminal-based bound. 
\begin{figure*}[!ht]
    \centering
    \begin{subfigure}[b]{0.245\textwidth}
\includegraphics[scale=0.28]{figs/bound-plot-svhn-vgg-1.png}    
\caption{VGG on (small) SVHN}            \label{fig:vgg-svhn-bound}
    \end{subfigure}
\begin{subfigure}[b]{0.245\textwidth}
\includegraphics[scale=0.28]{figs/bound-plot-cifar10-vgg-1.png}
\caption{VGG on CIFAR10}
    \label{fig:vgg-cifa10-bound}
\end{subfigure}
 \begin{subfigure}[b]{0.245\textwidth}
\includegraphics[scale=0.28]{figs/bound-plot-svhn-vgg-TS-1.png}
\caption{VGG on (small) SVHN}
\label{fig:vgg-svhn-TM-bound}
    \end{subfigure}
\begin{subfigure}[b]{0.245\textwidth}
\includegraphics[scale=0.28]{figs/bound-plot-cifar10-resnet-TS-1.png}
\caption{VGG on CIFAR10}
\label{fig:vgg-cifar10-TM-bound}
\end{subfigure}
\caption{Estimated trajectory-based bound and terminal-state based bound, with $R$ excluded. Zoomed-in figures of generalization error are given in Figure~\ref{fig:errs} in Appendix.}\label{fig:bounds}
% \vspace{-3mm}
\end{figure*}



\paragraph{Implementation and Hyperparameters}
The implementation in this paper is on PyTorch  \citep{paszke2019pytorch}, and all the experiments are carried out on NVIDIA Tesla V100 GPUs (32 GB). Most experiment settings follow \cite{wu2020noisy}, and the code is also based their implementation, which is available at:  \href{https://github.com/uuujf/MultiNoise}{https://github.com/uuujf/MultiNoise}. For CIFAR 10, the initial learning rates used for VGG-11 and ResNet-18 are $0.01$ and $0.1$, respectively. For SVHN, the initial learning rate is $0.05$. For CIFAR100, the initial learning rate is $0.1$.
The learning rate is then decayed by $0.1$ at iteration $40, 000$ and $60, 000$. If not stated otherwise, the batch size of SGD is $100$. 

% \vspace{-1mm}
\paragraph{SGD and SDE Training Dynamics} We implement the SDE training by following the same algorithm given in \cite[Algorithm~1]{wu2020noisy}. Our experiments involved training a VGG-11 architecture without BatchNormalization on a subset of SVHN (containing $25$k training images) and CIFAR10. Additionally, we trained a ResNet-18 on both CIFAR10 and CIFAR100.  Data augmentation is only used in the experiments related to CIFAR100. We ran each experiment for ten different random seed, maintaining a fixed initialization of the model parameters. 
Further details about the experimental setup can be found in \cite{wu2020noisy}.
The results are depicted in Figure~\ref{fig:Acc-Dynamics}. As mentioned earlier, SDE exhibits a performance dynamics akin to that of SGD, reinforcing the similarities in their training behaviors.

% \vspace{-0.15in}

% \vspace{-1mm}
\paragraph{Evolution of Key Quantities for SGD and SDE} We show $||G_t||^2$ and $\tr{\log\pr{\Sigma_t^{-1}\Sigma_{\mu}}}$ in Figure~\ref{fig:Cov-Dynamics}. Recognizing the computational challenges associated with computing $\tr{\log\pr{\Sigma_t^{-1}\Sigma_{\mu}}}$, we opted to draw estimates based on 100 training and 100 testing samples.  Notably, both SGD and SDE exhibit similar behaviors in these gradient-based metrics. It is noteworthy that despite the absence of the learning rate in the trajectory-based bounds, we observed that modifications to the learning rate at the $40,000^{\rm th}$ and $60,000^{\rm th}$ steps had discernible effects on these gradient-based quantities.  Additionally, in Figure~\ref{fig:Hess-Dynamics}, we examine the trace of the Hessian and its largest eigenvalue during training, leveraging the PyHessian library \citep{yao2020pyhessian}.  Note that we still use only $100$ training data to estimate the Hession for efficiency. Notice that the Hessian-related quantities of SGD and SDE are nearly perfectly matched in the terminal state of training. Furthermore, Figures~\ref{fig:vgg-svhn-weight}-\ref{fig:vgg-cifar10-weight} illustrate the ``distance to initialization'', revealing a consistent trend shared by both SGD and SDE.


% \vspace{-1mm}
\paragraph{Bound Comparison} We vary the size of the training sample and empirically estimate several of our bounds in Figure~\ref{fig:bounds}, with the subgaussian variance proxy $R$ excluded for simplicity. Thus, the estimated values in Figure~\ref{fig:bounds} don't accurately represent the true order of the bounds. Despite the general unbounded nature of cross-entropy loss, common training strategies, such as proper weight initialization, training techniques, and appropriate learning rate selection, ensure that the cross-entropy loss remains bounded in practice. Therefore, it is reasonable to assume subgaussian behavior of the cross-entropy loss under SGD training. In Figure~\ref{fig:vgg-svhn-bound}-\ref{fig:vgg-cifa10-bound}, we compare our Theorem~\ref{thm:anisotropic-prior-bound} with \citet[Theorem~2]{wang2022generalization}. Since both bounds incorporate the same $R$, the results in Figures\ref{fig:vgg-svhn-bound} to \ref{fig:vgg-cifa10-bound} show that our Theorem~\ref{thm:anisotropic-prior-bound} outperforms \citet[Theorem2]{wang2022generalization}. This aligns with expectations (see Appendix~\ref{sec:IT-SGD} for additional discussions), considering that the isotropic Gaussian used in the auxiliary weight process of \citet[Theorem~2]{wang2022generalization} is suboptimal, as demonstrated in Lemma~\ref{lem:compare-iso-noniso}. Moreover, Figures~\ref{fig:vgg-svhn-TM-bound} to \ref{fig:vgg-cifar10-TM-bound} hint that norm-based bounds Corollary~\ref{cor:pacbayes-isotropic-prior} (and Corollary~\ref{cor:pacbayes-isotropic-prior-init}) exhibit growth with $n$, which are also observed in \cite{nagarajan2019uniform}. In contrast, Corollary~\ref{cor:pacbayes-anisotropic-prior} effectively captures the trend of generalization error, emphasizing the significance of the geometric properties of local minima. Additionally, while trajectory-based bounds may appear tighter, terminal-state-based bounds seem to have a faster decay rate.
% \subsection{SGD Training Dynamics}
% \paragraph{SGD Training Dynamics}
% We record the SGD trajectories by training a MLP (with one hidden layer of 512 neurons) and a LeNet \citep{lecun1998gradient} on MNIST \cite{lecun2010mnist} and CIFAR10 \citep{krizhevsky2009learning}, respectively. 
% To simplify estimation, we only use $1/10$ of the entire dataset size. 
% We use the cross-entropy loss as our objective function. Although in general, the cross-entropy loss is unbounded, but in practice, given the current weight initialization and training technique (e.g., selecting a proper learning rate), the cross-entropy loss will not go to infinity.
% Thus, it’s reasonable to assume the cross-entropy loss is bounded under the SGD training. 

% In addition, we use fixed learning rate 0.05 and 0.005 for MLP and LeNet, respectively, and the batch size is set to $60$. 
% We train each model until the training accuracy is greater than $0.9999$.


% In Figure \ref{fig:train-dynamic}, we show the dynamics of full batch gradient norm ($||G_t||$), trace of SGD noise covariance ($tr\{\Sigma_t\}$), trace of Hessian ($tr\{H_t\}$), trace of inverse Hessian ($tr\{H^{-1}_t\}$), squared $L_2$ distance from current weights to the initialization ($||w_t-w_0||^2$) and the squared norm of weights ($||w_t||^2$). \looseness=-1

% As suggested by Theorem \ref{thm:isotropic-prior-bound} and Theorem \ref{thm:anisotropic-prior-bound}, the gradient norm and trace of gradient noise covariance are crucial to the generalization performance. 
% From Figure \ref{fig:train-mnist} and \ref{fig:train-cifar}, we find that during training, $||G_t||$, $tr\{\Sigma_t\}$ and $tr\{H_t\}$ have very similar behavior. That is, if the model is well trained (e.g., Figure \ref{fig:train-mnist}), these three quantities will monotonically decrease with nearly the same rate. If the model is forced to memorize the training sample during training and 
% does not generalize well on the unseen data (e.g., Figure \ref{fig:train-cifar}), then these three quantities will have a growing phase during training, which  is aligned with the previous work of \citet{wang2022generalization},  in which they show a "double-descent" curve of gradient dispersion. 
% In addition, 
% we see that during the whole training phase, 
% the value of Hessian trace is very close to the value of gradient noise covariance trace, which further justifies the correctness of $H_T\approx\Sigma_T$. 
% Clearly, the trace of inverse Hessian is also correlated to the empirical generalization gap in Figure~\ref{fig:inverse-mnist} and \ref{fig:inverse-cifar}. Note that Eq.~(\ref{eq:second-order-taylor}) only holds when weights are close to a local minimum so we only plot the curve in the latter phase of the training (e.g., $80\%$ training accuracy is achieved) in Figure~\ref{fig:inverse-cifar}.

% but seems not follow the same tendency of the gap in Figure~\ref{fig:inverse-cifar}. Bear in mind that Corollary~\ref{cor:IF-pacbayes-data-prior} requires Eq.~(\ref{eq:second-order-taylor}) to hold. Thus, the trace of inverse Hessian can only characterize the generalization when weights are close to a local minimum. Unlike MLP on MNIST, the model is not close to a local minimum on CIFAR10 at the beginning so $tr\{H^{-1}_t\}$ could not reflect the generalization gap in this case.\looseness=-1
% Also recall $C_t = 1/b \Sigma_t$, we can observe that $tr\{C_t\}\ll d$ in practice, which means that $-tr\{\log{C_t}\}$ is very large.

% Theorem \ref{thm:pacbayes-isotropic-prior} assures us that the squared $L_2$ distance of final weights with some other reference weights is likely to provide some clue about the model generalization performance. 
% Additionally, we show three different $L_2$ distance in Figure \ref{fig:norm-mnist} and \ref{fig:norm-cifar}. Clearly, computing $\tilde{w}=\ex{}{W_t}$ for each step $t$ is very complicated. Instead, since $\tilde{w}$ in Theorem \ref{thm:pacbayes-isotropic-prior} can be randomly chosen (because it is the mean of an arbitrary Gaussian prior), we simply let $\tilde{w}_t=0$ and $\tilde{w}_t=w_0$ where $w_0$ is the random initialization of the weights. In addition, we also randomly choose $J=j$ as the indices of the training  data subset, then we record the squared $L_2$ distance between the SGD weights and the weights trained by the subset.  
% Results presented in Figure \ref{fig:norm-mnist} and \ref{fig:norm-cifar} show that  $L_2$ distance of the weights will monotonically increase and converge to a stable value in the end. Particularly, if the final model generalizes well, the weight norm or distance will increase with a faster rate at the early phase of training (Figure~\ref{fig:norm-mnist}). In contrast, if the final model generalizes poorly, the weight norm will slowly grow at the early phase (Figure~\ref{fig:norm-cifar}). 
% Notably, the weight norm will not increase when the local minimum is found so this verifies that the bound in Theorem \ref{thm:pacbayes-isotropic-prior} will not grow with $T$ increasing at the end of training.


% \begin{wrapfigure}{r}{0.51\textwidth} %this figure will be at the right
% % \vspace{-10pt}
%     \centering
%     \input{NormPlot}
% \caption{Corrupted MNIST and CIFAR10.
% %(a)(b) show the bound decaying with the network width. (c)(d) show the bound increasing with the noise level.
% }
% \label{fig:l2-norm}
% % \vspace{-20pt}
% \end{wrapfigure}

% \begin{figure}[ht!]
% \centering
%     \input{NormPlot}
% \caption{Corrupted MNIST and CIFAR10. Some quantities are re-scaled.
% %(a)(b) show the bound decaying with the network width. (c)(d) show the bound increasing with the noise level.
% }
% \label{fig:l2-norm}
% % % \vspace{-5pt}
% \end{figure}

% \subsection{Generalization on Corrupted Data}
% \paragraph{Generalization on Corrupted Data} 
% We also perform experiments on datasets with varying levels of label corruption. Specifically, the noise level $\epsilon$ refers to the setting where we replace the labels of  $\epsilon$ fraction of the training and testing instances with random labels. We compare the empirical generalization gap (w.r.t. $0/1$ loss) with summation of all the full batch gradient norms during training $\sum_{t=1}^T ||G_t||^2$ and distance from initialization $||w_T-w_0||^2$. In addition, estimating the nearly accurate trace of inverse Hessian on corrupted dataset is rather challenging, here we only show $||w_s-w_{s_j}||^2$ for randomly chosen $j$ as a reference. 
% All of these quantities are scaled in Figure~\ref{fig:l2-norm}.
% The $L_2$ distance of the final weights is compared against the empirical generalization gap with respect to $0/1$ loss, namely, the difference between the training accuracy and testing accuracy, and is shown in Figure \ref{fig:l2-norm}.\looseness=-1
% From Figure~\ref{fig:weight-mnist} and \ref{fig:weight-cifar}, we find that all three quantities can well characterize the generalization behavior of SGD on the corrupted datasets. 
% This further verifies our Theorem~\ref{thm:isotropic-prior-bound}, Theorem~\ref{thm:pacbayes-isotropic-prior} and Theorem~\ref{thm:pacbayes-data-dependent-prior}. 
