% \documentclass{uai2024} % for initial submission
\documentclass[accepted]{uai2024} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2024} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2024} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\input{headers}
\title{Two Facets of SDE Under an Information-Theoretic Lens: Generalization of SGD via Training Trajectories and via Terminal States}
% The standard author block has changed for UAI 2024 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<zwang286@uottawa.ca>?Subject=Your UAI 2024 paper}{Ziqiao~Wang}}
\author[1]{\href{mailto:<ymao@uottawa.ca>?Subject=Your UAI 2024 paper}{Yongyi~Mao}}
% \author[1,2]{Further~Coauthor}
% \author[3]{Further~Coauthor}
% \author[1]{Further~Coauthor}
% \author[3]{Further~Coauthor}
% \author[3,1]{Further~Coauthor}
% Add affiliations after the authors
\affil[1]{%
School of Electrical Engineering and Computer Science\\

University of Ottawa\\

    Ottawa, Ontario, Canada
    % Computer Science Dept.\\
    % Cranberry University\\
    % Pittsburgh, Pennsylvania, USA
}
% \affil[2]{%
%     Second Affiliation\\
%     Address\\
%     …
% }
% \affil[3]{%
%     Another Affiliation\\
%     Address\\
%     …
%   }
  
  \begin{document}
\maketitle

\begin{abstract}
  Stochastic differential equations (SDEs) have been shown recently to  characterize well the dynamics of training machine learning models with SGD. When the generalization error of the SDE approximation closely aligns with that of SGD in expectation, it 
  % this
  provides two opportunities for understanding better the generalization behaviour of SGD through its SDE approximation. 
Firstly, viewing SGD as full-batch gradient descent with Gaussian gradient noise allows us to obtain trajectory-based generalization bound using the information-theoretic bound from \citet{xu2017information}. Secondly, assuming mild conditions, we estimate the steady-state weight distribution of SDE and use information-theoretic bounds from \citet{xu2017information} and \citet{negrea2019information} to establish terminal-state-based generalization bounds. 
Our proposed bounds have some advantages, notably the trajectory-based bound outperforms results in \cite{wang2022generalization}, and the terminal-state-based bound exhibits a fast decay rate comparable to stability-based bounds. 
\end{abstract}

\input{intro}

\input{preliminary}

\input{ITBound}

\input{PacBayes}

\input{experiment}

\section{Other Related Literature}
% \section{Other Related Literature}
% Information-theoretic generalization bounds are typically useful to noisy iterative algorithms.  For example,  \citet{pensia2018generalization} first apply the information-theoretic bound given by \citet{xu2017information} to analyze the generalization property of SGLD. Since the noise used in SGLD is usually an isotropic Gaussian, by utilizing the closed form of KL divergence between two Gaussian distributions, the information-theoretic generalization bound for SGLD is shown to have a tractable form. Their result is then improved by stronger bounds in  \citep{bu2019tightening,negrea2019information,haghifam2020sharpened,wang2022generalization}. 

% MI bounds, however, can not be directly used to analyze the generalization property of vanilla SGD since the MI term in the bound may go to infinity in this case and one could not obtain a tractable form by using following the approach of \cite{pensia2018generalization}. 
% Recently,  \cite{neu2021information} and \cite{wang2022generalization} have studied the generalization of models trained with SGD and obtained new MI bounds, using a technique via constructing an auxiliary perturbed weight process; additional complexity must be delt with in that analysis. Thus there appears significant room for improved understanding of the generalization of SGD.
% \textcolor{red}{
Recently, \cite{simsekli2019tail,nguyen2019first,simsekli2020hausdorff,meng2020dynamic,gurbuzbalaban2021heavy}, and \cite{gurbuzbalaban2021heavy} challenge the traditional assumption that gradient noise is a Gaussian and argue that the noise is heavy-tailed (e.g., L\'{e}vy noise). In contrast, \citet{xie2020diffusion} and \citet{li2021validity} 
claim that non-Gaussian noise is not essential to SGD performance, and SDE with Gaussian gradient noise can well characterize the behavior of SGD. They also 
argue that the empirical evidence shown in \cite{simsekli2019tail} relies on a hidden strong assumption that gradient noise is isotropic and each dimension  has the same distribution. 
For other works on SGD and SDE, see \citep{hoffer2017train,xing2018walk,panigrahi2019non,wu2020noisy,zhu2019anisotropic,li2020hessian,ziyin2022strength}.

In addition, there are some generalization bounds using fractal dimensions \citep{simsekli2020hausdorff,camuto2021fractal,dupuis2023generalization}, which are also trajectory-based generalization bounds. Notably, \cite{dupuis2023generalization} improves previous works by removing the Lipschitz continuity assumption, yet direct comparison of our results with theirs remains challenging. Specifically, one notable difference is that in both Section~\ref{sec:itb-sde} and Section~\ref{sec:pac-bayes} of our work, we provide in-expectation generalization bounds, while they present high-probability generalization guarantees, which require additional developments for comparison. Moreover, some key components in their bounds are not directly comparable to our gradient noise covariance or Hessian-based quantities, such as their upper and lower box-counting dimensions. On one hand, we believe our results have several advantages. For instance, while the boundedness of loss is essential in their work, we can relax it to a sub-Gaussian condition in ours. Additionally, our bound is easier to estimate for more complex models. On the other hand, their utilization of intrinsic dimension in the analysis is inspiring and may be possible to incorporate into our analysis for obtaining better results. 

\section{Limitations and Future works}
\label{sec:concluds}
% In this paper, we invoke the SDE approximation of SGD so that information-theoretic generalization bounds are directly applicable to SGD with two opportunities. First, dynamics characterized by SDE enable us to obtain trajectory-based bounds by the step-wise analysis of mutual information. 
% These results mainly suggest that the trace of gradient noise covariance is significant for studying the generalization ability of SGD. 
% In addition, with some mild assumptions, we also 
% % apply the PAC-Bayes-like approach to 
% obtain some new bounds based on the terminal state of SGD. 

 While our current work exhibits certain limitations, such as the requirement of positive definiteness for $C_t$ in our trajectory-based bounds, it's worth noting that recent studies \citep{frankle2018the,li2018measuring,gur2018gradient,larsen2022how} indicate that many parameters in deep neural networks might be dispensable without affecting generalization. This implies that GD/SGD could potentially occur in a subspace of $\mathbb{R}^d$ termed the ``intrinsic dimension'' $d_{\mathrm{int}}$. Defining $C_t$ within this invertible subspace, utilizing $d_{\mathrm{int}}$, could potentially overcome our current limitations. Theoretical characterization of intrinsic dimension, however, remains an open problem, and further exploration in this direction is poised to significantly improve our work. In addition, there are also some other promising directions for further improving this work, for example, 
% via developing new theoretic tools to analyzing non-Gaussian type gradient noise (e.g., heavy tailed noise) and 
via using other posterior and prior covariance instead of steady-state covariance (e.g., we also give one in Theorem~\ref{thm:IF-pacbayes-FIM} in Appendix), and via extending the similar analytical approach used in this work to other optimizer (e.g., Adam, Adagrad, etc.). 

\begin{acknowledgements} % will be removed in pdf for initial submission,
						 % (without ‘accepted’ option in \documentclass)
                         % so you can already fill it to test with the
                         % ‘accepted’ class option
    This work is supported partly by an NSERC Discovery
grant. Ziqiao Wang is also supported in part by the NSERC CREATE program through the Interdisciplinary Math and Artificial Intelligence (INTER-MATH-AI) project. The authors would like to thank all the anonymous reviewers and the ACs for their careful reading and valuable suggestions.
\end{acknowledgements}

% References
\bibliography{ref}

\newpage

\onecolumn

\input{Appendix}
% \title{Title in Title Case\\(Supplementary Material)}
% \maketitle



% This Supplementary Material should be submitted together with the main paper.

% \appendix


\end{document}
