% \documentclass{uai2025} % for initial submission
\documentclass[accepted]{uai2025} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2025} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2025} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

% Added extra package
\usepackage{amsmath,amsthm} % Math packages which might be useful for equations
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}
\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{subcaption,wrapfig}
\usepackage{hyperref}
\usepackage{url}
\usepackage{amsfonts}

\newcommand{\YM}[1]{{\color{purple}[YM: #1]}}





%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Adversarial Training May Induce Deteriorating Distributions}

% The standard author block has changed for UAI 2025 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{rtian081@uottawa.ca}{Runzhi Tian}{}}
\author[1]{\href{ymao@uottawa.ca}{Yongyi Mao}{}}

\affil[1]{%
    Dept of EECS\\
    University of Ottawa\\
    Canada
}

  \begin{document}
\maketitle



\begin{abstract}
  The interactions between the update of model parameters and the update of perturbation operators complicate the dynamics of adversarial training (AT). This paper reveals a surprising behavior in AT, namely that the distribution induced by adversarial perturbations during AT becomes progressively more difficult to learn. We derive a generalization bound to theoretically attribute this behavior to the increasing of a quantity associated with the perturbation operator, namely, its local dispersion.  We corroborate this explanation with concrete experimental validations and show that this deteriorating behavior of the induced distributions is correlated with robust overfitting of AT. Code is available at \url{https://github.com/rzTian/AT-Deteriorating-Distributions}.
\end{abstract}


\section{Introduction}

Despite their outstanding performance, deep neural networks (DNNs) are known to be vulnerable to adversarial attacks, where a carefully designed perturbation of input may cause the network to make a wrong prediction \citep{szegedy2014intriguing, goodfellow2015explaining}. Such perturbed inputs are termed adversarial examples. The existence of adversarial examples raises great concerns when DNNs are applied to decision-critical tasks such as autonomous driving and facial recognition \citep{eykholt2018robustphysicalworldattacksdeep, sharif2016accessorize}. Many methods have been proposed to improve the robustness of DNNs against adversarial perturbations \citep{madry2019deep, DBLP:journals/corr/abs-1901-08573, DBLP:journals/corr/abs-2010-09670}, among which the framework, known as adversarial training (AT) \citep{madry2019deep}, is arguably the most effective \citep{DBLP:journals/corr/abs-1802-00420, dong2020benchmarking}. 

In a nutshell, AT may be regarded as stochastic gradient descent (SGD) on an adversarially perturbed version of the training set at each iteration.  Specifically, at each gradient descent iteration, each input instance in a training batch is first perturbed to maximize the training loss with respect to the current model parameter, and then gradient descent is performed to update the model parameter. The maximization of the training loss prior to gradient descent is constrained on a maximum allowable perturbation radius; in other words, this maximization is equivalent to an adversarial attack to the model with current parameter setting. The most popular method to solve this maximization problem is the Projected Gradient Descent (PGD) \citep{madry2019deep}.


Despite that AT have been shown to have greatly improved the robustness of the learned model against adversarial attacks on the training set, a recent work in \citet{DBLP:journals/corr/abs-2002-11569} has however revealed that models trained by AT may still be vulnerable to adversarial attacks on the unseen data. Specifically, after training, even though the robust error (i.e., error probability in the predicted label for adversarially perturbed instances) is nearly zero on the training set, it may remain very high on the testing set. For example, on the testing set of CIFAR-10 \citep{krizhevsky2009learning}, the robust error of AT trained model can be as large as 44.19\%. This significantly contrasts the typical observations in standard training: on CIFAR-10, when the standard error (i.e., the error probability in the predicted label for non-perturbed instances) is nearly zero on the training set, its value on the testing set is only about 4\%. This unexpected phenomenon is often referred to as robust overfitting. 

Since its discovery, a great deal of research effort has been spent on understanding the cause of robust overfitting. Various perspectives have been exploited in this research direction. For instance, \citet{DBLP:journals/corr/abs-2004-05884, DBLP:journals/corr/abs-2104-04448, chen2021robust, smoothness} study the properties of the landscape of the adversarial loss; the authors of \citet{DBLP:journals/corr/abs-2102-07861} investigate the curvature of the activation functions used in the neural networks; \citet{DBLP:journals/corr/abs-2110-03135} attempt to relate robust overfitting to potential label noises in AT; \citet{xing2021on, xiao2022stability} look into the training trajectories of AT through the lens of algorithmic stability.


Despite partial answers provided by these works, the cause of robust overfitting remains largely elusive. Arguably this is due to the significant challenges posed by the complex dynamics of AT. In particular, this complexity arises from the convoluted interaction between the update of model parameter along AT iterations and the update of the adversarial perturbations in the inner maximization step. More concretely, when the model parameter gets updated, the adversarial perturbation is updated to one that attacks the updated model, and the updated adversarial perturbation in turn governs the next update of the model parameter. It is then conceivable that understanding the generalization behavior of AT requires a deep understanding of the interaction between the model updates and perturbation updates, even ``untangling'' the convoluted interaction along the training trajectory. 
This philosophy is behind the motivation of this work. 

A key observation of this paper is the recognition that 
in each AT iteration, the perturbation operator effectively induces a new data distribution and that the model update may be viewed as the standard training on data drawn from this induced distribution. Since perturbation in each AT iteration has a small magnitude, the induced distribution is provably close to the original data distribution. However, a surprising finding in this work is that these induced distributions behave distinctively from the original distribution: as AT progresses, they may become increasingly more difficult to learn. The experiments supporting this finding were conducted as follows: for a check point of AT, we extract the perturbation operator and use it to perturb both the training set and test set; we then train a model from scratch on the perturbed training set, using standard training, until the (standard) training loss is effectively zero; we then evaluate the learned model on the perturbed testing set to obtain its classification error. We call such an experiment as an "induced distribution experiment'' or IDE. When conducting IDE on datasets such as CIFAR-10, we usually observe large testing errors, particularly when the check point is near the end of AT. In fact, on such datasets, the generalization gap for models learned from the induced distribution appears to progressively increase as AT proceeds. 

To understand the deteriorating behavior of the induced distribution along AT, we derive a uniform-convergence upper bound of the generation gap for models learned on the induced distributions. The key quantity in the bound is a term we call ``local dispersion'' of the perturbation operator. Our bound suggests that only when the perturbation operator has small local dispersion, a good generalization guarantee can be obtained for models learned on the distribution induced by the operator. Through experiments, we show that local dispersion is indeed indicative to the generalization gap of models learned on the induced distribution and can be used to explain the deteriorating behavior of the induced distribution along the AT trajectories, as observed in our IDE experiments.  

In summary, in this work we discover an interesting phenomenon in AT, namely, that the induced distributions by the perturbation operator in AT are progressively more difficult to learn. We prove a generalization bound as a  theoretical explanation for this phenomenon and corroborate it with experimental validations. 
Our results shed new lights in understanding the complex AT dynamics and the interaction therein between model updates and perturbation updates. Although there have been previous works examining AT trajectories, very few actually zoom into the properties of the perturbation operator. The only work that we are aware of in this direction is a recent paper of \citet{tian2025algorithmic}, where a notion of expansiveness is introduced for the perturbation operator and subsequently used to analyze robust generalization via algorithmic stability. Like that work, this paper highlights the importance of investigations in this angle in paving ways towards understanding robust generalization. This importance is further manifested by our additional experimental observation presented at the end of the paper, where we show that the deteriorating behavior of the induced distributions correlates with robust overfitting. 


\section{Related works}

\paragraph{Adversarial examples}

Existing studies have uncovered intriguing properties of adversarial examples, such as their transferability across different models \citep{goodfellow2015explaining, papernot2016transferabilitymachinelearningphenomena, tramèr2017spacetransferableadversarialexamples} and their distinct geometric characteristics compared to clean examples  \citep{ma2018characterizingadversarialsubspacesusing, fawzi2018adversarialvulnerabilityclassifier}. The work in \citet{ilyas2019adversarialexamplesbugsfeatures} reveals that adversarial examples generated w.r.t a model trained via standard training may still contain useful features. Specifically, they demonstrate that a classifier trained on mislabeled adversarial examples can achieve remarkable generalization performance on unseen clean data. Theoretical explanations for this finding are then provided in \citet{kumano2024theoreticalunderstandinglearningadversarial, kumano2025widetwolayernetworkslearn}. Additionally, the work in \citet{zhang2022adversarialnoiseslinearlyseparable} presents another intriguing finding that adversarial perturbations for two-layer neural networks with random weights are linearly separable, suggesting structural properties of adversarial perturbations exist. 

Unlike \citet{ilyas2019adversarialexamplesbugsfeatures} and \citet{zhang2022adversarialnoiseslinearlyseparable}, who focus on adversarial examples for models trained via standard training or with random weights, our work explores adversarial examples along AT trajectories, providing new insights into how features of adversarial examples evolve throughout the training process.

\paragraph{Adversarial Robustness} A growing body of work has investigated the underlying causes of adversarial vulnerability, especially in linear and high-dimensional settings. \citet{tanay2016boundary} offered a geometric perspective, suggesting that adversarial examples arise when the decision boundary extends beyond the data manifold; in such regions, the boundary may lie close to data points, even if it remains distant within the manifold itself. \citet{tanner2024high} analyzed adversarial training for margin-based linear classifiers in high dimensions, highlighting how the interplay between data geometry and attack direction influences robustness. \citet{ribeiro2023regularization} examined adversarial training in linear regression, showing that it induces different forms of implicit regularization depending on whether the model is overparameterized or underparameterized. Similarly, \citet{javanmard2020precise} studied the trade-off between robustness and standard accuracy using linear regression with Gaussian features, providing precise theoretical characterizations in the high-dimensional regime. 

\paragraph{Robust generalization}
Different from standard generalization, robust generalization for deep neural networks \textemdash especially on high-dimensional data \textemdash appears significantly more challenging. Various work have attempted to understand the reason behind. \citet{DBLP:journals/corr/abs-1804-11285} proves that in simple data models such as the Gaussian and Bernoulli models, robust generalization requires significantly higher sample complexity than standard generalization. The sample complexity of robust generalization has been further analyzed using classical statistical learning tools, including Rademacher complexity \citep{khim2019adversarial, DBLP:journals/corr/abs-1810-11914, DBLP:journals/corr/abs-2004-13617, xiao2022adversarial, DBLP:journals/corr/abs-1810-02180}, VC dimension \citep{DBLP:journals/corr/abs-1902-04217} and algorithmic stability analysis \citep{xing2021on, xiao2022stability}, as well as the PAC learning frameworks \citep{cullina2018paclearning, DBLP:journals/corr/abs-1906-05815}. 

Beyond sample complexity, several theoretical perspectives have been explored. The work of \cite{li2022robust} analyze robust generalization through the lens of neural network's expressive power, showing that practical models may lack sufficient capacity to achieve low robust test error. The authors in \cite{DBLP:journals/corr/abs-1906-02931} investigate inductive bias of gradient descent for AT, while another line of research connects AT with distributionally robust optimization (DRO)  \citep{kuhn2019wasserstein, sinha2020certifying} . The works of \citet{staib2017distributionally} and \citet{bui2022unified} demonstrate that different AT schemes can be reformulated as special cases in DRO.  \citet{bennouna2023certified} further show that, under a saddle-point assumption, AT inevitably leads to a larger generalization gap than directly solving empirical risk minimization using adversarially perturbed data.
Numerous endeavors have been undertaken to address the challenge of robust overfitting with various empirical training algorithms proposed. \citet{bai2021recent} and \citet{qian2022survey} provide a comprehensive overview of the latest developments in empirical research in this field.


\section{Preliminaries and problem setup}

We consider a classification setting with input space ${\mathcal X}\subseteq \mathbb{R}^{d}$ and  label space ${\mathcal Y}:=\{1,2,\cdots, K\}$. We use ${\cal D}$ to denote a distribution on $\mathcal{X}\times\mathcal{Y}$ and denote ${\cal D}^{\cal X}$ as the marginal distribution of ${\cal D}$ on ${\cal X}$. 
Let $\Theta$ be the parameter space of a parameterized model of interest, and for each  $\phi\in \Theta$, let $f_{\phi}:\mathcal{X}\times\mathcal{Y}\to \mathbb{R}_{+}$ denote a model which consists of a loss function (e.g, the cross-entropy loss or 0-1 loss) and a classifier $h_\phi$ with parameter $\phi$. 


For any data distribution $\mathcal {D}$ and any model $f_\phi$, we define the model's \textit{standard population risk}
$R_{\cal D}(\phi)$ as 
\begin{equation}
\label{def:standard pop risk}
R_{\mathcal{D}}(\phi):=\mathbb{E}_{(x, y)\sim \mathcal{D}}\left[f_{\phi}(x,y)\right]
\end{equation}

For a set of $m$ samples $S:=\{(x_i, y_i)\}_{i=1}^{m}$ drawn i.i.d.
from ${\cal D}$, we define the model's standard empirical risk $R_{S}(\phi)$ as
\begin{equation}
\label{def:standard emp risk}
R_{\mathcal{S}}(\phi):=\frac{1}{m}\sum_{i=1}^{m}f_{\phi}(x_i,y_i)
\end{equation}

The standard generalization performance of the model $f_\phi$ is then measured by the \textit{standard generalization gap}: 
\begin{equation}
\label{def: std gap}
 {\rm GG}_{m}(\phi, S; {\cal D}):= |R_{\mathcal{D}}(\phi)-R_{S}(\phi)|   
\end{equation}


\paragraph{Adversarial perturbations} Let $\mathbb{B}_{\infty}(x,\epsilon)$ denote a $\infty$-norm ball centered at $x$ with radius $\epsilon$, or $\mathbb{B}_{\infty}(x,\epsilon):=\{t\in\mathbb{R}^{d}: \|t-x\|_{\infty}\le \epsilon\}$. Given any instance-label pair $(x,y)\in {\cal X}\times{\cal Y}$ and a target model $f_{\phi}$ parameterized by $\phi$,
we define the {\em $\epsilon$-adversarial perturbation of $x$ with respect to $f_{\phi}$} as
\begin{equation}
    {\cal Q}_{\phi}(x,y) := \arg\max_{v
\in \mathbb{B}_{\infty}(x,\epsilon)} f_{\phi}(v, y)
\end{equation}
Clearly the operator ${\cal Q}_\phi$ also depends on the allowable perturbation magnitude $\epsilon$, but we suppress such dependency in our notations throughout the paper for simplicity. 


\paragraph{Adversarial risks} Given a data distribution ${\cal D}$ and its i.i.d samples $S$, we define the \textit{adversarial population risk} $R_{\mathcal{D}}^{\rm adv}(\phi)$ and the \textit{adversarial empirical risk}  $R_{S}^{\rm adv}(\phi)$ of a model $f_\phi$ respectively as
\begin{equation}
\label{def:rob pop risk}
R_{\mathcal{D}}^{\rm adv}(\phi):=\mathbb{E}_{(x, y)\sim \mathcal{D}}f_{\phi}({\cal Q}_{\phi}(x,y),y)
\end{equation}
and
\begin{equation}
\label{eq:rob emp risk} 
R_{S}^{\rm adv}(\phi):=\frac{1}{m}\sum\limits_{i=1}^{m}f_{\phi}({\cal Q}_{\phi}(x_i,y_i),y_i)
\end{equation}


\paragraph{Adversarial training} Given a training set $S$, at the $t^{\rm th}$ iteration of adversarial training (AT), 
where the model parameter is $\phi_t$, the model parameter is updated, with learning rate $\eta$, by
\begin{equation}
\label{eq:PGD_update}
    \phi_{t+1} = \phi_{t}-\eta\nabla_{\phi_t}\left[\frac{1}{n}\sum\limits_{i=1}^{n} f_{\phi_t}\left({\cal Q}_{\phi_t}(x_i, y_i), y_i\right)\right]
\end{equation}

We note that when optimizing $f_{\phi}({\cal Q}_{\phi}(x,y), y)$ using gradient descent, despite ${\cal Q}$ is also a function of $\phi$, the gradient does not propagate through the perturbation operator ${\cal Q}$, an option consistent with the standard AT implementation as in \citet{madry2019deep, DBLP:journals/corr/abs-2002-11569}. 

Notably, the update equation (\ref{eq:PGD_update}) of AT results in a complex dynamics, namely, the update of $\phi$ causes the update of the perturbation operator ${\cal Q}_{\phi}$, and the update of ${\cal Q}_{\phi}$ in turn influences the next update of $\phi$. This complex interaction between the model parameter and the perturbation operator makes analyzing AT trajectories very difficult. 

One key perspective of this work is recognizing that at training iteration $t$, the perturbation operator 
${\cal Q}_{\phi_t}$ essentially induces a different distribution and that the AT step in (\ref{eq:PGD_update}) may be seen as a one-step gradient descent on the standard empirical risk of training data drawn from this induced distribution. We next make this precise. 


\paragraph{Perturbation induced distribution} 

Let $(X, Y)$ be drawn from ${\cal D}$. Given an adversarial perturbation ${\cal Q}_{\phi}$, the \textit{perturbation induced distribution} (or simply induced distribution) is defined as the joint distribution of $({\cal Q}_{\phi}(X,Y), Y)$ and is denoted by $\tilde{\cal D}_{\phi}$. 
For a given training set $S=\{(x_i, y_i)\}_{i=1}^m$, 
denote $\tilde{S}_{\phi}:=\{(v_i, y_i)\}_{i=1}^{m}$, where
$v_i:={\cal Q}_{\phi}(x_i,y_i)$. It is clear that the samples $\tilde{S}_{\phi}$ are drawn from the induced distribution $\tilde{\cal D}_\phi$.

Since each perturbed instances ${\cal Q}_{\phi}(x, y)$ lies within a small neighborhood of $x$ (i.e., $\|{\cal Q}_{\phi}(x, y)- x\|_{\infty}\le \epsilon$), it follows immediately that for any $\phi$, the Wasserstein $p$-distance (denoted by ${\cal W}_p(\cdot,\cdot)$) between ${\cal D}$ and $\tilde{\cal D}_\phi$ satisfies
\begin{equation}
\label{ineq: wassterstein}
    {\cal W}_{p}(\tilde{\cal D}_{\phi}, {\cal D})\le \epsilon
\end{equation}
for any $p\in [1, +\infty]$. Here the metric, say $d$, on ${\cal X}\times {\cal Y}$ by which the Wasserstein distance is defined,  is
\[
d((x, y), (x', y')):= \Vert x-x'\Vert_\infty + d_{\cal Y}(y, y')
\]
where $d_{\cal Y}$ is an arbitrary metric on ${\cal Y}$.

Notably, in the context of adversarial training, the maximum perturbation magnitude $\epsilon$ is usually small. Then by equation (\ref{ineq: wassterstein}), the distribution $\tilde{\cal D}_\phi$ induced by the perturbation operator ${\cal Q}_\phi$ during AT is very close to the original data distribution ${\cal D}$. However, a surprising observation in this work is that models trained (via standard training) on ${\cal D}$ and on $\tilde{\cal D}_\phi$ may have very different behaviors.

It is also worth noting that $R_{\mathcal{D}}^{\rm adv}(\phi)=R_{\tilde{\cal D}_{\phi}}(\phi)$ and $R_{S}^{\rm adv}(\phi)=R_{\tilde{S}_{\phi}}(\phi)$ \textemdash the adversarial risks of $\phi$ can be treated as the standard population (resp. empirical) risk of $\phi$ measured on the induced distribution (resp. the samples drawn from the induced distribution) generated by $\phi$. 

Following the definition of generalization gap in (\ref{def: std gap}), the notations ${\rm GG}_{m}(\phi, \tilde{S}_{\phi}; \tilde{\cal D}_{\phi})$ and 
${\rm GG}_m(\theta, \tilde{S}_{\phi}; \tilde{\cal D}_{\phi})$ are both well defined, where the former is
the \textit{robust generalization gap} of an arbitrary model $f_\phi$ and 
the latter is the standard generalization gap of an arbitrary model $f_\theta$ measured with respect to a given induced distribution 
$\tilde{\cal D}_\phi$ and its samples $\tilde{S}_\phi$.


\section{Learning on the induced distributions}
\label{sec: experiments}

\begin{figure*}
    \centering
    \subfigure[Clean dataset]{\includegraphics[width=0.24\textwidth]{uai2025-template/training dynamic/train_curve_cifar10/clean.pdf}}
    \subfigure[$\phi$=AT(0)]{\includegraphics[width=0.24\textwidth]{uai2025-template/training dynamic/train_curve_cifar10/chkpt0.pdf}}
    \subfigure[$\phi$=AT(80)]{\includegraphics[width=0.24\textwidth]{uai2025-template/training dynamic/train_curve_cifar10/chkpt80.pdf}}
    \subfigure[$\phi$=AT(200)]{\includegraphics[width=0.24\textwidth]{uai2025-template/training dynamic/train_curve_cifar10/chkpt200.pdf}}
    \caption{Learning curves of standard training on the clean CIFAR-10 dataset and IDEs w.r.t various $\phi$. In each training, the learning rate is decayed at the $100^{\rm th}$ epoch.}
     \label{fig: train curves-cifar10}
\end{figure*}

In this section, we experimentally study the problem of learning on the induced distribution $\tilde{\cal D}_{\phi}$, where $\phi$ is the parameter of a model being trained during AT. 

\paragraph{Induced distribution experiment}
Let $S$ and $T$ be the training set and testing set of a classification task. We perform AT for a neural network model using $S$. Let ${\rm AT}(t)$ denote that model's parameter obtained by performing AT for $t$ epochs.  For some choice of $t$, we obtain model parameter $\phi={\rm AT}(t)$. We then perturb $S$ and $T$ using ${\cal Q}_\phi$, and obtain the perturbed training and testing datasets $\tilde{S}_\phi$ and $\tilde{T}_\phi$ respectively. A new model (with the same architecture) is then trained from scratch (namely, starting from random initialization of its parameters) on $\tilde{S}_\phi$ \textbf{using standard training} and denote the learned model parameter by $\theta$. This model $\theta$ is evaluated on $\tilde{T}_\phi$. For the ease of reference we call such an experiment the ``induced distribution experiment" (IDE). 

In our IDE experiments, ${\cal Q}_{\phi}$ is taken as the Projected Gradient Descend (PGD) attack \citep{madry2019deep}, which is used both for AT and for generating the perturbed datasets. Other details of the experiments are given below.



\paragraph{Datasets}
The experiments are conducted on CIFAR10 and CIFAR100 \citep{krizhevsky2009learning}. We also conduct experiments on a "scaled-down" version of the ImageNet dataset \citep{russakovsky2015imagenet}, which we call 
Reduced ImageNet, drawing inspiration from a 
similar approach in \citet{tsipras2019robustness} for reduced training complexity. Reduced ImageNet aggregates several subsets of the original ImageNet and comprises 10 classes, each containing 5000 training samples and approximately 1000 testing samples per class. More details concerning this dataset are given in Appendix \ref{section: setup}.


\paragraph{Settings for AT and PGD} 
On CIFAR-10 and Reduced ImageNet we perform AT to train the pre-activation ResNet (PRN) model \citep{DBLP:journals/corr/HeZR016} with 18 and 50 layers respectively. On CIFAR-100 we train the Wide ResNet (WRN) model with 34 layers \citep{DBLP:journals/corr/ZagoruykoK16}. We use 5-step PGD with $\epsilon=4/255$ for Reduced ImageNet and 10-step PGD with $\epsilon=8/255$ for CIFAR-10 and CIFAR-100 according to \citet{DBLP:journals/corr/abs-2002-11569}. We set $\lambda=2/255$ on CIFAR10 and CIFAR100,  $\lambda=0.9/255$ on Reduced ImageNet. More details concerning the hyper-parameter settings are given in Appendix \ref{section: setup}. 


\paragraph{Experimental results} 
Let $\phi={\rm AT}(0)$ denote a randomly initialized model. Figure \ref{fig: train curves-cifar10}(b)-(d) presents the learning curves of IDEs conducted on the CIFAR-10 datasets for $\phi$ obtained after AT for different numbers of epochs, while Figure 
\ref{fig: train curves-cifar10}(a) shows the learning
curves of standard training on the clean CIFAR-10 dataset for comparison. The green and red curves respectively represent the training and testing error recorded along the training process. In all cases, the model is trained to achieve zero training error. However, the testing error varies significantly in different IDEs. On the clean dataset, the model attains a testing error as low as 4.13\%; A similar performance is observed on the IDE with $\phi={\rm AT}(0)$, where the testing error reaches around 6.06\%. In contrast, for $\phi={\rm AT}(80)$, the learned model shows a reduced generalization performance, with the testing error increasing to 11.38\%. A more significant rise on the testing error occurs when a model is trained on the perturbed dataset generated by $\phi={\rm AT}(200)$, where the testing error increases to 24.89\%. Similar results are also observed on CIFAR-100 and Reduced ImageNet (see Appendix \ref{Appendix: omit figs} Figure \ref{fig: train curves-cifar100} and \ref{fig: train curves-ResImg}).

For IDE with $\phi={\rm AT}(200)$, a large generalization gap \textemdash the gap between the red and green curves \textemdash emerges in the early phase of the training (around the $20^{\rm th}$ training epoch). After the drop of learning rate (at the $100^{\rm th}$ training epoch), the training error quickly reduces to zero, yet the generalization gap remains nearly unchanged, resulting in a high final testing error. This is in contrast to the learning behavior observed on the clean dataset and the IDE with $\phi={\rm AT}(0)$, where a small generalization gap is established at the early phase of training and is consistently preserved along the training. 

These experiments reveal a rather surprising  phenomenon: despite $\tilde{\cal D}_{\phi}$ being very close to ${\cal D}$, the model's learning performance on the induced distribution $\tilde{\cal D}_\phi$ can be significantly different from that on ${\cal D}$. In particular, as AT proceeds, the induced distribution  $\tilde{\cal D}_\phi$ may deteriorate, in the sense that it becomes increasingly more difficult to generalize, as signified by the increasing  generalization gap. 


\section{Theoretical analysis}

In this section, we provide a theoretical analysis to explain the deteriorating learning behavior of the induced distribution along AT. Specifically, we derive an upper bound for the ``worst-case'' generalization gap $\sup_{\theta\in\Theta}{\rm GG}_{m}(\theta, \tilde{S}_{\phi}; \tilde{\cal D}_{\phi})$. 

\begin{assumption}[Anchored data model]
\label{assumption: anchor}
    We assume that underlying the data distribution 
    ${\cal D}$, there is a latent distribution, or ``anchor distribution'',  ${\cal D}_*$ on ${\cal X} \times {\cal Y}$. ${\cal D}_*$ is specified by its marginal ${\cal D}_*^{\cal X}$ on ${\cal X}$ and a classifier  $h^{*}:{\cal X}\to{\cal Y}$ (which assigns every sample drawn from ${\cal D}_*^{\cal X}$ a label in ${\cal Y}$). The data distribution ${\cal D}$ of interest is a ``smoothed'' version of ${\cal D}_*$ as follows: Draw an ``anchor variable'' $T$ from ${\cal D}_{*}^{\cal X}$. Then draw a noise $\rho$ independent of $T$ from a distribution $\pi$ (on 
    ${\mathbb R}^d$) with zero mean and a finite variance in each dimension (recall that ${\cal X}$ is a subset of ${\mathbb R}^d$) \textemdash we assume the variance in each dimension is small.  The distribution of 
    $(T+\rho, h^*(T))$ is then the distribution ${\cal D}$.

\end{assumption}



\begin{remark}
    In this anchored data model, the true input variable $X$ is treated as a noise-perturbed version of an anchor variable $T\sim {\cal D}_{*}^{\cal X}$. Such an assumption is widely used in various machine learning contexts, for example,  in the VAE model \citep{Kingma_2019} where the reconstruction loss adopts the square error loss .
    
    On the other hand, the assumption that $X=T+\rho$ share the same label as $T$ is sensible, since one expects that within small neighborhood of $T$, the class label remains unchanged.
    
\end{remark}


Given a model class ${\cal F}:=\{f_{\theta}: \theta\in\Theta\}$,
we now study its generalization performance w.r.t the induced distributions. Specifically, we will derive an upper bound for the generalization gap ${\rm GG}_{m}(\theta, \tilde{S}_{\phi}; \tilde{\cal D}_{\phi})$ for all $\theta \in \Theta$.  As it turns out, a key quantity governing the upper bound is a local property of the perturbation map ${\cal Q}_{\phi}$ that induces $ \tilde{\cal D}_{\phi}$.

\begin{definition}[Local dispersion]

For any $(x,y)\in {\cal X}\times {\cal Y}$, we define the local dispersion $\tilde{\gamma}_{\phi}(x, y)$ of the perturbation mapping ${\cal Q}_{\phi}$ at 
 $(x,y)$ as
\begin{equation}
\label{eq:localDispersion}
\tilde{\gamma}_{\phi}(x, y):=\mathbb{E}_{\rho, \rho'} \|\mathcal{Q}_{\phi}(x+\rho, y)-\mathcal{Q}_{\phi}(x+\rho', y)\|_{2}^{2}.
\end{equation}
where $\rho$ and $\rho'$ are drawn independently from 
$\pi$.
\end{definition}


\begin{remark}
We refer to this quantity as the {\em local dispersion} of ${\cal Q}_{\phi}$, as it measures 
how far apart the operator ${\cal Q}_{\phi}$ disperses two noise-perturbed versions of $(x, y)$.
In fact, one may verify that $\tilde{\gamma}_{\phi}(x, y)$ can be expressed as 
\begin{equation}
\label{eq: trace}
    \tilde{\gamma}_{\phi}(x, y)= 2 \cdot {\rm Trace}\left({\rm COV}_{\rho}({\cal Q}_{\phi}(x+\rho, y))\right)
\end{equation}
where $\rho$ is drawn from $\pi$ and ${\rm COV}_{\rho}({\cal Q}_{\phi}(x+\rho, y))$ denotes the covariance matrix. That is, $\tilde{\gamma}_{\phi}(x, y)$ also measures the how far ${\cal Q}_{\phi}$ spreads a randomly perturbed version of $(x, y)$.  We defer the proof of (\ref{eq: trace}) to Appendix \ref{section: proof of 10}.
\end{remark}


One may argue intuitively that smaller local dispersion of ${\cal Q}_{\phi}$ may allow the model to generalize better when learning on the distribution $\tilde{\cal D}_{\phi}$: consider an instance $(T, Y)$ drawn from the anchor distribution ${\cal D}_*$, and two observed data points
$(T+\rho, Y)$ and $(T+\rho', Y)$ (with $\rho$ and $\rho'$ drawn independently from $\pi$).  Suppose that $(T+\rho, Y)$ is included in the training set and $(T+\rho', Y)$
is included in the testing set. When the local dispersion is small, the perturbed version of the training point $({\cal Q}_{\phi}(T+\rho,Y), Y)$ and that of the testing point $({\cal Q}_{\phi}(T+\rho',Y), Y)$ (both of which are realizations from $\tilde{\cal D}_{\phi}$) are close, allowing the model's prediction on the latter to behave similarly as that on the former. 

We now rigorously formalize this intuition, under the following assumptions. 
\begin{itemize}
\item (Lipchitzness of $f_{\theta}$ over ${\cal X}$)
For any $y\in\mathcal{Y}$ and any $\theta\in\Theta$,  $|f_{\theta}(x,y)-f_{\theta}(x',y)|\le \beta\|x-x'\|_2$ for $\forall x,x'\in \mathcal{X}$. 

\item (Boundedness) $\sup\limits_{x,y\in\mathcal{X}\times{\mathcal{Y}}}|f_{\theta}(x,y)|= B<\infty$ for any $\theta\in\Theta$. 


\end{itemize}


The generalization gap (\ref{def: std gap}) then has the following uniform convergence result:
\begin{lemma}
\label{lemma: uniform convergence}
    Consider the model class ${\cal F}$ where each $f_{\theta}\in {\cal F}$ satisfies the above boundedness condition.
    For any $\phi$ (or $\tilde{\cal D}_{\phi}$), with probability $1-\tau$ over drawing $\tilde{S}_{\phi}$ from $\tilde{\cal D}_{\phi}$, we have 
    \begin{align}
    \label{eq: bound-lemma}
         &\sup_{\theta\in\Theta}{\rm GG}_{m}(\theta, \tilde{S}_{\phi}; \tilde{\cal D}_{\phi}) \nonumber\\
         &\le \mathbb{E}_{\tilde{S}_{\phi}\sim \tilde{\cal D}^{m}_{\phi}} \sup_{\theta\in\Theta}{\rm GG}_{m}(\theta, \tilde{S}_{\phi}; \tilde{\cal D}_{\phi}) +  2B\sqrt{\frac{\log\frac{1}{\tau}}{2m}}  
    \end{align}
   
\end{lemma}

The proof of the lemma is deferred to Appendix \ref{section: proof-lemma}. 
Building upon lemma \ref{lemma: uniform convergence}, we now derive an upper bound for $\mathbb{E}_{\tilde{S}_{\phi}\sim \tilde{\cal D}^{m}_{\phi}} \sup_{\theta\in\Theta}{\rm GG}_{m}(\theta, \tilde{S}_{\phi}; \tilde{\cal D}_{\phi})$ where the local dispersion of ${\cal Q}_{\phi}$ plays a role.



\begin{theorem} 
\label{thm-main}
Consider the model class ${\cal F}$ where each $f_{\theta}\in {\cal F}$ satisfies the above Lipchitzness and boundedness conditions. Consider the data distribution ${\cal D}$ which satisfies the assumptions \ref{assumption: anchor}. Let $\tilde{\mathcal{D}}_{\phi}$ denote the induced distribution of ${\cal D}$, generated by a perturbation ${\cal Q}_{\phi}$. We have 

\begin{align}
\label{eq: bound-thm}
   & \mathbb{E}_{\tilde{S}_{\phi}\sim \tilde{\cal D}^{m}_{\phi}} \sup_{\theta\in\Theta}{\rm GG}_{m}(\theta, \tilde{S}_{\phi}; \tilde{\cal D}_{\phi})
      \nonumber\\
    & \le \frac{2\beta}{\sqrt{m}}\sqrt{\mathbb{E}_{(x, y)\sim {\mathcal{D}_{*}}}\tilde{\gamma}_{\phi}(x,y)} + \frac{2(\beta\sqrt{d}\epsilon+B)}{\sqrt{m}}
\end{align}
\label{main theorem}
\end{theorem}

We leave the proof of the Theorem in Appendix \ref{section: proof-thm-main}. Combining (\ref{eq: bound-thm}) with (\ref{eq: bound-lemma}) immediately gives an upper bound for the generalization gap (\ref{def: std gap}) that applies for any $\theta\in\Theta$. 

\begin{remark}
    The derivation of Theorem \ref{thm-main} is based on a modification of the Rademacher complexity analysis. 
    It worth noting that any direct application of Rademacher complexity 
    to establish a learning bound requires certain restriction on the hypothesis class ${\cal F}$, thus suffering from a loss of generality.
\end{remark}


The theorem suggests that the generalization gap of any $f_{\theta}$ w.r.t to the distribution $\tilde{\mathcal{D}}_{\phi}$ is affected by the 
expected local dispersion (ELD)
$\mathbb{E}_{{\mathcal{D}_{*}}}\tilde{\gamma}_{\phi}(x,y)$ 
of ${\cal Q}_{\phi}$ and that a small generalization gap  can be uniformly attained\textemdash for every $f_{\theta}\in{\cal F}$ \textemdash with high probability when ELD $\mathbb{E}_{{\mathcal{D}_{*}}}\tilde{\gamma}_{\phi}(x,y)$ is small. 

An interpretation of this theorem is that the learning difficulty of the induced distribution $\tilde{\cal D}_\phi$ may be attributed to the ELD 
$\mathbb{E}_{{\mathcal{D}_{*}}}\tilde{\gamma}_{\phi}(x,y)$ of the perturbation operator ${\cal Q}_\phi$.  But since the theorem only provides an upper bound,  such an interpretation is only valid if the upper bound in the theorem is indicative of the true generalization gap.  We next report experimental measurements to show this is indeed the case. 


\begin{figure*}[!t]
    \centering
    \subfigure[ELDs for different $\sigma$]{\includegraphics[width=0.24\textwidth]{uai2025-template/training dynamic/disper_figs/cifar10/different_std.pdf}} 
    \subfigure[$\sigma=0.001$]{\includegraphics[width=0.24\textwidth]{uai2025-template/training dynamic/disper_figs/cifar10/ELD_Gaussian_std0.001.pdf}}         
    \subfigure[$\sigma=0.005$]{\includegraphics[width=0.24\textwidth]{uai2025-template/training dynamic/disper_figs/cifar10/LD_Gaussian_hist_0.005.pdf}}
    \subfigure[$\sigma=0.001$]{\includegraphics[width=0.24\textwidth]{uai2025-template/training dynamic/disper_figs/cifar10/LD_Gaussian_hist_0.001.pdf}} 
     \caption{Local dispersion measured on the CIFAR-10 test set. (a) ELDs estimated using different $\sigma$ values. For different choice of $\sigma$, the estimated ELDs fall within different ranges. To clearly compare the trends of ELD across different $\sigma$, we plot all estimations in the same graph and position their respective vertical axes on the sides of the figure. (b) ELD (green curve) of ${\cal Q}_\phi$ for different $\phi$ in comparison  to the generalization gap achieved on $\tilde{\cal D}_\phi$. (c) and (d): histograms of $\tilde{\gamma}_{\phi}(x,y)$ for three distinct $\phi$.}
\label{fig: disper}
\end{figure*}


\section{Experimental Validation}

We conducted experiments to estimate the ELD of ${\cal Q}_{\phi}$ for $\phi={\rm AT}(t)$ with various $t$ values along the AT trajectory. Note that the expectation here is over the distribution ${\cal D}_*$, from which no samples are available. However, due to the relationship between ${\cal D}^{\cal X}$ and ${\cal D}_{*}^{\cal X}$, namely that ${\cal D}^{\cal X}$ is  merely a slightly smoothed version of ${\cal D}_{*}^{\cal X}$ (since $\pi$ has small variances), one expects that when we draw $x$ from ${\cal D}^{\cal X}$, ${\cal D}^{\cal X}(x) \approx {\cal D}_{*}^{\cal X}(x)$ with high probability. As a consequence, 
${\mathbb E}_{{\cal D}_*} \tilde{\gamma}_\phi(x, y) 
\approx {\mathbb E}_{{\cal D}} \tilde{\gamma}_\phi(x, y)$ with high probability. But the latter can be estimated using the i.i.d. samples from ${\cal D}$. This gives us the following estimation formula for ELD:
 \[
 {\mathbb E}_{{\cal D}^*} \tilde{\gamma}_{\phi}(x, y) \approx \frac{1}{m} \sum_{i=1}^m  \tilde{\gamma}_{\phi}(x_i, y_i),
 \]
 where $\{(x_i, y_i)\}_{i=1}^{m}$ are drawn from ${\cal D}$. 
 
 

Estimating the local dispersion $\tilde{\gamma}_{\phi}(x_i, y_i)$ requires the knowledge of $\pi$, which is unfortunately unavailable to us. In our experiments, we take $\pi$ as a spherical Gaussian, with variance in each dimension equal to $\sigma^2$.  Various values of $\sigma^2$ are considered in our experiments. The estimation of each $\tilde{\gamma}_{\phi}(x_i, y_i)$ is done by Monte-Carlo approximation via sampling 250 pairs of $(\rho, \rho')$ from $\pi$. The expectation in (\ref{eq:localDispersion}) is then approximated using the sample mean. 

\paragraph{Same trend of ELD estimated from different $\sigma$} 
Figure \ref{fig: disper}(a) show that the estimated  ELD values with $\phi={\rm AT}(t)$ using $\sigma = 0.001, 0.005, 0.01$ respectively. In the figure, the three curves, each corresponding to a different $\sigma$ value, have very similar trend. In fact, when adjusting the range of vertical axes, the three curves closely align with each other. 


\paragraph{ELD as an indicator of generalization gap} 
Figure \ref{fig: disper} (b) presents the generalization gaps of the models learned on various $\tilde{\cal D}_{\phi}$ (red curve) and the estimated ELD values of the corresponding ${\cal Q}_{\phi}$ (green curve). In the experiments, we set $\sigma=0.01$ for ELD estimation. In each IDE, the model is trained to achieve zero training error, hence the generalization gaps in Figure \ref{fig: disper} (b) correspond directly to the testing errors of the learned models. As shown in the figure, when the ELD of ${\cal Q}_{\phi}$ is small, the model learned on the corresponding $\tilde{\cal D}_{\phi}$ tends to achieve a smaller generalization gap. This empirical observation aligns with the theoretical findings in Theorem \ref{main theorem}. The positive correlation between the red and green curve in \ref{fig: disper}(b) suggests that the local dispersion of the perturbation operator significantly affects the generalization performance of the models learned on the induced distribution. This also validates the usefulness of Theorem \ref{thm-main}, corroborating ELD as an indicator of the generalization gap for the induced distributions. 





\paragraph{Increasing dispersiveness along AT} 
Since in our experiments $\phi$ is obtained at different AT epochs, the upward trend in the green curve of Figure \ref{fig: disper}(b) and that of all the three curves in \ref{fig: disper}(a) suggest that performing AT for more iterations tends to make the perturbation operator ${\cal Q}_{\phi}$ increasingly dispersive. To further illustrate this trend, Figure \ref{fig: disper} (c) and (d) respectively plot the histograms of $\tilde{\gamma}_{\phi}(x,y)$ for $\phi={\rm AT}(20),{\rm AT}(100), {\rm AT}(200)$, estimated using different $\sigma$ values. As shown on both figures, the histograms shift progressively to the right as AT is performed for more iterations, indicating that the perturbation operator ${\cal Q}_{\phi}$ becomes more locally dispersive as ${\phi}$ evolves in AT. Similar experimental results are also observed on CIFAR-100 and Reduced ImageNet (see Appendix \ref{Appendix: omit figs} Figure \ref{fig: disper-cifar100} and  \ref{fig: disper-resimg}).

\paragraph{Summary} From Theorem \ref{thm-main} and these experiments, one may conclude that the deteriorating learning performance on the induced distribution along the AT trajectory can be attributed to the progressive increase of local dispersions of the perturbation operators. It remains unclear what causes perturbation operators in AT to become increasingly dispersive. Nonetheless, this study may shed new lights in understanding the complex dynamics of AT. In particular, we show next that the induced distribution deteriorating along the AT trajectory is correlated with robust overfitting. 





\section{Correlation with Robust generalization}

\begin{figure*}
    \centering
    \subfigure[CIFAR-10]{\includegraphics[width=0.24\textwidth]{uai2025-template/training dynamic/fig_cifar10/PreResNet_linf.pdf}}
    \subfigure[CIFAR-100]{\includegraphics[width=0.24\textwidth]{training dynamic/fig_cifar100/WideResNet_linf.pdf}}
    \subfigure[Reduced ImageNet]{\includegraphics[width=0.24\textwidth]{training dynamic/resimg.pdf}}
    \subfigure[MNIST]{\includegraphics[width=0.24\textwidth]{training dynamic/fig_mnist/l_inf.pdf}}
    \caption{Robust generalization gap of $\phi={\rm AT}(t)$ in comparison to the IDE test error w.r.t $\phi$.
    Note that since models in each IDE are trained to achieve zero training error, the IDE test error effectively represents the standard generalization gap achieved on the induced distribution.
    The trend of the red curves matches that of the yellow curves in each sub-figures, demonstrating a compelling correlation between these two quantities.}
     \label{fig:IDE-robust gengap}
\end{figure*}
We now explore if the (standard) generalization performance of models learned on the induced distribution $\tilde{\cal D}_{\phi}$ along the AT trajectory has any connection to the {\em robust generalization} performance of $\phi$ on original data distribution ${\cal D}$. 

We conduct extra IDEs for $\phi$ collected along AT at various epochs and compare the IDE testing errors with the robust generalization performance of the corresponding $\phi$. AT and each IDE are repeated five times with different random seeds. 

The experimental results on CIFAR-10 and CIFAR-100 are shown in Figure \ref{fig:IDE-robust gengap}(a) and (b), where the green and yellow curves respectively report the adversarial training error and the robust generalization gap of $\phi$ (i.e., $R_{S}^{\rm adv}(\phi)$ and ${\rm GG}_{m}(\phi, \tilde{S}_{\phi}; \tilde{\cal D}_{\phi})$). The two curves
illustrate a phenomenon known as robust overfitting  \citep{DBLP:journals/corr/abs-2002-11569}: after a certain point in AT, the robust generalization gap steadily increases while the adversarial training error constantly decreases. The red curves in the figures depict the standard testing errors achieved in each IDEs (i.e., $R_{\tilde{\cal D}_{\phi}}(\theta)$ with $\theta$ learned on $\tilde{S}_{\phi}$). Notably, a significant rise in the IDE testing error is observed when $\phi$ is taken between ${\rm AT}(80)$ and ${\rm AT}(120)$, increasing from 3.6\% to 27.68\% for CIFAR-10 and from 19\% to 48.99\% for CIFAR-100. Furthermore, this shift coincides with the onset of robust overfitting, where a significant rise in ${\rm GG}_{m}(\phi, \tilde{S}_{\phi}; \tilde{\cal D}_{\phi})$) is also observed. 


These results further demonstrate that $\tilde{\mathcal{D}}_{\phi}$ becomes harder to learn as AT progresses. More importantly, it shows that the appearance of this deteriorating induced distribution is closely linked to the onset of the robust overfitting phenomenon, revealing a correlation between the two. This correlation is further demonstrated by experimental results on Reduced ImageNet (see Figure \ref{fig:IDE-robust gengap} (c)), where robust overfitting emerges at an earlier training stage and simultaneously a rise in $R_{\tilde{\cal D}_{\phi}}(\theta)$ occurs. This increment in $R_{\tilde{\cal D}_{\phi}}(\theta)$ is also substantial, with an averaged error of 21.65\% at ${\rm AT}(20)$ elevating to 38.52\% at ${\rm AT}(60)$.
\begin{figure}[!b]
    \centering
    \includegraphics[width=0.32\textwidth]{training dynamic/fig_cifar10/WideResNet_linf_weightdecay.pdf}
    \caption{AT with various weight decay rates and the test error achieved in IDEs for each of the AT variants. The blue curves are reproduced from Figure \ref{fig:IDE-robust gengap}(a), serving as a reference for a clear comparison. The results further solidify the correlation between the robust generalization and the generalization performance on the induced distribution.}
    \label{fig:IDE mnist & wd}
\end{figure}




Our experiments on MNIST \citep{lecun1998gradient} (see Figure \ref{fig:IDE-robust gengap} (d)) exhibits a scenario where a good robust generalization is achieved. \footnote{Experimental settings on MNIST are shown in Appendix \ref{section: setup}}  Interestingly, a small testing error $R_{\tilde{\cal D}_{\phi}}(\theta)$ is maintained throughout the evolution of $\tilde{\mathcal{D}}_{\phi}$ with the absence of robust overfitting. Figure \ref{fig:IDE mnist & wd}  shows results from additional experiments on CIFAR-10. In these experiments, we perform AT with different levels of weight decay to control the robust generalization gap. Subsequently, IDEs are conducted for each such variant of AT. In Figure \ref{fig:IDE mnist & wd}, each distinct color corresponds to a different weight decay factor utilized in AT. Within each color category, the dashed curves and the corresponding solid lines represent, respectively, ${\rm GG}_{m}(\phi, \tilde{S}_{\phi}; \tilde{\cal D}_{\phi})$ and $R_{\tilde{\cal D}_{\phi}}(\theta)$ with $\phi$ trained by that specific AT variant. From these results, we see that increasing the weight decay factor results in a notable reduction in the ${\rm GG}_{m}(\phi, \tilde{S}_{\phi}; \tilde{\cal D}_{\phi})$, while conversely, decreasing the weight decay factor leads to the opposite effect. This is shown by the downward shift in the dashed curves across the three color categories. More noteworthy is a clear synchronization observed between each pair of dashed and solid curves (of the same color), with lower dashed curves consistently corresponding to lower solid curves in the same color category.


All these results suggest a strong correlation between $R_{\tilde{\cal D}_{\phi}}(\theta)$ and the robust generalization gap ${\rm GG}_{m}(\phi, \tilde{S}_{\phi}; \tilde{\cal D}_{\phi})$. Although by construction, the robust generalization gap is written by ${\rm GG}_{m}(\phi, \tilde{S}_{\phi}; \tilde{\cal D}_{\phi}) = |R_{\tilde{\cal D}_{\phi}}(\phi)-R_{\tilde{S}_{\phi}}(\phi)|$ due to that $R^{\rm adv}_{\mathcal{D}}(\phi)= R_{\tilde{\cal D}_{\phi}}(\phi)$ and $R^{\rm adv}_{S}(\phi)= R_{\tilde{S}_{\phi}}(\phi)$, such a correlation is still quite surprising. This is because the learning of the parameter $\theta$ has been started from a completely random initialization and one would not expect the resulting parameter $\theta$ is linked to the parameter $\phi$ in any obvious way, despite that the latter contributes to shaping the distribution $\tilde{\cal D}_{\phi}$. 


A novel observation in this work, this correlation is certainly curious in its own right and deserves further investigation. At this point, it has at least highlighted the impact of the dynamics of AT on robust overfitting, beyond the static quantities, such as loss landscape, while also paving a way for developing deeper understanding of how AT results in robust overfitting. 


\section{Conclusion}

In this paper, we show that the distribution induced by the perturbation operator in AT may deteriorate along the trajectory of AT.  In particular, we observe experimentally that as AT progresses, the induced distribution may become harder to learn. Our theoretical analysis suggests that a key factor governing this increasing difficulty of learning is the local dispersion of the perturbation operator that induces the  distribution. Experimental results confirm that as AT proceeds, the perturbation becomes more dispersive, validating our theoretical results. Additionally, we empirically observed a correlation between the deteriorating behavior of the induced distributions with robust overfitting.

The novel observations and our theoretical explanation presented in this paper contribute to better understanding the complex dynamics of AT. Unraveling this complexity is  arguably essential to understanding  robust generalization in AT. 

\paragraph{Limitations \& Future Works} While this paper establishes a connection between local dispersion and the learning difficulty of the induced data distribution, the theoretical framework does not fully explain the underlying causes of increased local dispersion during AT. Nor does it provide improved AT algorithms based on the theoretical insights.

Understanding the mechanism that increases local dispersions during AT remains an open and intriguing direction. Any progress in this direction is likely to improve the practical design of AT algorithms. Not having a concrete answer at present, we speculate that this might be related to the increased complexity of classifier decision boundaries during AT: when the boundaries become more complex, the perturbations pointing to the boundaries are more scattered, thereby increasing the local dispersion. Formalizing this intuition with rigorous analysis is a promising avenue for future research.

Another promising direction is to explore ways to mitigate the deterioration of induced distributions, such as through regularization of perturbation operator to control local dispersion. 



% \clearpage


% \begin{contributions} % will be removed in pdf for initial submission 
% 					  % (without ‘accepted’ option in \documentclass)
%                       % so you can already fill it to test with the
%                       % ‘accepted’ class option
%     Briefly list author contributions. 
%     This is a nice way of making clear who did what and to give proper credit.
%     This section is optional.

%     H.~Q.~Bovik conceived the idea and wrote the paper.
%     Coauthor One created the code.
%     Coauthor Two created the figures.
% \end{contributions}

% \begin{acknowledgements} % will be removed in pdf for initial submission,
% 						 % (without ‘accepted’ option in \documentclass)
%                          % so you can already fill it to test with the
%                          % ‘accepted’ class option
%     Briefly acknowledge people and organizations here.

%     \emph{All} acknowledgements go in this section.
% \end{acknowledgements}

% References
\bibliography{uai2025-template}


\newpage

\onecolumn

\title{Appendices}
\maketitle

\appendix

\section{Detailed Experimental setup}
\label{section: setup}

Our Reduced ImageNet is made by aggregating several semantically similar subsets of the original ImageNet, resulting in a total of 66594 images. This dataset is then partitioned into a training set containing 5,000 images per class and a testing set containing approximately 1,000 images per class. Compared to the restricted ImageNet in \cite{russakovsky2015imagenet}, our dataset has a more balanced sample size across each classes. Table \ref{Table: resimg} illustrates the specific classes from the original ImageNet that have been aggregated in our dataset.
\begin{table}[!h]\small
\centering
\setlength{\tabcolsep}{0.5mm}{
\begin{tabular}{lcc}  
\toprule
    & Classes in the reduced ImageNet \quad  & Classes in ImageNet  \\ 
\midrule
      & "dog"   & 86 to 90  \\
      & "cat"  & (8,10,55,95,174) \\
      & "truck"  & 279 to 283  \\
       & "car"  & 272 to 276   \\
        & "beetles" & 623 to 627   \\
       &"turtle"  &458 to 462  \\
       &"crab"   &612 to 616   \\
      & "fish" &450 to 454   \\
      &"snake"  & 477 to 481 \\
      & "spider" & 604 to 608 \\
\bottomrule
\end{tabular}}
\caption{The left column presents the classes within our reduced ImageNet dataset, with each class being an aggregation of the corresponding classes from the full-scale ImageNet dataset, as depicted in the right column.}
\label{Table: resimg}
\end{table}

For adversarial training (AT), the settings on different datasets are summarized in Table \ref{Table: adv}. Data augmentation is performed on 
these datasets except for MNIST during the training. For CIFAR-10 and CIFAR-100 we follow the data augmentation setting in  \cite{DBLP:journals/corr/abs-2002-11569}. For our reduced ImageNet, we adopt the same data augmentation scheme that is used on the restricted ImageNet in \cite{DBLP:journals/corr/abs-2003-02460}. 

\begin{table}[!htbp]\small
\centering
\setlength{\tabcolsep}{0.5mm}{
\begin{tabular}{lcccc}  
\toprule
                   &MNIST &CIFAR-10 &CIFAR-100 &Reduced ImageNet \\ 
\midrule
model    & small CNN  &PRN18 &WRN-34  & PRN-50  \\
optimizer            & Adam  & SGD     & SGD & SGD \\
weight deacy         & None  & $5\times10^{-4}$ & $5\times10^{-4}$ & None \\
batch size           & 128  & 128  &128 &128 \\
$\epsilon$           & 0.3  & 8/255  &8/255 &4/255 \\
PGD step size       & 0.01  &2/255 &2/255 &0.9/255 \\
number of PGD        & 40  & 10  &10  &5 \\
\bottomrule
\end{tabular}}
\caption{Settings in PGD and AT across different datasets}
\label{Table: adv}
\end{table}


For the induced distribution experiments (IDEs) on each datasets, the settings are outlined in Table \ref{Table: IDE}. It is important to note that for each of the individual IDEs that is conducted on the same dataset, we maintain consistent training settings. This includes using the same model architecture with identical model size and the same level of regularization. This ensures a fair comparison of the IDE results obtained from the same dataset. Furthermore, the model is trained to achieve zero training error in all the IDEs, excluding the situation that the degeneration in model performance could be attributed to inadequate training procedures.

\begin{table}[!h]\small
\centering
\setlength{\tabcolsep}{0.5mm}{
\begin{tabular}{lcccc}  
\toprule
                   &MNIST &CIFAR-10 &CIFAR-100 &Reduced ImageNet \\ 
\midrule
model    & small CNN  & PRN-18 &WRN-34  & PRN-50  \\
optimizer            & Adam   & SGD    & SGD & SGD \\
weight deacy         & None  & $5\times10^{-4}$ & $5\times10^{-4}$ & $5\times10^{-4}$ \\
batch size           & 128  &128 &128 &128 \\
\bottomrule
\end{tabular}}
\caption{Settings in the IDE across different datasets}
\label{Table: IDE}
\end{table}


\section{Proofs}




\subsection{Proof of (\ref{eq: trace})}
\label{section: proof of 10}

We have that
\begin{align}
    \tilde{\gamma}_{\phi}(x, y) &:=\mathbb{E}_{\rho, \rho'} \|\mathcal{Q}_{\phi}(x+\rho, y)-\mathcal{Q}_{\phi}(x+\rho', y)\|_{2}^{2} \nonumber\\
& = \mathbb{E}_{\rho, \rho'} \left(\mathcal{Q}_{\phi}(x+\rho, y)-\mathcal{Q}_{\phi}(x+\rho', y)\right)^{T} \left(\mathcal{Q}_{\phi}(x+\rho, y)-\mathcal{Q}_{\phi}(x+\rho', y)\right) \\
& = \mathbb{E}_{\rho, \rho'} \left[\Vert\mathcal{Q}_{\phi}(x+\rho, y) \Vert_{2}^{2}+ \Vert\mathcal{Q}_{\phi}(x+\rho', y) \Vert_{2}^{2} - 2\mathcal{Q}_{\phi}(x+\rho', y)^{T}\mathcal{Q}_{\phi}(x+\rho, y)\right]\\
& = 2\mathbb{E}_{\rho}\Vert\mathcal{Q}_{\phi}(x+\rho, y) \Vert_{2}^{2} - 2\Vert \mathbb{E}_{\rho} \mathcal{Q}_{\phi}(x+\rho, y)\Vert_{2}^{2} \label{eq-trace-00}
\end{align}

On the other hand, we have that
\begin{align}
    \tilde{\gamma}_{\phi}(x, y) &:=\mathbb{E}_{\rho, \rho'} \|\mathcal{Q}_{\phi}(x+\rho, y)-\mathcal{Q}_{\phi}(x+\rho', y)\|_{2}^{2} \nonumber\\
    &= \mathbb{E}_{\rho, \rho'} \|\mathcal{Q}_{\phi}(x+\rho, y)- \mathbb{E}_{\rho}\mathcal{Q}_{\phi}(x+\rho, y) - \left(\mathcal{Q}_{\phi}(x+\rho', y) - \mathbb{E}_{\rho'}\mathcal{Q}_{\phi}(x+\rho', y)\right)\|_{2}^{2} \\
    & = 2 \mathbb{E}_{\rho}\Vert  \mathcal{Q}_{\phi}(x+\rho, y)- \mathbb{E}_{\rho}\mathcal{Q}_{\phi}(x+\rho, y)\Vert_{2}^{2} - 2 \Vert \mathbb{E}_{\rho}\left[\mathcal{Q}_{\phi}(x+\rho, y)- \mathbb{E}_{\rho}\mathcal{Q}_{\phi}(x+\rho, y)\right] \Vert_{2}^{2} \label{eq-trace-0}\\
    & = 2 \mathbb{E}_{\rho}\Vert  \mathcal{Q}_{\phi}(x+\rho, y)- \mathbb{E}_{\rho}\mathcal{Q}_{\phi}(x+\rho, y)\Vert_{2}^{2}\\
    & = 2 \mathbb{E}_{\rho} \left[\sum_{i=1}^{d}\left(\mathcal{Q}_{\phi}(x+\rho, y)[i]- \mathbb{E}_{\rho}\mathcal{Q}_{\phi}(x+\rho, y)[i]\right)^2\right]\\
    & = 2 \sum_{i=1}^{d}\mathbb{E}_{\rho}\left(\mathcal{Q}_{\phi}(x+\rho, y)[i]- \mathbb{E}_{\rho}\mathcal{Q}_{\phi}(x+\rho, y)[i]\right)^2\\
    & = 2 {\rm Trace}\left({\rm COV}_{\rho}(\mathcal{Q}_{\phi}(x+\rho, y))\right)
\end{align}
where equality (\ref{eq-trace-0}) is derived by applying the results of (\ref{eq-trace-00}). We use $\mathcal{Q}_{\phi}(x+\rho, y)[i]$ to denote the $i^{\rm th}$ coordinate of the vector $\mathcal{Q}_{\phi}(x+\rho, y)$.

\hfill $\Box$

\subsection{Proof of Lemma \ref{lemma: uniform convergence}}
\label{section: proof-lemma}

With a little abuse of notation, let $(t,y)$ denote an instance drawn from ${\cal D}_{*}$ and let $(v,y)$ denote an instance drawn from the induced distribution $\tilde{\mathcal{D}}_{\phi}$ associate with a perturbation ${\cal Q}_{\phi}$. For shorter notations, we will denote $z:=(t,y)$, $u:=(v,y)$ and $f(u):=f(v,y)$ and simply write $\mathcal{Q}_{\phi}$ as $\mathcal{Q}$. 

Denote by $g(u_1\cdots u_m):=\sup\limits_{\theta\in\Theta}{\rm GG}(\theta; \tilde{S}_{\phi}, \tilde{\cal D}_{\phi}) = \sup\limits_{\theta\in\Theta}\left|\frac{1}{m}\sum\limits_{i=1}^{m}f_{\theta}(u_{i})-\mathbb{E}f_{\theta}(u)\right|$. We have for any $1\le j\le m$

\begin{align}
&\sup\limits_{u_1,\cdots ,u_m, u_j'}\left|g(u_1,\cdots ,u_m)-g(u_1,\cdots ,u_j', u_{j+1},\cdots u_m)\right|\\
= & \sup\limits_{u_1,\cdots ,u_m, u_j'}\left|\sup\limits_{\theta\in\Theta}\left|\frac{1}{m}\sum\limits_{i=1}^{m}f_{\theta}(u_{i})-\mathbb{E}f_{\theta}(u)\right|-\sup\limits_{\theta\in\Theta}\left|\frac{1}{m}\left(\sum\limits_{i=1,i\ne j}^{m}f_{\theta}(u_{i}) +f_{\theta}(u_{j}')\right) -\mathbb{E}_{u}f_{\theta}(u)\right|\right|\\
\le & \sup\limits_{u_1,\cdots ,u_m, u_j'}\sup\limits_{\theta\in\Theta}\left|\left|\frac{1}{m}\sum\limits_{i=1}^{m}f_{\theta}(u_{i})-\mathbb{E}f_{\theta}(u)\right|-\left|\frac{1}{m}\left(\sum\limits_{i=1,i\ne j}^{m}f_{\theta}(u_{i}) +f_{\theta}(u_{j}')\right) -\mathbb{E}_{u}f_{\theta}(u)\right|\right| \\
\le & \sup\limits_{u_1,\cdots ,u_m, u_j'}\sup\limits_{\theta\in\Theta}\left|\frac{1}{m}\sum\limits_{i=1}^{m}f_{\theta}(u_{i})-\mathbb{E}_{u}f_{\theta}(u)-\frac{1}{m}\left(\sum\limits_{i=1,i\ne j}^{m}f_{\theta}(u_{i}) +f_{\theta}(u_{j}')\right) + \mathbb{E}_{u}f_{\theta}(u)\right| \label{eq 1} \\ 
=& \sup\limits_{\theta\in\Theta}\sup\limits_{u_j, u_j'}\frac{1}{m}\left|f_{\theta}(u_j)-f_{\theta}(u_j')\right|\\
\le & \frac{1}{m}\sup\limits_{\theta\in\Theta}\sup\limits_{u_j}\left|f_{\theta}(u_j)\right| + \frac{1}{m}\sup\limits_{\theta\in\Theta}\sup\limits_{u_j'}\left|f_{\theta}(u_j')\right| \label{eq 2} \\
\le & \frac{2B}{m} \label{eq 3}
\end{align}



where the inequality (\ref{eq 1}) follows from the inverse triangle inequality. The inequality (\ref{eq 2}) and (\ref{eq 3}) make use of the triangle inequality and the boundedness condition of $f$.

With the result derived above, by McDiarmid inequality, we have for all $\mu>0$
$$
{\rm Pr}\left[g(u_1\cdots u_m)-\mathbb{E}_{U}g(u_1\cdots u_m)\ge \mu  \right]\le \exp\left(\frac{-m\mu^{2}}{B}\right)
$$
where we use $U:=(u_1,\cdots,u_m)$. This is equivalent to saying that with probability $1-\tau$, we have
\begin{equation}
    g(u_1\cdots u_m)\le \mathbb{E}_{U}g(u_1\cdots u_m)+2B\sqrt{\frac{\log\frac{1}{\tau}}{2m}}
    \label{eq: mcdiarmid}
\end{equation}

\hfill $\Box$


\subsection{Proof of Theorem \ref{thm-main}}

\label{section: proof-thm-main}

Following the notations in the proof of Lemma \ref{lemma: uniform convergence}, we now derive an upper bound for the term $\mathbb{E}_{U}g(u_1\cdots u_m)$.

 For shorter notations, let $Z:=(z_1,\cdots, z_m)$, $\Gamma:=(\rho_1,\cdots, \rho_m)$ and $F_\theta(Z, \Gamma):=\frac{1}{m}\sum\limits_{i=1}^{m}f_\theta(\mathcal{Q}(x_i+\rho_i, y_i), y_i)$. We have
\begin{align}
    &\mathbb{E}_{U}g(u_1\cdots u_m)\\
=& \mathbb{E}_{U}\sup\limits_{\theta\in\Theta}\left| \frac{1}{m}\sum\limits_{i=1}^{m}f_{\theta}(u_{i})-\mathbb{E}f_{\theta}(u)\right|\\
=& \mathbb{E}_{U}\sup\limits_{\theta\in\Theta}\left| \frac{1}{m}\sum\limits_{i=1}^{m}f_{\theta}(u_{i})-\mathbb{E}_{\hat{U}}\left[\frac{1}{m}\sum\limits_{i=1}^{m}f_{\theta}(\hat{u}_i)\right]\right|\\
\le & \mathbb{E}_{U}\sup\limits_{\theta\in\Theta}\left[\mathbb{E}_{\hat{U}}\left| \frac{1}{m}\sum\limits_{i=1}^{m}f_{\theta}(u_{i})-\frac{1}{m}\sum\limits_{i=1}^{m}f_{\theta}(\hat{u}_i)\right|\right] \label{eq 4} \\
\le & \mathbb{E}_{U}\mathbb{E}_{\hat{U}}\sup\limits_{\theta\in\Theta}\left| \frac{1}{m}\sum\limits_{i=1}^{m}f_{\theta}(u_{i})-\frac{1}{m}\sum\limits_{i=1}^{m}f_{\theta}(\hat{u}_i)\right|\label{eq 4_extra} \\
=& \mathbb{E}_{Z}\mathbb{E}_{\Gamma}\mathbb{E}_{\hat{Z}}\mathbb{E}_{\hat{\Gamma
}}\sup\limits_{\theta\in\Theta}\left|\frac{1}{m}\sum\limits_{i=1}^{m}f_{\theta}(\mathcal{Q}(x_i+\rho_i,y_i), y_i)-\frac{1}{m}\sum\limits_{i=1}^{m}f_{\theta}(\mathcal{Q}(\hat{x}_i+\hat{\rho}_i, \hat{y}_i), \hat{y}_i) \right|\\
=& \mathbb{E}_{Z}\mathbb{E}_{\Gamma}\mathbb{E}_{\hat{Z}}\mathbb{E}_{\hat{\Gamma
}}\sup\limits_{\theta\in\Theta}\left|F_{\theta}(Z, \Gamma) -
\mathbb{E}_{\bar{\Gamma}}F_{\theta}(Z,\bar{\Gamma})+ \mathbb{E}_{\bar{\Gamma}}F_{\theta}(Z,\bar{\Gamma}) -F_{\theta}(\hat{Z}, \hat{\Gamma})+\mathbb{E}_{\tilde{\Gamma}}F_{\theta}(\hat{Z}, \tilde{\Gamma})-\mathbb{E}_{\tilde{\Gamma}}F_{\theta}(\hat{Z}, \tilde{\Gamma}) \right|\\
\le & \mathbb{E}_{Z}\mathbb{E}_{\Gamma}\sup\limits_{\theta\in\Theta}\left|F(Z, \Gamma) -
\mathbb{E}_{\bar{\Gamma}}F_{\theta}(Z,\bar{\Gamma}) \right| + \mathbb{E}_{\hat{Z}}\mathbb{E}_{\hat{\Gamma
}}\sup\limits_{\theta\in\Theta}\left|F_{\theta}(\hat{Z}, \hat{\Gamma})- \mathbb{E}_{\tilde{\Gamma}}F_{\theta}(\hat{Z}, \tilde{\Gamma}) \right| + \mathbb{E}_{Z}\mathbb{E}_{\hat{Z}}\sup\limits_{\theta\in\Theta}\left|\mathbb{E}_{\bar{\Gamma}}F_{\theta}(Z,\bar{\Gamma})-\mathbb{E}_{\tilde{\Gamma}}F_{\theta}(\hat{Z}, \tilde{\Gamma})\right| \label{eq 4-another} \\
=&\underbrace{2\mathbb{E}_{Z}\mathbb{E}_{\Gamma}\sup\limits_{\theta\in\Theta}\left|F_{\theta}(Z, \Gamma) -
\mathbb{E}_{\bar{\Gamma}}F_{\theta}(Z,\bar{\Gamma}) \right|}_{\textcircled{1}} +\underbrace{\mathbb{E}_{Z}\mathbb{E}_{\hat{Z}}\sup\limits_{\theta\in\Theta}\left|\mathbb{E}_{\bar{\Gamma}}F_{\theta}(Z,\bar{\Gamma})-\mathbb{E}_{\tilde{\Gamma}}F_{\theta}(\hat{Z}, \tilde{\Gamma})\right|}_{\textcircled{2}} \label{eq final}
\end{align}
where (\ref{eq 4}) follows from Jensen's inequality and (\ref{eq 4_extra}) is due to that the supremum of expectation is less than equal to expectation of the supremum. The inequality (\ref{eq 4-another}) is derived by the triangle inequality and the fact that supremum of sum is less than equal to sum of supremum. We now individually construct upper bounds for the term $\textcircled{1}$ and $\textcircled{2}$.

For the term $\textcircled{1}$, we have
\begin{align}
 &2\mathbb{E}_{Z}\mathbb{E}_{\Gamma}\sup\limits_{\theta\in\Theta}\left|F_{\theta}(Z, \Gamma) -
\mathbb{E}_{\bar{\Gamma}}F_{\theta}(Z,\bar{\Gamma}) \right|\nonumber\\
\le & 2\mathbb{E}_{Z}\mathbb{E}_{\Gamma}\mathbb{E}_{\bar{\Gamma}}\sup\limits_{\theta\in\Theta}\left|F_{\theta}(Z, \Gamma)-F_{\theta}(Z,\bar{\Gamma}) \right| \label{eq 5}\\
=& 2\mathbb{E}_{Z}\mathbb{E}_{\Gamma}\mathbb{E}_{\bar{\Gamma}}\sup\limits_{\theta\in\Theta}\left|\frac{1}{m}\sum\limits_{i=1}^{m}f_{\theta}(\mathcal{Q}(x_i+\rho_i, y_i), y_i)-\frac{1}{m}\sum\limits_{i=1}^{m}f_{\theta}(\mathcal{Q}(x_i+\bar{\rho}_i, y_i), y_i) \right|\\
=&\frac{2}{m}\mathbb{E}_{Z}\mathbb{E}_{\Gamma}\mathbb{E}_{\bar{\Gamma}}\sup\limits_{\theta\in\Theta}\mathbb{E}_{ \Sigma}\left|\sum\limits_{i=1}^{m}\sigma_i \left(f_{\theta}(\mathcal{Q}(x_i+\rho_i, y_i), y_i)-f_{\theta}(\mathcal{Q}(x_i+\bar{\rho}_i, y_i), y_i)\right) \right| \label{eq 6} \\
\le & \frac{2}{m}\mathbb{E}_{Z}\mathbb{E}_{\Gamma}\mathbb{E}_{\bar{\Gamma}}\sup\limits_{\theta\in\Theta}\sqrt{\sum\limits_{i=1}^{m}\left|f_{\theta}(\mathcal{Q}(x_i+\rho_i, y_i), y_i)-f_{\theta}(\mathcal{Q}(x_i+\bar{\rho}_i, y_i), y_i)\right|^{2}} \label{eq 7}\\
\le & \frac{2}{m}\mathbb{E}_{Z}\mathbb{E}_{\Gamma}\mathbb{E}_{\bar{\Gamma}}\sqrt{\sum\limits_{i=1}^{m}\beta^{2}\|\mathcal{Q}(x_i+\rho_i, y_i)-\mathcal{Q}(x_i+\bar{\rho}_i, y_i)\|^{2}} \label{eq 8}\\
\le & \frac{2\beta}{m}\mathbb{E}_{Z}\sqrt{\mathbb{E}_{\Gamma}\mathbb{E}_{\bar{\Gamma}}
\left[\sum\limits_{i=1}^{m}\|\mathcal{Q}(x_i+\rho_i, y_i)-\mathcal{Q}(x_i+\bar{\rho}_i, y_i)\|^{2}\right]}  \label{eq 9} \\
= & \frac{2\beta}{m}\mathbb{E}_{Z}\sqrt{
\sum\limits_{i=1}^{m}\mathbb{E}_{\rho}\mathbb{E}_{\bar{\rho}}\|\mathcal{Q}(x_i+\rho_i, y_i)-\mathcal{Q}(x_i+\bar{\rho}_i, y_i)\|^{2}}  \\
=& \frac{2\beta}{m}\mathbb{E}_{Z}\sqrt{\sum\limits_{i=1}^{m}\gamma(x_i,y_i)} \label{eq 10-0} \\
\le & \frac{2\beta}{m}\sqrt{\mathbb{E}_{Z}\left[\sum\limits_{i=1}^{m}\gamma(x_i,y_i)\right]}  \label{eq 10-00} \\
= &\frac{2\beta}{m}\sqrt{\sum\limits_{i=1}^{m}\mathbb{E}_{z_i}\gamma(x_i,y_i)}  \label{eq 11} \\
=&  \frac{2\beta}{\sqrt{m}}\sqrt{\mathbb{E}_{z}\gamma(x,y)}  \label{eq 12}
\end{align}
 The inequality (\ref{eq 5}) is derived similarly to inequality (\ref{eq 4}) and (\ref{eq 4_extra}). In (\ref{eq 6}), we introduce Rademacher variables $\Sigma:=(\sigma_1,\cdots,\sigma_m)$ (i.e., each random variable $\sigma_i$ takes values in $\{-1,+1\}$ independently with equal probability 0.5). The Rademacher variables introduces a random exchange of the corresponding difference term. Since $\Gamma$ and $\hat{\Gamma}$ are independently sampled from the same distribution, such a swap gives an equally likely configuration. Therefore, the equality (\ref{eq 6}) holds. The inequality (\ref{eq 7}) is given by the Khintchine's inequality. The inequality (\ref{eq 8}) makes use of the Lipschitz condition of $f$. (\ref{eq 9}) is derived from Jensen's inequality and due to that square root is a concave function. (\ref{eq 10-0}) is by the definition of the local dispersion of $\mathcal{Q}$. Again, we apply Jensen's inequality to obtain (\ref{eq 10-00}). Equation (\ref{eq 11}) and (\ref{eq 12}) follow from the settings that each $z_i=(x_i,y_i)$ is i.i.d.

For the term $\textcircled{2}$, we have
\begin{align}
&\mathbb{E}_{Z}\mathbb{E}_{\hat{Z}}\sup\limits_{\theta\in\Theta}\left|\mathbb{E}_{\bar{\Gamma}}F_\theta(Z,\bar{\Gamma})-\mathbb{E}_{\tilde{\Gamma}}F_\theta(\hat{Z}, \tilde{\Gamma})\right| \nonumber\\
=& \mathbb{E}_{Z}\mathbb{E}_{\hat{Z}}\sup\limits_{\theta\in\Theta}\left| \mathbb{E}_{\bar{\Gamma}} \left[\frac{1}{m}\sum\limits_{i=1}^{m}f_\theta(\mathcal{Q}(x_i+\bar{\rho}_i, y_i), y_i)\right]-   \mathbb{E}_{\tilde{\Gamma}} \left[\frac{1}{m}\sum\limits_{i=1}^{m}f_\theta(\mathcal{Q}(\hat{x}_i+\tilde{\rho}_i, \hat{y}_i), \hat{y}_i)\right]  \right|\\
=& \mathbb{E}_{Z}\mathbb{E}_{\hat{Z}}\sup\limits_{\theta\in\Theta}\left| \frac{1}{m}\sum\limits_{i=1}^{m} \mathbb{E}_{\bar{\rho}_{i}} \left[f_\theta(\mathcal{Q}(x_i+\bar{\rho}_i, y_i), y_i)\right]-   \frac{1}{m}\sum\limits_{i=1}^{m}\mathbb{E}_{\tilde{\rho}_{i}} \left[f_\theta(\mathcal{Q}(\hat{x}_i+\tilde{\rho}_i, \hat{y}_i), \hat{y}_i)\right]  \right|\label{eq 13}\\
=& \mathbb{E}_{Z}\mathbb{E}_{\hat{Z}}\sup\limits_{\theta\in\Theta}\left| \frac{1}{m}\sum\limits_{i=1}^{m} \mathbb{E}_{\rho} \left[f_\theta(\mathcal{Q}(x_i+\rho, y_i), y_i)\right]-   \frac{1}{m}\sum\limits_{i=1}^{m}\mathbb{E}_{\rho} \left[f_\theta(\mathcal{Q}(\hat{x}_i+\rho, \hat{y}_i), \hat{y}_i)\right]  \right|\label{eq 14}\\
=& \frac{1}{m} \mathbb{E}_{Z}\mathbb{E}_{\hat{Z}}\mathbb{E}_{\Sigma}\sup\limits_{\theta\in\Theta}\left|\sum\limits_{i=1}^{m} \sigma_{i}\left(\mathbb{E}_{\rho} \left[f_\theta(\mathcal{Q}(x_i+\rho, y_i), y_i)\right]-\mathbb{E}_{\rho} \left[f_\theta(\mathcal{Q}(\hat{x}_i+\rho, \hat{y}_i), \hat{y}_i)\right]\right)  \right|\label{eq 15}\\
\le &  \frac{1}{m} \mathbb{E}_{Z}\mathbb{E}_{\hat{Z}}\sup\limits_{\theta\in\Theta}\sqrt{\sum\limits_{i=1}^{m} \left|\left(\mathbb{E}_{\rho} \left[f_\theta(\mathcal{Q}(x_i+\rho, y_i), y_i)\right]-\mathbb{E}_{\rho} \left[f_\theta(\mathcal{Q}(\hat{x}_i+\rho, \hat{y}_i), \hat{y}_i)\right]\right)  \right|^{2}}\label{eq 16}
\end{align}
where equation (\ref{eq 13}) and (\ref{eq 14}) are due to each $\hat{\rho}_{i}$ and $\tilde{\rho}_{i}$ is i.i.d. Again, we introduce Rademacher variables at (\ref{eq 15}) and apply Khintchine's inequality to get (\ref{eq 16}). For the term $\left|\left(\mathbb{E}_{\rho} \left[f_{\theta}(\mathcal{Q}(x_i+\rho, y_i), y_i)\right]-\mathbb{E}_{\rho} \left[f_{\theta}(\mathcal{Q}(\hat{x}_i+\rho, \hat{y}_i), \hat{y}_i)\right]\right) \right|^{2}$, we have
\begin{align}
    &\left| \mathbb{E}_{\rho}f_{\theta}(\mathcal{Q}(x_i+\rho, y_i), y_i)-\mathbb{E}_{\rho}f_{\theta}(\mathcal{Q}(\hat{x}_i+\rho,\hat{y}_i), \hat{y}_i) \right|^{2}\\
\le & (\left| \mathbb{E}_{\rho}f_{\theta}(\mathcal{Q}(x_i+\rho, y_i), y_i)\right|+ \left|\mathbb{E}_{\rho} f_{\theta}(\mathcal{Q}(\hat{x}_i+\rho, \hat{y}_i), \hat{y}_i)  \right|)^{2}\\
\le & 2\left| \mathbb{E}_{\rho}f_{\theta}(\mathcal{Q}(x_i+\rho, y_i), y_i)\right|^{2}+2\left|\mathbb{E}_{\rho} f_{\theta}(\mathcal{Q}(\hat{x}_i+\rho, \hat{y}_i), \hat{y}_i)  \right|^{2} \label{eq 17}
\end{align}
where inequality (\ref{eq 17}) is derived by the inequality $(a+b)^{2}\le 2(a^2+b^2)$. We also have that 

\begin{align}
   & \left| \mathbb{E}_{\rho}f_{\theta}(\mathcal{Q}(x+\rho,y), y)\right|^{2} \nonumber\\
  \le & (\mathbb{E}_{\rho}\left| f_{\theta}(\mathcal{Q}(x+\rho, y), y)-f_{\theta}(x+\rho, y)\right|+\left|f_{\theta}(x+\rho, y)\right|)^{2}\label{eq 18}\\
  \le & (\mathbb{E}_{\rho}\left|f_{\theta}(\mathcal{Q}(x+\rho,y), y)-f_{\theta}(x+\rho, y)\right|+B)^{2} \label{eq 10}\\
  \le & (\mathbb{E}_{\rho}\beta\|\mathcal{Q}(x+\rho,y)-(x+\rho)\|_2+B)^{2} \label{eq 20}
\end{align}

The inequalities (\ref{eq 18})-(\ref{eq 20}) respectively make use of the triangle inequality, Jensen's inequality, and the boundedness and lipschitz condition of $f$.

Returning to (\ref{eq 16}), we then have
\begin{align}
&\frac{1}{m} \mathbb{E}_{Z}\mathbb{E}_{\hat{Z}}\sup\limits_{\theta\in\Theta}\sqrt{\sum\limits_{i=1}^{m} \left|\left(\mathbb{E}_{\rho} \left[f_{\theta}(\mathcal{Q}(x_i+\rho, y_i), y_i)\right]-\mathbb{E}_{\rho} \left[f_{\theta}(\mathcal{Q}(\hat{x}_i+\rho, \hat{y}_i), \hat{y}_i)\right]\right)  \right|^{2}} \notag \\
\le & \frac{1}{m} \mathbb{E}_{Z}\mathbb{E}_{\hat{Z}}\sup\limits_{\theta\in\Theta}\sqrt{\sum\limits_{i=1}^{m} 2\left| \mathbb{E}_{\rho}f_{\theta}(\mathcal{Q}(x_i+\rho, y_i), y_i)\right|^{2}+\sum\limits_{i=1}^{m}2\left|\mathbb{E}_{\rho} f_{\theta}(\mathcal{Q}(\hat{x}_i+\rho, \hat{y}_i), \hat{y}_i)  \right|^{2}}\\
\le & \frac{1}{m} \mathbb{E}_{Z}\mathbb{E}_{\hat{Z}}\sqrt{\sum\limits_{i=1}^{m} 2(\mathbb{E}_{\rho}\beta\|\mathcal{Q}(x_i+\rho,y_i)-(x_i+\rho)\|_2+B)^{2}+\sum\limits_{i=1}^{m}2(\mathbb{E}_{\rho}\beta\|\mathcal{Q}(\hat{x}_i+\rho,\hat{y}_i)-(\hat{x}_i+\rho)\|_2+B)^{2}}\\
\le & \frac{1}{m} \sqrt{\mathbb{E}_{Z}\mathbb{E}_{\hat{Z}}\left[\sum\limits_{i=1}^{m} 2(\mathbb{E}_{\rho}\beta\|\mathcal{Q}(x_i+\rho,y_i)-(x_i+\rho)\|_2+B)^{2}+\sum\limits_{i=1}^{m}2(\mathbb{E}_{\rho}\beta\|\mathcal{Q}(\hat{x}_i+\rho,\hat{y}_i)-(\hat{x}_i+\rho)\|_2+B)^{2}\right]}\\
\le & \frac{2}{\sqrt{m}}\sqrt{\mathbb{E}_{z}(\mathbb{E}_{\rho}\beta\|\mathcal{Q}(x+\rho,y)-(x+\rho)\|_2+B)^{2}}\\
\le & \frac{2(\beta\sqrt{d}\epsilon+B)}{\sqrt{m}} \label{eq 21}
\end{align}

The final line is due to that with $\|\mathcal{Q}(x+\rho)-(x+\rho)\|_{\infty}\le \epsilon$ we have $\|\mathcal{Q}(x+\rho)-(x+\rho)\|_{2}\le \sqrt{d}\epsilon$. This gives the final result
$$\mathbb{E}_{U}g(u_1\cdots u_m)\le \frac{2\beta}{\sqrt{m}}\sqrt{\mathbb{E}_{z}\gamma(x,y)} +  \frac{2(\beta\sqrt{d}\epsilon+B)}{\sqrt{m}} $$

 \hfill $\Box$


\section{Omitted Figures}
\label{Appendix: omit figs}

\begin{figure}[!htbp]
    \centering
    \subfigure[$\phi$=AT(0)]{\includegraphics[width=0.24\textwidth]{uai2025-template/training dynamic/train_curve_cifar100/chkpt0.pdf}}
    \subfigure[$\phi$=AT(80)]{\includegraphics[width=0.24\textwidth]{uai2025-template/training dynamic/train_curve_cifar100/chkpt80.pdf}}
    \subfigure[$\phi$=AT(120)]{\includegraphics[width=0.24\textwidth]{uai2025-template/training dynamic/train_curve_cifar100/chkpt120.pdf}}
    \subfigure[$\phi$=AT(200)]{\includegraphics[width=0.24\textwidth]{uai2025-template/training dynamic/train_curve_cifar100/chkpt200.pdf}}
    \caption{Experiments in Figure \ref{fig: train curves-cifar10} reproduced on CIFAR-100.}
     \label{fig: train curves-cifar100}
\end{figure}

\begin{figure}[!htbp]
    \centering
    \subfigure[$\phi$=AT(0)]{\includegraphics[width=0.24\textwidth]{uai2025-template/training dynamic/train_curve_ResImg/chkpt0.pdf}}
    \subfigure[$\phi$=AT(80)]{\includegraphics[width=0.24\textwidth]{uai2025-template/training dynamic/train_curve_ResImg/chkpt80.pdf}}
    \subfigure[$\phi$=AT(120)]{\includegraphics[width=0.24\textwidth]{uai2025-template/training dynamic/train_curve_ResImg/chkpt120.pdf}}
    \subfigure[$\phi$=AT(200)]{\includegraphics[width=0.24\textwidth]{uai2025-template/training dynamic/train_curve_ResImg/chkpt200.pdf}}
    \caption{Experiments in Figure \ref{fig: train curves-cifar10} reproduced on Reduced ImageNet.}
     \label{fig: train curves-ResImg}
\end{figure}

\begin{figure}[!htbp]
    \centering
    \subfigure[ELDs for different $\sigma$]{\includegraphics[width=0.24\textwidth]{uai2025-template/training dynamic/disper_figs/cifar100/different_std.pdf}} 
    \subfigure[$\sigma=0.005$]{\includegraphics[width=0.24\textwidth]{uai2025-template/training dynamic/disper_figs/cifar100/ELD_Gaussian_std0.005.pdf}}  
    \subfigure[$\sigma=0.01$]{\includegraphics[width=0.24\textwidth]{uai2025-template/training dynamic/disper_figs/cifar100/LD_Gaussian_hist_0.01.pdf}} 
    \subfigure[$\sigma=0.005$]{\includegraphics[width=0.24\textwidth]{uai2025-template/training dynamic/disper_figs/cifar100/LD_Gaussian_hist_0.005.pdf}}
     \caption{Experiments in Figure \ref{fig: disper} reproduced on CIFAR-100.}
\label{fig: disper-cifar100}
\end{figure}

\begin{figure}[!htbp]
    \centering
    \subfigure[ELDs for different $\sigma$]{\includegraphics[width=0.24\textwidth]{uai2025-template/training dynamic/disper_figs/ResImg/different_std.pdf}} 
    \subfigure[$\sigma=0.001$]{\includegraphics[width=0.24\textwidth]{uai2025-template/training dynamic/disper_figs/ResImg/ELD_Gaussian_std0.001.pdf}}  \subfigure[$\sigma=0.001$]{\includegraphics[width=0.24\textwidth]{uai2025-template/training dynamic/disper_figs/ResImg/LD_Gaussian_hist_0.001.pdf}} 
    \subfigure[$\sigma=0.0005$]{\includegraphics[width=0.24\textwidth]{uai2025-template/training dynamic/disper_figs/ResImg/LD_Gaussian_hist_0.0005.pdf}}
     \caption{Experiments in Figure \ref{fig: disper} reproduced on Reduced ImageNet.}
\label{fig: disper-resimg}
\end{figure}


\end{document}
