\documentclass[accepted]{uai2023} % for initial submission
% \documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\usepackage{xr} 
\usepackage{cleveref}


% \RequirePackage{amsmath}
\RequirePackage{amssymb}
\RequirePackage{amsthm}
\RequirePackage{bm} 
\RequirePackage{url}
\usepackage{natbib}
% \usepackage{multirow}
\usepackage{graphicx}
\usepackage{subfigure}
% \usepackage{makecell}
% \usepackage{booktabs}
% \usepackage{array}
% \usepackage{url}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{dsfont}
% \usepackage{enumerate}
\RequirePackage{bm} 
\newcommand\wtilde{\stackrel{\sim}{\smash{\mathcal{W}}\rule{0pt}{1.1ex}}}

%----- bold fonts -----%

\newcommand{\ab}{\mathbf{a}}
\newcommand{\bbb}{\mathbf{b}}
\newcommand{\cbb}{\mathbf{c}}
\newcommand{\db}{\mathbf{d}}
\newcommand{\eb}{\mathbf{e}}
\newcommand{\fb}{\mathbf{f}}
\newcommand{\gb}{\mathbf{g}}
\newcommand{\hb}{\mathbf{h}}
\newcommand{\ib}{\mathbf{i}}
\newcommand{\jb}{\mathbf{j}}
\newcommand{\kb}{\mathbf{k}}
\newcommand{\lb}{\mathbf{l}}
\newcommand{\mb}{\mathbf{m}}
\newcommand{\nbb}{\mathbf{n}}
\newcommand{\ob}{\mathbf{o}}
\newcommand{\pb}{\mathbf{p}}
\newcommand{\qb}{\mathbf{q}}
\newcommand{\rb}{\mathbf{r}}
\newcommand{\sbb}{\mathbf{s}}
\newcommand{\tb}{\mathbf{t}}
\newcommand{\ub}{\mathbf{u}}
\newcommand{\vb}{\mathbf{v}}
\newcommand{\wb}{\mathbf{w}}
\newcommand{\xb}{\mathbf{x}}
\newcommand{\yb}{\mathbf{y}}
\newcommand{\zb}{\mathbf{z}}

\newcommand{\ba}{\bm{a}}
\newcommand{\bb}{\bm{b}}
\newcommand{\bc}{\bm{c}}
\newcommand{\bd}{\bm{d}}
\newcommand{\be}{\bm{e}}
\newcommand{\bbf}{\bm{f}}
\newcommand{\bg}{\bm{g}}
\newcommand{\bh}{\bm{h}}
\newcommand{\bi}{\bmf{i}}
\newcommand{\bj}{\bm{j}}
\newcommand{\bk}{\bm{k}}
\newcommand{\bl}{\bm{l}}
\newcommand{\bbm}{\bm{m}}
\newcommand{\bn}{\bm{n}}
\newcommand{\bo}{\bm{o}}
\newcommand{\bp}{\bm{p}}
\newcommand{\bq}{\bm{q}}
\newcommand{\br}{\bm{r}}
\newcommand{\bs}{\bm{s}}
\newcommand{\bt}{\bm{t}}
\newcommand{\bu}{\bm{u}}
\newcommand{\bv}{\bm{v}}
\newcommand{\bw}{\bm{w}}
\newcommand{\bx}{\bm{x}}
\newcommand{\by}{\bm{y}}
\newcommand{\bz}{\bm{z}}




\newcommand{\Ab}{\mathbf{A}}
\newcommand{\Bb}{\mathbf{B}}
\newcommand{\Cb}{\mathbf{C}}
\newcommand{\Db}{\mathbf{D}}
\newcommand{\Eb}{\mathbf{E}}
\newcommand{\Fb}{\mathbf{F}}
\newcommand{\Gb}{\mathbf{G}}
\newcommand{\Hb}{\mathbf{H}}
\newcommand{\Ib}{\mathbf{I}}
\newcommand{\Jb}{\mathbf{J}}
\newcommand{\Kb}{\mathbf{K}}
\newcommand{\Lb}{\mathbf{L}}
\newcommand{\Mb}{\mathbf{M}}
\newcommand{\Nb}{\mathbf{N}}
\newcommand{\Ob}{\mathbf{O}}
\newcommand{\Pb}{\mathbf{P}}
\newcommand{\Qb}{\mathbf{Q}}
\newcommand{\Rb}{\mathbf{R}}
\newcommand{\Sbb}{\mathbf{S}}
\newcommand{\Tb}{\mathbf{T}}
\newcommand{\Ub}{\mathbf{U}}
\newcommand{\Vb}{\mathbf{V}}
\newcommand{\Wb}{\mathbf{W}}
\newcommand{\Xb}{\mathbf{X}}
\newcommand{\Yb}{\mathbf{Y}}
\newcommand{\Zb}{\mathbf{Z}}

\newcommand{\bA}{\bm{A}}
\newcommand{\bB}{\bm{B}}
\newcommand{\bC}{\bm{C}}
\newcommand{\bD}{\bm{D}}
\newcommand{\bE}{\bm{E}}
\newcommand{\bF}{\bm{F}}
\newcommand{\bG}{\bm{G}}
\newcommand{\bH}{\bm{H}}
\newcommand{\bI}{\bm{I}}
\newcommand{\bJ}{\bm{J}}
\newcommand{\bK}{\bm{K}}
\newcommand{\bL}{\bm{L}}
\newcommand{\bM}{\bm{M}}
\newcommand{\bN}{\bm{N}}
\newcommand{\bO}{\bm{O}}
\newcommand{\bP}{\bm{P}}
\newcommand{\bQ}{\bm{Q}}
\newcommand{\bR}{\bm{R}}
\newcommand{\bS}{\bm{S}}
\newcommand{\bT}{\bm{T}}
\newcommand{\bU}{\bm{U}}
\newcommand{\bV}{\bm{V}}
\newcommand{\bW}{\bm{W}}
\newcommand{\bX}{\bm{X}}
\newcommand{\bY}{\bm{Y}}
\newcommand{\bZ}{\bm{Z}}


%----- calligraphic fonts -----%

\newcommand{\cA}{\mathcal{A}}
\newcommand{\cB}{\mathcal{B}}
\newcommand{\cC}{\mathcal{C}}
\newcommand{\cD}{\mathcal{D}}
\newcommand{\cE}{\mathcal{E}}
\newcommand{\cF}{\mathcal{F}}
\newcommand{\cG}{\mathcal{G}}
\newcommand{\cH}{\mathcal{H}}
\newcommand{\cI}{\mathcal{I}}
\newcommand{\cJ}{\mathcal{J}}
\newcommand{\cK}{\mathcal{K}}
\newcommand{\cL}{\mathcal{L}}
\newcommand{\cM}{\mathcal{M}}
\newcommand{\cN}{\mathcal{N}}
\newcommand{\cO}{\mathcal{O}}
\newcommand{\cP}{\mathcal{P}}
\newcommand{\cQ}{\mathcal{Q}}
\newcommand{\cR}{\mathcal{R}}
\newcommand{\cS}{{\mathcal{S}}}
\newcommand{\cT}{{\mathcal{T}}}
\newcommand{\cU}{\mathcal{U}}
\newcommand{\cV}{\mathcal{V}}
\newcommand{\cW}{\mathcal{W}}
\newcommand{\cX}{\mathcal{X}}
\newcommand{\cY}{\mathcal{Y}}
\newcommand{\cZ}{\mathcal{Z}}




%----- blackboard bold fonts-----%

\newcommand{\CC}{\mathbb{C}}
\newcommand{\EE}{\mathbb{E}}
\newcommand{\VV}{\mathbb{V}}
\newcommand{\II}{\mathbb{I}}
\newcommand{\KK}{\mathbb{K}}
\newcommand{\LL}{\mathbb{L}}
\newcommand{\MM}{\mathbb{M}}
\newcommand{\NN}{\mathbb{N}}
\newcommand{\PP}{\mathbb{P}}
\newcommand{\QQ}{\mathbb{Q}}
\newcommand{\RR}{\mathbb{R}}
\newcommand{\SSS}{\mathbb{S}}
\newcommand{\ZZ}{\mathbb{Z}}
\newcommand{\XX}{\mathbb{X}}
\newcommand{\YY}{\mathbb{Y}}
\newcommand{\OOmega}{\mathbb{\Omega}}




%----- bold greek fonts -----%

\newcommand{\balpha}{\bm{\alpha}}
\newcommand{\bbeta}{\bm{\beta}}
\newcommand{\bgamma}{\bm{\gamma}}
\newcommand{\bepsilon}{\bm{\epsilon}}
\newcommand{\bvarepsilon}{\bm{\varepsilon}}
\newcommand{\bzeta}{\bm{\zeta}}
\newcommand{\btheta}{\bm{\theta}}
\newcommand{\bvartheta}{\bm{\vartheta}}
\newcommand{\bkappa}{\bm{\kappa}}
\newcommand{\blambda}{\bm{\lambda}}
\newcommand{\bmu}{\bm{\mu}}
\newcommand{\bnu}{\bm{\nu}}
\newcommand{\bxi}{\bm{\xi}}
\newcommand{\bpi}{\bm{\pi}}
\newcommand{\bvarpi}{\bm{\varpi}}
\newcommand{\brho}{\bm{\varrho}}
\newcommand{\bsigma}{\bm{\sigma}}
\newcommand{\bvarsigma}{\bm{\varsigma}}
\newcommand{\btau}{\bm{\tau}}
\newcommand{\bupsilon}{\bm{\upsilon}}
\newcommand{\bphi}{\bm{\phi}}
\newcommand{\bvarphi}{\bm{\varphi}}
\newcommand{\bchi}{\bm{\chi}}
\newcommand{\bpsi}{\bm{\psi}}
\newcommand{\bomega}{\bm{\omega}}

\newcommand{\bGamma}{\bm{\Gamma}}
\newcommand{\bDelta}{\bm{\Delta}}
\newcommand{\bTheta}{\bm{\Theta}}
\newcommand{\bLambda}{\bm{\Lambda}}
\newcommand{\bXi}{\bm{\Xi}}
\newcommand{\bPi}{\bm{\Pi}}
\newcommand{\bSigma}{\bm{\Sigma}}
\newcommand{\bUpsilon}{\bm{\Upsilon}}
\newcommand{\bPhi}{\bm{\Phi}}
\newcommand{\bPsi}{\bm{\Psi}}
\newcommand{\bOmega}{\bm{\Omega}}


%----- Some standard definitions -----%

\newcommand{\argmin}{\mathop{\mathrm{argmin}}}
\newcommand{\argmax}{\mathop{\mathrm{argmax}}}
\newcommand{\minimize}{\mathop{\mathrm{minimize}}}

\newcommand{\sign}{\mathop{\mathrm{sign}}}
\newcommand{\tr}{\mathop{\mathrm{tr}}}

\DeclareMathOperator{\Var}{{\rm Var}}
\DeclareMathOperator*{\Cor}{\rm Corr}
\DeclareMathOperator*{\Cov}{\rm Cov}
\DeclareMathOperator*{\ind}{\mathds{1}}  % Indicator
\newcommand{\smallfrac}[2]{{\textstyle \frac{#1}{#2}}}  
                                                        
\newcommand*{\zero}{{\bm 0}}
\newcommand*{\one}{{\bm 1}}

\newcommand{\diag}{{\rm diag}}


\ifx\proof\undefined
\newenvironment{proof}{\par\noindent{\bf Proof\ }}{\hfill\BlackBox\\[2mm]}\fi

\ifx\theorem\undefined
\newtheorem{theorem}{Theorem}
\fi
\ifx\example\undefined
\newtheorem{example}[theorem]{Example}
\fi
\ifx\property\undefined
\newtheorem{property}{Property}
\fi
\ifx\lemma\undefined
\newtheorem{lemma}[theorem]{Lemma}
\fi
\ifx\proposition\undefined
\newtheorem{proposition}[theorem]{Proposition}
\fi
\ifx\remark\undefined
\newtheorem{remark}[theorem]{Remark}
\fi
\ifx\corollary\undefined
\newtheorem{corollary}[theorem]{Corollary}
\fi
\ifx\definition\undefined
\newtheorem{definition}[theorem]{Definition}
\fi
\ifx\conjecture\undefined
\newtheorem{conjecture}[theorem]{Conjecture}
\fi
\ifx\fact\undefined
\newtheorem{fact}[theorem]{Fact}
\fi
\ifx\claim\undefined
\newtheorem{claim}[theorem]{Claim}
\fi
\ifx\assumption\undefined
\newtheorem{assumption}[theorem]{Assumption}
\fi

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


%%%%% Norms

\newcommand{\norm}[1]{||#1||}
\newcommand{\bignorm}[1]{\bigg|\bigg|#1\bigg|\bigg|}
\newcommand{\opnorm}[2]{| \! | \! | #1 | \! | \! |_{{#2}}}

%%%%% Dot product
\newcommand{\dotp}[2]{\langle{#1},{#2}\rangle}

%%%%  brackets
\newcommand{\inner}[2]{\left\langle #1,#2 \right\rangle}
\newcommand{\rbr}[1]{\left(#1\right)}
\newcommand{\sbr}[1]{\left[#1\right]}
\newcommand{\cbr}[1]{\left\{#1\right\}}
\newcommand{\nbr}[1]{\left\|#1\right\|}
\newcommand{\abr}[1]{\left|#1\right|}

%%%%%%%%%  Other commands

\newcommand{\mcomment}[1]{\marginpar{\tiny{#1}}}
\newcommand{\fcomment}[1]{\footnote{\tiny{#1}}}
%\newcommand{\overbar}[1]{\mkern 2mu\overline{\mkern-2mu#1\mkern-2mu}\mkern 2mu}
\newcommand{\overbar}[1]{\mkern 1.5mu\overline{\mkern-1.5mu#1\mkern-1.5mu}\mkern 1.5mu}
\newcommand{\ud}{{\mathrm{d}}}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\externaldocument{uai2023-supplement}

\title{Benign Overfitting in Adversarially Robust Linear Classification}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
% \author[1]{\href{mailto:<jj@example.edu>?Subject=Your UAI 2023 paper}{Jane~J.~von~O'L\'opez}{}}
% \author[1]{Harry~Q.~Bovik}
% \author[1,2]{Further~Coauthor}
% \author[3]{Further~Coauthor}
% \author[1]{Further~Coauthor}
% \author[3]{Further~Coauthor}
% \author[3,1]{Further~Coauthor}
% % Add affiliations after the authors
% \affil[1]{%
%     Computer Science Dept.\\
%     Cranberry University\\
%     Pittsburgh, Pennsylvania, USA
% }
% \affil[2]{%
%     Second Affiliation\\
%     Address\\
%     …
% }
% \affil[3]{%
%     Another Affiliation\\
%     Address\\
%     …
%   }

\author[1*]{Jinghui Chen} 
\author[2*]{Yuan Cao}
\author[3]{Quanquan Gu}
% Add affiliations after the authors
\affil[1]{%
  The Pennsylvania State University, \texttt{jzc5917@psu.edu}
}
\affil[2]{%
    The University of Hong Kong, \texttt{yuancao@hku.hk}
}
\affil[3]{%
    University of California, Los Angeles,  \texttt{qgu@cs.ucla.edu}
  }
\affil[*]{{Equal contribution}
  }

  
  \begin{document}
\maketitle

\begin{abstract}
  ``Benign overfitting'', where classifiers memorize noisy training data yet still achieve a good generalization performance, has drawn great attention in the machine learning community. To explain this surprising phenomenon, a series of works have provided theoretical justification for over-parameterized linear regression, classification, and kernel methods. However, it is not clear if benign overfitting can occur in the presence of adversarial examples, i.e., examples with tiny and intentional perturbations to fool the classifiers. In this paper, we show that benign overfitting indeed occurs in adversarial training, a principled approach to defend against adversarial examples, on subGaussian mixture data. In detail, we prove the risk bounds of the adversarially trained linear classifier on the mixture of sub-Gaussian data under $\ell_p$ adversarial perturbations. Our result suggests that under moderate perturbations, adversarially trained linear classifiers can achieve the near-optimal standard and adversarial risks, despite overfitting the noisy training data. Numerical experiments validate our theoretical findings. 
\end{abstract}

\section{Introduction}
\label{sec:intro}
Modern machine learning methods such as deep learning have made many breakthroughs in a variety of application domains, including image classification \citep{he2016deep, krizhevsky2012imagenet}, speech recognition \citep{hinton2012deep} and etc. These models are typically over-parameterized: the number of model  parameters far exceeds the size of the training samples. One mystery is that, these over-parameterized models can memorize noisy training data and yet still achieve quite good generalization performances on the test data \citep{zhang2016understanding}. 
Many efforts have been made to explain this striking phenomenon, which against what the classical notion of overfitting might suggest. A line of research works \citep{soudry2018implicit,ji2019implicit,nacson2019stochastic,gunasekar2018implicit,gunasekar2018characterizing} shows that there exists the so-called implicit bias \citep{neyshabur2017implicit}: the training algorithms tend to converge to certain kinds of solutions even with no explicit regularization. Specifically, \citet{soudry2018implicit,ji2019implicit,nacson2019stochastic} demonstrate that gradient descent trained linear classifiers on logistic or exponential loss with no regularization asymptotically converge to the maximum $L_2$ margin classifier. Recent works \citep{bartlett2020benign,chatterji2020finite,cao2021risk,wang2021benign,tsigler2020benign} further shows that  over-parameterized and implicitly regularized interpolators can indeed achieve small test error, and formulate this phenomenon as ``benign overfitting''.
% Recent works \citep{bartlett2020benign,chatterji2020finite,cao2021risk,wang2021benign,tsigler2020benign} further formulate this phenomenon as ``benign overfitting'' that over-parameterized models can indeed achieve small test error.
More concretely, suppose the classification model $f$ is parameterized by $\btheta \in \bTheta$ and the loss is denoted as $\ell(\cdot)$. The population risk is define as 
\begin{align*}
    \PP_{(\xb, y)\sim \cD}[f_{\btheta}(\xb) \neq y],
\end{align*}
where data pair $(\xb, y)$ is generated from certain data generation model. \citet{chatterji2020finite} shows that with sufficient over-parameterization, gradient descent trained maximum $L_2$ margin classifier can achieve nearly optimal population risk on noisy data for data generated from a sub-Gaussian mixture model. This suggests that the overfitting can be ``benign'' in the over-parameterized setting.


% Despite those remarkable success achieved by modern machine learning techniques, 


Besides these studies on the benign overfitting phenomenon, another well-known feature of modern machine learning methods is that they are vulnerable to adversarial examples.
Recent studies \citep{szegedy2013intriguing, goodfellow6572explaining} show that modern machine learning systems are brittle: slight input perturbation that is imperceptible to human eyes could mislead a well-trained classifier into wrong classification result. These malicious inputs are also known as the adversarial examples \citep{szegedy2013intriguing, goodfellow6572explaining}. Adversarial examples raise severe trustworthy issues and security concerns on the current machine learning systems especially in security-critical applications. Various methods \citep{kurakin2016adversarial,madry2017towards, zhang2019theoretically,wang2019convergence,wang2020improving} have been proposed to defend against the threats posed by adversarial examples. One of the notable approaches is adversarial training \citep{madry2017towards}. Specifically, adversarial training solves the following min-max optimization problem,
\begin{align*}
% \label{eq:advtrain}
   \min_{\btheta \in \bTheta} \frac{1}{n} \sum_{i=1}^n \max_{\xb_i' \in \cB^p_{\epsilon}(\xb_i)} \ell(f_{\btheta}(\xb_i'), y_i),
\end{align*}
where $\{(\xb_i, y_i)\}_{i=1}^n$ is the training set and $\cB^p_{\epsilon}(\xb_i) = \{\xb: \|\xb - \xb_i\|_p \leq \epsilon\}$ denotes the $\epsilon$-ball around $\xb_i$ in $\ell_p$ norm ($p \geq 1$).
% The common way to solve \eqref{eq:advtrain} is the gradient descent based approach: one first solves the inner maximization problem and then perform one step gradient update on the outer minimization problem. 
Many empirical or theoretical studies have been conducted trying to analyze or further improve adversarial training robustness \citep{zhang2019theoretically, wang2020improving,carmon2019unlabeled,wang2019convergence,raghunathan2020understanding}. A recent work \citep{sanyal2021how} also pointed out that normally trained interpolators with the presence of label noise are unlikely to be adversarially robust, while adversarially robust classifiers cannot overfit noisy labels under certain conditions. 
\citet{rice2020overfitting} showed that overfitting can hurt robust generalization in adversarial training on several real-world datasets. 
\citet{dong2021exploring} pointed out that robust overfitting in adversarial training is caused by memorizing one-hot labels, which can be relieved by proper regularization.
However, there still lacks theoretical understanding why and when benign overfitting %is no longer ``benign'' for adversarial training. 
%it remains mysterious whether the benign overfitting phenomenon 
can or cannot occur for adversarial training. %for extremely over-parameterized models in the presence of adversarial examples. 
% Inspired from the benign overfitting literature in standard training, the natural question to ask here is 

% \textit{Besides convergence properties, do adversarially trained linear classifiers also generalize well beyond the training set? Moreover, do they generalize well under adversarial perturbations?}

In this paper, we show that benign overfitting can indeed occur in adversarial training under certain data distributions, which largely advances our understanding towards overfitting in adversarial training.
We summarize our contributions of this paper in the following
\begin{itemize}[leftmargin=1em]
    \item We show that the benign overfitting phenomenon can occur in adversarially robust linear classifiers with sufficient over-parameterization for data generated from a Gaussian mixture model. Specifically, under moderate $\ell_p$ norm perturbations, adversarially trained linear classifiers can achieve the near-optimal standard and adversarial risks, in spite of overfitting the noisy training data. 
    
    \item When the perturbation strength $\epsilon$ is set to be $0$, our adversarial risk bound reduces to the standard one. The resulting standard risk bound extends \citet{chatterji2020finite}'s risk bound to further characterize the behavior of the linear classifier trained by $t$-step gradient descent. %(and is also consistent to \citet{chatterji2020finite}'s result when $t \to \infty$).
    
    \item We show that depending on the value of $p$ (perturbation norm), the adversarial risk bound can be different. The higher value of $p$ (typically for $p \geq 2$ case) actually leads to a larger gap between the adversarial risk and the standard risk with the same $\epsilon$. 
\end{itemize}

Complementary to our theory, we also conduct numerical experiments to show that if certain data distribution assumption is violated, overfitting can become harmful.
 

\textbf{Notation.} we use lower case letters to denote scalars and lower case bold face letters to denote vectors. For a vector $\xb \in \RR^d$, we denote its $\ell_p$ norm ($p\geq 1$) of $\xb$ by $\| \xb \|_p = \big(\sum_{i=1}^d |x_i|^p\big)^{1/p}$, the $\ell_\infty$ norm of $\xb$ by $ \|\xb\|_\infty = \max_{i=1}^d |x_i|$. We denote $\xb^{\circ p}$ as the element-wise $p$-power of $\xb$. 
The notation $(\xb, y) \sim \cD$ denotes that the data pair $(\xb, y)$ is generated from a distribution $\cD$.
For $p\geq 1$, we denote $\cB^p_{r}(\xb)$  as the $\ell_p$ norm ball of radius $r$ centered at $\xb$.
Given two sequences $\{a_n\}$ and $\{b_n\}$, we write $a_n = O(b_n)$ if there exists a constant $0 < C < +\infty$ such that $a_n \leq C\, b_n$. We denote $a_n = \Omega(b_n)$ if $b_n = O(a_n)$. We denote $a_n = \Theta(b_n)$ if $a_n = O(b_n)$ and $a_n = \Omega(b_n)$. 

%The remainder of the paper is organized as follows. In Section \ref{sec:related}, we briefly talk about the related literature.
%In Section \ref{sec:settings}, we illustrate the problem settings.
%In Section \ref{sec:theorem}, we describe our risk bounds for adversarial training on exponential losses for linear classifiers.
%In Section \ref{sec:proof}, we present the proof of our main theorems.
% present empirical studies in Section \ref{sec:exp}. Finally we discuss and conclude our paper in Section \ref{sec:con}.


\section{Related Work}\label{sec:related}
There exists a large body of works on adversarial training, implicit bias and benign overfitting. In this section, we review the most relevant works to ours.

\textbf{Adversarial Training.} Adversarial training \citep{madry2017towards} and its variants \citep{zhang2019theoretically,wang2019convergence,wang2020improving} are currently the most effective type of approaches to empirically defend against adversarial examples \citep{szegedy2013intriguing, goodfellow6572explaining}. Many attempts have been made to understand its empirical success.
\citet{charles2019convergence,Li2020Implicit} showed that the adversarially trained linear classifier directionally converges to the maximum margin classifier. 
\citet{gao2019convergence,zhang2020over} showed that adversarial training with neural networks can achieve low robust training loss. Yet these conclusions cannot explain the test (population) performances.
Another line of research focuses on the generalization performance of adversarial training and the number of training samples. \citet{schmidt2018adversarially} showed that adversarial models require more data than standard models to achieve certain test accuracy.
\citet{chen2020more} showed that more data may actually
increase the gap between the generalization error of adversarially-trained models and standard models.
\citet{yin2019rademacher,cullina2018pac} studied the adversarial Rademacher complexity and VC-dimensions. 
Some other works focus on the trade-off between robustness and natural accuracy \citep{zhang2019theoretically,tsipras2018robustness,wu2020wider,raghunathan2020understanding,yang2020closer,dobriban2020provable,javanmard2020precise}, adversarial model complexity lower bound \citep{allen2020feature}, as well as the provable robustness upper bound \citep{fawzi2018adversarial,zhang2020understanding}. \citet{liu2021impact} studied the impact of hard training
instances on adversarially trained model’s overfitting behavior.


Recently, some works also focus on studying the learning of robust halfspaces and linear models.
\citet{montasser2020efficiently} studied the conditions on the adversarial perturbation sets under which halfspaces are robustly learnable in the presence of random label noise. 
\citet{diakonikolas2020complexity} studied the computational complexity of adversarially robust halfspaces under $\ell_p$ norm perturbations. \citet{zou2021provable} showed that adversarially trained halfspaces are provably robust with low robust classification error in the presence of noise. \citet{dan2020sharp} proposed an adversarial signal to noise ratio and studied the excess risk lower/upper bounds for learning Gaussian mixture models. \citet{taheri2020asymptotic,javanmard2020precise} studied adversarial learning of linear models on Gaussian mixture data, where the data dimension and the number of training data points have a fixed ratio.

\textbf{Implicit Bias.} Several recent works studied the implicit bias of various training algorithms in over-parameterized models. \citet{soudry2018implicit} studied the implicit bias of gradient descent trained on linearly separable data while \citet{ji2019implicit} studied non-separable cases.
\citet{gunasekar2018characterizing} studied the implicit bias of various optimization methods in linear regression and classification problems. \citet{ji2018gradient} studied the implicit bias for deep linear networks and \citet{arora2019implicit,gunasekar2018implicit} studied the implicit bias for matrix factorization. \citet{Lyu2020Gradient} studied implicit regularization of homogeneous neural networks with exponential loss and logistic loss.

\noindent\textbf{Benign Overfitting and Double Descent.} A series of recent works have studied the ``benign overfitting'' phenomenon \cite{bartlett2020benign} that when training over-parameterized models, classifiers can still achieve good population risk even when overfitting the noisy training data. \cite{bartlett2020benign,tsigler2020benign} studied the risk bounds for over-parameterized linear (ridge) regression and showed that under certain settings, the interpolating linear model with minimum parameter norm can have asymptotically optimal risk.  \cite{belkin2018understand,belkin2019reconciling,belkin2019two,hastie2019surprises,wu2020optimal}  quantified the dependency curve between the population risk and the degree of over-parameterization and showed that the curve has a double-descent shape. 
\cite{chatterji2020finite} studied the risk bounds in over-parameterized linear logistic regression
with label flipping noises. \cite{cao2021risk} further tighten the risk bound in \cite{chatterji2020finite} in low SNR settings. \cite{zou2021benign} studied benign overfitting of stochastic gradient descent for linear regression. \cite{shamir2022implicit} studied benign overfitting for linear predictors using a generic data model. Recently, \cite{frei2022benign,cao2022benign} studied the benign overfitting in two-layer fully-connected neural networks or CNNs.
% \cite{chatterji2022foolish,wald2022malign} showed some negative results and suggests that the phenomenon of benign overfitting might not favorably extend to settings in which robustness or fairness are desirable.
\cite{chatterji2022foolish} showed a negative result for basis pursuit and compared it with ordinary least squares. \cite{wald2022malign} showed some negative results and suggests that the phenomenon of benign overfitting might not favorably extend to settings in which robustness or fairness are desirable.

% \textbf{Benign Overfitting and Double Descent.} A series of recent works have studied the ``benign overfitting'' phenomenon \citet{bartlett2020benign} that when training over-parameterized models, classifiers can still achieve good population risk even when overfitting the noisy training data. \citet{bartlett2020benign,tsigler2020benign} studied the risk bounds for over-parameterized linear (ridge) regression and showed that under certain settings, the interpolating linear model with minimum parameter norm can have asymptotically optimal risk.  \citet{chatterji2020finite,cao2021risk,wang2021benign} studied the risk bounds in linear logistic regression and linear support vector machines. \citet{belkin2018understand,belkin2019reconciling,belkin2019two,hastie2019surprises,wu2020optimal} further quantified the dependency curve between the population risk and the degree of over-parameterization and showed that the curve has a double-descent shape. 

\section{Problem Setting and Preliminaries}\label{sec:settings}
In order to properly characterize the benign overfitting phenomenon in adversarial training, we also define the population adversarial risk, which is the counterpart for population risk in the standard training scenario:
\begin{align*}
    \PP_{(\xb, y)\sim \cD}\big[ \exists \xb' \in \cB^p_{\epsilon}(\xb) \ s.t., \ f_{\btheta}(\xb') \neq y \big].
\end{align*}
The adversarial risk measures the misclassification rate of the target classifier under the presence of $\ell_p$-norm adversarial perturbations. It is easy to observe that the adversarial risk is always larger than the standard risk as it requires the classifier to correctly classify the data examples within the entire local $\ell_p$ norm ball.


We consider a sub-Gaussian mixture data generation model in our work. 
Specifically, the clean data $(\tilde\xb, \tilde y) \sim \tilde\cD$ is generated such that, for each data point $(\tilde\xb, \tilde y) \in \RR^d \times  \{\pm 1\}$, we have $\tilde y \sim \text{Unif}(\{\pm 1\})$ and $\tilde \xb = \tilde y\bmu + \bxi$ where $\bxi \in \RR^d$ and $\xi_1, \xi_2, \ldots, \xi_d$ are i.i.d. zero-mean sub-Gaussian variables with sub-Gaussian norm at most $1$.
The actual data examples are sampled from a noisy distribution $\cD$ which is close to the clean distribution $\tilde\cD$. Specifically, $\cD$ can be any distribution over $\RR^d \times \{\pm 1\}$ who has the same marginal distribution on $\RR^d$ and the total variation distance $d_{\text{TV}}(\cD,\tilde\cD) \leq \eta$ where $\eta$ denotes the noise level. 


Note that our data generation model is standard for studying the population risk of over-parameterized linear classification. In fact, it is exactly the same as the one studied in \citet{chatterji2020finite}. In this model, following standard coupling lemma \citep{lindvall2002lectures}, there always exists a joint distribution on original data and noisy data $((\tilde\xb, \tilde y),(\xb, y))$ such that the marginal distribution for $(\tilde\xb, \tilde y)$ is $\tilde\cD$, the marginal distribution for $(\xb, y)$ is $\cD$, $\PP[\xb = \tilde\xb] = 1$ and $\PP[y \neq \tilde y] \leq \eta$. 
% \CCC{A similar data generation model is studied in \citet{dan2020sharp}, where the authors developed risk lower bounds and upper bounds for the setting where $\eta = 0$. Our setting with the presence of label flipping noise is more challenging, as  overfitting classifiers may behave drasticly differently under the presence of label flipping noises.}

In this paper, we study the problem of robust binary classification with training data $\{(\xb_i, y_i)\}_{i=1}^n$ drawn i.i.d. from the distribution $\cD$.
Let's denote the ``clean'' sample index as $\cC:=\{k: y_k = \tilde y_k\}$ and the ``noisy'' sample index as $\cN:=\{k: y_k \neq \tilde y_k\}$.
We consider the adversarially trained linear classifier under exponential loss. In such cases, the adversarial loss can be explicitly written as 
\begin{align}\label{eq:adv_loss}
    L(\btheta) = \sum_{i=1}^n \max_{\xb_i' \in \cB^p_{\epsilon}(\xb_i)} \exp(-y_i \btheta^\top \xb_i' ).
\end{align} 
In gradient descent adversarial training algorithm, the adversarial loss $L(\btheta)$ is minimized by first solving the inner maximization problem in \eqref{eq:adv_loss} with respect to the current model parameter $\btheta_{t-1}$ and then update the model parameter $\btheta_t$ by performing gradient descent to minimize the adversarial loss in each iteration. We summarized the training procedure for gradient descent adversarial training\footnote{Note that in practice people often initialize $\btheta_0$ by a small random vector (e.g., Xavier initialization \citep{glorot2010understanding}), while we follow \citet{Li2020Implicit} and set $\btheta_0=\zero$ for the ease of theoretical analysis.} in Algorithm \ref{alg:adv}. 
\setlength{\textfloatsep}{4pt}
\begin{algorithm}[t]
	\caption{Gradient Descent Adversarial Training}
	\label{alg:adv}
	\begin{algorithmic}[1]
		\STATE \textbf{input:} Training data $\{\xb_i, y_i\}_{i=1}^n$, 
		number of training iterations $T$, maximum perturbation strength $\epsilon$, training step sizes $\alpha_t$;
		\STATE initialize model parameter $\btheta_0 = \zero$
 		\FOR {$t = 1,\ldots, T$}
		      \FOR{each $\{\xb_i, y_i\}$}
		        \STATE $\xb_i' = \argmax_{\xb_i' \in \cB^p_{\epsilon}(\xb_i)} \exp(-y_i \btheta_{t-1}^\top \xb_i' ) $
		      \ENDFOR
		      \STATE $\btheta_t = \btheta_{t-1} - \alpha_t \cdot \nabla_{\btheta} L(\btheta_{t-1}) $
		\ENDFOR    
 	\end{algorithmic}
\end{algorithm}
Note that in the linear classifier setting, the inner maximization problem in \eqref{eq:adv_loss} has the following property
\begin{align}\label{eq:closeform}
    \argmax_{\xb_i' \in \cB^p_{\epsilon}(\xb_i)} \exp(-y_i \btheta^\top \xb_i' ) &=\argmax_{ \ub_i \in \cB^p_{\epsilon}(\zero)} \exp(-y_i \btheta^\top (\xb_i + \ub_i) ) \notag\\
    &= \argmin_{\|\ub_i\|_p \leq \epsilon}  y_i \btheta^\top \ub_i.
\end{align}
By H\"{o}lders' inequality, it is easy to observe that the optimal adversarial loss and the corresponding gradient can be written as 
\begin{align}
    &L(\btheta) = \sum_{i=1}^n \exp(-y_i \btheta^\top \xb_i + \epsilon \|\btheta\|_q), \notag\\
    &\nabla_{\btheta}L(\btheta)\notag\\  
    &= -\sum_{i=1}^n (y_i \xb_i - \epsilon\cdot
    \partial\|\btheta\|_q)\exp(-y_i \btheta^\top \xb_i + \epsilon \|\btheta\|_q),\nonumber
\end{align}
where $1/p + 1/q =1$.
Also note that in the over-parameterized settings, the training examples drawn from our data generation model are linearly separable with high probability (See Lemma \ref{lemma:innerproduct_bound} in Section \ref{sec:proof}).
Linearly separable property ensures that the training samples have a positive margin (with high probability). Following \citet{Li2020Implicit}, we also define the standard and adversarial margin as
\begin{align}\label{eq:max-margin}
    &\bar\gamma:=\max_{\|\btheta\|_q =1 } \min_{i\in[n]}  y_i \btheta^\top \xb_i, \quad \notag\\
    &\gamma := \max_{\|\btheta\|_2 =1 } \min_{i\in[n]} \min_{\xb_i' \in \cB^p_{\epsilon}(\xb_i)}  y_i \btheta^\top \xb_i', 
    % \quad \wb:=\argmax_{\|\btheta\|_2 =1 } \min_{i\in[n]} \min_{\xb_i' \in \cB^p_{\epsilon}(\xb_i)}  y_i \btheta^\top \xb_i',
\end{align}
which are useful in our later analysis. We also define the unique linear classifier $\theta$ that achieves adversarial margin $\gamma$ defined above as $\wb$.

% \begin{align}\label{eq:max-margin}
%     \gamma := \max_{\|\btheta\|_2 =1 } \min_{i\in[n]} \min_{\xb_i' \in \cB^p_{\epsilon}(\xb_i)}  y_i \btheta^\top \xb_i', \quad \wb:=\argmax_{\|\btheta\|_2 =1 } \min_{i\in[n]} \min_{\xb_i' \in \cB^p_{\epsilon}(\xb_i)}  y_i \btheta^\top \xb_i',
% \end{align}

 
\section{Main Results}\label{sec:theorem}
In this section, we study both the behavior of the population risk and the population adversarial risk for adversarially trained linear classifiers.


\begin{assumption}\label{assumption:radius}
% The adversarial perturbation radius $\epsilon$ is up a constant level and is smaller than the adversarial margin, i.e., $\epsilon \leq R  \min\{1, \gamma\}$. 
The adversarial perturbation radius $\epsilon$ is upper bounded by a constant $R$ and is smaller than the $\ell_p$ data margin $\bar\gamma$, i.e., $\epsilon \leq   \min\{R, \bar\gamma  \}$. 
% \CC{should just be $\gamma$? because in our definition $\gamma$ is based on our sacling of $\xb$}
\end{assumption}
% In practice, we consider adversarial training mainly because we know that the classier should be able to do correct labeling up to the adversarial perturbations. (For example, we commonly use $\ell_\infty$ adversarial perturbations in image classification tasks because we know $\ell_\infty$  perturbations with reasonable scaling will be ignored by human beings.)
% Assumption~\ref{assumption:radius} is exactly to ensure that our adversarial training is  indeed within such a reasonable setting.
The goal of adversarial training is to obtain high-accuracy classifiers that are also robust to small input perturbations which can be ignored by human beings (e.g., small $\ell_\infty$-norm perturbations that are invisible to human eyes). Therefore, Assumption~\ref{assumption:radius} is reasonable by constraining the maximum allowable perturbation magnitude.

\begin{assumption}\label{assumption:noise_lowerbound}
The noise $\bxi$ in the data generation model satisfies that $\EE[\|\bxi\|_2^2] \geq \kappa d$ for some constant $\kappa$.%$\kappa\in (0,1)$.
\end{assumption}
Assumption~\ref{assumption:noise_lowerbound} is a common condition that has also been considered in \citet{chatterji2020finite}. It ensures that the summation of the variances of the data input increases in the order of $\Theta(d)$. Clearly, this assumption covers the most common setting where the entries of $\xi$ are i.i.d. and have a variance larger than or equal to $\kappa$.



\begin{assumption}\label{assumption:gradient_descent}
The gradient descent starts at $\mathbf{0}$, and the step sizes are set as $\alpha_0 = 1/(Gdn)$, $\alpha_t = \alpha \leq 1/(GdnM)$ for 
% $M = \max\{[2d+\epsilon (q-1)\sqrt{d}/\gamma]\exp(-\gamma^2/(Gd) + G\epsilon), 1\}$ 
$M = \max\{[2d+\epsilon (q-1) d^{\frac{3q-2}{2q-2}}/\gamma]\exp(-\gamma^2/(Gd) + \epsilon /G), 1\}$ and a constant $G$.
\end{assumption}
Assumption~\ref{assumption:gradient_descent} summarizes our assumptions about the gradient descent algorithm on the adversarial loss. The learning rate conditions here are to ensure the convergence of adversarial training, and is inspired by \citet{Li2020Implicit}.


We first present our theorem for standard risk of adversarial training method (Algorithm \ref{alg:adv}).
\begin{theorem}[Standard Risk of Adversarial Training]\label{theorem:poprisk}
% There exists absolute constants $C,C'$ such that the following result hold. 
For any $p \in [1,+\infty)$, suppose that Assumptions~\ref{assumption:radius}, \ref{assumption:noise_lowerbound} and~\ref{assumption:gradient_descent} hold with $\kappa\in(0,1]$ and large enough constants $R$ and $G$. Moreover, for any $\delta \in (0,1)$, suppose the number of training samples $n \geq C\log(1/\delta)$, the dimension $d \geq C\cdot\max\{n\|\bmu\|_2^2, n^2 \log(n/\delta)\}$, the noise level $\eta < 1/C$,  and $\|\bmu\|_2^2 \geq C \max\{ \log(n/\delta), \epsilon \| \bmu \|_q \}$ for a large enough constant $C$. Then with probability at least $1 - \delta$, adversarially trained linear classifier $f_{\btheta_t}$ for sufficiently large $t$ under $\ell_p$-norm $\epsilon$-perturbation satisfies the following standard risk
% \begin{align*}
%     \PP_{(\xb, y)\sim \cD}[f_{\btheta_t}(\xb) \neq y]
%   &\leq \eta + \exp\Bigg( -c \bigg(   \frac{\big(\frac{\|\bmu\|_2^2 }{4} - \epsilon\|\bmu\|_q  \big)}{  (\sqrt{c_0} + \epsilon)\sqrt{d} } - \frac{c_3\|\bmu\|_2\log n}{\log t}\bigg)^2 \Bigg),
% \end{align*}
\begin{align*}
    &\PP_{(\xb, y)\sim \cD}[f_{\btheta_t}(\xb) \neq y]\\
   &\leq \eta + \exp\Bigg( -C' \bigg( \frac{\big(\|\bmu\|_2^2  - 4\epsilon\|\bmu\|_q  \big)}{  (C'' + \epsilon)\sqrt{d} } - \frac{C'''\|\bmu\|_2\log n}{\log t}\bigg)^2 \Bigg),
\end{align*}
where $C',C'',C''' > 0$ are absolute constants, $1/p + 1/q =1$.
\end{theorem}

% \begin{theorem}[Standard Risk of Adversarial Training]\label{theorem:poprisk}
% There exists absolute constants $C,C'$ such that the following result hold. 
% For any $\delta > 0$, suppose the number of training samples $n \geq C\log(1/\delta)$, the dimension $d \geq C\cdot\max\{n\|\bmu\|_2^2, n^2 \log(n/\delta)\}$, the noise level $\eta < 1/C$,  and $\|\bmu\|_2^2 \geq C\log(n/\delta)$ for a large enough constant $C$, $\EE[\|\bxi\|_2^2] \geq \kappa d$ for some $0<\kappa < 1$, step size $\alpha_0 = 1/(c_0Cdn)$, $\alpha_t = \alpha \leq 1/(c_0CdnM)$ for $M = \max\{[2d+\epsilon (q-1)\sqrt{d}/\gamma]\exp(-\gamma^2/c_0Cd + \epsilon /\sqrt{c_0C}), 1\}$, then with probability $1 - \delta$, adversarially trained linear classifier $f$ parameterized by $\btheta_t$ under $\ell_p$-norm $\epsilon$-perturbation satisfies the following standard risk
% \begin{align*}
%     \PP_{(\xb, y)\sim \cD}[f_{\btheta_t}(\xb) \neq y]
%   &\leq \eta + \exp\Bigg( -c \bigg(   \frac{\big(\frac{\|\bmu\|_2^2 }{4} - \epsilon\|\bmu\|_q  \big)}{  (\sqrt{c_0} + \epsilon)\sqrt{d} } - \frac{c_3\|\bmu\|_2\log n}{\log t}\bigg)^2 \Bigg),
% \end{align*}
% where $0 <\delta < 1/C, c_0>1$, $c_3, c$ are absolute constants, $1/p + 1/q =1$.
% \end{theorem}




% \begin{theorem}[Standard Risk of Adversarial Training]\label{theorem:poprisk}
% Suppose the number of training samples $n \geq C\log(1/\delta)$, the dimension $d \geq C\cdot\max\{n\|\bmu\|_2^2, n^2 \log(n/\delta)\}$, noise level $\eta < 1/C$, adversarial perturbation $\epsilon \leq \min\{1/(2c_0^{1.5}), \sqrt{c_0d}\gamma\}$ and $\|\bmu\|_2^2 \geq C\log(n/\delta)$ for a large enough constant $C$, $\EE[\|\bxi\|_2^2] \geq \kappa d$ for some $0<\kappa < 1$, step size $\alpha_0 = 1/(c_0Cdn)$, $\alpha_t = \alpha < 1/(c_0CdnM)$ for $M = \max\{[2d+\epsilon (q-1)/\sqrt{c_0C\gamma^2}]\exp(-\gamma^2 + \epsilon/\sqrt{c_0C}), 1\}$, then with probability $1 - \delta$, adversarially trained linear classifier $f$ parameterized by $\btheta_t$ under $\ell_p$-norm $\epsilon$-perturbation satisfies the following standard risk
% \begin{align*}
%     \PP_{(\xb, y)\sim \cD}[f_{\btheta_t}(\xb) \neq y]
%   &\leq \eta + \exp\Bigg( -c \bigg(   \frac{\big(\frac{\|\bmu\|_2^2 }{4} - \epsilon\|\bmu\|_q  \big)}{  (\sqrt{c_0} + \epsilon)\sqrt{d} } - \frac{c_3\|\bmu\|_2\log n}{\log t}\bigg)^2 \Bigg),
% \end{align*}
% where $0 <\delta < 1/C, c_0>1$, $c_3, c$ are absolute constants, $1/p + 1/q =1$.
% \end{theorem}

\begin{remark}
Theorem \ref{theorem:poprisk} presents the standard risk of adversarial training under $\ell_p$ norm perturbations. Note that adversarially trained linear classifier enjoys a bounded population risk which decreases as the number of training iterations $t$ increases. Specifically, when $t \to \infty$, we have
% \begin{corollary}
% For any $\delta \in (0,1)$, under the same conditions as in Theorem \ref{theorem:poprisk}, with probability at least $1 - \delta$, when $t \to \infty$, we have the following standard risk
% \begin{align*}
%     \PP_{(\xb, y)\sim \cD}[f_{\btheta_t}(\xb) \neq y]
%   &\leq \eta + \exp\Bigg( -C' \bigg( \frac{\big(\|\bmu\|_2^2  - 4\epsilon\|\bmu\|_q  \big)}{  (C'' + \epsilon)\sqrt{d} } - \frac{C'''\|\bmu\|_2\log n}{\log t}\bigg)^2 \Bigg),
% \end{align*}
\begin{align}\label{eq:converge_risk}
    &\lim_{t \to \infty} \PP_{(\xb, y)\sim \cD}[f_{\btheta_t}(\xb) \neq y]\notag\\
    &\leq \eta + \exp\Bigg( -C' \bigg( \frac{\big(\|\bmu\|_2^2  - 4\epsilon\|\bmu\|_q  \big)}{  (C'' + \epsilon)\sqrt{d} } \bigg)^2 \Bigg).
\end{align}
% \begin{align}\label{eq:converge_risk}
%     \lim_{t \to \infty} \PP_{(\xb, y)\sim \cD}[f_{\btheta_t}(\xb) \neq y]
%     &\leq \eta + \exp\Bigg( -c \cdot   \frac{\big(\frac{\|\bmu\|_2^2 }{4} - \epsilon\|\bmu\|_q  \big)^2}{  (\sqrt{c_0} + \epsilon)^2d }   \Bigg).
% \end{align}
% \end{corollary}
\end{remark}

\begin{remark}\label{remark:std}
For \eqref{eq:converge_risk}, consider the case when the sample size $n$ is fixed but dimension $d$ and $\|\bmu\|_2$ are growing, we discuss the conditions to reach minimum standard risk of noise level $\eta$. Note that when $1 \leq p \leq 2$ we have $q \geq 2$ and $\|\bmu\|_q \leq \|\bmu\|_2$. In this case, if $\|\bmu\|_2 = \Omega(d^{1/4})$, the standard risk will come close to the noise level $\eta$ when $d$ is sufficiently large.  When $p > 2$ and therefore $q < 2$, we have $\|\bmu\|_q \leq d^{1/q -1/2}\|\bmu\|_2$. In this case, if $\|\bmu\|_2 = \Omega(d^{1/4})$ and $\epsilon = O(\|\bmu\|_2/d^{1/q -1/2})$, the standard risk will come close to the noise level $\eta$ with sufficiently large $d$.
Note that our theorem condition also requires that $\|\bmu\|_2 = O(\sqrt{d})$. Therefore, in order to reach the standard risk of $\eta$, we need $\|\bmu\|_2 = \Theta(d^r)$ for some $r \in (1/4, 1/2]$. 
\end{remark}

\begin{remark}
Choosing $\epsilon =0$ will reduce to the standard training case. Specifically, if we set $\epsilon =0$ in \eqref{eq:converge_risk}, it reduces to the same conclusion as Theorem 3.1 in \citet{chatterji2020finite}. However, our result is more general, as it covers the setting of adversarial training and gives risk bounds for the linear model obtained with a finite number of gradient descent iterations.
\end{remark}


% \begin{remark}
% Theorem \ref{theorem:poprisk} works for adversarial training with $1\leq p < \infty$. However, by taking $p \rightarrow \infty$ and $\alpha \rightarrow 0$, our result also implies the risk bound of the linear model trained by gradient flow on the robust empirical loss \eqref{eq:robustloss}. A similar argument has been used in the proof of Theorem D.1 in \citet{Li2020Implicit}.
% \end{remark}



\begin{theorem}[Adversarial Risk of Adversarial Training]\label{theorem:popadvrisk}
For any $\delta \in (0,1)$, under the same conditions as in Theorem~\ref{theorem:poprisk}, with probability at least $1- \delta$, the adversarially trained linear classifier $f_{\btheta_t}$ for sufficiently large $t$ under $\ell_p$-norm $\epsilon$-perturbation satisfies the following adversarial risk if $1 \leq p \leq 2$
{\scriptsize
\begin{align*}
    &\PP_{(\xb, y)\sim \cD}\big[ \exists \xb' \in \cB^p_{\epsilon}(\xb) \ s.t., \ f_{\btheta}(\xb') \neq y \big] \\
    &\leq \eta + \exp\Bigg( -C' \bigg( \frac{ \big(\|\bmu\|_2^2  - 4\epsilon\|\bmu\|_q  \big)}{ (C'' + \epsilon) \sqrt{d} } - \frac{C'''\|\bmu\|_2\log n}{\log t} - \epsilon \bigg)^2 \Bigg),
\end{align*}
}
and if $p > 2$,
{\scriptsize
\begin{align*}
    &\PP_{(\xb, y)\sim \cD}\big[ \exists \xb' \in \cB^p_{\epsilon}(\xb) \ s.t., \ f_{\btheta}(\xb') \neq y \big] \\
    &\leq \eta + \exp\Bigg( -C' \bigg( \frac{ \big(\|\bmu\|_2^2  - 4\epsilon\|\bmu\|_q  \big)}{ (C'' + \epsilon) \sqrt{d} } - \frac{C'''\|\bmu\|_2\log n}{\log t} - \epsilon d^{\frac{1}{q} -\frac{1}{2}} \bigg)^2 \Bigg),
    % &\qquad\leq \eta + \exp\Bigg( -c \bigg( \frac{ \big(\frac{\|\bmu\|_2^2 }{4} - \epsilon\|\bmu\|_q  \big)}{ (\sqrt{c_0} + \epsilon) \sqrt{d} } - \frac{c_3\|\bmu\|_2\log n}{\log t} - \epsilon d^{\frac{1}{q} -\frac{1}{2}}\bigg)^2 \Bigg),
\end{align*}
}
where $C',C'',C'''>0$ are absolute constants, $1/p + 1/q =1$.
\end{theorem}

% \begin{theorem}[Adversarial Risk of Adversarial Training]\label{theorem:popadvrisk}
% Under the same conditions as in Theorem \ref{theorem:poprisk}, the adversarially trained linear classifier $f$ parameterized by $\btheta_t$ under $\ell_p$-norm $\epsilon$-perturbation satisfies the following adversarial risk if $1 \leq p \leq 2$
% \begin{align*}
%     &\PP_{(\xb, y)\sim \cD}\big[ \exists \xb' \in \cB^p_{\epsilon}(\xb) \ s.t., \ f_{\btheta}(\xb') \neq y \big] \\
%     &\qquad\leq \eta + \exp\Bigg( -c \bigg( \frac{ \big(\frac{\|\bmu\|_2^2 }{4} - \epsilon\|\bmu\|_q  \big)}{ (\sqrt{c_0} + \epsilon) \sqrt{d} } - \frac{c_3\|\bmu\|_2\log n}{\log t} - \epsilon \bigg)^2 \Bigg),
% \end{align*}
% and if $p > 2$, we have
% \begin{align*}
%     &\PP_{(\xb, y)\sim \cD}\big[ \exists \xb' \in \cB^p_{\epsilon}(\xb) \ s.t., \ f_{\btheta}(\xb') \neq y \big] \\
%     &\qquad\leq \eta + \exp\Bigg( -c \bigg( \frac{ \big(\frac{\|\bmu\|_2^2 }{4} - \epsilon\|\bmu\|_q  \big)}{ (\sqrt{c_0} + \epsilon) \sqrt{d} } - \frac{c_3\|\bmu\|_2\log n}{\log t} - \epsilon d^{\frac{1}{q} -\frac{1}{2}}\bigg)^2 \Bigg),
% \end{align*}
% where $\delta, c_0, c_3$ and $c$ are absolute constants, $1/p + 1/q =1$.
% \end{theorem}

\begin{remark}
Theorem \ref{theorem:popadvrisk} shows the adversarial risk of adversarial training under $\ell_p$ norm perturbations. The major difference from the standard risk (Theorem \ref{theorem:poprisk}) lies in the additional $\epsilon$ or $\epsilon d^{1/q-1/2}$ term in the exponential function. This aligns with the common sense that adversarial risk should always be larger than the standard risk. 
This also suggests that for larger $p$-norm ($p > 2$) perturbation, the same magnitude of perturbation would lead to a larger gap between the adversarial risk and the standard risk.
In terms of the perturbation strength, we can also observe that with a larger $\epsilon$, adversarially trained classifiers obtain worse adversarial risk. This has been verified by many empirical observations of adversarial training \citep{madry2017towards, zhang2019theoretically}.
\end{remark}

% The following Corollary gives the adversarial risk bound when $t \to \infty$.
% \begin{corollary}
% Under the same conditions as in Theorem \ref{theorem:poprisk}, with probability $1 - \delta$, when $t \to \infty$, we have the following adversarial risk if $1 \leq p \leq 2$
% \begin{align*}
%     &\lim_{t \to \infty}\PP_{(\xb, y)\sim \cD}\big[ \exists \xb' \in \cB^p_{\epsilon}(\xb), f_{\btheta}(\xb') \neq y \big] \leq \eta + \exp\Bigg( -C' \bigg( \frac{ \big(\|\bmu\|_2^2  - 4\epsilon\|\bmu\|_q  \big)}{ (C'' + \epsilon) \sqrt{d} } - \epsilon \bigg)^2 \Bigg),
% \end{align*}
% and if $p > 2$, we have
% \begin{align*}
%     &\lim_{t \to \infty}\PP_{(\xb, y)\sim \cD}\big[ \exists \xb' \in \cB^p_{\epsilon}(\xb), f_{\btheta}(\xb') \neq y \big] \leq \eta + \exp\Bigg( -C' \bigg( \frac{ \big(\|\bmu\|_2^2  - 4\epsilon\|\bmu\|_q  \big)}{ (C'' + \epsilon) \sqrt{d} } - \epsilon d^{\frac{1}{q} -\frac{1}{2}} \bigg)^2 \Bigg).
% \end{align*}
% \end{corollary}

\begin{remark}\label{remark:benign_adv}
Note that when $t \to \infty$, if $1 \leq p \leq 2$, we have the following adversarial risk bound:
\begin{align*}
    &\lim_{t \to \infty}\PP_{(\xb, y)\sim \cD}\big[ \exists \xb' \in \cB^p_{\epsilon}(\xb), f_{\btheta}(\xb')
    \neq y \big] \\
    &\leq \eta + \exp\Bigg( -C' \bigg( \frac{ \big(\|\bmu\|_2^2  - 4\epsilon\|\bmu\|_q  \big)}{ (C'' + \epsilon) \sqrt{d} } - \epsilon \bigg)^2 \Bigg),
\end{align*}
and if $p > 2$, we have
\begin{align*}
    &\lim_{t \to \infty}\PP_{(\xb, y)\sim \cD}\big[ \exists \xb' \in \cB^p_{\epsilon}(\xb), f_{\btheta}(\xb') \neq y \big] \\
    &\leq \eta + \exp\Bigg( -C' \bigg( \frac{ \big(\|\bmu\|_2^2  - 4\epsilon\|\bmu\|_q  \big)}{ (C'' + \epsilon) \sqrt{d} } - \epsilon d^{\frac{1}{q} -\frac{1}{2}} \bigg)^2 \Bigg).
\end{align*}
Similar to the standard risk case (Remark \ref{remark:std}), when $1 \leq p \leq 2$, if $\|\bmu\|_2 = \Theta(d^r)$ for some $r \in (1/4, 1/2]$, the adversarial risk will also come close to the noise level $\eta$ with sufficiently large $d$.  When $p > 2$, if we have $\|\bmu\|_2 = \Theta(d^r)$ for some $r \in (1/4, 1/2]$ and $\epsilon = O(\|\bmu\|_2/d^{1/q})$, the adversarial risk will be close to $\eta$ with sufficiently large $d$. Note that compared to the standard risk, this requirement on $\epsilon$ is slightly stronger.
\end{remark}


\begin{remark}
Note that our results in Theorem~\ref{theorem:popadvrisk} imply that overfitting in adversarial training can be benign for certain distributions (e.g.,  subGaussian mixture data). %Specifically, Remark \ref{remark:benign_adv} shows that for linear models with sub-Gaussian mixture data, the overfitting effect is indeed benign. 
This is later empirically verified in the experiments for both linear and neural network models. 
\end{remark}

%  a striking fact that unlike those observed in previous studies (e.g., \citet{rice2020overfitting} showed that overfitting leads to worse empirical robustness on real image distributions such as CIFAR-10 \citep{krizhevsky2009learning} data),

% Theorem~\ref{theorem:popadvrisk} shows that benign overfitting indeed occurs in adversarial training when data is generated from a Gaussian mixture model. In contrast, recent studies such as \citet{rice2020overfitting} showed that overfitting hurts generalization for adversarial training on training real-world data such as CIFAR-10 \citep{krizhevsky2009learning}. Here we would like to emphasize that our result does not contradict with existing results. 
% However, this conclusion seemingly contradicts with recent studies such as \citet{rice2020overfitting}, where it is shown that overfitting hurts generalization for adversarial training on training real-world data such as CIFAR-10 \citep{krizhevsky2009learning}. Here we would like to emphasize that our result does not contradict with existing results. 
% The reasons are as follows.
% \begin{itemize}[leftmargin = *]
%     \item \citet{rice2020overfitting} showed that the overfitting robust classifier performs worse than the  robust classifier trained with early stopping. This does not contradict with out result, as our result does NOT show that the overfitting classifier is better or similar to classifiers trained with optimal early stopping or regularization. Our result only shows that the risk achieved by the overfitting classifier (possibly worse than optimal for finite $\| \bmu \|_2$) can still be relatively small and asymptotically optimal.
%     This is also the nature of existing works on benign overfitting  \citep{bartlett2020benign,chatterji2020finite,cao2021risk}.
%     % we are not showing that the overfitting classifier is better or similar to classifiers trained with optimal early stopping or regularization. 
%     % This is the nature of existing benign overfitting results \citep{bartlett2020benign,chatterji2020finite,cao2021risk} as well. This paper and the recent results on benign overfitting are all to show that, although overfitting predictors perform worse than optimally regularized classifiers, the risk achieved by the overfitting predictors may still be relatively small and asymptotically optimal. 
%     \item Our results are under the assumption that the data follows a sub-Gaussian mixture model. Moreover, the benign overfitting phenomenon only occurs when the two Gaussian clusters are fairly separated (i.e., $\| \bmu \|_2$ is large enough) so that the risk bound in Theorem~\ref{theorem:popadvrisk} can be close to $\eta$ (note that the data points are still mixed because of the noises). In practice, the learning problems may be more challenging and our theoretical assumptions cannot be exactly matched, which indicates that benign overfitting may be harder to occur in those problems. In Section~\ref{sec:experiment_realdata}, we further demonstrate that overfitting robust classifiers can achieve a much higher accuracy when the data is filtered by a GMM. 
% \end{itemize}






% , and the overfitting classifier are still quite different from the Bayes optimal classifier because it overfits the data with flipped labels as well
% However, with the experiments given in Figure~\ref{fig:cifar10}, we argue that our result does not contradict with existing works. The reasons are as follows.
% \begin{itemize}
%     \item 
% \end{itemize}


% \begin{itemize}[leftmargin = *]
%     \item \citet{rice2020overfitting} showed that the overfitting robust classifier performs worse than the  robust classifier trained with early stopping. We clarify that by demonstrating the benign overfitting phenomenon, we are not showing that the overfitting classifier is better or similar to classifiers trained with optimal early stopping or regularization. This is the nature of existing benign overfitting results \citep{bartlett2020benign,chatterji2020finite,cao2021risk} as well. This paper and the recent results on benign overfitting are all to show that, although overfitting predictors perform worse than optimally regularized classifiers, the risk achieved by the overfitting predictors may still be relatively small and asymptotically optimal. 
%     \item Our theoretical guarantees of benign overfitting is under the assumption that the data follows a sub-Gaussian mixture model, and the benign overfitting phenomenon only occurs when the two Guassian clusters are fairly separated (i.e., $\| \bmu \|_2$ is large enough) so that the risk bound can be close to $\eta$ (Note that even if the Gaussian clusters are seperated, the data points from different classes are NOT separated because of the noises, and the overfitting classifier are still quite different from the Bayes optimal classifier because it overfits the data with flipped labels as well). In practice, the learning problems may be more challenging and our theoretical assumptions cannot be exactly matched, which indicates that benign overfitting may be harder to occur in those problems. In Section~\ref{sec:experiment_realdata}, we further demonstrate that overfitting robust classifiers can achieve a much higher accuracy when the data is filtered by a GMM. 
% \end{itemize}


\section{Proof Outline of the Main Results}\label{sec:proof}
In this section, we present the proof of our main theorems, which consists of three main steps.  %Before we jump into the main proof, let us first lay out some of the key technical lemmas used to prove our main result.

\noindent\textbf{Statistical properties of the training data points.} We first list some basic properties of the training data points based on our data model defined in Section~\ref{sec:settings}. 
\begin{lemma}[Lemma 4.7 in \citet{chatterji2020finite}]\label{lemma:innerproduct_bound}
Let $\zb_k=y_k\xb_k$. There exist absolute constants $R$, $\kappa$ and $G$ and $C$, such that if the assumptions in Theorem~\ref{theorem:poprisk} hold, then with probability at least $1 - \delta$, 
\begin{align}
    &\frac{d}{c_0} \leq \|\zb_k\|_2^2 \leq c_0 d \text{ for all } k \in [n], \label{eq:bound1}\\ 
    &|\zb_i^\top \zb_j| \leq c_0\big(\|\bmu\|_2^2 + \sqrt{d   \log(n/\delta)}\big) \text{ for all } i\neq j,  \label{eq:bound2}\\
    &|\bmu^\top \zb_k - \|\bmu\|_2^2| \leq \|\bmu\|_2^2/2 \text{ for all } k \in \cC, \label{eq:bound3}\\ 
    &|\bmu^\top \zb_k - (-\|\bmu\|_2^2)| \leq \|\bmu\|_2^2/2 \text{ for all } k \in \cN, \label{eq:bound4}
\end{align} 
the number of noisy samples $|\cN| \leq (\eta + c_1)n$, and all training samples are linearly separable,
where $c_0 > 1$ is an absolute constant. 
\end{lemma}

Lemma \ref{lemma:innerproduct_bound} directly follows Lemma 4.7 in \citet{chatterji2020finite}. It provides direct high probability bounds for $\|\zb_k\|_2$ and $\bmu^\top\zb_k$ and also suggests that $\zb_k$ vectors are nearly pairwise orthogonal in over-parameterized settings. It also guarantees that training examples are linearly separable with high probability.


\noindent\textbf{Landscape properties of the training objective function.} Given the properties of the training data points, we proceed to establish the landscape properties of the objective function $ L(\btheta_1)$. 
The following lemma bound the loss for the adversarially trained classifier in step $1$.

% \begin{lemma}\label{lemma:loss_theta0_bound}
% Under the same conditions as in Theorem~\ref{theorem:poprisk}, with probability at least $1 - \delta$, we have
% \begin{align*}
%     L(\btheta_1) \leq 2n.
% \end{align*}
% \end{lemma}


\begin{lemma}\label{lemma:bound_loss}[Theorem 3.4 in \citet{Li2020Implicit}]
% Suppose $\epsilon \leq \gamma$, $M = \max\{[2d+\epsilon (q-1) d^{\frac{3q-2}{2q-2}}/\gamma]\exp(-\gamma^2/(c_0d) + \epsilon /\sqrt{c_0}), 1\}$, step size $\alpha_0 < 1/(c_0dn)$, $\alpha_t = \alpha \leq 1/(c_0dnM)$, then 
Under the same conditions as in Theorem~\ref{theorem:poprisk}, 
with probability at least $1 - \delta$, we have $L(\btheta_1) \leq 2n$, and 
\begin{align}
    &L(\btheta_{t+1}) \leq L(\btheta_{t}), \label{eq:tuo1}\\
    &1 - \frac{\btheta_t^\top\wb}{\|\btheta_t\|_2} \leq \frac{c_3\log n}{\log t}\label{eq:tuo2}
\end{align}
% \begin{align*}
%     L(\btheta_{t+1}) \leq L(\btheta_{t}), \ L(\btheta_t) \leq \frac{c_3\log^2 t}{c_0\alpha  d\gamma^2 t},
% \end{align*}
for all $t > 0$, where $c_3$ is an absolute constant.
\end{lemma}

% Combining Lemma \ref{lemma:loss_theta0_bound} and 
By Lemma \ref{lemma:bound_loss}, one can easily observe that the adversarial training loss is bounded by $2n$ along the entire training trajectory. Lemma \ref{lemma:bound_loss} also suggests that when $t \to \infty$, the adversarially trained classifier $\btheta_t$ will converge in direction to the max adversarial margin classifier $\wb$ defined in \eqref{eq:max-margin}.

\noindent\textbf{Length and direction of the adversarial training iterates $\btheta_t$.} We also establish the properties of the adversarial training iterates $\btheta_t$. We have the following lemmas.
\begin{lemma}\label{lemma:bound_vt}
Under the same conditions as in Theorem~\ref{theorem:poprisk}, for all adversarial training iteration $t>0$, with probability at least $1 - \delta$, we have
$
    \|\btheta_{t+1}\|_2 \leq  (\sqrt{c_0} + \epsilon)\sqrt{d} \sum_{m=0}^t \alpha_m  L(\btheta_m),
$
where $c_0$ is the absolute constant in Lemma \ref{lemma:innerproduct_bound}.
\end{lemma}

Lemma \ref{lemma:bound_vt} upper bound the $L_2$ norm of adversarially trained classifier $\btheta_t$ by the summation of training losses along the training trajectory.

\begin{lemma}\label{lemma:loss_range}
Let $\zb_k = y_k\xb_k$, under the same conditions as in Theorem \ref{theorem:poprisk}, for all adversarial training iteration $t \geq 0$, with probability as least $1- \delta$, we have
$
     \max_{k=1}^n \exp(- \btheta_t^\top \zb_k)  \leq c_3 \min_{k=1}^n \exp(- \btheta_t^\top \zb_k),
$
where $c_3>0$ is an absolute constant. 
\end{lemma}

Lemma \ref{lemma:loss_range} provides us a way to control the loss the noisy examples during the training procedure. Note that if $\max_{k=1}^n \exp(- \btheta_t^\top \zb_k) \leq c_3 \min_{k=1}^n \exp(- \btheta_t^\top \zb_k)$, we also have $\max_{k=1}^n \exp(- \btheta_t^\top \zb_k + \epsilon\|\btheta_t\|_q) \leq c_3 \min_{k=1}^n \exp(- \btheta_t^\top \zb_k + \epsilon\|\btheta_t\|_q)$. Therefore, the worst example training loss can be bounded via the best example training loss and further be bounded by the average training loss $L(\btheta_t)$. In this way, we can guarantee that those noisy examples will not have major influence on model training even in later training stages.


By Lemmas \ref{lemma:innerproduct_bound}-\ref{lemma:loss_range}, we establish the following key lemma for our main theorems.

\begin{lemma}\label{lemma:lastlemma}
Under the same condition as in Theorem \ref{theorem:poprisk}, with probability at least $1-\delta$, the adversarially trained linear model parameter $\btheta_t$ satisfies
\begin{align*}
    \frac{\bmu^\top\btheta_{t}}{\|\btheta_{t}\|_2} \geq \bigg(\frac{\|\bmu\|_2^2 }{4} - \epsilon\|\bmu\|_q  \bigg)  \frac{1}{  (\sqrt{c_0} + \epsilon)\sqrt{d} } - \frac{c_3\|\bmu\|_2\log n}{\log t}.
\end{align*}
where $c_0$ is the absolute constant in Lemma \ref{lemma:innerproduct_bound}.
\end{lemma}

Lemma \ref{lemma:lastlemma} provides the lower bound for the inner product of $\bmu$ and the direction of $\btheta_t$. This lemma extends Lemma 4.4 in \citet{Li2020Implicit} by considering the training iteration $t$ rather than just the converged classifier $\wb$, and also extends to the adversarial training setting. Notice that this lower bound actually gets larger with the increase of iteration $t$.

\noindent\textbf{Finalizing the proof.} We now present the proof for Theorems \ref{theorem:poprisk} and \ref{theorem:popadvrisk}.
% \subsection{Proof of Theorem \ref{theorem:poprisk}}

 
\begin{proof}[Proof of Theorem \ref{theorem:poprisk}]
First, following standard coupling lemma \citep{lindvall2002lectures}, there always exists a joint distribution on original data and noisy data $((\tilde\xb, \tilde y),(\xb, y))$ such that the marginal distribution for $(\tilde\xb, \tilde y)$ is $\tilde\cD$, the marginal distribution for $(\xb, y)$ is $\cD$, $\PP[\xb = \tilde\xb] = 1$ and $\PP[y \neq \tilde y] \leq \eta$.
Notice that the standard population risk can be written as
\begin{align}\label{eq:th1}
    \PP_{(\xb, y)\sim \cD}[f_{\btheta_t}(\xb) \neq y]
    &= \PP_{(\xb, y)\sim \cD}[y \cdot\btheta_t^\top \xb < 0]\notag\\
    &\leq \eta + \PP_{(\xb, y)\sim \cD}[y \cdot\btheta_t^\top \xb < 0, y=\tilde y] \notag\\
    &= \eta + \PP_{(\xb, y)\sim \cD}[\tilde  y \cdot\btheta_t^\top \xb < 0],
\end{align}
% \vspace{-10pt}
where the inequality holds since $\PP[y \neq \tilde y] \leq \eta$.
Since $\tilde y$ is the clean label for $\xb$, $\tilde y \xb$ follows the same distribution as $\bxi + \bmu$ and $\EE[\tilde y \cdot \hat\btheta^\top \xb] = \hat\btheta^\top \bmu$.
Therefore, \eqref{eq:th1} can be further written as
\begin{align}\label{eq:th2}
   &\PP_{(\xb, y)\sim \cD}[f_{\btheta_t}(\xb) \neq y]\notag\\
   &\leq \eta + \PP_{(\xb, y)\sim \cD}\big[\tilde  y \cdot\btheta_t^\top \xb - \EE[\tilde  y \cdot\btheta_t^\top \xb] < -\btheta_t^\top \bmu \big]\notag\\
   &= \eta + \PP_{(\xb, y)\sim \cD}\big[\btheta_t^\top\big(\tilde  y \xb - \EE[\tilde  y \xb]  \big)< -\btheta_t^\top \bmu \big]\notag\\
   &\leq \eta + \exp\bigg( -c \frac{(\btheta_t^\top \bmu)^2}{\|\btheta_t\|_2^2} \bigg),
\end{align}
where the last inequality holds by applying a Hoeffding-type concentration inequality (Theorem \ref{lemma:vershynin5.10}) with $t=(\btheta_t^\top \bmu)^2$. This bound in \eqref{eq:th2} enables the application of Lemma \ref{lemma:lastlemma} which characterizes how the direction of $\btheta_t$ aligns with $\bmu$ during training. 
% Now by Lemma \ref{lemma:lastlemma} we obtain
By direct calculation, we have
\begin{align*}
   &\PP_{(\xb, y)\sim \cD}[f_{\btheta_t}(\xb) \neq y]\\
   &\leq \eta + \exp\Bigg( -c \bigg(   \frac{\big(\frac{\|\bmu\|_2^2 }{4} - \epsilon\|\bmu\|_q  \big)}{  (\sqrt{c_0} + \epsilon)\sqrt{d} } - \frac{c_3\|\bmu\|_2\log n}{\log t}\bigg)^2 \Bigg).
\end{align*}
This concludes the proof.
\end{proof}

% \subsection{Proof of Theorem \ref{theorem:popadvrisk}}
\begin{proof}[Proof of Theorem \ref{theorem:popadvrisk}]
Similar to the proof of Theorem~\ref{theorem:poprisk}, we start with a calculating an upper bound of the population risk based on the formulation of the label noise. By the definition of the adversarial risk, we have
% note that the adversarial risk can be written as follows.
\begin{align}\label{eq:th3}
    &\PP_{(\xb, y)\sim \cD}\big[ \exists \xb' \in \cB^p_{\epsilon}(\xb) \ s.t., \ f_{\btheta_t}(\xb') \neq y \big]\notag\\
    &= \PP_{(\xb, y)\sim \cD}[\exists \xb' \in \cB^p_{\epsilon}(\xb) \ s.t., \ y \cdot\btheta_t^\top \xb' < 0]\notag\\
    &\leq \eta + \PP_{(\xb, y)\sim \cD}[\exists \xb' \in \cB^p_{\epsilon}(\xb) \ s.t., \ y \cdot\btheta_t^\top \xb' < 0, y=\tilde y] \notag\\
    % &= \eta + \PP_{(\xb, y)\sim \cD}[\exists \xb' \in \cB^p_{\epsilon}(\xb) \ s.t., \ \tilde  y \cdot\btheta_t^\top \xb' < 0]\notag\\
    &= \eta + \PP_{(\xb, y)\sim \cD}\Big[\min_{\ub \in \cB^p_{\epsilon}(\zero)} \tilde  y \cdot\btheta_t^\top (\xb+\ub) < 0\Big]\notag\\
    &=\eta + \PP_{(\xb, y)\sim \cD}\Big[ \tilde  y \cdot\btheta_t^\top \xb - \epsilon\|\btheta_t\|_q< 0\Big],
\end{align}
where the inequality holds in the same way as in \eqref{eq:th1}.
Since $\tilde y$ is the clean label for $\xb$, $\tilde y \xb$ follows the same distribution as $\bxi + \bmu$ and $\EE[\tilde y \cdot \btheta_t^\top \xb] = \btheta_t^\top \bmu$.
Therefore, \eqref{eq:th3} can be further written as
\begin{align}\label{eq:th4}
   &\PP_{(\xb, y)\sim \cD}\big[ \exists \xb' \in \cB^p_{\epsilon}(\xb) \ s.t., \ f_{\btheta_t}(\xb') \neq y \big]\notag\\
   &\leq \eta + \PP_{(\xb, y)\sim \cD}\big[\tilde  y \cdot\btheta_t^\top \xb - \EE[\tilde  y \cdot\btheta_t^\top \xb] < -\btheta_t^\top \bmu + \epsilon\|\btheta_t\|_q\big]\notag\\
   &= \eta + \PP_{(\xb, y)\sim \cD}\big[\btheta_t^\top\big(\tilde  y \xb - \EE[\tilde  y \xb]  \big)< -\btheta_t^\top \bmu + \epsilon\|\btheta_t\|_q\big]\notag\\
   &\leq \eta + \exp\bigg( -c \frac{(\btheta_t^\top \bmu - \epsilon\|\btheta_t\|_q)^2}{\|\btheta_t\|_2^2} \bigg),
%   &\leq \eta + \exp\bigg( -c \frac{(\btheta_t^\top \bmu - \epsilon\sqrt{d}\|\btheta_t\|_2)^2}{\|\btheta_t\|_2^2} \bigg)\notag\\
%   &= \eta + \exp\Bigg( -c \bigg(\frac{\btheta_t^\top \bmu }{\|\btheta_t\|_2} - \epsilon\frac{\|\btheta_t\|_q}{\|\btheta_t\|_2}  \bigg)^2 \Bigg)
\end{align}
where the second inequality holds by applying the Hoeffding-type concentration inequality (Theorem \ref{lemma:vershynin5.10}) with $t=(\btheta_t^\top \bmu- \epsilon\|\btheta_t\|_q)^2$. Based on \eqref{eq:th4} and Lemma~\ref{lemma:lastlemma}, we can further give the bounds of the adversarial risk. We consider the two settings $1 \leq p \leq 2$ and $2 < p < \infty $ separately.

When $1 \leq p \leq 2$, we have $q \geq 2$ and $\|\btheta\|_q \leq \|\btheta\|_2$. In this case, by Lemma \ref{lemma:lastlemma} we obtain
{\footnotesize
\begin{align*}
    &\PP_{(\xb, y)\sim \cD}[f_{\btheta_t}(\xb) \neq y]\\
    &\leq \eta + \exp\Bigg( -c \bigg( \frac{ \big(\frac{\|\bmu\|_2^2 }{4} - \epsilon\|\bmu\|_q  \big)}{ (\sqrt{c_0} + \epsilon) \sqrt{d} } - \frac{c_3\|\bmu\|_2\log n}{\log t} -  \epsilon \bigg)^2 \Bigg).
\end{align*}
}
When $p > 2$ and therefore $q < 2$, we have $\|\bmu\|_q \leq d^{1/q -1/2}\|\bmu\|_2$. In this case, by Lemma \ref{lemma:lastlemma} we obtain
{\scriptsize
\begin{align*}
    &\PP_{(\xb, y)\sim \cD}[f_{\btheta_t}(\xb) \neq y]\\
    &\leq \eta + \exp\Bigg( -c \bigg( \frac{ \big(\frac{\|\bmu\|_2^2 }{4} - \epsilon\|\bmu\|_q  \big)}{ (\sqrt{c_0} + \epsilon) \sqrt{d} } - \frac{c_3\|\bmu\|_2\log n}{\log t} -  \epsilon d^{\frac{1}{q} -\frac{1}{2}} \bigg)^2 \Bigg).
\end{align*}
}
This concludes the proof.
\end{proof}


\vspace{-5pt}
\section{Experiments}\label{sec:exp}

\begin{figure*}[ht!]
\centering
\subfigure[$\ell_2$ perturbation]{\includegraphics[width=0.245\textwidth]{figures/L2_nat_d.pdf}}
\subfigure[$\ell_2$ perturbation]{\includegraphics[width=0.245\textwidth]{figures/L2_adv_d.pdf}}
\subfigure[$\ell_\infty$ perturbation]{\includegraphics[width=0.245\textwidth]{figures/Linf_nat_d.pdf}}
\subfigure[$\ell_\infty$ perturbation]{\includegraphics[width=0.245\textwidth]{figures/Linf_adv_d.pdf}}
% \setlength{\belowcaptionskip}{-5pt}
% \setlength{\abovecaptionskip}{-1pt}
\caption{Risk and adversarial risk of adversarially trained linear classifiers versus the dimension $d$ under different scalings of $\bmu$. (a)(b) show the results for $\ell_2$ perturbation with $\epsilon=0.1$ and (c)(d) show the results for $\ell_\infty$ perturbation with $\epsilon=0.01$. 
The training error reaches $0$ for all experiments.
}
\label{fig:risk-vs-dimension}
\end{figure*}
In this section, we experimentally study the behavior of adversarially trained classifiers in the over-parameterized regime on both synthetic and real data.

\subsection{Synthetic Data Experiments}
 We generate $50$ training samples and $2000$ test samples and set the label noise ratio $\eta = 0.1$ for all experiments. Each clean sample $(\tilde\xb, \tilde y)$ is drawn from a Gaussian mixture model such that $\tilde y \sim \text{Unif}(\{\pm 1\})$ and $\tilde\xb = \tilde y\bmu + \bxi$ where $\bxi \in \RR^d$ and $\xi_1, \xi_2, \ldots, \xi_d$ are i.i.d. standard Gaussian variables and $\bmu$ simply shares the same direction as an all-one vector but has various different magnitudes. This aligns with our model assumptions in Section \ref{sec:settings}. For the adversarial training algorithm, we directly follows Algorithm \ref{alg:adv} except using a more practical Xavier normal initialization \citep{glorot2010understanding}, i.e., sampling $\btheta_0$ i.i.d. from from $\cN(0, 1/\sqrt{d})$. We set the learning rate $\alpha_t = 0.001$ and the total number of iterations $T=1000$ for all experiments.
All results are obtained by averaging over $10$ independent runs (both data sampling and training).

In the first set of experiments, we verify our main conclusions in this paper that benign overfitting can occur in adversarial training. Figure \ref{fig:risk-vs-dimension} illustrates the risk and adversarial risk of adversarially trained linear classifiers versus the dimension $d$ under different scalings of $\bmu$ for both $\ell_2$-norm and $\ell_\infty$-norm perturbations. We can observe that when $\|\bmu\|_2 = d^{0.2}$, the (adversarial) risk starts to increase as the dimension $d$ increases after an initial dive for both $\ell_2$-norm and $\ell_\infty$-norm perturbations. While for cases where $\|\bmu\|_2 = d^{0.3}$ and $\|\bmu\|_2 = d^{0.4}$, we can observe that the (adversarial) risk decreases steadily to the optimal risk $\eta$ as the dimension $d$ increases. This result backup our theory in Section \ref{sec:theorem} that the optimal risk is achievable when $\|\bmu\|_2 = \Theta(d^{r})$ and $r \in (1/4, 1/2]$. Note that the training error reaches 0 for all settings in Figure \ref{fig:risk-vs-dimension}.

% \setlength{\textfloatsep}{12pt}
% \begin{figure*}[ht!]
% \centering
% \subfigure[$\ell_2$ perturbation]{\includegraphics[width=0.32\textwidth]{figures/L2_nat_d.pdf}}
% \subfigure[$\ell_2$ perturbation]{\includegraphics[width=0.32\textwidth]{figures/L2_adv_d.pdf}}
% \subfigure[$\ell_\infty$ perturbation]{\includegraphics[width=0.32\textwidth]{figures/Linf_nat_d.pdf}}
% \subfigure[$\ell_\infty$ perturbation]{\includegraphics[width=0.32\textwidth]{figures/Linf_adv_d.pdf}}
% \subfigure[$\ell_2$ perturbation]{\includegraphics[width=0.325\textwidth]{figures/L2_adv_risk_d200n50mu3.pdf}}
% \subfigure[$\ell_\infty$ perturbation]{\includegraphics[width=0.325\textwidth]{figures/Linf_adv_risk_d200n50mu3.pdf}}
% % \setlength{\belowcaptionskip}{-5pt}
% % \setlength{\abovecaptionskip}{-1pt}
% \caption{(a-d) Risk and adversarial risk of adversarially trained linear classifiers versus the dimension $d$ under different scalings of $\bmu$. (a)(b) show the results for $\ell_2$ perturbation with $\epsilon=0.1$ and (c)(d) show the results for $\ell_\infty$ perturbation with $\epsilon=0.01$. 
% % The label noise level is set as $\eta=0.1$ and the training set size $n=50$. 
% (e-f) Adversarial risk of adversarially trained linear classifiers versus the training iterations $t$ for different $\epsilon$ with $d=200$ and $\|\bmu\|_2 = d^{0.3}$. 
% The training error reaches $0$ for all experiments.
% }
% \label{fig:risk-vs-dimension}
% \end{figure*}



\begin{figure}[ht!]
\centering
\subfigure[$\ell_2$ perturbation]{\includegraphics[width=0.238\textwidth]{figures/L2_adv_risk_d200n50mu3.pdf}}
\subfigure[$\ell_\infty$ perturbation]{\includegraphics[width=0.238\textwidth]{figures/Linf_adv_risk_d200n50mu3.pdf}}
% \setlength{\belowcaptionskip}{-5pt}
% \setlength{\abovecaptionskip}{-1pt}
\caption{Adversarial risk of adversarially trained linear classifiers versus the training iterations $t$ for different $\epsilon$ with $d=200$ and $\|\bmu\|_2 = d^{0.3}$. 
The training error reaches $0$ for all experiments.
}
\label{fig:risk-vs-iter}
\end{figure}
 
% \begin{figure}[t!]
% \centering
% \subfigure[$\ell_2$ perturbation]{\includegraphics[width=0.42\textwidth]{Adv/NeurIPS2021-PopAdvRisk/figures/L2_nat_d.pdf}}
% \subfigure[$\ell_2$ perturbation]{\includegraphics[width=0.42\textwidth]{Adv/NeurIPS2021-PopAdvRisk/figures/L2_adv_d.pdf}}
% \subfigure[$\ell_\infty$ perturbation]{\includegraphics[width=0.42\textwidth]{Adv/NeurIPS2021-PopAdvRisk/figures/Linf_nat_d.pdf}}
% \subfigure[$\ell_\infty$ perturbation]{\includegraphics[width=0.42\textwidth]{Adv/NeurIPS2021-PopAdvRisk/figures/Linf_adv_d.pdf}}
% \setlength{\belowcaptionskip}{-10pt}
% % \setlength{\abovecaptionskip}{-2pt}
% \caption{Risk and adversarial risk of adversarially trained linear classifiers versus the dimension $d$ under different scalings of $\bmu$. (a)(b) show the results for $\ell_2$ perturbation with $\epsilon=0.1$ and (c)(d) show the results for $\ell_\infty$ perturbation with $\epsilon=0.01$. 
% % The label noise level is set as $\eta=0.1$ and the training set size $n=50$. 
% The training error reaches $0$ for all experiments.
% }
% \label{fig:risk-vs-dimension}
% \end{figure}


% \begin{figure}[t!]
% \centering
% \subfigure[$\ell_2$ perturbation]{\includegraphics[width=0.42\textwidth]{Adv/NeurIPS2021-PopAdvRisk/figures/L2_adv_risk_d200n50mu3.pdf}}
% \subfigure[$\ell_\infty$ perturbation]{\includegraphics[width=0.42\textwidth]{Adv/NeurIPS2021-PopAdvRisk/figures/Linf_adv_risk_d200n50mu3.pdf}}
% \setlength{\belowcaptionskip}{-10pt}
% % \setlength{\abovecaptionskip}{-2pt}
% \caption{Adversarial risk of adversarially trained linear classifiers versus the training iterations $t$ for different $\epsilon$
% % The label noise level is set as $\eta=0.1$, the training set size $n=50$, 
% with $d=200$ and $\|\bmu\|_2 = d^{0.3}$. 
% The training error reaches $0$ for all experiments.
% }
% \label{fig:risk-vs-t}
% \end{figure}


In Figure \ref{fig:risk-vs-iter}, we present the adversarial risk\footnote{Here we omit the plot for standard risk as the curves are essentially overlapping to each other.} of adversarially trained linear classifiers versus the training iterations $t$ with different $\epsilon$ but fixed dimension $d$ and $\|\bmu\|_2$ for both $\ell_2$-norm and $\ell_\infty$-norm perturbations. We can also observe that in general, a larger $\epsilon$ will lead to the worse adversarial risk of the adversarially trained classifier. This also backs up our theory in Theorem \ref{theorem:popadvrisk}.

As our ultimate goal is to study the benign overfitting phenomenon in real-world adversarial training settings, we also conducted experiments on 2-layer neural networks with ReLU activation functions. In fact, the performances on the 2-layer ReLU network suggest very similar trends as the linear model. Due to the space limit, we display these results in the supplemental materials.
\begin{figure}[ht!]
\centering
\subfigure[2-class GMM filtered data]{\includegraphics[width=0.235\textwidth]{figures/cifar2class_gmm.pdf}}
\subfigure[2-class original data]{\includegraphics[width=0.235\textwidth]{figures/cifar2class_all.pdf}}
\subfigure[10-class GMM filtered data]{\includegraphics[width=0.235\textwidth]{figures/cifar10class_gmm.pdf}}
\subfigure[10-class original data]{\includegraphics[width=0.235\textwidth]{figures/cifar10class_all.pdf}}
% \setlength{\belowcaptionskip}{-5pt}
% \setlength{\abovecaptionskip}{-1pt}
\caption{The learning curves for adversarial training \citep{madry2017towards} on CIFAR-10 data using GMM filtered data and the original data (a)(b) show the results for 2-class classification (airplane vs automobile) and (c)(d) show the results for 10-class classification. 
}
\label{fig:cifar10}
\end{figure}


\subsection{Real-World Data Verification}\label{sec:experiment_realdata}
\citet{rice2020overfitting} showed that overfitting in adversarial training can lead to worse empirical robustness on empirical image distributions such as CIFAR-10 \citep{krizhevsky2009learning} data. 
We want to ensure that our result is not contradict with their results since they are testing on empirical image distributions while our analysis is based on subGaussian mixture data, which CIFAR-10 data does not satisfy. 

We conduct experiments to show that even for CIFAR-10 data, overfitting effect can be much less severe (or even benign) on robust classifiers, when we first filtered the input data by a Gaussian mixture model (GMM). 
Specifically, we craft a new dataset by fitting the original CIFAR-10 data via a Gaussian Mixture model. The new dataset will only keep the data points which have high probabilities to follow the Gaussian mixture distribution. We conduct two sets of adversarial training experiments using ResNet-18 model \citep{he2016identity}: one picking only 2 classes (airplane vs automobile) from CIFAR-10 and the other picking all 10 classes in CIFAR-10. The results are given in Figure~\ref{fig:cifar10}. 
% \begin{wrapfigure}{r}{0.5\textwidth}

From Figure \ref{fig:cifar10}, we can observe that for models trained on GMM filtered data, the overfitting issue is much less severe compared to the model trained on the original data. Specifically, for 2-class experiments, the overfitting is essentially benign for GMM filtered data.
This partially backup our theoretical results of benign overfitting for adversarial classifiers trained on subGaussian mixture data, and when such data distribution assumption is violated, overfitting can become harmful.
Furthermore, while \citet{rice2020overfitting} only presents the negative result on empirical data distributions, we actually present a positive result that benign overfitting can occur in adversarial training for certain data distributions. We believe that  subGaussian mixtures would not be the only distribution that could lead to benign overfitting in robust classifiers, yet our study certainly advances the understanding toward overfitting in adversarial settings.


% \end{wrapfigure}
% In Section \ref{sec:exp}, we have shown that benign overfitting indeed occurs in adversarial training when data is generated from a Gaussian mixture model. On the other hand, empirical studies such as \citet{rice2020overfitting} have shown that overfitting hurts generalization for adversarial training on training real-world data such as CIFAR-10 \citep{krizhevsky2009learning}. In this section, we show that these two results do not refute each other as the data assumptions are different. 




% On the other hand, empirical studies such as \citet{rice2020overfitting} have shown that overfitting hurts generalization for adversarial training on training real-world data such as CIFAR-10 \citep{krizhevsky2009learning}. In this section, we show that these two results do not refute each other as the data assumptions are different. 


\vspace{-5pt}

\section{Conclusions and Future Work}\label{sec:con}
In this paper, we show that the benign overfitting phenomenon can also occur in adversarial training. Specifically, we derive the risk bounds of the adversarially trained linear classifiers and show that under moderate $\ell_p$-norm perturbations, they can achieve the near-optimal standard and adversarial risks, despite overfitting the noisy training data. The numerical experimental results also validate our theoretical findings.
%Here we also discuss the potential limitations for our analysis: 
Our current analysis is limited to linear classifiers, while in practice, adversarial training is commonly used with neural networks. %In fact, this linear classifier setting stops us from further showing the strong robustness advantages of adversarial training. Previous studies \citep{madry2017towards,sanyal2021how} have shown that adversarial training requires more complex decision boundaries for establishing strong model robustness. 
We believe our work is the first step towards analyzing benign overfitting in adversarially trained neural networks. 
Yet extending our current analysis to 
adversarially trained neural networks is highly non-trivial and we leave it as a future work.


%   , a principled approach to defend against adversarial examples
 
 

 

% In the unusual situation where you want a paper to appear in the
% references without citing it in the main text, use \nocite
% \nocite{langley00}
 % \setlength{\bibsep}{2pt}

% {\small

% \bibliography{uai2023-template}


% \begin{contributions} % will be removed in pdf for initial submission 
% 					  % (without ‘accepted’ option in \documentclass)
%                       % so you can already fill it to test with the
%                       % ‘accepted’ class option
%     We thank the anonymous reviewers for their helpful comments. QG is supported in part by the National Science Foundation CAREER Award 1906169 and IIS-2008981, and the Sloan Research Fellowship. The views and conclusions contained in this paper are those of the authors and should not be interpreted as representing any funding agencies.
% \end{contributions}

\begin{acknowledgements} % will be removed in pdf for initial submission,
						 % (without ‘accepted’ option in \documentclass)
                         % so you can already fill it to test with the
                         % ‘accepted’ class option
    We thank the anonymous reviewers for their helpful comments. QG is supported in part by the National Science Foundation CAREER Award 1906169 and IIS-2008981, and the Sloan Research Fellowship. The views and conclusions contained in this paper are those of the authors and should not be interpreted as representing any funding agencies.
\end{acknowledgements}

% References
\bibliography{uai2023-template}
\end{document}
