\documentclass[accepted]{uai2023} % for initial submission
% \documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
\usepackage{cleveref}


\RequirePackage{amssymb}
\RequirePackage{amsthm}
\RequirePackage{bm} 
\RequirePackage{url}
\usepackage{natbib}
% \usepackage{multirow}
\usepackage{graphicx}
\usepackage{subfigure}
% \usepackage{makecell}
% \usepackage{booktabs}
% \usepackage{array}
% \usepackage{url}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{dsfont}
% \usepackage{enumerate}
\RequirePackage{bm} 
\newcommand\wtilde{\stackrel{\sim}{\smash{\mathcal{W}}\rule{0pt}{1.1ex}}}

%----- bold fonts -----%

\newcommand{\ab}{\mathbf{a}}
\newcommand{\bbb}{\mathbf{b}}
\newcommand{\cbb}{\mathbf{c}}
\newcommand{\db}{\mathbf{d}}
\newcommand{\eb}{\mathbf{e}}
\newcommand{\fb}{\mathbf{f}}
\newcommand{\gb}{\mathbf{g}}
\newcommand{\hb}{\mathbf{h}}
\newcommand{\ib}{\mathbf{i}}
\newcommand{\jb}{\mathbf{j}}
\newcommand{\kb}{\mathbf{k}}
\newcommand{\lb}{\mathbf{l}}
\newcommand{\mb}{\mathbf{m}}
\newcommand{\nbb}{\mathbf{n}}
\newcommand{\ob}{\mathbf{o}}
\newcommand{\pb}{\mathbf{p}}
\newcommand{\qb}{\mathbf{q}}
\newcommand{\rb}{\mathbf{r}}
\newcommand{\sbb}{\mathbf{s}}
\newcommand{\tb}{\mathbf{t}}
\newcommand{\ub}{\mathbf{u}}
\newcommand{\vb}{\mathbf{v}}
\newcommand{\wb}{\mathbf{w}}
\newcommand{\xb}{\mathbf{x}}
\newcommand{\yb}{\mathbf{y}}
\newcommand{\zb}{\mathbf{z}}

\newcommand{\ba}{\bm{a}}
\newcommand{\bb}{\bm{b}}
\newcommand{\bc}{\bm{c}}
\newcommand{\bd}{\bm{d}}
\newcommand{\be}{\bm{e}}
\newcommand{\bbf}{\bm{f}}
\newcommand{\bg}{\bm{g}}
\newcommand{\bh}{\bm{h}}
\newcommand{\bi}{\bmf{i}}
\newcommand{\bj}{\bm{j}}
\newcommand{\bk}{\bm{k}}
\newcommand{\bl}{\bm{l}}
\newcommand{\bbm}{\bm{m}}
\newcommand{\bn}{\bm{n}}
\newcommand{\bo}{\bm{o}}
\newcommand{\bp}{\bm{p}}
\newcommand{\bq}{\bm{q}}
\newcommand{\br}{\bm{r}}
\newcommand{\bs}{\bm{s}}
\newcommand{\bt}{\bm{t}}
\newcommand{\bu}{\bm{u}}
\newcommand{\bv}{\bm{v}}
\newcommand{\bw}{\bm{w}}
\newcommand{\bx}{\bm{x}}
\newcommand{\by}{\bm{y}}
\newcommand{\bz}{\bm{z}}




\newcommand{\Ab}{\mathbf{A}}
\newcommand{\Bb}{\mathbf{B}}
\newcommand{\Cb}{\mathbf{C}}
\newcommand{\Db}{\mathbf{D}}
\newcommand{\Eb}{\mathbf{E}}
\newcommand{\Fb}{\mathbf{F}}
\newcommand{\Gb}{\mathbf{G}}
\newcommand{\Hb}{\mathbf{H}}
\newcommand{\Ib}{\mathbf{I}}
\newcommand{\Jb}{\mathbf{J}}
\newcommand{\Kb}{\mathbf{K}}
\newcommand{\Lb}{\mathbf{L}}
\newcommand{\Mb}{\mathbf{M}}
\newcommand{\Nb}{\mathbf{N}}
\newcommand{\Ob}{\mathbf{O}}
\newcommand{\Pb}{\mathbf{P}}
\newcommand{\Qb}{\mathbf{Q}}
\newcommand{\Rb}{\mathbf{R}}
\newcommand{\Sbb}{\mathbf{S}}
\newcommand{\Tb}{\mathbf{T}}
\newcommand{\Ub}{\mathbf{U}}
\newcommand{\Vb}{\mathbf{V}}
\newcommand{\Wb}{\mathbf{W}}
\newcommand{\Xb}{\mathbf{X}}
\newcommand{\Yb}{\mathbf{Y}}
\newcommand{\Zb}{\mathbf{Z}}

\newcommand{\bA}{\bm{A}}
\newcommand{\bB}{\bm{B}}
\newcommand{\bC}{\bm{C}}
\newcommand{\bD}{\bm{D}}
\newcommand{\bE}{\bm{E}}
\newcommand{\bF}{\bm{F}}
\newcommand{\bG}{\bm{G}}
\newcommand{\bH}{\bm{H}}
\newcommand{\bI}{\bm{I}}
\newcommand{\bJ}{\bm{J}}
\newcommand{\bK}{\bm{K}}
\newcommand{\bL}{\bm{L}}
\newcommand{\bM}{\bm{M}}
\newcommand{\bN}{\bm{N}}
\newcommand{\bO}{\bm{O}}
\newcommand{\bP}{\bm{P}}
\newcommand{\bQ}{\bm{Q}}
\newcommand{\bR}{\bm{R}}
\newcommand{\bS}{\bm{S}}
\newcommand{\bT}{\bm{T}}
\newcommand{\bU}{\bm{U}}
\newcommand{\bV}{\bm{V}}
\newcommand{\bW}{\bm{W}}
\newcommand{\bX}{\bm{X}}
\newcommand{\bY}{\bm{Y}}
\newcommand{\bZ}{\bm{Z}}


%----- calligraphic fonts -----%

\newcommand{\cA}{\mathcal{A}}
\newcommand{\cB}{\mathcal{B}}
\newcommand{\cC}{\mathcal{C}}
\newcommand{\cD}{\mathcal{D}}
\newcommand{\cE}{\mathcal{E}}
\newcommand{\cF}{\mathcal{F}}
\newcommand{\cG}{\mathcal{G}}
\newcommand{\cH}{\mathcal{H}}
\newcommand{\cI}{\mathcal{I}}
\newcommand{\cJ}{\mathcal{J}}
\newcommand{\cK}{\mathcal{K}}
\newcommand{\cL}{\mathcal{L}}
\newcommand{\cM}{\mathcal{M}}
\newcommand{\cN}{\mathcal{N}}
\newcommand{\cO}{\mathcal{O}}
\newcommand{\cP}{\mathcal{P}}
\newcommand{\cQ}{\mathcal{Q}}
\newcommand{\cR}{\mathcal{R}}
\newcommand{\cS}{{\mathcal{S}}}
\newcommand{\cT}{{\mathcal{T}}}
\newcommand{\cU}{\mathcal{U}}
\newcommand{\cV}{\mathcal{V}}
\newcommand{\cW}{\mathcal{W}}
\newcommand{\cX}{\mathcal{X}}
\newcommand{\cY}{\mathcal{Y}}
\newcommand{\cZ}{\mathcal{Z}}




%----- blackboard bold fonts-----%

\newcommand{\CC}{\mathbb{C}}
\newcommand{\EE}{\mathbb{E}}
\newcommand{\VV}{\mathbb{V}}
\newcommand{\II}{\mathbb{I}}
\newcommand{\KK}{\mathbb{K}}
\newcommand{\LL}{\mathbb{L}}
\newcommand{\MM}{\mathbb{M}}
\newcommand{\NN}{\mathbb{N}}
\newcommand{\PP}{\mathbb{P}}
\newcommand{\QQ}{\mathbb{Q}}
\newcommand{\RR}{\mathbb{R}}
\newcommand{\SSS}{\mathbb{S}}
\newcommand{\ZZ}{\mathbb{Z}}
\newcommand{\XX}{\mathbb{X}}
\newcommand{\YY}{\mathbb{Y}}
\newcommand{\OOmega}{\mathbb{\Omega}}




%----- bold greek fonts -----%

\newcommand{\balpha}{\bm{\alpha}}
\newcommand{\bbeta}{\bm{\beta}}
\newcommand{\bgamma}{\bm{\gamma}}
\newcommand{\bepsilon}{\bm{\epsilon}}
\newcommand{\bvarepsilon}{\bm{\varepsilon}}
\newcommand{\bzeta}{\bm{\zeta}}
\newcommand{\btheta}{\bm{\theta}}
\newcommand{\bvartheta}{\bm{\vartheta}}
\newcommand{\bkappa}{\bm{\kappa}}
\newcommand{\blambda}{\bm{\lambda}}
\newcommand{\bmu}{\bm{\mu}}
\newcommand{\bnu}{\bm{\nu}}
\newcommand{\bxi}{\bm{\xi}}
\newcommand{\bpi}{\bm{\pi}}
\newcommand{\bvarpi}{\bm{\varpi}}
\newcommand{\brho}{\bm{\varrho}}
\newcommand{\bsigma}{\bm{\sigma}}
\newcommand{\bvarsigma}{\bm{\varsigma}}
\newcommand{\btau}{\bm{\tau}}
\newcommand{\bupsilon}{\bm{\upsilon}}
\newcommand{\bphi}{\bm{\phi}}
\newcommand{\bvarphi}{\bm{\varphi}}
\newcommand{\bchi}{\bm{\chi}}
\newcommand{\bpsi}{\bm{\psi}}
\newcommand{\bomega}{\bm{\omega}}

\newcommand{\bGamma}{\bm{\Gamma}}
\newcommand{\bDelta}{\bm{\Delta}}
\newcommand{\bTheta}{\bm{\Theta}}
\newcommand{\bLambda}{\bm{\Lambda}}
\newcommand{\bXi}{\bm{\Xi}}
\newcommand{\bPi}{\bm{\Pi}}
\newcommand{\bSigma}{\bm{\Sigma}}
\newcommand{\bUpsilon}{\bm{\Upsilon}}
\newcommand{\bPhi}{\bm{\Phi}}
\newcommand{\bPsi}{\bm{\Psi}}
\newcommand{\bOmega}{\bm{\Omega}}


%----- Some standard definitions -----%

\newcommand{\argmin}{\mathop{\mathrm{argmin}}}
\newcommand{\argmax}{\mathop{\mathrm{argmax}}}
\newcommand{\minimize}{\mathop{\mathrm{minimize}}}

\newcommand{\sign}{\mathop{\mathrm{sign}}}
\newcommand{\tr}{\mathop{\mathrm{tr}}}

\DeclareMathOperator{\Var}{{\rm Var}}
\DeclareMathOperator*{\Cor}{\rm Corr}
\DeclareMathOperator*{\Cov}{\rm Cov}
\DeclareMathOperator*{\ind}{\mathds{1}}  % Indicator
\newcommand{\smallfrac}[2]{{\textstyle \frac{#1}{#2}}}  
                                                        
\newcommand*{\zero}{{\bm 0}}
\newcommand*{\one}{{\bm 1}}

\newcommand{\diag}{{\rm diag}}


\ifx\proof\undefined
\newenvironment{proof}{\par\noindent{\bf Proof\ }}{\hfill\BlackBox\\[2mm]}\fi

\ifx\theorem\undefined
\newtheorem{theorem}{Theorem}
\fi
\ifx\example\undefined
\newtheorem{example}[theorem]{Example}
\fi
\ifx\property\undefined
\newtheorem{property}{Property}
\fi
\ifx\lemma\undefined
\newtheorem{lemma}[theorem]{Lemma}
\fi
\ifx\proposition\undefined
\newtheorem{proposition}[theorem]{Proposition}
\fi
\ifx\remark\undefined
\newtheorem{remark}[theorem]{Remark}
\fi
\ifx\corollary\undefined
\newtheorem{corollary}[theorem]{Corollary}
\fi
\ifx\definition\undefined
\newtheorem{definition}[theorem]{Definition}
\fi
\ifx\conjecture\undefined
\newtheorem{conjecture}[theorem]{Conjecture}
\fi
\ifx\fact\undefined
\newtheorem{fact}[theorem]{Fact}
\fi
\ifx\claim\undefined
\newtheorem{claim}[theorem]{Claim}
\fi
\ifx\assumption\undefined
\newtheorem{assumption}[theorem]{Assumption}
\fi

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


%%%%% Norms

\newcommand{\norm}[1]{||#1||}
\newcommand{\bignorm}[1]{\bigg|\bigg|#1\bigg|\bigg|}
\newcommand{\opnorm}[2]{| \! | \! | #1 | \! | \! |_{{#2}}}

%%%%% Dot product
\newcommand{\dotp}[2]{\langle{#1},{#2}\rangle}

%%%%  brackets
\newcommand{\inner}[2]{\left\langle #1,#2 \right\rangle}
\newcommand{\rbr}[1]{\left(#1\right)}
\newcommand{\sbr}[1]{\left[#1\right]}
\newcommand{\cbr}[1]{\left\{#1\right\}}
\newcommand{\nbr}[1]{\left\|#1\right\|}
\newcommand{\abr}[1]{\left|#1\right|}

%%%%%%%%%  Other commands

\newcommand{\mcomment}[1]{\marginpar{\tiny{#1}}}
\newcommand{\fcomment}[1]{\footnote{\tiny{#1}}}
%\newcommand{\overbar}[1]{\mkern 2mu\overline{\mkern-2mu#1\mkern-2mu}\mkern 2mu}
\newcommand{\overbar}[1]{\mkern 1.5mu\overline{\mkern-1.5mu#1\mkern-1.5mu}\mkern 1.5mu}
\newcommand{\ud}{{\mathrm{d}}}


\externaldocument{uai2023-template}









%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Benign Overfitting in Adversarially Robust Linear Classification\\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
% \author[1]{\href{mailto:<jj@example.edu>?Subject=Your UAI 2023 paper}{Jane~J.~von~O'L\'opez}{}}
% \author[1]{Harry~Q.~Bovik}
% \author[1,2]{Further~Coauthor}
% \author[3]{Further~Coauthor}
% \author[1]{Further~Coauthor}
% \author[3]{Further~Coauthor}
% \author[3,1]{Further~Coauthor}
% % Add affiliations after the authors
% \affil[1]{%
%     Computer Science Dept.\\
%     Cranberry University\\
%     Pittsburgh, Pennsylvania, USA
% }
% \affil[2]{%
%     Second Affiliation\\
%     Address\\
%     …
% }
% \affil[3]{%
%     Another Affiliation\\
%     Address\\
%     …
%   }


\author[1*]{Jinghui Chen} 
\author[2*]{Yuan Cao}
\author[3]{Quanquan Gu}
% Add affiliations after the authors
\affil[1]{%
  The Pennsylvania State University, \texttt{jzc5917@psu.edu}
}
\affil[2]{%
    The University of Hong Kong, \texttt{yuancao@hku.hk}
}
\affil[3]{%
    University of California, Los Angeles,  \texttt{qgu@cs.ucla.edu}
  }
\affil[*]{{Equal contribution}
  }
  \begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle

% This Supplementary Material should be submitted as a separate file. Please do not append the Supplementary Material to the main paper. 

% Fig. \ref{fig:pitt} and Eq \ref{eq:example} in the main paper can be cross referenced using \texttt{xr}. 

\appendix


\section{Comparison with Dan et al. (2020), Taheri et al. (2020) and Javanmard \& Soltanolkotabi (2020)}
\citet{dan2020sharp} proposed an adversarial signal to noise ratio and studied the excess risk lower/upper bounds for learning Gaussian mixture models. Compared to the setting studied in \citet{dan2020sharp}, our setting covers additional label flipping noises. More importantly, we study an estimator found by gradient descent that overfits the training data, while \citet{dan2020sharp} studied a specific plug-in estimator which does not overfit the training data. Due to these differences, there is a discrepancy in the risk bounds derived in both papers. 
% Note that although the data generation models are very similar, \citet{dan2020sharp} and us are studying different linear classifiers.
% Specifically, we are studying the overfitted linear classifiers on Gaussian data with additional label flipping noises. Note that our setting is heavily overparameterized, meaning that the model overfits the training data. Thus the resulting overfitted classifier under additional label flipping noises will be significantly different from the optimal classifiers (which does not need to overfit those label flipping noises) in \citet{dan2020sharp}. 
% % We believe this is the reason why our adversarial risk bound is slightly different from the result in \citet{dan2020sharp} in non-leading terms.


% \citet{taheri2020asymptotic,javanmard2020precise} studied adversarial learning of linear models on Gaussian mixture data under the setting where the data dimension and the number of training data points have a fixed ratio (i.e., $d / n = O(1)$). 
\citet{taheri2020asymptotic,javanmard2020precise} studied adversarial learning of linear models in the proportional limit setting, i.e., $d / n = O(1)$. In this setting, the data Gram matrix and the sample covariance matrix can be studied based on random matrix theory/Gaussian comparison inequalities/convex Gaussian min-max theorem. In contrast, in our setting where $d > \tilde{O} (n^2 )$, the sample covariance matrix is singular but the $n \times n$ Gram matrix concentrates around its expectation. Therefore, our  setting is different from the proportional limit setting in \citet{taheri2020asymptotic,javanmard2020precise}, and these results are not directly comparable.
% \citet{taheri2020asymptotic,javanmard2020precise} studied adversarial learning of linear models on Gaussian mixture data under the setting where the data dimension and the number of training data points have a fixed ratio. Note that the setting in \citet{taheri2020asymptotic,javanmard2020precise} are actually different from ours. Specifically, \citet{taheri2020asymptotic,javanmard2020precise} studied in the $d / n = O(1)$ setting where the data Gram matrix and the sample covariance matrix can be studied based on random matrix theory/Gaussian comparison inequalities. In comparison, the $d > \tilde{O} (n^2 )$ setting we consider essentially is the setting where the ``data Gram matrix'' concentrates towards its expectation. 


\section{Proof of Key Technical Lemmas}
% \subsection{Proof of Lemma \ref{lemma:loss_theta0_bound}}
% \begin{proof}
% It is easy to observe that $\btheta_1 = \alpha_0\sum_{k=1}^n \zb_k$, we have
% \begin{align*}
%     L(\btheta_1) &= \sum_{k=1}^n \exp(-\btheta_1^\top\zb_k + \epsilon\|\btheta_1\|_q)\\
%     &= \sum_{k=1}^n \exp\bigg(-\alpha_0 \sum_{i=1}^n \zb_i^\top\zb_k + \alpha_0\epsilon \Big\| \sum_{i=1}^n \zb_i\Big\|_q\bigg)\\
%     &\leq \sum_{k=1}^n \exp\bigg(\alpha_0 n \Big( c_0\big(\|\bmu\|_2^2 + \sqrt{d   \log(n/\delta)}\big) + \epsilon\sqrt{c_0}d \Big)\bigg)\\
%     &\leq \sum_{k=1}^n \exp(1/16) \leq 2n,
% \end{align*}
% where the first equality holds due to Lemma \ref{lemma:innerproduct_bound} and the fact that for any $\ub\in \RR^d, \|\ub\|_q \leq \|\ub\|_1 \leq \sqrt{d}\|\ub\|_2$, while the second inequality is by the choice of sufficiently small $\alpha_0$ in the theorem condition.
% \end{proof}

\subsection{Proof of Lemma \ref{lemma:bound_loss}}
\begin{proof} 
We first prove that $L(\btheta_1) \leq 2n$. To show this, we  observe that $\btheta_1 = \alpha_0\sum_{k=1}^n \zb_k$. Therefore
\begin{align*}
    L(\btheta_1) &= \sum_{k=1}^n \exp(-\btheta_1^\top\zb_k + \epsilon\|\btheta_1\|_q)\\
    &= \sum_{k=1}^n \exp\bigg(-\alpha_0 \sum_{i=1}^n \zb_i^\top\zb_k + \alpha_0\epsilon \Big\| \sum_{i=1}^n \zb_i\Big\|_q\bigg)\\
    &\leq \sum_{k=1}^n \exp\bigg(\alpha_0 n \Big( c_0\big(\|\bmu\|_2^2 + \sqrt{d   \log(n/\delta)}\big) + \epsilon\sqrt{c_0}d \Big)\bigg)\\
    &\leq \sum_{k=1}^n \exp(1/16) \leq 2n,
\end{align*}
where the first equality holds due to Lemma \ref{lemma:innerproduct_bound} and the fact that for any $\ub\in \RR^d, \|\ub\|_q \leq \|\ub\|_1 \leq \sqrt{d}\|\ub\|_2$, while the second inequality is by the choice of sufficiently small $\alpha_0$ and the assumptions that $d \geq Cn \|\bmu\|_2^2$ and $\epsilon\leq R$ for some absolute constants $C$ and $R$.  %in the theorem condition.


The rest part of Lemma \ref{lemma:bound_loss} summarizes parts of the results in \citet{Li2020Implicit}. However, the results in \citet{Li2020Implicit} are derived under the setting that $\| \xb_i \|_2 \leq 1$, Therefore 
to prove lemma \ref{lemma:bound_loss}, we re-scale our data and model parameters and convert our setting to the setting in \citet{Li2020Implicit}. 

By lemma~\ref{lemma:innerproduct_bound}, with probability at least $1 - \delta$, $\|\xb_i\|_2^2 \leq c_0 d$ for all $i \in [n]$. We therefore denote  $B:=\sqrt{c_0 d}$, and then $\tilde\xb_i : = \xb_i / B$ has $\ell_2$-norm less than or equal to one. 
% The proof directly follows the proof of Theorem 3.3 and Theorem 3.4 in \citet{Li2020Implicit}. Note that since they assume $\|\zb_k\|_2 \leq 1$, which is different from our data assumption. In order to apply Theorem 3.3 in \citet{Li2020Implicit}, we need to first re-scale our data and model parameters.
Further denote by $\bbeta_t $ the linear model parameters in \citet{Li2020Implicit}'s algorithm, $\tilde\zb_i = y_i \tilde\xb_i$, $\eta_t$ as their step sizes, $\tilde\epsilon$ as their perturbation strength, and 
\begin{align*}
    \tilde\gamma :=  \max_{\|\btheta\|_2 =1 } \min_{i\in[n]} y_i \btheta^\top \tilde\xb_i
\end{align*}
% $\tilde\gamma := \max_{\|\btheta\|_2 =1 } \min_{i\in[n]} \min_{\tilde\xb_i' \in \cB^p_{\epsilon}(\tilde\xb_i)}  y_i \btheta^\top \xb_i'$ 
as the $\ell_p$ margin. Then the adversarial training update rule in \citet{Li2020Implicit} is 
\begin{align*}
    \bbeta_{t+1} = \bbeta_t -  \frac{\eta_t}{n}  \sum_{i=1}^n \nabla_{\bbeta} \exp (-\bbeta_t^\top \tilde\zb_k + \tilde\epsilon\|\bbeta_t\|_q).
\end{align*}
Note that our update rule is 
\begin{align*}
    \btheta_{t+1} = \btheta_t - \alpha_t \sum_{k=1}^n \nabla_{\btheta} \exp (-\btheta_t^\top \zb_k + \epsilon\|\btheta_t\|_q).
\end{align*}
Now, in order to apply the results in \citet{Li2020Implicit}, we convert our parameters to match their scaling. Since
\begin{align*}
    \btheta_{t+1} &= \btheta_t - \alpha_t \sum_i \nabla_{\btheta}  \exp (-B\btheta_t^\top \zb_k/B + \epsilon\|B\btheta_t\|_q/B)\\
    &= \btheta_t - \frac{nB\alpha_t}{n} \sum_i \nabla_{(B \btheta)} \exp (-B\btheta_t^\top \zb_k/B + \epsilon\|B\btheta_t\|_q/B).
\end{align*}
Therefore
\begin{align*}
    B\btheta_{t+1} = B\btheta_t - \frac{nB^2\alpha_t}{n} \sum_i \nabla_{(B \btheta)} \exp (-B\btheta_t^\top \zb_k/B + \epsilon\|B\btheta_t\|_q/B).
\end{align*}
It is easy to observe that we can now apply Theorem 3.3 and Theorem 3.4 in \citet{Li2020Implicit} by setting $\bbeta_t = B\btheta_t, \eta_t = nB^2\alpha_t, \tilde\epsilon = \epsilon/B$. Moreover, by $\tilde\xb_i = \xb_i /B$, $\tilde\epsilon = \epsilon/B$ and the definition of $\tilde\gamma$, we have $\tilde\gamma = \bar\gamma / B$. Based on these relations, it is easy to see that under the conditions of Lemma \ref{lemma:bound_loss}, $\tilde\xb_i$, $\eta_t$, $\tilde\epsilon$, $\tilde\gamma$ satisfy the assumptions of Theorems~3.3 and 3.4 in \citet{Li2020Implicit}. 
Now
\eqref{eq:tuo1} is an intermediate result of the proof of Theorem 3.3 in \citet{Li2020Implicit}, and \eqref{eq:tuo2} follows by Theorem 3.4 in \citet{Li2020Implicit}.
% Note that in \citet{Li2020Implicit}, Theorem 3.3 and Theorem 3.4 do not directly applicable to $L_\infty$ norm case due to non-smoothness of $L_1$ norm. The $L_\infty$ norm perturbation analyses are presented in Theorem D.1 and Theorem D.2 by using a smoothed-$L_1$-norm argument. We neglect the details of this part and refer the readers to the appendix in \citet{Li2020Implicit}.
\end{proof}





\subsection{Proof of Lemma \ref{lemma:bound_vt}}
\begin{proof}
We have 
\begin{align*}
    \|\btheta_{t+1}\|_2 &= \bigg\|\sum_{m=0}^t \alpha_m \cdot \nabla  L(\btheta_m)\bigg\|_2\\
    &\leq  \sum_{m=0}^t \alpha_m \|\nabla  L(\btheta_m)\|_2\\
    &\leq \sum_{m=0}^t \alpha_m \bigg\|\sum_{k=1}^n \big(\zb_k - \epsilon\cdot\partial\|\btheta_m\|_q\big)\cdot \exp\big(-\zb_k^\top \btheta_m + \epsilon \|\btheta_m\|_q\big)\bigg\|_2,
\end{align*}
where the first three inequalities hold by triangle inequality.
By Lemma \ref{lemma:partial_norm_bound}, we have 
\begin{align*}
    \|\btheta_{t+1}\|_2  
    &\leq  \sum_{m=0}^t \alpha_m \sum_{k=1}^n (\|\zb_k\|_2 + \epsilon \sqrt{d}) \cdot \exp\big(-\zb_k^\top \btheta_m + \epsilon \|\btheta_m\|_q\big) \\
    &\leq  (\sqrt{c_0} + \epsilon )\sqrt{d} \sum_{m=0}^t \alpha_m \sum_{k=1}^n  \cdot \exp\big(-\zb_k^\top \btheta_m + \epsilon \|\btheta_m\|_q\big)\\
    &=  (\sqrt{c_0} + \epsilon)\sqrt{d} \sum_{m=0}^t \alpha_m L(\btheta_m),
\end{align*}
where the second inequality is due to Lemma \ref{lemma:innerproduct_bound}. 
\end{proof}


\subsection{Proof of Lemma \ref{lemma:loss_range}}
\begin{proof}
Denote $E_k^t=\exp(- \btheta_t^\top \zb_k)$ and $A_t^{i,j} = E_i^t / E_j^t$. The goal is to show that $\max_{i,j} A_t^{i,j} \leq c_3$ for some constant $c_3= 5c_0^2$. We prove this by induction.  

For the base case ($t=0$), we have $E_k^0 = \exp(0) = 1$. Therefore we have $\max_{i,j} A_0^{i,j} = 1 \leq 5c_0^2$.





% \begin{align*}
%     E_k^0 = \exp(- \btheta_0^\top \zb_k) =  \exp\bigg(- \sum_{i=1}^n \zb_i^\top \zb_k\bigg) = \exp\bigg(- \|\zb_k\|_2^2 -  \sum_{i\neq k} \zb_i^\top \zb_k\bigg).
% \end{align*}
% By Lemma \ref{lemma:innerproduct_bound}, we have
% \begin{align*}
%     &E_1^0 \leq \exp\Big(- d /c_0 + nc_0\big(\|\bmu\|_2^2 + \sqrt{d   \log(n/\delta)}\big) \Big),\\
%     &E_2^0 \geq \exp\Big(- c_0 d - nc_0\big(\|\bmu\|_2^2 + \sqrt{d   \log(n/\delta)}\big) \Big).
% \end{align*}
% Hence we have,
% \begin{align*}
%     A_0 &= \frac{\exp\Big(- d/c_0 + nc_0\big(\|\bmu\|_2^2 + \sqrt{d   \log(n/\delta)}\big) \Big)}{\exp\Big(-  c_0 d  - nc_0\big(\|\bmu\|_2^2 + \sqrt{d   \log(n/\delta)}\big) \Big)}\\
%     &=  \exp\Big((c_0 - 1/c_0) d + 2nc_0\big(\|\bmu\|_2^2 + \sqrt{d   \log(n/\delta)}\big) \Big) .
% \end{align*}


For $t>0$, to simplify the notation, let $E_1^t$ and $E_2^t$ denote values for the first and second samples and their ratio $A_t := E_1^t/E_2^t$. We want to show that $A_{t+1} \leq 5c_0^2$ (note that a similar result can be obtained for any distinct pair thus the max also satisfies).

Notice that 
\begin{align}\label{eq:at1}
    A_{t+1} &= \frac{\exp(- \btheta_{t+1}^\top \zb_1)}{\exp(- \btheta_{t+1}^\top \zb_2)}  = \frac{\exp(- \btheta_{t}^\top \zb_1)}{\exp(- \btheta_{t}^\top \zb_2)}\cdot\frac{\exp(\alpha_t\nabla L(\btheta_{t})^\top \zb_1)}{\exp(\alpha_t\nabla L(\btheta_{t})^\top \zb_2)} \notag\\
    &= A_t \cdot \frac{\exp(-\alpha_t \sum_{k=1}^n (\zb_k - \epsilon \partial\|\btheta_t\|_q)^\top \zb_1 \cdot \exp(-\btheta_t^\top \zb_k+ \epsilon\|\btheta_t\|_q))}{\exp(-\alpha_t \sum_{k=1}^n (\zb_k - \epsilon \partial\|\btheta_t\|_q)^\top \zb_2 \cdot \exp(-\btheta_t^\top \zb_k + \epsilon\|\btheta_t\|_q))} \notag\\
    &=  A_t \cdot \underbrace{\frac{\exp(-\alpha_t  (\zb_1 - \epsilon \partial\|\btheta_t\|_q)^\top \zb_1 \cdot \exp(-\btheta_t^\top \zb_k+ \epsilon\|\btheta_t\|_q))}{\exp(-\alpha_t (\zb_2 - \epsilon \partial\|\btheta_t\|_q)^\top \zb_2 \cdot \exp(-\btheta_t^\top \zb_k + \epsilon\|\btheta_t\|_q))}}_{I_1} \notag\\
    &\qquad\cdot \underbrace{\frac{\exp(-\alpha_t \sum_{k\neq 1}^n (\zb_k - \epsilon \partial\|\btheta_t\|_q)^\top \zb_1 \cdot \exp(-\btheta_t^\top \zb_k+ \epsilon\|\btheta_t\|_q))}{\exp(-\alpha_t \sum_{k \neq 2}^n (\zb_k - \epsilon \partial\|\btheta_t\|_q)^\top \zb_2 \cdot \exp(-\btheta_t^\top \zb_k + \epsilon\|\btheta_t\|_q))}}_{I_2}.
\end{align}
For term $I_1$, note that by Lemma \ref{lemma:innerproduct_bound} we have
\begin{align*}
\sqrt{\frac{d}{c_0}} \leq \|\zb_k\|_2 \leq \sqrt{c_0 d}.
\end{align*}
Also since by Lemma \ref{lemma:partial_norm_bound}, we have $\big\|\partial\|\btheta_t\|_q \big\|_p = 1$, 
\begin{align}\label{eq:signproduct}
  |\zb_k^\top \partial\|\btheta_t\|_q| \leq \|\zb_k\|_q \cdot \big\|\partial\|\btheta_t\|_q \big\|_p = \|\zb_k\|_q \leq \|\zb_k\|_1 \leq  \sqrt{d}\|\zb_k\|_2 \leq \sqrt{c_0}d.
\end{align}
Therefore, we have
\begin{align}\label{eq:i1}
    I_1 &\leq \exp\bigg(-\alpha_t  \Big(\frac{d}{c_0} - \epsilon\sqrt{c_0}d \Big)\exp(-\btheta_t^\top \zb_1 + \epsilon\|\btheta_t\|_q) + \alpha_t \Big(c_0 d + \epsilon\sqrt{c_0}d \Big)\exp(-\btheta_t^\top \zb_2 + \epsilon\|\btheta_t\|_q) \bigg) \notag\\
    &= \exp\Bigg(-\alpha_t E_2^t\bigg(  \Big(\frac{d}{c_0} - \epsilon\sqrt{c_0}d \Big) A_t - \Big(c_0 d + \epsilon\sqrt{c_0}d \Big)  \bigg) \exp \big( \epsilon\|\btheta_t\|_q \big) \Bigg).
\end{align}
For term $I_2$, by \eqref{eq:bound2} and \eqref{eq:signproduct} we have 
\begin{align}\label{eq:i2}
    I_2 &\leq  \exp\bigg(\alpha_t  \Big(c_0\big(\|\bmu\|_2^2 + \sqrt{d   \log(n/\delta)}\big) + \epsilon \sqrt{c_0}d \Big) \Big(\sum_{k \neq 1}^n \exp(-\btheta_t^\top \zb_k + \epsilon\|\btheta_t\|_q)  + \sum_{k \neq 2}^n \exp(-\btheta_t^\top \zb_k + \epsilon\|\btheta_t\|_q)  \Big)  \bigg) \notag\\
    &\leq \exp\bigg(2 \alpha_t  L(\btheta_t)  \Big(c_0\big(\|\bmu\|_2^2 + \sqrt{d   \log(n/\delta)}\big) + \epsilon \sqrt{c_0 }d \Big)  \bigg)
\end{align}
Substitute \eqref{eq:i1} and \eqref{eq:i2} into \eqref{eq:at1}, we have
\begin{align}\label{eq:at2}
    A_{t+1} &\leq A_t \cdot \exp\Bigg(-\alpha_t E_2^t\bigg(  \Big(\frac{d}{c_0} - \epsilon\sqrt{c_0}d \Big) A_t - \Big(c_0 d + \epsilon\sqrt{c_0}d \Big)  \bigg) \exp \big( \epsilon\|\btheta_t\|_q \big) \Bigg) \notag\\
    &\qquad \cdot \exp\bigg(2 \alpha_t  L(\btheta_t)  \Big(c_0\big(\|\bmu\|_2^2 + \sqrt{d   \log(n/\delta)}\big) + \epsilon \sqrt{c_0}d \Big)  \bigg).
\end{align}
Let us consider two cases here. If $(d/c_0 - \epsilon\sqrt{c_0}d ) A_t - (c_0 d + \epsilon\sqrt{c_0}d) > c_0 d$, i.e., $A_t > (2c_0 + \epsilon\sqrt{c_0})/(1/c_0 - \epsilon\sqrt{c_0})$, we further have
\begin{align*}
    A_{t+1} &\leq A_t \cdot \exp\Big(-\alpha_t E_2^t c_0 d \exp \big( \epsilon\|\btheta_t\|_q \big) \Big)  \cdot \exp\bigg(2 \alpha_t  L(\btheta_t)  \Big(c_0\big(\|\bmu\|_2^2 + \sqrt{d   \log(n/\delta)}\big) + \epsilon \sqrt{c_0}d \Big)  \bigg) \\
    &\leq A_t \cdot \exp\Big(-\alpha_t E_2^t c_0 d \exp \big( \epsilon\|\btheta_t\|_q \big) \Big)\\
    &\qquad\cdot \exp\bigg(2 \alpha_t n E_2^t \Big(c_0\big(\|\bmu\|_2^2 + \sqrt{d   \log(n/\delta)}\big) + \epsilon \sqrt{c_0}d \Big)\exp \big( \epsilon\|\btheta_t\|_q \big)  \bigg)\\
    &= A_t \cdot \exp\Big(-\alpha_t E_2^t c_0 \big(d - 2n\|\bmu\|_2^2 - 2n\sqrt{d   \log(n/\delta)} - 2n\epsilon \sqrt{c_0} \big) \exp \big( \epsilon\|\btheta_t\|_q \big) \Big) \\
    &\leq A_t,
\end{align*}
where the second inequality is due to the fact that $L(\btheta_t) = \sum_{k=1}^n E_k^t \exp \big( \epsilon\|\btheta_t\|_q \big)$ and $E_2^t = \max_k E_k^t$ while the last inequality holds since $d \geq C\cdot\max\{n\|\bmu\|_2^2, n^2 \log(n/\delta)\}$. 

On the other hand, if $A_t \leq (2c_0 + \epsilon\sqrt{c_0})/(1/c_0 - \epsilon\sqrt{c_0})$, we have
\begin{align*}
    A_{t+1} &\leq A_t \cdot \exp\Big(\alpha_t E_2^t \big(c_0 d + \epsilon\sqrt{c_0}d \big)   \exp \big( \epsilon\|\btheta_t\|_q \big) \Big)  \\
    &\qquad \cdot \exp\bigg(2 \alpha_t  L(\btheta_t)  \Big(c_0\big(\|\bmu\|_2^2 + \sqrt{d   \log(n/\delta)}\big) + \epsilon \sqrt{c_0}d \Big)  \bigg)\\
    &\leq A_t \cdot \exp\Big(\alpha_t L(\btheta_t)  \big(c_0 d + \epsilon\sqrt{c_0}d \big)    \Big)  \cdot \exp\bigg(2 \alpha_t  L(\btheta_t)  \Big(c_0\big(\|\bmu\|_2^2 + \sqrt{d   \log(n/\delta)}\big) + \epsilon \sqrt{c_0}d \Big)  \bigg)\\
    &\leq A_t \cdot \exp\bigg(2\alpha_t  n \Big(c_0\big(2\|\bmu\|_2^2 + 2\sqrt{d   \log(n/\delta)} + d\big) + 3\epsilon \sqrt{c_0}d \Big)  \bigg)\\
    &\leq (2c_0 + \epsilon\sqrt{c_0})/(1/c_0 - \epsilon\sqrt{c_0}) \cdot \exp(1/8) \\
    &\leq 5c_0^2,
\end{align*}
where the first inequality is due to the fact that $A_t > 0$, the third inequality holds by Lemma \ref{lemma:bound_loss}, the fourth inequality is because $\alpha_t \leq 1/(c_0Cnd)$ and $d \geq C\cdot\max\{n\|\bmu\|_2^2, n^2 \log(n/\delta)\}$ and the last inequality is because $\epsilon < C'$ and $C'$ can be chosen such that $C' \leq 1/(2c_0^{1.5})$ and we have $ 1/c_0 - \epsilon\sqrt{c_0} > 1/(2c_0)$.

This concludes the proof.
\end{proof}
 

\subsection{Proof of Lemma \ref{lemma:lastlemma}}
\begin{proof}
Note that 
\begin{align}\label{eq:tmp}
    \bmu^\top \btheta_{t+1} &= \bmu^\top \Big(\btheta_t + \alpha_t \sum_{k=1}^n \big(\zb_k - \epsilon \partial\|\btheta_t\|_q\big) \exp(-\btheta_t^\top \zb_k + \epsilon \|\btheta\|_1) \Big) \notag\\
    &= \bmu^\top \btheta_t - \alpha_t \epsilon \cdot \bmu^\top  \partial\|\btheta_t\|_q \cdot L(\btheta_t) + \alpha_t \sum_{k=1}^n \big(\bmu^\top \zb_k \big) \exp(-\btheta_t^\top \zb_k + \epsilon \|\btheta\|_q )\Big) \notag\\
    &\geq \bmu^\top \btheta_t - \alpha_t \epsilon \|\bmu\|_q \cdot L(\btheta_t) + \alpha_t \sum_{k \in \cC} \big(\bmu^\top \zb_k \big) \exp(-\btheta_t^\top \zb_k + \epsilon \|\btheta\|_q) \Big)\notag\\
    &\qquad+ \alpha_t \sum_{k \in \cN} \big(\bmu^\top \zb_k \big) \exp(-\btheta_t^\top \zb_k + \epsilon \|\btheta\|_q) \Big),
\end{align}
where the inequality holds in the same way as in \eqref{eq:signproduct}. By Lemma \ref{lemma:innerproduct_bound} (\eqref{eq:bound3} and \eqref{eq:bound4}), we further bound \eqref{eq:tmp} by
\begin{align}\label{eq:tmp1}
    \bmu^\top \btheta_{t+1} 
    &\geq \bmu^\top \btheta_t - \alpha_t \epsilon \|\bmu\|_q \cdot L(\btheta_t) + \frac{\alpha_t}{2} \sum_{k \in \cC} \|\bmu\|_2^2 \exp(-\btheta_t^\top \zb_k + \epsilon \|\btheta\|_q) \Big)\notag\\
    &\qquad- \frac{3\alpha_t}{2} \sum_{k \in \cN} \|\bmu\|_2^2 \exp(-\btheta_t^\top \zb_k + \epsilon \|\btheta\|_q )\Big) \notag\\
    &= \bmu^\top \btheta_t - \alpha_t \epsilon \|\bmu\|_q \cdot L(\btheta_t) + \frac{\alpha_t}{2}  \|\bmu\|_2^2 L(\btheta_t) - 2\alpha_t\|\bmu\|_2^2 \sum_{k \in \cN}  \exp(-\btheta_t^\top \zb_k + \epsilon \|\btheta\|_q) \Big).
\end{align}
Note that we have
\begin{align*}
    \sum_{k \in \cN}  \exp(-\btheta_t^\top \zb_k + \epsilon \|\btheta\|_q) &= \sum_{k \in \cN}  \exp(-\btheta_t^\top \zb_k ) \cdot\exp(\epsilon \|\btheta\|_q) \\
    &\leq c_3(\eta + c_1)n \cdot 
    \Big(\max_k E_k \Big)\cdot\exp(\epsilon \|\btheta\|_q)\\
    &\leq  c_3(\eta + c_1) L(\btheta_t)\\
    &\leq \frac{1}{8} L(\btheta_t),
\end{align*}
where the first inequality is due to Lemma \ref{lemma:bound_loss} and the last inequality is because $\eta < 1/C$ and $c_1$ can be chosen arbitrarily small given sufficient large $C$.
Therefore, \eqref{eq:tmp1} can be further written as
\begin{align}\label{eq:tmp2}
    \bmu^\top \btheta_{t+1} 
    &\geq \bmu^\top \btheta_t - \alpha_t \epsilon \|\bmu\|_q \cdot L(\btheta_t) + \frac{\alpha_t}{2}  \|\bmu\|_2^2 L(\btheta_t) - \frac{\alpha_t}{4}\|\bmu\|_2^2  L(\btheta_t)  \notag\\
    &= \bmu^\top \btheta_t + \alpha_t\bigg(\frac{\|\bmu\|_2^2 }{4} - \epsilon\|\bmu\|_q  \bigg)    \cdot L(\btheta_t) \notag\\
    &=  \bigg(\frac{\|\bmu\|_2^2 }{4} - \epsilon\|\bmu\|_q  \bigg)   \cdot \sum_{m=0}^t \alpha_m L(\btheta_m),
\end{align}
where the last equality is due the fact that $\btheta_0 = \zero$.
Now we multiply $\|\wb\|_2 / \|\btheta_{t+1}\|_2$ on both sides of \eqref{eq:tmp2} and take $t \to \infty$
\begin{align*} 
    \lim_{t\to \infty} \frac{\|\wb\|_2 (\bmu^\top \btheta_{t+1})}{\|\btheta_{t+1}\|_2}
    &\geq \lim_{t\to \infty} \bigg(\frac{\|\bmu\|_2^2 }{4} - \epsilon\|\bmu\|_q  \bigg)  \frac{\|\wb\|_2}{\|\btheta_{t+1}\|_2} \cdot \sum_{m=0}^t \alpha_m L(\btheta_m).
\end{align*}
Since $\|\wb\|_2 = 1$, and by Lemma \ref{lemma:bound_loss}, it is easy to observe that  $ \wb = \lim_{t \to \infty} \btheta_t / \| \btheta_t\|_2$, we have
\begin{align*} 
    \bmu^\top \wb
    &\geq  \bigg(\frac{\|\bmu\|_2^2 }{4} - \epsilon\|\bmu\|_q  \bigg) \cdot  \lim_{t\to \infty}    \frac{\sum_{m=0}^t \alpha_m L(\btheta_m)}{\|\btheta_{t+1}\|_2}\\
    &\geq \bigg(\frac{\|\bmu\|_2^2 }{4} - \epsilon\|\bmu\|_q  \bigg)  \frac{1}{  (\sqrt{c_0} + \epsilon)\sqrt{d} }.
\end{align*}
where the last inequality is due to Lemma \ref{lemma:bound_vt}.
Note that Lemma \ref{lemma:bound_loss} also suggests that $\|\btheta_t/\|\btheta_t\|_2 - \wb\|_2 \leq c_3 \log n/ \log t$, we have 
\begin{align*} 
    \bmu^\top\wb  &= \bmu^\top\bigg(\wb - \frac{\btheta_{t}}{\|\btheta_{t}\|_2} +\frac{\btheta_{t}}{\|\btheta_{t}\|_2} \bigg) \\
    &\leq \|\bmu\|_2\cdot\bigg\|\wb - \frac{\btheta_{t}}{\|\btheta_{t}\|_2}\bigg\|_2 + \frac{\bmu^\top\btheta_{t}}{\|\btheta_{t}\|_2} \\
    &\leq \frac{c_3\|\bmu\|_2\log n}{\log t} + \frac{\bmu^\top\btheta_{t}}{\|\btheta_{t}\|_2}.
\end{align*}
Therefore, 
\begin{align*} 
    \frac{\bmu^\top\btheta_{t}}{\|\btheta_{t}\|_2} \geq \bmu^\top\wb - \frac{c_3\|\bmu\|_2\log n}{\log t} \geq \bigg(\frac{\|\bmu\|_2^2 }{4} - \epsilon\|\bmu\|_q  \bigg)  \frac{1}{  (\sqrt{c_0} + \epsilon)\sqrt{d} } - \frac{c_3\|\bmu\|_2\log n}{\log t}.
\end{align*}
\end{proof}




\section{Auxiliary Lemmas}
\begin{theorem}[Proposition 5.10 in~\citet{vershynin2010introduction}]
\label{lemma:vershynin5.10}
Let $X_1,X_2,\ldots,X_n$ be independent centered sub-Gaussian random variables, and let $K=\max_i\|X_i\|_{\psi_2}$. Then for every $a=(a_1,a_2,\ldots,a_n) \in \RR^n$ and for every $t>0$, we have
\begin{align*}
\PP\bigg(\bigg|\sum_{i=1}^n a_i X_i\bigg|>t\bigg) \le \exp\Big(-\frac{Ct^2}{K^2\|a\|_2^2}\Big),
\end{align*}
where $C>0$ is a constant.
\end{theorem} 
 
% \begin{theorem}[Proposition 5.16 in~\citet{vershynin2010introduction}]
% \label{lemma:vershynin5.16}
% Let $X_1,X_2,\ldots,X_n$ be independent centered sub-exponential random variables, and let $K=\max_i\|X_i\|_{\psi_1}$. Then for every $a=(a_1,a_2,\ldots,a_n) \in \RR^n$ and for every $t>0$, we have
% \begin{align*}
% \PP\bigg(\bigg|\sum_{i=1}^n a_i X_i\bigg|>t\bigg) \le 2\exp\Big[-C \min\Big(\frac{t^2}{K^2\|a\|_2^2}, \frac{t}{K\|a\|_\infty}\Big)\Big],
% \end{align*}
% where $C>0$ is a constant.
% \end{theorem}
 
 
\begin{lemma}\label{lemma:partial_norm_bound}
For any $\btheta \in \RR^d$, 
\begin{align*}
&\big\|\partial\|\btheta\|_q \big\|_2 \leq \sqrt{d}, \ \big\|\partial\|\btheta\|_q \big\|_p = 1.
\end{align*}
\end{lemma}
\begin{proof}
Note that we have 
$$(\partial\|\btheta\|_q)_i = \frac{\theta_i^{q-1} }{\|\btheta\|_q^{q-1}}\cdot \sign(\btheta),$$
and since for any vector $\ub \in \RR^d$, $\|\ub\|_q \geq \|\ub\|_\infty, \|\ub\|_2 \leq \sqrt{d}\|\ub\|_\infty$, we have
\begin{align*}
    \big\|\partial\|\btheta\|_q \big\|_2 = \frac{\big\|\btheta^{ \circ (q-1)} \big\|_2}{\| \btheta\|_q^{q-1}} \leq \frac{\sqrt{d} \|\btheta\|_\infty^{q-1}}{\| \btheta\|_q^{q-1}} \leq \sqrt{d},
\end{align*}
where $\circ$ denotes element-wise power. This concludes the first part of the lemma.
For the second part, by $p$-norm definition, we have
\begin{align*}
    \big\|\partial\|\btheta\|_q \big\|_p = \frac{\big\|\btheta^{ \circ (q-1)} \big\|_p}{\| \btheta\|_q^{q-1}}  = \frac{1}{\| \btheta\|_q^{q-1}}   \Big(\sum_{i=1}^d (\theta_i^{q-1})^p \Big)^{1/p} = \frac{1}{\| \btheta\|_q^{q-1}}  \bigg( \Big(\sum_{i=1}^d \theta_i^{q} \Big)^{1/q} \bigg)^{q-1} = 1.
\end{align*}
\end{proof}
 
 
\section{Additional Experiments}
In this section, we present the additional experiments covering more settings as well as more complex models such as 2-layer neural network.

\subsection{Adversarially Trained Linear Classifier Under Various Settings}

In Figures \ref{fig:risk-vs-t-d200-mu4},\ref{fig:risk-vs-t-d1000-mu3},\ref{fig:risk-vs-t1000-mu4}, we plot the adversarial risk of adversarially trained linear classifiers versus the training iterations $t$ for different perturbation level $\epsilon$ for various combinations of dimension $d$ and $\|\bmu\|_2$. Specifically, in Figure \ref{fig:risk-vs-t1000-mu4}, we can observe that with moderate perturbations and sufficient over-parameterization, adversarially trained linear classifiers can achieve near-optimal adversarial risk. 


\begin{figure}[t!]
\centering
\subfigure[$\ell_2$ perturbation]{\includegraphics[width=0.45\textwidth]{figures/L2_adv_risk_d200n50mu4.pdf}}
\subfigure[$\ell_\infty$ perturbation]{\includegraphics[width=0.45\textwidth]{figures/Linf_adv_risk_d200n50mu4.pdf}}
\setlength{\belowcaptionskip}{-10pt}
%\setlength{\abovecaptionskip}{-5pt}
\caption{Risk and adversarial risk of adversarially trained linear classifiers versus the training iterations $t$ for different perturbation level $\epsilon$. The label noise level is set as $\eta=0.1$, the training set size $n=50$, dimension $d=200$ and $\|\bmu\|_2 = d^{0.4}$. The train error reaches $0$ for all experiments.
}
\label{fig:risk-vs-t-d200-mu4}
\end{figure}

\begin{figure}[t!]
\centering
\subfigure[$\ell_2$ perturbation]{\includegraphics[width=0.45\textwidth]{figures/L2_adv_risk_d1000n50mu3.pdf}}
\subfigure[$\ell_\infty$ perturbation]{\includegraphics[width=0.45\textwidth]{figures/Linf_adv_risk_d1000n50mu3.pdf}}
\setlength{\belowcaptionskip}{-10pt}
%\setlength{\abovecaptionskip}{-5pt}
\caption{Risk and adversarial risk of adversarially trained linear classifiers versus the training iterations $t$ for different perturbation level $\epsilon$. The label noise level is set as $\eta=0.1$, the training set size $n=50$, dimension $d=1000$ and $\|\bmu\|_2 = d^{0.3}$. The train error reaches $0$ for all experiments.
}
\label{fig:risk-vs-t-d1000-mu3}
\end{figure}


\begin{figure}[t!]
\centering
\subfigure[$\ell_2$ perturbation]{\includegraphics[width=0.45\textwidth]{figures/L2_adv_risk_d1000n50mu4.pdf}}
\subfigure[$\ell_\infty$ perturbation]{\includegraphics[width=0.45\textwidth]{figures/Linf_adv_risk_d1000n50mu4.pdf}}
\setlength{\belowcaptionskip}{-10pt}
%\setlength{\abovecaptionskip}{-5pt}
\caption{Risk and adversarial risk of adversarially trained linear classifiers versus the training iterations $t$ for different perturbation level $\epsilon$. The label noise level is set as $\eta=0.1$, the training set size $n=50$, dimension $d=1000$ and $\|\bmu\|_2 = d^{0.4}$. The train error reaches $0$ for all experiments.
}
\label{fig:risk-vs-t1000-mu4}
\end{figure}


\subsection{Adversarially Trained 2-layer Neural Networks}\label{sec:nn}
We have also conducted extra experiments on 2-layer neural networks with ReLU activation functions (one extra fix-dimension hidden layer). The data generation process are the same as our linear experiments. Note that in this setting, we no longer have the closed-form solutions to the inner maximization problem. Therefore, we following \citet{madry2017towards} and use $10$-step Projected Gradient Descent to get the inner maximizer.



\begin{figure}[t!]
\centering
\subfigure[$\ell_2$ perturbation]{\includegraphics[width=0.42\textwidth]{figures/NN_L2_nat_d.pdf}}
\subfigure[$\ell_2$ perturbation]{\includegraphics[width=0.42\textwidth]{figures/NN_L2_adv_d.pdf}}
\subfigure[$\ell_\infty$ perturbation]{\includegraphics[width=0.42\textwidth]{figures/NN_Linf_nat_d.pdf}}
\subfigure[$\ell_\infty$ perturbation]{\includegraphics[width=0.42\textwidth]{figures/NN_Linf_adv_d.pdf}}
\setlength{\belowcaptionskip}{-10pt}
% \setlength{\abovecaptionskip}{-2pt}
\caption{Risk and adversarial risk of adversarially trained 2-layer ReLU network versus the dimension $d$ under different scalings of $\bmu$. (a)(b) show the results for $\ell_2$ perturbation with $\epsilon=0.1$ and (c)(d) show the results for $\ell_\infty$ perturbation with $\epsilon=0.01$. 
The training error reaches $0$ for all experiments.
}
\label{fig:risk-vs-dimension-nn}
\end{figure}

As can be seen from Figure \ref{fig:risk-vs-dimension-nn}, the empirical results on 2-layer ReLU network suggest very similar trends as the linear classifier for both adversarial risk and standard risk. This further backs up our theoretical conclusions. 



\bibliography{uai2023-template}

\end{document}
