% \documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    %\bibliographystyle{plainnat}
    \bibliographystyle{abbrvnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example


%%%%%%%%%%%%%%%
\usepackage{times}  % DO NOT CHANGE THIS
\usepackage{helvet}  % DO NOT CHANGE THIS
\usepackage{courier}  % DO NOT CHANGE THIS
%\usepackage[hyphens]{url}  % DO NOT CHANGE THIS
%\usepackage{graphicx} % DO NOT CHANGE THIS
%\urlstyle{rm} % DO NOT CHANGE THIS
%\def\UrlFont{\rm}  % DO NOT CHANGE THIS
%\usepackage{natbib}  % DO NOT CHANGE THIS AND DO NOT ADD ANY OPTIONS TO IT
\usepackage{caption} % DO NOT CHANGE THIS AND DO NOT ADD ANY OPTIONS TO IT
%\DeclareCaptionStyle{ruled}{labelfont=normalfont,labelsep=colon,strut=off} % DO NOT CHANGE THIS
%\frenchspacing  % DO NOT CHANGE THIS
%\setlength{\pdfpagewidth}{8.5in}  % DO NOT CHANGE THIS
%\setlength{\pdfpageheight}{11in}  % DO NOT CHANGE THIS
%
% These are recommended to typeset algorithms but not required. See the subsubsection on algorithms. Remove them if you don't have algorithms in your paper.
\usepackage{algorithm}
%%%%%%%%%%% SL's copies%%%%%%%%%%

% Recommended, but optional, packages for figures and better typesetting:
% \usepackage{microtype}
% \usepackage{graphicx}
% \usepackage{subfigure}
% \usepackage{booktabs} % for professional tables
% \usepackage{hyperref}

% Attempt to make hyperref and algorithmic work together better:
%\newcommand{\theHalgorithm}{\arabic{algorithm}}


% if you need to pass options to natbib, use, e.g.:
  %   \PassOptionsToPackage{numbers, compress}{natbib}
% before loading neurips_2021


\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{hyperref}       % hyperlinks
\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors

\usepackage{multirow}
\usepackage{graphicx}
%\usepackage{subcaption}
\usepackage{pifont}
%%%%%%%%%%%%%%%%%%% customized by SL
\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{hyperref}
\usepackage{amsfonts}
\usepackage{amssymb,color}%,amsthm}
\usepackage{bbm}
\usepackage{mathrsfs}
\usepackage{amsmath}
\usepackage{amssymb,amsthm}
\usepackage{xr}
\makeatletter
\newcommand*{\addFileDependency}[1]{% argument=file name and extension
  \typeout{(#1)}
  \@addtofilelist{#1}
  \IfFileExists{#1}{}{\typeout{No file #1.}}
}
\makeatother
\newcommand*{\myexternaldocument}[1]{%
    \externaldocument{#1}%
    \addFileDependency{#1.tex}%
    \addFileDependency{#1.aux}%
}
\myexternaldocument{zhang_207-supp}
%%%%% NEW MATH DEFINITIONS %%%%%

\usepackage{amsmath,amsfonts,bm}

% Mark sections of captions for referring to divisions of figures
\newcommand{\figleft}{{\em (Left)}}
\newcommand{\figcenter}{{\em (Center)}}
\newcommand{\figright}{{\em (Right)}}
\newcommand{\figtop}{{\em (Top)}}
\newcommand{\figbottom}{{\em (Bottom)}}
\newcommand{\captiona}{{\em (a)}}
\newcommand{\captionb}{{\em (b)}}
\newcommand{\captionc}{{\em (c)}}
\newcommand{\captiond}{{\em (d)}}

% Highlight a newly defined term
\newcommand{\newterm}[1]{{\bf #1}}


% Figure reference, lower-case.
\def\figref#1{figure~\ref{#1}}
% Figure reference, capital. For start of sentence
\def\Figref#1{Figure~\ref{#1}}
\def\twofigref#1#2{figures \ref{#1} and \ref{#2}}
\def\quadfigref#1#2#3#4{figures \ref{#1}, \ref{#2}, \ref{#3} and \ref{#4}}
% Section reference, lower-case.
\def\secref#1{section~\ref{#1}}
% Section reference, capital.
\def\Secref#1{Section~\ref{#1}}
% Reference to two sections.
\def\twosecrefs#1#2{sections \ref{#1} and \ref{#2}}
% Reference to three sections.
\def\secrefs#1#2#3{sections \ref{#1}, \ref{#2} and \ref{#3}}
% Reference to an equation, lower-case.
\def\eqref#1{(\ref{#1})}
% Reference to an equation, upper case
\def\Eqref#1{Equation~\ref{#1}}
% A raw reference to an equation---avoid using if possible
\def\plaineqref#1{\ref{#1}}
% Reference to a chapter, lower-case.
\def\chapref#1{chapter~\ref{#1}}
% Reference to an equation, upper case.
\def\Chapref#1{Chapter~\ref{#1}}
% Reference to a range of chapters
\def\rangechapref#1#2{chapters\ref{#1}--\ref{#2}}
% Reference to an algorithm, lower-case.
\def\algref#1{algorithm~\ref{#1}}
% Reference to an algorithm, upper case.
\def\Algref#1{Algorithm~\ref{#1}}
\def\twoalgref#1#2{algorithms \ref{#1} and \ref{#2}}
\def\Twoalgref#1#2{Algorithms \ref{#1} and \ref{#2}}
% Reference to a part, lower case
\def\partref#1{part~\ref{#1}}
% Reference to a part, upper case
\def\Partref#1{Part~\ref{#1}}
\def\twopartref#1#2{parts \ref{#1} and \ref{#2}}

\def\ceil#1{\lceil #1 \rceil}
\def\floor#1{\lfloor #1 \rfloor}
\def\1{\bm{1}}
\newcommand{\train}{\mathcal{D}}
\newcommand{\valid}{\mathcal{D_{\mathrm{valid}}}}
\newcommand{\test}{\mathcal{D_{\mathrm{test}}}}

\def\eps{{\epsilon}}


% Random variables
\def\reta{{\textnormal{$\eta$}}}
\def\ra{{\textnormal{a}}}
\def\rb{{\textnormal{b}}}
\def\rc{{\textnormal{c}}}
\def\rd{{\textnormal{d}}}
\def\re{{\textnormal{e}}}
\def\rf{{\textnormal{f}}}
\def\rg{{\textnormal{g}}}
\def\rh{{\textnormal{h}}}
\def\ri{{\textnormal{i}}}
\def\rj{{\textnormal{j}}}
\def\rk{{\textnormal{k}}}
\def\rl{{\textnormal{l}}}
% rm is already a command, just don't name any random variables m
\def\rn{{\textnormal{n}}}
\def\ro{{\textnormal{o}}}
\def\rp{{\textnormal{p}}}
\def\rq{{\textnormal{q}}}
\def\rr{{\textnormal{r}}}
\def\rs{{\textnormal{s}}}
\def\rt{{\textnormal{t}}}
\def\ru{{\textnormal{u}}}
\def\rv{{\textnormal{v}}}
\def\rw{{\textnormal{w}}}
\def\rx{{\textnormal{x}}}
\def\ry{{\textnormal{y}}}
\def\rz{{\textnormal{z}}}

% Random vectors
\def\rvepsilon{{\mathbf{\epsilon}}}
\def\rvtheta{{\mathbf{\theta}}}
\def\rva{{\mathbf{a}}}
\def\rvb{{\mathbf{b}}}
\def\rvc{{\mathbf{c}}}
\def\rvd{{\mathbf{d}}}
\def\rve{{\mathbf{e}}}
\def\rvf{{\mathbf{f}}}
\def\rvg{{\mathbf{g}}}
\def\rvh{{\mathbf{h}}}
\def\rvu{{\mathbf{i}}}
\def\rvj{{\mathbf{j}}}
\def\rvk{{\mathbf{k}}}
\def\rvl{{\mathbf{l}}}
\def\rvm{{\mathbf{m}}}
\def\rvn{{\mathbf{n}}}
\def\rvo{{\mathbf{o}}}
\def\rvp{{\mathbf{p}}}
\def\rvq{{\mathbf{q}}}
\def\rvr{{\mathbf{r}}}
\def\rvs{{\mathbf{s}}}
\def\rvt{{\mathbf{t}}}
\def\rvu{{\mathbf{u}}}
\def\rvv{{\mathbf{v}}}
\def\rvw{{\mathbf{w}}}
\def\rvx{{\mathbf{x}}}
\def\rvy{{\mathbf{y}}}
\def\rvz{{\mathbf{z}}}

% Elements of random vectors
\def\erva{{\textnormal{a}}}
\def\ervb{{\textnormal{b}}}
\def\ervc{{\textnormal{c}}}
\def\ervd{{\textnormal{d}}}
\def\erve{{\textnormal{e}}}
\def\ervf{{\textnormal{f}}}
\def\ervg{{\textnormal{g}}}
\def\ervh{{\textnormal{h}}}
\def\ervi{{\textnormal{i}}}
\def\ervj{{\textnormal{j}}}
\def\ervk{{\textnormal{k}}}
\def\ervl{{\textnormal{l}}}
\def\ervm{{\textnormal{m}}}
\def\ervn{{\textnormal{n}}}
\def\ervo{{\textnormal{o}}}
\def\ervp{{\textnormal{p}}}
\def\ervq{{\textnormal{q}}}
\def\ervr{{\textnormal{r}}}
\def\ervs{{\textnormal{s}}}
\def\ervt{{\textnormal{t}}}
\def\ervu{{\textnormal{u}}}
\def\ervv{{\textnormal{v}}}
\def\ervw{{\textnormal{w}}}
\def\ervx{{\textnormal{x}}}
\def\ervy{{\textnormal{y}}}
\def\ervz{{\textnormal{z}}}

% Random matrices
\def\rmA{{\mathbf{A}}}
\def\rmB{{\mathbf{B}}}
\def\rmC{{\mathbf{C}}}
\def\rmD{{\mathbf{D}}}
\def\rmE{{\mathbf{E}}}
\def\rmF{{\mathbf{F}}}
\def\rmG{{\mathbf{G}}}
\def\rmH{{\mathbf{H}}}
\def\rmI{{\mathbf{I}}}
\def\rmJ{{\mathbf{J}}}
\def\rmK{{\mathbf{K}}}
\def\rmL{{\mathbf{L}}}
\def\rmM{{\mathbf{M}}}
\def\rmN{{\mathbf{N}}}
\def\rmO{{\mathbf{O}}}
\def\rmP{{\mathbf{P}}}
\def\rmQ{{\mathbf{Q}}}
\def\rmR{{\mathbf{R}}}
\def\rmS{{\mathbf{S}}}
\def\rmT{{\mathbf{T}}}
\def\rmU{{\mathbf{U}}}
\def\rmV{{\mathbf{V}}}
\def\rmW{{\mathbf{W}}}
\def\rmX{{\mathbf{X}}}
\def\rmY{{\mathbf{Y}}}
\def\rmZ{{\mathbf{Z}}}

% Elements of random matrices
\def\ermA{{\textnormal{A}}}
\def\ermB{{\textnormal{B}}}
\def\ermC{{\textnormal{C}}}
\def\ermD{{\textnormal{D}}}
\def\ermE{{\textnormal{E}}}
\def\ermF{{\textnormal{F}}}
\def\ermG{{\textnormal{G}}}
\def\ermH{{\textnormal{H}}}
\def\ermI{{\textnormal{I}}}
\def\ermJ{{\textnormal{J}}}
\def\ermK{{\textnormal{K}}}
\def\ermL{{\textnormal{L}}}
\def\ermM{{\textnormal{M}}}
\def\ermN{{\textnormal{N}}}
\def\ermO{{\textnormal{O}}}
\def\ermP{{\textnormal{P}}}
\def\ermQ{{\textnormal{Q}}}
\def\ermR{{\textnormal{R}}}
\def\ermS{{\textnormal{S}}}
\def\ermT{{\textnormal{T}}}
\def\ermU{{\textnormal{U}}}
\def\ermV{{\textnormal{V}}}
\def\ermW{{\textnormal{W}}}
\def\ermX{{\textnormal{X}}}
\def\ermY{{\textnormal{Y}}}
\def\ermZ{{\textnormal{Z}}}

% Vectors
\def\vzero{{\bm{0}}}
\def\vone{{\bm{1}}}
\def\vmu{{\bm{\mu}}}
\def\vtheta{{\bm{\theta}}}
\def\va{{\bm{a}}}
\def\vb{{\bm{b}}}
\def\vc{{\bm{c}}}
\def\vd{{\bm{d}}}
\def\ve{{\bm{e}}}
\def\vf{{\bm{f}}}
\def\vg{{\bm{g}}}
\def\vh{{\bm{h}}}
\def\vi{{\bm{i}}}
\def\vj{{\bm{j}}}
\def\vk{{\bm{k}}}
\def\vl{{\bm{l}}}
\def\vm{{\bm{m}}}
\def\vn{{\bm{n}}}
\def\vo{{\bm{o}}}
\def\vp{{\bm{p}}}
\def\vq{{\bm{q}}}
\def\vr{{\bm{r}}}
\def\vs{{\bm{s}}}
\def\vt{{\bm{t}}}
\def\vu{{\bm{u}}}
\def\vv{{\bm{v}}}
\def\vw{{\bm{w}}}
\def\vx{{\bm{x}}}
\def\vy{{\bm{y}}}
\def\vz{{\bm{z}}}

% Elements of vectors
\def\evalpha{{\alpha}}
\def\evbeta{{\beta}}
\def\evepsilon{{\epsilon}}
\def\evlambda{{\lambda}}
\def\evomega{{\omega}}
\def\evmu{{\mu}}
\def\evpsi{{\psi}}
\def\evsigma{{\sigma}}
\def\evtheta{{\theta}}
\def\eva{{a}}
\def\evb{{b}}
\def\evc{{c}}
\def\evd{{d}}
\def\eve{{e}}
\def\evf{{f}}
\def\evg{{g}}
\def\evh{{h}}
\def\evi{{i}}
\def\evj{{j}}
\def\evk{{k}}
\def\evl{{l}}
\def\evm{{m}}
\def\evn{{n}}
\def\evo{{o}}
\def\evp{{p}}
\def\evq{{q}}
\def\evr{{r}}
\def\evs{{s}}
\def\evt{{t}}
\def\evu{{u}}
\def\evv{{v}}
\def\evw{{w}}
\def\evx{{x}}
\def\evy{{y}}
\def\evz{{z}}

% Matrix
\def\mA{{\bm{A}}}
\def\mB{{\bm{B}}}
\def\mC{{\bm{C}}}
\def\mD{{\bm{D}}}
\def\mE{{\bm{E}}}
\def\mF{{\bm{F}}}
\def\mG{{\bm{G}}}
\def\mH{{\bm{H}}}
\def\mI{{\bm{I}}}
\def\mJ{{\bm{J}}}
\def\mK{{\bm{K}}}
\def\mL{{\bm{L}}}
\def\mM{{\bm{M}}}
\def\mN{{\bm{N}}}
\def\mO{{\bm{O}}}
\def\mP{{\bm{P}}}
\def\mQ{{\bm{Q}}}
\def\mR{{\bm{R}}}
\def\mS{{\bm{S}}}
\def\mT{{\bm{T}}}
\def\mU{{\bm{U}}}
\def\mV{{\bm{V}}}
\def\mW{{\bm{W}}}
\def\mX{{\bm{X}}}
\def\mY{{\bm{Y}}}
\def\mZ{{\bm{Z}}}
\def\mBeta{{\bm{\beta}}}
\def\mPhi{{\bm{\Phi}}}
\def\mLambda{{\bm{\Lambda}}}
\def\mSigma{{\bm{\Sigma}}}

% Tensor
\DeclareMathAlphabet{\mathsfit}{\encodingdefault}{\sfdefault}{m}{sl}
\SetMathAlphabet{\mathsfit}{bold}{\encodingdefault}{\sfdefault}{bx}{n}
\newcommand{\tens}[1]{\bm{\mathsfit{#1}}}
\def\tA{{\tens{A}}}
\def\tB{{\tens{B}}}
\def\tC{{\tens{C}}}
\def\tD{{\tens{D}}}
\def\tE{{\tens{E}}}
\def\tF{{\tens{F}}}
\def\tG{{\tens{G}}}
\def\tH{{\tens{H}}}
\def\tI{{\tens{I}}}
\def\tJ{{\tens{J}}}
\def\tK{{\tens{K}}}
\def\tL{{\tens{L}}}
\def\tM{{\tens{M}}}
\def\tN{{\tens{N}}}
\def\tO{{\tens{O}}}
\def\tP{{\tens{P}}}
\def\tQ{{\tens{Q}}}
\def\tR{{\tens{R}}}
\def\tS{{\tens{S}}}
\def\tT{{\tens{T}}}
\def\tU{{\tens{U}}}
\def\tV{{\tens{V}}}
\def\tW{{\tens{W}}}
\def\tX{{\tens{X}}}
\def\tY{{\tens{Y}}}
\def\tZ{{\tens{Z}}}


% Graph
\def\gA{{\mathcal{A}}}
\def\gB{{\mathcal{B}}}
\def\gC{{\mathcal{C}}}
\def\gD{{\mathcal{D}}}
\def\gE{{\mathcal{E}}}
\def\gF{{\mathcal{F}}}
\def\gG{{\mathcal{G}}}
\def\gH{{\mathcal{H}}}
\def\gI{{\mathcal{I}}}
\def\gJ{{\mathcal{J}}}
\def\gK{{\mathcal{K}}}
\def\gL{{\mathcal{L}}}
\def\gM{{\mathcal{M}}}
\def\gN{{\mathcal{N}}}
\def\gO{{\mathcal{O}}}
\def\gP{{\mathcal{P}}}
\def\gQ{{\mathcal{Q}}}
\def\gR{{\mathcal{R}}}
\def\gS{{\mathcal{S}}}
\def\gT{{\mathcal{T}}}
\def\gU{{\mathcal{U}}}
\def\gV{{\mathcal{V}}}
\def\gW{{\mathcal{W}}}
\def\gX{{\mathcal{X}}}
\def\gY{{\mathcal{Y}}}
\def\gZ{{\mathcal{Z}}}

% Sets
\def\sA{{\mathbb{A}}}
\def\sB{{\mathbb{B}}}
\def\sC{{\mathbb{C}}}
\def\sD{{\mathbb{D}}}
% Don't use a set called E, because this would be the same as our symbol
% for expectation.
\def\sF{{\mathbb{F}}}
\def\sG{{\mathbb{G}}}
\def\sH{{\mathbb{H}}}
\def\sI{{\mathbb{I}}}
\def\sJ{{\mathbb{J}}}
\def\sK{{\mathbb{K}}}
\def\sL{{\mathbb{L}}}
\def\sM{{\mathbb{M}}}
\def\sN{{\mathbb{N}}}
\def\sO{{\mathbb{O}}}
\def\sP{{\mathbb{P}}}
\def\sQ{{\mathbb{Q}}}
\def\sR{{\mathbb{R}}}
\def\sS{{\mathbb{S}}}
\def\sT{{\mathbb{T}}}
\def\sU{{\mathbb{U}}}
\def\sV{{\mathbb{V}}}
\def\sW{{\mathbb{W}}}
\def\sX{{\mathbb{X}}}
\def\sY{{\mathbb{Y}}}
\def\sZ{{\mathbb{Z}}}

% Entries of a matrix
\def\emLambda{{\Lambda}}
\def\emA{{A}}
\def\emB{{B}}
\def\emC{{C}}
\def\emD{{D}}
\def\emE{{E}}
\def\emF{{F}}
\def\emG{{G}}
\def\emH{{H}}
\def\emI{{I}}
\def\emJ{{J}}
\def\emK{{K}}
\def\emL{{L}}
\def\emM{{M}}
\def\emN{{N}}
\def\emO{{O}}
\def\emP{{P}}
\def\emQ{{Q}}
\def\emR{{R}}
\def\emS{{S}}
\def\emT{{T}}
\def\emU{{U}}
\def\emV{{V}}
\def\emW{{W}}
\def\emX{{X}}
\def\emY{{Y}}
\def\emZ{{Z}}
\def\emSigma{{\Sigma}}

% entries of a tensor
% Same font as tensor, without \bm wrapper
\newcommand{\etens}[1]{\mathsfit{#1}}
\def\etLambda{{\etens{\Lambda}}}
\def\etA{{\etens{A}}}
\def\etB{{\etens{B}}}
\def\etC{{\etens{C}}}
\def\etD{{\etens{D}}}
\def\etE{{\etens{E}}}
\def\etF{{\etens{F}}}
\def\etG{{\etens{G}}}
\def\etH{{\etens{H}}}
\def\etI{{\etens{I}}}
\def\etJ{{\etens{J}}}
\def\etK{{\etens{K}}}
\def\etL{{\etens{L}}}
\def\etM{{\etens{M}}}
\def\etN{{\etens{N}}}
\def\etO{{\etens{O}}}
\def\etP{{\etens{P}}}
\def\etQ{{\etens{Q}}}
\def\etR{{\etens{R}}}
\def\etS{{\etens{S}}}
\def\etT{{\etens{T}}}
\def\etU{{\etens{U}}}
\def\etV{{\etens{V}}}
\def\etW{{\etens{W}}}
\def\etX{{\etens{X}}}
\def\etY{{\etens{Y}}}
\def\etZ{{\etens{Z}}}

% The true underlying data generating distribution
\newcommand{\pdata}{p_{\rm{data}}}
% The empirical distribution defined by the training set
\newcommand{\ptrain}{\hat{p}_{\rm{data}}}
\newcommand{\Ptrain}{\hat{P}_{\rm{data}}}
% The model distribution
\newcommand{\pmodel}{p_{\rm{model}}}
\newcommand{\Pmodel}{P_{\rm{model}}}
\newcommand{\ptildemodel}{\tilde{p}_{\rm{model}}}
% Stochastic autoencoder distributions
\newcommand{\pencode}{p_{\rm{encoder}}}
\newcommand{\pdecode}{p_{\rm{decoder}}}
\newcommand{\precons}{p_{\rm{reconstruct}}}

\newcommand{\laplace}{\mathrm{Laplace}} % Laplace distribution

\newcommand{\E}{\mathbb{E}}
\newcommand{\Ls}{\mathcal{L}}
\newcommand{\R}{\mathbb{R}}
\newcommand{\emp}{\tilde{p}}
\newcommand{\lr}{\alpha}
\newcommand{\reg}{\lambda}
\newcommand{\rect}{\mathrm{rectifier}}
\newcommand{\softmax}{\mathrm{softmax}}
\newcommand{\sigmoid}{\sigma}
\newcommand{\softplus}{\zeta}
\newcommand{\KL}{D_{\mathrm{KL}}}
\newcommand{\Var}{\mathrm{Var}}
\newcommand{\standarderror}{\mathrm{SE}}
\newcommand{\Cov}{\mathrm{Cov}}
% Wolfram Mathworld says $L^2$ is for function spaces and $\ell^2$ is for vectors
% But then they seem to use $L^2$ for vectors throughout the site, and so does
% wikipedia.
\newcommand{\normlzero}{L^0}
\newcommand{\normlone}{L^1}
\newcommand{\normltwo}{L^2}
\newcommand{\normlp}{L^p}
\newcommand{\normmax}{L^\infty}

\newcommand{\parents}{Pa} % See usage in notation.tex. Chosen to match Daphne's book.

\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}

\DeclareMathOperator{\sign}{sign}
\DeclareMathOperator{\Tr}{Tr}
\let\ab\allowbreak

\usepackage{adjustbox}
\usepackage{lipsum}
\usepackage{wrapfig}
\usepackage{booktabs}
\usepackage{multirow,mathtools} 
\usepackage{algorithm,algpseudocode}
\usepackage{threeparttable}


 \algnewcommand{\algorithmicforeach}{\textbf{for each}}
\algdef{SE}[FOR]{ForEach}{EndForEach}[1]
  {\algorithmicforeach\ #1\ \algorithmicdo}% \ForEach{#1}
  {\algorithmicend\ \algorithmicforeach}% \EndForEach

\usepackage{times}

%\newtheorem{proof}{\bf{Proof}}
\newtheorem{myprop}{\bf{Proposition}}
\newtheorem{mycor}{\bf{Corollary}}
\newtheorem{mythr}{\bf{Theorem}}
\newtheorem{mylemma}{\bf{Lemma}}
\newtheorem{myremark}{\bf{Remark}}
\DeclareMathOperator{\tr}{tr}
\DeclareMathOperator{\card}{card}
\DeclareMathOperator{\cov}{cov}
\DeclareMathOperator{\diag}{diag}
\DeclareMathOperator*{\minimize}{\text{minimize}}
\DeclareMathOperator*{\maximize}{\text{maximize}}
\DeclareMathOperator*{\st}{\text{subject to}}
\DeclareMathAlphabet\mathbfcal{OMS}{cmsy}{b}{n}
\newcommand{\Def}[0]{\mathrel{\mathop:}=}
\newcommand{\Deff}[0]{=\mathrel{\mathop:}}

 \newcommand{\SL}[1]{\textcolor{red}{SL: #1}}
  \newcommand{\Gaoyuan}[1]{\textcolor{salmon}{Gaoyuan: #1}}
 \newcommand{\revision}[1]{\textcolor{blue}{#1}}


\def\remark{\addtocounter{remark}{1}\def\@currentlabel{\theremark}%
\emph{Remark~\theremark}. } \makeatother
\newcommand{\ubar}[1]{\underaccent{\bar}{#1}}
\newcommand{\overbar}[1]{\mkern 1.5mu\overline{\mkern-1.5mu#1\mkern-1.5mu}\mkern 1.5mu}
\newcounter{remark}
\def\L{{\cal L}}
\def\b1{{\boldsymbol 1}}
\def\ba{\mathbf a}
\def\abx{\overbar{\bx}}
\def\aby{\overbar{\by}}
\def\azeta{\overbar{\zeta}}
\def\axi{\overbar{\xi}}
\def\abg{\overbar{g}}
\newcommand{\CT}[1]{\boldsymbol{\mathscr{\MakeUppercase{#1}}}}
\newcommand{\TU}{\CT{U}}
\def\balpha{\boldsymbol{\alpha}}
\def\btheta{\boldsymbol{\theta}}
\def\bdelta{\boldsymbol{\delta}}
\def\bbeta{\boldsymbol{\beta}}
\def\bseta{\boldsymbol{\eta}}
\def\bdelta{\boldsymbol{\delta}}
\def\bH{\boldsymbol{\mathcal{H}}}

\def\hbx{\widehat{\bx}}

\def\hbx{\widehat{\mathbf x}}
\def\hbv{\widehat{\mathbf v}}
\def\byt{\widehat{\mathbf y}}
\def\wde{\widehat{\delta}}
\def\hcA{\widehat{\mathcal A}}
\def\bYt{\widehat{\mathbf Y}}
\def\hphi{\widehat{\phi}}
\def\htheta{\widehat{\theta}}
\def\bgo{\mathbf{g}_t}

\def\tbh{\widetilde{\mathbf h}}
\def\wbx{\widetilde{\mathbf x}}
\def\wx{\widetilde{x}}
\def\wbu{\widetilde{\mathbf u}}
\def\wsigma{\widetilde{\sigma}}
\def\tbw{\widetilde{\mathbf w}}
\def\tby{\widetilde{\mathbf y}}
\def\wbv{\widetilde{\mathbf v}}
\def\wzeta{\widetilde{\zeta}}
\def\tDelta{\widetilde{\Delta}}
\def\tbH{\widetilde{\boldsymbol{\mathcal{H}}}}

\def\bg{\hat {\mathbf g}_t}
\def\bd{\vec d}
\def\be{\mathbf e}
\def\bx{\mathbf x}
\def\bh{\mathbf h}
\def\bm{\mathbf m}
\def\bn{\mathbf n}
\def\bw{\mathbf w}
\def\by{\mathbf y}
\def\bu{\mathbf u}
\def\bv{\mathbf v}
\def\bp{\mathbf p}
\def\bq{\mathbf q}
\def\bz{\mathbf z}
\def\br{\mathbf r}
\def\bA{\mathbf A}
\def\bB{\mathbf B}
\def\bC{\mathbf C}
\def\bD{\mathbf D}
\def\bE{\mathbf E}
\def\bF{\mathbf F}
\def\bG{\mathbf G}
%\def\bH{\mathbf H}
\def\bI{\mathbf I}
\def\bK{\mathbf K}
\def\bL{\mathbf L}
\def\bM{\mathbf M}
\def\bN{\mathbf N}
\def\bP{\mathbf P}
\def\bQ{\mathbf Q}
\def\bS{\mathbf S}
\def\bT{\mathbf T}
\def\bU{\mathbf U}
\def\bV{\mathbf V}
\def\bW{\mathbf W}
\def\bX{\mathbf X}
\def\bY{\mathbf Y}
\def\bLambda{\mathbf \Lambda}
\def\bOmega{\mathbf \Omega}
\def\bZ{\mathbf Z}
\def\cA{\mathcal A}
\def\cB{\mathcal B}
\def\cC{\mathcal C}



\newcommand{\T}{\scriptscriptstyle T}
\def\adag{\alpha^\dag}
\def\rhomax{\rho_\text{max}}
\def\maximize{\mathop{\text{maximize}}}
\def\minimize{\mathop{\text{minimize}}}
\def\deref#1{Definition~\ref{#1}}
\def\secref#1{Section~\ref{#1}}
\def\leref#1{Lemma~\ref{#1}}
\def\conref#1{Condition~\ref{#1}}
\def\thref#1{Theorem~\ref{#1}}
\def\remref#1{Remark~\ref{#1}}
\def\coref#1{Corollary~\ref{#1}}
\def\figref#1{Figure~\ref{#1}}
\def\figtab#1{Table~\ref{#1}}
\def\algref#1{Algorithm~\ref{#1}}
\def\asref#1{Assumption~\ref{#1}}
\def\bydef{\triangleq}
\newtheorem{lemma}{Lemma}
\newtheorem{condition}{Condition}
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}{Corollary}
\newtheorem{definition}{Definition}
\newtheorem{assumption}{Assumption}


%% This part goes in preamble
\newcommand{\dummyfig}[1]{
  \centering
  \fbox{
    \begin{minipage}[c][0.2\textheight][c]{0.5\textwidth}
      \centering{#1}
    \end{minipage}
  }
}

\usepackage{color, colortbl}
\definecolor{Gray}{gray}{0.9}
\definecolor{Orange}{rgb}{1,0.5,0}
%\rowcolor{Gray}

\newcommand{\PY}[1]{\textcolor{green}{PY: #1}}
\newcommand{\GY}[1]{\textcolor{Orange}{GY: #1}}
\newcommand{\LH}[1]{\textcolor{NavyBlue}{LH: #1}}
\newcommand{\MH}[1]{\textcolor{red}{MH: #1}}
\newcommand{\XC}[1]{\textcolor{magenta}{XC: #1}}

\makeatletter
\newcommand*{\rom}[1]{\expandafter\@slowromancap\romannumeral #1@}
\makeatother
\newcommand{\mycomment}[1]{}

%%%%%%%%% notations %%%%%%%%%%%%%%
\newcommand{\layernum}{h}
\newcommand{\convacc}{\xi}
\newcommand{\layerscale}{\tau}
\newcommand{\modeldim}{d}
\newcommand{\increase}[1]{\textcolor{red}{{#1}}}
\newcommand{\decrease}[1]{\textcolor{blue}{{#1}}}
\newcommand{\high}[1]{\textcolor{purple}{{#1}}}

\newcommand{\DAT}{{\texttt{DAT}}}
\newcommand{\AT}{{\texttt{AT}}}



\title{Distributed Adversarial Training to Robustify Deep Neural Networks at Scale}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors


% Gaoyuan Zhang ( IBM Research ) < Gaoyuan.Zhang@ibm.com> 
% Songtao Lu ( IBM Thomas J. Watson Research Center ) < songtao@ibm.com> 
% Yihua Zhang 
% Xiangyi Chen ( University of Minnesota ) < chen5719@umn.edu> 
% Pin-Yu Chen ( IBM Research ) < pin-yu.chen@ibm.com> 
% Quanfu Fan ( IBM Research ) < qfan@us.ibm.com> 
% Lee Martie ( IBM Research ) < lee.martie@ibm.com> 
% Mingyi Hong ( University of Minnesota ) < mhong@umn.edu> 
% Sijia Liu ( Michigan State University ) < liusiji5@msu.edu> 


\author[1,*]{Gaoyuan Zhang}
\author[1,*]{Songtao Lu}
\author[2]{Yihua Zhang}
\author[3]{Xiangyi Chen}
\author[1]{Pin-Yu Chen}
\author[1]{Quanfu Fan}
\author[1]{Lee Martie}
\author[1]{Lior Horesh}
\author[3]{Mingyi Hong}
\author[1,2]{Sijia Liu}
% Add affiliations after the authors
\affil[1]{%
    IBM Research
    % Yorktown Heights, NY 10598
}
\affil[2]{%
    Michigan State University
    % East Lansing, MI 48824
}
\affil[3]{%
    University of Minnesota
    % Minneapolis, MN 55455
  }
  
\affil[*]{%
    Equal Contribution
  }
  
  \begin{document}
\maketitle

\begin{abstract}
{Current} deep neural networks (DNNs) are vulnerable to adversarial attacks, where adversarial perturbations to the inputs can change or manipulate  classification. %drastically change or direct 
% \PY{(what does drastically direct mean here)} 
% classification.
 To defend against such attacks, an effective and popular approach, known as \textit{adversarial training (AT)}, has been shown to mitigate the {negative} impact of adversarial attacks by virtue of a min-max robust training method. 
 While effective, it remains unclear whether it can successfully be adapted to the distributed learning context. 
 The power of distributed optimization over multiple machines 
  enables us to scale up robust training over large models and  datasets. 
  %by exploiting the power of distributed learning.
 %this approach is difficult to scale well to large models on large datasets (e.g., ImageNet) in general. 
 %To tackle this {problem}, 
 Spurred by that, 
 we propose
 %introduce a novel framework called 
 \textit{distributed adversarial training ({DAT})},
 a \textit{large-batch} adversarial training framework implemented over multiple machines. We show that {DAT} is general, which supports training over labeled and unlabeled data,
multiple types of attack generation methods, and   gradient compression operations favored for distributed optimization.
%gradient quantization, and 
 %\LEE{removed "as well as" since that can not replace "and"} 
% training over labeled and unlabeled data.
 %In particular,
%  \MH{{DAT} is a large-batch adversarial training framework implemented over multiple machines; %by providing \MH{[enabling?]} large-batch optimization,
%  it supports one-shot and iterative attack generation methods, gradient quantization, as well as training over labeled and unlabeled data.}
 Theoretically, we provide, under standard conditions in the optimization theory, the convergence rate of {DAT} to the first-order stationary points in general non-convex settings. Empirically, we demonstrate that {DAT} either matches or outperforms state-of-the-art robust accuracies and achieves a graceful training
 %\LEE{We need a more technical term for graceful. Can we say 5x (or the min/max?)} 
 speedup (e.g., on ResNet--50 under ImageNet). Codes are available at \url{https://github.com/dat-2022/dat}.
\end{abstract}



\section{Introduction}

% \PY{(Some thoughts: Should we use "machine" or "(compute) node"? I feel machine can mean either CPU or GPU or Cluster)}

% \MH{In the abstract, shall we emphasize the key challenges of directly extending the current adversarial training to large problems?}

% \LEE{Comment: Reworked the intro into below paragraphs. Let me know if this works or helps.}

%\LEE{
The rapid increase of research in DNNs and their adoption in practice is, in part, owed to the significant breakthroughs made with DNNs in computer vision \citep{alom2018history}.  
%\LEE{reworded previous sentence}
% The impact is easily visible from applications in facial recognition [], autonomous vehicles [], and image understanding [].
Yet, with the apparent power of DNNs, there remains a serious weakness of robustness. That is, DNNs can easily be manipulated (by an adversary) to output drastically different classifications and can be done so in a controlled and directed way. 
%That is\Gaoyuan{remove that is?}, by feeding images with imperceptible perturbations to a DNN, an attacker can make the DNN “see” what the attacker wants. 
This process is known as an adversarial attack and considered as one of the  major hurdles in using DNNs in security critical and real-world applications
 \citep{Goodfellow2015explaining,szegedy2013intriguing,carlini2017towards,papernot2016cleverhans,kurakin2016adversarial,xu2019evading}. 
%\LEE{Note: I fixed the tense of the previous sentence.}
%  Adversarial attacks are 
% a major hurdle in using DNNs in security critical and real-world applications.
%}




Methods to train DNNs being robust against adversarial attacks are now a major focus in research \citep{xu2019adversarial}. But most of them are far from satisfactory \citep{athalye2018obfuscated} with the exception of the adversarial training ({AT}) approach  \citep{madry2017towards}. 
%\LH{PDENet and Stable PDE architectures (Haber and Ruthotto) also showed great promise}
{AT} is a min-max robust training method that minimizes the worst-case training loss at adversarially perturbed examples. {AT} has inspired a wide range of state-of-the-art defenses \citep{zhang2019theoretically,sinha2018certifying,boopathy2020visual,carmon2019unlabeled,shafahi2019adversarial,zhang2019you}, which ultimately resort to min-max optimization. However,
different from standard training, AT is more computationally intensive and   is difficult to scale. 

   % \begin{figure}[htb]
\paragraph{Motivation and challenges.} 
\textit{First}, although a `fast' version of AT (we call Fast AT) was developed in   \citep{Wong2020Fast}    where an iterative   inner maximization solver is replaced by a simplified (single-step) solution, it may suffer several problems compared to AT: unstable robust learning performance \citep{li2020towards}, over-sensitive to learning rate schedule  \citep{rice2020overfitting}, and catastrophic  forgetting of robustness against strong attacks \citep{andriushchenko2020understanding}. As a result, AT is still the dominant robust training protocol across applications. Spurred by that,  we propose DAT, a new   approach to speed up AT    by allowing for scaling batch size with distributed machines.
\textit{Second}, existing AT-type methods
 are generally built on \textit{centralized} optimization.
 The need of AT in a \textit{distributed} setting arises when centralized robust training becomes infeasible or ineffective. 
For example,
%from at least the following two  aspects: 
training data are distributed as they cannot centrally be stored at a single machine due to  their size or privacy. 
 Or computing units are   distributed as they
%expands the individual capability of data storage or data privacy. 2)  Computing units are often distributed, provided by distributed machines,  
%which enables 
allow large-batch optimization to improve the   scalability of training.
 
% If private data is required for distributed storage or the data batch exceeds the storage capacity of a single machine, then the centralized method \textit{cannot} be applied.
% Thus, it is important to design a   distributed {AT} solution  that can bypass the limitations of centralized {AT}. 
%to the distributed learning regime.

%  \begin{figure}[htb] 
% \vspace*{-0mm}
% \centerline{
% \includegraphics[width=.42\textwidth,height=!]{Figures/fig1.pdf} 
% }
% \vspace*{-3mm}
% \caption{\footnotesize{
% Robust accuracy (RA) and standard test accuracy (TA) of  Fast AT \citep{Wong2020Fast} versus our proposed method DAT-FGSM 
%  versus the scaled batch size 
%  under (ResNet-18, CIFAR-10). Training and evaluation details can be found in Sec.\,\ref{sec: exp}.
%  \SL{Fast AT-ImageNet-vs batch size}
%  % in the setup of 
% %  \GY{1, 1, 3 machines,} respectively.
% % along 
% % with the increasing number of  machines \SL{(xx, xxx, xxx, xxx machines)}.
% %\SL{Update legends: AT, DAT-PGD, DAT-FGSM}
% }}
% \vspace*{-0mm}
% \label{fig: motivating_example}
% %\vspace{-2em}
% \end{figure}

    % \begin{wrapfigure}{r}{36mm}
%     \begin{figure}[htb]
%     \vspace*{-0mm}
% \centerline{
% \hspace*{-5mm}
% \begin{tabular}{c}
% \hspace*{-0.1in}
% \includegraphics[width=.43\textwidth,height=!]{Figures/plotf2.pdf} 
% %& \hspace*{-0.2in}
% %\includegraphics[width=.29\textwidth,height=!]{Figures/DAT_LSGD_ImageNet.pdf}  
% %\\
% %\footnotesize{(a)} &   \footnotesize{(b)}
% \end{tabular}}
% \vspace*{-4mm}
% \caption{\small{{{
% Robust accuracy (RA) and standard test accuracy (TA) of  AT 
%  vs.   scaled batch size 
%  under (ImageNet, ResNet-50) using distributed machines. 
%  %\SL{[use new figure]}
%  %\SL{[@GZ; update or Table, 2 batch size settings]}
% }}
% }}
%   \label{fig: AT_large_batch_ImageNet}
%   \vspace*{-5mm}
% \end{figure}
%\end{wrapfigure}
%\paragraph{Challenges}

   % \begin{wrapfigure}{r}{35mm}
  \begin{figure}[htb]
\centerline{
\hspace*{-1mm}
\begin{tabular}{c}
\hspace*{-5mm}
\includegraphics[width=.33\textwidth,height=!]{Figures/plotf2.pdf} 
%& \hspace*{-0.2in}
%\includegraphics[width=.29\textwidth,height=!]{Figures/DAT_LSGD_ImageNet.pdf}  
%\\
%\footnotesize{(a)} &   \footnotesize{(b)}
\end{tabular}}
% \vspace*{-3mm}
\caption{\small{{{
Robust accuracy (RA) and standard test accuracy (TA) of  AT 
 vs.   scaled batch size 
 under (ImageNet, ResNet-50) using distributed machines. 
 %\SL{[use new figure]}
 %\SL{[@GZ; update or Table, 2 batch size settings]}
}}
}}
  \label{fig: AT_large_batch_ImageNet}
%   \vspace*{-3mm}
\end{figure}
% \end{wrapfigure} 
While designing a distributed solution is important,  doing so effectively   is non-trivial.  Figure\,\ref{fig: AT_large_batch_ImageNet} demonstrates an example:
When scaling batch size with   the number of computing nodes,  the conventional AT method
yields a large performance drop in both robust    and standard accuracies. 
%(e.g., more than $30\%$
%drops in test accuracy  and $10\%$ drops in robust accuracy for ResNet-18 on CIFAR-10 under batch size $6144$). 
% By contrast, our proposed distributed algorithm  maintains accuracies as the batch size increases.  
Thus, the adaptation of AT to distributed learning
%and 
leaves many unanswered questions. 
% {A few works made empirical efforts to scale AT up by simply using multiple computing nodes \citep{xie2019feature,kang2019testing,qin2019adversarial}, they were limited to specific use cases and  lacked a thorough study on when and how distributed learning helps,  either in theory or in practice.}
{In this work,}
%To answer this question, 
we aim to design a principled and theoretically-grounded (large-batch) DAT %\textit{distributed (large-batch) adversarial training} (DAT) 
framework by making full use of the computing capability of multiple data-locality (distributed) machines, and show that DAT expands the capacity of data storage and the computational scalability. Furthermore, due to the existence of many variants of AT, it requires
 a  careful and systematic study on distributed AT   in its   formulation, methodology, theory and   performance evaluation.

%general distributd AT framework 

% it requires careful studies of distributed AT in formulation, algorithm, theory and experiments, which can cover a broad range of distributed versions of AT variants, such as supervised AT, semi-supervised AT,  FGSM-based AT, and quantization-tolerant AT. This is the first thorough study on distributed robust training.

\mycomment{
First, if the direct solution does not allow for scaling batch size with machines, then it does not speed the process up and leads to a significant amount of communication costs (considering that the number of training iterations is not reduced over a fixed number of epochs). 
Second, without proper design, the {direct} application of a {large batch} size to distributed adversarial training introduces a significant loss in both normal accuracy and adversarial robustness (e.g., more than $10\%$ performance drops for ResNet-18 on CIFAR-10
%for ResNet-50 on ImageNet 
shown by our experiments). Third, the direct approach does not confer a general algorithmic framework, which is needed in order to support different variants of AT, large-batch optimization, and efficient communication. 
%\MH{naive --> direct? also refer readers to xxx for these claims?}
}

\mycomment{Although  DAT is challenging, it can immediately offer two   
% Different from centralized learning, a \textit{distributed}  learning solution could offer
   benefits. First, it  expands the computational scalability by making full use of  multiple machines. For example, it only takes \SL{xxx hours} for training a robust  ResNet-50 on ImageNet, versus \SL{xxx hours} used by AT at single machine with \SL{6} GPUs.  
   Second, it expands the capacity of data storage and allows to   train a DNN over distributedly stored  data. We summarize our contributions as below. 
   %For example, DAT with additional unlabeled data achieves more than $5\%$ improvement over AT on CIFAR-10 with \SL{xx} 
   }

   
\mycomment{
%In this paper, we ask: 
{Taking all factors into consideration, a question that naturally arises is:}
\textit{Can we speed up AT by leveraging distributed learning with full utility of multiple computing nodes (machines), even when each only has access to limited GPU resources?}  
{Although a few works made empirical efforts to scale AT up by simply using multiple computing nodes \citep{xie2019feature,kang2019testing,qin2019adversarial}, they were limited to specific use cases and  lacked a thorough study on when and how distributed learning helps,  either in theory or in practice.}
{By contrast,}
%To answer this question, 
we propose a principled and theoretically-grounded \textit{distributed (large-batch) adversarial training} (DAT) framework by making full use of the computing capability of multiple data-locality (distributed) machines, and show that DAT expands the capacity of data storage and the computational scalability. 
%\SL{For example, it only takes {$16.3$ hours} for training a robust  ResNet-50 on ImageNet, versus {$50$ hours} used by AT on a single machine with {$6$} GPUs.}
We summarize our main contributions as below.
}
% We also show that  DAT
% is able to expand the capacity of data storage and allow training 
% %a DNN
% many variants of AT
% over distributed data.


\mycomment{\paragraph{Contributions} %We summarize our contributions as below. 
\textit{(i)} In this work, we provide a general problem formulation of   DAT, which extends multiple variants of AT in the distributed setting. 
%both supervised and semi-supervised learning settings. 
\textit{(ii)}
We propose a unified   algorithmic framework for DAT, which different from AT,  supports {large-batch} DNN training (without losing performance
    over a {fixed} number of epochs), and  allows the transmission of {compressed} gradients for efficient communication.
    \textit{(iii)}
    We provide the convergence analysis of the proposed DAT algorithm, showing its $O({1}/{\sqrt{T}})$ convergence rate to first-order stationary points, where $T$ is total number of iterations. 
    %\SL{@Songtao} convergence rate to a first-order stationary point.
    %{\color{blue}We can show that when the batch size is large enough, DAT is able to converge to an $\epsilon$ first-order stationary points in a rate of $\mathcal{O}(1/\epsilon^4)$, matching the standard convergence rate of stochastic gradient descent (SGD) for nonconvex problems.}
    \textit{(iv)} 
     We make a  comprehensive empirical study on DAT, showing that   
     its speedup in training large models at large datasets, and 
       matches (and even exceeds) state-of-the-art robust accuracies in different attacking and learning scenarios.  
    %  and  significantly speeds up the training with the aid of  computing capability of distributed machines. For example, 
We show that     DAT on ImageNet with use of $6 \times 6$ (GPUs per machine $\times$ machines) yields $38.45\%$ robust accuracy (comparable to $40.01\%$ from AT)  but only requires \SL{$xxx$} training time, \SL{$xxx$} times faster than AT. \SL{@Gaoyuan, please fill in.}
}

%\vspace{-0.2cm}
\paragraph{Contributions.}
We list our  main contributions   below.


%We summarize our contributions as below. 
{\textbf{\textit{(i)}}} We provide a general algorithmic framework for DAT, which {supports}  multiple (large-batch) distributed variants of AT, e.g., supervised AT and semi-supervised AT. %\underline{\textbf{\textit{(ii)}}}
%And we propose a principled algorithmic framework for DAT, which  supports {large-batch} min-max optimization based robust  DNN training.

%(without losing performance
    %over a {fixed} number of epochs) 
    % and 
    % allows the transmission of {compressed} gradients for efficient communication.
    {\textbf{\textit{(ii)}}}
     In theory, we quantify how descent errors from multiple sources (gradient estimation, quantization, adaptive learning rate, and inner maximization oracle) affect the convergence of DAT. We prove that the convergence speed of DAT to the first-order stationary points in general non-convex settings at a rate of  $O({1}/{\sqrt{T}})$, where $T$ is the total number of iterations. This result matches the standard convergence rate of classic training algorithms, e.g., stochastic gradient descent (SGD), for only the minimization problems.
    
    {\textbf{\textit{(iii)}}} 
    In practice, we make a comprehensive empirical study on DAT, showing its effectiveness to (1) robust training over ImageNet,
     (2) provably robust training by  randomized smoothing, 
     (3) robust training with unlabeled data,   (4) robust pretraining + finetuning, and (5) robust training across different computing and communication configurations.
%      We can show that DAT
% not only speeds up in training large models on large datasets but also matches 
%     %  %(and even exceeds)
%     state-of-the-art robust accuracies in different attacking and learning scenarios. 
    %{For example, DAT on ImageNet with $6 \times 6$ (machines $\times$ GPUs per machine) yields $38.45\%$ robust accuracy (comparable to $40.38\%$ from AT)  but only requires {$16.3$ hours}  training time ($6$ times larger batch size allowed in DAT), exhibiting {3.1} times faster than AT {on a single machine of 6 GPUs}.} 
    %  {We also make a large effort to evaluate the empirical performance of DAT across different computing configurations and learning regimes.}


% We have made a significant effort to evaluate the empirical performance of DAT in 
% defense  
%  against PGD and C\&W attacks,
% scalability
% %of computation and communication 
% across different computing configurations (AllReduce, parameter-server and HPC setups; see  Sec.\,5 \& Appendix\,3.5), 
% advantage of using unlabeled data, and   transferability  of robustness  from DAT pre-training to fine-tuning.
% \citep{madry2017towards}, the first known min-max optimization based defense, has inspired a wide range of other effective defenses. Examples include   input gradient or curvature regularization \citep{ross2018improving,moosavi2019robustness}, trade-off between between robustness and accuracy (TRADES) \citep{zhang2019theoretically}, distributionally robust training \citep{sinha2018certifying}, robust input attribution regularization \citep{chen2019robust,boopathy2020visual}, certifiably robust training \citep{wong2017provable,dvijotham2018training},     semi-supervised robust training \citep{stanforth2019labels,carmon2019unlabeled}, and   robust training with accelerated inner maximization oracles 
%   \citep{shafahi2019adversarial,zhang2019you,Wong2020Fast}.
  
% Some recent works  proposed   \textit{fast but approximate} AT algorithms, such as `free' AT \citep{shafahi2019adversarial},  you only propagate once (YOPO) \citep{zhang2019you}, and fast gradient sign method (FGSM) based AT (FGSM-AT) \citep{Wong2020Fast}. 
% These algorithms achieve speedup in training  by 
% simplifying the inner maximization step of AT, yet  they  are designed for \textit{centralized} training  at single {machine} (computing node) and thus limited to per-node computing resources (e.g., GPUs).  


%\vspace{-0.2cm}
% \begin{itemize}
%      \item We formulate the DAT problem for the first time, which covers both supervised and semi-supervised learning settings. 
%     \item We propose a unified  framework of the DAT algorithm, which different from AT,  supports \textit{large-batch} DNN training (without losing robust accuracy 
%     over a {fixed} number of epochs), and  allows the transmission of \textit{compressed} gradients (with respect to model parameters) to reduce in-network communication overhead.
%     \item We provide the convergence analysis of the proposed DAT algorithm, showing its \SL{@Songtao} convergence rate to a first-order stationary point. 
    
%     {\color{blue}We can show that when the batch size is large enough, DAT is able to converge to an $\epsilon$ first-order stationary points in a rate of $\mathcal{O}(1/\epsilon^4)$, matching the standard convergence rate of stochastic gradient descent (SGD) for nonconvex problems.}
    
%     \item We make a  comprehensive experimental study, showing that      DAT matches and even exceeds state-of-the-art robust accuracies in different attacking and learning scenarios, and  significantly speeds up the training with the aid of  computing capability of distributed machines. For example, DAT on ImageNet at the use of \SL{$xxx \times xxx$} (GPUs per machine $\times$ machines) yields \SL{$xxx\%$} robust accuracy (\SL{$xxx$} smaller or larger than that from AT) within \SL{$xxx$} hours (\SL{$xxx$} times faster than At in single machine). \SL{@Gaoyuan, please fill in.}
% \end{itemize}
%\vspace{-0.2cm}


\section{Related Work}
%\vspace{-0.3cm}
% Spurred by that, we ask:



\paragraph{Training robust classifiers.}
% The lack of robustness of DNNs  has promoted  a rapid expansion  of defenses against adversarial attacks, ranging from heuristic defenses  to robust (min-max) optimization based approaches. However, 
% many heuristic strategies 
%   are   easily bypassed by stronger adversaries due to the presence of obfuscated gradients \citep{athalye2018obfuscated}. By contrast, the min-max optimization-based training methods are generally able to offer  significant gains in robustness. 
  AT \citep{madry2017towards}, the first known min-max optimization-based defense, has inspired a wide range of other effective defenses. Examples include adversarial logit pairing \citep{kannan2018adversarial},   input gradient or curvature regularization \citep{ross2018improving,moosavi2019robustness}, trade-off between  robustness and accuracy (TRADES) \citep{zhang2019theoretically}, distributionally robust training \citep{sinha2018certifying},
  dynamic adversarial training \citep{wang2019convergence},
  robust input attribution regularization \citep{boopathy2020visual}, certifiably robust training \citep{wong2017provable},    and semi-supervised robust training \citep{stanforth2019labels,carmon2019unlabeled}.
  
  In particular,
   some recent works  proposed   \textit{fast but approximate} AT algorithms, such as `free' AT \citep{shafahi2019adversarial},  you only propagate once (YOPO) \citep{zhang2019you}, and fast gradient sign method (FGSM) based AT  \citep{Wong2020Fast}. These algorithms achieve speedup in training  by 
simplifying the inner maximization step of AT, but are designed for centralized model training.  
%   robust training with accelerated inner maximization oracles 
%   \citep{shafahi2019adversarial,zhang2019you,Wong2020Fast}
{A few  works made empirical efforts to scale AT up by  using multiple computing nodes \citep{xie2019feature,kang2019testing,qin2019adversarial}, they were limited to specific use cases and  lacked a thorough study on when and how distributed learning helps,  either in theory or in practice.}

%   Although there is vast literature on min-max optimization based robust training, it is designed for centralized model training  and   %[encounters] 
%   {without care about the scalability issue (such as distributed data storage) in AT.} %action it faces} the single-machine computing  limitation.
%   {In \citep{xie2019feature}, although
%   a  distributed version of vanilla  AT was implemented via the large-batch SGD algorithm \citep{goyal2017accurate},  it is significantly different from our proposal. Compared to  \citep{xie2019feature},    we adopt a different distributed training recipe (layer-wise adaptive learning rate method vs. SGD), and  make a more thorough and in-depth study in both theory and practice.  
%   %in-depth theoretical analysis of quantifying the convergence rate of DAT. 
%   %and generalize DAT to  various distributed learning setups.
%   }
  
  % To circumvent such a limitation, DAT is proposed for the first time.
%   \PY{(Should we mention some theoretic work on AT here? Like Quanquan's work)}
% \SL{In the theory session, we should make the difference clearer. I add this work in related work.}

%\vspace{-0.4cm}
\paragraph{Distributed model training.}
Distributed optimization has been found to be effective 
  for the standard training of machine learning models \citep{dean2012large,goyal2017accurate,you2019large,chia20}. 
  In contrast to centralized optimization, distributed learning enables increasing the batch size proportional to the number of computing nodes/machines. However, it is challenging to train a model via large-batch optimization without incurring   accuracy loss compared to the standard training  with  same number of epochs \citep{krizhevsky2014one,keskar2016large}.
  To tackle this challenge, it was shown in
     \citep{you2017scaling,you2018imagenet,you2019large}  that adaptation of learning rates to   the increase of the batch size is an essential mean to boost the performance of large-batch optimization. A layer-wise adaptive learning rate strategy was then proposed  to speed up the training as well as preserve the accuracy. Although these works    have  witnessed several successful applications of distributed learning in training \textit{standard} image classifiers, %they leave the question of {DAT} open to build \textit{robust} DNNs.
     they leave the question of how to build \textit{robust} DNNs with {DAT} open.
     In this paper, 
     we   show that the power of layer-wise adaptive learning rate also applies to DAT.
     Since distributed learning introduces  machine-machine communication overhead,
another line of work
\citep{alistarh2017qsgd,yu2019double, bernstein2018signsgd,wangni2018gradient,stich2018sparsified,
wang2019slowmo} focused on the design of communication-efficient distributed optimization algorithms.
    %  Gradient compression is a commonly-used technique to 
    %  reduce the communication cost, such as
    %  gradient quantization \citep{alistarh2017qsgd,yu2019double, bernstein2018signsgd} and gradient sparsification \citep{wangni2018gradient,stich2018sparsified}. We will show that DAT allows for not only large-batch optimization but also   gradient compression.
    
     The study on distributed learning is extensive, but the problem of distributed min-max optimization is less explored, {with some exceptions \citep{srivastava2011distributed,notarnicola2018duality,tsaknakis2020decentralized,liu2019decentralized,liu_aryan2019decentralized}}. A key difference to our work is that 
     none of the aforementioned literature studied
     the \textit{large-batch min-max optimization} with its  applications to training \textit{robust} DNNs, neither theoretically nor empirically.
     %but only a few work focused on {distributed min-max} optimization algorithms \citep{srivastava2011distributed,notarnicola2018duality,hanada2017simple,tsaknakis2020decentralized}. A significant difference between our work and theirs 
    %  none of the aforementioned literature studied  the notion of {distributed min-max} optimization algorithm with its  applications to training robust DNNs, neither theoretically nor empirically.
     %The work in \citep{liu2019decentralized,liu_aryan2019decentralized} proposed algorithms   for training Generative Adversarial Nets (GANs).%
     {While there are recent proposed algorithms for training Generative Adversarial Nets (GANs) \citep{liu2019decentralized,liu_aryan2019decentralized}, training robust DNNs against adversarial examples is intrinsically different from GAN training. In particular, training robust DNNs requires inner maximization with respect to each training data rather than empirical maximization with respect to model parameters.} Such an essential difference leads to different optimization goals, algorithms, convergence analyses and implementations.
     
     \mycomment{ The  work \citep{liu2019decentralized,liu_aryan2019decentralized} proposed algorithms   for training Generative Adversarial Nets (GANs).
     %which also adopt a min-max problem formulation. 
     However, training robust DNNs against adversarial examples is intrinsically different from   GAN training, where the former requires inner maximization with respect to each training data rather than empirical maximization with respect to model parameters. }
     
     %which also adopt a min-max problem formulation. 
     %However, training robust DNNs against adversarial examples is intrinsically different from   GAN training,%
     
     %\SL{@xiangyi@songtao, please check distributed min-max literature and state differences with ours if necessary.}

%\MH{[distributed min-max exisits; but not in the large-batch setting. I think we need to make this clear.]}
     
     %\citep{liu2019decentralized,liu_aryan2019decentralized,tsaknakis2020decentralized,notarnicola2016duality}. 
 %%%% Gradient compression 
%%%% In this paper, DAT
%%%%% distributed min-max

%\SL{[refs]}      
 % Gradient compression techniques have been proposed for the standard distributed training case
  %(i.e., machines). 
  
%   The {unbiased} gradient compression operations often include  a) gradient randomized quantization \citep{alistarh2017qsgd,yu2019double} and b) randomized sparsification \citep{wangni2018gradient,yu2019double}.
%   \citep{bernstein2018signsgd}, and b) top-$k$ sparsification
% \citep{stich2018sparsified}
  
%%% distributed standard training, 
%%%% communication-efficient distributed training 
%%%% distributed gAN
%%% . Another line of work

%\paragraph{Large-batch optimization}


% \quad \quad \textit{Can we make full use of the computing capability of distributed machines to speed up AT?}

%  A {distributed}  learning solution offers two main benefits. First, it enables to expand the computational scalability with the use of   multiple machines, even if each of which only has stringent GPU resources. Second, it enables to jointly train a DNN over distributedly stored (labeled and unlabeled) data.  Recent works \SL{[xxx]} have witnessed many successful applications of distributed learning in training \textit{standard} image classifiers \SL{[refs]}, however, it leaves the question of \textit{distributed adversarial training} open to build robust DNNs. 
%  %keep training data locally stored to each participating machine, and thus, is 
 
%  To the best of our knowledge, it was anopen question whether any convergence rate analysis canbe established for black-box min-max (bi-level) optimiza-tion.
%  has recently been found effective in speeding up standard training \SL{[xxx]}, since it enables to expand the computational scalability with the use of   multiple machines, even if each of which only has stringent GPU resources. Additionally, 
% distributed learning  
%  data-locality, the ability to perform
% joint training while keeping each part of the training data local to each participating device.

% Distributed machine learning—i.e. the training of machine learning models using distributed optimization
% algorithms—has recently enabled many successful applications in research and industry.
% Such methods offer two of the key success factors: 1) computational scalability by leveraging the
% simultaneous computational power of many devices, and 2) data-locality, the ability to perform
% joint training while keeping each part of the training data local to each participating device.

%despite the recent progress in accelerating AT 
%%%% Adversarial robustness and training bottleneck %%%%%%%%%%%%

%%% large-batch standard training: 
%%% scaling DNN training to larger numbers of processors
%%% focus on developing training algorithms that enable increasing the batch size in data-parallel synchronous stochastic gradient descent without losing accuracy over a fixed number of epochs.
%%% LAMB: Bert
%%%% non AT, and non distributed setting 

%%%%% Fast adversarial training: simple inner maximization. rather than make full use of the computing capability of distributed machines even if each of which ...

%%%%%% distributed learning: communication bottleneck rather than large-batch setting, (Multiple approaches have been proposed to reduce the communication overhead in distributed training,)
%%% non min-max (focusing only on standard training) for GAN non-AT,

%%%% decentralized schemes can be as efficient as the centralized approaches;to reduce the amount of data that has to be sent over each communication link in the network.
%%%%%%%%% DECENTRALIZED DEEP LEARNING WITH ARBITRARY COMMUNICATION COMPRESSION
%%%%%%%%%% SLOWMO: IMPROVING COMMUNICATION-EFFICIENT DISTRIBUTED SGD WITH SLOW MOMENTUM



\section{Problem Formulation}
In this section, we first review the standard  setup of    adversarial training (AT)  \citep{madry2017towards}, and then 
%We then motivate the need of  distributed AT (DAT)     and 
propose a  general    min-max    setup for distributed AT (DAT).    

\paragraph{Adversarial training.}
AT \citep{madry2017towards} is a  min-max  optimization method for training   robust ML/DL models against adversarial examples \citep{Goodfellow2015explaining}. 
% namely, crafted inputs with imperceptible perturbations, which make the model misbehaved in terms of erroneous prediction/classification. 
Formally, AT solves the %following min-max optimization
problem 

{
% \small
% \vspace*{-5mm}
{\begin{align}
 \label{eq: adv_train}
    \begin{array}{l}
\displaystyle\minimize_{\boldsymbol{\theta}} ~\mathbb E_{(\mathbf x,  y) \in \mathcal D} \left [   \maximize_{ \| \boldsymbol{\delta} \|_\infty \leq \epsilon }  ~  \ell(\boldsymbol{\theta},  \mathbf x + \boldsymbol{\delta}; y) \right ], 
    \end{array}
\end{align}} }%
where $\boldsymbol \theta \in \mathbb R^{\modeldim}$ denotes the vector of model parameters, $\boldsymbol \delta \in \mathbb R^n$ is the vector of input perturbations within an $\ell_\infty$ ball of the given radius $\epsilon$, namely, $\| \boldsymbol{\delta} \|_\infty \leq \epsilon$, $(\mathbf x, y) \in \mathcal D$ corresponds to the training example $\mathbf x$ with label $y$ in the dataset $\mathcal D$, and $\ell$ represents a pre-defined training loss, e.g., the cross-entropy (CE) loss. The rationale behind problem \eqref{eq: adv_train} is that the model $\boldsymbol \theta$ is robustly trained against the \textit{worst-case}  loss induced by the   adversarially perturbed samples. 
%$\{ \mathbf x + \boldsymbol \delta (\mathbf x)\}_{\mathbf x \in \mathcal D}$, where $\boldsymbol \delta (\mathbf x) \Def \maximize_{ \| \boldsymbol{\delta} \|_\infty \leq \epsilon }  ~  \ell(\boldsymbol{\theta},  \mathbf x + \boldsymbol{\delta}; y)$. 
%denotes the solution to the inner maximization problem of \eqref{eq: adv_train}.
It is  worth noting that the AT  problem \eqref{eq: adv_train}   is \textit{different} from   conventional stochastic min-max optimization problems, e.g.,  GANs training \citep{goodfellow2014generative}. Note that in \eqref{eq: adv_train}, the stochastic sampling corresponding to the expectation over
$(\mathbf x, \mathbf y) \in \mathcal D$  is conducted \textit{prior to} %\PY{(behind is not clear here. Maybe use "conducted prior to the inner maximization operation")} 
%\SL{rather than prior to}
the inner maximization operation. Such a difference leads to the \textit{sample-specific} adversarial perturbation $\boldsymbol \delta (\mathbf x) \Def \maximize_{ \| \boldsymbol{\delta} \|_\infty \leq \epsilon }  ~  \ell(\boldsymbol{\theta},  \mathbf x + \boldsymbol{\delta}; y)$. 
%rather than the single (universal) perturbation   $\boldsymbol \delta$ against the  dataset $\mathcal D$. 
% As will be evident later, this difference will  make our theoretical analysis for the AT-type min-max problems   more challenging than the analysis for conventional  min-max  problems \SL{[xxx]}.
% \SL{[Is this true? @Songtao @xiangyi. Please also add some refs.]}
 
%of the generic form $\min_{\mathbf u} \max_{\mathbf v} \mathbb E_{\boldsymbol{\xi}} [\ell(\mathbf u, \mathbf v; \boldsymbol \xi)] $ 


\paragraph{Distributed AT (DAT).}
% The need of AT in a \textit{distributed} setting arises when centralized robust training becomes infeasible or ineffective. 
% For example,
% %from at least the following two  aspects: 
% training data are distributed as they cannot centrally be stored at a single machine due to  their size or privacy. 
%  Or computing units are   distributed as they
% %expands the individual capability of data storage or data privacy. 2)  Computing units are often distributed, provided by distributed machines,  
% %which enables 
% allow large-batch optimization to improve the   scalability of training.
%for ease of using large data batch and thus accelerating AT during optimization.
%In this paper, we consider the popular parameter server model of distributed learning \SL{[xxx]}.
%\PY{(Not sure we want to mention private datasets here. Since our experiments only use common dataset)}
 Let us consider a popular parameter-server model of distributed learning \citep{dean2012large}.
Formally, there exist $M$ workers each of which has access to a local dataset $\mathcal D^{(i)}$, and thus 
  $\mathcal D = \cup_{i=1}^M \mathcal D^{(i)}$. There also exists a server/master node (e.g., one of workers could perform as server), which collects   local information (e.g., individual gradients)  from the other workers to  update the model parameters $\boldsymbol\theta$. 
  Spurred by  \eqref{eq: adv_train}, 
DAT   solves  problems of the following   generic form,

 {
%  \small
%  \vspace*{-5mm}
 \begin{align}\label{eq: prob_DAT}
%\hspace*{-3mm}  
\begin{array}{l}
\displaystyle\minimize_{\boldsymbol{\theta}}  ~ \frac{1}{M}\sum_{i=1}^M  f_i(\boldsymbol \theta; \mathcal D^{(i)}), %\text{~~where}\\
\\
f_i \Deff
%\underbrace{
%\left \{ 
 \mathbb E_{(\mathbf x, y) \in \mathcal D^{(i)}} \left [ \lambda \ell(\boldsymbol \theta; \mathbf x, y) + 
 \max_{\| \boldsymbol{\delta} \|_\infty \leq \epsilon} \phi(\boldsymbol \theta, \boldsymbol \delta ; \mathbf x, y)
 \right ] %\right.
%  f_i \Deff
% %\underbrace{
% %\left \{ 
%  \mathbb E_{(\mathbf x, y) \in \mathcal D^{(i)}} \left [ \lambda \ell(\boldsymbol \theta; \mathbf x, y) + 
%  \max_{\| \boldsymbol{\delta} \|_\infty \leq \epsilon} \phi(\boldsymbol \theta, \boldsymbol \delta ; \mathbf x, y)
%  \right ] %\right.
%\\
% \hspace*{15mm} 
 %\left. 
%  + \mathbb E_{(\mathbf x, y) \in \mathcal D^{(i)}} \left [ \max_{\| \boldsymbol{\delta} \|_\infty \leq \epsilon} \phi(\boldsymbol \theta, \boldsymbol \delta ; \mathbf x, y) \right ]
%\right \}
%}_\text{$\Deff f_i(\boldsymbol \theta; \mathcal D^{(i)}) $}
    \end{array}
    %\hspace*{-3mm}
\end{align}}%
where $f_i$ denotes the local cost function at the $i$th worker,  $\phi$ is a robustness regularizer against the input perturbation $\boldsymbol \delta$, and $\lambda \geq 0$ is a regularization parameter that strikes a balance between the training loss  and the worst-case robustness regularization. %\LH{maybe worthwhile explicitly explaining the notation $\sim$ associated with the domain $\mathcal{D}_i$?} . 
In \eqref{eq: prob_DAT}, if $M = 1$, $\mathcal D^{(1)} = \mathcal D$, $\lambda = 0$ and $\phi = \ell$, then the DAT problem reduces to the  AT problem  \eqref{eq: adv_train}.
We cover two categories of  \eqref{eq: prob_DAT}.
\ding{172} DAT with {l}abeled {d}ata: 
In  \eqref{eq: prob_DAT}, we consider %$\lambda = 0$ and 
$\phi(\boldsymbol \theta, \boldsymbol \delta ; \mathbf x, y)  = \ell (\boldsymbol \theta, \mathbf x+ \boldsymbol \delta ;  y) $ with labeled training data $(\mathbf x, y) \in \mathcal D^{(i)}$ for $i \in [M]$. Here $[M]$ denotes the integer set $\{ 1,2,\ldots, M\}$. 
%\LH{I assume that the same framework would also hold for regression, where $M\in \mathbb{R}$ ?}.
\ding{173} DAT with {u}nlabeled {d}ata:
In \eqref{eq: prob_DAT}, 
%we consider %$\lambda > 0$, and
different from DAT with labeled data,  we augment $\mathcal D^{(i)}$ with an unlabeled dataset,
%$\mathcal U^{(i)}$ (namely, $\mathcal U^{(i)} \subseteq \mathcal D^{(i)}$),
and define the robust regularizer 
  $\phi$ as the pseudo-labeled worst-case CE loss \citep{carmon2019unlabeled} or the TRADES regularizer 
  \citep{stanforth2019labels,zhang2019theoretically}.
  
% {\small 
% \vspace*{-5mm}
% \begin{align}\label{eq: reg_ud}
%         \phi(\boldsymbol \theta, \boldsymbol \delta; \mathbf x) = \mathrm{CE}(\mathbf z(\mathbf x+\boldsymbol \delta ; \boldsymbol\theta),
%         \mathbf z(\mathbf x ; \boldsymbol\theta)
%         ).
%     \end{align}}%
%     Here $\mathbf{z}(\mathbf x; \boldsymbol \theta)$ represents  the  probability distribution   over    class labels predicted by  the  model $\boldsymbol \theta$, and $\mathrm{CE}$ denotes the %Kullback–Leibler divergence 
%     cross-entropy
%     %\LH{I assume that KL does decent job and there is no need to change to symmetric measures as JS?} 
%     function. 
    % between the prediction at the perturbed sample $(\mathbf x + \boldsymbol \delta)$ and that at the original sample $\mathbf x$. 
%\PY{The DAT-UD is missing the robust loss of the labeled data part. It seems incomplete}    
% where the robustness regularizer $\phi$ is  independent of the data label $y$, $\mathbf x$ can be drawn either from $\mathcal D^{(i)}$ or from an additional unlabeled dataset $\mathcal U^{(i)}$ per worker, 
% $\mathbf{z}(\mathbf x; \boldsymbol \theta)$ represents  the  probability distribution   over    class labels predicted by  the learning model $\boldsymbol \theta$, and $\mathrm{KL}$ denotes the Kullback–Leibler divergence function between the prediction at the perturbed sample $(\mathbf x + \boldsymbol \delta)$ and that at the original sample $\mathbf x$. 
% The effectiveness of  \eqref{eq: reg_ud} has recently been demonstrated by \citep{stanforth2019labels,zhang2019theoretically} for training robust models by leveraging unlabeled data in a centralized manner. 
\mycomment{
The   robustness regularizer \eqref{eq: reg_ud} can also be specified in another form \citep{stanforth2019labels,carmon2019unlabeled}, 
$
\phi(\boldsymbol \theta, \boldsymbol \delta; \mathbf x) = \mathrm{KL}(\mathbf z(\mathbf x+\boldsymbol \delta ; \boldsymbol\theta),
        \mathbf z(\mathbf x ; \boldsymbol\theta_{\mathrm{base}})
        )
$, where $\boldsymbol\theta_{\mathrm{base}}$ denotes  a well-trained standard based model to generate  some pseudo-labels of unlabeled  samples, and $\mathrm{KL}$ represents the Kullback–Leibler divergence function.
}
%Note that in \citep{stanforth2019labels}, the unlabeled data  with those pseudo-labels  are  used  in the first term (i.e., the standard training loss term)  of \eqref{eq: prob_DAT}.

% In the rest of the paper, we will develop 
%  the algorithm and the theoretical analysis that are  applicable to   the general formulation of DAT given by  \eqref{eq: prob_DAT}.
 
 
%  \SL{Addressed the following comments in Introduction.}
% {\color{red} Goes back to the key challenges. What do we need to address the key challenges. 
 
%  From the discussion above, we know that existing state-of-the-art algorithm are not yet scalable for training large models because(?): 1) centralized training is infeasible for these models; 2) direct implementation of these methods in a decentralized setting does not provide the desired speedup, becuase: 1) if we use the same mini-batch sample size as centralized method, then the computation of adversarial input might be speedup because of the use of multiple machines, but those are typically simple operatsions, and the constant communication will be a bottle neck (i.e., by using small local sample and constantly transmitting), 2) if fewer communication is used, then we will run into the issue of how to deal with large mini-batch  (?)}
 
 %\SL{Please be more specific here. What do you mean properties?}
 


%\section{A Unified Algorithmic Framework for DAT}



\section{Methodologies}
At the first glance, distributed learning  seems being naturally applied since
 problem \eqref{eq: prob_DAT} is decomposable over multiple workers.
Yet, the actual case is much more complex.
\textbf{First}, in contrast to standard AT, DAT %(Algorithm\,\ref{alg: DAT})
allows for using a $M$ times larger     batch size    to update the model parameters $\boldsymbol \theta$ in \eqref{eq: prob_DAT}.   Thus, given the same number of epochs, DAT takes $M$ fewer gradient updates  than AT.
Although there exist some large-batch model training techniques for  solving \textit{min-only} problems \citep{you2017large,you2017scaling,you2018imagenet,you2019large,goyal2017accurate,keskar2016large}, it remains unclear if they are effective to DAT due to its   \textit{min-max} optimization nature. 
%This has not been carefully studied both empirically and theoretically.
\textbf{Second}, either AT or distributed learning has its own challenges. In AT, for ease of attack generation, i.e., conducting inner maximization of \eqref{eq: prob_DAT}, fast gradient sign method (FGSM) was leveraged to improve its computation efficiency
\citep{Wong2020Fast}. In distributed learning, 
 gradient compression   \citep{alistarh2017qsgd,yu2019double} was   used for  
reducing communication overhead. Thus, it  also
remains unclear
%important to study
whether these customizations  are adaptable to DAT.  
In a nutshell, the distributed min-max optimization-based robust training algorithm has not been well studied previously, particularly in the use of different types of attack generators (inner maximization oracles), gradient quantization, large-batch size, and adaptive learning rate. Although either of the standalone techniques was studied separately, justifying their coherent integration `actually works' (both    practically and theoretically) is quite demanding.
%When taking into account effects of both AT and distributed learning,  it remains unclear if DAT
%For example, inner maximization of AT takes computationally-intensive iterative optimization solver 
%since DAT integrates AT with distributed learning, 
%to improve efficiency of either AT or distributed learning
%unexplored 


%\SL{by integrating state-of-the-art AT techniques with} 
%We then dissect it to the key components that enable  scalable training and efficient worker-server communications. 


% \begin{wrapfigure}{r}{0.01\textwidth}
% \vspace{-8mm}
% \begin{small}
% % \resizebox{0.5\textwidth}{!}{
% \begin{minipage}{0.5\textwidth}
% \begin{algorithm}[H]
% \caption{xxxx}
% \label{alg: DAT_Meta}
% \begin{algorithmic}[1]
%   \State {\bfseries Input:} Initial $\boldsymbol{\theta}_1$,  dataset $\mathcal D^{(i)}$  for each of $M$ workers,   and $T$ iterations 
% \State{\textbf{for} Iteration $t =  1,2,\ldots, T$}
%   \end{algorithmic}
% % ---------------
% % Now insert all to comments that we desire on specific line numbers:
% % \AddNote[blue]{4}{7}{Worker}
% % \AddNote[blue]{9}{10}{Server}
% % ---------------
% \end{algorithm}
% \end{minipage}
% % }
% \end{small}
% \vspace{-2mm}
% \end{wrapfigure}



% \begin{wrapfigure}{R}{0.5\textwidth}
% \vspace*{-8mm}
%     \begin{minipage}{0.5\textwidth}
%       \begin{algorithm}[H]
%           \caption{Meta-version of DAT 
%         (Alg.\,\ref{alg: DAT} in Supplement)}
%         \begin{algorithmic}[1]
%           \For{Worker $i = 1,2, \ldots, M$} \hfill \Comment{\textcolor{blue}{Block 1}}
%              \State   Sample-wise attack generation \eqref{eq: inner_max_alg}
%     \State Local gradient computation \eqref{eq: stoch_grad_batch} 
% \State Worker-server communication 
% \EndFor
%   \State Gradient aggregation at server \eqref{eq: grad_agg}  
% \hfill\Comment{\textcolor{red}{Block 2}}
%     \State Server-worker communication 
%              %\FOR{$i=1$ {\bfseries to} $m-1$}
%             \For{Worker $i = 1,2, \ldots, M$}
%           %\FOR{Worker $i = 1,2, \ldots, M$} 
%           \hfill \Comment{\textcolor{blue}{Block 3}}
%           \State Model parameter update \eqref{eq: outer_min}
%  %    \hspace*{4.5mm}
%   %   update model via SGD calling for \eqref{eq: GD_upper}, 
%     %  \begin{align}
%     %      \btheta_{t+1} = \btheta_t - \alpha \frac{d \ell_{\mathrm{tr}}(\boldsymbol \theta_t, \boldsymbol \delta^*(\boldsymbol \theta_t))}{d \btheta }
%     %      \label{eq: SGD}
%     %  \end{align}
% %     \Comment{Other optimizers can also be applied.}
%       \EndFor
%         \end{algorithmic}
%   \label{alg: DAT_meta_form}
%       \end{algorithm}
%     \end{minipage}
%   \end{wrapfigure}
\paragraph{Algorithmic framework of DAT.}
DAT follows the   framework of distributed learning   with   parameter server. In what follows, we elaborate on its key components
through its meta-form shown by Algorithm\,1 (see its detailed version in Algorithm\,\ref{alg: DAT}).  DAT contains three algorithmic blocks. In the \textit{first} block, 
%(Steps\,3-8 of Algorithm\,\ref{alg: DAT}), 
every distributed worker    calls for a maximization oracle to obtain  the adversarial perturbation for each sample within a   data batch, then computes the gradient of the local cost function $f_i$ in \eqref{eq: prob_DAT} with respect to (w.r.t.) model parameters $\boldsymbol \theta$. And every worker is 
allowed to quantize/compress the local gradient prior to transmission to the  server.
In the \textit{second} block,
%(Steps\,9-10 of Algorithm\,\ref{alg: DAT}), 
the server aggregates the local gradients, and transmits the aggregated gradient (or the quantized gradient) to the other workers.
In the \textit{third} block,
%(Steps\,11-13 of Algorithm\,\ref{alg: DAT}), 
the model parameters are eventually updated by a  minimization oracle at each worker based on the received gradient information from the server.
 
      
    %   \begin{algorithm}[htb]
    %     \caption{Meta-version of DAT %
    %     (Algorithm\,\ref{alg: DAT})
    %     }%
    %           \label{alg: DAT_meta_form}
    %     \begin{algorithmic}[1]
    %       \FOR{Iteration $t =  1,2,\ldots, T$}
    %       \STATE{\hspace*{0.0in}\textbf{for} Worker $i = 1,2, \ldots, M$}  \hfill\COMMENT{\textcolor{blue}{Block 1}}
    %       \STATE{\hspace*{0.2in} Sample-wise attack generation Eq.\,\eqref{eq: inner_max_alg}} 
    %       \STATE{\hspace*{0.2in} Local gradient computation Eq.\,\eqref{eq: stoch_grad_batch}} 
    %           \STATE{\hspace*{0.2in} Worker-server communication (optional)} 
    %          \STATE{\hspace*{0.0in}\textbf{end for}}
    %          \STATE{Gradient aggregation at server Eq.\,\eqref{eq: grad_agg}   }
    %          \hfill\COMMENT{\textcolor{red}{Block 2}}
    %          \STATE{Server-worker communication (optional)} 
    %          \STATE{\hspace*{0.0in}\textbf{for} Worker $i = 1,2, \ldots, M$} \hfill\COMMENT{\textcolor{blue}{Block 3}}
    %       \STATE{\hspace*{0.2in} Model parameter update Eq.\,\eqref{eq: outer_min}} 
    %          \STATE{\hspace*{0.0in}\textbf{end for}}
    %          \ENDFOR
    %     \end{algorithmic}
    %   \end{algorithm}
    
%       \begin{algorithm}[tb]
%   \caption{Meta-version of DAT 
%         (Algorithm\,\ref{alg: DAT})}
%   \label{alg: DAT_meta_form}
% \begin{algorithmic}
%               \FOR{Iteration $t =  1,2,\ldots, T$}
%           \FOR{Worker $i = 1,2, \ldots, M$}  %\hfill\COMMENT{\textcolor{blue}{Block 1}}
%           \STATE Sample-wise attack generation Eq.\,\eqref{eq: inner_max_alg}
%           \STATE Local gradient computation Eq.\,\eqref{eq: stoch_grad_batch} 
%               \STATE Worker-server communication (optional) 
%              \ENDFOR
%              \STATE Gradient aggregation at server Eq.\,\eqref{eq: grad_agg}   
%              %\hfill\COMMENT{\textcolor{red}{Block 2}}
%              \STATE Server-worker communication (optional) 
%              \FOR{Worker $i = 1,2, \ldots, M$} %\hfill\COMMENT{\textcolor{blue}{Block 3}}
%           \STATE Model parameter update Eq.\,\eqref{eq: outer_min}
%             %  \STATE{\textbf{end for}}
%              \ENDFOR
%              \ENDFOR
% \end{algorithmic}
% \end{algorithm}





 
% \mycomment{Algorithm\,\ref{alg: DAT} summarizes the meta-form of DAT, which consists of  three   blocks. 
% At Steps\,3-8 (Block 1) of Algorithm\,\ref{alg: DAT}, every distributed worker    calls for a maximization oracle to obtain  the adversarial perturbation for each sample within a   data batch. It then   computes the gradient of the local cost function $f_i$ in \eqref{eq: prob_DAT} with respect to (w.r.t.) model parameters $\boldsymbol \theta$. And every worker is 
% allowed to quantize the local gradient prior to transmission to a  server.
% At Steps\,9-10 (block 2) of Algorithm\,\ref{alg: DAT}, the server aggregates the local gradients, and transmits the aggregated gradient (or the quantized gradient) to the other workers. At Steps\,11-13 (block 3) of Algorithm\,\ref{alg: DAT}, 
% the model parameters are eventually updated by a  minimization oracle at each worker based on the received gradient information from the server.}
% %each worker calls for a  minimization oracle to update the model parameters based on the received gradient information from the server. 
% \mycomment{
% \begin{minipage}{0.95\textwidth}
% %\centering

% \begin{algorithm}[H]
% \caption{Distributed adversarial training (DAT) for solving problem \eqref{eq: prob_DAT}}
% \label{alg: DAT}
% \begin{algorithmic}[1]
%   \State {\bfseries Input:} Initial $\boldsymbol{\theta}_1$,  dataset $\mathcal D^{(i)}$  for each of $M$ workers,   and $T$ iterations 
% \State{\textbf{for} Iteration $t =  1,2,\ldots, T$}

% \hspace*{-0.598in}\vbox{\colorbox{non-photoblue}{\vbox{
%  \State{\hspace*{0.2in}\textbf{for} Worker $i = 1,2, \ldots, M$} \Comment{\textcolor{blue}{Worker}}
% \State {\hspace*{0.4in} Draw a finite-size data batch  $\mathcal B_{t}^{i} \subseteq \mathcal D^{(i)} $} 
% \State {\hspace*{0.4in} For each data sample  $\mathbf x \in \mathcal B_{t}^{i}$, call for an \textit{inner maximization oracle}:
% {\small\begin{align}\label{eq: inner_max_alg}
% \boldsymbol{\delta}_t^{(i)}(\mathbf x) \Def \argmax_{ \| \boldsymbol{\delta} \|_\infty \leq \epsilon }  ~  \phi(\boldsymbol{\theta}_{t}, \boldsymbol{\delta}; \mathbf  x),
% \end{align}}%
% \hspace*{0.4in} where we omit the label or possible pseudo-label $y$ of $\mathbf x$ for brevity %if it is used
% }  % \MH{``if it is used" what does it mean?}
% \State {\hspace*{0.4in} Computing local gradient of $f_i$ in \eqref{eq: prob_DAT} with respect to $\boldsymbol \theta$ given perturbed samples: 
% {\small\begin{align}\label{eq: stoch_grad_batch}
%     \mathbf g_t^{(i)} =  \lambda \mathbb E_{\mathbf x \in \mathcal B_t^{(i)}}  [ \nabla_{\boldsymbol \theta}\ell(\boldsymbol \theta_{t}; \mathbf x)  ] + \mathbb E_{\mathbf x \in \mathcal B_t^{(i)}}  [ \nabla_{\boldsymbol \theta} \phi(\boldsymbol \theta_{t}; \mathbf x + \boldsymbol \delta_t^{(i)}(\mathbf x) )  ]
% \end{align}}}%
% \State {\hspace*{0.4in} (\textit{Optional}) Call for \textit{gradient quantizer} $Q(\cdot)$ and transmit
%   $ Q(\mathbf g_t^{(i)})$ to   server}
%   \State {\hspace*{0.2in}\textbf{end for}}
%   }}}%
  
%   \hspace*{-0.598in}\vbox{
%   \colorbox{babypink}
%   {\vbox{
%   \State {\hspace*{0.2in}Gradient aggregation at  server: \Comment{\textcolor{red}{Server}}
% %   $\hat {\mathbf g}_t = \frac{1}{M} \sum_{i=1}^M Q(\mathbf g_t^{(i)})$
%  {\small \begin{align}\label{eq: grad_agg}
%   \hat {\mathbf g}_t = \textstyle \frac{1}{M} \sum_{i=1}^M Q(\mathbf g_t^{(i)})
%   \end{align}}%
%   }
%   \State {\hspace*{0.2in}(\textit{Optional}) Call for \textit{gradient quantizer}   $\hat{\mathbf g}_t \leftarrow Q(\hat{\mathbf g}_t) $,
%   and transmit  
%   $ \hat {\mathbf g}_t$ to workers: 
%   }
%   }}}%
  
%   \hspace*{-0.598in}\vbox{\colorbox{non-photoblue}{\vbox{
% \State{\hspace*{0.2in}\textbf{for} Worker $i = 1,2, \ldots, M$}\Comment{
% \textcolor{blue}{Worker}}
%   \State {\hspace*{0.4in} Call for an \textit{outer minimization oracle} $\mathcal A(\cdot)$ to update $\boldsymbol \theta$:  
%  { \small \begin{align}\label{eq: outer_min}
%       \boldsymbol \theta_{t+1} = \mathcal A(\boldsymbol \theta_{t}   \hat {\mathbf g}_t, \eta_t), \quad \quad  \text{$\eta_t$ is learning rate}
%   \end{align}}%
%   %\hspace*{0.4in}
%   %\MH{[shall we indicating the dependency on local data?]} 
%  % where $\eta_t$ denotes a   constant learning rate at $t$
%  }
%   \State{\hspace*{0.2in}\textbf{end for}}
%   }}}%
  
%   \State{ \textbf{end for}}
%   \end{algorithmic}
% % ---------------
% % Now insert all to comments that we desire on specific line numbers:
% % \AddNote[blue]{4}{7}{Worker}
% % \AddNote[blue]{9}{10}{Server}
% % ---------------
% \end{algorithm}
% \end{minipage}
% }


% \mycomment{
% In contrast to standard AT, DAT %(Algorithm\,\ref{alg: DAT})
% allows for using a $M$ times larger     batch size    to update the model parameters $\boldsymbol \theta$.   Thus, given the same number of epochs, DAT takes $M$ fewer gradient updates  than AT. %less %total number of iterations than AT.
% In addition,  distributed learning  introduces   communication  overhead. To address this issue, it is optional to perform gradient quantization at both worker and server sides when a   very large model is possibly trained.
% %e.g., VGG-19 \SL{[ref]} contains \SL{xxx} MB parameters.  \SL{[@GZ, please fill the previous information.]}
% % It is also worth mentioning that different from many distributed optimization methods, 
% % \SL{[@SL@XC@MH, Please add some differences with existing distributed min-max.]} 
% %%% TMb/D = epochs , centralized TMb/D * D/b = TM iterations 
% We elaborate on DAT in what follows.
% }

      \begin{algorithm}[H]
          \caption{Meta-version of DAT 
        (Alg.\,\ref{alg: DAT} in Supplement)}
        \begin{algorithmic}[1]
          \For{Worker $i = 1,2, \ldots, M$} \hfill \Comment{\textcolor{blue}{Block 1}}
             \State   Sample-wise attack generation \eqref{eq: inner_max_alg}
    \State Local gradient computation \eqref{eq: stoch_grad_batch} 
\State Worker-server communication 
\EndFor
  \State Gradient aggregation at server \eqref{eq: grad_agg}  
\hfill\Comment{\textcolor{red}{Block 2}}
    \State Server-worker communication 
             %\FOR{$i=1$ {\bfseries to} $m-1$}
            \For{Worker $i = 1,2, \ldots, M$}
          %\FOR{Worker $i = 1,2, \ldots, M$} 
          \hfill \Comment{\textcolor{blue}{Block 3}}
          \State Model parameter update \eqref{eq: outer_min}
 %    \hspace*{4.5mm}
  %   update model via SGD calling for \eqref{eq: GD_upper}, 
    %  \begin{align}
    %      \btheta_{t+1} = \btheta_t - \alpha \frac{d \ell_{\mathrm{tr}}(\boldsymbol \theta_t, \boldsymbol \delta^*(\boldsymbol \theta_t))}{d \btheta }
    %      \label{eq: SGD}
    %  \end{align}
%     \Comment{Other optimizers can also be applied.}
      \EndFor
        \end{algorithmic}
  \label{alg: DAT_meta_form}
      \end{algorithm}

     

\paragraph{Large-batch challenge in DAT and a
{l}ayerwise adaptive learning rate (LALR) solution.}
%\SL{[I rewrote the following section. Please check]}
%   \paragraph{Outer minimization by {l}ayerwise adaptive learning rate (LALR)}
In DAT, the aggregated gradient (Step\,6 in Algorithm\,1)  
 is built on the data batch that is $M$ times larger than the standard AT. This leads to a large-batch challenge in min-max optimization.
% However, the large-batch training often leads to a significant loss in accuracy  if it runs for the same number of epochs as the small-batch training, since the former takes a less number of iterations than the latter \citep{krizhevsky2014one,you2018imagenet}.
% Recall from Figure\,\ref{fig: motivating_example} that the direct distributed implementations of AT   using FGSM  \citep{Wong2020Fast}  causes a performance loss in both accuracy and robustness when the batch size increases.
This  challenge  can also be verified 
from Fig.\,\ref{fig: AT_large_batch_ImageNet}.
%when 
%applying the   \underline{l}arge-batch \underline{SGD} (LSGD) method to robust training in a distributed   setup  \citep{goyal2017accurate,xie2019feature}.
%We show that
%     \begin{wrapfigure}{r}{40mm}
%     %\begin{figure}
%     \vspace*{-0.05in}
% \centerline{
% \hspace*{-5mm}
% \begin{tabular}{c}
% \hspace*{-0.1in}
% \includegraphics[width=.26\textwidth,height=!]{Figures/plotf1.pdf} 
% %& \hspace*{-0.2in}
% %\includegraphics[width=.29\textwidth,height=!]{Figures/DAT_LSGD_ImageNet.pdf}  
% %\\
% %\footnotesize{(a)} &   \footnotesize{(b)}
% \end{tabular}}
% \vspace*{-0.13in}
% \caption{\small{{{RA and TA of DAT-FGSM (our proposal using LALR) and DAT-LSGD versus number of computing nodes, each of which uses $2048$ batch size in the setting of (CIFAR-10, ResNet-18). 
% \SL{[Delete]}
% %Here 
% % a DAT implementation using LSGD \citep{xie2019feature} refers to DAT-LSGD and our proposal using FGSM  and  LALR refers to DAT-FGSM.
% %. Here each worker uses $2048$ batch size.
% %; see detailed experiment setting in Sec.\,\ref{sec: exp}.
% %\SL{[update legends]}
% }}
% % Here the fine-tuning is conducted by AT at a single computing node, and the target datasets are CIFAR-10 and CIFAR-100.
% %   Accuracy (TA or RA)  of the fine-tuned model is compared to that of    the end-to-end  training at CIFAR by DAT-PGD from scratch using  the same number of epochs   as the  fine-tuning. \SL{Update legends: pre-training on ImageNet, no pre-training}
% %(a) Fine-tuning over CIFAR-10. (b):  Fine-tuning over CIFAR-100.
% % pre-trained xxx \SL{[model name]} over dataset $\mathcal A$ using DAT. Left: RA against PGD attacks of different perturbation sizes during testing. Right:  RA against PGD attacks of different steps during testing.
% }}
%   \label{fig: LSGD_DAT_CIFAR10}
%   \vspace*{-0.1in}
% %\end{figure}
% \end{wrapfigure}
% As demonstrated by Figure\,\ref{fig: LSGD_DAT_CIFAR10}, 
To overcome the large-batch challenge, 
% the DAT implementation using LSGD (we call DAT-LSGD) also incurs a   performance drop in   large-batch scenarios.
% By contrast, the studied 
we adopt the technique of 
layerwise adaptive learning rate (LALR), backed up by   the recent successful applications to the standard  training of large-scale image classification and language modeling networks with large data batch \citep{you2019large,you2017scaling}.
%     \begin{figure}
%     %\begin{figure}
%     %\vspace*{-0.1in}
% \centerline{
% \hspace*{-6mm}
% \begin{tabular}{c}
% \hspace*{-0.1in}
% \includegraphics[width=.35\textwidth,height=!]{Figures/plotf1.pdf} 
% %& \hspace*{-0.2in}
% %\includegraphics[width=.29\textwidth,height=!]{Figures/DAT_LSGD_ImageNet.pdf}  
% %\\
% %\footnotesize{(a)} &   \footnotesize{(b)}
% \end{tabular}}
% \vspace*{-0.13in}
% \caption{\small{{{Robust and standard accuracy comparison between a DAT implementation using LSGD \citep{xie2019feature} and proposed DAT implementation using FGSM and  LALR
% on (CIFAR-10, ResNet-18). Here TA represents standard test accuracy and RA denotes robust accuracy; see detailed experiment setting in Sec.\,\ref{sec: exp}.
% %\SL{[update legends]}
% }}
% % Here the fine-tuning is conducted by AT at a single computing node, and the target datasets are CIFAR-10 and CIFAR-100.
% %   Accuracy (TA or RA)  of the fine-tuned model is compared to that of    the end-to-end  training at CIFAR by DAT-PGD from scratch using  the same number of epochs   as the  fine-tuning. \SL{Update legends: pre-training on ImageNet, no pre-training}
% %(a) Fine-tuning over CIFAR-10. (b):  Fine-tuning over CIFAR-100.
% % pre-trained xxx \SL{[model name]} over dataset $\mathcal A$ using DAT. Left: RA against PGD attacks of different perturbation sizes during testing. Right:  RA against PGD attacks of different steps during testing.
% }}
%   \label{fig: LSGD_DAT_CIFAR10}
%   \vspace*{-0.1in}
% %\end{figure}
% \end{figure}


 To be more specific,
 the model training recipe 
 %(i.e.,  $\mathcal A$ in Eq.\,\eqref{eq: outer_min})
 using LALR becomes
 %in Eq.\,\eqref{eq: outer_min} is then given by 
%as the gradient descending step, 
%integrated with  the laywerwise adaptive learning rate: 

{
% \small 
% \vspace*{-5mm}
\begin{align}\label{eq: ada_learn}
     \boldsymbol \theta_{t+1,i} = \boldsymbol \theta_{t,i}  -    \frac{ \layerscale(\|  \boldsymbol \theta_{t,i} \|_2) \cdot \eta_t }{\| \mathbf u_{t,i} \|_2} \cdot \mathbf u_{t,i},
     %\frac{\mathbf u_{t,i}}{\| \mathbf u_{t,i} \|_2},
     \quad \forall i \in [\layernum],
 \end{align}}%
 where   \mycomment{{\color{red}Songtao: change $\btheta_{t-1}$ to $\btheta_t$ and $\btheta_t$ to $\btheta_{t+1}$ because you didn't change $\bu_t$}}
 $\boldsymbol \theta_{t,i}$ denotes the $i$th-layer  parameters at iteration $t$, with $\boldsymbol \theta_t = [ \boldsymbol \theta_{t,1}^{\top}, \ldots, \boldsymbol \theta_{t,\layernum}^{\top}  ]^{\top}$, $\layernum$ is the number of layers, $\mathbf u_t$ is a descent direction computed based on the first-order gradient w.r.t. model parameters $\btheta_t$, %first-order information $Q(\hat {\mathbf g}_t)$,
 $\layerscale(\| \boldsymbol \theta_{t,i} \|_2) = \min\{ \max \{\| \boldsymbol \theta_{t,i} \|_2, c_l\}, c_u \}$ is a \textit{layerwise} scaling factor of the \textit{adaptive} learning rate $\frac{\eta_t}{\| \mathbf u_{t,i} \|_2}$,  and  {$c_l = 0$ and $c_u = 10$ are set in our experiments (see Appendix\,\ref{app: train_setting} for some ablation studies on hyperparameter selection).} 
% \SL{@GZ, please confirm the choice of $\sigma$ in implementation.}

  In \eqref{eq: ada_learn}, the specific form of the   descent direction $\mathbf u_t$ is determined by the optimizer employed. For example, if the adaptive momentum (Adam) method is used, then $\mathbf u_t$  is given by the exponential
moving average of past gradients scaled by  square root of exponential
moving averages of squared past gradients \citep{reddi2019convergence,chen2018convergence}. Such a variant of \eqref{eq: ada_learn} that uses Adam as the base algorithm is also known as LAMB  \citep{you2019large} in standard training. 
{However, it was elusive if the advantage of LALR is preserved in large-batch min-max optimization.}
As will be evident later, the effectiveness of LALR in DAT can be justified from both theoretical and empirical perspectives.
The rationale is that  the  layer-wise adaptive learning rate   smooths the optimization trajectory so that  a larger learning rate can be used without causing sharp optima even in distributed min-max optimization.
%can  boost the performance of DAT with large data batch in both  theory and practice.   %large-batch training. 
%%% large batch size can lead to a significant loss in accuracy,
%\SL{discussion server.}


% \SL{Inspired by the recent success of 
% LALR in  training large-scale image classification and language modeling networks with large data batch \citep{you2019large,you2017scaling}, 
% }
% The recent works \citep{you2019large,you2017scaling} showed that the use of LALR is the key to succeed in  training standard DNNs with large data batch. Spurred by that, we incorporate LALR in DAT. 


% \revision{
% We remark that none of  the aforementioned techniques (attack generation, randomized  gradient quantization and LALR) is of high complexity. By contrast, the computation gain from each aspect is apparent: FGSM simplifies inner maximization, gradient quantization reduces communication cost, and LALR supports the use of large data batches (and thus enables training over a small number of iterations). Their usage reduces the total training cost as verified in our  experiments while preserving the model convergence as supported in our theoretical analysis  introduced later.
% }


% \SL{Add `overview' paragraph.}

% \SL{Add `large-batch optimization challenge' and solution paragraph.}
% \begin{itemize}
%     \item Baseline setting as bag of tricks.
%     \item Small batch size: LALR hurts? (No); large-batch size: non-LALR, LALR; nontrivalness of LALR in large-batch setting. AT versus batch size; AT + LALR versus batch size; DAT versus batch size; DAT + LALR versus batch size
    
%     \SL{[Organize CIFAR-10 results here; please align experiments with baseline setting at page 9 \url{https://arxiv.org/pdf/2010.00467.pdf}.]}
% \end{itemize}


% \SL{Add `other add-ons to improve computation and communication efficiency'}

\paragraph{Other add-ons for DAT.}
In what follows, we illustrate two add-ons to improve computation and communication efficiency of DAT.  


{\ding{226}}
\textit{Inner maximization: Iterative vs. one-shot solution.}
%  \paragraph{Inner maximization: Iterative and one-shot solutions}
In DAT,  each worker   calls for an inner maximization oracle to generate adversarial perturbations (Step\,2 of Algorithm\,1). 
% given by Eq.\,\eqref{eq: inner_max_alg} in Appendix\,\ref{app: alg_DAT}. 
We specify two solvers of perturbation generation: iterative projected gradient descent (PGD) and one-shot (projected) FGSM  \citep{Goodfellow2015explaining,Wong2020Fast}.
\mycomment{leading to
%We specify attack generation %\eqref{eq: inner_max_alg}  
  the unified form 
% \begin{align}
%   &  \boldsymbol \delta_t^{(i)}(\mathbf x) =  \left \{
%     \begin{array}{ll}
%       \mathbf z_K   & \text{iterative PGD for $K > 1$}  \\
%         \mathbf z_1 &  \text{one-shot FGSM}
%     \end{array} \right. \label{eq: inner_PGD_FGSM}\\
% &  \mathbf z_k =  \Pi_{[-\epsilon, \epsilon]^d} [ \mathbf z_{k-1} + \alpha \cdot  \mathrm{sign}( \nabla_{\boldsymbol \delta} \phi(\boldsymbol \theta_{t}, \mathbf z_{k-1}; \mathbf x) ) ], ~ k \in [K], \nonumber 
% \end{align}
\begin{align}
   &  \boldsymbol \delta_t^{(i)}(\mathbf x) = \mathbf z_K, \quad \mathbf z_k =  \Pi_{[-\epsilon, \epsilon]^d} [ \mathbf z_{k-1} + \alpha \cdot  \mathrm{sign}( \nabla_{\boldsymbol \delta} \phi(\boldsymbol \theta_{t}, \mathbf z_{k-1}; \mathbf x) ) ], ~ k \in [K],
   \label{eq: inner_PGD_FGSM}
%   \\
% &  \mathbf z_k =  \Pi_{[-\epsilon, \epsilon]^d} [ \mathbf z_{k-1} + \alpha \cdot  \mathrm{sign}( \nabla_{\boldsymbol \delta} \phi(\boldsymbol \theta_{t}, \mathbf z_{k-1}; \mathbf x) ) ], ~ k \in [K], \nonumber 
\end{align}
%\PY{I don't  see why we need seperate $z_1$ from other $z_K$ for $K>1$ here? Can't we just use $z_k$?}
where $K$ is the total number of iterations in the inner loop,
the cases of $K = 1$ and $K > 1$ correspond to iterative PGD attack and FGSM attack respectively,
$\mathbf z_k$ denotes the PGD update of $\boldsymbol \delta$  at the $k$th iteration,  $\mathbf z_0$ is a given intial point, 
$\Pi_{[-\epsilon, \epsilon]^d} (\cdot)$ denotes the projection    onto the box constraint  $  [-\epsilon, \epsilon]^d$, $\alpha > 0$ is a given step size, and $\mathrm{sign}(\cdot)$ denotes the element-wise sign operation.
%It is clear from \eqref{eq: inner_PGD_FGSM} that the computation time of DAT with iterative PGD is $K$ times more than that of DAT with FGSM. 
The recent work \citep{Wong2020Fast} showed that if FGSM is conducted with   random initialization $\mathbf z_0$ and a proper step size, e.g., $\alpha = 1.25 \epsilon$, then FGSM  can be  as effective    as iterative PGD in robust training. Indeed, we will  show in Sec.\,\ref{sec: exp} that the effectiveness of our proposed DAT-FGSM algorithm echoes the finding in \citet{Wong2020Fast}.}
%is also   echoed by DAT. 
Our experiments will show that FGSM together with LALR works well in DAT.
We  also remark that other   techniques \citep{shafahi2019adversarial,zhang2019you}  can also be used to simplify inner maximization, however,   we   focus on FGSM since it is  computationally lightest. 

{\ding{226}} \textit{Gradient quantization.}
 % \paragraph{Gradient quantization}
  In contrast to standard AT, DAT may call for   worker-server communications (Steps 4 and 7 of Algorithm\,1). That is,
  if a single-precision floating-point data type is used, then DAT needs to transmit $32\modeldim$ bits per worker-server communication at each iteration. Recall that $\modeldim$ is the dimension of   $\boldsymbol{\theta}$. In order to reduce the communication cost, DAT has the option to quantize the transmitted gradients using a 
 fixed number of bits fewer than $32$.  
 {We specify  the gradient quantization operation  
 %in Algorithm\,\ref{alg: DAT} 
 as the   \textit{randomized quantizer}   \citep{alistarh2017qsgd,yu2019double}.
  In Sec.\,\ref{sec: exp} we will show that DAT, combined with gradient quantization, still leads to a competitive  performance. For example, the robust accuracy of ResNet-50 trained by a $8$-bit DAT
(performing quantization at Step\,4 of Algorithm\,1)
for ImageNet is just $0.55\%$  lower than the   robust accuracy achieved by the $32$-bit DAT. 
%however, the communication cost is reduced by \SL{xxx} times.  
It is also worth mentioning that 
  the All-reduce communication protocol can be regarded as a special case of the parameter-server setting  in Algorithm\,1
  when   every worker performs as a server. In this case,  the communication network becomes fully connected and  only
% Lastly, we note that  if every worker performs as a server in DAT, then 
the worker-server communication   (Step\,4 of Algorithm\,1) is needed. Please refer to Appendix\,\ref{app: alg_DAT} for more details on gradient quantization.
%In this case, the communication network becomes fully connected. With synchronized communication, this is favored  for   training DNNs under the All-reduce operation.
 }
 % adopt a \textit{quantized} low-precision   representation of transmitted data based only on a limited number of bits.
%  When the number of model parameter $d$ is large, 
  
  %Quantized low-precision representation stores numbers
%using limited number of bits, contrast to the $32$-bit full-precision. 

\mycomment{
Let $b$ denote the number of bits ($b \leq 32$), and  thus there exists $s = 2^b$  quantization levels. We specify the gradient quantization operation $Q(\cdot)$ in Algorithm\,\ref{alg: DAT} as the   \textit{randomized quantizer}   \citep{alistarh2017qsgd,yu2019double}.  Formally,
the  quantization operation at the $i$th coordinate of a vector $\mathbf g$ is given by \citep{alistarh2017qsgd}
{
% \small
\begin{align}\label{eq: rand_q}
    Q( g_i) = \| \mathbf g \|_2 \cdot \mathrm{sign}(g_i) \cdot \xi_i(g_i,s),  \quad \forall i \in \{ 1,2, \ldots, \modeldim \}.
\end{align}}%
In \eqref{eq: rand_q},  $\xi_i(g_i,s)$ is a random number drawn as follows. Given $|g_i|/\| \mathbf g \|_2 \in [l/s, (l+1)/s]$ for some $l \in \mathbb N^+$ and $0 \leq l < s$, we  then  have
{
% \small
\begin{align}\label{eq: xi}
\xi_i(g_i,s) = \left \{ 
    \begin{array}{ll}
      l/s   & \text{with probability $1 - (s |g_i|/\| \mathbf g \|_2 - l)$}  \\
      (l+1)/s   &  \text{with probability $ (s |g_i|/\| \mathbf g \|_2 - l)$},
    \end{array}
    \right.
\end{align}}%
where $|a|$ denotes the absolute value of a scalar $a$, and  $\| \mathbf a \|_2$ denotes the $\ell_2$ norm of a vector $\mathbf a$.
The rationale behind using \eqref{eq: rand_q} is that  $Q(g_i)$ is an \textit{unbiased} estimate of $g_i$, namely,
$
\mathbb E_{\xi_i(g_i, s)}[Q(g_i)] =  g_i
$, with bounded variance. Moreover, we at most  need
 $(32 + \modeldim + b \modeldim  )$ bits to transmit the quantized  $Q(\mathbf g)$, where $32$ bits for $\| \mathbf g \|_2$, $1$ bit for sign of $g_i$ and $b$ bits for $\xi_i(g_i,s)$,
whereas it needs $32\modeldim$ bits for a single-precision %\Gaoyuan{single-precision} \LH{(double)} 
$\mathbf g$. Clearly, a small $b$ saves the communication cost.
We will show in Sec.\,\ref{sec: exp} that   DAT, combined with gradient quantization, still leads to a competitive  performance. For example, the robust accuracy of ResNet-50 trained by $8$-bit DAT
(performing quantization at Step\,7 of Algorithm\,\ref{alg: DAT})
for ImageNet is just $0.55\%$  lower than the   robust accuracy achieved by the $32$-bit DAT. 
%however, the communication cost is reduced by \SL{xxx} times.  
Lastly, we note that  if every worker performs as a server in DAT, then the quantization operation at Step\,10 of Algorithm\,\ref{alg: DAT} is no longer needed. In this case, the communication network becomes fully connected. With synchronized communication, this is favored  for   training DNNs under the All-reduce operation.
}


% We also note that if $s = 1$, then $\xi_i(g_i,s) \in \{ 0, 1\}$ and thus $Q( g_i) / \| \mathbf g \|_2 \in   \{ -1, 0, 1\} $.
% The statistics of randomized quantization have been derived in \citep[Lemma\,3.1]{alistarh2017qsgd}.

\mycomment{
  \paragraph{Outer minimization by {l}ayerwise adaptive learning rate (LALR)}
In DAT, the aggregated gradient (Step\,7 in Algorithm\,1)  used for updating  model parameters (Step\,10 in Algorithm\,1)
 is built on the data batch that is $M$ times larger than standard AT. 
% However, the large-batch training often leads to a significant loss in accuracy  if it runs for the same number of epochs as the small-batch training, since the former takes a less number of iterations than the latter \citep{krizhevsky2014one,you2018imagenet}.
The recent works \citep{you2019large,you2017scaling} showed that the use of LALR is the key to succeed in  training standard DNNs with large data batch. Spurred by that, we incorporate LALR in DAT. Specifically, 
 the parameter updating operation $\mathcal A$ in Eq.\,\eqref{eq: outer_min} is given by 
%as the gradient descending step, 
%integrated with  the laywerwise adaptive learning rate: 
{
% \small 
\begin{align}\label{eq: ada_learn}
     \boldsymbol \theta_{t+1,i} = \boldsymbol \theta_{t,i}  -    \frac{ \layerscale(\|  \boldsymbol \theta_{t,i} \|_2) \cdot \eta_t }{\| \mathbf u_{t,i} \|_2} \cdot \mathbf u_{t,i},
     %\frac{\mathbf u_{t,i}}{\| \mathbf u_{t,i} \|_2},
     \quad \forall i \in [\layernum],
 \end{align}}%
 where   \mycomment{{\color{red}Songtao: change $\btheta_{t-1}$ to $\btheta_t$ and $\btheta_t$ to $\btheta_{t+1}$ because you didn't change $\bu_t$}}
 $\boldsymbol \theta_{t,i}$ denotes the $i$th-layer  parameters, $\layernum$ is the number of layers, $\mathbf u_t$ is a descent direction computed based on the first-order information $Q(\hat {\mathbf g}_t)$,
 $\layerscale(\| \boldsymbol \theta_{t,i} \|_2) = \min\{ \max \{\| \boldsymbol \theta_{t,i} \|_2, c_l\}, c_u \}$ is a \textit{layerwise} scaling factor of the \textit{adaptive} learning rate $\frac{\eta_t}{\| \mathbf u_{t,i} \|_2}$, {$c_l = 0$ and $c_u = 10$ are set in our experiments (\revision{see Appendix\,\ref{app: train_setting} for results on tuning $c_u$}), and $\boldsymbol \theta_t = [ \boldsymbol \theta_{t,1}^{\top}, \ldots, \boldsymbol \theta_{t,\layernum}^{\top}  ]^{\top}$.} 
% \SL{@GZ, please confirm the choice of $\sigma$ in implementation.}
  In \eqref{eq: ada_learn}, the specific form of the   descent direction $\mathbf u_t$ is determined by the optimizer employed. For example, if the adaptive momentum (Adam) method is used, then $\mathbf u_t$  is given by the exponential
moving average of past gradients scaled by  square root of exponential
moving averages of squared past gradients \citep{reddi2019convergence,chen2018convergence}. Such a variant of \eqref{eq: ada_learn} that uses Adam as the base algorithm is also known as LAMB  \citep{you2019large} in standard training. 
{However, it was elusive if the advantage of LALR is preserved in large-batch min-max optimization.}
We show in both theory and practice that the use of LALR can significantly boost the performance of DAT with large data batch.   %large-batch training. 
%%% large batch size can lead to a significant loss in accuracy,
%\SL{discussion server.}

% \revision{
% We remark that none of  the aforementioned techniques (attack generation, randomized  gradient quantization and LALR) is of high complexity. By contrast, the computation gain from each aspect is apparent: FGSM simplifies inner maximization, gradient quantization reduces communication cost, and LALR supports the use of large data batches (and thus enables training over a small number of iterations). Their usage reduces the total training cost as verified in our  experiments while preserving the model convergence as supported in our theoretical analysis  introduced later.
% }
}



\section{Convergence Analysis  of DAT}
%\SL{Re-shape it as a seperate section.}
Although standard AT has been proved  with convergence guarantees \citep{wang2019convergence,gao2019convergence}, 
%To the best of our knowledge, 
none of existing work addressed the  convergence of DAT  and   took into account LALR and gradient quantization, even in the standard AT setup.
%, although 
 %AT has been proved  with convergence guarantees \citep{wang2019convergence,gao2019convergence}.
Different from AT,
DAT needs to quantify the descent errors from multiple sources (such as gradient estimation, quantization, adaptive learning rate, and  inner maximization oracle). Before showing the challenges of proving the convergence rate guarantees, we first give the following assumptions.

 
\paragraph{Assumptions.}
Defining $  \Psi(\btheta) \Def \frac{1}{M}\sum^M_{i=1}
f_i(\btheta; \mathcal D^{(i)})
$ in \eqref{eq: prob_DAT}, we measure the convergence of DAT  by the first-order stationarity  of $\Psi$.
%then a solution $\btheta^{\star}$ from DAT is  an $\convacc$-accuracy first-order stationary point  if   $\|\nabla \Psi(\btheta^{\star})\|\le\convacc$.
Prior to  convergence analysis, we impose the following assumptions: ($\mathcal{A}1$) $\Psi(\btheta)$ is with layer-wise  Lipschitz  continuous gradients; ($\mathcal A2$) $\phi)$ in \eqref{eq: prob_DAT} is strongly concave with respect to $\boldsymbol \delta$ and with   Lipschitz continuous gradients within the perturbation constraint; ($\mathcal A3$) Stochastic gradient   
%in Eq.\,\eqref{eq: stoch_grad_batch} 
is unbiased and has bounded variance  for each worker denoted by $\sigma^2$.
%\MH{I think we still need somehow to justify the strong concavity assumption, explicitly.} 
%The assumptions A1-A3 are standard in convergence analysis, following  \citep{wang2019convergence,you2019large}.
Note that the validity of ($\mathcal A2$)
could be justified from  \citep{sinha2018certifying,wang2019convergence} by imposing a strongly convex regularization into the  neighborhood of $\bdelta$. $\mathcal A2$ is   needed for tractability of analysis.  We refer readers to Appendix\,\ref{app: assumption} for more justifications on our assumptions ($\mathcal A1$)-($\mathcal A3$).  
%is necessary  due to not only the tractability of analysis but also its validity justified by  
\mycomment{In our analysis, we 
focus on the 1-sided quantization, namely, Step\,10 of Algorithm\,\ref{alg: DAT} is omitted, and
specify $\mathcal A$ by LAMB used in \citep{you2019large}.}  



\paragraph{Technical challenges.}
%  {In theory}, the   fundamental challenge is {how \textit{descent errors from coupled        sources} (gradient estimation, quantization, and LALR) are co-evolved with \textit{min-max optimization} in DAT}. We have proposed
%  a new descent lemma ({Lemma\,2})  to measure  the decrease of the objective value in the   context of alternative optimization, and showed that the bias error   resulted from the layer-wise normalization can    be compensated by  large-batch training (Theorem\,2). Prior to our work, 
%  we are   \textit{not} aware of any established convergence   analysis for   large-batch min-max optimization.
 {In theory},  the incorporation of LALR makes the analysis of min-max optimization highly non-trivial. 
The fundamental challenge  lies in the nonlinear coupling between the biased adaptive gradient estimate resulted from LALR and the additional error generated from alternating update in DAT. From \eqref{eq: ada_learn}, we can see that the updated $\btheta$ is based on the normalized gradient, while if we perform convergence by applying the gradient Lipschitz continuity, the descent of the objective is measured by $\nabla \Psi(\btheta_t)$. This mismatch in the magnitude results in the bias term. The situation here is even worse, since the maximization problem cannot be solved exactly,   the size of the bias  depends on how close between the output of the oracle and the optimal solution w.r.t. $\bdelta$ given $\btheta$.

We have proposed
 a new descent lemma ({Lemma\,\ref{le.desc}} in Appendix)  to measure  the decrease of the objective value in the   context of alternative optimization, and showed that the bias error   resulted from the layer-wise normalization can    be compensated by  large-batch training (Theorem\,\ref{th:main_simplify}). Prior to our work, 
 we are   \textit{not} aware of any established convergence   analysis for   large-batch min-max optimization.
 
 
\paragraph{Convergence rate.}
% In our theoretical results, we first show that the inner product between the gradient of the objective function and estimated gradient with bias can be upper bounded by the size of $\btheta$  plus bias from the maximization step and gradient estimate error from the minimization step, which is shown as in \leref{le.desc}
% {\bf Lemma 2}
% %\label{le.desc}
%  Under A1--A3,  suppose that sequence $\{\btheta_t\}$ is generated by DAT. Then, we have
% \begin{equation}
% \mathbb{E}[-\langle\nabla \Psi(\btheta_t),\bg\rangle] \le-\frac{\mathbb{E}\|\nabla\Psi(\btheta_t)\|^2}{2}+\varepsilon+\frac{(1+\lambda)\sigma^2}{MB}\label{eq.deskey}.
% \end{equation}
% We will show
%  that even in the case where the adaptive gradient estimate is a function of the DAT variables, 
% %under certain conditions, 
% the  estimate bias resulted from the layer-wise normalization can still be compensated by increasing the batch-size. (Please see \eqref{eq.bdw} and detailed proof in the appendix.)
% As a result,  DAT eventually achieves a linear speedup of reducing gradient estimate error w.r.t. the increasing number of computing nodes.
In Theorem\,\ref{th:main_simplify}, we  present the sub-linear rate of DAT.  
\begin{theorem}\label{th:main_simplify}
Suppose that assumptions $\mathcal A1$-$\mathcal A3$ hold, the inner maximizer of DAT provides a $\varepsilon$-approximate solution 
{(i.e., the $\ell_2$-norm of inner gradient is upper bounded by $ \varepsilon$)}, 
  and the learning rate is set by   $\eta_t\sim\mathcal{O}(1/\sqrt{T})$, then $\{\btheta_t\}_{t=1}^{T}$   generated by DAT 
yields the   convergence rate

{
% \small
% \vspace*{-5mm}
\begin{align}\label{eq: thr_rate}
    &\frac{1}{T}\sum^{T}_{t=1}\mathbb{E}\|\nabla_{\btheta} \Psi (\btheta_t)\|_2^2 \nonumber  \\
    = &  \mathcal O\left ( \frac{1}{\sqrt{T}} + %\frac{\sigma^2}{MB} + 
    \frac{\sigma}{\sqrt{MB}} + \min\left\{ \frac{d}{4^b}, \frac{\sqrt{d}}{2^b} \right\} + \varepsilon \right ),
\end{align}}%
where $b$ denotes the number of quantization bits, and $B =\min\{|\mathcal{B}_t^{(i)}|,\forall t,i\}$ stands for the  smallest batch size per worker.
% over $T$ iterations, where $\btheta_t$ is updated by LAMB. When $c_l\le\layerscale(v)\le c_u,\forall v$, $\kappa=c_u/c_l$, and the problem dimension at each layer is $d_i=d/h$,
% \begin{align}
% \frac{1}{T}\sum^{\top}_{t=1}\mathbb{E}\|\nabla_{\btheta} \Psi(\btheta_t)\|^2\le&\frac{\Delta_{\Psi}}{\eta_t c_l CT}+2\left(\varepsilon+\frac{(1+\lambda)\sigma^2}{M|\mathcal{B}|}\right)+4\delta^2 +
% \frac{\kappa\sqrt{3}}{C}\|\boldsymbol{\chi}\|_1+\frac{\eta_t c_u\kappa \|L\|_1}{2C}.
% \end{align}
% where $\Delta_{\Psi}:=\mathbb{E}[\Psi(\btheta_1)]-\mathbb{E}[\Psi(\btheta_{T+1})]$, $\boldsymbol{\chi}$ is a vector that represents the error terms and its the $(ih+j)$th entry is $\sqrt{\frac{(1+\lambda)\sigma^2_{ij}}{M|\mathcal{B}|}+\varepsilon_{ij}+\delta^2_{ij}}$, $L=[L_1,\ldots,L_h]$, $C=\sqrt{\frac{h(1-\beta_2)}{G^2d}}\frac{1}{4}$, $0<\beta_2<1$, $\beta_1=0$ in LAMB, $|\mathcal{B}|=\min\{|\mathcal{B}^{(i)}|,\forall i\}$, and $\varepsilon$ is the predefined error threshold that the inner maximization oracle needs to achieve.
\end{theorem}
%\vspace{-0.2cm}
\textbf{Proof}: Please see Appendix\,\ref{app: analysis}. \hfill $\square$

%\MH{Define $\epsilon$-accuracy.}
%\MH{Maybe need to have better explain the term $1/\sqrt{MB}$? why there is this constant? compared with which type of algorithm, large batch/large \# nodes can be beneficial?}

% {Note that it is known that the layer-wise normalization of the gradient will cause a biased term in the convergence result, which is quantified by $\mathcal{O}(/1\sqrt{MB})$. When the batch size is large, i.e., $B$ is large, this term will be shrunk. In DAT, by leveraging the multiple computing nodes, this bias can be further reduced in terms of $\sqrt{M}$. }

% reduced bias (resp. variance) of adaptive gradient by factor 1/\sqrt{MB} (reps 1/(MB)), where 1/\sqrt{M} (1/M) corresponds to the linear speedup by M machines

The  error rate given by  \eqref{eq: thr_rate} involves four terms.  The term $\mathcal O(1/\sqrt{MB})$ characterizes the benefit of using the large per-worker  batch size $B$ and $M$ computing nodes in DAT. It is introduced since the variance of adaptive gradients (i.e., $\sigma^2$) is reduced by a factor $1/MB$, where $1/M$ corresponds to the linear speedup by $M$ machines. In \eqref{eq: thr_rate},
the term $\min\{ \frac{d}{4^b}, \frac{\sqrt{d}}{2^b} \} $ arises due to the variance of compressed gradients, %\citep{alistarh2017qsgd}, 
and the other two terms imply the dependence on the number of iterations $T$ as well as the $\varepsilon$-accuracy of the inner maximization oracle.  
{We highlight that  our convergence  analysis (Theorem\,\ref{th:main_simplify}) is not barely a  combination of LALR-enabled   standard training  analysis \citep{you2019large,you2017scaling} and adversarial training  convergence analysis \citep{wang2019convergence,gao2019convergence}.
%The fundamental challenge lies in how errors from multiple sources are coupled and evolved with alternating updates in min-max optimization. 
Different from the previous work, we address the
  fundamental challenges in (a) quantifying the descent property of the objective value at the presence of multi-source errors during alternating min-max optimization, and (b) deriving the theoretical relationship between large data batch (across distributed machines) and the eventual convergence error of DAT. 
%   As presented in Appendix\,\ref{app: thr_1} and \ref{app: analysis}, a new descent lemma (Lemma\,\ref{le.desc}) is provided to    deal with the nonlinear coupling between the multi-source error and the true gradient. 
%   %For outer minimization, the descent bias resulted from the layer-wise normalization  further contributes to the convergence rate in a square root of the oracle error ($\varepsilon$) along with stochastic gradient estimate and quantization error. 
%   We prove that under some mild assumptions the bias resulting from the layer-wise normalization can still be compensated by increasing the batch size in the order of $O(M)$.
}


\section{Experiments 
%DAT for Robust Image Classification
}\label{sec: exp}
We empirically evaluate DAT and show its success  in training robust DNNs across multiple applications, which include \ding{172} adversarially robust ImageNet training, \ding{173} provably robust training by randomized smoothing, \ding{174}  semi-supervised robust training with unlabeled data, \ding{175} robust transfer learning, \ding{176} DAT using different   communication protocols. 
%over ImageNet and   CIFAR    datasets. 
% We measure the performance of DAT in the following four aspects: a)  accuracies  against benign and adversarial test inputs, b) scalability to multiple computing nodes, c) incorporation of unlabeled data, and d) transferability of  pre-trained model by DAT. 



\subsection{Experiment setup}
\paragraph{DNN models and datasets.} 
We use  Pre-act {ResNet-18}  \citep{he2016identity}  and   {ResNet-50} \citep{he2016deep} for image classification, where the former is shortened as ResNet-18.  
And we use ImageNet \citep{deng2009imagenet}  for supervised DAT and augmented CIFAR-10 \citep{carmon2019unlabeled}  for semi-supervised DAT. 
In the latter setup, 
% We train these models under  ImageNet \citep{deng2009imagenet} or augmented CIFAR-10 dataset \citep{carmon2019unlabeled} to demonstrate the  large-batch advantage. 
% %but preserve {ResNet-18} for CIFAR-10 only. 
% In the semi-supervised DAT setup, 
CIFAR-10 is augmented with
 unlabeled data drawn 
from $80$ Million Tiny Images.
When studying pre-trained model's  transferability, 
CIFAR-100 is used   as a   target dataset for down-stream classification. 

\paragraph{Computing resources.}
We train a DNN using $p$ computing nodes, each of which contains $q$  GPUs (Nvidia V100 or P100). Nodes are connected with 1Gbps ethernet.
\textit{A configuration of   computing resources is noted by $p \times q$.}
%) is provided in  experiments. 
If $p > 1$, then the training is conducted in a \textit{distributed} manner.
And we split training data  into $p$ subsets, each of which is stored at a local node.  
%The total number of GPUs that we can use is $pq = 36$. 
% Based on our resources, in the CIFAR experiments, we  consider $p \in\{1,6, 18, 24 \}$ machines, each of which has $1$ GPU. 
% In the ImageNet experiments, we consider $p \in \{ 1,6\}$ machines, each of which has $6$ GPUs.
%We remark that due to the  computing resource budget, we are not able to use as many GPUs as \citep{xie2019feature,kannan2018adversarial}. 
%However, as will be evident later, our proposals have advantages  in applicability  and  scalability across various computing configurations and adversarial scenarios. 
We   note that the  batch size  $6 \times 512 = 3072$  is used for ImageNet over  $36$ GPUs.
%By contrast,  the work \citep{xie2019feature}
%used the batch size $4096$ across $128$ GPUs.
Unless specified otherwise, 
DAT is conducted using Ring-AllReduce, which requires one-sided quantization in Step\,4 of Algorithm\,\ref{alg: DAT_meta_form}. 

%\Gaoyuan{The computing node for ImageNet has dual 22-core CPU, 512GB RAM and 6 Nvidia V100 GPUs while the computing node for CIFAR-10 has 16 core CPU, 128GB RAM and 1 Nvidia P100 GPU.}
% \SL{Unless specified otherwise, 
% CIFAR experiments and ImageNet experiments are conducted using $p \times 1$ and $p \times 6$  configurations, respectively.%$p \times 1$ computing resources for $p \in \{ 1, 6, 18\}$
% }
%\SL{[also need to mention unlabled data.]}

% \SL{Missing AutoAttack (AA) measure}.

% \SL{Single CIFAR-10 dataset (least emphasis)}

% \SL{CIFAR-10 + Unlabeled data}



\paragraph{Baseline methods.}
%\SL{Discussion needed on DAT-FGSM vs. Fast AT.}
We consider 2 \textit{variants} of  DAT: \underline{1)  {\textit{DAT-PGD}}}, namely, DAT using (iterative) PGD as the inner maximization oracle; and \underline{2) {\textit{DAT-FGSM}}}, namely, DAT using one-step (projected) FGSM \citep{Wong2020Fast} as the inner maximization oracle. 
Additionally, 
we consider $4$ training \textit{baselines}: \underline{1)  \textit{AT}}  \citep{madry2017towards}; \underline{2) \textit{Fast AT}} \citep{Wong2020Fast};   {\underline{3)
\textit{DAT w/o LALR}}}, namely, a   distributed implementation of AT or Fast AT
%, which is in the form of   DAT-PGD or DAT-FGSM
but \textit{without} considering LALR; 
and \underline{4)
\textit{DAT-LSGD}} \citep{xie2019feature}, namely, 
a {d}istributed implementation of {l}arge-batch {SGD} (LSGD) 
for  AT.
We remark that conventional AT and Fast AT are   centralized training  methods. 
%In our setup, the number of GPUs is limited  to $6$  at a single machine, and thus the largest batch size that the centralized method can use is around $2048$ for CIFAR-10 and $85$ for ImageNet. 
\mycomment{We also find that the direct implementation of Fast-AT  in a distributed way leads to a quite poor scalability versus the growth of   batch size, and thus  a worse distributed baseline than DAT-FGSM w/o LALR. 
Lastly, we remark that the work \citep{xie2019feature} proposed modifying a model architecture  by incorporating feature denoising. In contrast, DAT does not call for architecture modification. Thus, to enable a fair comparison, we use the same training recipe LSGD as \citep{xie2019feature}  in the DAT setting, leading to the considered distributed training baseline  DAT-LSGD.
}


\begin{table}[htb]
%\begin{wraptable}{r}{95mm}
% \vspace*{-1mm}
\begin{center}
\caption{
{DAT  (in gray color) on (ImageNet, ResNet-50), compared with baselines, in TA (\%), RA (\%), AA (\%), communication time per epoch (seconds), and total training time (including communication time)  per epoch (seconds). For brevity, `$p \times q$' represents `\# nodes $\times$  \# GPUs per node', 
`C' represents communication time in seconds, and `T' represents training time in seconds. 
%\SL{AA}
%SL{we need to show relative improvement compared to AT and Fast AT, like fine-tuning example.}
}
%All the training methods are conducted in \SL{xxx [@Gaoyuan]} epochs .
} 
% \vspace{-2.5mm}
\label{table: overall}
\begin{threeparttable}
\resizebox{0.48\textwidth}{!}{
\begin{tabular}{c|c|c|c|c|c|c|c}
\hline
\hline
\multirow{2}{*}{\begin{tabular}[c]{@{}c@{}}Method\end{tabular}} & \multicolumn{6}{c}{\textbf{ImageNet, ResNet-50}} \\ 
\cline{2-8}  & \begin{tabular}[c]{@{}c@{}}
$p \times q$
%Nodes $\times$\\GPUs per node
\end{tabular}
& Batch size
& TA (\%) & RA (\%) & AA (\%) & \begin{tabular}[c]{@{}c@{}}C (s)
%\\per epoch (s)
\end{tabular} & \begin{tabular}[c]{@{}c@{}}
T  (s)
%Training time\\per epoch (s)
\end{tabular}
 \\ \hline
  AT & $1\times 6$  & $512$ & 62.70 & 40.38 & 37.46 & NA & 6022  \\
% \mycomment{{DAT-PGD w/o LALR} & $6\times 6$  & $512$ & 62.36 %\decrease{(0.34)}
% %65.06 %\SL{check?} 
% &  %39.28
% 39.86 % \decrease{(0.52)}
% & \high{4324}  & \high{5663}\\}
DAT-PGD w/o LALR & $6\times 6$  & $6 \times 512$ & 
57.09   %\decrease{(5.61)}
%60.09 
%\SL{check?} 
&  34.02 %35.02 
%\decrease{(6.36)}
& 30.98
& 865  & 1932\\
\rowcolor{Gray}
{DAT-PGD} & $6\times 6$  & $6 \times 512$ & 63.75 %\increase{(1.05)}
& 38.45 
%\decrease{(1.93)}
& 36.04
& 898  & 1960\\
{Fast AT} & $1\times 6$  & $512$ & 58.99  % \decrease{(3.71)}
& 40.78 % \increase{(0.4)}
& 37.18 & NA & 1544  \\
{DAT-FGSM w/o LALR} & $6\times 6$  & $6 \times 512$ & 55.04 %\decrease{(7.66)}
%57.04 \SL{check?} 
&  35.03 %39.03 \SL{check?} 
%\decrease{(5.35)}
& 32.16
& 863  & 1080\\
\rowcolor{Gray}
DAT-FGSM & $6\times 6$  & $6 \times 512$ & 58.02  
& 40.27
& 36.02
& 859  & 1109\\
\hline
% \rowcolor{Gray}
% {DAT-FGSM (Warmup)} & $6\times 6$  & $6 \times 512$ & xx  %\decrease{(4.38)}
% & xxx
% %\increase{(1.1)}
% & xxx  & xxx\\
% \hline
\hline
\end{tabular}}
\end{threeparttable}
\end{center}
% \vspace*{-1mm}
\end{table}

\paragraph{Training setting.}
Unless specified otherwise, we choose the training perturbation size    $\epsilon = 8/255$ and $2/255$  for  CIFAR and ImageNet respectively, where recall that $\epsilon$ was defined in \eqref{eq: adv_train}. We also choose 
$10$ steps  and $4$ steps for PGD attack generation in DAT (and its variants) under CIFAR and ImageNet, respectively. The number of training epochs is given by $100$  for CIFAR-10 and $30$ for ImageNet. Such   training settings are consistent with previous state-of-the-art \citep{zhang2019you,Wong2020Fast}.  
To implement DAT-FGSM, we find that the use of  cyclic learning rate suggested by Fast AT \citep{Wong2020Fast} becomes over-sensitive to the increase of batch size; see Appendix\,\ref{app: add_results}. Thus, we   adopt  the standard piecewise decay step size   and an early-stop strategy \citep{rice2020overfitting} in DAT. 
%We refer readers to {Appendix\,\ref{app: train_setting} [for ImageNet]}  for more implementation details.


% The number of training epochs is given by $100$  for CIFAR-10 and $30$ for ImageNet.
% Note that AT or Fast AT
% %the adversarially robust deep learning
% could be sensitive to the step size (learning rate) choice \citep{rice2020overfitting}. For example, the use of a cyclic learning rate trick can further accelerate  the Fast  AT algorithm \citep{Wong2020Fast}. However, such a trick   becomes less effective
% when the batch size becomes   larger: see Appendix\,\ref{app: add_results}. Meanwhile,  
% %the sensitivity of  adversarially model training to   step size  
% such a sensitivity
% can be mitigated 
% by using   early-stop  remedy  due to the existence of   robust overfitting  %in adversarially model training  
% \citep{rice2020overfitting}. Spurred by that, we   use  the standard piecewise decay step size   and an early-stop strategy during robust training.
% % Note that the FGSM based algorithms  require an early stop to achieve the best robustness accuracy 
% % %\LEE{First time RA is used, think you need to spell it out as robust test accuracy...} 
% % \citep{Wong2020Fast}. 
% %We  training over CIFAR-10 and ImageNet is conducted in $100$ and $30$ epochs, respectively. 
% We refer readers to {Appendix\,\ref{app: train_setting}}  for more implementation details.



% \begin{wraptable}{r}{95mm}
%  \vspace{-9mm}
% \begin{center}
% \caption{\small{Overall performance of DAT  (in gray color), compared with baselines, in TA (\%), RA (\%), communication time per epoch (seconds), and total training time (including communication time)  per epoch (seconds). For brevity, `$p \times q$' represents `\# nodes $\times$  \# GPUs per node', 
% `Comm.' represents communication cost, and `Tr. Time' represents training time.
% \mycomment{In the columns `TA' and `RA', we present the relative \increase{improvement} (\%)  or \decrease{degeneration} (\%) upon   the performance of AT (first row). In the columns `Comm.' and `Tr. time', we {highlight} the \high{worst} communication and training time used in  \textit{distributed} setting.} 
% %SL{we need to show relative improvement compared to AT and Fast AT, like fine-tuning example.}
% }
% %All the training methods are conducted in \SL{xxx [@Gaoyuan]} epochs .
% } 
% % \vspace{-2.5mm}
% \label{table: overall}
% \begin{threeparttable}
% \resizebox{0.68\textwidth}{!}{
% \begin{tabular}{c|c|c|c|c|c|c}
% \hline
% \hline
% \multirow{2}{*}{\begin{tabular}[c]{@{}c@{}}Method\end{tabular}} & \multicolumn{6}{c}{\textbf{CIFAR-10, ResNet-18}} \\ 
% \cline{2-7}  & \begin{tabular}[c]{@{}c@{}}
% $p \times q$
% %Nodes $\times$\\GPUs per node
% \end{tabular}
% & Batch size
% & TA (\%) & RA (\%) & \begin{tabular}[c]{@{}c@{}}Comm. 
% %\\per epoch (s)
% \end{tabular} & \begin{tabular}[c]{@{}c@{}}
% Tr. time
% %Training time\\per epoch (s)
% \end{tabular}
%  \\ \hline
% AT & $1\times 1$ & $2048$ & 82.94 & 38.54 & NA & {218}  \\
% {Fast AT} & $1\times 1$  & $2048$ & 81.58 % \decrease{(1.36)}
% & 38.34  %\decrease{(0.20)}
% & NA & 52  \\
% \mycomment{ 
% {DAT-PGD w/o LALR} & $6\times 1$  & {$2048$} & 82.02 \decrease{(0.92)}  & 38.18 \decrease{(0.36)} & \high{40.9}  & \high{87}\\
% }
% \mycomment{
% DAT-PGD w/o LALR & $6\times 1$  & $6 \times 2048$ &  69.30  \decrease{(13.64)} %% 74.29
% & 33.86 \decrease{(4.68)} & 8.5  & 42\\}
% DAT-PGD w/o LALR & $18\times 1$  & $18 \times 2048$ &  55.59   %% 74.29
% & 26.83    & 3.4  & 22\\
% \mycomment{
% DAT-FGSM w/o LALR & $6\times 1$  & $6 \times 2048$ & 64.46 \decrease{(18.48)} & 33.96 \decrease{(4.58)}  & 8.5  & 14\\}
% DAT-FGSM w/o LALR & $18\times 1$  & $18 \times 2048$ & 52.35   & 28.90 
% & 3.1  & 8\\
% {DAT-LSGD} & $18\times 1$  &   $18 \times 2048$ & 64.15  & 34.12  
% & 3.2 & 22  \\
% \mycomment{\rowcolor{Gray}
% {DAT-PGD} & $6\times 1$  & $6 \times 2048$ & 80.38 \decrease{(2.56)}  & 38.94 \increase{(0.40)}  & 8.5  & 42\\}
% \rowcolor{Gray}
% {DAT-PGD} & $18\times 1$  & $18 \times 2048$ & 80.28   & 38.44
% & 3.4  & 22\\
% \mycomment{\rowcolor{Gray}
% {DAT-FGSM} & $6\times 1$  & $6 \times 2048$ & 75.58   \decrease{(7.36)} & 40.92
% \increase{(2.38)}
% & 8.5  & 14\\}
% \rowcolor{Gray}
% {DAT-FGSM} & $18\times 1$  & $18 \times 2048$ & 73.42  & 38.55
% & 3.1  & 8\\
% \rowcolor{Gray}
% {DAT-FGSM} & $24\times 1$  & $24 \times 2048$ & 72.76 & 39.82
% & 2.0  & 5\\
% % \hline
% %   & \multicolumn{6}{c}{\textbf{CIFAR-10, ResNet-50} \SL{Appendix}} 
% % %   \\ 
% % % \cline{2-7}  &  \begin{tabular}[c]{@{}c@{}}GPUs per node\\ $\times$ nodes\end{tabular}
% % % & Batch size
% % % & TA (\%) & RA (\%) & \begin{tabular}[c]{@{}c@{}}Communication\\per epoch (s)\end{tabular} & \begin{tabular}[c]{@{}c@{}}Computation\\per epoch (s)\end{tabular}
% %  \\ \hline
% % AT & $1\times 1$  & $256$ &85.94   & 43.06 & NA &  894  \\
% % {Fast AT} &$1\times 1$ & $256$ & 75.28 & 40.48 & NA & 288  \\
% % DAT-PGD w/o LALR & $6\times 1$ & $6 \times 256$ & 74.45 & 33.35 & 68  & 236\\
% % \rowcolor{Gray}
% % DAT-PGD & $6\times 1$  & $6 \times 256$ & 84.79 & 42.16 & 68  & 236\\
% % \rowcolor{Gray}
% % DAT-FGSM & $6\times 1$  & $6 \times 256$ & 75.72 & 40.09 & 68  & 116\\
% \hline
%   & \multicolumn{6}{c}{\textbf{ImageNet, ResNet-50}} \\
%   \hline
%   AT & $1\times 6$  & $512$ & 62.70 & 40.38 & NA & 6022  \\
% {Fast AT} & $1\times 6$  & $512$ & 58.99  % \decrease{(3.71)}
% & 40.78 % \increase{(0.4)}
% & NA & 1544  \\
% \mycomment{{DAT-PGD w/o LALR} & $6\times 6$  & $512$ & 62.36 %\decrease{(0.34)}
% %65.06 %\SL{check?} 
% &  %39.28
% 39.86 % \decrease{(0.52)}
% & \high{4324}  & \high{5663}\\}
% DAT-PGD w/o LALR & $6\times 6$  & $6 \times 512$ & 
% 57.09   %\decrease{(5.61)}
% %60.09 
% %\SL{check?} 
% &  34.02 %35.02 
% %\decrease{(6.36)}
% & 865  & 1932\\
% {DAT-FGSM w/o LALR} & $6\times 6$  & $6 \times 512$ & 55.04 %\decrease{(7.66)}
% %57.04 \SL{check?} 
% &  35.03 %39.03 \SL{check?} 
% %\decrease{(5.35)}
% & 863  & 1080\\
% \rowcolor{Gray}
% {DAT-PGD} & $6\times 6$  & $6 \times 512$ & 63.75 %\increase{(1.05)}
% & 38.45 
% %\decrease{(1.93)}
% & 898  & 1960\\
% \rowcolor{Gray}
% {DAT-FGSM} & $6\times 6$  & $6 \times 512$ & 58.32  %\decrease{(4.38)}
% & 41.48
% %\increase{(1.1)}
% & 859  & 1109\\
% \hline
% % \rowcolor{Gray}
% % {DAT-FGSM (Warmup)} & $6\times 6$  & $6 \times 512$ & xx  %\decrease{(4.38)}
% % & xxx
% % %\increase{(1.1)}
% % & xxx  & xxx\\
% % \hline
% \hline
% \end{tabular}}
% \end{threeparttable}
% \end{center}
% \vspace{-5mm}
% \end{wraptable}

%\end{wraptable}
\paragraph{Evaluation setting.}
Unless specified otherwise, we report \underline{r}obust test \underline{a}ccuracy (\textbf{RA}) of a learned model against PGD attacks \citep{madry2017towards}.
%{and C\&W attack \citep{carlini2017towards}}. % for consistency with baselines
Unless specified otherwise, we choose the  perturbation size  same as the training $\epsilon$ in evaluation, and the number of PGD steps is selected as   $20$ and $10$ for CIFAR and ImageNet, respectively. We will also measure RA against AutoAttacks and   the resulting robust accuracy is named \textbf{AA}.
Further, we   measure the standard \underline{t}est \underline{a}ccuracy (\textbf{TA}) of a model against  normal examples.
{All experiments are run   $3$ times with different random seeds, and the mean metrics are reported.}
We consider three different communication protocols, Ring-AllReduce (with one-sided quantization), parameter-server (with double quantization), and high performance computing (HPC) setting (without quantization).
To measure the communication time, we use \textsc{torch.distributed} package with gloo and nccl as communication backend\footnote{\url{https://pytorch.org/docs/stable/distributed.html}}.
We then measure the time of required worker-server communications per epoch.
%We use a time module to measure communication time with  \textsc{torch.distributed.barrier} to synchronize all processes on each node.
 



% \begin{table}[b]
% %\begin{wraptable}{r}{95mm}
% \vspace*{-7mm}
% \begin{center}
% \caption{\small{DAT  (in gray color) on (ImageNet, ResNet-50), compared with baselines, in TA (\%), RA (\%), AA (\%), communication time per epoch (seconds), and total training time (including communication time)  per epoch (seconds). For brevity, `$p \times q$' represents `\# nodes $\times$  \# GPUs per node', 
% `C' represents communication cost, and `T' represents training time in seconds.  
% %\SL{AA}
% %SL{we need to show relative improvement compared to AT and Fast AT, like fine-tuning example.}
% }
% %All the training methods are conducted in \SL{xxx [@Gaoyuan]} epochs .
% } 
% % \vspace{-2.5mm}
% \label{table: overall}
% \begin{threeparttable}
% \resizebox{0.5\textwidth}{!}{
% \begin{tabular}{c|c|c|c|c|c|c|c}
% \hline
% \hline
% \multirow{2}{*}{\begin{tabular}[c]{@{}c@{}}Method\end{tabular}} & \multicolumn{6}{c}{\textbf{ImageNet, ResNet-50}} \\ 
% \cline{2-8}  & \begin{tabular}[c]{@{}c@{}}
% $p \times q$
% %Nodes $\times$\\GPUs per node
% \end{tabular}
% & Batch size
% & TA (\%) & RA (\%) & AA (\%) & \begin{tabular}[c]{@{}c@{}}Comm. 
% %\\per epoch (s)
% \end{tabular} & \begin{tabular}[c]{@{}c@{}}
% Tr. time (s)
% %Training time\\per epoch (s)
% \end{tabular}
%  \\ \hline
% \mycomment{AT & $1\times 1$ & $2048$ & 82.94 & 38.54 & NA & {218}  \\
% {Fast AT} & $1\times 1$  & $2048$ & 81.58 % \decrease{(1.36)}
% & 38.34  %\decrease{(0.20)}
% & NA & 52  \\
% \mycomment{ 
% {DAT-PGD w/o LALR} & $6\times 1$  & {$2048$} & 82.02 \decrease{(0.92)}  & 38.18 \decrease{(0.36)} & \high{40.9}  & \high{87}\\
% }
% \mycomment{
% DAT-PGD w/o LALR & $6\times 1$  & $6 \times 2048$ &  69.30  \decrease{(13.64)} %% 74.29
% & 33.86 \decrease{(4.68)} & 8.5  & 42\\}
% DAT-PGD w/o LALR & $18\times 1$  & $18 \times 2048$ &  55.59   %% 74.29
% & 26.83    & 3.4  & 22\\
% \mycomment{
% DAT-FGSM w/o LALR & $6\times 1$  & $6 \times 2048$ & 64.46 \decrease{(18.48)} & 33.96 \decrease{(4.58)}  & 8.5  & 14\\}
% DAT-FGSM w/o LALR & $18\times 1$  & $18 \times 2048$ & 52.35   & 28.90 
% & 3.1  & 8\\
% {DAT-LSGD} & $18\times 1$  &   $18 \times 2048$ & 64.15  & 34.12  
% & 3.2 & 22  \\
% \mycomment{\rowcolor{Gray}
% {DAT-PGD} & $6\times 1$  & $6 \times 2048$ & 80.38 \decrease{(2.56)}  & 38.94 \increase{(0.40)}  & 8.5  & 42\\}
% \rowcolor{Gray}
% {DAT-PGD} & $18\times 1$  & $18 \times 2048$ & 80.28   & 38.44
% & 3.4  & 22\\
% \mycomment{\rowcolor{Gray}
% {DAT-FGSM} & $6\times 1$  & $6 \times 2048$ & 75.58   \decrease{(7.36)} & 40.92
% \increase{(2.38)}
% & 8.5  & 14\\}
% \rowcolor{Gray}
% {DAT-FGSM} & $18\times 1$  & $18 \times 2048$ & 73.42  & 38.55
% & 3.1  & 8\\
% \rowcolor{Gray}
% {DAT-FGSM} & $24\times 1$  & $24 \times 2048$ & 72.76 & 39.82
% & 2.0  & 5\\
% % \hline
% %   & \multicolumn{6}{c}{\textbf{CIFAR-10, ResNet-50} \SL{Appendix}} 
% % %   \\ 
% % % \cline{2-7}  &  \begin{tabular}[c]{@{}c@{}}GPUs per node\\ $\times$ nodes\end{tabular}
% % % & Batch size
% % % & TA (\%) & RA (\%) & \begin{tabular}[c]{@{}c@{}}Communication\\per epoch (s)\end{tabular} & \begin{tabular}[c]{@{}c@{}}Computation\\per epoch (s)\end{tabular}
% %  \\ \hline
% % AT & $1\times 1$  & $256$ &85.94   & 43.06 & NA &  894  \\
% % {Fast AT} &$1\times 1$ & $256$ & 75.28 & 40.48 & NA & 288  \\
% % DAT-PGD w/o LALR & $6\times 1$ & $6 \times 256$ & 74.45 & 33.35 & 68  & 236\\
% % \rowcolor{Gray}
% % DAT-PGD & $6\times 1$  & $6 \times 256$ & 84.79 & 42.16 & 68  & 236\\
% % \rowcolor{Gray}
% % DAT-FGSM & $6\times 1$  & $6 \times 256$ & 75.72 & 40.09 & 68  & 116\\
% \hline
%   & \multicolumn{6}{c}{\textbf{ImageNet, ResNet-50}} \\
%   \hline}
%   AT & $1\times 6$  & $512$ & 62.70 & 40.38 & 37.46 & NA & 6022  \\
% {Fast AT} & $1\times 6$  & $512$ & 58.99  % \decrease{(3.71)}
% & 40.78 % \increase{(0.4)}
% & 37.18 & NA & 1544  \\
% % \mycomment{{DAT-PGD w/o LALR} & $6\times 6$  & $512$ & 62.36 %\decrease{(0.34)}
% % %65.06 %\SL{check?} 
% % &  %39.28
% % 39.86 % \decrease{(0.52)}
% % & \high{4324}  & \high{5663}\\}
% DAT-PGD w/o LALR & $6\times 6$  & $6 \times 512$ & 
% 57.09   %\decrease{(5.61)}
% %60.09 
% %\SL{check?} 
% &  34.02 %35.02 
% %\decrease{(6.36)}
% & 30.98
% & 865  & 1932\\
% {DAT-FGSM w/o LALR} & $6\times 6$  & $6 \times 512$ & 55.04 %\decrease{(7.66)}
% %57.04 \SL{check?} 
% &  35.03 %39.03 \SL{check?} 
% %\decrease{(5.35)}
% & 32.16
% & 863  & 1080\\
% \rowcolor{Gray}
% {DAT-PGD} & $6\times 6$  & $6 \times 512$ & 63.75 %\increase{(1.05)}
% & 38.45 
% %\decrease{(1.93)}
% & 36.04
% & 898  & 1960\\
% \rowcolor{Gray}
% {DAT-FGSM} & $6\times 6$  & $6 \times 512$ & 58.32  %\decrease{(4.38)}
% & 41.48
% %\increase{(1.1)}
% & 37.32
% & 859  & 1109\\
% \hline
% % \rowcolor{Gray}
% % {DAT-FGSM (Warmup)} & $6\times 6$  & $6 \times 512$ & xx  %\decrease{(4.38)}
% % & xxx
% % %\increase{(1.1)}
% % & xxx  & xxx\\
% % \hline
% \hline
% \end{tabular}}
% \end{threeparttable}
% \end{center}
% \vspace*{-5mm}
% \end{table}

% \begin{table*}[htb]
% %\begin{wraptable}{r}{95mm}
% \vspace*{-1mm}
% \begin{center}
% \caption{\small{DAT  (in gray color) on (ImageNet, ResNet-50), compared with baselines, in TA (\%), RA (\%), AA (\%), communication time per epoch (seconds), and total training time (including communication time)  per epoch (seconds). For brevity, `$p \times q$' represents `\# nodes $\times$  \# GPUs per node', 
% `C' represents communication time in seconds, and `T' represents training time in seconds. 
% \SL{[mixed-precision/Fast-AT] [Question on DAT-FGSM performance over multiple trials.]}
% %\SL{AA}
% %SL{we need to show relative improvement compared to AT and Fast AT, like fine-tuning example.}
% }
% %All the training methods are conducted in \SL{xxx [@Gaoyuan]} epochs .
% } 
% % \vspace{-2.5mm}
% \label{table: overall}
% \begin{threeparttable}
% \resizebox{0.47\textwidth}{!}{
% \begin{tabular}{c|c|c|c|c|c|c|c}
% \hline
% \hline
% \multirow{2}{*}{\begin{tabular}[c]{@{}c@{}}Method\end{tabular}} & \multicolumn{6}{c}{\textbf{ImageNet, ResNet-50}} \\ 
% \cline{2-8}  & \begin{tabular}[c]{@{}c@{}}
% $p \times q$
% %Nodes $\times$\\GPUs per node
% \end{tabular}
% & Batch size
% & TA (\%) & RA (\%) & AA (\%) & \begin{tabular}[c]{@{}c@{}}C (s)
% %\\per epoch (s)
% \end{tabular} & \begin{tabular}[c]{@{}c@{}}
% T  (s)
% %Training time\\per epoch (s)
% \end{tabular}
%  \\ \hline
%   AT & $1\times 6$  & $512$ & 62.70 & 40.38 & 37.46 & NA & 6022  \\
% % \mycomment{{DAT-PGD w/o LALR} & $6\times 6$  & $512$ & 62.36 %\decrease{(0.34)}
% % %65.06 %\SL{check?} 
% % &  %39.28
% % 39.86 % \decrease{(0.52)}
% % & \high{4324}  & \high{5663}\\}
% DAT-PGD w/o LALR & $6\times 6$  & $6 \times 512$ & 
% 57.09   %\decrease{(5.61)}
% %60.09 
% %\SL{check?} 
% &  34.02 %35.02 
% %\decrease{(6.36)}
% & 30.98
% & 865  & 1932\\
% \rowcolor{Gray}
% {DAT-PGD} & $6\times 6$  & $6 \times 512$ & 63.75 %\increase{(1.05)}
% & 38.45 
% %\decrease{(1.93)}
% & 36.04
% & 898  & 1960\\
% {Fast AT} & $1\times 6$  & $512$ & 58.99  % \decrease{(3.71)}
% & 40.78 % \increase{(0.4)}
% & 37.18 & NA & 1544  \\
% {DAT-FGSM w/o LALR} & $6\times 6$  & $6 \times 512$ & 55.04 %\decrease{(7.66)}
% %57.04 \SL{check?} 
% &  35.03 %39.03 \SL{check?} 
% %\decrease{(5.35)}
% & 32.16
% & 863  & 1080\\
% \rowcolor{Gray}
% {DAT-FGSM} & $6\times 6$  & $6 \times 512$ & 58.32  %\decrease{(4.38)}
% & 41.48
% %\increase{(1.1)}
% & 37.32
% & 859  & 1109\\
% \hline
% % \rowcolor{Gray}
% % {DAT-FGSM (Warmup)} & $6\times 6$  & $6 \times 512$ & xx  %\decrease{(4.38)}
% % & xxx
% % %\increase{(1.1)}
% % & xxx  & xxx\\
% % \hline
% \hline
% \end{tabular}}
% \end{threeparttable}
% \end{center}
% \vspace*{-5mm}
% \end{table*}

\subsection{Experiment results}

\paragraph{Adversarial training on ImageNet.}
In Table\,\ref{table: overall}, we show an overall performance comparison on ImageNet between our 
proposed DAT variants and baselines 
%, we evaluate  our proposals and baselines 
in TA, RA, communication and computation efficiency. 
Note that AT and Fast AT are centralized training baselines using   the same number of epochs as distributed training.
\textbf{First}, we observe that the direct extension from AT (or Fast AT) to its distributed counterpart (namely, DAT-PGD w/o LALR or DAT-FGSM w/o LALR) leads to a large degradation of both RA and TA. 
% For example, 
% as the 
% $6$ times  larger   batch size is used, DAT-PGD w/o LALR yields more than $5\%$ drop in TA and $6\%$ drop in RA compared to the best centralized AT using a smaller batch size.
  \textbf{Second},
DAT-PGD (or DAT-FGSM) is able to achieve  competitive      performance to AT (or Fast AT)
%(e.g., within {$1.93\%$} variation  in RA) 
and enables a graceful training speedup.
   %  ($3$ times using $6$ machines    ImageNet). 
In practice, DAT is not able to achieve linear speed-up %(e.g., $3$ times using $6$ machines) 
mainly because of the communication cost. For example, when comparing the computation time of DAT-PGD (batch size $6 \times 512$) with that of AT (batch size $512$), the computation speed-up (by excluding the communication cost) is given by $(6022)/(1960-898) = 5.67$, consistent with the ideal computation gain using $6\times$ larger  batch size in DAT-PGD.
\textbf{Third},  when comparing DAT-FGSM with DAT-PGD, we   observe that the former leads to 
%a robustness improvement   (around 3\% in PGD attack and 1\% in AA), but  it is accompanied by 
a larger loss in standard accuracy (around 5\%).
% the former    introduces a    TA loss, %This phenomenon also holds for Fast AT versus AT, 
%   e.g., $0.4\%$ RA improvement vs. $3.71\%$ TA drop for ImageNet.
  Moreover,  similar to Fast AT, we noted  a larger RA variance  of DAT-FGSM (around 1.5\%) than DAT-PGD (around 0.5\%). Thus, the FGSM-based training is less stable than the AT-based one, consistent with  \citep{li2020towards}.

%   we found that similar to Fast ATT-FGSM may also suffer  the issue of robustness forgetting  \citep{andriushchenko2020understanding} when large perturbation radii are used for training and evaluation (see Figure\,\SL{xxx}).


   %   \begin{wrapfigure}{r}{55mm}
     \begin{figure}[htb]
    % \vspace*{-1mm}
\centerline{
\begin{tabular}{c}
%\hspace*{-0.1in} %\includegraphics[width=.29\textwidth,height=!]{Figures/plotf1.pdf} & \hspace*{-0.2in}
\includegraphics[width=.39\textwidth,height=!]{Figures/DAT_LSGD_ImageNet.pdf}  
%\\
%\footnotesize{(a)} &   \footnotesize{(b)}
\end{tabular}}
% \vspace*{-3mm}
\caption{\small{{{TA/RA comparison between     {DAT-FGSM} and    DAT-LSGD   vs. node-GPU configurations on (ImageNet, ResNet-50).}}
% Here the fine-tuning is conducted by AT at a single computing node, and the target datasets are CIFAR-10 and CIFAR-100.
%   Accuracy (TA or RA)  of the fine-tuned model is compared to that of    the end-to-end  training at CIFAR by DAT-PGD from scratch using  the same number of epochs   as the  fine-tuning. \SL{Update legends: pre-training on ImageNet, no pre-training}
%(a) Fine-tuning over CIFAR-10. (b):  Fine-tuning over CIFAR-100.
% pre-trained xxx \SL{[model name]} over dataset $\mathcal A$ using DAT. Left: RA against PGD attacks of different perturbation sizes during testing. Right:  RA against PGD attacks of different steps during testing.
}}
  \label{fig: scalability}
%  \vspace*{-1mm}
%\end{wrapfigure}
\end{figure}

\begin{figure}[htb]
\centerline{
\begin{tabular}{cc}
%\hspace*{-1mm}
\includegraphics[width=.23\textwidth,height=!]{Figures/pltfigure10imagenet.pdf} &
\includegraphics[width=.23\textwidth,height=!]{Figures/pltfigure11imagenet.pdf}\\
 (a) & (b)
%\\
%\footnotesize{(a)} &   \footnotesize{(b)}
\end{tabular}}
% \vspace*{-3mm}
\caption{{RA against 
PGD attacks for    model
 trained by DAT-PGD, DAT-FGSM, and AT  following (ImageNet, ResNet-50) in Table\,\ref{table: overall}.
  %using $6 \times 1$ computing resources and $2048 \times 6$ batch size.
(a) RA versus different perturbation sizes (over the divisor $255$). (b)  RA versus different steps.
%\SL{Update legends: AT, DAT-PGD, DAT-FGSM}
}}
% \vspace*{-1mm}
\label{fig: robust_PGD_imagenet}
% \vspace*{-3mm}
%\end{wrapfigure}
\end{figure}


  In Figure\,\ref{fig: scalability}, we further compare our proposed DAT with the  DAT-LSGD baseline \citep{xie2019feature} in terms of TA/RA versus the number of computing nodes. Clearly, our approach scales more gracefully  than the baseline, without losing much performance as the batch sizes increases along with the number of computing nodes.
  It is worth noting that
  our DAT setup  is more challenging than \citep{xie2019feature}, which used 128 GPUs but the per-GPU utilization is the 32 batch size. By contrast, although we only use 36 GPUs, the per-GPU batch size is 85. The use of a larger batch size per GPU makes distributed robust training useful when having access to a limited computing budget.
Besides, \citet{xie2019feature} used the PGD attack generation with a quite large number of attack steps (30) at the training time. This makes the computation time dominated over the node-wise communication time. However, we  used a less number of PGD attack steps. In this scenario, the communication time cannot be neglected and prevents the practical distributed implementation from achieving the linear speed-up. For example, when comparing the computation time of DAT-PGD with that of AT in Table\,\ref{table: overall}, the computation speed-up (by excluding   communication cost) is given by $6022/(1960-898) =  5.67 $, close to the linear rate (6$\times$).

%\subsection*{Results}
% \paragraph{Overall performance of DAT}
% %  in TA, RA, and communication and computation costs
% Table\,\ref{table: overall} and Figure\,\ref{fig: scalability} provide an overall performance comparison on ImageNet between our 
% proposed DAT variants and baselines 
% %, we evaluate  our proposals and baselines 
% in TA, RA, communication and computation efficiency. 
% Note that AT and Fast AT are centralized training methods in single node   under the same number of epochs as distributed training.

% %\SL{Additional results are presented in Appendix\,\ref{app: overall-cifar10}.}



%   % \begin{wrapfigure}{r}{80mm}
% %\end{figure}
% In Table\,\ref{table: overall},
% we observe that the direct extension from AT (or Fast AT) to its distributed counterpart (namely, DAT-PGD w/o LALR or DAT-FGSM w/o LALR) leads to a degradation of both RA and TA.
% %leads to significantly poor TA and RA.
% % is far from satisfactory in 
% % %both small-batch and 
% % the
% % large-batch setting. 
% % \mycomment{As we can see, if the batch size of DAT-PGD w/o LALR is set the \textit{same} as AT,  then there is \textit{no} significant speedup in training  (see highlights by purple color) although the resulting TA and RA  are comparable to those of AT.} 
% % speedup is less significant than the proposed  DAT-PGD algorithm (around \Gaoyuan{$2$} times  faster under CIFAR-10 and \Gaoyuan{almost same under ImageNet}),
% %meanwhile the communication cost increases by nearly \Gaoyuan{5} times. 
% As the 
% $6$ times  larger   batch size is used, DAT-PGD w/o LALR yields more than $5\%$ drop in TA and $6\%$ drop in RA compared to the best centralized AT using a smaller batch size.
% % a large   drop in both TA and RA, e.g., more than $10\%$   drop  is observed in  CIFAR-10 when $18$ nodes are used. 
% % We find that
% %   the performance of DAT-PGD w/o LALR    rapidly degrades as the number of computing nodes increases.
%   The similar conclusion holds for  DAT-FGSM w/o LALR vs. Fast AT.
%   When comparing DAT-FGSM with DAT-PGD, we   observe that the former is capable of offering satisfactory (and even better) RA, but inevitably introduces a    TA loss. This phenomenon also holds for Fast AT versus AT, e.g., $0.4\%$ RA improvement vs. $3.71\%$ TA drop for ImageNet. 




% Moreover, as shown in Table\,\ref{table: overall}, 
% DAT-PGD (or DAT-FGSM) is able to achieve  competitive      performance to AT (or Fast AT)
% %(e.g., within {$1.93\%$} variation  in RA) 
% and enables a graceful training speedup
%      ($3$ times using $6$ machines    ImageNet). 
% In practice, DAT is not able to achieve linear speed-up mainly because of the communication cost. For example, when comparing the computation time of DAT-PGD (batch size $6 \times 512$) with that of AT (batch size $512$), the computation speed-up (by excluding the communication cost) is given by $(6022)/(1960-898) = 5.67$, consistent with the ideal computation gain using $6\times$ larger  batch size in DAT-PGD.




% %  \revision{We also note that 
% %  the per-epoch communication time decreases when the  more GPU machines ($24$) are used, since a larger batch size allows a smaller number of iterations per epoch, leading to less frequent communications  among machines. %Thus, the total communication cost per epoch, given by the number of iterations multiplying the communication cost per iteration, decreases.
% %  }
% %  In   {Appendix\,\ref{app: overall-cifar10}}, we present   additional results on CIFAR-10 using ResNet-50.
% %   In Figure\,\ref{fig: scalability},
% % we observe that DAT-PGD outperforms DAT-LSGD \citep{xie2019feature}
% % %with $16.13\%$ and $4.32\%$ improvement  
% % in both TA and RA. 
%   %Note that   we do not impose any additional warm-up learning strategy in both methods for fair comparison.
  
%       \begin{wrapfigure}{r}{55mm}
%   %  \begin{figure}[htb]
%     \vspace*{-5mm}
% \centerline{
% \begin{tabular}{c}
% %\hspace*{-0.1in} %\includegraphics[width=.29\textwidth,height=!]{Figures/plotf1.pdf} & \hspace*{-0.2in}
% \includegraphics[width=.4\textwidth,height=!]{Figures/DAT_LSGD_ImageNet.pdf}  
% %\\
% %\footnotesize{(a)} &   \footnotesize{(b)}
% \end{tabular}}
% \vspace*{-0.1in}
% \caption{\small{{{TA/RA comparison between     {DAT-FGSM} and    DAT-LSGD   vs. node-GPU configurations on (ImageNet, ResNet-50).}}
% % Here the fine-tuning is conducted by AT at a single computing node, and the target datasets are CIFAR-10 and CIFAR-100.
% %   Accuracy (TA or RA)  of the fine-tuned model is compared to that of    the end-to-end  training at CIFAR by DAT-PGD from scratch using  the same number of epochs   as the  fine-tuning. \SL{Update legends: pre-training on ImageNet, no pre-training}
% %(a) Fine-tuning over CIFAR-10. (b):  Fine-tuning over CIFAR-100.
% % pre-trained xxx \SL{[model name]} over dataset $\mathcal A$ using DAT. Left: RA against PGD attacks of different perturbation sizes during testing. Right:  RA against PGD attacks of different steps during testing.
% }}
%   \label{fig: scalability}
%  \vspace*{-5mm}
% \end{wrapfigure}
%   In Figure\,\ref{fig: scalability}, we further compare our proposed DAT with the  DAT-LSGD baseline \citep{xie2019feature} in terms of TA/RA versus the number of computing nodes. Clearly, our approach scales more gracefully  than the baseline, without losing much performance as the batch sizes increases along with the number of computing nodes.

  
% %   \SL{[Note that warm-up is used in xx. If further adding warm-up around additional $2\%$ improvement. Improvement independent of warm-up.
% %   ]}

% % \mycomment{as the $6$ or
% % $18$ times  larger   batch size is used, DAT-PGD w/o LALR then leads to a large   drop in both TA and RA, e.g., more than $10\%$   drop  is observed in  CIFAR-10 when $18$ nodes are used. This implies that the performance of DAT-PGD w/o LALR    rapidly degrades as the number of computing nodes increases.
% % %Additional results in Appendix\,\ref{app:RA_nodes}  show that as the number of computing nodes increases  (along with the batch size increases),  the performance of DAT-PGD w/o LALR    rapidly degrades. 
% % Similar observations can be made from Table\,\ref{table: overall}  for DAT-FGSM w/o LALR versus Fast AT and DAT-FGSM.
% % %(roughly $6\%$ drop for CIFAR-10 and $10\%$ drop for ImageNet) compared with AT.
% % }


% % Furthermore, we observe that when the largest batch size ($24 \times 2048$) is used,   DAT-FGSM takes only ($500$ seconds) to obtain satisfactory robustness. %in contrast  to its centralized baseline Fast AT  under CIFAR-10.
 
 
%  %we further show TA/RA of DAT-FGSM against the   number of computing nodes $p \in \{ 6, 12, 24, 36\}$ for CIFAR-10  in Figure\,xxx.  
% %\begin{wrapfigure}{r}{79mm}
% %\vspace*{-5mm}
 
% %in the large-batch setting.
% % In CIFAR-10 experiments, we also conduct DAT-PGD using $18$ times larger   batch size, {and obtain $10$ times speedup with just $0.1\%$ performance variation.}
% % and  yields quiet competitive  performance compared to AT.
% % given the same per-node GPU resources, 
% % DAT-PGD and DAT-FGSM supports the largest batch size and achieves competitive      performance. 
% %For example, DAT-PGD achieves training speedup by $10$ times using $18$ machines without losing robustness. 
% %enable  the use of $6$ times larger   batch size  and  yield quiet competitive  performance. They
% \mycomment{require the least number of iterations  and, compared to AT, accelerate the training by \SL{10} times for CIFAR-10 and \SL{xxx} times for ImageNet.} 
% % However, the direct extension from AT  to DAT  (without using LALR) in the large-batch setting  
% %   leads to a large performance drop in both TA and RA compared with AT (roughly $6\%$ drop for CIFAR-10 and $10\%$ drop for ImageNet). By contrast, the proposed DAT and Fast DAT (with use of LALR)  can match   and even exceed the performance of centralized baselines.
% % Second, since DAT and Fast DAT enable increasing    batch size (leading to   reduced number of iterations), we achieve around $5$ times 
% % speedup (in terms of computation time) by $6$ times computing resources. Besides computation cost,  distributed training introduces a certain amount of communication cost. It will be shown in Table\,\ref{table: quadtization} that the communication overhead can be  reduced by resorting to gradient quantization.   \SL{We also note that if the communication infrastructure xxx is used, then the communication cost becomes negligible  to the computation cost. @Gaoyuan.}
% %when  larger batch sizes are used (corresponding to  less iterations given the same number of epochs).

 








% %\vspace*{-0.1in}
% \paragraph{Robustness against perturbation sizes and attack steps}
In Figure\,\ref{fig: robust_PGD_imagenet}, we evaluate the performance of DAT  against   PGD attacks of different steps
and perturbation sizes (i.e., values of $\epsilon$).
We  observe that DAT matches  robust accuracies of standard AT even against PGD attacks at different values of $\epsilon$ and steps. 
%Specifically,  DAT has slightly smaller   RA than AT when facing  weak  PGD attacks with $\epsilon$ less than $(5/255)$ and steps less than $5$. Moreover, 
% We also note that although DAT-FGSM has the worst TA ($\epsilon = 0$), it yields slightly better robustness as attack steps increase. 
%The similar results can be found in Appendix\,\ref{app:RA_pgd} for (CIFAR-10, ResNet-18) against PGD (and C\&W) attacks.

 

% \mycomment{
%   \begin{figure}[htb]
%     \vspace*{-0.0in}
% \centerline{
% \begin{tabular}{cc}
% \includegraphics[width=.35\textwidth,height=!]{Figures/pltfigure10imagenet.pdf}  &
% \includegraphics[width=.35\textwidth,height=!]{Figures/pltfigure11imagenet.pdf}
% %\\
% %\footnotesize{(a)} &   \footnotesize{(b)}
% \end{tabular}}
% \caption{\footnotesize{RA against 
% PGD attacks for   the model
%  trained by DAT-PGD,     DAT-FGSM, and AT  following  (ImageNet, ResNet-50) of Table\,\ref{table: overall}.
%   %using $6 \times 1$ computing resources and $2048 \times 6$ batch size.
% (Left) RA against PGD attacks with different perturbation sizes (over the divisor $255$). (Right)  RA against PGD attacks with different steps.
% %\SL{Update legends: AT, DAT-PGD, DAT-FGSM}
% }}
%   \label{fig: robust_PGD_imagenet}
%   \vspace*{-0.00in}
% \end{figure}
% }


\paragraph{{Adversarially
trained smooth classifier}.}
DAT also provides us an effective way to speed up the smooth adversarial training (Smooth-AT) \citep{salman2019provably}  for certified robustness. 
Different from AT, Smooth-AT augments a single training sample with  \textit{multiple}
Gaussian noisy copies so as to  train a Gaussian smoothing-aware  classifier.    Thus, Smooth-AT requires $N\times$ data batch size and storage capacity  in contrast to AT, where $N$ is the number of noisy copies per sample. In the centralized training regime, $N$ can only be set by a small value, e.g., $N = 1$ or $2$. However, DAT is able to scale up Smooth-AT  with a large value of $N$, e.g., $N = 20$ in Table\,\ref{table: RS}. %Table\,\ref{table: RS} shows that DAT-enabled Smooth-AT with {$N = 20$}  yields improved certified robust accuracy over the conventional Smooth-AT with $N = 2$.

\begin{table}[htb]
%\begin{wraptable}{r}{95mm}
% \vspace*{-1mm}
\begin{center}
\caption{{Certified accuracy (\%) of smooth classifiers   
trained by Smooth-DAT on (CIFAR-10, ResNet-18) versus   $\ell_2$ radii. Here smooth classifiers are achieved at two Gaussian noise variance levels, $\sigma = 0.12$ and $\sigma = 0.25$, following \citep{salman2019provably}. And Smooth-AT is implemented using the baseline approach with $N = 2$ and the DAT approach with $N=20$, respectively. 
%\SL{AA}
%SL{we need to show relative improvement compared to AT and Fast AT, like fine-tuning example.}
}
%All the training methods are conducted in \SL{xxx [@Gaoyuan]} epochs .
} 
% \vspace{-2.5mm}
\label{table: RS}
\begin{threeparttable}
\resizebox{0.47\textwidth}{!}{
\begin{tabular}{c|c|c|c|c|c|c|c}
\hline
\hline
\multirow{2}{*}{\begin{tabular}[c]{@{}c@{}}Method\end{tabular}} & \multicolumn{6}{c}{{Smooth classifier ($\sigma = 0.12$)}} \\ 
\cline{2-8}  & $r = 0.05$
& $r = 0.1$
& $r = 0.15$ & $r = 0.2$ & $r = 0.3$ & $r = 0.4$  & $r = 0.5$
 \\ \hline
  Baseline ($N = 2$) & 0.832  & 0.804  &  0.762  & 0.728  & 0.654  & 0.545 & 0  \\
\rowcolor{Gray} DAT ($N=20$) & 0.838  & 0.812  &  0.784 & 0.748  & 0.661  & 0.550 & 0  \\
\hline
\multirow{2}{*}{\begin{tabular}[c]{@{}c@{}}Method\end{tabular}} & \multicolumn{6}{c}{{Smooth classifier ($\sigma = 0.25$)}} \\ 
\cline{2-8}  & $r = 0.05$
& $r = 0.1$
& $r = 0.15$ & $r = 0.2$ & $r = 0.3$ & $r = 0.4$  & $r = 0.5$
 \\ \hline
  Baseline ($N = 2$) & 0.752  & 0.730  &  0.708 & 0.678  & 0.625  & 0.562 & 0.498  \\
\rowcolor{Gray} DAT ($N=20$) & 0.764  & 0.748  &  0.716 & 0.688  & 0.632  & 0.566  & 0.514   \\
\hline
\hline
\end{tabular}}
\end{threeparttable}
\end{center}
%\vspace*{-1mm}
\end{table}


%Different empirical defense such as AT, 
Smooth-AT can produce a provably robust classifier \citep{cohen2019certified}.
To be more specific, let $f(\mathbf x)$ denote a classifier (with input $\mathbf x$) trained by Smooth-AT. Then its Gaussian smoothing version, given by 
$f_{\mathrm{smooth}}(\mathbf x)\Def 
\argmax_{c} \mathbb P_{\boldsymbol \delta \in \mathcal N(\mathbf 0, \sigma^2 \mathbf I)}[  f(\mathbf x+\bdelta)  = c ]
$, can achieve certified  robustness, where $c$ is a class label,   $\boldsymbol \delta \in \mathcal N(\mathbf 0, \sigma^2 \mathbf I)$  denotes the standard Gaussian noise with variance $\sigma^2$, and $\mathbb P$ signifies the majority vote-based prediction probability over multiple noisy samples. 
%In practice,  $N$ noise copies to make the probability-based majority vote. 
The resulting smooth classifier $f_{\mathrm{smooth}}$ can then be evaluated  at 
certified accuracy   \citep{cohen2019certified}, a provable robust guarantee at a given $\ell_2$ perturbation radius $r$.


In Table\,\ref{table: RS}, we present the certified accuracy (CA) of a $\sigma$-specified smooth classifier, obtained by either the conventional Smooth-AT approach (baseline using $N = 2$) or the DAT-enabled Smooth-AT method (DAT using $N = 20$). And we evaluated CA at different $\ell_2$-radii. As we can see, DAT  yields improved certified robustness over the conventional Smooth-AT with $N = 2$. This demonstrates the advantage of DAT in training provably robust classifiers: The use of a large number of Gaussian noisy samples becomes   feasible through distributed training. 
We also observe that CA drops if the perturbation $\ell_2$-radius $r$ increases. This is not surprising since CA is derived by sanity checking if the certified $\ell_2$ perturbation radius of a smooth classifier can cover a given $r$.
Besides, the smoother classifier constructed using Gaussian noises of large   variance $\sigma$ tend to be more robust against a larger $\ell_2$ perturbation radius, but may hamper the accuracy against perturbations of small $\ell_2$ radius. This is consistent with \citep{cohen2019certified} and reveals the tradeoff between accuracy and certified robustness for a $\sigma$-specific smooth classifier.  
%To expand the capacity of data storage, we  propose to use the Smooth-DAT algorithm.  



% \begin{table}[htb]
% \vspace*{-0mm}
% \begin{center}
% \caption{\footnotesize{
% Certified accuracy (\%) of smooth classifiers   
%  trained by Smooth-DAT on \SL{(CIFAR-10, ResNet-18)} versus   $\ell_2$ radii. Here smooth classifiers are achieved at two Gaussian noise variance levels, $\sigma = 0.12$ and $\sigma = 0.25$, following \citep{salman2019provably}. And Smooth-AT is implemented using the baseline approach with $N = 2$ and the DAT approach with $N=20$, respectively. 
% %The relative improvement over RA  or TA obtained in    supervised learning (CIFAR-10 only) is marked by \textcolor{red}{red} color.
% %\SL{RA}.
% %\Gaoyuan{unlabeled data from tiny imagenet. obtained from } 
% }
% %Unlabeled data,  GPUs per node $\times$ nodes: $1\times 6$, batch size $12288$
% } 
%  \vspace*{-2.5mm}
% \label{table: RS}
% \begin{threeparttable}
% \resizebox{0.45\textwidth}{!}{
% \begin{tabular}{c|c|c|c|c|c}
% \hline
% \hline
% \multirow{2}{*}{\begin{tabular}[c]{@{}c@{}}Method\end{tabular}} & \multicolumn{5}{c}{\textbf{Smooth classifier ($\sigma= xxx $) vs. $\ell_2$ radius $r$}} \\ 
% \cline{2-6}  
% %& \begin{tabular}[c]{@{}c@{}}GPUs\end{tabular}
% %& Batch size
% & $r = $ & $r = $ & $r = $ & $r = $ & $r = $
%  \\ \hline
% % DAT (CIFAR-10 only) 
% % %& $1\times 6$  & 12288 
% % & 79.77 & 38.93 & 8.5 & 42  \\
% % Fast DAT (CIFAR-10 only) 
% % %& $1\times 6$  & 12288 
% %  & 75.58 & 40.91 & 8.5  & 14  \\
% Baseline  %& $1\times 6$  & 12288 
% &  x
% & x % (\textcolor{red}{$\uparrow$ 8.40}) 
% & x %
% & x & x\\
% DAT 
% %& $1\times 6$  & 12288 
% & x % (\textcolor{red}{$\uparrow$ 12.42}) 
% & x % (\textcolor{red}{$\uparrow$ 4.92})
% & x 
% & x  & x\\
% \hline
% % \hline\\
% % \multirow{2}{*}{\begin{tabular}[c]{@{}c@{}}Method\end{tabular}} 
% & \multicolumn{5}{c}{\textbf{Smooth classifier ($\sigma= xxx $) vs. $\ell_2$ radius $r$}} \\ 
% \hline
% % \cline{2-6}  
% % %& \begin{tabular}[c]{@{}c@{}}GPUs\end{tabular}
% % %& Batch size
% % & $r = $ & $r = $ & $r = $ & $r = $ & $r = $
% %  \\ \hline
% % DAT (CIFAR-10 only) 
% % %& $1\times 6$  & 12288 
% % & 79.77 & 38.93 & 8.5 & 42  \\
% % Fast DAT (CIFAR-10 only) 
% % %& $1\times 6$  & 12288 
% %  & 75.58 & 40.91 & 8.5  & 14  \\
% Baseline  %& $1\times 6$  & 12288 
% &  x
% & x % (\textcolor{red}{$\uparrow$ 8.40}) 
% & x %
% & x & x\\
% DAT 
% %& $1\times 6$  & 12288 
% & x % (\textcolor{red}{$\uparrow$ 12.42}) 
% & x % (\textcolor{red}{$\uparrow$ 4.92})
% & x 
% & x  & x\\
% \hline
% \hline
% \end{tabular}
% }
% \end{threeparttable}
% \end{center}
% \vspace*{-0mm}
% %\end{wraptable}
% \end{table}


\begin{table}[htb]
%\begin{wraptable}{r}{95mm}
% \vspace*{-1mm}
\begin{center}
\caption{{DAT with semi-supervision   using  ResNet-18  or {Wide ResNet-28-10} under CIFAR-10 + 500K unlabeled Tiny Images. 
%The relative improvement over RA  or TA obtained in    supervised learning (CIFAR-10 only) is marked by \textcolor{red}{red} color.
%\SL{RA}.
%\Gaoyuan{unlabeled data from tiny imagenet. obtained from } 
}
%Unlabeled data,  GPUs per node $\times$ nodes: $1\times 6$, batch size $12288$
} 
%  \vspace*{-1mm}
\label{table: unlabel}
\begin{threeparttable}
\resizebox{0.45\textwidth}{!}{
\begin{tabular}{c|c|c|c|c|c}
\hline
\hline
\multirow{2}{*}{\begin{tabular}[c]{@{}c@{}}Method\end{tabular}} & \multicolumn{5}{c}{{ResNet-18, batch size $12 \times 2048$}} \\ 
\cline{2-6}  & TA (\%) & RA (\%) & AA (\%) & C(s) & T(s)\\ \hline
DAT-PGD  %& $1\times 6$  & 12288 
& 87.00 % (\textcolor{red}{$\uparrow$ 7.62})
& 47.34 % (\textcolor{red}{$\uparrow$ 8.40}) 
& 45.23 %
& 86  & 451\\
DAT-FGSM 
%& $1\times 6$  & 12288 
& 88.00 % (\textcolor{red}{$\uparrow$ 12.42}) 
& 45.84 % (\textcolor{red}{$\uparrow$ 4.92})
& 43.19%
& 86  & 124\\
\hline
\multirow{2}{*}{\begin{tabular}[c]{@{}c@{}}Method\end{tabular}} & \multicolumn{5}{c}{{{Wide ResNet-28-10}, batch size $12 \times 128$}} \\ 
\cline{2-6}  & TA (\%) & RA (\%) & AA (\%) & C(s) & T(s)\\ \hline
DAT-PGD  %& $1\times 6$  & 12288 
& 89.37 % (\textcolor{red}{$\uparrow$ 7.62})
& 62.06 % (\textcolor{red}{$\uparrow$ 8.40}) 
& 58.35 %
& 302  & 1020 \\
DAT-FGSM 
%& $1\times 6$  & 12288 
& 89.52 % (\textcolor{red}{$\uparrow$ 12.42}) 
& 61.24 % (\textcolor{red}{$\uparrow$ 4.92})
& 57.65 %
& 302  & 674 \\
\hline
\hline
\end{tabular}
}
\end{threeparttable}
\end{center}
% \vspace*{-1mm}
\end{table}


%\begin{wraptable}{r}{80mm}
%\vspace*{-0.1in}

%\begin{wraptable}{r}{80mm}
% \begin{table}[htb]
% \vspace*{-0mm}
% \begin{center}
% \caption{\small{DAT  in semi-supervised learning  under ResNet-18  with batch size $18 \times 2048$. 
% \SL{[Update to Wide ResNet 28-10; DAT-FGSM is strange]}
% %The relative improvement over RA  or TA obtained in    supervised learning (CIFAR-10 only) is marked by \textcolor{red}{red} color.
% %\SL{RA}.
% %\Gaoyuan{unlabeled data from tiny imagenet. obtained from } 
% }
% %Unlabeled data,  GPUs per node $\times$ nodes: $1\times 6$, batch size $12288$
% } 
%  \vspace*{-2.5mm}
% \label{table: unlabel}
% \begin{threeparttable}
% \resizebox{0.48\textwidth}{!}{
% \begin{tabular}{c|c|c|c|c|c}
% \hline
% \hline
% \multirow{2}{*}{\begin{tabular}[c]{@{}c@{}}Method\end{tabular}} & \multicolumn{5}{c}{\textbf{CIFAR-10 + 500K unlabeled Tiny Images,   ResNet-18}} \\ 
% \cline{2-6}  
% %& \begin{tabular}[c]{@{}c@{}}GPUs\end{tabular}
% %& Batch size
% & TA (\%) & RA (\%) & AA (\%) & \begin{tabular}[c]{@{}c@{}}C (s) %\\per epoch (s)
% \end{tabular} & \begin{tabular}[c]{@{}c@{}}T (s)
% %\\per epoch (s)
% \end{tabular}
%  \\ \hline
% % DAT (CIFAR-10 only) 
% % %& $1\times 6$  & 12288 
% % & 79.77 & 38.93 & 8.5 & 42  \\
% % Fast DAT (CIFAR-10 only) 
% % %& $1\times 6$  & 12288 
% %  & 75.58 & 40.91 & 8.5  & 14  \\
% DAT-PGD  %& $1\times 6$  & 12288 
% & 87.00 % (\textcolor{red}{$\uparrow$ 7.62})
% & 47.34 % (\textcolor{red}{$\uparrow$ 8.40}) 
% & 45.23 %
% & 86  & 451\\
% DAT-FGSM 
% %& $1\times 6$  & 12288 
% & 88.00 % (\textcolor{red}{$\uparrow$ 12.42}) 
% & 45.84 % (\textcolor{red}{$\uparrow$ 4.92})
% & 43.19%
% & 86  & 124\\
% \hline
% \hline
% \end{tabular}
% }
% \end{threeparttable}
% \end{center}
% \vspace*{-0mm}
% %\end{wraptable}
% \end{table}

  \begin{figure}[htb]
    % \vspace*{-0.0in}
\centerline{
\begin{tabular}{cc}
\hspace*{-3mm} \includegraphics[width=.23\textwidth,height=!]{Figures/pltfigure2.pdf} & \hspace*{-3mm}
\includegraphics[width=.23\textwidth,height=!]{Figures/pltfigure21.pdf}
 %\vspace*{-1.5mm}
\\
%\vspace*{1mm}
%  \\ 
% \includegraphics[width=.39\textwidth,height=!]{Figures/pltfigure21.pdf}
%  \vspace*{-1.5mm}
% \\
\hspace*{-3mm} \small{(a) Finetuning over CIFAR-10} & \hspace*{-3mm}
\small{(b) Finetuning over CIFAR-100}
%\vspace*{1mm}
\end{tabular}}
% \vspace*{-2mm}
\caption{{Fine-tuning  ResNet-50 (pre-trained on ImageNet)  under CIFAR-10 (a) and CIFAR-100 (b). 
For compassion, adversarial training on CIFAR datasets from scratch (no pretrain) is also presented.
Here DAT-PGD is   used for both  pre-training and fine-tuning at $6$ computing nodes.
%with   batch size $6 \times 128$. 
% Here the fine-tuning is conducted by AT at a single computing node, and the target datasets are CIFAR-10 and CIFAR-100.
%   Accuracy (TA or RA)  of the fine-tuned model is compared to that of    the end-to-end  training at CIFAR by DAT-PGD from scratch using  the same number of epochs   as the  fine-tuning. \SL{Update legends: pre-training on ImageNet, no pre-training}
%(a) Fine-tuning over CIFAR-10. (b):  Fine-tuning over CIFAR-100.
% pre-trained xxx \SL{[model name]} over dataset $\mathcal A$ using DAT. Left: RA against PGD attacks of different perturbation sizes during testing. Right:  RA against PGD attacks of different steps during testing.
}}
  \label{fig: transfer}
%   \vspace*{-0.00in}
\end{figure}

\paragraph{DAT under unlabeled data} 
In Table\,\ref{table: unlabel}, we report TA and RA of DAT  in the semi-supervised  setting \citep{carmon2019unlabeled} with the use of 500K  unlabeled  images mined from Tiny Images  \citep{carmon2019unlabeled}.  
As we can see, both DAT-PGD and DAT-FGSM scale well even if a $12 \times$ batch size is used across $12$ machines, each of which has a single GPU.  
%Compared to the DAT supervised learning results    at (CIFAR-10, ResNet-18) in Table\,\ref{table: overall_supplement}, 
% We observe that although the communication and computation costs increase due to the use  of additional unlabeled images, both TA and RA are significantly improved. In particular, the performance of DAT-FGSM matches that of DAT-PGD. This suggests that unlabeled data might provide a solution to compensate the TA loss induced by FGSM-based   robust training algorithms. 
We also compare  DAT-PGD with \citep{carmon2019unlabeled}  following the latter's  architecture, Wide ResNet 28-10.
%and using a \textit{$6$ times larger batch size} ($6 \times 256$). 
We note that  DAT-PGD yields $89.37\%$ TA and $58.35\%$ AA. This is close to the reported  $89.69 \%$ TA and $59.53\%$ AA in RobustBench \citep{croce2020robustbench} built upon small-batch adversarial training.
Although the use of large data batch may cause a performance loss  due to the reduced number of training iterations, the use of data augmentation serves as a   remedy for such loss.



%  \mycomment{
% \begin{table}[ht]
% \begin{center}
% \caption{\footnotesize{DAT  in semi-supervised learning with   unlabeled data, where the   computing resource configuration and batch size are set the same as    the $8$th  row of (CIFAR-10, ResNet-18) in Table\,\ref{table: overall}. The relative improvement over RA  or TA obtained in    supervised learning (CIFAR-10 only) is marked by \textcolor{red}{red} color. %\Gaoyuan{unlabeled data from tiny imagenet. obtained from } 
% }
% %Unlabeled data,  GPUs per node $\times$ nodes: $1\times 6$, batch size $12288$
% } 
% % \vspace{-2.5mm}
% \label{table: unlabel}
% \begin{threeparttable}
% \resizebox{0.8\textwidth}{!}{
% \begin{tabular}{c|c|c|c|c}
% \hline
% \hline
% \multirow{2}{*}{\begin{tabular}[c]{@{}c@{}}Method\end{tabular}} & \multicolumn{4}{c}{\textbf{CIFAR-10 + 500K unlabeled Tiny Images,   ResNet-18}} \\ 
% \cline{2-5}  
% %& \begin{tabular}[c]{@{}c@{}}GPUs\end{tabular}
% %& Batch size
% & TA (\%) & RA (\%) & \begin{tabular}[c]{@{}c@{}}Communication\\per epoch (s)\end{tabular} & \begin{tabular}[c]{@{}c@{}}Training time\\per epoch (s)\end{tabular}
%  \\ \hline
% % DAT (CIFAR-10 only) 
% % %& $1\times 6$  & 12288 
% % & 79.77 & 38.93 & 8.5 & 42  \\
% % Fast DAT (CIFAR-10 only) 
% % %& $1\times 6$  & 12288 
% %  & 75.58 & 40.91 & 8.5  & 14  \\
% DAT-PGD  %& $1\times 6$  & 12288 
% & 87.00 (\textcolor{red}{$\uparrow$ 7.23}) & 47.34 (\textcolor{red}{$\uparrow$ 8.41}) & 86  & 451\\
% DAT-FGSM 
% %& $1\times 6$  & 12288 
% & 88.00  (\textcolor{red}{$\uparrow$ 11.82})  & 45.83 (\textcolor{red}{$\uparrow$ 4.92}) & 86  & 124\\
% \hline
% \hline
% \end{tabular}}
% \end{threeparttable}
% \end{center}
% \vspace{-3mm}
% \end{table}
% }




\paragraph{DAT from pre-training to fine-tuning.}
In Figure\,\ref{fig: transfer}, we investigate if a DAT pre-trained  model (ResNet-50) over a source dataset (ImageNet)  can offer a fast fine-tuning to a down-stream target dataset (CIFAR-10/100).
Here  we up-sample  a  CIFAR image to the same dimension of an ImageNet image before feeding it into the pre-trained model \citep{Shafahi2020Adversarially}.
Compared with the direct application of DAT to the target dataset (without pre-training), the pre-training enables a fast adaption  to the down-stream CIFAR task in both TA and RA within just  $3$ epochs. Thus, the scalability of DAT to large datasets and multiple   nodes offers a great potential to rapidly initialize an adversarially robust base model  in the  `pre-training + fine-tuning' paradigm.
%The similar results can be found in Appendix\,\ref{app:dat_pretrain} on CIFAR-10.
%This suggest the usefulness of a pre-trained robust model learnt by    DAT at   superbly large datasets  with the aid of a large number of distributed machines.   


% \begin{figure}[htb]
%   \dummyfig{Placeholder} 
%   \caption{TA and RA   versus different total time for different methods.}
%   \label{fig:dummy2}
% \end{figure}

 

% In Figure\,\ref{fig: transfer}, we investigate if a DAT pre-trained  model (ResNet-50) over a source dataset (ImageNet)  can offer a fast fine-tuning to a down-stream target dataset (CIFAR-10 or CIFAR-100).
% Here  we up-sample  a  CIFAR image to the same dimension of an ImageNet image before feeding them into the pre-trained model \citep{Shafahi2020Adversarially}.
% Compared with the direct application of DAT to the target dataset (without pre-training), the pre-training enables a fast adaption  to the down-stream CIFAR task in both TA and RA within just  $5$ epochs. Thus, the scalability of DAT to large datasets and multiple   nodes offers a great potential in the  pre-training + fine-tuning paradigm. 
%This suggest the usefulness of a pre-trained robust model learnt by    DAT at   superbly large datasets  with the aid of a large number of distributed machines.   


% \begin{figure}[htb]
%   \dummyfig{Placeholder} 
%   \caption{TA and RA   versus different total time for different methods.}
%   \label{fig:dummy2}
% \end{figure}



% \begin{table}[ht]
% \begin{center}
% \caption{Pre-training and fine-tuning} 
% % \vspace{-2.5mm}
% \label{table: transfer}
% \begin{threeparttable}
% \resizebox{0.6\textwidth}{!}{
% \begin{tabular}{c|c|c|c|c|c|c}
% \hline
% \hline
% \multicolumn{2}{c}{Pre-training}
% &\multicolumn{3}{c}{Fine-tuning}
%  TA (\%) & RA (\%) \\
% \cline{1-2}  \cline{3-5}
% % \multirow{2}{*}{\begin{tabular}[c]{@{}c@{}}Method\end{tabular}} & \multicolumn{6}{c}{\textbf{CIFAR-10 + unlabeled Tiny Imagenet,   Resnet18}} \\ 
% % \cline{2-7}  
% % %& \begin{tabular}[c]{@{}c@{}}GPUs\end{tabular}
% % %& Batch size
% % & TA (\%) & RA (\%) & \begin{tabular}[c]{@{}c@{}}Communication\\per epoch (s)\end{tabular} & \begin{tabular}[c]{@{}c@{}}Computation\\per epoch (s)\end{tabular}
% %  \\ \hline
% DAT (labeled only) 
% %& $1\times 6$  & 12288 
% & 79.77 & 38.93 & 8.5 & 42  \\
% DAT  %& $1\times 6$  & 12288 
% & 87.00 & 47.34 & 86  & 451\\
% Fast DAT 
% %& $1\times 6$  & 12288 
% & 86.60 & 1.49 & 86  & 124\\
% \hline
% \hline
% \end{tabular}}
% \end{threeparttable}
% \end{center}
% \vspace{-3mm}
% \end{table}
%\vspace*{-0.1in}

%\paragraph{Ablation studies on LALR effect}


%   \begin{figure*}[htb]
%     \vspace*{-0.0in}
% \centerline{
% \begin{tabular}{cc}
% \includegraphics[width=.39\textwidth,height=!]{Figures/pltfigure2.pdf}  & \includegraphics[width=.39\textwidth,height=!]{Figures/pltfigure21.pdf}
%  %\vspace*{-1.5mm}
% \\
% \small{(a) Finetuning over CIFAR-10}  &
% %\vspace*{1mm}
% %  \\ 
% % \includegraphics[width=.39\textwidth,height=!]{Figures/pltfigure21.pdf}
% %  \vspace*{-1.5mm}
% % \\
% \small{(b) Finetuning over CIFAR-100}
% %\vspace*{1mm}
% \end{tabular}}
% \vspace*{-2mm}
% \caption{\small{Fine-tuning  ResNet-50 (pre-trained on ImageNet)  under CIFAR-10 (a) and CIFAR-100 (b). 
% For compassion, adversarial training on CIFAR datasets from scratch (no pretrain) is also presented.
% Here DAT-PGD is   used for both  pre-training and fine-tuning at $6$ nodes with   batch size $6 \times 128$. 
% % Here the fine-tuning is conducted by AT at a single computing node, and the target datasets are CIFAR-10 and CIFAR-100.
% %   Accuracy (TA or RA)  of the fine-tuned model is compared to that of    the end-to-end  training at CIFAR by DAT-PGD from scratch using  the same number of epochs   as the  fine-tuning. \SL{Update legends: pre-training on ImageNet, no pre-training}
% %(a) Fine-tuning over CIFAR-10. (b):  Fine-tuning over CIFAR-100.
% % pre-trained xxx \SL{[model name]} over dataset $\mathcal A$ using DAT. Left: RA against PGD attacks of different perturbation sizes during testing. Right:  RA against PGD attacks of different steps during testing.
% }}
%   \label{fig: transfer}
%   \vspace*{-0.00in}
% \end{figure*}


%\vspace*{-0.1in}
%end{figure}
%\vspace*{-0.1in}


 %\begin{wraptable}{r}{80mm}

\paragraph{Quantization effect in various communication protocols}
 %\SL{[Check the paragraph and the table.]}
%\SL{[I am considering to put this Table to appendix.]}
In %Appendix\,\ref{app:quantization}, 
Table\,\ref{table: quadtization_imagenet},
we present how DAT is affected by gradient quantization. 
% In Table\,\ref{table: quadtization}, we present the performance of DAT  by making use of gradient quantization.  Two quantization scenarios are covered: 1) quantization is conducted  at each  worker (Step\,7 of Algorithm\,\ref{alg: DAT}), and 2) quantization is conducted at  both worker and server sides (Step\,7 and 10 of Algorithm\,\ref{alg: DAT}).
As we can see, when the number of bits is reduced, the communication cost and the amount of transmitted data are
saved, respectively. However, the use of an aggressive gradient quantization introduces a performance loss. 
For example, compared with
the case of using $32$ bits, 
the most aggressive quantization scheme  ($8$-bit $2$-sided quantization in Steps 4 and 7 of Algorithm\,\ref{alg: DAT_meta_form}) yields an RA drop around $4\%$
and $7\%$ for DAT-PGD and   DAT-FGSM, respectively. 
In particular,
 DAT-FGSM is more sensitive to the effect of gradient quantization than DAT-PGD. 
%However, 8-bit 2-sided quantization transmitted the least amount of data per iteration. 
% We find that when the number of bits is reduced from $32$ to $8$, {the communication cost is saved by \Gaoyuan{2} times.} 
% We find that when the number of bits is reduced from $32$ to $8$, the resulting TA and RA becomes  worse than the best $32$-bit case. For example, in the {worst case} (8-bit 2-sided quantization)  of CIFAR-10, TA drops $1.52\%$ and $6.32\%$ for DAT-PGD and   DAT-FGSM, respectively. And RA drops  $4.74\%$ and $5.58\%$, respectively. 
%Even in the centralized setting, the use of   8-bit quantization can lead to a non-trivial drop in TA (see Table\,\ref{table: centralized_8bit}).
% However, the use of   quantization  reduces the amount of data transmission   per iteration.
It is worth noting that 
our main communication configuration used in previous experiments is   Ring-AllReduce that calls for 1-sided (rather than 2-sided) quantization.
We further show that if a high performance computing (HPC) cluster of nodes (with NVLink high-speed GPU interconnect \citep{foley2017ultra}) is used, the communication cost can  be further reduced without causing performance loss. 



 \begin{table}[htb]
%  \vspace*{-1mm}
\begin{center}
\caption{{Effect of gradient quantization on the performance of DAT  for various numbers of bits. 
The training and evaluation settings on (ImageNet, ResNet-50) are consistent with     Table\,\ref{table: overall}. The new performance metric `Data trans. (MB)' represents data transmitted per iteration in the unit MB. 
}
} 
% \vspace{-2.5mm}
\label{table: quadtization_imagenet}
\begin{threeparttable}
\resizebox{0.45\textwidth}{!}{
\begin{tabular}{c|c|c|c|c|c}
\hline
\hline
\mycomment{\multirow{2}{*}{\begin{tabular}[c]{@{}c@{}}Method\end{tabular}} & \multicolumn{5}{c}{\textbf{CIFAR-10, ResNet-18}
%\SL{Appendix}
%under $6 \times 1$ computing resources and $6 \times 2048$ batch size
}
\\ 
\cline{2-6}  
%& \begin{tabular}[c]{@{}c@{}}GPUs per node\\ $\times$ nodes\end{tabular}
%& Batch size
& \# bits
& TA (\%) & RA (\%) & \begin{tabular}[c]{@{}c@{}}Comm.\\per epoch (s)\end{tabular} & \begin{tabular}[c]{@{}c@{}}Data transmitted
\\ per iteration (MB)\end{tabular}
 \\ \hline
DAT-PGD & $32$  & 
%$1\times 6$   & $2048\times 6$ & 
80.38 & 38.94 & 8.5  & 1278\\
DAT-PGD & $16$ 
& 
%$1\times 6$  & $2048\times 6$ & 
79.38 & 38.32 & 8.3  & 639\\
DAT-PGD & $8$  & 
%$1\times 6$  & $2048\times 6$ & 
78.18 & 37.34 & 4.3  & 320\\
DAT-PGD & $8$  ($2$-sided) & 
%$1\times 6$  & $2048\times 6$ & 
78.86 & 34.2 & 5.0  & 107\\
DAT-FGSM &  $32$  & 
%$1\times 6$  & $2048\times 6$  &
75.58 & 40.92 & 8.5  & 1278\\
DAT-FGSM & $16$  & 
%$1\times 6$  & $2048\times 6$ & 
75.74 & 40.86 & 8.3 & 639  \\
DAT-FGSM &  $8$  & 
%$1\times 6$  & $2048\times 6$ & 
72.48 & 38.98 & 4.3 & 320  \\
DAT-FGSM & $8$ ($2$-sided) &
%$1\times 6$  & $2048\times 6$ &
69.26 & 35.34 & 5.0 & 107  \\
\hline 
}
\multirow{2}{*}{\begin{tabular}[c]{@{}c@{}}Method\end{tabular}} & \multicolumn{5}{c}{\textbf{ImageNet, ResNet-50} 
%under $6 \times 6$ computing resources and $6 \times 512$ batch size
} \\ 
\cline{2-6}  
%&  \begin{tabular}[c]{@{}c@{}}GPUs per node\\ $\times$ nodes\end{tabular}
%& Batch size
& \# bits
& TA (\%) & RA (\%) & \begin{tabular}[c]{@{}c@{}}C (s)\end{tabular} & \begin{tabular}[c]{@{}c@{}}Data\\ trans.  (MB)\end{tabular}
 \\ \hline
DAT-PGD & $32$  % & $6\times 6$  & $512\times 6$ 
& 63.75 & 38.45 & 898  & 2924\\
DAT-PGD & $16$  % & $6\times 6$  & $512\times 6$
& 61.77 & 38.40 & 850  & 1462\\
DAT-PGD & $8$ % & $6\times 6$  & $512\times 6$ 
& 56.53 & 37.90 & 592  & 731\\
DAT-PGD & $8$ ($2$-sided)  % & $6\times 6$  & $512\times 6$
& 53.09 & 34.59 & 1091  & 244\\
DAT-PGD (HPC) & $32$  % & $6\times 6$  & $512\times 6$ 
& 63.43 & 38.55 & 15  & 1074\\
\hline
DAT-FGSM & 32 %& $6\times 6$  & $512\times 6$ 
& 58.02 & 40.27 & 859  & 2924\\
DAT-FGSM & 16 % & $6\times 6$  & $512\times 6$
& 54.71 & 39.29 & 849 & 1462  \\
DAT-FGSM & 8 % & $6\times 6$ & $512\times 6$ 
& 50.11 & 36.38 & 594 & 731  \\
DAT-FGSM & $8$ ($2$-sided) % & $6\times 6$ & $512\times 6$ 
& 48.27 & 33.20 & 1013 & 244  \\
DAT-FGSM (HPC) & 32 %& $6\times 6$  & $512\times 6$ 
& 57.60 & 41.70 & 15  & 310\\
\hline
\hline 
\end{tabular}}
\end{threeparttable}
\end{center}
% \vspace{-3mm}
%\end{wraptable}
\end{table}




%\vspace*{-1mm}
\section{Conclusions}
%\vspace*{-3mm}
We proposed   {d}istributed {a}dversarial {t}raining (DAT)  to  scale up the training of adversarially robust DNNs over multiple machines. 
% In contrast to standard AT, DAT allows for large-batch min-max optimization and gradient compression to reduce  communication overhead.   
We   showed that   DAT   is general in  that it
enables large-batch min-max optimization and supports gradient compression and different learning regimes. 
% supports different 
% types of attack generation oracles (iterative or one-shot attack generation)  and  different types of training data   (labeled or unlabeled).
We proved that under mild conditions, DAT  is guaranteed to converge to a first-order stationary point with a sub-linear rate. Empirically, we provided comprehensive experiment results to demonstrate the effectiveness and the usefulness of DAT in training robust DNNs with large datasets and multiple machines. %\LEE{Note: I would list the 4 metrics you used in your empirical evaluation.}
In the future, it will be worthwhile to examine the speedup achieved by DAT in the extreme training cases, e.g., using a significantly large number of  attack steps and distributed machines, and extremely large models. 
%We will also plan to investigate if DAT can be further integrated with randomized smoothing (which calls for Gaussian data augmentation during training) to achieve certified robustness. 
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Construction by Songtao ######################
% %\newpage 

\begin{acknowledgements} 
Y. Zhang and S. Liu are supported by the Cisco Research   grant CG\# 70614511. P. Khanduri and M. Hong are supported in part by the NSF grants 1910385 and 1727757. We also thank Dr. Cho-Jui Hsieh for the helpful discussion on early ideas of this paper.
\end{acknowledgements}
% will be removed in pdf for initial submission,
%                          % so you can already fill it to test with the
%                          % ‘accepted’ class option
%     Briefly acknowledge people and organizations here.

%     \emph{All} acknowledgements go in this section.
% \end{acknowledgements}


\newpage
\clearpage
{
% \small 
{{
\bibliography{zhang_207}
}}
}

% \bibliography{refs}


% \appendix
% % NOTE: necessary when ptmx or no mathfont class option is given
% \providecommand{\upGamma}{\Gamma}
% \providecommand{\uppi}{\pi}
% \section{Math font exposition}
% How math looks in equations is important:
% \begin{equation*}
%   F_{\alpha,\beta}^\eta(z) = \upGamma(\tfrac{3}{2}) \prod_{\ell=1}^\infty\eta \frac{z^\ell}{\ell} + \frac{1}{2\uppi}\int_{-\infty}^z\alpha \sum_{k=1}^\infty x^{\beta k}\mathrm{d}x.
% \end{equation*}
% However, one should not ignore how well math mixes with text:
% The frobble function \(f\) transforms zabbies \(z\) into yannies \(y\).
% It is a polynomial \(f(z)=\alpha z + \beta z^2\), where \(-n<\alpha<\beta/n\leq\gamma\), with \(\gamma\) a positive real number.

% \newpage
% \clearpage
% \include{zhang_207-supp}

\end{document}
