% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
\externaldocument{uai2023-template}


\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{amsthm}
{
\theoremstyle{plain}
      \newtheorem{asm}{Assumption}
  }
\usepackage{eepic,epic}
\usepackage{epsfig}
\usepackage{graphicx}
%\usepackage[notcite,notref]{showkeys}


% THEOREMS -----------------------------------------------
\newtheorem{thm}{Theorem}[section]
\newtheorem{cor}[thm]{Corollary}
\newtheorem{lem}[thm]{Lemma}
\newtheorem{prop}[thm]{Proposition}
\newtheorem{clm}[thm]{Claim}
\theoremstyle{definition}
\newtheorem{defn}[thm]{Definition}
\newtheorem{hyp}[thm]{Hypothesis}
\theoremstyle{remark}
\newtheorem{rem}[thm]{Remark}
\numberwithin{equation}{section}

% to compile a preprint version, e.g., for submission to arXiv, add add the
% [preprint] option:
%     \usepackage[preprint]{neurips_2021}

% to compile a camera-ready version, add the [final] option, e.g.:
%     \usepackage[final]{neurips_2021}

% to avoid loading the natbib package, add option nonatbib:
%    \usepackage[nonatbib]{neurips_2021}

\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{hyperref}       % hyperlinks
\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors
% \usepackage{kotex}

% Additionall packages (yeachan)
\usepackage{multirow}
\usepackage{lipsum}
\usepackage{adjustbox}
\usepackage{placeins}
\usepackage{wrapfig}
\usepackage{subcaption}
\usepackage{stackengine}

\usepackage{algpseudocode}
\usepackage{algorithm}
\algnewcommand\algorithmicforeach{\textbf{for each}}
\algdef{S}[FOR]{ForEach}[1]{\algorithmicforeach\ #1\ \algorithmicdo}

\usepackage{scalerel,stackengine}
\stackMath
\newcommand\reallywidehat[1]{%
\savestack{\tmpbox}{\stretchto{%
  \scaleto{%
    \scalerel*[\widthof{\ensuremath{#1}}]{\kern.1pt\mathchar"0362\kern.1pt}%
    {\rule{0ex}{\textheight}}%WIDTH-LIMITED CIRCUMFLEX
  }{\textheight}% 
}{2.4ex}}%
\stackon[-6.9pt]{#1}{\tmpbox}%
}
\parskip 1ex


\def\delequal{\mathrel{\ensurestackMath{\stackon[1pt]{=}{\scriptstyle\Delta}}}}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

% \title{Title in Title Case\\(Supplementary Material)}

\title{Phase-shifted Adversarial Training (Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{Yeachan Kim\href{mailto:<yeachan@deargem.me>?Subject=Your UAI 2023 paper}}
\author[2]{Seongyeon Kim\href{mailto:<synkim@kias.re.kr>?Subject=Your UAI 2023 paper}\thanks{Co-corresponding authors who equally contributed to this work.}}
\newcommand\CoAuthorMark{\footnotemark[\arabic{footnote}]}

\author[3]{Ihyeok Seo\href{mailto:<ihseo@skku.edu>?Subject=Your UAI 2023 paper}\protect\CoAuthorMark}
\author[4]{Bonggun Shin\href{mailto:<bonggun.shin@deargem.me>?Subject=Your UAI 2023 paper}\protect\CoAuthorMark}
% Add affiliations after the authors
\affil[1]{%
    Deargen Inc., Seoul, Republic of Korea
}
\affil[2]{%
    School of Mathematics, Korea Institute for Advanced Study, Seoul, Republic of Korea
}
\affil[3]{%
    Department of Mathematics, Sungkyunkwan University, Suwon, Republic of Korea
}
\affil[4]{%
    Deargen USA Inc., Atlanta, GA
}
  
  \begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle

\section{Filtering Method for Frequency Analysis}

Motivated by the examination of F-principle \citep{xu2020frequency}, we use the filtering method to analyze the behavior of the neural networks in adversarial training. The idea is to split the frequency domain into two parts, i.e., low-frequency and high-frequency parts. However, the Fourier transform for high-dimensional data requires high computational costs and large memory footprints. As an alternative, we use the Fourier transform of a Gaussian function $\hat{G}$.

Let the original dataset be $\{x_j, y_j\}_{j=0}^{N-1}$, and the network output for $x_j$ be $\mathcal{T}_j$. The low frequency part of the training dataset can be derived by
\begin{equation}
    y_{j}^{low, \delta} = \frac{1}{C_j} \sum_{m=0}^{N-1}y_{m} G^{\delta}(x_j - x_m)
\end{equation}
where $C_j = \sum_{m=0}^{N-1}G^{\delta}(x_j - x_m)$ is a normalization factor, and $\delta$ is the variance of the Gaussian function (we fix $\delta$ to 3). The Gaussian function can be represented as 
\begin{equation}
    G^{\delta}(x_j - x_m) = \exp(-\vert x_j - x_m\vert^2/(2\delta)).
\end{equation}
Then, the high-frequency part can be derived by $y_{j}^{high, \delta} \delequal y_j - y_{j}^{low, \delta}$. We also compute the frequency components for the networks, i.e, $\mathcal{T}_{j}^{low, \delta}, \mathcal{T}_{j}^{high, \delta}$ by replacing $y_j$ with the outputs of networks, i.e., $\mathcal{T}_j$. Lastly, we calculate the errors to quantify the convergence in terms of low- and high-frequency.
\begin{equation}
    e_{low} = \left(\frac{\sum_{j}\left\vert y_{j}^{low,\delta} - \mathcal{T}_j^{low,\delta} \right\vert^2}{\sum_{j} \left\vert y_{j}^{low,\delta}\right\vert^2} \right)^{\frac{1}{2}}
\end{equation}

\begin{equation}
    e_{high} = \left(\frac{\sum_{j}\left\vert y_{j}^{high,\delta} - \mathcal{T}_j^{high,\delta} \right\vert^2}{\sum_{j} \left\vert y_{j}^{high,\delta}\right\vert^2} \right)^{\frac{1}{2}}
\end{equation}


\section{Iterative-version of PhaseAT}


\begin{algorithm}[h] \caption{Phase-shifted Adversarial Training (Iterative version)} \label{algo}
\begin{algorithmic}[1]
\Require Training epochs $T$, Dataset size $N$, PGD steps $P$, Perturbation size $\epsilon$, Perturbation step $\alpha$, Trainable networks $\mathcal{T}$, Cosine similarity function $CS(\cdot, \cdot)$
\For {$t= 1$ $...$ $T$} 
    \For {$j= 1$ $...$ $N$}
        \State $\delta = $ Uniform$(-\epsilon, \epsilon)$
        \For {$k= 1$ $...$ $P$} \Comment{Multiple updates of perturbations}
            \If {$j$ \% 2 == 0} \Comment{Alternate training on mini-batches }
            \State $\delta = \delta + \alpha \cdot$ sign($\nabla_{\delta}\ell(\mathcal{T}(x_j + \delta),y_j)$)
            \Else
            \State $\delta = \delta + \alpha \cdot$ sign($\nabla_{\delta}\ell(\mathcal{T}_{0}(x_j + \delta),y_j)$)
            \EndIf
            \State $\delta = $ max(min($\delta, \epsilon$), $-\epsilon$) 
        \EndFor
        \State $\theta = \theta - \nabla_{\theta}[\ell(\mathcal{T}(x_j + \delta), y_j)$ + $CS(\mathcal{T}(x_j + \delta), \mathcal{T}_{0}(x_j + \delta)$)]
    \EndFor
\EndFor
% Fourier updates 
% Frequency selection 
\end{algorithmic}
\label{alg:main2}
\end{algorithm}


For training efficiency, we design PhaseAT as a non-iterative method based on the FGSM perturbation \citep{wong2019fast}. To confirm the effect of stronger attacks in the training process of PhaseAT, we additionally introduce an iterative version of PhaseAT. Since PhaseAT is not closely related to the perturbation generation, we replace the FGSM perturbation with the perturbation generated from PGD \citep{madry2017towards}. The overall algorithm is shown in Algorithm-\ref{alg:main2}.

\section{Details about Evaluation}
\subsection{Attack Configuration}
In our work, we mainly adopt the projected gradient descent (PGD) \citep{madry2017towards} and auto-attack (AA) \citep{croce2020reliable} to evaluate baselines. PGD is constructed by multiple updates of adversarial perturbations, and AA is the ensemble of strong attacks including the variants of PGD. Typically, AA is considered one of the strongest attacks. The details about each attack of AA are as follows:
\begin{itemize}

    \item Auto-PGD (APGD) \citep{croce2020reliable}: This is parameter-free adversarial attack that adaptively changes the step size by considering the optimization of the perturbations. APGD has three variations depending on loss functions: APGD$_{ce}$, APGD$_{dlr}$, and APGD$_t$\footnote{Subscript $ce$ and $dlr$ on APGD indicates the \textit{cross-entropy loss} and \textit{difference of logits ratio}, respectively, and $t$ stands for targeted attacks. The attacks without $t$ subscripts are non-targeted attacks.}.
    
    \item FAB \citep{croce2020minimally}: This attack minimizes the norm of the perturbation necessary to achieve a misclassification. FAB has two variants, FAB and FAB$_{t}$.
    
    \item Square \citep{andriushchenko2020square}: Compared to others, this attack belongs to the black-box attacks and is also known as score-based attack. This attack iteratively inserts an artificial square to the inputs to search optimal perturbations causing huge changes on predictions.
    
\end{itemize}
We set the hyper-parameter settings for each attack based on \textit{standard} version of AA in \textit{robust-bench} framework \citep{croce2021robustbench}. Note that we exclude Square attack from the AA because the stochastic process in PhaseAT can be robust against Square \citep{qin2021random}, which prevents the fair comparison with other baselines which do not include stochastic process. We thus move the results of Square attack to the Supplementary Section \ref{sect:black}.


\subsection{Dataset Information}
We evaluate each baseline on two benchmark datasets, CIFAR-10 and ImageNet. CIFAR-10 \citep{krizhevsky2009learning} consist of 60,000 images of 32×32×3 size for 10 classes, and ImageNet contains 1.2M images of 224×224×3 size for 1,000 classes. Instead of existing ImageNet, we use the smaller version of ImageNet which used in recent baselines \citep{sriramanan2020guided,sriramanan2021towards}, which contains 120K images of 224×224×3 size for 100 classes \footnote{Selected classes are listed in \url{https://github.com/val-iisc/GAMA-GAT}}.

\subsection{Baseline Setting}
PhaseAT is compared to both non-iterative (FBF, GAT, and NuAT) and iterative (FBF, GAT, and NuAT) methods (PGD, TRADES, and AWP). The hyper-parameter settings of each baseline are listed in Table \ref{tab:param}. Since the evaluation results on ImageNet come from previous works \citep{sriramanan2020guided,sriramanan2021towards}, the table only includes the parameters reported in these works (unknown parameters are denoted with $-$.).

\section{Additional Evaluation}
\subsection{Different Architectures}

\begin{table}[t]
\centering
\caption{Performance evaluation on CIFAR-10 dataset. The backbone networks are \textbf{WideResNet-34-10}. Best and second best results are highlighted in boldface and underline, respectively.}
\begin{adjustbox}{max width=\textwidth}
\begin{tabular}{@{}lccc@{}}
\toprule
                                 Method & Standard accuracy  & PGD$_{50}$ & AA \\ \midrule
                                 FBF \citep{wong2019fast}  & 82.1  & 54.4    & 51.3  \\
                                 GAT \citep{sriramanan2020guided}   & 84.7   & 56.1    & 52.1  \\
                                 NuAT \citep{sriramanan2021towards}   & 85.1   & 54.6    & 53.4  \\
                                 PhaseAT (Ours.)  & 88.8   & 62.3    & 59.2  \\ \bottomrule
\end{tabular}
\end{adjustbox}
\label{sup:exp:architecture}
\end{table}

We conduct additional experiments by scaling the PhaseAT backbone networks to verify the effectiveness of PhaseAT on different architectures. We use WideResNet-34-10 architecture instead of PreActResNet-18 to evaluate each baseline on CIFAR-10.The comparison results are listed in Table \ref{sup:exp:architecture}. Similar to the main experiment, we see that PhaseAT achieves the best results amonst all non-iterative methods, demonstrating that PhaseAT can be well scaled to the larger networks.

\subsection{Adversarial Robustness against Black-box Attacks}\label{sect:black}


\begin{table}[t]
\centering
\caption{Performance evaluation on CIFAR-10 dataset against two different black-box attacks.}
\begin{tabular}{@{}lcccc@{}}
\toprule
\multirow{2}{*}{Method} & \multirow{2}{*}{Standard accuracy} & \multicolumn{2}{c}{Transfer-based attack} & \multirow{2}{*}{Score-based attack} \\ \cmidrule(lr){3-4}
                  &                                    & VGG-11                 & ResNet-18            &                                     \\ \midrule
FBF \citep{wong2019fast} & 84.0       & 80.5   & 80.6           & 53.5      \\
GAT \citep{sriramanan2020guided}   & 80.5             & 79.8   & 80.3            & 54.1      \\
NuAT \citep{sriramanan2021towards}   & 81.6             & 79.5   & 80.5            & 56.7      \\
PhaseAT (Ours.)             & 86.2             & 83.8   & 85.0           & 76.5      \\  \bottomrule
\end{tabular}
\label{exp:black_cifar}
\end{table}


As DNN models are often hidden from users in real-world applications, the robustness against black-box attacks is also crucial. Among the different kinds of black-box attacks, we consider transfer-based \citep{liu2016delving,papernot2017practical} and score-based attacks. For transfer-based attacks, we use VGG-11 and ResNet-18 as substitute models and construct the attacks using seven steps of PGD \citep{madry2017towards}. For score-based attacks, we adopt square attack \citep{andriushchenko2020square} with 5,000 query budgets, which is a gradient-free attack and one of the strongest attacks in black-box attacks.

Table \ref{exp:black_cifar} shows the robust accuracy against black-box attacks. Similar to white-box attacks, PhaseAT shows better accuracy against both transfer-based and score-based attacks in comparison to other non-iterative methods. In score-based attacks, the difference in performance between others and PhaseAT is particularly noticeable. This can be explained by the stochastic process of PhaseAT because Qin et al. \citep{qin2021random} demonstrate that randomized defense (e.g., Gaussian noise in the inputs) can robustly prevent the model from score-based attacks. This is why we exclude square attack from the AA attack for a fair comparison with other baselines that do not include the stochastic process. Note that the stochastic property can not be circumvented in the black-box scenario because it is infeasible to design adaptive attacks (i.e., EOT attacks) as in the white-box scenario. Comprehensive results show that PhaseAT could be a robust defense strategy against both white-box and black-box attacks. 


% \section{Filtering Method for Frequency Analysis}
% Motivated by the examination of F-principle \cite{xu2020frequency}, we use the filtering method to analyze the behavior of the neural networks in adversarial training. The idea is to split the frequency domain into two parts, i.e., low-frequency and high-frequency parts. However, the Fourier transform for high-dimensional data requires high computational costs and memory footprints. Hence we alternatively use the Fourier transform of a Gaussian function $\hat{G}$ where $\delta$ is the variance of the Gaussian function $G$. 

% Let the original dataset be $\{x_j, y_j\}_{j=0}^{N-1}$, and the network output for $x_j$ be $\mathcal{T}_j$. The low frequency part of the training dataset can be derived by
% \begin{equation}
%     y_{j}^{low, \delta} = \frac{1}{C_j} \sum_{m=0}^{N-1}y_{j} G^{\delta}(x_j - x_m)
% \end{equation}
% where $C_j = \sum_{m=0}^{N-1}G^{\delta}(x_j - x_m)$ is a normalization factor, and $\delta$ is the variance of the Gaussian function. The Gaussian function can be represented as 
% \begin{equation}
%     G^{\delta}(x_j - x_m) = \exp(-\vert x_j - x_m\vert^2/(2\delta)).
% \end{equation}
% Then, the high-frequency part can be derived by $y_{i}^{high, \delta} \delequal y_i - y_{i}^{low, \delta}$. We also compute the frequency for the networks, i.e, $\mathcal{T}_{i}^{low, \delta}, \mathcal{T}_{i}^{high, \delta}$ by replacing $y_j$ with the outputs of networks, i.e., $\mathcal{T}_j$. Lastly, we calculate the errors to quantify the convergence in terms of low- and high-frequency.
% \begin{equation}
%     e_{low} = \left(\frac{\sum_{j}\left\vert y_{j}^{low,\delta} - \mathcal{T}_j^{low,\delta} \right\vert^2}{\sum_{j} \left\vert y_{j}^{low,\delta}\right\vert^2} \right)^{\frac{1}{2}}
% \end{equation}

% \begin{equation}
%     e_{high} = \left(\frac{\sum_{j}\left\vert y_{j}^{high,\delta} - \mathcal{T}_j^{high,\delta} \right\vert^2}{\sum_{j} \left\vert y_{j}^{high,\delta}\right\vert^2} \right)^{\frac{1}{2}}
% \end{equation}

% \section{Iterative version of PhaseAT}

% \section{Different Architecture for Evaluations}

% \begin{table}[t]
% \centering
% \caption{Performance evaluation on CIFAR-10 dataset. The backbone networks are \textbf{WideResNet-34-10}. Best and second best results are highlighted in boldface and underline, respectively.}
% \begin{adjustbox}{max width=\textwidth}
% \begin{tabular}{@{}lccc@{}}
% \toprule
%                                  Method & Standard accuracy  & PGD$_{50}$ & AA \\ \midrule
%                                  FBF \cite{wong2019fast}  & 82.1  & 0.0    & 0.0  \\
%                                  GAT \cite{sriramanan2020guided}   & 84.7   & 56.1    & 52.1  \\
%                                  NuAT \cite{sriramanan2021towards}   & 85.1   & 54.6    & 53.4  \\
%                                  PhaseAT (Ours.)  & 88.8   & 62.3    & 59.2  \\ \bottomrule
% \end{tabular}
% \end{adjustbox}
% \label{sup:exp:architecture}
% \end{table}

% To verify the general applicability to different architecture, we conduct additional experimnets by scaling the backbon networks of PhaseAT. Instead of ResNet-18, we train WideResNet-34-10 architecture to evaluate each baselines on CIFAR-10. The comparison results are listed in Table \ref{sup:exp:architecture}. Similar to the main experiment, we see that PhaseAT achieves the best results amonst all non-iterative methods, which demonstrates that PhaseAT can be well scaled to the larger networks.


% \section{Training Details}
% \lipsum[1]

\begin{table}[ht]
\centering
\caption{Hyper-parameter setting for all baselines.}
\begin{adjustbox}{max width=\textwidth}
\begin{tabular}{@{}clccc@{}}
\toprule
Method                    & Hyper-parameters       & CIFAR-10 (PreActResNet-18)  & ImageNet-100 (ResNet-18) \\ \midrule
\multirow{5}{*}{FBF}      & perturbation $\epsilon$         & 0.031                 & 0.031                           \\
                          & perturbation step size & 0.039                 & 0.039                           \\
                          & learning rate          & 0.1                   & -                           \\
                          & epoch                  & 30                    & -                           \\
                          & batch size             & 256                   & -                           \\ \midrule
\multirow{5}{*}{GAT}      & perturbation $\epsilon$         & 0.031                 & 0.031                           \\
                          & perturbation step size & 0.031                    & 0.031                           \\
                          & learning rate          & 0.1                  & 0.1                           \\
                          & epoch                  & 100                   & 100                           \\
                          & batch size             & 64                   & 64                           \\ \midrule
\multirow{5}{*}{NuAT}     & perturbation $\epsilon$         & 0.031                 & 0.031                           \\
                          & perturbation step size & 0.031                    & 0.031                           \\
                          & learning rate          & 0.1                  & 0.1                           \\
                          & epoch                  & 100                   & 100                           \\
                          & batch size             & 64                   & 64                           \\ \midrule \midrule
\multirow{6}{*}{PGD}      & perturbation $\epsilon$         & 0.031                 & 0.031                           \\
                          & perturbation step size & 0.039                 & 0.039                           \\
                          & number of iterations   & 7                     & -                           \\
                          & learning rate          & 0.1                   & -                           \\
                          & epoch                  & 30                    & -                           \\
                          & batch size             & 256                  & -                           \\ \midrule
\multirow{6}{*}{TRADES}   & perturbation $\epsilon$         & 0.031                & 0.031                           \\
                          & perturbation step size & 0.007                & -                           \\
                          & beta                   & 6.0                  & -                           \\
                          & learning rate          & 0.1                  & -                           \\
                          & epoch                  & 100                  & -                           \\
                          & batch size             & 128                  & -                           \\ \midrule \midrule
\multirow{7}{*}{PhaseDNN} & perturbation $\epsilon$         & 0.031                & 0.031                           \\
                          & perturbation step size & 0.039                & 0.039                          \\
                          & frequency range        & {[}0, 50000{)}       & {[}0, 50000{)}                           \\
                          & number of heads        & 3                    & 3                           \\
                          & learning rate          & 0.1                  & 0.1                           \\
                          & epoch                  & 30                   & 50                           \\
                          & batch size             & 256                  & 128                           \\ \bottomrule
\end{tabular}
\end{adjustbox}
\label{tab:param}
\end{table}

% \section{Consideration of Adaptive Attacks.}
% % adaptive attacks; 더 강한 attack 을 만들 수 있었다.
% % EOT 보다는 frequency 선택이 더 강한 adaptive attacks 
% Compared to previous methods, PhaseDNN involves stochastic process to select frequencies, which prevents reliable evaluation of the adversarial robustness. 
% \begin{wraptable}{r}{6cm}
% % \begin{table}[t]
% \caption{Performance evaluation of the different designs of adaptive attacks on CIFAR-10 dataset.}\label{wrap-tab:1}
% \begin{adjustbox}{max width=\textwidth}
% \begin{tabular}{@{}lcc@{}}\\\toprule  
% Attack  & Accuracy \\\midrule 
% PGD$_{50}$                      &       61.5 \small{$\pm 0.0$} \\  \midrule
% PGD$_{50}$ + EOT                &       60.9 \small{$\pm 0.0$} \\  
% PGD$_{50}$ + Frequency          &       59.7 \small{$\pm 0.0$} \\  
% PGD$_{50}$ + EOT + Frequency    &       59.5 \small{$\pm 0.0$} \\  \bottomrule
% \end{tabular}
% \end{adjustbox}
% \label{tab:adaptive_attack}
% \end{wraptable} Following the guideline \cite{tramer2018ensemble}, we carefully design the adaptive attack for our method. First, to circumvent the stochastic process, the adversary performs the expectation over transformation (EOT) \cite{athalye2018synthesizing} to approximate the true gradient of the inference model. Second, we assume that the adversary has the full knowledge about the learning algorithm. 
% Hence the adversary could mimic the strategy of frequency selection (in Eq. \ref{eq:freq_select}) used in the training/inference phase. We tabulate how these adaptive attacks affect our method in Table \ref{tab:adaptive_attack}. 
% As can be seen from the table, reducing the effects of the stochastic properties by EOT decreases the performance of our method. Particularly, mimicking the frequency selection strategy further reduces the performance. However, note that PhaseAT still reveals strong adversarial robustness even though the adaptive attacks are applied, which requires ten times more costs to generate the attacks compared to other baselines. Such adaptive attacks are applied to our method for fair comparison, and additional but failed trials are listed in Supplementary material.

% \subsection{Adversarial Robustness against Black-box Attacks}

% \begin{table}[t]
% \centering
% \caption{Performance evaluation on CIFAR-10 dataset against two different black-box attacks.}
% \begin{tabular}{@{}lcccc@{}}
% \toprule
% \multirow{2}{*}{Method} & \multirow{2}{*}{Standard accuracy} & \multicolumn{2}{c}{Transfer-based attack} & \multirow{2}{*}{Score-based attack} \\ \cmidrule(lr){3-4}
%                   &                                    & VGG-11                 & ResNet-18            &                                     \\ \midrule
% FBF \cite{wong2019fast} & 84.0       & 80.5   & 80.6           & 53.5      \\
% GAT \cite{sriramanan2020guided}   & 80.5             & 79.8   & 80.3            & 54.1      \\
% NuAT \cite{sriramanan2021towards}   & 81.6             & 79.5   & 80.5            & 56.7      \\
% PhaseAT (Ours.)             & 86.2             & 83.8   & 85.0           & 76.5      \\  \midrule \midrule
% RND \cite{qin2021random}             & 93.0             & -  & -          & 82.9      \\  \bottomrule
% \end{tabular}
% \label{exp:black_cifar}
% \end{table}


% As the DNN models are often hidden from users in real-world applications, their robustness against black-box attacks is also crucial. Among the different kinds of black-box attacks, we consider transfer-based \citep{liu2016delving,papernot2017practical} and score-based attacks. For transfer-based attacks, we use VGG-11 and ResNet-18 as substitute models and construct the attacks using seven steps of PGD \cite{madry2017towards}. For score-based attacks, we adopt square attack \citep{andriushchenko2020square} with 5,000 query budgets, which is a gradient-free attack and one of the strongest attacks in black-box attacks.

% % 두 attack 모두에 대해서 강하다
% % Square -> randomness 를 포함하고 있기에 black-box 에 대해서도 강할 수 있다. RAND method 언급
% Table \ref{exp:black_cifar} shows the robust accuracy against black-box attacks. Similar to white-box attacks, PhaseAT shows better accuracy against both transfer-based and score-based attacks in comparison to other non-iterative methods. In score-based attacks, the difference in performance between others and PhaseAT is particularly noticeable. This can be explained by the stochastic process of PhaseAT because Qin et al. \cite{qin2021random} demonstrate that randomized defense (e.g., Gaussian noise in the inputs) can robustly prevent the model from score-based attacks. This is why we exclude square attack from the AA attack for a fair comparison. As PhaseAT has the stochasticity in the outputs instead of inputs, we further extend their findings by showing that stochastic property in the outputs could bring the robustness against score-based attacks. Note that the stochastic property can not be circumvented in the black-box scenario because it is infeasible to design adaptive attacks (i.e., EOT attacks) as in the white-box scenario. Comprehensive results show that PhaseAT could be a robust defense against both white-box and black-box attacks. 

\section{Proofs of Theorems 3.1 and 3.2}

\subsection{Preliminaries}
Before proving Theorems 3.1 and 3.2 in this section, we start with a detailed explanation of DNNs, and then introduce the mathematical tools required for proof which can be found in standard references (e.g. \citep{stein,wolff,Schlag,Evans}).

\paragraph{Deep Neural Networks.} A DNN with $K$-hidden layers and general activation functions is a vector-valued function $\mathcal T_{\theta}(x): \mathbb{R}^d \rightarrow \mathbb{R}^{m_{K+1}}$
where $m_k$ denotes the number of nodes in the $k$-th layer.
For $1\leq k \leq K+1$, we set ${\boldsymbol W}^{(k)}\in\mathbb{R}^{m_k \times m_{k-1}}$ and ${\boldsymbol b}^{(k)}\in\mathbb{R}^{m_k}$ as the matrices whose entries consist of the weights and biases called parameters.
The parameter vector $\theta$ is then defined as $$\theta=\big(\textrm{vec}({\boldsymbol W}^{(1)}), \textrm{vec}({\boldsymbol b}^{(1)}),\cdots,\textrm{vec}({\boldsymbol W}^{(K+1)}),\textrm{vec}({\boldsymbol b}^{(K+1)})\big)\in\mathbb{R}^M,$$
where $M=\sum_{k=1}^{K+1}(m_{k-1}+1)m_k$ is the number of the parameters. 
Given $\theta\in\mathbb{R}^M$ and an activation function $\sigma:\mathbb{R}\rightarrow \mathbb{R}$, the DNN output $\mathcal T_\theta^{(K+1)}(x):\mathbb{R}^d \rightarrow \mathbb{R}^{m_{K+1}}$ is expressed in terms of composite functions;
setting $\mathcal T_\theta^{(0)}(x)=x$, $\mathcal T_\theta^{(k)}(x):\mathbb{R}^d \rightarrow \mathbb{R}^{m_k}$ is defined recursively as
\begin{equation*}
(\mathcal T_\theta^{(k)}(x))_i = \sigma \big(({\boldsymbol W}^{k}\mathcal T_\theta^{k-1} + {\boldsymbol b}^{k})_i\big), \,\, 1\leq i \leq m_k,\,\, 1\leq k\leq K.
\end{equation*}
We denote the DNN output $\mathcal T_\theta^{(K+1)}(x)={\boldsymbol W}^{(K+1)} \mathcal T_\theta^{{(K)}} + {\boldsymbol b}^{(K+1)}$ by $\mathcal T_\theta(x)$.

\paragraph{The Basic Properties of Fourier Transforms.}
Let $f\in L^1(\mathbb{R}^d)$. 
The Fourier transform of $f$ is defined by
\begin{equation*}
\widehat{f}(\xi)=\int_{\mathbb{R}^d} e^{-2\pi i x\cdot \xi} f(x) dx.
\end{equation*}
Then clearly
\begin{equation}\label{fou}
    \|\widehat f\|_{L^{\infty}} \leq \|f\|_{L^1}.
\end{equation}
Additionally if $\widehat {f} \in L^1(\mathbb{R}^d)$, 
the Fourier inversion holds:
\begin{equation}\label{inv}
f(x)=\int_{\mathbb{R}^d} e^{2\pi i x\cdot \xi} \widehat{f}(\xi) d\xi.
\end{equation}
If $f,g\in L^1 (\mathbb{R}^d)$, then $f\ast g\in L^1(\mathbb{R}^d)$ and
\begin{equation}\label{con}
\widehat{f\ast g} = \hat f \hat g.
\end{equation}
For an $n$-tuple $\alpha=(\alpha_1, \cdots , \alpha_d)$ of nonnegative integers, we denote
$$D^\alpha = \prod_{j=1}^{d} \frac{\partial^{\alpha_j}}{\partial^{\alpha_j}_{x_j}} \quad \textnormal{and} \quad |\alpha|=\sum_{j=1}^d \alpha_j.$$
Then, if $D^\alpha f \in L^1(\mathbb{R}^d)$ whenever $0\leq|\alpha|\leq s$, 
\begin{equation}\label{dif}
\widehat{D^\alpha f} (\xi) = (2\pi i)^{|\alpha|} \xi^\alpha \widehat f(\xi).
\end{equation}

\paragraph{Sobolev Spaces and Gaussian Weights.}
For $s \in \mathbb{N}$, the Sobolev space $W^{s,\infty}(\mathbb{R}^{d})$ is defined as
$$W^{s,\infty}(\mathbb{R}^{d}) = \{ f \in L^{\infty}(\mathbb{R}^d) : D^{\alpha}f \in L^{\infty}(\mathbb{R}^d) \, \textnormal{for all} \, 0\leq|\alpha| \leq s\}$$ equipped with the norm
\begin{equation*}
\| f \|_{W^{s,\infty}(\mathbb{R}^{d})} = \sum_{|\alpha| \leq s} \| D^{\alpha}f\|_{L^\infty (\mathbb{R}^{d})}.
\end{equation*}
We also introduce a Gaussian weight $G_\varepsilon(x)=\varepsilon^{-d}e^{-\pi \varepsilon^{-2} |x|^2}$ for any $\varepsilon>0$ on which the Fourier transform has an explicit form,
\begin{equation}\label{Ga}
\widehat{G_\varepsilon} (\xi) = e^{-\pi \varepsilon^2 |\xi|^2}.    
\end{equation}
The final observation is that $G_\varepsilon$ is an approximate identity with respect to the limit $\varepsilon\rightarrow0$ as in the following well-known lemma:
\begin{lem}\label{delta}
Let $f\in C(\mathbb{R}^d)\cap L^\infty(\mathbb{R}^d)$. Then
\begin{equation}\label{approx}
\lim_{\varepsilon\rightarrow0}\int_{\mathbb{R}^d}
G_\varepsilon(x-y)f(y)dy=f(x)
\end{equation}
for all $x\in\mathbb{R}^d$.
\end{lem}

\subsection{Proof of Theorem 3.1}\label{sec:pfthm}
In what follows we may consider a compact domain $\Omega$ instead of $\mathbb{R}^d$ because the input data $\{x_j\}_{j=0}^{N-1}$ used for training is sampled from a bounded region. 

For a discrete input data $\{x_j\}_{j=0}^{N-1}$, we now recall the total loss in adversarial training from Section 3.1:
\begin{equation}\label{total0}
L(\theta) = \frac{1}{N} \sum_{j=0}^{N-1} \ell(\mathcal{T}_{\theta}\circ\mathcal A,g)(x_j).
\end{equation}
From the continuity of $\mathcal T_\theta$ and $g$ in the compact domain $\Omega$, we note that $\ell(\mathcal T_\theta \circ A,g)$ is continuous and bounded for general loss functions such as mean-squared error loss and cross-entropy loss.
Then we can apply Lemma \ref{delta} to deduce
\begin{align}\label{total}
\nonumber
L(\theta)&=\lim_{\varepsilon \rightarrow 0}\frac{1}{N} \sum_{j=0}^{N-1}\int_{\mathbb{R}^d} G_{\varepsilon}(x_j-x) \ell(\mathcal{T}_{\theta}\circ\mathcal A,g)(x)dx\\
&=\lim_{\varepsilon \rightarrow 0}\frac{1}{N} \sum_{j=0}^{N-1} \big(G_{\varepsilon} \ast\ell(\mathcal{T}_{\theta}\circ\mathcal A,g)\big)(x_j).
\end{align}
Using the properties $G_{\varepsilon} \in L^1(\mathbb{R}^d)$ and $\ell(\mathcal T_{\theta} \circ \mathcal A,g) \in L^1(\mathbb{R}^d)$, we then derive from Eq.~\ref{con} and Eq.~\ref{Ga} that 
\begin{equation*}
G_{\varepsilon}\ast\ell(\mathcal{T}_{\theta}\circ\mathcal A,g) \in L^1(\mathbb{R}^d)
\end{equation*}
and
\begin{equation}\label{df}
\reallywidehat{G_{\varepsilon}\ast\ell(\mathcal{T}_{\theta}\circ\mathcal A,g)} (\xi) =e^{-\pi \varepsilon^2 |\xi|^2}\, \reallywidehat{\ell(\mathcal{T}_{\theta}\circ\mathcal A,g)}(\xi).
\end{equation}
Note here that by Eq.~\ref{fou}
\begin{equation*}
\|e^{-\pi \varepsilon^2 |\xi|^2} \, \reallywidehat{\ell(\mathcal{T}_{\theta}\circ\mathcal A,g)}(\xi)\|_{L^1} \leq \|\reallywidehat{\ell(\mathcal{T}_{\theta}\circ\mathcal A,g)}\|_{L^{\infty}} \|e^{-\pi \varepsilon^2 |\xi|^2}\|_{L^1} \leq C \|\ell(\mathcal{T}_{\theta}\circ\mathcal A,g)\|_{L^1} < \infty. 
\end{equation*}
Hence the Fourier inversion Eq.~\ref{inv} together with Eq.~\ref{df} implies
\begin{equation}\label{conv}
\big(G_{\varepsilon}\ast\ell(\mathcal{T}_{\theta}\circ\mathcal A,g)\big)(x_j)
=\int_{\mathbb{R}^d}e^{2\pi i x_j \cdot \xi}e^{-\pi \varepsilon^2 |\xi|^2}\, \reallywidehat{\ell(\mathcal{T}_{\theta}\circ\mathcal A,g)}(\xi) d\xi.
\end{equation}
Substituting Eq.~\ref{conv} into the right-hand side of Eq.~\ref{total}, 
we immediately obtain 
\begin{equation}\label{arg}
L(\theta)=\lim_{\varepsilon \rightarrow 0} \frac{1}{N} \sum_{j=0}^{N-1}\int_{\mathbb{R}^d} e^{2\pi i x_j \cdot\xi }\, e^{-\pi \varepsilon^2 |\xi|^2} \reallywidehat{\ell\big(\mathcal T_{\theta}\circ \mathcal{A},g\big)}(\xi) d\xi,
\end{equation}
as desired. This completes the proof.


\subsection{Proof of Theorem 3.2}
\paragraph{Representing $\nabla_\theta L(\theta)$ in the frequency domain.}
To begin with, we represent $\nabla_\theta L(\theta)$ in the frequency domain in the same way as in Section \ref{sec:pfthm}.
By differentiating both sides of Eq.~\ref{total0} with respect to $\theta$ and using Lemma \ref{delta}, we first see  
\begin{align}\label{gra}
\nonumber
\nabla_\theta L(\theta) &= \frac{1}{N} \sum_{j=0}^{N-1} \nabla_\theta\ell(\mathcal{T}_{\theta}\circ\mathcal A,g)(x_j)\\
&=\lim_{\varepsilon\rightarrow 0}\frac{1}{N} \sum_{j=0}^{N-1}\int_{\mathbb{R}^d} G_\varepsilon (x_j-x)\nabla_{\theta}\ell(\mathcal{T}_{\theta}\circ\mathcal A,g)(x) dx
\end{align}
if $\nabla_\theta \ell(\mathcal T_\theta \circ \mathcal A,g)$ is continuous and bounded.
Since $\ell(\mathcal T_\theta \circ \mathcal A,g)$ is differentiable with respect to the first argument (as mentioned in Section 3.1) and $\mathcal T_\theta$ is differentiable with respect to $\theta$ for general activation functions such as ReLU, eLU, tanh and sigmoid, the continuity is generally permissible, and thus the boundedness follows also from compact domain.
In fact, the ReLU activation function is not differentiable at the origin and neither is $\mathcal T_\theta$ on a certain union of hyperplanes;
for example, when considering $1$-hidden layer neural network with $m_1$ nodes and $1$-dimensional output, the output is 
$$\mathcal T_\theta (x) = \sum_{i=1}^{m_1} w_i^{(2)} \sigma (\boldsymbol W_i^{(1)} \cdot x + \boldsymbol b_i^{(1)}),\quad w_i^{(2)},b_i^{(1)} \in \mathbb{R}, \,\boldsymbol W_i^{(1)}\in \mathbb{R}^d$$
%$(\mathcal T_\theta(x))_i = w \sigma\big((\boldsymbol W^{(1)} \cdot x + \boldsymbol b^{(1)})_i \big)+b$ 
and the set of non-differentiable points is a union of hyperplanes given by $\{x\in \mathbb{R}^d:\boldsymbol W_i^{(1)} \cdot x + \boldsymbol b_i^{(1)}=0,\, 1\leq i \leq m_1\}$. But the $d$-dimensional volume of such thin sets is zero and thus they may be excluded from the integration region in Eq.~\ref{approx} when applying Lemma \ref{delta} to obtain Eq.~\ref{gra} for the case of ReLU.

Just by replacing $L(\theta)$ with $\nabla_\theta L(\theta)$ in the argument employed for the proof of Eq.~\ref{arg} and repeating the same argument, it follows now that
\begin{equation}\label{loss} 
\nabla_\theta L(\theta)=\lim_{\varepsilon \rightarrow 0} \frac{1}{N} \sum_{j=0}^{N-1}\int_{\mathbb{R}^d} e^{2\pi i x_j \cdot\xi }\, e^{-\pi \varepsilon^2 |\xi|^2} \reallywidehat{\nabla_\theta \ell\big(\mathcal T_{\theta}\circ \mathcal{A},g\big)}(\xi) d\xi.
\end{equation}
We then pull $\nabla_\theta$ to the outside of the integration in Eq.~\ref{loss}, and recall $L_{\leq\eta}(\theta)$ and $L_{\geq\eta}(\theta)$ 
from Section 3.1, contributed by low and high frequencies in the loss, to see
\begin{equation}\label{app}
\nabla_\theta L(\theta) \approx \nabla_\theta L_{\leq\eta}(\theta) + \nabla_\theta L_{\geq\eta}(\theta).
\end{equation}
This approximation is more and more accurate as $\varepsilon$ diminishes smaller in Eq.~\ref{loss}, and the size of $\varepsilon$ will be later determined inversely proportional to the number of dataset $N$ or dimension $d$ to consider a natural approximation reflecting the discrete experimental setting.

\paragraph{Estimating $\nabla_\theta L_{\geq\eta}(\theta)$ in terms of $\eta$.}
Now we show that for the $i$-th element of $\nabla_\theta L_{\geq\eta}(\theta)$
\begin{equation}\label{el}
\Big|\frac{\partial L_{\geq\eta}(\theta)}{\partial\theta_i}\Big| \leq C\max(N, d^d)\,\eta^{-2s}
\end{equation}
which implies
\begin{equation*}
|\nabla_{\theta}L_{\geq\eta}(\theta)|= \Big(\sum_{\theta} \Big|\frac{\partial L_{\geq\eta}(\theta)}{\partial\theta_i}\Big|^2\Big)^{1/2} \leq C \max(N, d^d)\, \eta^{-2s}.
\end{equation*}
By Eq.~\ref{app} and this bound, we get 
\begin{equation*}
|\nabla_\theta L(\theta) - \nabla_\theta L_{\leq\eta}(\theta)|\approx  |\nabla_\theta L_{\geq\eta}(\theta)|\leq C \max(N, d^d)\, \eta^{-2s}
\end{equation*}
which completes the proof of Theorem 3.2.

To show Eq.~\ref{el}, we first use the chain rule to calculate 
\begin{align*}
\frac{\partial L_{\geq\eta}(\theta)}{\partial \theta_i}= \frac{1}{N} \sum_{j=0}^{N-1} \int_{|\xi|\geq \eta} e^{2\pi i x_j \cdot \xi} e^{-\pi \varepsilon^2 |\xi|^2}\reallywidehat{\nabla_{\mathcal T_\theta}\ell(\mathcal T_{\theta}\circ \mathcal A,g)\cdot \frac{\partial(\mathcal T_\theta\circ \mathcal A)}{\partial\theta_i}}(\xi) d\xi.
\end{align*}
Since $\eta \leq \langle\xi\rangle:=\sqrt{1+|\xi|^2}$ for all $0<\eta \leq |\xi|$, we then see that for $s\in\mathbb{N}$
\begin{align}\label{1}
\nonumber
\bigg|\frac{\partial L_{\geq\eta}(\theta)}{\partial \theta_i}\bigg| &\leq \frac{1}{N} \sum_{j=0}^{N-1} \eta^{-2s}\int_{|\xi|\geq \eta} e^{-\pi \varepsilon^2 |\xi|^2}\Big|\langle\xi\rangle^{2s}\reallywidehat{\nabla_{\mathcal T_\theta}\ell(\mathcal T_{\theta}\circ \mathcal A,g)\cdot \frac{\partial(\mathcal T_\theta\circ \mathcal A)}{\partial\theta_i}}(\xi)\Big|d\xi\\
&\leq \frac{1}{N} \sum_{j=0}^{N-1} \eta^{-2s}\Big\|\langle\xi\rangle^{2s}\reallywidehat{\nabla_{\mathcal T_\theta}\ell(\mathcal T_{\theta}\circ \mathcal A,g)\cdot \frac{\partial(\mathcal T_\theta\circ \mathcal A)}{\partial\theta_i}}(\xi)\Big\|_{L^{\infty}} \|e^{-\pi \varepsilon^2 |\xi|^2}\|_{L^1}.
\end{align}
By a change of variables $\varepsilon \xi \rightarrow \xi$, we note
\begin{equation*}
\|e^{-\pi \varepsilon^2|\xi|^2}\|_{L^1} = \varepsilon^{-d}\int_{\mathbb{R}^d} e^{-\pi |\xi|^2} d\xi \leq C \varepsilon^{-d}.
\end{equation*}
Hence, if we show that the $L^\infty$-norm in Eq.~\ref{1} is finite, then
\begin{equation*}
\bigg|\frac{\partial L_{\geq\eta}(\theta)}{\partial \theta_i}\bigg|\leq C \eta^{-2s}\varepsilon^{-d}.
\end{equation*}
Finally, if we take $\varepsilon= \min\{1/\sqrt[d]{N}, 1/ d\}$ for large $N, d$, we conclude  
\begin{equation}\label{still}
\bigg|\frac{\partial L_{\geq\eta}(\theta)}{\partial \theta_i}\bigg|\leq C \max\{N, d^d\}\eta^{-2s}
\end{equation}
as desired.

Now all we have to do is to bound the $L^\infty$-norm in Eq.~\ref{1}. 
Using the simple inequalities
\[\langle\xi\rangle\leq 1+|\xi|,\quad(1+|\xi|)^M \leq C \sum_{|\alpha|\leq M} |\xi^\alpha|,\] and Eq.~\ref{dif}, Eq.~\ref{fou} in turn, we first see 
\begin{align*}
\Big\|\langle\xi\rangle^{2s}\reallywidehat{\nabla_{\mathcal T_{\theta}}\ell(\mathcal T_{\theta}\circ \mathcal A,g)\cdot \frac{\partial(\mathcal T_\theta\circ \mathcal A)}{\partial\theta_i}}\Big\|_{L^{\infty}} &\leq \Big\|(1+|\xi|)^{2s}\reallywidehat{\nabla_{\mathcal T_{\theta}}\ell(\mathcal T_{\theta}\circ \mathcal A,g)\cdot \frac{\partial(\mathcal T_\theta\circ \mathcal A)}{\partial\theta_i}}\Big\|_{L^{\infty}}\\
\nonumber
&\leq C\sum_{|\alpha|\leq 2s}\Big\|\xi^{\alpha}\reallywidehat{\nabla_{\mathcal T_{\theta}}\ell(\mathcal T_{\theta}\circ \mathcal A,g)\cdot \frac{\partial(\mathcal T_\theta\circ \mathcal A)}{\partial\theta_i}}\Big\|_{L^{\infty}}\\
\nonumber
&\leq C\sum_{|\alpha|\leq 2s}\Big\|\reallywidehat{D^{\alpha}\big(\nabla_{\mathcal T_{\theta}}\ell(\mathcal T_{\theta}\circ \mathcal A,g)\cdot \frac{\partial(\mathcal T_\theta\circ \mathcal A)}{\partial\theta_i}\big)}\Big\|_{L^{\infty}}\\
&\leq C\sum_{|\alpha|\leq 2s}\Big\|D^{\alpha}\big(\nabla_{\mathcal T_{\theta}}\ell(\mathcal T_{\theta}\circ \mathcal A,g)\cdot \frac{\partial(\mathcal T_\theta\circ \mathcal A)}{\partial\theta_i}\big)\Big\|_{L^1}.
\end{align*}
By Leibniz's rule we then bound the $L^1$-norm in the above as 
\begin{equation*}
\Big\|D^{\alpha}\big(\nabla_{\mathcal T_{\theta}}\ell(\mathcal T_{\theta}\circ \mathcal A,g)\cdot \frac{\partial(\mathcal T_\theta\circ \mathcal A)}{\partial\theta_i}\big)\Big\|_{L^1} \leq C \sum_{|\alpha_1|+|\alpha_2|\atop=|\alpha|}\Big\|D^{\alpha_1}\nabla_{\mathcal T_{\theta}}\ell(\mathcal T_{\theta}\circ \mathcal A,g)\cdot D^{\alpha_2}\frac{\partial(\mathcal T_\theta\circ \mathcal A)}{\partial\theta_i} \Big\|_{L^1}.    
\end{equation*}

When $|\alpha|\leq s-1$, $|\alpha_1|\leq s$ and $|\alpha_2|+1\leq s$, 
and then the $L^1$-norm in the right-hand side is generally finite since $\sigma\in W^{s,\infty}(\mathbb{R})$, $g \in W^{s,\infty}(\mathbb{R}^d)$, and the $L^1$-norm may be taken over the compact domain $\Omega$;
for example, $\ell(\mathcal T_\theta \circ \mathcal A,g)(x)=|(\mathcal T_\theta \circ \mathcal A)(x)-g(x)|^2$ and 
\begin{equation*}
\nabla_{\mathcal T_\theta} \ell(\mathcal T_\theta \circ \mathcal A,g)(x)=2((\mathcal T_\theta \circ \mathcal A)(x)-g(x))\end{equation*}
for mean-squared error loss.
Since $(\mathcal T_\theta \circ \mathcal A)(x)$ is expressed as compositions of $\sigma$, the regularity of $\nabla_{\mathcal T_\theta} \ell (\mathcal T_\theta \circ \mathcal A, g)(x)$ is exactly determined by that of $\sigma$ and $g$.
Namely, 
\begin{equation}\label{theat}
\nabla_{\mathcal T_\theta} \ell (\mathcal T_\theta \circ \mathcal A, g)(x)\in W^{s,\infty}(\mathbb{R}^d), \quad  (\mathcal T_\theta \circ \mathcal A)(x) \in W^{s,\infty}(\mathbb{R}^d),
\end{equation}
from which the $L^1$-norm taken over the compact domain $\Omega$ is finite
since $|\alpha_1|\leq s$ and $|\alpha_2|+1\leq s$.

On the other hand, when $s\leq|\alpha|\leq 2s$ we set $|\alpha|=s+j$ with $0\leq j \leq s$.
Firstly, if $0\leq|\alpha_1|\leq s$ (and so $j\leq|\alpha_2|\leq s+j$ since $|\alpha|=|\alpha_1|+|\alpha_2|$), then we bound
\begin{align*}
\Big\|D^{\alpha_1} \nabla_{\mathcal T_{\theta}}\ell(\mathcal T_{\theta}\circ \mathcal A,g)\cdot& D^{\alpha_2} \frac{\partial(\mathcal T_{\theta}\circ \mathcal A)}{\partial\theta_i}\Big\|_{L^1}\leq \big\|D^{\alpha_1} \nabla_{\mathcal T_{\theta}}\ell(\mathcal T_{\theta}\circ \mathcal A,g)\big\|_{L^\infty} \Big\|D^{\alpha_2} \frac{\partial(\mathcal T_{\theta}\circ \mathcal A)}{\partial\theta_i}\Big\|_{L^1}.
\end{align*}
Here, by Eq.~\ref{theat}, the $L^\infty$-norm in the right-hand side is finite since $|\alpha_1|\leq s$, while the finiteness of $L^1$-norm follows from the fact that $D^{\beta}\sigma \in L^1(\mathbb{R})$ where $|\beta|=|\alpha_2|+1$. This fact is indeed valid for general activation functions such as ReLU, eLU, tanh and sigmoid since the $L^1$-norm may be taken over the compact domain $\Omega$.
Finally, if $s+1\leq|\alpha_1|\leq s+j$ (and so $0\leq|\alpha_2|\leq j-1$), then we bound this time
\begin{align*}
\Big\|D^{\alpha_1} \nabla_{\mathcal T_{\theta}}\ell(\mathcal T_{\theta}\circ \mathcal A,g)\cdot& D^{\alpha_2} \frac{\partial(\mathcal T_{\theta}\circ \mathcal A)}{\partial\theta_i}\Big\|_{L^1}\leq \big\|D^{\alpha_1} \nabla_{\mathcal T_{\theta}}\ell(\mathcal T_{\theta}\circ \mathcal A,g)\big\|_{L^1} \Big\|D^{\alpha_2} \frac{\partial(\mathcal T_{\theta}\circ \mathcal A)}{\partial\theta_i}\Big\|_{L^\infty}.
\end{align*}
Here, the $L^\infty$-norm in the right-hand side is finite by Eq.~\ref{theat} since $|\alpha_2|+1\leq j\leq s$.
The finiteness of $L^1$-norm also comes from $D^{\beta}\sigma\in L^1(\mathbb{R})$ and $D^{\beta}g \in L^1(\mathbb{R}^d)$ with $s+1\leq|\beta|\leq 2s$.
Here, the condition $D^{\beta}\sigma\in L^1(\mathbb{R})$ is valid generally as above, and the bound Eq.~\ref{still} is still valid with $\eta^{-s}$ even if the condition $D^{\beta}g \in L^1(\mathbb{R}^d)$ is not required. 
This is the case $j=0$ in the proof.

\bibliography{kim_707}

\end{document}
