\documentclass[accepted]{uai2022}

\usepackage{microtype}
\usepackage{graphicx}
\usepackage{booktabs}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage[colorinlistoftodos,textsize=scriptsize]{todonotes}
\usepackage{marginnote}
\usepackage{amsthm}
\usepackage{booktabs}
\usepackage{multirow}
\usepackage{float}
\usepackage{subcaption}
\usepackage{wrapfig}
\graphicspath{{./Figures/}}

% \usepackage{xr}
% \externaldocument{appendix.tex}

\linepenalty=1000


\usepackage{nameref}
\usepackage{zref-xr}
\zxrsetup{toltxlabel}
% \zexternaldocument*{appendix}
\zexternaldocument*{maini_322}
% \externaldocument{../maini_322-supp}


\newcommand{\theHalgorithm}{\arabic{algorithm}}
\newtheorem{theorem}{Theorem}

\def\ours{\textsc{Protector}}
\def\re{\text{ReColor}}
\def\st{\text{StAdv}}
\def\mp{M_\mathcal{A}}


\definecolor{darkgreen}{rgb}{0,0.3,0}
\definecolor{darkblue}{rgb}{0,0,0.5}
\definecolor{darkorange}{rgb}{0.9,0.4,0}
\newcommand{\eat}[1]{}

\usepackage[utf8]{inputenc} %
\usepackage[T1]{fontenc}    %
\usepackage{url}            %
\usepackage{booktabs}       %
\usepackage{amsfonts}       %
\usepackage{nicefrac}       %
\usepackage{microtype}      %
\usepackage{xcolor}         %
\usepackage[american]{babel}
\usepackage{natbib} %
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}


\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}

\usepackage{mathtools} %
\usepackage{booktabs} %
\usepackage{tikz} %


\title{Perturbation Type Categorization for Multiple Adversarial Perturbation Robustness: Supplementary Material}

\author[1]{\href{mailto:<pratyushmaini@cmu.edu>?Subject=Your UAI 2022 paper}{Pratyush Maini}{}}
\author[2]{Xinyun Chen}
\author[3]{Bo Li}
\author[2]{Dawn Song}

\affil[1]{%
    Carnegie Mellon University
}
\affil[2]{%
    University of California, Berkeley
}
\affil[3]{%
    University of Illinois at Urbana-Champaign
  }
  
  \begin{document}

\onecolumn
\maketitle
\appendix
\section{Problem Setting: Theoretical Analysis}
% \subsection{Problem Setting}
\label{app:sep-proof-setting}
% In this section, we formally define the problem setting and motivate the distinctions made with respect to the problem studied by \citet{ilyas2019adversarial}.
The classification problem consists of two tasks: \textbf{(1)} Predicting the correct class label of an adversarially perturbed (or benign) image using adversarially robust classifier $\mp{}$; and \textbf{(2)} Predicting the type of adversarial perturbation that the input image was subjected to, using attack classifier $C_{adv}$.

\paragraph{Setup.} We consider the data to consist of inputs to be sampled from two multi-variate Gaussian distributions such that the input-label pairs (x,y) can be described as:
\begin{align}
    \centering
    \begin{split}
        y &\stackrel{u.a.r}{\sim}\{-1,+1\}, 
        \\
        x_0 {\sim} \mathcal{N}(y\alpha, \sigma^2),
        &\quad
        x_1, \dots, x_d \stackrel{i.i.d}{\sim} \mathcal{N}(y\eta, \sigma^2),
    \end{split}
\end{align}

where the input $x \sim \mathcal{N}(y\boldsymbol\mu, \mathbf{\Sigma)} \in \mathcal{R}^{(d+1)}$; $\eta = \alpha/\sqrt{d}$ for some positive constant $\alpha$; $\boldsymbol\mu = [\alpha, \eta,\dots,\eta] \in \mathcal{R^+}^{(d+1)}$ and $\mathbf{\Sigma} = \sigma^2 \mathbf{I} \in \mathcal{R^+}^{(d+1) \times (d+1)}$.  We can assume without loss of generality, that the mean for the two distributions has the same absolute value, since for any two distributions with mean $ \boldsymbol\mu_1,  \boldsymbol\mu_2$, we can translate the origin to $\frac { \boldsymbol\mu_1 +  \boldsymbol\mu_2}{2}$. 
This setting demonstrates the distinction between an input feature $x_0$ that is strongly correlated with the input label and $d$ weakly correlated features that are normally distributed (independently) with mean $y\eta$ and variance $\sigma^2$ each. We adapt this setting from \citet{ilyas2019adversarial} who used a stochastic feature $x_0 = y$ with probability $p$, as opposed to a normally distributed input feature as in our case. All our findings hold in the other setting as well, however, the chosen setting better represents true data distribution, with some features that are strongly correlated to the input label, while others that have only a weak correlation.


\section{Separability of perturbation types (Theorem~\ref{thm:separability})}
\label{app:sep-proof}
% ~\xinyun{TODO: re-arrange the proofs of Theorem 1 and 2, add another section to summarize the preliminaries for both theorems.}
Our goal is to 
evaluate if the optimal perturbation confined within different $\ell_p$ balls have different distributions and whether they are separable. We do so by
developing an error bound on the maximum error in classification of the perturbation types. The goal of the adversary is to fool a standard (non-robust) classifier $M$. $C_{adv}$ aims to predict the perturbation type based on \textbf{only} viewing the adversarial image, and not the delta perturbation.

First, in Appendix~\ref{app:sec:gaussian_classifier} we define a binary Gaussian classifier that is trained on the given task. Given the weights of the binary classifier, we then identify the optimal adversarial perturbation for each of the $\ell_1, \ell_2, \ell_\infty$ attack types in Appendix~\ref{app:sep:perturb}. In Appendix~\ref{app:sep:cadv} we define the difference between the adversarial input distribution for different $\ell_p$ balls. Finally, we calculate the error in classification of these adversarial input types in Appendix~\ref{app:sep:error} to conclude the proof of Theorem~\ref{thm:separability}.

\begin{figure*}[h]
\centering
  \includegraphics[width=0.42\linewidth]{figures/binary_gaussian.pdf}
\caption{\textbf{Simulation}: Decision boundary (solid green line) of binary Gaussian classifier. $x_M = \frac{1}{\sqrt{d}} \sum_{i=1}^{d} x_i$ represents a meta feature, and $x_0$ is the first dimension of the input.}
 \label{fig:binary_gaussian}
 \vspace{-1em}
\end{figure*}
\subsection{Binary Gaussian Classifier}
\label{app:sec:gaussian_classifier}
We assume that we have enough input data to be able to empirically estimate the parameters $\mu, \sigma$ of the input distribution via sustained sampling. The multivariate Gaussian representing the input data is given by:

\begin{align}
\label{eqn:multi_variate}
p(x|y=y_i)=\frac{1}{\sqrt{(2\pi)^d|\boldsymbol\Sigma|}}
\exp\left(-\frac{1}{2}({x}-y_i. \boldsymbol\mu)^T{\boldsymbol\Sigma}^{-1}({x}-y_i. \boldsymbol\mu)
\right), \quad \forall y_i \in \{-1,1\}.
\end {align}

\subsection*{}
We want to find $p(y = y_i|x)$ $\forall y_i \in \{-1, +1\}$. From Bayesian Decision Theory, the optimal decision rule for separating the two distributions is given by:
\begin{align}
\label{eqn:bayesian}
\begin{split}
    p(y=1) p(x|y=1) \stackrel{y=1}{>} p(y=-1) p(x|y=-1);  \\
    p(y=1) p(x|y=1) \stackrel{y=-1}{<} p(y=-1) p(x|y=-1). 
\end{split}
\end{align}

Therefore, for two Gaussian Distributions $\mathcal{N}( \boldsymbol \mu_1,\,\boldsymbol\Sigma_1)$, $\mathcal{N}( \boldsymbol \mu_2,\,\boldsymbol\Sigma_2)$, we have:

\begin{align}
\label{eqn:master_eqn}
    \begin{split}
        0 &\stackrel{y=1}{<} x^\top A x -2 b^\top x + c;\\
        A &= \boldsymbol\Sigma_{1}^{-1} - \boldsymbol\Sigma_{2}^{-1}; \\
        b &= \boldsymbol\Sigma_{1}^{-1} \mu_{1} - \boldsymbol\Sigma_{2}^{-1} \mu_{2}; \\
        c &= \mu_{1}^\top \boldsymbol\Sigma_{1}^{-1} \mu_{1} - \mu_{2}^{\top} \boldsymbol\Sigma_{2}^{-1} \mu_{2} + \log \frac{\|\Sigma_1\|}{\|\Sigma_2\|} - 2 \log \frac{p(y=1)}{p(y=-1)}.
    \end{split}
\end{align}


Substituting (\ref{eqn:multi_variate}) and (\ref{eqn:bayesian}) in (\ref{eqn:master_eqn}), we find that the optimal Bayesian decision rule for our problem is given by:  
\begin{align}
\label{eqn:simplified_bayesian_eqn}
    \begin{split}
        x^\top  \boldsymbol\mu &\stackrel{y=1}{>}  0 ,
        \end{split}
\end{align}

which means that the label for the input can be predicted with the information of the sign of $x^\top \boldsymbol\mu$ alone. We can define the parameters $\mathbf{W} \in \mathcal{R}^{d+1} $ of the optimal binary Gaussian classifier $M^W$ , such that $\|\mathbf{W}\|_2 = 1$ as:
\begin{align}
\label{eqn:params_vanilla_model}
    \begin{split}
        \mathbf{W}_0  &= \frac{\alpha}{\sqrt{2}}, \quad
        \quad \mathbf{W}_i = \frac{\alpha}{\sqrt{2d}} \quad \forall i \in \{1,\dots,d\}; \\
        M^W(x) &= x^\top W.%\operatorname{sgn}(x^\top W)
        \end{split}
\end{align}

The same is also verified via a simulation in Figure~\ref{fig:binary_gaussian}.

\subsection{Optimal Adversarial Perturbation against $M^W$} 
\label{app:sep:perturb}
Now, we calculate the optimal perturbation $\delta$ that is added to an input by an adversary in order to fool our model. For the purpose of this analysis, we only aim to fool a model trained on the standard classification metric as discussed in Section~\ref{section:multiple-perturb} (and not an adversarially robust model). The parameters of our model are defined in (\ref{eqn:params_vanilla_model}). 

The objective of any adversary $\delta \in \Delta$ is to maximize the loss of the label classifier $M^{W}$. 
We assume that the classification loss is given by $-y\times M^{W}(x + \delta)$.
% For simplicity, we discuss the case where an adversary attempts to flip the label corresponding to $y=1$. 
The object of the adversary is to find $\delta^{*}$ such that:

\begin{align}
    \begin{split}
        \ell (x+\delta,y;M^W) 
        &= -y \times M^{W}(x + \delta) = -y x^\top \mathbf{W};\\
        \delta^{*} 
        &= \text{arg}\max_{\delta \in \Delta} \ell (x+\delta,y;M^W),  \\
        &= \text{arg}\max_{\delta \in \Delta} -y(x + \delta)^{\top}\mathbf{W} 
        = \text{arg}\max_{\delta \in \Delta}  -y\delta^{\top}\mathbf{W}. 
    \end{split}
\end{align}

We will now calculate the optimal perturbation in the $\ell_p$ balls $\forall p \in \{1,2,\infty\}$. For the following analyses, we restrict the perturbation region $\Delta$ to the corresponding $\ell_p$ ball of radius $\{\epsilon_1, \epsilon_2, \epsilon_\infty\}$ respectively. We also note that the optimal perturbation exists at the boundary of the respective $\ell_p$ balls. Therefore, the constraint can be re-written as :
\begin{align}
\label{eqn:optimal-delta}
    \begin{split}
        \delta^{*} 
        &= \text{arg}\max_{\|\delta\|_p = \epsilon_p} -y\delta^{\top}\mathbf{W}. 
    \end{split}
\end{align}


We use the following properties in the individual treatment of $\ell_p$ balls:
\begin{align}
\label{eqn:ball_properties}
    \begin{split}
        \|\delta\|_p  
        &= \left(\sum_i \vert \delta_i \vert^p\right)^{\frac{1}{p}},\\
        \partial_j \|\delta\|_p 
        &= \frac{1}{p} \left(\sum_i \vert \delta_i \vert^p\right)^{\frac{1}{p}-1} \cdot p \vert \delta_j \vert^{p-1} \operatorname{sgn}(\delta_j) =  \left(\frac{\vert \delta_j \vert}{\|\delta\|_p}\right)^{p-1} \operatorname{sgn}(\delta_j).
    \end{split}
\end{align}


\paragraph{p = 2} Making use of langrange multipliers to solve
(\ref{eqn:optimal-delta}), we have:
\begin{align}
    \begin{split}
        \nabla_{\delta} (-\delta^{\top}\Sigma^{-1}\mu)
         &= \lambda \nabla_{\delta} (\|\delta\|_{p}^{2} - \epsilon_{p}^{2}), \\
        - \mathbf{W} &= \lambda^{'} \|\delta\|_{p} \nabla_{\delta} (\|\delta\|_{p}).
    \end{split}
\end{align}
Combining the results from (\ref{eqn:ball_properties}) and replacing $\delta$ with $\delta_2$ we obtain :
\begin{align}
    \begin{split}
        - \mathbf{W} &= \lambda^{'} \|\delta_2\|_{2}  \left(\frac{\vert \delta_2 \vert}{\|\delta_2\|_2}\right) \operatorname{sgn}(\delta_2)\\
        \delta_2 ;
        &= -\epsilon_2 \left( \frac{\mathbf{W}}{\|\mathbf{W}\|_{2}} \right) = -\epsilon_2 \mathbf{W}.
    \end{split}
\end{align}

\paragraph{p = $\infty$} Recall that the optimal perturbation is given by :

\begin{align}
\label{eqn:l_inf_reparam}
    \begin{split}
    \delta^{*} 
        &= \text{arg}\max_{\|\delta\|_\infty = \epsilon_\infty} -y \delta^{\top} \mathbf{W}, \\
        &= \text{arg}\max_{\|\delta\|_\infty = \epsilon_\infty} -y \sum_{i=0}^{d} \delta_i \mathbf{W}_i .
    \end{split}
\end{align}
Since $\|\delta\|_\infty = \epsilon_\infty$, we know that $\max_i |\delta_i| = \epsilon_\infty$. Therefore (\ref{eqn:l_inf_reparam}) is maximized when each $\delta_i = -y \epsilon_\infty \operatorname{sgn} \mathbf{W}_i \quad \forall i \in \{0,\dots,d\}$. Further, since the weight matrix only contains non-negative elements ($\alpha$ is a positive constant), we can conclude that the optimal perturbation is given by:

\begin{align}
\label{eqn:l_inf_opt}
    \begin{split}
        \delta_\infty 
        &= -y\epsilon_\infty \mathbf{1}.
    \end{split}
\end{align}


\paragraph{p = 1}
% We make the following additional assumptions in the data distribution to be able to better represent the effect of different perturbations (more evidently in the $\ell_1$ case). Assume that among all the $d$ dimensions in the input feature space, there are exactly $k < d$ dimensions that are strongly correlated with the actual label prediction. This means that the weights $\mathbf{W}_j = (\Sigma^{-1}\mu)_j$ of classifier $\mp{}$ have a higher magnitude for these k-dimensions than the remaining ones. For simplicity, we assume $|\mathbf{W}_j| = \mathbf{W}_{max}$  $\forall j \in \{1\dots k\}$. 
% 
% \textbf{Note 1:} The proofs are general otherwise, but better resemble real data distributions with the added assumptions.
% 
We attempt an analytical solution for the optimal perturbation $\delta_1$. Recall that the optimal perturbation is given by :

\begin{align}
\label{eqn:l_1_reparam}
    \begin{split}
    \delta^{*} 
        &= \text{arg}\max_{\|\delta\|_1 = \epsilon_1} - y\sum_{i=1}^{d} \delta_i \mathbf{W}_i, \\
        &= \text{arg}\max_{\|\delta\|_1 = \epsilon_1} 
        - y\delta_0 \mathbf{W}_0
        - y\sum_{i=1}^{d} \delta_i \mathbf{W}_i, \\
        &= \text{arg}\max_{\|\delta\|_1 = \epsilon_1} 
        - y\delta_0 \frac{\alpha}{\sqrt{2}}
        - y\sum_{i=1}^{d} \delta_i \frac{\alpha}{\sqrt{2d}}.
    \end{split}
\end{align}
% where $\mathbf{W} = \boldsymbol\Sigma^{-1}\mu$. 
Since $\|\delta\|_1 = \epsilon_1$, (\ref{eqn:l_1_reparam}) is maximized when:
\begin{align}
\label{eqn:l_1_opt}
\begin{split}
    \delta_0 &= - y\epsilon_1 \operatorname{sgn} (\alpha) = -y\epsilon_1,
    \quad \quad\delta_i =0 \quad \forall i \in \{1\dots d\}. 
\end{split}
\end{align}
% While there may exist other solutions giving complete weight to only one of the top-k dimensions, we proceed with this solution and justify the choice later. Hence, we can conclude that the optimal perturbation is given by:

% \begin{align}
% \label{eqn:l_1_opt}
%         \delta_i^{*}
%         =
%         \begin{cases}
%             - \frac{\epsilon_1}{k} \operatorname{sgn} (\Sigma^{-1}\mu)_i,& \text{if } i\leq k\\
%             0,              & \text{otherwise}
%         \end{cases}
% \end{align}


% \textbf{Note 2:} We assume that all the weights for the important dimensions are equal because in real data settings the maximum perturbation $\delta_j$ in any dimension is usually bounded according to the domain (for instance between 0 and 1 in case of images). However, we deal with an unbounded scenario in this work. Without the equality assumption the $\ell_1$ metric would suggest that the perturbation $\delta_j$ corresponding to the dimension $j$ with maximum classifier weight $\mathbf{W}_j$ should contain all the weight, and all other $\delta_j$ to be exactly zero.  


\paragraph{Combining the results.} From the preceding discussion, it may be noted that the new distribution of inputs within a given label changes by a different amount $\delta$ depending on the perturbation type. Moreover, if the mean and variance of the distribution of a given label are known (which implies that the corresponding true data label is also known), the optimal perturbation is independent of the input itself, and only dependent on the respective class statistics (Note that the input is still important in order to understand the true class).

\subsection{Perturbation Classification by $C_{adv}$}
\label{app:sep:cadv}
Now we aim to verify if it is possible to accurately separate the optimal adversarial inputs crafted within different $\ell_p$ balls. For the purposes of this discussion, we only consider the problem of classifying perturbation types into $\ell_1$ and $\ell_\infty$, but the same analysis may also be extended more generally to any number of perturbation types.

We will consider the problem of classifying the correct attack label for inputs from true class $y=1$ for this discussion.
Note that the original distribution:
$$X_{true} \sim \mathcal{N}(y . \boldsymbol\mu,\,\boldsymbol\Sigma).$$

Since the perturbation value $\delta_p$ is fixed for all inputs corresponding to a particular label, the new distribution of perturbed inputs $X_1$ and $X_\infty$ in case of $\ell_1$ and $\ell_\infty$ attacks respectively (for y = 1) is given by:

\begin{equation}
    \begin{split}
        X_1 &\sim \mathcal{N}( \boldsymbol \mu + \delta_1,\,\boldsymbol\Sigma); \\
        X_\infty &\sim \mathcal{N}( \boldsymbol \mu + \delta_\infty,\,\boldsymbol\Sigma).
    \end{split}
\end{equation}

We now try to evaluate the conditions under which we can separate the two Gaussian distributions with an acceptable worst-case error.

% \subsection{Error Bound}
% \label{app:sep:error}


% A classification error occurs if a data vector x belongs to one class but falls in the decision region of the other class. That is in (\ref{eqn:bayesian}) the decision rule indicates the incorrect class. (This can be understood through the existence of outliers.)
% \begin{align}
%     \begin{split}
%         P_e &= \int P(\text{error}|x)p(x) dx \\
%             &= \int \text{min} \left[ p(y=\ell_1|x)p(x), p(y=\ell_\infty|x)p(x) \right]  dx
%     \end{split}
% \end{align}

% We use the Chernoff and Bhattacharya bounds \citep{chernoff1952measure,bhattacharyya1943measure} to get an upperbound on the error. (Note that in the case of shared covariance matrices, the bound is tight)

% \begin{align}
%     \begin{split}
%         P_e &\leq \sqrt{p(y=\ell_1) p(y = \ell_\infty)} \exp (-B) = \frac{1}{2} \exp (-B)\\
%         B &= \frac{1}{8} ( \boldsymbol\mu_1 - \boldsymbol\mu_\infty)^{\top} \left(\frac{ \boldsymbol\Sigma_{1} + \boldsymbol\Sigma_{\infty}}{2}\right)^{-1} ( \boldsymbol\mu_1 - \boldsymbol\mu_\infty) + 
%         \frac{1}{2} \log \frac{\left|\frac{ \boldsymbol\Sigma_{1} + \boldsymbol\Sigma_{\infty}}{2}\right|}
%         {\sqrt{| \boldsymbol\Sigma_{1}| | \boldsymbol\Sigma_{\infty}|}}
%     \end{split}
% \end{align}

% Using the result that $\mu_p =  \boldsymbol\mu + \delta_p, \boldsymbol\Sigma_p = \boldsymbol\Sigma$ from 
% % \ref{eqn:l_inf_delta} and \ref{eqn:l_1_delta},
% we simplify :
% \begin{align}
% \label{eqn:error_bound}
%     \begin{split}
%         P_e &\leq \frac{1}{2} 
%         \exp \left(- \frac{1}{8} (\delta_1 - \delta_\infty)^{\top} \boldsymbol\Sigma^{-1} (\delta_1 - \delta_\infty) \right)
%     \end{split}
% \end{align}

% Note that for the classification by $C_{adv}$, we are only interested if the regions in the perturbation balls that are exclusive to the two balls are correctly classified (since for the common regions, either of the second level models $\mp{}$ may be able to accurately classify the perturbed image).


% % \subsection{Visualizing the error Bound}
% % For the calculation of the optimal perturbation $\delta_1$, we had assumed that exactly $k<d$ dimensions have a higher importance in the weight matrix $\mathbf{W} = \boldsymbol\Sigma^{-1}\mu$ learned by the classifier $\mp{}$. It may be observed that this can occur either when the mean of all inputs pertaining to that label is high in that dimension, or the variance is low, or a combination of the two. While either of the mechanisms does not make a difference, we assume that all dimensions have the same variance and $\Sigma = I$ to demonstrate the maximum error. Similarly without loss of generality, we can ensure that $\mu$ is positive in all dimensions by flipping the sign of all inputs of any dimension $j$ where $\mu_j$ is negative.
% % 
% % Combining these assumptions with optimal perturbation values from (\ref{eqn:l_inf_opt}) and (\ref{eqn:l_1_opt}), we have:
% % 
% % \begin{align}
% %     \begin{split}
% %         \delta_\infty 
% %         &= -\epsilon_\infty \operatorname{sgn}
% %         \left(\Sigma^{-1}  \boldsymbol\mu \right) \\
% %         &= -\epsilon_\infty \mathbf{1} \\
% %         \delta_1(i)^{*}
% %         &=
% %         \begin{cases}
% %             - \frac{\epsilon_1}{k} \operatorname{sgn} (\Sigma^{-1}\mu)_i,& \text{if } i\leq k\\
% %             0,              & \text{otherwise}
% %         \end{cases} \\
% %         &=
% %         \begin{cases}
% %             - \left(\frac{\epsilon_1}{k}\right), & \text{if } i\leq k\\
% %             0,              & \text{otherwise}
% %         \end{cases}
% %     \end{split}
% % \end{align}

% We simplify (\ref{eqn:error_bound}) using (\ref{eqn:l_inf_opt}) and (\ref{eqn:l_1_opt}) and substituting $\boldsymbol\Sigma = \sigma^2 \boldsymbol I$:

% \begin{equation}
% \label{eqn:p_e_simplified}
% \begin{split}
%     P_e 
%     &\leq \frac{1}{2} 
%     \exp \left\{
%             - \frac{\sigma^2}{8} \sum_{i=0}^{d} \left(\delta_1(i) - \delta_\infty(i)\right)^{2}
%         \right\} \\
%     &= \frac{1}{2} 
%     \exp \left\{
%             - \frac{\sigma^2}{8} \left(
%                     \left(\delta_1(0) - \delta_\infty(0)\right)^{2}
%                     +
%                     \sum_{i=1}^{d} \left(\delta_1(i) - \delta_\infty(i)\right)^{2}
%                     \right)
%         \right\} \\
%     &= \frac{1}{2} 
%     \exp \left\{
%             - \frac{\sigma^2}{8} \left(
%                     \left(-\epsilon_1 + \epsilon_\infty\right)^{2}
%                     +
%                     d \cdot \left(\epsilon_\infty\right)^{2}
%                     \right)
%         \right\} \\
%     &= \frac{1}{2} 
%     \exp \left\{
%             - \frac{\sigma^2}{8} \left(
%                     (d+1) \epsilon_\infty^{2} + \epsilon_1^2 -2\epsilon_\infty\epsilon_1
%                     \right)
%         \right\}
%     = \frac{1}{2} 
%     \exp \left\{
%             - \frac{\epsilon_\infty^{2}\sigma^2}{8} \left(
%                     (d+1) + r^2 -2r
%                     \right)
%         \right\} \\
% \end{split}
% \end{equation}
% % \todo{Need to incorporate $\sigma$ here}
% where $r = \left(\frac{\epsilon_1}{\epsilon_\infty} \right)$

% % \todo{Alternate option: (I would prefer this) We can avoid the whole 'equal volumes' approach and rather just say that we use the following values of the $\epsilon_\infty = \eta$ and $\epsilon_1 = \alpha$ such that the $\ell_\infty$ adversary can make all the weakly correlated labels meaningless by changing the new expected value to 0 and the $\ell_1$ adversary can make $x_0$ meaningless but not flip all the weakly correlated ones. In this case $r = \sqrt{d}$}

% \paragraph{Perturbation Size} We set the radius of the $\ell_\infty$ ball, $\epsilon_\infty = \eta$ and the radius of the $\ell_1$ ball, $\epsilon_1 = \alpha$. We further extend the discussion about suitable perturbation sizes in Appendix~\ref{app:subsec:perturbation_size}. These values ensure that the $\ell_\infty$ adversary can make all the weakly correlated labels meaningless by changing the expected value of the adversarial input to less than 0  ($\mathbf{E}[x_i+\delta_\infty(i)] \quad \forall i>0$), 
% while the $\ell_1$ adversary can make the strongly correlated feature $x_0$ meaningless by changing its expected value to less than 0 ($\mathbf{E}[x_0+\delta_1(0)]$). However, neither of the two adversaries can flip all the features together. This gives us that $r = \sqrt{d}$.

% % \paragraph{Volumes of $\ell_p$ balls} For an $\ell_p$ ball of radius $R$, and dimension $d$, the volume of the ball is given by:
% % \begin{align}
% %     \begin{split}
% %         V_p^{d}(R) &= \frac{ \left(2 \Gamma\left(\frac{1}{p} + 1\right) R\right)^d}{ \Gamma\left(\frac{d}{p} + 1\right)} \\
% %         V_\infty^{d}(R) = (2R)^d, \quad \quad
% %         V_1^{d}(R) &= \frac{ \left(2 R\right)^d}{d!}, \quad \quad
% %         V_2^{d}(R) = \frac{ \pi^{\frac{d}{2}} R^d}{ \Gamma\left(\frac{d}{2} + 1\right)} 
% %     \end{split}
% % \end{align}

% % Consider the scenario of classifying perturbation within $\ell_p$ balls of equal volume. For the $\ell_1$ and $\ell_\infty$ case,  we have:

% % \begin{align}
% % \label{eqn:r_value}
% %     \begin{split}
% %         (\epsilon_\infty)^d &= \frac{(\epsilon_1)^d}{d!} \\
% %         \left(\frac{\epsilon_1}{\epsilon_\infty} \right) 
% %         &= r = (d!)^{1/d}
% %     \end{split}
% % \end{align}

% Combining the value of $r$ in (\ref{eqn:p_e_simplified}) and substituting for $\epsilon_\infty$, we obtain the final error bound as follows:
% \begin{equation}
% \label{eqn:p_e_final}
% \begin{split}
%     P_e 
%     &\leq  \frac{1}{2} 
%     \exp \left\{
%             - \frac{\sigma^2\alpha^{2}}{4d} \left(
%                     d-\sqrt{d}
%                     \right)
%         \right\} 
% \end{split}
% \end{equation}

% For $\frac{\alpha}{\sigma} > 10, \sigma = 1$ and $d\sim100$, we have:
% \begin{equation}
%     P_e \leq 0.05
% \end{equation}
% % \todo{Correct this after incorporating $\sigma$}
% % \todo{Alternate: The theorem statement can rather be made that as limit $d$ tends to infinity, $P_e$ tends to 0. the choice of d and epsilon seem rather arbitrary and bad for a theorem statement}
% which suggests that the distributions are significantly distinct and can be easily separated. This concludes the proof for Theorem~\ref{thm:separability}.

% \textbf{Note:} We can extend the same analysis to other $\ell_p$ balls as well, but we consider the case of $\ell_1$ and $\ell_\infty$ for simplicity.


\subsection{Calculating a bound on the error}
\label{app:sep:error}
\paragraph{Classification Error.} A classification error occurs if a data vector x belongs to one class but falls in the decision region of the other class. That is in (\ref{eqn:bayesian}) the decision rule indicates the incorrect class. (This can be understood through the existence of outliers)
\begin{align}
    \begin{split}
        P_e &= \int P(\text{error}|x)p(x) dx ,\\
            &= \int \text{min} \left[ p(y=\ell_1|x)p(x), p(y=\ell_\infty|x)p(x) \right]  dx .
    \end{split}
\end{align}

\paragraph{Perturbation Size.} We set the radius of the $\ell_\infty$ ball, $\epsilon_\infty = \eta$ and the radius of the $\ell_1$ ball, $\epsilon_1 = \alpha$. We further extend the discussion about suitable perturbation sizes in Appendix~\ref{app:subsec:perturbation_size}. These values ensure that the $\ell_\infty$ adversary can make all the weakly correlated labels meaningless by changing the expected value of the adversarial input to less than 0  ($\mathbf{E}[x_i+\delta_\infty(i)] \quad \forall i>0$), 
while the $\ell_1$ adversary can make the strongly correlated feature $x_0$ meaningless by changing its expected value to less than 0 ($\mathbf{E}[x_0+\delta_1(0)]$). However, neither of the two adversaries can flip all the features together. 

\paragraph{Translating the axes.} We can translate the axis of reference by $\left(-\mu -\left(\frac{\delta_1 + \delta_\infty}{2}\right)\right)$ and define $\boldsymbol \mu_{adv} = \left(\frac{\delta_1 - \delta_\infty}{2}\right)$, such that :
\begin{equation}
    \begin{split}
        X_1 &\sim \mathcal{N}( \boldsymbol \mu_{adv},\,\boldsymbol\Sigma); \\
        X_\infty &\sim \mathcal{N}( -\boldsymbol \mu_{adv},\,\boldsymbol\Sigma).
    \end{split}
\end{equation}

We can once again combine this with the simplified Bayesian model in (\ref{eqn:simplified_bayesian_eqn}) to obtain the classification rule:
\begin{align}
\label{eqn:bayesian_lp}
    \begin{split}
        x^\top  \boldsymbol\mu_{adv} &\stackrel{p=1}{>}  0 .
        \end{split}
\end{align}

Combining the optimal perturbation definitions in (\ref{eqn:l_inf_opt}) and (\ref{eqn:l_1_opt}) that $\boldsymbol \mu_{adv} =   \left(\frac{\delta_1 - \delta_\infty}{2}\right) = \frac{1}{2}\left[-\epsilon_1+\epsilon_\infty,\epsilon_\infty,\dots,\epsilon_\infty \right]$. We can further substitute $\epsilon_1 = \alpha$ and $\epsilon_\infty = \eta = \frac{\alpha}{\sqrt{d}}$. 
Notice that $\boldsymbol \mu_{adv}(i)>0$ $\forall i > 0$. Without loss of generality, to simplify further discussion we can flip the coordinates of $x_0$, since all dimensions are independent of each other. Therefore,  $\boldsymbol \mu_{adv}  = \frac{\alpha}{2\sqrt{d}}\left[\sqrt{d}-1,1,\dots,1\right]$.
Consider a new variable $x_z$ such that:

\begin{align}
\label{eqn:xz}
    \begin{split}
        x_z = x_0 \cdot \left(1 - \frac{1}{\sqrt{d}}\right) + \frac{1}{\sqrt{d}}\sum_{i=1}^{d} x_i = \frac{2}{\alpha} \left(x^\top \boldsymbol \mu_{adv}\right).
        \end{split}
\end{align}
Since each $x_i \forall i  \geq 0$ is independently distributed, the new feature $x_z \sim \mathcal{N}(\mu_z, \sigma^2_z)$, where

\begin{align}
\label{eqn:xzy}
    \begin{split}
        \mu_z &= \alpha\left(1 - \frac{1}{\sqrt{d}}\right) + \frac{1}{\sqrt{d}}\sum_{i=1}^{d} \frac{\alpha}{\sqrt{d}} 
        = 2\alpha - \frac{\alpha}{\sqrt{d}}\\
        \sigma_z^2 &= \sigma^2 \left(1 + \frac{1}{d} -2\frac{1}{\sqrt{d}} + \sum_{i=1}^{d} \frac{1}{d} \right),
        \\
        &= \sigma^2 \left(2 + \frac{1}{d} -2\frac{1}{\sqrt{d}} \right).
        \end{split}
\end{align}

Therefore, the problem simplifies to calculating the probability that the meta-variable $x_z > 0$. 

For $\frac{\alpha}{\sigma} > 10$ and $d>1$, we have in the z-table, $z>10$:

\begin{align}
\label{eqn:xzq}
    \begin{split}
        P_e &\leq 10^{-24},
    \end{split}
\end{align}


% \todo{Correct this after incorporating $\sigma$}
% \todo{Alternate: The theorem statement can rather be made that as limit $d$ tends to infinity, $P_e$ tends to 0. the choice of d and epsilon seem rather arbitrary and bad for a theorem statement}
which suggests that the distributions are significantly distinct and can be easily separated. This concludes the proof for Theorem~\ref{thm:separability}.

\textbf{Note:} We can extend the analysis to other $\ell_p$ balls as well, but we consider $\ell_1$ and $\ell_\infty$ for simplicity.


\section{Robustness of the {\ours} Pipeline (Theorem~\ref{thm:trade-off})}
\label{app:trade-off-proof}
% \paragraph{Idea} Show that the adversary which tries to fool the pipeline will have to face a trade-off between fooling the top classifier and fooling the bottom robust models. 

% Use this trade-off to prove the worst case error of the entire pipeline in a dynamic setting.
In the previous section, we show that it is indeed possible to distinguish between the distribution of inputs of a given class that were subjected to $\ell_1$ and $\ell_\infty$ perturbations over a standard classifier. Now, we aim to develop further understanding of the robustness of our two-stage pipeline in a dynamic attack setting with multiple labels to distinguish among. The first stage is a preliminary classifier $C_{adv}$ that classifies the perturbation type and the second stage consists of multiple models $\mp{}$ that were specifically trained to be robust to perturbations to the input within the corresponding $\ell_p$ norm. 

First, in Appendix~\ref{app:subsec:binary-gaussian-m_p}, we calculate the optimal weights for a binary Gaussian classifier $\mp{}$, trained on dataset $\mathcal{D}$ to be robust to adversaries within the $\ell_p$ ball $\forall p \in \{1,\infty\}$. Based on the weights of the individual model, we fix the perturbation size $\epsilon_p$ to be only as large, as is required to fool the alternate model with high probability. Here, by `alternate' we mean that for an $\ell_q$ attack, the prediction should be made by the $M_{\ell_p,\epsilon_p}$ model,where $p,q\in \{1,\infty\}; p\neq q$. In Appendix~\ref{app:subsec:robust_acc} we calculate the robustness of individual $\mp{}$ models to $\ell_p$ adversaries, given the perturbation size $\epsilon_p$ as defined in Appendix~\ref{app:subsec:perturbation_size}. In Appendix~\ref{app:subsec:decision-rule}, we analyze the modified distributions of the perturbed inputs after different $\ell_p$ attacks. Based on this analysis, we construct a simple decision rule for the perturbation classifier $C_{adv}$. Finally, in Appendix~\ref{app:subsec:trade-off} we determine the perturbation induced by the worst-case adversary that has complete knowledge of both $C_{adv}$ and $M_{\ell_p,\epsilon_p} \forall p \in \{1,\infty\}$. We show how there exists a trade-off between fooling the perturbation classifier (to allow the alternate $M_{\ell_p,\epsilon_p}$ model to make the final prediction), and fooling the alternate $M_{\ell_p,\epsilon_p}$ model itself. 


\paragraph{Perturbation Size.} We set the radius of the $\ell_\infty$ ball, $\epsilon_\infty = \eta + \zeta_\infty$ and the radius of the $\ell_1$ ball, $\epsilon_1 = \alpha + \zeta_1$, where $\zeta_p$ are some small positive constants that we calculate in Appendix~\ref{app:subsec:perturbation_size}. These values ensure that the $\ell_\infty$ adversary can make all the weakly correlated labels meaningless by changing the expected value of the adversarial input to less than 0  ($\mathbf{E}[x_i+\delta_\infty(i)] \quad \forall i>0$), 
while the $\ell_1$ adversary can make the strongly correlated feature $x_0$ meaningless by changing its expected value to less than 0 ($\mathbf{E}[x_0+\delta_1(0)]$). However, neither of the two adversaries can flip all the features together. The exact values of $\zeta_p$ determine the exact success probability of the attacks. We defer this calculation to later when we have calculated the weights of the models $\mp{}$. For the following discussion, it may be assumed that $\zeta_p\to0$ $\forall p \in$ $\{1,\infty\}$.



\subsection{Binary Gaussian Classifier $\mp{}$}
\label{app:subsec:binary-gaussian-m_p}
Extending the discussion in Appendix~\ref{app:sec:gaussian_classifier}, we now examine the learned weights of a binary Gaussian classifier $\mp{}$ that is trained to be robust against perturbations within the corresponding $\ell_p$ ball of radius $\epsilon_p$. 
% We (a) calculate the optimal perturbation $\ell_p$ for a given perturbation ball, and (b) calculate the learned weights of a classifier $\mp{}$ that is robust against $\epsilon_p$ bounded perturbations. 
The optimization equation for the classifier can be formulated as follows:
% \label{subsec:opt-perturb-2}
\begin{equation}
        \min_{\mathbf{W}} 
        \mathbb{E} 
        \left[- yx^{\top} \mathbf{W} \right] + 
            \frac{1}{2} \lambda {||\mathbf{W}||}_{2}^{2},
\end{equation}
where $\lambda$ is tuned in order to make the $\ell_2$ norm of the optimal weight distribution, $||\mathbf{W}^{*}||_{2}, = 1$. 
Following the symmetry argument in Lemma D.1 \citep{tsipras2018robustness} we extend for the binary Gaussian classifier that :
\begin{equation}
\label{eqn:symmetric weights}
\mathbf{W}^{*}_i = \mathbf{W}^{*}_{j} = \mathbf{W_M} \quad \forall i, j \in \{1, \dots, d \}.
\end{equation}

% Similar to \citep{tsipras2018robustness}, we define a meta-feature $x_M$ as:
% $$ x_M = \frac{1}{\sqrt{d}} \sum_{i=1}^{d} x_i ,$$ which is distributed as :
% $$x_M \sim \mathcal{N}(y\eta \sqrt{d}, 1)$$
% The optimization equation for the classifier can then be simplified as 
% \begin{equation}
%         \min_{\mathbf{W}} 
%         \mathbb{E} 
%         \left[- y(\mathbf{W}_0 \cdot x_0 + \mathbf{W}_M \cdot       x_M \right] +  \frac{1}{2} \lambda \|\mathbf{W}\|_2^2
% \end{equation}

% \subsubsection{Optimal $\ell_p$ perturbations}
% The optimal perturbation $\delta^{*}$ for a given learned SVM parametrized by $w = [\mathbf{W}_0, \mathbf{W}_M]$ is given by:
% \begin{equation}
% \begin{split}
%         \delta^{*} 
%         &= \text{arg}\max_{|\delta|_p \leq \epsilon_p}
%         1 - y\left(\mathbf{W}_0 \cdot (x_0 + \delta_0)  + \frac{\mathbf{W}_M}{\sqrt{d}} \cdot \sum_{i=1}^{d}(x_i + \delta_i)\right)  \\
%         &= \text{arg}\min_{|\delta|_p \leq \epsilon_p}
%         y \mathbf{W}_0 \cdot \delta_0 +  \frac{y \mathbf{W}_M}{\sqrt{d}} \cdot \sum_{i=1}^{d} \delta_i
% \end{split}
% \end{equation}


% \paragraph{p = $\infty$}
% \begin{equation}
% \begin{split}
%         \delta^{*}_{\infty} 
%         &= \text{arg}\min_{|\delta|_\infty \leq \epsilon_\infty}
%         y \mathbf{W}_0 \cdot \delta_0 +  \frac{y \mathbf{W}_M}{\sqrt{d}} \cdot \sum_{i=1}^{d} \delta_i
%         = - y \epsilon_\infty \mathbf{1}
% \end{split}
% \end{equation}
% % Lemma D.5
% It is worth noting that $\epsilon_\infty$ is larger than $\eta$ and can flip the sign of input in any dimension $i \in \{1,\dots,d\}$.


% \paragraph{p = $1$}
% \begin{equation}
%         \delta^{*}_{1} 
%         = \text{arg}\min_{|\delta|_1 \leq \epsilon_1}
%         y \mathbf{W}_0 \cdot \delta_0 +  \frac{y \mathbf{W}_M}{\sqrt{d}} \cdot \sum_{i=1}^{d} \delta_i
% \end{equation}

% The constraint $|\delta|_1 \leq \epsilon_1$ implies that $\sum_{i=0}^{d} |\delta_i| \leq \epsilon_1$. It can be seen that to minimize the objective, all values of $\delta_i$ should be negative. We can re-write the equation in terms of the meta-feature as:
% \begin{equation}
% \begin{split}
%         \delta_{M} &= \frac{1}{\sqrt{d}} \sum_{i=1}^{d} \delta_i, 
%         \quad \delta_i \times \delta_j \geq  0 \quad \forall j,k > 0\\
%         \delta^{*}_{1} 
%         &= \text{arg}\min_{|\delta_0| + |\delta_M|  \leq \epsilon_1}
%         y \mathbf{W}_0 \cdot \delta_0 +  y \mathbf{W}_M \cdot \delta_M
%         % = - y \epsilon_\infty \mathbf{1}
% \end{split}
% \end{equation}

% which gives us that $\delta_1^{*} = [\delta_0, \delta_M] = [\epsilon_1, 0]$ if $\mathbf{W}_0 > \mathbf{W}_{M}$ else $\delta_1^{*} = [0,\epsilon_1]$. In the case of equal weights, any distribution of $\delta$ such that $|\delta|_1 = \epsilon_1$ returns the same loss. From Lemma D.2 \citep{tsipras2018robustness} we know that for any standard classifier $\mathbf{W}_M \geq \mathbf{W}_{0}$ (as long as $p \leq 0.975$). Therefore, $\delta_1^{*} = [0,\epsilon_1]$ (barring the special equality case). 

% Note that the distribution of weights among various $\delta_i \forall i \in \{1,\dots,d\}$ does not change the final classifier output. For simplicity, we can assume that the weight is equally distributed among all dimensions, but alternately even if all the weight is attributed to only one dimension, the final classifier output $\operatorname{sgn}(\mathbf{W}_{0} \cdot x_{0} + \mathbf{W}_{M} \cdot x_{M})$ remains unchanged.

% \subsubsection{Optimal $\mp{}$ weights}
% Now we develop the characteristics of SVM classifiers trained to be robust to attacks within a given $\epsilon_p$ norm.
We deal with the cases pertaining to $p\in\{\infty, 1\}$ in this section. For both the cases, we consider existential solutions for the classifier $\mp{}$ to simplify the discussion. This gives us lower bounds on the performance of the optimal robust classifier. The robust objective under adversarial training can be defined as:
\begin{equation}
\label{eqn:robust_objective_initial}
\begin{split}
         &\min_{\mathbf{W}} \max_{\|\delta\|_p \leq \epsilon_p}
        \mathbb{E} 
        \left[\mathbf{W}_0 \cdot (x_0 + \delta_0) + \mathbf{W_M} \cdot \sum_{i=1}^{d}(x_i + \delta_i)
        \right] + \frac{1}{2} \lambda \|\mathbf{W}\|_2^2; \\
        &\min_{\mathbf{W}} \left\{
        -1\left(\mathbf{W_0}\alpha + d\times\mathbf{W_M}\frac{\alpha}{\sqrt{d}}\right)
         +  \frac{1}{2} \lambda \|\mathbf{W}\|_2^2
         +\max_{\|\delta\|_p \leq \epsilon_p}\mathbb{E} 
        \left[- y
        \left(\mathbf{W}_0 \delta_0 + \mathbf{W_M} \sum_{i=1}^{d}\delta_i
        \right)
        \right]
         \right\}   
\end{split}
\end{equation}

Further, since the $\lambda$ constraint only ensures that  $||\mathbf{W}^{*}||_{2} = 1$,  we can simplify the optimization equation by substituting $\mathbf{W_0} = \sqrt{1 - d\cdot \mathbf{W_M}^2}$ as follows,

\begin{equation}
\label{eqn:robust_objective}
\begin{split}
        &\min_{\mathbf{W_M}} \left\{
        -1\left(\alpha\sqrt{1 - d \cdot \mathbf{W_M}^2} + d\times\mathbf{W_M}\frac{\alpha}{\sqrt{d}}\right)
         +\max_{\|\delta\|_p \leq \epsilon_p}\mathbb{E} 
        \left[- y
        \left(\delta_0\sqrt{1 - d \cdot \mathbf{W_M}^2}  + \mathbf{W_M} \sum_{i=1}^{d}\delta_i
        \right)
        \right]
         \right\}.   
\end{split}
\end{equation}

\paragraph{p = $\infty$}
% From Lemma D.5 \citep{tsipras2018robustness}, minimizing the adversarial variant of the loss results in 
As discussed in (\ref{eqn:l_inf_opt}) the optimal perturbation $\delta_\infty$ is given by $-y\epsilon_\infty\mathbf{1}$. The optimization equation is simplified to:
\begin{equation}
\label{eqn:robust_objective_l_inf}
\begin{split}
        &\min_{\mathbf{W_M}} \left\{
        (\epsilon_\infty - \alpha)\sqrt{1 - d \cdot \mathbf{W_M}^2} + d\times\mathbf{W_M}\left(\epsilon_\infty - \frac{\alpha}{\sqrt{d}}\right)
         \right\}.   \\
\end{split}
\end{equation}

Recall that $\epsilon_\infty = \frac{\alpha}{\sqrt{d}} + \zeta_\infty$. To simplify the following discussion we use the weights of a classifier trained to be robust against perturbations within the $\ell_\infty$ ball of radius $\epsilon_\infty = \frac{\alpha}{\sqrt{d}}$. The optimal solution is then given by:

\begin{equation}
\label{eqn:wd_m_inf}
\begin{split}
        \lim_{\zeta_\infty\to0} \mathbf{W_M} = 0.
\end{split}
\end{equation}

Therefore, the classifier weights are given by $\mathbf{W} = [\mathbf{W}_0,\mathbf{W}_1,\dots,\mathbf{W}_d] = [1,0,\dots,0]$. We also show later in Appendix~\ref{app:subsec:robust_acc} that the model achieves greater than 99\% accuracy against $\ell_\infty$ adversaries for the chosen values of $\zeta_\infty$.

\paragraph{p = 1}
We consider an analytical solution to yield optimal weights for this case.
Recall from (\ref{eqn:l_1_opt}) that the optimal perturbation $\delta_1$ depends on the weight distribution of the classifier.
Therefore, if $\mathbf{W}_0 > \mathbf{W_M}$ the optimization equation can be simplified to
\begin{equation}
\label{eqn:robust_objective_l_1}
        \min_{\mathbf{W}} \left\{
        \mathbf{W_0}(\epsilon_1 - \alpha) - d\times\mathbf{W_M} \frac{\alpha}{\sqrt{d}}
         +  \frac{1}{2} \lambda \|\mathbf{W}\|_2^2
         \right\}  , 
\end{equation}

and if $\mathbf{W_M} > \mathbf{W}_0$
\begin{equation}
        \min_{\mathbf{W}} \left\{
        -\mathbf{W_0}\alpha - \mathbf{W_M}\left( \sqrt{d}\alpha - \epsilon_1 \right)
         +  \frac{1}{2} \lambda \|\mathbf{W}\|_2^2
         \right\}   .
\end{equation}
Recall that $\epsilon_1 = \alpha + \zeta_1$. 
Once again to simplify the discussion that follows we will lower bound the robust accuracy of the classifier $M_{\ell_1}$ by considering the optimal solution when $zeta_1 = 0$.
The optimal solution is then given by:

\begin{equation}
\label{eqn:wd_M_l_1}
\begin{split}
        \lim_{\zeta_1\to0} \mathbf{W_M} = 1 .
\end{split}
\end{equation}
For the robust classifier $M_{\ell_1}$, the weights $\mathbf{W} = [\mathbf{W}_0,\mathbf{W}_1,\dots,\mathbf{W}_d] = [0,\frac{1}{\sqrt{d}},\frac{1}{\sqrt{d}},\dots,\frac{1}{\sqrt{d}}]$. While this may not be the optimal solution for all values of $\zeta_1$, we are only interested in a lower bound on the final accuracy and the classifier described by weights $\mathbf{W}$ simplifies the discussion hereon. We also show later in Appendix~\ref{app:subsec:robust_acc} that the model achieves greater than 99\% accuracy against $\ell_1$ adversaries for the chosen values of $\zeta_1$.

\subsection{Perturbation Sizes for Fooling $\mp{}$ Models}
\label{app:subsec:perturbation_size}
Now that we exactly know the weights of the learned robust classifiers $M_{\ell_1}$ and $M_{\ell_\infty}$, we can move towards calculating values $\zeta_1$ and $\zeta_\infty$ for the exact radius of the perturbation regions for the $\ell_1$ and $\ell_\infty$ metrics.
We set the radii of these regions in such a way that an $\ell_1$ adversary can fool the model $M_{\ell_\infty}$ with probability $\sim$ 98\% (corresponding to $z = 2$ in the z-table for normal distributions), and similarly, the success of $\ell_\infty$ attacks against the $M_{\ell_1}$ model is $\sim$ 98\%.

Let $P_{p_1,p_2}$ represent the probability that model $M_{\ell_{p_1}}$ correctly classifies an adversarial input in the $\ell_{p_2}$ region. For $p_1 = \infty$ and $p_2 = 1$,

\begin{equation}
    \begin{split}
        P_{\infty,1} 
        &= \mathbb{P}_{x\sim\mathcal{N}(y\boldsymbol\mu, \boldsymbol\Sigma)}[y\cdot M_{\ell_\infty}(x + \delta_1)>0], \\
        &= \mathbb{P}_{x\sim\mathcal{N}(y\boldsymbol\mu, \boldsymbol\Sigma)}[y\cdot(x+\delta_1)^\top \mathbf{W} > 0], \\
        &\geq \mathbb{P}_{x\sim\mathcal{N}(\boldsymbol\mu, \boldsymbol\Sigma)}[x_0 > \epsilon_1]; 
        \\
        z &= \frac{\epsilon_1 - \alpha}{\sigma} = 
        \frac{\alpha + \zeta_1 - \alpha}{\sigma} = \frac{\zeta_1}{\sigma} = 2;\\
        \zeta_1 &= 2\sigma; \\
        \epsilon_1 &= \alpha + 2\sigma.
    \end{split}
\end{equation}

To simplify the discussion for the $M_{\ell_1}$ model, we define a meta-feature $x_M$ as:
\begin{equation}
    \label{eqn:meta-feature}
    x_M = \frac{1}{\sqrt{d}} \sum_{i=1}^{d} x_i ,
\end{equation}
which is distributed as :
$$x_M \sim \mathcal{N}(y\eta \sqrt{d}, \sigma^2) \,{\buildrel d \over =}\, \mathcal{N}(y\alpha, \sigma^2). $$
For $p_1 = 1$ and $p_2 = \infty$,
\begin{equation}
    \begin{split}
        P_{1,\infty} 
        &= \mathbb{P}_{x\sim\mathcal{N}(y\boldsymbol\mu, \boldsymbol\Sigma)}[y\cdot M_{\ell_1}(x + \delta_\infty)>0], \\
        &= \mathbb{P}_{x\sim\mathcal{N}(y\boldsymbol\mu, \boldsymbol\Sigma)}[y\cdot(x+\delta_\infty)^\top \mathbf{W} > 0], \\
        &= \mathbb{P}_{x\sim\mathcal{N}(y\boldsymbol\mu, \boldsymbol\Sigma)}[y\cdot\frac{1}{\sqrt{d}}\sum_{i=1}^{d}(x_i+\delta_\infty(i)) > 0], \\
        &= \mathbb{P}_{x\sim\mathcal{N}(y\boldsymbol\mu, \boldsymbol\Sigma)}[y\cdot (x_M - \sqrt{d}\cdot \epsilon_\infty) > 0], \\
        &\geq \mathbb{P}_{x\sim\mathcal{N}(\boldsymbol\mu, \boldsymbol\Sigma)}\left[x_M > \sqrt{d}\cdot \epsilon_\infty\right]; 
        \\
        z &= \frac{\sqrt{d}\cdot\epsilon_\infty - \alpha}{\sigma} = 
        \frac{\alpha + \sqrt{d}\cdot\zeta_\infty - \alpha}{\sigma} = \frac{\sqrt{d}\cdot\zeta_\infty}{\sigma} = 2;\\
        \zeta_\infty &= \frac{2\sigma}{\sqrt{d}}; \\
        \epsilon_\infty &= \frac{\alpha+ 2\sigma}{\sqrt{d}} ;
    \end{split}
\end{equation}


\begin{figure*}[t]
\centering
\begin{subfigure}[t]{0.59\linewidth}
  \includegraphics[width=\linewidth]{figures/linf.pdf}
  \caption{}
  \label{fig:pipeline-a}
\end{subfigure}
\begin{subfigure}[t]{0.40\linewidth}
   \includegraphics[width=\linewidth]{figures/l1.pdf}
   \caption{}
  \label{fig:tradeoff}
\end{subfigure}
\caption{\textbf{Simulation:} Decision boundary (solid green line) and robustness of individual $\mp{}$ models to different $\ell_p$ attacks. $x_M$ represents the meta feature as defined in Equation~\ref{eqn:meta-feature} and $x_0$ is the first dimension of the input. Notice how the distribution of perturbed samples varies according to the change in model architecture (scatter plots in the same color in the two graphs represent the same distribution). (a) The $M_{\ell_\infty}$ model is able to correctly classify all benign and $\ell_\infty$ perturbed samples. However, the $\ell_1$ adversary is able to successfully flip the decision of most data points (b) The same illustration is repeated for the $M_1$ model. In this case, while the model is robust to $\ell_1$ attacks, it fails against an $\ell_\infty$ adversary.}
 \label{fig:robustness_gaussian}
 \vspace{-1em}
\end{figure*}

\subsection{Robustness of individual $\mp{}$ models}
\label{app:subsec:robust_acc}

\paragraph{Additional assumptions.} We add the following assumptions: (1) the dimensionality parameter $d$ of input data is larger than 100; and (2) the ratio of the mean and variance for feature $x_0$ is greater than 10. (These assumptions were also made when introducing the problem in the main paper.)


\begin{equation}
\label{eqn:assumptions}
        d \geq 100,
        \quad \quad
        \frac{\alpha}{\sigma} \geq 10.
\end{equation}

We define $P_p$ as the probability that for any given input $x\sim\mathcal{N}(y\boldsymbol\mu, \boldsymbol\Sigma)$, the classifier $\mp{}$ outputs the correct label y for the input $x+\delta_p$.

\paragraph{p = $\infty$}
% Moreover, for a given input-label pair $(x+\delta_\infty,y)$, since $x_0 {\sim} \mathcal{N}(y\alpha, \sigma)$ and 
% $\epsilon_\infty < \frac{2\alpha}{\sqrt{d}}$:\todo{Correct this}
\begin{equation}
    \begin{split}
        P_{\infty,\infty} 
        &= \mathbb{P}_{x\sim\mathcal{N}(y\boldsymbol\mu, \boldsymbol\Sigma)}[y\cdot M_{\ell_\infty}(x + \delta_\infty)>0], \\
        &= \mathbb{P}_{x\sim\mathcal{N}(y\boldsymbol\mu, \boldsymbol\Sigma)}[y\cdot(x+\delta_\infty)^\top \mathbf{W} > 0], \\
        &= \mathbb{P}_{x\sim\mathcal{N}(y\boldsymbol\mu, \boldsymbol\Sigma)}[y\cdot(x_0+\delta_\infty(0)) > 0], \\
        &\geq \mathbb{P}_{x\sim\mathcal{N}(\boldsymbol\mu, \boldsymbol\Sigma)}[x_0 > \epsilon_\infty]; 
        \\
        z &= \frac{\epsilon_\infty - \alpha}{\sigma} = 
        \frac{\alpha}{\sigma} \left(\frac{1}{\sqrt{d}} - 1\right) + \frac{2}{\sqrt{d}}.
    \end{split}
\end{equation}
using the assumptions in (\ref{eqn:assumptions}),
\begin{equation}
\label{eqn:linf_robust_acc}
    P_{\infty,\infty} \geq 0.999.
\end{equation}

\paragraph{p = 1}

\begin{equation}
    \begin{split}
        P_{1,1} 
        &= \mathbb{P}_{x\sim\mathcal{N}(y\boldsymbol\mu, \boldsymbol\Sigma)}[y\cdot M_{\ell_1}(x + \delta_1)>0], \\
        &= \mathbb{P}_{x\sim\mathcal{N}(y\boldsymbol\mu, \boldsymbol\Sigma)}[y\cdot(x+\delta_1)^\top \mathbf{W} > 0], \\
        &= \mathbb{P}_{x\sim\mathcal{N}(y\boldsymbol\mu, \boldsymbol\Sigma)}[y\cdot\frac{1}{\sqrt{d}}\sum_{i=1}^{d}(x_i+\delta_1(i)) > 0], \\
        &= \mathbb{P}_{x\sim\mathcal{N}(y\boldsymbol\mu, \boldsymbol\Sigma)}[y\cdot (x_M + \delta_M) > 0], \\
        &\geq \mathbb{P}_{x\sim\mathcal{N}(\boldsymbol\mu, \boldsymbol\Sigma)}\left[x_M > \frac{\epsilon_1}{\sqrt{d}}\right]; 
        \\
        z &= \frac{\frac{\epsilon_1}{\sqrt{d}} - \alpha}{\sigma} = 
        \frac{\alpha}{\sigma} \left(\frac{1}{\sqrt{d}} - 1\right) + \frac{2}{\sqrt{d}}.
    \end{split}
\end{equation}
using the assumptions in (\ref{eqn:assumptions}),
\begin{equation}
\label{eqn:l1_robust_acc_m1}
    P_{1,1} \geq 0.999.
\end{equation}
% \todo{correct this for l1}
% Recall that for $\epsilon_1 = 2, \epsilon_\infty = 2\eta$, we violate the invariance condition, which means that the robust accuracy of the classifier is strictly larger. 
% Moreover, this is a lower bound on the performance of the optimal classifier since we show robustness of one existential classifier and not the optimal one. 

\subsection{Decision rule for $C_{adv}$}
\label{app:subsec:decision-rule}
We aim to provide a lower bound on the worst-case accuracy of the entire pipeline, through the existence of a simple decision tree $C_{adv}$. For given perturbation budgets $\epsilon_1$ and $\epsilon_\infty$, we aim to understand the range of values that can be taken by the adversarial input. Consider the scenarios described in Table~\ref{tab:adv-values} below. The same is also corroborated via the empirical experiments shown in Figure~\ref{fig:robustness_gaussian}.

\begin{table}[htb]
\centering
\caption{The table shows the range of the values that the mean can take depending on the decision taken by the adversary. $\mu_0^{adv}$ and $\mu_M^{adv}$ represent the new mean of the distribution of features $x_0$ and $x_M$ after the adversarial perturbation.}
\label{tab:adv-values}
\small
\scalebox{0.83}{
\begin{tabular}{@{}llrrlrr@{}}
\toprule
\multicolumn{1}{c}{\multirow{2}{*}{\begin{tabular}[c]{@{}c@{}}Attack \\ Type\end{tabular}}} &  & \multicolumn{2}{c}{$\mu_0^{adv}$}                                                                               &                       & \multicolumn{2}{c}{$\mu_M^{adv}$}                                                                                                                           \\ \cmidrule(lr){3-4} \cmidrule(l){6-7} 
\multicolumn{1}{c}{}                                                                        &  & y = 1                                                 & y = -1                                                  &                       & y = 1                                                                       & y = -1                                                                        \\ \midrule
\multicolumn{1}{l}{None}                                                                   &  & $\alpha$                                              & $-\alpha$                                               & \multicolumn{1}{l}{} & $\eta \sqrt{d}$                                                             & $- \eta \sqrt{d}$                                                             \\
\multicolumn{1}{l}{$\ell_\infty$}                                                          &  & $\{\alpha-\epsilon_\infty, \alpha+\epsilon_\infty \}$ & $\{-\alpha-\epsilon_\infty, -\alpha+\epsilon_\infty \}$ & \multicolumn{1}{l}{} & $\{\eta \sqrt{d} + \epsilon_\infty \sqrt{d}, \eta \sqrt{d} - \epsilon_\infty \sqrt{d} \}$ & $\{-\eta \sqrt{d} + \epsilon_\infty \sqrt{d}, -\eta \sqrt{d} - \epsilon_\infty \sqrt{d} \}$ \\
\multicolumn{1}{l}{$\ell_1$}                                                               &  & $\{\alpha-\epsilon_1, \alpha+\epsilon_1\}$            & $\{-\alpha-\epsilon_1, -\alpha+\epsilon_1\}$            & \multicolumn{1}{l}{} & $\{\eta \sqrt{d}  + \epsilon_1/\sqrt{d}, \eta \sqrt{d}  - \epsilon_1/\sqrt{d} \}$               & $\{-\eta \sqrt{d}  + \epsilon_1/\sqrt{d}, -\eta \sqrt{d}  - \epsilon_1/\sqrt{d} \}$               \\ \bottomrule
\end{tabular}}
\end{table}

Note that any adversary that moves the perturbation away from the y-axis is uninteresting for our comparison, since irrespective of a correct perturbation type prediction by $C_{adv}$, either of the two second level models naturally obtain a high accuracy on such inputs. Hence, we define the following decision rule with all the remaining cases mapped to $\ell_1$ perturbation type.
\begin{align}
\label{eqn:c_adv_decision}
        C_{adv} (x)
        =
        \begin{cases}
            % \ell_1,& \text{if } \quad |x_0| \in [0,1-\epsilon_\infty] \cup [1+\epsilon_\infty, \infty) \\
            1,& \text{if } \quad ||x_0|-\alpha| < \epsilon_\infty + \frac{\alpha}{2}  \\
            % &\& \quad |x_M| \in [0,(\eta-\epsilon_1/d) \sqrt{d}] \cup [(\eta+\epsilon_1/d) \sqrt{d}, \infty) \\
            0,              & \text{otherwise} \\
        \end{cases}
\end{align}
where the output 1 corresponds to the classifier predicting the presence of $\ell_\infty$ perturbation in the input, while an output of 0 suggests that the classifier predicts the input to contain perturbations of the $\ell_1$ type.

If we consider a black-box setting where the adversary has no knowledge of the classifier $C_{adv}$, and can only attack $\mp{}$ it is easy to see that the proposed pipeline obtains a high adversarial accuracy against the union of $\ell_1$ and $\ell_\infty$ perturbations (since the given decision rule correctly classifies known examples as simulated in Figure~\ref{fig:robustness_gaussian}.


Note: (1) There exists a single model that can also achieve robustness against the union of $\ell_1$ and $\ell_\infty$ perturbations, however, learning this model may be more challenging in real data settings. (2) The classifier need not be perfect.

\begin{figure*}[t]
\centering
  \includegraphics[width=0.6\linewidth]{figures/two_stage.pdf}
\caption{\textbf{Simulation}: Decision boundary of the overall two stage classifier. $x_M$ represents the meta feature as defined in Equation~\ref{eqn:meta-feature} and $x_0$ is the first dimension of the input.}
 \label{fig:perturb_classifier_gaussian}
 \vspace{-1em}
\end{figure*}

\subsection{Trade-off between attacking $\mp{}$ and $C_{adv}$}
\label{app:subsec:trade-off}
To obtain true robustness it is important that the entire pipeline is robust against adversarial attacks. More specifically, in this section we demonstrate the natural tension that exists between fooling the top level attack classifier (by making an adversarial attack less representative of its natural distribution) and fooling the bottom level adversarially robust models (requiring stronger attacks leading to a return to the attack's natural distribution).

The accuracy of the pipelined model $f$ against any input-label pair $(x,y)$ sampled through some distribution $\mathcal{N}(y\boldsymbol\mu_{adv}, \boldsymbol\Sigma)$ (where $\boldsymbol\mu_{adv}$ incorporates the change in the input distribution owing to the adversarial perturbation) is given by:
\begin{equation}
    \begin{split}
        \mathbb{P} \left[f(x) = y \right] 
         &=
         \mathbb{P}_{x\sim\mathcal{N}(y\boldsymbol\mu_{adv}, \boldsymbol\Sigma)} \left[C_{adv}(x)\right]\mathbb{P}_{x\sim\mathcal{N}(y\boldsymbol\mu_{adv}, \boldsymbol\Sigma)} \left[y\cdot M_{\ell_\infty}(x) > 0 | C_{adv}(x) \right] \\
         &\quad\quad\quad\quad+ (1 - \mathbb{P}_{x\sim\mathcal{N}(y\boldsymbol\mu_{adv}, \boldsymbol\Sigma)} \left[C_{adv}(x)\right])
         \mathbb{P}_{x\sim\mathcal{N}(y\boldsymbol\mu_{adv}, \boldsymbol\Sigma)} \left[y\cdot M_{\ell_1}(x) > 0 | \lnot C_{adv}(x) \right], \\
         &=
         \mathbb{P}_{x\sim\mathcal{N}(\boldsymbol\mu_{adv}, \boldsymbol\Sigma)}\left[C_{adv}(x)\right]\mathbb{P}_{x\sim\mathcal{N}(\boldsymbol\mu_{adv}, \boldsymbol\Sigma)} \left[ M_{\ell_\infty}(x) > 0 | C_{adv}(x) \right]
         \\
         &\quad\quad\quad\quad+ (1 - \mathbb{P}_{x\sim\mathcal{N}(\boldsymbol\mu_{adv}, \boldsymbol\Sigma)}\left[C_{adv}(x)\right])
         \mathbb{P}_{x\sim\mathcal{N}(\boldsymbol\mu_{adv}, \boldsymbol\Sigma)} \left[ M_{\ell_1}(x) > 0 | \lnot C_{adv}(x) \right]. \\
    \end{split}
\end{equation}

\paragraph{$\ell_\infty$ adversary.}
To simplify the analysis, we consider loose lower bounds on the accuracy of the model $f$ against the $\ell_\infty$ adversary. 
Recall that the decision of the attack classifier is only dependent of the input $x_0$. 
Irrespective of the input features $x_i \forall i> 0$,  it is always beneficial for the adversary to perturb the input by $\mu_i = -\epsilon_\infty$. However, the same does not apply for the input $x_0$.
Analyzing for the scenario when the true label $y=1$, if the input $x_0$ lies between $\frac{\alpha}{2} + \epsilon_\infty$ of the mean $\alpha$, irrespective of the perturbation, the output of the attack classifier $C_{adv} = 1$. The $M_{\ell_\infty}$ model then always correctly classifies these inputs. The overall robustness of the pipeline requires analysis for the case when input lies outside $\frac{\alpha}{2} + \epsilon_\infty$ of the mean as well. However, we consider that the adversary always succeeds in such a case in order to only obtain a loose lower bound on the robust accuracy of the pipeline model $f$ against $\ell_\infty$ attacks.

\begin{equation}
    \begin{split}
        \mathbb{P} \left[f(x) = y \right] 
         &=
         \mathbb{P}_{x\sim\mathcal{N}(\boldsymbol\mu_{adv}, \boldsymbol\Sigma)}\left[C_{adv}(x)\right]\mathbb{P}_{x\sim\mathcal{N}(\boldsymbol\mu_{adv}, \boldsymbol\Sigma)} \left[ M_{\ell_\infty}(x) > 0 | C_{adv}(x) \right],
         \\
         &\quad\quad\quad\quad+ (1 - \mathbb{P}_{x\sim\mathcal{N}(\boldsymbol\mu_{adv}, \boldsymbol\Sigma)}\left[C_{adv}(x)\right])
         \mathbb{P}_{x\sim\mathcal{N}(\boldsymbol\mu_{adv}, \boldsymbol\Sigma)} \left[ M_{\ell_1}(x) > 0 | \lnot C_{adv}(x) \right], \\
         &\geq 
         \mathbb{P}_{x\sim\mathcal{N}(\boldsymbol\mu_{adv}, \boldsymbol\Sigma)}\left[C_{adv}(x)\right]\mathbb{P}_{x\sim\mathcal{N}(\boldsymbol\mu_{adv}, \boldsymbol\Sigma)} \left[ M_{\ell_\infty}(x) > 0 | C_{adv}(x) \right],\\
         &\geq 
         \mathbb{P}_{x\sim\mathcal{N}(\boldsymbol\mu, \boldsymbol\Sigma)}\left[ |x_0 - \alpha| \leq \frac{\alpha}{2} - \epsilon_\infty \right],\\
         &\geq 
         2\mathbb{P}_{x\sim\mathcal{N}(\boldsymbol\mu, \boldsymbol\Sigma)}\left[ x_0 \leq \alpha - \frac{\alpha}{2} + \epsilon_\infty \right],\\
    z &= \frac{(\alpha - \frac{\alpha}{2} + \epsilon_\infty) - \alpha}{\sigma} = -\frac{\alpha}{2\sigma} + \frac{3\sigma}{2\sigma\sqrt{d}}.
    \end{split}
\end{equation}
using the assumptions in (\ref{eqn:assumptions}),
\begin{equation}
\label{eqn:l1_robust_acc_final}
     \mathbb{P} \left[f(x) = y \right]  \sim 0.99.
\end{equation}

\paragraph{$\ell_1$ adversary.}
It may be noted that a trivial way for the $\ell_1$ adversary to fool the attack classifier is to return a perturbation $\delta_1 = 0$. In such a scenario, the classifier predicts that the adversarial image was subjected to an $\ell_\infty$ attack. The label prediction is hence made by the $M_{\ell_\infty}$ model. But we know from (\ref{eqn:linf_robust_acc}) that the $M_{\ell_\infty}$ model predicts benign inputs correctly with a probability $P_{\infty,\infty}>0.99$, hence defeating the adversarial objective of misclassification.
To achieve misclassification over the entire pipeline the optimal perturbation decision for the $\ell_1$ adversary when $x_0 \in \left[- \alpha - \frac{\alpha}{2} - \epsilon_1, -\alpha + \frac{\alpha}{2} + \epsilon_1\right]$ the adversary can fool the pipeline by ensuring that the $C_{adv}(x) = 1$. However, in all the other cases irrespective of the perturbation, either $C_{adv} = 0$ or the input features $x_0$ has the same sign as the label $y$. Since, $P_{1,1} > 0.99$ for the $M_{\ell_1}$ model, for all the remaining inputs $x_0$ the model correctly predicts the label with probability greater than $0.99$ (approximate lower bound). 
We formulate this trade-off to elaborate upon the robustness of the proposed pipeline.

\begin{equation}
    \begin{split}
        \mathbb{P} \left[f(x) = y \right] 
         &=
         \mathbb{P}_{x\sim\mathcal{N}(\boldsymbol\mu_{adv}, \boldsymbol\Sigma)}\left[C_{adv}(x)\right]\mathbb{P}_{x\sim\mathcal{N}(\boldsymbol\mu_{adv}, \boldsymbol\Sigma)} \left[ M_{\ell_\infty}(x) > 0 | C_{adv}(x) \right]
         \\
         &\quad\quad\quad\quad+ (1 - \mathbb{P}_{x\sim\mathcal{N}(\boldsymbol\mu_{adv}, \boldsymbol\Sigma)}\left[C_{adv}(x)\right])
         \mathbb{P}_{x\sim\mathcal{N}(\boldsymbol\mu_{adv}, \boldsymbol\Sigma)} \left[ M_{\ell_1}(x) > 0 | \lnot C_{adv}(x) \right], \\
         &\geq 
         \mathbb{P}_{x\sim\mathcal{N}(\boldsymbol\mu, \boldsymbol\Sigma)}
         \left[ -\alpha -\frac{\alpha}{2} -\epsilon_1\leq x_0 \leq -\alpha + \frac{\alpha}{2} + \epsilon_1 \right] \\
         &\quad \quad \quad \quad + 0.999(\mathbb{P}_{x\sim\mathcal{N}(\boldsymbol\mu, \boldsymbol\Sigma)}
         \left[ x_0 < -\alpha -\frac{\alpha}{2} -\epsilon_1 \text{ or } x_0 > -\alpha + \frac{\alpha}{2} + \epsilon_1 \right] ),\\
         &\geq 
         0.999(\mathbb{P}_{x\sim\mathcal{N}(\boldsymbol\mu, \boldsymbol\Sigma)}
         \left[ x_0 < -\alpha -\frac{\alpha}{2} -\epsilon_1  \text{ or } x_0 > -\alpha + \frac{\alpha}{2} + \epsilon_1 \right] ).
    % z &= \frac{(\alpha - \frac{\alpha}{2} + \epsilon_\infty) - \alpha}{\sigma} = -2 + \frac{\alpha + \frac{\alpha}{2}}{\sigma\sqrt{d}}
    \end{split}
\end{equation}
using the assumptions in (\ref{eqn:assumptions}),
\begin{equation}
\label{eqn:l1_robust_acc}
     \mathbb{P} \left[f(x) = y \right]  \sim 0.99.
\end{equation}

This concludes the proof for Theorem~\ref{thm:trade-off}, showing that an adversary can hardly stage successful attacks on the entire pipeline and faces a natural tension between attacking the label predictor and the attack classifier. We verify these results via a simulation in Figure~\ref{fig:perturb_classifier_gaussian}. We emphasize that these accuracies are lower bounds on the actual robust accuracy, and the objective of this analysis is not to find the optimal solution to the problem of multiple perturbation adversarial training, but to elucidate the trade-off between attacking the two~pipeline~stages.


\section{Model Architecture}
\label{app:architecture}
\paragraph{Second-level $\mp{}$ models.} A key advantage of {\ours} is that we can build upon existing defenses against individual perturbation type. Specifically, for MNIST, we use the same CNN architecture as \citet{zhang2019theoretically} for our $\mp{}$ models, and we train these models using their proposed TRADES loss. For CIFAR-10, we use the same training setup and model architecture as \citet{carmon2019unlabeled}, which is based on a robust self-training algorithm that utilizes unlabeled data to improve the model robustness. 

\paragraph{Perturbation classifier $C_{adv}$.} For both MNIST and CIFAR-10 datasets, the architecture of the perturbation classifier $C_{adv}$ is similar to the individual $\mp{}$ models. Specifically, for MNIST, we use the CNN architecture  in \citet{zhang2019theoretically} with four convolutional layers, followed by two fully-connected layers. For CIFAR-10, $C_{adv}$ is a WideResNet \citep{zagoruyko2016wide} model with depth 16 and widening factor of 2 (WRN-16-2). The architectures for classifying $\ell_p$ perturbations and common corruptions are largely the same, except that the final classification layers have different dimensions due to the different label set sizes.

\section{Training Details}
\label{app:hp_used}
\subsection{Specialized Robust Predictors $\mp{}$}
\paragraph{MNIST.}
We use the Adam optimizer \citep{kingma2014adam} to train our models along with a piece-wise linearly varying learning rate schedule \citep{smith2018disciplined} to train our models with maximum learning rate of $10^{-3}$. The base models $M_{\ell_1}, M_{\ell_2}, M_{\ell_\infty}$ are trained using the TRADES algorithm for 20 iterations, and step sizes $\alpha_1$ = 2.0, $\alpha_2$ = 0.3, and $\alpha_\infty$ = 0.05 for the $\ell_1, \ell_2, \ell_\infty$ attack types within perturbation radii $\epsilon_1$ = 10.0, $\epsilon_2$ = 2.0, and $\epsilon_\infty$ = 0.3 respectively.\footnote{We use the Sparse $\ell_1$ descent \citep{tramer2019adversarial} for the PGD attack in the $\ell_1$ constraint.}

\paragraph{CIFAR10.}
The individual $\mp{}$ models are trained to be robust against $\{\ell_\infty, \ell_1, \ell_2\}$ perturbations of $\{\epsilon_\infty, \epsilon_1, \epsilon_2\} = \{0.003, 10.0, 0.05\}$ respectively.
For CIFAR10, the attack step sizes $\{\alpha_\infty, \alpha_1, \alpha_2\} = \{0.005, 2.0, 0.1\}$ respectively. The training of the individual $\mp{}$ models is directly based on the work of \citet{carmon2019unlabeled}.

\subsection{Perturbation classifier $C_{adv}$} 
\paragraph{MNIST.} 
We train the model for 5 epochs using the SGD optimizer with
weight decay as $5\times 10^{-4}$. We used a variation of the learning
rate schedule from \citet{smith2018disciplined}, which is piecewise linear
from $5\times 10^{-4}$ to $10^{-3}$ over the first 2 epochs, and down to 0 till the end. The batch size is set to 100 for all experiments.

\paragraph{CIFAR10.}
We train the model for 5 epochs using the SGD optimizer with weight decay as $5\times 10^{-4}$. We used a variation of the learning rate schedule from \citet{smith2018disciplined}, which is piecewise linear from $5\times 10^{-3}$ to $10^{-2}$ over the first 2 epochs, and down to 0 till the end. The batch size is set to 100 for all experiments.

\paragraph{Creating the Adversarial Perturbation Dataset.}
We create a static dataset of adversarially perturbed images and their corresponding attack label for training the perturbation classifier $C_{adv}$. For generating adversarial images, we perform weak adversarial attacks that are faster to compute. In particular, we perform 10 iterations of the PGD attack. 
For MNIST, the attack step sizes $\{\alpha_\infty, \alpha_1, \alpha_2\} = \{0.05, 2.0, 0.3\}$ respectively. For CIFAR10, the attack step sizes $\{\alpha_\infty, \alpha_1, \alpha_2\} = \{0.005, 2.0, 0.1\}$ respectively. Note that we perform the Sparse-$\ell_1$ or the top-k PGD attack for the $\ell_1$ perturbation ball, as introduced by \citet{tramer2019adversarial}. We set the value of k to 10, that is we move by a step size $\frac{\alpha_1}{k}$ in each of the top 10 directions with respect to the magnitude of the gradient.

\paragraph{CIFAR10-C.}
We use a dropout value of 0.3 along with the same optimizer (SGD). We use a learning rate of 0.01 and SGD optimizer for 5 epochs, with linear rate decay to 0.001 between the second epoch and the fifth epoch
For experiments on classifying corruptions of severity 1, we find that the model takes longer to train. Hence, we train the model for 10 epochs, whereas all other models (at other severity levels) were trained for 5 epochs.

\section{Attacks Used for Evaluation}
\label{app:attacks_used}
A description of all the attacks used for evaluation of the models is presented here. 
% From the Foolbox library\citep{rauber2017foolbox}, apart from $\ell_1, \ell_2$ and $\ell_\infty$ PGD adversaries, we also evaluate the following attacks for different perturbation types.
% (1) For $\ell_1$ perturbations, we include the Salt \& Pepper Attack (SAPA) \citep{rauber2017foolbox} and Pointwise Attack (PA) \citep{schott2018towards}. 
% (2) For $\ell_2$ perturbations, we include the Gaussian noise attack \citep{rauber2017foolbox}, Boundary Attack \citep{brendel2018decision}, DeepFool \citep{moosavi2016deepfool}, 
% Pointwise Attack (PA) \citep{schott2018towards}, 
% DDN attack \citep{rony2019decoupling}, and C\&W attack \citep{carlini2017towards}. 
% (3) For $\ell_\infty$ perturbations, we include FGSM attack \citep{goodfellow2014explaining} and the Momentum Iterative Method~\citep{Dong_2018_CVPR}. 
From the AutoAttack library \citep{croce2020reliable}, we make use of all the three variants of the Adaptive PGD attack (APGD-CE, APGD-DLR, APGD-T) along with the targeted and standard version of Fast Adaptive Boundary Attack (FAB, FAB-T) \citep{croce2019minimally} and the Square Attack \citep{andriushchenko2019square}. We utilize the AA$^+$ version in the auto-attack library for stronger attacks. 

\paragraph{Attack Hyperparameters.}
For the attacks in the AutoAtack library we use the default parameter setting in the strongest available mode (such as AA$^+$). For the custom PGD attacks, we evaluate the models with 10 restarts and 200 iterations of the PGD attack. The step size of the $\{\ell_\infty, \ell_1, \ell_2\}$ PGD attacks are set as follows:  For MNIST, the attack step sizes $\{\alpha_\infty, \alpha_1, \alpha_2\} = \{0.01, 1.0, 0.1\}$ respectively. For CIFAR10, the attack step sizes $\{\alpha_\infty, \alpha_1, \alpha_2\} = \{0.003, 1.0, 0.02\}$ respectively.

Further, in line with previous work \citep{tramer2019adversarial, maini2019adversarial} we evaluate our models on the first 1000 images of the test set of MNIST and CIFAR-10, since many of the attacks employed are extremely computationally expensive and slow to run. Specifically, on a single GPU, the entire evaluation for a single model against all the attacks discussed with multiple restarts will take nearly 1 month, and is not feasible.

\begin {figure}[t]
\begin{subfigure}[b]{\linewidth}
      \centering
            \includegraphics[width=0.7\linewidth]{figures/mnist_fourier.jpg}
        \caption{MNIST dataset}\label{fig:fourier-mnist}
      \end{subfigure}\\
      \begin{subfigure}[b]{\linewidth}
      \centering
            \includegraphics[width=0.7\linewidth]{figures/cifar_fourier.jpg}
        \caption{CIFAR10 dataset}\label{fig:fourier-cifar10}
      \end{subfigure}
      
      \caption{We present the Fourier spectrums of various attacks on a vanilla model trained on (a) MNIST and (b) CIFAR10 datasets by averaging the per-pixel DFT over the entire test set, i.e. for an $\ell_\infty, \ell_1, \ell_2$ adversarial example corresponding to image in the test set.}
      \label{fig:fourier}
\end{figure}

\section{Fourier Features}
\label{app:fourier}
\citet{yin2019fourier} studied various perturbations in their Fourier domain. Their work mainly focused on studying the Fourier spectrum of various common corruptions, and they showed how model robustness was affected by the data augmentation scheme used. In particular, they found that certain augmentation strategies benefit robustness to perturbations in the high frequency domain.

On the contrary, in our work, we use Fourier features to classify perturbation types. While \citet{yin2019fourier} directly studied only the perturbation ($\delta$) added to the image, we visualize the Fourier transform of the actual perturbed image ($\mathbf{x}+\delta$). This makes it more challenging to distill the perturbation from the original image. Secondly, we study the Fourier transform of various adversarially crafted examples. In what follows, we will first provide a visual example to justify how adversarial examples crafted by different attack types, have different Fourier spectrums. We then utilise this property to use Fourier features as an input to the perturbation classifier for classifying the perturbation type. 

\paragraph{Fourier Spectrum.} We follow the same naming convention as \citet{yin2019fourier}. For an input image $\mathbf{x} \in \mathbb{R}^{d_1 \times d_2}$, we will represent the 2-dimensional discrete  Fourier transform (DFT) by $\mathcal{F}: \mathbb{R}^{d_1\times d_2} \rightarrow \mathbb{C}^{d_1\times d_2}$.  $\mathcal{F}^{-1}$ represents the inverse DFT.
% We shift the low frequency components to the center of the spectrum to aid visualization. 
Since the Fourier transform belongs to the complex plane, we estimate $\mathbb{E}\left[|\mathcal{F}(\mathbf{x}_{adv})[i,j]|\right]$ by averaging over adversarial examples generate for each image in the test set.

Note that \citet{yin2019fourier} had estimated only the perturbation ($\mathbb{E}\left[|\mathcal{F}(\mathbf{x}_{adv}-\mathbf{x})[i,j]|\right]$) and not the perturbed image in their work. However, since at test time we do not have access to the original image, we only perform our analysis based on the perturbed input.

We present the Fourier spectrums in Figure~\ref{fig:fourier}. While adversarial examples typically have an imperceptible amount of perturbation for the human eye, the visualization of these adversarial examples through the Fourier spectrums help us visually distinguish between them. We also note that the Fourier spectrum for each attack does not show similar characteristics across different datasets (MNIST and CIFAR10). However, the characteristics stay consistent when independently attacking a given model on the same dataset. 

We use this observation to augment \ours{} with an ensemble of diverse perturbation classifiers. We do so by training another model $C_{adv}$ for which the inputs are \emph{only} the Fourier features of the corresponding adversarial examples. The training process and architecture for such a classifier stays identical as one that classifies adversarial examples in their image domain. 



\section{Perturbation Categorization}
\subsection{Empirical Perturbation Overlap}
\label{app:perturb_overlap_stats}
While we justify the choice of perturbation sizes in our theoretical proofs in Appendix~\ref{app:sep:error} and~\ref{app:subsec:perturbation_size}, in this section we demonstrate the empirical agreement of the choices of perturbation sizes we make for our results on MNIST and CIFAR10 datasets. To measure how often adversarial perturbations of different attacks overlap, we empirically quantify the overlapping regions by attacking a benign model with PGD attacks. In Table~\ref{table:app:vanilla-perturb-stats} we report the range of the norm of perturbations in the alternate perturbation region for any given attack type. The observed overlap is exactly 0\% in all cases and the observation is consistent across MNIST and CIFAR10 datasets.

\begin{table}[t]
  \caption{\textbf{Vanilla Model:} Empirical overlap of $\ell_{p,\epsilon_p}$ attack perturbations in different $\ell_{q,\epsilon_q}$ regions for (a) MNIST $(\epsilon_1, \epsilon_2, \epsilon_\infty) = (10,2.0,0.3)$; (b) CIFAR-10 $(\epsilon_1, \epsilon_2, \epsilon_\infty) = (10,0.5,0.03)$. Each column represents the range (min - max) of $\ell_q$ norm for perturbations generated using $\ell_p$ PGD attack.}
  \label{table:app:vanilla-perturb-stats}
  \centering
  \scalebox{0.94}{
  \begin{tabular}{l|rrr|rrr}
    \hline
    \textbf{Attack} &\multicolumn{3}{c}{\textbf{MNIST}} & \multicolumn{3}{c}{\textbf{CIFAR10}}\\
    \cline{2-7}
    & $\ell_\infty < 0.3$ & $\ell_2 < 2.0$ & $\ell_1 < 10$  
    & $\ell_\infty < 0.03$ & $\ell_2 < 0.5$ &    $\ell_1 < 10$\\
    \hline 
PGD $\ell_\infty$ 
& $\leq$ 0.3 & (3.67 - 6.05)     & (54.8 - 140.9)   &  
$\leq$ 0.03 & (1.33 - 1.59)      & (62.7 - 85.5)    \\
PGD $\ell_2$      
& (0.40 - 0.86)      & $\leq$ 2.0 & (11.2 - 24.1)    
& (0.037 - 0.10)      & $\leq$ 0.05 & (15.4 - 20.9)    \\
Sparse $\ell_1$   
& (0.70 - 1.0)       & (2.08 - 2.92)     & $\leq$ 10.0 
& (0.27 - 0.77)       & (1.32 - 1.88)      & $\leq$ 10.0 \\
\hline
\end{tabular}}
\end{table}

\begin{table}[t]
  \caption{\textbf{\ours :} Empirical overlap of $\ell_{p,\epsilon_p}$ attack perturbations in different $\ell_{q,\epsilon_q}$ regions for (a) MNIST $(\epsilon_1, \epsilon_2, \epsilon_\infty) = (10,2.0,0.3)$; (b) CIFAR-10 $(\epsilon_1, \epsilon_2, \epsilon_\infty) = (10,0.5,0.03)$. Each column represents the range (min - max) of $\ell_q$ norm for perturbations generated using $\ell_p$ PGD attack.}
  \label{table:app-protector-perturb-stats}
  \centering
  \scalebox{0.94}{
  \begin{tabular}{l|rrr|rrr}
    \hline
    \textbf{Attack} &\multicolumn{3}{c}{\textbf{MNIST}} & \multicolumn{3}{c}{\textbf{CIFAR10}}\\
    \cline{2-7}
    & $\ell_\infty < 0.3$ & $\ell_2 < 2.0$ & $\ell_1 < 10$  
    & $\ell_\infty < 0.03$ & $\ell_2 < 0.5$ &    $\ell_1 < 10$\\
    \hline 
PGD $\ell_\infty$ 
& $\leq$ 0.3  & (5.03-6.12) & (100.40-138.52) 
& $\leq$ 0.03  & (1.46-1.69) & (73.15-93.26) \\
PGD $\ell_2$      
& (0.35-0.95) & $\leq$2.0   & (17.06-27.88)   
& (0.036-0.29) & $\leq$0.05  & (5.83-21.21)  \\
Sparse $\ell_1$   
& (0.81-1.0)  & (2.13-2.98) & $\leq$10.0      
& (0.42-1.0)   & (1.50-2.91) & $\leq$10.0 \\
\hline
\end{tabular}}
\end{table}


\begin{table*}[h]
  \caption{Perturbation type classification accuracy for different perturbation types. The perturbation classifier $C_{adv}$ is trained on adversarial examples against two $\mp{}$ models. Each column represent the model used to create transfer-based attack via the attack type in the corresponding row. The represented accuracy is an aggregate over 1000 randomly sampled attacks of the $\ell_\infty,\ell_2,\ell_1$ types for the corresponding algorithms (and datasets).}
%   .~\xinyun{TODO: add numbers for 2 and 3 base models.}.\pratyush{Update numbers once known}}
  \label{table:c-adv-transfer}
  \centering
  \begin{tabular}{l|rrrrrr}
    \hline
                                & $M_{\ell_\infty}$ & $M_{\ell_2}$ & $M_{\ell_1}$ & MAX & AVG & MSD \\
    \hline
    MNIST-PGD               & 100\% & 100\% & 99.3\% & 99.0\% & 99.6\% & 99.1\% \\
    MNIST-AutoAttack                          & 100\%&	100\%&	99.0\% &	99.5\%&	100\%&	100\% \\
    CIFAR10-PGD & 99.9\%&	99.5\%&	100\%&	100\%&	98.7\%&	95.7\% \\
    CIFAR10-AutoAttack & 99.9\%&	99.9\%&	100\%&	100\%&	99.7\%&	99.7\% \\
    \hline
  \end{tabular}
\end{table*}

To contrast the results with that of attacking a vanilla model, we also present results on the perturbation overlap when we attack \ours{} with PGD attacks (in Table~\ref{table:app-protector-perturb-stats}). It is noteworthy that the presence of a perturbation classifier forces the adversaries to generate such attacks that increase the norm of the perturbations in alternate $\ell_q$ region. Secondly, we also observe that in the case of CIFAR10, the $\ell_2$ PGD attack has a large overlap with the $\ell_1$ norm of radius 10. However, recall that in case of $\ell_2$ attacks for CIFAR10, both the base models $M_{\ell_1}$ and $M_{\ell_\infty}$ were satisfactorily robust. Hence, the attacker has no incentive to reduce the perturbation radius for an $\ell_q$ norm since the perturbation classifier only performs a binary classification between $\ell_1$ and $\ell_\infty$ attacks. 


\subsection{Robustness of $C_{adv}$}
\label{app:subsec:c-adv}
In this section, we
present the results of the perturbation type classifier $C_{adv}$ against transfer adversaries. The results for the robustness of the perturbation classifier $C_{adv}$ in the presence of adaptive adversaries is presented in Table~\ref{table:c-adv-transfer}.
Note that $C_{adv}$ transfers well across the board, even if the adversarial examples are generated against new models that are unseen for $C_{adv}$ during training, achieving extremely high test accuracy.
Further, even if the adversarial attack was generated by a different algorithm such as from the AutoAttack library, the transfer success of $C_{adv}$ still holds up. 
In particular, the obtained accuracy is $>95\%$ across all the individual test sets created. 
The attack classification accuracy is in general highest against those generated by attacking $M_{\ell_1}$ or $M_{\ell_\infty}$ for CIFAR10, and $M_{\ell_2}$ or $M_{\ell_\infty}$ for MNIST. This is an expected consequence of the nature of generation of the static dataset for training the perturbation classifier $C_{adv}$ as described in Section~\ref{subsec:dataset-creation}.


\begin{table}[h]
  \caption{Classification accuracy for common corruptions at different severity levels. The task is a 19 class classification problem. In the training setting ``Combined'', all images of different severity levels are used for training. The model predicts the corruption type among the 19 possible corruptions.}
  \label{table:app:corruptions-19}
  \centering
  \begin{tabular}{l|rrrrr}
    \hline
    & \multicolumn{5}{c}{Tested on} \\
    \cline{2-6}
    Training         & Level 1 & Level 2 & Level 3 & Level 4 & Level 5  \\
    \hline
    Level Specific             & 87.2\% & 97.7\% & 97.0\% & 98.7\% & 99.5\% \\
    Combined             & 85.4\% & 96.2\% & 97.2\% & 98.1\% & 99.1\%\\
    \hline
  \end{tabular}
\end{table}

\subsection{More Results on Common Corruptions}
\label{app:common-corruptions}
% \paragraph{Classifying Corruption Types.}
For each image in the original CIFAR-10 test set, CIFAR-10-C includes corrupted images of 19 different corruption types at 5 severity levels. In this section, we present results on corruption classification at different severity levels. Specifically, we train a single model on images of all severity levels. Then to evaluate on each of the 5 severity levels, we also train another model on corrupted images of the same level. As mentioned in Section~\ref{subsubsec:common-corruptions}, each corruption type has 9K training samples at each severity level, and 1K for testing. We ensure that all corrupted samples of the same original CIFAR-10 image are in the same data split, so that no sample in the test split corresponds to the same original image in the training split.

We present the corruption type classification accuracies at different severity levels in Table~\ref{table:app:corruptions-19}. We observe that the classification accuracy is around 90\% for all severity levels, even when the severity level is low and the corruptions are hard to notice for the human eye. Note that for a 19-class classification problem, random guessing would only yield about 5\% accuracy. Further, the test accuracy increases as the severity of the corruption increases. This can be explained due the fact that increasing the magnitude of corruptions makes them more representative and easier to be distinguished from others. Note that models trained on standard image classification tasks are typically more resilient to corruptions at a lower severity, and images with a high corruption severity can be detrimental to the prediction performance of standard classifiers. Therefore, it is important to correctly identify such highly corrupted images. We also note that a combined model trained on multiple corruption severity levels does not have a significant trade-off in test accuracy to those trained on the specific levels. Specifically, the drop in test set accuracy varies between 0.4\% and 1.8\% across various severity levels, and the decrease is much less noticeable when the severity level becomes large.
% \paragraph{Generalization to unseen corruptions.}

% We further evaluate the generalization of the perturbation classifier to unseen corruption types. Specifically, different from the above setting of classifying different corruption types, this time our classifier categorizes all corruption types into 4 categories --- noise, blur, digital, and weather. These 4 categories are defined in the CIFAR-10-C benchmark, and all categories roughly include the same number of corruption types.
% We evaluate the model performance on 4 held-out corruption types, 1 for each category, and we select these corruption types following the model validation setting in~\citet{hendrycks2019benchmarking}.
% From the remaining 15 corruption types, we vary the number of corruptions included for training, and present the results in Table~\ref{tab:corruptions-generalization}. 
% We observe that even if we do not train the perturbation classifier on the same corruption types for testing, the classifier still obtains a high generalization accuracy, i.e., higher than 90\%. These results demonstrate that our perturbation classification approach is still effective even for unseen perturbations.
% % \pratyush{Add results for training for few corruptions and testing on extra validation corruptions.}




% \section{Uniform Average over all $\mp{}$}
% \label{app:uniform-average}


\section{Adaptive Attacks}
\label{app:adaptive}

\subsection{Aggregating predictions from different $\mp{}$ at Inference}
\label{app:inference-exp}
In all our experiments in this work the adversary constructs adversarial examples using the softmax based adaptive strategy for aggregating predictions from different $\mp{}$ models, as described in Equation~\ref{eqn:adaptive_softmax} for the column `Ours' and using the `max' strategy (Equation~\ref{eqn:ctp}) for results described in the column `Ours*'. 

However, for consistency of our defense strategy irrespective of the attacker's strategy, the defender only utilizes predictions from the specialized model $\mp{}$ corresponding to the most-likely attack (Equation~\ref{eqn:ctp}) to provide the final prediction (only forward propagation) for generated adversarial examples. In our evaluation, we found a negligible impact of changing this aggregation to the `softmax' strategy for aggregating the predictions. For example, we show representative results in case of the APGD ($\ell_\infty$, $\ell_2$) attacks on the CIFAR10 dataset in Table~\ref{table:app:inference}.


\begin{table*}[t]
  \caption{Comparison between using a `softmax' based aggregation of predictions from different specialized models versus using the prediction from the model corresponding to the most likely attack (only at inference time). Results are presented for APGD $\ell_2,\ell_\infty$ attacks on the CIFAR10 dataset.}
  \label{table:app:inference}
  \centering
  \begin{tabular}{l|rr}
    \hline
              Attack                              & Max-approach (Eq.~\ref{eqn:ctp}) & Softmax-approach (Eq.~\ref{eqn:adaptive_softmax})\\
    \hline
    APGD-CE $\ell_2$ ($\epsilon_2 = 0.5$)    & 75.7\% & 75.6\%  \\
    APGD-DLR $\ell_2$ ($\epsilon_2 = 0.5$)    & 76.5\% & 76.7\%  \\
    APGD-CE $\ell_\infty$ ($\epsilon_\infty = 0.03$)    & 86.9\% & 86.9\%  \\
    APGD-DLR $\ell_\infty$ ($\epsilon_\infty = 0.03$)    & 91.8\% & 91.2\%  \\
    \hline
  \end{tabular}
\end{table*}

\begin{table*}[t]
  \caption{Performance of Adaptive attacks that attempt to separately fool the perturbation classifier and the alternate specialized robust model. The corresponding objective functions for each attack are specified in Appendix~\ref{app:adaptive}.}
  \label{table:app:adaptive_dual}
  \centering
  \begin{tabular}{l|rr}
    \hline
              Attack                              & Dual Attack (Eq.~\ref{eqn:adaptive_dual_1}) & Binary Attack (Eq.~\ref{eqn:adaptive_dual_2})\\
    \hline
    PGD $\ell_\infty$ ($\epsilon_\infty = 0.03$)    & 69.3\% & 73.2\%  \\
    PGD $\ell_2$ ($\epsilon_2 = 0.5$)    & 72.1\% & 74.8\%  \\
    Sparse PGD $\ell_1$ ($\epsilon_1 = 10$)    & 64.7\% & 59.1\%  \\
    \hline
  \end{tabular}
\end{table*}

\subsection{Trade-off between fooling $\mp{}$ and $C_{adv}$}
The adversary chooses the strongest attack over a set of adaptive attacks targeted at each $\mp{}$. For any data point (x,y) each targeted attack optimises the following constraint:
\begin{equation}
\label{eqn:adaptive_dual}
\begin{split}
    \min_{\delta_p} \ell_p&(x + \delta_p) \\
    \text{s.t.} \quad \mp{}(x + \delta_p) \neq y; &\quad C_{adv}(x + \delta_p) = p 
\end{split}
\end{equation}
We perform the attack for each of the PGD attacks for $p\in\{1,2,\infty\}$. To design the exact objective function for optimization of Equation~\ref{eqn:adaptive_dual}, we take inspiration from a similar exploration by~\citet{carlini2017towards}.

\textbf{First}, we combine a dual loss function for individually fooling the $\mp$ model and the perturbation classifier $C_{adv}$ by giving different importance to each of them using a parameter $\lambda$. More specifically, for an input $(x,y)$, the objective for finding an adversarial example of type $\mathcal{A}\in\mathcal{S}$ can be written as:

\begin{equation}
\label{eqn:adaptive_dual_1}
\begin{split}
    \mathcal{L}_{(x,y,\mathcal{A})} = -1 \cdot \text{CrossEntropyLoss}(C_{adv}(x), \mathcal{A}) + \lambda\cdot\text{CrossEntropyLoss}(\mathcal{M_B}(x), y)
\end{split}
\end{equation}

where $\mathcal{B} = \argmax{C_{adv}(x)}$. We experiment with values of $\lambda \in \{10^{-1}, 1, 10, 100\}$ and report the worst adversarial example in each case.

\textbf{Secondly}, we design an alternate approach where the adversary is constrained to fool the perturbation classifier (owing to a strong binary misclassification loss). It then attempts to fool the alternate $\mp$ model under this constraint. More specifically, if $\mathcal{B} = \argmax{C_{adv}(x)}$, then

\begin{equation}
\label{eqn:adaptive_dual_2}
\begin{split}
    \mathcal{L}_{(x,y,\mathcal{A})} = -1 \cdot (\mathcal{A} =\mathcal{B}) + \lambda\cdot\text{CrossEntropyLoss}(\mathcal{M_B}(x), y)
\end{split}
\end{equation}

We perform the above optimization for the PGD attacks in the $\ell_\infty, \ell_1, \ell_2$ perturbation radius constraints. In case of the $\ell_1$ attack, we optimize using the stronger Sparse-$\ell_1$ attack~\citep{tramer2019adversarial}. The adversarial robustness of \ours{} (on CIFAR10) to these attacks is reported in Table~\ref{table:app:adaptive_dual}. We note that the formulation used in the main paper (Equation~\ref{eqn:adaptive_softmax}) that uses a `softmax' bridge between the two levels of the pipeline performs better than the attacks outlined above. In particular, we observe that adversaries find it difficult to balance the two losses separately in order to satisfy the dual constraint.

\section{Breakdown of Complete Evaluation}
\label{app:res-breakdown}
Now we present a breakdown of results of the adversarial robustness of baseline approaches and {\ours} against all the attacks in our suite. We also report the worst case performance against the union of all attacks. 

\begin{table}
\caption{Attack-wise breakdown of adversarial robustness on the MNIST dataset. \emph{Ours} represents the \ours{} method against the adaptive attack strategy described in Section~\ref{subsec:adaptive}, and~\emph{Ours*} represents the standard attack setting.}
  \centering
\begin{tabular}{l|rrrrrrrr}
\hline
                                         & $M_{\ell_\infty}$ & $M_{\ell_2}$ & $M_{\ell_1}$ & MAX    & AVG    & MSD    & Ours    & Ours*   \\
\hline
Benign Accuracy                        & 99.2\%            & 98.7\%     & 98.8\%     & 98.6\% & 99.1\% & 98.3\% & 98.9\% & 98.9\% \\
\hline
PGD-$\ell_\infty$                      & 92.8\%            & 6.2\%      & 0.0\%      & 50.0\% & 64.8\% & 65.7\% & 83.5\% & 89.1\% \\
APGD-CE                                & 91.5\%            & 3.6\%      & 0.0\%      & 41.0\% & 59.1\% & 65.2\% & 84.3\% & 84.6\% \\
APGD-DLR                               & 91.8\%            & 8.0\%      & 0.0\%      & 43.9\% & 61.9\% & 66.0\% & 88.6\% & 88.4\% \\
APGD-T                                 & 91.9\%            & 2.9\%      & 0.0\%      & 39.6\% & 59.0\% & 64.4\% & 88.0\% & 88.6\% \\
FAB-T                                  & 92.5\%            & 5.0\%      & 0.0\%      & 48.8\% & 64.3\% & 65.5\% & 99.0\% & 98.6\% \\
SQUARE                                 & 90.3\%            & 7.6\%      & 0.0\%      & 45.9\% & 65.1\% & 68.2\% & 93.0\% & 93.3\% \\
$\ell_\infty$ attacks $(\epsilon=0.3)$ & 90.2\%            & 2.6\%      & 0.0\%      & 39.0\% & 57.8\% & 63.5\% & 78.1\% & 79.0\% \\
\hline
PGD-$\ell_2$                           & 84.9\%            & 74.9\%     & 51.6\%     & 63.6\% & 69.5\% & 71.7\% & 73.0\% & 75.5\% \\
DDN                                    & 42.3\%            & 76.0\%     & 53.1\%     & 62.2\% & 64.6\% & 70.1\% & 87.5\% & 94.3\% \\
APGD-CE                                & 78.9\%            & 74.0\%     & 50.7\%     & 61.9\% & 65.0\% & 69.6\% & 72.2\% & 76.4\% \\
APGD-DLR                               & 79.3\%            & 75.2\%     & 54.1\%     & 63.2\% & 65.1\% & 70.9\% & 74.4\% & 78.2\% \\
APGD-T                                 & 80.7\%            & 73.8\%     & 48.0\%     & 61.0\% & 63.9\% & 69.6\% & 70.8\% & 74.3\% \\
FAB-T                                  & 12.2\%            & 74.8\%     & 49.4\%     & 62.5\% & 63.7\% & 69.1\% & 86.9\% & 96.3\% \\
SQUARE                                 & 25.6\%            & 82.3\%     & 66.6\%     & 71.7\% & 71.8\% & 75.0\% & 96.9\% & 96.6\% \\
$\ell_2$ attacks $(\epsilon=2.0)$      & 9.5\%             & 72.3\%     & 47.8\%     & 58.5\% & 58.6\% & 65.7\% & 66.6\% & 72.3\% \\
\hline
PGD-$\ell_1$                           & 72.5\%            & 74.6\%     & 78.5\%     & 52.9\% & 59.3\% & 67.9\% & 73.8\% & 79.4\% \\
FAB-T                                  & 20.0\%            & 71.6\%     & 77.6\%     & 43.9\% & 51.2\% & 67.5\% & 74.3\% & 85.0\% \\
\hline
$\ell_1$ attacks $(\epsilon=10)$       & 18.8\%            & 70.6\%     & 77.5\%     & 41.8\% & 46.1\% & 64.3\% & 68.1\% & 72.5\% \\
\hline
All Attacks                            & 7.3\%             & 2.6\%      & 0.0\%      & 29.1\% & 37.1\% & 57.2\% & 63.6\% & 67.2\% \\
Average All Attacks                    & 69.8\%            & 47.4\%     & 35.3\%     & 54.1\% & 63.2\% & 68.4\% & 83.1\% & 86.6\% \\
\hline
\end{tabular}
\label{table:mnist_breakup}
\end{table}
\subsection{MNIST}
\label{app:mnist}
In Table~\ref{table:mnist_breakup}, we provide a breakdown of the adversarial accuracy of all the baselines, individual $\mp{}$ models and the \ours{} method, with both the adaptive and standard attack variants on the MNIST dataset. 
% The `all attacks' metric represents the worst case performance of the models against the union of all attacks discussed. That is, we consider the union of all misclassifications to report the minimum accuracy assuming that any image can be attacked by all the attacks in the suite. Apart from the `all attacks' metric, 
% We evaluate \ours{} against a stronger adaptive adversary, in terms of the~\emph{all attacks} accuracy, 
\ours{} outperforms prior baselines by $6.4\%$ on the MNIST dataset. 
It is important to note that \ours{} shows significant improvements against most attacks in the suite. 
Compared to the previous state-of-the-art defense against multiple perturbation types (MSD), 
% the accuracy gain on $\ell_\infty$ attacks is especially notable, i.e., greater than $15\%$. In particular, 
if we compare the performance gain on each individual attack algorithm, the average accuracy increase of $14.7\%$ on MNIST dataset. These results demonstrate that {\ours} considerably mitigates the trade-off in accuracy against individual attack types.


\subsection{CIFAR-10}
\label{app:cifar10}
In Table~\ref{table:cifar10_breakup}, we provide a breakdown of the adversarial accuracy of all the baselines, individual $\mp{}$ models and the \ours{} method, with both the adaptive and standard attack variants on the CIFAR10 dataset. 
% The `all attacks' metric represents the worst case performance of the models against the union of all attacks discussed. That is, we consider the union of all misclassifications to report the minimum accuracy assuming that any image can be attacked by all the attacks in the suite.
\ours{} outperforms prior baselines by $10\%$. 
Once again, note that \ours{} shows significant improvements against most attacks in the suite. 
Compared to the previous state-of-the-art defense against multiple perturbation types (MSD), 
% the accuracy gain on $\ell_\infty$ attacks is especially notable, i.e., greater than $15\%$. In particular, 
if we compare the performance gain on each individual attack algorithm, the improvement is significant, with an average accuracy increase of $14.2\%$ on. These results demonstrate that {\ours} considerably mitigates the trade-off in accuracy against individual attack types. Further, {\ours} also retains a higher accuracy on benign images, as opposed to past defenses that have to sacrifice the benign accuracy for the robustness on multiple perturbation types. The clean accuracy of {\ours} is over $7\%$ higher than such existing defenses on CIFAR-10, and the accuracy is close to $\mp{}$ models trained for a single perturbation type.
\begin{table}
\caption{Attack-wise breakdown of adversarial robustness on CIFAR-10. \emph{Ours} represents \ours{} against the adaptive attack strategy described in Section~\ref{subsec:adaptive}, and~\emph{Ours*} represents the standard attack setting.}
  \centering
\begin{tabular}{l|rrrrrrrr}
\hline
                                         & $M_{\ell_\infty}$ & $M_{\ell_2}$ & $M_{\ell_1}$ & MAX    & AVG    & MSD    & Ours    & Ours*   \\
\hline
Benign Accuracy                          & 89.5\%            & 93.9\%     & 89.0\%     & 81.0\% & 84.6\% & 81.7\% & 89.0\% & 89.0\% \\
\hline
PGD-$\ell_\infty$                        & 62.3\%            & 36.2\%     & 36.0\%     & 43.2\% & 41.1\% & 46.6\% & 62.3\% & 62.3\% \\
APGD-CE                                  & 62.1\%            & 35.5\%     & 35.9\%     & 38.5\% & 41.1\% & 46.3\% & 62.2\% & 63.9\% \\
APGD-DLR                                 & 60.9\%            & 38.0\%     & 37.7\%     & 39.1\% & 43.3\% & 46.6\% & 59.1\% & 63.8\% \\
APGD-T                                   & 59.4\%            & 34.9\%     & 35.0\%     & 36.5\% & 39.7\% & 43.8\% & 58.7\% & 62.3\% \\
FAB-T                                    & 59.9\%            & 35.9\%     & 35.4\%     & 40.8\% & 40.2\% & 44.0\% & 79.1\% & 84.7\% \\
SQUARE                                   & 67.2\%            & 57.7\%     & 50.5\%     & 51.8\% & 50.8\% & 52.1\% & 85.6\% & 80.3\% \\
\hline
$\ell_\infty$ attacks $(\epsilon=0.003)$ & 59.3\%            & 34.8\%     & 35.0\%     & 34.9\% & 39.7\% & 43.7\% & 56.1\% & 58.4\% \\
\hline
PGD-$\ell_2$                             & 66.5\%            & 77.5\%     & 72.4\%     & 64.4\% & 67.7\% & 66.2\% & 69.4\% & 69.6\% \\
DDN                                      & 66.9\%            & 77.5\%     & 72.6\%     & 64.5\% & 67.7\% & 66.2\% & 83.1\% & 85.2\% \\
APGD-CE                                  & 66.3\%            & 77.4\%     & 72.3\%     & 64.4\% & 67.2\% & 66.1\% & 71.1\% & 70.8\% \\
APGD-DLR                                 & 65.6\%            & 77.6\%     & 72.0\%     & 63.0\% & 66.0\% & 65.3\% & 70.5\% & 70.6\% \\
APGD-T                                   & 65.1\%            & 77.3\%     & 71.5\%     & 62.1\% & 65.5\% & 64.5\% & 69.4\% & 69.6\% \\
FAB-T                                    & 65.0\%            & 77.4\%     & 71.7\%     & 62.7\% & 65.7\% & 64.5\% & 88.7\% & 90.4\% \\
SQUARE                                   & 81.2\%            & 86.2\%     & 81.7\%     & 72.0\% & 77.1\% & 72.2\% & 90.2\% & 92.1\% \\
\hline
$\ell_2$ attacks $(\epsilon=0.5)$        & 64.6\%            & 77.2\%     & 71.5\%     & 61.8\% & 65.5\% & 64.5\% & 69.3\% & 69.4\% \\
\hline
PGD-$\ell_1$                             & 30.2\%            & 48.5\%     & 62.5\%     & 50.8\% & 61.0\% & 58.2\% & 59.8\% & 64.1\% \\
FAB-T                                    & 35.0\%            & 47.2\%     & 61.3\%     & 48.3\% & 63.8\% & 57.7\% & 65.5\% & 69.3\% \\
\hline
$\ell_1$ attacks $(\epsilon=10)$         & 27.6\%            & 45.3\%     & 60.9\%     & 43.7\% & 60.0\% & 56.1\% & 57.9\% & 59.5\% \\
\hline
All Attacks                              & 27.6\%            & 32.9\%     & 35.0\%     & 31.5\% & 39.3\% & 43.5\% & 53.5\% & 54.9\% \\
Average All Attacks                      & 60.9\%            & 59.0\%     & 57.9\%     & 53.5\% & 57.2\% & 57.4\% & 71.6\% & 73.3\% \\
\hline
\end{tabular}
\label{table:cifar10_breakup}
\end{table}


\begin{table}[t]
\centering
\caption{Effect of the number (n) of specialized robust predictors $\mp{}$ in \ours{}(n) on CIFAR-10. The analysis was performed for an architecture that only utilizes the raw input, and not the Fourier features.}
 \label{tab:ablation-summary}
 \scalebox{0.95}{
\begin{tabular}{l|rr}
\hline
& \ours{}(2)   & \ours{}(3)    \\ \hline
Clean accuracy                          & 90.8\%   & 92.2\%          \\ APGD $\ell_\infty$  $(\epsilon=0.03)$   & 64.8\%   & 56.3\%          \\
APGD $\ell_2$  $(\epsilon=0.5)$         & 68.8\%   & 69.2\%          \\
Sparse $\ell_1$ $(\epsilon=10)$     & 55.9\%   & 52.3\%          \\ \hline
\end{tabular}}
\end{table}

\subsection{Different number of second-level $\mp{}$ predictors} 
\label{app:subsec:number-of-mp}
We also evaluate {\ours} with three second-level predictors, i.e., $M_{\ell_1}$, $M_{\ell_2}$ and $M_{\ell_\infty}$. The results are presented in Table~\ref{tab:ablation-summary}. This alternative design reduces the overall accuracy of the pipeline model.
We hypothesize that this happens because
% The main reason could be that 
the $M_{\ell_1}$ model is already reasonably robust against the $\ell_2$ attacks, as shown in Table~\ref{tab:res-cifar10}. However, having both $M_{\ell_1}$ and $M_{\ell_2}$ models 
allows adaptive adversaries to find larger regions for fooling both $C_{adv}$ and $\mp{}$,
% poses the extra difficulties for the perturbation classifier,
thus hurting the overall performance against adaptive adversaries.


\vfill

\bibliography{paper}

\end{document}