% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{graphicx}  
\usepackage{subcaption}
\usepackage{bm}
\usepackage{multirow}
\usepackage{algorithm,algorithmic}
%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\newcommand\blfootnote[1]{%
  \begin{NoHyper}%
  \renewcommand\thefootnote{}\footnote{#1}%
  \addtocounter{footnote}{-1}%
  \end{NoHyper}%
}

\title{MFA: Multi-layer Feature-aware Attack for Object Detection}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<cwen1998@hust.edu.cn>?Subject= }{Wen Chen}{}} 
\author[3]{\href{mailto:<155252942@qq.com>?Subject= }{Yushan Zhang}{}} 
\author[1]{\href{mailto:<zhli_kaifeng@hust.edu.cn>?Subject= }{Zhiheng Li}{}} 
\author[1,2*]{\href{mailto:<yuehwang@hust.edu.cn>?Subject= }{Yuehuan Wang}{}} 
% Add affiliations after the authors
\affil[1]{%
    School of Artificial Intelligence and Automation \\
    Huazhong University of Science and Technology, Wuhan, China
}
\affil[2]{%
    National Key Lab of Science and Technology on Multi-spectral Information Processing \\
    Wuhan, China
}
\affil[3]{%
    Shanghai Institute of  Satellite Engineering \\
    Shanghai, China
}
  
  \begin{document}
\maketitle

\begin{abstract} 
	Physical adversarial attacks can mislead detectors in real-world scenarios and have attracted increasing attention. However, most existing works manipulate the detector’s final outputs as attack targets while ignoring the inherent characteristics of objects. This can result in attacks being trapped in model-specific local optima and reduced transferability. To address this issue, we propose a \emph{Multi-layer Feature-aware Attack} (MFA) that considers the importance of multi-layer features and disrupts critical object-aware features that dominate decision-making across different models. Specifically, we leverage the location and category information of detector outputs to assign attribution scores to different feature layers. Then, we weight each feature according to their attribution results and design a pixel-level loss function in the opposite optimized direction of object detection to generate adversarial camouflages. We conduct extensive experiments in both digital and physical worlds on ten outstanding detection models and demonstrate the superior performance of MFA in terms of attacking capability and transferability. Our code is available at: \url{https://github.com/ChenWen1997/MFA}.
\end{abstract}

\section{INTRODUCTION}\label{chap1}
\blfootnote{$^*$Corresponding Author}
Deep neural networks (DNNs)have achieved impressive performance in object detection \citep{35,34,37,38}. However, they are found to be vulnerable to adversarial examples \citep{1},which are elaborately crafted to fool DNNs. The effectiveness of adversarial examples has also been proved in object detection \citep{2}.

\begin{figure}[!htb]
	\centering
	\begin{minipage}[c]{0.45\linewidth}
		\centering
		\includegraphics[width=\linewidth]{Figure/Figure1_a}
		\subcaption{}
		\label{fig1_a}
	\end{minipage} 
  \begin{minipage}[c]{0.45\linewidth}
		\centering
		\includegraphics[width=\linewidth]{Figure/Figure1_b}
		\subcaption{}
		\label{fig1_b}
	\end{minipage} \\
	\begin{minipage}[c]{0.45\linewidth}
		\centering
		\includegraphics[width=\linewidth]{Figure/Figure1_c(1)}
		\subcaption{}
		\label{fig1_c}
	\end{minipage} 
	\begin{minipage}[c]{0.45\linewidth}
		\centering
		\includegraphics[width=\linewidth]{Figure/Figure1_d(1)}
		\subcaption{}
		\label{fig1_d}
	\end{minipage}
	\caption{Attention maps on yolov3 obtained by performing feature attribution on the target object marked by the red box \citep{15}. \textbf{(a)} and \textbf{(b)} show attention maps of different feature layers when the detector detects objects of different scales in the same picture. \textbf{(c)} shows an attention map attributed with category information. \textbf{(d)} shows an attention map attributed with both location and category information.}
\label{fig1}
\end{figure} 

Many adversarial attack methods have been proposed for object detection, which can generally be divided into two categories: \textbf{1) Digital attacks}, which modify the pixels of input images directly in the digital space \citep{10,21,22}, and \textbf{2) Physical attacks}, which perform attacks on physical objects before camera imaging \citep{13,11,8}. Physical attacks typically generate adversarial perturbations in the digital world and then apply them to real objects through painting or direct creation of perturbed objects. In this paper, we focus on physical attacks as they have more practical significance for deployed deep learning applications.

However, existing works always ignore the inherent characteristics of objects, resulting in subpar attack ability and transferability. In particular, These limitations can be summarized as follows: \textbf{(1)} Existing works have yet to attempt to disrupt multi-layer features, which play a significant role in object detection. As illustrated in Figures \ref{fig1_a} and \ref{fig1_b}, the detection of different objects with significant scale differences on the same image is performed on different feature layers. \textbf{(2)} Most current methods directly take the final outputs of the model as attack targets\citep{3,8,16}, which can easily overfit the source model and reduce transferability. The Dual Attention Suppression (DAS) Attack\citep{5} exploits attention maps to generate perturbations but has certain limitations. Firstly, DAS only utilizes category information to attack a detector, which cannot accurately assign attribution scores to non-target regions, especially when the target is relatively small in the image. As shown in Figure \ref{fig1_c}, the non-target regions of the attention map have a relatively strong response. Secondly, DAS will compromise the accuracy of importance estimation since using a method similar to Grad-CAM\citep{28} to average each channel as weight will not be able to distinguish the attribution scores within each channel.

To address the mentioned issues, we propose the Multi-layer Feature-aware Attack (MFA) by distorting critical object-aware features at different layers. Specifically, we first attempt to attribute the location and category information of outputs to different feature layers. As shown in Figure \ref{fig1_d}, the attribution with location and category would assign high attribution scores to the target regions but low attribution scores to the non-target regions. The attribution results can accurately reflect the attribution of each activation of the feature map to the outputs. Subsequently, we take into consideration both the polarity and magnitude of the attribution results and weight each feature accordingly. Finally, we model the generation of the camouflage texture as an optimization problem and optimize in the opposite direction of object detection. Comprehensive experiments confirm that our proposed MFA outperforms state-of-the-art methods.

In summary, our main contributions list as follows. 

\begin{itemize}
  \item [1.] 
  We leverage the location and category information of the detector’s outputs to assign attribution scores to features, capturing critical multi-layer object-aware features of target objects.
  \item [2.]
  We propose taking multi-layer features as attack object and disrupt them at the pixel level, improving the attacking ability and transferability of attacks. 
  \item [3.]
  Extensive experiments demonstrate the superior attacking ability and transferability of adversarial examples generated by the proposed MFA compared to state-of-the-art physical attack methods.
\end{itemize}


\section{RELATED WORK} 

\begin{figure*}[!htb]
	\centering
	\includegraphics[width=\textwidth]{Figure/Figure2}   
	\caption{Overview of MFA. Given a training set(\textbf{X}, \textbf{Y}, \textbf{E}) with corresponding binary mask \emph{m} and a 3D vehicle model (\textbf{M}, \textbf{T}). The camouflaged vehicle image \textbf{I} is the rendered result with environmental condition $\emph{e}$ from a renderer $\mathcal{R}$. Next, we use a physical transformation function \bm{$\Phi$} to transfer the camouflaged vehicle into the different physical scenarios and feed it into the detector. Then, we can obtain the importance of multi-layer features by backpropagating the ultimate outputs filtered by post-processing (NMS and comparison with ground truth). Finally, the adversarial camouflage is updated through backpropagation with our devised loss function.}
	\label{fig2}
\end{figure*} 

\paragraph{Physical Attacks on Object Detection}
Physical attack aims to generate adversarial perturbations by modifying the visual characteristics of the real object in the physical world. A simple method is adversarial patch\citep{47}, which is often stuck to a planar object. 
\cite{11} generated a planar adversarial stop sign to fool detectors.
\cite{3} trained a printable patch that can successfully hide a person from a person detector.
\cite{4} adopted a set of transformations to generate adversarial camouflage for non-rigid or non-flat objects.
However, these methods can only attack at certain viewing angles. The more recent approaches involve manipulating the color texture patterns of target 3D objects.
\cite{7} proposed CAMOU to hide vehicles from detectors by training a clone network that simulates applying camouflage to vehicles.
\cite{6} generated an adversarial patch and then repeated and enlarged the patches until they covered the vehicle surface.
Besides, there is a rising trend of leveraging differentiable neural renderers for adversarial camouflage generation.
\cite{5} proposed DAS to generate natural adversarial camouflage using a neural renderer\citep{33} by suppressing the model and human attention.
\cite{52dta} proposed DTA to learn the expected transformation of a rendered object, which can gain both the advantages of the various physical-world transformations and white-box access.
\cite{8} and \cite{17} achieved more robust attacks in multi-view, long-distance, and partial occlusions situations by utilizing a renderer to generate full-coverage camouflage texture.

\paragraph{Feature-level Attacks}
Since the most critical features are shared among different DNNs\citep{20,48}, feature-level attacks have shown promise in synthesizing more transferable adversarial samples.
\cite{18} maximized the feature distance between clean images and adversarial examples in the intermediate layers.
\cite{19} improved the transferability of black-box attacks by increasing the perturbation strength of the feature layer.
\cite{20} used the average activation values to distinguish the positive and negative polarity of the feature.
\cite{21} introduced aggregated gradients to suppress the model-specific features and preserve important features.
\cite{22} weighed neurons using neuron attribution, considering the importance of different neurons.
The above feature-level attacks are digital attacks for classification tasks and are difficult to implement in the physical world.


\paragraph{Feature Attribution Methods}
Feature attribution methods are popular in interpretable machine learning\citep{23}. These methods accept model inputs and assign attribution scores to input features based on the feature's contribution to the model outputs. There is no consensus on the definition of "attribution". In various works, the notion of attribution has been defined as sensitivity\citep{24}, relevance\citep{25}, local influence\citep{26}, Shapley values\citep{27}, or filter activations\citep{28}.

\section{APPROACH} \label{chap3}

\subsection{Preliminaries}\label{chap3_1}
Given a training set (\textbf{X}, \textbf{Y}, \textbf{E}), where \textbf{X}, \textbf{Y}, and \textbf{E} are the sampled images, the ground truth and the sampling environmental condition (e.g., transformation and location, etc.), let (\textbf{M}, \textbf{T}) denote a 3D real object with a mesh tensor \textbf{M} and a texture tensor \textbf{T}, The image \textbf{I} is the rendered result of the real object (\textbf{M}, \textbf{T}) with environmental condition $\emph{e}\in\textbf{E}$ from a renderer $\mathcal{R}$ by $\textbf{I}=\mathcal{R}((\textbf{M}, \textbf{T}),e)$. To perform physical attacks, we use a transformation function \bm{$\Phi$} to transfer the rendered image to different environment scenarios. The physical transformation will be discussed in depth in section \ref{chap3_3}. Then, we generate the input image of the detector $\textbf{I}_{adv}=\bm{\Phi}(\mathcal{R}((\textbf{M}, \textbf{T}_{adv}),e), m, x)$ by replacing the original \textbf{T} with an adversarial texture tensor $\textbf{T}_{adv}$. Now we can obtain the detector's outputs $O=\mathcal{F}(\textbf{I}_{adv}; \theta_f)$,  where $\mathcal{F}$ is the detector with parameters $\theta_f$. Take the yolov3 detector for example, each anchor point in the output grid contains a vector $[x_{\rm offset}, y_{\rm offset}, w, h, p_{\rm obj}, p_{\rm cls1}, \cdots, p_{\rm clsn}]$ with bounding boxes containing different aspect ratios. $x_{\rm offset}$ and $y_{\rm offset}$ are the positions of the center of the bounding box compared to the current anchor point, $w$ and $h$ are the width and height of the bounding box, $p_{\rm obj}$ is the probability that this anchor point contains an object, and $p_{\rm cls1}$ through $p_{\rm clsn}$ is the class probability score of the object.

Our attack object is to generate the adversarial camouflage texture, which can be painted on the surface of the 3D object and hide the object from being detected. We treat the adversarial texture generation as an optimization problem, and our objective function is expressed as follows
\begin{equation}\label{eqn1}
	\mathop{\arg\max}\limits_{\textbf{T}_{adv}}   J(\mathcal{F}(\bm{\Phi}(\mathcal{R}((\textbf{M}, \textbf{T}_{adv}),e);\theta_f), \textbf{Y})
\end{equation}
where $J(·,·)$ measures the distance between ground truth and predicted results of the model. We can obtain the adversarial camouflage texture by solving the above optimization problem.

\subsection{Multi-layer Feature-aware Attack}\label{chap3_2}
The key to craft feature-level attacks is to find a proper way of measuring the importance of each feature. Let $\mathcal{F}$ denote the source model with parameters $\theta_f$. The feature map from the $k$-th layer is expressed as $\mathcal{F}_k(\textbf{I}_{adv}; \theta_f)$ for the input image $\textbf{I}_{adv}$. Since the attribution scores reflect how the features contribute to the final decision, an intuitive strategy is obtaining the gradient commonly used in feature attribution methods\citep{28}. So the attribution scores as written in the following.
\begin{equation}\label{eqn2}
	\Delta_k^{\textbf{I}_{adv}}=\frac{\partial \mathcal{P}(O, y)}{\partial \mathcal{F}_k(\textbf{I}_{adv}; \theta_f)}
\end{equation}
where $y$ is the ground truth and $\mathcal{P}(·,·)$ is post-processing which includes NMS\citep{50nms} and comparison with ground truth.

The original outputs $O$ contain many more predicted bounding boxes than the actual number of targets. Attributing all predicted bounding boxes is not meaningful due to the high redundancy between them and would result in extensive computation consumption. Therefore, we employ NMS to eliminate redundant predicted bounding boxes and then compare them with the ground truth to filter out the attacked objects.

Utilizing the aforementioned attribution scores $\Delta_k^{\textbf{I}_{adv}}$ as the measurement to weight each feature, reflecting the feature real influence on the output, we design the loss function to guide the generation of the adversarial camouflage texture. Intuitively, the essential features will yield relatively higher intensity, indicating the efforts of correcting the features to approach the true label, and the sign provides the correcting direction. In the object detection task, the positive will be corrected in the positive direction and the negative will be corrected in the negative direction. The objective of generating transferable adversarial examples is exactly the opposite of the correction direction of the object detection task. In other words, we aim to guide the positive to be corrected in the negative direction and the negative to be corrected in the positive direction. Therefore, our objective function should be designed to manipulate the features in the opposite direction of the object detector's correction direction. Therefore, our attack loss function can be written as
\begin{equation}\label{eqn3} 
	L_{adv}=\sum^{H_k}\sum^{W_k} \lvert \Delta_k^{\textbf{I}_{adv}} \odot \mathcal{F}_k(\textbf{I}_{adv}; \theta_f) \rvert 
\end{equation}
Where $\odot$ means pixel-wise multiplication, and $H_k$ and $W_k$ denote the height and width of the $k$-th layer feature map.

Additionally, empirical studies from most DNN-based detectors have shown that low-level features have high resolution and contain more location and detail information, and high-level features have a lower resolution but more robust semantic information\citep{51fpn}. Detectors often use multi-scale features to achieve better performance. Therefore, adversarial attacks should also consider the destruction of multi-scale features. To sum up, the Eq. \ref{eqn3} can be rewritten as 
\begin{equation}\label{eqn4}
	L_{adv}=\sum_{k \in \rm K}\sum^{H_k}\sum^{W_k} \lvert \Delta_k^{\textbf{I}_{adv}} \odot \mathcal{F}_k(\textbf{I}_{adv}; \theta_f) \rvert
\end{equation}
Where $\rm K$ is the set of target feature layers to attack.

To suppress high-frequency noise to ensure the smoothness of the the generated adversarial camouflage, we utilize the smooth loss\citep{29} to reduce the the difference
square between adjacent pixels. For a rendered vehicle image \textbf{I}, the calculation of smooth loss $L_{smooth}$ can be written as
\begin{equation}\label{eqn5}
	L_{smooth}=\sum_{i,j}(x_{i,j}-x_{i+1,j})^2+(x_{i,j}-x_{i,j+1})^2
\end{equation}
where $x_{i,j}$ is the pixel value of image at coordinate $(i, j)$.


\subsection{Physical Transformation}\label{chap3_3}
To bridge the gap between the digital and physical world, we follow the transformation function \bm{$\Phi$} of \citep{8} to transfer the rendered vehicle to different environment scenarios. However, we discovered that despite preserving the location and rotation information during the sampling stage of the photo-realistic images, the rendered vehicle cannot fully cover the vehicle in the sampled image, resulting in an unnatural appearance. As shown in Figure \ref{fig_add_a}, the upper outline of the vehicle has a unnatural black edge, and the lower outline is not fully displayed. To address this issue, we introduce a simple but effective method. Specifically, the binary mask is obtained by segmentation from the original photo-realistic image, so we extract the outline of the vehicle in the mask and rendered image, scale and shift the rendered images to align the car's outline, and then feather the binary mask for softer boundaries and more realistic transformation. The visualization of our physical transformation can be seen in Figure \ref{fig_add_b}.

\begin{equation}
	\mathbf{I}_{adv}=\mathbf{\Phi}\left(\mathbf{I}, m, x\right)=m \cdot \mathcal{T}\left(\mathbf{I}\right)+(1-m) \cdot x
\end{equation}
where $\mathcal{T}$ represents the scaling, translation and other operations performed on the rendered image.

\begin{figure}[!htb]
	\centering
	\begin{minipage}[c]{0.38\linewidth}
		\centering
		\includegraphics[width=\linewidth]{Figure/before.png}
		\subcaption{FCA}
		\label{fig_add_a}
	\end{minipage} 
  \begin{minipage}[c]{0.38\linewidth}
		\centering
		\includegraphics[width=\linewidth]{Figure/after.png}
		\subcaption{Ours}
		\label{fig_add_b}
	\end{minipage} \\
	\caption{The result of different physical transformation.}
\label{fig_add}
\end{figure} 

\begin{algorithm}[!htb]
    \renewcommand{\algorithmicrequire}{\textbf{Input:}}
	\renewcommand{\algorithmicensure}{\textbf{Output:}}
	\caption{Multi-layer Feature-aware Attack (MFA)}
    \label{algorithm1}
    \begin{algorithmic}[1] % 控制是否有序号
        \REQUIRE  training set(\textbf{X}, \textbf{Y}, \textbf{E}) with corresponding binary mask \emph{m}, 3D object model(\textbf{M}, \textbf{T}), neural renderer $\mathcal{R}$, physical transformation function \bm{$\Phi$}, object detector $\mathcal{F}$ 
	    \ENSURE adversarial texture $\textbf{T}_{adv}$ % output 的内容
        \STATE Initial $\textbf{T}_{adv}$ with random noise $\textbf{T}_{0} \sim U(0,1)$
        \FOR {$i=1$ \textbf{to} $max iteration$}
		\STATE select \emph{minibatch} sample $(x,y,e)$ from training set(\textbf{X}, \textbf{Y}, \textbf{E})
		\STATE $\textbf{I} \leftarrow \mathcal{R}((\textbf{M}, \textbf{T}_{adv}),e)$
		\STATE $\textbf{I}_{adv} \leftarrow$ \bm{$\Phi$} $(\textbf{I}, m, x)$
		\STATE $O \leftarrow \mathcal{F}(\textbf{I}_{adv}; \theta_f)$
		\STATE calculate $\Delta_k^{\textbf{I}_{adv}}$ by Eq. \ref{eqn2}
		\STATE calculate $L_{smooth}$ and $L_{adv}$ by Eq. \ref{eqn4}, \ref{eqn5}
		\STATE optimize the $\textbf{T}_{adv}$ by Eq. \ref{eqn6}
		\ENDFOR
    \end{algorithmic}
\end{algorithm}

\subsection{Overall Optimization Process}\label{chap3_4}
Overall, we generate the adversarial camouflage by jointly optimizing the multi-layer feature attack loss $L_{adv}$ and smooth loss $L_{smooth}$. substitute the Eq. \ref{eqn4} and Eq. \ref{eqn5} into Eq. \ref{eqn1}, we get the proposed objective for MFA.
\begin{equation}\label{eqn6}
	\mathop{\arg\min}\limits_{\textbf{T}_{adv}}   L_{adv}+ \lambda L_{smooth}
\end{equation}
where $\lambda$ controls the contribution of the term $L_{smooth}$.
The overall training algorithm for the generation of adversarial camouflage can be described as Algorithm \ref{algorithm1}.

\begin{table*}
    \centering
    \caption{The comparison result of adversarial attacks in the digital space.}\label{tab1}
    \begin{tabular}{cccccccccccc}
      \toprule % from booktabs package
	  \multirow{2}{*}{\textbf{Method}} & \multicolumn{11}{c}{ASR(\%)} \\
      \cline{2-12} & SSD & Faster & Mask & Corner & FCOS & Swin & TOOD & VFNet & yolov5 & yolov7 & \bfseries mASR\\
      \midrule % from booktabs package 
	  	UPC	  & 49.11 & 71.93 & 56.77 & 42.88 & 63.11 & 42.76 & 55.67 & 45.60 & 40.91 & 26.40 & 49.51 \\
		DTA   & 44.32 & 82.08 & 72.21 & 42.20 & 71.16 & 47.84 & 69.48 & 52.65 & 37.91 & 31.09 & 55.09 \\
		ER	  & 45.68 & 88.18 & 63.92 & 48.39 & 60.89 & 43.84 & 74.89 & 65.03 & 44.17 & 43.06 & 57.81 \\
		CAMOU & 49.76 & 81.72 & 76.02 & 47.61 & 72.45 & 49.75 & 70.51 & 60.81 & 49.10 & 38.88 & 59.66 \\
		DAS	  & 90.89 & 87.00 & 78.26 & 61.01 & 82.35 & 64.56 & 81.23 & 81.60 & 46.19 & 51.36 & 72.45 \\
		FCA	  & 86.98 & 75.77 & 81.13 & 62.77 & 88.29 & 71.71 & 73.37 & 64.28 & 75.30 & 72.29 & 75.19 \\
		MFA	  & \textbf{96.39} & \textbf{92.69} & \textbf{92.98} & \textbf{85.62} & \textbf{98.86} & \textbf{87.81} & \textbf{94.01} & \textbf{82.64} & \textbf{95.99} & \textbf{89.96} & \textbf{91.7}\\

      \bottomrule % from booktabs package
    \end{tabular}
\end{table*}

\section{EXPERIMENTS}\label{chap4}
\subsection{Experimental Settings}\label{chap4_1}


\paragraph{Datasets} To compare with previous works, we use the same dataset provided by \cite{5}, which were sampled from CARLA\citep{30}, a prevalent opensource simulator for autonomous driving research. The CARLA simulator  provides a variety of high-fidelity digital scenarios (e.g., modern urban) based on Unreal Engine 4. The training set consists of 12,500 high-resolution images, and the testing set has 3,000 high-resolution images sampled from different angles and distances. The dataset also provides corresponding masks of the vehicle targets for the training and testing set. 

\paragraph{Evaluation Metrics}
To evaluate the performance of our proposed method, we select the commonly used Attack Success Rate (ASR)\citep{32} as our first evaluation metric, which is defined as the percentage of the target vehicles detected before perturbation and not detected or falsely detected after perturbation. Further, we average the attack success rate of multiple models and called it the \emph{mean Attack Success Rate} (mASR) to better evaluate the cross-model transferability. In addition, we adopt the P@0.5 following \citep{17,8} as our second evaluation metric, which is defined as the percentage of the correctly detected when the detection IoU threshold is set to 0.5.

\paragraph{Compared methods}
We choose several state-of-the-art works in the 3D attack and physical attack literature, including  CAMOU\citep{7}, ER\citep{6}, UPC\citep{4}, DAS\citep{5}, FCA\citep{8}, and DTA\citep{52dta}. Note that UPC and DAS paint the adversarial camouflage only on the part of the vehicle model. In order to fairly compare, we reimplement them with full-coverage camouflage. The adversarial examples of different methods as shown in Figure \ref{fig3}

\begin{figure}[!htb]
	\centering
	\begin{minipage}[c]{0.24\linewidth}
		\centering
		\includegraphics[width=\linewidth]{Figure/CAMOU}
		\subcaption{CAMOU}
		\label{fig3_a}
	\end{minipage} 
  	\begin{minipage}[c]{0.24\linewidth}
		\centering
		\includegraphics[width=\linewidth]{Figure/ER}
		\subcaption{ER}
		\label{fig3_b}
	\end{minipage} 
	\begin{minipage}[c]{0.24\linewidth}
		\centering
		\includegraphics[width=\linewidth]{Figure/UPC}
		\subcaption{UPC}
		\label{fig3_c}
	\end{minipage} 
	\begin{minipage}[c]{0.24\linewidth}
		\centering
		\includegraphics[width=\linewidth]{Figure/DAS}
		\subcaption{DAS}
		\label{fig3_d}
	\end{minipage} 
	\\
	\begin{minipage}[c]{0.24\linewidth}
		\centering
		\includegraphics[width=\linewidth]{Figure/FCA}
		\subcaption{FCA}
		\label{fig3_e}
	\end{minipage}
	\begin{minipage}[c]{0.24\linewidth}
		\centering
		\includegraphics[width=\linewidth]{Figure/DTA}
		\subcaption{DTA}
		\label{fig3_f}
	\end{minipage} 
	\begin{minipage}[c]{0.24\linewidth}
		\centering
		\includegraphics[width=\linewidth]{Figure/MFA}
		\subcaption{MFA}
		\label{fig1_g}
	\end{minipage} 
	\caption{Adversarial examples of different methods}
	\label{fig3}
\end{figure} 

\paragraph{Target models}
We select ten different commonly used model architectures for experiments. Specifically, SSD\citep{49ssd}, Faster RCNN\citep{35}, Mask RCNN\citep{34}, Cornernet\citep{36}, FCOS\citep{37}, Swin Transformer\citep{38}, TOOD\citep{39}, VFNet\citep{40}, yolov5\footnote{https://github.com/ultralytics/yolov5\label{foot1}} and yolov7\citep{42}. In our experiments, all models are the official implementation version of MMDetection\citep{46}, except for yolov5 \footref{foot1} and yolov7\footnote{https://github.com/WongKinYiu/yolov7\label{foot2}}. 

\paragraph{Implementation details}
We train adversarial camouflage texture on the yolov3\citep{15}. All experiments are under black-box settings. We adopt an Adam optimizer with a learning rate of 0.01. We empirically set the $\lambda=10^{-4}$ and a maximum of 5 epochs. The other hyperparameters are set as provided by the yolov3 implementation. We conduct the experiment on an NVIDIA RTX 1080Ti 12GB GPU, and all codes are implemented in PyTorch. For all the models, we use the pre-trained version on COCO.

\begin{table*}
    \centering
    \caption{The comparison result of adversarial attacks in the physical space.}\label{tab2}
    \begin{tabular}{cccccccccccc}
      \toprule % from booktabs package
      \multirow{2}{*}{\textbf{Method}} & \multicolumn{11}{c}{P@0.5(\%)} \\
      \cline{2-12} & SSD & Faster & Mask & Corner & FCOS & Swin & TOOD & VFNet & yolov5 & yolov7 & \bfseries Average\\
      \midrule % from booktabs package 
	  RAW	& 90.28 & 99.31 &100.00 & 92.36 & 98.61 & 97.22 & 97.92 &100.00 & 93.75 & 99.31 & 96.88 \\
	  CAMOU	& 70.14 & 33.33 & 69.44 & 68.06 & 31.94 & 70.83 & 72.22 & 77.78 & 56.25 & 71.53 & 62.15 \\
	  DAS	& 47.22 & 48.61 & 54.86 & 34.72 & 32.64 & 70.14 & 66.67 & 61.11 & 57.64 & 65.28 & 53.89 \\
	  FCA	& 45.14 & 47.92 & 56.94 & 36.11 & 22.22 & 54.86 & 71.53 & 64.58 & 48.61 & 57.64 & 50.56 \\
	  MFA	& \textbf{22.92} & \textbf{24.31} & \textbf{36.81} & \textbf{22.22} & \textbf{9.03}  & \textbf{40.28} & \textbf{46.53} & \textbf{47.92} & \textbf{34.72} & \textbf{43.06} & \textbf{32.78} \\
      \bottomrule % from booktabs package
    \end{tabular}
\end{table*}

\subsection{Digital World Attack}\label{chap4_2}
In this section, we evaluate the performance of our generated adversarial camouflages on the vehicle in the digital world under black-box settings. We report the ASR for the detection of the target vehicle. More experimental results can be found in the Supplementary Material.

The comparison results are outlined in Table \ref{tab1}. Our adversarial camouflage outperforms other methods across all the detectors. Specifically, our adversarial camouflage achieves the highest mASR at \textbf{91.7\%}, and the ASR of each detector exceeds 80\%. Six detectors (SSD, Faster-RCNN, Mask-RCNN, FCOS, TOOD and yolov5) are easily vulnerable by our proposed MFA with ASR surpassing 90\%. The ASRs of the other four detectors (Cornernet, Swin Transformer, VFNet and yolov7) range between 80\% and 90\%, which may be due to the special design that makes it more robust against adversarial attacks on object detection. For example, the backbone of Cornernet comes from the Hourglass Network of pose estimation, and the backbone of Swin Transformer is a novel vision Transformer. 

In addition, our proposed MFA improves the mASR by \textbf{19.25\%} against DAS, indicating that our attack can more accurately capture inherent conducive characteristics of objects and successfully paralyze the vehicle detection system. The mASR of MFA is \textbf{16.51\%} higher than that of FCA, which suggests that attacking intermediate features is more transferable than directly attacking the final output layer.

We provide some adversarial camouflage vehicle examples in different scenarios. As illustrated in Figure \ref{fig4}, we select yolov7 as the detector, the vehicle before painted with adversarial camouflage is correctly detected as a car with high detection confidence. However, after being painted with our adversarial camouflage texture, the target vehicle turns out to be incorrectly detected or undetected, while the vehicles not painted with our adversarial camouflage texture are correctly detected. 

\begin{figure}
	\centering
	\includegraphics[width=\linewidth]{Figure/Figure4}   
	\caption{The detection result of the vehicle before and after our attack in the digital world.}
	\label{fig4}
\end{figure} 

\subsection{Physical World Attack}\label{chap4_3}
As for the physical world attack, we conduct several experiments to validate the practical effectiveness of our generated adversarial camouflage. Because it is difficult to guarantee that all other elements except the adversarial camouflages are preserved consistently before and after the attack,  We report the P@0.5 for the detection of the target vehicle.

For simplicity, we compare three attack methods that are more robust in digital adversarial attacks (i.e., CAMOU, DAS, FCA). Due to the limitation of funds and conditions, we follow \cite{5} and \cite{8} to print adversarial camouflages by a Xerox Color 550 printer and crop the camouflage parts, then stick them on a 1:32 scale model of an Audi Q5 with different backgrounds to mimic the real car painting in the physical world. To show the efficiency of our adversarial camouflage under various scenarios, we captured 144 pictures of the painted car in different settings (i.e., 8 directions \{left, right, front, back and their corresponding intersection directions\}, 2 angles \{0° and 45°\}, 3 distances \{long, middle, and short distance\} and 3 surroundings) with a Xiaomi 12S phone. The visualization of our generated adversarial camouflages can be found in Figure \ref{fig5}.

\begin{figure}
	\centering
	\includegraphics[width=\linewidth]{Figure/Figure5}   
	\caption{The detection result of the toy cars before and after our attack in the physical world.}
	\label{fig5}
\end{figure} 

The experiment results are shown in Table \ref{tab2}. Each detector correctly detects almost all raw toy cars, with their P@0.5 over 90\%. Compared with other methods, the MFA shows competitively transferable attacking ability in the physical world. Its average P@0.5 is the lowest at \textbf{32.78\%}, significantly better than the compared baselines (e.g., 62.15\% on CAMOU, 53.89\% on DAS, and 50.56\% on FCA, respectively). VFNet is the most robust against adversarial attacks, and Swin Transformer, TOOD and yolov7 also exhibit strong robustness. The conclusion is consistent with the results for digital attacks except for TOOD. TOOD is more robust in the physical world than in the digital world which is worth further study. Besides, FCOS is the most vulnerable with a maximum drop of 89.58\%. This may be because FCOS is an anchor-free, one-stage model with a relatively simple structure. 

We provide some detection result examples of attacking toy cars in the physical world on yolov7. As shown in Figure \ref{fig5}, the toy cars painted with our adversarial camouflage texture are hidden and undetected. 

To sum up, the experimental results demonstrate that our adversarial camouflages have strong transferable attacking ability in the physical world.

\subsection{Effect of different layer feature to attack}\label{chap4_5}
We take different feature layers as attack objects and observe the attack effect of various layer features. The source model yolov3 has three detection layers, which are called low layer, medium layer and high layer in this paper for convenience.

First, we compare the impact of the attack detection layer and the non-detection layer. We take the previous layer of the yolov3's detection layers as the non-detection layer to carry out comparative experiments. The results are shown in Table \ref{tab3}. For the adjacent detection layer and non-detection layer, the mASR of detection layer is higher than that of non-detection layer. The main reason may be that the detection layers fuse different features, which is more conducive to object detection. So we select the detection layers as the attack target in the rest of this paper.

Furthermore, we evaluate single-layer and multi-layer feature attacks. As shown in Figure \ref{fig6}, multi-layer attacks significantly outperform single-layer attacks (e.g., For the average mASR, 76.24\% on a single layer, 84.98\% on two layers and 91.7\% on three layers, respectively). On the other hand, The attack on the middle layer is better than on the low or high layer. The same conclusion can be drawn from Table \ref{tab3}. The reason might be that low layers have not learned salient features and semantic concepts, and high layers are model-specific and it is easily to get trapped in soure model local optimum. By contrast, middle layers have well-separated representations and they are not highly correlated to the model architecture.

\begin{table}[!htb]
    \centering
    \caption{The mASR performance of MFA under different target layer settings.}\label{tab3}
    \begin{tabular}{cc|cc|cc}
      \toprule % from booktabs package
      \multicolumn{2}{c|}{low} & \multicolumn{2}{c|}{middle} & \multicolumn{2}{c}{high} \\
	  \midrule
      non-Det & Det & non-Det & Det & non-Det & Det\\
      \midrule % from booktabs package 
	  57.07 & 79.78 & 75.28 & 84.06 & 63.65 & 64.88\\
      \bottomrule % from booktabs package
    \end{tabular}
\end{table}


\begin{figure}[!htb]
	\centering
	\includegraphics[width=1.0\linewidth]{Figure/Figure6}   
	\caption{The mASR performance for attacks at different layer features.}
	\label{fig6}
\end{figure} 



\subsection{Effect of different outputs for attribution}\label{chap4_4}
Object detection is a multi-output task, and we investigate the influence of using different outputs of the object detection model for attribution in this part. As mentioned in Section \ref{chap3_1}, there are two scores for each anchor point in yolov3: the object score, which can reflect the location information, and the class score, which is the probability of the most likely category of the object.

Figure \ref{fig7} shows the results of attribution using different outputs. The OBJ-CLS approach utilizes the product of object and class scores for attribution, while OBJ only uses the object score and CLS only the class score. The MFA using both object and class scores for attribution yields better results than using either object or class score alone for each model. Specifically, the ASR of OBJ-CLS is higher than that of CLS or OBJ for almost every detector, and the mASR is 91.7\% for OBJ-CLS, 83.92\% for CLS, and 84.64\% for OBJ. This confirms our earlier analysis that only using category information to attribute will cannot accurately assign attribution scores to features. The object-aware/important features can be captured by using category and location information to attribute, guiding the generation of more transferable adversarial camouflage. 
\begin{figure}[!htb]
	\centering
	\includegraphics[width=1.0\linewidth]{Figure/Figure7}   
	\caption{The mASR performance of attribution using different outputs.}
	\label{fig7}
\end{figure} 

\subsection{Effect of hyperparameters}\label{chap4_6}
In this section, we conduct several experiments to further investigate the effect of loss function items and the confidence thresholds of NMS.

\paragraph{The effect of hyper-parameter $\lambda$}
The hyper-parameter $\lambda$ controls the contribution of the term $L_{smooth}$. As we can observe from Table \ref{tab5}, When $\lambda$ is between 0 and $10^{-4}$, $L_{adv}$ dominates the optimization direction and and $L_{smooth}$ is negligible, resulting in strong attack ability. In particular, the highest mASR is achieved 91.83\% when $\lambda=10^{-5}$, but the adversarial camouflage appears unnatural. As $\lambda$ continues to increase, $L_{smooth}$ will dominate the optimization direction , thus reducing the attacking ability(e.g., 78.64\% when $\lambda=10^{-3}$, 53.27\% when $\lambda=10^{-2}$, respectively).

\paragraph{The effect of NMS confidence thresholds}
As mentioned in Section \ref{chap3_2}, the NMS will be used to filter the outputs to remove redundant predicted bounding boxes. We compare the effect of different confidence thresholds of NMS on attacks in this part. As we can observe from Table \ref{tab5}, The mASR performance is optimal when the threshold is 0.25, which is the default threshold of yolov3. When the threshold increases, the mASR will decrease, primarily due to the exclusion of certain targets. For instance, when the threshold is set to 0.35, targets with confidence scores between 0.25 and 0.35 are discarded, even though they are meaningful positive targets. Conversely, when the threshold decreases, the mASR decreases slightly, possibly due to the introduction of negative targets that disrupt the optimization direction.

\begin{table}
    \centering
    \caption{The mASR performance for hyper-parameter $\lambda$}\label{tab5}
    \begin{tabular}{cccccc}
      \toprule % from booktabs package
      $\lambda$ & $0$ & $10^{-5}$ & $10^{-4}$ & $10^{-3}$ & $10^{-2}$\\
      \midrule % from booktabs package 
	  mASR & 90.98 & \textbf{91.83} & 91.70 & 78.64 & 53.27 \\
      \bottomrule % from booktabs package
    \end{tabular}
\end{table}

\begin{table}
    \centering
    \caption{The mASR performance for thresholds of NMS}\label{tab4}
    \begin{tabular}{cccccc}
      \toprule % from booktabs package
      threshold & 0.05 & 0.15 & 0.25 & 0.35 & 0.45\\
      \midrule % from booktabs package 
	  mASR & 90.46 & 89.21 & \textbf{91.70} & 86.53 & 85.70\\
      \bottomrule % from booktabs package
    \end{tabular}
\end{table}

\subsection{Interpretability of the Adversarial Camouflage}\label{chap4_7}
In this part, we adopt model attention visualization to conduct qualitative analysis to further validate our MFA attack's effectiveness. The regions the models pay attention to can be deemed discriminative. Because the vehicle with adversarial camouflage texture will not be detected correctly when it is sent to the detector, which leads to the attention maps of the vehicle cannot be obtained, we follow \citep{5} and \citep{8} to generate the attention maps of the vehicle with different viewpoints on ResNet50\citep{43} model by the commonly used model-agnostic attention maps technique\citep{28}. Figure \ref{fig8} shows the original vehicle, virtual adversarial vehicle, and their attention maps for the "car" class label. We can observe that the MFA attack distracts the attention maps from the vehicle body to other uncamouflaged regions, suggesting that the model's decision evidence has been changed. 

\begin{figure}
	\centering
	\includegraphics[width=\linewidth]{Figure/Figure8}   
	\caption{The detection result of the vehicle under different view angles before and after our attack in the digital space.}
	\label{fig8}
\end{figure} 


\section{CONCLUSION AND FUTURE WORK}\label{chap5}
In this paper, we investigate the problem of generating robust adversarial examples in the physical world for object detectors. We propose the Multi-layer Feature-aware Attack(MFA) method, which improves the attacking ability and transferability of adversarial attacks by distorting important features at different layers. Specifically, we first use location and category information to assign attribution scores to different feature layers and utilize their amplitude and polarity to weight each feature. Finally, we optimize the generation problem of the camouflage texture in the opposite direction of the object detection. Comprehensive experiments confirm the superiority of our method. 

In the future, we are interested in investigating the attack abilities of our adversarial camouflage using a real vehicle in a real-world scenario, we could paint our camouflage on a real-world vehicle by projection or 3D printing. Additionally, we would also like to investigate the appearance of our generated camouflage to be more visually unsuspicious and natural.


% References
\bibliography{uai2023-template}
\end{document}
