% \documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} 
%% In your camera-ready you should use the 'accepted' parameter. This shows the authors and how an accepted paper will look like. The footer is 'Acccepted for X'. In the final version, the proceedings chairs will add the page numbers for PMLR and the final footer will be 'Proceedings of X'.
%
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

% add
% \usepackage{amsmath}
\usepackage{booktabs, multirow, tabularx}
% \usepackage{subcaption}
% \usepackage[switch]{lineno}
\usepackage{boldline}
\usepackage{cellspace}
\usepackage{makecell}
% \usepackage{setspace}
\usepackage{graphicx}
\usepackage{xspace}
\usepackage{algorithm}
\usepackage{algorithmicx}
\usepackage{algpseudocode}
\usepackage{subcaption}
% \usepackage{newfloat}
% \usepackage{listings}
\usepackage{colortbl}
% \usepackage{enumitem}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

%% addnew
\newcommand{\origWord}[1]{\textcolor{green}{#1}}
\newcommand{\advAttack}[2]{\textcolor{red}{\textbf{#1}} (\origWord{#2})}

\makeatletter
\DeclareRobustCommand\onedot{\futurelet\@let@token\@onedot}
\def\@onedot{\ifx\@let@token.\else.\null\fi\xspace}
\newcommand{\name}[0]{RS\&V\xspace}
\def\eg{\emph{e.g}\onedot} \def\Eg{\emph{E.g}\onedot}
\def\ie{\emph{i.e}\onedot} \def\Ie{\emph{I.e}\onedot}
\def\cf{\emph{c.f}\onedot} \def\Cf{\emph{C.f}\onedot}
\def\etc{\emph{etc}\onedot} \def\vs{\emph{vs}\onedot}
\def\wrt{w.r.t\onedot} \def\dof{d.o.f\onedot}
\def\etal{\emph{et al}\onedot}
\def\st{\emph{s.t}\onedot}
\makeatother
\newcommand\setrow[1]{\gdef\rowmac{#1}#1\ignorespaces}
\newcommand\clearrow{\global\let\rowmac\relax}
\clearrow
% color for table cells
\definecolor{Gray}{gray}{0.95}
%\newcolumntype{g}{>{\columncolor{Gray}}c}

% color for the text
\newcommand{\Red}[1]{\textcolor{red}{\textbf{#1}}}
% \newcommand{\Red}[1]{\colorbox{red}{\underline{#1}}}
\newcommand{\Blue}[1]{\textcolor{blue}{\textbf{#1}}}
\newcommand{\Green}[1]{\textcolor{green}{#1}}
\newtheorem{definition}{Definition}

% argmax and argmin
\DeclareMathOperator*{\argmax}{argmax}
\DeclareMathOperator*{\argmin}{argmin}




\title{Detecting Textual Adversarial Examples through\\ Randomized Substitution and Vote}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is automatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
% 
% Important: in case of equal contributions, we strongly recommend to NOT show it in this part of the paper, but rather describe it in the appropriate section at the end of the paper "Author Contribution", where you have more space to describe how each author contributed.
%
% Add authors
% Remember to use the order convention "First/Given name" "Last/Family name", e.g. John Smith, Hanako Yamada, Marco Rossi, Wei Zhang
\author[ ]{Xiaosen Wang\thanks{Equal Contribution}}
\author[ ]{Yifeng Xiong\footnote[1]{}}
\author[ ]{Kun He\thanks{Corresponding author.}}
% Add affiliations after the authors
\affil[ ]{%
    School of Computer Science and Technology\\
    Huazhong University of Science and Technology\\
    Wuhan\\
    China 
}
\affil[ ]{
\texttt{ \{xiaosen,xiongyf,brooklet60\}@hust.edu.cn}
}
  \begin{document}
\maketitle

\begin{abstract}
A line of work has shown that natural text processing models are vulnerable to adversarial examples. Correspondingly, various defense methods are proposed to mitigate the threat of textual adversarial examples, \eg, adversarial training, input transformations, detection, \etc. In this work, we treat the optimization process for synonym substitution based textual adversarial attacks as a specific sequence of word replacement, in which each word mutually influences other words. We identify that we could destroy such mutual interaction and eliminate the adversarial perturbation by randomly substituting a word with its synonyms. Based on this observation, we propose a novel textual adversarial example detection method, termed \textit{Randomized Substitution and Vote} (\name), which votes the prediction label by accumulating the logits of $k$ samples generated by randomly substituting the words in the input text with synonyms. The proposed \name is generally applicable to any existing neural networks without modification on the architecture or extra training, and it is orthogonal to prior work on making the classification network itself more robust. Empirical evaluations on three benchmark datasets demonstrate that our \name could detect the textual adversarial examples more successfully than the existing detection methods while maintaining the high classification accuracy on benign samples.
\end{abstract}

\section{Introduction}
Deep Neural Networks (DNNs) are known to be vulnerable to adversarial examples~\citep{szegedy2014intriguing,goodfellow2015explaining,alzantot2018generating,wang2021enhancing}, in which human-imperceptible modifications on the benign samples could mislead the model prediction. More seriously, adversarial examples have been found in a variety of deep learning tasks, including Computer Vision (CV)~\citep{goodfellow2015explaining,madry2017towards} and Natural Language Processing (NLP)~\citep{papernot2016crafting,alzantot2018generating}, leading to 
great threats to the security of numerous real-world applications, \eg, spam filtering, malware detection, \etc. Consequently, it has gained broad attention on generating or defending adversarial examples for various deep learning tasks.

In the field of NLP, numerous adversarial attack methods have been proposed recently, which could be roughly split into three categories, namely character-level attacks~\citep{gao2018black,ebrahimi2017hotflip,li2019textbugger}, word-level attacks~\citep{papernot2016crafting,alzantot2018generating,ren2019generating,garg2020bae,zang2020word,wang2021Adversarial,maheshwary2021generating}, and sentence-level attacks~\citep{liang2017deep,ribeiro2018semantically,zhang2019paws,wang2019t3}. Among these methods, synonym substitution based attacks that belong to word-level attacks, are widely investigated because they naturally satisfy the lexical, grammatical and semantic constraints of natural texts. In this work, we focus on effectively detecting the textual adversarial examples generated by synonym substitution based attacks.


To mitigate the threat of textual adversarial examples, some researchers propose to enhance the model to resist adversarial examples, such as input pre-processing~\citep{wang2021Natural,zhou2021defense}, adversarial training~\citep{wang2021Adversarial,dong2021towards}, certified defense~\citep{jia2019certified,huang2019achieving,shi2020robustness}, \etc. Meanwhile, another line of work~\citep{zhou2019learning,mozes2021frequency} focuses on detecting the adversarial examples so that we could further reject the detected adversarial examples or recover the benign samples before feeding them into the models. Detection methods usually utilize an extra detection module that is generally applicable to any target model and is very popular in industrial applications~\citep{SafetyNet} due to the minor decay on clean accuracy. Currently, however, there exists little attention on textual adversarial example detection. Moreover, the two lines of defense are complementary to each other and could be integrated together to further enhance the model robustness.



In this work, we aim to improve the adversarial example detection accuracy for text classification tasks against synonym substitution based attacks. We treat the optimization process for synonym substitution based attacks as a specific sequence for word replacement, denoted as replacement sequence, in which each word mutually influences other words. Inspired by the existing input pre-processing based defenses~\citep{wang2021Natural,zhou2021defense}, we observe that randomized synonym substitution could destroy such mutual interaction and eliminate the adversarial perturbation with high probability. To this end, we propose a novel textual adversarial example detection method, termed \textit{Randomized Substitution and Vote} (\name). Specifically, given an input text $x$, we first generate a set of samples $\{x_1, x_2,\cdots, x_k\}$ by randomly substituting some words $w_i\in x$ with their arbitrary synonyms $\hat{w}_i$. Then we accumulate the logits output of the $k$ processed samples and treat $x$ as an adversarial example if the prediction for $x$ is inconsistent with the prediction voted by the $k$ samples. 

Our main contributions are as follows:

\begin{itemize}[itemsep=2pt,topsep=0pt,parsep=0pt]
    \item We identify that randomized synonym substitution could destroy the mutual interaction among the words in the replacement sequence.
    \item We propose a simple yet effective adversarial example detection method \name against synonym substitution based attack, which is rarely investigated but very significant in real-world applications.
    \item Empirical evaluations demonstrate that \name outperforms the state-of-the-art baselines no matter whether the attackers could access the detection module and maintain the high accuracy on benign samples.
    \item \name utilizes the model generalization and robustness to randomized synonym substitution to detect the adversarial examples without any additional training or modification on the architecture and is generally applicable to any existing neural networks.
\end{itemize}



\section{Related Work}
Recently, adversarial examples for text classification tasks, \ie textual adversarial examples, have attracted great interest, especially on textual adversarial attacks ~\citep{papernot2016crafting,ebrahimi2017hotflip,ribeiro2018semantically,alzantot2018generating,ren2019generating,zang2020word}, followed by an increasing attention on textual adversarial defenses~\citep{jia2019certified,liu2020joint,jones2020robust,wang2021Adversarial,mozes2021frequency,Yang2022Robust}.


\textbf{Textual Adversarial Attacks.} According to the different strategies to generate adversarial examples, we could roughly divide the existing textual adversarial attacks into three categories: 1) Character-level attacks usually swap, flip, remove or insert characters in the input text to generate the adversary, \eg, DeepWordBug~\citep{gao2018black}, HotFlip~\citep{ebrahimi2017hotflip}, TextBugger~\citep{li2019textbugger}. 2) Word-level attacks replace the words with other unrelated words in the dictionary~\citep{papernot2016crafting} or only substitute the words with its synonyms~\citep{alzantot2018generating}. The latter kind of word-level attacks could preserve the semantic consistency and have been widely accepted~\citep{alzantot2018generating,ren2019generating,li2020bert,zang2020word,jin2020bert,wang2021Natural,maheshwary2021generating,guo2021gradient,yuan2021transferability}. 3) Sentence-level attacks insert a parenthesis~\citep{liang2017deep,wang2019t3} or rephrase the original sentence without changing the original meaning~\citep{iyyer2018adversarial,zhang2019paws}. Some works~\citep{liang2017deep,li2019textbugger} also utilize both character-level and word-level attacks to achieve better attack performance.

\textbf{Textual Adversarial Defenses.} To mitigate the threat of textual adversarial examples, various defense methods have also been proposed, such as spell checker~\citep{pruthi2019combating}, input pre-processing~\citep{wang2021Natural,zhou2021defense}, adversarial training~\citep{wang2021Adversarial,dong2021towards,SAE}, certified defense~\citep{jia2019certified,huang2019achieving,shi2020robustness} and adversarial example detection~\citep{zhou2019learning,mozes2021frequency}. For the detection methods, \citet{zhou2019learning} propose \textit{Learning to \underline{Dis}criminate \underline{P}erturbation} (DISP), which trains a perturbation discriminator and embedding estimator to detect the adversarial examples and recover the benign samples. \citet{mozes2021frequency} propose \textit{Frequency-Guided Word Substitutions} (FGWS) that substitutes all the low frequency words in the input text with the most frequent synonyms to eliminate the adversarial perturbation. Compared with the defense methods that enhance the model robustness, detection methods usually adopt an extra detection module which leads to little decay on clean accuracy, making it popular in real-world applications.

In this work, we propose a simple yet effective method, called Randomized Substitution and Vote (\name), to detect the synonym substitution based textual adversarial examples without modifying the architecture or training process, which is generally applicable to any model. 


\section{Methodology}
In this section, we first introduce the preliminary of textual adversarial attacks and our motivation. Then we give a detailed description on the proposed Randomized Substitution and Vote (\name) method. Code is available at \url{https://github.com/JHL-HUST/RSV}.

\subsection{Preliminary}
Let $\mathcal{X}$ be the set of input texts and $\mathcal{Y}=\{y_1, y_2, \cdots, y_N\}$ be the set of corresponding labels. 
Given an input text $x=\langle w_1, w_2, \cdots, w_n \rangle \in \mathcal{X}$ with the ground-truth label $y_{true} \in \mathcal{Y}$, a well-trained natural language model $f:\mathcal{X}\to \mathcal{Y}$ predicts its label $\bar{y}=y_{true}$ with high probability, using the maximum posterior probability:
\begin{equation}
    \bar{y} = \mathop{\arg\max}_{y_i \in \mathcal{Y}}  f(y_i|x).
\end{equation}
The attacker perturbs the natural text $x$ slightly to generate a textual adversarial example $x^{adv}$ which misleads the model prediction:
\begin{equation}
    \mathop{\arg\max}_{y_i \in \mathcal{Y}} f(y_i|x^{adv}) \neq y_{true}.
\end{equation}
In this work, we focus on detecting the synonym substitution based adversarial examples generated by substituting a few words, in which each word $w_i \in x$ is substituted with one of its synonym $\hat{w}_i$, so as to maintain the semantic consistency while attacking. 


For the task of adversarial example detection, given an input text $x$, the detector $D$ should determine whether $x$ is benign sample or adversarial example. A stronger detector would also contain a corrector $E$, which could restore the correct label for the input adversarial example:
\begin{equation}
    \bar{y}_{restore} =\left\{
        \begin{array}{ll}
        \bar{y} &  \text{if } D(x)=\text{False};\\
        E(x)     &  \text{if } D(x)=\text{True}. \\
        \end{array} 
        \right.
\end{equation}

\input{figs/unk}
\input{figs/frame}



\subsection{Motivation}
\label{sec:motivation}
The optimization process of synonym substitution based textual adversarial attacks could be regarded as searching a specific sequence for word replacement, termed replacement sequence, in which the words mutually influence each other and contribute together to mislead the target classifier. Intuitively, we postulate that we could eliminate the perturbation if we could successfully break the mutual interaction of the words in the replacement sequence. 
    % To verify this hypothesis, we conduct a validation experiment as follows.
 
    To validate our hypothesis, %we utilize the Textfooler attack~\citep{jin2020bert} to generate adversarial examples on a Word-CNN model ~\cite{kim2014convolutional}  for $1,000$ randomly sampled correctly classified texts from the AG's News test set~\cite{Zhang2015Dataset}.
    we first generate adversarial examples using the Textfooler attack~\citep{jin2020bert} on $1,000$ randomly sampled texts from the AG's News test set~\citep{Zhang2015Dataset} that are correctly classified by Word-CNN~\citep{kim2014convolutional}.
    To break the interaction among words in the replacement sequence, we first randomly mask words in the benign samples or adversarial examples as unknown (``UNK'') token, denoted as \textit{UNK Benign} or \textit{UNK Adversarial}, with different rates and feed these processed texts to the model. As shown in Figure~\ref{fig:unk}, the classification accuracy is still over 90\% when we mask 40\% words in the benign text, showing the stability and robustness of model to such randomized mask. In contrast, the classification accuracy on adversarial examples increases remarkably when we mask more words and we could recover 50\% adversarial examples when masking 40\% words. This validates our hypothesis that we could eliminate the adversarial perturbation if we successfully break the mutual interaction of words in the replacement sequence.
 
    However, since we randomly substitute the meaningful words with meaningless ``UNK'' token, the semantic meaning of the text would be destroyed significantly when masking a certain number of words, leading to poor performance on benign samples and limited improvement on adversarial examples. Inspired by the existing synonym substitution based attacks, we attempt to randomly substitute the words with its synonyms to break the interaction of words without destroying the original semantic meaning. As shown in Figure~\ref{fig:unk}, we find that randomly substituting words with its synonyms in the benign samples or adversarial examples, denoted as \textit{SYN Benign} or \textit{SYN Adversarial}, could consistently and significantly improve the robust accuracy against adversarial examples at the same time maintaining the high accuracy on benign samples under various substitution rates, which further validates our hypothesis.
    
\par 

\input{algorithm}
    Based on the above observation, we propose a novel textual adversarial example detection method called \name, which randomly substitutes the words in the input text with their synonyms to effectively detect the adversarial examples and restore the correct label for the input adversarial example. 

\subsection{Randomized Substitution and Vote}
Motivated by the observation that randomly substituting the words with their synonyms could eliminate the adversarial perturbation remarkably while maintaining the high clean accuracy, we propose a novel adversarial example detection method called Randomized Substitution and Vote (\name), to detect textual adversarial examples. As shown in Figure~\ref{fig:RS}, given an input text $x$, \name contains two main stages, \ie randomized substitution and vote \& detection, for detecting whether $x$ is an adversarial example and restoring its correct label if it is adversarial.

\textbf{Randomized Substitution.} Given an input text $x=\langle w_1, w_2, \cdots w_n\rangle$, we first construct the synonym set $\mathcal{S}(w_i)=\{\hat{w}_i^1, \hat{w}_i^2, \cdots, \hat{w}_i^k\}$ for each word $w_i \in x$ by combining the synonyms from WordNet~\citep{Miller1995WordNet} and neighbor words within a given Euclidean distance in the GloVe embedding space post-processed by counter-fitting~\citep{mrkvsic2016counter}. Then we randomly sample $n\cdot p$ words \{$w_i \in x$\} and substitute each selected word with its arbitrary synonym $\hat{w}_i^j \in \mathcal{S}(w_i)$ to break the interaction among words in the replacement sequence for adversarial examples. Moreover, we find that substituting some high-frequency words (\eg, ``a'', ``the'', ``hello'', \etc) is not only useless to mitigate the adversarial effect, but also has negative impact on classifying benign samples. 
Inspired by the Term Frequency–Inverse Document Frequency (TF-IDF) technique, we set the words with the top $s$ frequency among the training set as stopwords, where $s$ is a hyper-parameter, and ignore these words for randomized sampling. Such randomized substitution would repeat for $k$ times to generate $k$ different converted texts $\{x_1, x_2, \cdots, x_k\}$.

\textbf{Vote \& Detection.} Although we have shown that randomized substitution could remarkably mitigate the adversarial effect with an acceptable decay on the clean accuracy, we seek to further improve the recovery ability of randomized substitution and decrease the negative impact on benign samples. To eliminate the variance introduced by randomized substitution and stabilize the detection, we feed $k$ various converted texts $\{x_1, x_2, \cdots, x_k\}$ and accumulate the logits output on these texts to vote for the final label $\argmax \sum_{i=1}^k f(x_i)$. Note that the method is different from the naive approach depicted in Section~\ref{sec:motivation}. Our \name would detect the input text $x$ as an adversarial example if the voted \name label is inconsistent with the prediction label $\argmax f(x)$ and output the \name label as the prediction result for $x$.

\input{tabs/dataset}

In summary, as depicted in Figure~\ref{fig:RS}, the proposed \name first generates multiple samples by randomly substituting words in text with synonyms and accumulates the logits of these samples to vote the final label. If the voted label is not consistent with the prediction label for the input text, \name 
would treat the input text as adversarial example and output the voted label. The overall algorithm of \name is summarized in Algorithm \ref{algorithm1}.    


\section{Experiments}
To validate the effectiveness of the proposed \name, we conduct extensive evaluations on three benchmark datasets for three models against five popular adversarial attacks. In this section, we first specify the experimental setup, then we compare the detection performance of \name with two detection baselines, and demonstrate that \name could achieve better detection performance in two settings and maintain the high accuracy on benign samples. 
Finally, we investigate the impact of three hyper-parameters, \ie substitution rate $p$, number of votes $k$, and stopword selection portion $s$.




\subsection{Experimental Setup}
    % \textbf{Datasets and Models.} 
    % We adopt three widely investigated benchmark datasets, \ie IMDB~\citep{imdb}, AG's News, Yahoo! Answers~\citep{Zhang2015Dataset}, whose details are summarized in Table~\ref{tab:dataset}. To validate the generalization of \name, we consider three popular deep neural models that exhibit state-of-the-art performance on text classification tasks, including Word-CNN~\citep{kim2014convolutional} and two pre-trained language models, \ie BERT~\citep{devlin-etal-2019-bert} and RoBERTa~\citep{RoBERTa}.
    
    \textbf{Datasets and Models.} We adopt three widely investigated benchmark datasets, \ie IMDB~\citep{imdb}, AG's News, Yahoo! Answers~\citep{Zhang2015Dataset}, including sentiment analysis and news or topic classification. Details about the datasets are summarized in Table~\ref{tab:dataset}. To validate the generalization to model architectures of our \name, we adopt several popular deep learning models that exhibit state-of-the-art performance on text classification tasks, including Word-CNN~\citep{kim2014convolutional} and two pre-trained language models, BERT~\citep{devlin-etal-2019-bert} and RoBERTa~\citep{RoBERTa}.
    
    \textbf{Evaluation Setup.}  We consider static attack evaluation (SAE) and targeted attack evaluation (TAE) \citep{SAE}. In SAE setting, the attacker does not know the detection module and generates adversarial examples on the original model. In TAE setting, the attacker could access and attack the detection module simultaneously when attacking target model. We evaluate the detection performance against various attacks, \ie GA~\citep{alzantot2018generating}, PWWS~\citep{ren2019generating}, PSO~\citep{zang2020word}, Textfooler~\citep{jin2020bert} and HLA~\citep{maheshwary2021generating}, using F1 score and detection accuracy, \ie classification accuracy on samples recovered by detection modules.
    %evaluate the detection performance in a general setting that the attacker does not know there is a detection module, as well as a more rigorous setting that the attacker could access and attack the detection module simultaneously when attacking the target model.
    % \ADD{SAE}}
  
    % \DEL{We report the detection accuracy and F1 score on the adversarial examples generated by various attacks, \ie GA~\cite{alzantot2018generating}, PWWS~\cite{ren2019generating}, PSO~\cite{zang2020word}, Textfooler~\cite{jin2020bert} and HLA~\cite{maheshwary2021generating}.}
    % \ADD{ We test the detect methods on the adversarial examples generated by various attacks, \ie GA~\cite{alzantot2018generating}, PWWS~\cite{ren2019generating}, PSO~\cite{zang2020word}, Textfooler~\cite{jin2020bert} and HLA~\cite{maheshwary2021generating}. We evaluate the performance of detector by F1 scores and evaluate the overall end-to-end performance by classification accuracy     that the detector recover.}
    
     \input{tabs/detection}
    \textbf{Baselines.}  We compare our method with normally trained models and two recently proposed detection methods designed for synonym substitution based attacks:
    \begin{itemize}[itemsep=2pt,topsep=0pt,parsep=0pt]
        \item DISP~\citep{zhou2019learning} trains a perturbation discriminator to identify the perturbed tokens and an embedding estimator to reconstruct the original text.
        \item FGWS~\citep{mozes2021frequency} replaces the low-frequency words with their most frequent synonyms in the dictionary to eliminate the adversarial perturbation.
    \end{itemize}
    
    
    \textbf{Hyperparameters.} For our \name, the number of votes is set to 25, the substitution rate $p$ is set to $80\%$ on IMDB and $60\%$ on AG's News as well as Yahoo! Answers based on the text length, and the stopword selection portion $s$ is set to $2\%$. 
    %We construct the synonym set containing the synonyms from WordNet~\citep{Miller1995WordNet} and at most 6 closest words on the GloVe vectors post-processed by counter-fitting~\citep{mrkvsic2016counter} evaluated by the Euclidean distance. We adopt BERT as the base model for DISP and the same strategy to tune the hyper-parameters using the validation set on various models for FGWS.
    % \ADD{more implementation details can be viewed in the appendix }.
    We mix up the synonyms from WordNet~\citep{Miller1995WordNet} and embedding space as synonyms set, where the embedding space synonyms were calculated by ranking the Euclidean distance between words based on GloVe vectors processed by counter-fitting~\citep{mrkvsic2016counter}.
    We select at most 6 words on embedding space to keep the diversity of the synonyms set, and the portion $s$ for stopwords is set to 5 for IMDB and 2 for AG's News and Yahoo base on the frequency drops,
    For the baselines, we use the same way in the original paper to tune hyper-parameters on the validation set.
\input{tabs/sample}

%\subsection{Evaluations on Detection in General Setting}
\subsection{Evaluation in the SAE Setting}
\label{sec:eva_detection}
To validate the effectiveness of \name, we evaluate the detection accuracy, the classification accuracy on the recovered samples by the detection methods and F1 score against adversarial attacks on three datasets, and do comparison with DISP and FGWS. Due to the high computational cost of generating adversarial examples for the attack methods, we randomly select 1,000 samples on each dataset and adopt the attacks to generate adversarial examples for each model. The results are summarized in Table~\ref{tab:detection}.

We could observe that \name generally exhibits superior detection performance against various adversarial attacks with high classification accuracy on benign samples. Specifically, all the normally trained models achieve the lowest classification accuracy on three datasets against five attacks. Compared with two detection baselines, \name consistently achieve much better detection accuracy and F1 score on three datasets for various models against five adversarial attacks, while maintaining the high classification accuracy on benign samples. For instance, on Yahoo! Answers dataset for RoBERTa model, \name exhibits even better (1.3\% higher) classification accuracy on benign samples than normally trained model and averagely outperforms DISP and FGWS with a clear margin of $16.1\%$ and $3.0\%$, respectively. This convincingly validates the high effectiveness of \name. 


To further gain insight on the performance improvement of \name, we present a randomly sampled adversarial example for Word-CNN from AG's News generated by Textfooler and the recovered examples for three detection methods in Table~\ref{tab:sample}. As we can see, 
%either DISP which predicts and recovers the perturbed word or FGWS or \name which substitute the word with synonyms using different strategies, could not really recover the benign sample as we expect, 
none of DISP, FGWS, and \name could really recover the benign sample as we expect, but all of them could lead to correct classification result. The results also show the fragility of textual adversarial example and might inspire more powerful detection and defense methods by pre-processing the input text without extra training or modification on the architecture in the future.    
    

\input{tabs/defense}


%\subsection{Evaluation on Detection in More Rigorous Setting}
\subsection{Evaluation in TAE Setting}
The SAE setting, where the attacker does not know there is a detection module, is the only case considered by the existing textual adversarial example detection methods. 
To further validate the effectiveness of \name, we consider a more rigorous setting of TAE, in which the attacker could access and attack the detection module to generate adversarial examples. Specifically, we generate adversarial examples on AG's News for Word-CNN in the TAE setting. %Due to the high computational cost of inferring the extra models adopted in DISP for the attack baselines, here we do not consider DISP as the baseline.
Due to the high computational cost of DISP in inferring the extra models for the attacks, here we do not consider DISP as the detection baseline.


As shown in Table~\ref{tab:defense}, FGWS exhibits little effectiveness against various attacks under the rigorous setting when compared with the normally trained model. One possible reason for the poor effectiveness of FGWS would be that the attacker could easily bypass the low-frequency words and generate adversarial examples by just using the high-frequency words. In contrast, \name could maintain the high effectiveness against various attacks. We postulate that the uncertainty of \name at each iteration makes it harder for the attacker to find good local minima to craft adversarial examples, leading to high robustness of \name. This further verifies the high effectiveness and superiority of \name.

% \subsection{Comparisons}
% \ADD{}

\input{figs/ablation}
\subsection{Ablation Study}
We also conduct a series of ablation experiments to investigate the impact of hyper-parameters in \name, namely the substitution rate $p$ in the randomized substitution, the number of votes $k$ during the vote process, and the stopword selection portion $s$. All the experiments are conducted on AG's News for Word-CNN against Textfooler using 1,000 randomly sampled texts that are correctly classified. To eliminate the variance of randomness, we repeat the experiments five times and report the average detection accuracy. The default values are  $p=60\%$, $k=25$ and $s=2\%$.

\textbf{On the effectiveness of substitution rate.} In Figure~\ref{fig:ablation:sub_ratio}, we study the impact of substitution rate $p$ on the detection accuracy. When $p=0$, \name degenerates to the normally trained model without the detection module, which exhibits high classification accuracy on benign samples but low detection accuracy on adversarial examples. With the increment on the value of $p$, more words in the input text would be substituted, but the accuracy on the benign sample is stable with slight decay, indicating the model's high robustness against such randomized synonym substitution. In contrast, the detection accuracy on adversarial examples increases significantly when we increase the value of $p$ till $p=60\%$, which is intuitive that replacing more words in the input text with synonyms would be more likely to eliminate the adversarial perturbation. Thus, we set $p=60\%$ in the main experiments.

\textbf{On the effectiveness of number of votes.} %Then we study the impact of vote number $k$ on the detection accuracy.
As shown in Figure~\ref{fig:ablation:voting_num}, when the number of votes $k=1$, there would be only a single input text and the vote cannot make any difference, hence, \name exhibits the lowest accuracy on either benign samples or adversarial examples. %When the value of $k$ is increased, 
With the increment on the value of $k$, the accuracy on both benign samples and adversarial examples increases significantly, especially for the detection accuracy. With a large value of $k \geq 10$, \name maintains high and stable performance, supporting that our vote could eliminate the variance of randomized substitution and stabilize the detection process. In the main experiments, we simply select a large value of $k = 25$ on all datasets.


\textbf{On the effectiveness of stopword selection portion.} %Finally, 
We continue to investigate the impact of the stopword selection portion $s$ on the detection accuracy. As illustrated in Figure~\ref{fig:ablation:stopwords}, the classification accuracy on benign samples is generally stable for various values of $s$, as our vote strategy has mitigated the variance introduced by randomness, leading to high and stable performance. The detection accuracy, however, increases when $s<2\%$ and decreases significantly when more words in the input text cannot be substituted and thus could not eliminate the adversarial perturbation effectively. In the main experiments, we set $s=2\%$ for better performance.

In summary, the substitution rate $p$, number of votes $k$ and stopword selection portion $s$ significantly influence the performance when they are small ($p \leq 60\%$, $k \leq 10$ or $s\leq2\%$), while the performance becomes stable for larger $p$ and $k$ but decreases for larger $s$. In our experiments, we adopt $p=60\%$, $k=25$ and $s=2\%$ for high and stable performance.

\section{Conclusion}
In this work, we identify that randomized synonym substitution could destroy the mutual interaction among words in the replacement sequence for adversarial attacks. Based on this observation, we propose a novel adversarial example detection method called Randomized Substitution and Vote (\name) to effectively detect the textual adversarial examples. \name adopts randomized synonym substitution to eliminate the adversarial perturbation and utilizes the accumulated votes to mitigate the variance introduced by randomness. Our method is generally applicable to any neural models without additional training or modification on the models. Empirical evaluations demonstrate that \name could achieve better detection performance no matter whether the attacker could access the detection module than existing baselines, at the same time \name maintains the high accuracy on benign samples. Moreover, \name identifies the fragility of textual adversarial examples, which might inspire more defense and detection methods by pre-processing the input text without sacrificing the classification accuracy on clean data.

\begin{acknowledgements} 
    This work is supported by National Natural Science Foundation (62076105) and Hubei International Cooperation Foundation of China (2021EHB011).
\end{acknowledgements}

\bibliography{wang_244}

% \appendix
% % NOTE: necessary when ptmx or no mathfont class option is given
% \providecommand{\upGamma}{\Gamma}
% \providecommand{\uppi}{\pi}
% \section{Math font exposition}
% How math looks in equations is important:
% \begin{equation*}
%   F_{\alpha,\beta}^\eta(z) = \upGamma(\tfrac{3}{2}) \prod_{\ell=1}^\infty\eta \frac{z^\ell}{\ell} + \frac{1}{2\uppi}\int_{-\infty}^z\alpha \sum_{k=1}^\infty x^{\beta k}\mathrm{d}x.
% \end{equation*}
% However, one should not ignore how well math mixes with text:
% The frobble function \(f\) transforms zabbies \(z\) into yannies \(y\).
% It is a polynomial \(f(z)=\alpha z + \beta z^2\), where \(-n<\alpha<\beta/n\leq\gamma\), with \(\gamma\) a positive real number.

\end{document}
