% \documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example
\usepackage{xspace}
\newcommand{\name}[0]{FTML\xspace}
\usepackage{multirow}
\usepackage{booktabs}
\usepackage{graphicx}
\usepackage{amsmath}
\usepackage{bbm}
\usepackage{subfigure}
\usepackage{color}
\newcommand{\Blue}[1]{\textcolor{blue}{#1}}

\newcommand{\HK}[1]{\textcolor{red}{[HK: #1]}}

%\title{Fast Triplet Metric Learning against Word-level Adversarial Attacks}
%\title{Fast Triplet Metric Learning for Robust Textual Embedding}
\title{Robust Textual Embedding against Word-level Adversarial Attacks}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[ ]{Yichen Yang\thanks{Equal Contribution.}}
\author[ ]{Xiaosen Wang\footnote[1]{}}
\author[ ]{Kun He\thanks{Corresponding author.}}
% Add affiliations after the authors
\affil[ ]{%
    School of Computer Science and Technology\\
    Huazhong University of Science and Technology\\
    Wuhan\\
    China 
}
\affil[ ]{
\texttt{ \{yangyc,xiaosen,brooklet60\}@hust.edu.cn}

}
  \begin{document}
\maketitle

\begin{abstract}
  We attribute the vulnerability of natural language processing models to the fact that similar inputs are converted to dissimilar representations in the embedding space, leading to inconsistent outputs, and we propose a novel robust training method, termed \textit{Fast Triplet Metric Learning (\name)}. 
  Specifically, we argue that the original sample should have similar representation with its adversarial counterparts and distinguish its representation from other samples for better robustness. To this end, we adopt the triplet metric learning into the standard training to pull words closer to their positive samples (\textit{i.e.}, synonyms) and push away their negative samples (\textit{i.e.}, non-synonyms) in the embedding space. Extensive experiments demonstrate that \name can significantly promote the model robustness against various advanced adversarial attacks while keeping competitive classification accuracy on original samples. Besides, our method is efficient as it only needs to adjust the embedding and introduces very little overhead on the standard training. Our work shows great potential of improving the textual robustness through robust word embedding.
\end{abstract}

%Keywords
%Adversarial learning, textual adversarial defense, robust textual embedding, triplet metric learning
%"Too Long; Didn't Read": a short sentence describing your paper:
%We propose a novel and fast adversarial defense method that adopts the triplet metric learning into the standard training to generate robust textual embeddings. 
%pull the words closer to their positive samples and push away their negative samples in the embedding space. 

\section{Introduction}

Deep learning models have achieved impressive performance on various machine learning tasks~\citep{krizhevsky2012imagenet,jacob2019bert}, however, recent studies have shown their vulnerability to adversarial examples crafted by exerting elaborate and imperceptible perturbations on the original input data. %~\citep{christian14intriguing,nicolas2016crafting}. 
Adversarial examples are firstly found 
in models for image classification task~\citep{christian14intriguing,goodfellow2014explaining}, and recently tremendous attention has been attracted in various natural language processing (NLP) tasks~\citep{nicolas2016crafting,wang22randomized}.

For adversarial attacks on text classification, the character-level perturbations~\citep{ji18blackbox,javid18hotflip} could be easily eliminated by a spell checker~\citep{danish19combating}, while the sentence-level attacks~\citep{mohit18adversarial,wang2019t3} based on rephrasing are hard to preserve the original semantics. 
In contrast, the word-level attacks~\citep{shuhuai19generating,alzantot18generating,yuan20pso,wang21adversarial,rishabh21generating} based on synonym substitutions have become the most widely adopted approach as they could craft the adversarial examples with high success rate while maintaining the grammatical correctness and semantic consistency, which are more challenging to defend against.
%This work will focus on defending such challenging adversarial attacks based on synonym substitutions. 

To improve the model robustness against the word-level adversarial attacks, researchers have implemented adversarial training by crafting adversarial examples with their proposed attacks and incorporating the adversaries into the training set~\citep{shuhuai19generating, alzantot18generating}. However, due to the discrete input space of the texts, the adversarial attacks for texts take much longer time compared with those for images and could not generate sufficient adversarial examples at each training epoch. To this end, \citet{wang21adversarial} and \citet{dong21towards} make improvement by designing fast white-box adversarial attacks to speed up adversarial training, yet it still takes dozens of times longer than the standard training. Certified defense methods based on interval bound propagation (IBP)~\citep{jia19certified,huang19achieving} could provide theoretical lower bound on the robustness. However, due to the heavy computing overhead and strict constraints, it is hard to extend the certified methods to large-scale datasets and complex models such as BERT. Meanwhile, a recent progress~\citep{wang21natural} maps synonyms to the same code to eliminate perturbation in the input space, which is efficient and easy-to-apply. 

% In this paper, by regarding the samples that can be mutually converted by synonym substitutions as similar samples, we attribute the vulnerability of deep models to diverse representations given similar inputs. Among similar inputs, the ones that ultimately lead to the wrong output of the model are adversarial examples. To ameliorate such vulnerability, we need to force the model to have the similar latent representations between original samples and their similar counterparts. The similar samples around an original input are the combinations of words and their synonyms, and they are hard to exhaust. Therefore, we turn to word-level solution. Specifically, we propose a triplet loss to force each word in the input text to attract its synonyms and repel non-synonyms in the embedding space. Then, any sample will be close to all its potential adversarial examples crafted by synonym substitutions and far away from other samples in the embedding space. Incorporating the triplet loss with standard training, we could train a robust model, which is able to eliminate the threat of adversarial examples while keeping good performance on original samples.

In this work, 
% we focus on defending against the synonym substitutions based textual adversarial attacks. 
%We define the texts obtained by synonym substitutions on the original input sample as \textit{similar samples} of the original input sample.
we regard the texts obtained by synonym substitutions on the original sample as \textit{similar samples}. We attribute the vulnerability of deep learning models to the dissimilar representations of similar input samples, among which those ultimately leading to wrong predictions are adversarial examples. 
%To ameliorate such vulnerability, 
For robustness, we propose a \textit{Fast Triplet Metric Learning (\name)}
to force each word in the input text to be close to its synonyms and far away from the non-synonyms in the embedding space. In this way, any original sample would have similar representations with its adversarial counterparts crafted by synonym substitution based attacks and distinguish its representation from other samples in the embedding space. By incorporating the triplet metric learning into the standard training, we could train a robust model against the synonym substitutions based adversarial attacks.
% that is able to eliminate\HK{We did not eliminate, but alleviate ... considerably/significantly } the threat of adversarial examples.
%while keeping good performance on original samples.

Our main contributions are summarized as follows:
\begin{itemize}
    \item \name is the first adversarial defense approach that focuses on robust word embedding. It reveals a new perspective of enhancing the NLP model robustness, highlighting the difference of adversarial defense between the text domain and image domain. 
    \item In \name, we propose a general idea to learn a robust word embedding by pulling words closer to their synonyms while pushing non-synonyms further away in the embedding space. Such general idea of how to learn a robust word embedding is important to defend against word-level adversarial attacks.
    \item Extensive experiments demonstrate that \name could significantly promote the model robustness against various advanced adversarial attacks while keeping high accuracy on original data across multiple datasets and models, including CNN, LSTM and BERT.
    \item \name is also efficient, because it introduces only a little overhead to the standard training for adjusting the embedding, facilitating its application to large-scale datasets and complex models.
\end{itemize}

\section{Related Work}
%This section provides a brief overview on  adversarial attack and defense methods for NLP models.
%\vspace{-0.5em}
\subsection{Adversarial Attacks}

According to different types of perturbation, existing adversarial attacks fall into three categories: (a) Character-level attacks usually utilize character insertion, modification, or deletion to craft adversarial perturbation~\citep{ji18blackbox,javid18hotflip,li19textbugger}. (b) Sentence-level attacks are based on rephrasing~\citep{mohit18adversarial,zhang19paws} or inserting short related sentences~\citep{wang2019t3,liang18deep}. (c) Word-level attacks substitute words in the input text with their synonyms according to different strategies~\citep{nicolas2016crafting,zhang19generating,wang21adversarial,meng20a,di20is}. Probability Weighted Word Saliency (PWWS)~\citep{shuhuai19generating} greedily determines the substitutions based on both the output probability change and the word saliency. Genetic Attack (GA)~\citep{alzantot18generating} and Particle Swarm Optimization (PSO)~\citep{yuan20pso} employ population-based optimization algorithms to search for adversarial examples. By only querying the predicted label, Hard Label Attack (HLA)~\citep{rishabh21generating} crafts adversarial examples through random substitutions and reduces the perturbation using a genetic algorithm. 

%\vspace{-0.5em}
\subsection{Adversarial Defenses}

Three categories of textual defense methods have been proposed to boost the model robustness against word-level attacks: adversarial training based methods, certified defense methods, and input transformation based methods. 

Adversarial training (AT) is one of the most popular defense methods~\citep{goodfellow2014explaining,madry18towards,alzantot18generating,shuhuai19generating,ivgi21achieving}. Adversarial Training with FGPM enhanced by Logit pairing (ATFL)~\citep{wang21adversarial} utilizes their proposed FGPM to generate adversarial examples and injects them into the training set. Adversarial Sparse Convex Combination (ASCC)~\citep{dong21towards} utilizes a weighted combination of the word vectors of synonyms to replace the original word vector, and then optimizes the weights by a gradient optimization to craft virtual adversarial examples in the embedding space for adversarial training. 

Certified defense methods provide the models with provable robustness to all possible word substitutions~\citep{zeng21certified, wang21certified,huang19achieving}. Given the interval of input,  \citet{jia19certified} utilize Interval Bound Propagation (IBP) to calculate the upper and lower bound of the output layer by layer, and then minimize the worst-case loss that any combination of the word substitutions can induce to achieve certified robustness. %However, due to the heavy computational overhead and strict constraints, IBP based methods are hard to scale to large datasets and complex models. 

Input transformation based methods eliminate the adversarial perturbations in the input space. \citet{wang21natural} propose Synonym Encoding Method (SEM) that inserts a coder before the input layer to map all the synonyms to the same code. SEM is efficient and easy-to-apply without involving the model architecture and training process. 
% Nevertheless, the coding erases subtle differences of the synonyms, causing some accuracy decay on original samples. 

% \input{figures/sem_ftml}

%Different from the input transformation methods such as SEM~\citep{wang21natural}, 
Different from SEM~\citep{wang21natural} that assigns the same code to synonyms in the input space, 
our work delicately captures the synonymous and non-synonymous relations of words by adjusting the distances among the words in the embedding space. We will provide detailed analysis on the differences between SEM and our work in Section~\ref{subsec:further}. Besides, since we do not need to craft adversarial examples during the training process and only introduce a little calculation, compared with typical adversarial training or certified defense methods, our method is easier to be extended to large-scale datasets and complex models.

% \vspace{-0.11em}
\section{Methodology}
This section first formulates the adversarial examples for text classification, then introduces our motivation, and finally describes the proposed defense method. %, Fast Triplet Metric Learning (\name).

% \vspace{-0.5em}
\subsection{Preliminary}
\label{subsection: task}
% \vspace{-0.25em}
Let \(\mathcal{X}\) denote the input space containing all the texts, and \(\mathcal{W}\) the dictionary containing all legal words in the input texts. Let \(\mathcal{Y}=\{y_1, y_2, \cdots, y_c\}\) denote the output space containing all the classification labels. Given an input text with \(n\) words \(x=\left \langle w_1, w_2, \cdots, w_n \right \rangle\) where \(w_i \in \mathcal{W}\), a classifier \(f: \mathcal{X} \rightarrow \mathcal{Y}\) firstly encodes \(x\) into the sequence of word vectors denoted as \(v(x) = \left \langle v(w_1), v(w_2), \cdots, v(w_n)\right \rangle\) in the embedding layer, and then feeds the representations \(v(x)\) into subsequent layers to output the prediction label \(f(x)\), which is expected to be the true label \(y\). 

Next, we define the synonyms and the adversarial examples based on synonym substitutions. Following the previous works~\citep{jia19certified, wang21natural,dong21towards}, we define the synonym set \(\mathcal{S}(w)\) as no more than \(k\) nearest words of \(w\) within the Euclidean distance \(\delta\) in the third-party GloVe embedding space post-processed by counter-fitting technique~\citep{mrksic16counter}. 

To achieve the semantic consistency, the attacker adopts synonym substitutions to craft the adversarial example \(x'=\left \langle w'_1, w'_2, \cdots, w'_n \right \rangle, w'_i \in \mathcal{S}(w_i) \cup \{ w_i \} \), such that: 
\begin{equation}
    f(x') \neq f(x) = y, \quad s.t.~~ R(x, x') \leq \epsilon,
\end{equation}
where \(R(x, x')\) is the distance metric of two texts in the input space, and $\epsilon$ is a small constant used to constrain the distance between \(x\) and \(x'\). For the word level adversarial examples, 
we adopt the word substitution ratio  as the distance metric $R(x, x_{adv})$ in the input space:
\begin{equation}
    R(x, x') = \frac{1}{n} \sum_{i = 1}^n \mathbbm{1}_{w_i \neq w_i'}(w_i, w_i'),
\end{equation}
where \(\mathbbm{1}\) is the indicator function.
% Let \(S_\delta(w)\) denote the set consisting of all the words inside the \(\delta\)-ball around \(w\):
% \begin{equation*}
%     S_\delta(w) = \{\hat{w} \in \mathcal{W} | \ d(w,\hat{w}) \leq \delta\}.
% \end{equation*}
% Then, we can formulate \(S(w)\) based on \(S_\delta(w)\):
% \begin{gather*}
%     S(w) = \mathop{\arg\max}_{S \subseteq S_\delta(w)}|S| , \\s.t.~~|S(w)| \!\leq\! k, \forall \hat{w} \!\in\! S_\delta(w)\! - \! S(w), w' \!\in\! S(w), \\
%     d(w, w') \leq d(w, \hat{w}).
% \end{gather*}

% \textbf{Textual Adversarial Example}.
% To achieve the semantic consistency, the attacker adopts synonym substitutions to craft the adversarial example \(x'=\left \langle w'_1, w'_2, \cdots, w'_n \right \rangle, w'_i \in S(w_i) \cup \{ w_i \} \), where \(S(w_i) \) is the synonym set of word \(w_i\), such that \(f(x') \neq f(x) = y\). Besides, \(x'\) also needs to satisfy the substitution ratio \(R(x, x') = \frac{1}{n} \sum_{i = 1}^n \mathbbm{1}_{w_i \neq w_i'}(w_i, w_i') \leq r, 0\leq r \leq 1\), where \(\mathbbm{1}\) is the indicator function.


\subsection{Motivation}

Although the adversarial example \(x'\) is close to the original sample \(x\) in the input space, it succeeds to deceive the classifier. We attribute the vulnerability of the classifier to the fact that \(x\) and \(x'\) have rather remote representations, leading to inconsistent outputs. In contrast, a robust classifier should be able to extract similar representations when feeding with similar input samples.

% In order to train such a robust classifier, we could adopt the triplet metric learning to force the latent representations of the original input sample and its adversarial counterparts to be closer. Triplet loss is commonly used for machine learning algorithms where an input as the anchor sample is compared to its positive samples and negative samples, such as learning the face embeddings~\citep{schroff15facenet}. Given a triplet \(\langle a, p, n\rangle\), the triplet loss attempts to attract the positive sample \(p\) close to the anchor sample \(a\) and repel the negative sample \(n\) away from the anchor sample \(a\), which could be formulated as:
% \begin{equation}
% \label{eq:triplet}
%     T(a, n, p) = \mathop{max}\left\{d(a,p)-d(a,n)+\alpha, 0\right\},
% \end{equation}
% where \(d(\cdot, \cdot)\) denotes the distance metric between two samples, and \(\alpha\) is a margin between positive and negative pairs. 
% A straightforward approach is to regard the original text \(x\) as the anchor sample \(a\), the adversarial example \(x'\) as the positive sample \(p\), and other text sampled from the dataset as the negative sample \(n\). Then we use the \(p\)-norm distance between the latent representations of pairwise samples as the distance metric \(d(\cdot, \cdot)\).
% \begin{align*}
% \small
% \begin{split}
%     &T(x, \mathcal{P}, \mathcal{N}) = \\ &\mathop{max}\left \{\frac{1}{|\mathcal{P}|}\sum_{x'\in \mathcal{P}}d(x,x')-\frac{1}{|\mathcal{N}|}\sum_{\hat{x}\in \mathcal{N}}d(x,\hat{x})+\alpha, 0 \right \},
% \end{split}
% \end{align*}
% where \(\mathcal{P}\) denotes the set consisting of adversarial examples of \(x\), \(\mathcal{N}\) the set of randomly sampled texts other than \(x\) from dataset, and \(d(\cdot, \cdot)\) are instantiated as the \(p\)-norm distance between latent features such as logits between two samples.
% \begin{equation*}
%     d(a, b) = \|f_l(a) - f_l(b)\|_p,
% \end{equation*}
% where \(f_l(\cdot)\) denotes the latent features extracted by the model.

To this end, we adopt the triplet metric learning, which is commonly used for machine learning algorithms where an input as the anchor sample is compared to its positive samples and negative samples. 
%, such as learning the face embeddings~\citep{schroff15facenet}. 
A straightforward way is to regard the original text \(x\) as the anchor sample, the similar texts \(x'\) obtained by synonym substitutions  %\HK{check, why need to be adversarial?} 
as the positive sample, and other text \(\tilde{x}\) sampled from the dataset as the negative sample. Given a triplet \(\langle x, x', \tilde{x} \rangle\), the triplet loss is formulated as:
\begin{equation}
\label{eq:triplet}
    \mathcal{L}(x, x', \tilde{x}) = \max\left\{d(x,x')-d(x,\tilde{x})+\alpha, 0\right\},
\end{equation}
where \(d(\cdot, \cdot)\) denotes the distance metric between the representations of two samples, and \(\alpha\) is a margin between the distance of positive and negative pairs. 
Minimizing \(\mathcal{L}(x, x', \tilde{x})\) forces the model to extract representations for the original sample \(x\), which is similar to that of its similar samples \(x'\) but dissimilar from that of other samples \(\tilde{x}\). 

% However, the above approach has a critical drawback concerning the positive samples, \textit{i.e.}, adversarial examples. Exploring the adversarial examples is a combinatorial optimization problem, and the time complexity grows exponentially with the text length.
However, exploring the positive samples is a combinatorial optimization problem, and the time complexity grows exponentially with the text length. To solve this issue, we turn to a word-level solution. The combinations of words are hard to exhaust, but the synonyms of words are limited. In the embedding space, if words are forced to be close to their synonyms and far away from non-synonyms, then any input text will have similar representations with its potential adversarial examples crafted by synonym substitutions and distinguish their representations from that of other samples in the dataset. Thus, we propose Fast Triplet Metric Learning (\name) which uses triplet loss to adjusting word embedding for robustness. 

% However, the above approach has a critical drawback concerning the positive samples, \ie adversarial examples. There may exist many adversarial examples around the input text. Exploring all the adversarial examples is a combinatorial optimization problem, and the time complexity grows exponentially with the text length.
%Moreover, it would be inefficient to generate adversarial examples during the training process, making it unpractical to large-scale datasets.

% We turn to a word-level solution to address the aforementioned issues based on the following two reasons. First, the combinations of words are hard to exhaust, but the synonyms of words are limited. Second, the embedding layer of the classifier maps the input word sequence into the word vectors word by word, where each word is independent and not associated with other words. Intuitively, in the embedding space, if words are forced to be close to their synonyms and far away from non-synonyms, then any input text will have similar representations with all its potential adversarial examples crafted by synonym substitutions and distinguish their representations from that of other samples in the dataset. Thus, we propose to use a triplet loss in the word level for metric learning to implement this solution.


% \vspace{-0.5em}
\subsection{Fast Triplet Metric Learning}

We first formulate the word-level triplet loss, and then describe how to incorporate the triplet loss with standard training to train a robust model.

\textbf{Word-level Triplet Loss}. For two words \(w_a\) and \(w_b\), we use the \(\ell_p\)-norm distance of their word vectors in the embedding space as the distance metric between them:
\begin{equation}
\label{eq:distance}
    d(w_a, w_b) = \| v(w_a) - v(w_b) \|_p.
\end{equation}
In this work, we adopt the Euclidean distance, \textit{i.e.}, \(p=2\).
Then, we design the triplet loss for a word \(w\) as follows:
\begin{align}
\label{eq:wtl}
\small
    \begin{split}
        \mathcal{L}_{tr}(w, \mathcal{S}(w), \mathcal{N}) = 
        & \frac{1}{|\mathcal{S}(w)|}  \sum_{w' \in \mathcal{S}(w)} d(w, w') - \\
        &\frac{1}{|\mathcal{N}|} \sum_{\tilde{w} \in \mathcal{N}}  \min (d(w, \tilde{w}), \alpha) +  \alpha,
    \end{split}
\end{align}
where \(\mathcal{S}(w)\) denotes the synonym set of word \(w\), and \(\mathcal{N}\) the set containing words randomly sampled from the dictionary.  
The number of randomly sampled words is the same as the maximum number of synonyms, namely \(k\).
% This triplet loss minimizes the distances between a word and its synonyms, and hence shorten the distances between positive pairs\HK{changed, check}, and maximizes the distances between the word and its non-synonyms (negative pairs\HK{not among negative pairs, check}) in the embedding space. Note that in order to prevent the distance 
% %of negative pairs\HK{not correct}
% of the word to its negative samples 
% from increasing indefinitely, we set a hyper-parameter of upper limit \(\alpha\). It is easy to verify that Eq.  \ref{eq:wtl} is always greater than 0, and thus the outermost \(\mathop{max}\{\cdot, 0\}\) in Eq.~\ref{eq:triplet} could be removed.
We minimize \(\mathcal{L}_{tr}(w, \mathcal{S}(w), \mathcal{N})\) to decrease the distances between the word \(w\) and its synonyms (positive samples) and increase the distances between \(w\) and its non-synonyms (negative samples) in the embedding space. In addition, to prevent the distance of positive pairs and negative pairs from keeping increasing simultaneously, the negative pairs would no longer be pushed away once the distance exceeds \(\alpha\).
%we set the margin hyper-parameter \(\alpha\) as the upper bound\HK{?, not really an upper bound} of the distance between negative pairs as well.

\textbf{Overall Training Objective}. Given a text \(x=\langle w_1, w_2, \cdots, w_n\rangle\) with the ground-truth class label \(y\) as the current input, we formulate the overall training objective as follows:
\begin{align}
\small
\label{eq:objective}
    \begin{split}
        \mathcal{L}(x,y)= \mathcal{L}_{ce}(f(x), y) + 
          \beta \cdot \frac{1}{n} \sum_{i=1}^n \mathcal{L}_{tr}(w_i, \mathcal{S}(w_i), \mathcal{N}_i),
    \end{split}
\end{align}
where \(\mathcal{L}_{ce}(\cdot, \cdot)\) denotes the cross-entropy loss, and \(\beta\) is a hyper-parameter to control the weight of the triplet loss. The first term \(\mathcal{L}_{ce}\) is used to train the subsequent layers after the first embedding layer for the classification's capability. The second term \(\mathcal{L}_{tr}\) is designed to train a robust word embedding, where each word in the input text is forced to be close to its synonyms and far away from the non-synonyms in the embedding space. In this way, we could train a robust model that has similar representations for similar input samples and distinguishes representations from that of other samples in the dataset, alleviating the vulnerability exploited by attackers while maintaining good classification performance.

Note that the metric learning is performed only in the first embedding layer of NLP models, regardless of the subsequent architecture of models. For BERT models that incorporate various input representations, including word embeddings, segment embeddings, and position embeddings, we only perform our metric learning on word embeddings. Furthermore, \name is fast because adjusting the embedding introduces only a little overhead to the standard training. Hence, theoretically \name is generic to any NLP models.



\section{Experiments}
% \input{tables/defense_imdb}
% \input{tables/defense_yelp_yahoo}

\begin{table*}[t]
    \centering

    \caption{The classification accuracy (\%)  against various adversarial attacks on three datasets for CNN and LSTM. The columns of \textit{Clean} denote the classification accuracy on the entire original testing set. The highest accuracy against the corresponding attack on each column is highlighted in \textbf{bold}, while the second one is highlighted in \underline{underline}. The last row of each block indicates the gains of the accuracy between \name and the best baseline. }
   
    \begin{tabular}{llcccccccccc}
        \toprule
         \multirow{2}{*}{Dataset} & \multirow{2}{*}{Defense} & \multicolumn{5}{c}{CNN} & \multicolumn{5}{c}{LSTM}\\
         \cmidrule(lr){3-7} \cmidrule(lr){8-12}
          ~ & ~ & Clean & PWWS & GA & PSO & HLA & Clean & PWWS & GA & PSO & HLA \\
         \midrule
         \multirow{7}{*}{\textit{IMDB}} &
         Standard & 89.7 & ~~0.6 & ~~2.6 & ~~1.4 &  17.7 & 89.1 & ~~0.2 & ~~1.6 &  ~~0.3 & ~~8.7 \\
         &IBP & 81.7 & \underline{75.9} & \underline{76.0} & \underline{75.9} &  76.6 & 77.6 & 67.5 & 67.8 & 67.6 & 68.2 \\
         & ATFL & 85.0 & 63.6 & 66.8 & 64.7 & 72.8 & 85.1 & 72.2 & 75.5 & 74.0 & 77.7\\
         & SEM & 87.6 & 62.2 & 63.5 & 61.5 & 70.5 & 86.8 & 61.9 & 63.7 & 62.2 & 70.8\\
         & ASCC & 84.8 & 74.0 & 75.5 & 74.5 & \underline{77.6} & 84.3 & \underline{74.2} & \underline{76.8} & \underline{75.5} & \underline{79.5}\\
         & \name & 88.1 & \textbf{81.1} & \textbf{81.4} & \textbf{81.1}  & \textbf{82.4} & 87.2 & \textbf{79.0} & \textbf{79.2} & \textbf{78.8}  & \textbf{79.7} \\
         & & & \Blue{\(\uparrow\)5.2} & \Blue{\(\uparrow\)5.4} & \Blue{\(\uparrow\)5.2}  & \Blue{\(\uparrow\)4.8} & & \Blue{\(\uparrow\)4.8} & \Blue{\(\uparrow\)2.4} & \Blue{\(\uparrow\)3.3} &  \Blue{\(\uparrow\)0.2} \\
         \midrule
         \multirow{7}{*}{\textit{Yelp-5}} &
         Standard & 62.7 & ~~1.1 & ~~1.3 & ~~0.8 &  ~~1.1 & 64.8 & ~~0.5 & ~~0.9 & ~~0.4 &  ~~0.5 \\
         & IBP & 52.1 & 47.8 & 47.8 & 47.7  & 47.6 & 42.6 & 42.1 & 40.7 & 40.6  & 40.2 \\
         & ATFL & 61.4 & \underline{50.0} & \underline{51.7} & \underline{50.2} &  \underline{53.9} & 62.4 & 48.0 & 48.8 & 46.9  & 51.9 \\
         & SEM & 60.1 & 34.9 & 33.8 & 32.4 & 37.2 & 61.9 & 35.5 & 34.3 & 33.7 & 37.6 \\
         & ASCC & 58.9 & 47.3 & 49.3 & 47.4  & 50.6 & 59.9 & \underline{48.5} & \underline{50.5} & \underline{49.6} & \underline{52.5} \\ 
         & \name & 59.9 & \textbf{56.7} & \textbf{56.7} & \textbf{56.6} & \textbf{56.5} & 61.9 & \textbf{57.5} & \textbf{57.6} & \textbf{57.5}  & \textbf{57.6} \\
         & & & \Blue{\(\uparrow\)6.7} & \Blue{\(\uparrow\)5.0} & \Blue{\(\uparrow\)6.4}  & \Blue{\(\uparrow\)2.6} & & \Blue{\(\uparrow\)9.0} & \Blue{\(\uparrow\)7.1} & \Blue{\(\uparrow\)7.9}  & \Blue{\(\uparrow\)5.1} \\
         \midrule
         \multirow{7}{*}{\textit{\shortstack{Yahoo!\\ Answers}}} &
         Standard & 72.6 & ~~6.8 & ~~7.2 & ~~4.9 &  ~~7.0 & 74.7 & 12.2 & ~~9.6 & ~~6.5 & 10.4 \\
         & IBP & 63.1 & 54.9 & 54.9 & 54.8  & 55.0 & 54.3 & 47.3 & 47.6 & 47.0  & 47.3 \\
         & ATFL & 72.5 & \underline{62.5} & \underline{63.1} & \underline{62.5} &  \textbf{65.0} & 73.6 & \underline{61.7} & 60.8 & 60.3 & 63.1\\ 
         & SEM & 70.1 & 53.8 & 52.4 & 51.9  & 54.6 & 72.3 & 57.0 & 56.1 & 55.4 & 56.8 \\
         & ASCC & 69.0 & 58.4 & 59.6 & 58.5 & 59.9 & 70.7 & \underline{61.7} & \underline{62.3} & \underline{61.9} & \underline{63.2}\\
         & \name & 69.4 & \textbf{65.1} & \textbf{65.1} & \textbf{65.0} &  \underline{64.9} & 71.4 & \textbf{67.8} & \textbf{67.8} & \textbf{67.8}  & \textbf{67.9} \\
         &  & & \Blue{\(\uparrow\)2.6} & \Blue{\(\uparrow\)2.0} & \Blue{\(\uparrow\)2.5}  & ~\Blue{\(\downarrow\)0.1} & & \Blue{\(\uparrow\)6.1} & \Blue{\(\uparrow\)5.5} & \Blue{\(\uparrow\)5.9} & \Blue{\(\uparrow\)4.7}  \\ 
         \bottomrule
    \end{tabular}
    
    \label{tab:defense_cnn_lstm}
\end{table*}

This section evaluates \name with four defense baselines against various attacks on three benchmark datasets involving CNN, LSTM and BERT models. 
Code is available at  \url{https://github.com/JHL-HUST/FTML}.

% \vspace{-0.5em}
\subsection{Experimental Setup}
\label{subsection: setup}
\textbf{Datasets.} We evaluate \name on three benchmark datasets, namely \textit{IMDB}~\citep{maas11learning}, \textit{Yelp-5} and \textit{Yahoo! Answers}~\citep{zhang15character}. \textit{IMDB} is a binary sentiment classification dataset containing 25,000 movie reviews for training and 25,000 for testing. \textit{Yelp-5} has 640,000 training samples and 50,000 testing samples with five labels. \textit{Yahoo! Answers} is a large-scale topic classification dataset with 10 classes, consisting of 1,400,000 training samples and 50,000 testing samples. 

\textbf{Models}. We replicate CNN~\citep{kim14conv} and bidirectional LSTM~\citep{liu2016recurrent} from \citet{wang21adversarial}. For brevity, we denote the bidirectional LSTM with LSTM. We use the 300-dimensional GloVe word vectors~\citep{pennington14glove} to initialize the embedding layer of CNN and LSTM models. We also fine-tune the base-uncased pre-trained BERT~\citep{jacob2019bert} as another target model. 

\textbf{Attack Methods.} To thoroughly evaluate the defense efficacy of the \name defense methods, we adopt four advanced adversarial attacks, including  GA~\citep{alzantot18generating}, PWWS~\citep{shuhuai19generating}, PSO~\citep{yuan20pso}, and HLA~\citep{rishabh21generating}. 
Due to the inefficiency of textual adversarial attacks, we attack each model using 1000 randomly sampled testing data from each dataset to craft adversarial examples.

\textbf{Defense Baselines}. We compare \name with standard training and four state-of-the-art adversarial defense baselines, including a certified defense: IBP~\citep{jia19certified}, an input transformation based defense: SEM~\citep{wang21natural}, and two adversarial training based defenses: ATFL~\citep{wang21adversarial} and ASCC~\citep{dong21towards}. However, as indicated in~\citet{shi20robustness}, BERT models are too challenging to be tightly verified with current IBP technologies, neither do \citet{wang21adversarial} extend their adversarial training method ATFL to BERT. Thus, we omit IBP and ATFL as the defense baselines on BERT models. 

\textbf{Evaluation Settings}. For the synonym definition in Section~\ref{subsection: task}, we follow \citet{jia19certified} and \citet{dong21towards} and set \(k=8\) and \(\delta=0.5\) for all the experiments to have a fair comparison. For the hyper-parameters in \name, we set the margin \(\alpha=0.7\alpha_0\) in Eq.~\ref{eq:wtl}, where \(\alpha_0\) is the average word distance of the initial word embeddings before training. \(\alpha_0\) is \( 8.54\) for CNN and LSTM models initialized by GloVe word vectors and \(1.48\) for the base-uncased pre-trained BERT model. We set the weight \(\beta=1\) in Eq.~\ref{eq:objective} to achieve a proper trade-off between the standard training loss and the triplet metric loss. We will provide a hyper-parameter study to explore their sensitivity. We train our models for \(20\) epochs on \textit{IMDB}, but \(5\) epochs on \textit{Yelp-5} and \textit{Yahoo! Answers} respectively, as the models converge faster on large datasets.

\subsection{Evaluation on Defense Efficacy}
\label{subsec:evaluation}
\textbf{Performance on CNN and LSTM.} 
%To evaluate the defense performance, we attack the models on 1000 randomly sampled testing samples from each dataset. 
We compare \name with standard training (Standard) and four defense baselines, IBP, ATFL, SEM and ASCC, using the original samples (Clean) or the adversarial examples crafted by different attack methods. 
The comparison results are presented in Table~\ref{tab:defense_cnn_lstm}.
%shows the classification accuracy.
The more effective the defense method, the higher the classification accuracy under various attacks.
Meanwhile, we also wish the performance do not decay much on the original samples, as compared with the standard training. 

%From Table~\ref{tab:defense_cnn_lstm}, 
From the results, we can observe that \name achieves dominant robustness across all datasets under various adversarial attacks with clear margins. 
% Especially on the LSTM model of \textit{Yelp-5} dataset, \name has gained the most benefits.
Since HLA attack is based on the hard-label setting and is easier to defend against, the superiority of \name is relatively insignificant when compared to other defense methods under HLA attack.
%For instance, the standard training on CNN model without defenses almost loses the classification ability completely under the adversarial attacks, only achieving an average accuracy of \(4.7\%\) on \textit{IMDB} dataset. After enhanced by \name, the model's accuracy increases to \(82.5\%\) on average, outperforms all other defense methods significantly. 
In addition, among the defense methods, \name yields the best clean accuracy on \textit{IMDB} dataset and is close to the best clean accuracy on the other two datasets. 

\textbf{Performance on BERT.} Before using \name to fine-tune the BERT model, we need to make some modifications to the vocabulary of the pre-trained BERT model. Some words in the training set do not exist in the vocabulary, and will be separated into multiple tokens by the WordPiece tokenizer adopted by the BERT model, making it hard to calculate the distance involving these words. Thus, we add these words to the vocabulary of BERT, and initialize their embedding vectors by the average pre-trained embedding vectors of the tokens that they would have been split into, so that each word corresponds to a single embedding vector. We will investigate the effect of supplementing BERT's vocabulary on 
vocabulary size, robustness, and generalization of BERT models in Section~\ref{subsec:further}. 

The results on three datasets are shown in Table~\ref{tab:defense_bert}. Similar to the performance on CNN and LSTM, \name exhibits the best robustness over the state-of-the-art defense baselines by a large margin. Besides, for the clean accuracy, \name decays the least on \textit{IMDB}, and is only inferior to SEM by less than \(1\%\) on the other two datasets. Note that we evaluate the defenses on a more aggressive attack setting that the nearest \(k=8\) words satisfying the distance constraint \(\delta\) are regarded as synonyms, and therefore the robustness results of reproduction are lower than that reported in the SEM paper~\citep{wang21natural} where \(k=4\).

\begin{table}[tb]
    \centering
    \setlength\tabcolsep{3pt}
    \caption{The classification accuracy (\%) against various adversarial attacks on three datasets for BERT models. As explained in subsection~\ref{subsection: setup}, we omit IBP and ATFL as the baselines on BERT models.}
    \resizebox{\columnwidth}{!}{
    \begin{tabular}{cccccccc}
        \toprule
         Dataset & Defense & Clean & PWWS & GA & PSO & HLA \\
         \midrule
         \multirow{5}{*}{\textit{IMDB}} & Standard & 92.4 & 16.6 & ~~8.1 & ~~1.9 & ~~8.2 \\
         &SEM & 89.9 & \underline{72.3} & \underline{70.5} & \underline{69.2} & \underline{75.2} \\
         &ASCC & 81.3 & 65.1 & 65.4 & 63.1 & 69.5 \\
         &\name & 91.3 & \textbf{81.2} & \textbf{81.5} & \textbf{80.0} & \textbf{83.1}  \\
         & & & \Blue{\(\uparrow\)8.9} & \Blue{\(\uparrow\)11.0~~} & \Blue{\(\uparrow\)10.8~~} & \Blue{\(\uparrow\)7.9} \\
         \midrule
         \multirow{5}{*}{\textit{Yelp-5}} & Standard & 65.7 & ~~2.4 & ~~1.1 & ~~0.7 & ~~1.3 \\
         &SEM &  63.7 & 39.9 & 37.0 & 36.9 & 39.4 \\
         &ASCC &  63.4 & \underline{50.0} & \underline{50.8} & \underline{49.5} & \underline{54.8} \\
         &\name & 63.0 & \textbf{55.4} & \textbf{55.1} & \textbf{55.0} & \textbf{55.2}  \\
         & & & \Blue{\(\uparrow\)5.4} & \Blue{\(\uparrow\)4.3} & \Blue{\(\uparrow\)5.5} & \Blue{\(\uparrow\)0.4} \\
         \midrule
         \multirow{5}{*}{\textit{\shortstack{Yahoo!\\ Answers}}} & 
         Standard & 77.0 & 20.7 & 10.3 & ~~7.3 & 10.0 \\
         & SEM &  75.6 & 64.6 & 62.0 & 61.9 & 63.6 \\
         & ASCC & 75.2 & \underline{66.4} & \underline{67.5} & \underline{66.6} & \underline{68.0} \\
         &\name & 74.8 & \textbf{70.0} & \textbf{70.0} & \textbf{70.0} & \textbf{70.0} \\
         & & & \Blue{\(\uparrow\)3.6} & \Blue{\(\uparrow\)2.5} & \Blue{\(\uparrow\)3.4} & \Blue{\(\uparrow\)2.0} \\         
         \bottomrule
    \end{tabular}
    }
    
    \label{tab:defense_bert}
\end{table}

\begin{table*}[tb]
% \setlength\tabcolsep{3pt}

    \centering
    \caption{Evaluation on the training time %of one
    per epoch (in minutes) for the models with various defenses. 
    %\name needs slightly longer time than SEM, as \name needs to learn the embedding during the training while SEM only does the coding, and they are both faster than other defense methods. We highlight the best defense efficiency in \textbf{bold} and indicate the second one in \underline{underline}.
    }
    % \resizebox{\textwidth}{!}{
    \begin{tabular}{lccccccccc}
        \toprule
        \multirow{2}{*}{Defense} & \multicolumn{3}{c}{\textit{IMDB}} & \multicolumn{3}{c}{\textit{Yelp-5}} & \multicolumn{3}{c}{\textit{Yahoo! Answers}}\\
        \cmidrule(lr){2-4} \cmidrule(lr){5-7} \cmidrule(lr){8-10}
         & CNN & LSTM & BERT & CNN & LSTM & BERT & CNN & LSTM & BERT\\
         \midrule
         Standard & ~~~~1 & ~~~~1 & ~~11 & ~~~~4 & ~~~~6 & 178 & ~~~~9 & ~~13 & 371 \\
         IBP & ~~~~1 & ~~48 & N/A & ~~12 & 610 & N/A & ~~26 & 953 & N/A \\
         ATFL & ~~15 & ~~22 & N/A & 203 & 290 & N/A & 444 & 573 & N/A \\
         SEM & ~~~~1 & ~~~~1 & ~~11 & ~~~~4 & ~~~~6 & 178 & ~~~~9 & ~~16 & 371 \\
         ASCC & ~~~~2 & ~~~~6 & 100 & ~~32 & ~~71 & 1638~~ & ~~55 & 125 & 2523~~\\
         \name & ~~~~1 & ~~~~1 & ~~12 & ~~12 & ~~16 & 183 & ~~22 & ~~29 & 384 \\
         \bottomrule
    \end{tabular}
    % }
    
    \label{tab:time}
\end{table*}

%\vspace{-0.5em}
\subsection{Evaluation on Defense Efficiency}

The efficiency is also crucial for evaluating the defense methods, especially when a defense is applied on large-scale datasets and complex models, such as BERT model on \textit{Yahoo! Answers} dataset. 
The training time cost per epoch for the models with various defense methods is shown in Table~\ref{tab:time}. 
The IBP defense heavily slows the training on LSTM models due to the high %computational 
overhead for certified constraints. 
Although designed with fast white-box adversarial attacks, the two adversarial training based methods, ATFL and ASCC, are still not easy to scale to large datasets and complex models. 
SEM is the most efficient approach because it only needs to transform the input text based on a synonym coding. Since we introduce a little overhead for adjusting the word embeddings on the standard training, our proposed \name 
needs slightly longer time for the training than SEM, but ours is at least four times faster than other defense methods on LSTM and BERT models. Especially on the BERT model, the extra cost of adjusting the embedding is less than 10\% of the time spent on standard training, which is almost negligible. It is worth consuming a little longer time for gaining a much better defense efficacy. 

%the most efficient one among the IBP and adversarial training based methods on LSTM models, while achieving the leading defense capability.

\begin{table}[tb]
    \centering
    % \setlength\tabcolsep{3pt}
    \caption{The classification accuracy (\%) of models initialized and frozen with different word vectors without any other defenses on \textit{IMDB} dataset.}
    \resizebox{\columnwidth}{!}{
    \begin{tabular}{ccccccc}
        \toprule
         Model & Embedding & Clean & PWWS & GA & PSO & HLA \\
         \midrule
         \multirow{3}{*}{CNN} & GloVe & 89.3 & ~~0.9 & ~~2.8 & ~~1.5 & 18.3 \\
         & CNN-RV & 87.3 & 79.9 & 80.7 & 80.0 & 81.1 \\
         & LSTM-RV & 87.0 & 79.2 & 79.6 & 79.2 & 80.2 \\
         \midrule
         \multirow{3}{*}{LSTM} & GloVe & 89.4 & ~~1.9 & ~~4.0 & ~~1.4 & 17.5 \\
         & CNN-RV & 87.9 & 80.2 & 80.8 & 80.3 & 81.9 \\
         & LSTM-RV & 87.8 & 80.7 & 81.0 & 80.9 & 81.8 \\
         \midrule
         \multirow{2}{*}{BERT} 
         & BERT-V & 92.5 & 22.0 & 12.6 & ~~8.2 & ~~7.0 \\
         & BERT-RV & 90.3 & 79.7 & 80.4 & 79.0 & 82.5 \\
         \bottomrule
    \end{tabular}}
    
    \label{tab:embedding_study}
\end{table}

\begin{figure}
\begin{center}
\includegraphics[width=\columnwidth]{figures/sem_ftml_v3.pdf}
\end{center}
\caption{Illustration of the differences between SEM and \name. Since word \textit{substantial} is polysemous, both synonym set \(A\) and \(B\) contain \textit{substantial}, and they are semantically related but not really synonymous. Synonym set \(C\) is not semantically related to the previous two synonym sets. 
%(a) In the input space, SEM distributes \textit{substantial} to the left green cluster and loses the synonymous relationship between \textit{substantial} and the other words in synonym set \(B\) (\ie, the right cluster). (b) In the embedding space, \name captures more delicately the semantic relationship between words by adjusting the word distances.
}
\label{figs:sem_ftml}
\end{figure}

\begin{table}[tb]
    \centering
    % \setlength\tabcolsep{3pt}
    \caption{The classification accuracy (\%) against various adversarial attacks of models trained with contrastive metric learning (CML) on \textit{IMDB} dataset.} % CML denotes a method obtained by replacing triplet loss in \name with contrastive loss.}
    \resizebox{\columnwidth}{!}{
    \begin{tabular}{ccccccc}
        \toprule
         Model & Method & Clean & PWWS & GA & PSO  & HLA \\
         \midrule
         \multirow{2}{*}{CNN} & CML & 88.3 & 78.6 & 79.1 & 78.8 & 81.1  \\
         & \name & 88.1 & 81.1 & 81.4 & 81.1 & 82.4 \\
         \midrule
         \multirow{2}{*}{LSTM} & CML & 87.2 & 76.5 & 76.5 & 76.0 & 78.2 \\
         & \name & 87.2 & 79.0 & 79.2 & 78.8 & 79.7 \\
         \midrule
         \multirow{2}{*}{BERT} & CML &  92.1 & 71.9 & 67.3 & 64.1 & 63.3 \\
         & \name & 91.3 & 81.2 & 81.5 & 80.0 & 83.1 \\
         \bottomrule
    \end{tabular}}
    \label{tab:abulation_study}
\end{table}

\begin{table}[tb]
    \centering
    % \setlength\tabcolsep{3pt}
    \caption{The classification accuracy (\%) of models trained with \name involving different \(L_p\)-norm distance metric on \textit{IMDB} dataset.}
    \resizebox{\columnwidth}{!}{
    \begin{tabular}{ccccccc}
        \toprule
         Model & Distance & Clean & PWWS & GA & PSO & HLA \\
         \midrule
         \multirow{3}{*}{CNN}
         & \(p=1\) & 88.5 & 76.9 & 77.9 & 77.6 & 80.1 \\
         & \(p=2\) & 88.1 & 81.1 & 81.4 & 81.1 & 82.4 \\
         & \(p=\infty\) & 88.8 & 29.0 & 39.4 & 30.4 & 58.1 \\
         \midrule
         \multirow{3}{*}{LSTM}
         & \(p=1\) & 88.0 & 75.8 & 76.5 & 75.8 & 77.7 \\
         & \(p=2\) & 87.2 & 79.0 & 79.2 & 78.8 & 79.7 \\
         & \(p=\infty\) & 88.3 & 35.2 & 34.1 & 25.1 & 49.0 \\
         \midrule
         \multirow{3}{*}{BERT} 
         & \(p=1\) & 91.6 & 80.3 & 80.4 & 79.0 & 83.4\\
         & \(p=2\) & 91.3 & 81.2 & 81.5 & 80.0 & 83.1 \\
         & \(p=\infty\) & 92.6 & 57.5 & 45.2 & 40.9 & 22.2\\
         \bottomrule
    \end{tabular}}
    
    \label{tab:p_norm}
\end{table}

% \vspace{-0.5em}
\subsection{Evaluation on Word Embedding}

To validate that \name learns a robust word embedding, we apply the standard training on the models using frozen word embeddings initialized by three types of pre-trained word vectors respectively: (a) GloVe word vectors~\citep{pennington14glove}. (b) Word vectors of base-uncased pre-trained BERT without fine-tuning~\citep{jacob2019bert}, denoted as BERT-V. (c) Robust word vectors trained by \name on three models using the corresponding dataset, denoted as CNN-RV, LSTM-RV, and BERT-RV, respectively. Note that during the training process, we freeze the initialized word vectors and only do standard training to check whether our trained word embeddings are beneficial to the model robustness. 

As shown in Table~\ref{tab:embedding_study}, the standard trained models initialized with our robust word vectors  could efficaciously block the attacks without any other defense mechanisms, and even perform better than all the defense baselines (see results in Table~\ref{tab:defense_cnn_lstm} and Table~\ref{tab:defense_bert}), manifesting the great potential of improving the robustness of NLP models through word embeddings. Furthermore, our robust word embeddings exhibit good defense transferability across the models. For instance, CNN-RV pre-trained on CNN model yields high robustness on LSTM model, and vise versa for LSTM-RV. The transferability indicates that once we have trained a robust word embedding, we can easily apply it to other models, and only require standard training for the classification’s capability.

% \vspace{-0.5em}
\subsection{Further Analysis}
\label{subsec:further}
%In this subsection, 
Here we provide further analysis %and discussion
on the differences of \name and SEM, as well as an \name variant based on contrastive learning. We also discuss on the influence of different \(\ell_p\)-norm distance metrics and hyper-parameters. 
%In addition, we investigate the impact of supplementing BERT's vocabulary with actual words when applying \name on BERT. 
In addition, we investigate the effect of supplementing BERT's vocabulary with actual words on robustness and generalization. 

\textbf{Analysis on \name and SEM.} 
Among the baselines, SEM is the most similar approach to ours. 
However, SEM directly maps the synonyms to the same code in the input space, while \name adjusts the representations by metric learning in the embedding space. For SEM, there is no discrimination for words assigned the same code, and no relation between words with different codes. On the contrary, \name could use the distance metric in the embedding space to delicately capture the semantic relationship among the words.

For the instance in Figure~\ref{figs:sem_ftml} (a) for SEM, all words inside the green cluster is mapped to one code, while all words inside the red cluster is mapped to another code, and there is no relations between the two codes. SEM distributes \textit{substantial} to the green cluster and loses synonymous relationship between \textit{substantial} and other words in synonym set $B$. 

As illustrated Figure~\ref{figs:sem_ftml} (b) for \name, although after the training, word \textit{substantial} is more closer to other words in the synonym set \(A\) with an average distance of $0.0004$ in the embedding space. Compared to the average distance of $6.69$ between \textit{substantial} and the semantically unrelated synonym set \(C\), \name has also forced \textit{substantial} to be closer to other words in the other synonym set \(B\) with the average distance of $3.85$. Benefiting from the above property, \name performs better than SEM empirically.

% For instance, if $w_j$ is polysemous, $(w_i, w_j)$ and $(w_j, w_k)$ are different synonymous pairs, yet $w_i$ and $w_k$ are not really synonymous. In this case, SEM has three possible clustering schemes: (a) assign the same code to $w_i$ and $w_j$, and send another code to $w_k$; (b) assign the same code to $w_j$ and $w_k$, and send another code to $w_i$; (c) assign the same code to the three words. The first two clustering schemes cannot capture the synonymous relationship of either $(w_i, w_j)$ or $(w_j, w_k)$. Figure~\ref{figs:sem_ftml} (a) illustrates a concrete example on this issue. All words inside the green 
% cluster is mapped to one code, while all words inside the red cluster is mapped to another code, and there is no relations between the two codes.

% In contrast, our method can bring the two pairs of synonyms closer simultaneously, but also enable the model to distinguish $w_i$ and $w_k$. For the example in Figure~\ref{figs:sem_ftml} (b), after the training with \name, word \textit{substantial} is more closer to other words in the synonym set \(A\) with an average distance of $0.0004$ in the embedding space. Compared to the average distance of $6.69$ between \textit{substantial} and the semantically unrelated synonym set \(C\), \name has also forced \textit{substantial} to be closer to other words in the other synonym set \(B\) with the average distance of $3.85$. Benefiting from the above property, \name performs better than SEM empirically.



\textbf{Variant based on Contrastive Metric Learning.} Contrastive learning is popular recently with the purpose of making the representations between similar samples closer. 
To verify whether contrastive learning can also enhance the robustness by pulling words closer to their synonyms and pushing away their non-synonyms in the embedding space, we replace the triplet loss \(\mathcal{L}_{tr}\) with the contrastive loss \(\mathcal{L}_{ct}\), which could be formulated as follows: 
\begin{align}
\scriptsize
% \small
    \begin{split}
        \mathcal{L}_{ct}(w, \mathcal{S}(w), \mathcal{N}) = 
        -\log \frac{\sum_{w'\in \mathcal{S}(w)}\exp(-d(w, w')/\tau)}{\sum_{\tilde{w}\in \mathcal{S}(w)\cup \mathcal{N}}\exp(-\min(d(w, \tilde{w}), \alpha)/\tau)},
    \end{split}
\end{align}
where \(\tau\) is the temperature hyper-parameter. We test the \(\tau\) in \(\{1, 5, 10, 15, 20, 25, \cdots, 55, 60\}\) on CNN model with other settings being unchanged, and choose \(\tau=20\) for the best robustness. We denote this variant as Contrastive Metric Learning (CML). As shown in Table~\ref{tab:abulation_study}, CML is inferior to \name on the three models consistently, especially on the BERT model. Note that CML also significantly boosts the model's robustness compared to the standard trained models, and has competitive performance with the defense baselines (see results in Table~\ref{tab:defense_cnn_lstm} and Table~\ref{tab:defense_bert}), indicating that our motivation of adjusting word distances in the embedding space is sound.

\begin{figure*}[tb]
    \centering
    \subfigure[ \textit{IMDB}]{\label{fig:alpha:first}\includegraphics[width=.3\textwidth]{figures/imdb_alpha_pwws.pdf}}
    \subfigure[ \textit{Yelp-5}]{\label{fig:alpha:first}\includegraphics[width=.3\textwidth]{figures/yelp_alpha_pwws.pdf}}
    \subfigure[\textit{Yahoo! Answers}]{\label{fig:alpha:first}\includegraphics[width=.3\textwidth]{figures/yahoo_alpha_pwws.pdf}}
    \caption{The impact of hyper-parameter \(\alpha\) on the performance of \name on CNN models across the three datasets.}
    \label{fig:alpha}
\end{figure*}

\begin{figure*}[tb]
    \centering
    \subfigure[ \textit{IMDB}]{\label{fig:beta:first}\includegraphics[width=.3\textwidth]{figures/imdb_beta_pwws.pdf}}
    \subfigure[ \textit{Yelp-5}]{\label{fig:beta:first}\includegraphics[width=.3\textwidth]{figures/yelp_beta_pwws.pdf}}
    \subfigure[\textit{Yahoo! Answers}]{\label{fig:beta:first}\includegraphics[width=.3\textwidth]{figures/yahoo_beta_pwws.pdf}}
    \caption{The impact of hyper-parameter \(\beta\) on the performance of \name on CNN models across the three datasets. 
    }
    \label{fig:beta}
\end{figure*}

\textbf{Analysis on Distance Metric}. We adopt the Euclidean distance of word vectors by default to define the word distance, that is, \(p=2\) in Eq.~\ref{eq:distance}. To explore the effect of different \(\ell_p\)-norm distance metrics on \name, we also evaluate the defense efficacy of \name with  Manhattan distance (\(p=1\)) or Chebyshev distance (\(p=\infty\)). The results are presented in Table~\ref{tab:p_norm}. \name with Manhattan distance achieves competitive robustness to \name with Euclidean distance on three models. \name with Chebyshev distance is inferior to the models trained with the other two distance metrics, but still enhances the robustness compared to the standard trained models (see results in Table~\ref{tab:defense_bert}). The reason is that Manhattan distance and Euclidean distance consider the difference of word vectors in all dimensions, while \name with Chebyshev distance only updates the dimension with the largest difference in each training iteration, making the models hard to converge.



\textbf{Hyper-parameter Study.} \name involves two hyper-parameters. \(\alpha\) is used for constraining the distance between the anchor word and its non-synonyms in the word-level triplet loss. \(\beta\) controls the weight of the word-level triplet loss in the overall training objective function. 

In Figure~\ref{fig:alpha}, the \textit{clean accuracy} denotes the classification accuracy (\%) on the entire original testing set, while the \textit{robust accuracy} denotes the classification accuracy (\%) under the PWWS attack. With \(\beta\) being fixed to \(1\), we vary \(\alpha/\alpha_0\) from \(0.0\) to \(1.2\) to investigate how \(\alpha\) influence the performance of \name. When \(\alpha=0\), \name only forces the words to be close to their synonyms without considering the non-synonyms, and the clean accuracy is the lowest. Intuitively, the reason is that the existence of polysemous words will cause all the words that have at least one same meaning to be compressed together, making the model unable to distinguish them. With the increment of \(\alpha\), the clean accuracy of the models rises considerably, while the robust accuracy fluctuates but first increases and then declines in general. 
%on the whole. 
Hence, we choose \(\alpha = 0.7\alpha_0\) for the three datasets to achieve a proper trade-off on the clean accuracy and the robust accuracy. 

Similarly, in Figure~\ref{fig:beta}, we vary parameter \(\beta=10^i\) (\(-3\leq i \leq 3\)) to explore its sensitivity with the other parameter \(\alpha\) being fixed to \(0.7\alpha_0\).
We can observe that the clean accuracy tends to decay slightly with the increment of \(\beta\). 
%We observe that
Also, \name performs stably under the PWWS attack for a wide range of \(\beta\) in \([10^{0},10^{3}]\) over all the three datasets. 
Therefore, we choose \(\beta = 1\) to have a proper trade-off on the clean accuracy and the robust accuracy. 

\textbf{Analysis on Supplementing BERT's Vocabulary.} 
For the recent pre-trained NLP models based on sub-words such as BERT, some actual words that do not exist in the model’s vocabulary would be divided into several sub-words, making it hard to calculate the distance between the words in the embedding space for FTML. To solve this problem, we add some actual words into the vocabulary during the fine-tuning phase without affecting the pre-training phase.

The common words are limited. Besides, many actual words have been included in the original vocabulary as sub-words. Thus, the vocabulary size would not be increased much. In our experiments, we only supplement the BERT’s vocabulary to contain the top 50,000 words with the highest frequency in the dataset, which increases the vocabulary size from 30,522 to 59,734. To investigate the influence of the number of actual words on the performance of FTML, we fine-tune BERT by FTML with various numbers of actual words included in the vocabulary. We report the robust accuracy of FTML trained models against PWWS attack on the IMDB dataset. The results are summarized in Table~\ref{tab:robust_various_actual_words}. When we supplement the vocabulary to contain 30,000 actual words, the vocabulary size is 1.5 times larger than the original one, and the robustness has been significantly improved compared with normally fine-tuned BERT (16.6\%). When the vocabulary size increases gradually, the robustness of the FTML trained model has been greatly improved.

\begin{table}[tb]
\setlength\tabcolsep{3pt}
\small
    \centering
    \caption{The influence of various number of actual words included in the vocabulary on BERT fine-tuing. \textit{Robust accuracy} denotes the accuracy of FTML trained BERT models against PWWS attack. \textit{Clean accuracy} denotes the accuracy of normally trained BERT models on the original test set.
    }
    \begin{tabular}{lccccc}
        \toprule
         \#~actual words & 50,000 & 45,000 & 40,000 & 35,000 & 30,000 \\
         \midrule
         Vocabulary size & 59,734 &	55,523 & 51,402 & 47,387 & 43,549 \\
         Robust accuracy (\%) & 81.2 & 77.3 & 75.7 & 68.9 & 64.7 \\
         Clean accuracy (\%) & 92.2 & 92.2 & 92.4 & 92.5 & 92.4 \\
         \bottomrule
    \end{tabular}
    % }
    
    \label{tab:robust_various_actual_words}
\end{table}

To investigate the influence of the number of added words on the generalization, we adopt the expanded vocabulary to fine-tune the BERT model without FTML on the IMDB dataset. As shown in Table~\ref{tab:robust_various_actual_words}, the classification performance of BERT models fine-tuned with the expanded vocabulary varies from 92.2\% to 92.5\%, which is very close to the one with the original vocabulary (92.4\%). It indicates that adding some additional words to the vocabulary during the fine-tuning stage does not affect the generalization of the BERT model. Interestingly, when we supplement the vocabulary to contain 35,000 actual words, the model even performs slightly better than the one with the original vocabulary.

\section{Conclusion}


In this work, we introduce a novel method termed \textit{Fast Triplet Metric Learning (\name)} to train models with robust word embeddings. Specifically, we incorporate the standard training with a word-level triplet loss, which pulls the words to be closer to their synonyms and pushes away their non-synonyms in the embedding space. Extensive experiments demonstrate that \name achieves much higher robustness against various attacks than existing state-of-the-art baselines. \name is efficient, making it easy to extend to large-scale datasets and complex models. 
%Furthermore, by using our robust embedding as the frozen initial embedding, the models could achieve great robustness simply with standard training.   
% Besides, by using our robust embedding as the frozen initial embedding, the models with standard training could also achieve high robustness and exhibit good defense transferability. %, showing the great potential of robust embedding for natural language processing. 
It is noted that the idea of how to learn a robust word embedding in \name is general to any language. Once we have prepared a synonym dictionary for the given language, we can directly apply \name to pull the words closer to their synonyms and push away their non-synonyms in the embedding space to train an adversarially robust model.

Our work exhibits great potential of improving the textual robustness through robust word embedding, which challenges the mainstream view of enhancing the robustness of the overall model against adversarial attacks and highlights the difference of model robustness on texts and images. We hope our work could inspire more works in this direction by considering the speciality of natural languages. 

% \begin{contributions} % will be removed in pdf for initial submission,
%                       % so you can already fill it to test with the
%                       % ‘accepted’ class option
%     Briefly list author contributions.
%     This is a nice way of making clear who did what and to give proper credit.

%     H.~Q.~Bovik conceived the idea and wrote the paper.
%     Coauthor One created the code.
%     Coauthor Two created the figures.
% \end{contributions}

\begin{acknowledgements} % will be removed in pdf for initial submission,
                         % so you can already fill it to test with the
                         % ‘accepted’ class option
This work is supported by National Natural Science Foundation of China (62076105) and International Cooperation Foundation of Hubei Province, China (2021EHB011).
\end{acknowledgements}

\bibliography{yang_243}

% \appendix
% % NOTE: necessary when ptmx or no mathfont class option is given
% \providecommand{\upGamma}{\Gamma}
% \providecommand{\uppi}{\pi}
% \section{Math font exposition}
% How math looks in equations is important:
% \begin{equation*}
%   F_{\alpha,\beta}^\eta(z) = \upGamma(\tfrac{3}{2}) \prod_{\ell=1}^\infty\eta \frac{z^\ell}{\ell} + \frac{1}{2\uppi}\int_{-\infty}^z\alpha \sum_{k=1}^\infty x^{\beta k}\mathrm{d}x.
% \end{equation*}
% However, one should not ignore how well math mixes with text:
% The frobble function \(f\) transforms zabbies \(z\) into yannies \(y\).
% It is a polynomial \(f(z)=\alpha z + \beta z^2\), where \(-n<\alpha<\beta/n\leq\gamma\), with \(\gamma\) a positive real number.

\end{document}
