% \documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
% \usepackage[american]{babel}
\usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
\bibliographystyle{plainnat}
\usepackage{placeins}
\renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{multirow}
\usepackage{booktabs}
\usepackage[normalem]{ulem}
\usepackage{latexsym}
\usepackage{amssymb}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)





%%%
%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\newcommand{\hq}[1]{\textcolor{black}{#1}}%pink %revise 
\newcommand{\lin}[1]{\textcolor{black}{#1}}%pink %revise

\newcommand{\HQ}[1]{\textcolor{black}{#1}}%pink %revise 

\title{Addressing Token Uniformity in Transformers \\via Singular Value Transformation \\(Supplementary Material)}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:Hanqi.yan@warwick.ac.uk}{Hanqi Yan}{}}
\author[1]{\href{mailto:Lin.gui@warwick.ac.uk}{Lin Gui}{}}
\author[2]{\href{mailto:cswjli@comp.polyu.edu.hk}{Wenjie Li}{}}
\author[1,3]{\href{mailto:Yulan.He@warwick.ac.uk}{Yulan He}{}}
% Add affiliations after the authors
\affil[1]{%
    Department of Computer Science, University of Warwick, United Kingdom
}
\affil[2]{%
        Department of Computing, The Hong Kong Polytechnic University, China
}
  
\affil[3]{%
    The Alan Turing Institute, United Kingdom
}
\begin{document}
\maketitle

\renewcommand\thesection{\alph{section}}
% \renewcommand{\thetable}{A\arabic{table}}
% \setcounter{table}{0}
% \renewcommand{\thetable}{A\arabic{table}}
% \setcounter{figure}{0}
% \renewcommand{\thefigure}{A\arabic{figure}}


\section{Proof of Theorem in Section 3}

\noindent \textbf{Theorem:} $ \forall x \in X^{l}$, $\exists x' \in \mathcal{S}_{[1,k]}^{l}$, where the subspace $\mathcal{S}_{[1,k]}^{l}$ is defined based on $\lambda_k \geq C  \geq \lambda_{k+1}$, then $\lVert x-x' \lVert_2 \leq C$.

\noindent \textbf{Proof} We assume that $X^{l}$ can be represented as a $n_l \times m$ matrix:

\begin{gather}
X^l = 
\left[
\begin{matrix}
 \Vec{x}_1   \\
 \Vec{x}_2   \\
 \vdots  \\
 \Vec{x}_{n_l} \\
\end{matrix} 
\right], \notag
\label{eq:disc-joint1}
\end{gather}

\noindent where $\Vec{x}_i \in \mathbb{R}^m$ is an $m$-dimensional embedding of a token in the output of $l$-th layer. After performing SVD on $X^{l}$, we have:

\begin{gather}
\left[
\begin{matrix}
 \Vec{x}_1   \\
 \Vec{x}_2   \\
 \vdots  \\
 \Vec{x}_{n_l} \\
\end{matrix} 
\right] = 
\left[
\begin{matrix}
 \Vec{u}_1   \\
 \Vec{u}_2   \\
 \vdots \\
 \Vec{u}_m \\
 \vdots \\
 \Vec{u}_{n_l} \\
\end{matrix} 
\right] \cdot
\left[
\begin{matrix}
 \lambda_1 & 0 & \cdots & 0   \\
 0 & \lambda_2  & \cdots & 0   \\
 \vdots & \vdots & \ddots & \vdots \\
  0 & 0 & \cdots & \lambda_m \\
  \vdots & \vdots & \ddots & \vdots \\
  0 & 0 & \cdots & 0 \\
\end{matrix} 
\right]  \cdot
\left[
\begin{matrix}
 \Vec{v}_1   \\
 \Vec{v}_2   \\
 \vdots  \\
 \Vec{v}_{m} \\
\end{matrix} 
\right], \notag
\label{eq:disc-joint2}
\end{gather}

where the unitary matrix $U = [\Vec{u}^{\top}_1,\Vec{u}^{\top}_2,...,\Vec{u}^{\top}_{n^l}]^{\top}$, $V = [\Vec{v}^{\top}_1,\Vec{v}^{\top}_2,...,\Vec{v}^{\top}_{m}]^{\top}$ are $n^l \times n^l$ left singular matrix and $m \times m$ right singular matrix, respectively. Therefore, the two collections of vectors, i.e. $\Vec{u}_i = \{u_{i1},u_{i2},...,u_{in^l}\}$ and $\Vec{v}_i = \{v_{i1},v_{i2},...,v_{im}\}$, are two subsets of basis for the  $m$-dimensional vector space ($m << n^l$). Without loss of generality, we assume $x_i \in X^l$ can be represented by its corresponding left singular vector, singular values, and the right singular matrix $V$, which yields:

\begin{align}
\Vec{x}_i &= 
\Vec{u}_i \cdot
\left[
\begin{matrix}
 \lambda_1 & 0 & \cdots & 0   \\
 0 & \lambda_2  & \cdots & 0   \\
 \vdots & \vdots & \ddots & \vdots \\
  0 & 0 & \cdots & \lambda_m \\
  \vdots & \vdots & \ddots & \vdots \\
  0 & 0 & \cdots & 0 \\
\end{matrix} 
\right]  \cdot
\left[
\begin{matrix}
 \Vec{v}_1   \\
 \Vec{v}_2   \\
 \vdots  \\
 \Vec{v}_{m} \\
\end{matrix} 
\right] \notag \\
\notag & =\left[
\begin{matrix}
\lambda_1 \cdot \Vec{u}_1, & \lambda_2 \cdot \Vec{u}_2, & \cdots, & \lambda_m \cdot \Vec{u}_m
\end{matrix} 
\right] 
\cdot 
\left[
\begin{matrix}
 \Vec{v}_1   \\
 \Vec{v}_2   \\
 \vdots  \\
 \Vec{v}_{m} \\
\end{matrix} 
\right] \\
\label{eq:combination}
\end{align}

If we separate the singular values into two parts by $C$, where $\lambda_k \geq C  \geq \lambda_{k+1} \geq 0$, we can rewrite  Eq. (\ref{eq:combination}) by: 

\begin{align}
    \Vec{x}_i &= \Sigma_{j=1}^{m}\lambda_j \cdot u_{ij} \cdot \Vec{v}_j \notag \\
    &=\Sigma_{j=1}^{k}\lambda_j \cdot u_{ij} \cdot \Vec{v}_j + \Sigma_{j=k+1}^{m}\lambda_j \cdot u_{ij} \cdot \Vec{v}_j  \notag
\end{align}

\noindent By defining $\Vec{x}'_i = \Sigma_{j=1}^{k}\lambda_j \cdot u_{ij} \cdot \Vec{v}_j$, where singular values are taken from %belong to 
the larger group, %singular values group, due to $\lambda_k \geq C  \geq \lambda_{k+1} \geq 0$, 
we have:
\begin{align}
    ||\Vec{x}_i - \Vec{x}'_i ||&=  ||\Sigma_{j=k+1}^{m}\lambda_j \cdot u_{ij} \cdot \Vec{v}_j ||  \notag \\
    &= |<\Vec{\lambda}^{[k+1,m]} \otimes\Vec{u}^{[k+1,m]}_i,V^{(m-k-1) \times m}>| \notag
\end{align}

Where $||\cdot||$ is the norm, $\otimes$ is the pairwise product, and $|<\cdot,\cdot>|$ is the inner product in a vector space, $\Vec{\lambda}^{[k+1,m]}$, and $\Vec{u}^{[k+1,m]}_i$ are the sub-vectors of singular values and $\Vec{u}_i$ from $k+1$-th to $m$-th dimensions, respectively, and $V^{(m-k-1) \times m}$ is the corresponding right singular sub-matrix. According to H${\rm \ddot{o}}$lder inequality, we have:
\begin{align}
    ||\Vec{x}_i - \Vec{x}'_i || \leq ||\Vec{\lambda}^{[k+1,m]} \otimes \Vec{u}^{[k+1,m]}|| \cdot ||V_{(m-k-1) \times m}|| \notag
\end{align}

Since $V$ is a unitary matrix, $V^{\top} \cdot V=I$, which yields $||V_{(m-k-1) \times m}|| = 1$. Hence, 
\begin{align}
    ||\Vec{x}_i - \Vec{x}'_i || &\leq ||\Vec{\lambda}^{[k+1,m]} \otimes \Vec{u}^{[k+1,m]}||  \notag \\ \notag
    &=  \sqrt{\Sigma_{j=k+1}^m \lambda_j^2 \cdot u_{ij}^2} \\ \notag
\end{align}

Considering $||\Vec{u}|| = 1$ and $\lambda_{k+1} \leq C$, obviously we have $||\Vec{u}_{[k+1,m]}|| \leq 1$ and $\lambda_j \leq C$, when $j \geq k+1$. Therefore,   
\begin{align}
    ||\Vec{x}_i - \Vec{x}'_i || &\leq C \cdot \sqrt{\Sigma_{j=k+1}^m  u_{ij}^2}  \leq C \notag 
\end{align}
$\Box$

\textbf{A case study where the vectors in the unitary matrix $U$ follows a uniform distribution in a $L_2$-norm based metric space}

The \textbf{theorem} states that the learned features from a transformer-based language model can be represented as a closure which is defined as a $C$-neighbour of a $k$-dimensional space. Here, we present a case study, assuming the vectors $\Vec{u}_i$ in the unitary matrix $U$ follows a uniform distribution within a $L_2$-norm based metric space.

Under such an assumption, the probability of $P(\Sigma_{j=k+1}^{m}\sqrt{u_{ij}^2}\leq d)$ is the integral of the probability density function in the corresponding area of a $n$-sphere, denoted as $S_{n-1}$,  defined by $\Sigma_{j=k+1}^m  \sqrt{u_{ij}^2}$. It is clear that  ${\rm{P}}(\Sigma_{j=k+1}^{m}\sqrt{u_{ij}^2} \leq d)  \geq 0$. Hence, we only discuss the upper boundary of $P$ in the following. We denote %the area of whole $n$-sphere as $S_{n-1}$, and 
the sub-area of $\Sigma_{j=k+1}^m  \sqrt{u_{ij}^2} \leq d$ as $S_{\phi}$. To simplify the notation, without loss of generality, we re-order the elements in $\Vec{u}_i\in\mathbb{R}^n$ %by taking the dimensions of $\Vec{u}_i$ as $n$ and letting 
such that its last $k$ dimensions correspond to the small singular values. Then, we have
\begin{small}
\begin{align}
    &{\rm P}(\Vec{u}_i \in S_{\phi}) \notag \\
    =& \int_{S_{\phi}} \frac{\Gamma(n/2)}{2\pi^{n/2}} \Pi^{n-2}_{i=1} {\rm sin}^{n-1-i}(\psi_i) d{\phi_1}...d{\phi_{(n-1)}} \notag \\
     \leq& \frac{\Gamma(n/2)}{2\pi^{n/2}} \cdot d^{k} \int_{S_{\phi}} \Pi^{n-k}_{i=1} {\rm sin}^{n-k-i}(\psi_i) d{\phi_1}...d{\phi_{(n-k+1)}} \cdot \notag \\
    & \int_{S_{\phi}} \Pi^{k}_{i=1} {\rm sin}^{k-i}(\psi_{n-k-1+i}) d{\phi_k}...d{\phi_{(n-1)}} \notag \\
     \leq& \frac{\Gamma(n/2)}{2\pi^{n/2}} \cdot \frac{2\pi^{(n-k)/2}}{\Gamma((n-k)/2+1)} \cdot \frac{2\pi^{k/2}}{\Gamma(k/2+1)}\cdot (1 - d^{n-k}) \cdot d^{2k} \notag \\
     \leq& \frac{2}{k(n-k)} \cdot\frac{\Gamma(n/2)}{\Gamma((n-k)/2)\Gamma(k/2)}  \cdot d^{2k}(1-d^{n-k}) \notag \\
     =& \frac{2}{k(n-k)\cdot {\rm B}(k/2,(n-k)/2)} \cdot  d^{2k}(1-d^{n-k}), 
    \label{eq:bound}
\end{align}
\end{small}
\noindent where ${\rm B(\cdot,\cdot)}$ is the beta function. The result show that the probability of a singular vector residing in the sub-area $S_{\phi}$ will converge to 0 exponentially with the growth of $k$. %$k$ is the size of small singular values,  
As such, when $k$, the number of smaller singular vectors, is large, the distance between the embedding space and the subspace spanned by the larger singular vectors is bounded by $C$, the smallest value in the larger singular value group.
%According to the inequality shown in Eq. (\ref{eq:bound}), there is a very low probability to find a small $d \leq 1$ as a boundary to measure the distance between the embedding space and the subspace spanned by the larger singular vectors when $k$, the number of smaller singular vectors, is large.


\section{Model Configurations and Training Details}
% \label{sup:model_train}
\paragraph{Unsupervised Setting}

In the unsupervised setting on the STS task, we use the datasets processed by~\citep{DBLP:journals/corr/abs-2104-01767} and follow their evaluation pipeline by replacing their Whitening function with our \texttt{SoftDecay} function in their released  code\footnote{\url{https://github.com/Jun-jie-Huang/WhiteningBERT}}. We do not use any dataset to train the transformation function, instead, we choose a fixed $\alpha$ empirically ($\alpha$ is the hyper-parameter in Eq.(3)). As we did not see significant changes across different $\alpha$, we set $\alpha$ to $-0.6$ for all the datasets and PTLMs. For metrics calculation, we use $t=0.5$ in RBF$_{dis}$ and we choose the nearest 12 points to reconstruct the query point in \textit{LSDS}.

\paragraph{Supervised Setting}

We apply \texttt{SoftDecay} to the output of the last layer of a PTLM provide by huggingface, before layer normalisation. %We evaluate our method on sentence-level classification tasks, including grammar acceptability assessment on the Corpus of Linguistic Acceptability (CoLA)~\citep{DBLP:journals/tacl/WarstadtSB19}, sentiment classification on the Stanford Sentiment Treebank (SST2)~\citep{DBLP:conf/emnlp/SocherPWCMNP13}, paraphrase detection on the Microsoft Research Paraphrase Corpus (MRPC)~\citep{DBLP:conf/acl-iwp/DolanB05}, natural language inference on the Question-Answering NLI (QNLI) data and the Recognizing Textual Entailment (RTE) data, all from the GLUE datasets~\citep{DBLP:conf/iclr/WangSMHLB19}. 
%In supervised settings on GLUE datasets, i.e.,  text-classification (SST2, MRPC, QNLI, CoLA, RTE)~\footnote{You can access these dataset through this link: \url{https://github.com/huggingface/transformers/tree/master/examples/pytorch/text-classification}}.
We use the default parameters configured in BERT-base-uncased\footnote{%BERT configuration can be found here 
\url{https://huggingface.co/docs/transformers/master/en/model_doc/bert}}, 
ALBERT-base-v1\footnote{
\url{https://huggingface.co/docs/transformers/master/en/model_doc/albert}}, 
RoBERTa-base\footnote{
\url{https://huggingface.co/docs/transformers/master/en/model_doc/roberta}} and DistilBERT-base-uncased\footnote{
\url{https://huggingface.co/docs/transformers/master/en/model_doc/distilbert}} as the baselines. 
For hyper-parameter setting, we search the initial alpha for different datasets from $[-0.2,-0.5,-0.8]$, and set different learning rates from $[2e-3, 2e-5]$ for the transformation layer and the pretrained models.\footnote{As SVD decomposition generates an error in the RoBERTa-base model, we exclude it in GLUE evaluation.} 

% \begin{figure*}[t]
%     \centering
%     \includegraphics[trim=150 260 200 60,clip,width=0.90\textwidth]{templates/latex/appendix_bert_colarte.pdf}
%     \caption{CDF for CoLA and RTE dataset. The upper results are from the vanilla BERT, the bottom are from \texttt{BERT+SoftDecay}. Large singular values are more predominant as layers go deeper except for the last layer. Comparing to the dash and solid red line, we notice that \texttt{SoftDecay} can greatly improve the anisotropy in BERT features.}
%     \label{fig:bert_colarte}
% \end{figure*}

\begin{table*}[th]
\centering
\resizebox{0.78\textwidth}{!}{
\begin{tabular}{ll|rr|rr|rr}
\toprule[1pt]
 &  & BERT & +SoftDecay & ALBERT & +SoftDecay & DistilBERT & +SoftDecay \\
 \hline
 \multirow{3}{*}{STS-B} & Evs & 0.6259 & 0.0252 & 0.6987 & 0.0326 &0.7301 & 0.0341 \\
 & RBF$_{dis}$ & -1.4624 & -3.8534 & -1.1602 & -3.8016  & -1.0549 & -3.8052 \\
 & TokenUni & 0.6195 & 0.0274 &0.6983 & 0.036  & 0.7282 & 0.037 \\
 \hline
\multirow{3}{*}{SICK} & Evs & 0.7383 & 0.0212 &  0.7711 & 0.0274  & 0.8135 & 0.0289 \\
 &RBF$_{dis}$ & -1.0323 & -3.8671 & -0.8979 & -3.8268  & -0.7367 & -3.8241 \\
 & TokenUni & 0.7361 & 0.023 & 0.7706 & 0.0295 &0.8130 & 0.0311 \\
  \hline
\multirow{3}{*}{STS-12} & Evs & 0.6219 & 0.0182 &0.7052 & 0.0247  & 0.7321 & 0.0245 \\
 & RBF$_{dis}$ & -1.4785 & -3.8717 & -1.4785 & -1.1438 & -3.8308& -3.8381  \\
 & TokenUni & 0.6193 & 0.0203 & 0.7058 & 0.0273& 0.7021 & 0.0329 \\
  \hline
\multirow{3}{*}{STS-13} & Evs & 0.5823 & 0.0221 &  0.6632 & 0.0287  & 0.7015 & 0.0302 \\
 & RBF$_{dis}$ & -1.6189 & -3.8706 & -1.3032 & -3.8258  & -1.1594 & -3.8262 \\
 & TokenUni & 0.5817 & 0.024 & 0.6637 & 0.031 & 0.7021 & 0.0329 \\
  \hline
\multirow{3}{*}{STS-14} & Evs & 0.5933 & 0.6729 & 0.0204  & 0.0151 & 0.712 & 0.0202 \\
 & RBF$_{dis}$ & -1.593 & -3.9124 & -1.2712 &-3.8787  & -1.1288 & -3.8855 \\
 & TokenUni & 0.5929 & 0.016 &0.6743 & 0.0217 & 0.7127 & 0.0215 \\
  \hline
\multirow{3}{*}{STS-15} & Evs & 0.6072 & 0.0183 &0.6827 & 0.0239& 0.7225 & 0.0248 \\
 & RBF$_{dis}$ & -1.5177 & -3.8706 & -1.2178 & -3.8379  & -1.0772 & -3.8313 \\
 & TokenUni & 0.6057 & 0.0216 & 0.6848 & 0.0273   & 0.7228 & 0.0291 \\
  \hline
\multirow{3}{*}{STS-16} & Evs & 0.6049 & 0.0267 &0.6824 & 0.0333& 0.7190 & 0.0363 \\
 &RBF$_{dis}$ & -1.5262 & -3.8375 & -1.5262 & -1.2095 & -3.7952 & -3.7869 \\
 & TokenUni & 0.6054 & 0.0286 & 0.6864 & 0.0360 & 0.7201 & 0.0390 \\
 \bottomrule[1pt]
\end{tabular}
}
\caption{Uniformity metrics (\textit{EVs}, \textit{TokenUni}, \textit{RBF$_\text{dis}$}) evaluates the isotropy in transformed feature space comparing to the vanilla PTLMs features. Smaller values means the features are better uniformly distributed. It can be seen that \texttt{SoftDecay} can greatly improve the uniformity. }
\label{tab:sts_uniformity}
\end{table*}


\section{Additional results on Semantic Textual Similarity Dataset}

In this section, we first examine the potential reasons of improvement by comparing the learnt representations from baselines models (i.e., vanilla PLTMs and WhiteningBERT) and our proposed \texttt{SoftDecay} through quantitative evaluation results and the visualisation results (See in \S\ref{sec:rep_metric}. and \S\ref{sec:rep_vis}). We then discuss a comparison between \texttt{SoftDecay} and a representative contrastive learning method, \texttt{SimCSE}~\citep{DBLP:conf/emnlp/GaoYC21}, which also aims to alleviate the anisotropy problem in language representations. % as an example.

\subsection{Feature Evaluation Results on STS Datasets}
\label{sec:rep_metric}
%To further examine the improvements source, 
We show in Table~\ref{tab:sts_uniformity} and Figure~\ref{fig:structureloss_bert} both the uniformity and local neighborhood preservation evaluation results of different methods over the seven STS datasets. %Although the sentence features derived from \texttt{BERT} and \texttt{BERT+SoftDecay} look similar as an imperfect visualization result of TSNE algorithm, the noticeable improvements 
The lower scores returned by \texttt{SoftDecay} in Table~\ref{tab:sts_uniformity} in comparison to the base PTLMs verify its capability of alleviating anisotropic feature space derived from \texttt{BERT}. In Figure~\ref{fig:structureloss_bert}), \texttt{SoftDecay} preserves the local neighbourhood structure better among all the datasets, which explains its performance superiority comparing with \texttt{Whitening} which ignores the original local manifold structure.


% \begin{figure}[ht]
%     \centering
%     \includegraphics[trim=11 60 18 20,clip,width=0.48\textwidth]{templates/latex/structure-loss_bert_v2.pdf}
%     \caption{Structure-Invariance for \texttt{Whitening} and \texttt{Soft-Decay} transformed Representations. Larger value structure-invariance means better preserving the original structure information learnt in pretrained model. }
%     \label{fig:structureloss_bert}
% \end{figure}

\begin{figure}[thb]
    \centering
    \includegraphics[trim=11 30 18 20,clip,width=0.48\textwidth]{sst_lsds.pdf}
    \caption{Local Structure Discrepancy Score (\textit{LSDS}) for \texttt{Whitening} and \texttt{SoftDecay} transformed Representations. Smaller scores are preferred as the original local neighborhood information learnt in the pretrained model is preserved better. }
    \label{fig:structureloss_bert}
\end{figure}

\subsection{Visualisation of Features in STS Datasets}
\label{sec:rep_vis}
We show the representations of sentence pairs generated from \texttt{BERT}, with \texttt{Whitening} and with \texttt{SoftDecay} via tSNE for the rest five STS datasets in Figure~\ref{fig:appendix_bertsst}. In STSB, STS13 and STS16, the representation mapping results in \texttt{Whitening} are not unit Gaussian due to some \textit{abnormal} data point. %The Uniformity and LSDS metrics are shown in Figure~\ref{fig:structureloss_bert} and Table~\ref{tab:sts_uniformity}. 
Our proposed method \texttt{SoftDecay} gives better uniformity score than vanilla BERT and  better \textit{LSDS} than \texttt{WhiteningBERT}, as have been shown in Figure~\ref{fig:structureloss_bert} and Table~\ref{tab:sts_uniformity}.

%The sentence features derived from ALBERT and DistilBERT are similar in shape when mapping to the 2D plane, and it lacks quantity information, so we don't display the the TSNE results for these two models. Instead, we calculate the \textit{Uniformity} and \textit{structure-invariance} for the two models.

\begin{figure*}[thb]
    \centering
    \includegraphics[trim= 75 90 75 25, clip,width=0.99\textwidth]{appendix_bert_sts.pdf}
    \caption{The tSNE visualisation of representations of sentence pairs in datasets SICKR, STSB, STS12-16 (except STS15) in different columns. These representations from top to bottom are derived from vanilla \texttt{BERT}, \texttt{BERT+whitening} and \texttt{BERT+SoftDecay}. For each sentence pair, the two sentences are denotes by different colors, e.g., black and red in \texttt{BERT}. We can see clear clusters in \texttt{BERT} and \texttt{BERT+SoftDecay} for STS-B, STS-12 and STS-14 datasets. }
    \label{fig:appendix_bertsst}
\end{figure*}

\subsection{Comparison with Contrastive Learning on STS}
\HQ{The objective of contrastive learning methods is to align semantically-related positive data pairs and make the learned representations evenly distributed in the resulting embedding space~\citep{DBLP:conf/icml/0001I20}. The latter property naturally addresses the token uniformity issue. Therefore, we further compare \texttt{Softdecay} with a representative contrastive learning method, \texttt{SimCSE}~\citep{DBLP:conf/emnlp/GaoYC21}, on STS. As SimCSE needs to be trained on datasets to fine-tune its parameters, we conduct experiments using \texttt{SimCSE} following its original setup: (1) \emph{Unsupervised}. Train the model on sampled 1 million sentences from English Wikipedia~\footnote{\href{ https://huggingface.co/datasets/princeton-nlp/datasets-for-simcse/resolve/main/wiki1m_for_simcse.txt}{Download link for Sampled English Wikipedia dataset}} and pass the same sentence twice to a pre-trained encoder with standard dropout to generate two different sentence embeddings as positive pairs. Other sentences in the same mini-batch are taken as negative pairs; % use the data pairs after being applied with two different dropout masks on the same data sample as positive pairs, while the two different data samples as negative pair. 
(2) \emph{Supervised}. Train the model on natural language inference datasets, MNLI and SNLI \footnote{\href{https://huggingface.co/datasets/princeton-nlp/datasets-for-simcse/resolve/main/nli_for_simcse.csv}{Download link for the combined NLI dataset}}, and use the annotated entailment and contradictory pairs as positive and negative sentence pairs, respectively. The results are shown in Table~\ref{tab:compare_cl}. It can be observed that \texttt{SoftDecay} outperforms \texttt{SimCSE} in general, %The results are overall better, 
especially under the supervised setting. The end goal of our approach (via increasing the weights of small singular values in the output embedding space) is similar to \texttt{SimCSE} (via random dropout masks) under the unsupervised setting, as both aim to learn an isotropic embedding distribution. However, in the supervised \texttt{SimCSE}, its contrastive loss is calculated on a subset of training pairs, % and their corresponding labels. 
as such, it is relatively difficult to achieve the universal isotropy, which is not the case in our approach.}

\begin{table*}[h!]
\centering
\resizebox{0.72\textwidth}{!}{%
\begin{tabular}{llllllll}
\toprule[1pt]
  \textbf{Model} &STSB & STS-12 &STS-13&STS-14&STS-15&STS-16&SICK-R \\
  \hline
        \multicolumn{8}{c}{\textit{Trained on wiki-text (unsupervised)}} \\
    \texttt{SimCSE}~\citep{pmlr-v119-goyal20a}     & 	74.48 & \textbf{66.01}	&\textbf{81.48}	&\textbf{71.77}	&77.55	&76.53	&69.36 \\
    \texttt{SoftDecay} & 	\textbf{75.81} & 63.25&	78.67&	70.41	&\textbf{79.37}&	\textbf{77.69}&	\textbf{71.15}\\
\hline
      \multicolumn{8}{c}{\textit{Trained on MNLI and SNLI dataset (supervised)}} \\
    \texttt{SimCSE}~\citep{pmlr-v119-goyal20a}     &  	82.26 & \textbf{77.37}&	78.12&	77.81&	84.65&	81.10&	78.73\\
\texttt{SoftDecay} & \textbf{83.51} & 75.31&	\textbf{81.70}&	\textbf{79.88}&	\textbf{86.33}&	\textbf{81.37}&	\textbf{79.04} \\
\bottomrule[1pt]
    \end{tabular}
}
    \caption{Comparison with contrastive learning method, \texttt{SimCSE}. Our methods demonstrate overall better results under the supervised setting.}
    \label{tab:compare_cl}
\end{table*}

\section{Additional Results on GLUE datasets}
In this section, we first show the results of comparing \texttt{SoftDecay} with another method, which applies regularisation during training to alleviate the anisotropy issue. Then, we display the Cumulative distribution function (CDF) of singular value distributions before and after applying \texttt{SoftDecay}. 

\subsection{Comparing with  another singular value transformation function}

In addition to Sentence-BERT (\texttt{S-BERT} for short) ~\citep{DBLP:conf/emnlp/ReimersG19} and \texttt{BERT-CT}~\citep{DBLP:conf/iclr/CarlssonGGHS21}, we also compare with another method which applies regularisation on the output embedding matrix with an exponentially decayed singular value prior distribution during training (\texttt{ExpDecay} for short)~\citep{DBLP:conf/iclr/Wang0HHWG20}.

\texttt{ExpDecay} is designed for an encoder-decoder architecture in language generation.
%task, in their paper, machine translation task and they derive 
The singular value distribution of the output embedding matrix is derived from the decoder. This approach is not directly applicable to our setup since we don't use the encoder-decoder architecture here. Nevertheless, we modify our training objective by adding the singular values $\{\lambda_{k}\}_{k=1}^{K}$ of output feature $X$: $\gamma{e}\sum_{k=1}^{K}(\lambda_{k}-c_{1}e^{-c_{2}k^{\gamma}})$. where $\gamma{e}$ is a hyperparameter used to adjust the weight of the added term, $c_1, c_2$, and $\gamma$ are hyperparameters in the desirable exponential prior term of singualar values. We empirically set $c_{1},c_{2} =1, \gamma=2, \gamma{e}=1e-4$.

\HQ{By comparing with the results of \texttt{ExpDecay} in Table  \ref{tab:ExpDecay}, we don't see substantial improvement using the fixed exponential decay term. It can be explained by 1) the difficulty of balancing two losses by adding the exponential decay term into the training objective function; 2) the sensitivity of the hyper-parameter in the prior decay term in \texttt{ExpDecay}. In our method, we only has a single parameter $\alpha$ in Eq. (2) and its value can be automatically adjusted during training to fit the downstream tasks under the supervised setting.} %Note that our method can be used under the unsupervised setting on the STS dataset (i.e., the sentence representations can be directly transformed using our method without any training).}

\begin{table}[h]
    \centering
    \resizebox{0.49\textwidth}{!}{
    \begin{tabular}{lcll}
\toprule[1pt]
\\
    [-1em]
  Dataset (\hq{size})  & BERT & +\texttt{SoftDecay}($\Delta$\%)& +\texttt{ExpDecay}($\Delta$\%) \\
    [-1em]
    \\
    \midrule CoLA(8.5k)&59.57&\textbf{59.84}*($\uparrow$0.45)& 59.37($\downarrow$0.34)\\
    SST2(67k)&92.32&\textbf{93.12}**($\uparrow$0.87)&\textbf{92.43}($\uparrow$1.19)\\
    \midrule
MRPC-Acc(3.7k)&84.00&\textbf{85.20}**($\uparrow$1.43)&83.25($\downarrow$0.89)\\
    MRPC-F1(3.7k)&89.50&\textbf{89.65}($\uparrow$0.17)&87.92($\downarrow$1.21)\\
% QQP-acc(364k) &91.06&\textbf{91.11}($\uparrow$0.05)& & & & \\
% QQP-f1(364k) &87.96&\textbf{88.07}($\uparrow$0.06)& & & \\
    \midrule
QNLI(105k)&91.25&\textbf{91.98}**($\uparrow$0.80)&89.21($\downarrow$2.23)\\
% MNLI-m(393k)& 84.39 & 84.36($\downarrow$0.03) & & &\\
% MNLI-mm(393k)& 84.70 & \textbf{84.82}($\uparrow$2.50) & & & \\
    RTE(2.5k)& 64.98&\textbf{68.23}**($\uparrow$5.00)&64.98($\uparrow$0.00)\\
    % \hline
    % Avg.$\Delta$\%& -&$\uparrow$1.43&-&$\uparrow$1.18&-&$\uparrow$1.10\\
\bottomrule[1pt]
    \end{tabular}
    }
    \caption{Sentence-level classification results on five representative GLUE validation datasets. Matthews correlation is used to evaluate CoLA, Accuracy/F1 is used in other datasets. $\Delta\%$ represents the relative improvement over the baseline. Better results than BERT are in bold. No substantial improvements are observed using \texttt{ExpDecay}.}
    \label{tab:ExpDecay}
\end{table}


\subsection{Singular Value Distribution}
\label{appe:glue_sv}
\paragraph{The effects of dataset size on NLI dataset}
We highlight the different singular value distribution in QNLI and RTE, two datasets for language inference task \HQ{(See in Figure ~\ref{fig:qnli_rte})}.

\begin{figure*}[th]
    \centering
    \includegraphics[width=0.68\textwidth,trim={100 350 380 120},clip]{uai2022-template/inference_bert_cdf.pdf}
    \caption{\hq{The CDF of singular value in QNLI (left) and RTE (right) dataset derived from vanilla \texttt{BERT}}. For the same percentage 0.8, the larger dataset QNLI dataset has smaller $\Delta L_{i}$ among all the layers, refers to a more serious token uniformity issue.}
    \label{fig:qnli_rte}
    % \vspace{-10pt}
\end{figure*}


\paragraph{BERT-Based Model Results}
For BERT-based model, we show the CDF of singular values on all the evaluated datasets in Figure~\ref{fig:appendix_bert_glue}. We observe that by applying \texttt{SoftDecay} (bottom row of Figure~\ref{fig:appendix_bert_glue}), the CDF of singular values in the last layer becomes more flattened compared to that in vanilla BERT (top row of Figure~\ref{fig:appendix_bert_glue}). %the larger singular value distributions derived from the last layer suddenly become less predominant comparing to layer 9. 

\paragraph{ALBERT-Based and DistilBERT-based Model Results}
We also show the results for \texttt{ALBERT} (Figure~\ref{fig:albert_glue1} and Figure~\ref{fig:albert_glue2}) and \texttt{DistilBERT} (Figure~\ref{fig:distilbertglue1} and Figure~\ref{fig:distilbertglue2}). By comparing with the vanilla PTLMs (the top row of each figure), %curves in the first row of \texttt{ALBERT} are more gathering means that the singular values distribution derived from different ALBERT layers are more similar. By comparing the gap between red solid and dash line in the bottom row of each figure, 
we notice that the application of \texttt{SoftDecay} has a larger impact on ALBERT compared to DistilBERT, especially on the CoLA dataset. For \texttt{DistilBERT}, its feature space becomes anisotropic gradually as layers go deeper. % without a sudden drop that are seen in BERT and ALBERT. 

\begin{figure*}[tb]
    \centering
    \includegraphics[trim=30 305 135 25,clip,width=0.98\textwidth]{appendix_bert_gluecdf.pdf}
    \caption{Cumulative distribution function (CDF) of singular value distributions. The upper ones are from vanilla \texttt{BERT}, bottom ones are from \texttt{BERT+SoftDecay}. From left to right, the evaluation datasets are SST-2, MRPC, QNLI and CoLA. Different curves represent distributions derived from different model layers. The x-axis represents the normalised singular values sorted in an ascending order. %If a curve reaches $F(x)=1.0$ quicker means that the largest singular value is more predominant, i.e., the EV value is larger and the \textit{uniformity} is worse. It is noticeable that our proposed 
    \texttt{SoftDecay} adjusts the anisotropy of the feature space with the effect more %through the starting point and its increase speed, its adjusting varies from different tasks: most 
    noticeable in MRPC and less obvious in QNLI.}
    \label{fig:appendix_bert_glue}
\end{figure*}

\begin{figure*}[th]
    \centering
    \includegraphics[trim= 30 245 200 120,clip, width=0.98\textwidth]{appendix_albert_nocompare1.pdf}
    \caption{CDF of SST-2, MRPC and QNLI datasets. The upper row results are from the vanilla \texttt{ALBERT}, the bottom ones are from \texttt{ALBERT+SoftDecay}.}
    \label{fig:albert_glue1}
        \vspace{5pt}
    \includegraphics[trim= 40 280 350 30,clip, width=0.65\textwidth]{appendix_albert_gluenocompare2.pdf}
    \caption{CDF of CoLA and RTE datasets. The upper row results are from the vanilla ALBERT, the bottom ones are from \texttt{ALBERT+SoftDecay}.}
    \label{fig:albert_glue2}
\end{figure*}

% \begin{figure*}[t]
%     \centering
%     \includegraphics[trim= 40 280 350 30,clip, width=0.65\textwidth]{appendix_albert_gluenocompare2.pdf}
%     \caption{CDF of CoLA and RTE datasets. The upper row results are from the vanilla ALBERT, the bottom ones are from \texttt{ALBERT+SoftDecay}.}
%     
% \end{figure*}

\begin{figure*}[th]
    \centering
    \includegraphics[trim= 130 175 50 160, clip,width=0.98\textwidth]{appendix_distilbert_glue1.pdf}
    \caption{CDF of SST-2, MRPC and QNLI datasets. The upper row results are from the vanilla DistilBERT, the bottom ones are from \texttt{DistilBERT+SoftDecay}.}
    \label{fig:distilbertglue1}
    \vspace{5pt}
     \includegraphics[trim= 90 210 180 160, clip,width=0.88\textwidth]{appendix_distilbert_glue2.pdf}
      \caption{CDF of CoLA and RTE datasets. The upper row results are from the vanilla DistilBERT, the bottom ones are from \texttt{DistilBERT+SoftDecay}.}
    
     \label{fig:distilbertglue2}
\end{figure*}

% \begin{figure*}[t]
%     \centering
%     \includegraphics[trim= 90 210 180 160, clip,width=0.88\textwidth]{appendix_distilbert_glue2.pdf}
%     \caption{CDF of CoLA and RTE datasets. The upper row results are from the vanilla DistilBERT, the bottom ones are from \texttt{DistilBERT+SoftDecay}.}
%     \label{fig:distilbertglue2}
% \end{figure*}



\FloatBarrier
\clearpage
\bibliography{yan_670}

\end{document}
