\documentclass[accepted]{uai2023} % for initial submission
% \documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{hyperref}
\usepackage{url}
\usepackage{color}
\usepackage{tcolorbox}
\usepackage{CJK}
\usepackage{adjustbox}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{graphicx}
\usepackage{booktabs}
\usepackage{threeparttable}
\usepackage{siunitx}
\usepackage{lipsum}
\usepackage{multirow}
\usepackage{bm}
\usepackage{amssymb}
\usepackage{amsmath,amsthm,mathtools}
\usepackage[normalem]{ulem}
% \usepackage{subcaption}
\usepackage{commath}
\usepackage{wrapfig,lipsum}
\usepackage{enumitem}
\usepackage{subfig}

\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}
\DeclareMathOperator*{\predictor}{predictor}

% \makeatletter
% \newcommand*{\addFileDependency}[1]{% argument=file name and extension
% \typeout{(#1)}% latexmk will find this if $recorder=0
% % however, in that case, it will ignore #1 if it is a .aux or 
% % .pdf file etc and it exists! If it doesn't exist, it will appear 
% % in the list of dependents regardless)
% %
% % Write the following if you want it to appear in \listfiles 
% % --- although not really necessary and latexmk doesn't use this
% %
% \@addtofilelist{#1}
% %
% % latexmk will find this message if #1 doesn't exist (yet)
% \IfFileExists{#1}{}{\typeout{No file #1.}}
% }\makeatother

% \newcommand*{\myexternaldocument}[1]{%
% \externaldocument{#1}%
% \addFileDependency{#1.tex}%
% \addFileDependency{#1.aux}%
% }

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
% \usepackage{xr} 
% \myexternaldocument{li_516}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{\texttt{CUE}: An Uncertainty Interpretation Framework for Text Classifiers Built on Pre-Trained Language Models\\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{Jiazheng Li}
\author[2]{Zhaoyue Sun}
\author[3]{Bin Liang}
\author[1]{Lin Gui}
\author[1,2,4]{Yulan He}
% Add affiliations after the authors
\affil[1]{%
 Department of Informatics, King’s College London, UK
}
\affil[2]{%
    Department of Computer Science, University of Warwick, UK
}
\affil[3]{%
    Joint Lab of HITSZ-CMS, Harbin Institute of Technology, Shenzhen, China
  } 
\affil[4]{%
    The Alan Turing Institute, UK
  }
  
  \begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle

\section{Derivations}

% \subsection{Notations}
% We restate our notations for readability. Under the pre-train and fine-tune paradigm, the input document of a PLM is denoted as $\bm{x}$, and the gold label is denoted as $y$, where $y\in\mathcal{Y}$, and $\mathcal{Y}$ is the label set. Let $\bm{\vartheta}$ be the parameters of the PLM layers, $\bm{\eta}$ be the parameters of the task-specific classification layer, and $\bm{e}$ be the text representation generated by the PLM. Then the prediction of the fine-tuned model can be represented as:
% We denote the parameters of the PLM as $\bm{\vartheta}$ and the parameters of the task-specific classification layer as $\bm{\eta}$. 
% Let $\bm{e}$ be the text representation generated by the PLM
%the PLM layer $\bm{\vartheta}$ are pre-trained and fine-tuned to learn the the distribution $p_{\bm{\vartheta}}(\bm{e} | \bm{x})$, and the classification layer $\bm{\eta}$ is fine-tuned to learn the distribution $p_{\bm{\eta}}(y | \bm{e})$.
%The PLM $\bm{\vartheta}$ is fine-tuned to learn the distribution over input text $\bm{x}$ to generate text representation $\bm{e}$, this process is denoted as $p_{\bm{\vartheta}}(\bm{e} | \bm{x})$. 
% The text classifier $\bm{\eta}$ is fine-tuned to learn the distribution over text representation $\bm{e}$ and to give the gold label $y$, this process is denoted as $p_{\bm{\eta}}(y | \bm{e})$. 
% The fine-tuned model is responsible to make a prediction based on the given text $\bm{x}$, and the classification result $\hat{y}$ is given by:
% \begin{equation}
%     \hat{y}=\argmax_{y\in\mathcal{Y}} p(y | \bm{x}; \bm{\vartheta}, \bm{\eta}) = \argmax_{y\in\mathcal{Y}} p_{\bm{\eta}}(y | \bm{e}) p_{\bm{\vartheta}}(\bm{e} | \bm{x})  \notag  
% \end{equation}
% Once a classifier built on a PLM is fine-tuned on a target dataset, we freeze the parameters of the PLM and the classification layer and then insert a VAE between the PLM last layer and the task-specific classification layer. 
% Recall that the plug-in VAE framework proposed in our paper contains an encoder network and a decoder network.
% First, the PLM-encoded representation $\bm{e}$ for input text $\bm{x}$ is mapped to a latent vector, denoted by $\bm{z}$, via a VAE which consists of two networks. 
% The encoder network learns the distribution of a lower dimensional latent variable $\bm{z}$ given the PLM-encoded representation $\bm{e}$: $q_{\bm{\phi}}(\bm{z}|\bm{e})$, while the decoder network reconstructs the input $\bm{e}'$ given the latent variable $\bm{z}$: $p_{\bm{\theta}}(\bm{e}'|\bm{z})$, where $\bm{\phi}$ and $\bm{\theta}$ are the sets of parameters in the encoder and decoder respectively. The classifier's prediction on the reconstructed representation $\bm{e}'$ is denoted as $\hat{y}'$. The derivation of this paper follows the following assumption:

% \begin{assumption}
% \label{app:assum1}\textit{Latent vector $\bm{z}$ is distributed as a multivariate Gaussian and dimensions of variable $\bm{z}$ are independent of each other, i.e., $\bm{z} \sim \mathcal{N}(\bm{\mu},\rm \diag(\bm{\sigma}^2))$}. %, $\bm{z} = q_\phi(\bm{z}|\bm{e}) = \mu(e) +\epsilon \cdot \sigma(e)$.}
% \end{assumption}

% Here $\bm{\mu}$ is the mean vector and $\diag(\bm{\sigma}^2)$, which stands for the diagonal matrix of the vector $\bm{\sigma^2}$, is the covariance matrix of the Gaussian distribution. 
% % We use $\mathbb{E}_{q_{\phi}(\bm{z}|\bm{e})}[\bm{z}]=\mu_{\phi}(\bm{z}|\bm{e})$ and $\mathbb{E}_{p_{\theta}(\bm{e}'|\bm{z})}[\bm{e}']=\mu_{\theta}(\bm{e}'|\bm{z})$ to denote these two networks' predictive means. 

% In addition, we use $[n]$ to represent $\{1,2,...,n\}$, 
% % and $|S|$ to stand for the size of a set $S$. 
% $\mathbb{E}$ to represent the mathematical expectation and $\mathbb{D}$ to represent the  variance. We use $||\cdot||$ to denote the operator/spectral norm of matrices and $\mathcal{L}_{2}$-norm for vectors.

\subsection{Decomposition of the Predictive Uncertainty}
We show how we decompose the Mean Squared Error (MSE) based predictive uncertainty into the epistemic uncertainty and the aleatoric uncertainty mentioned in \textsection{3.2}. 
%Here $y$ is the gold classification label from a dataset, while $\hat{y}$ is the predicted class label.
% \begin{align}
%  \mathbb{E}\big[\big(y_i - \hat{y}_i\big)^2\big] \nonumber 
%     & = \underbrace{\mathbb{E}[(y_i - \mathbb{E}[y])^2]}_{\rm aleatoric\,uncertainty} + \underbrace{\mathbb{E}\big[\mathbb{E}[y] - \hat{y}_i)^2\big]}_{\rm epistemic\,uncertainty}
% \end{align}
\begin{align}
    \mathbb{E}\big[(y_i - \hat{y}_i)^2\big] 
    &= \mathbb{E}\big[(y_i - \mathbb{E}[y] + \mathbb{E}[y] - \hat{y}_i)^2\big] \notag\\
    &=\mathbb{E}\big[(y_i - \mathbb{E}[y])^2] + \mathbb{E}[(\mathbb{E}[y] - \hat{y}_i)^2] + 2\mathbb{E}[(y_i - \mathbb{E}[y])(\mathbb{E}[y] - \hat{y}_i)] \notag\\
    &= \mathbb{E}[(y_i - \mathbb{E}[y])^2] + \mathbb{E}[(\mathbb{E}[y] - \hat{y}_i)^2] + 2(\mathbb{E}[{y}] - \mathbb{E}[y])(\mathbb{E}[y] - \mathbb{E}[\hat{y}_i]) \notag\\ 
    & = \underbrace{\mathbb{E}[(y_i - \mathbb{E}[y])^2]}_{\rm aleatoric\,uncertainty} + \underbrace{\mathbb{E}[(\mathbb{E}[y] - \hat{y}_i)^2]}_{\rm epistemic\,uncertainty} \notag
\end{align}
Here, $\mathbb{E}[y]$ denotes the expectation of the ground truth label distribution. Since the first term, $\mathbb{E}[(y_i - \mathbb{E}[y])^2]$, contains the observed $y_i$, it can be defined as the aleatoric uncertainty. The second term represents the epistemic uncertainty since it contains the predicted $\hat{y}_i$.
% \begin{align}
% \notag
%     \underbrace{\mathbb{E}[(y-\hat{y})^2]}_{\rm total\,uncertainty} &= \mathbb{E}[(y - \mathbb{E}[y] + \mathbb{E}[y] - \hat{y})^2] \\ 
% \notag
%     &=\mathbb{E}[(y - \mathbb{E}[y])^2] + \mathbb{E}[(\mathbb{E}[y] - \hat{y})^2] + 2\mathbb{E}[(y - \mathbb{E}[y])(\mathbb{E}[y] - \hat{y})] \\ 
% \notag
%     &= \mathbb{E}[(y - \mathbb{E}[y])^2] + \mathbb{E}[(\mathbb{E}[y] - \hat{y})^2] + 2(\mathbb{E}[{y}] - \mathbb{E}[y])(\mathbb{E}[y] - \mathbb{E}[\hat{y}]) \\ 
% \notag
%     &= \underbrace{\mathbb{E}[(y - \mathbb{E}[y])^2]}_{\rm aleatoric\,uncertainty}+\underbrace{\mathbb{E}[(\mathbb{E}[y] - \hat{y})^2]}_{\rm epistemic\,uncertainty} \nonumber
% \end{align}
% Rearranging the above equation, we have:
% \begin{align}
% \notag
%      \underbrace{\mathbb{E}[(y - \mathbb{E}[y])^2]}_{\rm aleatoric\,uncertainty}  &= \underbrace{\mathbb{E}[(y-\hat{y})^2]}_{\rm total\,uncertainty} - \underbrace{\mathbb{E}[(\mathbb{E}[y] - \hat{y})^2]}_{\rm epistemic\,uncertainty} \notag
%     %  \mathbb{D}[y] &= \mathbb{E}[(y-\hat{y})^2] - \mathbb{E}[(\mathbb{E}[y] - \hat{y})^2] \notag
% \end{align}
% Since the total uncertainty is fixed. We can therefore cause aleatoric uncertainty fluctuation to estimate the epistemic uncertainty.

% \subsection{Derivation of the Predictive Entropy Increment Upper-bound}
\subsection{Interpreting Entropy Change with Reconstruction Difference}
% Bounding loss for an arbitrary variable}
% In the following, we show that the increment of the predictive entropy caused by the representation perturbation has an upper bound and is proportional to $||\bm{e}'-\bm{e}||^2$, which is mentioned in Section 4.1.
% \newpage
% % Considering the reparameterisation trick with
% According to Assumption \ref{app:assum1}, using the reparameterisation trick with the latent code $\bm{z}\sim\mathcal{N}(\bm{\mu},\rm diag(\bm{\sigma}^2))$, 
% % , where $\mu$ and $\sigma$ are the mean value and covariance matrix respectively. 
% we can reconstruct the representation $\bm{e}'$ from a given text representation $\bm{e}$ by:
% \begin{align}
%     \bm{z} &= \bm{\mu}(\bm{e}) +\bm{\epsilon} \cdot \bm{\sigma}(\bm{e})\notag \\ 
%     % e' &= f(z) \\ \notag
%     \bm{e}' &= p_{\bm{\theta}}(\bm{z})\notag \\
%     % y' &= {\rm pred}(e') \\ \notag
%     \hat{y}' &= p_{\bm{\eta}}(\bm{e}') \notag
% \end{align}
% In our learning objective, we aim to estimate the uncertainty by adding the noise to increase the predictive entropy while keeping the classification results unchanged. % the entropy in prediction without changing it. 
% Then, we have 
% \begin{equation}
%     0 \leq H_{(\bm{e})}(\hat{y}) \leq H_{(\bm{e}')}(\hat{y}') \leq {\rm log}K  \notag
% \end{equation}
% Assuming that the difference between text representation $\bm{e}$ and the reconstructed representation $\bm{e}'$ is $\bm{u}$, $\bm{u} = \bm{e}' - \bm{e} $, and the prediction is obtained from the Softmax function. Let $\bm{U}$ be the maximum distance of $\bm{e}'-\bm{e}$ that causes the new $\bm{e}'$ confuse the classifier (i.e. when the predictive class probability of any of the $K$ classes is $\frac{1}{K}$). Then, according to the Jensen inequality, the prediction is bounded by: 
% \begin{align}
% \notag
%     \hat{y}' &\leq t \cdot {\rm softmax}(\bm{e}) + (1-t){\rm softmax}(\bm{e}+\bm{U}) \\ \notag
%     & =   t \cdot {\rm softmax}(\bm{e}) + (1-t)\frac{1}{K} \notag
% \end{align}
% where $0 \leq t \leq 1$ and $H_{(\bm{e}+\bm{U})}(\hat{y}') = {\rm log}K$. Considering the convexity of entropy, we have:
% \begin{align}
% \notag
%     \Delta \mathcal{H} &= \mathcal{H}_{\bm{e}'}(\hat{y}') - \mathcal{H}_{\bm{e}}(\hat{y})\\ \notag
%     & \leq - (t \cdot p(\hat{y}) +  \frac{(1-t)}{K}) {\rm log} (t \cdot p(\hat{y}) + \frac{(1-t)}{K}) + p(\hat{y}){\rm log}p(\hat{y})\\ \notag
%     & \leq - t \cdot p(\hat{y}) {\rm log} p(\hat{y}) - (1-t) \cdot \frac{1}{K} {\rm log} (\frac{1}{K}) + p(\hat{y}){\rm log}p(\hat{y})\\ \notag
%     & = (1-t)({\log} K - \mathcal{H}_{\bm{e}}(\hat{y}) ) \\\notag
%     & = \frac{||\bm{e}'-\bm{e}||^2 \cdot ({\log} K - \mathcal{H}_{\bm{e}}(\hat{y}) )}{||U||^2} \\
%     &\propto ||\bm{e}'-\bm{e}||^2   \notag
% \end{align}
% Therefore, our loss function is a convex function and its upper bound is correlated with the $\mathcal{L}_{2}$-norm between $\bm{e}$ and $\bm{e}'$.
In this subsection, we provide detailed derivation corresponding to the predictive entropy upper-bound mentioned in \textsection{4.1}.

In our learning objective, we aim to estimate the uncertainty by adding the noise to increase the predictive entropy while keeping the classification results unchanged. For a Softmax-based classifier, the predictive uncertainty reaches the maximum when probabilities are uniformly distributed (i.e. when the predictive class probability of any of the $K$ classes is $\frac{1}{K}$).  % the entropy in prediction without changing it. 
Then, we have 
\begin{equation}
    0 \leq \mathcal{H}_{\bm{e}_i}(\hat{y}_i) \leq \mathcal{H}_{\bm{e}'_i}(\hat{y}'_i) \leq {\rm log}K  \notag
\end{equation}
Assuming that the difference between text representation $\bm{e}_i$ and the reconstructed representation $\bm{e}'_i$ is $\bm{u}$, $\bm{u} = \bm{e}'_i - \bm{e}_i$, and the prediction is obtained from the Softmax function. Let $\bm{U}$ be the maximum distance of $\bm{e}'_i-\bm{e}_i$ that causes the new $\bm{e}'_i$ confuse the classifier (i.e. when the predictive class probability equals to $\frac{1}{K}$). Then, according to the Jensen inequality, the prediction is bounded by: 
\begin{align}
\notag
    \hat{y}_i' &\leq t \cdot {\rm softmax}(\bm{e}_i) + (1-t){\rm softmax}(\bm{e}_i+\bm{U}) \\ \notag
    & =   t \cdot {\rm softmax}(\bm{e}_i) + (1-t)\frac{1}{K} \notag
\end{align}
So we can get $\bm{e}_i \leq \bm{e}_i' \leq (\bm{e}_i + \bm{U})$, let $0 \leq t \leq 1$ and $\mathcal{H}_{(\bm{e}_i+\bm{U})}(\hat{y}_i') = {\rm log}K$. Considering the convexity of entropy, we have:
\begin{small}
\begin{align}
    \Delta \mathcal{H} &= \mathcal{H}_{\bm{e}_i'}(\hat{y}_i') - \mathcal{H}_{\bm{e}_i}(\hat{y}_i)\notag\\ \notag
    &\leq - (t \cdot p(\hat{y}_i) +  \frac{(1-t)}{K}) {\rm log} (t \cdot p(\hat{y}_i) + \frac{(1-t)}{K}) + p(\hat{y}_i){\rm log}p(\hat{y}_i)\\ \notag
    & \leq - t \cdot p(\hat{y}_i) {\rm log} p(\hat{y}_i) - (1-t) \cdot \frac{1}{K} {\rm log} (\frac{1}{K}) + p(\hat{y}_i){\rm log}p(\hat{y}_i)\\ \notag
    & = (1-t)({\log} K - \mathcal{H}_{\bm{e}_i}(\hat{y}_i) ) \\\notag
    & = \frac{||\bm{e}_i'-\bm{e}_i||^2 \cdot ({\log} K - \mathcal{H}_{\bm{e}_i}(\hat{y}_i) )}{||U||^2} \\
    &\propto ||\bm{e}_i'-\bm{e}_i||^2   \notag
\label{eq:entropy}
\end{align}
\end{small}

The above derivation demonstrates the generated perturbation can guarantee an upper bound of predictive entropy difference $\Delta\mathcal{H}$, and the variation of the entropy $\Delta\mathcal{H}$ is proportional to the reconstruction error $||\bm{e}'_i-\bm{e}_i||^2$, which thus can be used to interpret the predictive uncertainty.

% Hence, we can prove our framework guarantee the upper bound of predictive entropy difference $\Delta\mathcal{H}$, which also directs the learning objective of our loss function. As shown above, the variation of the entropy, $\Delta\mathcal{H}$, is proportional to the $||\bm{e}'-\bm{e}||^2$, which can thus be used to interpret the uncertainty in section 4.2.

% \subsection{Interpreting the Functionality of Loss}
% % We interpret the functionality of the combination of our loss functions and demonstrate the objective of our loss function in here as mentioned in the paper Section 4.1.
% Our training process is equivalent to using the multi-Gaussian process to learn the training data's Probability Density Function (PDF). As has been discussed by \citet{hullermeier2021aleatoric}, for an unseen data point, the reconstruction error can be used as a measure to quantify if the data point falls within the underlying PDF. 
% % The aforementioned training process is essentially equivalent to performing perturbation in the latent space to generate a semi-factual such that it resides on the original data manifold but causes epistemic uncertainty.

% Given an input representation $\bm{e}_i$, we can sample a reconstructed representation $\bm{e}'$ from our proposed framework. Then the conditional probability of $P(\bm{e}_i'|\bm{e}_i)$ can be given by Nadaraya-Watson estimator \citep{bierens1988nadaraya}:
% \begin{align}
% \notag
%      P(\bm{e}_i'|\bm{e}_i) &= \frac{P(\bm{e}_i',\bm{e}_i)}{P(\bm{e}_i)} \\ 
% \notag
%      &= \frac{\frac{1}{N} \Sigma_{j=1}^{N} \kappa(\frac{\bm{e}_i'-\bm{e}_j}{h}) \cdot \kappa(\frac{\bm{e}+i-\bm{e}_j}{h})}{\frac{1}{N}  \Sigma_{j=1}^{N} \kappa(\frac{\bm{e}_i-\bm{e}_j}{h})} 
% \notag
% \end{align}
% where $\kappa$ is a kernel function with parameter $h$, $\{\bm{e}_j\}_{j=1}^N$ is the set of samples from the training set. 
% Since the input feature for ${\bm{e}_j}$ is fixed, we only care about the updating of reconstruction through encoder-decoder architecture. Thus we have:
% \begin{align}
% \notag
%      P(\bm{e}_i'|\bm{e}_i) &\propto \frac{1}{N} \Sigma_{j=1}^{N} \kappa(\frac{\bm{e}_i'-\bm{e}_j}{h}) \cdot \kappa(\frac{\bm{e}_i-\bm{e}_j}{h})
% \end{align}
% To simplify the above estimation, we apply the RBF-kernel and triangle inequality in estimation and have:
% \begin{align}
% \notag
%      {\rm log} P(\bm{e}_i'|\bm{e}_i) &\propto \frac{1}{N} \Sigma_{j=1}^{N} \kappa(\frac{\bm{e}_i'-\bm{e}_j}{h}) \cdot \kappa(\frac{\bm{e}_i-\bm{e}_j}{h}) \\ \notag
%      &={\rm log}\frac{1}{N}\sum_{j=1}^N {\rm exp}(- ||\bm{e}_i'-\bm{e}_j||^2) \cdot {\rm exp}(- ||\bm{e}_i-\bm{e}_j||^2) \\ \notag
%    &={\rm log}\frac{1}{N}\sum_{j=1}^N {\rm exp}(- ||\bm{e}_i'-\bm{e}_j||^2 - ||\bm{e}_i-\bm{e}_j||^2) \\ \notag
%    &\leq {\rm log}\frac{1}{N}\sum_{j=1}^N {\rm exp}(- ||\bm{e}_i'-\bm{e}_i||^2) \\ \notag
%    &={\rm log}[{\rm exp}(- ||\bm{e}_i'-\bm{e}_i||^2)] \\ \notag
% \end{align}
% Thus, we are able to optimise ${\rm log} P(\bm{e}_i'|\bm{e}_i) $ by the mean squared error if we choose the natural logarithm function:
% \begin{align}
% \notag
%      {\rm ln} P(\bm{e}_i'|\bm{e}_i) \leq - ||\bm{e}_i'-\bm{e}_i||^2
% \end{align}
% % The reconstruct based predictive probability of $\hat{y}'$ 
% Considering to re-write the conditional predictive probability of $\hat{y}_i'$ with the reconstructed $\bm{e}_i'$:
% \begin{align}
% \notag
%      P(\hat{y}_i'|\bm{e}_i) = P(\hat{y}_i'|\bm{e}_i') \cdot P(\bm{e}_i'|\bm{e}_i) 
% \end{align}
% By taking a logarithm at both sides, we have:
% \begin{align}
% \notag
%      {\rm ln}P(\hat{y}_i'|\bm{e}_i) &= {\rm ln} P(\hat{y}_i'|\bm{e}_i') \cdot P(\bm{e}_i'|\bm{e}_i) \\
% \notag
%      &\leq -||\bm{e}_i'-\bm{e}_i||^2 + {\rm ln} P(\hat{y}_i'|\bm{e}_i') 
% \notag
% \end{align}
% Therefore, for a given $\bm{e}_i$ and the reconstructed $\bm{e}_i'$, the difference between the original predictive probability $\hat{y}_i$ and the reconstruction based predictive probability $\hat{y}_i'$ can be measured by KL-divergence as:
% \begin{align}
% \notag
%      {\rm KL}_e(\hat{y}_i',\hat{y}_i) &= p(\hat{y}_i|\bm{e}_i){\rm log}\frac{p(\hat{y}_i|\bm{e}_i)}{p(\hat{y}_i'|\bm{e}_i)} \\ \notag
%      &= p(\hat{y}_i|\bm{e}_i){\rm log}p(\hat{y}_i|\bm{e}_i) - p(\hat{y}_i|\bm{e}_i){\rm log}p(\hat{y}_i'|\bm{e}_i) \\ \notag
%      &= p(\hat{y}_i|\bm{e}_i){\rm log}p(\hat{y}_i|\bm{e}_i) - p(\hat{y}_i|\bm{e}_i)\cdot(-||\bm{e}_i'-\bm{e}_i||^2 + {\rm log} P(\hat{y}_i'|\bm{e}_i')) \\ \notag
%      &= p(\hat{y}_i|\bm{e}_i) \cdot ||\bm{e}_i'-\bm{e}_i||^2 + p(\hat{y}_i|\bm{e}_i){\rm log}p(\hat{y}_i|\bm{e}_i) - p(\hat{y}_i|\bm{e}_i){\rm log}P(\hat{y}_i'|\bm{e}_i') \\ \notag
%      &= p(\hat{y}_i|\bm{e}_i) \cdot ||\bm{e}_i'-\bm{e}_i||^2 + p(\hat{y}_i|\bm{e}_i){\rm log} \frac{p(\hat{y}_i|\bm{e}_i)}{p(\hat{y}_i'|\bm{e}_i')} \notag
% \end{align}
% Here, the $p(\hat{y}_i|\bm{e}_i)$ is fixed in the training progress. Then we can obtain that: 
% \begin{align}
% \notag
%      {\rm KL}_e(\hat{y}_i',\hat{y}_i) &\propto ||\bm{e}_i'-\bm{e}_i||^2 + {\rm KL}(p(\hat{y}_i|\bm{e}_i),p(\hat{y}_i'|\bm{e}_i'))
% \end{align}
% Therefore, in the loss function in our proposed method, we are able to use the above term to measure the similarity between the prediction based on the reconstructed representation and the prediction based on the original input representation. It is worth noting that both the terms of $\Delta \mathcal{H}$ and ${\rm KL}_{\bm{e}_i}(\hat{y}_i',\hat{y}_i)$ are related to $||\bm{e}_i'-\bm{e}_i||^2$ but with opposite directions for optimisation. That is, we need to make a trade-off on the optimisation of $||\bm{e}_i'-\bm{e}_i||^2$ to guarantee that the difference between $\bm{e}_i'$ and $\bm{e}_i$ should be significant but without changing the prediction result. That is why we introduce an entropy term into the loss function with a weight of $\lambda$ to control the influence of this term. 

% This conclusion might be related to the theory of information bottleneck \citep{Bang_Xie_Lee_Wu_Xing_2021} where  $||\bm{e}_i'-\bm{e}_i||^2$ reflects the information bottleneck and ${\rm KL}(p(\hat{y}_i|\bm{e}_i),p(\hat{y}_i'|\bm{e}_i'))$ represent the decoding result.
\section{Uncertain Feature Identification Algorithm}
In this section, we provide the detailed implementation of the Uncertain Feature Identification algorithm built in the \texttt{CUE} framework corresponding to \textsection{4.2}. Intuitively, we use greedy search to find a locally optimal solution by identifying the most influential latent dimensions of $\bm{z}_i$ first and then estimating the influential score for each token (see in Algorithm \ref{alg:UFI_alg}). That is, we can identify input tokens that are most similar to the influential representation vector ${\bm{r}_{z_i}}_d$ as the ones which cause predictive uncertainty by the inner product in the metric space. More concretely, assuming the PLM-encoded representation for token $j$ is ${\bm{e}_{i}}_j$, we can compute each token's importance score by ${\rm{token}^j}_{\rm score}= \langle {\bm{r}_{z_i}}_d,{\bm{e}_{i}}_j  \rangle$. By sorting ${\rm{token}^j}_{\rm score}$ in descending order, we can identify input tokens that cause predictive uncertainty.

The identification of the source of the uncertainty highly relies on the influence of latent dimensions. In practice, we use a threshold $\alpha$ to select the most similar dimensions of $\Delta \bm{e}_i$ to construct a combination of the most influential uncertain representation ${\bm{r}_{z_i}}_D$. The threshold $\alpha$ can be defined with the help of the average entropy curve and ECE histograms from the dimension importance analysis described in \textsection{5.2}.

\begin{algorithm}[h]
\small
\caption{Uncertain Feature Identification}
\label{alg:UFI_alg}
\textbf{Input:} original text representation $\bm{e}_i$, reconstruct text representation $\bm{e}'_i$, \texttt{CUE} decoder $\mu_{\bm{\theta}}$, token representations $\{{\bm{e}_{i}}_1,{\bm{e}_{i}}_2,\cdots,{\bm{e}_{i}}_n\}$, tokens $\{\rm{token}_1,\rm{token}_2,\cdots,\rm{token}_n\}$, threshold $\alpha$.
\normalsize
\begin{algorithmic}
\STATE $\Delta \bm{e}_i = \bm{e}'_i - \bm{e}_i$
\FOR {the $d$th dimension ${\bm{z}_{i}}_d$ in $\bm{z}_i$}
    \STATE ${\bm{r}_{z_i}}_d = \mu_{\theta}({\bm{z}_{i}}_d)$ 
    \STATE ${\rm dim}^{d}_{\rm score}= \langle \Delta \bm{e}_i, {\bm{r}_{z_i}}_d \rangle$
\ENDFOR
\FOR {$\mbox{sort}({\bm{r}_{z_i}}_d, key=\phi({\rm dim}^{d}_{\rm score}, {\bm{r}_{z_i}}_d))[:\alpha]$}
    \STATE ${\bm{r}_{z_i}}_D += {\bm{r}_{z_i}}_d$
\ENDFOR
\FOR {${\bm{e}_{i}}_j$ in $\{{\bm{e}_{i}}_1,{\bm{e}_{i}}_2,\cdots,{\bm{e}_{i}}_n\}$}
    \STATE ${\rm{token}^j}_{\rm score}=  \langle {\bm{r}_{z_i}}_D, {\bm{e}_{i}}_j  \rangle$
\ENDFOR
\STATE \textbf{return} $\mbox{sort}(\rm{token}_j, key=\phi({\rm{token}^j}_{\rm score}, \rm{token}_j))$
\end{algorithmic}
% \small
% \textbf{Output:} Sorted tokens by the influential score in decreasing order.
\end{algorithm}

\section{Experimental Setup}
In this section, we provide detailed dataset statistics, baseline setup, evaluation metrics and hyperparameter settings as mentioned in \textsection{5}.

\paragraph{Datasets}

We evaluate our proposed framework on four datasets for \emph{linguistic acceptability classification}, \emph{natural language inference}, and \emph{emotion classification}. The dataset statistics are shown in Table \ref{table:dataset}.

\begin{table}[h]
\centering
% \resizebox{\columnwidth}{!}{
\begin{tabular}{lrrrr}
\toprule
Datasets &   CoLA & MultiNLI & Emotion & GoEmotions \\ \midrule
Classes  &  2 & 3 & 6 & 27  \\   \midrule
 Train   &   8,551   &  392,702     &   16,000 &   43,410 \\ 
 Dev     &   1,043   &    20,000    &   2,000  &   5,427  \\ 
 Test    &   1,043   &    20,000    &   2,000  &   5,426  \\ \midrule
Total    &   10,637  &    432,702   &   20,000 &   58,009 \\
\bottomrule
\end{tabular}
\caption{Statistic of the datasets.}
\label{table:dataset}
\end{table}
\noindent\underline{Linguistic Acceptability Classification}. The CoLA (Corpus of Linguistic Acceptability) %is from the GLUE benchmark 
\citep{cola} %,wang2018glue}. This dataset 
contains sentences %from 23 linguistics publications %from books and journal articles, which are 
annotated as \emph{grammatically acceptable} or \emph{not}.
%to judge whether the sentence is a grammatical sentence. %We use this dataset to exam the ability of our latent variable model on the ability of grammatical classification and binary classification.

\noindent\underline{Natural Language Inference}. The MultiNLI \citep{dataset_multinli} %,wang2018glue} 
dataset contains annotations for relations of \emph{entailment}, \emph{contradiction}, and \emph{neutrality} between sentence pairs.%from the GLUE benchmark to verify our method's effectiveness on classification for sentence pairs, in which each 
% contains sentence pairs, %each of which is annotated with one of the three categories, \emph{entailment}, \emph{contradictory}, and \emph{neutral}. %Training and testing split is the same as in the original dataset release. All the result we report for Multi NLI are test on the matched testset.

\noindent\underline{Emotion Classification}. %We use two popular multi-class emotion classification datasets, GoEmotions \citep{dataset_goemotions} and Emotion \citep{dataset_emotion}. 
The GoEmotions \citep{dataset_goemotions} dataset annotates Reddit comments with twenty-seven emotion labels (e.g., \emph{fear} and \emph{admiration}). %is a multi-label emotion classification dataset consisting of 54k Reddit comments categorised into 27 emotion labels.
The Emotion \citep{dataset_emotion} dataset classifies English tweets into six emotion classes (e.g., \emph{sadness} and \emph{joy}). Note that the GoEmotions dataset allows multi-label settings that a sentence can be annotated with more than one emotion label. In our experimental setup, we only focus on multi-class classification, and we thus filtered out those instances annotated with multiple labels in the GoEmotions.



\paragraph{Baselines}

We compare our method with the following baselines:

\noindent\underline{Label Smoothing} \citep{ Gupta_Kvernadze_Srikumar_2021} is commonly used to deal with overfitting %or overconfident problem 
when using cross-entropy loss on classification tasks. It aims to uniform the distribution of labels to encourage small logit gaps and has been shown effective in calibrating PLM-based classifiers. %existing researches  %have proved its capability on PLM calibration. 

\noindent\underline{MC Dropout} \citep{monte_carlo_dropout} %, Monte-Carlo (MC) dropout 
is an uncertainty estimation technique that performing multiple stochastic forward passes by randomly switching neurons off to generate ensemble of predictions. We follow the implementation of \citet{acl2022_uncertainty_transformers} in our experiments. 
% is a popular regularisation technique by applying Monte Carlo samples from the space of available models by randomly switching neurons off to generate different outputs. This method is similar to approximating variational inference in a deep Gaussian process %, it uses the approximating variational distribution with Bernoulli variables related to network units 
% \citep{acl2022_uncertainty_transformers}.

\noindent\underline{Bayesian Neural Network (BNN)} \citep{8371683} assumes weights of neural networks are random variables with a prior distribution, is thus able to obtain more robust predictions by sampling the network weights during inference, and is often used for uncertainty estimation. Motivated by \citet{getting_clue}, we also implemented a BNN plug-in framework as a comparison with our \texttt{CUE} framework. Specifically, we use a Bayesian linear layer\footnote{\url{https://github.com/piEsposito/blitz-bayesian-deep-learning}} as the encoder and a linear layer as the decoder, and then insert them between the PLM-encoding layer and the classification layer, similar to the way we ensemble the \texttt{CUE}.
%\citet{getting_clue,8371683} BNN is typical way to learn a set of prior parameters from training data that gives predictions  distribution to model the epistemic uncertainty and doing Bayesian inference on its weights. This enables the predictor fits on the training data and reasons about the uncertainty of its own prediction on test data. 
% The output of the BNN is based on distribution of sampled historical prior information. 
%In the experiments, we implemented this method with a Bayesian linear layer\footnote{\url{https://github.com/piEsposito/blitz-bayesian-deep-learning}} as encoder and adopt a classic linear layer as decoder, insert between %We insert a BNN layer between 
%the PLM-encoding layer and the classification layer. 
%The purpose is to introduce uncertainty to the network parameters. This allows us to additionally reduce the variance of model predictions apart from minimising the cross-entropy loss during training so as to obtain a more robust model. We implement the BNN layer using the  Blitz library\footnote{\url{https://github.com/piEsposito/blitz-bayesian-deep-learning}}.

% \noindent\underline{SNGP} \citep{sngp} A Spectral-normalized Neural Gaussian Processes approach to attain an uncertainty performance.

\paragraph{Evaluation Metrics}

We use accuracy (Acc) and macro-averaged F1 (F1) to evaluate the classification results, and Expected Calibration Error (ECE) \citep{desai-durrett-2020-calibration} calculated on predictive probabilities during inference to measure model calibration. 
% Details of the ECE calculation can be found in Appendix \ref{sec:exp-setup}. 
% %For model uncertainty measurement, we follow  \citet{desai-durrett-2020-calibration} to adopt expected calibration error (ECE) calculated on predictive probabilities during inference. Detail of ECE calculation can be found in Appendix \ref{sec:exp-setup}.
% \paragraph{ECE calculation}
For ECE implementation, we use the formula provided by \citet{softmax_uncertain} as follows:
\begin{small}
\begin{gather}
    \text{acc}(B_m) = \frac{1}{B_m} \sum_{i\in B_m} 1(\hat{y}_i = y_i), \quad\quad
    \text{conf}(B_m) = \frac{1}{B_m} \sum_{i\in B_m} \hat{p}_i, \notag\\
    \mbox{ECE} = \sum_{m=1}^M \frac{B_m}{n} |\text{acc}(B_m)-\text{conf}(B_m)| \notag
\end{gather}
\end{small}

Predictions of $n$ samples are grouped into $M$ interval bins and the accuracy is calculated for each bin. $B_m$ is the set of indices of samples that prediction confidence falls into the current interval bin. The ECE formula calculates the weighted average of the difference between the accuracy of each bin -  $\text{acc}(B_m)$ - and the average confidence - $\text{conf}(B_m)$ - within bin $B_m$. 
In our experiments, we set $B_m = 9$. 


\paragraph{Hyperparameters Settings} 

We adopted the Pytorch-Transformers package\footnote{\url{https://github.com/huggingface/pytorch-transformers}} for the implementation of all our Transformer-based language models. For each model, we chose its corresponding base model with the following parameter size: ALBERT-base-v2 (11M), distilBERT-base-uncased (66M), BERT-base-uncased (110M), and RoBERTa-base (125M). %Our experiments used two types of graphic cards: Nvidia Titan RTX and Quadro RTX 6000. 
We fine-tuned all these base models for 20 epochs with a batch size of 16 on each target dataset as compared to base models. For the Label Smoothing and the MC Dropout baseline, the frameworks directly modified the PLM-based models and were finetuned together with the PLM for 20 epochs with a batch size of 16. For the BNN and our \texttt{CUE} plug-in methods, we first fine-tuned the base models for 20 epochs and then froze the PLM encoding and classifier parameters and fine-tuned the BNN and \texttt{CUE} module for a further 50 epochs (with batch sizes as 16 for both modules).
% We fine-tuned our base models and baselines for 20 epochs with batch size of 16, and 50 epochs with batch size 16 on our plugged in BNN and VAE modules. 
A learning rate of $2e-5$ and the early stop strategy have been applied to all the training. %To exam the effectiveness of methods, 
Each model has been trained 5 times with different random seeds. %: $0$, $1$, $2$, $3$, $4$. 
For each model, we report the mean and standard deviation of the evaluation results obtained by the five trained models on test sets. %value of 5 times of experiments with deviation from mean. For the chose of base models on BNN and VAE methods, we selected the base models that reported the median ECE value than other fine-tuned models. 
% Because of our interpretation method aims to provide explanation for cases, the implementation of our interpretation part only supports batch size of 1.

% For our base models and baselines fine-tuning process, we set the learning rate to $2e-5$ and used batch size of 16 trained for 20 epochs. For BNN and VAE fine-tuning processes, we trained our models with 16 batch size for 50 epochs. All the models trained with learning rate of $2e-5$ and applied early stop strategy, all the models are trained 
% we used  We applied the early stop strategy for the base model fine-tuning process on all three datasets. We validated our methods with batch size of 8, 16 and 32 on a Nvidia 3090 GPU. The token uncertainty decomposition and analysis process require the batch size of 1. 
% \begin{align}
%     \bm{u} &= \bm{e}' - \bm{e} \\
%     \bm{u} &= \mathcal{N}(0,\Sigma) \\
%     \Delta \mathcal{H} &= \mathcal{H}' - \mathcal{H}\\
%     &= - \sum_{k=1}^K p(\hat{y}'_k|\bm{e}')\log p(\hat{y}'_k|\bm{e}') - (- \sum_{k=1}^K p(\hat{y}_k|\bm{e})\log p(\hat{y}_k|\bm{e}))\\
%     &= \sum_{k=1}^K p(\hat{y}_k|\bm{e})\log p(\hat{y}_k|\bm{e}) - \sum_{k=1}^K p(\hat{y}'_k|\bm{e}')\log p(\hat{y}'_k|\bm{e}') \\
%     % &= \sum_{k=1}^K p(\hat{y}_k|\bm{e})\log p(\hat{y}_k|\bm{e}) - \sum_{k=1}^K p(\hat{y}'_k|\bm{e} + \bm{u})\log p(\hat{y}'_k|\bm{e} + \bm{u}) \\
%     &= \sum_{k=1}^K p(\hat{y}_k|\bm{e})\log p(\hat{y}_k|\bm{e}) - \sum_{k=1}^K p(\hat{y}'_k|\bm{e})p(\hat{y}'_k|\bm{u})\log p(\hat{y}'_k|\bm{e})p(\hat{y}'_k|\bm{u}) \\
%     &= \sum_{k=1}^K p(\hat{y}_k|\bm{e})\log p(\hat{y}_k|\bm{e}) - \sum_{k=1}^K p(\hat{y}'_k|\bm{e})p(\hat{y}'_k|\bm{u})\log p(\hat{y}'_k|\bm{e}) - \sum_{k=1}^K p(\hat{y}'_k|\bm{e})p(\hat{y}'_k|\bm{u})\log p(\hat{y}'_k|\bm{u}) \\
%     % &= \sum_{k=1}^K p(\hat{y}_k|\bm{e})\log p(\hat{y}_k|\bm{e}) - \sum_{k=1}^K p(\hat{y}'_k|\bm{e})\log p(\hat{y}'_k|\bm{e}) \sum_{k=1}^K p(\hat{y}'_k|\bm{u})\log p(\hat{y}'_k|\bm{e}) - \sum_{k=1}^K p(\hat{y}'_k|\bm{e})p(\hat{y}'_k|\bm{u})\log p(\hat{y}'_k|\bm{u}) \\
%     &= \sum_{k=1}^K p(\hat{y}_k|\bm{e})\log p(\hat{y}_k|\bm{e}) - \sum_{k=1}^K p(\hat{y}'_k|\bm{e}')\log p(\hat{y}'_k|\bm{e}) - \sum_{k=1}^K p(\hat{y}'_k|\bm{e})p(\hat{y}'_k|\bm{u})\log p(\hat{y}'_k|\bm{u}) \\
%     &= \sum_{k=1}^K p(\hat{y}_k|\bm{e})\log p(\hat{y}_k|\bm{e}) - \sum_{k=1}^K p(\hat{y}'_k|\bm{e}')\log p(\hat{y}'_k|\bm{e}) - \sum_{k=1}^K p(\hat{y}'_k|\bm{e})\log p(\hat{y}'_k|\bm{u}) \sum_{k=1}^K p(\hat{y}'_k|\bm{u})\log p(\hat{y}'_k|\bm{u}) \\
%     &= \sum_{k=1}^K p(\hat{y}_k|\bm{e})\log p(\hat{y}_k|\bm{e}) - \sum_{k=1}^K p(\hat{y}'_k|\bm{e}')\log p(\hat{y}'_k|\bm{e}) - \sum_{k=1}^K p(\hat{y}'_k|\bm{e})\log p(\hat{y}'_k|\bm{u}) \sum_{k=1}^K p(\hat{y}'_k|\bm{u})\log p(\hat{y}'_k|\bm{u}) \\
% \end{align}
% \begin{align}
%     KL(\hat{y}'|\hat{y}) &= \sum_{k=1}^K p(\hat{y}'_k|\bm{e}')\log \frac{p(\hat{y}'_k|\bm{e}')}{p(\hat{y}_k|\bm{e}) }\\
%     &\geq (\sum_{k=1}^K p(\hat{y}'_k|\bm{e}'))\log \frac{p(\hat{y}'_k|\bm{e}')}{p(\hat{y}_k|\bm{e}) }\\
%     &\geq (\sum_{k=1}^K p(\hat{y}'_k|\bm{e}'))\log p(\hat{y}'_k|\bm{e}') - (\sum_{k=1}^K p(\hat{y}'_k|\bm{e}'))\log{p(\hat{y}_k|\bm{e}) }\\
%     &\geq \sum_{k=1}^K p(\hat{y}'_k|\bm{e}')\log p(\hat{y}'_k|\bm{e}') - (\sum_{k=1}^K p(\hat{y}'_k|\bm{e}'))\log{p(\hat{y}_k|\bm{e}) }\\
% \end{align}
% \begin{align}
%     &\leq - t \cdot p(y) {\rm log}(t \cdot p(y)) - t \cdt p(y) {\rm log} ((1-t) \cdot \frac{1}{K}) - (1-t) \cdot \frac{1}{K} {\rm log}(t \cdot p(y)) - (1-t) \cdot \frac{1}{K} {\rm log} ((1-t) \cdot \frac{1}{K} {\rm log}) +  p(y){\rm log}p(y) \\
%     &= H(t \cdot p(y)) + KL(t\cdt p(y)| (1-t)/K) + KL( (1-t)/K | t\cdt p(y)) + H((1-t)/K) - H_e{Y}
% \end{align}

\section{Further Experimental Results}\label{sec:futher_experimental_results}

% As mentioned in the \textbf{paper Section 5}, we provide further experimental results on latent dimension removal on other PLMs, the ablation study, including stability of various additional training loss terms and latent space orthogonality.

\begin{figure}[h!]
\centering
\subfloat[ALBERT \texttt{CUE} on CoLA.]{
  \centering
  \includegraphics[width=0.35\columnwidth]{resources/new_latent_dim/albert-cola.png}}
%
\subfloat[ALBERT \texttt{CUE} on GoEmotions.]{
  \centering
  \includegraphics[width=0.35\columnwidth]{resources/new_latent_dim/albert-goemo.png}}\\
\subfloat[ALBERT \texttt{CUE} on Emotion.]{
  \centering
  \includegraphics[width=0.35\columnwidth]{resources/new_latent_dim/albert-emo.png}}
%
\subfloat[ALBERT \texttt{CUE} on MultiNLI.]{
  \includegraphics[width=0.35\columnwidth]{resources/new_latent_dim/albert-nli.png}}\\
\subfloat[DistilBERT \texttt{CUE} on CoLA.]{
  \includegraphics[width=0.35\columnwidth]{resources/new_latent_dim/distilbert-cola.png}}
%
\subfloat[DistilBERT \texttt{CUE} on GoEmotions.]{
  \includegraphics[width=0.35\columnwidth]{resources/new_latent_dim/distilbert-goemo.png}}\\
\subfloat[DistilBERT \texttt{CUE} on Emotion.]{
  \includegraphics[width=0.35\columnwidth]{resources/new_latent_dim/distilbert-emo.png}}
%
\subfloat[DistilBERT \texttt{CUE} on MultiNLI.]{
  \includegraphics[width=0.35\columnwidth]{resources/new_latent_dim/distilbert-nli.png}}\\
\subfloat[RoBERTa \texttt{CUE} on CoLA.]{
  \includegraphics[width=0.35\columnwidth]{resources/new_latent_dim/roberta-cola.png}}
%
\subfloat[RoBERTa \texttt{CUE} on GoEmotions.]{
  \includegraphics[width=0.35\columnwidth]{resources/new_latent_dim/roberta-goemo.png}}\\
\subfloat[RoBERTa \texttt{CUE} on Emotion.]{
  \includegraphics[width=0.35\columnwidth]{resources/new_latent_dim/roberta-emo.png}}
%
\subfloat[RoBERTa \texttt{CUE} on MultiNLI.]{
  \includegraphics[width=0.35\columnwidth]{resources/new_latent_dim/roberta-nli.png}}
     \caption{Evaluation results by removing latent dimensions. The $x$-axis represents the index of \textbf{removed} dimensions ranked by their relevance to $\Delta \bm{e}_i$, smaller index number indicates the latent dimension is more similar. Histograms show the ECE scores after removing the corresponding latent dimensions. The blue curve shows the predictive entropy. The green and red curves show classification accuracy and F1, respectively.}
    \label{fig:latent_ablation}
\end{figure}

\subsection{Results with Latent Dimension Removal} 
As mentioned in \textsection{5.2}, we study the impact of removing ranked latent variable dimensions on the other three base models: ALBERT, DistilBERT and RoBERTa. As shown in Figure \ref{fig:latent_ablation}, we can observe the same trend of ECE score and average entropy increasing when removing latent dimensions ranked by their influential scores on almost all the models while keeping accuracy and macro F1 scores almost unchanged. This proves our \texttt{CUE} framework can be generalized to interpret the uncertainty via latent dimensions on various models and different datasets. On the CoLA dataset, both ALBERT and RoBERTa exhibit a similar pattern compared with the BERT model; we can also observe a peak for the average entropy.  
The graphs show our \texttt{CUE} can effectively distinguish the importance between latent dimensions, and thus we can use those dimensions to interpret token level uncertainty as discussed in \textsection{4.2}.

\subsection{Ablation Study}

%In this section, 
As mentioned at the end of the \textsection{5}, we present an ablation study to investigate the contribution of various components in our framework.

\paragraph{Stability of training loss}
 
As discussed in \textsection{4.1}, our learning objective is implemented with four loss terms.
% that are implemented by three loss functions, i.e., the reconstruction loss, the reconstructed cross-entropy loss, and the orthogonality loss that encourages the orthogonality of the dimensions of the latent vector $\bm{z}_i$. 
We investigate the training stability benefits from orthogonal regularisation by replacing the orthogonality loss with either a KL-divergence loss or a Wasserstein loss, where the KL-divergence loss encourages the distribution of latent variables to follow the prior standard Gaussian distribution and is widely used in general Variational Auto-encoders \citep{vae,scholar_vae}, while the Wasserstein loss enforces the latent variables to follow a Dirichlet distribution and is used in Wasserstein Auto Encoder (WAE) \citep{nan-etal-2019-topic, tolstikhin2018wasserstein}.
% , we fine-tuned two additional BERT models, either with the orthogonality loss term replaced by a KL-divergence loss \citep{vae,scholar_vae} or a Wasserstein loss \citep{nan-etal-2019-topic, tolstikhin2018wasserstein}, to investigate the training stability. %, we implemented a KL divergence loss \citep{scholar_vae} and a Wasserstein loss \citep{nan-etal-2019-topic, tolstikhin2018wasserstein}. 
% The KL-divergence loss encourages the distribution of latent variables %encoder's mean and variance parameters 
% to follow the prior standard Gaussian distribution, while the Wasserstein loss enforces the latent variables to follow a Dirichlet distribution. 
The total loss (including the reconstruction loss and the cross-entropy loss) curves during training are shown in Figure \ref{fig:loss_stability}. We observe that the total loss replaced by either the KL-divergence loss or the Wasserstein loss exhibits %Both loss terms are highlighted in red color, and we can observe a clear 
fluctuation during the training process across all datasets. %Mean while, the different value of pairwise distance also indicates other loss term could potentially impact the ability of pairwise loss to reduce the distance between $\bm{e}'$ and $\bm{e}$. 
On the contrary, the loss with orthogonal regularisation is very stable. We further show the evaluation results with various loss terms in Table \ref{table:loss} \footnote{Results reported are single run results, which we used to generate loss stability graphs.}. It can be observed that our proposed framework with the orthogonality loss achieves better ECE results compared to using KL or Wasserstein loss. %All of these experiments examined this tendency and displayed the capability of orthogonality can efficiently force the dimensions to be orthogonal and help to enhance the interpretability.

\begin{figure*}[h!]
  \centering
  \includegraphics[width=\columnwidth]{resources/loss_visuals/loss_graph.pdf}
\caption{Comparison of the stability of the total loss for three loss terms trained with BERT model on four datasets. Red: total loss with KL divergence loss; Green: total loss with Wasserstein loss; Blue: total loss with Orthogonality loss.}
\label{fig:loss_stability}
\end{figure*} 

% \begin{table*}[h!]
% \centering
% \resizebox{0.85\columnwidth}{!}{
% \begin{tabular}{llllllllll}
% \toprule
%                  & \multicolumn{3}{c}{BERT VAE w/  Orthogonality}                                                              & \multicolumn{3}{c}{BERT VAE w/ KL}                                                              &\multicolumn{3}{c}{BERT VAE w/ Wasserstein }                          \\
% \cmidrule(lr){2-4} \cmidrule(lr){5-7}   \cmidrule(lr){8-10}
% Datasets             &  Acc    &  F1   & ECE$\downarrow$    &  Acc    &  F1    & ECE$\downarrow$   &  Acc    &  F1    & ECE$\downarrow$  \\ \midrule
% CoLA& 0.8130&0.7459&\textbf{0.0640}&0.8072&0.7300&0.1090&0.8044&0.7240&0.1111\\
% GoEmotions&0.6298&0.4661&\textbf{0.0321}&0.6298&0.4752&0.0695&0.6263&0.4608&0.0600\\
% Emotion&0.9255&0.8827&\textbf{0.0322}&0.9270&0.8847&0.0431&0.9275&0.8853&0.0441\\
% MultiNLI&0.8284&0.8278&\textbf{0.0272}&0.8294&0.8290&0.0418&0.8291&0.8286&0.0344\\
% \bottomrule
% \end{tabular}}

% \caption{Comparison of the performance of the BERT model fine-tuned with different loss terms on four datasets. }%We only reported each single model's result, of which we used to generate loss stability graphs.}
% \label{table:loss}
% \end{table*}
\begin{table*}[h!]
\centering
\resizebox{\columnwidth}{!}{
\begin{tabular}{lllllllllllll}
\toprule
                 & \multicolumn{4}{c}{BERT \texttt{CUE} w/  Orthogonality}                                                              & \multicolumn{4}{c}{BERT \texttt{CUE} w/ KL}                                                              &\multicolumn{4}{c}{BERT \texttt{CUE} w/ Wasserstein }                          \\
\cmidrule(lr){2-5} \cmidrule(lr){6-9}   \cmidrule(lr){10-13}
Datasets & Acc    &  F1  & $\mathcal{H}$  & ECE$\downarrow$&Acc    &  F1  & $\mathcal{H}$  & ECE$\downarrow$&Acc    &  F1  & $\mathcal{H}$  & ECE$\downarrow$\\ \midrule
CoLA& 0.8130&0.7459&0.4986&\textbf{0.0640}&0.8072&0.7300&0.3407&0.1090&0.8044&0.7240&0.3499&0.1111\\
GoEmotions&0.6298&0.4661&0.4333&\textbf{0.0321}&0.6298&0.4752&0.3345&0.0695&0.6263&0.4608&0.3437&0.0600\\
Emotion&0.9255&0.8827&0.0984&\textbf{0.0322}&0.9270&0.8847&0.0518&0.0431&0.9275&0.8853&0.0510&0.0441\\
MultiNLI&0.8284&0.8278&0.3650&\textbf{0.0272}&0.8294&0.8290&0.3194&0.0418&0.8291&0.8286&0.3343&0.0344\\
\bottomrule
\end{tabular}}

\caption{Comparison of the performance of the BERT model fine-tuned with different loss terms on four datasets. }%We only reported each single model's result, of which we used to generate loss stability graphs.}
\label{table:loss}
\end{table*}
 

%\subsection{Visualisation of loss stability}
\paragraph{Training Loss Stability with Additional Loss Terms} 
We further examine the training loss stability when adding the KL or Wasserstein distance loss terms to our framework. We fine-tuned two BERT-base uncased \texttt{CUE} models on the Emotions dataset. It can be observed in Figure \ref{fig:loss_compare_individual} that the pairwise distance (i.e., the reconstruction loss) seems to be very unstable and keeps fluctuating during training while the orthogonality loss shows a stable decreasing trend and converges quickly. If we only compare the KL loss with the Wasserstein loss, we can see that the Wasserstein loss is more stable compared to KL. %The fluctuation on pairwise loss can be impact by the distribution loss terms. Because the KL loss is obviously more unstable, the models tend to have higher average pairwise distance. This may also 
Our visualisation results show that the prior distributions assumed by the KL or the Wasserstein loss may not be suitable for reconstructing PLM-encoded representations, thus leading to higher ECE results compared to using the orthogonality constraint. %could not lead to a more calibrated model.
\begin{figure*}[ht]
\centering
\subfloat[Additional KL divergence loss term.]{
    \includegraphics[width=0.4\columnwidth]{resources/kl_loss.png}}
%
\subfloat[Additional Wasserstein loss term.]{
    \includegraphics[width=0.4\columnwidth]{resources/Wasserstein.png}}
\caption{Comparison of BERT models trained on Emotions with additional loss term. Blue: Reconstruction loss; Red: KL loss in (a) and Wasserstein loss in (b); Green: Orthogonality loss.}
\label{fig:loss_compare_individual}
\end{figure*}
%  \begin{figure}[h!]
% \centering
% \begin{subfigure}{.4\columnwidth}
%   \centering
%   \includegraphics[width=\columnwidth]{resources/kl_loss.png}
%   \caption{Additional KL divergence loss term.}
% \end{subfigure}%
% \begin{subfigure}{.4\columnwidth}
%   \centering
%   \includegraphics[width=\columnwidth]{resources/Wasserstein.png}
%   \caption{Additional Wasserstein loss term.}
% \end{subfigure}
% \caption{Comparison of BERT models trained on Emotions with additional loss term. Blue: Reconstruction loss; Red: KL loss in (a) and Wasserstein loss in (b); Green: Orthogonality loss.}
% \label{fig:loss_compare_individual}
% \end{figure}

\paragraph{Latent Space Orthogonality}
\begin{table*}[h!]
\centering
\resizebox{\columnwidth}{!}{
\begin{tabular}{lcccccccc}
\toprule
                 & \multicolumn{4}{c}{BERT \texttt{CUE} w/  Orthogonality}                                                              & \multicolumn{4}{c}{BERT \texttt{CUE} w/o Orthogonality}                             \\
\cmidrule(lr){2-5} \cmidrule(lr){6-9}   
Datasets &  Acc    &  F1  & $\mathcal{H}$  & ECE$\downarrow$    &  Acc    &  F1  & $\mathcal{H}$  & ECE$\downarrow$   \\ \midrule
CoLA      & 0.8123\small{±0.0012}&0.8762\small{±0.0007}&0.4991\small{±0.0032}&\textbf{0.0677}\small{±0.0056}& 0.8042\small{±0.0011}&0.7230\small{±0.0024}&0.3458\small{±0.0047}&0.1121\small{±0.0020} \\
GoEmotions& 0.6282\small{±0.0029}&0.4712\small{±0.0087}&0.4433\small{±0.0159}&\textbf{0.0326}\small{±0.0013}& 0.6291\small{±0.0019}&0.4652\small{±0.0037}&0.3432\small{±0.0014}&0.0615\small{±0.0023} \\
Emotion   & 0.9259\small{±0.0009}&0.8850\small{±0.0015}&0.1031\small{±0.0082}&\textbf{0.0289}\small{±0.0043}& 0.9268\small{±0.0003}&0.8848\small{±0.0006}&0.0519\small{±0.0009}&0.0430\small{±0.0016} \\
MultiNLI  & 0.8283\small{±0.0005}&0.8277\small{±0.0005}&0.3665\small{±0.0030}&\textbf{0.0262}\small{±0.0021}& 0.8281\small{±0.0009}&0.8277\small{±0.0009}&0.3317\small{±0.0009}&0.0370\small{±0.0010} \\
\bottomrule
\end{tabular}}
\caption{Comparison of results on BERT models trained with/without latent space orthogonality.}
\label{table:orthogonal}
\end{table*}
\begin{figure}[h!]
    \small
  \centering
  \includegraphics[width=0.6\columnwidth]{resources/new_latent_dim/bert-non_orth.png}
  \caption{ECE and average entropy with latent dimension removal from the model trained without the orthogonality regulariser. The $x$-axis represents the index of \textbf{removed} dimensions ranked by their relevance to $\Delta \bm{e}_i$, smaller index number indicates the latent dimension is more similar. Histograms show the ECE scores after removing the corresponding latent dimensions. The blue curve shows the predictive entropy. The green and red curves show the classification accuracy and F1, respectively.}
  \label{fig:non_ort}
\end{figure} 

As explained in \textsection{4.1}, the orthogonality regulariser facilitates a better interpretation of the latent space. Shown in Table \ref{table:orthogonal}, we compare the overall performance between BERT models trained with and without latent space orthogonality. The PLMs fine-tuned with Eq. (8) outperform the counterparts without the orthogonality regularisation in ECE and average entropy on all four datasets. It is also interesting to find the f1 scores slightly decrease on the models trained without the orthogonality on almost all the datasets. Therefore, the orthogonality constraints ensure the decoder network to facilitate the same distribution on the latent space to generate reconstructed representations that lead to uncertain predictions. 
% better perturbations that make uncertain predictions. 

% \begin{table*}[h!]
% \centering
% \resizebox{0.9\columnwidth}{!}{
% \begin{tabular}{lllllll}
% \toprule
%                  & \multicolumn{3}{c}{BERT VAE w/  Orthogonality}                                                              & \multicolumn{3}{c}{BERT VAE w/o Orthogonality}                             \\
% \cmidrule(lr){2-4} \cmidrule(lr){5-7}   
% Datasets             &  Acc    &  F1   & ECE$\downarrow$    &  Acc    &  F1    & ECE$\downarrow$    \\ \midrule
% CoLA& 0.8123±0.0012	&0.8762±0.0007&\textbf{0.0677±0.0056}&0.8042±0.0011&0.7230±0.0024&0.1121±0.0020 \\
% GoEmotions& 0.6282±0.0029 & 0.4712±0.0087& \textbf{0.0326±0.0013} & 0.6291±0.0019	& 0.4652±0.0037 & 0.0615±0.0023 \\
% Emotion& 0.9259±0.0009&	0.8850±0.0015& \textbf{0.0289±0.0043} & 0.9268±0.0003	& 0.8848±0.0006	& 0.0430±0.0016 \\
% MultiNLI& 0.8283±0.0000 &	0.8277±0.0005 & \textbf{0.0262±0.0021} & 0.8281±0.0009	& 0.8277±0.0009& 0.0370±0.0010 \\
% \bottomrule
% \end{tabular}}
% \caption{Comparison of results on BERT models trained with/without latent space orthogonality.}
% \label{table:orthogonal}
% \end{table*}

We performed a further ablation study to examine the interpretability of the latent space without being regularised by orthogonality. As shown in Figure \ref{fig:non_ort}, without the orthogonality loss term, there is no clear relationship between the tendency of ECE scores and the average entropy during the removal of latent dimensions ranked by their influential scores. Without orthogonality we are not able to maintain the distributional consistency from the latent representation space, hence we can see an obvious fluctuation in Accuracy and F1. Without a consistent tendency, it is thus %the average entropy and ECE score doesn't share the same tendency. This described disorder makes it 
difficult to investigate each latent dimension's importance and interpret the impact of each latent dimension on model predictive uncertainty. %Hence, without formula \ref{eq:orthogonality}, we cannot interpret the decision making process and find out possible uncertain words with latent dimensions.

% \bibliography{uai2023-template}
\bibliography{li_516-supp}

\end{document}
