%%%%%%%% ICML 2023 EXAMPLE LATEX SUBMISSION FILE %%%%%%%%%%%%%%%%%

% \documentclass{article}

% % Recommended, but optional, packages for figures and better typesetting:
% \usepackage{microtype}
% \usepackage{graphicx}
% \usepackage{subfigure}
% \usepackage{booktabs} % for professional tables

% % hyperref makes hyperlinks in the resulting PDF.
% % If your build breaks (sometimes temporarily if a hyperlink spans a page)
% % please comment out the following usepackage line and replace
% % \usepackage{icml2023} with \usepackage[nohyperref]{icml2023} above.
% \usepackage{hyperref}


% % Attempt to make hyperref and algorithmic work together better:
% \newcommand{\theHalgorithm}{\arabic{algorithm}}

% % Use the following line for the initial blind version submitted for review:
% \usepackage{icml2023}

% % If accepted, instead use the following line for the camera-ready submission:
% % \usepackage[accepted]{icml2023}

% % For theorems and such
% \usepackage{hyperref}
% \usepackage{url}
% \usepackage{color}
% \usepackage{tcolorbox}
% \usepackage{CJK}
% \usepackage{adjustbox}
% % \usepackage{algorithm}
% % \usepackage{algpseudocode}
% \usepackage{graphicx}
% \usepackage{booktabs}
% \usepackage{threeparttable}
% \usepackage{siunitx}
% \usepackage{lipsum}
% \usepackage{multirow}
% \usepackage{bm}
% \usepackage{amssymb}
% \usepackage{amsmath,amsthm,mathtools}
% \usepackage[normalem]{ulem}
% \usepackage{subcaption}
% \usepackage{commath}
% \usepackage{wrapfig,lipsum}
% \usepackage{enumitem}
% \usepackage{subfig}


% \DeclareMathOperator*{\diag}{diag}

% % if you use cleveref..
% \usepackage[capitalize,noabbrev]{cleveref}

% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% % THEOREMS
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% \theoremstyle{plain}
% \newtheorem{theorem}{Theorem}[section]
% \newtheorem{proposition}[theorem]{Proposition}
% \newtheorem{lemma}[theorem]{Lemma}
% \newtheorem{corollary}[theorem]{Corollary}
% \theoremstyle{definition}
% \newtheorem{definition}[theorem]{Definition}
% \newtheorem{assumption}[theorem]{Assumption}
% \theoremstyle{remark}
% \newtheorem{remark}[theorem]{Remark}

% % Todonotes is useful during development; simply uncomment the next line
% %    and comment out the line below the next line to turn off comments
% %\usepackage[disable,textsize=tiny]{todonotes}
% \usepackage[textsize=tiny]{todonotes}

% \newcommand{\gl}[1]{\textcolor{magenta}{#1}}

% \linespread{0.97}

% \DeclareMathOperator*{\argmax}{arg\,max}
% \DeclareMathOperator*{\argmin}{arg\,min}
% \DeclareMathOperator*{\predictor}{predictor}


% % The \icmltitle you define below is probably too long as a header.
% % Therefore, a short form for the running title is supplied here:
% \icmltitlerunning{Submission and Formatting Instructions for ICML 2023}

% \begin{document}

\documentclass[accepted]{uai2023} % for initial submission
% \documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{abbrvnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
% \usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
% For theorems and such
\usepackage{hyperref}
\usepackage{url}
\usepackage{color}
\usepackage{tcolorbox}
\usepackage{CJK}
\usepackage{adjustbox}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{graphicx}
\usepackage{booktabs}
\usepackage{threeparttable}
\usepackage{siunitx}
\usepackage{lipsum}
\usepackage{multirow}
\usepackage{bm}
\usepackage{amssymb}
\usepackage{amsmath,amsthm,mathtools}
\usepackage[normalem]{ulem}
% \usepackage{subcaption}
\usepackage{commath}
\usepackage{wrapfig,lipsum}
\usepackage{enumitem}
\usepackage{subfig}

\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}
\DeclareMathOperator*{\predictor}{predictor}


% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% % THEOREMS
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

%\linespread{0.98}

\title{\texttt{CUE}: An Uncertainty Interpretation Framework for Text Classifiers Built on Pre-Trained Language Models}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<jiazheng.li@kcl.ac.uk>?Subject=UAI 2023 paper}{Jiazheng Li}}
\author[2]{Zhaoyue Sun}
\author[3]{Bin Liang}
\author[1]{Lin Gui}
\author[1,2,4]{Yulan He}
% Add affiliations after the authors
\affil[1]{%
 Department of Informatics, King’s College London, UK
}
\affil[2]{%
    Department of Computer Science, University of Warwick, UK
}
\affil[3]{%
   Joint Lab of HITSZ-CMS, Harbin Institute of Technology, Shenzhen, China
  } 
\affil[4]{%
    The Alan Turing Institute, UK
  }

\setlength{\abovedisplayskip}{0pt}
\setlength{\belowdisplayskip}{0pt}

\begin{document}
\maketitle

\begin{abstract}
Text classifiers built on Pre-trained Language Models (PLMs) have achieved remarkable progress in various tasks including sentiment analysis, natural language inference, and question-answering. However, the occurrence of uncertain predictions by these classifiers poses a challenge to their reliability when deployed in practical applications. Much effort has been devoted to designing various probes in order to understand what PLMs capture. But few studies have delved into factors influencing PLM-based classifiers' predictive uncertainty. In this paper, we propose a novel framework, called \texttt{CUE}, which aims to interpret uncertainties inherent in the predictions of PLM-based models. In particular, we first map PLM-encoded representations to a latent space via a variational auto-encoder. We then generate text representations by perturbing the latent space which causes fluctuation in predictive uncertainty. By comparing the difference in predictive uncertainty between the perturbed and the original text representations, we are able to identify the latent dimensions responsible for uncertainty and subsequently trace back to the input features that contribute to such uncertainty. Our extensive experiments on four benchmark datasets encompassing linguistic acceptability classification, emotion classification, and natural language inference show the feasibility of our proposed framework. Our source code is available at \url{https://github.com/lijiazheng99/CUE}.
\end{abstract}

\section{Introduction}
% Predictive uncertainty is one of the aspects that resides in model trustworthiness and transparency that have been popularly researched due to the increasing complexity of deep neural networks and the potential risks that could bring without understanding black-box models' decision-making process. Recent progress in computer vision has undercovered the source of the uncertainty for image classifiers via perturbations made by Variational Autoencoders. However, due to the discreteness of textual data and the complex structure of \textbf{P}re-trained \textbf{L}anguage \textbf{M}odels (PLMs), identifying the source of text classifiers built on PLMs is particularly difficult and less sufficient research has been carried out. Although PLM classifiers have made remarkable progress on various Natural Language Processing (NLP) tasks \citep{bert,roberta,albert,distilbert}, the aforementioned issue has become a barrier to applying them to sensitive domains such as medicine and finance \citep{uncertainty_medical,uncertainty_finance}.
% The predictive uncertainty is generally believed to include two aspects - \emph{aleatoric uncertainty} and \emph{epistemic uncertainty}, where the aleatoric uncertainty measures the data uncertainty due to inherent random effects and is irreducible, while the epistemic uncertainty measures the uncertainty caused by the lack of knowledge from data and is reducible \citep{predictive_uncertainty}. Common approaches to estimating the uncertainty focused on investigating the mismatch of distribution between the training and testing sets. 
% The common approach to study classifiers is to make perturbations on input representation and 
% this is particularly difficult for \textbf{P}re-trained \textbf{L}anguage \textbf{M}odels (PLMs).
% Rather than directly perturbing the input features, in computer vision, perturbations can be done in the latent space to generate counterfactuals such that the modified input would still reside on the data manifold while the model's prediction on the modified input becomes more confident \citep{getting_clue}

Text classifiers built on Pre-trained Language Models (PLMs) have made remarkable progress on various Natural Language Processing (NLP) tasks \citep{bert,roberta,albert,distilbert}. However, their deployment in practical applications still faces significant challenges. Of particular concern, these models 
% are either unable to provide measures of uncertainty in their predictions or 
tend to make over-confident predictions in uncertain cases \citep{softmax_uncertain, he-etal-2020-towards,predictive_uncertainty}. Since PLMs have been widely used in various applications, such issues cause concerns about model trustworthiness and transparency, which becomes a barrier to deploying PLMs in sensitive domains such as medicine and finance.  %\citep{uncertainty_medical,uncertainty_finance}.
\begin{figure*}[!t]
  \centering
  \includegraphics[width=\linewidth]{resources/fig1.pdf}
  \caption{The illustration of proposed framework: \textbf{(a)} The original representations learned from a Pre-trained Language Model (PLM) and the decision boundary separating two classes; \textbf{(b)} We can freeze the PLM-classifier parameters and perturb the PLM-encoded representations in the latent space to increase the aleatoric (data) uncertainty along some isotropic directions while preserving the predictive labels. This would help determine the uncertain areas; \textbf{(c)} Using the representations reconstructed from the perturbed latent space, the uncertainty for out-of-distribution (OOD) areas can be estimated. }
  \label{fig:exampleIllustration}
\end{figure*} 

Predictive uncertainty is generally believed to include two aspects - \emph{aleatoric uncertainty} and \emph{epistemic uncertainty}, where the aleatoric uncertainty measures the data uncertainty due to inherent random effects and is irreducible, while the 
epistemic uncertainty measures the uncertainty caused by the lack of knowledge from data and is reducible \citep{predictive_uncertainty}.
% Predictive uncertainty is defined as the probability of mis-classification \citep{sullivan2015introduction}. 
Numerous approaches have been proposed to estimate the predictive uncertainty of deep neural models, such as Deep Ensemble models \citep{deep_ensemble}, Bayesian Neural Networks (BNN) \citep{bnn_wight_uncertainty} and Monte-Carlo (MC) Dropout \citep{monte_carlo_dropout}. 
%Previous uncertainty estimation methods either require a large memory footprint to train ensemble models or are hard to interpret the uncertainty caused by the out-of-distribution (OOD) data. Therefore, taking advantage uniform distribution of Gaussian mixture generative models \citep{wang2018glue}, the uncertainty induced by dataset shift\citep{10.5555/1462129} can be estimated via more controllable constraints (variance). Thus, can be used to interpret the source of uncertainty in deterministic classifiers built on PLMs. %or focus on estimating the uncertainty caused by out-of-distribution (OOD) data \citep{acl2022_uncertainty_transformers}.  but ignore the uncertainty induced by dataset shift \citep{10.5555/1462129} - the distributional mismatch between training and test set. Research on prior neural networks \cite{predictive_uncertainty,wang2018glue} enables the study of predictive uncertainty with lower computational resources \cite{acl2022_uncertainty_transformers}.  
%Common approaches on estimate uncertainty in Deep learning has been widely researched, including Deep Ensemble models \citep{deep_ensemble}, Bayesian Neural Network \citep{bnn_wight_uncertainty}, and MC Dropout \citep{monte_carlo_dropout}. 
Similar idea has been applied to PLMs in recent years, to study the uncertainty of text classifiers.
Particularly, quantifying uncertainty in PLM-based classifiers can be done by incorporating weight uncertainty into the PLM architecture. However, uncertainty can only be induced to a certain number of layers (e.g., the last layer of the PLM feature extractor and/or the classification layer) due to a large number of PLM layers and parameters. Alternatively, one could use deep ensembles by aggregating classification results generated from multiple PLM classifiers trained with different initialisation \citep{deep_ensemble}, or apply MC dropout in the inference stage to estimate the uncertainty of PLMs \citep{acl2022_uncertainty_transformers}.

Previous studies \citep{vulic-etal-2020-probing, clark-etal-2019-bert, yang-etal-2021-exploring} have also been devoted to designing various probes in order to understand what PLMs capture. Nevertheless, they largely ignore the interpretation of the source of the uncertainty, i.e., identifying the input features which cause classification uncertainty, which can be crucial for understanding the model and taking appropriate mitigating strategies. In text classification, recent research tried to identify word tokens that lead to uncertainty via perturbations on input sequences \citep{token_level_uncertainty,token_level_classification_interpret}. However, due to the discreteness of textual data, token replacement or removal would require a large search space on the input sequence and incur expensive computational costs. %Therefore, a lightweight interpretation method to investigate PLM classifiers' uncertainty caused by distributional mismatch is urgently needed by the research community.
% Specifically, predictive uncertainty is generally believed to include two aspects - \emph{aleatoric uncertainty} and \emph{epistemic uncertainty}, where the aleatoric uncertainty measures the data uncertainty due to inherent random effects and is irreducible, while the epistemic uncertainty measures the uncertainty caused by the lack of knowledge from data and is reducible \citep{predictive_uncertainty}. 
% Corresponding to text classification, identifying the word tokens that lead to uncertainty could help us to decide whether to increase appropriate training data and what data should be added to reduce epistemic uncertainty. On the other hand, interpreting the uncertainty of PLM might also shed a light on whether to trust a model prediction.
%With the help of uncertainty interpretation, researchers can find out the reason that caused predictive uncertainty and decide when to trust in their model's prediction.

% In this paper, we consider generating perturbations on text representations to induce aleatoric uncertainty under the supervision of original text classifier to study the epistemic uncertainty.
% In this paper, we aim to interpret the epistemic uncertainty by generating perturbations on text representations which induce aleatoric uncertainty while maintaining prediction boundaries.
% In this paper, we aims to interpret and identify the cause of the epistemic uncertainty on PLM text classifiers at both sentence- and token-level.

In this paper, we aim to interpret the predictive uncertainty on PLM text classifiers %from both \emph{aleatoric uncertainty} and \emph{epistemic uncertainty} perspectives 
by identifying the input tokens that cause the uncertainty.
% In this paper, we focus on the use of distance difference between PLM-encoded text representations to represent the predictive uncertainty change on PLM classifiers and thus study the epistemic uncertainty.
% we aim to investigate the epistemic uncertainty via text representations perturbation on PLM classifiers.
% In this paper, we propose to interpret the predictive uncertainty by comparing the original PLM learned representation with semi-factuals which are perturbations to the text representation but maintaining prediction boundaries.
% with the supervision of Bregman Divergences \citet{banerjee2005clustering}
We propose a novel PLM \textbf{C}lassifier \textbf{U}ncertainty \textbf{E}xplanation (\texttt{CUE}) framework built on Variational Auto-encoder (VAE) \citep{vae,scholar_vae} that generates perturbations on latent text representations to induce uncertain predictions.
%\emph{semi-factuals} \citep{Kenny_Keane_2021}, which stands for perturbed text representations that maintaining prediction boundaries, to represent the \emph{aleatoric uncertainty}. 
As shown in Figure \ref{fig:exampleIllustration}, we can perturb the PLM-encoded representations in the latent space to increase the aleatoric uncertainty (data uncertainty) along some isotropic directions while preserving the predictive labels. As will be shown in \textsection{\ref{sec:uncertain_estimation}}, this is equivalent to decreasing the predictive epistemic uncertainty. By examining the difference between the original and the perturbed text representations, a subset of input features (i.e., word tokens) can be identified as the interpretation of the original model’s predictive uncertainty. 
%We then subtract the aleatoric uncertainty from the mean squared error to represent the \emph{epistemic uncertainty}. Furthermore, the difference between the reconstructed representation and the original PLM-encoded representation is compared to trace back to the corresponding tokens that cause the uncertainty, thus interpreting the predictive uncertainty.
%Therefore, we can trace back the cause of the epistemic uncertainty via latent dimensions importance from the VAE.
% To achieve this, we propose a two-stage VAE framework \citep{vae,scholar_vae} to study the text classifier's predictive uncertainty by perturbing the latent feature space of PLM-encoded text representations. 
% The resulting representations are called the \emph{semi-factual representations} \citep{Kenny_Keane_2021} that when they are fed to the classifier, the model's prediction is not significantly changed. 
We compared our framework with existing approaches addressing the predictive uncertainty problem on three classification tasks across four benchmark datasets. Extensive experimental results show that our proposed method can identify the source of epistemic uncertainty and calibrate text representations from four commonly used PLMs.  
% Moreover, our proposed framework allows the identification of input token which cause classification uncertainty and offering uncertainty interpretations. 

In summary, our contributions are:
%\begin{itemize}[noitemsep,nolistsep]
  %\item 
  \textbf{(1)} We propose a novel framework \texttt{CUE} to induce perturbations on PLM-encoded representations for uncertainty interpretation of the PLM-based text classifiers.
  %\item 
  \textbf{(2)} We propose an uncertainty feature identification algorithm to identify token-level features which lead to model predictive uncertainty.
  %\item 
  \textbf{(3)} We validate the effectiveness of our proposed framework by conducting extensive experiments using various classifiers built on four commonly-used PLMs on three different tasks and four datasets with class numbers ranging from 2 to 27. The results show that our proposed framework achieves lower expected calibration errors compared to existing approaches such as label smoothing, MC dropout, and BNN. 
%\end{itemize}
% To the best of our knowledge, our framework is the first to generate semi-factual perturbations on PLM-encoded text representations to study the text classifier uncertainty and provide uncertainty interpretations. 
To the best of our knowledge, our framework is the first to study the token interpretation of PLM-based classifiers' predictive uncertainty from the representation space, without editing the semantic meaning of the original input text.
% To the best of our knowledge, our framework is the first to interpret PLM text classifiers' uncertainty on tokens via text representations, without editing the semantic meaning of the original sentence.
% We will release our code on GitHub.

% To tackle the , in this work, we focus on studying the classifier predictive uncertainty by perturbing the latent feature space of PLM-encoded text representations. The resulting representations are called the \emph{semi-factual representations} \citep{Kenny_Keane_2021} that when they are fed to the classifier, the model's prediction is not significantly changed.  %obtaining a more calibrated model with representation-level perturbation. 
% We use our reconstruction loss to determine the perturbed semi-factual as far as possible. With this new framework, we can study the classifier's decision boundary by learning the Probability Density Function (PDF) of trainign data.
% % uncertainty with the change on predictive probabilities based on the original PLM-encoded representation and the perturbed representation (a.k.a. the semi-factual representation). 
% We further study the predictive uncertainty change by identifying the most important latent dimensions in the latent feature space so as to retrieve their corresponding input tokens which lead to classification uncertainty, as will be described in Section \ref{sec:inputFeatureIdentification}. %,


% Text classifiers built on Pre-trained Language Models (PLMs) have made remarkable progress on various Natural Language Processing (NLP) tasks including sentiment analysis and natural language inference \citep{bert,roberta,albert,distilbert}. However, their deployment in practical applications still faces significant challenges. Of a particular concern, these models tend to make 
% uncertain predictions \citep{softmax_uncertain, he-etal-2020-towards}. Such an issue has raised the concern of model trustworthiness and transparency \citep{uncertainty_medical,uncertainty_finance} and could cause a barrier of applying the deep neural language models to sensitive domains (e.g., medical and finance domains). Therefore, it is necessary to develop tools for better understanding predictive uncertainty from token level and explain the decision boundary (kernel density) of text classifiers. With the help of uncertainty interpretation, researchers can find out the 
% reason that caused predictive uncertainty and decide when to trust in their model's prediction.

% The predictive uncertainty is defined as the probability of classification error \citet{sullivan2015introduction}. Researchers have found that part of the uncertainty for NLP tasks comes from the complexity of deep neural models, which tend to output overconfident probabilities that lead to unreliable predictions \citep{softmax_uncertain}. Recent research \citep{Kaushik2020Learning} revealed that the use of counterfactual texts can improve the text classifier robustness by removing or replacing some input tokens. However, PLM-based classifiers tend to highly rely on the PLM-encoded text representations which are obtained from models involving millions and even billions of parameters. Tracking the change between input and output in the discrete input token space might not reveal the mechanism of the black box. It is an emerging area to instead analyse and interpret text classifier behaviours in a continuous latent representation space.   

% %Various methods have been explored to investigate PLMs to understand what properties PLM learned during the fine-tuning process from different perspectives. Recent research \cite{yang-etal-2021-exploring} found out that the use of counterfactual texts can improve the text classifier robustness. However, unlike statistical language models, PLM classifiers tend to highly rely on its latent representations which is obtained from a complicated learning architecture. It is an emerging area to generate counterfactual representation perturbation to analysis and interpret text classifier behaviours.  

% Quantifying uncertainties in PLM-based classifiers can be done by incorporating weight uncertainty into the PLM architecture. However, since PLMs may have many layers and millions of parameters, it is only feasible to induce the uncertainty to a certain number of layers (e.g., the last layer of the PLM feature extractor) and/or the classification layer. Alternatively, one could use deep ensembles by aggregating classification results generated from multiple PLM classifiers trained with different initialisation \citep{deep_ensemble}, or apply Monte-Carlo dropout during the inference stage \citep{monte_carlo_dropout,acl2022_uncertainty_transformers}. Nevertheless, none of these approaches can interpret the source of the uncertainty, i.e., identifying the input features which cause classification uncertainties. 

% To address the above issues, in this work, we focus on studying the classifier predictive uncertainty by perturbing the latent feature space of PLM-encoded text representations. The resulting representations are called the \emph{semi-factual representations} \citep{Kenny_Keane_2021} that when they are fed to the classifier, the model's prediction is not significantly changed.  %obtaining a more calibrated model with representation-level perturbation. 
% We use our reconstruction loss to determine the perturbed semi-factual as far as possible. With this new framework, we can study the classifier's decision boundary by learning the Probability Density Function (PDF) of trainign data.
% % uncertainty with the change on predictive probabilities based on the original PLM-encoded representation and the perturbed representation (a.k.a. the semi-factual representation). 
% We further study the predictive uncertainty change by identifying the most important latent dimensions in the latent feature space so as to retrieve their corresponding input tokens which lead to classification uncertainty, as will be described in Section \ref{sec:inputFeatureIdentification}. %,

% %Uncertainties can be category in to two types -- Aleatoric uncertainty and Epistemic uncertainty \citep{sullivan2015introduction}: Aleatoric uncertainty often refers to the uncertainty that incorporate from the data, such as uncertainty brought by poor quality of labeling data. Epistemic uncertainty stands for those uncertainty caused by models due to lack of knowledge. For example, models give uncertain predictions from instances due to limited amount of training data. In our work we mainly focus on the epistemic uncertainty. Particularly we define the type of our uncertainty as $P(\hat{y}!=y)$, \textbf{which stands for the type of the predictive uncertainty that models gives incorrect predictions}. 

% % \begin{figure*}[htbp]
% %   \centering
% %   \includegraphics[width=0.5\columnwidth]{Untitled.pdf}
% % \caption{Our framework target on estimate the uncertain predictions where the wrong predictions locates.}
% % \label{fig:pipeline}
% % \end{figure*} 

% %Common approaches to estimate epistemic uncertainty are through the injection of stochastic noise and study the predictive distribution \citep{8371683,bnn_wight_uncertainty,sngp}. Unlike images, textual data are discrete and in a non-numerical form that models can hardly to study its distribution directly. To deal with this issue PLMs relies on textual data encoded at a high-dimensional continuous representation space, that pre-trained on huge amount of textual data. Tracking the change between input/output in the discrete feature space might not reveal the mechanism of the black box. Therefore, we cannot easily capture those prior distributions like research did on the statistical word embeddings \citep{scholar_VAE}. Some research achieved to estimates PLM uncertainty via deterministic networks such as BNN, SNGP and MCDropout \citep{bnn_wight_uncertainty, sngp,monte_carlo_dropout, acl2022_uncertainty_transformers}. Those approaches either assumes models' weights follow a Gaussian or a mixture of Gaussian distribution or adopt concretely Bernoulli distributed random variables to approximate the exact posterior distribution \citep{8371683,monte_carlo_dropout,10.5555/3045118.3045290}. 

% %Nevertheless, all these approaches cannot identify the input features which causes classification uncertainties and requires additional training or modifying on the origin model structure. Some methods have also been examined that tend to make over-confident predictions \citep{mc_overcofident}. For a precise and sensitive model like PLMs, modification on models or change on input data could potentially cause difference on the original textual representation. Therefore, their analysis may result in different object from the representation space. 
% % To tackle this issue, in this paper, we present a solution to use latent variable model perform the uncertainty estimation on PLMs.

% % If we assume the distribution of input golden label is $P(y|x)$, current researches mainly focus on investigate the predictive distributions on a trained model, which is $P(\hat{y}|x)$. The uncertainty they analysed is the distribution of wrong predictions, which is $P(\hat{y}!=y|x)$. However, due to the modification on the models, they tried to produce a new distribution $P(\hat{y}'|x)$ that approach the $P(y|x)$ to interpret the behaviour of $P(\hat{y}|x)$. For representation based models this is always hard to ensure all those distributions shares the same space when their parameter changes. Hence, in our approach, we trying to use our $P(\hat{y}'|x)$ to approach of the distribution of $P(\hat{y}|x)$. Since we kept the original models' structure and freeze their parameters, plus the restriction with our objective functions, our framework ensures this approximation stays in the same dimensional space. 


% %Without the requirement of additional training on original model or modifying the model structure, our framework tend to perform perturbation on model's original representation space and generate semi-factual representation points to find a more calibrated distribution based on its original classifier \citep{Kenny_Keane_2021}. We did not assume a strong distribution to enforce the data points as Gaussian distribution or other formats. Instead, as explained in section \ref{sec:counterfactual}, our framework tend to learn the distribution $P(\hat{y}'|x)$ under the supervision of original models' classifier $P(\hat{y}|x)$. We further proved the difference on perturbed text representation can be seen as interpolation on the $e$'s space and represent different input $x$s. This difference can be then used to estimate the change of prediction, $P(\hat{y}'!=\hat{y}|x)$. Then, we further explored the capability of our framework on provide uncertainty interpretation at token level in section \ref{sec:inputFeatureIdentification}.

% Extensive experimental results show that our proposed method is sufficient to calibrate various classifiers built on four commonly used PLMs, achieving lower expected calibration errors on three tasks and across four benchmark datasets compared to existing approaches addressing the prediction overconfidence problem. Moreover, our proposed framework allows the identification of input token which cause classification uncertainty, offering uncertainty interpretations. %estimate uncertainty and calibrate its representation to find a distribution under original classifier's supervision. 
% In summary, our contributions are:
% \begin{itemize}
%   \item We propose a novel latent variable framework %\citep{VAE,scholar_VAE} 
%   to generate semi-factual perturbations on PLM-encoded representations for classifier uncertainty analysis.
%   %\item We investigate the capability of our latent variable's on interpretation for uncertainty.
%   \item We propose an uncertain feature identification algorithm to identify token-level features which lead to model predictive uncertainty. % sequence-level predictive uncertainty to token-level.
%   \item We validate the effectiveness of our proposed framework by conducting extensive experiments using various classifiers built on four commonly-used PLMs on three different tasks and four datasets with class numbers ranging from 2 to 27. The results show that our proposed framework achieves lower expected calibration errors compared to existing approaches such as label smoothing, Monte-Carlo dropout or Bayesian neural networks. 
% %   \item We use our framework to interpret PLMs predictive uncertainty both at the sentence- and the token-level.
% \end{itemize}
% To the best of our knowledge, our framework is the first to generate semi-factual interpolation for uncertainty analysis on PLM-induced text representations to offer interpretations of predictive uncertainty. % from sentences representations. 
% We will release our code on GitHub.

% Original introduction

% Researchers have recently found that part of the (predictive) uncertainty for NLP tasks comes from the complexity of deep neural models, which tends to output overconfident probabilities that lead to unreliable predictions \citep{softmax_uncertain}.  Recent research \citep{yang-etal-2021-exploring} found out that the use of counterfactual texts can improve the text classifier robustness by removing/editing input tokens. However, unlike statistical language models, PLM classifiers tend to highly rely on their latent representations which are obtained from a complicated learning architecture. Tracking the change between input/output in the discrete feature space might not reveal the mechanism of the black box. It is an emerging area to generate a counterfactual representation for analyzing and interpreting text classifier behaviours in a continuous representation space.

% Quantifying uncertainties in PLM-based classifiers can be done by incorporating weight uncertainty into the PLM architecture. However, since PLM may have many layers and millions of parameters, it is only feasible to induce the uncertainty to a certain number of layers (e.g., the last layer of the PLM feature extractor) and/or the classification layer. Alternatively, one could use deep ensembles by aggregating classification results generated from multiple PLM classifiers trained with different initialisation \citep{deep_ensemble}, or apply Monte-Carlo dropout during the inference stage \citep{monte_carlo_dropout,acl2022_uncertainty_transformers}. Nevertheless, all these approaches cannot identify the input features which causes classification uncertainties.

% \begin{figure*}[htbp]
%   \centering
%   \includegraphics[width=\columnwidth]{pipeline.pdf}
% \caption{Our framework requires two phases of training: \textbf{Phase 1.} Fine-tune a classifier built on a Pre-trained Language Model (PLM) on a target dataset; \textbf{Phase 2.} Freeze the PLM and classification layer parameters and plug-in a VAE and train the VAE that perturbs the latent representation $\bm{z}$ to generate counterfactual $\bm{e}'$ which leads to predictive uncertainty fluctuation.}
% \label{fig:pipeline}
% \end{figure*} 

% To address the above issues, in this work, we focus on studying the classifier predictive uncertainty by obtaining a more calibrated model with representation-level perturbation. With this new framework, we can study the classifier uncertainty with the change on predictive probabilities based on the original representation and the perturbed representation. We study the predictive uncertainty change by identifying the most important latent dimension so as to retrieve its corresponding input tokens which lead to uncertainty, as will be %mapping it on the latent space 
% described in Section \ref{sec:inputFeatureIdentification}.

% Extensive experimental result shows that our proposed method is efficient to calibrate model with reasonable predictions. In summary, our contribution can be summarized as:
% \begin{itemize}
%   \item We proposed a Variational Autoencoder (VAE) \citep{VAE,scholar_VAE} framework to generate counterfactual perturbation on PLM encoded representations for classifier uncertainty analysis.
%   \item We proposed an uncertain feature identification algorithm to map sequence-level predictive uncertainty to token-level.
%   \item We use our framework to interpret PLMs predictive uncertainty both at the sentence- and the token-level.
% \end{itemize}
% To the best of our knowledge, our framework is the first to generate counterfactual perturbation for uncertainty analysis on text representations to identify token-level uncertainty from sentences. We will
% release our code on GitHub.

\section{Related work}
Our work is related to two lines of research, interpretation of PLMs and uncertainty estimation in ML.
\paragraph{Interpretation of PLMs}
Transformer-based language models have achieved impressive performance across various NLP tasks \citep{bert,roberta}. However, the complex structure of these models has raised concerns about model transparency and reliability. Thus, there has been growing interest in developing methods to interpret PLMs. For example, \citet{clark-etal-2019-bert} proposed an attention-based visualisation method to interpret the model parameters by probing the feature space to determine the potential influence of the model output. \citet{Brunner2020On} studied the identifiability of attention weights in the BERT model and found that the distribution of self-attentions cannot be directly used as an interpretation. There has also been work focusing on interpreting the representations from PLMs \citep{zhou-srikumar-2021-directprobe} and attention weights \citep{sun-marasovic-2021-effective,marecek-rosa-2019-balustrades}.
\paragraph{Uncertainty Estimation}
As the interpretation of PLMs cannot provide prediction confidence directly, much effort has been devoted to developing approaches for uncertainty estimation of neural models. A straightforward approach to uncertainty estimation is %Recent research of 
using deep ensemble models %on uncertainty estimation 
\citep{deep_ensemble}. %attracts researchers start to study the predictive uncertainty in deep neural networks \citep{yarin-blackbox,latent_gussian_process}. Most of the researchers propose 
Various Bayesian inference methods have also been developed to prevent overfitting  %achieve a lower predictive uncertainty by adding variance to a complex network architecture for an interpolatable estimation of unseen data, through re-parameterising 
by attaching distributions to parameters in standard networks and estimating parameters via posterior inference \citep{bnn_wight_uncertainty}. Alternatively, uncertainty estimation can be performed using MC dropout \citep{monte_carlo_dropout}, which performs multiple stochastic forwards passes with dropout in a network during the inference stage to produce an ensemble of predictions. %This enables massive research to investigate the uncertainty in the classification for different applications, such as the image classification 
Other approaches to uncertainty estimation include prior networks \citep{predictive_uncertainty}. %, and classification with discrete feature space \citep{getting_clue}. 
Taking advantage of the development of transformer \citep{transformer}, there has been increasing interest in investigating classification uncertainty of language models \citep{quantify_uncertainty_nlp,desai-durrett-2020-calibration}. Various methods have been developed, partly inspired by the research in computer vision, from uncertainty quantification via input marginalization \citep{token_level_uncertainty,token_level_classification_interpret} to MC dropout and Bayesian inference methods such as SNGP \citep{how_certain_transformer,acl2022_uncertainty_transformers,sngp}. %However, as a sequential model, simply removing tokens from an input not only changes the original content, but also the word order. Hence, 
Nevertheless, the aforementioned approaches cannot identify the cause of the uncertainty. %The Bayesian inference based methods, on the contrary, could generate an explainable uncertainty estimation statistically at the sentence level, but are unable to identify input tokens leading to predictive uncertainty.

To overcome the limitation of existing methods, we propose an uncertainty analysis framework \texttt{CUE} built on VAE \citep{vae,scholar_vae,gaussian_process_uncertainty}, in which noise can be generated by perturbing the latent representation space. This allows us to disentangle the source of uncertainty via text representation dimensions and study PLM-based classifiers' predictive uncertainty at both the sequence- and the token-level.

\section{Background}
\begin{figure*}[!t]
  \centering
  \includegraphics[width=0.84\linewidth]{resources/pipeline_cue.pdf}
\caption{\texttt{CUE} requires two phases of training: \textbf{Phase 1}: Fine-tune a classifier built on a PLM on a target dataset; \textbf{Phase 2}: Freeze the PLM and classification layer parameters and plug-in the \texttt{CUE} module to train the model that perturbs the latent representation $\bm{z}_i$ to generate $\bm{e}'_i$ which leads to uncertainty fluctuation.}
\label{fig:pipeline}
\end{figure*} 

\subsection{Problem setup}

We are given a labelled text classification dataset, where $X$ is input text and $Y$ is the label set. $\forall \{x_i, y_i\} \in X \times Y$, where $(x_i,y_i), i = 1,2, ...,N$, is an i.i.d. realisation of the random variables, $P(Y|X) \sim \mathbb{D}$, where $\mathbb{D}$ is the unknown ground truth conditional distribution of class labels. To train a text classifier built on a PLM, we need to find an optimal feature extraction function $f$ and a classification layer $g$ with trainable parameter $\bm{\vartheta}$ and $\bm{\eta}$, respectively: $x_i \stackrel{f_{\bm{\vartheta}} (\cdot) }{\longrightarrow} \bm{e}_i \stackrel{g_{\bm{\eta}(\cdot)}} {\longrightarrow} \hat{y}_i$, which first encodes text into a  representation $\bm{e}_i$ and then outputs a probability distribution over the label set with the predicted output close to the desired true label $y_i$. In this work, we take one step further to analyse the potential uncertainty in the two stages of the learning process: 1) in $\bm{\eta}$: $\bm{e}_i \stackrel{g_{\bm{\eta}(\cdot)}} {\longrightarrow} \hat{y}_i$, which dimension(s) in $\bm{e}_i$ is the source of uncertainty in prediction; and 2) in $\bm{\vartheta}$: $x_i \stackrel{f_{\bm{\vartheta}} (\cdot) }{\longrightarrow} \bm{e}_i$, which input tokens cause the uncertainty. Before we detail our proposed uncertainty estimation approach, we give the formal definition of uncertainty first.
%and $y_t \in \mathcal{Y}$ 
%$\mathcal{X}_{\rm train}$, $\mathcal{Y}_{\rm train}\in \mathcal{X} \times \mathcal{Y}$ and $x_{t}\in \mathcal{X}_{\rm train}$, $y_{t}\in \mathcal{Y}_{\rm train}$ for $t = 1,2, ...,T$ of i.i.d. realizations of random variables $\mathcal{X}_{\rm train},\mathcal{Y}_{\rm train} \sim \mathbb{D}_{\rm train}$, where $\mathbb{D}_{\rm train}$ is the unknown ground truth distribution of the training data. Given $x_t$, a pre-trained language model $\bm{\vartheta}, \bm{\eta}: \mathcal{X}_{\rm train} \rightarrow \bm{e} \rightarrow (\mathcal{Y}_{\rm train} \rightarrow [0,1])$ outputs a probability distribution $F_t(y)$ targeting the label $y_t$.
% PLMs can be fine-tuned with a task-specific classification layer on a training set for a downstream task. Taking text classification as an example, the input document of a PLM is denoted as $\bm{x}$, and the gold label is denoted as $y$, where $y\in\mathcal{Y}$, and $\mathcal{Y}$ is the label set. Let $\bm{\vartheta}$ be the parameters of the PLM layers, $\bm{\eta}$ be the parameters of the task-specific classification layer, and $\bm{e}$ be the text representation generated by the PLM. Each token has its representation generated from the last layer of the PLM, without further specifications, the $\bm{e}$ generally indicates the representation of the sentence-level classification token. Then the prediction of the fine-tuned model can be represented as:
% \begin{equation}
% \small
%     \hat{y}=\argmax_{y\in\mathcal{Y}} p(y | \bm{x}; \bm{\vartheta}, \bm{\eta}) = \argmax_{y\in\mathcal{Y}} p_{\bm{\eta}}(y | \bm{e}) p_{\bm{\vartheta}}(\bm{e} | \bm{x})  
% \end{equation}
% A latent multi-Gaussian process model for reconstructing text representations contains an encoder network and a decoder network.
% \begin{assumption}
% \label{ass:gaussian}
% We assume the longitudinal data are collected from M correlated tasks, over an arbitrary representation space $\tau$. We define a latent process common to all input data, such that output for the i-th data can be expressed as:
% \begin{equation}
% \small
%     \bm{z}_i(\bm{e}) = \bm{\mu}(\bm{e}) +\bm{f}_i (\bm{e}), \forall \bm{e} \in \tau, \notag
% \end{equation}
% where: $\bm{\mu}(\cdot) \sim \mathcal{GP}(m_0(\cdot),K_{\theta_{0}}(\cdot,\cdot))$ is the common mean process, $\bm{f}_i(\cdot) \sim \mathcal{GP}(0,\Sigma_{\gamma_{i}}(\cdot,\cdot))$ is the i-th task-specific process. 
% Under the hypothesis of independence between all pairwise elements, we remark that, conditionally to $\mu$, all $\bm{z}_i$ are independent and the conditional likelihood remains Gaussian:
% \begin{equation}
% \small
%     \bm{z}_i(\cdot)|\bm{\mu}(\cdot) \sim \mathcal{GP}(\bm{\mu}(\cdot), \Sigma_{\gamma_{i}} (\cdot,\cdot)) \notag
% \end{equation}
% \end{assumption}
% while the decoder network reconstructs the input $\bm{e}'$ given the latent variable $\bm{e}' &= p_{\bm{\theta}}(\bm{z})$, where $\bm{\phi}$ and $\bm{\theta}$ are the sets of parameters in the encoder and decoder respectively. The classifier's prediction on the reconstructed representation $\bm{e}'$ is denoted as $\hat{y}'$, $\hat{y}' &= p_{\bm{\eta}}(\bm{e}')$. 
% Here $\bm{\mu}$ is the mean vector and $\diag(\bm{\sigma}^2)$, which stands for the diagonal matrix of the vector $\bm{\sigma}^2$, is the covariance matrix of the Gaussian distribution. 
% In addition, we use $[n]$ to represent $\{1,2,...,n\}$, $K$ to represent the labels set, 
% $\mathbb{E}$ to represent the mathematical expectation and $\mathbb{D}$ to represent the  variance. We use $||\cdot||$ to denote the operator/spectral norm of matrices and $\mathcal{L}_{2}$-norm for vectors.
\subsection{Uncertainty Estimation}
\label{sec:uncertain_estimation}
According to established definitions found in prior literature, uncertainty can be defined based on the probability of predictive error \citep{sullivan2015introduction}, the mean squared error (MSE) \citep{mse_uncertainty}, or the conditional entropy \citep{entropy_uncertainty}. %Since correlations among these definitions are positive, 
% In our work, 
We adopt the MSE-based definition as a representative measure of uncertainty, which is chosen without compromising the generality of our approach. 
\begin{definition}
$\forall P(y|x) \in \mathbb{D}$, the predictive epistemic uncertainty can be defined by $\mathbb{E}\big[\big(\mathbb{E}[y] - \hat{y}\big)^2\big]$.
%\begin{equation}
%    P(\hat{y} = \predictor(x|\bm{\vartheta}, \bm{\eta})\in \mathcal{Y}_{\rm fail})
%\end{equation}
\end{definition}
Here, $\hat{y}$ is the class label for input $x$ predicted by the trained classifier. $\mathbb{E}[y]$ is the expectation of the ground truth label distribution, which is however unknown to the learner, making it impossible to calculate the epistemic uncertainty based on predictive variance directly. %$\mathbb{E}[\cdot]$ denotes the mathematical expectation of a random variable. The above definition states that the predictive epistemic uncertainty is defined by the predictive variance based on the true distribution of $\mathbb{D}$. However, the true distribution of $\mathbb{D}$ is unknown to the learner which is the main challenge in  uncertainty quantification according to our problem setup. 
Therefore, we propose to estimate the uncertainty by decomposing the variance based on the observed training data, $\{x_i,y_i\}_{i=1}^N$, %$$ P(y_i|x_i) \in \mathbb{D}$, 
which yields:
\begin{equation}
\small
    \mathbb{E}[(y_i - \hat{y}_i)^2] = \underbrace{\mathbb{E}[(y_i - \mathbb{E}[y])^2]}_{\rm aleatoric\,uncertainty} + \underbrace{\mathbb{E}[(\mathbb{E}[y] - \hat{y}_i)^2]}_{\rm epistemic\,uncertainty}
    \label{eq:predVariance}
\end{equation}
%     & \mathbb{E}\big[\big(y_i - \hat{y}_i\big)^2\big] \nonumber \\
%     & = \underbrace{\mathbb{E}[(y_i - \mathbb{E}[y])^2]}_{\rm aleatoric\,uncertainty} + \underbrace{\mathbb{E}\big[(\mathbb{E}[y] - \hat{y}_i)^2\big]}_{\rm epistemic\,uncertainty}
%     \label{eq:predVariance}
% \end{align}
%Here, $\mathbb{E}[y]$ denotes the expectation of the ground truth label distribution. 
Since the first term, $\mathbb{E}[(y_i - \mathbb{E}[y])^2]$, contains the observed $y_i$, it can be defined as the aleatoric uncertainty. The detailed derivation of Eq. (\ref{eq:predVariance}) can be found in our Supplementary Material Section 1.1. Similar to the setup in \citep{heiss2023nomu}, if we assume the conditional distribution of class labels follows a Gaussian distribution:
\begin{assumption}
\label{assum:gaussian}
    $\forall P(y_i|x_i) \in \mathbb{D}$, the true label distribution for a give data $x_i$ follows a Gaussian noise based generating process: $P(y_i|x_i) = \bm{\mu}_y + \bm{\varepsilon}_{y_i}$, %where distribution $\mathcal{N}(\bm{\mu}_y,\bm{\sigma}_y^2)$ 
    where $\bm{\mu}_y = \mathbb{E}[y] $ and the noise $\bm{\varepsilon}_{y_i}$ follows a Gaussian distribution of $\mathcal{N}(0,\bm{\sigma}_y^2)$ and $\bm{\sigma}_y^2 = \mathbb{E}[(\mathbb{E}[y] - y_i)^2] $.
\end{assumption}
Then, the epistemic uncertainty can be written as:
\begin{equation}
    \mathbb{E}\big[(\mathbb{E}[y] - \hat{y}_i)^2\big] = \mathbb{E}[(y_i - \hat{y}_i)^2] - \bm{\sigma}_y^2.
    \label{eq:epistemic}
\end{equation}
Here, the term $\mathbb{E}[(y_i - \hat{y}_i)^2]$ in the Eq. (\ref{eq:epistemic}) is the empirical MSE on the training data which can be optimised in the training process. The term $\bm{\sigma}_y^2$ is based on the true label distribution which is unseen to the learner. We need to clarify that $P(y_i|x_i)$ can be larger than $1$ under the assumption. Therefore, there is necessary to stack a normalisation layer before the prediction to guarantee the sum of predictive probabilities for different class labels is $1$. 

We assume that the empirical MSE has been minimised by the trained PLM-based classifier with parameters $\bm{\vartheta}$ and $\bm{\eta}$. To minimise the epistemic uncertainty given by Eq. (\ref{eq:epistemic}) for a given $x_i$ and its corresponding representation $\bm{e}_i$, we have to increase $\bm{\sigma}_y^2$, which however cannot be calculated directly. We propose to use a VAE-based generative model parameterised by $\bm{\omega}$ to reconstruct $\bm{e}_i$ by adding Gaussian noise while preserving the predictive label, resulting in $\bm{e}'_i$. The reconstructed representation should be similar to the original input representation, $\bm{e}'_i\approx \bm{e}_i$, and the predictive class label distribution from $\bm{e}'_i$, $\hat{y}'_i = g_{\bm{\eta}}(\bm{e}'_i)$, should be close to $\hat{y}_i= g_{\bm{\eta}}(\bm{e}_i)$, $\hat{y}'_i \approx \hat{y}_i$. %and $ g_{\bm{\eta}}(\bm{e}'_i) \sim g_{\bm{\eta}}(\bm{e}_i)$. 
%Thus we are going to have two different predicted label distributions, the original label distribution $\mathcal{\hat{Y}}_i = p(\hat{y}_i|x_i)$ and reconstructed label distribution $\mathcal{\hat{Y}}'_i = p(\hat{y}'_i|x_i)$.
This allows us to manipulate the latent code of the VAE to increase the variance of the Gaussian noise, which leads to the resulting label distribution %a more general assumption that is 
closer to a uniform distribution in the out-of-distribution (OOD) area, thus achieving a lower epistemic uncertainty. Accordingly, we define the learning objective function as:

%generative learning architecture to approximate the output by $\bm{\vartheta}$ and $\bm{\eta}$, named as $\bm{\vartheta}'$ and $\bm{\eta}'$. Meanwhile, an estimator target to enlarge the $\bm{\sigma}_y^2$ without changing the prediction will be deployed to guarantee the minimising of epistemic uncertainty. Since we assume the the variable $y$ follows Gaussian distribution $\mathcal{N}(\bm{\mu}_y,\bm{\sigma}_y^2)$, we use the $\mathcal{H}(y)$ in our optimising which is proportional to $\bm{\sigma}_y^2$: $\mathcal{H}(y) = \frac{1}{2} \rm ln 2\pi\bm{\sigma_y^2}  + \frac{1}{2} \propto \bm{\sigma_y^2}$.
% \begin{equation}
%     \mathcal{H}(y) = \frac{1}{2} \rm ln 2\pi\bm{\sigma_y^2}  + \frac{1}{2} \propto \bm{\sigma_y^2} 
% \end{equation}
%Then, the learning objective can be defined as:
\textbf{Learning objective:} $\forall P(y_i|x_i) \in \mathbb{D}$ under the Assumption \ref{assum:gaussian}, the learning objective is to:
\begin{equation}
\small
% \label{eq:learningobjective}
     \mathop{min}_{\bm{\omega}} \mathbb{E}[(y_i - \hat{y}_i)^2] - \mathcal{H}_{e'}(\hat{y}),  \quad
    s.t. \quad \bm{e}_i \approx \bm{e}'_i, \quad 
    \hat{y}_i \approx \hat{y}'_i,
    %g_{\bm{\eta}}(\bm{e}'_i) \approx g_{\bm{\eta}}(\bm{e}_i)
\end{equation}
where $\bm{\omega}$ denotes the parameters of VAE, $\bm{e}_i=f_{\bm{\vartheta}}(x_i)$, $\bm{e}'_i=f_{\bm{\omega}}(\bm{e}_i)$, $\hat{y}_i= g_{\bm{\eta}}(\bm{e}_i)$, $\hat{y}'_i = g_{\bm{\eta}}(\bm{e}'_i)$, $\mathcal{H}_{e'}(\hat{y})$ is the estimated entropy by the predictive label distribution from the reconstructed $\bm{e}'_i$, which approximates the variance of the true label distribution. The above learning objective can be formulated using the method of Lagrange multipliers: %to optimize the following multiple constraints Lagrange multipliers:
% \paragraph{Learning objective:} $\forall \{x_i,y_i\} \in \mathbb{D}$, $y \sim \mathcal{N}(\mu_y,\sigma_y^2)$, the learning objective is to optimize the following Lagrangian function:
{\small 
\begin{align}
    \mathcal{L}(x_i,y_i) = &- \mathbb{E}[ \mathcal{H}_{e'}(\hat{y}_i)]     + \lambda_1 \mathbb{E}[(f_{\bm{\vartheta}}(x_i)-f_{\bm{\omega}}(f_{\bm{\vartheta}}(x_i)))^2] \notag \\
    &+ \lambda_2 \mbox{KL}\big( \hat{y}'_i || \hat{y}_i \big)
    \label{eq:learningobjective}
\end{align}}
Therefore, by optimising Eq. (\ref{eq:learningobjective}), we can obtain an alternative representation of $\bm{e}'_i$ with the predictive distribution of $\hat{y}'_i$ using the parameters $\bm{\omega}$, where the lower bound of the epistemic uncertainty can be obtained by increasing the aleatoric uncertainty defined by $\mathcal{H}_{e'}(\hat{y}_i)$. %In the experimental setup, we will take the expected calibration error to measure the estimation result. 
In the next section, we show how each term in Eq. (\ref{eq:learningobjective}) can be defined in our VAE-based uncertainty interpretation framework \texttt{CUE}.

%However, the optimising targets regarding to the two terms in Eq. \ref{eq:learningobjective} lead to different directions in the hyperspace. Therefore, to determine the boundary where the entropy from the aleatoric uncertainty has been increased but without changing the prediction, we propose a VAE-based, Gaussian mixture process \cite{vae, getting_clue} framework \texttt{CUE} to approximate the learning objective -- Since mixture Gaussian can be used to disentangle the latent representations into independent codes and corresponding vectors, especially the OOD area with mean class ratio\citep{zhang2022ood}, and quantify the contribution from different codes to the final outputs, including the uncertainty estimation on both dimension level and token level, by the latent perturbations proposed in our next section.

%approximate any continuous functions, we can then generate perturbations via a latent variable and reconstruct the text representations to approximate the functionality of the aforementioned $\bm{\vartheta}'$. The first property of Eq.\ref{eq:learningobjective}, can thus be achieved by generating noisy perturbations via maximising the variance in MGP.

%However, in the existing pre-trained language model, it usually assumes that the model is well-trained on the large scale of training corpus and no different between the training and true distribution. As a result
% where the $\mathcal{Y}_{\rm fail}$ is identified as a ‘failure set’ and $\predictor(\cdot)$ gives the output of a classifier. The main challenge to estimating the above probability is that the ‘failure set’ is unseen to the trainer. Therefore, we cannot simply take the conditional probability given by the predictor for estimation. More concretely, if we assume the true distribution of $y_{\rm train}$ has the expected value $\mathbb{E}[y_{\rm train}]$, without loss of generality, we are able to obtain the following by decomposing the Mean Squared Error (MSE) based predictive uncertainty:
% \begin{align}\small
%     & \mathbb{E}[(y_{\rm train}-\hat{y})^2] \\
%     &= \underbrace{\mathbb{E}[(y_{\rm train} - \mathbb{E}[y_{\rm train}])^2]}_{\rm aleatoric\,uncertainty}+\underbrace{\mathbb{E}[(\mathbb{E}[y_{\rm train}] - \hat{y})^2]}_{\rm epistemic\,uncertainty} \nonumber \\
%     &= \mathbb{V}[y_{\rm train}] + \mathbb{E}[(\mathbb{E}[y_{\rm train}] - \hat{y})^2]
% \label{eq:total_uncertainty}
% \end{align}
% The above total uncertainty can be decomposed into aleatoric and epistemic uncertainty, where the aleatoric uncertainty refers to the variability of outcome due to inherent random effect and is irreducible, while the epistemic uncertainty is caused by the lack of knowledge about the best model and is reducible. Therefore, %Assuming $\Lambda=\{\bm{\vartheta}, \bm{\eta}\}$, the aleatoric uncertainty can be estimated by $\mathbb{E}_{p(\Lambda|\mathcal{D})}\big[ H(y|\bm{x},\Lambda) \big]$. 
% the epistemic uncertainty can be derived by subtracting the aleatoric uncertainty from the total prediction error. 
% To estimate the aleatoric uncertainty, which can be represented as the variance of the label $y$, we propose to make perturbations on the input representation to increase the predictive entropy of $y$ by injecting a Gaussian noise sampled from the data as well as keeping the original prediction results. 

%\subsection{Learning Objective}
% During the training process, the trainer tends to learn the distribution of the train set:$\mathcal{Y}_{\rm train}=P_{\mathbb{D}_{\rm train}}(\mathcal{X}_{\rm train})$ during the training process. However, it is hard to probe the classifier and investigate its decision boundary. We can controllably generate perturbation on each $x_t$ by minimizing the MSE, but this may not be equivalent to optimizing the predicted $\hat{y}$. 
% We assume the distribution follows the Gaussian distribution, 
% As it is unclear to define the decision boundary of the text classifier, we are hard to observe the distribution of $\mathcal{Y}$ in the model.
% \begin{definition}
% Each instance used to train the PLM are i.i.d samples from the data generating process $y_{\rm train} = f(x_{\rm train})+\varepsilon$, where $\varepsilon|x_{\rm train}\sim\mathcal{N}(\mu,\sigma^{2}_{n}(x_{\rm train}))$. 
% \end{definition}
% We use $\sigma_{n}$ to denote the data noise (aleatoric uncertainty). 


%we aim to fix the total uncertainty $\mathbb{E}[(y_i - g_{\eta}(f_{\vartheta}(x_i)))^2]$ and enlarge the aleatoric uncertainty $\bm{\sigma_y^2}$.
% that is let $P(\hat{y}) \simeq P(\hat{y}')$. This is equivalent to optimizing the generating process $\delta f(y_{\rm train})$.
%Under the Gaussian distribution in assumption \ref{assum:gaussian}, $\mathcal{H}(y)$ is propotional to $\bm{\sigma_y^2}$:
%\begin{equation}
%    \mathcal{H}(y) = \frac{1}{2} \rm ln 2\pi\bm{\sigma_y^2}  + \frac{1}{2} \propto \bm{\sigma_y^2} 
%\end{equation}
%Increasing the entropy of $y_i$ is equivalent to increasing the aleatoric uncertainty. 
% We assume that the distribution of label $Y$ is under the Gaussian distribution. For any $Y$ in $\mathcal{N}(\mu,\sigma^2)$, aleatoric uncertainty $= \mathbb{V}[y_{\rm train}] \propto H(y_{\rm train})$.
% Under the above assumption, 
%Therefore optimising the epistemic uncertainty is equivalent to making perturbations on the text representation to increase the aleatoric uncertainty. We can conclude the below theorem:
% $H(y_{\rm train})$ by injecting a Gaussian noise sampled from the data as well as keeping the original prediction results, that is $P(\hat{y}) \simeq P(\hat{y}')$.
%\begin{theorem}
%\label{theo:learning_object}
%$\forall x,y \in \mathbb{D}$, $y \sim \mathcal{N}(\mu_y,\sigma_y^2)$, increasing $\bm{\sigma_y^2}$ is equivalent to make $\mathcal{H}(y) > \mathcal{H}(y_i)$ and $y = y_i$.
% If any $Y$ in $\mathcal{N}(\mu,\sigma^2)$, 1.optimize $P(\hat{y}) \sim P(\hat{y}')$ and 2.increase $H(y_{\rm train})$ is equivalent to enlarge the $\mathbb{V}[y_{\rm train}]$.
%\end{theorem}

% Intuitively, the high variance always indicates the high entropy in Gaussian distribution. Therefore, to simplify the computation in the following sections, we compute the bound of entropy variation to represent the aleatoric uncertainty, and further estimate and interpret the epistemic uncertainty.
%we propose to use the VAE based reconstruction by injecting a Gaussian noise that increase classifier's aleatoric uncertainty without changing the prediction results to estimate the epistemic uncertainty.
% According to \citep{Malinin2018PredictiveUE}, total uncertainty in prior networks can be divided into three components: Model uncertainty, Expected Data Uncertainty and Distributional Uncertainty. Where model uncertainty is caused by the lack of knowledge about the best model, expected data uncertainty describes an irreducible uncertainty caused by the natural complexity of the data. In the previous work, distributional uncertainty is used to describe the uncertainty difference between different sources of data (e.g. in-domain or Out-of-Domain). In this work, we aim to use distributional uncertainty to estimate the model uncertainty and examine the model robustness.
% Current PLMs fine-tuning  process learns the distribution of $\mathbb{E}[y|x]$. However, due to limited in-domain training and testing data, we are hard to estimate the model uncertainty and exam the classifier robustness based on the observation. Therefore, in this work, we adopt prior networks to study the distributional uncertainty based under different supervision. The original trained PLM can be seen as the best model that makes the distribution of $\mathbb{E}[\hat{y}|e,e|x]$ closer to the distribution of $\mathbb{E}[y|x]$, aka under the supervision of data. The additional prior networks aims to produce a new distribution that $\mathbb{E}[\hat{y}'|e']$ closer to $\mathbb{E}[\hat{y}|e]$ but keep uncertainty as high as possible, aka under the supervision of the trained classifier.
% \begin{align}
%     \textrm{PLM\,total\,uncertainty} &= \textrm{expected\,data\,uncertainty} + \textrm{model\,uncertainty}\\
%     \textrm{PLM\,with\,Prior\,total\,uncertainty} &= \textrm{expected\,data\,uncertainty} + \textrm{model\,uncertainty} + \textrm{distributional\,uncertainty}\\
%     \textrm{Prior} - \textrm{PLM\,total\,uncertainty} &= \textrm{distributional\,uncertainty} = \textrm{model\,uncertainty}
% \end{align}
% The prior network generated representation $\bm{e}'$ is aim to enlarge the predictive uncertainty while keeping the new prediction $\mathbb{E}(\hat{y}')$ follow the original distribution $\mathbb{E}(\hat{y})$. In short, $\mathbb{E}(\hat{y}'|e')-\mathbb{E}(\hat{y}|e)$ represents the distributional predictive uncertainty difference of $\bm{e}$ and $\bm{e}'$, which can be understand as the measure of model uncertainty.

\section{Uncertainty Interpretation}

In this paper, we are interested in interpreting model uncertainty, that is, what input features lead to the predictive uncertainty. %One simple way is to mask out some input word tokens and observe the change in predictive uncertainties \citep{token_level_uncertainty,token_level_classification_interpret}. However, due to the complexity of PLMs, making perturbation on input text to study the classifier uncertainty has proved to be difficult due to the huge search space of possible candidate words and combinations to be perturbed. 
% % It is also time-consuming to study the classifier behaviour as the generated perturbations would require human verification to ensure they are plausible and meaningful \citep{human_generate_token_adv}.
% Rather than directly perturbing the input features, in computer vision, perturbations can be done in the latent space to generate counterfactuals such that the modified input would still reside on the data manifold while the model's prediction on the modified input becomes more confident \citep{getting_clue}. 
% By examining the difference between the original and the perturbed image, a subset of input features which are responsible for uncertainty can be identified on latent (or conceptual) representations \citep{getting_clue} and introduce minimum perturbation in the latent space which can lead to a change of predictive uncertainty. 
% It is, however, difficult to trace back input word tokens leading to the change in the latent space due to discrete token sequences. Plus the unexplainable nature of PLM encoding layers, it is also hard to find out the proportion of each token representations included in the classification representation that causes sentence-level predictive uncertainty.  
To this end, we propose a VAE-based uncertainty interpretation framework \texttt{CUE}, as shown in Figure \ref{fig:pipeline}. Rather than directly perturbing the input features, perturbations can be done in the latent space in \texttt{CUE} to generate the modified input representation such that it still resides on the original data manifold while the model's predictive epistemic uncertainty on the modified input is reduced. By examining the difference between the original and the perturbed text representations, a subset of input features (i.e., word tokens) can be identified as the interpretation of the original model's predictive uncertainty. 

We will first present how to generate perturbations on latent space in order to cause the prediction uncertainty change (\textsection{\ref{sec:counterfactual}}). We will then describe how to identify input features that lead to original prediction uncertainty to facilitate the interpretation of model predictive uncertainty  (\textsection{\ref{sec:inputFeatureIdentification}}).

\subsection{Latent Space Perturbation for Epistemic Uncertainty Reduction}\label{sec:counterfactual}
% Each longitudinal textual data $x_i$ from the training set $X$ can be mapped by a PLM text encoder over an arbitrary representation space $\tau$ as a sentence-level representation $\bm{e}$. The representation $\bm{e}$ is a weighted sum of token representations ${\bm{e}_1, ...,\bm{e}_n}$ via unexplainable process, where $n$ is the number of tokens. 
Once a classifier built on a PLM is fine-tuned on a target dataset, we freeze the parameters of the PLM and the classification layer and then insert the \texttt{CUE} between the PLM last layer and the task-specific classification layer. 
The PLM-encoded representation $\bm{e}_i$ is mapped to a latent vector, denoted by $\bm{z}_i$, via \texttt{CUE} which consists of two networks. 

\textbf{The \emph{encoder} network $\bm{\phi}$}, learns the distribution of a lower dimensional latent variable $\bm{z}_i$ given the PLM-encoded representation by a random Gaussian noise $\epsilon$: $\bm{z}_i=\bm{\mu_{\phi}}(\bm{e_i})+\epsilon\cdot\bm{\sigma_{\phi}}(\bm{e_i})$, i.e., $\bm{z}_i \sim \mathcal{N}(\bm{\mu_{\phi}},\bm{\sigma_{\phi}}^2)$. 
% Under the hypothesis of independence between all pairwise elements conditionally to $\bm{\mu_{\phi}}$, all $\bm{z}_i$ are independent, and the conditional likelihood remains Gaussian.

\textbf{The \emph{decoder} network $\bm{\theta}$}, reconstructs the text representation given the latent variable $\bm{z}_i$, defined as $\bm{e}_i' = p_{\bm{\theta}}(\bm{z}_i)$. Although $p_{\bm{\theta}}(\bm{z}_i)$ can be any decoding network, our implementation utilises a linear mapping $\bm{W_{\theta}}$ without a bias term. The benefit is that $\bm{W_{\theta}}$ can be treated as a set of learnable vectors and the reconstructed text representation $\bm{e}_i'$ can be written as a linear combination of the decoded output generated from each of the latent dimensions of $\bm{z}_i$. As will be discussed in \textsection{\ref{sec:inputFeatureIdentification}}, such a decomposition form of decoding as illustrated in Eq. (\ref{eq:GMPR}) allows the identification of latent dimensions of $\bm{z}_i$ which causes predictive uncertainty. 
%it can be treated as a set of learnable vectors.  and their corresponding linear combinations, which are weighted by the latent representation $\bm{z}_i$, to quantify uncertainty on the dimension of the latent representation. The details of this decoding form will be given in \textsection{\ref{sec:inputFeatureIdentification}}, Eq. (\ref{eq:GMPR}).



% where $\bm{\phi}$ and $\bm{\theta}$ are the sets of parameters in the encoder and decoder respectively.
% % We use $\mathbb{E}_{q_{\phi}(\bm{z}|\bm{e})}[\bm{z}]=\mu_{\phi}(\bm{z}|\bm{e})$ and $\mathbb{E}_{p_{\theta}(\bm{e}'|\bm{z})}[\bm{e}']=\mu_{\theta}(\bm{e}'|\bm{z})$ to denote these two networks' predictive means. 
% The reconstructed input $\bm{e}'_i$ is then fed to the classification layer to generate a new prediction $\hat{y}'_i$, where $\bm{e}_i' \sim p(\bm{e}_i|\bm{p_\theta}(\bm{z}_i))$. 
% Once a classifier built on a PLM is fine-tuned on a target dataset, we freeze the parameters of the PLM and the classification layer. Then we insert the VAE model between the PLM last layer and the task-specific classification layer, to maximize the aleatoric uncertainty by generating semi-factual representation $\bm{e}_i'$ based on each text representation $\bm{e}_i$. We construct the following modelling assumption to generate perturbed latent variable $\bm{z}_i$ by sampling data noise from the representation distribution:
% \begin{assumption}
% \label{ass:gaussian}
%     $\forall P(y_i|x_i) \in \mathbb{D}$, $x_i \stackrel{f_{\bm{\vartheta}} (\cdot) }{\longrightarrow} \bm{e}_i$. $\bm{z}_i=\bm{\mu_{\phi}}(\bm{e_i})+\epsilon\cdot\bm{\sigma_{\phi}}(\bm{e_i})$, where $\bm{z}_i \sim \mathcal{N}(\bm{\mu_{\phi_z}},\bm{\sigma_{\phi_z}}^2)$. $\bm{e}_i' = \bm{p_\theta}(\bm{z}_i)$, where $\bm{e}_i' \sim p(\bm{e}_i|\bm{p_\theta}(\bm{z}_i))$
% \end{assumption}
% Under the hypothesis of independence between all pairwise elements conditionally to $\bm{\mu_{\phi}}$, all $\bm{z}_i$ are independent, and the conditional likelihood remains Gaussian.
% where $\mu(\cdot) \sim \mathcal{GP}(\bm{e}_t(\cdot),K_{\theta_{0}}(\cdot,\cdot))$ is the common mean process, $f(\cdot) \sim \mathcal{GP}(u,\sigma(\cdot,\cdot))$ is the sampled noise.
% Under the hypothesis of independence between all pairwise elements, we remark that, conditionally to $\mu$, all $\bm{z}$ are independent and the conditional likelihood remains Gaussian:
% \begin{equation}
% \small
%     \bm{z}(\cdot)|\mu(\cdot) \sim \mathcal{GP}(\mu(\cdot), \Sigma_{\gamma_{i}} (\cdot,\cdot)) \notag
% \end{equation}
% The decoder network reconstructs the input $\bm{e}'_i$ given the latent variable $\bm{e}'_i = p_{\bm{\theta}}(\bm{z}_i)$, we use $\bm{\phi}$ and $\bm{\theta}$ to denote the sets of parameters in the encoder and decoder respectively. 
The VAE parameters are denoted as $\bm{\omega} = \{\bm{\phi}, \bm{\theta}\}$.  The classifier's prediction on the reconstructed representation $\bm{e}'_i$ is denoted as $\hat{y}'_i = g_{\bm{\eta}}(\bm{e}'_i)$. Here, we choose to use the Softmax based prediction layer to normalise the predictive probability, but the representation $\bm{e}'_i$ before the normalisation should follow the Gaussian distribution since it is captured by a linear combination of Gaussians. 
% Based on the first objective in Eq.\ref{eq:learningobjective}, the maximum entropy for a softmax-based classifier is ${\rm log}K$, where $K$ is the size of the label set. 
% \begin{equation}
%     0 \leq \mathcal{H}_{(\bm{e}_i)}(g_{\eta}(\bm{e}_i)) \leq \mathcal{H}_{(\bm{e}_i')}(g_{\eta}(\bm{e}_i')) \leq {\rm log}K  \notag
% \end{equation}
% We can then assume that for each pair of text representation $\bm{e}_i$ and the reconstructed representation $\bm{e}_i'$, there exists a maximum distance $\bm{U}$ that causes the new $\bm{e}_i'$ confuse the classifier (i.e. when the predictive class probability of any classes is $\frac{1}{K}$).
% \begin{assumption}
%     $\forall P(\bm{e}'_i|\bm{e}_i) \in \mathbb{D}$, $\exists \bm{U} = \mathop{max}(\bm{e}'_i - \bm{e}_i)$ where $\forall j,t \in {1,2,...,K}, j \neq t, P(\hat{y}_j = g_{\eta}(\bm{e}'_i)) = P(\hat{y}_t = g_{\eta}(\bm{e}'_i))$.
%     \label{ass:classifier}
% \end{assumption}
% Given the above modelling assumptions, the semi-factual representation generated by the MGP latent model can guarantee an upper bound of predictive entropy difference $\Delta \mathcal{H}$. The $\Delta \mathcal{H}$ is proportional to the reconstruction error $||\bm{e}'_i-\bm{e}_i||^2$, which can thus be used to interpret the uncertainty. We provide detail prove in the supplementary material.
% \begin{remark}
% \label{theo:delta_h}
% Under our modelling Assumption \ref{ass:gaussian} and \ref{ass:classifier}, $\Delta \mathcal{H} = \mathcal{H}(g_{\eta}(\bm{e}'_i)) - \mathcal{H}(g_{\eta}(\bm{e}_i)) \propto ||\bm{e}'_i-\bm{e}_i||^2$
% \end{remark}
%Then the latent variable model can be used to generate perturbations 
Besides, the latent representation $\bm{z}_i$ can be perturbed which leads to uncertain predictions bounded by a uniform distribution probability, $\log K$ ($K$ is the label set size)\footnote{The proof is shown in Supplementary Material Section 1.2.}. %, based on Softmax-based prediction.
For the training of the \texttt{CUE} model, we define various loss terms in Eq. (\ref{eq:learningobjective}) below:
%to preserve the following properties and define the loss terms accordingly:



\textbf{Minimum change on both the perturbed representation and the model prediction}. The reconstructed $\bm{e}'_i$ should be similar to the original $\bm{e}_i$.
\begin{equation}
\small
    \mathcal{L}_r = \norm{\bm{e}'_i-\bm{e}_i}^2 \label{eq:reconstruction}
\end{equation}
The prediction, $p(\hat{y}'_i|\bm{e}'_i)$, based on the reconstructed $e'_i$, should be close to the origin prediction $p(\hat{y}_i|\bm{e}_i)$.
\begin{equation}
\small
\mbox{KL}(\hat{y}'_i || \hat{y}_i) = \sum_{k=1}^K p(\hat{y}'_{i_k}|\bm{e}'_i)\log \frac{p(\hat{y}'_{i_k}|\bm{e}'_i)}{p(\hat{y}_{i_k}|\bm{e}_i) },
\end{equation}
where $K$ denotes the size of the class label set. 

\textbf{Predictive Entropy Increment}.
% to preserve the first property mentioned in Eq.\ref{eq:learningobjective} and fulfil the assumption 4.1. 
We need to increase the predictive entropy $\mathcal{H}_{e'}(\hat{y}'_i)$ calculated based on the reconstructed input representation $\bm{e}'_i$, which approximates the variance of the true label distribution, $\bm{\sigma}_y^2$, in order to decrease the model epistemic uncertainty defined in Eq. (\ref{eq:epistemic}). 
% We can define the following loss terms accordingly, where $K$ represents the label set:
\begin{equation}
\small
    \mathcal{H}_{e'}(\hat{y}'_i) = -\sum_{k=1}^K p(\hat{y}'_{i_k}|\bm{e}'_i)\log p(\hat{y}'_{i_k}|\bm{e}'_i)
\end{equation}
% \paragraph{Consistency of Prediction} according to the second property in Eq.\ref{eq:learningobjective}, the prediction, $p(\hat{y}'_i|\bm{e}'_i)$, based on the reconstructed representation $e'_i$, should be close to the origin prediction $p(\hat{y}_i|\bm{e}_i)$.
% \begin{align}
% KL(\hat{y}'_i|\hat{y}_i) = \sum_{k=1}^K p(\hat{y}'_i_k|\bm{e}'_i)\log \frac{p(\hat{y}'_i_k|\bm{e}'_i)}{p(\hat{y}_i_k|\bm{e}_i) }
% \end{align}
% \textbf{Lipschitz Constant Consistency} In order to facilitate the same distribution of the reconstructed text representation as the latent space, it is desirable to keep the Lipschitz constant consistent, i.e., $\bm{e}_i'$ should follow the same Gaussian distribution of the latent vector $\bm{z}_i$. We thus include an additional orthogonality constraint, which can upper-bound the Lipschitz constant of linear transformations to achieve better numerical stability \citep{zhang-etal-2021-orthogonality}. Specifically, we regularise the dimensions of the decoder $W_{\theta}$ to be orthogonal to each other:
In addition, %to normalize decoder parameters and maintain the consistency of the Gaussian process, 
we incorporate an orthogonality constraint within the decoder to encourage independence among dimensions of the latent variable: %to be orthogonal to each other.
% Therefore, we regularise the dimensions of the latent vector $\bm{z}$ are orthogonal to each other:
\begin{equation}
\small
    \mathcal{L}_o = \|{\rm \bf I} - \bm{W_{\theta}}\times \bm{W_{\theta}}^{\mathsf{T}}\|\label{eq:orthogonality}
\end{equation}
\noindent where {$\rm \bf I$} is an identity matrix, $W_{\theta}$ is the weights in the decoder. %We explore the influence of this 
%In implementation, we adopt a cross-entropy loss $H(\hat{y}'_i|\hat{y}_i)$ to represent $KL(\hat{y}'_i|\hat{y}_i) - H_{\bm{e}'_i}(\hat{y}'_i)$. 
The final objective function is then defined as:
\begin{equation}
\small
    %\mathcal{L} = \gamma_r\mathcal{L}_r + \gamma_{H} H(\hat{y}'_i|\hat{y}_i) + \gamma_o \mathcal{L}_o
     \mathcal{L} = \gamma_1\mathcal{L}_r + \gamma_2\mbox{KL}(\hat{y}'_i || \hat{y}_i ) - \gamma_3\mathcal{H}_{e'}(\hat{y}'_i) + \gamma_4 \mathcal{L}_o
     \label{eq:finalObjective}
\end{equation}
% The reconstruction loss $\mathcal{L}_r$ is used to minimise the distance between the original and the reconstructed representations to encourage sparse explanations. The cross entropy loss is used to minimise the distance between the predicted $\hat{y}$ using the original representation $\bm{e}$ and $\hat{y}'$ using the reconstructed representation $\bm{e}'$.
% The reconstruction loss $\mathcal{L}_r$ is used to minimise the distance between the original and the reconstructed representations to encourage sparse explanations. 
% The KL loss aims to minimise the difference between the original prediction distribution $p(\hat{y}_k|\bm{e})$ and the perturbed semi-factual $p(\hat{y}'_k|\bm{e}')$. The entropy aims to enforce our perturbation create a more uncertain prediction $\hat{y}'$. The combination of the KL and entropy terms ensures our semi-factual generates uncertain predictions while keeping the distribution of the perturbed label similar.
% The combination of $\mathcal{L}_r$ and $KL(\hat{y}'|\hat{y})$ is a Bregman Divergences \citep{banerjee2005clustering}, which keep the distribution of perturbed representation $\bm{e}'$ and its prediction $\hat{y}'$ similar to the original representation $\bm{e}$ and prediction $\hat{y}$. Minimise a negative entropy term $H_{\bm{e}'}(\hat{y}')$ can enlarge the predictive uncertainty of $\hat{y}'$ and induce the aleatoric uncertainty to find the equivalent epistemic uncertainty. 

where the $\gamma$ coefficients are used to balance various loss terms. Minimising the loss function defined in Eq. (\ref{eq:finalObjective}) is equivalent to introducing perturbation in the latent space so as to increase the predictive entropy. We can use the reconstruction error $||\bm{e}'_i-\bm{e}_i||^2$ to represent the perturbed noise that leads to predictive uncertainty difference $\Delta \mathcal{H}$. As will be shown in Supplementary Material Section 1.2, $\Delta\mathcal{H}$ is proportional to the reconstruction error $||\bm{e}'_i-\bm{e}_i||^2$. As such, the reconstruction error can be used to interpret the predictive uncertainty. 
%, which can be understood as the possible maximum aleatoric uncertainty to the classifier. We present the detailed derivation in \textbf{Supplementary Material Section 1.2}.
By retracing alterations made in the input feature space, we can effectively identify features which cause the uncertainty. To the best of our knowledge, we are the first to apply perturbations in the latent representation space to interpret the predictive uncertainty associated with PLM-based classifiers.
% The step above is equivalent to using the multi-Gaussian process to learn the training data's Probability Density Function (PDF). As has been discussed by \citet{hullermeier2021aleatoric}, for an unseen data point, the reconstruction error can be used as a measure to quantify if the data point falls within the underlying PDF. 
% The aforementioned training process is essentially equivalent to performing perturbation in the latent space to generate noise such that it resides on the original data manifold but causes aleatoric uncertainty. If we can trace back what has been changed in the input feature space, we can then identify features which cause the uncertainty. To the best of our knowledge, we are the first to apply perturbations in the latent representation space to interpret the predictive uncertainty on PLM-based classifiers. 
\subsection{Input Feature Identification for Uncertainty Interpretation}
\label{sec:inputFeatureIdentification}

% In this subsection, we discuss how to quantify the prediction uncertainty that caused by the input features based on the semi-factual representation generated via latent space perturbation in \textsection{\ref{sec:counterfactual}}. During the inference stage, we identify possible features that caused predictive uncertainty by our proposed Uncertain Feature Identification shown in Algorithm \ref{alg:UFI_alg}. For a given input, we can retrieve three different representations from our framework, the original PLM-encoded representation $\bm{e}_i$, the VAE reconstructed representation (i.e., semi-factual) $\bm{e}'_i$, and the difference between them, $\Delta e_i = \bm{e}'_i - \bm{e}_i$. 

% Sequence-level representation $\bm{e}_i$ in PLMs are aggregated by each token's representation $\bm{e}_{w_j}$ weighted by its corresponding self-attention $\alpha_j$ \cite{bert}. Similarly, reconstructed representation $\bm{e}'_i$ can be seen as the weighted sum of each latent dimension $\bm{z}_i$.
% \begin{equation}
%     \bm{e}_i = \Sigma_{j=1}^l \bm{\alpha}_j \bm{e}_{w_j},\quad\bm{e}'_i = \Sigma_{i=1}^k \bm{\theta}_i \bm{z}_i
% \end{equation}
% where $l$ and $k$ represent the input sequence length and latent dimension size. $\theta_i$ represents the decoder weights correspond to the $i$-th dimension of latent variable $\bm{z}$. Therefore, we can write the reconstruction difference $\Delta\bm{e}_i$ as:
% \begin{align}
%     \Delta\bm{e}_i &= \bm{e}'_i - \bm{e}_i = \Sigma_{i=1}^k \bm{\theta}_i \bm{z}_i -\Sigma_{j=1}^l \bm{\alpha}_j \bm{e}_{w_j}\notag\\
%     &= \bm{\theta}_1 \bm{z}_1 +\bm{\theta}_2 \bm{z}_2 + ... + \bm{\theta}_k \bm{z}_k - \Sigma_{j=1}^l \bm{\alpha}_j \bm{e}_{w_j}
% \end{align}
% To find out the most influential latent dimension towards the uncertainty fluctuation, we remove each dimension by letting $\bm{z}_i=0$. The new representation can be written as:
% \begin{align}
%     \Delta\bm{e}'_i = \Delta\bm{e}_i - \bm{\theta}_i \bm{z}_i
% \end{align}
% As mentioned in \textsection{\ref{sec:counterfactual}}, $\Delta \mathcal{H}$ has an upper bound and is proportional to the reconstruction error $||\bm{e}'_i-\bm{e}_i||^2$. We thus use the norm (calculated as the inner product) of the reconstruction error, $\langle \Delta e'_i, \Delta e_i \rangle $, to measure the entropy change as: 
% \begin{align}
%     \langle \Delta \bm{e}'_i, \Delta \bm{e}_i \rangle 
%     =&  \langle \Delta\bm{e}_i - \bm{\theta}_i \bm{z}_i, \Delta e_i \rangle \notag \\
%     =& \langle \Delta\bm{e}_i, \Delta \bm{e}_i \rangle  - \langle \bm{\theta}_i \bm{z}_i, \Delta e \rangle \notag\\
%      \propto& - \langle \bm{\theta}_i \bm{z}_i, \Delta e \rangle ,
% \end{align}
% Then we can find out the most influential latent dimension $\bm{z}_i$ that contributes to the overall reconstruction difference $\Delta \bm{e}_i$. Therefore, the influence on prediction uncertainty changes $\Delta \mathcal{H}$ of the $j$-th word is decided by the generative probability of the multi-Gaussian encoder and the inner product $\langle \bm{\theta}_i \bm{z}_i,\bm{e}_{w_j} \rangle$.
% \begin{align}
%     \Delta\bm{e}_i &= \bm{e}'_i - \bm{e}_i = \Sigma_{i=1}^k \bm{\theta}_i \bm{z}_i -\Sigma_{j=1}^l \bm{\alpha}_j \bm{e}_{w_j}\notag\\
%     &= \bm{\theta}_1 \bm{z}_1 +\bm{\theta}_2 \bm{z}_2 + ... + \bm{\theta}_k \bm{z}_k - \Sigma_{j=1}^l \bm{\alpha}_j \bm{e}_{w_j}
% \end{align}
% % Knapsack problem
% % \begin{equation}
% %     r_z(i) = \mathop{max}_{0\leq k\leq\lfloor \rfloor}[ z_k + r_z(i-)]
% % \end{equation}

% ---

\begin{table*}[!ht]
% \resizebox{columnwidth}{!}{}
% \small
\centering
\resizebox{\linewidth}{!}{
% \begin{tabular}{lllllllll}
\begin{tabular}{lcccccccc}
\toprule
                 & \multicolumn{4}{c}{CoLA}                                                              & \multicolumn{4}{c}{GoEmotions}                                                                       \\
\cmidrule(lr){2-5} \cmidrule(lr){6-9}  
Model             &  Acc    &  F1  & $\mathcal{H}$  & ECE$\downarrow$    &  Acc    &  F1  & $\mathcal{H}$  & ECE$\downarrow$   \\ \midrule
% SNGP                 & 56.49 & 0.44 & 0.501 & 0.0379  & 90.60  & 0.85 & 0.073 & 0.0409  & 79.30  & 0.79 & 0.396 & 0.0365\\
\midrule
ALBERT (11M)          &0.7923\small{±0.0192}&0.8624\small{±0.0106}&0.4650\small{±0.0695}&0.0834\small{±0.0179} & 0.6193\small{±0.0051}&0.4545\small{±0.0106}&0.3574\small{±0.0042}&0.0446\small{±0.0080}   \\  
ALBERT Label Smoothing&0.7699\small{±0.0452}&0.8508\small{±0.0195}&0.5712\small{±0.1834}&0.0625\small{±0.0345} & 0.6219\small{±0.0025}&0.4579±\small{0.0133}&0.3651\small{±0.0163}&0.0353\small{±0.0146} \\  
ALBERT MC Dropout     &0.7893\small{±0.0109}&0.8583\small{±0.0065}&0.4575\small{±0.0589}&0.0854\small{±0.0223} & 0.6152\small{±0.0055}&0.4448\small{±0.0144}&0.3697\small{±0.0188}&\textbf{0.0345}\small{±0.0076} \\   
ALBERT w/ BNN         &0.7973\small{±0.0011}&0.8647\small{±0.0006}&0.4180\small{±0.0015}&0.0936\small{±0.0016} & 0.6187\small{±0.0009}&0.4396\small{±0.0015} &0.3170\small{±0.0003}&0.0864\small{±0.0011}  \\  
ALBERT w/ \texttt{CUE} (Ours) &0.8038\small{±0.0005}&0.8668\small{±0.0004}&\underline{0.5771}\small{±0.0004}&\textbf{0.0444}\small{±0.0031} & 0.6176\small{±0.0021}&0.4567\small{±0.0046}&\underline{0.3814}\small{±0.0294}&0.0395\small{±0.0098}  \\   \midrule
DistilBERT (66M)          &0.7634\small{±0.0032}&0.8479\small{±0.0019}&0.5412\small{±0.0151}&0.0842\small{±0.0060}&0.6231\small{±0.0018}&0.4637\small{±0.0047}&0.3312\small{±0.0024}&0.0566\small{±0.0039}\\   
DistilBERT Label Smoothing&0.7632\small{±0.0033}&0.8477\small{±0.0018}&0.5620\small{±0.0154}&0.0765\small{±0.0063}&0.6233\small{±0.0021}&0.4643\small{±0.0042}&0.3412\small{±0.0033}&0.0520\small{±0.0035} \\  
DistilBERT MC Dropout     &0.7787\small{±0.0241}&0.8559\small{±0.0125}&0.4773\small{±0.0864}&0.0897\small{±0.0110}&0.6224\small{±0.0023}&0.4670\small{±0.0054}&0.3246\small{±0.0033}&0.0623\small{±0.0025} \\  
DistilBERT w/ BNN         &0.7659\small{±0.0016}&0.8491\small{±0.0010}&0.5133\small{±0.0002}&0.0966\small{±0.0021}&0.6237\small{±0.0009}&0.4550\small{±0.0016}&0.3080\small{±0.0002}&0.0802\small{±0.0005} \\ 
DistilBERT w/ \texttt{CUE} (Ours) &0.7831\small{±0.0012}&0.8540\small{±0.0010}&\underline{0.8362}\small{±0.0008}&\textbf{0.0738}\small{±0.0029}&0.6253\small{±0.0017}&0.4517\small{±0.0022}&\underline{0.4457}\small{±0.0004}&\textbf{0.0208}\small{±0.0031}   \\ \midrule
BERT (110M)         & 0.8000\small{±0.0072}	&0.8696\small{±0.0043}&0.4026\small{±0.0317}&0.0995\small{±0.0057} &0.6266\small{±0.0032} &	0.4829\small{±0.0075} &0.3342\small{±0.0049} &	0.0537\small{±0.0047}\\ %\cline{2-13}& 
BERT Label Smoothing& 0.8036\small{±0.0085}	&0.8717\small{±0.0044}&0.4099\small{±0.0843}&0.0967\small{±0.0185} &0.6268\small{±0.0040}&	0.4819\small{±0.0086} &0.3444\small{±0.0046} &	0.0492\small{±0.0020}\\ %\cline{2-13&}
BERT MC Dropout     & 0.8008\small{±0.0054}	&0.8703\small{±0.0037}&0.4023\small{±0.0321}&0.0987\small{±0.0100} &0.6266\small{±0.0026} &	0.4889\small{±0.0085} &0.3337\small{±0.0056} &	0.0548\small{±0.0053}\\ %\cline{2-13&}
BERT w/ BNN         & 0.6104\small{±0.1689}	&0.6545\small{±0.3509}&\underline{0.8837}\small{±0.0732}&0.1095\small{±0.1391} &0.6296\small{±0.0008} &	0.4855\small{±0.0008}         &0.3102\small{±0.0001} &	0.0775\small{±0.0014}\\ %\cline{2-13&}
BERT w/ \texttt{CUE} (Ours)        & 0.8123\small{±0.0012}	&0.8762\small{±0.0007}&0.4991\small{±0.0032}&\textbf{0.0677}\small{±0.0056}&0.6282\small{±0.0029} &	0.4712\small{±0.0087} &\underline{0.4433}\small{±0.0159} &\textbf{0.0326}\small{±0.0013}\\ \midrule
RoBERTa (125M)         &0.8050\small{±0.0142}&0.8721\small{±0.0072} &0.3310\small{±0.0472}&0.1100\small{±0.0190}& 0.6226\small{±0.0051}&0.4877\small{±0.0095}   &0.3310\small{±0.0472}&0.0602\small{±0.0073}\\
RoBERTa Label Smoothing&0.8165\small{±0.0128}&0.8788\small{±0.0058}&0.3091\small{±0.0415}&0.1079\small{±0.0120}& 0.6215\small{±0.0021}&0.4866\small{±0.0098}&0.3091\small{±0.0415}&0.0554\small{±0.0084}\\
RoBERTa MC Dropout     &0.8056\small{±0.0056}&0.8724\small{±0.0023}&0.3340\small{±0.0452}&0.1074\small{±0.0165}& 0.6217\small{±0.0034}&0.4907\small{±0.0082}&0.3340\small{±0.0452}&0.0604\small{±0.0060}\\
RoBERTa w/ BNN  &0.7992\small{±0.0022}&0.8699\small{±0.0014}&0.3069\small{±0.0010}&0.1228\small{±0.0021}&0.6227\small{±0.0002}&0.4686\small{±0.0056} &0.3069\small{±0.0010}&0.0881\small{±0.0002}\\
RoBERTa w/ \texttt{CUE} (Ours)          &0.8075\small{±0.0087}&0.8744\small{±0.0045}&\underline{0.6077}\small{±0.0595}&\textbf{0.0465}\small{±0.0075}&0.6255\small{±0.0005}&0.4540\small{±0.0013}&\underline{0.6077}\small{±0.0595}&\textbf{0.0316}\small{±0.0024}\\
\midrule
                 & \multicolumn{4}{c}{Emotion}                                                                 & \multicolumn{4}{c}{MultiNLI}                                                                     \\
\cmidrule(lr){2-5} \cmidrule(lr){6-9} 
Model             &  Acc    &  F1  & $\mathcal{H}$  & ECE$\downarrow$    &  Acc    &  F1  & $\mathcal{H}$  & ECE$\downarrow$   \\ \midrule
\midrule
ALBERT (11M)          &0.9284\small{±0.0037}&0.8862\small{±0.0031}&0.8862\small{±0.0031}&0.0348\small{±0.0059} &0.8362\small{±0.0018}&0.8358\small{±0.0018}  &0.8358\small{±0.0018}&0.0465\small{±0.0036} \\  
ALBERT Label Smoothing&0.9310\small{±0.0023}&0.8897\small{±0.0043} &\underline{0.8897}\small{±0.0043}&\textbf{0.0231}\small{±0.0018} &0.8327\small{±0.0020}&0.8317\small{±0.0020}   &0.8317\small{±0.0020}&0.0364\small{±0.0034} \\  
ALBERT MC Dropout     &0.9331\small{±0.0033}&0.8927\small{±0.0036}&0.8827\small{±0.0036}&0.0326\small{±0.0032} &0.8367\small{±0.0013}&0.8361\small{±0.0019}  &\underline{0.8361}\small{±0.0019}&0.0470\small{±0.0039} \\   
ALBERT w/ BNN         &0.9265\small{±0.0008}&0.8862\small{±0.0015}&0.8862\small{±0.0015}&0.0411\small{±0.0007} &0.8339\small{±0.0001}&0.8338\small{±0.0001}  &0.8338\small{±0.0001}&0.0527\small{±0.0001} \\  
ALBERT w/ \texttt{CUE}  (Ours)       &0.9269\small{±0.0020}&0.8897\small{±0.0044} &\underline{0.8897}\small{±0.0044}&0.0282\small{±0.0018}&0.8331\small{±0.0003}&0.8329\small{±0.0003}&0.8329\small{±0.0003}&\textbf{0.0338}\small{±0.0007}	 \\   \midrule
DistilBERT (66M)         &0.9287\small{±0.0031}&0.8886\small{±0.0062} &0.0441\small{±0.0044}&0.0393\small{±0.0030} &0.8067\small{±0.0014}	&0.8059\small{±0.0012} &0.3737\small{±0.0058}&0.0376\small{±0.0035} \\   
DistilBERT Label Smoothing&0.9264\small{±0.0031}&0.8841\small{±0.0056}&0.0716\small{±0.0051}&0.0353\small{±0.0020} &0.8049\small{±0.0012}	&0.8040\small{±0.0011} &0.3994\small{±0.0070}&0.0319\small{±0.0044} \\  
DistilBERT MC Dropout     &0.9298\small{±0.0018}&0.8886\small{±0.0025}&0.0432\small{±0.0051}&0.0388\small{±0.0020} &0.8066\small{±0.0020}	&0.8058\small{±0.0019} &0.3734\small{±0.0121}&0.0383\small{±0.0066} \\  
DistilBERT w/ BNN         &0.9315\small{±0.0008}&0.8931\small{±0.0009}&0.0413\small{±0.0001}&0.0406\small{±0.0000} &0.8059\small{±0.0002}	&0.8052\small{±0.0002} &0.3673\small{±0.0000}&0.0424\small{±0.0002}\\ 
DistilBERT w/ \texttt{CUE}  (Ours)       &0.9295\small{±0.0010}&0.8911\small{±0.0011} &\underline{0.0900}\small{±0.0002}&\textbf{0.0265}\small{±0.0005}&0.8058\small{±0.0003}	&0.8051\small{±0.0003}&\underline{0.4600}\small{±0.0022}&\textbf{0.0229}\small{±0.0005} \\ \midrule
BERT (110M)         & 0.9296\small{±0.0030}	&0.8871\small{±0.0057}	&0.0523\small{±0.0011}&0.0335\small{±0.0020}&0.8286\small{±0.0029}&0.8281\small{±0.0027} &0.3361\small{±0.0071}&0.0321\small{±0.0033} \\ %\cline{2-13} 
BERT Label Smoothing& 0.9295\small{±0.0042}	&0.8862\small{±0.0074}	&0.0759\small{±0.0024}&\textbf{0.0289}\small{±0.0061}&0.8265\small{±0.0016}&0.8261\small{±0.0017}&0.3513\small{±0.0116}&0.0317\small{±0.0043} \\ %\cline{2-13}
BERT MC Dropout     & 0.9285\small{±0.0027}	&0.8872\small{±0.0048}	&0.0527\small{±0.0026}&0.0325\small{±0.0037}&0.8287\small{±0.0030}&0.8281\small{±0.0028} &0.3363\small{±0.0071}&0.0315\small{±0.0032} \\ %\cline{2-13}
BERT w/ BNN         & 0.9274\small{±0.0008}	&0.8853\small{±0.0011}	&0.0497\small{±0.0001}&0.0402\small{±0.0006}&0.3469\small{±0.0194}&0.1862\small{±0.0179}&\underline{0.9245}\small{±0.0835}&0.1456\small{±0.1111} \\ %\cline{2-13}
BERT w/ \texttt{CUE}  (Ours)       & 0.9259\small{±0.0009}&	0.8850\small{±0.0015}&\underline{0.1031}\small{±0.0082}&\textbf{0.0289}\small{±0.0043}&0.8283\small{±0.0005}&0.8277\small{±0.0005}&0.3665\small{±0.0030}&\textbf{0.0262}\small{±0.0021}\\ \midrule
RoBERTa (125M)         & 0.9279\small{±0.0033}&0.8821\small{±0.0062}&0.0448\small{±0.0043}&0.0384\small{±0.0050}&0.8569\small{±0.0043}&0.8563\small{±0.0044}&0.2628\small{±0.0127}&0.0368\small{±0.0074} \\
RoBERTa Label Smoothing& 0.9301\small{±0.0024}&0.8896\small{±0.0051}&0.0675\small{±0.0026}&0.0341\small{±0.0049}&0.8551\small{±0.0022}&0.8546\small{±0.0020}&0.3029\small{±0.0082}&\textbf{0.0255}\small{±0.0033} \\
RoBERTa MC Dropout     & 0.9305\small{±0.0034}&0.8919\small{±0.0059}&0.0514\small{±0.0068}&0.0315\small{±0.0046}&0.8586\small{±0.0062}&0.8581\small{±0.0061}&0.2516\small{±0.0309}&0.0403\small{±0.0112}\\
RoBERTa w/ BNN         & 0.9290\small{±0.0006}&0.8923\small{±0.0005}&0.0478\small{±0.0002}&0.0412\small{±0.0007}&0.8528\small{±0.0001}&0.8527\small{±0.0001}&0.2620\small{±0.0000}&0.0416\small{±0.0002}\\
RoBERTa w/ \texttt{CUE} (Ours)        & 0.9286\small{±0.0005}&0.8927\small{±0.0008}&\underline{0.0944}\small{±0.0004}&\textbf{0.0313}\small{±0.0033}&0.8526\small{±0.0003}&0.8526\small{±0.0003}&\underline{0.3179}\small{±0.0003}&0.0262\small{±0.0004}\\
\bottomrule
\end{tabular}}

\caption{Results for \texttt{CUE} compare against baseline methods on four language models trained on four datasets. Values shown in parentheses indicate the model size. The highest entropy are underlined and the lowest ECE values are in bold. }
\label{table:results}
\end{table*}

In this subsection, we discuss how to quantify the prediction uncertainty that caused by the input features based on the latent space perturbation in \textsection{\ref{sec:counterfactual}}. The discussion is built on an inner product space defined by our noise generation methods. 
% In general, we first fine-tune a PLM classifier on a dataset for a downstream task, then we freeze the parameters of the PLM and the classification layer to train a multi-Gaussian model in order to learn to generate semi-factual latent representations.
% Therefore we designed an algorithm to retrieve input features that lead to uncertainty in PLM's original prediction output. 
During the inference stage, we identify the possible feature that caused predictive uncertainty by our proposed Uncertain Feature Identification (UFI) algorithm\footnote{We provide the UFI algorithm implementation in Supplementary Material Section 2.}. For a given input, we can retrieve three different representations from the \texttt{CUE} framework, the original PLM-encoded representation $\bm{e}_i$, the reconstructed representation $\bm{e}'_i$, and the difference between two representation, $\Delta e_i = \bm{e}'_i - \bm{e}_i$.
% \begin{algorithm}[h]
% \small
% \caption{Uncertain Feature Identification}
% \label{alg:UFI_alg}
% \textbf{Input:} $\bm{e}_i$, $\bm{e}'_i$, $\mu_{\bm{\theta}}$, token representations $\{{\bm{e}_{i}}_1,{\bm{e}_{i}}_2,\cdots,{\bm{e}_{i}}_n\}$, threshold $\alpha$.
% \normalsize
% \begin{algorithmic}
% \STATE $\Delta \bm{e}_i = \bm{e}'_i - \bm{e}_i$
% \FOR {the $d$th dimension ${\bm{z}_{i}}_d$ in $\bm{z}_i$}
%     \STATE ${\bm{r}_{z_i}}_d = \mu_{\theta}({\bm{z}_{i}}_d)$ 
%     \STATE ${\rm dim}^{d}_{\rm score}= \langle \Delta \bm{e}_i, {\bm{r}_{z_i}}_d \rangle$
% \ENDFOR
% \FOR {$\mbox{sort}({\bm{r}_{z_i}}_d, key=\phi({\rm dim}^{d}_{\rm score}, {\bm{r}_{z_i}}_d))[:\alpha]$}
%     \STATE ${\bm{r}_{z_i}}_D += {\bm{r}_{z_i}}_d$
% \ENDFOR
% \FOR {${\bm{e}_{i}}_j$ in $\{{\bm{e}_{i}}_1,{\bm{e}_{i}}_2,\cdots,{\bm{e}_{i}}_n\}$}
%     \STATE ${\rm{token}^j}_{\rm score}=  \langle {\bm{r}_{z_i}}_D, {\bm{e}_{i}}_j  \rangle$
% \ENDFOR
% \STATE \textbf{return} $\mbox{sort}(\rm{token}_j, key=\phi({\rm{token}^j}_{\rm score}, \rm{token}_j))$
% \end{algorithmic}
% % \small
% % \textbf{Output:} Sorted tokens by the influential score in decreasing order.
% \end{algorithm}
The reconstructed representation $\bm{e}'_i$ can be rewritten as the weighted sum of each latent dimension from $\bm{z}_i$, where the weight is given by the decoder:
% According to our modelling assumption \ref{ass:gaussian}, the dimensions should be independent between all pairwise elements. Therefore, the covariance between any two dimensions in the latent space is 0. The reconstructed $\bm{e}'$ from the latent vector $\bm{z}$ is a weighted sum of the reconstructed representation for each dimension of $\bm{z}$, where the weight is given by the encoder model:
\begin{equation}
\small
    \bm{e}'_i = \Sigma_{d=1}^{\rm dim} {q_{\phi}({\bm{z}_i}_d|\bm{e}_i)} \cdot {\bm{r}_{z_i}}_d, \quad {\bm{r}_{z_i}}_d = \mu_{\theta}({\bm{z}_i}_d),
    \label{eq:GMPR}
\end{equation}
\noindent where $\rm dim$ is the size of the latent space and ${\bm{r}_{z_i}}_d$ denotes the representation generated via the $d$-th dimension's code corresponding to the latent vector $\bm{z}_i$ from the decoder. As mentioned in \textsection{\ref{sec:counterfactual}}, $\Delta \mathcal{H}$ %s bounded by uniform distribution and 
is proportional to the reconstruction error $||\bm{e}'_i-\bm{e}_i||^2$. We thus use the norm (calculated as the inner product) of the reconstruction error, $||\Delta \bm{e}_i||^2 = \langle \Delta \bm{e}_i, \Delta \bm{e}_i \rangle $, to measure the entropy change as: 
{\small
\begin{align}
\label{eq:detalNorm}
    \langle \Delta \bm{e}_i, \Delta \bm{e}_i \rangle 
    =&  \langle \Sigma_{d=1}^{\rm dim} {q_{\phi}({\bm{z}_i}_d|\bm{e}_i)} \cdot {\bm{r}_{z_i}}_d - \bm{e}_i, \Delta \bm{e}_i \rangle \notag \\
    =& \langle \Sigma_{d=1}^{\rm dim} {q_{\phi}({\bm{z}_i}_d|\bm{e}_i)} \cdot {\bm{r}_{z_i}}_d, \Delta \bm{e}_i \rangle  - \langle \bm{e}_i, \Delta \bm{e}_i \rangle, 
    %=& \langle \Sigma_{d=1}^{\rm dim} {q_{\phi}({\bm{z}_i}_d|\bm{e}_i)} \cdot {\bm{r}_{z_i}}_d, \Delta \bm{e}_i \rangle
     % \approx& \Sigma_{i=1}^d {q_{\phi}(\bm{z_i}|\bm{e})} \cdot \langle  \bm{r}_{z_i}, \Delta e \rangle 
\end{align}}
\noindent where $\langle \cdot,\cdot \rangle$ denotes the inner product. 
In the first line of Eq. (\ref{eq:detalNorm}), we substitute the first $\Delta \bm{e}_i$ with $\bm{e}'_i -\bm{e}_i$, and further substitute $\bm{e}'_i$ with Eq. (\ref{eq:GMPR}). When determining the relative importance of each latent dimension with respect to the predictive entropy change, $\langle \bm{e}_i, \Delta \bm{e}_i\rangle$ can be ignored as it is the same for all latent dimensions. Therefore, the inner product of $\langle {\bm{r}_{z_i}}_d, \Delta \bm{e}_i \rangle$, which dominates the norm value of $||\Delta \bm{e}_i||$ in the $d$-th dimension can be used to measure predictive uncertainty caused by each dimension from the latent space $\bm{r}_{z_i}$, and thus determine each dimension's importance.
% which can be used to measure the dimension importance   to determine the important dimension $z_i$ in the latent code $\bm{z}$ which causes uncertainty of the prediction.

On the other hand, the input text representation $\bm{e}_i$ output by the PLM at layer-$L$ can be written as a Softmax-based weighted sum of each token's representation from the previous layer $L-1$ by\footnote{Note that all representations in the RHS are from Layer $L-1$. We drop the superscript $L-1$ to simplify the notations.}:
{\small
\begin{align}
    \bm{e}_i^L  &= \Sigma_{j=1}^n {\rm Softmax}(\left \langle \bm{e}_i, {\bm{e}_{i}}_j \rangle\right ) \cdot {\bm{e}_{i}}_j \nonumber\\
    &\propto \Sigma_{j=1}^n {\rm exp}(\langle \bm{e}_i,{\bm{e}_{i}}_j \rangle) \cdot {\bm{e}_{i}}_j,
\end{align}}
\noindent where ${\bm{e}_{i}}_j$ denotes the representation of the $j$-th input token. We assume that the adjacent layers in the transformer share similar representations. Then, $\bm{e}_i$ at layer $L$ is: 
%{\small
%\vspace*{-0.54cm}
{\small
\begin{align}
    \bm{e}_i^L \propto& \Sigma_{j=1}^n {\rm exp}(\langle \Sigma_{d=1}^{\rm dim} {q_{\phi}({\bm{z}_i}_d|\bm{e}_i)} \cdot {\bm{r}_{z_i}}_d,{\bm{e}_{i}}_j \rangle) \cdot {\bm{e}_{i}}_j \notag \\
   % \propto& \Sigma_{j=1}^n {\rm exp}( \Sigma_{d=1}^{\rm dim} {q_{\phi}({\bm{z}_i}_d|\bm{e}_i)} \cdot \langle {\bm{r}_{z_i}}_d,{\bm{e}_{i}}_j \rangle) \cdot {\bm{e}_{i}}_j \notag \\
    \propto & \Sigma_{j=1}^n ( \Sigma_{d=1}^{\rm dim} {q_{\phi}({\bm{z}_i}_d|\bm{e}_i)} \cdot \langle {\bm{r}_{z_i}}_d,{\bm{e}_{i}}_j \rangle) \cdot {\bm{e}_{i}}_j 
\end{align}}%}
Therefore, the influence on prediction uncertainty changes $\Delta \mathcal{H}$ of the $j$-th token is decided by the generative probability of the encoder and the inner product $\langle {\bm{r}_{z_i}}_d,{\bm{e}_{i}}_j \rangle$. However, seeking the optimal ${\bm{z}_i}_d$, by minimizing the reconstruction loss Eq. (\ref{eq:reconstruction}), is a typical Knapsack problem, which is an NP-complete problem. Hence, intuitively, we use greedy search to find a locally optimal solution by identifying the most influential latent dimensions of $\bm{z}_i$ first and then estimating the influential score for each token. %The UFI algorithm is shown in the \textbf{Supplementary Material Section 2}.

%(see in Algorithm \ref{alg:UFI_alg}). That is, we can identify input tokens that are most similar to the influential representation vector ${\bm{r}_{z_i}}_d$ as the ones which cause predictive uncertainty by the inner product in the metric space. More concretely, assuming the PLM-encoded representation for token $j$ is ${\bm{e}_{i}}_j$, we can compute each token's importance score by ${\rm{token}^j}_{\rm score}= \langle {\bm{r}_{z_i}}_d,{\bm{e}_{i}}_j  \rangle$. By sorting ${\rm{token}^j}_{\rm score}$ in descending order, we can identify input tokens that cause predictive uncertainty.
% The identification of the source of the uncertainty highly relies on the influence of latent dimensions. In practice, we use a threshold $\alpha$ to select the most similar dimensions of $\Delta \bm{e}_i$ to construct a combination of the most influential uncertain representation ${\bm{r}_{z_i}}_D$. The threshold $\alpha$ can be defined with the help of the entropy curve from the dimension importance analysis described in Section \ref{sec:label_removal}.
% \begin{table*}[!ht]
% % \resizebox{columnwidth}{!}{}
% % \small
% \centering
% \resizebox{0.8\columnwidth}{!}{
% \begin{tabular}{lllllll}
% \toprule
%                  & \multicolumn{3}{c}{CoLA}                                                              & \multicolumn{3}{c}{GoEmotions}                                                                       \\
% \cmidrule(lr){2-4} \cmidrule(lr){5-7}  
% Model             &  Acc    &  F1    & ECE$\downarrow$    &  Acc    &  F1    & ECE$\downarrow$   \\ \midrule
% % SNGP                 & 56.49 & 0.44 & 0.501 & 0.0379  & 90.60  & 0.85 & 0.073 & 0.0409  & 79.30  & 0.79 & 0.396 & 0.0365\\
% \midrule
% ALBERT (11M)          &0.7923±0.0192&0.8624±0.0106&0.0834±0.0179 & 0.6193±0.0051&0.4545±0.0106&0.0446±0.008   \\  
% ALBERT Label Smoothing&0.7699±0.0452&0.8508±0.0195&0.0625±0.0345 & 0.6219±0.0025&0.4579±0.0133&0.0353±0.0146 \\  
% ALBERT MC Dropout     &0.7893±0.0109&0.8583±0.0065&0.0854±0.0223 & 0.6152±0.0055&0.4448±0.0144&0.0345±0.0076 \\   
% ALBERT w/ BNN         &0.7973±0.0011&0.8647±0.0006&0.0936±0.0016 & 0.6187±0.0009&0.4396±0.0015&0.0864±0.0011  \\  
% ALBERT w/ VAE         &0.8038±0.0005&0.8668±0.0004&\textbf{0.0444±0.0031} & 0.6176±0.0021&0.4567±0.0046&\textbf{0.0395±0.0098}  \\   \midrule
% DistilBERT (66M)          &0.7634±0.0032&0.8479±0.0019&0.0842±0.006& 0.6231±0.0018&0.4637±0.0047&0.0566±0.0039 \\   
% DistilBERT Label Smoothing&0.7632±0.0033&0.8477±0.0018&0.0765±0.0063&0.6233±0.0021&0.4643±0.0042&0.052±0.0035  \\  
% DistilBERT MC Dropout     &0.7787±0.0241&0.8559±0.0125&0.0897±0.011&0.6224±0.0023&0.467±0.0054&0.0623±0.0025  \\  
% DistilBERT w/ BNN         &0.7659±0.0016&0.8491±0.001&0.0966±0.0021&0.6237±0.0009&0.455±0.0016&0.0802±0.0005  \\ 
% DistilBERT w/ VAE         &0.7831±0.0012&0.854±0.001&\textbf{0.0738±0.0029}&0.6253±0.0017&0.4517±0.0022&\textbf{0.0208±0.0031}   \\ \midrule
% BERT (110M)         & 0.8±0.0072	   & 0.8696±0.0043&0.0995±0.0057 &0.6266±0.0032 &	0.4829±0.0075  &	0.0537±0.0047\\ %\cline{2-13} 
% BERT Label Smoothing& 0.8036±0.0085	&0.8717±0.0044&0.0967±0.0185 &0.6268±0.004 &	0.4819±0.0086  &	0.0492±0.002 \\ %\cline{2-13}
% BERT MC Dropout     & 0.8008±0.0054	&0.8703±0.0037&0.0987±0.01 &0.6266±0.0026 &	0.4889±0.0085  &	0.0548±0.0053\\ %\cline{2-13}
% BERT w/ BNN         & 0.6104±0.1689	&0.6545±0.3509&0.1095±0.1391 &0.6296±0.0008 &	0.4855±0.0008  &	0.0775±0.0014\\ %\cline{2-13}
% BERT w/ VAE         & 0.8123±0.0012	&0.8762±0.0007&\textbf{0.0677±0.0056}&0.6282±0.0029 &	0.4712±0.0087  &\textbf{0.0326±0.0013}\\ \midrule
% RoBERTa (125M)         &0.805±0.0142&0.8721±0.0072	&0.11±0.019& 0.6226±0.0051&0.4877±0.0095&0.0602±0.0073\\
% RoBERTa Label Smoothing&0.8165±0.0128&0.8788±0.0058&0.1079±0.012& 0.6215±0.0021&0.4866±0.0098&0.0554±0.0084\\
% RoBERTa MC Dropout     &0.8056±0.0056&0.8724±0.0023&0.1074±0.0165& 0.6217±0.0034&0.4907±0.0082&0.0604±0.006\\
% RoBERTa w/ BNN         &0.7992±0.0022&0.8699±0.0014&0.1228±0.0021&0.6227±0.0002&0.4686±0.0056&0.0881±0.0002\\
% RoBERTa w/ VAE         &0.8075±0.0087&0.8744±0.0045&\textbf{0.0465±0.0075}&0.6255±0.0005&0.454±0.0013&\textbf{0.0316±0.0024}\\
% \midrule
%                  & \multicolumn{3}{c}{Emotion}                                                                 & \multicolumn{3}{c}{MultiNLI}                                                                     \\
% \cmidrule(lr){2-4} \cmidrule(lr){5-7} 
% Model             &  Acc    &  F1     & ECE$\downarrow$    &  Acc    &  F1       & ECE$\downarrow$ \\ \midrule
% \midrule
% ALBERT (11M)         &0.9284±0.0037&0.8862±0.0031&0.0348±0.0059 &0.8362±0.0018&0.8358±0.0018 &0.0465±0.0036 \\  
% ALBERT Label Smoothing&0.931±0.0023&0.8897±0.0043&0.0231±0.0018  &0.8327±0.002&0.8317±0.002   &0.0364±0.0034 \\  
% ALBERT MC Dropout     &0.9331±0.0033&0.8927±0.0036&0.0326±0.0032 &0.8367±0.0013&0.8361±0.0019 &0.047±0.0039 \\   
% ALBERT w/ BNN         &0.9265±0.0008&0.8862±0.0015&0.0411±0.0007 &0.8339±0.0001&0.8338±0.0001&0.0527±0.0001 \\  
% ALBERT w/ VAE         &0.9269±0.002&0.8897±0.0044&\textbf{0.0282±0.0018}&0.8331±0.0003&0.8329±0.0003&\textbf{0.0338±0.0007}	 \\   \midrule
% DistilBERT (66M)         &0.9287±0.0031&0.8886±0.0062&0.0393±0.003 &0.8067±0.0014	&0.8059±0.0012&0.0376±0.0035 \\   
% DistilBERT Label Smoothing&0.9264±0.0031&0.8841±0.0056&0.0353±0.002 &0.8049±0.0012	&0.804±0.0011&0.0319±0.0044 \\  
% DistilBERT MC Dropout     &0.9298±0.0018&0.8886±0.0025&0.0388±0.002 &0.8066±0.002	&0.8058±0.0019&0.0383±0.0066 \\  
% DistilBERT w/ BNN         &0.9315±0.0008&0.8931±0.0009&0.0406±0.000 &0.8059±0.0002	&0.8052±0.0002&0.0424±0.0002\\ 
% DistilBERT w/ VAE         &0.9295±0.001&0.8911±0.0011&\textbf{0.0265±0.0005}&0.8058±0.0003	&0.8051±0.0003&\textbf{0.0229±0.0005} \\ \midrule
% BERT (110M)         & 0.9296±0.003	&0.8871±0.0057	&0.0335±0.002&0.8286±0.0029&0.8281±0.0027&0.0321±0.0033 \\ %\cline{2-13} 
% BERT Label Smoothing& 0.9295±0.0042	&0.8862±0.0074	&0.0289±0.0061&0.8265±0.0016&0.8261±0.0017	&0.0317±0.0043 \\ %\cline{2-13}
% BERT MC Dropout     & 0.9285±0.0027	&0.8872±0.0048	&0.0325±0.0037&0.8287±0.003&0.8281±0.0028&0.0315±0.0032 \\ %\cline{2-13}
% BERT w/ BNN         & 0.9274±0.0008	&0.8853±0.0011	&0.0402±0.0006&0.3469±0.0194&0.1862±0.0179&0.1456±0.1111 \\ %\cline{2-13}
% BERT w/ VAE         & 0.9259±0.0009&	0.885±0.0015	&\textbf{0.0289±0.0043}&0.8283±0.0005&0.8277±0.0005&\textbf{0.0262±0.0021}\\ \midrule
% RoBERTa (125M)         & 0.9279±0.0033&0.8821±0.0062&0.0384±0.005&	0.8569±0.0043&0.8563±0.0044&0.0368±0.0074 \\
% RoBERTa Label Smoothing& 0.9301±0.0024&0.8896±0.0051&0.0341±0.0049&	0.8551±0.0022&0.8546±0.002&0.0255±0.0033 \\
% RoBERTa MC Dropout     & 0.9305±0.0034&0.8919±0.0059&0.0315±0.0046&0.8586±0.0062&0.8581±0.0061&0.0403±0.0112\\
% RoBERTa w/ BNN         & 0.929±0.0006&0.8923±0.0005 &0.0412±0.0007&0.8528±0.0001&0.8527±0.0001&0.0416±0.0002\\
% RoBERTa w/ VAE         & 0.9286±0.0005&0.8927±0.0008&\textbf{0.0313±0.0033}&0.8526±0.0003&0.8526±0.0003&\textbf{0.0262±0.0004}\\
% \bottomrule
% \end{tabular}}

% \caption{Results for BNN and VAE uncertainty analysis methods on four language models trained on three datasets. Values shown in parentheses indicate the model size. The lowest ECE values are in bold.}
% \label{table:results}
% \end{table*}



\begin{figure*}[ht]
\centering
\subfloat[BERT \texttt{CUE} on CoLA.]{
    \includegraphics[width=\columnwidth]{resources/new_latent_dim/bert-cola.png}}
%
\subfloat[BERT \texttt{CUE} on GoEmotions.]{
    \includegraphics[width=\columnwidth]{resources/new_latent_dim/bert-goemo.png}}\\
\subfloat[BERT \texttt{CUE} on Emotion.]{
    \includegraphics[width=\columnwidth]{resources/new_latent_dim/bert-emo.png}}
%
\subfloat[BERT \texttt{CUE} on MultiNLI.]{
    \includegraphics[width=\columnwidth]{resources/new_latent_dim/bert-mnli.png}}
\caption{Evaluation results by removing latent dimensions. The $x$-axis represents the index of \textbf{removed} dimensions ranked by their relevance to $\Delta \bm{e}_i$, smaller index number indicates higher relevance. Histograms show the ECE scores after removing the corresponding latent dimensions. The blue curve shows the predictive entropy. The green and red curves show classification accuracy and F1, respectively.}
    \label{fig:latent_ablation}
\end{figure*}

\section{Experiments} \label{sec:experiments}
We first present the experimental setup followed by evaluation results.
% \subsection{Experimental Setup}

\paragraph{Datasets}
We evaluate our proposed framework on four datasets for \emph{linguistic acceptability classification}, \emph{natural language inference}, and \emph{emotion classification}. 

% We evaluate our proposed framework on four datasets below. %for \emph{linguistic acceptability classification}, \emph{natural language inference}, and \emph{emotion classification}. %The dataset statistics are shown in Table \ref{table:dataset}.
%\begin{table}[h!]
% \begin{wraptable}{r}{6cm}
% \centering
% \resizebox{\columnwidth}{!}{
% \begin{tabular}{lrrrr}
% \toprule
% Datasets &   CoLA & MultiNLI & Emotion & GoEmotions \\ \midrule
% Classes  &  2 & 3 & 6 & 27  \\   \midrule
%  Train   &   8,551   &  392,702     &   16,000 &   43,410 \\ 
%  Dev     &   1,043   &    20,000    &   2,000  &   5,427  \\ 
%  Test    &   1,043   &    20,000    &   2,000  &   5,426  \\ \midrule
% Total    &   10,637  &    432,702   &   20,000 &   58,009 \\
% \bottomrule
% \end{tabular}}
% \caption{Statistic of the datasets.}
% \label{table:dataset}
% \end{wraptable}
% \noindent\underline{Linguistic Acceptability Classification}. The CoLA (Corpus of Linguistic Acceptability) %is from the GLUE benchmark 
% \citep{cola} %,wang2018glue}. This dataset 
% contains sentences %from 23 linguistics publications %from books and journal articles, which are 
% annotated as \emph{grammatically acceptable} or \emph{not}.
%to judge whether the sentence is a grammatical sentence. %We use this dataset to exam the ability of our latent variable model on the ability of grammatical classification and binary classification.
% \noindent\underline{Natural Language Inference}. The MultiNLI \citep{dataset_multinli} %,wang2018glue} 
% dataset contains annotations for relations of \emph{entailment}, \emph{contradiction}, and \emph{neutrality} between sentence pairs. %from the GLUE benchmark to verify our method's effectiveness on classification for sentence pairs, in which each 
% contains sentence pairs, %each of which is annotated with one of the three categories, \emph{entailment}, \emph{contradictory}, and \emph{neutral}. %Training and testing split is the same as in the original dataset release. All the result we report for Multi NLI are test on the matched testset.
% \noindent\underline{Emotion Classification}. %We use two popular multi-class emotion classification datasets, GoEmotions \citep{dataset_goemotions} and Emotion \citep{dataset_emotion}. 
% The GoEmotions \citep{dataset_goemotions} dataset annotates Reddit comments with twenty-seven emotion labels (e.g., \emph{fear} and \emph{admiration}). %is a multi-label emotion classification dataset consisting of 54k Reddit comments categorised into 27 emotion labels.
% The Emotion \citep{dataset_emotion} dataset classifies English tweets into six emotion classes (e.g., \emph{sadness} and \emph{joy}). Note that the GoEmotions dataset allows multi-label settings to annotate a sentence with more than one emotion label. In our experimental setup, we only focus on multi-class classification, and we thus filtered out those instances annotated with multiple labels in the GoEmotions dataset.
% In our experiments, we chose four popular PLMs base model with the following parameter size: ALBERT-base-v2 (11M), distilBERT-base-uncased (66M), BERT-base-uncased (110M), and RoBERTa-base (125M).
\paragraph{Baselines}
We compare our method with three baselines: Label Smoothing \citep{Gupta_Kvernadze_Srikumar_2021}, MC Dropout \citep{monte_carlo_dropout} and Bayesian Neural Network (BNN). Label Smoothing and MC Dropout are implemented in PLMs and directly fine-tuned on the target datasets. The BNN works as a plug-in component, same as \texttt{CUE}, for which the base PLM encoding and the classification layer are firstly fine-tuned and then parameters are frozen for the plug-in layer training. 
%These methods are evaluated on four datasets for \emph{linguistic acceptability classification} (CoLA, \citet{cola}), \emph{natural language inference} (MultiNLI, \citet{dataset_multinli}), and \emph{emotion classification} (Emotion, \citet{dataset_emotion}; GoEmotions, \citet{dataset_goemotions}). 

\textbf{Evaluation Metrics}
Accuracy (Acc), macro-averaged F1 (F1), average entropy ($\mathcal{H}$), and Expected Calibration Error (ECE) are used as metrics for classification performance, uncertainty and model calibration measurement. 
% \begin{figure}[htbp]
%   \centering
%   \includegraphics[width=\columnwidth]{resources/ece_entropy_picture.pdf}
%   \caption{Lower entropy and lower ECE is the most ideal situation, however this is quite hard to achieve. Therefore, our model tend to search by the boundary and find a state that achieves lower ECE with higher average entropy. }
% \end{figure} 

%Due to page limits, we provide 
More details on dataset statistics, baseline setup, evaluation metrics and hyperparameter settings are in Supplementary Material Section 3.

% In all our experiments, language models are firstly fine-tuned on the CoLA, GoEmotions, Emotion and MultiNLI datasets, respectively. Afterwards, the BNN or VAE uncertainty analysis component is inserted between a PLM's encoder and its classification layer and is trained while keeping all the other parameters frozen. In this work, we focus on studying the model behaviour with uncertainty embedded, aiming at adjusting model's uncertain predictions, while keeping the model's prediction accuracy and Macro F1 scores unchanged as much as possible.

\newcommand\blue[1]{\textcolor{blue}{\emph{#1}}}
\newcommand\red[1]{\textcolor{red}{#1}}

\begin{table*}[htb]
\begin{center}
\resizebox{\linewidth}{!}{
\begin{tabular}{p{0.95\textwidth}p{0.15\textwidth}p{0.1\textwidth}}
\toprule
\textbf{Examples} & \textbf{Predicted} & \textbf{True} \\
\cmidrule(lr){1-1} \cmidrule(lr){2-2} \cmidrule(lr){3-3}
\multicolumn{3}{c}{\textbf{GoEmotions}} \\ \midrule
Despite having lived here for 10 years, I've never been to portillos, and given this, it's \blue{somewhat unlikely} I start going now...   & Disapproval 0.42$\rightarrow$ 0.35 & Neutral  \\
%Ours came today too : ). \blue{We got 22 h bu?} & Curiosity\newline 0.45$\rightarrow$ 0.35 & Neutral \\
%\blue{Oh gosh [name]} had \blue{issues} as well with someone and \blue{his leg} I forgot. & Disappointment\newline 0.32 $\rightarrow$ 0.22 & Neutral \\
Somehow I got banned for replying to a troll. The mods over there have \blue{itchy trigger fingers}. & Disappointment\newline 0.28 $\rightarrow$ 0.19 & Disapproval \\
Boundaries. Have some boundaries. \blue{Say no}. \blue{Don't go}. This is frustrating to read, honestly. Don't do things that you hate doing. & Disgust \newline 0.34 $\rightarrow$ 0.25 & Fear \\\midrule
\multicolumn{3}{c}{\textbf{Emotion}} \\
\midrule
I were to \blue{go overseas or cross the border} then I become a foreigner and will feel that way but never in my \blue{beloved land}. & Joy\newline 0.51 $\rightarrow$ 0.43 & Love \\
I started feeling a little stressed about leaving on time and making sure we \blue{got the getting ready pictures} I wanted but everything seemed to work out perfectly. & Sadness\newline 0.59 $\rightarrow$ 0.40 & Anger \\
I \blue{wont} lie this week has been \blue{abit of a difficult} week for me ive been feeling very stressed and anxious this week plus i think im coming down with the flu but it \blue{has definately} helped me to appreciate the little things. & Sadness\newline  0.64 $\rightarrow$ 0.44 & Anger \\\midrule
\multicolumn{3}{c}{\textbf{MultiNLI}} \\
\midrule
\textbf{P:} There are no shares of a stock that might someday come back, just piles of options \blue{as worthless as} those shares of cook's american business alliance. & Neutral\newline 0.43 $\rightarrow$ 0.40 & Contradiction  \\
\textbf{H:} Cook's american business alliance caused shares of stock to come back. & & \\
\textbf{P:} Until all \blue{members} of our \blue{society} are afforded that access, this promise of our government will continue to be \blue{unfulfilled}. & Entailment\newline 0.48 $\rightarrow$ 0.43 & Neutral\\
\textbf{H:} the government is flawed and \blue{unfulfilled}. & &\\
\textbf{P:} Here you`ll find the finest leather goods and of - the - moment fashions from all the predictable high - \blue{priests} (valentino, armani, versace, gucci, missoni, etc.). A number of \blue{classic} men's clothing meccas such as cucci (with a c), brioni, and battistoni are still \blue{going strong}. & Contradiction\newline 0.35 $\rightarrow$ 0.33 & Entailment \\
\textbf{H:} You will find only the highest quality goods, be they \blue{high - fashion icons} or top - \blue{notch designer clothing} here. & &\\
\bottomrule
\end{tabular}}
\caption{Visualisation of token-level uncertainty interpretation. The `\textbf{Predicted}' column denotes the incorrect predictions made by the original fine-tuned model. Values below each predicted label denotes the predictive probability change after applying our framework. The `\textbf{True}' column denotes the gold-standard class labels. Italic text highlighted in blue are word tokens identified by UFI Algorithm that cause predictive uncertainty.}
\label{tab:sen_examples}
\end{center}
\end{table*}


\subsection{Overall Comparison}

Table \ref{table:results} presents the performance of methods with four state-of-the-art PLMs, namely,  BERT \citep{bert}, ALBERT \citep{albert}, DistilBERT \citep{distilbert} and RoBERTa \citep{roberta}, as backbones. 
%Table \ref{table:results} presents the performance on four datasets of four state-of-the-art PLMs, BERT \citep{bert}, ALBERT \citep{albert}, DistilBERT \citep{distilbert} and RoBERTa \citep{roberta}.
Our framework with a plug-in \texttt{CUE} module obtains the lowest ECE scores and highest average predictive entropy on all tasks and with different base model choices while maintaining a comparable level of Acc/F1 scores as the original model. Although the BNN model achieves the highest entropy with BERT on MultiNLI and CoLA dataset, we can observe a significant drop in its Acc/F1 scores. This indicates the BNN encoder hardly generates reliable perturbations that maintain predicted labels unchanged.
Interestingly, while the classification performance of all compared models shows slight degradation with the injection of uncertainty into the PLMs, our framework achieves steady accuracy gains on the CoLA dataset. %Furthermore, our framework achieves lower ECE with higher entropy. 
% We speculate that %the increased entropy is a result of the decrement of model confidence, while 
% the decreased ECE shows the model tends to reduce the confidence of more `ambiguous' instances and improve the confidence of the more `certain' instances.
% We also show the results of SNGP in the top row in Table \ref{table:results}. Unlike other three approaches, SNGP method modified the BERT classifier \footnote{\url{https://www.tensorVAE.org/text/tutorials/uncertainty_quantification_with_sngp_bert}}. In general, it performs worse in terms of classification accuracy and F1 scores compared to PLM-based approaches. It tends to give higher predictive entropy. SNGP is also built on BERT encoder, and yet it is inferior to BERT w/ VAE on ECE across all three datasets.  
\subsection{Effectiveness of the Uncertainty Feature Identification Algorithm} %Ability on Latent Variable Dimensions}
\label{sec:effectiveness_experiments}
\paragraph{Results with Latent Dimension Removal}\label{sec:label_removal}
As presented in \textsection{\ref{sec:inputFeatureIdentification}}, we can use the \texttt{CUE}'s reconstruction difference $\Delta{\bm{e}_i}$, to disentangle the most influential latent dimensions ${\bm{z}_i}_d$s which cause predictive uncertainty. Since each latent dimension is associated with an influential score, we can sort the latent dimensions accordingly. We speculate that by removing latent dimensions with higher influence scores, we should be able to observe a reduction in predictive uncertainty. As shown in Figure \ref{fig:latent_ablation}, we visualise the evaluation results by removing latent dimensions from $\bm{z}_i$ according to their relevance to $\Delta \bm{e}_i$ (the rank is shown on the $x$-axis) on BERT models. 
In our experiments, the latent vector $\bm{z}_i$ has 100 dimensions, we thus sort them into 10 bins in descending order based on their influential scores. In practice, the latent dimension removal is achieved by assigning $0$ as the value of the dimension on $\bm{z}_i$ to create a modified latent variable $\bm{z}_i'$, new prediction is made with $\hat{y}_i' = g_{\bm{\eta}}(p_{\bm{\theta}}(\bm{z}_i'))$.  %Since the bias term of the linear layer has been disabled in our decoder, we can ensure that assigning $0$ would not influence the output of other dimensions. 

We can observe a remarkable increasing trend of ECE (the histograms) and average entropy (the blue curve) when removing the most influential latent dimensions of $\bm{z}_i$ on GoEmotions, Emotion and MultiNLI datasets. This indicates the top-ranked dimensions (smaller index number) contribute more to increase the predictive uncertainty and reducing the overconfidence prediction, % on reconstruct representation $e'$, 
while lower-ranked dimensions have less effect. Therefore, we can select the appropriate threshold for each dataset during token-level uncertainty identification by observing the trend of ECE and entropy.
% This is also evident by observing the average predictive entropy values (the blue curves) that removing the most influential latent dimensions leads to the largest drop in predictive entropy. %share the similar trend. We can notice Accuracy and F1 Score stays stable for all the dimensions. 
We also notice that across all datasets, removing any latent dimensions does not affect much the classification accuracy (the green curve) and macro-F1 (the red curve). However, we did not observe a similar trend of ECE and entropy on the CoLA dataset. We suspect this is due to a relatively simple setup in CoLA as it is only a binary classification task. For datasets with more classes, such as GoEmotion with 27 classes, the trend of ECE with latent dimension removal becomes more obvious. %this is due to the size of the labels, we can see as the label size grow the tendency becomes more obvious. Therefore, we are able to identify the token-level uncertainty according to the inner product between a token and the latent dimensions learned by VAE. 
We also performed the same analysis and observed similar phenomena on other PLMs, DistilBERT, ALBERT and RoBERTa, in Supplementary Material Section 4.1.  
% we can see that the removed dimensions not only contribute to the correctness of the predictive output, but also provide a stable quantification of confidence in the predictive probabilistic space. Specifically, in this case the first 40 latent dimensions may be the most influential dimensions for current model. 
% In practice, we disable the bias in the VAE's decoder to get more stable results on the scores. It is interesting to find that the accuracy keeps decreasing by removing the latent dimensions ranked from the top to the bottom. As a consequence, after removing the latent dimensions from the bottom of the list, both ECE and entropy increase rapidly, which shows that the removed dimensions not only contribute to the correctness of the predictive output, but also provide a stable quantification of confidence in the predictive probabilistic space. 


\paragraph{Case Study of Token-Level Uncertainty Identification}

In this subsection, we demonstrate the effectiveness of our uncertainty identification algorithm by visualising the tokens that our framework finds contributing to predictive uncertainty. We present several examples in which our framework reduces overconfident predictions %(i.e., lower down the incorrect prediction's probability) 
in Table \ref{tab:sen_examples}. We only show the results with BERT as the base model due to page limits. %All the examples are obtained from VAE models fine-tuned on BERT.
Tokens coloured in blue are the influential tokens\footnote{For words split into subword tokens, we take the average importance score of the constituent subword tokens.} identified by the UFI Algorithm.

% For the case from GoEmotions, the emotion label of the sentence ``\textit{I have no faith, but the theodicy problem has multiple answers, I'm not too concerned with it.}'' should be ``\textit{Neutral}'', but it is wrongly classified as ``\textit{Disapproval}'' with high confidence of 0.70. With our framework, the prediction confidence has been decreased to 0.57, which indicates our framework could mitigate the problem of \emph{overconfidence} prediction. By tracing back to the tokens which cause uncertainty by perturbing the latent space, our framework found the phrase ``\textit{concerned with}'' contributes the most to the uncertainty. This is also in line with human perception that ``\textit{concerned with}'' may lead people to mistakenly believe that the speaker disapproves something. Similarly, we could also find that words such as ``\textit{stressed}'', ``\textit{heavenly father}'' confuse the model to give a wrong label of ``\textit{Sadness}'' in the example from the Emotion dataset. For the case of NLI, words like ``\textit{high priority for}'' without a deep understanding of the context could easily lead to an erroneous prediction of``\textit{Entailment}''.
% For the first case from GoEmotions, our framework identifies "somewhat unlikely" as the key phrase that confused the classifier, if we only read the sub-sentence that contains this phrase we may think the sentence emotion as a Disapproval, which is an incorrect label. However, if we combine the whole sentence together, this sentence should be a Neutral, cause the author does not have negative emotion towards go to portillos. The second and third cases have been classified as an incorrect labels mainly because the partial emotion's influence. The question from the second sentence and "Oh gosh [name] had issues" made the classifier confused about the emotion overall, hence the original model mis-classified these two sentences as Curiosity and Disappointment. The fourth case identifies a slang that hard to be understand by the language model. The author's impression is more disagree with the situation rather than disappointed. Similarly, we can see the emotion of Fear from the influential phrase that confused the classifier from the fifth case.

% For the Emotion dataset, the first case 

For emotion classification, we found classifiers tend to be confused by idioms or phrases carrying emotions different from the true emotion labels. For example, the second sentence in GoEmotion contains a metaphorical phrase, `\emph{itchy trigger finger}', making it a tricky case for emotion classification. We conducted additional experiments in which we substituted the phrase "itchy trigger finger" with either the \texttt{\small{[MASK]}} token or commonly used words to express the same meaning. In both cases, the model uncertainty is reduced by replacing the original phrase with the mask tokens leading to label switching. Replacing the identified phrase with more commonly-used words increases the predictive probability and leads to a more confident prediction. These results verify the validity of our approach for identifying words/phrases causing predictive uncertainties. The first and last sentences in GoEmotion and also the last sentence in Emotion contain phrases which are somewhat more closely related to the incorrectly predicted labels than the true labels, confusing the classifier to generate wrong predictions. %may confused on overall prediction by those partial phrases with strong emotion features. For first three cases from the GoEmotions and last case from the Emotion dataset, classifier identify those sentence parts that reflects strong emotion on the incorrect predict label. In some cases, the classifier may also affect by the slang or oral languages that hard to be understand. For the last two cases from the GoEmotions and second case from the Emotion dataset, we can notice those description infers the correct labels' emotions. The classifiers' uncertainty on those these cases indicates current PLM have weakness on understanding those type of languages. Limitations on tokenizer may also cause text classifier uncertainty. 
Tokenisation may also cause a problem. For example, the word `\emph{beloved}' in the first sentence in Emotion is split into three parts after tokenisation, making it difficult for the classifier to recognise %. The missing of this word makes those unrecognised tokens hard to represent 
the `Love' emotion. For the natural language inference task, we found classifiers tend to make overconfident predictions when the same words are found in both premise and hypothesis. For examples, the second instance in MultiNLI has the word `\emph{unfulfilled}' in both its premise and hypothesis. This leads to the wrong prediction of `Entailment'. In the last instance, %due to uncertain on conjunction words or understanding on the consistent meaning. For example, 
the classifier misunderstood that the `\emph{classic men's clothing}' contradicts with `\emph{high - fashion icons}' and thus failed to recognise the `Entailment' relation. %. However, the author's statement is not using another group of brands to compare with ``high preiests brands", instead ``classic clothing" is a sub category belongs to those types. 
Nevertheless, in all these cases, our proposed framework managed to increase the predictive entropy by reducing the confidence of predictions, alleviating the overconfidence problem.%\footnote{We provide further experimental results on the ablation study, including stability of various additional training loss terms and latent space orthogonality in the Supplementary Material Section 4.2.} %, and the influence of non-orthogonality on selecting important latent dimensions 

We provide further experimental results and the ablation study, including stability of various additional training loss terms and latent space orthogonality in Supplementary Material Section 4.2.
% in Appendix \ref{sec:futher_experimental_results}.
% For the first three cases from the GoEmotions dataset, we can see the classifier is confused by those strong emotions partial phrases on making prediction towards overall emotions. This behaviour tend to really common on instances under the Neutral category. Classifier uncertainty could also influenced by complexity of using slang and wording. As shown in the last two examples from the GoEmotions dataset, we can see phrases identified are very oral languages that reflects to the correct labels. 
% \subsection{Ablation Study}
% In this section, we conduct ablation study to investigate the contribution of various components in our framework.
% \paragraph{Stability of training loss}
% As discussed in \ref{sec:counterfactual}, our objective function contains three loss terms, the reconstruction loss, the cross-entropy loss between the original prediction and the prediction using the reconstructed text representation, and the orthogonality loss to encourage the orthogonality of latent dimensions of the latent vector $\bm{z}$. Here, we fine-tuned two additional BERT models, each with the orthogonality loss term replaced by a KL divergence loss \citep{scholar_VAE} or a Wasserstein loss \citep{nan-etal-2019-topic, tolstikhin2018wasserstein}, to investigate the training stability. %, we implemented a KL divergence loss \citep{scholar_VAE} and a Wasserstein loss \citep{nan-etal-2019-topic, tolstikhin2018wasserstein}. 
% The KL divergence loss encourages the distribution of latent variables %encoder's mean and variance parameters 
% follows the prior standard Gaussian distribution, while the Wasserstein loss  enforces the latent variables generated by the encoder to follow a Dirichlet distribution. The results are shown in Figure \ref{fig:loss_stability}. We observe that both KL loss and Wasserstein loss exhibits %Both loss terms are highlighted in red color, and we can observe a clear 
% fluctuation during training across all datasets. %Mean while, the different value of pairwise distance also indicates other loss term could potentially impact the ability of pairwise loss to reduce the distance between $\bm{e}'$ and $\bm{e}$. 
% On the contrary, the orthogonal regularisation is very stable compared with other two loss terms. We further show the evaluation results with various loss terms in Table \ref{table:loss}. It can be observed that our proposed framework with orthogonality loss gives better ECE results compared to using KL or Wasserstein loss. %All of these experiments examined this tendency and displayed the capability of orthogonality can efficiently force the dimensions to be orthogonal and help to enhance the interpretability.
% \begin{figure*}[h!]
%   \centering
%   \includegraphics[width=\columnwidth]{loss_visuals/loss_graph.pdf}
% \caption{Comparison of the stability of three loss terms trained with BERT model on four datasets. Red: KL divergence loss; Green: Wasserstein loss; Blue: Orthogonality loss.}
% \label{fig:loss_stability}
% \end{figure*} 
% \begin{table*}[h!]
% \centering
% \resizebox{0.85\columnwidth}{!}{
% \begin{tabular}{llllllllll}
% \toprule
%                  & \multicolumn{3}{c}{BERT VAE w/  Orthogonality}                                                              & \multicolumn{3}{c}{BERT VAE w/ KL}                                                              &\multicolumn{3}{c}{BERT VAE w/ Wasserstein }                          \\
% \cmidrule(lr){2-4} \cmidrule(lr){5-7}   \cmidrule(lr){8-10}
% Datasets             &  Acc    &  F1   & ECE$\downarrow$    &  Acc    &  F1    & ECE$\downarrow$   &  Acc    &  F1    & ECE$\downarrow$  \\ \midrule
% CoLA& 0.8130&0.7459&\textbf{0.0640}&0.8072&0.7300&0.1090&0.8044&0.7240&0.1111\\
% GoEmotions&0.6298&0.4661&\textbf{0.0321}&0.6298&0.4752&0.0695&0.6263&0.4608&0.0600\\
% Emotion&0.9255&0.8827&\textbf{0.0322}&0.9270&0.8847&0.0431&0.9275&0.8853&0.0441\\
% MNLI&0.8284&0.8278&\textbf{0.0272}&0.8294&0.8290&0.0418&0.8291&0.8286&0.0344\\
% \bottomrule
% \end{tabular}}

% \caption{Comparison of the performance of the BERT model fine-tuned with different loss terms on four datasets. }%We only reported each single model's result, of which we used to generate loss stability graphs.}
% \label{table:loss}
% \end{table*}
% \paragraph{Latent Space Orthogonality}
% As explained in Section \ref{sec:counterfactual}, the orthogonality regulariser facilitates a better interpretation of the latent space. Shown in table \ref{table:orthogonal}, we compare the overall performance between BERT models trained with and without latent space orthogonality. The PLMs fine-tuned with Equation (\ref{eq:orthogonality}) significantly outperforme the counterparts without the orthogonality regulariser in ECE on all four datasets. Therefore, the orthogonality of decoder network facilitate the classifier to make more certain predictions. 
% \begin{table*}[h!]
% \centering
% \resizebox{0.9\columnwidth}{!}{
% \begin{tabular}{lllllll}
% \toprule
%                  & \multicolumn{3}{c}{BERT VAE w/  Orthogonality}                                                              & \multicolumn{3}{c}{BERT VAE w/o Orthogonality}                             \\
% \cmidrule(lr){2-4} \cmidrule(lr){5-7}   
% Datasets             &  Acc    &  F1   & ECE$\downarrow$    &  Acc    &  F1    & ECE$\downarrow$    \\ \midrule
% CoLA& 0.8123±0.0012	&0.8762±0.0007&\textbf{0.0677±0.0056}&0.8042±0.0011&0.7230±0.0024&0.1121±0.0020 \\
% GoEmotions& 0.6282±0.0029 & 0.4712±0.0087& \textbf{0.0326±0.0013} & 0.6291±0.0019	& 0.4652±0.0037 & 0.0615±0.0023 \\
% Emotion& 0.9259±0.0009&	0.8850±0.0015& \textbf{0.0289±0.0043} & 0.9268±0.0003	& 0.8848±0.0006	& 0.0430±0.0016 \\
% MNLI& 0.8283±0.0000 &	0.8277±0.0005 & \textbf{0.0262±0.0021} & 0.8281±0.0009	& 0.8277±0.0009& 0.0370±0.0010 \\
% \bottomrule
% \end{tabular}}
% \caption{Comparison of results on BERT models trained with/without latent space orthogonality.}
% \label{table:orthogonal}
% \end{table*}
\section{Conclusion}
In this paper, we have proposed a new framework \texttt{CUE} for uncertainty interpretation of PLM classifiers. %by generating perturbation that causes aleatoric uncertainty at the text representation level to disentangle the change of predictive uncertainty in order to interpret the epistemic uncertainty and investigate classifier behaviour both at the sequence- and the token-level. 
By comparing our method with previous solutions, we show that \texttt{CUE} can achieve lower expected calibration errors across four datasets. In some cases, it can also mitigate the confidence of previously wrong predictions. Further experiments and case studies demonstrate \texttt{CUE} is effective in identifying tokens/latent dimensions that could potentially cause predictive uncertainty. % through our proposed uncertain feature identification algorithm. 
Our work sheds light on a new direction of uncertainty interpretation for PLMs in various NLP tasks.

\begin{acknowledgements} % will be removed in pdf for initial submission,
						 % (without ‘accepted’ option in \documentclass)
                         % so you can already fill it to test with the
                         % ‘accepted’ class option
This work was supported in part by the UK Engineering and Physical Sciences Research Council (grant no. EP/T017112/2, EP/V048597/1, EP/X019063/1). YH is supported by a Turing AI Fellowship funded by the UK Research and Innovation (grant no. EP/V020579/2). The authors would like to thank Yuxiang Zhou, Hanqi Yan and Xingwei Tan for their invaluable feedback on this paper. 
\end{acknowledgements}

\bibliography{li_516}
% \bibliographystyle{icml2023}

% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% % APPENDIX
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% \newpage
% \appendix
% \onecolumn

% \section{Preliminaries}

% % \subsection{Notations}
% % We restate our notations for readability. Under the pre-train and fine-tune paradigm, the input document of a PLM is denoted as $\bm{x}$, and the gold label is denoted as $y$, where $y\in\mathcal{Y}$, and $\mathcal{Y}$ is the label set. Let $\bm{\vartheta}$ be the parameters of the PLM layers, $\bm{\eta}$ be the parameters of the task-specific classification layer, and $\bm{e}$ be the text representation generated by the PLM. Then the prediction of the fine-tuned model can be represented as:
% % We denote the parameters of the PLM as $\bm{\vartheta}$ and the parameters of the task-specific classification layer as $\bm{\eta}$. 
% % Let $\bm{e}$ be the text representation generated by the PLM
% %the PLM layer $\bm{\vartheta}$ are pre-trained and fine-tuned to learn the the distribution $p_{\bm{\vartheta}}(\bm{e} | \bm{x})$, and the classification layer $\bm{\eta}$ is fine-tuned to learn the distribution $p_{\bm{\eta}}(y | \bm{e})$.
% %The PLM $\bm{\vartheta}$ is fine-tuned to learn the distribution over input text $\bm{x}$ to generate text representation $\bm{e}$, this process is denoted as $p_{\bm{\vartheta}}(\bm{e} | \bm{x})$. 
% % The text classifier $\bm{\eta}$ is fine-tuned to learn the distribution over text representation $\bm{e}$ and to give the gold label $y$, this process is denoted as $p_{\bm{\eta}}(y | \bm{e})$. 
% % The fine-tuned model is responsible to make a prediction based on the given text $\bm{x}$, and the classification result $\hat{y}$ is given by:
% % \begin{equation}
% %     \hat{y}=\argmax_{y\in\mathcal{Y}} p(y | \bm{x}; \bm{\vartheta}, \bm{\eta}) = \argmax_{y\in\mathcal{Y}} p_{\bm{\eta}}(y | \bm{e}) p_{\bm{\vartheta}}(\bm{e} | \bm{x})  \notag  
% % \end{equation}
% % Once a classifier built on a PLM is fine-tuned on a target dataset, we freeze the parameters of the PLM and the classification layer and then insert a VAE between the PLM last layer and the task-specific classification layer. 
% % Recall that the plug-in VAE framework proposed in our paper contains an encoder network and a decoder network.
% % First, the PLM-encoded representation $\bm{e}$ for input text $\bm{x}$ is mapped to a latent vector, denoted by $\bm{z}$, via a VAE which consists of two networks. 
% % The encoder network learns the distribution of a lower dimensional latent variable $\bm{z}$ given the PLM-encoded representation $\bm{e}$: $q_{\bm{\phi}}(\bm{z}|\bm{e})$, while the decoder network reconstructs the input $\bm{e}'$ given the latent variable $\bm{z}$: $p_{\bm{\theta}}(\bm{e}'|\bm{z})$, where $\bm{\phi}$ and $\bm{\theta}$ are the sets of parameters in the encoder and decoder respectively. The classifier's prediction on the reconstructed representation $\bm{e}'$ is denoted as $\hat{y}'$. The derivation of this paper follows the following assumption:

% % \begin{assumption}
% % \label{app:assum1}\textit{Latent vector $\bm{z}$ is distributed as a multivariate Gaussian and dimensions of variable $\bm{z}$ are independent of each other, i.e., $\bm{z} \sim \mathcal{N}(\bm{\mu},\rm \diag(\bm{\sigma}^2))$}. %, $\bm{z} = q_\phi(\bm{z}|\bm{e}) = \mu(e) +\epsilon \cdot \sigma(e)$.}
% % \end{assumption}

% % Here $\bm{\mu}$ is the mean vector and $\diag(\bm{\sigma}^2)$, which stands for the diagonal matrix of the vector $\bm{\sigma^2}$, is the covariance matrix of the Gaussian distribution. 
% % % We use $\mathbb{E}_{q_{\phi}(\bm{z}|\bm{e})}[\bm{z}]=\mu_{\phi}(\bm{z}|\bm{e})$ and $\mathbb{E}_{p_{\theta}(\bm{e}'|\bm{z})}[\bm{e}']=\mu_{\theta}(\bm{e}'|\bm{z})$ to denote these two networks' predictive means. 

% % In addition, we use $[n]$ to represent $\{1,2,...,n\}$, 
% % % and $|S|$ to stand for the size of a set $S$. 
% % $\mathbb{E}$ to represent the mathematical expectation and $\mathbb{D}$ to represent the  variance. We use $||\cdot||$ to denote the operator/spectral norm of matrices and $\mathcal{L}_{2}$-norm for vectors.

% \subsection{Decomposition of the Predictive Uncertainty}

% In this subsection, we show how we decompose the Mean Squared Error (MSE) based predictive uncertainty into the epistemic uncertainty and the aleatoric uncertainty mentioned in Section 3. 
% %Here $y$ is the gold classification label from a dataset, while $\hat{y}$ is the predicted class label.
% \begin{align}
%     \mathbb{E}[(y_i - g_{\eta}(f_{\vartheta}(x_i)))^2] 
%     &= \mathbb{E}[(y_i - \mathbb{E}[y] + \mathbb{E}[y] - f_{\vartheta}(x_i))^2] \notag\\
%     &=\mathbb{E}[(y_i - \mathbb{E}[y])^2] + \mathbb{E}[(\mathbb{E}[y] - f_{\vartheta}(x_i))^2] + 2\mathbb{E}[(y_i - \mathbb{E}[y])(\mathbb{E}[y] - f_{\vartheta}(x_i))] \notag\\
%     &= \mathbb{E}[(y_i - \mathbb{E}[y])^2] + \mathbb{E}[(\mathbb{E}[y] - f_{\vartheta}(x_i))^2] + 2(\mathbb{E}[{y}] - \mathbb{E}[y])(\mathbb{E}[y] - \mathbb{E}[f_{\vartheta}(x_i)]) \notag\\ 
%     & = \underbrace{\mathbb{E}[(y_i - \mathbb{E}[y])^2]}_{\rm aleatoric\,uncertainty} + \underbrace{\mathbb{E}[\mathbb{E}[y] - g_{\eta}(f_{\vartheta}(x_i)))^2]}_{\rm epistemic\,uncertainty} \notag
% \end{align}
% % \begin{align}
% % \notag
% %     \underbrace{\mathbb{E}[(y-\hat{y})^2]}_{\rm total\,uncertainty} &= \mathbb{E}[(y - \mathbb{E}[y] + \mathbb{E}[y] - \hat{y})^2] \\ 
% % \notag
% %     &=\mathbb{E}[(y - \mathbb{E}[y])^2] + \mathbb{E}[(\mathbb{E}[y] - \hat{y})^2] + 2\mathbb{E}[(y - \mathbb{E}[y])(\mathbb{E}[y] - \hat{y})] \\ 
% % \notag
% %     &= \mathbb{E}[(y - \mathbb{E}[y])^2] + \mathbb{E}[(\mathbb{E}[y] - \hat{y})^2] + 2(\mathbb{E}[{y}] - \mathbb{E}[y])(\mathbb{E}[y] - \mathbb{E}[\hat{y}]) \\ 
% % \notag
% %     &= \underbrace{\mathbb{E}[(y - \mathbb{E}[y])^2]}_{\rm aleatoric\,uncertainty}+\underbrace{\mathbb{E}[(\mathbb{E}[y] - \hat{y})^2]}_{\rm epistemic\,uncertainty} \nonumber
% % \end{align}
% % Rearranging the above equation, we have:
% % \begin{align}
% % \notag
% %      \underbrace{\mathbb{E}[(y - \mathbb{E}[y])^2]}_{\rm aleatoric\,uncertainty}  &= \underbrace{\mathbb{E}[(y-\hat{y})^2]}_{\rm total\,uncertainty} - \underbrace{\mathbb{E}[(\mathbb{E}[y] - \hat{y})^2]}_{\rm epistemic\,uncertainty} \notag
% %     %  \mathbb{D}[y] &= \mathbb{E}[(y-\hat{y})^2] - \mathbb{E}[(\mathbb{E}[y] - \hat{y})^2] \notag
% % \end{align}
% % Since the total uncertainty is fixed. We can therefore cause aleatoric uncertainty fluctuation to estimate the epistemic uncertainty.

% % \subsection{Derivation of the Predictive Entropy Increment Upper-bound}
% \subsection{Interpreting Entropy Change with Reconstruction Difference}
% % Bounding loss for an arbitrary variable}
% % In the following, we show that the increment of the predictive entropy caused by the representation perturbation has an upper bound and is proportional to $||\bm{e}'-\bm{e}||^2$, which is mentioned in Section 4.1.
% % \newpage
% % % Considering the reparameterisation trick with
% % According to Assumption \ref{app:assum1}, using the reparameterisation trick with the latent code $\bm{z}\sim\mathcal{N}(\bm{\mu},\rm diag(\bm{\sigma}^2))$, 
% % % , where $\mu$ and $\sigma$ are the mean value and covariance matrix respectively. 
% % we can reconstruct the representation $\bm{e}'$ from a given text representation $\bm{e}$ by:
% % \begin{align}
% %     \bm{z} &= \bm{\mu}(\bm{e}) +\bm{\epsilon} \cdot \bm{\sigma}(\bm{e})\notag \\ 
% %     % e' &= f(z) \\ \notag
% %     \bm{e}' &= p_{\bm{\theta}}(\bm{z})\notag \\
% %     % y' &= {\rm pred}(e') \\ \notag
% %     \hat{y}' &= p_{\bm{\eta}}(\bm{e}') \notag
% % \end{align}
% % In our learning objective, we aim to estimate the uncertainty by adding the noise to increase the predictive entropy while keeping the classification results unchanged. % the entropy in prediction without changing it. 
% % Then, we have 
% % \begin{equation}
% %     0 \leq H_{(\bm{e})}(\hat{y}) \leq H_{(\bm{e}')}(\hat{y}') \leq {\rm log}K  \notag
% % \end{equation}
% % Assuming that the difference between text representation $\bm{e}$ and the reconstructed representation $\bm{e}'$ is $\bm{u}$, $\bm{u} = \bm{e}' - \bm{e} $, and the prediction is obtained from the Softmax function. Let $\bm{U}$ be the maximum distance of $\bm{e}'-\bm{e}$ that causes the new $\bm{e}'$ confuse the classifier (i.e. when the predictive class probability of any of the $K$ classes is $\frac{1}{K}$). Then, according to the Jensen inequality, the prediction is bounded by: 
% % \begin{align}
% % \notag
% %     \hat{y}' &\leq t \cdot {\rm softmax}(\bm{e}) + (1-t){\rm softmax}(\bm{e}+\bm{U}) \\ \notag
% %     & =   t \cdot {\rm softmax}(\bm{e}) + (1-t)\frac{1}{K} \notag
% % \end{align}
% % where $0 \leq t \leq 1$ and $H_{(\bm{e}+\bm{U})}(\hat{y}') = {\rm log}K$. Considering the convexity of entropy, we have:
% % \begin{align}
% % \notag
% %     \Delta \mathcal{H} &= \mathcal{H}_{\bm{e}'}(\hat{y}') - \mathcal{H}_{\bm{e}}(\hat{y})\\ \notag
% %     & \leq - (t \cdot p(\hat{y}) +  \frac{(1-t)}{K}) {\rm log} (t \cdot p(\hat{y}) + \frac{(1-t)}{K}) + p(\hat{y}){\rm log}p(\hat{y})\\ \notag
% %     & \leq - t \cdot p(\hat{y}) {\rm log} p(\hat{y}) - (1-t) \cdot \frac{1}{K} {\rm log} (\frac{1}{K}) + p(\hat{y}){\rm log}p(\hat{y})\\ \notag
% %     & = (1-t)({\log} K - \mathcal{H}_{\bm{e}}(\hat{y}) ) \\\notag
% %     & = \frac{||\bm{e}'-\bm{e}||^2 \cdot ({\log} K - \mathcal{H}_{\bm{e}}(\hat{y}) )}{||U||^2} \\
% %     &\propto ||\bm{e}'-\bm{e}||^2   \notag
% % \end{align}
% % Therefore, our loss function is a convex function and its upper bound is correlated with the $\mathcal{L}_{2}$-norm between $\bm{e}$ and $\bm{e}'$.
% Given the Assumption \ref{ass:classifier}, according to the Jensen inequality, the prediction is bounded by: 
% \begin{align}
% \notag
%     \hat{y}' &\leq t \cdot {\rm softmax}(\bm{e}) + (1-t){\rm softmax}(\bm{e}+\bm{U}) \\ \notag
%     & =   t \cdot {\rm softmax}(\bm{e}) + (1-t)\frac{1}{K} \notag
% \end{align}
% Since $\bm{e} \leq \bm{e}' \leq \bm{e} + \bm{U}$, we let $0 \leq t \leq 1$ and $H_{(\bm{e}+\bm{U})}(\hat{y}') = {\rm log}K$. Considering the convexity of entropy, we have:
% \begin{small}
% \begin{align}
%     \Delta \mathcal{H} &= \mathcal{H}_{\bm{e}'}(\hat{y}') - \mathcal{H}_{\bm{e}}(\hat{y})\\ \notag
%     &\leq - (t \cdot p(\hat{y}) +  \frac{(1-t)}{K}) {\rm log} (t \cdot p(\hat{y}) + \frac{(1-t)}{K}) + p(\hat{y}){\rm log}p(\hat{y})\\ \notag
%     & \leq - t \cdot p(\hat{y}) {\rm log} p(\hat{y}) - (1-t) \cdot \frac{1}{K} {\rm log} (\frac{1}{K}) + p(\hat{y}){\rm log}p(\hat{y})\\ \notag
%     & = (1-t)({\log} K - \mathcal{H}_{\bm{e}}(\hat{y}) ) \\\notag
%     & = \frac{||\bm{e}'-\bm{e}||^2 \cdot ({\log} K - \mathcal{H}_{\bm{e}}(\hat{y}) )}{||U||^2} \\
%     &\propto ||\bm{e}'-\bm{e}||^2   \notag
% \label{eq:entropy}
% \end{align}
% \end{small}
% Therefore, we proved the generated semi-factual can guarantee the upper bound of predictive entropy difference $\Delta\mathcal{H}$. As shown above, the variation of the entropy $\Delta\mathcal{H}$, is proportional to the reconstruction error $||\bm{e}'-\bm{e}||^2$, which can thus be used to interpret the uncertainty.

% % Hence, we can prove our framework guarantee the upper bound of predictive entropy difference $\Delta\mathcal{H}$, which also directs the learning objective of our loss function. As shown above, the variation of the entropy, $\Delta\mathcal{H}$, is proportional to the $||\bm{e}'-\bm{e}||^2$, which can thus be used to interpret the uncertainty in section 4.2.

% \subsection{Interpreting the Functionality of Loss}
% % We interpret the functionality of the combination of our loss functions and demonstrate the objective of our loss function in here as mentioned in the paper Section 4.1.

% Given an input representation $\bm{e}$, we can sample a reconstructed representation $\bm{e}'$ from our proposed framework. Then the conditional probability of $P(\bm{e}'|\bm{e})$ can be given by Nadaraya-Watson estimator\citep{bierens1988nadaraya}:
% \begin{align}
% \notag
%      P(\bm{e}'|\bm{e}) &= \frac{P(\bm{e}',\bm{e})}{P(\bm{e})} \\ 
% \notag
%      &= \frac{\frac{1}{N} \Sigma_{i=1}^{N} \kappa(\frac{\bm{e}'-\bm{e}_i}{h}) \cdot \kappa(\frac{\bm{e}-\bm{e}_i}{h})}{\frac{1}{N}  \Sigma_{i=1}^{N} \kappa(\frac{\bm{e}-\bm{e}_i}{h})} 
% \notag
% \end{align}
% where $\kappa$ is a kernel function with parameter $h$, $\{\bm{e}_i\}_{i=1}^N$ is the set of samples from the training set. 
% Since the input feature for ${\bm{e}_i}$ is fixed, we only care about the updating of reconstruction through encoder-decoder architecture. Thus we have:
% \begin{align}
% \notag
%      P(\bm{e}'|\bm{e}) &\propto \frac{1}{N} \Sigma_{i=1}^{N} \kappa(\frac{\bm{e}'-\bm{e}_i}{h}) \cdot \kappa(\frac{\bm{e}-\bm{e}_i}{h})
% \end{align}
% To simplify the above estimation, we apply the RBF-kernel and triangle inequality in estimation and have:
% \begin{align}
% \notag
%      {\rm log} P(\bm{e}'|\bm{e}) &\propto \frac{1}{N} \Sigma_{i=1}^{N} \kappa(\frac{\bm{e}'-\bm{e}_i}{h}) \cdot \kappa(\frac{\bm{e}-\bm{e}_i}{h}) \\ \notag
%      &={\rm log}\frac{1}{N}\sum_{i=1}^N {\rm exp}(- ||\bm{e}'-\bm{e}_i||^2) \cdot {\rm exp}(- ||\bm{e}-\bm{e}_i||^2) \\ \notag
%    &={\rm log}\frac{1}{N}\sum_{i=1}^N {\rm exp}(- ||\bm{e}'-\bm{e}_i||^2 - ||\bm{e}-\bm{e}_i||^2) \\ \notag
%    &\leq {\rm log}\frac{1}{N}\sum_{j=1}^N {\rm exp}(- ||\bm{e}'-\bm{e}||^2) \\ \notag
%    &={\rm log}[{\rm exp}(- ||\bm{e}'-\bm{e}||^2)] \\ \notag
% \end{align}
% Thus, we are able to optimise ${\rm log} P(\bm{e}'|\bm{e}) $ by the mean squared error if we choose the natural logarithm function:
% \begin{align}
% \notag
%      {\rm ln} P(\bm{e}'|\bm{e}) \leq - ||\bm{e}'-\bm{e}||^2
% \end{align}
% % The reconstruct based predictive probability of $\hat{y}'$ 
% Considering to re-write the conditional predictive probability of $\hat{y}'$ with the reconstructed $\bm{e}'$:
% \begin{align}
% \notag
%      P(\hat{y}'|\bm{e}) = P(\hat{y}'|\bm{e}') \cdot P(\bm{e}'|\bm{e}) 
% \end{align}
% By taking a logarithm at both sides, we have:
% \begin{align}
% \notag
%      {\rm ln}P(\hat{y}'|\bm{e}) &= {\rm ln} P(\hat{y}'|\bm{e}') \cdot P(\bm{e}'|\bm{e}) \\
% \notag
%      &\leq -||\bm{e}'-\bm{e}||^2 + {\rm ln} P(\hat{y}'|\bm{e}') 
% \notag
% \end{align}
% Therefore, for a given $\bm{e}$ and the reconstructed $\bm{e}'$, the difference between the original predictive probability $\hat{y}$ and the reconstruction based predictive probability $\hat{y}'$ can be measured by KL-divergence as:
% \begin{align}
% \notag
%      {\rm KL}_e(\hat{y}',\hat{y}) &= p(\hat{y}|\bm{e}){\rm log}\frac{p(\hat{y}|\bm{e})}{p(\hat{y}'|\bm{e})} \\ \notag
%      &= p(\hat{y}|\bm{e}){\rm log}p(\hat{y}|\bm{e}) - p(\hat{y}|\bm{e}){\rm log}p(\hat{y}'|\bm{e}) \\ \notag
%      &= p(\hat{y}|\bm{e}){\rm log}p(\hat{y}|\bm{e}) - p(\hat{y}|\bm{e})\cdot(-||\bm{e}'-\bm{e}||^2 + {\rm log} P(\hat{y}'|\bm{e}')) \\ \notag
%      &= p(\hat{y}|\bm{e}) \cdot ||\bm{e}'-\bm{e}||^2 + p(\hat{y}|\bm{e}){\rm log}p(\hat{y}|\bm{e}) - p(\hat{y}|\bm{e}){\rm log}P(\hat{y}'|\bm{e}') \\ \notag
%      &= p(\hat{y}|\bm{e}) \cdot ||\bm{e}'-\bm{e}||^2 + p(\hat{y}|\bm{e}){\rm log} \frac{p(\hat{y}|\bm{e})}{p(\hat{y}'|\bm{e}')} \notag
% \end{align}
% Here, the $p(\hat{y}|\bm{e})$ is fixed in the training progress. Then we can obtain that: 
% \begin{align}
% \notag
%      {\rm KL}_e(\hat{y}',\hat{y}) &\propto ||\bm{e}'-\bm{e}||^2 + {\rm KL}(p(\hat{y}|\bm{e}),p(\hat{y}'|\bm{e}'))
% \end{align}
% Therefore, in the loss function in our proposed method, we are able to use the above term to measure the similarity between the prediction based on the reconstructed representation and the prediction based on the original input representation. It is worth noting that both the terms of $\Delta \mathcal{H}$ and ${\rm KL}_{\bm{e}}(\hat{y}',\hat{y})$ are related to $||\bm{e}'-\bm{e}||^2$ but with opposite directions for optimisation. That is, we need to make a trade-off on the optimisation of $||\bm{e}'-\bm{e}||^2$ to guarantee that the difference between $\bm{e}'$ and $\bm{e}$ should be significant but without changing the prediction result. That is why we introduce an entropy term into the loss function with a weight of $\lambda$ to control the influence of this term. 

% This conclusion might be related to the theory of information bottleneck \citep{Bang_Xie_Lee_Wu_Xing_2021} where  $||\bm{e}'-\bm{e}||^2$ reflects the information bottleneck and ${\rm KL}(p(\hat{y}|\bm{e}),p(\hat{y}'|\bm{e}'))$ represent the decoding result.

% \section{Experimental Setup}

% \paragraph{Datasets}

% We evaluate our proposed framework on four datasets for \emph{linguistic acceptability classification}, \emph{natural language inference}, and \emph{emotion classification}. The dataset statistics are shown in Table \ref{table:dataset}.

% \begin{table}[h]
% \centering
% % \resizebox{\columnwidth}{!}{
% \begin{tabular}{lrrrr}
% \toprule
% Datasets &   CoLA & MultiNLI & Emotion & GoEmotions \\ \midrule
% Classes  &  2 & 3 & 6 & 27  \\   \midrule
%  Train   &   8,551   &  392,702     &   16,000 &   43,410 \\ 
%  Dev     &   1,043   &    20,000    &   2,000  &   5,427  \\ 
%  Test    &   1,043   &    20,000    &   2,000  &   5,426  \\ \midrule
% Total    &   10,637  &    432,702   &   20,000 &   58,009 \\
% \bottomrule
% \end{tabular}
% \caption{Statistic of the datasets.}
% \label{table:dataset}
% \end{table}
% \noindent\underline{Linguistic Acceptability Classification}. The CoLA (Corpus of Linguistic Acceptability) %is from the GLUE benchmark 
% \citep{cola} %,wang2018glue}. This dataset 
% contains sentences %from 23 linguistics publications %from books and journal articles, which are 
% annotated as \emph{grammatically acceptable} or \emph{not}.
% %to judge whether the sentence is a grammatical sentence. %We use this dataset to exam the ability of our latent variable model on the ability of grammatical classification and binary classification.

% \noindent\underline{Natural Language Inference}. The MultiNLI \citep{dataset_multinli} %,wang2018glue} 
% dataset contains annotations for relations of \emph{entailment}, \emph{contradiction}, and \emph{neutrality} between sentence pairs.%from the GLUE benchmark to verify our method's effectiveness on classification for sentence pairs, in which each 
% % contains sentence pairs, %each of which is annotated with one of the three categories, \emph{entailment}, \emph{contradictory}, and \emph{neutral}. %Training and testing split is the same as in the original dataset release. All the result we report for Multi NLI are test on the matched testset.

% \noindent\underline{Emotion Classification}. %We use two popular multi-class emotion classification datasets, GoEmotions \citep{dataset_goemotions} and Emotion \citep{dataset_emotion}. 
% The GoEmotions \citep{dataset_goemotions} dataset annotates Reddit comments with twenty-seven emotion labels (e.g., \emph{fear} and \emph{admiration}). %is a multi-label emotion classification dataset consisting of 54k Reddit comments categorised into 27 emotion labels.
% The Emotion \citep{dataset_emotion} dataset classifies English tweets into six emotion classes (e.g., \emph{sadness} and \emph{joy}). Note that the GoEmotions dataset allows multi-label settings that a sentence can be annotated with more than one emotion label. In our experimental setup, we only focus on multi-class classification, and we thus filtered out those instances annotated with multiple labels in the GoEmotions.



% \paragraph{Baselines}

% We compare our method with the following baselines:

% \noindent\underline{Label Smoothing} \citep{ Gupta_Kvernadze_Srikumar_2021} is commonly used to deal with overfitting %or overconfident problem 
% when using cross-entropy loss on classification tasks. It aims to uniform the distribution of labels to encourage small logit gaps and has been shown effective in calibrating PLM-based classifiers. %existing researches  %have proved its capability on PLM calibration. 

% \noindent\underline{MC Dropout} \citep{monte_carlo_dropout} %, Monte-Carlo (MC) dropout 
% is an uncertainty estimation technique that performing multiple stochastic forward passes by randomly switching neurons off to generate ensemble of predictions. We follow the implementation of \citet{acl2022_uncertainty_transformers} in our experiments. 
% % is a popular regularisation technique by applying Monte Carlo samples from the space of available models by randomly switching neurons off to generate different outputs. This method is similar to approximating variational inference in a deep Gaussian process %, it uses the approximating variational distribution with Bernoulli variables related to network units 
% % \citep{acl2022_uncertainty_transformers}.

% \noindent\underline{Bayesian Neural Network (BNN)} \citep{8371683} assumes weights of neural networks are random variables with a prior distribution, is thus able to obtain more robust predictions by sampling the network weights during inference, and is often used for uncertainty estimation. Motivated by \citet{getting_clue}, we also implemented a BNN plug-in framework as a comparison with our MGP-based framework. Specifically, we use a Bayesian linear layer\footnote{\url{https://github.com/piEsposito/blitz-bayesian-deep-learning}} as the encoder and a linear layer as the decoder, and then insert them between the PLM-encoding layer and the classification layer, similar to the way we ensemble the MGP.
% %\citet{getting_clue,8371683} BNN is typical way to learn a set of prior parameters from training data that gives predictions  distribution to model the epistemic uncertainty and doing Bayesian inference on its weights. This enables the predictor fits on the training data and reasons about the uncertainty of its own prediction on test data. 
% % The output of the BNN is based on distribution of sampled historical prior information. 
% %In the experiments, we implemented this method with a Bayesian linear layer\footnote{\url{https://github.com/piEsposito/blitz-bayesian-deep-learning}} as encoder and adopt a classic linear layer as decoder, insert between %We insert a BNN layer between 
% %the PLM-encoding layer and the classification layer. 
% %The purpose is to introduce uncertainty to the network parameters. This allows us to additionally reduce the variance of model predictions apart from minimising the cross-entropy loss during training so as to obtain a more robust model. We implement the BNN layer using the  Blitz library\footnote{\url{https://github.com/piEsposito/blitz-bayesian-deep-learning}}.

% % \noindent\underline{SNGP} \citep{sngp} A Spectral-normalized Neural Gaussian Processes approach to attain an uncertainty performance.

% \paragraph{Evaluation Metrics}

% We use accuracy (Acc) and macro-averaged F1 (F1) to evaluate the classification results, and Expected Calibration Error (ECE) \citep{desai-durrett-2020-calibration} calculated on predictive probabilities during inference to measure model calibration. 
% % Details of the ECE calculation can be found in Appendix \ref{sec:exp-setup}. 
% % %For model uncertainty measurement, we follow  \citet{desai-durrett-2020-calibration} to adopt expected calibration error (ECE) calculated on predictive probabilities during inference. Detail of ECE calculation can be found in Appendix \ref{sec:exp-setup}.
% % \paragraph{ECE calculation}
% For ECE implementation, we use the formula provided by \citet{softmax_uncertain} as follows:
% \begin{small}
% \begin{gather}
%     \text{acc}(B_m) = \frac{1}{B_m} \sum_{i\in B_m} 1(\hat{y}_i = y_i), \quad\quad
%     \text{conf}(B_m) = \frac{1}{B_m} \sum_{i\in B_m} \hat{p}_i, \\
%     \mbox{ECE} = \sum_{m=1}^M \frac{B_m}{n} |\text{acc}(B_m)-\text{conf}(B_m)|
% \end{gather}
% \end{small}

% Predictions of $n$ samples are grouped into $M$ interval bins and the accuracy is calculated for each bin. $B_m$ is the set of indices of samples that prediction confidence falls into the current interval bin. The ECE formula calculates the weighted average of difference between the accuracy of each bin -  $\text{acc}(B_m)$ - and the average confidence - $\text{conf}(B_m)$ - within bin $B_m$. 
% In our experiments, we used 9 interval bins for our prediction results. 


% \paragraph{Hyperparameters Settings} 

% We adopted the Pytorch-Transformers package\footnote{\url{https://github.com/huggingface/pytorch-transformers}} for the implementation of all our Transformer-based language models. For each model, we chose its corresponding base model with the following parameter size: ALBERT-base-v2 (11M), distilBERT-base-uncased (66M), BERT-base-uncased (110M), and RoBERTa-base (125M). %Our experiments used two types of graphic cards: Nvidia Titan RTX and Quadro RTX 6000. 
% We fine-tuned all these base models for 20 epochs with a batch size of 16 on each target dataset as compared base models. For the Label Smoothing and the MC Dropout baseline, the frameworks directly modified the PLM-based models and were finetuned together with the PLM for 20 epochs with a batch size of 16. For the BNN and our MGP plug-in methods, we firstly fine-tuned the base models for 20 epochs and then froze the PLM encoding and classifier parameters and fine-tuned the BNN and MGP module for further 50 epochs (with batch sizes as 16 for both module).
% % We fine-tuned our base models and baselines for 20 epochs with batch size of 16, and 50 epochs with batch size 16 on our plugged in BNN and VAE modules. 
% A learning rate of $2e-5$ and the early stop strategy have been applied to all the training. %To exam the effectiveness of methods, 
% Each model has been trained for 5 times with different random seeds. %: $0$, $1$, $2$, $3$, $4$. 
% For each model, we report the mean and standard deviation of the evaluation results obtained by the five trained models on test sets. %value of 5 times of experiments with deviation from mean. For the chose of base models on BNN and VAE methods, we selected the base models that reported the median ECE value than other fine-tuned models. 
% % Because of our interpretation method aims to provide explanation for cases, the implementation of our interpretation part only supports batch size of 1.

% % For our base models and baselines fine-tuning process, we set the learning rate to $2e-5$ and used batch size of 16 trained for 20 epochs. For BNN and VAE fine-tuning processes, we trained our models with 16 batch size for 50 epochs. All the models trained with learning rate of $2e-5$ and applied early stop strategy, all the models are trained 
% % we used  We applied the early stop strategy for the base model fine-tuning process on all three datasets. We validated our methods with batch size of 8, 16 and 32 on a Nvidia 3090 GPU. The token uncertainty decomposition and analysis process require the batch size of 1. 
% % \begin{align}
% %     \bm{u} &= \bm{e}' - \bm{e} \\
% %     \bm{u} &= \mathcal{N}(0,\Sigma) \\
% %     \Delta \mathcal{H} &= \mathcal{H}' - \mathcal{H}\\
% %     &= - \sum_{k=1}^K p(\hat{y}'_k|\bm{e}')\log p(\hat{y}'_k|\bm{e}') - (- \sum_{k=1}^K p(\hat{y}_k|\bm{e})\log p(\hat{y}_k|\bm{e}))\\
% %     &= \sum_{k=1}^K p(\hat{y}_k|\bm{e})\log p(\hat{y}_k|\bm{e}) - \sum_{k=1}^K p(\hat{y}'_k|\bm{e}')\log p(\hat{y}'_k|\bm{e}') \\
% %     % &= \sum_{k=1}^K p(\hat{y}_k|\bm{e})\log p(\hat{y}_k|\bm{e}) - \sum_{k=1}^K p(\hat{y}'_k|\bm{e} + \bm{u})\log p(\hat{y}'_k|\bm{e} + \bm{u}) \\
% %     &= \sum_{k=1}^K p(\hat{y}_k|\bm{e})\log p(\hat{y}_k|\bm{e}) - \sum_{k=1}^K p(\hat{y}'_k|\bm{e})p(\hat{y}'_k|\bm{u})\log p(\hat{y}'_k|\bm{e})p(\hat{y}'_k|\bm{u}) \\
% %     &= \sum_{k=1}^K p(\hat{y}_k|\bm{e})\log p(\hat{y}_k|\bm{e}) - \sum_{k=1}^K p(\hat{y}'_k|\bm{e})p(\hat{y}'_k|\bm{u})\log p(\hat{y}'_k|\bm{e}) - \sum_{k=1}^K p(\hat{y}'_k|\bm{e})p(\hat{y}'_k|\bm{u})\log p(\hat{y}'_k|\bm{u}) \\
% %     % &= \sum_{k=1}^K p(\hat{y}_k|\bm{e})\log p(\hat{y}_k|\bm{e}) - \sum_{k=1}^K p(\hat{y}'_k|\bm{e})\log p(\hat{y}'_k|\bm{e}) \sum_{k=1}^K p(\hat{y}'_k|\bm{u})\log p(\hat{y}'_k|\bm{e}) - \sum_{k=1}^K p(\hat{y}'_k|\bm{e})p(\hat{y}'_k|\bm{u})\log p(\hat{y}'_k|\bm{u}) \\
% %     &= \sum_{k=1}^K p(\hat{y}_k|\bm{e})\log p(\hat{y}_k|\bm{e}) - \sum_{k=1}^K p(\hat{y}'_k|\bm{e}')\log p(\hat{y}'_k|\bm{e}) - \sum_{k=1}^K p(\hat{y}'_k|\bm{e})p(\hat{y}'_k|\bm{u})\log p(\hat{y}'_k|\bm{u}) \\
% %     &= \sum_{k=1}^K p(\hat{y}_k|\bm{e})\log p(\hat{y}_k|\bm{e}) - \sum_{k=1}^K p(\hat{y}'_k|\bm{e}')\log p(\hat{y}'_k|\bm{e}) - \sum_{k=1}^K p(\hat{y}'_k|\bm{e})\log p(\hat{y}'_k|\bm{u}) \sum_{k=1}^K p(\hat{y}'_k|\bm{u})\log p(\hat{y}'_k|\bm{u}) \\
% %     &= \sum_{k=1}^K p(\hat{y}_k|\bm{e})\log p(\hat{y}_k|\bm{e}) - \sum_{k=1}^K p(\hat{y}'_k|\bm{e}')\log p(\hat{y}'_k|\bm{e}) - \sum_{k=1}^K p(\hat{y}'_k|\bm{e})\log p(\hat{y}'_k|\bm{u}) \sum_{k=1}^K p(\hat{y}'_k|\bm{u})\log p(\hat{y}'_k|\bm{u}) \\
% % \end{align}
% % \begin{align}
% %     KL(\hat{y}'|\hat{y}) &= \sum_{k=1}^K p(\hat{y}'_k|\bm{e}')\log \frac{p(\hat{y}'_k|\bm{e}')}{p(\hat{y}_k|\bm{e}) }\\
% %     &\geq (\sum_{k=1}^K p(\hat{y}'_k|\bm{e}'))\log \frac{p(\hat{y}'_k|\bm{e}')}{p(\hat{y}_k|\bm{e}) }\\
% %     &\geq (\sum_{k=1}^K p(\hat{y}'_k|\bm{e}'))\log p(\hat{y}'_k|\bm{e}') - (\sum_{k=1}^K p(\hat{y}'_k|\bm{e}'))\log{p(\hat{y}_k|\bm{e}) }\\
% %     &\geq \sum_{k=1}^K p(\hat{y}'_k|\bm{e}')\log p(\hat{y}'_k|\bm{e}') - (\sum_{k=1}^K p(\hat{y}'_k|\bm{e}'))\log{p(\hat{y}_k|\bm{e}) }\\
% % \end{align}
% % \begin{align}
% %     &\leq - t \cdot p(y) {\rm log}(t \cdot p(y)) - t \cdt p(y) {\rm log} ((1-t) \cdot \frac{1}{K}) - (1-t) \cdot \frac{1}{K} {\rm log}(t \cdot p(y)) - (1-t) \cdot \frac{1}{K} {\rm log} ((1-t) \cdot \frac{1}{K} {\rm log}) +  p(y){\rm log}p(y) \\
% %     &= H(t \cdot p(y)) + KL(t\cdt p(y)| (1-t)/K) + KL( (1-t)/K | t\cdt p(y)) + H((1-t)/K) - H_e{Y}
% % \end{align}

% \section{Further Experimental Results}\label{sec:futher_experimental_results}

% \subsection{Ablation Study}

% %In this section, 
% We conduct an ablation study to investigate the contribution of various components in our framework.

% \paragraph{Stability of training loss}
 
% As discussed in Section 4.1, our objective function contains four loss terms that implemented by three loss functions, i.e., the reconstruction loss, the reconstructed cross-entropy loss, and the orthogonality loss that encourages the orthogonality of the dimensions of the latent vector $\bm{z}$. We investigate the training stability benefits from orthogonal regularisation by replacing the orthogonality loss with either a KL-divergence loss or a Wasserstein loss, where the KL-divergence loss encourages the distribution of latent variables to follow the prior standard Gaussian distribution and is widely used in general MGP \citep{vae,scholar_vae}, while the Wasserstein loss enforces the latent variables to follow a Dirichlet distribution and is used in Wasserstein Auto Encoder (WAE) \citep{nan-etal-2019-topic, tolstikhin2018wasserstein}.
% % , we fine-tuned two additional BERT models, either with the orthogonality loss term replaced by a KL-divergence loss \citep{vae,scholar_vae} or a Wasserstein loss \citep{nan-etal-2019-topic, tolstikhin2018wasserstein}, to investigate the training stability. %, we implemented a KL divergence loss \citep{scholar_vae} and a Wasserstein loss \citep{nan-etal-2019-topic, tolstikhin2018wasserstein}. 
% % The KL-divergence loss encourages the distribution of latent variables %encoder's mean and variance parameters 
% % to follow the prior standard Gaussian distribution, while the Wasserstein loss enforces the latent variables to follow a Dirichlet distribution. 
% The total loss (including the reconstruction loss and the cross-entropy loss) curves during training are shown in Figure \ref{fig:loss_stability}. We observe that the total loss replaced by either the KL-divergence loss or the Wasserstein loss exhibits %Both loss terms are highlighted in red color, and we can observe a clear 
% fluctuation during the training process across all datasets. %Mean while, the different value of pairwise distance also indicates other loss term could potentially impact the ability of pairwise loss to reduce the distance between $\bm{e}'$ and $\bm{e}$. 
% On the contrary, the loss with orthogonal regularisation is very stable. We further show the evaluation results with various loss terms in Table \ref{table:loss} \footnote{Results reported are single run results, which we used to generate loss stability graphs.}. It can be observed that our proposed framework with the orthogonality loss achieves better ECE results compared to using KL or Wasserstein loss. %All of these experiments examined this tendency and displayed the capability of orthogonality can efficiently force the dimensions to be orthogonal and help to enhance the interpretability.

% \begin{figure*}[h!]
%   \centering
%   \includegraphics[width=\columnwidth]{resources/loss_visuals/loss_graph.pdf}
% \caption{Comparison of the stability of the total loss for three loss terms trained with BERT model on four datasets. Red: total loss with KL divergence loss; Green: total loss with Wasserstein loss; Blue: total loss with Orthogonality loss.}
% \label{fig:loss_stability}
% \end{figure*} 

% % \begin{table*}[h!]
% % \centering
% % \resizebox{0.85\columnwidth}{!}{
% % \begin{tabular}{llllllllll}
% % \toprule
% %                  & \multicolumn{3}{c}{BERT VAE w/  Orthogonality}                                                              & \multicolumn{3}{c}{BERT VAE w/ KL}                                                              &\multicolumn{3}{c}{BERT VAE w/ Wasserstein }                          \\
% % \cmidrule(lr){2-4} \cmidrule(lr){5-7}   \cmidrule(lr){8-10}
% % Datasets             &  Acc    &  F1   & ECE$\downarrow$    &  Acc    &  F1    & ECE$\downarrow$   &  Acc    &  F1    & ECE$\downarrow$  \\ \midrule
% % CoLA& 0.8130&0.7459&\textbf{0.0640}&0.8072&0.7300&0.1090&0.8044&0.7240&0.1111\\
% % GoEmotions&0.6298&0.4661&\textbf{0.0321}&0.6298&0.4752&0.0695&0.6263&0.4608&0.0600\\
% % Emotion&0.9255&0.8827&\textbf{0.0322}&0.9270&0.8847&0.0431&0.9275&0.8853&0.0441\\
% % MultiNLI&0.8284&0.8278&\textbf{0.0272}&0.8294&0.8290&0.0418&0.8291&0.8286&0.0344\\
% % \bottomrule
% % \end{tabular}}

% % \caption{Comparison of the performance of the BERT model fine-tuned with different loss terms on four datasets. }%We only reported each single model's result, of which we used to generate loss stability graphs.}
% % \label{table:loss}
% % \end{table*}
% \begin{table*}[h!]
% \centering
% \resizebox{\columnwidth}{!}{
% \begin{tabular}{lllllllllllll}
% \toprule
%                  & \multicolumn{4}{c}{BERT MGP w/  Orthogonality}                                                              & \multicolumn{4}{c}{BERT MGP w/ KL}                                                              &\multicolumn{4}{c}{BERT MGP w/ Wasserstein }                          \\
% \cmidrule(lr){2-5} \cmidrule(lr){6-9}   \cmidrule(lr){10-13}
% Datasets & Acc    &  F1  & $\mathcal{H}$  & ECE$\downarrow$&Acc    &  F1  & $\mathcal{H}$  & ECE$\downarrow$&Acc    &  F1  & $\mathcal{H}$  & ECE$\downarrow$\\ \midrule
% CoLA& 0.8130&0.7459&0.4986&\textbf{0.0640}&0.8072&0.7300&0.3407&0.1090&0.8044&0.7240&0.3499&0.1111\\
% GoEmotions&0.6298&0.4661&0.4333&\textbf{0.0321}&0.6298&0.4752&0.3345&0.0695&0.6263&0.4608&0.3437&0.0600\\
% Emotion&0.9255&0.8827&0.0984&\textbf{0.0322}&0.9270&0.8847&0.0518&0.0431&0.9275&0.8853&0.0510&0.0441\\
% MultiNLI&0.8284&0.8278&0.3650&\textbf{0.0272}&0.8294&0.8290&0.3194&0.0418&0.8291&0.8286&0.3343&0.0344\\
% \bottomrule
% \end{tabular}}

% \caption{Comparison of the performance of the BERT model fine-tuned with different loss terms on four datasets. }%We only reported each single model's result, of which we used to generate loss stability graphs.}
% \label{table:loss}
% \end{table*}
 

% %\subsection{Visualisation of loss stability}
% \paragraph{Training Loss Stability with Additional Loss Terms} 
% We further examine the training loss stability when adding the KL or Wasserstein distance loss terms to our framework. We fine-tuned two BERT-base uncased MGP models on the Emotions dataset. It can be observed in Figure \ref{fig:loss_compare_individual} that the pairwise distance (i.e., the reconstruction loss) seems to be very unstable and keeps fluctuating during training while the orthogonality loss shows a stable decreasing trend and converges quickly. If we only compare the KL loss with the Wasserstein loss, we can see that the Wasserstein loss is more stable compared to KL. %The fluctuation on pairwise loss can be impact by the distribution loss terms. Because the KL loss is obviously more unstable, the models tend to have higher average pairwise distance. This may also 
% Our visualisation results show that the prior distributions assumed by the KL or the Wasserstein loss may not be suitable for reconstruct PLM-encoded representations, thus leading to higher ECE results compared to using the orthogonality regulariser. %could not lead to a more calibrated model.
% \begin{figure*}[ht]
% \centering
% \subfloat[Additional KL divergence loss term.]{
%     \includegraphics[width=0.5\columnwidth]{resources/kl_loss.png}}
% %
% \subfloat[Additional Wasserstein loss term.]{
%     \includegraphics[width=0.5\columnwidth]{resources/Wasserstein.png}}
% \caption{Comparison of BERT models trained on Emotions with additional loss term. Blue: Reconstruction loss; Red: KL loss in (a) and Wasserstein loss in (b); Green: Orthogonality loss.}
% \label{fig:loss_compare_individual}
% \end{figure*}
% %  \begin{figure}[h!]
% % \centering
% % \begin{subfigure}{.4\columnwidth}
% %   \centering
% %   \includegraphics[width=\columnwidth]{resources/kl_loss.png}
% %   \caption{Additional KL divergence loss term.}
% % \end{subfigure}%
% % \begin{subfigure}{.4\columnwidth}
% %   \centering
% %   \includegraphics[width=\columnwidth]{resources/Wasserstein.png}
% %   \caption{Additional Wasserstein loss term.}
% % \end{subfigure}
% % \caption{Comparison of BERT models trained on Emotions with additional loss term. Blue: Reconstruction loss; Red: KL loss in (a) and Wasserstein loss in (b); Green: Orthogonality loss.}
% % \label{fig:loss_compare_individual}
% % \end{figure}

% \paragraph{Latent Space Orthogonality}
% \begin{table*}[h!]
% \centering
% \resizebox{\columnwidth}{!}{
% \begin{tabular}{lllllllll}
% \toprule
%                  & \multicolumn{4}{c}{BERT MGP w/  Orthogonality}                                                              & \multicolumn{4}{c}{BERT MGP w/o Orthogonality}                             \\
% \cmidrule(lr){2-5} \cmidrule(lr){6-9}   
% Datasets &  Acc    &  F1  & $\mathcal{H}$  & ECE$\downarrow$    &  Acc    &  F1  & $\mathcal{H}$  & ECE$\downarrow$   \\ \midrule
% CoLA& 0.8123±0.0012	&0.8762±0.0007&0.4991±0.0032&\textbf{0.0677±0.0056}&0.8042±0.0011&0.7230±0.0024&0.3458±0.0047&0.1121±0.0020 \\
% GoEmotions& 0.6282±0.0029 &	0.4712±0.0087 &0.4433±0.0159 &\textbf{0.0326±0.0013} & 0.6291±0.0019	& 0.4652±0.0037 &0.3432±0.0014& 0.0615±0.0023 \\
% Emotion& 0.9259±0.0009&	0.885±0.0015&0.1031±0.0082&\textbf{0.0289±0.0043}& 0.9268±0.0003	& 0.8848±0.0006	& 0.0519±0.0009&0.0430±0.0016 \\
% MultiNLI& 0.8283±0.0005&0.8277±0.0005&0.3665±0.003&\textbf{0.0262±0.0021} & 0.8281±0.0009	& 0.8277±0.0009 &0.3317±0.0009& 0.0370±0.0010 \\
% \bottomrule
% \end{tabular}}
% \caption{Comparison of results on BERT models trained with/without latent space orthogonality.}
% \label{table:orthogonal}
% \end{table*}
% \begin{figure}[h!]
%     \small
%   \centering
%   \includegraphics[width=0.6\columnwidth]{resources/new_latent_dim/bert-non_orth.png}
%   \caption{ECE and average entropy with latent dimension removal from the model trained without the orthogonality regulariser. The $x$-axis represents the index of \textbf{removed} dimensions ranked by their relevance to $\Delta e$, smaller index number indicates the latent dimension is more similar. Histograms show the ECE scores after removing the corresponding latent dimensions. The blue curve shows the predictive entropy. The green and red curves show the classification accuracy and F1, respectively.}
%   \label{fig:non_ort}
% \end{figure} 

% As explained in Section 4.1, the orthogonality regulariser facilitates a better interpretation of the latent space. Shown in Table \ref{table:orthogonal}, we compare the overall performance between BERT models trained with and without latent space orthogonality. The PLMs fine-tuned with Eq. (7) significantly outperform the counterparts without the orthogonality regularisation in ECE and average entropy on all four datasets. It is also interesting to find the f1 scores slightly decrease on the models trained without the orthogonality on almost all the datasets. Therefore, the orthogonality loss term helps the decoder network to facilitate the same Gaussian distribution on the latent space to generate better semi-factual that make uncertain predictions. 

% % \begin{table*}[h!]
% % \centering
% % \resizebox{0.9\columnwidth}{!}{
% % \begin{tabular}{lllllll}
% % \toprule
% %                  & \multicolumn{3}{c}{BERT VAE w/  Orthogonality}                                                              & \multicolumn{3}{c}{BERT VAE w/o Orthogonality}                             \\
% % \cmidrule(lr){2-4} \cmidrule(lr){5-7}   
% % Datasets             &  Acc    &  F1   & ECE$\downarrow$    &  Acc    &  F1    & ECE$\downarrow$    \\ \midrule
% % CoLA& 0.8123±0.0012	&0.8762±0.0007&\textbf{0.0677±0.0056}&0.8042±0.0011&0.7230±0.0024&0.1121±0.0020 \\
% % GoEmotions& 0.6282±0.0029 & 0.4712±0.0087& \textbf{0.0326±0.0013} & 0.6291±0.0019	& 0.4652±0.0037 & 0.0615±0.0023 \\
% % Emotion& 0.9259±0.0009&	0.8850±0.0015& \textbf{0.0289±0.0043} & 0.9268±0.0003	& 0.8848±0.0006	& 0.0430±0.0016 \\
% % MultiNLI& 0.8283±0.0000 &	0.8277±0.0005 & \textbf{0.0262±0.0021} & 0.8281±0.0009	& 0.8277±0.0009& 0.0370±0.0010 \\
% % \bottomrule
% % \end{tabular}}
% % \caption{Comparison of results on BERT models trained with/without latent space orthogonality.}
% % \label{table:orthogonal}
% % \end{table*}

% We performed a further ablation study to examine the interpretability of the latent space without being regularised by orthogonality. As shown in Figure \ref{fig:non_ort}, without the orthogonality loss term, there is no clear relationship between the tendency of ECE scores and the average entropy during the removal of latent dimensions ranked by their influential scores. As described in Section 4.1, without orthogonality we are not able to maintain the Lipschitz constant consistently, hence we can see an obvious fluctuation in Accuracy and F1. Without a consistent tendency, it is thus %the average entropy and ECE score doesn't share the same tendency. This described disorder makes it 
% difficult to investigate each latent dimension's importance and interpret the impact of each latent dimension on model predictive uncertainty. %Hence, without formula \ref{eq:orthogonality}, we cannot interpret the decision making process and find out possible uncertain words with latent dimensions.


% \begin{figure}[h!]
% \centering
% \subfloat[ALBERT MGP on CoLA.]{
%   \centering
%   \includegraphics[width=0.4\columnwidth]{resources/new_latent_dim/albert-cola.png}}
% %
% \subfloat[ALBERT MGP on GoEmotions.]{
%   \centering
%   \includegraphics[width=0.4\columnwidth]{resources/new_latent_dim/albert-goemo.png}}\\
% \subfloat[ALBERT MGP on Emotion.]{
%   \centering
%   \includegraphics[width=0.4\columnwidth]{resources/new_latent_dim/albert-emo.png}}
% %
% \subfloat[ALBERT MGP on MultiNLI.]{
%   \includegraphics[width=0.4\columnwidth]{resources/new_latent_dim/albert-nli.png}}\\
% \subfloat[DistilBERT MGP on CoLA.]{
%   \includegraphics[width=0.4\columnwidth]{resources/new_latent_dim/distilbert-cola.png}}
% %
% \subfloat[DistilBERT MGP on GoEmotions.]{
%   \includegraphics[width=0.4\columnwidth]{resources/new_latent_dim/distilbert-goemo.png}}\\
% \subfloat[DistilBERT MGP on Emotion.]{
%   \includegraphics[width=0.4\columnwidth]{resources/new_latent_dim/distilbert-emo.png}}
% %
% \subfloat[DistilBERT MGP on MultiNLI.]{
%   \includegraphics[width=0.4\columnwidth]{resources/new_latent_dim/distilbert-nli.png}}\\
% \subfloat[RoBERTa MGP on CoLA.]{
%   \includegraphics[width=0.4\columnwidth]{resources/new_latent_dim/roberta-cola.png}}
% %
% \subfloat[RoBERTa MGP on GoEmotions.]{
%   \includegraphics[width=0.4\columnwidth]{resources/new_latent_dim/roberta-goemo.png}}\\
% \subfloat[RoBERTa MGP on Emotion.]{
%   \includegraphics[width=0.4\columnwidth]{resources/new_latent_dim/roberta-emo.png}}
% %
% \subfloat[RoBERTa MGP on MultiNLI.]{
%   \includegraphics[width=0.4\columnwidth]{resources/new_latent_dim/roberta-nli.png}}
%      \caption{Evaluation results by removing latent dimensions. The $x$-axis represents the index of \textbf{removed} dimensions ranked by their relevance to $\Delta e$, smaller index number indicates the latent dimension is more similar. Histograms show the ECE scores after removing the corresponding latent dimensions. The blue curve shows the predictive entropy. The green and red curves show classification accuracy and F1, respectively.}
%     \label{fig:latent_ablation}
% \end{figure}

% \subsection{Results with Latent Dimension Removal} 
% We study the impact of removing ranked latent variable dimensions on the other three models: ALBERT, DistilBERT and RoBERTa. As shown in Figure \ref{fig:latent_ablation}, we can observe the same trend of ECE score and average entropy increasing when removing latent dimensions ranked by their influential scores on almost all the models while keeping accuracy and macro F1 scores almost unchanged. This proves our MGP framework can be generalized to interpret the uncertainty via latent dimensions on various models and different datasets. On the CoLA dataset, both ALBERT and RoBERTa exhibit a similar pattern compared with BERT model; we can also observe a peak for the average entropy.  
% Our method effectively distinguishes the importance between latent dimensions, and thus we can use those dimensions to interpret token level uncertainty as discussed in the paper Section 5.3.
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


\end{document}


% This document was modified from the file originally made available by
% Pat Langley and Andrea Danyluk for ICML-2K. This version was created
% by Iain Murray in 2018, and modified by Alexandre Bouchard in
% 2019 and 2021 and by Csaba Szepesvari, Gang Niu and Sivan Sabato in 2022.
% Modified again in 2023 by Sivan Sabato and Jonathan Scarlett.
% Previous contributors include Dan Roy, Lise Getoor and Tobias
% Scheffer, which was slightly modified from the 2010 version by
% Thorsten Joachims & Johannes Fuernkranz, slightly modified from the
% 2009 version by Kiri Wagstaff and Sam Roweis's 2008 version, which is
% slightly modified from Prasad Tadepalli's 2007 version which is a
% lightly changed version of the previous year's version by Andrew
% Moore, which was in turn edited from those of Kristian Kersting and
% Codrina Lauth. Alex Smola contributed to the algorithmic style files.
