% \documentclass{uai2024} % for initial submission
\documentclass[accepted]{uai2024} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2024} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2024} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

% \usepackage{hyperref}
% \usepackage[colorlinks=true,
%             citecolor=blue!50!black,
%             linkcolor=red!50!black,
%             urlcolor=green!50!black]{hyperref}
\usepackage{url}          
\usepackage{amsfonts}      
\usepackage{nicefrac}      
\usepackage{graphicx}
\usepackage{doi}

\usepackage{amsmath,amsthm}
\usepackage{graphicx}       % graphics
\usepackage{csquotes}
\usepackage{xcolor}
\usepackage{bbm}
\usepackage{bm}
\usepackage{subcaption}
\usepackage{caption}
\usepackage{enumitem}

\usepackage{multirow}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example



%% Our stuff
\DeclareMathOperator*{\prob}{\mathbb{P}}
\DeclareMathOperator*{\lab}{\mathcal{Y}}

\DeclareMathOperator*{\plab}{\prob(\mathcal{Y})}
\DeclareMathOperator*{\pplab}{\prob(\prob(\mathcal{Y}))}
\DeclareMathOperator*{\Dir}{\text{Dir}}
\DeclareMathOperator*{\Var}{\text{Var}}
\DeclareMathOperator*{\Cov}{\text{Cov}}
\DeclareMathOperator*{\ent}{\text{H}}
\DeclareMathOperator*{\mi}{\text{I}}
\DeclareMathOperator*{\kl}{D_{KL}}

\newtheorem{theorem}{Theorem}[section]
% \newtheorem*{theorem*}{Theorem}
\newtheorem{lemma}{Lemma}[section]
\newtheorem{prop}{Proposition}[section]
\newtheorem{corollary}{Corollary}[section]
\newtheorem{result}{Result}[section]
\newtheorem{definition}{Definition}[section]
\newtheorem{claim}{Claim}[section]
\newtheorem{remark}{Remark}[section]
\newtheorem{example}{Example}[section]
%\newtheorem{proof}{Proof}[section]
\newtheorem{defi}{Definition}[section]

\newcommand{\TU}{\text{TU}}
\newcommand{\EU}{\text{EU}}
\newcommand{\AU}{\text{AU}}

\newcommand{\mathsymbol}[2]{\newcommand{#1}{\ensuremath{\mathit{#2}}\xspace}}
\renewcommand{\vec}[1]{\boldsymbol{#1}}
\newcommand{\vtheta}{{\vec{\theta}}}
\newcommand{\given}{\, | \,}
\newcommand{\fromto}{\longrightarrow}

\newcommand{\ksimplex}{\Delta_K}
\newcommand{\ksimplextwo}[1][K]{\Delta_{#1}^{(2)}}

\newcommand*{\defeq}{\mathrel{\vcenter{\baselineskip0.5ex \lineskiplimit0pt
			\hbox{\footnotesize.}\hbox{\footnotesize.}}}%
=}

% Define colors
\definecolor{darkolivegreen}{RGB}{85, 107, 47}
\definecolor{maroon}{RGB}{128, 0, 0}
\definecolor{PythonRed}{RGB}{255, 0, 0} 
\definecolor{PythonBlue}{RGB}{31, 119, 180}
\definecolor{Green}{RGB}{0, 128, 0}
\definecolor{Orange}{RGB}{255, 165, 0}


\title{Label-wise Aleatoric and Epistemic Uncertainty Quantification}

% The standard author block has changed for UAI 2024 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1,3]{Yusuf~Sale}
\author[1,3]{Paul~Hofman}
\author[1,3]{Timo~L\"ohr}
\author[2,3]{Lisa~Wimmer}
\author[2,3]{Thomas~Nagler}
\author[1,3]{Eyke~H\"ullermeier}
% \author[3,1]{Further~Coauthor}
% Add affiliations after the authors
\affil[1]{%
    Institute of Informatics\\
    LMU Munich\\
    Munich, Germany
}
\affil[2]{%
    Department of Statistics\\
    LMU Munich\\
    Munich, Germany
}
\affil[3]{%
    Munich Center for Machine Learning
  }
  
  \begin{document}
\maketitle

%Uncertainty quantification is a critical aspect of machine learning models, providing important insights into the reliability of predictions and aiding the decision-making process in real-world applications. 
%
%allows for reasoning about uncertainty at individual class levels.
%
%By leveraging entropy- and variance-based measures, our proposal 

\begin{abstract}
We present a novel approach to uncertainty quantification in classification tasks based on label-wise decomposition of uncertainty measures. This label-wise perspective allows uncertainty to be quantified at the individual class level, thereby improving cost-sensitive decision-making and helping understand the sources of uncertainty. Furthermore, it allows to define total, aleatoric, and epistemic uncertainty on the basis of non-categorical measures such as variance, going beyond common entropy-based measures. In particular, variance-based measures address some of the limitations associated with established methods that have recently been discussed in the literature. We show that our proposed measures adhere to a number of desirable properties. Through empirical evaluation on a variety of benchmark data sets -- including applications in the medical domain where accurate uncertainty quantification is crucial -- we establish the effectiveness of label-wise uncertainty quantification. 
\end{abstract}


% Section 1
\section{Introduction}
\label{sec:intro}
%
Thanks to methods of unprecedented predictive power, machine learning (ML) is becoming more and more ingrained into peoples' lives.
It increasingly supports human decision-making processes in fields ranging from healthcare \citep{lambrou2010reliable, senge_2014_ReliableClassificationLearning, yang2009using, mobiny_2021_DropConnectEffectiveModeling} and autonomous driving \citep{michelmore_2018_EvaluatingUncertaintyQuantification} to socio-technical systems \citep{varshney2016engineering,varshney2017safety}.
The safety requirements of such applications trigger an urgent need to report \textit{uncertainty} alongside model predictions \citep{hullermeier2021aleatoric}.
Meaningful uncertainty estimates are indispensable for trust in ML-assisted decisions as they signal when a prediction is not confident enough to be relied upon.
%
 % 	\begin{figure}[t!]
	% 	\centering
	% 	\includegraphics[width=1\linewidth]{Figures/mnist_label_02.png}
 %        % \includegraphics[width=1\linewidth]{Figures/mnist_label_02.png}
	% 	\caption{Label-wise \textit{\textcolor{PythonRed}{
 %        aleatoric}} and \textit{\textcolor{PythonBlue}{epistemic}} uncertainties for MNIST instances.} 
	% 	\label{fig1}
 %    \vspace{-0.75cm}
	% \end{figure}

    \begin{figure}[t!]
\centering
\begin{subfigure}{.5\linewidth}
  \centering
  \includegraphics[width=1\linewidth]{figures_uai/ml_data/mnist_tu_0.pdf}
\end{subfigure}%
\begin{subfigure}{.5\linewidth}
  \centering
  \includegraphics[width=1\linewidth]{figures_uai/ml_data/mnist_tu_2.pdf}
\end{subfigure}
\caption{Label-wise \textit{\textcolor{PythonRed}{
        aleatoric}} and \textit{\textcolor{PythonBlue}{epistemic}} uncertainties for MNIST instances.} 
\label{fig1}
\vspace{-0.5cm}
\end{figure}
 
In order to address predictive uncertainty about a query instance $\boldsymbol{x}$ (e.g., an image like in Fig.\ \ref{fig1}), it is often crucial to identify its source.
For one, uncertainty can arise through inherent stochasticity of the data-generating process, omitted variables or measurement errors \citep{gruber2023sources}.
As such, \emph{aleatoric} uncertainty (AU) is a fixed but unknown quantity.
In addition, a lack of knowledge about the best way to model the data-generating process induces \emph{epistemic} uncertainty (EU).
Under the assumption that the model class is correctly specified, collecting enough information will reduce the EU until it vanishes in the limit of infinite data \citep{hullermeier2021aleatoric}.
The attribution of uncertainty to its sources can inform decisions in various ways.
For instance, it might help practitioners realize that gathering more data will be futile when only AU is present, or guide sequential learning processes like active learning \citep{shelmanov_2021_ActiveLearningSequence, nguyen2022measure} and Bayesian optimization \citep{HOFFER2023109279, stanton2023bayesian} by seeking out promising parts of the search space that can be explored to reduce EU while avoiding uninformative areas with high AU. 
 

Quantifying both AU and EU necessitates a meaningful uncertainty \emph{representation}.
In supervised learning, we consider hypotheses in the form of probabilistic classifiers $h$ that map a query instance $\boldsymbol{x}$ to a probability distribution $p = h(\boldsymbol{x})$ on the label space. 
%, where $\lab$ denotes the outcome space and $\sigma(\mathcal{Y})$ a suitable $\sigma$-algebra on $\lab$.
This prediction provides an estimate of the ground-truth (conditional) probability  $p^*(\cdot \, | \, \boldsymbol{x})$, i.e., $p(y)$ estimates the true probability $p^*(y \, | \, \boldsymbol{x})$ of observing class label $y$ as outcome given $\boldsymbol{x}$. 
% on $(\lab, \sigma(\lab))$.
When predicting a single %numerical value or 
class label in a deterministic way, $p$ will be a Dirac measure.
The case of probabilistic classification, which we study in this work, is more informative in the sense that a (posterior) probability is associated with all possible class labels, giving rise to a natural notion of AU around the observed outcome $y$. 
However, such probabilistic expressions are point predictions (in the space of probability distributions) derived from a single hypothesis $h$ learned on the training data.
Since all other candidates in the hypothesis space are discarded in the process, $p$ cannot, by design, represent EU \citep{hullermeier2021aleatoric}.

Expressing EU requires a further level of uncertainty representation. 
A straightforward approach is to impose a \emph{second-order} distribution, effectively assigning a probability (density) to each candidate first-order distribution $p$, and equate the dispersion of this distribution with EU.
Both the classical Bayesian paradigm \citep{gelman2013bayesian} and evidential deep learning (EDL) methods \citep{UlmerHF23} follow this idea.
As an alternative, methods founded on more general theories of probability, such as imprecise probabilities  \citep{walley1991,augustin2014introduction}, have been considered \citep{corani2012bayesian, sale2023volume}.

In recent years, probabilistic classification has increasingly embraced a bi-level distributional approach, with a predominant reliance on \textit{Shannon entropy} to dissect uncertainty into its aleatoric and epistemic components \citep[e.g.,][]{kendall2017uncertainties, smithGal2018, Charpentier2022NaturalPN}.
%
In this approach, the entropy of the categorical output distribution over class labels is associated with the total predictive uncertainty for a query instance $\boldsymbol{x}$. By a well-known result from information theory \citep{cover1999elements}, this quantity decomposes additively into \textit{conditional entropy} (representing AU) and \textit{mutual information} (representing EU). While this set of measures may seem concise and intuitive, \citet{wimmer2023quantifying} recently pointed out that it does not fulfill certain properties that one would naturally expect to hold. Whereas these methods solely focus on quantifying uncertainty at a \textit{global} level, we argue that this perspective may not suffice for all decision-making scenarios. To address this gap, we propose an approach centered on \textit{label-wise} uncertainty quantification. This approach allows for a more nuanced understanding of uncertainty, enabling decision-makers to evaluate the uncertainty associated with individual class predictions. 

By adopting a label-wise perspective, our method facilitates more informed decision-making, especially in contexts where the consequences of incorrect predictions---as in medical scenarios---differ between classes. \newpage This perspective on uncertainty quantification not only preserves the global perspective inherent in traditional approaches, but also enhances it by providing insights at the class level. Moreover, since a label-wise decomposition of uncertainty measures effectively amounts to reducing multinomial to binary classification, our approach is no longer restricted to uncertainty measures for categorical variables, such as entropy. Instead, it is amenable to a much broader class of measures, including variance as arguably the most common statistical measure of dispersion. 

Our contributions are as follows: 
\begin{itemize}
    \item[\textbf{(1)}] We propose a \textit{label-wise} perspective enabling reasoning about uncertainty at the individual class level, aiding decision-making especially in scenarios where the stakes of incorrect predictions vary across classes. To this end, we leverage entropy- and variance-based measures for label-wise uncertainty quantification. 
    \item[\textbf{(2)}] We showcase that adopting a label-wise perspective retains the \textit{global} perspective at the same time. In this regard, we demonstrate that the proposed measures satisfy a set of desirable properties, enhancing their theoretical appeal. In particular we show that our proposed variance-based measures overcome the drawbacks of the entropy-based approach, recently highlighted in the literature, without sacrificing practical applicability.
    \item[\textbf{(3)}] Through empirical evaluation, we validate the efficacy of our approach, demonstrating its competitiveness in (global) downstream tasks such as prediction with abstention and out-of-distribution detection.  Our empirical findings are substantiated across a range of classical machine learning benchmarks and verified in the medical domain, where suitable uncertainty quantification is indispensable. 
\end{itemize}

Proofs of our theoretical results can be found in Appendix~\ref{appendix:proofs}. For experimental details and supplementary experiments, refer to Appendix~\ref{appendix:exp_details} and Appendix~\ref{appendix:exp}, respectively.


%----------------------------------------------------%
% Section 2
\section{Quantifying Second-Order Uncertainty}
\label{sec:uq}
 %
    In the following, we will be concerned with the supervised classification scenario. 
    %
    We refer to $\mathcal{X}$ as \textit{instance} space, and we assume categorical target variables from a finite \textit{label} space $\lab = \{ y_1, \ldots, y_K \}$, where $K \in \mathbb{N}_{\geq 2}$. 
    %
    Thus, each instance $\vec{x} \in \mathcal{X}$ is associated with a conditional distribution on the measurable space $(\lab, 2^{\lab})$, such that $\theta_k \defeq p(y_k \given \boldsymbol{x} )$ is the probability to observe label $y_k \in \mathcal{Y}$ given $\vec{x} \in \mathcal{X}$.
    %
    Further, we note that the set of all probability measures on $(\lab, 2^{\lab})$ can be identified with the $(K-1)$-simplex
    $\ksimplex \defeq \left \{ \vtheta = (\theta_1, \ldots , \theta_K) \in [0,1]^K ~ \given~ \| \vtheta \|_1 = 1 \right \}$.
    %
    Consequently, for each $\vtheta \in \ksimplex$, an associated degree of aleatoric uncertainty can be calculated. \newpage To effectively represent epistemic uncertainty, it is necessary for the learner to express its uncertainty regarding $\vtheta$. This can be achieved by a second-order probability distribution over the first-order distributions $\vtheta$.
    %
    
    Two popular methods to obtain a second-order (predictive) distribution are Bayesian inference and Evidential Deep Learning. 
    %
    In both approaches, we arrive at a second-order predictor $h_2: \mathcal{X} \fromto \ksimplextwo,$ where $\ksimplex^{(2)}$ denotes the set of all probability measures on $(\ksimplex, \sigma(\ksimplex))$; we call $Q \in \ksimplextwo$ a second-order distribution. For the sake of simplicity, we omit the conditioning on the query instance $\vec{x}$ in the notation. Hence, given an instance $\vec{x}$, the second-order distribution $Q$ represents our probabilistic knowledge about $\vtheta$, i.e., $Q(\vtheta)$ is the probability (density) of $\vtheta \in \ksimplex$.
    %
    In the remainder of this paper, we assume that a second-order distribution $Q$ is already provided.
    %

    Given an uncertainty representation in terms of a second-order distribution $Q \in \ksimplextwo$, the subsequent question is how to suitably quantify total, aleatoric, and epistemic uncertainty. Popular approaches to uncertainty quantification in the literature \citep{houlsby_2011_BayesianActiveLearning, gal_2016_UncertaintyDeepLearning, depeweg2018decomposition, smithGal2018, mobiny_2021_DropConnectEffectiveModeling} rely on information-theoretic measures derived from Shannon entropy \citep{shannon1948mathematical}. In the following section, we will revisit these commonly accepted entropy-based uncertainty measures and discuss meaningful properties that any uncertainty measure should possess.

%%
\subsection{Entropy-Based Measures}
\label{subsec:entropy}
 %
	We begin by revisiting the arguably most common entropy-based approach in machine learning for quantifying predictive uncertainty represented by a second-order distribution $Q$.
	%
    This approach leverages (Shannon) entropy and its connection to mutual information and conditional entropy to quantify total, aleatoric, and epistemic uncertainty associated with $Q$.

    Shannon entropy for a (first-order) probability distribution $\vtheta \in \ksimplex$ is given by
    \begin{align}
		\ent(\vtheta) \defeq - \sum_{k = 1}^{K} \theta_k \log_2 \theta_k \, .
		\label{eq:entropy}
	\end{align}
    Now, let $Y: \Omega \fromto \lab$ be a (discrete) random variable, and denote by $\vtheta_{Y}$ its corresponding distribution on the measurable space $(\lab, 2^{\lab})$. Then, we can analogously define the entropy of the random variable $Y$ by simply replacing $\theta_k$ in \eqref{eq:entropy} by the respective distribution of $Y$.
    Entropy has established itself as an accepted uncertainty measure due to both appealing theoretical properties and the intuitive interpretation as a measure of uncertainty. In particular, it measures the uniformity degree of the distribution of a random variable. 
    %
    
	Subsequently, following the notation of \citet{wimmer2023quantifying}, we assume that $\Theta \sim Q.$ Therefore, $\Theta: \Omega \fromto \ksimplex$ is a random first-order distribution which is distributed according to a second-order distribution $Q$, and consequently takes values $\Theta(\omega) = \vtheta$ in the $(K-1)$-simplex $\ksimplex$.
    %
    

 Given a second-order distribution $Q$, we can consider its expectation given by 
	%
	\begin{align}
		\bar{\vtheta} \defeq \mathbb{E}_Q[\Theta] = \int_{\ksimplex} \vtheta \; \mathrm{d}Q(\vtheta) \, ,
		\label{eq:aggregation}
	\end{align} 
	%
	which yields a probability distribution $\bar{\vtheta}$ on $(\lab, 2^{\lab})$. This measure corresponds to the distribution of $Y$ when we view it as generated from first sampling $\Theta \sim Q$ and then $Y$ according to $\Theta$. Then, it is natural to define the measure of total uncertainty (TU) as the entropy \eqref{eq:entropy} of $\bar{\vtheta}$:
    %
	\begin{align}
		\TU(Q) \defeq  \ent \left( \mathbb{E}_Q[\Theta]   \right) \, .
		\label{tu:entropy}
	\end{align}
   % with $\ent(\cdot)$ as defined in \eqref{eq:entropy}.
    Similarly, aleatoric uncertainty (AU) can be defined in terms of \textit{conditional entropy} $\ent(Y|\Theta)$:
	\begin{align}
		\AU(Q) \defeq  \mathbb{E}_Q[ \ent(Y| \Theta) ] = \int_{\ksimplex} \ent(\vtheta) \; \mathrm{d}Q(\vtheta) \, .
		\label{au:entropy}
	\end{align}
    By fixing a first-order distribution $\vtheta \in \ksimplex$, all EU is essentially removed and only AU remains. However, as $\vtheta$ is not precisely known, we take the expectation with respect to the second-order distribution.
	%
    The measure of epistemic uncertainty is particularly inspired by the widely known additive decomposition of entropy into \textit{conditional entropy} and \textit{mutual information} (see also Section 2.4 in \cite{cover1999elements}). This is expressed as follows:
	\begin{align}
		\underbrace{\ent(Y)}_{\textnormal{entropy}} = \underbrace{\ent(Y\, | \, \Theta)}_{\textnormal{conditional entropy}}+ \underbrace{\mi(Y, \Theta).}_{\textnormal{mutual information}} 
		\label{eu:entropy}
	\end{align}
	Rearranging \eqref{eu:entropy} for mutual information yields a measure of epistemic uncertainty
	\begin{align} \label{eu:entropy_2}
		%
		\begin{split}
			%
			\EU(Q) \defeq \mi(Y, \Theta) &= \ent(Y)- \ent(Y\, | \, \Theta). \\[0.2cm]
			%
		\end{split}
		%
	\end{align}
	% where $D_{KL}(\cdot \, \| \, \cdot)$ denotes the Kullback-Leibler (KL) divergence \citep{kullback1951information}.
    %
    While entropy, mutual information, and conditional entropy provide meaningful interpretations for quantifying uncertainties  within first-order predictive distributions, the suitability of these entropy-based measures for second-order quantification has been challenged by \cite{wimmer2023quantifying}.
    %
    This criticism was substantiated on the basis of a set of desirable properties, which will be discussed next. 

%%
\subsection{Desirable Properties}
\label{subsection:axioms}
    %
    In this section we discuss desirable properties that any suitable uncertainty measure should fulfill. 
    In the (uncertainty) literature it is standard practice to establish measures based on a set of axioms \citep{pal1993uncertainty, bronevich2008axioms}. Such an axiomatic approach was also adopted in the recent machine learning literature \citep{hullermeier2022quantification, sale2023secondorder}.
    %
    To this end, we revisit the axioms outlined by \cite{wimmer2023quantifying}, while also taking into account recently proposed properties that further refine the understanding of what constitutes a suitable measure of second-order uncertainty \citep{sale2023secondorder}.
    %
    Before discussing the proposed axioms, we first provide some mathematical preliminaries.
    \begin{definition} \label{def:shifts}
    Let $\vec{\Theta} \sim Q,\, \vec{\Theta}^{\prime} \sim Q^{\prime}$ be two random vectors, where $Q, Q^{\prime} \in \ksimplextwo$. Denote by $\sigma(\vec{\Theta})$ the $\sigma$-algebra generated by the random vector $\vec{\Theta}$.
    Then we call $Q^{\prime}$
    \begin{itemize}
        \item[(i)] a mean-preserving spread of $Q$, iff $\vec{\Theta}^\prime \overset{d}{=} \vec{\Theta} + \vec{Z}$, for some random vector $\vec{Z}$ with $\mathbb{E}[\vec{Z} \given \sigma(\vec{\Theta})] = 0$ almost surely (a.s.) and $ \max_k \Var(Z_k) > 0$;
        \item[(ii)] a spread-preserving location shift of $Q$, iff $\vec{\Theta}^\prime \overset{d}{=} \vec{\Theta} + \vec{z}$, where $\vec{z} \neq 0$ is a constant;
        \item[(iii)] a spread-preserving center-shift of $Q$, iff it is a spread-preserving location shift with $\mathbb E[\vec{\Theta}'] = \lambda \mathbb E[\vec{\Theta}] + (1 - \lambda) (1/K, \dots, 1/K)^\top$ for some $\lambda \in (0, 1)$.
    \end{itemize}
    Note that for definitions (ii) and (iii) it should be guaranteed that the shifted probability measure $Q^{\prime}$ remains valid within its support. 
    \end{definition}
    %
    Now, let $\TU$, $\AU$, and $\EU$ denote, respectively, measures $\ksimplextwo \to \mathbb{R}_{\geq 0}$ of total, aleatoric, and epistemic uncertainty associated with a second-order uncertainty representation $Q \in \ksimplextwo$.
    %
    \cite{wimmer2023quantifying} propose that any uncertainty measure should fulfill (at least) the following set of axioms:

\begin{itemize}
    \item[A0] $\TU$, $\AU$, and $\EU$ are non-negative.
    \item[A1] $\EU(Q) = 0$, if and only if $Q = \delta_{\vtheta}$, where $\delta_{\vtheta}$ denotes the Dirac measure on some $\vtheta \in \ksimplex$.
    \item[A2] $\EU$ and $\TU$ are maximal for $Q$ being the uniform distribution on $\ksimplex$.
    \item[A3] If $Q'$ is a mean-preserving spread of $Q$, then $\EU(Q') \geq \EU(Q)$ (weak version) or $\EU(Q') > \EU(Q)$ (strict version), the same holds for TU.
    \item[A4] If $Q'$ is a spread-preserving center-shift of $Q$, then $\AU(Q') \geq \AU(Q)$ (weak version) or $\AU(Q') > \AU(Q)$ (strict version), the same holds for $\TU$.
    \item[A5] If $Q'$ is a spread-preserving location shift of $Q$, then $\EU(Q') = \EU(Q)$.
\end{itemize}
%
Axiom A0 is an obvious requirement, ensuring that such measures reflect a degree of uncertainty without implying the absence of information or negative uncertainty, which would be conceptually unsound. 
%
Axiom A1 addresses the behavior of $\EU$ in the context of Dirac measures, where a Dirac measure $\delta_{\vtheta}$ represents a scenario of complete certainty about $\vtheta \in \ksimplex$. The vanishing of $\EU$ in this context aligns with the intuitive understanding that epistemic uncertainty should be zero when there is absolute certainty about the true underlying model.
%
Further, Axiom A2 considers the condition under which $\EU$ and $\TU$ attain their maximal values, specifically when $Q$ is the uniform distribution on $\ksimplex$. This reflects situations of maximum uncertainty or ignorance, where the lack of knowledge about any specific outcome $\vtheta \in \ksimplex$ leads to the highest level of uncertainty. As we will discuss later, this axiom is not without controversy, particularly in the fields of statistics and decision theory.
%
Axiom A3 encapsulates the idea that spreading a distribution while preserving its mean should not reduce, and might increase, the epistemic (and thus, total) uncertainty. It underscores the notion that increased dispersion (while maintaining the mean) is associated with higher uncertainty, a concept that is central in statistics.
% \footnote{According to \cite{wimmer2023quantifying}, this axiom is violated by the entropy-based approach. As we show in Proposition \ref{corrigendum}, however, this claim seems to be incorrect.}
%
Conversely, leaving the dispersion constant but shifting the distribution closer to the barycenter of the simplex, thereby expressing a belief about $\vtheta$ that is closer to uniform, should be reflected by an increase in AU (Axiom A4).
%
Lastly, Axiom A5 asserts that a spread-preserving location shift, which alters the distribution's location without affecting its spread, should leave the epistemic uncertainty unchanged. This property highlights the distinct nature of epistemic uncertainty, which is sensitive to the spread of the distribution rather than its location \citep{hullermeier2022quantification}. 

Taking into consideration recently proposed criteria for measures of second-order uncertainty \citep{sale2023secondorder}, we expand the existing set of Axioms A0--A5 by introducing two additional properties. For the set of all mixtures of second-order Dirac measures on first-order Dirac measures we write
 \begin{align*} 
\Delta_{\delta_m} = \Big\{ \delta_m \in  \ksimplextwo \, : \,  
\delta_m =   \sum_{y \in \lab} \lambda_y \cdot \delta_{\delta_{y}}, \, \sum_{y \in \lab} \lambda_y = 1 \Big\} \,,
\end{align*}
where $\delta_{\delta_y}$ denotes the second-order Dirac measure on $\delta_y \in \ksimplex$ for $y \in \lab$. 
Each element in this set should arguably have no aleatoric uncertainty, such that we postulate the following Axiom A6. 
\begin{itemize}
    \item[A6] $\AU(\delta_{m}) = 0$ holds for any $\delta_m \in \Delta_{\delta_m}$ . 
\end{itemize}
%
Now, let $\lab_1$ and $\lab_2$ be partitions of $\lab$ and $Q \in \ksimplextwo$; further denote by $Q_{|\lab_i}$ the corresponding marginalized distribution for $i \in \{1, 2\}$. 

\begin{itemize}
\item[A7] $\TU_{\lab}(Q) \leq \TU_{\lab_1}(Q_{|\lab_1}) + \TU_{\lab_2}(Q_{|\lab_2})$, and the same holds for $\AU$ and $\EU$.
\end{itemize}

Axiom A7 guarantees that the total uncertainty of a second-order distribution is bounded by the sum of total uncertainties of its corresponding marginalizations.


\section{Label-wise Uncertainty Quantification}
\label{sec:label}
%
In this section, we propose to measure uncertainty in a label-wise manner, and to obtain the overall uncertainty associated with a prediction by aggregating the uncertainties across the individual labels. This approach allows us to adopt a \textit{label-wise} perspective while retaining the \textit{global} one.

Let us emphasize again that the label-wise perspective is particularly useful in settings where decisions following the prediction of different labels are associated with unequal costs. For instance, when predicting the sub-type of a certain medical condition, with costly treatment administered at occurrence of one of the sub-types, the marginal uncertainty about this category might be of particular interest. We present some experimental results on medical images in Section~\ref{sub:classwise}. The global view, %captured by measures (\ref{tu:label_general}--\ref{eu:label_general}), 
on the other side, is crucial in scenarios where understanding the overall uncertainty is key to making informed decisions. For instance, $\TU$ serves as an indicator for the overall reliability of the model for the given observation. Meanwhile, $\AU$ and 
$\EU$ distinguish between the uncertainty arising from the data's inherent variability and that stemming from the model's knowledge limitations, respectively. 


We denote by $\vec{Y}: \Omega \fromto \{0,1\}^K$ the $K$-dimensional random vector indicating the presence or absence of a particular label $y_k \in \mathcal{Y}$ for $k \in \{1, \dots, K\}$. Further, define $\Theta_k \coloneqq P(Y_k = 1)$ and assume that the random vector $\vec{\Theta} = (\Theta_1, \dots, \Theta_K)$ is distributed according to a second-order distribution $Q \in \ksimplextwo$, i.e., $\vec{\Theta} \sim Q$. Moreover, let $Q_k$ be the marginal distribution of the random variable $\Theta_k$, such that $\Theta_k \sim Q_k$, and denote its expectation by $\bar{\theta}_k = \mathbb{E}[\Theta_k]$. 
%This allows us now to define suitable measures of total, aleatoric, and epistemic uncertainty associated with each marginal distribution $Q_k$, effectively quantifying (predictive) uncertainty of each label.


Our general approach to label-wise uncertainty quantification adheres to the following template: First, we define \emph{local} measures of total, aleatoric, and epistemic uncertainty per label: $\TU(Q_k)$, $\AU(Q_k)$, $\EU(Q_k)$ for $k \in \{1, \ldots, K \}$. One way to define these measures in a meaningful way is to adopt a loss-based perspective: Consider a learner making probabilistic predictions $\hat{\theta}$ for a (binary) outcome $Y_k$, which are penalized with a loss $\phi(  \hat{\theta} , Y_k)$. If $Y_k$ is distributed according to $\theta_k$, then the expected loss is given by
\begin{equation}\label{eq:epsr}
\phi(\hat{\theta}, \theta_k) \defeq \mathbb{E}_{Y_k \sim \theta_k} \, \phi(  \hat{\theta} , Y_k) \, .
\end{equation}
In our case, $\theta_k$ itself is presumably distributed according to the second-order distribution $Q_k$, so the prediction $\hat{\theta}$ induces the expected loss
\begin{equation}\label{eq:etl}
\mathbb{E}_{\theta_k \sim Q_k} \, \phi( \hat{\theta} , \theta_k ) = \mathbb{E}_{\theta_k \sim Q_k} \, \mathbb{E}_{Y_k \sim \theta_k} \, \phi(  \hat{\theta} , Y_k) \, .
\end{equation}
Broadly speaking, the idea is as follows: If the expected loss (\ref{eq:etl}) can be kept small, by virtue of an appropriate prediction $\hat{\theta}$, then this signifies a situation of low (total) uncertainty. Otherwise, if this is not possible, then the uncertainty is high. More specifically, we suggest the following definitions for the three types of uncertainty: 
\begin{itemize}
\item Total uncertainty is the minimum of (\ref{eq:etl}), i.e., the expected loss of the risk-minimizing prediction $\hat{\theta}$ given knowledge of the second-order distribution $Q_k$:
% (i.e., knowing that $\theta$ is drawn from $Q$ and then $Y$ according to $\theta$):
\begin{equation}\label{eq:lwt}
\TU(Q_k) \defeq  \min_{\hat{\theta}} \, \mathbb{E}_{\theta_k \sim Q_k} \, \phi( \hat{\theta} , \theta_k ) 
\end{equation}

\item Aleatoric uncertainty is the expected loss of the risk-minimizing prediction $\hat{\theta}$ given knowledge about the true $\theta_k$ (sampled from $Q_k$). 
\begin{equation}\label{eq:lwa}
\AU(Q_k) \defeq  \mathbb{E}_{\theta_k \sim Q_k} \, \min_{\hat{\theta}} \, \phi( \hat{\theta} , \theta_k ) 
\end{equation}

\item Epistemic uncertainty is the difference between these two, i.e., the extra loss that is caused by the lack of knowledge about the true $\theta_k$:
 \begin{equation}\label{eq:lwe}
\EU(Q_k) \defeq  \TU(Q_k)  - \AU(Q_k) 
\end{equation}

\end{itemize}
%
In particular, total uncertainty reflects an optimistic perspective inherent in the idea of quantifying uncertainty in terms of \textit{unavoidable loss}. To illustrate, let us consider the following: Given a second-order distribution $Q_k$, from which a distribution $\theta_k$ will be sampled, one aims to predict $\hat{\theta}$ and will then incur the loss $\phi(\hat{\theta}, \theta_k)$. The objective is to minimize the expected loss, hence the minimum in \eqref{eq:lwt}. Success in minimizing this expected loss implies that $Q_k$ is "peaked" or close to a Dirac measure, indicating low uncertainty. Conversely, if $Q_k$ is widely spread and not very informative, the uncertainty is high, and even the optimal prediction $\hat{\theta}$ cannot ensure a low loss. This explains the rationale behind total uncertainty \eqref{eq:lwt}.

The additive relationship of the (global) entropy-based measures has been a subject of debate in the literature \citep{wimmer2023quantifying}. In our framework, it can be justified as follows: As discussed before, TU represents the unavoidable loss in predicting $\theta_k$, incorporating an epistemic component since the true data-generating process $\theta_k$ is unknown and only characterized by $Q_k$. This epistemic uncertainty would vanish if $\theta_k$ were known, leaving only aleatoric uncertainty. With $\theta_k$ known, the best prediction aligns with $\hat{\theta} = \theta_k$, resulting in a loss $\phi(\theta_k, \theta_k)$; for instance, in the case of log-loss, this equates to Shannon entropy. Therefore, AU is defined as the expectation with respect to $Q_k$ of this residual loss, as per Equation \eqref{eq:lwa}. Consequently, $\EU$ is measured by the difference between $\TU$ and $\AU$, indicating the extent (in expectation) to which the unavoidable loss can be mitigated by eliminating epistemic uncertainty.
%


% Then, based on the local measures, we define corresponding \emph{global} uncertainty measures by summing over all label-wise uncertainties:
Nevertheless, certain scenarios call for a \textit{global} perspective on predictive uncertainty. To obtain corresponding measures, the most obvious idea is to define total, aleatoric, and epistemic uncertainty associated with a second-order distribution $Q \in \ksimplextwo$ by summing over all label-wise uncertainties:
\begin{align}
\TU(Q) &\coloneqq \sum_{k = 1}^{K} \TU(Q_k)
\label{tu:label_general}\\
\AU(Q) &\coloneqq \sum_{k = 1}^{K} \AU(Q_k)
\label{au:label_general}\\
\EU(Q) &\coloneqq \sum_{k = 1}^{K} \EU(Q_k) 
\label{eu:label_general}
\end{align}



As mentioned, one advantage of the label-wise decomposition, which goes hand in hand with a binarization of the problem (the $Y_k$ are binary outcomes), is that it broadens the scope of measures that can be applied. 
%Although the label-wise perspective is of course not reserved for any specific uncertainty measure, 
Our approach as outlined above is ``parameterized'' by the loss function $\phi$. Natural candidates for this loss are (strictly) proper scoring rules \citep{gnei_sp05}, which have the meaningful property that the risk-minimizer $\hat{\theta}$ in (\ref{eq:epsr}) coincides with $\theta_k$ itself; therefore, total and aleatoric uncertainty become, respectively,
\begin{align}
\TU(Q_k) & =   \phi( \bar{\theta}_k , \bar{\theta}_k )  =  \phi(  \mathbb{E}[\Theta_k] ,  \mathbb{E}[\Theta_k] ) \label{eq:lwts} \\[0.1cm]
\AU(Q_k) & =   \mathbb{E}_{\theta_k \sim Q_k} \, \phi( \theta_k , \theta_k )  \label{eq:lwas}
\end{align}

Our construction extends the well-established information-theoretic decomposition of Shannon entropy into conditional entropy and mutual information, which are the most widely used measures for uncertainty quantification in machine learning. Specifically, when the loss function $\phi$ is the log-loss, entropy-based measures are recovered as a special case. Our generalization allows for the use of losses other than log-loss, such as variance, thereby broadening the scope and applicability of our framework. Furthermore, we demonstrate desirable properties of this generalization.

In the following, we propose two concrete instantiations: The log-loss $\phi(\hat{\theta} , Y) = - ( Y \log(\hat{\theta})+ (1-Y) \log(1- \hat{\theta}))$, 
which leads to entropy as (total) uncertainty, and the squared-error loss $\phi(\hat{\theta} , Y) = (\hat{\theta} - Y)^2$, which leads to variance as uncertainty measure. 
% $\phi(\hat{\theta} , Y) = - \log( Y \hat{\theta} + (1-Y) (1- \hat{\theta}))$,
%The common entropy-based approach is an instantiation of this decomposition for the case of log-loss, the variance-based approach for the case of squared-error loss. 

We note that variance is an example of a measure that can be applied to the binary case but not to the categorical case in general. However, our label-wise approach addresses this issue, enabling the effective use of variance-based uncertainty measures for classification purposes. 

% For entropy, a binarization is in principle not needed. We nevertheless include it as another example, due its prevalence in the literature, and also to compare it with its original (non-binarized) version. 
% For entropy, a binarization is in principle not need. 


% Label-wise Entropy
\subsection{Entropy-based Measures}
%
%Given the prevalence of global entropy-based measures in uncertainty quantification tasks, we first propose uncertainty measures to employ entropy on a \textit{per-label} basis. Building upon the binarization of the categorical label variable $Y$, as outlined earlier, we can now assess uncertainty with respect to a single label.

In complete analogy to global entropy-based measures \eqref{tu:entropy}, \eqref{au:entropy}, and \eqref{eu:entropy} we can define the corresponding label-wise counterparts for all $k \in \{1,\dots,K\}$:

\begin{itemize}
    \item Label-wise total uncertainty (\ref{eq:lwts}) is given by $\ent(\bar{\theta}_k) = \ent(\mathbb{E}[\Theta_k])$. 
    %reflects the overall uncertainty associated with label $Y_k$, encapsulating both the aleatoric and epistemic components. 
    \item Label-wise aleatoric uncertainty (\ref{eq:lwas}) is given by expected conditional entropy $\mathbb{E}[\ent(Y_k \given \Theta_k)]$.
    %pertains to the inherent variability or randomness within the outcome of $Y_k$ given $\theta_k$. Since $\theta_k$ is not precisely known, we take the expectation with respect to our current best knowledge of $\theta_k$, which is encapsulated in the second-order distribution $Q_k$.
    \item Label-wise epistemic uncertainty is given by the expected KL-divergence $\mathbb{E}[\kl(\Theta_k \, || \, \bar{\theta}_k)]$.
\end{itemize}
The corresponding global measures (\ref{tu:label_general}--\ref{eu:label_general}) are then given as follows:
%Nevertheless, certain scenarios call for a global perspective on predictive uncertainty.  To obtain corresponding measures, the most obvious idea is to define total, aleatoric, and epistemic uncertainty associated with a second-order distribution $Q \in \ksimplextwo$ by summing over all label-wise uncertainties:
\begin{align}
\TU(Q) &\coloneqq \sum_{k = 1}^{K} \ent(\mathbb{E}[\Theta_k])
\label{tu:label_entropy}\\
\AU(Q) &\coloneqq \sum_{k = 1}^{K} \mathbb{E}[\ent(Y_k\given\Theta_k)]
\label{au:label_entropy}\\
\EU(Q) &\coloneqq \sum_{k = 1}^{K} \mathbb{E}[\kl(\Theta_k \, || \, \bar{\theta}_k)]
\label{eu:label_entropy}
\end{align}
%

In the following we demonstrate which of the properties discussed in \ref{subsection:axioms} are fulfilled by the entropy-based measures constructed in a label-wise manner. 

\begin{theorem}
\label{thm:entropy_axioms}
Entropy-based measures \eqref{tu:label_entropy}, \eqref{au:label_entropy}, and \eqref{eu:label_entropy} satisfy Axioms A0, A1, A2 (only for $\TU$), A3 (strict version), A4 (strict version, only for $\TU$), A6, and A7. 
\end{theorem}


% \newpage 
% Variance-based Measures
\subsection{Variance-based Measures}
\label{sec:proposal}
%
%Variance represents an alternative and natural measure of uncertainty, yet, in contrast to entropy, it is not directly applicable to the case of a categorical variables. 
%
%However, our label-wise approach addresses this issue by employing the binarization of categorical variables, enabling the effective use of variance-based uncertainty measures for classification purposes. 

%2
Here, we leverage the \textit{law of total variance}: for any random variable  $X \in L^2(\Omega, \mathcal{A}, P)$ and sub-$\sigma$-algebra  $\mathcal{F} \subseteq \mathcal{A}$, 
\begin{align*}
    \Var(X) = \mathbb{E}[ \underbrace{\mathbb{E}[(X - \mathbb{E}[X \given \mathcal{F}])^2 \given \mathcal{F}]}_{\eqqcolon \Var(X \given \mathcal{F}) }]   + \Var(\mathbb{E}[X \given \mathcal{F}])   \, .
    % \label{eq:lawvariance}
\end{align*}
%
%
Then, %by the law of total variance and 
observing that $\sigma(\Theta_k) \subseteq \mathcal{F}$ for any $k \in \{1,\dots,K\}$, we get
\begin{align*}
 \Var(Y_k)&= \mathbb{E}[\Var(Y_k \given \sigma(\Theta_k)) ] + \Var(\mathbb{E}[Y_k \given \sigma(\Theta_k)] ) \\[0.2cm]
   &= \mathbb{E}[\Theta_k \cdot (1 - \Theta_k)] + \Var(\Theta_k) \, .
\end{align*}
%
This equality suggests an alternative definition of total uncertainty and its (additive) decomposition into an aleatoric and an epistemic part:
\begin{itemize}
    \item Label-wise total uncertainty is given by $\Var(Y_k)$ and is obtained as an instantiation of  (\ref{eq:lwts}) with $\phi$ the squared-error loss. 
    %It takes the role of Shannon entropy in the entropy-based approach. Just like the latter corresponds to the expected log-loss (of the risk-minimizing prediction $\mathbb{E}[\Theta_k]$), $\Var(Y_k)$ is the expected squared-error loss. 
    
    \item Label-wise aleatoric uncertainty (\ref{eq:lwas}) is captured by $ \mathbb{E}[\Theta_k \cdot (1 - \Theta_k)]$, reflecting the inherent randomness in the outcome of each $Y_k$.
    %it reflects how much the occurrence of each label is subject to chance, particularly in cases where predicting the label is inherently uncertain due to the variability of the data-generating process itself. 
    Just like conditional entropy, it can be seen as the (expected) ``conditional variance'' of $Y_k$ and corresponds to the expected squared-error loss provided the true value of $\Theta_k$ is given. 

    \item Label-wise epistemic uncertainty is quantified by $\Var(\Theta_k)$. 
    %It captures the dispersion of $\Theta_k$, indicating a level of uncertainty due to limited data or imperfect modeling. 
    Just like mutual information corresponds to the expected reduction in log-loss achieved by the knowledge of $\Theta_k$, $\Var(\Theta_k)$ is the expected reduction of squared-error loss.
\end{itemize}
%
% We note that the label-wise perspective is particularly useful in settings where decisions following the prediction of different labels are associated with unequal costs. For instance, when predicting the sub-type of a certain medical condition, with costly treatment administered at occurrence of one of the sub-types, the marginal uncertainty about this category might be of particular interest. We show some illustrative experimental results on medical images in Section~\ref{sub:classwise}.

%
 	\begin{figure*}[ht!]
		\centering
		\includegraphics[width=0.9\linewidth]{figures_uai/med_data/image_label_uncertainty_tu_neg_4_class_LUNG_CANCER_pred_0.pdf}
		\caption{Coronal 2D image from a patient with malignant lesions. Left: CT, middle: PET with \textit{
        \textcolor{Orange}{segmentation}} by medical experts, right: corresponding \textit{\textcolor{PythonRed}{
        aleatoric}} and \textit{\textcolor{PythonBlue}{epistemic}} uncertainties, \textit{\textcolor{Green}{ground truth class}} and \textit{\underline{predicted class}}.}
		\label{fig:medical_example}
    \vspace{-0.25cm}
	\end{figure*}
%

% Nevertheless, certain scenarios call for a global perspective on predictive uncertainty. 
% To obtain corresponding measures, the most obvious idea is to define total, aleatoric, and epistemic uncertainty associated with a second-order distribution $Q \in \ksimplextwo$ by summing over all label-wise uncertainties, taking their relative importance into account:
% \begin{align}
% \TU(Q) &\coloneqq \sum_{k = 1}^{K} w_k \Var(Y_k)
% \label{tu:variance}\\
% \AU(Q) &\coloneqq \sum_{k = 1}^{K} w_k \mathbb{E}[\Theta_k \cdot (1 - \Theta_k)]
% \label{au:variance}\\
% \EU(Q) &\coloneqq \sum_{k = 1}^{K} w_k \Var(\Theta_k),
% \label{eu:variance}
% \end{align}
% with importance weights $w_1, \dots, w_K \ge 0$.
% %

% This global view is crucial in scenarios where understanding the overall uncertainty is key to making informed decisions. For instance, $\TU$ serves as an indicator for the overall reliability of the model. Meanwhile, $\AU$ and 
% $\EU$ distinguish between the uncertainty arising from the data's inherent variability and that stemming from the model's knowledge limitations, respectively. 

% We now demonstrate that variance-based measures \eqref{tu:variance}, \eqref{au:variance}, and \eqref{eu:variance} satisfy a number of important properties.

% \begin{theorem}
% \label{thm:axioms}
% Variance-based measures \eqref{tu:variance}, \eqref{au:variance}, and \eqref{eu:variance} satisfy Axioms A0, A1, A3 (strict version), A5--A7 for any $w_1, \dots, w_K > 0$, and A4 (strict version) if additionally $w_1 = \dots = w_K$. 
% \end{theorem}

% Axiom A4 is only satisfied for equal weights. This is a feature, not a deficiency: the `center' $(1 / K, \dots, 1 / K)$ of the simplex loses its meaning as the canonical point of maximal AU when we weigh uncertainty about individual labels differently. For potentially unequal weights, we can prove a more general property, however.
% \begin{lemma} \label{lem:concavity}
%     For any $w_1, \dots, w_K > 0$, the function $V(\vec{\theta}) = \sum_{k = 1}^K w_k \theta_k(1 - \theta_k)$ is strictly concave on $\Delta_K$ with unique maximizer $\vec{\beta}$ defined by
%     \begin{align*}
%         \beta_k = \frac 1 2\biggl(1 - \frac{(K - 2) / w_k}{ \sum_{k = 1}^K 1/ w_k }\biggr).
%     \end{align*}
% \end{lemma}


% \begin{corollary} \label{cor:A4}
%     Let $w_1, \dots, w_K > 0$ and $\vec{\beta} = \arg\max_{\vec{\theta} \in \Delta_K} \sum_{k = 1} w_k \theta_k(1 - \theta_k)$ and define $Q'$ as a spread-preserving location shift such that $\mathbb E[\vec{X}'] = \lambda \mathbb E[\vec{X}] + (1 - \lambda) \vec{\beta}$ for some $\lambda \in (0, 1)$.
%     Then $\mathrm{AU}(Q') > \mathrm{AU}(Q)$ and $\mathrm{TU}(Q') > \mathrm{TU}(Q)$.
% \end{corollary}
% The inequality for TU follows directly from Lemma \ref{lem:concavity}. The inequality for AU is then implied by $\mathrm{EU}(Q') = \mathrm{EU}(Q)$ (Axiom A5).
% One can verify that for $w_1 = \dots = w_K$, we have $\vec{\beta} = (1, \dots, 1)/ K$, so $Q'$ is a spread-preserving center-shift in the sense of Definition \ref{def:shifts}.

% Nevertheless, certain scenarios call for a global perspective on predictive uncertainty. 
% To obtain corresponding measures, the most obvious idea is to define total, aleatoric, and epistemic uncertainty associated with a second-order distribution $Q \in \ksimplextwo$ 
Summing over all label-wise uncertainties yields global measures of total, aleatoric, and epistemic uncertainty:
\begin{align}
\TU(Q) &\coloneqq \sum_{k = 1}^{K} \Var(Y_k)
\label{tu:variance}\\
\AU(Q) &\coloneqq \sum_{k = 1}^{K} \mathbb{E}[\Theta_k  (1 - \Theta_k)]
\label{au:variance}\\
\EU(Q) &\coloneqq \sum_{k = 1}^{K} \Var(\Theta_k)
\label{eu:variance}
\end{align}

%
% This global view is crucial in scenarios where understanding the overall uncertainty is key to making informed decisions. For instance, $\TU$ serves as an indicator for the overall reliability of the model. Meanwhile, $\AU$ and 
% $\EU$ distinguish between the uncertainty arising from the data's inherent variability and that stemming from the model's knowledge limitations, respectively. 

Finally we demonstrate that variance-based measures \eqref{tu:variance}, \eqref{au:variance}, and \eqref{eu:variance} satisfy a number of desirable properties, discussed in depth in Section \ref{subsection:axioms}.

\begin{theorem}
\label{thm:axioms}
Variance-based measures \eqref{tu:variance}, \eqref{au:variance}, and \eqref{eu:variance} satisfy Axioms A0, A1, A2 (only for $\TU$), A3 (strict version), A4 (strict version) and A5--A7.
\end{theorem}

Let us highlight that, while entropy-based measures do not fulfill property A5 (as previously pointed out by \cite{wimmer2023quantifying}), this property is now met by the variance-based measures.

On a further note, let us remark that the idea of using variance-based uncertainty measures is not completely new. In particular, a decomposition derived from the law of total variance has been used in regression problems for quite some time \citep{depeweg2018decomposition}.
Moreover, \citet{duan2023evidential} introduce variance-based uncertainty measures for classification, yet they motivate this from the EDL paradigm and do not discuss any theoretical properties.

% Experimental section
\section{Experiments}
%
In this experimental section, our aim is twofold. Firstly, we empirically illustrate the practical efficacy of the \textit{label-wise} uncertainty quantification approach, as motivated in the preceding sections. This is achieved through experiments conducted on medical data sets, where uncertainty quantification is deemed particularly critical. Our results not only reinforce the theoretical underpinnings discussed earlier but also highlight the importance of reliable uncertainty quantification in high-stakes medical applications.
% Our code is published in a GitHub repository\footnote{Experimental code is available \href{https://github.com/YSale/uq-variance.git}{here}. The code will be made available in a public repository upon publication. \textcolor{blue}{!Change!}} 

Secondly, additional experiments on common benchmark data sets are designed to illustrate that adopting a label-wise perspective does not come at the expense of a \textit{global} viewpoint. 
Due to the fundamental lack of a ground truth in studying uncertainty (as opposed to predictive performance where ground-truth labels are usually available), it is challenging to assess the quality of the uncertainty estimates. 

As such, we study the (global) effectiveness of the proposed measures in two different tasks: prediction with abstention and out-of-distribution (OoD) detection.

Details on model architecture and training setup as well as additional experiments can be found in Appendix \ref{appendix:exp_details} and Appendix \ref{appendix:exp}, respectively. The code is available in a public repository (\url{https://github.com/YSale/label-uq}). 


\subsection{Label-wise Uncertainties}
\label{sub:classwise}
%
For the evaluation in the medical domain, we use a data set of Positron Emission Tomography/Computed Tomography (PET/CT) images which comprises 501 full-body scans from patients with malignant lymphoma, melanoma, and lung cancer, as well as 513 scans from individuals without malignant lesions (negative controls) \citep{Datasetgatidis2022whole}. Each scan is annotated with a tumor segmentation performed by a medical expert. We extract from each 3D CT and PET volume multiple 2D images which are used to train a deep neural network ensemble and evaluate the label-wise uncertainty quantification. 

Figure \ref{fig:medical_example} depicts a qualitative example of a 2D PET/CT image from the data set with the corresponding label-wise uncertainties from our evaluation. We observe low aleatoric uncertainty and negligible epistemic uncertainty for the melanoma class. This implies that the approximation of the aleatoric uncertainty is reliable. On the contrary, the classes lung cancer and lymphoma are associated with high aleatoric and high epistemic uncertainty, suggesting that the prediction with respect to these classes may not be accurate. This observation is plausible from a medical perspective as we observe multiple tumors in the image which are not limited to the lung area and thus might indicate a different tumor class as well. In this instance, we could request a medical expert to annotate additional data for the classes lung cancer and lymphoma, thereby diminishing the epistemic uncertainty associated with these classes. Here, a global measure of uncertainty would only give epistemic uncertainty with respect to all classes, meaning a doctor would have to annotate data for all classes.
% We can observe a low uncertainty for the negative class corresponding to a high certainty that the patient shows some malignant lesions. While predicting the lung cancer label, the high epistemic uncertainty for the lung cancer and lymphoma label indicates a high model uncertainty between those two classes. This observation is plausible from a medical perspective as we observe multiple tumors in the image which are not limited to the lung area and therefore might indicate a different tumor class as well. 

Having a more detailed understanding of the label-wise uncertainties is crucial for the decision-making in medical applications supporting a given diagnosis. Moreover, it allocates resources to the relevant classes and saves valuable examination time and costs.

% To illustrate the usefulness of a label-wise uncertainty measure, Figure \ref{fig:mnistmain} shows six representative high-uncertainty instances from the MNIST data set. 

In Appendix \ref{app:label}, we provide further examples of images with the highest total, aleatoric, and epistemic uncertainty.

% Accuracy Rejection Curves
\subsection{Accuracy-Rejection Curves}
\label{subsec:arcs}
%
We generate Accuracy-Rejection Curves (ARCs) by rejecting the predictions for instances on which the predictor is most uncertain and computing the accuracy on the remaining subset \citep{huhn2009}. Given a good uncertainty quantification method, the accuracy should monotonically increase with the percentage of rejected instances, because the model misclassifies instances with low uncertainty less often than instances with high uncertainty. %This is a very relevant use-case in medical scenarios as it implies that a high classification accuracy can be reached by rejecting a portion of the data. In a practical setting, this could mean that for some portion of the data, a medical expert has to examine the images, while on the remaining data, a machine learning model can make predictions with high accuracy.


To approximate the second-order distribution, we train an ensemble of five neural networks on the CIFAR10 data set \citep{krizhevsky2009learning}. %Further experiments using additional data sets can be found in Appendix \ref{appendix:exp}. 
% 
We compare the proposed label-wise entropy- and variance-based uncertainty measures to the entropy-based baseline (cf. Section \ref{subsec:entropy}) as used in the Bayesian setting. Figure \ref{fig:mlarcs} shows the ARCs for the CIFAR10 data set. The accuracies are reported as the mean over five independent runs and the standard deviation is depicted by the shaded area.

The ARCs for all uncertainty measures closely align with the entropy-based baseline and exhibit similar qualitative behaviors. This highlights the effectiveness of the global measure derived from the local (label-wise) measures.

In this regard, let us note that our goal is \textit{not} to demonstrate that label-wise constructed measures always perform better than their entropy equivalents. Instead, we show that
they fulfill many desirable properties and are highly competitive in downstream task applications. In particular, this is a very relevant use-case in medical scenarios as it implies that a high classification accuracy can be reached by rejecting a portion of the data. In practice, this could mean that a machine learning algorithm, supplying predictions for unambiguous cases, can work in tandem with a medical expert, examining images deemed too difficult for machine-based prediction, to achieve high accuracy overall.

% In a practical setting, this could mean that, for some portion of the data, a medical expert has to examine the images, while on the remaining data, a machine learning model can make predictions with high accuracy.


Further experiments using medical and machine learning data sets can be found in Appendix \ref{appendix:exp}. 
\begin{figure}[t!]
\vspace{-0.728cm}
    \centering
    \includegraphics[width=0.5\textwidth]{figures_uai/ml_data/accr_tu_cifar_lent.pdf}
    %
    % \includegraphics[width=0.5\textwidth]{figures_uai/ml_data/accr_tu_cifar_lent.pdf}
    \caption{Accuracy-rejection curves generated on the CIFAR10 data set. We compare entropy ($\TU_{\text{ent}}$), label-wise constructed entropy ($\TU_{\text{lent}}$), and the variance-based ($\TU_{\text{var}}$) measure of total uncertainty.}
    \label{fig:mlarcs}
    \vspace{-0.25cm}
\end{figure}
%OoD experiments
\subsection{Out-of-Distribution Detection}
%
Another way to assess and compare measures of uncertainty is to conduct out-of-distribution detection experiments.
%
We train a model on an in-distribution (iD) data set, and compute uncertainty values on instances of the iD test set. Subsequently, the model is exposed to data from an OoD data set, and we similarly assess the uncertainty for these new instances.
%
The model, which has not previously encountered the OoD data, is expected to exhibit increased epistemic uncertainty for these instances. The ability to distinguish between iD and OoD data is an important requirement for a reliable machine learning model, because accurate predictions cannot be guaranteed on OoD data.


Our approach involves training an ensemble of five neural networks on the FashionMNIST \citep{xiaoFashion2017} data set (iD), using MNIST \citep{lecun1998} and KMNIST \citep{clanuwat2018deep} as our chosen OoD data sets. To determine the effectiveness in distinguishing between iD and OoD instances using total uncertainty, we calculate the \texttt{AUROC} and compute its mean and standard deviation across five independent runs. Similarly, we conduct OoD experiments for CIFAR10 (iD) with SVHN \citep{netzer2011reading} and CIFAR10.2 \citep{luHarderDifferentCloser} as OoD data sets. 


Table~\ref{table:ood1} shows the results for the networks trained on FashionMNIST or CIFAR10. Overall, the compared measures perform well. In particular, the label-wise measures and the entropy-based measures yield similar results, emphasizing again that using the label-wise measures in a global way does not sacrifice performance.

% \begin{figure}[t!]
% \vspace{-0.728cm}
%     \centering
%     \includegraphics[width=0.5\textwidth]{figures_uai/ml_data/accr_tu_cifar_lent.pdf}
%     %
%     % \includegraphics[width=0.5\textwidth]{figures_uai/ml_data/accr_tu_cifar_lent.pdf}
%     \caption{Accuracy-rejection curves generated on the CIFAR10 data set. We compare entropy ($\TU_{\text{ent}}$), label-wise constructed entropy ($\TU_{\text{lent}}$), and the variance-based ($\TU_{\text{var}}$) measure of total uncertainty.}
%     \label{fig:mlarcs}
% \end{figure}

\begin{table}[t!]
\captionof{table}{
  OoD detection performance. \texttt{AUROC} and standard deviation over five runs are reported. $\EU_{\text{ent}}$ denotes mutual information, $\EU_{\text{var}}$ the variance-based measure, and $\EU_{\text{lent}}$ label-wise entropy. Best performance is in \textbf{bold}.
}
\resizebox{0.49\textwidth}{!}{
  \begin{tabular}{@{}ccccc@{}}\toprule
  & \multicolumn{2}{c}{FashionMNIST} & \multicolumn{2}{c}{CIFAR10} \\ \midrule 
  Methods & MNIST & KMNIST & SVHN & CIFAR10.2 \\ \midrule
  $\EU_{\text{var}}$ & .882 $\pm$ 0.18 & .959 $\pm$ .005 & \textbf{.761 $\pm$ .022} & .999 $\pm$ .001 \\
  $\EU_{\text{lent}}$ & .894 $\pm$ .017 & .967 $\pm$ .004 & .745 $\pm$ .027 & \textbf{1.00 $\pm$ .001} \\
  $\EU_{\text{ent}}$ & \textbf{.895 $\pm$ .017} & \textbf{.969 $\pm$ .004} & .760 $\pm$ .026 & .998 $\pm$ .002 \\
  \bottomrule
  \end{tabular}
}
\label{table:ood1}
% \vspace{-0.25cm}
\end{table}

% Conclusion
\section{Concluding Remarks}
% We proposed variance-based measures for uncertainty quantification in the context of second-order distributions. 
We introduced a novel approach to quantifying uncertainty in classification tasks, adopting a label-wise perspective that allows for reasoning about uncertainty at the level of individual classes. This can be beneficial for decision-making in situations where incorrect predictions have unequal consequences for different classes, and for deciding about the right course of action to reduce uncertainty. 
Addressing criticisms in the recent literature and problems of the commonly used information-theoretic (entropy-based) measures, we showed that our measures satisfy many desirable properties. 
%Moreover, our measures allow for reasoning about the uncertainty associated with individual classes, something that can aid in decision-making in situations where incorrect predictions have different consequences for different classes. 
We also presented empirical results highlighting the practical usefulness of these measures. Overall, we trust that this work is a step towards a more interpretable representation of uncertainty that will be beneficial for safety-critical applications.

%Let us conclude with a remark on future work related to the loss-based view of the uncertainty measures as briefly touched on in Section 3:
%\begin{itemize}
%\item Total uncertainty is the expected loss of the risk-minimizing prediction $\hat{\theta}$ given knowledge of the second-order distribution $Q$ (i.e., knowing that $\theta$ is drawn from $Q$ and then $Y$ according to $\theta$).
%\item Aleatoric uncertainty is the expected loss of the risk-minimizing prediction $\hat{\theta}$ given knowledge about the true $\theta$ (sampled from $Q$). 
%\item Epistemic uncertainty is the difference between these two, i.e., the extra loss that is caused by the lack of knowledge about the true $\theta$. 
%\end{itemize}
%The common entropy-based approach is an instantiation of this decomposition for the case of log-loss, the variance-based approach for the case of squared-error loss. Thus, an obvious idea is to elaborate on instantiations for other loss functions, too, maybe in combination with the label-wise decomposition of the uncertainty measures. Natural candidates for such losses are (strictly) proper scoring rules \citep{gnei_sp05}, which have the meaningful property that the risk-minimizer $\hat{\theta}$ given $\Theta = \theta$ coincides with $\theta$ itself\,---\,both log-loss and squared-error loss are examples of proper scoring rules.  


Our approach to decomposing uncertainty in a label-wise manner is in direct correspondence with the one-vs-rest decomposition of a multinomial into several binary classification problems. An interesting idea for future work, therefore, is the use of other decomposition techniques. In any case, thanks to the binarization, our approach is amenable to a very broad class of uncertainty measures.  
Specifically, we presented a framework that is parameterized by a loss function (proper scoring rule) $\phi$. As another direction of future work, we plan to elaborate more deeply on the appropriate choice of this loss, and the effect it has on uncertainty quantification. 



%In this context, our approach to quantifying uncertainty for each label can be seen as analogous to a decomposition strategy in machine learning, reminiscent of the one-versus-all method. Depending on the specific characteristics and requirements of the (classification) problem at hand, other decomposition techniques could be in principle considered. 



% \begin{contributions} % will be removed in pdf for initial submission 
% 					  % (without ‘accepted’ option in \documentclass)
%                       % so you can already fill it to test with the
%                       % ‘accepted’ class option
%     Briefly list author contributions. 
%     This is a nice way of making clear who did what and to give proper credit.
%     This section is optional.

%     H.~Q.~Bovik conceived the idea and wrote the paper.
%     Coauthor One created the code.
%     Coauthor Two created the figures.
% \end{contributions}

\begin{acknowledgements} % will be removed in pdf for initial submission,
						 % (without ‘accepted’ option in \documentclass)
                         % so you can already fill it to test with the
                         % ‘accepted’ class option
    % Briefly acknowledge people and organizations here.

    % \emph{All} acknowledgements go in this section.
Yusuf Sale and Lisa Wimmer are supported by the DAAD program Konrad Zuse Schools of Excellence in Artificial Intelligence, sponsored by the Federal Ministry of Education and Research.
\end{acknowledgements}



% References
\bibliography{references}

\newpage

\onecolumn

\title{Label-wise Aleatoric and Epistemic Uncertainty Quantification\\(Supplementary Material)}
\maketitle



% This Supplementary Material should be submitted together with the main paper.

\appendix



\section{Proofs}
\label{appendix:proofs}
%
\begin{prop}\label{corrigendum}
Let $Q^{\prime} \in \ksimplextwo$ be a mean-preserving spread of $Q \in \ksimplextwo$, i.e., let $\vec{\Theta} \sim Q$, and $\vec{\Theta}^\prime \sim Q^\prime$ be two random vectors such that we have $\vec{\Theta}^\prime \overset{d}{=} \vec{\Theta} + \vec{Z}$, for some random vector $\vec{Z}$ with $\mathbb{E}[\vec{Z} \given \sigma(\vec{\Theta})] = 0$ almost surely. \\[0.2cm]
Now, define 
\begin{align}
\EU(Q) = \mathbb{E}_Q[\kl(\vec{\Theta} \, \| \, \bar{\vec{\theta}})], 
\end{align}
where \(\kl(\vec{\Theta} \, \| \, \bar{\vec{\theta}})\) denotes the Kullback-Leibler (KL) divergence of \(\vec{\Theta}\) from its mean \(\bar{\vec{\theta}}\). Then the claim is that $\EU(Q') > \EU(Q).$    
\end{prop}

\begin{proof}
First, note that $\kl(\vec{\Theta} \, \| \, \bar{\vec{\theta}})$ is a convex function of $\vec{\Theta}$ since $\bar{\vec{\theta}} \in \ksimplex$ is a constant. Given that $\vec{\Theta}^{\prime} \overset{d}{=} \vec{\Theta} + \vec{Z}$ and $\mathbb{E}[\vec{Z} \given \sigma(\vec{\Theta})] = 0$ almost surely, it follows that $\mathbb{E}_{Q^{\prime}}[\vec{\Theta}^{\prime} \mid \sigma(\vec{\Theta})] = \vec{\Theta}$.\\[0.1cm]
Jensen's inequality states that for a strict convex function \(f\) and a non-constant random vector $\vec{Y}$, we have $\mathbb{E}[f(\vec{Y})] > f(\mathbb{E}[\vec{Y}])$. Then Jensen's inequality implies:
\begin{align*}
\mathbb{E}_{Q'}[\kl(\vec{\Theta}' \| \bar{\vec{\theta}}) \given \sigma(\vec{\Theta})] > \underbrace{\kl(\mathbb{E}_{Q'}[\vec{\Theta}'\given \sigma(\vec{\Theta})] \, \| \, \bar{\vec{\theta}})}_{= \kl(\vec{\Theta} \, \| \, \bar{\vec{\theta}})} \quad \textnormal{a.s}.
\end{align*}
By this we get 
\begin{align}
    \mathbb{E}_{Q'}[\kl(\vec{\Theta}' \, \| \,\bar{\vec{\theta}})] = \mathbb{E}_Q[\mathbb{E}_{Q'}[\kl(\vec{\Theta}' \, \| \, \bar{\vec{\theta}}) \given \sigma(\vec{\Theta})]] > \mathbb{E}_{Q}[\kl(\vec{\Theta} \, \| \, \bar{\vec{\theta}})],
    \label{ineq:mps}
\end{align}
 by the law of total expectation. This concludes the proof. 
\end{proof}


% Proof of entropy-based properties
\newpage
\begin{proof}[Proof of Theorem \ref{thm:entropy_axioms}]
We prove that the entropy-based uncertainty measures \eqref{tu:label_entropy}, \eqref{au:label_entropy}, and \eqref{eu:label_entropy} satisfy Axioms A0, A1, A2 (only for $\TU$), A3 (strict version), A4 (strict version, only for $\TU$), A6, and A7 of Section \ref{subsection:axioms}. 
\begin{itemize}
    \item[A0:] This property holds true since entropy $\ent(\cdot)$ and KL-divergence $\kl(\cdot \, || \, \cdot)$ are non-negative.
    %
    \item[A1:] 
    Let $Q = \delta_{\vtheta} \in \ksimplextwo$ be a Dirac measure on $\vtheta \in \ksimplex$. 
    %
    Since $\kl(\theta_k \, || \, \bar{\theta}_k)] = 0$ if and only if $\theta_k = \bar{\theta}_k$ and consequently $\mathbb{E}_{Q_k}[\kl(\Theta_k \, || \, \bar{\theta}_k)] = 0$  for all $k \in \{1, \dots, K\}$ the claim holds true.
    %
    % Then $\EU(\delta_{\vtheta}) = 0$ holds true, since $\kl(\theta_k \, || \, \bar{\theta}_k)] = 0$ iff $\theta_k = \bar{\theta}_k$ and thus $\mathbb{E}_{Q_k}[\kl(\Theta_k||\bar{\theta}_k)] = 0$  for all $k \in \{1, \dots, K\}$.  
    %
    \item[A2:] 
    % We have $\TU = \sum_{k=1}^K \ent(\mathbb{E}_{Q_k}[\Theta_k])$. 
    First we show that the function $U(\vtheta) = \sum_{k=1}^K \ent(\theta_k)$ is strictly concave on $\Delta_K$ with unique maximizer $\vec{\beta} = (1/K,\dots,1/K)$. We observe that 
    \begin{align*}
    \nabla^2 U(\vec{\theta}) = 
    \begin{bmatrix}
    -\left[\frac{\log(2)}{\theta_1} + \frac{\log(2)}{1-\theta_1}\right] & 0 & \cdots & 0 \\
    0 & -\left[\frac{\log(2)}{\theta_2} + \frac{\log(2)}{1-\theta_2}\right] & \cdots & 0 \\
    \vdots & \vdots & \ddots & \vdots \\
    0 & 0 & \cdots & -\left[\frac{\log(2)}{\theta_K} + \frac{\log(2)}{1-\theta_K}\right]
    \end{bmatrix}
    % \mathrm{diag}(-1/\theta_k(1-\theta_k), \dots, -1/\theta_k(1-\theta_k) ),
    \end{align*}
    is negative definite.
    % First, note that $U(\vtheta) = \sum_{k=1}^K \ent(\theta_k)$ is strictly on $\ksimplex$ concave since 
    %
    % strictly concave as it is the sum of entropy $\ent(\cdot)$, which is strictly concave \textcolor{blue}{(why?)}. 
    To find the maximizer, we consider the Lagrangian dual:
    \begin{align*}
        \max_{\vtheta\in\Delta_k, \lambda} U^*(\vtheta, \lambda) = \max_{\vtheta\in\Delta_k, \lambda} U(\vtheta) + \lambda \left(\sum_{k=1}^K\theta_k - 1 \right).  
    \end{align*}
    The first-order conditions are 
    % \begin{align*}
    %     \frac{\partial U^*(\vtheta, \lambda)}{\partial\theta_k} = \frac{\log(\theta_k) - \log(1-\theta_k)}{\log(2)} + \lambda = 0, k = 1,...,K\ \text{and}\ \sum_{k=1}^K\theta_k = 1,
    % \end{align*}
        \begin{align*}
        \frac{\partial U^*(\vtheta, \lambda)}{\partial\theta_k} = \log_2(\theta_k) - \log_2(1-\theta_k) + \lambda = 0, \, k \in \{1,...,K\} \, \text{and} \, \sum_{k=1}^K\theta_k = 1,
    \end{align*}
    which are solved by $\vtheta = (1/K,\dots,1/K)$ and $\lambda = \log_2  \frac{K-1}{K} - \log_2 \frac{1}{K}$. This implies that $\TU$ is maximized for $Q \in \ksimplextwo$ such that $\mathbb{E}[\Theta_k] = 1/K$ for all $k\in\{1,\dots,K\}$. The latter holds true for $Q$ being the uniform distribution on $\ksimplextwo$.

    % \textcolor{blue}{
    % Similarly, one can show that $\EU(Q) \leq \log_2(K) + (K-1)\log_2(K/(K-1))$ for all $Q \in \ksimplextwo$. Now, assume that $\EU$ is maximal for $Q_{\mathrm{Unif}}$ being the uniform distribution on $\ksimplextwo$. Additivity implies $\AU(Q_{\mathrm{Unif}}) = 0$. Since $\AU(Q_{\mathrm{unif}}) > 0$, this proves that $\EU$ cannot be maximal for $Q_{\mathrm{Unif}}$.}
    
    Thus, for any $Q \in \ksimplextwo$ satisfying $\mathbb{E}[\Theta_k] = 1/K$ for all $k \in \{1,\dots,K\}$ we obtain $$\TU(Q) = \log_2(K) + (K-1)\log_2(K/(K-1)).$$
    It is easy to show that the maximum of $\EU$ aligns with that of $\TU$. Assume, for the sake of argument, that $\EU$ is maximal for $Q_{\mathrm{Unif}}$ being the uniform distribution on $\ksimplextwo$. Since $\TU$ decomposes additively in $\AU$ and $\EU$, it follows that $\AU(Q_{\mathrm{Unif}}) = 0$. However, given that $\AU(Q_{\mathrm{unif}}) > 0$, this leads to a contradiction, demonstrating that $\EU$ cannot be maximal for $Q_{\mathrm{Unif}}$.
    \item[A3:]  Let $Q^{\prime} \in \ksimplextwo$ be a mean-preserving spread of $Q \in \ksimplextwo$, i.e., let $\vec{\Theta} \sim Q, \vec{\Theta}^\prime \sim Q^\prime$ be two random vectors such that $\vec{\Theta}^\prime \overset{d}{=} \vec{\Theta} + \vec{Z}$, for some random vector $\vec{Z}$ with $\mathbb{E}[\vec{Z} \given \sigma(\vec{\Theta})] = 0$ almost surely. Applying Proposition \ref{corrigendum} yields $\mathbb{E}_{Q_k^{\prime}}[\kl(\Theta_k^{\prime} \, \| \, \bar{\theta}_k)] > \mathbb{E}_{Q_k}[\kl(\Theta_k \, \| \, \bar{\theta}_k)]$ for all $k \in \{1,\dots,K\}$
    and with that $\EU(Q') > \EU(Q)$. 
    
    Since we have by definition $\TU(Q') = \sum_{k=1}^K \ent(\mathbb{E}_{Q'_k}[\Theta'_k])$, and $\mathbb{E}_{Q_k^{\prime}}[\Theta_k^{\prime}] = \mathbb{E}_{Q_k}[\Theta_k]$ for all $k \in \{1,\dots,K\}$ by the mean-preserving spread assumption, $\TU(Q') = \TU(Q)$ follows. 
    %
    \item[A4:] Let $Q'$ be a spread-preserving center shift of $Q$ such that $\mathbb{E}[\vec{\Theta}'] = \lambda\mathbb{E}[\vec{\Theta}] + (1-\lambda)(1/K,\dots,1/K)^\top$ for some $\lambda \in (0,1)$. Because also $\vec{\Theta}' = \vec{\Theta} + \vec{z}$ with $\vec{z} \neq 0$, this implies $\mathbb{E}[\vec{\Theta}] \neq (1/K, \dots, 1/K)^\top$. From the proof of A2 we know that $(1/K,\dots,1/K)$ maximizes $\sum_{k=1}^K \ent(\theta_k)$ and concavity of $\ent(\cdot)$ implies $\TU(Q') > \TU(Q)$. 

\iffalse 
    For $\AU$, we start with the following observation: because $\vec{\Theta}' = \vec{\Theta} + \vec{z}$, the equality
    \begin{align*}
        \mathbb{E}[\vec{\Theta}'] = \lambda\mathbb{E}[\vec{\Theta}] + (1-\lambda)(1/K,\dots,1/K)^\top
    \end{align*}
    can be rearranged to
    \begin{align*}
        \vec{z} = (\lambda - 1)\mathbb{E}[\vec{\Theta}] + (1 - \lambda)(1/K,\dots,1/K)^\top,
    \end{align*}
    and, thus,
    \begin{align*}
        \vec{\Theta}' =  \vec{\Theta} + \vec{z} = \lambda \underbrace{\left(\vec{\Theta}/\lambda + (1 - 1/\lambda)\mathbb{E}[\vec{\Theta}]\right)}_{=:\widetilde{\vec{\Theta}}} + (1 - \lambda) (1/K,\dots,1/K)^\top.
    \end{align*}
    We now get
    \begin{align*}
        \sum_{k=1}^{K} \mathbb{E}[\ent(Y_k \given \Theta_k')] 
        &\geq \sum_{k=1}^{K} \lambda \mathbb{E}[\ent(Y_k \given \tilde \Theta_k)] + (1 - \lambda) \mathbb{E}[\ent(Y_k \given (1/K, \dots, 1/K)^\top)]   \\[0.2cm]
        &\geq \sum_{k=1}^{K} \mathbb{E}[\ent(Y_k \given \tilde \Theta)] + (\lambda - 1) \mathbb{E}[\ent(Y_k \given \mathbb{E}[\vec{\Theta}])] + (1 - \lambda)\mathbb{E}[\ent(Y_k \given (1/K, \dots, 1/K)^\top)]  \\[0.2cm]
        &> \sum_{k=1}^{K} \mathbb{E}[\ent(Y_k \given \Theta_k)], 
    \end{align*}
    where the first two inequalities follow from Jensen's inequality for concave functions, and the last from the fact that the entropy is maximal at the barycenter $(1/K, \dots, 1/K)$ (shown in A2).  The inequality is strict because $\mathbb{E}[\vec{\Theta}] \neq (1/K, \dots, 1/K)^\top$ as shown above.
    \fi 
    %
    \item[A6:] 
    Let $\delta_m \in \Delta_{\delta_m}$, such that we have 
    \begin{align*}
    \AU(\delta_m) &= \sum_{k=1}^{K} \mathbb{E} [\ent(Y_k \given \Theta_k)] \\[0.2cm]
    &= \sum_{k=1}^{K} \sum_{y_k \in \{0,1\}} \ent(\delta_{y_k}) \lambda_{y_k}(\delta_{y_k}) \\[0.2cm]
    &= 0,
    \end{align*}
    where the last equation holds true, since $\ent(\delta_{y_k}) = 0$ for all $y_k \in \{0,1\}$ and $k \in \{1,\dots,K\}$.
    %
    \item[A7:] Let $Q \in \ksimplextwo$ and denote by $Q_{|\lab_1}$ and $Q_{|\lab_2}$ the corresponding marginalized distribution, where $\lab_1$ and $\lab_2$ are partitions of $\lab$. It holds 
    \begin{align*}
        \TU_{\lab}(Q) = \sum_{k \in \lab} \ent(\mathbb{E}_{Q_k}[\Theta_k]) &= \sum_{k \in \lab_1} \ent(\mathbb{E}_{Q_k}[\Theta_k]) + \sum_{k \in \lab_2} \ent(\mathbb{E}_{Q_k}[\Theta_k]) \\[0.2cm]
        &= \TU_{\lab_1}(Q_{|\lab_1}) + \TU_{\lab_2}(Q_{|\lab_2}),
    \end{align*}
    similarly the same holds for $\AU$. Due to the additive decomposition the claim is also true for $\EU$.
\end{itemize}
This concludes the proof.
\end{proof}


%

\begin{proof}[Proof of Theorem \ref{thm:axioms}]
%
We prove that variance-based uncertainty measures \eqref{tu:variance}, \eqref{au:variance}, and \eqref{eu:variance} satisfy Axioms A0, A1, A2 (only for $\TU$), A3 (strict version), A4 (strict version) and A5--A7 of Section \ref{subsection:axioms}.
\begin{itemize}
    \item[A0:] This property holds trivially true. 
    %
    \item[A1:] Let $Q = \delta_{\vtheta} \in \ksimplextwo$ be a Dirac measure on $\vtheta \in \ksimplex$. Then $\EU[\delta_{\vtheta}] = 0$ holds trivially true, since $\Var_{Q_k}[\Theta_k] = 0$ for all $k \in \{1, \dots, K\}$. The other direction follows similarly. 
    %
    \item[A2:] First we show that the function $V(\vec{\theta}) = \sum_{k = 1}^K \theta_k(1 - \theta_k)$ is strictly concave on $\Delta_K$ with unique maximizer $\vec{\beta} = (1/K,\dots,1/K)$.  It holds 
    \begin{align*}
    \nabla^2 V(\vec{\theta}) = -2 \mathrm{diag}(1, \dots, 1 ),
    \end{align*}
    which is negative definite. To find the maximizer, we consider the Lagrangian dual:
    \begin{align*}
        \max_{\vec{\theta} \in \Delta_K, \lambda} V^*(\vec{\theta}, \lambda) =  \max_{\vec{\theta} \in \Delta_K, \lambda} V(\vec{\theta}) + \lambda \biggl(\sum_{k = 1}^K \theta_k - 1\biggr).
    \end{align*}
    The first-order conditions are
    \begin{align*}
        \frac{\partial  V^*(\vec{\theta}, \lambda)}{\partial \theta_k} = 1 - 2 \theta_k + \lambda = 0,\; k \in \{1, \dots, K\} \; \text{and} \;
        \sum_{k = 1}^K \theta_k = 1,
    \end{align*}
    which are solved by $\vec{\theta} = \vec{\beta}$ and $\lambda = -(K - 2) / K$. 
    This implies that $\TU$ is maximized for any $Q \in \ksimplextwo$ such that $\mathbb{E}[\Theta_k] = 1/K$ for all $k\in\{1,\dots,K\}$. The latter holds true for $Q$ being the uniform distribution on $\ksimplextwo$.

    The proof that $\EU$ is not maximal for $Q_{\mathrm{Unif}}$ being the uniform distribution on $\ksimplextwo$ is completely analogous to the proof of A3 in Theorem \ref{thm:entropy_axioms}.
    %
    \item[A3:] Let $Q^{\prime} \in \ksimplextwo$ be a mean-preserving spread of $Q \in \ksimplextwo$, i.e., let $\vec{\Theta} \sim Q, \vec{\Theta}^\prime \sim Q^\prime$ be two random vectors such that $\vec{\Theta}^\prime \overset{d}{=} \vec{\Theta} + \vec{Z}$, for some random vector $\vec{Z}$ with $\mathbb{E}[\vec{Z} \given \sigma(\vec{\Theta})] = 0$ almost surely. Then, we have the following:
    \begin{align}
    \EU(Q^{\prime}) &=  \sum_{k=1}^{K} \Var(\Theta_k + Z_k) \\
           &= \sum_{k=1}^{K} \Var(\Theta_k) + \Var(Z_k)+ 2 \Cov(\Theta_k, Z_k)  \\
           &= \EU(Q) + \sum_{k=1}^{K}  \Var(Z_k) 
           \label{mps:ineq} \\
           &> \EU(Q) \label{mps:ineq2}
    \end{align}
    Note that the equality \eqref{mps:ineq} holds true, since we know that $\Cov(\Theta_k, Z_k) = 0$. To see this, observe that we have $\mathbb{E}[\Theta_k  Z_k] = \mathbb{E}[\mathbb{E}[\Theta_k Z_k \given \sigma(\Theta_k)]] = 0$ due to the mean-preserving spread assumption. Similarly, we know that $\mathbb{E}[Z_k] = 0$, such that we have $\Cov(\Theta_k, Z_k) = \mathbb{E}[\Theta_k  Z_k] - \mathbb{E}[\Theta_k] \mathbb{E}[Z_k] = 0.$ The inequality \eqref{mps:ineq2} is strict since by assumption $\max_k \Var(Z_k) > 0$. 

    Since we have $\mathbb{E}[\vec{\Theta}] = \mathbb{E}[\vec{\Theta}^{\prime}]$ by assumption, the weak version of A3 holds for $\TU$. 
    %
    \item[A4:] Let $Q'$ be a spread-preserving location shift of $Q$ such that $\mathbb E[\vec{\Theta}'] = \lambda \mathbb E[\vec{\Theta}] + (1 - \lambda) (1/K, \dots, 1/K)^\top$ for some $\lambda \in (0, 1)$. From the proof of A2 we know that $\vec{\beta} = (1/K,\dots,1/K)$ maximizes $\sum_{k=1}^{K} \theta_k(1-\theta_k)$, which in turn immediately implies $\TU(Q') > \TU(Q)$. The inequality for AU is then implied by $\mathrm{EU}(Q') = \mathrm{EU}(Q)$ (Axiom A5).
    %
    \item[A5:] Let $\vec{\Theta} \sim Q$, and $(\vec{\Theta} + \vec{z}) \sim Q^{\prime}$, where $\vec{z} \neq \vec{0}$ is a constant. Then, we observe
% \begin{align*}
%     \EU(Q^{\prime}) &= \mathbb{E}_{Q^{\prime}} \left\lVert (\vec{\Theta} + \vec{z}) - \mathbb{E}_{Q^{\prime}}[\vec{\Theta} + \vec{z}] \right\rVert_2^2 \\[0.1cm]
%     &= \mathbb{E}_{Q^{\prime}} \left\lVert (\vec{\Theta} + \vec{z}) - (\bar{\vec{\vtheta}}+ \vec{z}) \right\rVert_2^2 \\[0.1cm]
%     &= \mathbb{E}_{Q} \left\lVert \vec{\Theta} - \bar{\vec{\vtheta}} \right\rVert_2^2\\[0.1cm]
%     &=\EU(Q) 
% \end{align*}
\begin{align*}
    \EU(Q^{\prime}) &= \sum_{k = 1}^{K} \Var(\Theta_k + z_k) \\[0.1cm]
    &= \sum_{k = 1}^{K}  \mathbb{E}[((\Theta_k + z_k) - \mathbb{E}[\Theta_k + z_k] )^2 ] \\[0.1cm]
    &= \sum_{k = 1}^{K} \mathbb{E}[(\Theta_k - \mathbb{E}[\Theta_k])^2] \\[0.1cm]
    &= \EU(Q). 
\end{align*}
    \item[A6:] With $\delta_m \in \Delta_{\delta_m}$ we have 
    \begin{align*}
        \AU(\delta_m) &= \sum_{k = 1}^{K} \mathbb{E}[\Theta_k  (1 - \Theta_k)] \\[0.2cm]
        &= \sum_{k = 1}^{K} \lambda_k  (1 - 1) + (1 - \lambda_k)  (0 - 0) \\[0.2cm]
        &= 0.
    \end{align*}
    %Since $\AU$ is non-negative, we have $\AU(\delta_{\vtheta}) \geq \AU(\delta_m)$ for any $\vtheta \in \ksimplex$ and any $\delta_m \in \Delta_{\delta_m}$.
    \item[A7:] Let $Q \in \ksimplextwo$ and further denote by $Q_{|\lab_1}$ and $Q_{|\lab_2}$ the corresponding marginalized distribution, where $\lab_1$ and $\lab_2$ are partitions of $\lab$. It holds 
    \begin{align*}
        \TU_{\lab}(Q) = \sum_{k \in \lab}  \Var(Y_k) &= \sum_{k \in \lab_1} \Var(Y_k) + \sum_{k \in \lab_2} \Var(Y_k) \\[0.2cm]
        &= \TU_{\lab_1}(Q_{|\lab_1}) + \TU_{\lab_2}(Q_{|\lab_2}),
    \end{align*}
    similarly the same holds for $\AU$. Due to the additive decomposition the claim is also true for $\EU$.
\end{itemize}
This concludes the proof.
\end{proof}






\newpage
\section{Experimental Details}
\label{appendix:exp_details}
%
In this section, we provide a detailed overview of the experimental setup to allow reproduction of the results.
%

The experimental code is written in \texttt{Python 3.9} using the \texttt{PyTorch} \citep{pytorch2019} library. 
%

\subsection{Experiments on ML Data sets}
\paragraph{Data sets. }For all data sets, we use the respective dedicated train-test splits. We only use pre-processing for the CIFAR10 data set. Each image is normalized using the mean and standard deviation per channel of the training set. Additionally, the training images are cropped randomly (while adding 4 pixels of padding on every border and randomly flipped horizontally). 
% 
\paragraph{Ensembles. }The ensembles are built using two base models: a Convolutional Neural Network (\texttt{CNN}) and a \texttt{ResNet18} \citep{heResnet2016}. The \texttt{CNN} has two convolutional layers followed by two fully connect layers. The convolutional layers have $32$ and $64$ filters of $5$ by $5$ and the fully connected layers have $512$ and $10$ neurons, respectively. The layers have \texttt{ReLU} activations and the last layer uses a softmax function to output probabilities. The \texttt{ResNet18} model has a fully connected last layer of $10$ units and a softmax function to generate probabilities for $10$ classes. The output of the ensemble is generated by averaging over the outputs of the individual ensemble members.

\subsubsection*{Accuracy-Rejection Curves}
%
We train $5$ \texttt{CNNs} on FMNIST, MNIST and KMNIST and $5$ \texttt{ResNets} on CIFAR10 and SVHN. We use \texttt{Adam} \citep{kingmaAdam2015} with the default hyper-parameters to train the \texttt{CNNs} in $10$ epochs for MNIST and $20$ epochs for FMNIST and KMNIST using a batch size of $256$. We train the \texttt{ResNets} using stochastic gradient descent with weight decay set to $10^{-4}$, momentum at $0.9$, and a learning rate of $0.1$, setting the learning rate to $0.001$ at epoch $20$ and to $0.0001$ at epoch $25$. The models are trained for $30$ epochs in total. 
The ARCs are then generated using the test set.

\subsubsection*{Out-of-Distribution Detection}
We train $5$ \texttt{CNNs} on FashionMNIST and $5$ \texttt{ResNets} on CIFAR10, using the same setup as for the ARCs. Epistemic uncertainty is computed on the test sets of the corresponding data sets without applying any data augmentation. 

\subsection{Experiments on Medical Images}

\paragraph{Data set.} We use a publicly available PET/CT image data set \citep{Datasetgatidis2022whole} with respective dedicated train-test splits. During the preprocessing, we extract 2D images as coronal slices from the 3D PET and CT image volumes. Furthermore, each image is resized to $400 \times 400$ pixels, normalized using the mean and standard deviation per channel of the training set and stacked into a three channel image consisting of a PET, CT, and fusion channel. The final data set consists of $96000$ 2D images coming from $1014$ patients.
% 
\paragraph{Ensembles. }The ensembles are built using a \texttt{ResNet50} \citep{heResnet2016}. The \texttt{ResNet50} model has a fully connected last layer of $4$ units and a softmax function to generate probabilities for $4$ classes. The output of the ensemble is generated by averaging over the outputs of the individual ensemble members. We use Adam \citep{kingmaAdam2015} to train each model with a cross entropy loss for $40$ epochs with a learning rate of $0.001$ and a batch size of $50$. All models are evaluated on a separate test set.

\subsubsection*{Accuracy-Rejection Curves}
\label{app:arcmed}
We train $5$ \texttt{ResNets} on the PET/CT image data set. We use Adam \citep{kingmaAdam2015} to train each model with a cross entropy loss for $40$ epochs with a learning rate of $0.001$ and a batch size of $50$. The ARCs are then generated using the test set.

\newpage
\section{Additional Results}
\label{appendix:exp}
%
In this section, we report on experiments that we perform in addition to the ones presented in the main paper.
%
\subsection{Label-wise Uncertainties}
\label{app:label}
% Medical Images
Figure \ref{fig:medical_examples} presents additional medical images with the highest total, aleatoric, and epistemic uncertainties. Similarly, Figure \ref{fig:tu_add} showcases images with the highest total, aleatoric, and epistemic uncertainties for the MNIST data set. 

\begin{figure}[h!]
		\centering
		\includegraphics[width=0.9\linewidth]{figures_uai/med_data/image_label_uncertainty_tu_1_class_MELANOMA_pred_1.pdf}
        %
  		\includegraphics[width=0.9\linewidth]{figures_uai/med_data/image_label_uncertainty_au_3_class_LYMPHOMA_pred_2.pdf}
        %
  		\includegraphics[width=0.9\linewidth]{figures_uai/med_data/image_label_uncertainty_eu_4_class_LYMPHOMA_pred_2.pdf}
		\caption{Medical image examples with highest total \textit{(top)}, \textit{\textcolor{PythonRed}{
        aleatoric}} \textit{(middle)} and \textit{\textcolor{PythonBlue}{epistemic}} uncertainties \textit{(bottom)} along with their corresponding label-wise uncertainties, \textit{\textcolor{Green}{ground truth class}} and \textit{\underline{predicted class}}.}
		\label{fig:medical_examples}
\end{figure}

\newpage 
\begin{figure}[htbp]
    \centering

    % First row
    \begin{minipage}{\textwidth}
        \centering
        \begin{subfigure}[b]{0.24\textwidth}
            \includegraphics[width=\textwidth]{figures_uai/ml_data/mnist_tu_00.pdf}
        \end{subfigure}
        \hspace{1.5cm}
        \begin{subfigure}[b]{0.24\textwidth}
            \includegraphics[width=\textwidth]{figures_uai/ml_data/mnist_tu_11.pdf}
        \end{subfigure}
        \hspace{1.5cm}
        \begin{subfigure}[b]{0.24\textwidth}
            \includegraphics[width=\textwidth]{figures_uai/ml_data/mnist_tu_22.pdf}
        \end{subfigure}
    \end{minipage}

    % Add space between rows
    \vspace{0.5cm} 

    % Second row
    \begin{minipage}{\textwidth}
        \centering
        \begin{subfigure}[b]{0.24\textwidth}
            \includegraphics[width=\textwidth]{figures_uai/ml_data/mnist_au_0.pdf}
        \end{subfigure}
        \hspace{1.5cm}
        \begin{subfigure}[b]{0.24\textwidth}
            \includegraphics[width=\textwidth]{figures_uai/ml_data/mnist_au_1.pdf}
        \end{subfigure}
        \hspace{1.5cm}
        \begin{subfigure}[b]{0.24\textwidth}
            \includegraphics[width=\textwidth]{figures_uai/ml_data/mnist_au_2.pdf}
        \end{subfigure}
    \end{minipage}

    % Add space between rows
    \vspace{0.5cm} %

    % Third row
    \begin{minipage}{\textwidth}
        \centering
        \begin{subfigure}[b]{0.24\textwidth}
            \includegraphics[width=\textwidth]{figures_uai/ml_data/mnist_eu_0.pdf}
        \end{subfigure}
        \hspace{1.5cm}
        \begin{subfigure}[b]{0.24\textwidth}
            \includegraphics[width=\textwidth]{figures_uai/ml_data/mnist_eu_1.pdf}
        \end{subfigure}
        \hspace{1.5cm}
        \begin{subfigure}[b]{0.24\textwidth}
            \includegraphics[width=\textwidth]{figures_uai/ml_data/mnist_eu_2.pdf}
        \end{subfigure}
    \end{minipage}

    \caption{MNIST instances with highest total \textit{(top)}, \textit{\textcolor{PythonRed}{aleatoric}} \textit{(middle)}, and \textit{\textcolor{PythonBlue}{epistemic}} uncertainties \textit{(bottom)} along with their corresponding label-wise uncertainties.}
    \label{fig:tu_add}
\end{figure}
\clearpage 




%
\newpage
\subsection{Accuracy-Rejection Curves}
\label{app:arc}
%
We train an ensemble of $5$ neural networks on the data sets using the setup outlined in Section \ref{appendix:exp_details}. Figure \ref{fig:arcs_supp} shows the accuracy-rejection curves for the medical data and the FMNIST data set. The accuracies are reported as the mean over five runs and the standard deviation is depicted by the shaded area.

\begin{figure}[htbp]
\centering
\begin{subfigure}{0.45\textwidth}
\includegraphics[width=0.90\linewidth]{figures_uai/med_data/accuracy_rejection_curve_diff_measures_total.pdf}
\caption{PET/CT (TU)}
\end{subfigure}
\hfill
\begin{subfigure}{0.45\textwidth}
\includegraphics[width=0.90\linewidth]{figures_uai/ml_data/accr_tu_cifar_lent.pdf}
\caption{FMNIST (TU)}
\end{subfigure}

\begin{subfigure}{0.45\textwidth}
\includegraphics[width=0.90\linewidth]{figures_uai/med_data/accuracy_rejection_curve_diff_measures_aleatoric.pdf}
\caption{PET/CT (AU)}
\end{subfigure}
\hfill
\begin{subfigure}{0.45\textwidth}
\includegraphics[width=0.90\linewidth]{figures_uai/ml_data/accr_au_fmnist_lent.pdf}
\caption{FMNIST (AU)}
\end{subfigure}

\begin{subfigure}{0.45\textwidth}
\includegraphics[width=0.90\linewidth]{figures_uai/med_data/accuracy_rejection_curve_diff_measures_epistemic.pdf}
\caption{PET/CT (EU)}
\end{subfigure}
\hfill
\begin{subfigure}{0.45\textwidth}
\includegraphics[width=0.90\linewidth]{figures_uai/ml_data/accr_eu_fmnist_lent.pdf}
\caption{FMNIST (EU)}
\end{subfigure}

\caption{Accuracy-rejection curves on PET/CT \textit{(left)} and FMNIST \textit{(right)}.}
\label{fig:arcs_supp}
\end{figure}
% We observe similar results as the one presented in the main paper. The label-wise measures show very similar behavior to the (usual) entropy measures, increasing monotonically for almost every measure. 
We observe results consistent with those presented in the main paper. The label-wise measures exhibit behavior very similar to the usual entropy measures, with most measures increasing monotonically.

\newpage
\subsection{Holdout Experiments}
\subsubsection*{Experimental Details}
\textbf{Data sets. } We perform experiments on CIFAR10 and FMNIST data sets with corresponding train-test splits. We only use pre-processing for the CIFAR10 data set. Each image is normalized using the mean and standard deviation per channel of the training set. Additionally, the training images are cropped randomly (while adding $4$ pixels of padding on every border and randomly flipped horizontally). \\[0.2cm]
\textbf{Ensemble. } The ensembles are built using two base models: a Convolutional Neural Network (\texttt{CNN}) and a \texttt{ResNet18} \citep{heResnet2016}. The \texttt{CNN} has two convolutional layers followed by two fully connect layers. The convolutional layers have $32$ and $64$ filters of $5$ by $5$ and the fully connected layers have $512$ and $10$ neurons, respectively. The layers have \texttt{ReLU} activations and the last layer uses a softmax function to output probabilities. The \texttt{ResNet18} model has a fully connected last layer of $10$ units and a softmax function to generate probabilities for $10$ classes. The output of the ensemble is generated by averaging over the outputs of the individual ensemble members.

\textbf{Experiments. } 
For the CIFAR10 data set, we train the ensemble for $20$ epochs on a small subset of the train data  ($10\%$ of the initial train data, the other $90\%$ is reserved as holdout). 
To prevent class imbalance, we remove instances with the highest EU class (EU per class is computed by first calculating the label-wise $\EU$ for all instances using the variance-based EU measure, and then averaging over instances from the same class) from the train data, and add the same amount of holdout data (from the class, which was identified after the initial $20$ epochs training as highest $\EU$ class). 
%
Although the amount of data for each class remains the same across epochs, the learner is progressively exposed to a broader range of examples from the class with highest EU. In other words, the approach effectively increases the total number of observations the learner encounters from that class over time without leading to class imbalance.
This step is repeated for $20$ epochs of \textit{continued} learning to ensure the model is trained on a diverse set of examples. Finally, we compare the epistemic uncertainty (of both the class with highest $\EU$ and the average of all other classes) \textit{before} and \textit{after} giving the learner access to more data from the class with highest $\EU$. We follow the same procedure for the FMNIST dataset (see executed configurations).

Executed configurations:
\begin{itemize}
    \item[(i)] For the CIFAR10 data set:
    \begin{itemize}
        \item The experiment was run with $20$ epochs of \textit{initial} training and $20$ epochs of \textit{continued} training.
        \item  A hold-out rate of $90\%$ was applied, indicating that a large portion of the data was initially withheld.
        \item The experiment was repeated for $5$ runs.
    \end{itemize}
       \item[(ii)] For the FMNIST data set:
    \begin{itemize}
        \item The experiment was run with $5$ epochs of \textit{initial} training and $5$ epochs of \textit{continued} training.
        \item A hold-out rate of $99.5\%$ was used, indicating that a large portion of the data was initially withheld.
        \item The experiment was repeated for $5$ runs.
    \end{itemize}
\end{itemize}


\subsubsection*{Experimental Results}
In Table \ref{table:ood} we present both the absolute and relative changes in the $\EU$ values for each dataset. Additionally, we include the changes in EU for other classes, with the average being reported. For comparison purposes, we also provide the absolute and relative changes in $\EU$ for the class experiencing the second-largest reduction in the $\EU$ values ("Next highest drop").
\vspace{-0.1cm}
\begin{table}[h!]
\centering
\resizebox{1\textwidth}{!}{
  \begin{tabular}{@{}ccccccc@{}}\toprule
  & \multicolumn{3}{c}{FMNIST} & \multicolumn{3}{c}{CIFAR10} \\ \midrule 
   & Max. EU class & Other classes & Next highest drop & Max. EU class & Other classes  & Next highest drop  \\ \midrule
  Absolute & \textbf{0.0070 $\pm$ 0.0006} & 0.0018 $\pm$ 0.0000 & 0.0029 $\pm$ 0.0000 & \textbf{0.0057 $\pm$ 0.0011} & 0.0023 $\pm$ 0.0000 & 0.0031 $\pm$ 0.0000 \\
  Relative &  \textbf{0.7934 $\pm$ 0.0616} &  0.3872 $\pm$ 0.0000 & 0.5619 $\pm$ 0.0000 & \textbf{0.5815 $\pm$ 0.0304} &  0.3682 $\pm$ 0.0000 & 0.4630 $\pm$ 0.0000 \\
  \bottomrule
  \end{tabular}
}
\captionof{table}{
  Absolute and relative changes in $\EU$. 
}
\label{table:ood}
\end{table}

\newpage 
\begin{figure}[h!]
    \centering
    
    % 
    \begin{subfigure}{\textwidth}
        \centering
        \includegraphics[width=0.8\linewidth]{figures_uai/ml_data/ecdf_cifar.png} 
        \caption{CIFAR10 data set.}
        \label{fig:sub1}
    \end{subfigure}
    
    \vspace{1cm} 
    
    % 
    \begin{subfigure}{\textwidth}
        \centering
        \includegraphics[width=0.8\linewidth]{figures_uai/ml_data/ecdf_fmnist.png} 
        \caption{FMNIST data set.}
        \label{fig:sub2}
    \end{subfigure}
    
    \caption{ECDF of class with maximal $\EU$ \textit{(left)} and ECDF of other classes \textit{(right)}.}
    \label{fig:test}
\end{figure}

Figure \ref{fig:test} shows the empirical cumulative distribution function (ECDF), averaged over $5$ runs of the experiment for the $\EU$ values that we observe. On the left we see the ECDF for the class that we identified as having the "highest $\EU$" after the \textit{initial} training, and on the right the averaged ECDF of the "other classes".


\textbf{Conclusion. } We conclude that providing the learner with more data from the highest $\EU$ class decreases $\EU$ for this class the most. While $\EU$ for other classes will not remain necessarily constant, it is also important to note that $\EU$ is also \textit{not} increasing for other classes.


\end{document}

