\documentclass[accepted]{uai2023} % for initial submission
% \documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{abbrvnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{bibentry}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{csquotes}
\usepackage{amsfonts}
\usepackage{amsmath, amsthm, amssymb}
\usepackage{float}
\usepackage{caption}
\usepackage{subcaption}
\newtheorem{proposition}{Proposition}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
%% Self-defined macros
\newcommand{\R}{\R}
\newcommand{\N}{\mathbb{N}}
\newcommand{\E}{\mathbb{E}}
\newcommand{\V}{\mathbb{V}}
\newcommand{\del}{\operatorname{d}}
\newcommand{\data}{\mathcal{D}}
\newcommand{\xpred}{\bm{x}_\ast}
\newcommand{\ypred}{y_\ast}
\newcommand{\pgivenx}{\prob( \cdot \given \vec{x})}
\newcommand{\hatpgivenx}{\hat \prob( \cdot \given \vec{x})}
\newcommand{\ymarg}{p(y)}
\newcommand{\ygivent}{p(y \given \vtheta)}
\newcommand{\yrv}{p(Y)}
\newcommand{\yrvt}{p(Y, \Theta)}
\newcommand{\yrvgivent}{p(Y | \Theta)}
\newcommand{\levelone}{\mathbb{P}(\cY)}
\newcommand{\leveltwo}{\mathbb{P}(\mathbb{P}(\cY))}
\newcommand{\ksimplex}{\Delta_K}
\newcommand{\ksimplextwo}[1][K]{\Delta_{#1}^{(2)}}
\newcommand{\hypone}{f}
\newcommand{\hyptwo}{F}
\newcommand{\ber}[1]{\mathcal{B}(#1)}
\newcommand{\ent}{H}
\newcommand{\mi}{I}
\newcommand{\dkl}{D_{\text{KL}}}
\newcommand{\half}{\tfrac{1}{2}}
\newcommand{\dirac}[1][1/2]{\delta_{#1}}
\newcommand{\unif}[1][0, 1]{\mathcal{U}[#1]}
\newcommand{\textmacro}[2]{\newcommand{#1}{#2\xspace}}
\newcommand{\mathsymbol}[2]{\newcommand{#1}{\ensuremath{\mathit{#2}}\xspace}}
\renewcommand{\vec}[1]{\boldsymbol{#1}}
\newcommand{\given}{\, | \,}
\newcommand{\hath}{\hat{h}}
\newcommand{\hatp}{\hat{p}}
\newcommand{\haty}{\hat{y}}
\newcommand{\sety}{\widehat{Y}}
\newcommand{\cat}{\operatorname{Cat}}
\newcommand{\dir}{\operatorname{Dir}}
\newcommand{\evalue}{\mathbb{E}}
\newcommand{\vtheta}{\vec{\theta}}
\newcommand{\valpha}{\vec{\alpha}}
\newcommand{\vphi}{\vec{\phi}}
\newcommand{\vthetatrue}{\vtheta^\ast}
\newcommand{\fromto}{\longrightarrow}
\newcommand{\td}[1]{\textcolor{red}{[TODO: #1]}}
\newcommand*{\defeq}{\mathrel{\vcenter{\baselineskip0.5ex \lineskiplimit0pt
			\hbox{\footnotesize.}\hbox{\footnotesize.}}}%
=}
\newcommand{\svert}{\, \vert \, }
\newcommand{\cX}{\mathcal{X}}
\newcommand{\cY}{\mathcal{Y}}
\newcommand{\cH}{\mathcal{H}}
\newcommand{\cD}{\mathcal{D}}
\newcommand{\eu}{\operatorname{EU}}
\newcommand{\au}{\operatorname{AU}}
\newcommand{\tu}{\operatorname{U}}
\newcommand{\Prob}{P}
\newcommand{\prob}{p}
\newcommand{\argmin}{\operatorname*{argmin}}
\newcommand{\argmax}{\operatorname*{argmax}}
\newcommand{\on}[1]{\operatorname{#1}}
\newcommand{\sebcom}[1]{{\color{blue}Seb: #1}}
\newcommand{\sebchang}[1]{{\color{red}#1}}

\title{Quantifying Aleatoric and Epistemic Uncertainty in Machine Learning: \\Are Conditional Entropy and Mutual Information Appropriate Measures? \\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1,3]{\href{mailto:<lisa.wimmer@stat.uni-muenchen.de>?Subject=Your UAI 2023 paper}{Lisa~Wimmer}{}}
\author[2,3]{Yusuf~Sale}
\author[2,3]{Paul~Hofman}
\author[1,3]{Bernd~Bischl}
\author[2,3]{Eyke~H\"ullermeier}
% Add affiliations after the authors
\affil[1]{%
    Department of Statistics\\
    LMU Munich\\
    Germany
}
\affil[2]{%
    Institute of Informatics\\
    LMU Munich\\
    Germany
}
\affil[3]{%
    Munich Center for Machine Learning (MCML)\\
    Germany
}  
  \begin{document}

\maketitle

\begin{NoHyper}

\section{Experimental Details}

In the following, we list the most important training configurations used to generate our results.
The full experimental code is hosted in a public repository\footnote{\href{https://github.com/lisa-wm/entropybaseduq}{\texttt{https://github.com/lisa-wm/entropybaseduq}}}.

\paragraph{Software}
Our codebase is written in \texttt{Python}.
It chiefly relies on the \texttt{PyTorch} \citep{paszke_2019_PyTorchImperativeStyle}, \texttt{PyTorch Lightning} \citep{lightningai_2023_PyTorchLightningTrain}, \texttt{Laplace Redux} \citep{daxberger_2021_LaplaceReduxEffortless}, and \texttt{scikit-learn} \citep{scikit-learn} libraries.

\paragraph{Datasets}
The real-world computer vision tasks are \texttt{CIFAR10} \citep{Krizhevsky2009learning} and \texttt{MNIST} \citep{lecun_1998_GradientBasedLearningApplied}.
Both contain ten balanced classes.
We further synthesize rectangles (white-on-black), where the class label is determined by whether height $>$ width or \textit{vice versa}, and random non-convex polygons (white-on-black) with 3--5 vertices.
These datasets comprise 60k (10k) training (test) samples.
The tabular classification problem is created via \texttt{scikit-learn}'s \texttt{make\_classification} function, using two features (and four classes.
Here, we generate 6k (1k) training (test) samples.

\paragraph{Base learners}
Our probabilistic classifiers all combine some base learners into an explicit (deep ensemble, random forest) or implicit (Laplace approximation) ensemble.
We train \texttt{EfficientNet-B7} (approx. 64m parameters; \citet{tan_2019_EfficientNetRethinkingModel}) for \texttt{CIFAR10} and a small convolutional network (three convolutional layers with ReLU activation; approx. 62k parameters) for \texttt{MNIST} and the rectangle/polygon images. 
In the tabular classification problem, we use a random forest with a maximum tree depth of ten as well as single-hidden-layer MLPs with a hidden layer size of ten, adopting the default parameters from \texttt{scikit-learn} unless stated otherwise.
Ensemble size is set to $M = 10$.

\paragraph{Training Configurations}
We use an SGD optimizer (momentum 0.9), a learning rate schedule with cosine annealing, where the initial learning rate is set to $10^{-2}$, and weight decay ($5 \times 10^{-4}$).
Training runs for a maximum of 200 epochs at batch size 256 with early stopping if validation loss does not improve over five consecutive epochs (evaluated on a validation set containing 10\% of the training data).  

\end{NoHyper}

\section{Additional Results}

\subsection{Increasing Data Noise}

Compared to the ensemble of MLPs\footnote{
In the tabular classification task, we bootstrap the data for the MLP ensemble to make it directly comparable to the random forest that relies on this technique.
}, the random forest (Fig.~\ref{fig:uvsdistance_rf}) reacts in both uncertainty components when class overlap is increased.

\begin{figure}[H]
    \centering
    \includegraphics[width=0.4\textwidth]{figures/uvsdistance_rf}
    \caption{Entropy-based uncertainty for increasing class overlap (tabular data).}
    \label{fig:uvsdistance_rf}
\end{figure}

In order to simulate label noise, we randomly change classes for a varying share (1\%--75\%) of observations in the tabular classification task, leading to datasets as depicted in Fig.~\ref{fig:uvsnoise_data}.

\begin{figure}[H]
    \centering
    \includegraphics[width=0.4\textwidth]{figures/uvsnoise_data}
    \caption{Tabular data with two features and four classes for increasing noise level.}
    \label{fig:uvsnoise_data}
\end{figure}


\paragraph{Expected Behavior}
AU picks up with increasing noise level.
Since learner capacity remains fixed, it is reasonable to assume that EU also rises to some extent when the decision boundaries become more complex with mounting degree of dataset contamination.

\paragraph{Observed Behavior}
As observed in the experiments modifying image resolution and class overlap, we find that AU duly increases for a rising noise level, though it remains moderate for the random forest even in the most extreme scenario (Fig.~\ref{fig:uvsnoise}), where three out of four labels are assigned randomly.
EU goes up slightly for the random forest, as presumed, but remains ultra-low for every value of the ablation with the MLP ensemble.

\begin{figure}[H] \centering
    \includegraphics[width=0.4\textwidth]{figures/uvsnoise_rf}
      \includegraphics[width=0.4\textwidth]{figures/uvsnoise_mlp}
  \caption{Entropy-based uncertainty for increasing label noise (tabular data).}
  \label{fig:uvsnoise}
\end{figure}

\subsection{Number of Ensemble Members}

We study for the tabular classification problem how different ensemble sizes (2--50) affect the uncertainty estimates.

\paragraph{Expected Behavior}
There should be no systematic pattern except for possible volatility for very small ensemble sizes, where the finite-ensemble estimator might have larger bias.

\paragraph{Observed Behavior}
The results are indeed fairly stable for different values of $M$ (Fig.~\ref{fig:uvsmembers}).
Again, the overall levels of reported uncertainty differ considerably between the learners.

\begin{figure}[H]
  \centering
      \includegraphics[width=0.4\textwidth]{figures/uvsmembers_rf}
      \includegraphics[width=0.4\textwidth]{figures/uvsmembers_mlp}
  \caption{Entropy-based uncertainty for increasing number of ensemble members (tabular data).}
  \label{fig:uvsmembers}
\end{figure}

We also compute the uncertainty measures for the computer vision tasks, where such large ensembles are prohibitively expensive, that result from using $M = 5$.
Tables~\ref{tab:samples_la_mnist}--\ref{tab:res_de_cifar} show the uncertainty values as an average over all possible five-member ensembles that can be constructed from the ten original predictions (we can compute this \textit{ex post} since ensemble size does not affect training for either of the used probabilistic learners: deep ensembles are trained in parallel with no shared loss propagation, and Laplace approximation is an inherently \textit{ex-post} approach anyway).
The results are quite robust here as well (with some exceptions for the particularly noisy settings, such as 1\% sample size).

\subsection{Base Learner Complexity}

Lastly, we investigate the effect of changing the base learner's capacity in the random forest and ensemble of MLPs.
As a a proxy for capacity, we use maximum tree depth and hidden-layer size, respectively.

\paragraph{Expected Behavior}
Initially, AU should decrease when base learners get more capacity so they can fit more varied distributions, express their confidence more adequately and achieve better calibration.
Similarly, the additional complexity might result in higher EU because the base learners have more freedom for disagreement.

\paragraph{Observed Behavior}
We find that AU indeed reduces considerably for more complex base learners (Fig.~\ref{fig:uvscomplexity}), especially for the random forest, which appears to overstate AU when the base learners are very simple (resulting in high calibration error).
The strong effect is quite striking and might be overlooked as performance is relatively stable, again underlining that accuracy, calibration and uncertainty must be considered jointly. 
EU, on the other hand, does not change much -- apparently, relation between capacity and reported AU is quite consistent across base learners and does not provoke more conflict when the ensemble members obtain more freedom.

\begin{figure}[H]
  \centering
      \includegraphics[width=0.4\textwidth]{figures/uvscomplexity_rf.png}
      \includegraphics[width=0.4\textwidth]{figures/uvscomplexity_mlp.png}
  \caption{Increasing base learner complexity}
  \label{fig:uvscomplexity}
\end{figure}

% \section{Proof for Proposition 4}

% In the following, we provide the proof for Proposition 4 (\textit{EU in terms of mutual information
% violates property A3}) for $Q, Q^\prime \in \ksimplextwo$.

% Let $X, Y: \Omega \rightarrow \R$ be two random variables such that $P_X = \frac{1}{2} \dirac[0.25] + \frac{1}{2} \dirac[0.75]$ and $P_Y = \unif[0, 1]$. 
% Then, $P_Y$ is a mean-preserving-spread (MPS) of $P_X$.

% We show that $P_X$ second-order dominates $P_Y$, i.e., $P_X \succeq_2 P_Y$. 
% Denote by $F_X(x) = P_X([-\infty, x])$ the cumulative distribution function (cdf) of the random variable $X$, and $F_Y(x) = P_Y([- \infty, x] )$ the cdf of $Y$, respectively. 
% Note that $\mathbb{E}[X] = \mathbb{E}[Y] = \half$.

% Now, it is sufficient to show that
% \begin{align}
%  \forall t \in \R: \; \int_{- \infty}^{t} F_Y(x) dx \geq \int_{- \infty}^{t} F_X(x) dx. 
%  \label{eq:sod}
%  \end{align}

% First, we have the following cdfs:
% \begin{align*}
%      F_X(x) &= 
%   \begin{cases}
%     0, & \text{for } x < 0.25 \\
%     0.5, & \text{for } 0.25 \leq x < 0.75 \\
%     1, & \text{for }  x \geq 0.75
%   \end{cases}  \\[0.2cm]
%      F_Y(x) &= 
%   \begin{cases}
%     0, & \text{for } x < 0 \\
%     x , & \text{for } 0 \leq x < 1 \\
%     1, & \text{for }  x \geq 1
%   \end{cases}
% \end{align*}
% Then, integration yields for $t \in \R$
% \begin{align*}
%     \int_{- \infty}^{t} F_X(x) dx &= 
%   \begin{cases}
%     0, & \text{for } t < 0 \\
%     0, & \text{for } 0 \leq t < 0.25 \\
%     0.5t - 0.125, & \text{for } 0.25 \leq t < 0.75 \\
%     t - 0.5, & \text{for } 0.75 \leq t < 1 \\
%     t - 0.5, & \text{for } t \geq 1
%   \end{cases}  \\[0.2cm]
%     \int_{- \infty}^{t} F_Y(x) dx &= 
%   \begin{cases}
%     0, & \text{for } t < 0 \\
%     0.5t^2, & \text{for } 0 \leq t < 0.25 \\
%     0.5t^2, & \text{for } 0.25 \leq t < 0.75 \\
%     0.5t^2, & \text{for } 0.75 \leq t < 1 \\
%     t - 0.5, & \text{for } t \geq 1
%   \end{cases}
% \end{align*}
% Thus, inequality~(\ref{eq:sod}) is obviously satisfied for all $t \in \R$. This proves $P_X \succeq_2 P_Y$. 
% Since equality of expected values and second-order dominance constitute a sufficient condition for MPS, this concludes the proof of the claim. 

\onecolumn

\begin{table}
    \centering
    % \small
        \caption{Results for sample size with ensemble of $M = 5$. Mean and standard deviation are obtained by aggregating over all possible ensembles of size five that can be sampled from the ten predictions of the original experiment.}
    \label{tab:samples_la_mnist}
\begin{tabular}{|l|l|l|l|l|r|r|r|}
\hline
\textbf{Experiment} & \textbf{Case} & \textbf{Probabilistic learner} & \textbf{Dataset} & \textbf{Measure} & \textbf{Mean} & \textbf{Standard deviation} & $M = 10$\\
\hline
sample size & 1 & Laplace approximation & MNIST & TU & 0.5918 & 0.0451 & 0.7754\\
\hline
sample size & 1 & Laplace approximation & MNIST & AU & 0.0091 & 0.0012 & 0.0091\\
\hline
sample size & 1 & Laplace approximation & MNIST & EU & 0.5827 & 0.0449 & 0.7663\\
\hline
sample size & 2 & Laplace approximation & MNIST & TU & 0.5794 & 0.0436 & 0.7419\\
\hline
sample size & 2 & Laplace approximation & MNIST & AU & 0.0386 & 0.0037 & 0.0388\\
\hline
sample size & 2 & Laplace approximation & MNIST & EU & 0.5408 & 0.0416 & 0.7031\\
\hline
sample size & 5 & Laplace approximation & MNIST & TU & 0.5370 & 0.0416 & 0.6716\\
\hline
sample size & 5 & Laplace approximation & MNIST & AU & 0.0631 & 0.0053 & 0.0634\\
\hline
sample size & 5 & Laplace approximation & MNIST & EU & 0.4738 & 0.0391 & 0.6083\\
\hline
sample size & 10 & Laplace approximation & MNIST & TU & 0.4578 & 0.0364 & 0.5760\\
\hline
sample size & 10 & Laplace approximation & MNIST & AU & 0.0447 & 0.0039 & 0.0449\\
\hline
sample size & 10 & Laplace approximation & MNIST & EU & 0.4131 & 0.0341 & 0.5311\\
\hline
sample size & 50 & Laplace approximation & MNIST & TU & 0.1756 & 0.0247 & 0.2147\\
\hline
sample size & 50 & Laplace approximation & MNIST & AU & 0.0354 & 0.0033 & 0.0355\\
\hline
sample size & 50 & Laplace approximation & MNIST & EU & 0.1402 & 0.0221 & 0.1791\\
\hline
sample size & 100 & Laplace approximation & MNIST & TU & 0.0793 & 0.0116 & 0.0947\\
\hline
sample size & 100 & Laplace approximation & MNIST & AU & 0.0223 & 0.0022 & 0.0224\\
\hline
sample size & 100 & Laplace approximation & MNIST & EU & 0.0570 & 0.0096 & 0.0723\\
\hline
\end{tabular}
\end{table}

\begin{table}
    \centering
    % \small
        \caption{Results for sample size with ensemble of $M = 5$. Mean and standard deviation are obtained by aggregating over all possible ensembles of size five that can be sampled from the ten predictions of the original experiment.}
    \label{tab:samples_de_mnist}
\begin{tabular}{|l|l|l|l|l|r|r|r|}
\hline
\textbf{Experiment} & \textbf{Case} & \textbf{Probabilistic learner} & \textbf{Dataset} & \textbf{Measure} & \textbf{Mean} & \textbf{Standard deviation} & $M = 10$\\
\hline
sample size & 1 & deep ensemble & MNIST & TU & 0.0487 & 0.0110 & 0.0518\\
\hline
sample size & 1 & deep ensemble & MNIST & AU & 0.0343 & 0.0065 & 0.0344\\
\hline
sample size & 1 & deep ensemble & MNIST & EU & 0.0144 & 0.0049 & 0.0174\\
\hline
sample size & 2 & deep ensemble & MNIST & TU & 0.0381 & 0.0042 & 0.0404\\
\hline
sample size & 2 & deep ensemble & MNIST & AU & 0.0257 & 0.0032 & 0.0258\\
\hline
sample size & 2 & deep ensemble & MNIST & EU & 0.0124 & 0.0016 & 0.0146\\
\hline
sample size & 5 & deep ensemble & MNIST & TU & 0.0212 & 0.0017 & 0.0223\\
\hline
sample size & 5 & deep ensemble & MNIST & AU & 0.0152 & 0.0012 & 0.0153\\
\hline
sample size & 5 & deep ensemble & MNIST & EU & 0.0060 & 0.0011 & 0.0070\\
\hline
sample size & 10 & deep ensemble & MNIST & TU & 0.0137 & 0.0011 & 0.0144\\
\hline
sample size & 10 & deep ensemble & MNIST & AU & 0.0098 & 0.0007 & 0.0099\\
\hline
sample size & 10 & deep ensemble & MNIST & EU & 0.0039 & 0.0007 & 0.0045\\
\hline
sample size & 50 & deep ensemble & MNIST & TU & 0.0082 & 0.0007 & 0.0087\\
\hline
sample size & 50 & deep ensemble & MNIST & AU & 0.0051 & 0.0004 & 0.0051\\
\hline
sample size & 50 & deep ensemble & MNIST & EU & 0.0031 & 0.0004 & 0.0036\\
\hline
sample size & 100 & deep ensemble & MNIST & TU & 0.0067 & 0.0008 & 0.0072\\
\hline
sample size & 100 & deep ensemble & MNIST & AU & 0.0042 & 0.0004 & 0.0042\\
\hline
sample size & 100 & deep ensemble & MNIST & EU & 0.0025 & 0.0004 & 0.0030\\
\hline
\end{tabular}
\end{table}

\begin{table}
    \centering
    % \small
        \caption{Results for sample size with ensemble of $M = 5$. Mean and standard deviation are obtained by aggregating over all possible ensembles of size five that can be sampled from the ten predictions of the original experiment.}
    \label{tab:samples_la_cifar}
\begin{tabular}{|l|l|l|l|l|r|r|r|}
\hline
\textbf{Experiment} & \textbf{Case} & \textbf{Probabilistic learner} & \textbf{Dataset} & \textbf{Measure} & \textbf{Mean} & \textbf{Standard deviation} & $M = 10$\\
\hline
sample size & 1 & Laplace approximation & CIFAR10 & TU & 0.8171 & 0.0629 & 0.8507\\
\hline
sample size & 1 & Laplace approximation & CIFAR10 & AU & 0.6339 & 0.0587 & 0.6364\\
\hline
sample size & 1 & Laplace approximation & CIFAR10 & EU & 0.1832 & 0.0290 & 0.2143\\
\hline
sample size & 2 & Laplace approximation & CIFAR10 & TU & 0.5742 & 0.0368 & 0.6030\\
\hline
sample size & 2 & Laplace approximation & CIFAR10 & AU & 0.4337 & 0.0277 & 0.4354\\
\hline
sample size & 2 & Laplace approximation & CIFAR10 & EU & 0.1405 & 0.0092 & 0.1676\\
\hline
sample size & 5 & Laplace approximation & CIFAR10 & TU & 0.3823 & 0.0247 & 0.4114\\
\hline
sample size & 5 & Laplace approximation & CIFAR10 & AU & 0.2449 & 0.0158 & 0.2459\\
\hline
sample size & 5 & Laplace approximation & CIFAR10 & EU & 0.1374 & 0.0089 & 0.1655\\
\hline
sample size & 10 & Laplace approximation & CIFAR10 & TU & 0.3000 & 0.0199 & 0.3268\\
\hline
sample size & 10 & Laplace approximation & CIFAR10 & AU & 0.1776 & 0.0117 & 0.1783\\
\hline
sample size & 10 & Laplace approximation & CIFAR10 & EU & 0.1224 & 0.0083 & 0.1485\\
\hline
sample size & 50 & Laplace approximation & CIFAR10 & TU & 0.1327 & 0.0089 & 0.1460\\
\hline
sample size & 50 & Laplace approximation & CIFAR10 & AU & 0.0724 & 0.0048 & 0.0727\\
\hline
sample size & 50 & Laplace approximation & CIFAR10 & EU & 0.0603 & 0.0043 & 0.0733\\
\hline
sample size & 100 & Laplace approximation & CIFAR10 & TU & 0.0690 & 0.0047 & 0.0736\\
\hline
sample size & 100 & Laplace approximation & CIFAR10 & AU & 0.0460 & 0.0030 & 0.0461\\
\hline
sample size & 100 & Laplace approximation & CIFAR10 & EU & 0.0231 & 0.0018 & 0.0275\\
\hline
\end{tabular}
\end{table}

\begin{table}
    \centering
    % \small
        \caption{Results for sample size with ensemble of $M = 5$. Mean and standard deviation are obtained by aggregating over all possible ensembles of size five that can be sampled from the ten predictions of the original experiment.}
    \label{tab:samples_de_cifar}
\begin{tabular}{|l|l|l|l|l|r|r|r|}
\hline
\textbf{Experiment} & \textbf{Case} & \textbf{Probabilistic learner} & \textbf{Dataset} & \textbf{Measure} & \textbf{Mean} & \textbf{Standard deviation} & $M = 10$\\
\hline
sample size & 1 & deep ensemble & CIFAR10 & TU & 0.9022 & 0.0715 & 0.9425\\
\hline
sample size & 1 & deep ensemble & CIFAR10 & AU & 0.7073 & 0.1223 & 0.7100\\
\hline
sample size & 1 & deep ensemble & CIFAR10 & EU & 0.1949 & 0.0770 & 0.2324\\
\hline
sample size & 2 & deep ensemble & CIFAR10 & TU & 0.7189 & 0.0652 & 0.7750\\
\hline
sample size & 2 & deep ensemble & CIFAR10 & AU & 0.4410 & 0.0676 & 0.4428\\
\hline
sample size & 2 & deep ensemble & CIFAR10 & EU & 0.2778 & 0.0404 & 0.3322\\
\hline
sample size & 5 & deep ensemble & CIFAR10 & TU & 0.4204 & 0.0273 & 0.4523\\
\hline
sample size & 5 & deep ensemble & CIFAR10 & AU & 0.2519 & 0.0173 & 0.2529\\
\hline
sample size & 5 & deep ensemble & CIFAR10 & EU & 0.1685 & 0.0113 & 0.1995\\
\hline
sample size & 10 & deep ensemble & CIFAR10 & TU & 0.2826 & 0.0183 & 0.3072\\
\hline
sample size & 10 & deep ensemble & CIFAR10 & AU & 0.1545 & 0.0106 & 0.1551\\
\hline
sample size & 10 & deep ensemble & CIFAR10 & EU & 0.1281 & 0.0082 & 0.1521\\
\hline
sample size & 50 & deep ensemble & CIFAR10 & TU & 0.1318 & 0.0131 & 0.1458\\
\hline
sample size & 50 & deep ensemble & CIFAR10 & AU & 0.0617 & 0.0076 & 0.0620\\
\hline
sample size & 50 & deep ensemble & CIFAR10 & EU & 0.0701 & 0.0057 & 0.0838\\
\hline
sample size & 100 & deep ensemble & CIFAR10 & TU & 0.1064 & 0.0176 & 0.1187\\
\hline
sample size & 100 & deep ensemble & CIFAR10 & AU & 0.0480 & 0.0101 & 0.0482\\
\hline
sample size & 100 & deep ensemble & CIFAR10 & EU & 0.0585 & 0.0078 & 0.0706\\
\hline
\end{tabular}
\end{table}

\begin{table}
    \centering
    % \small
        \caption{Results for image resolution with ensemble of $M = 5$. Mean and standard deviation are obtained by aggregating over all possible ensembles of size five that can be sampled from the ten predictions of the original experiment.}
    \label{tab:res_la_mnist}
\begin{tabular}{|l|l|l|l|l|r|r|r|}
\hline
\textbf{Experiment} & \textbf{Case} & \textbf{Probabilistic learner} & \textbf{Dataset} & \textbf{Measure} & \textbf{Mean} & \textbf{Standard deviation} & $M = 10$\\
\hline
image resolution & 5 & Laplace approximation & MNIST & TU & 0.7660 & 0.0571 & 0.8540\\
\hline
image resolution & 5 & Laplace approximation & MNIST & AU & 0.3781 & 0.0387 & 0.3796\\
\hline
image resolution & 5 & Laplace approximation & MNIST & EU & 0.3879 & 0.0418 & 0.4744\\
\hline
image resolution & 10 & Laplace approximation & MNIST & TU & 0.6475 & 0.0463 & 0.6996\\
\hline
image resolution & 10 & Laplace approximation & MNIST & AU & 0.3847 & 0.0348 & 0.3862\\
\hline
image resolution & 10 & Laplace approximation & MNIST & EU & 0.2628 & 0.0317 & 0.3134\\
\hline
image resolution & 25 & Laplace approximation & MNIST & TU & 0.1149 & 0.0099 & 0.1261\\
\hline
image resolution & 25 & Laplace approximation & MNIST & AU & 0.0634 & 0.0048 & 0.0636\\
\hline
image resolution & 25 & Laplace approximation & MNIST & EU & 0.0516 & 0.0057 & 0.0624\\
\hline
image resolution & 50 & Laplace approximation & MNIST & TU & 0.0787 & 0.0087 & 0.0923\\
\hline
image resolution & 50 & Laplace approximation & MNIST & AU & 0.0259 & 0.0024 & 0.0260\\
\hline
image resolution & 50 & Laplace approximation & MNIST & EU & 0.0528 & 0.0065 & 0.0663\\
\hline
image resolution & 100 & Laplace approximation & MNIST & TU & 0.0703 & 0.0108 & 0.0833\\
\hline
image resolution & 100 & Laplace approximation & MNIST & AU & 0.0212 & 0.0024 & 0.0213\\
\hline
image resolution & 100 & Laplace approximation & MNIST & EU & 0.0491 & 0.0085 & 0.0620\\
\hline
\end{tabular}
\end{table}

\begin{table}
    \centering
    % \small
        \caption{Results for image resolution with ensemble of $M = 5$. Mean and standard deviation are obtained by aggregating over all possible ensembles of size five that can be sampled from the ten predictions of the original experiment.}
    \label{tab:res_de_mnist}
\begin{tabular}{|l|l|l|l|l|r|r|r|}
\hline
\textbf{Experiment} & \textbf{Case} & \textbf{Probabilistic learner} & \textbf{Dataset} & \textbf{Measure} & \textbf{Mean} & \textbf{Standard deviation} & $M = 10$\\
\hline
image resolution & 5 & deep ensemble & MNIST & TU & 0.7635 & 0.0487 & 0.7672\\
\hline
image resolution & 5 & deep ensemble & MNIST & AU & 0.7585 & 0.0484 & 0.7615\\
\hline
image resolution & 5 & deep ensemble & MNIST & EU & 0.0050 & 0.0008 & 0.0056\\
\hline
image resolution & 10 & deep ensemble & MNIST & TU & 0.5378 & 0.0350 & 0.5413\\
\hline
image resolution & 10 & deep ensemble & MNIST & AU & 0.5272 & 0.0344 & 0.5292\\
\hline
image resolution & 10 & deep ensemble & MNIST & EU & 0.0107 & 0.0013 & 0.0121\\
\hline
image resolution & 25 & deep ensemble & MNIST & TU & 0.0519 & 0.0039 & 0.0537\\
\hline
image resolution & 25 & deep ensemble & MNIST & AU & 0.0421 & 0.0030 & 0.0423\\
\hline
image resolution & 25 & deep ensemble & MNIST & EU & 0.0098 & 0.0012 & 0.0114\\
\hline
image resolution & 50 & deep ensemble & MNIST & TU & 0.0088 & 0.0008 & 0.0093\\
\hline
image resolution & 50 & deep ensemble & MNIST & AU & 0.0058 & 0.0004 & 0.0059\\
\hline
image resolution & 50 & deep ensemble & MNIST & EU & 0.0030 & 0.0004 & 0.0035\\
\hline
image resolution & 100 & deep ensemble & MNIST & TU & 0.0074 & 0.0006 & 0.0079\\
\hline
image resolution & 100 & deep ensemble & MNIST & AU & 0.0046 & 0.0004 & 0.0046\\
\hline
image resolution & 100 & deep ensemble & MNIST & EU & 0.0028 & 0.0003 & 0.0033\\
\hline
\end{tabular}
\end{table}

\begin{table}
    \centering
    % \small
        \caption{Results for image resolution with ensemble of $M = 5$. Mean and standard deviation are obtained by aggregating over all possible ensembles of size five that can be sampled from the ten predictions of the original experiment.}
    \label{tab:res_la_cifar}
\begin{tabular}{|l|l|l|l|l|r|r|r|}
\hline
\textbf{Experiment} & \textbf{Case} & \textbf{Probabilistic learner} & \textbf{Dataset} & \textbf{Measure} & \textbf{Mean} & \textbf{Standard deviation} & $M = 10$\\
\hline
image resolution & 5 & Laplace approximation & CIFAR10 & TU & 0.7137 & 0.0451 & 0.7171\\
\hline
image resolution & 5 & Laplace approximation & CIFAR10 & AU & 0.7093 & 0.0448 & 0.7122\\
\hline
image resolution & 5 & Laplace approximation & CIFAR10 & EU & 0.0043 & 0.0003 & 0.0049\\
\hline
image resolution & 10 & Laplace approximation & CIFAR10 & TU & 0.2676 & 0.0169 & 0.2697\\
\hline
image resolution & 10 & Laplace approximation & CIFAR10 & AU & 0.2593 & 0.0164 & 0.2603\\
\hline
image resolution & 10 & Laplace approximation & CIFAR10 & EU & 0.0083 & 0.0005 & 0.0095\\
\hline
image resolution & 25 & Laplace approximation & CIFAR10 & TU & 0.1151 & 0.0073 & 0.1176\\
\hline
image resolution & 25 & Laplace approximation & CIFAR10 & AU & 0.1007 & 0.0064 & 0.1011\\
\hline
image resolution & 25 & Laplace approximation & CIFAR10 & EU & 0.0144 & 0.0010 & 0.0165\\
\hline
image resolution & 50 & Laplace approximation & CIFAR10 & TU & 0.0835 & 0.0055 & 0.0890\\
\hline
image resolution & 50 & Laplace approximation & CIFAR10 & AU & 0.0545 & 0.0036 & 0.0547\\
\hline
image resolution & 50 & Laplace approximation & CIFAR10 & EU & 0.0290 & 0.0021 & 0.0343\\
\hline
image resolution & 100 & Laplace approximation & CIFAR10 & TU & 0.0690 & 0.0047 & 0.0736\\
\hline
image resolution & 100 & Laplace approximation & CIFAR10 & AU & 0.0460 & 0.0030 & 0.0461\\
\hline
image resolution & 100 & Laplace approximation & CIFAR10 & EU & 0.0231 & 0.0018 & 0.0275\\
\hline
\end{tabular}
\end{table}

\begin{table}
    \centering
    % \small
        \caption{Results for image resolution with ensemble of $M = 5$. Mean and standard deviation are obtained by aggregating over all possible ensembles of size five that can be sampled from the ten predictions of the original experiment.}
    \label{tab:res_de_cifar}
\begin{tabular}{|l|l|l|l|l|r|r|r|}
\hline
\textbf{Experiment} & \textbf{Case} & \textbf{Probabilistic learner} & \textbf{Dataset} & \textbf{Measure} & \textbf{Mean} & \textbf{Standard deviation} & $M = 10$\\
\hline
image resolution & 5 & deep ensemble & CIFAR10 & TU & 0.7351 & 0.0467 & 0.7425\\
\hline
image resolution & 5 & deep ensemble & CIFAR10 & AU & 0.7012 & 0.0446 & 0.7040\\
\hline
image resolution & 5 & deep ensemble & CIFAR10 & EU & 0.0339 & 0.0027 & 0.0386\\
\hline
image resolution & 10 & deep ensemble & CIFAR10 & TU & 0.3727 & 0.0248 & 0.3900\\
\hline
image resolution & 10 & deep ensemble & CIFAR10 & AU & 0.2752 & 0.0197 & 0.2763\\
\hline
image resolution & 10 & deep ensemble & CIFAR10 & EU & 0.0975 & 0.0067 & 0.1137\\
\hline
image resolution & 25 & deep ensemble & CIFAR10 & TU & 0.2084 & 0.0187 & 0.2265\\
\hline
image resolution & 25 & deep ensemble & CIFAR10 & AU & 0.1134 & 0.0126 & 0.1138\\
\hline
image resolution & 25 & deep ensemble & CIFAR10 & EU & 0.0950 & 0.0068 & 0.1127\\
\hline
image resolution & 50 & deep ensemble & CIFAR10 & TU & 0.1174 & 0.0088 & 0.1302\\
\hline
image resolution & 50 & deep ensemble & CIFAR10 & AU & 0.0527 & 0.0045 & 0.0529\\
\hline
image resolution & 50 & deep ensemble & CIFAR10 & EU & 0.0648 & 0.0046 & 0.0773\\
\hline
image resolution & 100 & deep ensemble & CIFAR10 & TU & 0.1045 & 0.0137 & 0.1160\\
\hline
image resolution & 100 & deep ensemble & CIFAR10 & AU & 0.0480 & 0.0084 & 0.0482\\
\hline
image resolution & 100 & deep ensemble & CIFAR10 & EU & 0.0565 & 0.0055 & 0.0679\\
\hline
\end{tabular}
\end{table}



\nobibliography{references.bib}

\end{document}