\documentclass{midl}

\usepackage{bm}
\usepackage{booktabs}
\usepackage{graphicx}
\usepackage{multirow}
\usepackage{float}

\newtheorem{prop}{Proposition}

\newcommand{\STAB}[1]{\begin{tabular}{@{}c@{}}#1\end{tabular}}

\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}
\DeclareMathOperator{\Forall}{\forall}
\newcommand{\transp}{\mathsf{T}}

\jmlrvolume{XX}
\jmlryear{2020}
\jmlrworkshop{Full Paper -- MIDL 2020}
%\editors{Under Review for MIDL 2020}

\title[Well-Calibrated Regression Uncertainty]{Well-Calibrated Regression Uncertainty\\ in Medical Imaging with Deep Learning}

\midlauthor{\Name{Max-Heinrich Laves\textsuperscript{1}} \Email{laves@imes.uni-hannover.de}\\
\Name{Sontje Ihler\textsuperscript{1}} \Email{ihler@imes.uni-hannover.de}\\
\Name{Jacob F.\ Fast\textsuperscript{1}} \Email{fast@imes.uni-hannover.de}\\
\Name{Lüder A.\ Kahrs\textsuperscript{2,3}} \Email{lakahrs@cs.toronto.edu}\\
\Name{Tobias Ortmaier\textsuperscript{1}} \Email{ortmaier@imes.uni-hannover.de}\\
\addr {}\textsuperscript{1}Institute of Mechatronic Systems, Leibniz Universität Hannover, Hanover, Germany\\
\addr {}\textsuperscript{2}Centre for Image Guided Innovation and Therapeutic Intervention, The Hospital for Sick Children, Toronto, Canada \\
\addr {}\textsuperscript{3}Department of Mathematical and Computational Sciences, University of Toronto Mississauga, Mississauga, Canada
}

\begin{document}

\maketitle

\begin{abstract}
% Purpose
The consideration of predictive uncertainty in medical imaging with deep learning is of utmost importance.
We apply estimation of predictive uncertainty by variational Bayesian inference with Monte Carlo dropout to regression tasks and show why predictive uncertainty is systematically underestimated.
We suggest using $ \sigma $ \emph{scaling} with a single scalar value; a simple, yet effective calibration method for both aleatoric and epistemic uncertainty.
The performance of our approach is evaluated on a variety of common medical regression data sets using different state-of-the-art convolutional network architectures.
% Results
In all experiments, $ \sigma $ scaling is able to reliably recalibrate predictive uncertainty.
It is easy to implement and maintains the accuracy.
% Conclusion
Well-calibrated uncertainty in regression allows robust rejection of unreliable predictions or detection of out-of-distribution samples.
%
Our source code is available at:
\href{https://github.com/mlaves/well-calibrated-regression-uncertainty}{github.com/mlaves/well-calibrated-regression-uncertainty}
\end{abstract}

\begin{keywords}
Bayesian approximation, variational inference
\end{keywords}

\section{Introduction}
\label{sec:intro}

%In recent years, deep learning methods have advanced the state-of-the-art in a variety of domains of medical image analysis.
For the task of regression, we aim to estimate a continuous target value $ \bm{y} \in \mathbb{R}^{d} $ given an input image $ \bm{x} $.
Regression in medical imaging with deep learning has been applied to forensic age estimation from hand CT/MRI \cite{Halabi2018,Stern2016}, natural landmark localization \cite{Payer2019}, cell detection in histology \cite{Xie2018}, or instrument pose estimation \cite{Gessert2018}.
By predicting the coordinates of object boundaries, segmentation can also be performed as a regression task.
This has been done for segmentation of pulmonary nodules in CT \cite{Messay2015}, kidneys in ultrasound \cite{Yin2019}, or left ventricles in MRI \cite{Tan2017}.
In registration of medical images, a continuous displacement field is predicted for each coordinate of $ \bm{x} $, which has also recently been addressed by CNNs for regression \cite{Dalca2019}.

In medical imaging it is crucial to consider the predictive uncertainty of deep learning models.
Bayesian neural networks (BNN) and their approximation provide mathematical tools for  reasoning the uncertainty \cite{Bishop2006,Kingma2013}.
In general, predictive uncertainty can be split into two types: aleatoric and epistemic uncertainty \cite{Kendall2017}.
Aleatoric uncertainty arises from the data directly; e.\,g.\ sensor noise or motion artifacts.
In regression, it is derived from the conditional log-likelihood under the maximum likelihood framework and can be captured by a deep model directly (see §\,\ref{sec:cond-log-likelihood}).
Epistemic uncertainty is caused by uncertainty in the model parameters due to a limited amount of training data \cite{Bishop2006}.
A well-accepted approach to quantify epistemic uncertainty is variational inference with Monte Carlo (MC) dropout, where dropout is used at test time to sample from the approximate posterior \cite{Gal2016}.

\begin{figure}
    \centering
    \includegraphics[scale=0.9]{opener_results.pdf}
    \caption{Calibration plots and uncertainty calibration error (UCE) for EfficientNet-B4 on BreastPathQ test set. Uncalibrated uncertainty is underestimated and does not correspond well with the model error (left). Uncertainty can be calibrated most effectively with $ \sigma $ scaling (right). Solid lines show the mean and shaded areas show standard deviation from 5 repeated runs. Dashed lines denote perfect calibration.}
    \label{fig:opener}
\end{figure}

However, uncertainty obtained by deep BNNs tends to be miscalibrated, i.\,e.\ it does not correspond well with the model error\,\cite{Laves2019NIPS}.
Fig.\,\ref{fig:opener} shows calibration plots (predictive error vs.\ uncertainty) for uncalibrated and calibrated uncertainty estimates.
The predictive uncertainty (taking into account both epistemic and aleatoric uncertainty) is underestimated and does not allow robust detection of uncertain predictions at test time.

Calibration of uncertainty in regression has been addressed in prior work.
In \cite{Kuleshov2018}, inaccurate uncertainties from Bayesian models for regression are recalibrated using a technique inspired by Platt scaling.
Given a pre-trained, miscalibrated model $ \bm{H} $, an auxiliary model $ \bm{R} : [0,1]^{d} \rightarrow [0,1]^{d} $ is trained, that yields a calibrated regressor $ \bm{R} \circ \bm{H} $.
In \cite{Phan2018}, this method was applied to bounding box regression.
However, an auxiliary model with enough capacity will always be able to recalibrate, even if the predicted uncertainty is completely uncorrelated with the real uncertainty.
Furthermore, Kuleshov et al. state that calibration via $ \bm{R} $ is possible if enough independent and identically distributed (i.i.d.) data is available.
In medical imaging, large data sets are usually hard to obtain, which can cause $ \bm{R} $ to overfit the calibration set (as we will show later).
This downside was addressed in \cite{Levi2019}, which is most related to our work.
They proposed to scale the standard deviation of a Gaussian model to recalibrate aleatoric uncertainty.
In contrast to our work, they do not take into account epistemic uncertainty, which is an important source of uncertainty, especially when dealing with small data sets in medical imaging.

To the best of our knowledge, calibration of predictive uncertainty for regression tasks in medical imaging has not been addressed. Our main contributions are:
(1) We analyze and provide theoretical background why deep models for regression are miscalibrated with regard to predictive uncertainty,
(2) we suggest to use $ \sigma $ \emph{scaling} in a separate calibration phase to tackle underestimation of uncertainty, and
(3) we perform extensive experiments on four different data sets to show the effectiveness of the proposed method.

\section{Methods}

In this section, we discuss estimation of aleatoric and epistemic uncertainty for regression and show, why uncertainty is systematically miscalibrated.
We propose to use $ \sigma $ scaling to jointly calibrate aleatoric and epistemic uncertainty.

\subsection{Conditional Log-Likelihood for Regression}
\label{sec:cond-log-likelihood}

We revisit regression under the maximum posterior (MAP) framework to derive direct estimation of heteroscedastic aleatoric uncertainty.
The goal of our regression model is to predict a target value $ \bm{y} $ given some new input $ \bm{x} $ and a training set $ \mathcal{D} $ of $ m $ inputs $ \{ \bm{x}_1, \ldots, \bm{x}_m \} $ and their corresponding (observed) target values $ \{ \bm{y}_1, \ldots , \bm{y}_m \} $.
We assume that $ \bm{y} $ has a Gaussian distribution $ \mathcal{N} \left( \bm{y} ; \hat{\bm{y}}(\bm{x}), \hat{\sigma}^{2}(\bm{x}) \right) $ with mean equal to $ \hat{\bm{y}}(\bm{x}) $ and variance $ \hat{\sigma}^{2}(\bm{x}) $.
A neural network with parameters $ \bm{\theta} $
\begin{equation}
    \bm{f}_{\bm{\theta}} \left( \bm{x} \right) = \left[ \hat{\bm{y}}(\bm{x}), \hat{\sigma}^{2} (\bm{x}) \right] , ~ \hat{\bm{y}} \in \mathbb{R}^{d} , ~ \hat{\sigma}^{2} \geq 0
    \label{eq:neural_net}
\end{equation}
outputs these values for a given input.
%We use a Gaussian to model the likelihood and define
%\begin{equation}
%    p \left( \bm{y} \vert \bm{x} \right) = \mathcal{N} \left( \bm{y} ; \hat{\bm{y}} (\bm{x}), \hat{\sigma}^{2}(\bm{x}) \right) ~ .
%    \label{eq:gaussian_likelihood}
%\end{equation}
By assuming a Gaussian prior over the parameters $ \bm{\theta} \sim \mathcal{N}(\bm{\theta} ; \bm{0}, \lambda^{-1} \bm{I}) $, MAP estimation becomes maximum-likelihood estimation with added weight decay \cite{Bishop2006}.
With $ m $ i.i.d.\ random samples, the conditional log-likelihood is given by
\begin{align} 
%    & \sum_{i=1}^{m} \log p \left( \bm{y}^{(i)} \vert \bm{x}^{(i)} ; \hat{\bm{y}}_{\bm{\theta}}^{(i)} , \big( \hat{\sigma}^{(i)}_{\bm{\theta}} \big)^{2} \right) \\
    & \sum_{i=1}^{m} \log \left( \frac{1}{\sqrt{2\pi} \hat{\sigma}^{(i)}_{\bm{\theta}}} \exp \left\{ - \frac{\big\Vert \bm{y}^{(i)} - \hat{\bm{y}}_{\bm{\theta}}^{(i)} \big\Vert^{2}}{2 \big( \hat{\sigma}^{(i)}_{\bm{\theta}} \big)^{2} } \right\} \right) \\
    = & - \dfrac{m}{2} \log \left( 2\pi \right) - \sum_{i=1}^{m} \log \big( \hat{\sigma}^{(i)}_{\bm{\theta}} \big) + \frac{1}{2 \big( \hat{\sigma}_{\bm{\theta}}^{(i)} \big)^{2} } \big\Vert \bm{y}^{(i)} - \hat{\bm{y}}_{\bm{\theta}}^{(i)} \big\Vert^{2} ~ .
        \label{eq:gaussian_derive}
\end{align}
The dependence on $ \bm{x} $ has been omitted to simplify the notation. 
Maximizing the log-likelihood in Eq.\,(\ref{eq:gaussian_derive}) w.r.t.\ $ \bm{\theta} $ is equivalent to minimizing the negative log-likelihood (NLL), which leads to the following optimization criterion (with weight decay)
\begin{equation}
    \mathcal{L}_{\mathrm{G}}(\bm{\theta}) = \sum_{i=1}^{m} \big( \hat{\sigma}^{(i)}_{\bm{\theta}} \big)^{-2} \big\Vert \bm{y}^{(i)} - \hat{\bm{y}}_{\bm{\theta}}^{(i)} \big\Vert^{2} + \log \big( ( \hat{\sigma}_{\bm{\theta}}^{(i)} )^{2} \big) ~ .
    \label{eq:loss_gaussian}
\end{equation}
%
Here, $ \hat{\bm{y}}_{\bm{\theta}} $ and $ \hat{\sigma}_{\bm{\theta}} $ are estimated jointly by finding $ \bm{\theta} $ that minimizes Eq.\,(\ref{eq:loss_gaussian}).
This can be achieved using gradient descent in a standard training procedure.
In this case, $ \hat{\sigma}_{\bm{\theta}} $ captures the uncertainty that is inherent in the data (aleatoric uncertainty).
To avoid numerical instability due to potential division by zero, we directly estimate $ \log \hat{\sigma}^{2} (\bm{x}) $ and implement Eq.\,(\ref{eq:loss_gaussian}) in similar practice to \citet{Kendall2017}.

\subsection{Biased estimation of \texorpdfstring{$ \sigma $}{Sigma}}

\begin{figure}
    \centering
    \includegraphics[scale=0.9]{sigma_overfit.pdf}
    \caption{Biased estimation of aleatoric uncertainty $ \sigma^{2} $. The deep model overfits estimation of $ \bm{y} $ on the training set. On unseen test data, the MSE of predictive mean is higher and $ \sigma^{2} $ is underestimated. Early stopping (e.g.\ at epoch 50) would result in an unbiased estimator, but this would not be optimal in terms of test MSE.}
    \label{fig:biased_estimation}
\end{figure}
Ignoring the dependence through $ \bm{\theta} $, the solution to Eq.\,(\ref{eq:loss_gaussian}) decouples estimation of $ \hat{\bm{y}} $ and $ \hat{\sigma} $.
In case of a Gaussian likelihood, minimizing (\ref{eq:loss_gaussian}) w.r.t.\ $ \hat{\bm{y}}^{(i)} $ yields
\begin{equation}
    \hat{\bm{y}}^{(i)} = \argmin_{\hat{\bm{y}}^{(i)}} \mathcal{L}_{\mathrm{G}} = \bm{y}^{(i)} ~ \Forall i ~ .
\end{equation}
%Therefore, Eq.\,(\ref{eq:loss_gaussian}) gives an unbiased estimation of $ \bm{y}^{(i)} $.
Minimizing (\ref{eq:loss_gaussian}) w.r.t. $ (\hat{\sigma}^{(i)})^{2} $ yields
\begin{equation}
    \big( \hat{\sigma}^{(i)} \big)^{2} = \argmin_{ ( \hat{\sigma}^{(i)} )^{2}} \mathcal{L}_{\mathrm{G}} = \Vert \bm{y}^{(i)} - \hat{\bm{y}}^{(i)} \Vert^{2} ~ \Forall i ~ .
    \label{eq:perfect_argmin}
\end{equation}
That is, estimation of $ \sigma^{2} $ should perfectly reflect the squared error.
However, in (\ref{eq:perfect_argmin}) $ \sigma^{2} $ is estimated relative to the estimated mean $ \hat{\bm{y}} $ and therefore biased.
In fact, the maximum likelihood solution systematically underestimates $ \sigma^{2} $, which is a phenomenon of overfitting the training set \cite{Bishop2006}.
The squared error $ \Vert \bm{y} - \hat{\bm{y}} \Vert^{2} $ will be lower on the training set and $ \hat{\sigma}^{2} $ on new samples will be systematically too low (see Fig.\,\ref{fig:biased_estimation}).
This is a problem especially in deep learning, where large models have millions of parameters and tend to overfit.
Bias in estimation of $ \sigma^{2} $ can be corrected by a scaling factor.
Rescaling for unbiased estimation of the population variance via computing the sample variance is famously known as \emph{Bessel's correction}.
In our case, we introduce a simple learnable scalar parameter to rescale the biased estimation of $ \sigma^{2} $.

\subsection{\texorpdfstring{$ \sigma $}{Sigma} Scaling for Aleatoric Uncertainty}
\label{sec:scaling}

We first derive $ \sigma $ scaling for aleatoric uncertainty.
Using a Gaussian model, we scale the standard deviation $ \sigma $ with a scalar value $ s $ to recalibrate the probability density function
\begin{equation}
    p \left( \bm{y} \vert \bm{x} ; \hat{\bm{y}} (\bm{x}), \hat{\sigma}^{2}(\bm{x})  \right) = \mathcal{N} \left( \bm{y} ; \hat{\bm{y}} (\bm{x}), (s \cdot \hat{\sigma}(\bm{x}))^{2} \right) ~ .
\end{equation}
Now, the conditional log-likelihood is given by
\begin{equation}
    \sum_{i=1}^{m} \log p \left( \bm{y}^{(i)} \vert \bm{x} ; \hat{\bm{y}}_{\bm{\theta}}^{(i)} , \big(s \cdot \hat{\sigma}^{(i)}_{\bm{\theta}}\big)^{2} \right) ~ .
\end{equation}
This results in the following optimization objective
\begin{equation}
    \mathcal{L}_{\mathrm{G}}(s) = m \log(s) + \tfrac{1}{2} s^{-2} \sum_{i=1}^{m} \big( \hat{\sigma}_{\bm{\theta}}^{(i)} \big)^{-2} \big\Vert \bm{y}^{(i)} - \hat{\bm{y}}_{\bm{\theta}}^{(i)} \big\Vert^{2} ~ .
    \label{eq:loss_scaling_gaussian}
\end{equation}
Eq.\,(\ref{eq:loss_scaling_gaussian}) is optimized w.r.t.\ $ s $ with fixed $ \bm{\theta} $ using gradient descent in a separate calibration phase after training to calibrate aleatoric uncertainty measured by $ \hat{\sigma}_{\bm{\theta}}^{2} $.
In case of a single scalar, the solution to Eq.\,(\ref{eq:loss_scaling_gaussian}) can also be written in closed form as
\begin{equation}
    s = \pm \sqrt{\frac{1}{m} \sum_{i=1}^{m} \big( \hat{\sigma}_{\bm{\theta}}^{(i)} \big)^{-2} \big\Vert \bm{y}^{(i)} - \hat{\bm{y}}_{\bm{\theta}}^{(i)} \big\Vert^{2}} ~ .
\end{equation}
We apply $ \sigma $ scaling to jointly calibrate aleatoric and epistemic uncertainty in the next section.

\subsection{Well-Calibrated Estimation of Predictive Uncertainty}
\label{sec:well}

So far we have assumed a MAP point estimate for $ \bm{\theta} $ which does not consider uncertainty in the parameters.
To quantify both aleatoric and epistemic uncertainty, we extend $ \bm{f}_{\bm{\theta}} $ into a fully Bayesian model under the variational inference framework with Monte Carlo dropout \cite{Gal2016}.
In MC dropout, the model $ \bm{f}_{\tilde{\bm{\theta}}} $ is trained with dropout \cite{Srivastava2014} and dropout is applied at test time by performing $ N $ stochastic forward passes to sample from the approximate Bayesian posterior $ \tilde{\bm{\theta}} \sim q (\bm{\theta}) $.
Following \cite{Kendall2017}, we use MC integration to approximate the predictive variance
\begin{equation}
    \hat{\Sigma}^{2} =  \underbrace{ \frac{1}{N} \sum_{n=1}^{N} \left( \hat{\bm{y}}_{n} - \frac{1}{N} \sum_{n=1}^{N} \hat{\bm{y}}_{n} \right)^{2}}_{\mathrm{epistemic}} + \underbrace{ \frac{1}{N} \sum_{n=1}^{N} \hat{\sigma}^{2}_{n} }_{\mathrm{aleatoric}}
    \label{eq:pred_variance}
\end{equation}
and use $ \hat{\Sigma}^{2} $ as a measure of predictive uncertainty.
If the neural network has multiple outputs ($ d > 1 $), the predictive variance is calculated per output and the mean across $ d $ forms the final uncertainty value.
We expect $ \hat{\Sigma}^{2} $ to reflect the squared error of $ \hat{\bm{y}} $ and define perfect calibration of predictive uncertainty as
\begin{equation}
    \mathbb{E}_{\bm{x},\bm{y}} \left[ \Vert \bm{y} - \hat{\bm{y}} \Vert^{2} \, \big\vert \, \hat{\Sigma}^{2} = \Sigma^{2} \right] = \Sigma^{2} \quad \Forall \left\{ \Sigma ^{2}\in \mathbb{R} \, \vert \, \Sigma^{2} \geq 0 \right\} ~ ,
    \label{eq:perfect}
\end{equation}
which extends the definition by \cite{Levi2019}.
For example, in a batch of images all predicted with $ \hat{\Sigma}^{2} = 0.5 $, the expectation of the squared error should equal 0.5.
Eq.\,(\ref{eq:pred_variance}) is an unbiased estimator of the approximate predictive variance (see proof in Appendix\,\ref{app:proof_var}).
However, even in deep learning with Bayesian principles, the approximate posterior predictive distribution can overfit on small data sets.
In practice, this results in underestimation of the predictive uncertainty.

One could regularize overfitting by early stopping at minimal loss (Eq.\,(\ref{eq:loss_gaussian})) on the validation set, which would circumvent underestimation of $ \sigma^{2} $.
However, our experiments show that early stopping is not optimal with regard to the squared error of $ \hat{\bm{y}} $ on both training and testing data (see Fig.\,\ref{fig:biased_estimation}).
In contrast, the model with lowest mean error on the validation set underestimates predictive uncertainty considerably.
Therefore, we apply $ \sigma $ scaling to recalibrate the predictive uncertainty $ \hat{\Sigma}^{2} $.
This allows a lower squared error but reduces underestimation of uncertainty as shown experimentally in the following section.

\subsection{Expected Uncertainty Calibration Error for Regression}
\label{app:uce}

We extend the definition of miscalibrated uncertainty for classification \cite{Laves2019NIPS} to quantify miscalibration of uncertainty in regression
\begin{equation}
    \mathbb{E}_{\hat{\Sigma}^{2}} \left[ \big\vert \big( \Vert \bm{y} - \hat{\bm{y}} \Vert^{2} \, \big\vert \, \hat{\Sigma}^{2} = \Sigma^{2} \big) - \Sigma^{2} \big\vert \right] \quad \Forall \left\{ \Sigma^{2} \in \mathbb{R} \, \vert \, \Sigma^{2} \geq 0 \right\} ~ .
\end{equation}
On finite data sets, this can be approximated with the expected uncertainty calibration error (UCE) for regression.
Following \cite{Guo2017}, the uncertainty output $ \hat{\Sigma}^{2} $ of a deep model is partitioned into $ M $ bins with equal width.
A weighted average of the difference between the predictive error and uncertainty is used:
\begin{equation}
    \mathrm{UCE} := \sum_{m=1}^{M} \frac{\vert B_{m} \vert}{n} \big\vert \mathrm{err}(B_{m}) - \mathrm{uncert}(B_{m}) \big\vert ~ , 
\end{equation}
with number of inputs $ n $ and set of indices $ B_{m} $ of inputs, for which the uncertainty falls into the bin.
The error per bin is defined as
\begin{equation}
    \mathrm{err}(B_{m}) := \frac{1}{\vert B_{m} \vert} \sum_{i \in B_{m}} \big\Vert \bm{y}_{i} - \hat{\bm{y}}_{i} \big\Vert^{2} ~ ,
\end{equation}
and the uncertainty per bin is defined as
\begin{equation}
    \mathrm{uncert}(B_{m}) := \frac{1}{\vert B_{m} \vert} \sum_{i \in B_{m}} \hat{\Sigma}_{i}^{2} ~ .
\end{equation}
Throughout this work, UCE is given in \%.
Additionally, we plot $ \mathrm{err}(B_{m}) $ vs.\ $ \mathrm{uncert}(B_{m}) $ to create calibration diagrams.

\section{Experiments \& Results}
\label{sec:experiments}

We use four data sets and three different deep network architectures to evaluate recalibration with $ \sigma $ scaling.
The last linear layer of all networks is replaced by two linear layers predicting $ \hat{\bm{y}} $ and $ \hat{\sigma}^{2} $ as described in §\,\ref{sec:cond-log-likelihood}.
For MC dropout, we use dropout before the last linear layers and at different locations, depending on the architecture.
Dropout is additionally added after each of the four layers of stacked residual blocks in ResNet \cite{He2016}.
In DenseNet and EfficientNet, we use the default configuration of dropout during training and testing \cite{Huang2017,Tan2019}.
The networks are trained until no further decrease in mean squared error (MSE) on the validation set can be observed.

The data sets were selected to represent various regression tasks in medical imaging with different dimension $ d $ of target value $ \bm{y} \in \mathbb{R}^{d} $:
(1) tumor cellularity in H\,\&\,E stained whole slides of cancerous breast tissue from BreastPathQ SPIE challenge data set ($ d = 1$) \cite{Martel2019},
(2) hand CT age regression from the RSNA pediatric bone age data set ($ d = 1 $) \cite{Halabi2018},
(3) surgical instrument tracking on endoscopic images from EndoVis endoscopic vision challenge 2015 data set ($ d = 2 $), and
(4) 6DoF needle pose estimation on optical coherence tomography (OCT) scans from our own data set\footnote{Our OCT pose estimation data set is publicly available at \href{https://github.com/mlaves/3doct-pose-dataset}{github.com/mlaves/3doct-pose-dataset}} ($ d = 6 $).
All outputs are normalized such that $ \bm{y} \in [0, 1]^{d}$.
More details on the training procedure can be found in Appendix \ref{app:training}.

\begin{figure}[t]
    \centering
    \includegraphics[scale=0.9]{results_breastpathq_resnet101.pdf} \\
    ~\\
    \includegraphics[scale=0.9]{results_endovis_densenet201.pdf} \\ % was densenet on boneage
    \caption{Calibration plots for ResNet-101 on BreastPathQ (top row) and DenseNet-201 on EndoVis (bottom row). Aux scaling tends to overfit the calibration set, which results in higher UCE compared to simple $ \sigma $ scaling. Dashed lines denote perfect calibration.}
    \label{fig:results}
\end{figure}

Calibration is performed after training in a separate calibration phase using the validation data set.
We plug the predictive uncertainty $ \hat{\Sigma}^{2} $ into Eq.\,(\ref{eq:loss_scaling_gaussian}) and minimize w.r.t.\ $ s $.
Additionally, we compare $ \sigma $ scaling to a more powerful auxiliary recalibration model $ \bm{R} $ consisting of a two-layer fully-connected network with 16 hidden units and ReLU activations (inspired by \cite{Kuleshov2018}, see §\,\ref{sec:intro}).

To quantify miscalibration, we use the proposed expected uncertainty calibration error for regression.
We visualize (mis-)calibration in Fig.\,\ref{fig:opener} and Fig.\,\ref{fig:results} using calibration diagrams, which show predictive uncertainty vs.\ predictive error (MSE). 
The discrepancy to the identity function reveals miscalibration.
Fig.\,\ref{fig:calibrated_estimation} uses intra-training calibration of aleatoric uncertainty to show closing the gap between test MSE and uncertainty.
%However, this is an artificial result as it does not contain epistemic uncertainty and calibration is usually performed post-training.
Tab.\,\ref{tab:results} reports negative log-likelihood and UCE values of all data set/model combinations on the respective test sets.
Fig.\,\ref{fig:result_endovis} shows a practical example from the EndoVis test set.
Figures for all configurations are listed in Appendix \ref{app:figures}.

\begin{figure}
    \centering
    \includegraphics[scale=0.9]{sigma_overfit_revisited.pdf}
    \caption{(Left) Intra-training calibration of aleatoric uncertainty with $ \sigma $ scaling. The deep model no longer underestimates $ \hat{\sigma}^{2} $ on unseen test data. (Right) The MSE of predictive mean is higher and $ \sigma^{2} $ is underestimated. Note: Calibration is only applied at test time.}
    \label{fig:calibrated_estimation}
\end{figure}

\begin{figure}
    \centering
    \includegraphics[scale=0.89]{endovis_results.pdf} \\
    \small pixel coordinates
    \caption{Example result from EndoVis test set. The task is to predict pixel coordinates of the forceps shaft center. Before calibration, the uncertainty is underestimated and the true instrument position $ \bm{y} $ does not fall into the predictive uncertainty region around $ \hat{\bm{y}} $. After calibration with $ \sigma $ scaling, the uncertainty better reflects the predictive error.}
    \label{fig:result_endovis}
\end{figure}

\begin{table}
    \caption{Values of negative log-likelihood of uncalibrated and calibrated models. If the uncalibrated model already achieves low NLL, aux scaling may overfit the calibration set, resulting in worse NLL on the test set.}
    \centering
    \begin{tabular}{ccccc}
    \toprule
    & & uncalibrated & aux scaling & $ \sigma $ scaling \\
    \cmidrule{2-5}
    ResNet-101/ & calibration set NLL & -2.26 & \textbf{-4.92} & -4.88 \\
    BreastPathQ & test set NLL & -2.82 & -4.89 & \textbf{-5.02} \\
    \midrule
    EfficientNet-B4/ & calibration set NLL & -4.92 & \textbf{-5.91} & -5.88 \\
    EndoVis & test set NLL & -5.93 & -6.17 & \textbf{-6.24} \\
    \bottomrule 
    \end{tabular}    
    \label{tab:calib_overfit_app}
\end{table}

\begin{table}
    \caption{Test set results for different datasets and model architectures (averaged over 5 runs). The values of the negative log-likelihood and uncertainty calibration error quantify miscalibration. In addition, the resulting $ s $ for $ \sigma $ scaling is given.}
    \footnotesize
    \centering
    \begin{tabular}{ccccccccccc}
    \toprule
     & & & \multicolumn{2}{c}{uncalibrated} & \multicolumn{2}{c}{aux scaling} & \multicolumn{3}{c}{$ \sigma $ scaling} \\
    \cmidrule(lr){4-5} \cmidrule(lr){6-7} \cmidrule(lr){8-10}
    Data Set & Model & MSE & NLL & UCE & NLL & UCE & NLL & UCE & s \\
    \midrule
                & ResNet-101      & 6.0e-3 & -2.90 & 0.47 & -5.17 & 0.34 & -5.16 & \textbf{0.23} & 2.39 \\
    BreastPathQ & DenseNet-201    & 6.2e-3 & -5.66 & 0.28 & -6.04 & 0.42 & -5.77 & \textbf{0.15} & 1.31 \\
                & EfficientNet-B4 & 5.9e-3 & -4.75 & 0.44 & -6.38 & 0.13 & -5.62 & \textbf{0.11} & 1.79 \\
    \midrule
            & ResNet-101          & 5.1e-3 & -3.99 & 0.28 & -4.34 & \textbf{0.06} & -4.34 & \textbf{0.06} & 1.40 \\
    BoneAge & DenseNet-201        & 3.5e-3 & -0.84 & 0.29 & -4.71 & \textbf{0.04} & -4.71 & \textbf{0.04} & 2.53 \\
            & EfficientNet-B4     & 3.5e-3 &  6.34 & 0.32 & -4.75 & \textbf{0.06} & -4.64 & 0.18 & 3.91 \\
    \midrule
            & ResNet-101          & 4.0e-4 & -3.85 & \textbf{0.03} & -6.79 & 0.04 & -6.73 & 0.04 & 3.46 \\
    EndoVis & DenseNet-201        & 1.1e-3 & -4.97 & 0.08 & -6.01 & \textbf{0.07} & -6.04 & \textbf{0.07} & 2.58 \\
            & EfficientNet-B4     & 8.9e-4 & -5.94 & 0.05 & -6.18 & \textbf{0.04} & -6.17 & \textbf{0.04} & 1.78 \\
    \midrule
        & ResNet-101              & 2.0e-3 & -3.38 & 0.15 & -5.24 & \textbf{0.02} & -5.24 & \textbf{0.02} & 2.13 \\
    OCT & DenseNet-201            & 1.3e-3 & -5.51 & 0.04 & -5.61 & \textbf{0.01} & -5.61 & 0.02 & 1.27 \\
        & EfficientNet-B4         & 1.4e-3 & -4.25 & 0.10 & -5.58 & \textbf{0.01} & -5.57 & \textbf{0.01} & 1.93 \\
    \bottomrule
    \end{tabular}
    \label{tab:results}
\end{table}

\subsection{Detection of Out-of-Distribution Data and Unreliable Predictions}

Deep neural networks only yield reliable predictions for data which follow the same distribution as the training data.
A shift in distribution could occur when a model trained on CT data from a specific CT device is applied to data from another manufacturer's CT device, for example.
To create a distribution shift, we add Gaussian noise with $ \mathcal{N}(c, c^{2}) $ to the BoneAge data and report histograms of the uncertainties for $ c \in \{0.0, 0.1, 0.2\} $ (see Fig.\,\ref{fig:ood}).
\citet{Lakshminarayanan2017} state that deep ensembles provide better-calibrated uncertainty than Bayesian neural networks with MC dropout variational inference.
We therefore train an ensemble of 5 randomly initialized DenseNet-201 and compare Bayesian uncertainty with $ \sigma $ scaling to ensemble uncertainty under distribution shift.

Additionally, we apply the well-calibrated models to detect and reject uncertain predictions, as crucial decisions in medical practice should only be made on the basis of reliable predictions.
An uncertainty threshold $ \Sigma_{\mathrm{max}}^{2} $ is defined and all predictions from the test set are rejected where $ \hat{\Sigma}^{2} > \Sigma_{\mathrm{max}}^{2} $ (see Fig.\,\ref{fig:rejection}).
From this, a decrease in overall MSE is expected.
We additionally compare rejection on the basis of $ \sigma $ scaled uncertainty to uncertainty from the aforementioned ensemble.

\begin{figure}
    \centering
    \includegraphics[scale=.9]{ood_boneage_ensemble_densenet201.pdf} ~ \includegraphics[scale=.9]{ood_boneage_s_densenet201.pdf}
    \caption{Histograms of the uncertainties for out-of-distribution detection with DenseNet-201 on BoneAge test set. (Left) Uncertainties from a non-Bayesian ensemble of five DenseNets and (right) Bayesian uncertainties calibrated with $ \sigma $ scaling. The distribution shift has been created by additive Gaussian noise with $ \mathcal{N}(c, c^{2}) $.}
    \label{fig:ood}
\end{figure}

\begin{figure}
    \centering
    \includegraphics[scale=.9]{rejection_ensemble_densenet201.pdf} ~ \includegraphics[scale=.9]{rejection_s_densenet201.pdf}
    \caption{Rejection of uncertain predictions with DenseNet-201 on BoneAge test set with $ \hat{\Sigma}^{2} > \Sigma^{2}_{\mathrm{max}} $. The shaded area width visualizes the percentage of rejected samples. The dashed line visualizes linear relationship.}
    \label{fig:rejection}
\end{figure}

\section{Discussion \& Conclusion}

In this paper, well-calibrated predictive uncertainty in medical imaging obtained by variational inference with deep Bayesian models is discussed.
Both calibration methods considerably reduce miscalibration of predictive uncertainty in terms of UCE and NLL.
If the model is already well-calibrated (see BreastPathQ/DenseNet-201 in Tab.\,\ref{tab:results}), aux scaling can slightly increase UCE.
In such cases, we often observe the more powerful auxiliary model $ \bm{R} $ to overfit the calibration set (see Tab.\,\ref{tab:calib_overfit_app}).
This results in aux scaling yielding the lowest NLL on the calibration set, which is however outperformed on the test set by $ \sigma $ scaling, or even by the uncalibrated model.
If the deep model is already well-calibrated, $ \sigma $ scaling does not negatively affect the calibration, which results in $ s \rightarrow 1 $.

Well-calibrated uncertainty from MC dropout is able to reliably detect a shift in the data distribution.
The results are comparable to those from a deep ensemble, but without the need to train multiple individual models on the same data set.
This is in contrast to what was reported by \citet{Lakshminarayanan2017}.
BNNs calibrated with $ \sigma $ scaling even outperform ensembles in the rejection task (see Fig.\,\ref{fig:rejection}).
In case of $ \sigma $ scaling, the test set MSE decreases monotonically as a function of the uncertainty threshold, whereas the ensemble initially shows an increasing MSE.

$ \sigma $ scaling is simple to implement, does not change the predictive mean $ \hat{\bm{y}} $, and does not affect the model accuracy.
It is preferable to regularization (e.\,g.\ early stopping) or more complex recalibration methods in calibrated uncertainty estimation with Bayesian deep learning.
The disconnection between test MSE and test NLL can successfully be closed, which creates highly accurate models with reliable uncertainty estimates.

Predictive uncertainty should be considered in any medical imaging task that is approached with deep learning.
Well-calibrated uncertainty is of great importance for decision-making and is anticipated to increase patient safety.
It allows to robustly reject unreliable predictions or out-of-distribution samples.
However, there are many factors (e.\,g.\ network capacity, weight decay, dropout configuration) influencing the uncertainty that have not been discussed here and will be addressed in future work.

% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{We thank Vincent Modes for his insightful comments. This research has received funding from the European Union as being part of the ERDF OPhonLas project.}

\sloppy
\bibliography{laves20.bib}

\appendix

\pagebreak

\section{Laplacian Model}

Using $ \mathsf{Laplace}(\hat{\bm{y}} (\bm{x}), \hat{\sigma} (\bm{x})) $ as model, the conditional log-likelihood is given by
\begin{align} 
      & \sum_{i=1}^{m} \log \left( \frac{1}{2 \hat{\sigma}^{(i)}_{\bm{\theta}}} \exp \left\{ - \frac{\big| \bm{y}^{(i)} - \hat{\bm{y}}_{\bm{\theta}}^{(i)} \big|}{\hat{\sigma}^{(i)}_{\bm{\theta}}} \right\} \right) \\
%    = & - \sum_{i=1}^{m} \log \big( 2 \hat{\sigma}^{(i)}_{\bm{\theta}} \big) + (\hat{\sigma}_{\bm{\theta}}^{(i)})^{-1} \big| \bm{y}^{(i)} - \hat{\bm{y}}_{\bm{\theta}}^{(i)} \big| ~ ,
        \label{eq:laplacian_derive}
\end{align} 
which results in the following optimization criterion
\begin{equation}
    \mathcal{L}_{\mathrm{L}}(\bm{\theta}) = \sum_{i=1}^{m} \frac{1}{\hat{\sigma}^{(i)}_{\bm{\theta}}} \big| \bm{y}^{(i)} - \hat{\bm{y}}_{\bm{\theta}}^{(i)} \big| + \log \big( \hat{\sigma}_{\bm{\theta}}^{(i)} \big) ~ .
    \label{eq:loss_laplacian}
\end{equation}
Using $ \mathcal{L}_{\mathrm{L}}(\bm{\theta}) $ instead of $ \mathcal{L}_{\mathrm{G}}(\bm{\theta}) $ results in performing an L1 metric on the predictive mean.
In some cases, this led to better results.
However, we have not conducted extensive experiments with it and leave it to future work.

\section{Derivation of \texorpdfstring{$\sigma$}{sigma} Scaling}

See §\,\ref{sec:scaling}.
Using a Gaussian model, we scale the standard deviation $ \sigma $ with a scalar value $ s $ to calibrate the PDF
\begin{equation}
    p \left( \bm{y} \vert \bm{x} ; \hat{\bm{y}} (x), \hat{\sigma}^{2}(x)  \right) = \mathcal{N} \left( \bm{y} ; \hat{\bm{y}} (x), (s \cdot \hat{\sigma}(x))^{2} \right) ~ .
\end{equation}
The conditional log-likelihood is given by
\begin{align}
%    & \sum_{i=1}^{m} \log p \left( \bm{y}^{(i)} \vert \bm{x} ; \hat{\bm{y}}_{\bm{\theta}}^{(i)} , (s \cdot \hat{\sigma}^{(i)}_{\bm{\theta}})^{2} \right) \\
    & \sum_{i=1}^{m} \log \left( \frac{1}{\sqrt{2\pi} s \hat{\sigma}_{\theta}^{(i)}} \exp \left( \frac{\big\Vert \bm{y}^{(i)} - \hat{\bm{y}}^{(i)}_{\bm{\theta}} \big\Vert^{2}}{2 \left( s \hat{\sigma}_{\bm{\theta}}^{(i)} \right)^{2}} \right) \right) \\
    &= - \dfrac{m}{2} \log\left( 2\pi \right) - \sum_{i=1}^{m} \log \left( s \hat{\sigma}^{(i)}_{\bm{\theta}} \right) + \frac{1}{2} \left( s \hat{\sigma}_{\bm{\theta}}^{(i)} \right)^{-2} \cdot \big\Vert \bm{y}^{(i)} - \hat{\bm{y}}_{\bm{\theta}}^{(i)} \big\Vert^{2}
\end{align}
This results in the following optimization objective (ignoring constants):
\begin{equation}
    \mathcal{L}_{\mathrm{G}}(s) = m \log(s) + \tfrac{1}{2} s^{-2} \sum_{i=1}^{m} (\hat{\sigma}_{\bm{\theta}}^{(i)})^{-2} \big\Vert \bm{y}^{(i)} - \hat{\bm{y}}_{\bm{\theta}}^{(i)} \big\Vert^{2} ~ .
    \label{eq:loss_scaling_gaussian_app}
\end{equation}
Using a Laplacian model, the optimization criterion follows as
\begin{equation}
    \mathcal{L}_{\mathrm{L}}(s) = m \log(s) + s^{-1} \sum_{i=1}^{m} \frac{1}{\hat{\sigma}_{\bm{\theta}}^{(i)}} \left| \bm{y}^{(i)} - \hat{\bm{y}}_{\bm{\theta}}^{(i)} \right| ~ .
    \label{eq:loss_scaling_laplacian_app}
\end{equation}
Eq.\,(\ref{eq:loss_scaling_gaussian_app}) and (\ref{eq:loss_scaling_laplacian_app}) are optimized w.r.t.\ $ s $ with fixed $ \bm{\theta} $ using gradient descent in a separate calibration phase after training.
The solution to Eq.\,(\ref{eq:loss_scaling_gaussian_app}) can also be written in closed form as
\begin{equation}
    s_{\mathrm{G}} = \pm \sqrt{\frac{1}{m} \sum_{i=1}^{m} \big( \hat{\sigma}_{\bm{\theta}}^{(i)} \big)^{-2} \big\Vert \bm{y}^{(i)} - \hat{\bm{y}}_{\bm{\theta}}^{(i)} \big\Vert^{2}}
\end{equation}
and the solution to Eq.\,(\ref{eq:loss_scaling_laplacian_app}) follows as
\begin{equation}
    s_{\mathrm{L}} = \frac{1}{m} \sum_{i=1}^{m} \frac{1}{\hat{\sigma}_{\bm{\theta}}^{(i)}} \left| \bm{y}^{(i)} - \hat{\bm{y}}_{\bm{\theta}}^{(i)} \right| ~ ,
\end{equation}
respectively.
We apply $ \sigma $ scaling to jointly calibrate aleatoric and epistemic uncertainty as described in §\,\ref{sec:well}.

\section{Unbiased Estimator of the Approximate Predictive Variance}
\label{app:proof_var}

We show that the expectation of the predictive sample variance from MC dropout, as given in \cite{Kendall2017}, equals the true variance of the approximate posterior predictive distribution.

\begin{prop}    
    Given $ N $ MC dropout samples $ \bm{f}_{\bm{\theta}_{n}} = [ \hat{\bm{y}}_{n}, \hat{\sigma}^{2}_{n} ] $ from our approximate predictive distribution $ p(\bm{y}^{\ast} \vert \bm{x}^{\ast}, \mathcal{D}) = \mathcal{N} ( \bm{y}^{\ast} ; \bm{y}, \Sigma^{2} ) $,
    the predictive sample variance
    \begin{equation}
    \hat{\Sigma}^{2} = \frac{1}{N} \sum_{n=1}^{N} \left( \hat{\bm{y}}_{n} - \frac{1}{N} \sum_{n=1}^{N} \hat{\bm{y}}_{n} \right)^{2} + \frac{1}{N} \sum_{n=1}^{N} \hat{\sigma}^{2}_{n}
\end{equation}
    is an unbiased estimator of the approximate predictive variance.
\end{prop}
\begin{proof}
\begin{flalign}
    \mathbb{E} \left[ \hat{\Sigma}^{2} \right] &= \mathbb{E} \left[ \frac{1}{N} \sum_{n=1}^{N} \left( \hat{\bm{y}}_{n} - \frac{1}{N} \sum_{n=1}^{N} \hat{\bm{y}}_{n} \right)^{2} + \frac{1}{N} \sum_{n=1}^{N} \hat{\sigma}^{2}_{n} \right] &\\
     &= \mathbb{E} \left[ \frac{1}{N} \sum_{n=1}^{N} \left( \hat{\bm{y}}_{n} - \frac{1}{N} \sum_{n=1}^{N} \hat{\bm{y}}_{n} \right)^{2} \right] + \mathbb{E} \left[ \frac{1}{N} \sum_{n=1}^{N} \hat{\sigma}^{2}_{n} \right] & \\
     &\mathrm{with} \quad \frac{1}{N} \sum_{n=1}^{N} \hat{\bm{y}}_{n} = \bar{\bm{y}} \quad \mathrm{follows} &\\
     &= \mathbb{E} \left[ \frac{1}{N} \sum_{n=1}^{N} \left( \hat{\bm{y}}_{n} - \bar{\bm{y}} \right)^{2} \right] + \hat{\sigma}^{2} &\\
     &= \mathbb{E} \left[ \frac{1}{N} \sum_{n=1}^{N} \left( \hat{\bm{y}}_{n} - \bar{\bm{y}} \right)^{2} + \bar{\bm{y}}^{2} - \bar{\bm{y}}^{2} + \bm{y}^{2} - \bm{y}^{2} + 2 \bar{\bm{y}}\bm{y} - 2 \bar{\bm{y}}\bm{y} \right] + \hat{\sigma}^{2} &\\
     &= \mathbb{E} \left[ \frac{1}{N} \sum_{n=1}^{N} \left( \hat{\bm{y}}_{n} - \bm{y} \right)^{2} - \left( \bar{\bm{y}} - \bm{y} \right)^{2} \right] + \hat{\sigma}^{2} &\\
     &= \mathbb{E} \left[ \frac{1}{N} \sum_{n=1}^{N} \left( \hat{\bm{y}}_{n} - \bm{y} \right)^{2} \right] - \mathbb{E} \Bigg[ \left( \bar{\bm{y}} - \bm{y} \right)^{2} \Bigg] + \hat{\sigma}^{2} &\\
     &= \Sigma^{2} - \hat{\sigma}^{2} + \hat{\sigma}^{2} &\\
     \mathbb{E}\left[ \hat{\Sigma}^{2} \right] &= \Sigma^{2} &
\end{flalign}
Note that the predicted aleatoric uncertainty $ \hat{\sigma}^{2} $ equals the expected squared error when trained $ \bm{f}_{\theta} $ by minimizing NLL, thus $ \mathbb{E} [ ( \bar{\bm{y}} - \bm{y} )^{2} ] = \hat{\sigma}^{2} $.
\end{proof}

\section{Training Procedure}
\label{app:training}

The model implementations from PyTorch 1.3 \cite{PyTorch2019} are used and trained with the following settings:
\begin{itemize}
    \itemsep0em
    \item training for 500 epochs with batch size of 16
    \item Adam optimizer with initial learn rate of $ 3 \cdot 10^{-4} $ and weight decay with $ \lambda = 10^{-7} $
    \item reduce-on-plateau learn rate scheduler (patience of 20 epochs) with factor of 0.1
    \item in MC dropout, $ N=25 $ forward passes were performed with dropout with $ p = 0.5 $ used for ResNet (as described in \cite{Gal2016}). In DenseNet ($ p = 0.2 $) and EfficientNet ($ p = 0.4 $) standard dropout $ p $ of the architecture is used.
    \item Additional validation and test sets are used if provided by the data sets; otherwise, a train/validation/test split of approx. 50\%\,/\,25\%\,/\,25\% is used
\end{itemize}

\section{3D OCT Needle Pose Data Set}

\begin{figure}[h]
    \centering
    \includegraphics[scale=1.0]{oct_datset-crop.pdf} \\
    \small pixel coordinates
    \caption{Example image from OCT data set showing $ \argmax $ projections of a surgical needle tip acquired by optical coherence tomography.}
    \label{fig:oct_dataset}
\end{figure}

\noindent Our data set was created by attaching a surgical needle to a high-precision six-axis hexapod robot (H-826, Physik Instrumente GmbH \& Co. KG, Germany) and observing the needle tip with 3D optical coherence tomography (OCS1300SS, Thorlabs Inc., USA).
The data set consists of 5,000 OCT acquisitions with $ 64 \times 64 \times 512 $ voxels, covering a volume of approx.\ $ 3 \times 3 \times 3 $\,$\mathrm{mm}^{3}$.
Each acquisition is taken at a different robot configuration and labeled with the corresponding 6DoF pose.
To process the volumetric data with CNNs for planar images, we calculate 3 planar projections along the spatial dimensions using the $ \argmax $ operator, scale them to equal size and stack them together as three-channel image (see Fig.\,\ref{fig:oct_dataset}).
A similar approach was presented in \cite{Laves2017} and \cite{Gessert2018}.
The data are characterized by a high amount of speckle noise, which is a typical phenomenon in optical coherence tomography.
The data set is publicly available at \href{https://github.com/mlaves/3doct-pose-dataset}{github.com/mlaves/3doct-pose-dataset}.

\section{Additional Calibration Diagrams}
\label{app:figures}

All test set runs have been repeated 5 times.
Solid lines denote mean and shaded areas denote standard deviation calculated from the repeated runs.

\begin{figure}[h]
    \centering
    \includegraphics[scale=0.9]{results_breastpathq_resnet101.pdf}
    %\caption{ResNet-101 on BreastPathQ test set.}
\end{figure}

\begin{figure}[h]
    \centering
    \includegraphics[scale=0.9]{results_breastpathq_densenet201.pdf}
    %\caption{DenseNet-201 on BreastPathQ test set.}
\end{figure}

\begin{figure}[h]
    \centering
    \includegraphics[scale=0.9]{results_breastpathq_efficientnetb4.pdf}
    %\caption{EfficientNet-B4 on BreastPathQ test set.}
\end{figure}

\pagebreak

\begin{figure}[h]
    \centering
    \includegraphics[scale=0.9]{results_boneage_resnet101.pdf}
    %\caption{ResNet-101 on BoneAge test set.}
\end{figure}

\begin{figure}[h]
    \centering
    \includegraphics[scale=0.9]{results_boneage_densenet201.pdf}
    %\caption{DenseNet-201 on BoneAge test set.}
\end{figure}

\begin{figure}[h]
    \centering
    \includegraphics[scale=0.9]{results_boneage_efficientnetb4.pdf}
    %\caption{EfficientNet-B4 on BoneAge test set.}
\end{figure}

\pagebreak

\begin{figure}[h]
    \centering
    \includegraphics[scale=0.9]{results_endovis_resnet101.pdf}
    %\caption{ResNet-101 on EndoVis test set.}
\end{figure}

\begin{figure}[h]
    \centering
    \includegraphics[scale=0.9]{results_endovis_densenet201.pdf}
    %\caption{DenseNet-201 on EndoVis test set.}
\end{figure}

\begin{figure}[h]
    \centering
    \includegraphics[scale=0.9]{results_endovis_efficientnetb4.pdf}
    %\caption{EfficientNet-B4 on EndoVis test set.}
\end{figure}

\pagebreak

\begin{figure}[h]
    \centering
    \includegraphics[scale=0.9]{results_oct_resnet101.pdf}
    %\caption{ResNet-101 on OCT test set.}
\end{figure}

\begin{figure}[h]
    \centering
    \includegraphics[scale=0.9]{results_oct_densenet201.pdf}
    %\caption{DenseNet-201 on OCT test set.}
\end{figure}

\begin{figure}[h]
    \centering
    \includegraphics[scale=0.9]{results_oct_efficientnetb4.pdf}
    %\caption{EfficientNet-B4 on OCT test set.}
\end{figure}

\end{document}
