\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{booktabs} % to get nicer tables

\hyphenation{TotalSegmentator}

\newcommand{\todo}[1]{\textcolor{red}{\{#1\}}}
\newcommand{\sml}[0]{\footnotesize}

\jmlryear{2024}
\jmlrworkshop{Full Paper -- MIDL 2024}
\jmlrvolume{-- nnn}
\editors{Accepted for publication at MIDL 2024}

\title[Structure Size as Confounder in Uncertainty Based Segmentation Quality Prediction]{Structure Size as Confounder in Uncertainty Based Segmentation Quality Prediction}

% Alternative Titles:
% Pitfalls in Uncertainty Based Segmentation Quality Prediction
% On the structure size as a confounder in uncertainty based segmentation quality prediction
% The structure size is a confounder in uncertainty based segmentation quality prediction
% Confounders in uncertainty based segmentation quality prediction

\midlauthor{\Name{Kai Geißler\nametag{$^{1}$}} \Email{kai.geissler@mevis.fraunhofer.de}\\
\addr $^{1}$ Fraunhofer Institute for Digital Medicine MEVIS, Bremen, Germany\\
\Name{Jochen G. Hirsch\nametag{$^{1}$}} \Email{} \\
\Name{Stefan Heldmann\nametag{$^{1}$}} \Email{} \\
\Name{Hans Meine\nametag{$^{1}$}} \Email{}
}

\begin{document}


\maketitle


\begin{abstract}
Various uncertainty estimation methods have been proposed for deep learning-based image segmentation models.
An uncertainty measure is treated useful if it can be used to accurately predict segmentation quality. Therefore, structure-wise uncertainty measures are frequently correlated with measures like the Dice score.
However, it is known that the Dice score highly depends on the size of the structure of interest. It is less well-known that popular structure-wise uncertainty measures also correlate with structure size.
Therefore, the structure size acts as confounding variable when trying to quantify the performance of such uncertainty measures via correlation.
We investigate this for the popular uncertainty measures structure-wise epistemic uncertainty, mean pairwise Dice and volume variation coefficient based on test-time-augmentation, Monte Carlo Dropout and model ensembles.
We propose to use a partial correlation coefficient to address structure size as confounding variable and arrive at lower correlation estimates which better reflect the true relationship between segmentation quality and structure-wise uncertainty.
% As we use 28 structure types of differing sizes, we make the same experiment on each individual structure type to exclude it as potential confounding variable. On the structure level, our observations are less severe, but still significant. 
%In addition, we confirm that the average symmetric surface distance has lower correlation with structure size than the Dice score. Therefore it is less severely affected by the confounding nature of structure size when correlating it with uncertainty measures and can be a useful additional metric to assess the quality of uncertainty measures.
\end{abstract}


\begin{keywords}
Uncertainty Quantification, Medical Image Segmentation
\end{keywords}


\section{Introduction}
Estimation of model uncertainty in deep learning based image segmentation can be done with various proposed measures such as mean predictive entropy \citep{gal2017deep}, mutual information \citep{kendall2017uncertainties}, volume variation coefficient (VVC) \citep{roy2018inherent}, or mean pairwise Dice (MPD) \citep{roy2018inherent}.
Given a specific model input, there is also a choice of methods to produce a set of outputs, like test time augmentation (TTA) \citep{wang2019aleatoric}, Monte Carlo dropout (MCD) \citep{gal2015bayesian, gal2017deep} or model ensembles \citep{lakshminarayanan2017simple}, which we will call uncertainty sources. Uncertainty measures then translate such sets of outputs into an uncertainty prediction, i.e. an estimate of the likelihood of making a wrong prediction.
In order to assess an uncertainty measure, it is commonly correlated with a measure for the prediction quality. In the case of image segmentation, this is usually the Dice score, which is known to be higher in general for larger structures \cite{reinke2021common}. If model uncertainty was also influenced by structure size, then the structure size would be a confounding variable, potentially tainting correlation estimates. \filbreak

In this work, we analyse this relationship for the task of segmenting diverse anatomical structures in whole body MRI. Our contributions are threefold: 
\begin{itemize}
  \item We show empirically that the popular uncertainty measures structure-wise epistemic uncertainty, MPD and VVC all correlate notably with structure size.
  \item To compute correlation estimates that correct for this confounding variable, we propose to use partial Spearman's rank correlation \cite{liu2018covariate} and assess its changes to the correlation estimates and to the resulting ranking of uncertainty measures.
  \item We also confirm that the average symmetric surface distance has a lower correlation with structure size and is therefore less severely affected its confounding nature.
\end{itemize}

Our results are achieved in the context of domain transfer between computed tomography (CT) which is our source domain and magnetic resonance imaging (MRI) which is our target domain. A segmentation model is trained on the TotalSegmentator dataset \cite{wasserthal2023totalsegmentator}, a large collection of CT images with masks for anatomical structures. During training it utilizes GIN augmentation \citep{ouyang2023} to allow domain transfer. It is evaluated on a subset of the MR Imaging Study within the German National Cohort Study \citep{bamberg2015whole} for which segmentation masks are manually created by radiological technologists.

\begin{figure}
\floatconts
  {fig:graphical_abstract}
  {\caption{A proper uncertainty measure for structure segmentation quality should be well correlated with the segmentation quality. We assess the confounding effect of the structure volume on this relationship.}}
  {\includegraphics[width=1.0\linewidth]{images/graphical_abstract.pdf}\vskip-1em}
\end{figure}


\section{Related Work}
Different ways have been proposed to estimate the uncertainty of deep neural networks, like model ensembles~\citep{lakshminarayanan2017simple}, test time augmentation~\citep{wang2019aleatoric}, using the raw model outputs~\citep{smailagic2018medal}, Monte Carlo dropout \citep{gal2015bayesian,gal2016dropout} or deterministic methods \cite{liu2020simple}. In addition there are uncertainty aware models using variational autoencoders to learn the model uncertainty \citep{kohl2018probabilistic} and models based on Dempster-Shafer theory \citep{huang2022lymphoma} or subjective logic \citep{zou2022tbrats} using evidential deep learning to incorporate the model uncertainty.
There are also models which try to disentangle the uncertainty coming from different possible uncertainty sources \citep{shaw2021heteroscedastic}.

\citet{mehta2022qu} conducted the QU-BraTS challenge assessing the uncertainty prediction performance of 14 participating teams and proposing a novel metric to evaluate uncertainty maps. \citet{camarasa2021quantitative} evaluated how to best calculate voxel-wise uncertainty maps in multi-class segmentation settings.

Based on sampled model outputs, there are different ways to compute uncertainties for each structure. One way is to compute the voxel-wise uncertainty of the image, such as entropy, variance or variation ratio over the samples \citep{gal2017deep} and average it over the predicted structure mask.
For segmentation tasks, there are also structure level measures which can be computed directly from a set of masks, like the volume variation coefficient and the mean pairwise Dice score between the predictions \citep{roy2018inherent}.

\citet{kendall2017uncertainties} introduced an uncertainty decomposition into two components:
Aleatoric uncertainty captures uncertainty inherent to the data, like measurement or labeling errors, while epistemic uncertainty captures the uncertainty in the model parameters by computing the mutual information \citep{mukhoti2018evaluating}.
%or the BALD acquisition function \cite{houlsby2011bayesian}.

%Another approach are uncertainty measures based on fuzzy sets  \todo{cite}, which ... \todo{elaborate briefly}.

Many publications use correlations between segmentation quality and uncertainty measures to compare different methods \cite{roy2018inherent,wang2019aleatoric,hoebel2020exploration,lin2022novel,lin2022quality,sahlsten2023application} while others report correlations on their own to assess the quality of the uncertainty measure \cite{hiasa2019automated}.
%As long as there are no confounding variables it is a valid methodology to compare different uncertainty measures or evaluate their quality on their own.
\citet{hoebel2020exploration} found no significant correlation between uncertainty measures and structure size in their experiments on lung nodule segmentation in CT scans, computing uncertainties based on MCD and model ensembles using structure-wise mean entropy over the predicted samples, VVC and MPD. Furthermore, \citet{wang2019aleatoric} claim that the VVC is agnostic to structure size.

In contrast to this, we find non-negligible correlations between the structure size and the uncertainty measures structure-wise epistemic uncertainty, VVC and MPD based on model ensembles, MCD and TTA in our experiments.



\section{Experiments}
\subsection{Data}
For model training the publicly available TotalSegmentator dataset is used. It contains 1204 CT images with 104 annotated structures \cite{wasserthal2023totalsegmentator}, accounting for 27 organs, 59 bones, 10 muscles, and eight vessels.

\begin{table}
\caption{Description of study population using mean $\pm$ SD for continuous variables}
\label{tab:study_population}
\begin{center}
\begin{tabular}{cccccc}
\toprule
\rule[-1ex]{0pt}{3.5ex} Cases & Study Centers & Sex & Age [Years] & Weight [kg] & Height [cm] \\
\midrule 
\rule[-1ex]{0pt}{3.5ex} 30 & 5 & 15 male, 15 female & 45.7 $\pm$ 13.6 & 86.9 $\pm$ 21.4 & 173.9 $\pm$ 13.2  \\
\bottomrule
\end{tabular}
\end{center}
\end{table}

Our evaluation dataset consists of 30 selected cases out of 10828 whole body MR volumes obtained as part of the MR Imaging Study within the German National Cohort Study (GNC, 2014-2019) \cite{bamberg2015whole} from volunteers.  The data were acquired on MAGNETOM Skyra 3\,T (Siemens Healthineers, syngo VD13C) systems. The MRI sequence is a two-point Dixon volumetric interpolated breath-hold examination (VIBE) with T1 weighting. Axial slices were acquired with a 320$\times$260 in-plane matrix (resolution 1.4$\times$1.4$\,mm^2$) and a slice thickness of 3$\,mm$. The volume consists of four acquired table positions with a total of 316~slices. The test cases are selected to span a diverse sample of the subject distribution with regard to study center, sex, age, weight and height. Details of the study population are depicted in Table~\ref{tab:study_population}.

Three radiological technologists annotate segmentation masks for 28 anatomical structures, comprising 8 abdominal organs (liver, spleen, kidneys, stomach, pancreas, adrenal glands), 5 thoracic organs (heart, lungs, esophagus, trachea), 12 bones (scapulas, claviculas, hips, sacrum, femurs, vertibrae L1-5, T1-12 and C3-7), 2 muscles (autochthon) and one vessel (aorta) on these cases. The number of annotated structures per structure type is shown in Table \ref{tab:number_of_structures_and_structure_volumes} in the appendix. The structure types are chosen as a subset of the TotalSegmentator structures. This variety was selected to be able to draw conclusions about anatomical structures of various shapes and sizes. 

\subsection{Segmentation Model}
%\todo{Can we cite this from our SPIE paper? It is not published yet though.} - no, also it is more important than for instance the specific imaging protocols above, don't you agree?
A segmentation model is trained from scratch on the TotalSegmentator data using our re-implementation of the nnU-Net framework~\cite{isensee2021nnunet}. As training configuration, we approximate the TotalSegmentator low resolution (3$\,mm$) model. The images are resampled to a voxel size of 3$\times$3$\times$3$\,mm^3$, a patch size of 80$\times$80$\times$80 is used and the nnU-Net non-CT normalization is performed. The models are trained for 250,000 iterations with a batch size of 2. The model used for MCD has a dropout layer with dropout rate 0.1 after each block of convolution, normalization and non-linearity in the up- and downpath. For basic data augmentation we utilize the batch generators library \cite{isensee_fabian_2020_3632567}.

In addition all models are trained using GIN data augmentation \cite{ouyang2023} to allow for domain transfer between CT and MR images. In GIN, a convolutional neural network (CNN) applies a random non-linear intensity value transformation on the training patches. Therefore, the model needs to focus more on the shape and less on the intensity values of structures and generalizes better to different imaging modalities and contrasts. We re-implement GIN augmentation. In our experiments, GIN augmentation is applied with a probability of 0.9 to each patch. The CNN for the augmentation has 4 layers with 2 channels each and uses ReLU activations. In each layer of the GIN augmentation network a random 1$\times$1$\times$1 convolution is sampled. As we train our models on a low resolution we decided to use only 1$\times$1$\times$1 convolutions to avoid too much smoothing that could arise from larger filters.


\subsection{Uncertainty Quantification}
We asses three alternative uncertainty sources to predict a set of different predictions for the same input image: model ensembles, TTA and MCD.
For model ensembles one trains a set of deep learning models to obtain models with different learned weights for the prediction task.
In MCD this process is approximated by training only one model with dropout layers \cite{srivastava2014dropout}; in our experiments, we use spatial dropout \cite{tompson2015efficient} which is more suited for convolutional neural networks.
During inference, the dropout layers are then activated to use the model as Bayesian neural network that approximates the model posterior with different predictions for each dropout layer configuration.
In TTA, there is only one fixed model, but for each inference pass the input image is augmented with either reversible geometric augmentations or intensity transformations to obtain a set of different predictions for the same input. In our experiments 3D rotations, scaling, additive and multiplicative brightness, contrast transformation and gamma transformation are used. These are inspired by the data augmentations used in the batch generators library.

The epistemic uncertainty \cite{kendall2017uncertainties,mukhoti2018evaluating} is first computed for each voxel by Equation~\ref{eq:epistemic_uncertainty}, where $p_{ct}(x)$ is the predicted probability for class $c$, voxel $x$ and sample $t$. $T$ is the number of samples used for TTA, MCD and model ensembles, which is set to 10 in our experiments.
\begin{equation}
\label{eq:epistemic_uncertainty}
\mathbb{I}(x) = - \sum_c \left(\frac{1}{T} \sum_t p_{ct}(x) \right) \log \left( \frac{1}{T} \sum_t p_{ct}(x) \right) + \frac{1}{T} \sum_t \sum_c p_{ct}(x) \log p_{ct}(x)
\end{equation}
In order to then get to a structure-wise epistemic uncertainty, we average the epistemic uncertainty over all voxels which are predicted as this structure.

The MPD \cite{roy2018inherent} for class $c$ is computed as the mean of the pairwise Dice scores of all predicted segmentation masks (Eq. \ref{eq:mean_pairwise_dice}). $m_{ct}$ is the binary mask for class $c$ and sample $t$.
\begin{equation}
\label{eq:mean_pairwise_dice}
\text{MPD}_c = \frac{2}{T(T-1)} \sum_{i>j} \text{Dice}(m_{ci}, m_{cj}) 
\end{equation}

Finally, the VVC \cite{roy2018inherent} for class $c$ is computed as the variance of the predicted structure volumes divided by their mean (Eq. \ref{eq:volume_variation_coefficient}).
\begin{equation}
\label{eq:volume_variation_coefficient}
\text{VVC}_c = \mathrm{Var}_t(\mathrm{vol}(m_{ct})) \; / \; \mathrm{E}_t(\mathrm{vol}(m_{ct}))
\end{equation}

\subsection{Partial Spearman's Rank Correlation}
When computing the correlation between two variables X and Y, which are both highly correlated with a third variable Z, the correlation between X and Y gets tainted by the confounding nature of Z and potentially overestimated. In our case X and Y are the segmentation quality and structure-wise uncertainty while the potential confounder Z is the structure size. Partial correlation coefficients allow to remove the confounding effect of Z when assessing the correlation between X and Y, providing a better estimate of their actual relationship.

%In order to compute more meaningful correlations between segmentation quality and uncertainty measures, we want to control for the structure size when doing so. 
As we do not observe a linear relationship between our segmentation quality metrics and uncertainty measures, we cannot use partial Pearson's correlation for our purpose, but have to rely on rank based correlations.
\citet{kendall1942partial} defined a partial correlation based on Spearman's rank correlation, but stated that it is hard to theoretically justify. It was also criticized because it is nonzero even under conditional independence of the correlated variables given the controlling variable~\citep{korn1984ranges}.
Therefore we propose to use the partial Spearman's rank correlation defined by \citet{liu2018covariate} which they base on probability scale residuals. To compute it we use the R package \texttt{PResiduals} \cite{liu2020presiduals}.

%\begin{equation}
%\label{eq:partial_spearman_correlation}
%\rho_{XY \cdot Z} = \left( \rho_{XY} - \rho_{XZ} \rho_{YZ} \right) \; / \; \sqrt{ (1 - \rho_{XZ}^2) (1 - \rho_{YZ}^2) }
%\end{equation}

% HACK: This is admittedly a little early
\begin{table}
\floatconts
  {tab:correlation_with_structure_volume}
  {\caption{Correlation between uncertainty measures (left) or segmentation quality (right) and structure volume computed over all structures. Signs of some quantities are switched to allow for easier comparison of correlations.}}
  {\begin{tabular}{l@{\hspace{3em}}ccc@{\hspace{3em}}cc}
  \toprule
   & -Ep. Unc.  & MPD & -VVC & Dice & -ASSD \\
  \midrule
  TTA      & 0.627 & 0.776 & 0.782 & 0.804 & 0.246  \\
  Ensemble & 0.807 & 0.757 & 0.735 & 0.776 & 0.193  \\
  MCD      & 0.641 & 0.769 & 0.850 & 0.814 & 0.134  \\
  \bottomrule
  \end{tabular}}
\end{table}


\section{Results}
\subsection{Evaluation Across All Structures}

At first, we consider Spearman's correlation between structure volume and uncertainty, as well as between structure volume and segmentation quality. The results across all structures and patients are presented in Table~\ref{tab:correlation_with_structure_volume}. We observe strong correlation for all tested uncertainty measures and for the Dice score, which is contrary to the observations of \citet{hoebel2020exploration} who observed no significant correlations between structure volume and uncertainty. For the average symmetric surface distance (ASSD), we observe only weaker correlations.

\begin{table}
\floatconts
  {tab:correlation_uncertainty_segmentation_quality}
  {\caption{Correlation between uncertainty measures and segmentation quality. Bold values mark the best method/measure combination for each segmentation quality metric.}}
  {\begin{tabular}{lccc@{\hspace{3em}}ccc}
  \toprule
   & 1-Dice vs. & 1-Dice vs. & 1-Dice vs. & ASSD vs. & ASSD vs. & ASSD vs. \\
   & Ep. Unc.   & 1-MPD      & VVC        & Ep. Unc. & 1-MPD    & VVC  \\
  \midrule
  TTA      & 0.845 & 0.936 & 0.850 & 0.635 & 0.586 & 0.562 \\
  Ensemble & 0.908 & 0.941 & 0.801 & 0.746 & \textbf{0.788} & 0.694 \\
  MCD      & 0.837 & \textbf{0.958} & 0.907 & 0.660 & 0.577 & 0.520 \\
  \bottomrule
  \end{tabular}}
\end{table}

The correlation between uncertainty measures and segmentation quality is shown in Table \ref{tab:correlation_uncertainty_segmentation_quality}. We observe moderate to high correlations in all cases, ranging from 0.520 to 0.958, again higher when using the Dice score as segmentation quality measure.

As we observed before, there is a high correlation between both the segmentation quality and structure volume as well as uncertainty and structure volume.
Therefore, one can see the structure volume acting as confounding variable between segmentation quality and uncertainty.
To correct for this, we use the partial Spearman's rank correlation between segmentation quality and uncertainty, accounting for the structure volume.
It removes the influence of structure volume from the correlation between segmentation quality and uncertainty.
We show the results in Table~\ref{tab:partial_correlation_uncertainty_segmentation_quality}.
One can observe that the correlations drop in all cases, sometimes considerably by up to 0.260. We can also compare Table~\ref{tab:correlation_uncertainty_segmentation_quality} and Table~\ref{tab:partial_correlation_uncertainty_segmentation_quality} to check if the ranking of different uncertainty methods (column wise) or uncertainty measures (row wise) changes when switching from correlation to partial correlation.
This is important, because it means that we potentially arrive at different conclusions about the utility of uncertainty methods/measures when we assess them either with the correlation or partial correlation.
We observe that the ranking of different methods is relatively stable (only one column changes rankings) while for the different measures the rankings differ more (all three rows changing for Dice and one row changing for ASSD).

From these observations we can conclude that the structure volume has a strong confounding effect on the correlation between uncertainty measures and the Dice score and a weak confounding effect on the correlation between uncertainty measures and ASSD. The partial correlation allows to address this issue and arrive at correlation estimates less tainted by this confounding variable.

\begin{table}
\floatconts
  {tab:partial_correlation_uncertainty_segmentation_quality}
  {\caption{Partial correlation between uncertainty measures and segmentation quality controlling for structure volume. Differences to regular correlation (Table~\ref{tab:correlation_uncertainty_segmentation_quality}) are displayed in gray, measures that rank better (worse) within their column are marked with $\vartriangle(\triangledown)$ and uncertainty sources that changed rank within their row with $\blacktriangle (\blacktriangledown)$. Bold values mark the best method/measure combination for each segmentation quality metric.}}
  {\begin{tabular}{lccc@{\hspace{3em}}ccc}
  \toprule
   & {1-Dice vs.} & 1-Dice vs. & 1-Dice vs. & ASSD vs. & ASSD vs. & ASSD vs. \\
   & {Ep. Unc.} & 1-MPD & VVC & Ep. Unc. & 1-MPD & VVC  \\
  \midrule
  TTA      & {\llap{$^\blacktriangle$}}0.736  & \textbf{0.820}{\rlap{$^\vartriangle$}}  & {\llap{$^\blacktriangledown$}}0.665  & 0.607  & 0.553  & 0.508 \\
  \textcolor{gray}{$\Delta$} & \textcolor{gray}{\llap-0.109} & \textcolor{gray}{\llap-0.116} & \textcolor{gray}{\llap-0.185} & \textcolor{gray}{\llap-0.028} & \textcolor{gray}{\llap-0.033} & \textcolor{gray}{\llap-0.054} \\
  Ensemble & {\llap{$^\blacktriangle$}}0.792  & {\llap{$^\blacktriangledown$}}0.726{\rlap{$^\triangledown$}}  & 0.541  & {\llap{$^\blacktriangle$}}\textbf{0.678}  & {\llap{$^\blacktriangledown$}}0.629  & 0.534 \\
  \textcolor{gray}{$\Delta$} & \textcolor{gray}{\llap-0.116} & \textcolor{gray}{\llap-0.215} & \textcolor{gray}{\llap-0.260} & \textcolor{gray}{\llap-0.068} & \textcolor{gray}{\llap-0.159} & \textcolor{gray}{\llap-0.160} \\
  MCD      & {\llap{$^\blacktriangle$}}0.698  & 0.799{\rlap{$^\triangledown$}}  & {\llap{$^\blacktriangledown$}}0.675  & 0.638  & 0.513  & 0.443 \\
  \textcolor{gray}{$\Delta$} & \textcolor{gray}{\llap-0.139} & \textcolor{gray}{\llap-0.159} & \textcolor{gray}{\llap-0.232} & \textcolor{gray}{\llap-0.022} & \textcolor{gray}{\llap-0.064} & \textcolor{gray}{\llap-0.077} \\
  \bottomrule
  \end{tabular}}
\end{table}


\subsection{Evaluation per Structure Type}
In the previous section we observed a strong correlation between structure-wise uncertainty measures and structure volume when computing it across a wide variety of structure types. All these structure types differ in their typical volume (cf. Table \ref{tab:number_of_structures_and_structure_volumes} in the appendix), which is why the structure type itself could be the confounding variable, with the structure volume being only an intermediate dependent variable on that.
So the remaining question is if the structure volume is the true confounding variable or if it is actually the structure type.

Considering correlation between structure size and uncertainty or segmentation quality, the median, 1$^{st}$ and 3$^{rd}$ quartiles over all 28 structure types are shown in Table~\ref{tab:distribution_correlation_with_structure_volume}.
The correlations between structure volume and uncertainty are weaker when evaluated per structure type, but still many structure types show correlations that deviate considerably from 0.

When correlating the uncertainty measures with the Dice score per structure type, the medians of the distribution of correlations move towards 0 in all nine cases we are assessing when switching from correlation to partial correlation. The lower and upper quartiles also move towards 0 in eight of nine cases each. For the ASSD the medians and quartiles sometimes rise and sometimes fall when switching from correlation to partial correlation.
Figure~\ref{fig:distribution_of_correlations_per_structure_type} in the appendix summarizes the distribution over correlations and partial correlations per structure type and Figure~\ref{fig:correlations_per_structure_type_dice} and Figure~\ref{fig:correlations_per_structure_type_assd} in the appendix provide all the individual correlations.
 This shows that the confounding nature of structure volume is still present when assessing individual structure types and the partial correlation should be used. When correlating with the ASSD this effect is much less severe. This highlights its usefulness to assess the quality of uncertainty measures, as it is less effected by the confounding nature of structure volume.


\begin{table}
\floatconts
  {tab:distribution_correlation_with_structure_volume}
  {\caption{Distribution over correlation between uncertainty measures or segmentation quality and structure volume over structure types (median [$1^{st}$ quartile, $3^{rd}$ quartile])}}
  {\begin{tabular}{lccccc}
  \toprule
   & -Ep. Unc.  & MPD & -VVC & Dice & -ASSD \\
  \midrule
  TTA      & 0.23\sml{} [-0.02,0.37] & 0.29\sml{} [0.17,0.42] & 0.17\sml{} [0.08,0.36] & 0.51\sml{} [0.27,0.68] & 0.25\sml{} [0.13,0.35]  \\
  Ensem. & 0.23\sml{} [0.13,0.51] & 0.30\sml{} [0.07,0.53] & 0.29\sml{} [0.06,0.44] & 0.51\sml{} [0.37,0.69] & 0.25\sml{} [0.18,0.37]  \\
  MCD      & 0.16\sml{} [0.06,0.30] & 0.19\sml{} [0.04,0.38] & 0.20\sml{} [0.11,0.35] & 0.58\sml{} [0.29,0.73] & 0.15\sml{} [0.00,0.40]  \\
  \bottomrule
  \end{tabular}}
\end{table}

\subsection{Statistical Testing for Structure-wise Evaluation}
To test if partial correlation also leads to consistent reductions when considering structure types individually, we perform a one-sided Wilcoxon signed-rank test over all types. The threshold for significance is set to $p<0.0028$ based on Bonferroni correction to achieve a family wise error of 0.05. We find significance for TTA using either MPD or VVC as uncertainty measure and Dice score as segmentation quality metric.
%, as well as for MCD combined with VVC and Dice.
%VVC as uncertainty measure and Dice score as segmentation quality metric.


\section{Conclusion}
We evaluated the influence of structure volume as confounding variable on the correlation between model uncertainty and segmentation quality in medical image segmentation.
It was confirmed that the Dice score has a strong correlation with structure volume, and in contrast to prior work, we also found model uncertainty to be strongly correlated with structure volume.
This effect was evaluated both across various structure types as well individually per structure type. It appears stronger in the first case, but is still visible per structure type.

To counteract this issue, we propose to use the partial correlation coefficient when correlating segmentation quality and model uncertainty, which removes the confounding effect of the structure volume from the correlation estimate. In addition, we observe that the average symmetric surface distance suffers less from this issue, as it has only a very weak correlation with structure volume. This makes it a suitable segmentation quality measure that can be used in addition to the Dice score to assess the performance of uncertainty measures.

\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version

\midlacknowledgments{This work was supported within the Fraunhofer and DFG transfer programme. We thank Sophia Winkler, Christiane Engel and Andrea Koller for their help with creating manual annotations. We thank Ole Schwen and Max Westphal for helpful discussions. We also acknowledge the German National Cohort study for collecting the data on which we performed our evaluation.}


\bibliography{midl24_246}

\newpage

\appendix

\section{Number of Masks and Volumes per Structure Type}

\begin{table}[h]
\floatconts
  {tab:number_of_structures_and_structure_volumes}
  {\caption{Volume of individual structure types}}
  {\begin{tabular}{lcc}
    \toprule
    \rule[-1ex]{0pt}{3.5ex} Structure Type & Number of masks & Volume [ml] (mean $\pm$ std. dev.) \\
    \midrule
    \rule[-1ex]{0pt}{3.5ex} Adrenal Gland Left & 8 & 1.13 $\pm$ 0.93 \\
    \rule[-1ex]{0pt}{3.5ex} Adrenal Gland Right & 6 & 0.46 $\pm$ 0.37 \\
    \rule[-1ex]{0pt}{3.5ex} Aorta & 17 & 159.86 $\pm$ 54.09 \\
    \rule[-1ex]{0pt}{3.5ex} Autochthon Left & 18 & 513.04 $\pm$ 193.42 \\
    \rule[-1ex]{0pt}{3.5ex} Autochthon Right & 16 & 487.95 $\pm$ 187.71 \\
    \rule[-1ex]{0pt}{3.5ex} Clavicula Left & 24 & 24.49 $\pm$ 9.67 \\
    \rule[-1ex]{0pt}{3.5ex} Clavicula Right & 24 & 23.89 $\pm$ 8.22 \\
    \rule[-1ex]{0pt}{3.5ex} Esophagus & 17 & 23.37 $\pm$ 6.34 \\
    \rule[-1ex]{0pt}{3.5ex} Femur Left & 18 & 341.03 $\pm$ 52.93 \\
    \rule[-1ex]{0pt}{3.5ex} Femur Right & 17 & 337.94 $\pm$ 48.76 \\
    \rule[-1ex]{0pt}{3.5ex} Heart & 24 & 495.06 $\pm$ 173.17 \\
    \rule[-1ex]{0pt}{3.5ex} Hip Left & 25 & 299.66 $\pm$ 98.55 \\
    \rule[-1ex]{0pt}{3.5ex} Hip Right & 25 & 306.24 $\pm$ 78.32 \\
    \rule[-1ex]{0pt}{3.5ex} Kidney Left & 18 & 145.69 $\pm$ 36.08 \\
    \rule[-1ex]{0pt}{3.5ex} Kidney Right & 18 & 143.43 $\pm$ 34.84 \\
    \rule[-1ex]{0pt}{3.5ex} Liver & 26 & 1660.21 $\pm$ 410.92 \\
    \rule[-1ex]{0pt}{3.5ex} Lung Left & 25 & 1715.26 $\pm$ 437.92 \\
    \rule[-1ex]{0pt}{3.5ex} Lung Right & 25 & 2046.60 $\pm$ 485.89 \\
    \rule[-1ex]{0pt}{3.5ex} Pancreas & 17 & 73.56 $\pm$ 19.35 \\
    \rule[-1ex]{0pt}{3.5ex} Sacrum & 24 & 158.62 $\pm$ 42.40 \\
    \rule[-1ex]{0pt}{3.5ex} Scapula Left & 24 & 71.61 $\pm$ 17.90 \\
    \rule[-1ex]{0pt}{3.5ex} Scapula Right & 24 & 78.51 $\pm$ 27.39 \\
    \rule[-1ex]{0pt}{3.5ex} Spleen & 18 & 211.46 $\pm$ 71.36 \\
    \rule[-1ex]{0pt}{3.5ex} Stomach & 18 & 222.13 $\pm$ 140.41 \\
    \rule[-1ex]{0pt}{3.5ex} Trachea & 18 & 27.16 $\pm$ 9.63 \\
    \rule[-1ex]{0pt}{3.5ex} Urinary Bladder & 19 & 154.18 $\pm$ 121.17 \\
    \rule[-1ex]{0pt}{3.5ex} Vertebrae C & 14 & 18.66 $\pm$ 7.53 \\
    \rule[-1ex]{0pt}{3.5ex} Vertebrae L & 17 & 229.63 $\pm$ 70.20 \\
    \rule[-1ex]{0pt}{3.5ex} Vertebrae T & 17 & 242.51 $\pm$ 94.14 \\
    \bottomrule
  \end{tabular}}
\end{table}

\newpage

\section{Distribution of Correlation between Uncertainty and Segmentation Quality per Structure Type}

\begin{figure}[h]
    \subfigure[TTA]{
    \label{fig:tta}
    \includegraphics[width=1.0\linewidth]{images/structure_wise_presiduals_partial_correlation_tta.pdf}
    }
    
    \subfigure[Ensemble]{
    \label{fig:ensemble}
    \includegraphics[width=1.0\linewidth]{images/structure_wise_presiduals_partial_correlation_ensemble.pdf}
    }
    
    \subfigure[MCD]{
    \label{fig:mcd}
    \includegraphics[width=1.0\linewidth]{images/structure_wise_presiduals_partial_correlation_mcd.pdf}
    }
    \caption{Distribution over (partial) correlation between segmentation quality and uncertainty for individual structure types.}
    \label{fig:distribution_of_correlations_per_structure_type}
\end{figure}

\newpage

\section{Correlation of Uncertainty and Segmentation Quality per Structure Type}

\begin{figure}[h]
    \subfigure[TTA]{
    \label{fig:tta}
    \includegraphics[width=1.0\linewidth]{images/correlations_per_stucture_type_TTA_1 - Dice_2024_03_17.pdf}
    }
    
    \subfigure[Ensemble]{
    \label{fig:ensemble}
    \includegraphics[width=1.0\linewidth]{images/correlations_per_stucture_type_Ensemble_1 - Dice_2024_03_17.pdf}
    }
    
    \subfigure[MCD]{
    \label{fig:mcd}
    \includegraphics[width=1.0\linewidth]{images/correlations_per_stucture_type_MCD_1 - Dice_2024_03_17.pdf}
    }
    \caption{(Partial) correlation between 1 - Dice and uncertainty for individual structure types.}
    \label{fig:correlations_per_structure_type_dice}
\end{figure}

\begin{figure}[h]
    \subfigure[TTA]{
    \label{fig:tta}
    \includegraphics[width=1.0\linewidth]{images/correlations_per_stucture_type_TTA_ASSD_2024_03_17.pdf}
    }
    
    \subfigure[Ensemble]{
    \label{fig:ensemble}
    \includegraphics[width=1.0\linewidth]{images/correlations_per_stucture_type_Ensemble_ASSD_2024_03_17.pdf}
    }
    
    \subfigure[MCD]{
    \label{fig:mcd}
    \includegraphics[width=1.0\linewidth]{images/correlations_per_stucture_type_MCD_ASSD_2024_03_17.pdf}
    }
    \caption{(Partial) correlation between average symmetric surface distance and uncertainty for individual structure types.}
    \label{fig:correlations_per_structure_type_assd}
\end{figure}

\end{document}