\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\jmlrvolume{-- 146}
\jmlryear{2024}
\jmlrworkshop{Full Paper -- MIDL 2024 submission}
\editors{Accepted for publication at MIDL 2024}

% Author packages
\usepackage{verbatim}
\usepackage{float}
\usepackage{bm}
\usepackage{algorithm2e}
\usepackage{algorithmic}
\newcommand{\argmax}[1]{\underset{#1}{\operatorname{arg}\,\operatorname{max}}\;}
\newcommand{\argmin}[1]{\underset{#1}{\operatorname{arg}\,\operatorname{min}}\;}
\DeclareMathOperator{\E}{\mathbb{E}}
\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage{hhline}
\usepackage{caption}
\usepackage{makecell}
\usepackage{float}
\DeclareMathOperator{\diag}{diag}
\usepackage{adjustbox}
\newcommand{\specialcell}[2][c]{%
	\begin{tabular}[#1]{@{}l@{}}#2\end{tabular}}
\newcommand{\specialcellC}[2][c]{%
	\begin{tabular}[#1]{@{}c@{}}#2\end{tabular}}
\usepackage{multirow}
\newcommand{\mycom}[1]{}

\title[Active Learning with the nnUNet]{Active Learning with the nnUNet and Sample Selection with Uncertainty-Aware Submodular Mutual Information Measure}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Bernhard F\"ollmer\nametag{$^{1}$}} \Email{bernhard.foellmer@charite.de}\\
	\Name{Kenrick Schulze\nametag{$^{1}$}} \Email{kenrick.schulze@charite.de}\\
	\Name{Christian Wald\nametag{$^{2}$}} \Email{wald@math.tu-berlin.de}\\
	\Name{Sebastian Stober\nametag{$^{3}$}} \Email{stober@ovgu.de}\\
	%\Name{Jackie Ma\nametag{$^{3}$}} \Email{jackie.ma@hhi.fraunhofer.d}\\
	\Name{Wojciech Samek\nametag{$^{4,5,6}$}} \Email{wojciech.samek@hhi.fraunhofer.de}\\
	\Name{Marc Dewey\nametag{$^{1,7}$}} \Email{marc.dewey@charite.de}\\
	\addr $^{1}$ Department of Radiology, Charité-Universitätsmedizin Berlin, corporate member of Freie Universität Berlin and Humboldt-Universität zu Berlin, Berlin, 10117, Germany \\
	\addr $^{2}$ Institute of Mathematics, Technical University of Berlin, Berlin, Germany \\
	\addr $^{3}$ Artificial Intelligence Lab, Otto-von-Guericke-Universität, Magdeburg, Germany \\
	\addr $^{4}$ Department of Artificial Intelligence, Fraunhofer Heinrich Hertz Institute, Berlin, Germanyy \\
	\addr $^{5}$ BIFOLD – Berlin Institute for the Foundations of Learning and Data, Berlin, Germany \\
	\addr $^{6}$ Department of Electrical Engineering and Computer Science, Technical University of Berlin, Berlin, Germanyy \\
	\addr $^{7}$ Berlin Institute of Health and DZHK (German Centre for Cardiovascular Research), partner site Berlin,  Germany \and Deutsches Herzzentrum der Charité (DHZC), Berlin, Germany \\
}

\begin{document}

\maketitle

\begin{abstract}
Annotating medical images for segmentation tasks is a time-consuming process that requires expert knowledge.
Active learning can reduce this annotation cost and achieve optimal model performance by selecting only the most informative samples for annotation.
However, the effectiveness of active learning sample selection strategies depends on the model architecture and training procedure used.
The nnUNet has achieved impressive results in various automated medical image segmentation tasks due to its self-configuring pipeline for automated model design and training.
This raises the question of whether the nnUNet is applicable in an active learning setting to avoid cumbersome manual configuration of the training process and improve accessibility for non-experts in deep learning-based segmentation.
This paper compares various sample selection strategies in an active learning setting in which the self-configuring nnUNet is used as the segmentation model. 
Additionally, we propose a new sample selection strategy for UNet-like architectures: \textbf{USIM} - \textbf{U}ncertainty-Aware \textbf{S}ubmodular Mutual \textbf{I}nformation \textbf{M}easure. 
The method combines uncertainty and submodular mutual information to select batches of uncertain, diverse, and representative samples.
We evaluate the performance gain and labeling costs on three medical image segmentation tasks with different segmentation challenges.. 
Our findings demonstrate that utilizing nnUNet as the segmentation model in an active learning setting is feasible, and most sampling strategies outperform random sampling. 
Furthermore, we demonstrate that our proposed method yields a significant improvement compared to existing baseline methods.
\end{abstract}

\begin{keywords}
Deep Learning, Active Learning, Medical Image Segmentation, nnUNet, Submodular Subset Selection
\end{keywords}

\section{Introduction}
Segmentation of tumors and surrounding anatomy is an important task in medical imaging for cancer diagnosis using CT and MRI \cite{Hesamian2019}.
However, training these models requires a large number of voxel-wise labeled images. Annotating such datasets is usually time-consuming and requires expert knowledge \cite{Ren2022}.\\
Active learning (AL) can address this issue by iteratively selecting only the most informative samples (batches) from the unlabeled dataset for annotation and training, while still achieving near-optimal model performance.
However, the performance of AL sampling strategies depend heavily on the model architecture and training procedure used \cite{10030143}.
To analyze the gain of AL strategies, it is important to ensure robustness and reproducability under different experimental conditions \cite{Munjal2022}.
The performance gain and labeling cost reduction of AL strategies depend on the following factors.
First, the training pipeline including data preprocessing (normalization, augmentation), model architecture (patch size, number of layers), and training procedure (batch size, learning rate scheduling). Second, the dataset characteristics such as dataset size, class imbalance, difficulty of the learning task. Third, the labeling budget during each sampling round. \\
The nnUNet \cite{Isensee2021} is a self-configuring pipeline, which allows for the training of 2D and 3D UNet-like models \cite{Ronneberger2015a} without the need for manual adaptation of preprocessing, model architecture, or hyperparameters and has won many segmentation competitions \cite{Antonelli2022}. 
The nnUNet identifies robust design decisions based on multiple tasks \cite{Isensee2021}, ensuring reproducibility and robustness in evaluating AL strategies \cite{Munjal2022, Burmeister2022} by automatically configuring preprocessing, model architecture, and hyperparameters.
This paper investigate whether the use of a self-configuring training pipeline in an AL setting can reduce annotation costs.
This would also increase reproducibility and could facilitate the use of active learning for medical image segmentation.\\
Most deep AL strategies that combine uncertainty, diversity and representativeness in their sampling objective have been developed for classification tasks, and less frequently for segmentation tasks \cite{jimaging7020037}.
Novel AL strategies such as BADGE \cite{Ash2020Deep} select samples based on diverse gradients where gradient length captures the uncertainty.
We explore the adaptation of gradient-based active learning methods to U-Net-like architectures and segmentation tasks with high-dimensional annotations.
To efficiently combine predictive uncertainty and gradient based sample representation, we propose  \textbf{USIM} - \textbf{U}ncertainty-aware \textbf{S}ubmodular mutual \textbf{I}nformation \textbf{M}easure. This approach combines predictive uncertainty-based sampling with diversity and representative sampling in parameter space using submodular mutual information measures \cite{Kothawade2021, 10.1007/978-3-031-19839-7_1}.
The novelty of our approach lies in the selection of the query set based on class-weighted predictive uncertainty using Monte Carlo Dropout and the estimation of representative gradient embeddings based on the bottleneck layer (USIMC) as well as an automated gradient embedding selection based on the Fisher information (USIMF).\\
Our contributions are twofold:
First, we evaluate various active learning strategies using the nnUNet pipeline as the segmentation model in an AL setting.
Second, we propose and evaluate USIM, an AL strategy that combines predictive uncertainty with diversity and representativeness using a submodular mutual information measure.

\section{Related work}
Two main principles for selecting batches of informative samples are uncertainty-based sampling and representation-based sampling.
Various uncertainty-base approaches exist \cite{Pratapa2011,32948baba85d4c19b9d9594dcf57bf03}, such as Monte Carlo dropout-based methods \cite{Gal2016,Kendall2017a}, Bayesian neural networks \cite{pmlr-v70-gal17a}, or ensemble methods \cite{Chitta2018LargeScaleVA}.
However, methods that rely solely on uncertainty are not suitable for large datasets with redundant information.\\
Representation-based sampling methods model the representativeness and diversity of samples within a batch. 
For example, the Core-set approach \cite{sener2018active} estimates distances between samples modeled by the Euclidean distance between feature vectors.	Uncertainty and representation-based methods have been used for classification and segmentation tasks \cite{Burmeister2022}.\\
Hybrid methods aim to combine uncertainty and diversity	in their sampling objectives to select informative samples while avoiding redundancy \cite{Yang2017, 9310359}.
These methods rely on compact image representation which are typically extracted from the last layer of networks for classification tasks such as BADGE \cite{Ash2020Deep}.
\citet{Sreenivasaiah} investigated adaptation of this method for segmentation tasks \cite{Aklilu2022}.
MEAL \cite{Sreenivasaiah} extracts embeddings based on Uniform Manifold Approximation to model representativeness of image patches for image segmentation.\\
In addition, semi-supervised approaches offer additional solutions to reduce labeling costs \cite{MB23a, Gaillochet2022}.  
Recent active learning strategies make use of the  Fisher information ratio, Hessian, or similarity matrices but were mainly developed for classfication task \cite{kirsch2022unifying, Kothawade2021, Ash2020Deep, NEURIPS2021_4afe0449, Liu_2021_ICCV} and seldom for segmentation tasks \cite{Al2019, Yu2023}.\\
We explicitly compare practical and user-friendly methods and exclude methods that require design of additional models to evaluate informativeness of unlabeled samples, such as VAAL \cite{Sinha2019} or adaptation of the loss function. 
We further exclude methods that require additional sub-networks with trainable parameters that might influence the automated architecture configuration of the nnUNet due to memory restrictions \cite{Yoo2019LearningLF}.

\section{Method}

\subsection{Active learning and submodular subset selection}
Active learning is the process where the learning algorithm attempts to maximize a model’s performance gain while annotating the fewest samples possible \cite{Ren2022}.

\subsubsection{Active learning for medical image segmentation}
To evaluate AL sampling strategies for medical image segmentation, the dataset is split into training and test set. 
From the training data, we construct an unlabeled pool $\bm{X_U}=\{\bm{x^{(j)}}\}_{j=1}^{U}$ of patches $\bm{x} \in \mathbb{R}^{C \times H \times W}$ with patch width $W$, height $H$ and number of channels $C$.
We randomly select and label a small initial labeled dataset $\bm{X_L}=\{(\bm{x}^{(j)},\bm{y}^{(j)})\}_{j=1}^{L}$ with segmentation masks  $\bm{y} \in \mathbb{R}^{K \times H \times W}$ ($K$ - number of classes) to train the segmentation model.
During multiple sampling rounds, we use one of the sampling strategies to select a batch $\bm{X_B} \subseteq \bm{X_U}$ of $B$ most informative samples, simulate annotation and add the annotated samples to the labeled dataset.
Since retraining from scratch is very time consuming, we fine-tune the model in each AL round and evaluate the model using the test set.

\subsubsection{Submodular mutual information measure for subset selection}
Submodular functions are a class of functions that can be used to model guided (uncertainty-guided in our approach) data subset selection of representative subsets \cite{10.1007/978-3-031-19839-7_1}.
The submodular mutual information (SMI) function is given by 
$I_f(\bm{X_B} ; \bm{X_Q})=f(\bm{X_B})+f(\bm{X_Q})-f(\bm{X_B} \cup \bm{X_Q})$, where $\bm{X_Q}$ is the query set (target set), $\bm{X_B}$ is the selected subset of patches from the unlabeled data set and $f: 2^{\bm{X}} \rightarrow \mathbb{R}$ a submodular set function defined as $f(\bm{X})=\sum_{x_i \in \bm{\Omega}} \max _{x_j \in \bm{X}} S_{i j}$. 
The term $S_{i j}$ measures the similarity between the elements $x_i$ and $x_j$.
The instantiated submodular function $I_f(\bm{X_B} ; \bm{X_Q})$ is used to select a subset that maximizes the submodular function 
$\bm{X_B} \leftarrow \operatorname{argmax}_{\bm{X_B} \subseteq \bm{X_U},|\bm{X_B}| \leq B} I_f(\bm{X_B} ; \bm{X_Q})$.
Intuitively, SMI models the similarity between $\bm{X_Q}$ and $\bm{X_B}$, and maximizing SMI will select points similar to $\bm{X_Q}$ while being diverse.
We use the mutual information function variant for the facility location (FL) function defined over $\bm{X_Q}$ (FL\textbf{Q}MI) shown in Equation~\ref{eq:submod}. The variant was successfully applied in \citet{10.1007/978-3-031-19839-7_1} and is memory and time efficient. 
The parameter $\eta$ balances query-relevance and diversity. 
\citet{Kothawade2022} and \citet{beck2024theoretical} performed a qualitative an theoretical analysis with respect to query-coverage, query-relevance and diversity, and showed that as soon as $\eta$ is increased, the summary produced by FL\textbf{Q}MI becomes more query-relevant and less diverse.
\begin{equation}
	\label{eq:submod}
	I_f(\bm{X_B} ; \bm{X_Q})=\sum_{x_i \in \bm{X_Q}} \max _{x_j \in \bm{X_B}} S_{i j}+
	\eta \sum_{x_i \in \bm{X_B}} \max _{x_j \in \bm{X_Q}} S_{i j}
\end{equation} 
We further investigate the robustness of our method in a redundancy scenario to confirm the superiority of the submodular information measure for subset selection compared to sampling with a uniform distribution over the query set in Appendix~\ref{redundency}. 

\subsection{Active learning with the self-configuring nnUNet pipeline}
We first run the nnUnet preprocessing and planning function on the raw dataset to generate a dataset fingerprint and automatically configure model architecture and training procedure.
Based on the automatic configured parameters (patch size, resampling parameter, etc.) we extract overlapping image patches to create the unlabeled pool of samples.
We label a small randomly selected subset and run the self-configuring nnUNet pipeline again to re-configure a subset of parameters for resampling, intensity normalization and train the initial model. 
We used the trained model to select the most informative batch of samples using one of the compared sample selection strategies.
After annotation, we rerun the nnUNet pipeline again to update the dataset fingerprint and planing configuration, and finetune the model.
Since the characteristics of the image data remain the same and only the annotations change, most model configurations, such as model architecture and learning rate, remain unchanged. The only changes that might occur are annotation resampling and intensity normalization. 
Sampling, annotation and self-configured model finetuning is repeated in multiple AL round.
The proposed AL framework is shown in Figure~\ref{fig:model}.
\begin{figure}[h]
	\centering
	\vspace{8pt}
	\includegraphics[width=1.0\textwidth]{images/model}
	\caption{Active learning utilizing the self-configuring nnUNet pipeline. The nnUNet-pipeline automatically configures and trains the model in each active learning round. The most informative samples are selected from the unlabeled dataset using one of the compared sampling strategies including our proposed USIMC and USIMF methods. Images are annotated by the radiologist (oracle) and added to the labeled dataset. The procedure is repeated in multiple AL rounds to increase model performance while annotating the fewest samples possible.}
	\vspace{8pt}
	\label{fig:model}
\end{figure}

\subsection{USIM - Uncertainty-aware submodular mutual information measure for subset selection}
We propose USIM, a sample selection strategy that combines predictive uncertainty and submodular mutual information measure for subset selection with UNet-like architectures.
\subsubsection{Predictive Uncertainty as a Measure of Informativeness}
To measure the informativeness of a sample, predictive uncertainty using Monte Carlo dropout is a powerful and simple method \cite{32948baba85d4c19b9d9594dcf57bf03}. 
In our work, the uncertainty of the unlabeled samples is estimated using Monte Carlo dropout and the model $\bm{\theta}_{k-1}$ trained in the previous round.
A subset of $Q$ uncertain samples $\bm{X_Q}$ is sampled (with replacement) from the unlabeled dataset with sampling probability proportional to a weighted uncertainty 
$p(\bm{\theta}_{k-1}, \bm{x})=
\frac{\sum_{c} w_c \cdot Uncert(\bm{\theta}_{k-1}, \bm{x}, c)}{\sum_{\bm{x} \in \bm{X_U}} \sum_{c} w_c \cdot Uncert(\bm{\theta}_{k-1}, \bm{x}, c)}$.
The uncertainty of a foreground class $c$ is weighted by  $w_c$ which is the inverse of the number of annotated voxels of that class in the training set.
The uncertainty-based weighting ensures that the performance is less biased towards classes where more data is presented.
We analyze the influence of query set size $Q$ in Appendix~\ref{influence}.
In our experiments, $Q$ is estimated using the elbow method \cite{Thorndike1953} which plots the sample uncertainties in descending order and takes the elbow of the curve as the number of uncertain samples in the data set. 

\subsubsection{Mutual information based submodular subset selection with gradient embeddings}
To create an informative batch of samples, it is important to consider not only uncertainty but also diversity and representativeness.
Since we select the query set $\bm{X_Q}$ by sampling from the unlabeled dataset with a probability distribution proportional to the class-weighted uncertainty, the query set includes not only most uncertain but more diverse samples.
To confirm that the proposed sampling strategy of the query set is more efficient than standard active learning (query set is the unlabeled dataset) \cite{Kothawade2021} or sampling the most uncertain samples from the unlabeled dataset, we compare the performance gain for the methods in Appendix~\ref{influence}.
In our approach, we use mutual information function variant for the facility location (FL) function (FL\textbf{Q}MI) show in Equation~\ref{eq:submod}.
The method requires a similarity matrix 
$\bm{S_{u q}} \in \mathbb{R}^{|\bm{X_U}| \times|\bm{X_Q}|}$
between samples of the unlabeled dataset and the query set and can be constructed in many ways, e.g. by computing the cosine distance \cite{10.1007/978-3-031-19839-7_1}, Euclidean distance \cite{inproceedings} or Fisher information kernel.
We analyze two variants of similarity matrices using cosine similarity between gradient embeddings of the bottleneck layer defined as \textbf{USIMC} and an approximation of pairwise influence between samples using the Fisher kernel which is defined as \textbf{USIMF}.
Details about the construction of the similarity matrices can be found in Appendix~\ref{fishersim}. 
After construction of the similarity matrix, we instantiate the submodular function $I_f(\bm{X_B} ; \bm{X_Q})$ and use stochastic greedy method to select the batch $\bm{X_B}$ for labeling.
We use the stochastic greedy optimizer from SUBMODLIB \cite{Kaushal2022} because it has a provably linear running time independent of the budget and is faster than the naive greedy approach.
We summarize the proposed USIM method in Algorithm~\ref{alg:usim}.

\label{algorithm}
\begin{algorithm}[h]
	\caption{USIM - Uncertainty-Aware Submodular Mutual Information Measure}
\label{alg:usim}
\begin{algorithmic}[1]
	\STATE \textbf{Input:} Initial labeled dataset: $\bm{X_L}$, unlabeled dataset: $\bm{X_{U}}$, initial self-configured nnUNet model parameter: $\bm{\theta}_0$, batch size: $B$, number of sampling rounds: $K$
	\FOR {$k=1,2,...,K$} 
	\STATE Estimate sample uncertainty $Uncert(\bm{\theta}_{k-1}, \bm{x}) \in \mathbb{R}$ for all unlabeled samples $x \in \bm{X_U}$
	\STATE Sample a query subset $\bm{X_Q} \subseteq \bm{X_U}$ with sample probability proportional to weighted uncertainty
	\STATE Compute similarity matrix $\bm{S_{u q}}$ with Equation~\ref{eq:cosine} for USIMC or \ref{eq:fishersim} for USIMF
	\STATE Instantiate the submodular function $I_f(\bm{X_B} ; \bm{X_Q})$ based on $\bm{S_{u q}}$ (Equation~\ref{eq:submod})
	\STATE Use stochastic greedy method to select batch $\bm{X_B}$ with\\
	$\bm{X_B} \leftarrow \operatorname{argmax}_{\bm{X_B} \subseteq \bm{X_U},|\bm{X_B}| \leq B} I_f(\bm{X_B} ; \bm{X_Q})$
	\STATE Query the oracle to obtain segmentation masks $\bm{y}(\bm{x}), \forall \bm{x} \in \bm{X_B}$
	\STATE $\bm{X_{L}} \leftarrow \bm{X_{L}} \cup \bm{X_B} $ ; $\bm{X_{U}} \leftarrow \bm{X_{U}} \setminus \bm{X_B} $
	\STATE Train model on $\bm{X_L}$: $\bm{\theta}_k = \argmin{\bm{\theta}} {\E_{\bm{X_L}}[l(\bm{x},\bm{y};\bm{\theta})]}$
	\ENDFOR
	\RETURN Final model $\bm{\theta}_{K}$
\end{algorithmic} 
\end{algorithm}

\section{Experiments and Results}
\label{experiments}

\subsection{Datasets}
We evaluate the AL sampling methods on three medical image segmentation datasets from the Medical Segmentation Decathlon \cite{Antonelli2022} to ensure method evaluation on different target regions, modalities and challenging features. 
For our analysis, we choose the labeled training set of 1) Spleen dataset, as a small dataset of 41 CT scans with a large ranging foreground size, 2) Liver dataset, a large dataset of 131 CT scans with label imbalance between large liver class and small tumor class and 3) Hippocampus dataset consisting of 260 MRI scans with the challenge to segment two adjacent small structures with high precision

\subsection{Active learning sampling strategies and implementation}
\textbf{Sampling Strategies:} We compare the following sampling method in our analysis: (1) Random Sampling, (2) Max Entropy \cite{Shannon1948}, (3) Mean STD \cite{Kendall2017a}, (4) BALD \cite{pmlr-v70-gal17a}, (5) Core-Set \cite{sener2018active}, (6) BADGE(LL) \cite{Ash2020Deep, Aklilu2022}, (7) Stochastic Batches \cite{Gaillochet2023} and (8) USIMC (ours), (9) USIMF (ours).\\
\textbf{Implementation:} For our experiments, we used 2D nnUNet configurations because they have a higher training speed and are less prone to overfitting. 
However, the proposed USIM method can easily be extended to 3D models and will be studied in further research.\\
For all our experiments, we set $\eta=1.0$ and analyzed the influence of the hyperparameter in Appendix~\ref{influence}.
The used hardware configurations and self-configured hyperparameters can be found in
Table~\ref{table:parameter} in Appendix~\ref{configuration}.
The code, for training and evaluation is available at \url{https://github.com/Berni1557/ALUNET}.\\
Further details about sampling strategies and implementation can be found in Appendix~\ref{sampling}.

\subsection{Results}
\label{results}
The performance gain in terms of dice score with respect to the number of annotated samples for all competing strategies is shown in Figure~\ref{fig:performance}. \\
\textbf{Spleen:}
All sampling strategies except Core-Set outperformed random sampling. However, the segmentation task is less challenging  and the performance differences between the strategies are therefore small. 
With less than 8\% of annotated data, USIMF achieved a similar dice score as trained with fully annotated dataset. \\
\textbf{Liver:}
Uncertainty-based methods (BALD, Mean STD) performed better than random sampling.
USIMF and USIMC outperformed most methods and reach near optimal performance with less than 5\% of the data.\\
\textbf{Hippocampus:}
USIMC and USIMF outperformed all other sampling strategies and reach performance of the fully annotated dataset with roughly 30\% of the annotated data.
All other strategies outperformed or performed similar to random sampling.\\
The pairwise penalty matrix \cite{10030143} in Figure~\ref{fig:performance} (D) aggregates results over all conducted experiments. 
The overall performance $\Phi$ is measured by the column-wise average, where lower numbers indicate a higher-performing algorithm.
The results show that USIMC and USIMF have the lowest column sum, indicating that they outperform other strategies.
The Figures~\ref{fig:segmentation_Spleen}, \ref{fig:segmentation_Liver}, and \ref{fig:segmentation_Hippocampus} in Appendix~\ref{examples} show examples of AL based segmentation results for all three datasets.
Figure~\ref{fig:tsne} shows a t-SNE plot of the USIMF gradient embeddings on the Liver dataset.
\begin{figure}[H]
	\centering
	\includegraphics[width=0.75\textwidth]{images/performance.png}
	\caption{
		Performance comparison of active learning strategies (Random sampling, Maximum Entropy, Mean Std, BALD, Core-Set, BADGE(LL), Stochastic Batches, USIMC (ours), USIMF (ours) and fully labeled dataset for the Spleen (A), Liver (B) and Hippocampus (C) dataset. A pairwise penalty matrix is shown in D. Element i j corresponds to the number of times (expressed in percentage) algorithm i outperforms algorithm j. Column-wise averages $\Phi$ are given where a lower number corresponds to a higher-performing algorithm.
	}
	\label{fig:performance}
\end{figure}

\section{Discussion and Conclusion}
In this paper, we have investigated the utility of the nnUNet as a self-configuring model in an AL setting to reduce labeling costs.
We additionally proposed USIM, an AL strategy that combined predictive uncertainty and submodular mutual information measure to select informative, diverse, and representational batches with two similarity matrix variants (USIMF and USIMC).
The experiments confirmed that using the self-configuring nnUNet pipeline in an active learning setting is an effective strategy for reducing labeling costs and facilitating AL by avoiding the cumbersome configuration of the training process.
All methods performed equally or better than random sampling.
We showed that methods based on uncertainty (BALD, STD MEAN) outperformed those based on representation (Core-Set). 
Our hybrid method outperformed the compared active learning methods, with USIMF performing slightly better than USIMC.
The proposed method was evaluated only on 2D nnUNets, but it can be extended to 3D approaches. 
However, we refrained from conducting the evaluation due to longer training times, leaving it for future research.
Further research is necessary to prove the effectiveness of active learning for medical image segmentation in real scenarios, rather than relying solely on simulations. 

\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{This work was funded by the German Research Foundation through the graduate program BIOQIC (GRK2260, project-ID: 289347353).}


\bibliography{midl24_146}


\appendix

\section{Similarity matrices for submodular mutual information measure}
\label{fishersim}
We analyze two gradient based similarity matrices for the submodular mutual information measure.\\
\textbf{USIMC}: In USIMC, we used the cosine similarity between gradient embeddings of bottleneck layer as similarity measure to construct a similarity matrix $\bm{S_{u q}}$.
The matrix measures similarity between the gradients of the unlabeled dataset $\bm{g_u}$ and the query set $\bm{g_q}$.
\begin{equation}
	\label{eq:cosine}
	\bm{S_{u q}}=\frac{\bm{g_u} \cdot \bm{g_q}}{\lVert \bm{g_u} \rVert \lVert \bm{g_q} \rVert}
\end{equation}
\textbf{USIMF}: In USIMF, we use the approximated pairwise influence (Fisher kernel) as similarity matrix.
The Fisher kernel, has already been used in \citet{Liu_2021_ICCV} for influence selection-based active learning or for deep active learning on biased datasets \cite{Gudovskiy2020}. 
%The similarity matrix is computed using Equation~\ref{eq:fishersim}.
The similarity matrix $\bm{S_{u q}} \in \mathbb{R}^{|\bm{X_U}| \times|\bm{X_Q}|}$ is constructed between the unlabeled dataset $\bm{X_U}$ and the query data set $\bm{X_Q}$. 
To select only the gradients from important parameters, we use a simple method that selects parameters based on the empirical Fisher information matrix \cite{Tu2016ReducingTM}.
The use of the Fisher matrix to identify model parameters that are important for a learning task was proposed by \citet{Kirkpatrick2017} to avoid catastrophic forgetting. 
We use Equation~\ref{eq:fisher} to approximate the Fisher information matrix based on randomly sampled subset of the query set $\bm{X_Q}$ with pseudo labels $\bm{\bar{y}}$ and predicted class probability $p_{\bm{\theta}}(\bm{\bar{y}} \vert \bm{x})$.
\begin{equation}
	\label{eq:fisher}
	\bm{\mathcal{F}}(\bm{\theta})=
	\diag(\frac{1}{\lvert \bm{X_Q} \rvert}\sum_{\bm{x} \in \bm{X_Q}}
	\nabla_{\bm{\theta}}\log p_{\bm{\theta}}(\bm{\bar{y}} \vert \bm{x})
	\nabla_{\bm{\theta}}\log p_{\bm{\theta}}(\bm{\bar{y}} \vert \bm{x})^{T})
\end{equation}
The Fisher information matrix is used to select a small subset of important parameters 
$\bm{\hat{\theta}} \subseteq \bm{\theta}$ 
by sampling with probability distribution proportional to the Fisher information 
given by 
$p(\theta_i)=\frac{\bm{\mathcal{F}}(\theta_i)}{\lVert\bm{\mathcal{F}}(\bm{\theta})\rVert}$.
We compute the gradient embeddings (Fisher score) of the unlabeled dataset with $\bm{g_u}=\nabla_{\bm{\hat{\theta}}}\log p_{\bm{\theta}}(\bm{\bar{y}} \vert \bm{x}), \bm{x} \in \bm{X_U}$ 
and the query set with 
$\bm{g_q}=\nabla_{\bm{\hat{\theta}}}\log p_{\bm{\theta}}(\bm{\bar{y}} \vert \bm{x}), \bm{x} \in \bm{X_Q}$.
The similarity matrix is than constructed using Equation~\ref{eq:fishersim}.
\begin{equation}
	\label{eq:fishersim}
	\bm{S_{u q}}=\bm{g_u}^T\bm{\mathcal{F}}^{-1}\bm{g_q}
\end{equation}

\newpage
\section{USIMF-based gradient embedding visualization}
\begin{figure}[h]
	\centering
	\includegraphics[width=0.99\textwidth]{images/strategy.png}
	\caption{Visualization (t-SNE) of gradient embeddings from USIMF. Black dots: labeled dataset, grey dots: unlabeled dataset, blue dots: target dataset, red dots: selected batch samples. 
		The diameter of the dots visualizes the sample uncertainty.\\
	}
	\label{fig:tsne}
\end{figure}

\newpage

\section{Sampling strategies and implementation details}
\label{sampling}
\textbf{Sampling strategies}
We compare the following sampling method in our analysis: (1) Random Sampling, (2) Max Entropy \cite{Shannon1948}, (3) Mean STD \cite{Kendall2017a}, (4) BALD \cite{pmlr-v70-gal17a}, (5) Core-Set \cite{sener2018active}, (6) BADGE(LL) \cite{Ash2020Deep, Aklilu2022}, (7) Stochastic Batches \cite{Gaillochet2023} and (8) USIMC (ours), (9) USIMF (ours).\\
For Core-Set, the activation map of the last convolutional layer of the encode was used as feature vector to compute Euclidean distances between samples.
BADGE(LL) is an adaptation of BADGE for segmentation tasks. 
The method uses the gradients from the last convolutional layer of the decoder to extract gradient embeddings.
The method is equivalent to the ALGES-img method proposed in \citet{Aklilu2022}.
Stochastic Batches is a method that combines uncertainty and representation by measuring uncertainty at the level of batches instead of samples. 
%The stochastic batch with highest uncertainty is than used for labeling in each sampling round. 
The method depends on the number of batches $Q$ that are generated from the unlabeled dataset and we set $Q=floor(|D_U|/B)$ and used Entropy as uncertainty measure in our experiments. 
For a fair comparison between USIMC and USIMF, and to make the processing tractable, the gradient embeddings are truncated to a length of 10k elements.\\
\textbf{Implementation}
The methods were evaluated on the first two folds of the nnUNet-based self-configured five-fold cross validation. 
The model was initialized with $100$ randomly selected samples (patches) and the labeled dataset was expanded with a budget of $B=100$ samples in each sampling round.
In our experiments, we selected an annotation budget of 100 slices, since it can be considered a realistic budget for practical active learning scenarios.
A larger budget may lead to less cost reduction, while a smaller budget may be impractical in realistic scenarios where radiologists must integrate the annotation process into their daily clinical routine.
However, we evaluated our proposed strategy for larger budgets by increasing the budget to 900 and 2000 samples for the Hippocampus and Liver datasets, respectively, in the last sampling rounds even it might not be reasonable for practical active learning.

\newpage
\section{Analysis of USIM based on query set selection strategy, batch uncertainty, query set size and weighting parameter}
\label{influence}
To further analyze and validate the proposed USIM sampling strategy, we investigated the query set selection strategy, batch uncertainty, influence of the query set size, and weighting parameter $\eta$.
We only conducted experiments for the USIMF method since it can be considered the best performing method without loss of generalization.

\textbf{Query set selection:}
We compared our proposed query set selection strategy based on the probability distribution proportional to the weighted uncertainty, with uniform sampling (similar to standard active learning in SIMILAR) \cite{Kothawade2021} and maximum uncertainty-based sampling. 
We compared query set selection strategies for the first two sampling rounds on the Hippocampus and Liver datasets, as shown in Figure~\ref{fig:query_selection}.
We observed that our query set selection strategy (USIMF), outperformed 
uniform sampling $\bm{X_Q} \sim Uniform(\bm{X_U}, Q)$ (USIMF uniform sampling) and sampling based on maximum uncertainty $\bm{X_Q} \leftarrow \operatorname{argmax}_{\bm{X_Q} \subseteq \bm{X_U},|\bm{X_Q}| = Q} \sum_{x_i \in \bm{X_Q}} Uncert(x_i)$ (USIMF max uncertainty sampling). 
%standard active learning $\bm{X_Q} \leftarrow \bm{X_U}$ and sampling based on maximum uncertainty $\bm{X_Q} \leftarrow \operatorname{argmax}_{\bm{X_Q} \subseteq \bm{X_U},|\bm{X_Q}| = Q} \sum_{x_i \in \bm{X_Q}} Uncert(x_i)$(USIMF max uncertainty sampling). 
Similar observations were made in \cite{Gaillochet2023}, where uncertainty-based sampling was performed on a stochastic batch level to overcome the shortcomings of pure uncertainty-based methods in sampling redundant data.
% alpaper_plot11
\begin{figure}[H]
	\centering
	\includegraphics[width=1.0\textwidth]{images/query_selection.png}
	\caption{Performance gain in terms of dice score for our query set selection strategy (USIMF), query set selection using uniform sampling from the unlabeled dataset (USIMF uniform sampling) and sampling based on maximum uncertainty (USIMF max uncertainty sampling) on the Hippocampus and Liver dataset.}
	\label{fig:query_selection}
\end{figure}


%\textbf{Gradient embedding:}
%We examined the impact of gradient embedding selection and found that our proposed methods, which estimate gradient embeddings from the bottleneck layer (USIMC) or estimate the gradient based on Fisher information (USIMF), produce better representations than last layer gradients usually used active learning for classification tasks \cite{Ash2020Deep}.
%In Figure~\ref{fig:gradient_embedding}, we compare the performance gain in terms of Dice score during the first two sampling rounds for USIMC, USIMF, and USIMC using last layer gradient embeddings (USIMC - last layer gradient embedding). 
%As expected, we observe that the performance gain based on last layer gradient embeddings is much smaller compared to the proposed methods and is in line with the lower performance gain of BADGELL in Figure~\ref{fig:performance}.
%\begin{figure}[H]
%	\centering
%	\includegraphics[width=1.0\textwidth]{images/gradient_embedding.png}
%	\caption{Performance gain in terms of dice score for USIMC, USIMF and USIMC with gradient embedding es extracted from the last layer of the U-Net architecture (USIMC - last layer gradient embedding) on the Hippocampus and Liver dataset.}
%	\label{fig:gradient_embedding}
%\end{figure}
%Uncertainty, diversity, and representativeness are three important properties of an effective sampled set. To analyse to what extend our sampling method meets this three criteria, we analyze uncertainty, diversity and representativeness.
\textbf{Uncertainty:}
We analyzed the amount of uncertainty of the selected batches to confirm that the chosen samples are indeed uncertain. 
We compared in Figure~\ref{fig:uncertainty} the overall uncertainty 
$Uncert(\bm{X_B}) = \sum_{x \in \bm{X_B}} \sum_{c} \cdot Uncert(\bm{\theta}_{k-1}, \bm{x}, c)$ of the selected batches estimated using Monte Carlo dropout with $c$ being the foreground class and $\theta_{k-1}$ the trained model from the previous training round.
The analysis was performed in the first five sampling rounds of the Hippocampus dataset.
We observed that the uncertainty of batches sampled with USIMF and USIMC are lower than uncertainty based methods (Mean STD, ENTROPY, BALD) but higher than representativity-based methods such as CORESET, hybrid variants (BADGELL, STB) or Random sampling.
\begin{figure}[H]
	\centering
	\includegraphics[width=0.6\textwidth]{images/bar_uncertainty_hippocampus.png}
	\caption{Overall batch uncertainty of the AL methods in the first five sampling rounds on the Hippocampus dataset.}
	\label{fig:uncertainty}
\end{figure}
To investigate the relationship between the segmentation error and the uncertainty measured by the gradient length and prediction uncertainty, we also performed a correlation analysis during the first sampling round of the Hippocampus dataset.
We analyzed the segmentation error (total number of misclassified pixels) and predictive uncertainty, as well as the gradient length (L2 norms) of the bottleneck layer on a randomly selected subset of the unlabeled dataset. 
The correlation coefficients shows that for segmentation tasks with the nnUNet, the predictive uncertainty estimated using the Monte Carlo dropout correlates more strongly with the segmentation error ($r=0.84$) than with the gradient length ($r=0.68$), indicating that predictive uncertainty is a better measure of uncertainty.

%Interestingly, for the correlation between gradient length and segmentation error, most samples that do not correlate well are samples with moderate gradient length but small segmentation errors (red circle).
%The results indicate that measuring uncertainty gradient length as a measure uncertainty measure are suboptimal
\begin{figure}[h]
	\centering
	\includegraphics[width=1.0\textwidth]{images/correlation.png}
	\caption{Correlation between the total number of misclassified pixels and the predictive uncertainty (left), and the correlation between the total number of misclassified pixels and the gradient length of the bottleneck layer (right)}
	\label{fig:correlation}
\end{figure}
\newpage

\textbf{Query set size:}
To analyze the influence of the size of the query set $Q$, we compare the performance gain with respect to $Q$ in Figure~\ref{fig:query_size} for USIMF. 
For the Hippocampus dataset, we set $Q \in \{200, 400, 1600, U\}$ with $U$ being the size of the unlabeled dataset and compare it with automated estimation of the query set size (USIMF). 
The query set size for the USIMF method which was estimated with the elbow method \cite{Thorndike1953} was $Q=942$  $(942/7662)$ and $Q=1470$ $(1470/7562)$ for the first and second round, respectively.  
For the Liver dataset, we set the query set size $Q \in \{200, 400, 800, 1600, 10000\}$.
For the USIMF method, the estimated query size was automatically set to $Q=6606$ $(6606/104133)$ and $Q=6852$ $(6852/104033)$ for the first and second rounds, respectively. 
As the query set is sampled with replacement, we assume that the overall uncertainty of the batch should remain similar. It can be observed that the performance gain is not significantly affected by the size of the query set.
% and the performance gain is not significantly affected by the size of the query set. 
%It is evident that the optimal number of query samples is highly dependent on the dataset.
%It can be observed that the performance gain is not significantly affected by the size of the query set. 
%Therefore, we conclude that the elbow method \cite{Thorndike1953} used, which plots the sample uncertainties in descending order and takes the elbow of the curve as the number of uncertain samples in the data set, is a decent method for estimating the query set size.
% alpaper_plot07
\begin{figure}[H]
	\centering
	\includegraphics[width=1.0\textwidth]{images/query_size.png}
	\caption{Performance gain with respect to the size of the query set $Q$ in the first two sampling round for the Hippocampus and Liver dataset.}
	\label{fig:query_size}
\end{figure}

\newpage

\textbf{Weighting parameter $\bm{\eta}$:}
We analyze the influence of weighting parameter $\eta$ governing trade-off between query-relevance and diversity for USIMF.
\citet{Kothawade2022} have shown in their experiments that larger weighting parameters $\eta$ tend to increases query relevance while decreasing query coverage and diversity.
We anlyzed the performance gain with respect to $\eta \in \{0.0, 1.0, 10.0, 100.0\}$ in the first two sampling rounds and did not encounter larger performance differences.
%We observe that for the Liver dataset, larger values of $\eta$ seem to slightly improve performance indicating that the query set already includes diverse samples so that query relevance seems to be 
% alpaper_plot08
\begin{figure}[H]
	\centering
	\includegraphics[width=1.0\textwidth]{images/weighting_parameter.png}
	\caption{Performance gain with respect to the weighting parameter $\eta$.}
	\label{fig:weighting_parameter}
\end{figure}

%\begin{figure}[H]
%	\centering
%	\includegraphics[width=1.0\textwidth]{images/analysis.png}
%	\caption{Performance comparison of USIMF with selection of the query set based on a) uniform probability distribution, b) sampling probability proportional to a normalized weighted predictive uncertainty and c) most uncertain samples.}
%	\label{fig:sampling_query}
%\end{figure}

\newpage

\section{Redundancy scenario}
\label{redundency}
To analyze our proposed method under a realistic redundancy scenario, we create a custom unlabeled Hippocampus dataset by dublicating all samples (patches) of one MRI scan (hippocampus\textunderscore172.nii.gz) 100× after the initial training round. 
After the dublication process, the query set with a query size of $Q=1000$ is sampled with probability distribution proportional to uncertainty.
After sampling process, the query set consists of $45\%$ samples from the same MRI scan. 
After submodular subset selection the selected batch $X_B$ includes only $18\% (18/100)$ samples from the dublicated scan compared to $45\% (45/100)$ by uniform sampling from the query set.
It confirms that the proposed strategy with mutual information based submodular subset selection is robust with respect to redundancy.
We compare the performance gain in terms of dice score based on the redundant unlabeled dataset for USIMF and uniform sampling from the query set in Figure~\ref{fig:redundency}.
\begin{figure}[H]
	\centering
	\includegraphics[width=0.6\textwidth]{images/redundancy.png}
	\caption{Performance comparison of USIMF and uniform sampling from the query set in a redundancy scenario.}
	\label{fig:redundency}
\end{figure}

\newpage

%\section{Correlation between segmentation error, predictive uncertainty and gradient length}
%\label{correlation}
%A exemplary correlation analysis was conducted between the segmentation error (total number of misclassified pixels) and predictive uncertainty, as well as the gradient length (L2-norms) of te bottleneck layer on a randomly selected subset of the unlabeled dataset during the first sampling round of USIMF. 
%The correlation coefficients indicates that for segmentation tasks with the nnUNet, the predictive uncertainty estimated using Monte Carlo dropout correlates stronger with the segmentation error ($r=0.84$) than the gradient length ($r=0.68$).
%The correlation plots are shown in Figure~\ref{fig:correlation}.
%Interestingly, for the correlation between gradient length and segmentation error, most samples that do not correlate well are samples with moderate gradient length but small segmentation errors (red circle).
%The results indicate that measuring uncertainty gradient length as a measure uncertainty measure are suboptimal
%\begin{figure}[h]
%	\centering
%	\includegraphics[width=1.0\textwidth]{images/correlation.png}
%	\caption{Correlation between the total number of misclassified pixels and the predictive uncertainty (left), and the correlation between the total number of falsely predicted pixels and the gradient length of the bottleneck layer (right)}
%	\label{fig:correlation}
%\end{figure}
%\newpage

\section{Configurations and self-configured hyperparameters by the nnUNet pipeline}
\label{configuration}
\begin{table}[H]
	\centering
		\caption{Configurations and self-configured hyperparameters}
			\begin{tabular}{|c|c|}
				\hline
				\multicolumn{2}{|c|}{Spleen dataset} \\
				%\Xhline{1.5pt} 
				\hline
				Hardware & 120GB RAM; NVIDIA A100-PCIE GPU, 40GB  \\
				\hline 
				nnUNet configuration & 2D \\
				\hline
				\# downsampling stages & 8 \\
				\hline 
				\# model parameters & 10.2M \\
				\hline 
				Batch size & 12 \\
				\hline 
				Patch size & 512 x 512 \\
				\hline
				Epochs per training round & 300 \\
				\hline
				Loss function & Compound loss (dice and cross entropy loss)\\
				\hline\hline
				\multicolumn{2}{|c|}{Liver dataset} \\
				\hline
				Hardware & 120GB RAM; NVIDIA A100-PCIE GPU, 40GB \\
				\hline 
				nnUNet configuration & 2D \\
				\hline 
				\# downsampling stages & 8 \\
				\hline 
				\# model parameters & 46.3M \\
				\hline 
				Batch size & 12 \\
				\hline 
				Patch size & 512 x 512 \\
				\hline
				Epochs per training round & 300 \\
				\hline
				Loss function & Compound loss (dice and cross entropy loss)\\
				\hline\hline
				\multicolumn{2}{|c|}{Hippocampus dataset} \\
				\hline
				Hardware & 120GB RAM; NVIDIA V100 GPU, 32GB  \\
				\hline 
				nnUNet configuration & 2D \\
				\hline 
				\# downsampling stages & 4 \\
				\hline 
				\# model parameters & 1.9M \\
				\hline 
				Batch size & 366 \\
				\hline 
				Patch size & 56 x 40 \\
				\hline
				Epochs per training round & 300 \\
				\hline
				Loss function & Compound loss (dice and cross entropy loss)\\
				\hline
			\end{tabular}
			%}
		\label{table:parameter}
		%\end{adjustbox}
	\end{table}	
	
	\newpage

	\newpage
	\section{Examples of active learning for medical image segmentation}
	\label{examples}
	\begin{figure}[H]
		\centering
		\includegraphics[width=1.0\textwidth]{images/Spleen.png}
		\caption{The top row, from left to right, shows a CT slice of the spleen, the ground truth segmentation, and the segmentation result of the model train on the fully annotated dataset. 
			The second row shows the segmentation results after the first and fifth round of random sampling.
			The third row shows the segmentation results after the first and fifth round of the USIMF sampling method.}
		\label{fig:segmentation_Spleen}
	\end{figure}
	
	\newpage
	
	\begin{figure}[h]
		\centering
		\includegraphics[width=1.0\textwidth]{images/Liver.png}
		\caption{The top row, from left to right, shows a CT slice of the liver, the ground truth segmentation, and the segmentation result of the model train on the fully annotated dataset. 
			The second row shows the segmentation results after the first and fifth round of random sampling.
			The third row shows the segmentation results after the first and fifth round of the USIMF sampling method.}
		\label{fig:segmentation_Liver}
	\end{figure}
	
	\newpage
	
	\begin{figure}[H]
		\centering
		\includegraphics[width=0.85\textwidth]{images/Hippocampus.png}
		\caption{The top row, from left to right, shows an MRI slice of the hippocampus, the ground truth segmentation, and the segmentation result of the model train on the fully annotated dataset. 
			The second row shows the segmentation results after the first and fifth round of random sampling.
			The third row shows the segmentation results after the first and fifth round of the USIMF sampling method.}
		\label{fig:segmentation_Hippocampus}
	\end{figure}

\end{document}
