% This is samplepaper.tex, a sample chapter demonstrating the
% LLNCS macro package for Springer Computer Science proceedings;
% Version 2.21 of 2022/01/12
%
\documentclass[runningheads]{llncs}
%
\usepackage[T1]{fontenc}
% T1 fonts will be used to generate the final print and online PDFs,
% so please use T1 fonts in your manuscript whenever possible.
% Other font encondings may result in incorrect characters.
%
\usepackage{graphicx}
% Used for displaying a sample figure. If possible, figure files should
% be included in EPS format.
%
% If you use the hyperref package, please uncomment the following two lines
% to display URLs in blue roman font according to Springer's eBook style:
%\usepackage{color}
%\renewcommand\UrlFont{\color{blue}\rmfamily}
%\urlstyle{rm}
%

% add philippe
\usepackage{xcolor}
\usepackage{todonotes}
\usepackage{comment}
\usepackage{hyperref}

\usepackage{multirow}

\usepackage{cite}
% \usepackage[numbers, sort&compress]{natbib}
% \usepackage[backend=biber, style=numeric-comp, maxbibnames=5, maxcitenames=2]{biblatex}
% \addbibresource{biblio.bib}

\newcommand{\rev}[1]{{\color{black} #1}} 

\begin{document}
%
% \title{Combining SAM and nnU-Net in an Active Learning Framework for 3D Dental Image Segmentation}
\title{From Prediction to Prompt: Leveraging nnU-Net Outputs to Guide SAM for Active Learning in 3D Dental Segmentation}
%
%\titlerunning{Abbreviated paper title}
% If the paper title is too long for the running head, you can set
% an abbreviated paper title here
%
\author{Nicolas Martin\inst{1,2}\orcidID{0000-0002-2788-1042} \and
Jean-Pierre Chevallet\inst{2}\orcidID{0000-0002-5945-9444} \and
Philippe Mulhem\inst{2}\orcidID{0000-0002-3245-6462}}
%
\authorrunning{N. Martin et al.}
% First names are abbreviated in the running head.
% If there are more than two authors, 'et al.' is used.
%
\institute{PEEKTORIA, Grenoble, France \\
\email{nicolas.martin@peektoria.com}
\and
%\and
%Springer Heidelberg, Tiergartenstr. 17, 69121 Heidelberg, Germany
%\email{lncs@springer.com}\\
%\url{http://www.springer.com/gp/computer-science/lncs} \and
Univ. Grenoble Alpes, CNRS, Grenoble INP\footnote{Institute of Engineering Univ. Grenoble Alpes}, LIG, Grenoble, France\\
\email{\{jean-pierre.chevallet,philippe.mulhem\}@univ-grenoble-alpes.fr}}
%
\titlerunning{nnU-Net Predictions as Prompts for SAM-Med3D}
\maketitle              % typeset the header of the contribution
%
\begin{abstract}
%This paper explores the segmentation of 3D dental images (Cone Beam Computed Tomography - CBCT) through active learning (AL) by integrating nnU-Net and a 3D specialized Segment Anything Model (SAM). 
To enhance annotation efficiency in 3D dental Cone Beam Computed Tomography (CBCT) image segmentation, this paper explores an active learning \rev{(AL)} approach that leverages nnU-Net predictions to generate prompts for a specialized 3D Segment Anything Model (SAM).
The objective is to minimize the annotation burden without relying on prompts during the inference phase. 
First, our experiments showed that AL offers similar segmentation performance with less than 20\% of the original annotations. 
Second, random selection offers similar results than more complex sampling method with less more computing demand.
Third, the predictions of nnU-Net on unannotated images provided effective prompts for the SAM model specialized in 3D medical images \rev{(i.e., SAM-Med3D)}. Combining these two approaches reduced the required amount of manual annotation by up to 50\%.
This paper paves the way for more easily obtaining new annotated datasets in the dental domain while simultaneously training a segmentation model, by leveraging SAM-like models.

\keywords{Active Learning \and nnU-Net \and Segment Anything \and Segmentation \and 3D dental CBCT.}
\end{abstract}
%
%

\section{Introduction}

% \todo[inline]{
% -add qualitative visualisation
% - fix results 
% }

% The segmentation of organs is one of the most explored task in computer vision applied to medical images. 
% In dental domain, the wide-spreading of imaging technology inside dental offices, such as cone-beam computed tomography (CBCT) or panoramic X-ray, highlighted the need for a automatic solution to easily exploit this information. Correctly segmenting anatomical structures (e.g., teeth) is usually an essential step in the pipeline of the computer-aided detection system \cite{litjens_survey_2017}.
% Although dental problems affect a very large proportion of the population, computer vision tools applied to dentistry remain less explored, notably due to the low number of annotated datasets outside the recent MICCAI challenges (e.g., ToothFairy \cite{Bolelli_2025_CVPR}, 3DTeethSeg \cite{ben_hamadou_3dteethseg22_2023}).
% As showed by the recent challenges organized at MICCAI (e.g., ToothFairy~\cite{Bolelli_2025_CVPR}, 3DTeethSeg~\cite{ben_hamadou_3dteethseg22_2023}), segmenting dental organs, especially on 3D images remains a strong challenge. It mainly rely on nnUNet model \cite{isensee_nnu_net_2021}, specialized on dental datasets (e.g., \cite{wang_sts_2024, isensee_scaling_2024}).

Organ segmentation is a highly active research area within computer vision for medical imaging. In the dental domain, the widespread adoption of imaging technologies like Cone Beam Computed Tomography (CBCT) and panoramic X-rays in clinical settings has underscored the critical need for automated solutions to effectively leverage this information.
Precisely segmenting anatomical structures (e.g., teeth) is often an essential step for robust computer-aided detection systems \cite{litjens_survey_2017}. 
Despite dental issues affecting a significant global population, dedicated computer vision tools for dentistry remain less developed, largely due to a scarcity of annotated datasets outside the scope of recent MICCAI challenges (e.g., ToothFairy \cite{Bolelli_2025_CVPR}, 3DTeethSeg \cite{ben_hamadou_3dteethseg22_2023}). As highlighted by these challenges, accurately segmenting dental organs, particularly in 3D images, presents a major difficultly, and currently often relies on adaptations of the nnU-Net model \cite{isensee_nnu_net_2021} specialized for dental datasets (e.g., \cite{isensee_scaling_2024, wang_sts_2024}).

On the other hand, inspired by the success of large language models (LLMs), which are pre-trained using self-supervised learning (SSL) on very large datasets and fine-tuned to follow instructions (prompt-based models)~\cite{ouyang_training_2022}, the Segment Anything Model (SAM)~\cite{SAMKirillov} has been proposed.
% \rev{On the other hand, large language models (LLMs) are pre-trained using self-supervised learning (SSL) on massive datasets and then fine-tuned to follow instructions (prompt-based models)~\cite{ouyang_training_2022}. Inspired by their success, the Segment Anything Model (SAM)~\cite{SAMKirillov} was proposed.}
The initial SAM model~\cite{SAMKirillov} have been trained on approximately one billion image-mask pairs. This attention-based model is designed to be applied to any image, aiming to address nearly any segmentation task. Despite this initial assertion, these models are unable to correctly segment specific image types, such as medical images~\cite{HUANG2024103061}, necessitating fine-tuning (e.g., MedSAM~\cite{MedSAMMa, ma_medsam2_2025}, SAM-Med3D~\cite{wang_sam_med3d_2024}). Furthermore, such SAM-like models heavily rely on ``prompts'' (e.g., bounding boxes, points), which serve as strong indicators for defining the image region to be segmented~\cite{SAMKirillov}.
In practice, in daily clinical routine, the introduction of SAM is barely impossible, as it requires a precise bounding box or multiple points to perform accurate segmentation \cite{li_optimization_2025}. Consequently, it remains essential to train segmentation models on annotated data.
This paper investigates the integration of Active Learning (AL) with SAM-like models to reduce the expert annotation burden in 3D dental segmentation tasks.

\section{Related work}
Prior studies on active learning (AL) have shown that not all data points are equally informative \cite{ren_survey_2021}. Their annotations can significantly influence both the training process and the final performance of the model \cite{settles_active_2009}. Selecting the most informative images should be more beneficial to model performance than random selection of images \cite{yoo_learning_2019}. This assumption has led to the development of numerous AL methods designed to select the most informative samples for annotation \cite{settles_active_2009, ren_survey_2021}.

In the dental domain, obtaining images for diagnostic or archival purposes has become standard practice, leading to the availability of large datasets \cite{zeng2020generating}. However, these datasets are rarely annotated \cite{dao_comprehensive_2023}. Thus, selecting the most informative images using AL methods presents a valuable opportunity to significantly alleviate the annotation workload for experts, thereby promoting the creation of more efficient medical tools based on deep learning algorithms: see \cite{budd_survey_2021} for a review of AL for medical images.
In 3D dental domain, Huang et al. \cite{huang_uncertainty_based_2024} and Jung et al. \cite{jung_deep_2021} showed that AL can improve the segmentation performance. 

In the context of 2D medical images, Li et al. \cite{li_plugandplay_2025} explored the combination of nnU-Net and a generic SAM model. SAM predictions are directly integrated into the nnU-Net architecture as an external module to enhance segmentation performance. Stock el al. \cite{stock_segment_2025} investigated the integration of nnU-Net with SAM for 3D images. However, due to computational constraints, their approach is applied in a 2D slice-by-slice manner.
On the other hand, interactive annotation relying on SAM-like models have been proposed: Isensee et al. \cite{isensee_nninteractive_2025} trained nnU-Net model on 120+ 3D datasets to produce segmentation masks using prompts.  % But these methods are only adapted for data annotation and not integrated into AL process.

In this paper, we explore the integration of active learning with promptable segmentation models (e.g., SAM-like models). To the best of our knowledge, no prior study has investigated the combination of nnU-Net and SAM for 3D dental image segmentation within an active learning framework.

\section{Method}
This paper investigates two key aspects: (1) the impact of various AL sampling strategies on 3D image segmentation performance and (2) the performance of SAM-like models (i.e., SAM-Med3D~\cite{wang_sam_med3d_2024}) when integrated with nnU-Net-derived prompts during AL training.

\subsection{Datasets}
The dataset ToothFairy2~\cite{Bolelli_2025_CVPR} have been used in the following experiments. It is composed of 480 Cone Beam Computed Tomography (CBCT) with 42 classes.
To reduce computational complexity and focus our analysis, the original anatomical classes were re-categorized into the following 6 broader classes for segmentation:
\begin{itemize}
    \item Background
    \item Jawbones: Lower and Upper
    \item Inferior Alveolar Canal (IAC): Left and Right
    \item Sinus: Left and Right
    \item Pharynx
    \item Teeth (32 classes originally)
\end{itemize}

Due to their sparse representation in the dataset, the Bridge, Crown, Implant, and NA classes were excluded from segmentation and assigned to the background.
% To facilitate the experiments and similary to [REF], the classes have been grouped by main anatomical structure: L/R IAC, L/R Sinus, Teeth, Jawbones, Pharynx, and Others.

\subsection{Metrics}
The segmentation performance was evaluated using the Dice Similarity Coefficient (DSC in \%).
For a given image $i$ and a specific target class $C$, let $Sg_i^C$ represent the set of pixels assigned to class $C$ in the ground truth segmentation, and $Sa_i^C$ denote the corresponding set of pixels predicted by the automatic segmentation model. The $Dice$ score for class $C$ on image $i$ quantifies the overlap between these two segmentations and is defined by equation~(\ref{eq:4b}):

\begin{equation}
\label{eq:4b}
Dice(Sg_i^C, Sa_i^C) = \frac{2| Sg_i^C \cap Sa_i^C|}{|Sg_i^C| + |Sa_i^C |}
\end{equation}

DSC ranges from $0$ to $1$, where $1$ indicates perfect agreement between the predicted and ground truth segmentations for that specific class. The overall performance is typically reported as the mean of these per-image, per-class Dice scores averaged across all relevant classes and images in the dataset.

To evaluate the effectiveness of SAM-Med3D~\cite{wang_sam_med3d_2024} in facilitating annotation, we calculated the Symmetric Difference (SD). This metric quantifies the total volume of discrepancy between two segmentations, representing the exact voxels an expert would need to adjust (either add or remove) to align a prediction with the ground truth. It is defined as the sum of false positives (FP) and false negatives (FN), as shown in Equation~(\ref{eq:5}):

\begin{equation}
\label{eq:5}
SD(A, B) = FP + FN
\end{equation}
This metric is normalized (Normalized Symmetric Difference – NSD) per class by the union of predicted and the ground truth for the corresponding voxels. That ensures a fair comparison between classes with large regions (e.g., jawbones) and those with small regions (e.g., IAC). NSD ranges from $0$ to $100$, where $0$ indicates perfect masks not requiring any modification. 

To account for differences in organ size across classes (e.g., large regions such as Jawbones versus small regions such as the Sinus), SD was normalized by the union of predicted and ground-truth voxels, resulting in the Normalized Symmetric Difference (NSD). NSD ranges from $0$ to $100$, where $0$ indicates perfectly overlapping masks that require no modification.

\subsection{Active Learning sampling methods}
Two AL sampling methods have been evaluated: Naive sampling (random selection) and Least confidence sampling.
The random sampling consists into randomly select N images at each AL round. 
The least confidence \cite{li_hal_ia_2023} approach involves selecting the images for which the model is the least confident. The least confidence score for a single pixel is defined in Equation~(\ref{eq:1}):

\begin{equation}
\label{eq:1}
Uncertainty_{LeastConfidence}(\hat{y}) = |1 - \hat{y}|
\end{equation}
where $\hat{y}$ is the predicted value for pixel $y$ of an input image. The uncertainty score for an entire image is obtained by averaging the individual pixel uncertainty scores across all considered classes.

\subsection{Workflow}

During the AL process (see \figurename~\ref{al_training}), round 0 corresponds to the cold-start and consists of the following:
(1) \textit{N} images are randomly selected for annotation,
(2) a data fingerprint is generated and used to prepare the dataset for nnU-Net, and
(3) the model is trained.

The following steps are performed in each subsequent AL round:
\begin{enumerate}
    \item the informativeness of each unlabeled image is computed using previously trained model,
    \item the most informative images are selected,
    \item these images are annotated and incorporated into the set of images labeled in previous AL rounds,
    \item the images are prepared for nnU-Net. Following the approach of~\cite{follmer_active_2024}, a fixed data fingerprint (generated in round 0) is reused across iterations to accelerate data preparation,
    \item a new model is fine-tuned, and
    \item the model is evaluated, with the best checkpoint always used to make predictions at each AL round.
\end{enumerate}
This AL process is repeated until the annotation budget is exhausted.

Concerning the SAM predictions, the following steps are performed (see \figurename~\ref{sam_prompting}):
\begin{enumerate}
    \item Predictions are generated using the nnU-Net model.
    \item Prompts (i.e., simulated clicks on relevant areas corresponding to classes) are generated based on these predictions.
    \item The images and prompts are fed into SAM-Med3D to produce 3D segmentations.
\end{enumerate}

\begin{figure}[htb]
    \centering
    \includegraphics[scale=0.38]{active_learning_1.pdf}
    \caption{Active Learning (AL) workflow. \rev{Round 0: A predefined number of images are randomly selected to generate the nnU-Net data fingerprint and train the initial model. Steps in a single AL round: (1) evaluate the informativeness of each image in the unlabeled pool using the current model, (2) select the most informative images, (3) annotate the selected images, (4) prepare the images for nnU-Net using the existing data fingerprint, (5) fine-tune the model with both previously and newly annotated images, and (6) evaluate the updated model. Steps 1–6 in are repeated until the annotation budget is exhausted.}}
    \label{al_training}    
\end{figure}

\begin{figure}[htb]
    \centering
    \includegraphics[scale=0.35]{medsam_prediction.pdf}
    \caption{Overview of 3D-assisted annotation using nnU-Net and a SAM-like model. \rev{The process consists of four steps: (1) generating pixel-wise predictions with nnU-Net, (2) creating point-based prompts (simulated clicks) from these predictions, (3) producing pixel-wise predictions with SAM-Med3D using these prompts, and (4) performing human corrections on the generated masks to obtain the final annotation.}}
    \label{sam_prompting}    
\end{figure}

\subsection{Network architecture}
The segmentation is performed using the nnU-Net model \cite{isensee_nnu_net_2021}. It builds upon the successful U-Net architecture \cite{ronneberger_u-net_2015} and offers a self-configuring approach that minimizes the need for manual parameter tuning. nnU-Net has consistently demonstrated high performance across various medical datasets  \cite{isensee_nnu_net_2021} and becomes the default model for medical image segmentation \cite{russell2024machine, isensee_nnu_net_2024}. 
Concerning prompt-based models for segmentation, the SAM-Med3D model \cite{wang_sam_med3d_2024} has been used. This model has been specialized for 3D medical images and adapted to handle click-based prompts.

\subsection{Hyper-parameters}
Concerning nnU-Net~\cite{isensee_nnu_net_2021}, the default parameters were used, with three exceptions. To reduce computational demands and mitigate overfitting, since AL involves significantly fewer annotated examples than standard training, the number of iterations per epoch was limited to 100. Additionally, the number of epochs per AL round was limited to 50. Lastly, only the 3D low-resolution configuration of nnU-Net was used.

Concerning the AL part, prospective comparison of AL methods (i.e., actually asking an expert to annotate the selected images) is problematic, since image selection influences subsequent selections and, consequently, the results. To enable a fair comparison, the AL process was simulated using the fully annotated dataset.
The following parameters was used:
\begin{itemize}
    \item Number of AL rounds: 10
    \item Number of images selected at each AL round: 5
    \item Cold start (round 0): 5\% of annotated data (20 images) have been randomly selected images and used to initialize model training
    \item At each AL round, in accordance with the survey of Budd et al.~\cite{budd_survey_2021}, the model was finetuned using all available annotated data (previously + newly annotated images), from prior best checkpoint at the previous round. 
\end{itemize}

For the evaluation, 15\% of the dataset (72 images) was randomly sampled to form the test dataset.
For a fair comparison, a nnU-Net model was also trained on the fully annotated dataset for the same number of iterations (50,000) as used in the 10 AL iterations (called “Internal Test” in Table~\ref{tab:AL_comparison}).

Concerning the SAM-Med3D model \cite{wang_sam_med3d_2024}, the default parameters were used. 

Experiments were performed on a system with an NVIDIA A6000 GPU (48 GB VRAM), Intel Xeon Silver 4208 CPU (16 cores), and 128 GB RAM. The code used for the experiments is publicly available at \url{https://github.com/martinicmrim/sam_nnunet}.

\section{Results}

\subsection{Active Learning sampling methods on segmentation performance}

The comparison between Active Learning (AL) sampling methods is depicted in Table~\ref{tab:AL_comparison}. The performance of the AL sampling methods was evaluated using the model weights from the final AL round (i.e., round 10). We also report the performance obtained using the fully annotated dataset (“Internal Test”), as well as the performance of random sampling AL with the 3D full-resolution configuration of nnU-Net.

The AL methods demonstrated performance comparable to training on the full dataset, utilizing less than 20\% of the original training data, with the exception of ``Sinus'' segmentation.
Similar segmentation performance is observed between the random selection method and least confidence selection, although training time is 5 time longer.

\begin{table}[htb]
\caption{Mean Dice Score (in \%) at the last AL round (round 10) and training time on grouped ToothFairy2 classes according to Active Learning sampling method}
\label{tab:AL_comparison}
\begin{tabular}{lcccccccc}
\hline
\textbf{} &
  Average &
  Jawbones &
  IAC &
  Sinus &
  Pharynx &
  Teeth &
  \begin{tabular}[c]{@{}c@{}}Training \\ time\end{tabular} &
  \begin{tabular}[c]{@{}c@{}}Data \\ used\end{tabular} \\ \hline
\textbf{Method}                                                                     & DSC   & DSC   & DSC   & DSC   & DSC   & DSC   & Hours & \%  \\ \hline
Full dataset (FD)~\cite{Bolelli_2025_CVPR}                                              & 70.92 & 90.31 & 71.34 & 64.81 & 95.66 & 73.17 & NA    & 100 \\
%\begin{tabular}[c]{@{}l@{}}Random Samp. AL \\ $\;\;\;$(last step)\end{tabular}           & 74.33 & 98.5  & 88.38 & 0     & 96.73 & 88.38 & 5     & 18  \\
\begin{tabular}[c]{@{}l@{}}Random Samp. AL \end{tabular}           & 74.33 & 98.5  & 88.38 & 0     & 96.73 & 88.38 & 5     & 18  \\
%\begin{tabular}[c]{@{}l@{}}Least Conf. Samp. AL \\ $\;\;\;$(last step)\end{tabular} & 73.7  & 98    & 86.02 & 0     & 96.82 & 87.67 & 27    & 18  \\ \hline
\begin{tabular}[c]{@{}l@{}}Least Conf. Samp. AL \end{tabular} & 73.7  & 98    & 86.02 & 0     & 96.82 & 87.67 & 27    & 18  \\ \hline
% Full Data\\ $\;\;\;$(Internal Test)                                                       & 74.33 & 98.15 & 88.38 & 0.0   & 96.74 & 88.37 & 5     & 100 \\
Internal Test FD & 74.33 & 98.15 & 88.38 & 0.0   & 96.74 & 88.37 & 5     & 100 \\
%\begin{tabular}[c]{@{}l@{}}Random Samp. AL \\ - Full resolution \\ $\;\;\;$(last step)\end{tabular} &
\begin{tabular}[c]{@{}l@{}}Random Samp. AL \\ - Full resolution \end{tabular} &
  72.62 &
  97.89 &
  85.42 &
  0.0 &
  95.16 &
  84.60 &
  7 &
  100 \\ \hline
\end{tabular}
\end{table}

% \todo[inline]{add visualisation of annotation over AL rounds}

Figure~\ref{al_qualitative} depicts the qualitative evaluation of segmentation across AL rounds.

\begin{figure}[htb]
    \centering
    \includegraphics[scale=0.35]{AL_qualitative.pdf}
    \caption{Qualitative visualization of predictions for image 58 (ToothFairy dataset) at AL rounds 1, 5, and 10, compared with the annotation mask (axial slice S: 43.8 mm, 3D Slicer).}
    \label{al_qualitative}    
\end{figure}



%\subsection{SAM-Med3D \cite{wang_sam_med3d_2024} performance on ToothFairy2}
\subsection{Evaluation of SAM-Med3D masks with prompts derived from nnU-Net predictions}

% \rev{
% To evaluate the performance of SAM-Med3D~\cite{wang_sam_med3d_2024} to facilitate annotation of 3D dental images, we simulate a new AL step. The objective was to test whether to deploy the solution in 3 step of the AL, what annotation work we would have to do thanks to the combinaition of nnUnet and medsam. The principle of using nnU-net predictions as input for SAM-like models has been evaluated. We evaluated the quality of masks generated by SAM-Med3D~\cite{wang_sam_med3d_2024} from prompts derived from nnU-Net predictions. For this purpose, we simulate a new AL step where 5 images are selected.  To do so, the following step were performed based on the last AL step (random selection):
% \begin{enumerate}
%     \item Randomly select 5 images
%     \item Generate predictions from lastly trained nnU-Net model
%     \item Generate prompts (i.e., simulated clicks) on each predicted classes
%     \item Provide prompts and images as input of 
% \end{enumerate}
% }

\rev{
To evaluate the potential of SAM-Med3D~\cite{wang_sam_med3d_2024} in facilitating the annotation of 3D dental images, we simulated an additional Active Learning (AL) iteration. The objective was to assess, if SAM-Med3D were deployed at the step 3 of the AL process, how much annotation effort could be reduced through the combination of nnU-Net and SAM-Med3D. Specifically, the quality of the masks generated by SAM-Med3D from prompts derived from nnU-Net predictions was evaluated. The procedure was as follows, based on the last AL iteration (with random sampling method):
\begin{enumerate}
    \item Randomly select 5 images,
    \item Generate 3D predictions using the most recently trained nnU-Net model,
    \item Generate point-based prompts (i.e., simulated clicks) for each predicted class,
    \item Use SAM-Med3D with the prompts and input images to produce 3D annotation masks,
    \item Evaluate the quality of the generated 3D annotation masks.
\end{enumerate}

The influence of the number of prompts per class (i.e., 1, 5, and 10 clicks per class) on the quality of the masks was also evaluated.
The quality of the generated masks was quantitatively assessed using the Normalized Symmetric Difference (NSD), with Table~\ref{tab:sam_click} reporting the percentage of voxels requiring expert annotation or correction based on the combination of nnU-Net and SAM-Med3D.
}

% \begin{itemize}
%     \item On (internal) test dataset
%     \item Within the AL process, comparing predictions derived directly from nnU-Net and those informed by ground truth: SAM-Med3D relies on the segmentation maps to simulate clicks
%     \item Assessing the impact of varying numbers of simulated clicks (1, 5, and 10)
% \end{itemize}


% \begin{table}[htb]
% \caption{\rev{Evaluation (Normalized Symmetric Difference) of SAM-Med3D performance under AL with varying prompt inputs}}
% \label{tab:sam_click2}
% \centering
% \begin{tabular}{lcccccc}
% \hline
% \textbf{}          & Average & Jawbones & IAC   & Sinus & Pharynx & Teeth \\ \hline
% \textbf{N prompts} & SD      & SD       & SD    & SD    & SD      & SD    \\ \hline
% 1 click            & 62.61   & 88.92    & 37.94 & 98.60 & 98.97   & 0     \\
% 5 clicks           & 50.75   & 76.59    & 37.58 & 98.12 & 97.38   & 0     \\
% 10 clicks          & 51.52   & 69.34    & 35.38 & 97.56 & 97.30   & 0     \\ \hline
% \end{tabular}
% \end{table}

\begin{table}[htb]
\caption{\rev{Evaluation of SAM-Med3D performance (Normalized Symmetric Difference, in \%) with varying numbers of prompts per class.}}
\label{tab:sam_click}
\centering
\begin{tabular}{lcccccc}
\hline
\textbf{Number of Prompts} & \textbf{Average} & \textbf{Jawbones} & \textbf{IAC} & \textbf{Sinus} & \textbf{Pharynx} & \textbf{Teeth} \\ \hline
1 click   & 62.61 & 88.92 & 37.94 & 98.60 & 98.97 & 0 \\
5 clicks  & 50.75 & 76.59 & 37.58 & 98.12 & 97.38 & 0 \\
10 clicks & 51.52 & 69.34 & 35.38 & 97.56 & 97.30 & 0 \\ \hline
\end{tabular}
\end{table}


\section{Discussion}

% \todo[inline]{instance segmentation for teeth}

Concerning AL, estimating informativeness at each AL round is computationally expensive. In this paper, we focus exclusively on the least confidence method to compare to random selection. While other strategies, such as entropy or Monte Carlo (MC) dropout, may improve the performance, they come with significantly higher computational costs. For example, MC dropout requires multiple forward passes per image, substantially increasing the overall runtime. The choice of cold-start images may also influence the outcomes, as noted in~\cite{liu_colossal_2023}.
Moreover, consistent with findings in other medical domains (e.g.,~\cite{martin_combining_2024, ekner_active_2025}), random selection has shown performance comparable to more complex selection strategies such as least confidence.

% Only random selection has been used instead of complex algorithms to estimate the informativeness of the images. Indeed, prior papers showed on medical images (e.g., \cite{martin_combining_2024, ekner_active_2025}) that random selection is a very hard to beat baseline.

The preliminary results on the annotation using Med-SAM3D show that with 5 simulated point-based prompt (i.e., simulated clicks) from nnU-net prediction allows to reduce the number of pixels to annotate or verify to up to 50\%. Other SAM models exploiting other type of prompt (e.g., \cite{MedSAMMa}) could be explored to improve this pre-annotation.

Moreover, contrary to 2D image segmentation, where training U-Net-like models can be very fast and require fewer iterations, 3D image training demands significantly more computational time. 
Adding the use of SAM generate also a lot of time between each AL round. An asynchronous iteration need to be considered to limit the waiting for the experts during the annotation. Moreover, even if random is very hard to beat to selection the images to annotate, other sampling methods could be considered in the future. The trade-off between gain in term of quality in selection and the computing power required as well as computing time to select the images seems to be an essential criteria to develop new methods.

It is important to note that the selected test dataset may not be entirely representative of the underlying distribution of the full dataset. Furthermore, the chosen class grouping strategy appears to have significantly impacted segmentation performance. On one hand, this grouping led to increased performance for classes with a large pixel representation in the images (e.g., Teeth), as the aggregation of pixels likely facilitated model training. On the other hand, it severely degraded performance for the ``Sinus'' class, which became largely undetected. This degradation could be attributed to the increased class imbalance introduced by the grouping, which disproportionately affects minority or less complex classes such as ``Sinus''.

This paper presents a preliminary work on the combination of traditional segmentation models (nnU-net \cite{isensee_nnu_net_2021}) and prompt-based segmentation models (SAM-Med3D~\cite{wang_sam_med3d_2024}) to facilitate data annotation and model training in the 3D dental domain.
In future studies, other dental datasets (e.g., 3DTeethSeg \cite{ben_hamadou_3dteethseg22_2023}) will be considered. Moreover, nnU-Net is a complex model due to its automated configuration capabilities, which accelerate model setup. Other models, such as TransUNet (e.g., \cite{chen_transunet_2024}), could also be considered in future work, especially to evaluate other AL sampling methods.

\begin{credits}
\subsubsection{\ackname} This work has been supported by MIAI@Grenoble Alpes (ANR-19-P3IA-0003). This work benefited from state aid managed by the National Research Agency under France 2030 bearing the reference ANR-23-IACL-0006.

\subsubsection{\discintname}
Philippe Mulhem and Jean-Pierre Chevallet have no competing interests to declare that are relevant to the content of this article. 
Nicolas Martin owns stock in PEEKTORIA.
% It is now necessary to declare any competing interests or to specifically
% state that the authors have no competing interests. Please place the
% statement with a bold run-in heading in small font size beneath the
% (optional) acknowledgments\footnote{If EquinOCS, our proceedings submission
% system, is used, then the disclaimer can be provided directly in the system.},
% for example: The authors have no competing interests to declare that are
% relevant to the content of this article. Or: Author A has received research
% grants from Company W. Author B has received a speaker honorarium from
% Company X and owns stock in Company Y. Author C is a member of committee Z.
\end{credits}

\newpage
%
% ---- Bibliography ----
%
% BibTeX users should specify bibliography style 'splncs04'.
% References will then be sorted and formatted in the correct style.
%
\bibliographystyle{splncs04}
\bibliography{biblio}
% \printbibliography

% other method of biblio
%
% \begin{thebibliography}{8}
% \bibitem{ref_article1}
% Author, F.: Article title. Journal \textbf{2}(5), 99--110 (2016)

% \bibitem{ref_lncs1}
% Author, F., Author, S.: Title of a proceedings paper. In: Editor,
% F., Editor, S. (eds.) CONFERENCE 2016, LNCS, vol. 9999, pp. 1--13.
% Springer, Heidelberg (2016). \doi{10.10007/1234567890}

% \bibitem{ref_book1}
% Author, F., Author, S., Author, T.: Book title. 2nd edn. Publisher,
% Location (1999)

% \bibitem{ref_proc1}
% Author, A.-B.: Contribution title. In: 9th International Proceedings
% on Proceedings, pp. 1--2. Publisher, Location (2010)

% \bibitem{ref_url1}
% LNCS Homepage, \url{http://www.springer.com/lncs}, last accessed 2023/10/25
% \end{thebibliography}

\end{document}
