\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\usepackage{arydshln}
\jmlrvolume{-- 47}
\jmlryear{2024}
\jmlrworkshop{Full Paper -- MIDL 2024}
\editors{Accepted for publication at MIDL 2024}

\title[Training-free Prompt Placement for SAM]{Training-free Prompt Placement by Propagation for SAM Predictions in 3D Bone CT Scans}

\midlauthor{\Name{Caroline Magg\nametag{$^{1,2}$}} \Email C.MAGG@AMSTERDAMUMC.NL \\ 
\Name{Lukas P.E. Verweij\nametag{$^{2}$}} \\ 
\Name{Maaike A. ter Wee\nametag{$^{2}$}} \\
\Name{George S. Buijs\nametag{$^{2}$}} \\
\Name{Johannes G.G. Dobbe\nametag{$^{2}$}} \\
\Name{Geert J. Streekstra\nametag{$^{2}$}} \\
\Name{Leendert Blankevoort\nametag{$^{2}$}}  \\
\Name{Clara I. S\'anchez\nametag{$^{1,2}$}} \\
\addr $^{1}$ University of Amsterdam, Amsterdam, The Netherlands \\
\addr $^{2}$ Amsterdam UMC location University of Amsterdam, Amsterdam, The Netherlands
\vspace{-2pt}}

\begin{document}

\maketitle

\begin{abstract}
The Segment Anything Model (SAM) is an interactive foundation segmentation model, showing impressive results for 2D natural images using prompts such as points and boxes. Transferring these results to medical image segmentation is challenging due to the 3D nature of medical images and the high demand of manual interaction. As a 2D architecture, SAM is applied slice-per-slice to a 3D medical scan. This hinders the application of SAM for volumetric medical scans since at least one prompt per class for each single slice is needed. In our work, the applicability is improve by reducing the number of necessary user-generated prompts. We introduce and evaluate multiple training-free strategies to automatically place box prompts in bone CT volumes, given only one initial box prompt per class. 
The average performance of our methods ranges from $54.22\%$ Dice to $88.26\%$ Dice. At the same time, the number of annotated pixels is reduced significantly from a few millions to two pixels per class. These promising results underline the potential of foundation models in medical image segmentation, paving the way for annotation-efficient, general approaches. 
The code is available at \href{https://github.com/CarolineMagg/SAM-Prompt-Placement-by-Propagation}{this github URL}.
\end{abstract}

\begin{keywords}
SAM, foundation segmentation model, bone segmentation, CT scans
\end{keywords}

\section{Introduction}
Medical image segmentation (MIS) plays a crucial role in various clinical applications, among others the segmentation of bones as part of the musculoskeletal system for medical interventions and for examining changes in anatomical structures \cite{bonaldi2023msk}. In order to reduce user interaction and time-consuming manual delineations, deep learning (DL) methods have been developed to automatically generate a segmentation mask of diverse bone structures in CT imaging \cite{lindgrenbelal2019, deng2022, wasserthal2023, magg2024}.
However, the best performing methods so far developed are trained in a fully supervised manner requiring pixel-based annotation. These models cannot be transferred to other unseen structures without new annotations and training. Foundation Models (FMs) offer better generalisability as they are general-purpose models that are trained on large data sets and can be used for a wide range of downstream tasks. New advances for segmentation FMs have been made with the prompt-based Segment Anything Model (SAM) by \citet{kirillov2023sam}. This model relies on the user to mark a point in the object or bound the region of interest, which is referred to as placing a prompt.
However, current segmentation FMs are not suitable for MIS because of two main challenges. 
First, SAM has been trained on natural images. Several studies \cite{he2023eval, mazurowski2023, ma2023medsam, cheng2023eval} evaluated SAM for MIS and reported unstable performance across diverse datasets which can be explained by the difference between natural and medical images. To overcome this challenge, adaptation techniques \cite{wu2023medsa, li2023autoprompting} and fine-tuning on medical data \cite{ma2023medsam, cheng2023sammed2d, xie2024fewshot} are introduced. 
Second, the 2D SAM architecture does not account for the 3D nature of volumetric medical scans. As at least one prompt per slice is required, the interactive approach depends on a high manual interaction if applied to volumetric MIS. This reduces the applicability in high-throughput or less supervised medical tasks. 
To alleviate the second challenge, we propose multiple training-free strategies that automatically place box prompts for bone CT scans. In contrast to our training-free method, \citet{lei2023medlsam} and \citet{wang2023sammed3d} addressed this challenge using approaches that required an extra training phase, either to predict bounding boxes locations or to adapt SAM architecture to 3D, respectively.
Most similar to our work is EviPrompt by \citet{xu2023eviprompt} which is an evidence-based method for generating point prompts for SAM based on uncertainty estimates. A medical image-annotation pair is used as reference to create a shared embedding in the feature space and extract pixel similarity compared to the sample of interest.\newline
In this work, we propose four different training-free strategies of prompt placement by propagation to adapt SAM for bone segmentation in 3D CT scans. In contrast to EviPrompt, only one initial box prompt is required and the remaining box prompts are inferred from predictions in adjacent slices. We evaluated our methods on three datasets containing different bone structures, and showed the annotation-efficiency of our methods by analyzing performance with respect to the required numbers of annotated pixels.

\section{Data}
Three different datasets are used for our experiments, i.e., two internal datasets, namely Ds and Dk, and one publicly available dataset, namely Dt (see Appendix \ref{sec:dataset_appendix_figure}, \figureref{fig:datasets}).
The first internal dataset (Ds) contains $15$ bilateral CT scans of the shoulder joint with annotations of the left and right humerus and scapula. 
The shoulder CT scans were acquired with a Brilliance 64-channel CT Scanner (Philips Healthcare, Best, The Netherlands) with $250$ mAs, $120$ kV, pixel resolution of $0.83-0.98$mm and slice thickness $0.3-1.0$ mm. The second internal dataset (Dk) consists of $25$ unilateral knee CT scans with knee implants, including $20$ cadaveric and $5$ patient scans. The tibial cortex bone and the tibial component of a knee prosthesis are labeled in the knee CT scans which were acquired with a Brilliance 64-channel CT Scanner (Philips Healthcare, Best, The Netherlands) or a Siemens SOMATOM Force with $160$ mAs, $120$ kV, isotropic voxel spacing of $0.45$mm. The internal datasets were annotated using an in-house annotation software \cite{dobbe2014} and ITK-Snap \cite{itk_snap}.
The external dataset, Dt, is extracted from a publicly available dataset (https://doi.org/10.5281/zenodo.6802613, \citet{wasserthal2023}). Two different subsets, Dt1 and Dt2, are extracted from the provided test set: Dt1 consists of $33$ CT scans containing the left and right humerus and scapula, in analogue to Ds; and Dt2 consists of $53$ CT scans containing the left and right hip and femur.
As the smallest dataset, Ds was used for the method development and preliminary hyperparameter testing. Dk and Dt were only used for additional evaluation purposes.

\section{Methods}
Based on the prompt-based mode of SAM, our method is an automated prompt placement approach which is initialized by defining a box prompt. 
Assuming that the boundary of the object of interest in a 3D CT scan, in our case bone structures, can be modeled by a smooth surface over slices, local changes of prompts between neighboring slices could be considered minimally. Based on this, we present four different strategies that iterative propagate predictions from adjacent slices to infer a new prompt.

\subsection{Revisiting SAM}
SAM by \citet{kirillov2023sam} is a prompt-based FM developed for interactive 2D segmentation. The architecture consists of three components: an image encoder, a prompt encoder and a mask decoder. First, the input image $I\in \mathbb{R}^{H \times W}$ is embedded by a  Vision Transformer (ViT) pre-trained as Masked Auto-encoder (MAE). Then, the prompt encoder creates a prompt encoding from geometric prompts (namely bounding box or point). Multiple studies illustrate that box prompts achieve better results compared with point prompts \cite{ma2023medsam, mazurowski2023, he2023eval}. Thus, we focus on box prompts $P \in \mathbb{N}^{2\times 2}$ represented by an embedding pair, i.e., the positional encoding of its top-left and bottom-right corner. Finally, a lightweight mask decoder maps the image and prompt embedding to an output mask $M = f_{SAM}(I,P)$ with $M\in \mathbb{R}^{H \times W}$.

\subsection{Prompt Inference by Propagation}
For volumetric data, SAM is currently applied in a slice-by-slice manner per class \cite{mazurowski2023}.
Therefore, given a 3D CT scan $I\in\mathbb{R}^{H\times W\times D}$ with the i-th slice $I_i\in\mathbb{R}^{H\times W}$ and the corresponding mask prediction $M\in\mathbb{R}^{H\times W\times D}$ with $M_i\in\mathbb{R}^{H\times W}$, one prompt $P_i$ for each slice $i$ is required, so that $M_i=f_{SAM}(I_i, P_i)$. 
In our approaches, $P_i$ is defined as a bounding box enclosing the object of interest and described by $P_i=f_{BOX}(M_{i})=\{(x^t_i, y^t_i), (x^b_i, y^b_i)\}$, with the coordinates of the top-left $(x^t_i, y^t_i)$ and bottom-right $(x^b_i, y^b_i)$ pixel.
Given an initial slice $S_j\in I$ and a manually provided prompt $P_j$, we propose four approaches to estimate $P_i$, and thus obtain $M_i$. \figureref{fig:methods} visualizes the idea behind three of the strategies. Without any loss of generality, $j<i$ applies in the following. \newline

\textbf{Baseline ($f_b$):}
In this approach, given the initial slice, the box prompt is propagated uni-directional through the volume, inferred from the previous prediction as follows:
\begin{equation}\label{eq:baseline}
    M_i = f_b(I_i, M_{i-1}) = f_{SAM}(I_i,P_i) \text{, with } P_i= f_{BOX}(M_{i-1}).
\end{equation}

\textbf{Stochastic Approach ($f_s$):}
For this approach, the box prompt is also propagated in one direction, i.e., uni-directional, but relies on multiple modifications of the previous prompt prediction. Specifically, $K$ prompts are generated by randomly shifting the coordinates of the box enclosing the previous prediction as follows:
\begin{align*}
P_i^{\left(k\right)} = \{(x^t_i, y^t_i)+(\delta^{\left(k\right)}_x, \delta^{\left(k\right)}_y), (x^b_i, y^b_i)+(\delta^{\left(k\right)}_x, \delta^{\left(k\right)}_y)\}, \forall k=\{1,\ldots,K\} \text{, with }
\delta^{\left(k\right)}_x, \delta^{\left(k\right)}_y \in \mathbb{N}.
\end{align*}
Based on this, $M_i$ is estimated as follows:
\begin{align} \label{eq:stochastic}
    M_i = f_s(I_i, M_{i-1}, K) = \bigcap_{k=1}^K f_{SAM}(I_i, P_i^{\left(k\right)}) 
    %\text{, with } P_i= f_{BOX}(M_{i-1}).
\end{align}

\textbf{Nested Approach ($f_n$):}
The nested approach performs bi-directional box prompt propagation. The information of the previous prediction $M_{i-1}$ and the following slice $I_{i+1}$ are used to perform propagation steps forwards and backwards as follows:
\begin{align}\label{eq:nested}
\begin{split}
    M_i = f_n(I_i, I_{i+1}, M_{i-1}) = f_{SAM}(I_i, P_i) \text{, with } \\
    P_i = f_{BOX}(f_{SAM}(I_{i+1}, f_{BOX}(f_{SAM}(I_i, f_{BOX}(M_{i-1}))))).
\end{split}
\end{align}

\textbf{Combined approach ($f_c$):} The nested and stochastic approaches can be combined as
\begin{align}
    M_i = f_c(I_i, I_{i+1}, M_{i-1}, K) = f_s(I_i, f_s(I_{i+1}, f_s(I_i, M_{i-1}, K), K), K).
\end{align}
by using $f_s$ from Equation (\ref{eq:stochastic}).\newline

The initial slice $S_j$ is chosen randomly around the center of the object, because there will be a higher certainty of the correctness of the manually placed prompt $P_j$ due to the larger object area contained in this region.
% containing a large area of the object of interest due to the higher certainty of the correctness of the manually placed prompt $P_j$.
The obtained masks are postprocessed in two steps. First, the largest component associated with the prediction generated by the initial prompt is retained and all other disconnected components are removed (pp1). Second, predictions are removed if the corresponding box is significantly smaller than the initial box (pp2).

\begin{figure}[t!]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {fig:methods}
  {\caption{Overview of the three strategies for prompt placement in slice $I_i$. The boxes are color-coded based on the strategy: baseline (red), nested (orange) and stochastic (green). For the sake of simplicity, the combined approach is omitted.}\vspace{-10pt}}
  {\includegraphics[width=0.9\linewidth]{methods_overview02.PNG}\vspace{-30pt}}
\end{figure}

\section{Experimental Setup}

For our experiments, we used the pre-trained SAM ViT-B, as described in \cite{kirillov2023sam}. The 2D architecture processes the 3D CT scans in axial slices, since the axial plane has usually a higher resolution in CT scans. 
For each slice, a binary mask per class is generated. In order to combine all binary masks to one multi-class mask, a class processing sequence was defined: For Dk and Dt1, humerus before scapula; for Dk, tibia bone before tibial implant; for Dt2, hip before femur. Following \citet{mazurowski2023}, a maximum of three bounding boxes, each larger than $5\%$ of the 2D mask area, was extracted from a prediction.
The values $\delta^{\left(k\right)}_x$ and $\delta^{\left(k\right)}_y$ are sampled from a uniform distribution between 0 and 5. The size of threshold in pp2 is empirically set to $10\%$ of the size of $P_j$.
Other parameters, such as the initial slice position and $K$, were studied in hyperparameter experiments on Ds without postprocessing and after pp1 to determine optimal values.\newline
As comparison to FMs, two 3D full resolution nnUNets \cite{isensee2021} for Ds and Dk were trained. The training details are reported in Appendix \ref{sec:nnunet_details}. For Dt, the trained nnUNet from \citet{wasserthal2023} was deployed. Another 3D full resolution nnUNet was trained on predictions from our best method as reference labels, using the first fold of Ds and the settings mentioned in Appendix \ref{sec:nnunet_details}. This demonstrates the capability of our methods to create pixel-annotations for a fully supervised training, starting with one bounding box per object. As an upper performance estimate for our methods, SAM was evaluated based on the evaluation scheme of \citet{mazurowski2023}, where a maximum of three box prompts were extracted from the ground truth labels. This mimics the placement of the most ideal SAM box prompts when manually entering one prompt per slice.\newline
All obtained masks were compared with the ground truth labels by means of two common segmentation metrics, i.e. Dice Coefficient (Dice) and the Hausdorff Distance 95\% (HD95). In addition, we analysed the number of annotated pixels in relation to the Dice to investigate the annotation-efficiency.

\section{Results}

The evaluation results of our prompt propagation approaches without postprocessing (no pp) and after applying both postprocessing steps (pp), the traditional SAM, and the nnUNet predictions are shown in \tableref{tab:results_propagation}. The methods are not statistically significantly different (see Appendix \ref{sec:ablation_significance}). Results for only applying one of both postprocessing steps are shown in Appendix \ref{sec:ablation_postprocessing}, \tableref{tab:results_propagation_postprocessing}. For Ds and Dt1, qualitative examples are shown in \figureref{fig:example01}. 
\figureref{fig:performance_annotations} shows the annotation-efficiency of our approaches compared to fully supervised methods. An ablation study about the number of prompts used for initialization considering performance and annotation time is given in Appendix \ref{sec:ablation_slices}. In Appendix \ref{sec:ablation_performance}, performance across different slice ranges surrounding the initial slice is evaluated to understand performance evolution across volumes. The results after training a nnUNet with the masks obtained by combination pp1 are $90.19\%$ Dice and $2.77$mm HD95, while for training a nnUNet with the ground truth labels the results are $98.35\%$ Dice and $0.92$mm HD95. 

According to \figureref{fig:hyperparameter} in Appendix \ref{sec:ablation_study}, the best results in the hyperparameter experiments are achieved with the following settings: The initial slice is sampled from $-25\%$ to $+25\%$ around the object's center. For the stochastic-based approach, $K$ is set to $10$ and if at least $5$ predictions include a particular pixel, it is included in the final mask.

\begin{table}[ht]
 % The first argument is the label.
 % The caption goes in the second argument, and the table contents
 % go in the third argument.
\floatconts
  {tab:results_propagation}
  {\caption{Results of our methods, the reference SAM, and nnUNet. For our  methods, the results without postprocessing (no pp) and after postprocessing (pp) are reported. For Dt, the nnUNet results marked with $^*$ are taken from \citet{wasserthal2023}.}\vspace{-15pt}}
  {
  \begin{tabular}{|l|c:c|c:c|c:c|c:c|}
  \hline
  \bfseries Method & \multicolumn{2}{|c|}{\bfseries Dice (\%) $\uparrow$} & \multicolumn{2}{|c|}{\bfseries HD95 (mm) $\downarrow$} & \multicolumn{2}{|c|}{\bfseries Dice (\%) $\uparrow$} & \multicolumn{2}{|c|}{\bfseries HD95 (mm) $\downarrow$}\\
  \hline
  & \multicolumn{4}{|c|}{\textit{Ds}} & \multicolumn{4}{|c|}{Dt1} \\
  \cdashline{2-9}
  nnUNet & \multicolumn{2}{|c|}{98.5} & \multicolumn{2}{|c|}{0.9} & \multicolumn{2}{|c|}{91.5$^*$} & \multicolumn{2}{|c|}{not reported$^*$} \\
  SAM & \multicolumn{2}{|c|}{90.0} & \multicolumn{2}{|c|}{2.5} & \multicolumn{2}{|c|}{84.2} & \multicolumn{2}{|c|}{5.9} \\
  \cdashline{2-9}
  & \textit{no pp} & \textit{pp} & \textit{no pp} & \textit{pp} & \textit{no pp} & \textit{pp} & \textit{no pp} & \textit{pp} \\
  \cdashline{2-9}
  baseline & 84.4 & 87.6 & 22.3 & 4.2 & 77.2 & 78.5 & 31.8  & 15.2\\
  stochastic & 85.5 & 87.6 & 17.0 & 4.0 & 70.3 & 69.6 & 23.9 & 25.5 \\
  nested & 85.1 & 88.1 & 21.6 & 4.1 & 78.0 & 76.8 & 24.5 & 16.8 \\
  combined & 85.8 & 87.9 & 18.6 & 4.5 & 70.3 & 69.1 & 25.3 & 26.3 \\
  \hline
  & \multicolumn{4}{|c|}{Dk} & \multicolumn{4}{|c|}{Dt2} \\
  \cdashline{2-9}
  nnUNet & \multicolumn{2}{|c|}{95.6} & \multicolumn{2}{|c|}{0.6} & \multicolumn{2}{|c|}{95.1$^*$} & \multicolumn{2}{|c|}{not reported$^*$} \\
  SAM & \multicolumn{2}{|c|}{76.3} & \multicolumn{2}{|c|}{3.5} & \multicolumn{2}{|c|}{92.0} & \multicolumn{2}{|c|}{5.4} \\
  \cdashline{2-9}
  & \textit{no pp} & \textit{pp} & \textit{no pp} & \textit{pp} & \textit{no pp} & \textit{pp} & \textit{no pp} & \textit{pp} \\
  \cdashline{2-9}
  baseline & 54.2 & 54.8 & 27.1 & 24.3 & 78.2 & 77.7 & 32.6 & 29.9\\
  stochastic & 58.1 & 58.1 & 20.1 & 19.7 & 80.8 & 80.7 & 18.8 & 19.5 \\
  nested & 55.7 & 56.8 & 31.1 & 23.4 & 79.5 & 78.6 & 30.8 & 28.9 \\
  combined & 58.1 & 58.9 & 21.2 & 19.4 & 80.6 & 80.0 & 18.9 & 18.8   \\
  \hline
  \end{tabular}\vspace{-10pt}
  }
\end{table}

\begin{figure}[h]
    \floatconts
    {fig:example01}% label for whole figure
    {\vspace{-15pt}\caption{Examples of 3D models for Ds (a) and Dt1 (b).}} % caption for whole figure
    {%
    \subfigure[Ds - humerus (red), scapula (green)]{%
    \label{fig:ds_example}% label for this sub-figure
    \includegraphics[width=0.73\linewidth]{ds_example.PNG}
    } 
    \subfigure[Dt1 - humerus (dark blue, green), scapula (bright blue, yellow)]{%
    \label{fig:dt1_example}% label for this sub-figure
    \includegraphics[width=0.73\linewidth]{dt1_example.PNG}
    }
    }
\end{figure}

\begin{figure}[ht!]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {fig:performance_annotations}
  {\caption{Analysing performance (Dice) wrt number of annotated pixels.}\vspace{-15pt}}
  {\includegraphics[width=1.0\linewidth]{performance_vs_annotations02.PNG}\vspace{-15pt}}
\end{figure}

\section{Discussion}
In this work, we have presented multiple training-free strategies to apply SAM with box prompts to 3D bone CT scans starting with only two pixels annotated. We have introduced four methods, following different strategies of using the adjacent information. Our experiments do not reveal a definitive trend favoring one of our methods. However, our methods significantly reduce the number of annotated pixels and compared to using SAM in the traditional slice-by-slice manner or a fully supervised segmentation method, while maintaining a certain level of segmentation performance and showing promising results.

Metal artefacts, similar structures in close proximity to each other or interlocking structures, as well as objects that split into multiple components have a negative effect on our training-free strategies. In some cases, they can lead to segmentation overflow to surrounding structures or a lack of components tracking during a split (see \figureref{fig:example02}). Aside from that, a lower contrast between bone and background decreases the performance of our methods, as seen in Dt1, aligning with the limitations of SAM \cite{mazurowski2023}.
However, our proposed strategies show very good results when the segmentation task is easier and there is a good bone-background contrast. For example, for Ds, the Dice Coefficients are only $2-3\%$ off the traditional SAM (see \tableref{tab:results_propagation}). As a very thin structure, the scapula is difficult to manually annotate in a smooth 3D manner, resulting in some holes in the ground truth due to annotation errors (see \figureref{fig:datasets} Ds). Even when the reference standard is not completely correct, our methods produce smooth surfaces (see \figureref{fig:ds_example}). 

Analysing the individual methods reveals that baseline and nested approach can show severe over-propagation if an error is propagated (see \figureref{fig:example02}). In contrast, stochastic-based methods are more conservative, requiring agreement between multiple predictions. However, they suffer from vanishing boxes and thus, missing segmentation, when agreement is lacking (see \figureref{fig:dt1_example}, \figureref{fig:dt2_example}). Combining stochastic and nested strategy does not balance weaknesses and may even amplify them (see \figureref{fig:example02}). 
Our current postprocessing steps target two reoccurring segmentation mistakes (see \figureref{fig:shoulder02}). SAM can predict multiple disconnected components for one box, detecting structures similar to the object of interest. Removing all components, that are not connected to the largest component predicted from the initial box (pp1), improves performance. Moreover, SAM lacks spatial context, leading to the box not always vanishing at the top and bottom of the object. Restricting the box size (pp2), corrects for small additional structures.
Using only pp1 yields greater performance gains, as the corrections are generally larger than pp2 corrections made at the object's boundary (see \tableref{tab:results_propagation_postprocessing}, \figureref{fig:shoulder02}). However, since each postprocessing step addresses another error pattern, the best results are achieved applying both (see \tableref{tab:results_propagation}).


As shown in \figureref{fig:performance_annotations}, there is a trade-off between performance and annotated pixels. While nnUNet consistently outperforms other solutions, it requires extensive ground truth labels for each new task and shift in data characteristics. In contrast, traditional SAM reduces annotated pixels by a magnitude of 2, and it seems robust to data and task changes. However, SAM lacks scalability for high-throughput 3D segmentation tasks due to the demand of manual interactions per slice. Our approaches are independent of the number of slices.
\figureref{fig:number_of_prompts} shows that more user-provided prompts enhance performance in our approaches until reaching the fully prompted SAM level, which comes with the cost of additional annotation time.
The annotation-efficient predictions can be used for pseudo-label training or as a starting point for model-assisted labeling. Training nnUNet with SAM predictions (see \figureref{fig:nnunet_example}) achieves $90\%$ Dice, demonstrating the feasibility of creating a training dataset for supervised learning. Aside annotated datasets, our study provides insights for selecting training samples for efficient fine-tuning. 
For example, analyzing performance evolution throughout the volume reveals that for some slices the performance is closer to nnUNet and better than fully-prompted SAM (see \ref{sec:ablation_performance}). Thus, our methods have value for some slices, but more challenging ones may require supervised approaches.


A limitation of our method is relying solely on prediction information from adjacent slices, excluding image information. To enhance segmentation performance without increasing annotation effort, incorporating a priori information like shape could be beneficial. Further improvements in prediction quality could be achieved by exploiting the full range of geometric prompts, combining box with positive and negative point prompts to reduce ambiguity.
The performance of our methods depends on the underlying SAM version, which should be investigated further in combination with models fine-tuned on medical data, like Med-SAM \cite{wu2023medsa} or SAM-Med2D \cite{cheng2023sammed2d}.
It is important to note that our goal is not to achieve state-of-the-art results but to offer a training-free strategy for using prompt-based 2D segmentation FMs, like SAM, in 3D medical data with one prompt per class per volume rather than one prompt per slice.
 
\section{Conclusion}

We have presented multiple training-free strategies to apply the box-prompt mode of SAM to 3D bone CT scans with only one initial box prompt. Our method significantly decreases the number of annotated pixels while maintaining a certain level of segmentation performance. This work is another step towards the applicability of FMs for 3D bone segmentation.


% Acknowledgments---Will not appear in anonymized version
%\midlacknowledgments{We thank a bunch of people.}


\bibliography{midl24_47}


\appendix

\newpage


\section{Data Overview} \label{sec:dataset_appendix_figure}

\figureref{fig:datasets} gives an overview of the three datasets with an example of a CT slice and 3D model of the reference masks.

\begin{figure}[ht]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {fig:datasets}
  {\caption{Dataset overview with exemplary CT slices and 3D models of the reference mask.}}
  {\includegraphics[width=0.98\linewidth]{datasets01.PNG}}
\end{figure}

\section{nnUNet Training Details} \label{sec:nnunet_details}

We train two 3D full resolution nnUNet \cite{isensee2021} on each of the datasets Ds and Dk. The default training settings have been retained, except for mirroring in data augmentation. For Ds, the mirroring on the vertical axes would confuse the left and right labels in the bilateral scans and is therefore removed. For Dk, the mirroring on the horizontal axes is removed since a horizontally flipped femoral implant component shows some similarity with a tibial implant component. The models are trained and evaluated on a 5-fold and 4-fold patient-based cross-validation split for Ds and Dk, respectively. The training is performed on an NVIDIA Geforce RTX 2080 Ti 12GB and an Intel Core Xeon Gold 6128 3.40GHz CPU.

\newpage

\section{Ablation studies} 

Ablation studies for statistical significance, postprocessing, the number of prompts used for initialization, and hyperparameter settings have been performed.

\subsection{Statistical significance} \label{sec:ablation_significance}

For each dataset, the statistical significance is tested with bootstrapped paired t-tests using 1000 iterations. Due to the high number of tests per dataset, Bonferroni correction is used, i.e., corrected $\alpha = 0.05/6 = 0.0083$. The results for each method combination per dataset is shown in \tableref{tab:results_significance}. There is no a significant difference in the methods.

\begin{table}[ht]
 % The first argument is the label.
 % The caption goes in the second argument, and the table contents
 % go in the third argument.
\floatconts
  {tab:results_significance}
  {\caption{P-values for statistical significance testing of difference in methods per dataset. No pairing shows statistical significance, compared to the corrected $\alpha=0.0083$.}\vspace{-10pt}}
  {
  \begin{tabular}{l|l|l|l|l}
  Method combination & \bfseries Ds & \bfseries Dk & \bfseries Dt1 & \bfseries Dt2 \\
  \hline
  baseline, stochastic & 0.918 & 0.068 & 0.052 & 0.372 \\
  baseline, nested & 0.027 & 0.029 & 0.102 & 0.359 \\
  baseline, combined & 0.276 & 0.039 & 0.009 & 0.433 \\
  stochastic, nested & 0.046 & 0.295 & 0.111 & 0.382 \\
  stochastic, combined & 0.44 & 0.292 & 0.760 & 0.625 \\
  nested, combined & 0.445 & 0.093 & 0.058 & 0.544 \\
  \end{tabular}
  }
\end{table}


\subsection{Postprocessing} \label{sec:ablation_postprocessing}


\tableref{tab:results_propagation_postprocessing} shows the results for our methods without postprocessing (no pp) and after applying only one postprocessing step, i.e., either pp1 or pp2.

\begin{table}[ht]
 % The first argument is the label.
 % The caption goes in the second argument, and the table contents
 % go in the third argument.
\floatconts
  {tab:results_propagation_postprocessing}
  {\caption{Results of our methods without postprocessing (no pp), and after one postprocessing step, either only pp1 or only pp2.}\vspace{-10pt}}
  {
  \begin{tabular}{|l|cc|cc|}
  \hline
  \bfseries Method & \bfseries Dice (\%) $\uparrow$ & \bfseries HD95 (mm) $\downarrow$ & \bfseries Dice (\%) $\uparrow$ & \bfseries HD95 (mm) $\downarrow$\\
  \hline
  & \multicolumn{2}{|c|}{\textit{Ds}} & \multicolumn{2}{|c|}{Dt1} \\
  \cdashline{2-5}
  & \textit{no pp, pp1, pp2} & \textit{no pp,  pp1, pp2} & \textit{no pp, pp1,pp2} & \textit{no pp, pp1,pp2} \\
  \cdashline{2-5}
  baseline & 84.4, 86.4, 85.6 & 22.3, 6.6, 13.2 & 77.2, 77.5, 0.8 & 31.8, 30.8, 15.5\\
  stochastic & 85.5, 87.7, 85.6 & 17.0, 4.0, 15.6 & 70.3, 70.2, 70.2 & 23.9, 24.5, 25.0 \\
  nested & 85.1, 88.0, 85.7 & 21.6, 4.3, 14.1 & 78.0, 77.8, 77.1 & 24.5, 21.1, 18.3 \\
  combined & 85.8, 88.3, 85.7 & 18.6, 3.9, 14.6 & 70.3, 70.3, 69.1 & 25.3, 25.9, 25.7 \\
  \hline
  & \multicolumn{2}{|c|}{Dk} & \multicolumn{2}{|c|}{Dt2} \\
  \cdashline{2-5}
  & \textit{no pp,pp1, pp2} & \textit{no pp,pp1,pp2} & \textit{no pp, pp1,pp2} & \textit{no pp, pp1,pp2} \\
  \cdashline{2-5}
  baseline & 54.2, 54.4, 54.7 & 27.1, 26.9, 24.4 & 78.2, 77.3, 78.6 & 32.6, 32.9, 29.4 \\
  stochastic & 58.1, 58.2, 57.9 & 20.1, 19.9, 20.01 & 80.8, 80.8, 80.8 & 18.8, 19.0, 19.3 \\
  nested & 55.7, 56.0, 56.5 & 31.1, 30.2, 24.4 & 79.5, 78.3, 79.8 & 30.8, 30.7, 28.8 \\
  combined & 58.1, 58.4, 58.6 & 21.2, 20.6, 19.9 & 80.6, 80.1, 80.4 & 18.9, 18.9, 19.5 \\
  \hline
  \end{tabular}
  }
\end{table}


\subsection{Number of prompts \& annotation time} \label{sec:ablation_slices}

In order to evaluate how the performance changes if multiple prompts are used for initialization, the number of initialized slices is increased to 3, 5, 7, 10, 20, 30, 40, 50 for Ds. The slice chosen randomly around the center of the object is kept the same, the additional slices are chosen randomly in the volume of the object. The propagation is started from the center slice. When a slice with given prompt is reached, this initialized prompt is used instead of the prompt generated from prediction for the propagation anew. The Dice coefficient for the different numbers of prompts is shown in \figureref{fig:number_of_prompts} (blue dots).

An exact time analysis of the prompt annotation process was not conducted, since the prompts have been generated automatically from the existing ground truth labels. In a real-world scenario without ground truth labels, the prompts need to be drawn manually. In order to assess the annotation time depending on the prompts used for initialization of our approaches, an annotation time estimation is calculated as:\newline
\begin{equation}
    (t_a \cdot c  + t_s) \cdot N, \label{eq:annotation_time}
\end{equation}
with the annotation time for one prompt per class $t_a$, the number of classes $c$, the time to scroll and navigate through the 3D volume per slice $t_s$ and the number of slices $N$. For Ds, the number of classes is $4$. Under the assumption that drawing one prompt per slice takes $5$ seconds per class and scrolling through the volume takes $2$ seconds per slice, Equation (\ref{eq:annotation_time}) is only dependent on $N$ which we set equal to the number of initialized prompts. For Ds, the average number of slices containing all structures is maximal $269$, which we use as $N$ for fully-prompted SAM. \figureref{fig:number_of_prompts} (red line and markers) shows the estimated annotation time for the same set of prompt numbers as used for the performance gain analysis. 

\begin{figure}[htbp]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {fig:number_of_prompts}
  {\caption{Analysing performance (Dice) and estimated annotation time (min) wrt different numbers of prompts given as initialization for Ds. The dotted line corresponds to the Dice value of the fully-prompted SAM.}}
  {\includegraphics[width=0.65\linewidth]{prompts_analsysis.png}\vspace{-20pt}}
\end{figure}


\newpage

\subsection{Performance evolution} \label{sec:ablation_performance}

To show the performance evolution across volumes, the performance is evaluated for different subsets of slices surrounding the slice with initial prompt. \figureref{fig:performance_evolution1} and \figureref{fig:performance_evolution2} shows the results for all four datasets and different ranges of slices. The trend is a decrease of performance since errors accumulate and are propagated towards the boundary of the object. Dataset Ds is an exception of this trend due to the discrepancy between full-bone and cortical-bone segmentation. While ground truth labels represent full-bone segmentation, predictions primarily capture cortical bone, particularly in the center of the object (see \figureref{fig:cortex}). Increasing the number of slices towards the boundary, where predictions are filled, improves Dice scores.

\begin{figure}[htbp]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {fig:performance_evolution1}
  {\caption{{Analysing performance for different ranges of slices surrounding the initial slice for internal datasets Ds and Dk.}}}
  {
  \subfigure[Ds]{%
  \label{fig:baseline_center}% label for this sub-figure
  \includegraphics[width=0.98\linewidth]{ds_performance.png}
  } %\qquad % space out the images a bit
  \subfigure[Dk]{%
  \label{fig:baseline_center}% label for this sub-figure
  \includegraphics[width=0.98\linewidth]{dk_performance.png}
  } %\qquad % space out the images a bit
  }
\end{figure}

\begin{figure}[htbp]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {fig:performance_evolution2}
  {\caption{Analysing performance for different ranges of slices surrounding the initial slice for external datasets Dt1 and Dt2.}}
  {
  \subfigure[Dt1]{%
  \label{fig:baseline_center}% label for this sub-figure
  \includegraphics[width=0.98\linewidth]{dt1_performance.png}
  } %\qquad % space out the images a bit
  \subfigure[Dt2]{%
  \label{fig:baseline_center}% label for this sub-figure
  \includegraphics[width=0.98\linewidth]{dt2_performance.png}
  } %\qquad % space out the images a bit
  }
\end{figure}


\newpage

\subsection{Hyperparameter settings} \label{sec:ablation_study}

\figureref{fig:hyperparameter} shows the results for hyperparameter testing to determine the default settings for the range to randomly sample the initial slice (\figureref{fig:baseline_center}), the number of iterations (\figureref{fig:stochastic_iterations}) and minimal number of models contributing to the final mask (\figureref{fig:stochastic_majority_vote}) in the stochastic approach.

\begin{figure}[h]
    \floatconts
    {fig:hyperparameter}% label for whole figure
    {\caption{Results for hyperparameter testing on Ds}} % caption for whole figure
    {%
    \subfigure[range to sample initial slices in $f_b$]{%
    \label{fig:baseline_center}% label for this sub-figure
    \includegraphics[width=0.48\linewidth]{baseline_center02.png}
    } %\qquad % space out the images a bit
    \subfigure[number of iterations in $f_s$]{%
    \label{fig:stochastic_iterations}% label for this sub-figure
    \includegraphics[width=0.48\linewidth]{stochastic_iterations02.png}
    }
    \subfigure[minimal number of models contributing to the final mask in $f_s$]{%
    \label{fig:stochastic_majority_vote}% label for this sub-figure
    \includegraphics[width=0.48\linewidth]{stochastic_model_vote.png}
    }
    \vspace{-10pt}
    }
\end{figure}

\newpage

\section{Prediction examples} \label{sec:examples}

The following figures show examples of the predictions in different views after both postprocessing steps, except when stated differently. The methods are denoted with their first letters as abbreviations: ground truth (GT), reference SAM (R), baseline (B), stochastic (S), nested (N) and combined (C) . White arrows highlight regions of interest.

\begin{figure}[htbp]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {fig:cortex}
  {\caption{Cortex vs. full bone annotation protocol and predictions in coronal planes. For bone structures with visible cortex, the predictions vary between fully capturing the bone and only the bone cortex. First row: scapula (green) and humerus (red) in Ds; Second row: Tibial cortex (green) and tibial implant (red) in Dk.}}
  {\includegraphics[width=0.95\linewidth]{cortex_example.PNG}}
\end{figure}

\begin{figure}[htbp]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {fig:shoulder}
  {\caption{Affect of postprocessing shown in 3D models. First row: scapula (green) and humerus (red) in Ds; Second row: scapula (yellow) and humerus (green) in Dt1.}}
  {\includegraphics[width=0.98\linewidth]{shoulder_example02.PNG}}
\end{figure}

\begin{figure}[htbp]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {fig:shoulder02}
  {\caption{Affect of individual postprocessing steps shown in 3D models. First row: scapula (green) and humerus (red) in Ds; Second row: scapula (yellow) and humerus (green) in Dt1.}}
  {\includegraphics[width=0.98\linewidth]{shoulder_example03.PNG}}
\end{figure}

\begin{figure}[htbp]
    \floatconts
    {fig:example02}% label for whole figure
    {\caption{Examples of 3D models for Dk (a) and Dt2 (b). Dk shows wrong predictions of the implant plateau and overflow of the tibia to other structures, such as the ankle. Common areas of mistake for Dt2 are the ball-and-socket joint between femur and pelvis, as well as the ischium and pubis.}} % caption for whole figure
    {%
    \subfigure[Dk - tibia (green), implant (red)]{%
    \label{fig:dk_example}% label for this sub-figure
    \includegraphics[width=0.98\linewidth]{dk_example.PNG}
    }
    \subfigure[Dt2 - right hip (white), right femur (turquoise), left hip (green)]{%
    \label{fig:dt2_example}% label for this sub-figure
    \includegraphics[width=0.98\linewidth]{dt2_example.PNG}
    }
    }
\end{figure}

\begin{figure}[htbp]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {fig:nnunet_example}
  {\caption{Predictions from fully supervised models with different sets of training labels. The top row and the zoom-in show the 3D model of scapula (green) and humerus (right) with zoomed in to the humerus head. The bottom row shows the axial slice.}}
  {\includegraphics[width=0.98\linewidth]{nnunet_example.PNG}}
\end{figure}

\end{document}
