\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{lmodern}

% figures
\usepackage{enumitem}
\usepackage{float}
\usepackage[export]{adjustbox}

% table
\usepackage{multirow}
\usepackage{longtable}
\usepackage{array}
\usepackage{arydshln}
\usepackage{caption}
\usepackage{makecell}
\usepackage{hhline}

% colors
\usepackage{tikz}
\usetikzlibrary{shapes.geometric}
\usepackage{xcolor}
\definecolor{blue}{HTML}{636EFA}
\definecolor{red}{HTML}{EF553B}
\definecolor{green}{HTML}{00CC96}
\definecolor{purple}{HTML}{AB63FA}
\definecolor{oranje}{HTML}{FFA15A}
\definecolor{cyan}{HTML}{19D3F3}
\definecolor{cherry}{HTML}{FF6692}
\definecolor{pomme}{HTML}{B6E880}
\definecolor{pink}{HTML}{FF97FF}
\definecolor{gold}{HTML}{fecb52}
\definecolor{grey}{HTML}{808080}

\usepackage{pifont}% http://ctan.org/pkg/pifont
\newcommand{\cmark}{\ding{51}}%
\newcommand{\xmark}{\ding{55}}%
\newcommand{\tabitem}{~~\llap{\textbullet}~}

\newcommand{\tikdot}[1]{\tikz\draw[#1,fill=#1] (0,0) circle (.5ex);}
\include{symbols}

\usepackage{mwe} % to get dummy images
\jmlryear{2025}
\jmlrworkshop{Full Paper -- MIDL 2025}
\jmlrvolume{-- nnn}
\editors{Accepted for publication at MIDL 2025}

\title[Zero-shot capability of SAM-family models]{Zero-shot capability of 2D SAM-family models for bone segmentation in CT scans}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Caroline Magg\nametag{\orcid{0009-0008-5592-2586}$^{1,2}$}} \Email{c.magg@amsterdamumc.nl} \\
\Name{Clara I. S\'anchez\nametag{\orcid{0000-0001-9787-8319}$^{1,2}$}} \\
\Name{Hoel Kervadec\nametag{\orcid{0000-0002-6786-7042}$^{1,2}$}} \\
\addr $^{1}$ University of Amsterdam, The Netherlands \\
\addr $^{2}$ Amsterdam UMC location University of Amsterdam, The Netherlands
}

\begin{document}

\maketitle

\begin{abstract}
The Segment Anything Model (\textsc{Sam}) and similar models build a family of promptable foundation models (FMs) for image and video segmentation. The object of interest is identified using prompts---user provided input such as bounding boxes or points---and the models have shown very promising results when it comes to generalization to new tasks.
However, extensive evaluation studies are required for medical applications, to assess their strengths and weaknesses in clinical settings.
As the performance of those models is highly dependent on the quality and quantity of their prompts, it is necessary to thoroughly benchmark the different options. 
Currently, no dedicated evaluation studies exist specifically for bone segmentation in CT scans. 
Leveraging high-quality private and public datasets on four skeletal regions, we test the zero-shot capabilities of \textsc{Sam}-family models for bone CT segmentation, using non-interactive prompting strategies, composed of bounding box, points and combinations of the two. Additionally, we design a guideline for informed decision-making in 2D non-interactive prompting based on our insights on segmentation performance and inference time.
Our results show that \emph{\textsc{Sam}} and \emph{\textsc{Sam2}} currently outperform medically fine-tuned FMs, and prompted with a bounding box together with a center point have the best performance across all tested settings. Our code is available in this \href{https://github.com/CarolineMagg/SAM-family-2D-benchmark}{github repository}. 
\end{abstract}

\begin{keywords}
Segment anything model, Medical image segmentation, Foundation models, Bone segmentation
\end{keywords}

\section{Introduction}

The release of Segment Anything Model (\textit{\textsc{Sam}}) \cite{kirillov2023sam} started a family of promptable foundation models (FMs) for segmentation. Spatial information in form of bounding box and points inside and outside the object are used as prompts to identify the object of interest. FMs are trained on huge datasets (i.e., hundreds of thousands of images and masks) and their design allows them to generalize to unseen tasks and data. As data scarcity and domain shifts are common problems in medical image segmentation, FMs appear as an alternative to fully supervised, specialized models trained on annotated data. 

Since \textit{\textsc{Sam}} and \textit{\textsc{Sam2}} are trained on natural image materials, there remains a gap in applicability for medical data due to the modality differences (natural images vs. medical scans) and image size (2D vs. 3D, at much higher resolutions). Efforts to address this gap have focused on fine-tuning and modifying the \textit{\textsc{Sam}}-architecture to improve its suitability for medical imaging, resulting in versions such as \textit{Med-\textsc{Sam}} \cite{jun2024medsam}, \textit{\textsc{Sam}-Med2d} \cite{cheng2023sammed2d}, \textit{\textsc{Sam}-Med3d} \cite{wang2024sammed3d}, \textit{Med-\textsc{Sam2}} \cite{zhu2024medsam2}. Beyond model adaptations, thorough evaluation studies are essential to understand the current performance behavior, to identify potential weaknesses, risks and limitations in clinical settings and to formulate application guidelines for medical use cases. 

Table \ref{tab:related_work} shows evaluation studies closely related to our work, especially those conducted with a variety of medical image datasets. The conclusion of several evaluation studies \cite{MAZUROWSKI2023eval, he2023eval, mattjie2023eval, cheng2023eval, HUANG2024eval, dong2024eval} is that performances are unstable across different datasets and task. The models tend to struggle with small, irregular structures with low-contrast or fuzzy boundaries, leading to unsatisfying results. In contrast, they show promising results on larger structures with clear, sharp boundaries. Given that bone appears in CT scans with high-intensity values and well-defined boundaries, we hypothesize that \textsc{Sam}-family models are well-suited to achieve promising results for this task. However, there is no dedicated study focused on CT scans for bone segmentation. In addition, existing studies
% \cite{MAZUROWSKI2023eval, he2023eval, mattjie2023eval, roy2023sammd, HUANG2024eval, shen2024eval, cheng2023eval, HUANG2024eval, dong2024eval, sengupta2024eval, yu2024eval} 
primarily evaluate the performance of only \textsc{Sam} and \textsc{Sam2} with a very limited set of prompting options. As the model prediction is directly driven by the provided prompts, it is essential to evaluate a broader variety of options (e.g., prompt combinations).

\begin{table}[h!]
    \centering
    \footnotesize
    \renewcommand{\arraystretch}{1}
    \setlength{\tabcolsep}{2pt}
    \caption{Overview of evaluation studies similar to our work. Unknown model sizes are denoted as N/A. 
    \textsuperscript{\textdagger} corresponds to prompt application for the largest and multiple disconnected components of one object. The last column indicates whether bone CT segmentation is included in the dataset. \xmark* corresponds to testing on X-Rays of the hip \cite{Gut2021}.}
    \label{tab:related_work}
    \resizebox{0.91\textwidth}{!}{
    \begin{tabular}{llclc}
        Reference & Prompting strategies & Models & Dataset & Bone CT \\
        \hhline{=====}
        \cite{roy2023sammd} &
        \makecell[l]{\tabitem pos. random points \\
        \tabitem perturbed bounding box}
        & \makecell[c]{\textsc{Sam} \\ (size N/A)} & 1 public & \xmark \\
        \hdashline
        \cite{he2023eval} &
        \makecell[l]{\tabitem mass center \\
        \tabitem dilated bounding box}
        & \makecell[c]{\textsc{Sam} \\ (size N/A)} & 12 public & \xmark\\
        \hdashline
        \cite{MAZUROWSKI2023eval} &
        \makecell[l]{\tabitem center\textsuperscript{\textdagger} 
        \tabitem bounding box\textsuperscript{\textdagger} \\
        \tabitem simulated interactive 2D prompting}
        & \makecell[c]{\textsc{Sam} \\ (size N/A)} & 19 public & \xmark* \\
        \hdashline
        \cite{HUANG2024eval} &
        \makecell[l]{\tabitem center of mass/positive point \\
        \tabitem 5 pos./pos. and neg. random points \\
        \tabitem bounding box \\
        \tabitem bounding box + 1 positive point}
        & \makecell[c]{\textsc{Sam} \\ (all sizes)} & 53 public & \cmark \\
        \hdashline
        \cite{cheng2023eval} &
        \makecell[l]{\tabitem bounding box \\
        \tabitem center point of bounding box}
        & \makecell[c]{\textsc{Sam} \\ (size N/A)} & 12 public & \xmark \\
        \hdashline
        \cite{mattjie2023eval} & 
        \makecell[l]{\tabitem center \\
        \tabitem (distributed) pos. random points \\ 
        \tabitem (perturbed) bounding box}
        & \makecell[c]{\textsc{Sam} \\ (all sizes)} & 6 public & \xmark * \\
        \hdashline
        \cite{dong2024eval} &
        \makecell[l]{\tabitem center\textsuperscript{\textdagger} \\
        \tabitem bounding box\textsuperscript{\textdagger}} 
        & \makecell[c]{\textsc{Sam2} \\ (size N/A)} & 18 public & \xmark *\\
        \hdashline
        \cite{shen2024eval} &
        \makecell[l]{\tabitem pos. and neg. points \\
        \tabitem bounding box} & 
        \makecell[c]{\textsc{Sam}, \textsc{Sam2}, \\ Med-\textsc{Sam}, \\ \textsc{Sam}-Med2 \\ (sizes N/A)} & 1 public & \xmark \\
        \hdashline
        \cite{sengupta2024eval} &
        \makecell[l]{\tabitem pos. random points \\
        \tabitem 1 pos. and 2 neg. points}
        & \makecell[c]{\textsc{Sam} \& \textsc{Sam2} \\ (all sizes)} & 11 public & \xmark \\
        \hdashline
        \cite{yu2024eval} & 
        \makecell[l]{\tabitem 1 point (undefined) \\
        \tabitem bounding box} & 
        \makecell[c]{\textsc{Sam} \& \textsc{Sam2} \\ (size N/A)} & 2 public & \xmark \\
        \hline
        Our study & 
        \makecell[l]{\tabitem center\textsuperscript{\textdagger}  \tabitem centroid\textsuperscript{\textdagger} 
        \tabitem bounding box\textsuperscript{\textdagger} \\
        \tabitem pos random points\textsuperscript{\textdagger} \\
        \tabitem bounding box + center\textsuperscript{\textdagger} \\
        \tabitem bounding box + pos/neg. points\textsuperscript{\textdagger} \\
        \tabitem center + 1,5 neg. points\textsuperscript{\textdagger} \\
        \tabitem 1,5 pos. + neg. points\textsuperscript{\textdagger}} 
        & \makecell[c]{\textsc{Sam}, \textsc{Sam2}, \\ Med-\textsc{Sam}, \\ \textsc{Sam}-Med2d \\ (all sizes)} & \makecell[c]{3 private + \\ 1 public} & \cmark \\
        \hline
    \end{tabular}
    }
\end{table}

The aim of this study is to investigate different non-interactive prompting strategies for \textsc{Sam}-family models on bone segmentation in CT scans under ``ideal'' 2D conditions, i.e., prompts are based on reference masks without manipulation or human error. We test $9$ \textsc{Sam}-family models with up to $32$ prompting strategies on four different skeletal regions containing different bone and metal structures. Based on our analysis of segmentation performance and model inference time, we introduce guidelines for choosing a 2D prompting strategy and model considering prompting preferences and inference time constraints.

\section{Dataset}

Medical SAM versions (e.g., \textit{Med-\textsc{Sam}}, \textit{\textsc{Sam}-Med2D}, or \textit{\textsc{Sam}-Med3d}) are fine-tuned on publicly available datasets containing bone segmentation from CT scans (e.g., TotalSegmentator \cite{wasserthal2023totalsegmentator}, CTPelvic1K \cite{liu2021ctpelvic1k}, VerSe2020 \cite{verse2021}). 
Therefore, private datasets are required for a fair and independent evaluation across all models, while public datasets enable other researchers to reproduce findings. To achieve this balance, we compiled a private dataset from the department of Orthopedic Surgery and Sports Medicine of the Amsterdam UMC of 80 CT scans from three skeletal regions. Additionally, we selected the TotalSegmentator dataset \cite{wasserthal2023totalsegmentator} as the public dataset for comparison (D4), as it includes a pre-defined train-and-test split. Although neither \textit{Med-\textsc{Sam}} nor \textit{\textsc{Sam}-Med2D} specify their exact dataset splits, all models in our work are evaluated on a subset of the test set\footnote{\url{https://zenodo.org/records/10047292}, dataset v2.01}. Thus, our dataset consists of 80 private and 71 public CT scans of four different skeletal regions, annotated with labels for various bone structures and one metal structure (\figureref{fig:datasets}). Extra dataset details can be found in Appendix \ref{sec:dataset_description_detail}. The public samples are only used for the comparison to the private dataset, the remaining evaluation is performed on the private dataset alone.

\begin{figure}[h!]
   \centering
   \vspace{-10pt}
   \includegraphics[width=0.8\textwidth]{datasets.png}
  \caption{Dataset composition: private dataset containing 80 CT scans from three skeletal regions: shoulder (D1), wrist (D2) and lower leg (two sets of tibia segmentation, cortical (D3a) and full (D3b)); and public dataset containing 71 CT scans from two skeletal regions: wrist (D4a) and hip (D4b).}
  \label{fig:datasets}
\end{figure}

\section{Methods}

\subsection{SAM-family}

\paragraph{SAM}
The Segment Anything Model (\textit{\textsc{Sam}}) \cite{kirillov2023sam} was introduced as promptable ``foundation model for image segmentation''. \textit{\textsc{Sam}} supports sparse prompts, i.e., bounding box and points (positive and negative), and dense prompts, i.e., masks. The architecture consists of three parts: First, the image encoder, a Masked Autoencoder (MAE) pre-trained Vision Transformer (ViT), is run once per image to create image embeddings of the 2D image input. Then, the prompt encoder creates prompt embeddings for each prompt type. Finally, the lightweight mask decoder combines both embeddings and an output token and predicts the final segmentation mask. 
%Since prompts can be ambiguous, the model produces by default three masks with confidence scores that are based on estimated IoU. We employ the option for a single output mask.
%The model was trained in three stages employing a model-assisted annotation workflow to annotate the final training dataset SA-1B, which consists of 11M images and over 1B masks.
The model is available in three different sizes: base (B), large (L) and huge (H), which depends on the ViT encoder.

\paragraph{SAM2} \cite{ravi2024sam2} is an extension of \textit{\textsc{Sam}} with the additional capability of video segmentation. This is realized by changes in the architecture: The ViT encoder is replaced by a MAE pre-trained Hiera image encoder, and a memory mechanics is introduced to fuse frame embeddings with past frame features and predictions. Due to different Hiera sizes, four different versions are available: base plus (B+), tiny (T), small (S)  and large (L).

\paragraph{Med-SAM} \cite{jun2024medsam} was introduced as ``a foundation model for promptable medical image segmentation". Without any adaptions to the \textsc{Sam} architecture, \textit{\textsc{Sam} B} is fine-tuned on a medical image dataset with focus on cancer types. \textit{Med-\textsc{Sam}} only supports bounding boxes as it was only retrained for this prompt type.

\paragraph{SAM-Med2d} \cite{cheng2023sammed2d} was developed by fine-tuning \textit{\textsc{Sam} B} on SA-Med2D-20M \cite{ye2023sammed2d_dataset} with an adapter technique using learnable adapter layers. The model keeps the functionality of both sparse prompts, i.e., bounding box and point.

\subsection{Prompting Strategies}

We use non-interactive prompts, which are automatically extracted from the reference masks. A prompt consists of at least one primitive % (i.e., \texttt{bounding box}, \texttt{center}, \texttt{centroid}, \texttt{positive},  \texttt{negative} points) 
and one component selection criteria. % (i.e., largest (\texttt{1C}) or up to 5 components (\texttt{5C})).

\paragraph{Primitives}
There are 5 primitives which are the building blocks for a prompt (\figureref{fig:prompts}):
\begin{enumerate}[label=(\alph*)]
    \item \texttt{bounding box}: Tight box enclosing the entire object.
    \item \texttt{(EDT) center}: The point the most furthest away from the object boundary (with respect to the Euclidean distance transform). In case of equality, a single candidate is kept randomly. For simplicity, we refer to it simply as \texttt{center} from now on.
    \item \texttt{centroid}: Center of mass with homogeneous density. Note that there is no guarantee that the centroid is inside the object. Despite this shortcoming, we include it for completeness and as other existing work \cite{he2023eval} used it.
    \item \texttt{positive} points: Random points within the region. To avoid random points on the border, the reference mask is eroded by a $3\times3$ kernel before sampling.
    \item \texttt{negative} points: Random points outside the region but close to the border. %To ensure that the points remain near the object and prevent scattering across distant image regions, 
    % A margin through mask dilation is created for point extraction. 
    The mask is dilated in two steps: first with a $5\times5$ kernel and then with a $15\times 15$ kernel. The point(s) is (are) then sampled from the difference between these two dilations.
\end{enumerate}

The prompt primitives are extracted for each component of the reference mask larger than 15 pixels or larger than 5\% of the entire component in a slice. Components smaller than the defined criteria are unrealistic to be annotated as bounding boxes are collapsing and not enough points are included to extract 10 diverse random positive points. The thresholds were chosen empirically after dataset inspection.

\paragraph{Component selection criteria}
As shown in \figureref{fig:prompts}, anatomical structures can consist of multiple disconnected components in a 2D slice. In our datasets, the number of disconnected components does not exceed 6, which only occurs for less than 10 slices in our entire dataset. Thus, for a prompt, the primitives of either the largest component (\textit{1C}, denoted as open symbols) or up to 5 components (\textit{5C}, denoted as closed symbols) are used. 

\begin{figure}[H]
     \centering
     \subfigure[\texttt{bbox}][b]{\includegraphics[width=0.17\textwidth]{bbox.png}}
     \subfigure[\texttt{center}][b]{\includegraphics[width=0.17\textwidth]{center.png}}
     \subfigure[\texttt{centroid}][b]{\includegraphics[width=0.17\textwidth]{centroid.png}}
     \subfigure[\texttt{positive}][b]{\includegraphics[width=0.17\textwidth]{random.png}}
     \subfigure[\texttt{negative}][b]{\includegraphics[width=0.17\textwidth]{negative.png}}
  \caption{5 prompt primitives form the building blocks. The largest component's prompt is blue (i.e., one component, \textit{1C})), while the others are white, resulting in the strategy with multiple components (\textit{5C}), when all components are used.}
  \label{fig:prompts}
\end{figure}

\paragraph{A 2D Prompting strategy}
 (also referred to as ``prompt'') is defined by one or more prompt primitives and one component selection criteria. 
They can be divided into three categories based on the primitive types:
\begin{itemize}
    \item One-type prompts (OT prompts): bounding box (\cwhitesquare{black}), center (\cwhitecircledot{black}), centroid (\cwhitecircle{black}), 1, 3, 5 or 10 positive random points (\cwhitetriangleup{black}, \cwhitetriangleright{black}, \cwhitetriangledown{black}, \cwhitetriangleleft{black}).
    \item bounding box + point combination prompts (BPC prompts): bounding box with center (\cwhitesquaredot{black}), with 1 or 5 positive random points (\cwhitesquarecross{black}, \cwhitediamondcross{black}), with 1 or 5 negative random points (\cwhitesquarex{black}, \cwhitediamondx{black}).
    \item Point based combination prompts (PB prompts): center with 1 or 5 negative points (\cwhitestartriangleupdot{black}, \cwhitestartriangledowndot{black}), 1 or 5 positive and negative random points (\cwhitestartriangleup{black}, \cwhitestartriangledown{black}).
\end{itemize}

The centroid is an unreliable primitive since it may lie outside the object depending on its shape, so it is not used for any combinations. Random points combinations are evaluated with one and five points to compare against the center point and evaluate the impact of increased number of points. In total, there are $32$ prompting strategies per model, with exception of \textit{Med-\textsc{Sam}}, which only supports bounding boxes. Since the evaluation on D4 serves as a secondary objective to compare with D1 and expand the number of skeletal regions, the prompting strategies are restricted to bounding box (\cwhitesquare{black}), center (\cwhitecircledot{black}), and bounding box with center (\cwhitesquaredot{black}). We refer to the combination of a \textsc{Sam}-family model prompted with a specific prompting strategy as ``setting''.

\subsection{Guidelines}

We derive guidelines based on two key considerations: First, preference of prompts, influenced by existing workflows or software solutions supporting specific annotations. Second, constraints on inference time and resources, influenced by task-specific requirements (e.g., real-time processing). The guidelines are summarized in a flowchart, with the end-leaves showing the best settings (i.e., highest DSC score on the private dataset) for each condition.

\subsection{Evaluation}

Two common segmentation metrics, Dice Similarity Coefficient (DSC) and 95\%-percentile Hausdorff Distance (HD95), were used to compare predictions with reference labels. Moreover, inference time for each model prediction was measured, including the recommended (i.e., no specific bone CT imaging preprocessing with leveling and windowing techniques) image and prompt preprocessing, but excluding image and prompt loading. 
For multiple prediction calls (as each individual class requires a separate prediction call since binary segmentation masks are returned), the image embedding is done once and reused for all class predictions. As comparison to a fully supervised, dataset-specific model, a 2D and 3D full resolution nnUNet \cite{isensee2021nnunet} have been trained. Implementation details are available in Appendix \ref{sec:implementation}.

\section{Results}

\paragraph{Segmentation performance}
The segmentation performance of all settings averaged over the private dataset is shown in \figureref{fig:scatter_plot_overview}. Considering only segmentation metrics the bottom right corner of \figureref{fig:scatter_plot_overview} shows an overview of the best performing methods with high DSC and low HD95. Visual examples are shown in \figureref{fig:visual_examples} and Appendix \ref{sec:dataset_specifics}. The best prompting strategy across all models and private datasets is \texttt{bbox+center 5C}, which reaches $90.89\%$ DSC and $1.87$mm HD95 (Appendix \ref{sec:top_10}). In comparison, the average 3D full resolution nnUNet performance is $97.74\%$ DSC and $1.72$mm HD95, showing a performance gap in favor of nnUNet. Comparing different number of points evaluated on the private dataset demonstrated that the settings with the highest DSC are \textit{\textsc{Sam} H} \texttt{10 random points 1C} with $89.6$\% DSC for a point-based OT setting and \textit{\textsc{Sam} H} \texttt{5 positive + negative points 5C} with $91.1\%$ DSC for a PB setting (Appendix \ref{sec:different_number}).

\begin{figure}[h!]
    \centering
    \includegraphics[width=0.98\linewidth]{scatter_plot.png}
    \caption{Performance of prompting strategies averaged over private dataset: Scatterplot of (A) BPC prompts, (B) PB prompts, (C) OT prompts, and (D) zoom-in to the lower right corner of subplots (A)-(C). The symbol size in (A)-(C) corresponds to the DSC standard deviation (std), i.e., bigger symbol means higher std.\vspace{-10pt}}
    \label{fig:scatter_plot_overview}
\end{figure}

\begin{figure}[h!]
    \centering
    \includegraphics[width=0.95\linewidth]{visual_examples.png}
    \caption{Selected visual examples for \texttt{bounding box 5C} predictions for \textit{Med-\textsc{Sam}} \cblacksquare[0.3]{blue} and \textit{\textsc{Sam} B }\cblacksquare[0.3]{green} with low ($\downarrow$), medium (-) and high ($\uparrow$) DSC (\%).\vspace{-15pt}}
\label{fig:visual_examples}
\end{figure}

Analyzing the optimal prompting strategies for each dataset reveals variations across datasets (Appendix \ref{sec:dataset_specifics}). These differences become clear when comparing shoulder CT samples from the public and private data subsets, where, despite similar best DSC scores, the private dataset consistently achieves better DSC across many settings (Appendix \ref{sec:private_vs_public}). Additional insights come from the lower leg dataset (D3), where different labeling protocols, i.e. cortex versus full tibia bone segmentation, show that the full bone protocol yields superior metrics across all prompting strategies, as cortical bone is significantly over-segmented by the models (Appendix \ref{sec:different_labeling}).

\paragraph{Inference time} As inference time per slice (sec.) might be related to number of model parameters, image size and prompting strategies, they are all reported in Table \ref{tab:results_inference_time}. The fastest prediction time has \textit{\textsc{Sam}-Med2D} with $0.054$ sec per slice.

\begin{figure}[h!]   
\captionof{table}{Average prediction time per slice (sec.): The table on the left sorts the inference time averaged over all prompting strategies in ascending order. The line plot on the right shows the time per slice (sec.) for the different prompting strategies for each model.}
\begin{minipage}[c]{.55\textwidth}
    \centering
    \setlength{\tabcolsep}{1.5pt}
    \begin{tabular}{|l|ccc|}
    \hline
    Model & \makecell{Avg. time per \\ slice (s)} & \makecell{\# Model \\ Parameter} & Image Size \\
    \hline
    SAM-Med2d &                 0.054 &       271 &    256x256 \\
       SAM2 T &                 0.068 &        38 &  1024x1024 \\
       SAM2 S &                 0.080 &        46 &  1024x1024 \\
      SAM2 B+ &                 0.113 &        80 &  1024x1024 \\
        SAM B &                 0.166 &        93 &  1024x1024 \\
       SAM2 L &                 0.240 &       224 &  1024x1024 \\
        SAM L &                 0.375 &       312 &  1024x1024 \\
        SAM H &                 0.657 &       641 &  1024x1024 \\
      Med-SAM &                 1.866 &        93 &  1024x1024 \\
      \hline
    \end{tabular}
\end{minipage}
\begin{minipage}[c]{.44\textwidth}
    \centering
    \includegraphics[width=1.0\linewidth]{inference_time.png}
\end{minipage}
\label{tab:results_inference_time}
\end{figure}

\vspace{-5pt}
\paragraph{Guidelines}
Based on the insights from segmentation performance and inference time, \figureref{fig:guidelines} shows our proposed guidelines for non-interactive 2D prompting. Depending on the prompt choice (i.e., no preference, bounding box, combination, one or multiple points) and time restrictions (i.e., low, medium, high), at least one optimal setting is provided.

\begin{figure}[h!]
    \centering
    \includegraphics[width=0.98\linewidth]{guidelines_color.png}
    \caption{Guidelines for non-interactive 2D prompting for bone segmentation in CT scans based on evaluating $258$ settings on 3 private data subsets.\vspace{-15pt}} 
    \label{fig:guidelines}
\end{figure}

\section{Discussion}

Based on \figureref{fig:scatter_plot_overview}, three trends emerge in segmentation performance of \textit{\textsc{Sam}}-family models. First, performance strongly depends on the prompting strategy. For \textit{\textsc{Sam}} and \textit{\textsc{Sam2}}, their symbols form an arc from optimal to suboptimal metrics, with increasing symbol size (indicating greater DSC standard deviation). Second, \textit{Med-\textsc{Sam}} and \textit{\textsc{Sam}-Med2d}, fine-tuned on medical datasets, are generally outperformed by \textit{\textsc{Sam}} and \textit{\textsc{Sam2}} with most \texttt{bbox}-based prompts. Third, zooming into the lower right corner of \figureref{fig:scatter_plot_overview} (D) (high DSC, low HD95) reveals three groups of strategies: \texttt{bbox}-only prompts, bounding box + point combinations, and point-based combinations (upper left quadrant). Overall, \texttt{bbox+center 5C} achieves the best performance across models and datasets (\figureref{fig:scatter_plot_best}).
The evaluation was performed on a shared server, where varying utilization may affect exact inference times. Despite this limitation, clear trends are observed: inference time is primarily influenced by image and model size, not by the prompting strategies (\tableref{tab:results_inference_time}). 
\textit{Med-\textsc{Sam}}'s slowest time is potentially due to an inefficient implementation.
Based on our results, we propose guidelines for non-interactive 2D prompting that consider both prompt preferences and time constraints (\figureref{fig:guidelines}). This gives practitioners a much simpler and clearer set of options to pick from, when using FMs on a new applications.

A limitation of our study is that we did not investigate why medically fine-tuned models are outperformed. We suspect this may be due to catastrophic forgetting and loss of general representations, but this requires future testing on the original task (i.e., natural image segmentation).
Currently, the evaluation and guidelines are limited to ``theoretical'' conditions, without taking into account human interaction and eventual errors.
Although an observer study with human-generated prompts is outside the scope of this paper, based on our current insights, we hypothesize that model performance is robust to small variations in prompt position (e.g., center vs. 1 random point) and influenced by false negative prompts (e.g., 1C vs 5C). In future work we will perform an observer study with multiple observers to confirm these hypotheses.
Another future work (intertwined with the observer study) is 3D prompting and 3D models like \textsc{Sam2}, \textsc{Sam}-Med3d, and Med-\textsc{Sam2}. These models offer broader possibilities but also introduce a higher prompting complexity, e.g., slice selection. Insights from our 2D analysis will guide the design of future studies for human-generated prompts and 3D \textsc{Sam}-family models, as it can help limit the prompt primitives to the ones showing better performance in the current study, reducing considerable the amount of experiments and observer time for the analysis.

\section{Conclusion}
We tested $9$ different 2D \textsc{Sam}-family models with $32$ different non-interactive prompting strategies containing one-type and combination prompts, for bone segmentation in CT scans. 
Most notably, we found that ``vanilla'' \textsc{Sam} models consistently outperformed its medically fined-tuned versions. 
From our results, we derived guidelines for non-interactive 2D prompting to guide practitioners when coming to new applications.

\clearpage  
\midlacknowledgments{We thank in alphabetical order Leendert Blankevoort, George S. Buijs, Johannes G.G. Dobbe, Arthur J. Kievit, Matthias U. Schafroth, Geert J. Streekstra, Stela Topalova, Lukas P.E. Verweij, and Annemiek ter Wee for their work, support and guidance in data acquisition and curation.}


\bibliography{midl25_91}

\clearpage

\appendix

\section{Dataset details} \label{sec:dataset_description_detail}

Our dataset consists of $80$ private and $71$ public CT scans of four different skeleton regions with various different labels for bone structures and one metal structure:
\begin{itemize}
    \item D1 Shoulder: 15 private bilateral scans with 4 labels for left and right scapula and humerus.
    \item D2 Wrist: 40 private unilateral scans with 6 labels for capitate, lunate, radius, scaphoid, triquetrum, and ulna.
    \item D3 Lower Leg: 25 private unilateral scans with 2 labels for tibia bone and tibia implant. There are two different labeling protocols for the tibia bone: cortical bone (D3a) and full bone (D3b).
    \item D4a Shoulder: 35 public scans with same labels as D1.
    \item D4b Hip: 53 public scans with 5 labels for sacrum, right and left hip and femur.
\end{itemize}

All scans of the private dataset were acquired with a Brilliance 64-channel CT Scanner (Philips Healthcare, Best, The Netherlands) or a Siemens SOMATOM Force (Siemens Healthineers, Forchheim, Germany) with $160$ mAs, $120$ kV. The isotropic voxel spacing is $0.93$ mm, $0.32$ mm, and $0.48$ mm, for D1, D2, and D3, respectively. The annotations were generated using an in-house annotation software \cite{dobbe2014} and/or ITK-Snap \cite{itk_snap}.\newline

The TotalSegmentator test set contains $89$ scans, of which $18$ have been excluded because neither shoulder nor hip labels are present. Of the remaining $71$ scans, $16$ scans are included in both subsets (D4a and D4b).

\section{Ablation studies}

\subsection{Top 10 prompting strategies}  \label{sec:top_10}

The $10$ best performing prompting strategies across different models and datasets are shown in \figureref{fig:scatter_plot_best}. They are determined by ranking the prompting strategy from 1 to 32 (1 being the best) for each model based on their averaged DSC over the private dataset. The best method is \texttt{bbox+center 5C}, which reaches $90.89\%$ DSC and $1.87$mm HD95 on the private dataset (D1-D3), and $91.24\%$ DSC and $2.98$mm HD95 on the public dataset (D4).  To compare FMs with fully supervised and task-specific models, 2D and 3D full resolution nnUNets \cite{isensee2021nnunet} were trained for each private data subset. The training details are reported in Appendix \ref{sec:nnunet_details}. 

\begin{figure}[h!]
    \centering
    \sbox0{\begin{minipage}[c]{.75\textwidth}
    \centering
    \setlength{\tabcolsep}{2pt}
    \begin{tabular}{|c|ccc|}
    \hline
    Prompt &  avg Ranking & \makecell{DSC (\%) \\ avg (std)} & \makecell{HD95 (\%) \\ avg (std)}\\
    \hline
    \cblacksquaredot[0.3]{grey}    &  2.38 &  90.89 (10.0) &    1.87 (2.7) \\
    \cwhitesquaredot[0.3]{grey}    &  2.88 &  90.80 (10.0) &    1.79 (2.1) \\
    \cwhitesquarex[0.3]{grey}      &  4.38 &  90.49 (10.2) &    2.50 (2.8) \\
    \cblacksquarecross[0.3]{grey}  &  4.50 &   90.44 (9.6) &    2.50 (2.6) \\
    \cblacksquare[0.3]{grey}       &  5.33 &  88.68 (10.4) &    2.25 (2.2) \\
    \cblackdiamondcross[0.3]{grey} &  6.25 &   90.15 (9.0) &    2.26 (2.3) \\
    \cwhitesquarecross[0.3]{grey}  &  7.38 &   90.22 (9.7) &    2.49 (2.4) \\
    \cwhitesquare[0.3]{grey}       &  9.67 &  88.03 (10.7) &    2.39 (2.2) \\
    \cblackdiamondx[0.3]{grey}     & 10.38 &  87.91 (10.2) &   7.32 (16.1) \\
    \cblacksquarex[0.3]{grey}      & 10.75 &  87.67 (14.1) &  12.20 (28.1) \\
    \hline
    \end{tabular}
    \end{minipage}}
    \hspace{1em}
    \sbox1{\begin{minipage}[c]{.2\textwidth}
    \centering
    \adjincludegraphics[trim={{0.8\width} {0.08\height} 0 {0.01\height}}, clip, scale=0.5]{scatter_plot_marginal.PNG}
    \end{minipage}}

    \sbox2{\adjincludegraphics[trim={0 0 {0.2\width} 0}, clip, width=1.0\linewidth]{scatter_plot_marginal.PNG}}
    
    \begin{tikzpicture}
        \node[inner sep=0pt] at (-1.7,9) {\usebox0};
        \node[inner sep=0pt] at (0,0) {\usebox2};% second subfigure at bottom
        \node[inner sep=0pt] at (6,7.65) {\usebox1};% first subfigure at top with 10mm overlap
    \end{tikzpicture}
    \caption{$10$ best performing 2D prompting strategies across models. The prompt ranking is determined per model by means of the average DSC over all private data subsets (i.e., highest DSC corresponds to rank 1) and the averaged over all models. The visualization shows the scatter plot with the performance distribution per model across different prompting strategies. Note that the 10 best performing prompts are a subset of the prompts visualized in \figureref{fig:scatter_plot_overview}(D).}
    \label{fig:scatter_plot_best}
\end{figure}

\subsection{Different number of points} \label{sec:different_number}

In \figureref{fig:performance_points_all}, prompting strategies with different number of points evaluated on the private dataset are compared. For point-based OT prompts, the best DSC is achieved by \textit{\textsc{Sam} H} \texttt{10 random points 1C} with $89.6$\%, followed by \textit{\textsc{Sam} L} \texttt{10 random points 1C} with $87.4$\% and \textit{\textsc{Sam2} B+} \texttt{5 random points 1C} with $87.2$\%. For PB prompts \texttt{5 positive + negative points} performs best for \textit{\textsc{Sam} H} (\texttt{5C}, $91.1\%$), \textit{\textsc{Sam} B} (\texttt{1C}, $91.2\%$) and \textit{\textsc{Sam} L} (\texttt{1C}, $90\%$).

\begin{figure}[h!]
\centering
    \subfigure[Center and positive random points]{\includegraphics[width=0.8\linewidth]{barplot_all_points_all.png}}
    \subfigure[Point combinations]{\includegraphics[width=0.8\linewidth]{barplot_point_combination_all.png}}
    \caption{DSC (\%) performance for different number of points per model on the private dataset: (a) center point and 1,3,5,10 random positive points; (b) point combinations of center, 1 or 5 positive and negative random points.}
    \label{fig:performance_points_all}
\end{figure}

\clearpage

\subsection{Dataset-specific analysis} \label{sec:dataset_specifics}

As shown in \figureref{fig:datasetspecific_best_results}, different dataset have different ``best'' settings, here i.e., achieving the highest DSC (\%). Despite the dataset-specific differences, settings including the bounding box prompt primitive perform the best, with only one exception (SAM performance for D1). A comparison between the best SAM-family setting and a dataset-specific, fully supervised model, such as nnUNet, shows a performance gap in favor of nnUNet.

\begin{figure}[h!]
    \centering
    \begin{minipage}[c]{.98\textwidth}
        \includegraphics[width=0.98\linewidth]{radar_plot_datasets.png}
    \end{minipage} \hfill
    \begin{minipage}[c]{.98\textwidth}
        \centering
        \setlength{\tabcolsep}{1pt}
        \scalebox{0.8}{
        \begin{tabular}{|l|cccc|cc|}
        \hline
        Dataset &   Med-SAM &   SAM-Med2d &    SAM &   SAM2 & nnUNet 2D & nnUNet 3D full res\\
        \hline
         D1 & 52.74 \scriptsize{(19.8)} \cblacksquare[0.3]{blue} & 75.48 \scriptsize{(11.5)} \cblacksquaredot[0.3]{red} & \textbf{92.79 \scriptsize{(4.5)}} \cblackstartriangledowndot[0.3]{oranje} &     92.27 \scriptsize{(3.4)} \cblacksquare[0.3]{pomme} & 98.04 \scriptsize{(1.1)} & 98.52 \scriptsize{(0.8)} \\
        D2 &  83.78 \scriptsize{(4.8)} \cblacksquare[0.3]{blue} &  89.72 \scriptsize{(2.4)} \cwhitesquaredot[0.3]{red} &            96.91 \scriptsize{(0.7)} \cblacksquaredot[0.3]{green} &     \textbf{97.22 \scriptsize{(0.5)}} \cwhitesquarex[0.3]{pink} & 98.68 \scriptsize{(0.7)} & 98.84 \scriptsize{(0.4)} \\
        D3a & 54.86 \scriptsize{(29.3)} \cblacksquare[0.3]{blue} &    70.63 \scriptsize{(14.4)} \cblacksquare[0.3]{red} &            \textbf{77.29 \scriptsize{(13.5)}} \cwhitediamondx[0.3]{green} & 76.73 \scriptsize{(13.3)} \cwhitediamondx[0.3]{cherry} & 94.95 \scriptsize{(3.0)} & 93.50 \scriptsize{(4.5)} \\
        D3b &  80.62 \scriptsize{(6.7)} \cblacksquare[0.3]{blue} &  83.66 \scriptsize{(3.0)} \cwhitesquaredot[0.3]{red} &            \textbf{92.16 \scriptsize{(3.1)}} \cwhitesquaredot[0.3]{green} &   90.87 \scriptsize{(3.1)} \cblacksquaredot[0.3]{pink} & 96.85 \scriptsize{(3.0)} & 95.82 \scriptsize{(4.9)} \\
        D4a &  62.63 \scriptsize{(0.2)} \cblacksquare[0.3]{blue} &  54.74 \scriptsize{(0.2)} \cblacksquaredot[0.3]{red} &           85.93 \scriptsize{(0.1)} \cblacksquaredot[0.3]{oranje} &   \textbf{89.96 \scriptsize{(0.1)}} \cblacksquaredot[0.3]{pink} &   --\footnotemark[5]{} &   91.48 \scriptsize{(5.9)}\footnotemark[5]{} \\
         D4b &  69.67 \scriptsize{(0.1)} \cblacksquare[0.3]{blue} &  64.09 \scriptsize{(0.1)} \cblacksquaredot[0.3]{red} &              88.78 \scriptsize{(0.1)} \cblacksquare[0.3]{purple} & \textbf{ 89.92 \scriptsize{(0.1)}} \cblacksquaredot[0.3]{pomme} &   --\footnotemark[5]{} &   95.11 \scriptsize{(2.1)}\footnotemark[5]{} \\
        \hline
        \end{tabular}
        }
    \end{minipage}
    \caption{Best DSC (\%) results per data subset: Radar plot with best setting across subsets (i.e., bounding box for Med-SAM and bounding box + center for remaining models) and with best setting per subset (indicated in table). The table reports the best setting per dataset encoded in the setting symbols and the corresponding DSC (\%) scores. The highest scores by a SAM-family model are highlighted in bold for each subset.}
    \label{fig:datasetspecific_best_results}
\end{figure}

\footnotetext[5]{Results from TotalSegmentator v1 for selected labels: \url{https://github.com/wasserth/TotalSegmentator/blob/master/resources/results_all_classes_v1.json}, commit 9bd3ca1}

\subsubsection{Public vs. Private} \label{sec:private_vs_public}

As we have access to shoulder CT samples with the same label classes from our private dataset (D1) and the TotalSegmentator dataset (D4a), \figureref{fig:private_vs_public} shows the difference in DSC (\%) between the two data subsets for a selected subset of settings. For \textit{Med-\textsc{Sam}} and some \textit{\textsc{Sam2}} settings, the DSC on public data is higher, wheras, for \textit{\textsc{Sam}-Med2D} and almost all \textit{\textsc{Sam}} settings, the DSC on private data is higher. As the public dataset was in the fine-tuning dataset of \textit{\textsc{Sam}-Med2D}, the poor results are surprising. However, looking at visual examples (\figureref{fig:ts_shoulder_examples}) shows that the shoulder joint and humerus are not always fully visible on the CT scans, and, especially center-based prompting settings under-perform on the scapula class, a thin structure with lower contrast.

\begin{figure}[h!]
    \centering
    \includegraphics[trim={0 15 0 50}, clip, width=0.8\linewidth]{public_vs_private.png}
    \caption{Comparison of private (D1) and public shoulder dataset (D4a) with respect to DSC (\%) for selected settings. Red corresponds to private dataset performs better, blue to public dataset performs better.}
    \label{fig:private_vs_public}
\end{figure}

\begin{figure}[h!]
    \centering
    \includegraphics[trim={0 180 0 0}, clip, frame, width=0.98\linewidth]{shoulder_examples.png}
    \caption{3D model examples of public shoulder dataset (D4a) for selected predictions (i.e., \textit{Med-\textsc{Sam}} (\cblacksquare{blue}), \textit{\textsc{Sam}-Med2D} (\cblacksquaredot{red}), \textit{\textsc{Sam} B} (\cblacksquaredot{green}), \textit{\textsc{Sam2} B+} (\cblacksquaredot{cyan}, \cblackcircledot{cyan}))  and reference labels in the lightgrey box in the lower left corner of each cell with low ($\downarrow$), medium (-) and high ($\uparrow$) DSC (\%). The labels are color-encoded: blue - scapula, brown - humerus.}
    \label{fig:ts_shoulder_examples}
\end{figure}

\clearpage

\subsubsection{Cortex vs. Full bone segmentation} \label{sec:different_labeling}

As shown in \figureref{fig:datasetspecific_best_results}, a noticeable difference can be seen between the two subsets of the lower leg (D3). The highest dataset-specific DSC scores in both cases are achieved with \textit{\textsc{Sam} B}. However, for D3a, the optimal setting reaches only $77.29\%$ DSC and $5.8$mm HD95, while for D3b, the best setting yields $92.16\%$ DSC and $1.9$mm HD95. \figureref{fig:performance_knee_comparison} presents the performance of both subsets for \textit{\textsc{Sam} B} across all 2D prompting strategies. The full bone labeling protocol outperforms the cortical protocol, achieving higher DSC and lower HD95 for each prompting strategy. \figureref{fig:examples_knee} showcases examples of different \textit{\textsc{Sam} B} prompting strategies, which illustrates the over-segmentation in the cortical protocol. A more task-tailored prompting strategy, such as placing negative points in the bone inside (i.e., error regions), might achieve better performance by incorporating dataset-specific knowledge.

\begin{figure}[h!]
    \centering
    \includegraphics[width=.7\textwidth]{bar_plot_comparison_knee.png}
    \caption{\textit{\textsc{Sam} B} performance (top - DSC (\%); bottom - HD95 (mm)) for different labeling protocols in the lower leg dataset (D3), i.e., cortical tibia bone (D3a) and full tibia bone (D3b).}
    \label{fig:performance_knee_comparison}
\end{figure}

\begin{figure}[H]
\begin{tabular}{c|c|cccccc}
    \hline
    Reference & \cblackstar[0.5]{yellow} & \cblacksquarecross[0.4]{green} & \cblacksquare[0.4]{green} & \cblackdiamondcross[0.4]{green} & \cblacksquaredot[0.4]{green} & \cblackstartriangledowndot[0.4]{green} & \cblackcircledot[0.4]{green} \\ \hline
    \includegraphics[width=0.1\textwidth, trim=330 0 460 0, clip]{knee_ref_0.png} & \includegraphics[width=0.1\textwidth, trim=330 0 460 0, clip]{knee_nnunet2d_0.png} & \includegraphics[width=0.1\textwidth, trim=330 0 460 0, clip]{knee_bbox_random_one_multiple_0.png} & \includegraphics[width=0.1\textwidth, trim=330 0 460 0, clip]{knee_bbox_multiple_0.png} & \includegraphics[width=0.1\textwidth, trim=330 0 460 0, clip]{knee_bbox_negative_one_multiple_0.png} & \includegraphics[width=0.1\textwidth, trim=330 0 460 0, clip]{knee_bbox_center_multiple_0.png} & \includegraphics[width=0.1\textwidth, trim=330 0 460 0, clip]{knee_positive_negative_five_multiple_0.png} & \includegraphics[width=0.1\textwidth, trim=330 0 460 0, clip]{knee_center_multiple_0.png} \\ \hdashline
    
    \includegraphics[width=0.1\textwidth, trim=330 0 460 0, clip]{knee_ref_10.png} & \includegraphics[width=0.1\textwidth, trim=330 0 460 0, clip]{knee_nnunet2d_10.png} & \includegraphics[width=0.1\textwidth, trim=330 0 460 0, clip]{knee_bbox_random_one_multiple_10.png} & \includegraphics[width=0.1\textwidth, trim=330 0 460 0, clip]{knee_bbox_multiple_10.png} & \includegraphics[width=0.1\textwidth, trim=330 0 460 0, clip]{knee_bbox_negative_one_multiple_10.png} & \includegraphics[width=0.1\textwidth, trim=330 0 460 0, clip]{knee_bbox_center_multiple_10.png} & \includegraphics[width=0.1\textwidth, trim=330 0 460 0, clip]{knee_positive_negative_five_multiple_10.png} & \includegraphics[width=0.1\textwidth, trim=330 0 460 0, clip]{knee_center_multiple_10.png} \\ \hdashline
    
    \includegraphics[width=0.1\textwidth, trim=330 0 460 0, clip]{knee_ref_24.png} & \includegraphics[width=0.1\textwidth, trim=330 0 460 0, clip]{knee_nnunet2d_24.png} & \includegraphics[width=0.1\textwidth, trim=330 0 460 0, clip]{knee_bbox_random_one_multiple_24.png} & \includegraphics[width=0.1\textwidth, trim=330 0 460 0, clip]{knee_bbox_multiple_24.png} & \includegraphics[width=0.1\textwidth, trim=330 0 460 0, clip]{knee_bbox_negative_one_multiple_24.png} & \includegraphics[width=0.1\textwidth, trim=330 0 460 0, clip]{knee_bbox_center_multiple_24.png} & \includegraphics[width=0.1\textwidth, trim=330 0 460 0, clip]{knee_positive_negative_five_multiple_24.png} & \includegraphics[width=0.1\textwidth, trim=330 0 460 0, clip]{knee_center_multiple_24.png} \\ \hline
\end{tabular}
\caption{Examples of knee dataset in axial view for reference labels, 2D nnUNet and \textit{\textsc{Sam} B} with different prompt strategies. In each row, the same axial slice is displayed with cortical (top) and full (bottom) tibia bone segmentation. The labels/prompts are color-encoded: yellow - tibia; green - tibia implant; brown - prompts for  \textit{\textsc{Sam} B} inference.}
\label{fig:examples_knee}
\end{figure}

\section{Implementation details}
\label{sec:implementation}

The official github repositories (i.e., \textit{\textsc{Sam}}\footnote{\url{https://github.com/facebookresearch/segment-anything}, commit \texttt{6fdee8f}}, \textit{\textsc{Sam2}}\footnote{\url{https://github.com/facebookresearch/sam2}, commits \texttt{0e78a11} \& \texttt{29267c8}, weights from July 29, 2024}, \textit{Med-\textsc{Sam}}\footnote{\url{https://github.com/bowang-lab/MedSAM}, commit \texttt{2b7c64c}}, \textit{\textsc{Sam}-Med2d}\footnote{\url{https://github.com/OpenGVLab/SAM-Med2D}, commit \texttt{bfd2b93}}) are used for all models. Data preprocessing and weight download is performed as instructed.
The evaluation is performed on GPUs NVIDIA Geforce RTX 2080 Ti 12GB and an Intel Core Xeon Gold 6128 3.40GHz CPU, which are embedded in a server accessible to multiple users. Evaluation code is adapted from \citet{isensee2021nnunet} and \citet{jia2024seg}. Visualizations are created with 3D Slicer (\url{https://www.slicer.org/}) and plotly (\url{https://plotly.com/}).

\subsection{nnUNet training details} \label{sec:nnunet_details}
A 2D and a 3D full resolution nnUNet \cite{isensee2021nnunet} were trained on each of the datasets individually. The default training settings have been retained, except for the data augmentation for D1 and D3 and the division into training and validation folds. For D1, the mirroring on the vertical axes is removed since bilateral scans contain right and left labels. For D3, the mirroring on the horizontal axes is removed since a horizontally flipped femoral bone and implant show some similarity with the tibial counterparts. The models for D1 and D2 are trained and evaluated on a 5-fold, for D3 on a 4-fold patient-based cross-validation split. The results are denoted as \cwhitestar[0.4]{black}.

\end{document}
