\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\usepackage{multirow}
\usepackage{threeparttable}
\usepackage{caption}
\usepackage{color}
% \usepackage[pagebackref,breaklinks,colorlinks,citecolor=cvprblue]{hyperref}
% \jmlrvolume{-- Under Review}
% \jmlryear{2024}
% \jmlrworkshop{Full Paper -- MIDL 2024 submission}
% \editors{Under Review for MIDL 2024}

\jmlryear{2024}
\jmlrworkshop{Full Paper -- MIDL 2024}
\jmlrvolume{-- nnn}
\editors{Accepted for publication at MIDL 2024}

\newcommand{\tabincell}[2]{\begin{tabular}{@{}#1@{}}#2\end{tabular}}
\newcommand{\bheading}[1]{{\noindent{\textbf{#1}}}}

\title[Slide-SAM: Medical SAM Meets Sliding Window]{Slide-SAM: Medical SAM Meets Sliding Window}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Quan Quan\midljointauthortext{Contributed equally}\nametag{$^{1,2}$}} \Email{quan.quan@miracle.ict.ac.cn}\\
\addr $^{1}$ Institute  of  Computing  Technology, Chinese Academy  of  Sciences \\
\addr $^{2}$ University of Chinese Academy of  Sciences \AND 
\Name{Fenghe Tang\midlotherjointauthor\nametag{$^{3,4}$}} \Email{fhtan9@mail.ustc.edu.cn}\\
\Name{Zikang Xu\nametag{$^{3,4}$}} \Email{zikangxu@mail.ustc.edu.cn}\\
\Name{Heqin Zhu\nametag{$^{3,4}$}} \Email{zhuheqin@mail.ustc.edu.cn}\\
% \addr $^{3}$ Address 3 \AND
\Name{S.Kevin Zhou\nametag{$^{1,2,3,4}$}} \Email{skevinzhou@ustc.edu.cn}\\
\addr $^{3}$ School of Biomedical Engineering, Division of Life Sciences and Medicine, University of Science and Technology of China \\
\addr $^{4}$ Suzhou Institute for Advanced Research, University of Science and Technology of China
}

\begin{document}

\maketitle

\begin{abstract}
The Segment Anything Model (SAM) has achieved a notable success in two-dimensional image segmentation in natural images. However, the substantial gap between medical and natural images hinders its direct application to medical image segmentation tasks. Particularly in 3D medical images, SAM struggles to learn contextual relationships between slices, limiting its practical applicability. Moreover, applying 2D SAM to 3D images requires prompting the entire volume, which is time- and label-consuming. To address these problems, we propose Slide-SAM, which treats a stack of three adjacent slices as a prediction window. It firstly takes three slices from a 3D volume and point or bounding box prompts on the central slice as inputs to predict segmentation masks for all three slices. Subsequently, the masks of the top and bottom slices are then used to generate new prompts for adjacent slices. Finally, step-wise prediction can be achieved by sliding the prediction window forward or backward through the entire volume. Our model is trained on multiple public and private medical datasets and demonstrates its effectiveness through extensive 3D segmetnation experiments, with the help of minimal prompts. Code is available at \url{https://github.com/Curli-quan/Slide-SAM}.
\end{abstract}

\begin{keywords}
Segmentation, Interactive segmentation.
\end{keywords}

\section{Introduction}

Nowadays, 3D medical image segmentation plays a crucial role in medical image analysis for clinical analysis and diagnosis. However, annotating 3D medical images requires a significant amount of human labor and time resources. Recently, the Segment Anything Model (SAM)~\cite{sam} demonstrates impressive zero-shot segmentation capabilities in large-scale computer vision tasks~\cite{zero1, zero2, zero3}. In the field of medical imaging, SAM introduces new possibilities for accelerating data annotation by using non-fixed points, bounding boxes, and rough mask prompts to define segmentation region categories. However, the huge distributional gap between natural images and medical images makes SAM inapplicable directly to medical images~\cite{gap1, sammed2d}. One straightforward solution to bridge the gap is finetuning SAM using medical images~\cite{sammed2d, medsam, msa}. 
Another feasible approach is to adjust the numeric range of medical images, making them visually more akin to natural images, which greatly improves the segmentation ability of SAM in the medical domain. 
% We will show the feasibility in our experiments. 
Given these straightforward solutions, we believe that this issue may not be the central challenge in medical segmentation tasks.


% \begin{figure}
%     \centering
%     \includegraphics[width=0.5\linewidth]{figs/scheme.pdf}
%     \caption{An overview of the inference of the proposed method. Given a point/box prompt on a slice, Slide-SAM auto-segment via sliding window \textbf{without additional prompts}.}
%     \label{fig:1}
% \end{figure}

The fundamental challenge that SAM faces with medical images, in our view, lies in its inability to efficiently segment 3D images. Most recent variants can only employ a slice-by-slice approach for processing volumetric images~\cite{sammed2d, medsam}. However, such methods require substantial manual assistance and overlook the contextual information between slices, resulting in evident discontinuities in the segmentation results on each layer. 
Some other methods utilize adapters to introduce information between slices ~\cite{sam3d, medlsam, msa} whereas they still require a substantial cost in terms of prompt annotations. Additionally, there exists another category of methods that directly extend SAM into a 3D model~\cite{sam3dadapter}. Nevertheless, this method relinquishes the assistance provided by SAM's pre-trained weights due to the difference between tasks, necessitating a more resource-intensive model training process. 
Indeed, a core question prominently arises: \textit{How can SAM be enabled to predict 3D data effectively with only one prompt while fully harnessing its pre-trained weights?}
% 
% In addition to the aforementioned issues, annotations are also of paramount importance in medical images. We observe that many publicly available medical datasets have only partial annotations (e.g. AbdomenCT-1K~\cite{Abdomenct1k}). Therefore, we pose the following question: \textit{How can we acquire more effective annotations for datasets with limited labeling?}
% 
To address it, we propose a network called Slide-SAM. Slide-SAM only requires a prompt from the central slice to simultaneously infer multiple adjacent slices, and the resulting predictions can be used to generate prompts for the next group of adjacent slices. This is achieved through a sliding window approach, ultimately enabling one prompt to segment an entire volume. Furthermore, Slide-SAM's architecture and task are similar to the original SAM (from an RGB image to 3 grayscale images), making it easier to leverage the pre-trained weights of SAM. 
% we find that by experimenting with various threshold ranges and generating prompts through superpixel methods, followed by SAM's automatic segmentation, we can obtain rich 2D pseudo labels. 
Regarding data, we utilize both 3D ground truth labels and 2D pseudo-labels generated by SAM and we introduce a hybrid loss function that controls which slices to calculate the loss on, allowing the incorporation of single-slice labels and multi-slice labels. 
% Additionally, we propose an ensemble strategy that combines Slide-SAM with 2D SAM, achieving better segmentation results through their complementary strengths.
Extensive experiments prove that our Slide-SAM can gain superior inference performance on 3D images with minimal prompt cost.

% In summary, our contributions can be summarized as follows:
% \begin{itemize}
%     \item We propose Slide-SAM, a network designed for 3D medical volume segmentation, which efficiently utilizes pre-trained weights and enables multi-slice inference with a sliding window approach.
%     \item We introduce a data enrichment strategy involving the exploration of different threshold ranges, superpixel-based prompt generation, and the use of SAM for automatic 2D segmentation, resulting in rich 2D segmentation results. We introduce a hybrid loss function that uses both 3D labels and 2D pseudo-labels, allowing for more effective training and performance improvement.
%     \item Our ensemble strategy combines Slide-SAM with 2D SAM, leveraging their complementary strengths to achieve improved single-prompt segmentation results.
%     \item We conduct extensive experiments on large-scale datasets, and the results prove that our Slide-SAM can gain superior inference performance on 3D images with minimal prompt cost.
% \end{itemize}
% We conduct extensive experiments on large-scale datasets, and the results prove that our Slide-SAM can gain superior inference performance on 3D images with minimal prompt cost.

\begin{figure*}
\centering
    \includegraphics[width=0.95\linewidth]{figs/training.pdf}
    \caption{The training pipeline of Slide-SAM. First, Three adjacent slices are used as input and fed into the backbone network. Then, the Prompt encoder is employed to encode points or boxes. The Mask decoder receives the generated features from the previous step and generates masks for each slice using different heads. The hybrid loss is only computed for layers with labels.}
    \label{fig:train}
\end{figure*}
% 首先3个相邻的slices作为输入，送进主干网络。然后使用Prompt encoder对点或框进行编码。Mask decoder接收前面产生的特征后通过不同的头生成每个slice的mask。使用混合损失时，仅对有标签的层求损失。

% \underline{Annotated datasets:} We collect multiple datasets including over 4000 CT and MRI volumes and over 30,000 3D masks. We segment all 3D volumes and labels into sets of three consecutive slices, and resized them to (1024, 1024). The images are stored in JPG format with compression, while the labels are stored as sparse matrices.


% In this section, we first briefly introduce the structure of original SAM, and then introduce our improvements.

% \begin{figure*}
%     \centering
%     \includegraphics[width=0.8\linewidth]{figs/inference.pdf}
%     \caption{The inference process of Slide-SAM.}
%     \label{fig:infer}
% \end{figure*}
\section{Method}

\subsection{Structure of SAM}
SAM consists of three main components: image encoder, prompt encoder and light-weight mask decoder. The image encoder is based on the Vision Transformer (ViT)~\cite{vits} pretrained by MAE~\cite{mae} to extract
representations. Prompt encoders can handle both sparse (points, boxes, text) and dense (masks) prompts. In this paper, we mainly focus on sparse encoders, which represent points and boxes as positional encodings that are then summed with the learned embeddings for each prompt type. The mask decoder is a Transformer decoder block modified to include dynamic mask prediction headers. SAM uses bidirectional cross-attention in each block, one for prompt-to-image embeddings and the other for image-to-prompt embeddings, to learn the interaction between prompt and image embeddings. After fusing two embeddings, SAM upsamples the image embedding, and then a multilayer perceptron maps the output labels to a dynamic linear classifier that predicts the target mask given the image.

\subsection{Structure of Slide-SAM}
\bheading{Feature Encoder:}
% For feature encoder, which is a transformer encoder, we introduce LoRA~\cite{lora}, which allows SAM to update only a small part of parameters in medical image training. 
% which not only saves computational overhead, but also reduces the deployment and storage difficulty of fine-tuning models while ensuring segmentation performance.
% The ViT-B is used as the backbone and freezed during training, and the embedding layer is set to be trainable.
Concerning the feature encoder, which is a transformer encoder, we incorporate LoRA~\cite{lora}, enabling SAM to update only a small subset of parameters during medical image training. The ViT-B (used in SAM-B) serves as the backbone and remains frozen throughout training, while the embedding layer is configured to be trainable.

% LoRA introduces bypasses which first compress the Transformer features into a low-rank space and then reproject the compressed features to align with the output feature channels of the frozen Transformer block. 
% Specifically, for each attention block in feature encoder, given an encoded token sequence $F \in \mathbb{R}^{B \times N \times C_{in}}$ and represented by a projection layer $W \in \mathbb{R}^{C_{out} \times C_{in}}$ The output token sequence of the operation $\hat{F} \in \mathbb{R}^{B \times N \times C_{out}}$, LoRA assumes that the update of $W$ should be asymptotic and stable, it is therefore proposed to apply low-rank approximation to describe this progressive update. According to this strategy, Slide-SAM first freezes the Transformer layer to keep $W$ fixed, and then adds a bypass to complete the low-rank approximation. This bypass contains two linear layers $A \in \mathbb{R}^{r \times C_{in}}$ and $B \in \mathbb{R}^{C_{out} \times r}$, Among them $r \ll \min\{C_{in}, C_{out}\}$. Therefore, the process of updating layer $\hat{W}$ can be described as follows:

% \begin{equation}
% \begin{aligned}
%     \hat{F} &= \hat{W} F \\
%     \hat{W} &= W + \Delta W = W + BA
% \end{aligned}
% \end{equation}


\bheading{Mask Encoder:}
Given an image with 3 slices $X \in \mathbb{R}^{H \times W \times 3}$, the mask decoder efficiently maps the image embedding $F_{im} \in \mathbb{R}^{c \times h \times w}$, prompt embeddings $P \in \mathbb{R}^{5 \times c}$, and an output token into feature maps $F_o \in \mathbb{R}^{c \times h \times w}$ representing the images, and three prediction heads $H$ representing the prompted-based tasks. In order to segment multiple slices at the same time, the feature maps are expanded from one to three, but followed with the same heads. 

Specifically, for the feature map $F_o$, Slide-SAM has three different MLP blocks to convert the feature map $F_o$ into three feature maps $F_1 \in \mathbb{R}^{c \times h \times w}$, $F_2 \in \mathbb{R}^{c \times h \times w}$, and $F_3 \in \mathbb{R}^{c \times h \times w}$. Each feature map represents a slice. The heads $H$ are divided into three heads $H_{s1}\in \mathbb{R}^{c}$, $H_{s2}\in \mathbb{R}^{c}$, and $H_{s3}\in \mathbb{R}^{c}$ for segmenting different semantic regions, and $H_{u}\in \mathbb{R}^c$ for IoU prediction. Similar to SAM, we can obtain mask predictions $M \in \mathbb{R}^{H \times W \times 3 \times 3 }$ and IoU predictions $U \in \mathbb{R}^3$:
\begin{equation}
\begin{aligned}
    M_{ij} & = F_i \odot H_{sj}, & j=\{1,2,3\} \\
    U_{j} & = MLP(H_{u}) , &     
\end{aligned}
\end{equation}
where $\odot$ is point-wise multiplication. 
% Moreover, 
Additionally, to fully leverage the prior knowledge of SAM weights, we load all weights from the Transformer decoder and the weights of the heads. We duplicate the weights of the MLP block receiving the feature map $F_o$ from one to three to load into the three branches of Slide-SAM.

\bheading{Prompt Encoder:}
We consider two sets of prompts: sparse (points, boxes) and dense (masks). 
The distinction between our method and SAM lies in the fact that our input images consist of three slices, and we opt to select the middle slice as a reference to provide prompts. 
% In other words, the point or box prompts represent the points or outer bounding boxes of the middle slice among the three slices.
Regarding the mask prompt, we extend the input channel count of the convolutional block associated with the mask prompt to three, in order to facilitate the input of masks from three layers of slices.
Points and boxes are encoded by positional encodings summed with learned embeddings for each prompt type. Mask prompts are encoded and then summed element-wise with the image embedding.


\subsection{Training strategy}
\bheading{Hybrid loss:}
Given the prediction $M \in \mathbb{R}^{H \times W  \times 3 \times 3}$ and the ground-truth $\hat{M} \in \mathbb{R}^{H \times W \times 3}$, Slide-SAM adopts cross-entropy and Dice loss to supervise the fine-tuning process. The loss function can be described as follows:

\begin{equation}
\begin{aligned}
&\mathcal{L}^j_{seg}(M_j, \hat{M}) = \lambda_1\mathcal{L}_{ce}(M_j, \hat{M})+\lambda_2\mathcal{L}_{dice}(M_j, \hat{M}), \\
&\mathcal{L}^j_{iou}(U_j, M_j, \hat{M}) = \mathcal{L}_{mse}(U_j, IoU(M_j, \hat{M}))    
\end{aligned}
\label{eq:loss1}
\end{equation}

\begin{equation}
    \mathcal{L} = \mathcal{L}^k_{seg} + \mathcal{L}^k_{iou},~~ k = \arg\min_{j} \mathcal{L}^j, \\
\label{eq:loss2}
\end{equation}
where $\mathcal{L}_{ce}$ and $\mathcal{L}_{dice}$ represent cross-entropy loss and dice loss, respectively. $M \in \mathbb{R}^{H \times W \times 3}$ and $\hat{M} \in \mathbb{R}^{H \times W \times 3}$ represent the prediction and the ground truth, respectively. $\lambda_1$ and $\lambda_2$ represent loss weights, which are used to balance the impact between these two loss terms. $\lambda_1$ and $\lambda_2$ are set to 20 and 1 in practice, respectively.

Next, to facilitate concurrent training on 2D and 3D data, we introduce an indicator $\mathcal{I} \in \{0, 1\}^{3}$ to guide the layers for which loss computation is required. As a result, Eq~(\ref{eq:loss2}) can be transformed as follows:
\begin{equation}
\begin{aligned}
\hat{\mathcal{L}}^j_{seg} = \mathcal{I}\mathcal{L}_{seg}(M_j, \hat{M}),
\end{aligned}
\label{eq:loss3}
\end{equation}

For instance, when using 3D labels, all three slices possess masks, resulting in each value of the indicator being 1. Conversely, when using 2D labels, only one slice contains a mask, leading to the indicator values being set to 1 for the slice with the mask and 0 for the others. 
In the case of IoU prediction loss, since it is not feasible to accurately predict all masks when using 2D labels, the exact IoU values remain unknown. Therefore, in such situations, we set the IoU prediction loss to be 0. 

\begin{figure}
    \centering
    \includegraphics[width=0.8\linewidth]{figs/inference.pdf}
    \caption{The inference process of Slide-SAM.}
    \label{fig:infer}
\end{figure}

\subsection{Inference}
Our inference process, as shown in Figure~\ref{fig:infer}, begins by selecting a specific layer as the starting point. We provide prompts for points or bounding boxes on this layer. Subsequently, we input this layer and its adjacent slices (3 slices in total) into the model to obtain segmentation masks. 
The masks at both ends are then denoised by morphological opening and used for computing bounding boxes for the next 3 consecutive slices.
% In this segmentation step, the central slice is the slice associated with the cue.
We iterate through the inference process in a \textbf{sliding window} fashion, extending towards both ends, until one of the masks at ends is empty. 
% Finally, we obtain the complete 3D segmentation mask. 

% The steps of \textbf{sliding window} are as follows:
% % \bheading{3D post-processing (sliding window):} 
% First, we predict the results for the central slice. Then, the iterative inference process splits into two directions: \textit{forward} and \textit{backward}. For instance, in the \textit{forward} direction: 
% (a) We utilize the masks on the first slice of each predicted slice result. We apply morphological opening to denoise each mask and compute bounding boxes. These bounding boxes serve as prompts for another round of segmentation using the model. In this segmentation step, the central slice is the one associated with the prompt.
% (b) For areas on this slice that lack coverage from existing masks, we evenly sample points as prompts for segmentation, following the same segmentation procedure as described earlier. 
% (c) All obtained masks are then subjected to the filtering method described earlier. Subsequently, the process continues with shifting and predicting masks in the specified direction. The operations in the backward direction are similar to those outlined above.

\begin{table}[th]
\centering
\footnotesize
    \resizebox{1\linewidth}{!}{
        \begin{tabular}{l|ccccc|ccccc}
        \hline
        \multirow{2}{*}{\textbf{Method}} & \multicolumn{5}{c|}{\textbf{CHAOS}} & \multicolumn{5}{c}{\textbf{BTCV}} \\
        \cline{2-11}
        & \textbf{Liver} & \textbf{Kid(R)}& \textbf{Kid(L)} &\textbf{Spleen} &\textbf{Avg.}  & \textbf{Liver} & \textbf{Stomach} & \textbf{Kid(L)} & \textbf{Spleen} &\textbf{Avg.} \\
        \hline
        % \multicolumn{6}{l}{\textit{{\color{Gray} SUPERVISED METHOD}}} \\
         nnUNet~\cite{nnunet} & 87.95 & 93.91 & 93.67 & 87.78 & 90.82 & 96.17 & 76.79   & 94.81 & 93.84 & 90.40 \\
         % UNet~\cite{unet} & 79.80 & 76.80 & 73.30 & 88.10 & \\ss
        % \hline
        SSL-ALP~\cite{SSLALP} (1 volume) &63.82  & 56.52  & 63.68  &  73.40 &64.35 & 69.40 & 34.35   & 44.89 & 65.00 &53.40\\        
        % Self-ref~\cite{wang2022few} (1 volume) & 65.95  & 61.45   & 65.06    & 74.62   & \\
        \hline
        % \multicolumn{6}{l}{\textit{{\color{Gray} SAM VARIANTS}}} \\
         SAM (N points) &29.79&	36.93&	63.77&	45.55 & 44.01 &	39.60 & 31.74   & 68.57 & 23.46  & 40.84\\
         SAM-Med2d (N points) & 58.48 & 89.16 & 87.53 & 72.51 & 76.92 & 84.89 & 79.02   & 92.97 & 92.03 & 87.22 \\
         % \rowcolor{mygray}
         SAMMed-3D (N points) & 82.80 & 80.25 & 78.49 & 79.37 & 80.22 & 64.31 & 54.73 & 85.98 & 75.38 & 70.10 \\
         Ours (1 point) & 88.39 & 91.86&	90.74&	90.53 &90.38 & 92.38 & 89.03   & 92.35 & 75.62 &87.34 \\
         \hline
         SAM (N boxes) & 78.48&	93.81&	92.40&	91.84 & 89.13 &	72.18 & 83.98   & 93.67 & 88.51 &84.58 \\
         SAM-Med2d (N boxes) & 90.09 & \textbf{94.39} & \textbf{94.03} & 92.73 &\textbf{92.81} & 93.58 & 82.88   & 93.49 & \textbf{94.07} & 91.00\\
         Ours (1 box) & 88.42&	91.89&	90.98&	90.76 & 90.51 & 95.44 & 74.31   & 92.42 & 91.16 & 88.33\\
         % Ours (3 box) & 89.03&	92.00&	91.15&	91.79 & \\
         Ours (5 boxes) & 91.70&	92.18&	91.37&	92.03 & 91.82 & 95.62 & 90.88   & 93.49 & 91.88 & 92.96 \\
         Ours (N boxes) & \textbf{92.94}& 92.40&	91.60&	\textbf{92.75} & 92.42 & \textbf{95.75} & \textbf{93.24} & \textbf{93.52} & 93.44 & \textbf{93.98} \\
        \hline
        \end{tabular}
    }
    \caption{Evaluation on CHAOS and BTCV testsets (\textbf{Dice (\%)}). }
    \label{tab:chaos}
\end{table}

\begin{table}[h]
\centering
\resizebox{1\linewidth}{!}{
\begin{threeparttable}
\begin{tabular}{clcccccc}
\hline
\textbf{Type} & \textbf{Methods}  & \multicolumn{3}{c}{\textbf{Pancreas Tumor}} & \multicolumn{3}{c}{\textbf{Colon Tumor}}  \\
\hline
\multirow{3}{*}{\tabincell{c}{Supervised \\ models}} &nnUNet~\cite{nnunet}$^1$       &&41.65    &&&43.91  \\
&nnFormer~\cite{zhou2023nnformer}$^1$      &&36.53    &&&24.28  \\
&Swin-UNETR~\cite{hatamizadeh2021swin}$^1$    &&40.57    &&&35.21  \\
\hline
&Num of Prompts & 1 & 3 & 10 & 1 & 3 & 10 \\
\cline{2-8}
\multirow{3}{*}{\tabincell{c}{SAM \\ variants}}&SAM-B~\cite{sam}$^1$         & 24.01 & 29.80 & 30.55   & 28.83 & 35.26 & 39.14\\
&SAM3d-Adapter~\cite{sam3dadapter}$^1$ & 54.09 & 54.92 & 57.47  & 48.35 & 49.43 & 49.99\\
&Ours          & \textbf{60.28} & \textbf{70.01} & \textbf{80.09} & \textbf{61.89} & \textbf{69.80} & \textbf{71.55} \\
              \hline
\end{tabular}
\begin{tablenotes}
    \footnotesize
    \item[1] Copied from \cite{sam3dadapter}.
\end{tablenotes}
\end{threeparttable}
}
\caption{Comparison with classical medical image segmentation methods on MSD Pancreas and MSD Colon datasets (\textbf{Dice (\%)}). }
\label{tab:tumor}
\end{table}

\begin{table*}[h]
\centering
\footnotesize
\resizebox{1\linewidth}{!}{
\begin{threeparttable}
\begin{tabular}{llrrrrrrrr}
\hline
\textbf{Method}   & \textbf{Prompt} & Liver    & Spleen    & Kid(L) & Kid(R)     & Stomach   & Gallbladder     &  Esophagus  & Pancreas \\
\hline
SAM-B     & $\sim$40 boxes  & 74.98 & 89.74 & \textbf{93.57} & \textbf{93.56} & 78.86 & \underline{82.21} & \textbf{72.09} & 62.31 \\
SAM-Med2d & $\sim$40 boxes  & 93.51 & 92.00 & 91.66 & 92.01 & 87.48 & 70.68 & 52.07 & 64.81 \\
SAMMed-3D & 1 point & 90.38 & 82.83 & 84.10 & 87.24 & 55.28 & 60.56 & 28.02 & 43.49 \\
\hline
         % \rowcolor{mygray} 
Ours      & 1 box    & 94.50  & 87.36 & 92.86 & 92.45 & 76.40 & 73.75 & 37.42 & 61.65 \\
         % \rowcolor{mygray} 
Ours      & 5 boxes  & \textbf{95.55} & \underline{92.03} & 92.32 & \underline{92.73} & \underline{91.35} & \textbf{83.01} & 48.97 & 65.54 \\
Ours (ViT-H) & 1 box & 94.36 & 90.01 & 93.04 & 91.39 & 76.46 & 69.03 & 60.75 & \underline{71.84} \\
Ours (ViT-H) & 5 boxes & \underline{95.25} & \textbf{92.46} & \underline{93.40}  & 92.60  & \textbf{92.08} & 78.57 & \underline{69.52} & \textbf{82.38} \\

\hline
\textbf{Method}   & Duodenum &Colon& Intestine& Adrenal& Rectum& Bladder & Femur(L)  & Femur(R)  & \textbf{Avg.}  \\
  \hline
SAM-B      & 59.03  & 34.99  & 54.98 & 17.21 & 84.05 & 86.23 & 89.53 & 89.08 &72.65 \\
SAM-Med2d & 58.10  & 54.54  & 73.32 & 15.45 & 85.37 & 91.71 & 89.40  & 89.24 & 75.14 \\
SAMMed-3D & 31.41 &24.45 &30.47 &0.86 &60.38 &86.84 &83.89 &73.65 &57.74\\
\hline
         % \rowcolor{mygray} 
Ours     & 44.75  & 31.69  & 50.92 & 24.80 & 73.92 & 90.85 & 89.12 & 90.33 &69.54 \\
         % \rowcolor{mygray} 
Ours    & \underline{62.89} & \underline{64.33} & \underline{76.07} & 32.54 & 76.94 & \textbf{92.88} & 92.04 & 92.60 & 78.23 \\
Ours (ViT-H)  & 50.00   & 50.15 & 56.78 & \underline{33.98} & \underline{77.86} & 84.11 & \underline{93.47} & \underline{93.72} & 74.18\\
Ours (ViT-H)  & \textbf{76.70} & \textbf{79.43} & \textbf{81.77} & \textbf{49.97} & \textbf{86.68} & \underline{91.76} & \textbf{93.89} & \textbf{93.99} & 84.38\\
\hline
\end{tabular}
% \begin{tablenotes}
%     \footnotesize
%     \item[] The segmentation targets from 1 to 16 are \textit{Liver, Spleen, Kidney(L), Kidney(R), Stomach, Gallbladder, Esophagus, Pancreas, Duodenum, Colon, Intestine, Adrenal, Rectum, Bladder, Head of femur (L), Head of femur (R)}.
% \end{tablenotes}
\end{threeparttable}
}
\caption{Evaluation on WORD testset with \textit{box prompts} (\textbf{Dice (\%)}). The \textbf{best} and \underline{second best} are highlighted}
\label{tab:word}
\end{table*}
% \textcolor{red}{TODO ?}


\section{Experiment}
\subsection{Dataset preparation}
\bheading{Training data:}
The training data we used is divided into two parts. (1) \underline{Annotated datasets}. The public datasets include AbdomenCT-1K~\cite{Abdomenct1k}, TotalSegmentor~\cite{wasserthal2023totalsegmentator}, CTPelvic1K~\cite{CTPelvic1K}, WORD~\cite{word}, etc. and some private data;
(2) \underline{Pseudo-labels generated by SAM.} We use SAM to generate labels for unlabeled or partially labeled data. These labels are typically in 2D format and need to be used in conjunction with the mixed loss function we propose.

% \textcolor{red}{
\bheading{Evaluation settings:} We choose ISBI 2019 Combined Healthy Abdominal Organ Segmentation Challenge (CHAOS)~\cite{chaos}, MICCAI 2015 Multi-Atlas Abdominal Labeling challenge (BTCV)~\cite{landman2015miccai} and WORD~\cite{word} as the validation dataset. CHAOS and BTCV are split into training and test sets in a 4:1 ratio. 20 cases are taken from the WORD as testset. MSD Pancreas and MSD Colon~\cite{antonelli2022medical} are split into training, validation and test sets in a 7:1:2 ration, following \cite{sam3dadapter}. We choose \textbf{Dice} as the evaluation metric.
% }

\bheading{Preprocessing:}
% In order to effectively apply SAM to medical image segmentation, we preprocess the datasets from multiple perspectives. 
% In the context of the annotated 3D dataset, 
We initially apply value clipping to confine the range of CT data within [-200, 400] and MRI data within [0, 600]. Subsequently, we standardize the intensity values of each volume to the range [0, 255]. We proceed to extract all slice images along the x, y, and z axes, along with their corresponding masks. These slices are then organized and saved in groups of three adjacent slices.
During the extraction process, we discard groups for which the percentage of the central slice's mask area is less than 0.14\%.
% For the unannotated portions of 3D data or those with partial annotations, we employ the SAM method to automatically predict annotations on the 3D data in a layer-wise manner, as referenced earlier. Subsequently, we save the slices and their corresponding pseudo-annotations. 
% The image data is saved in JPG compression format, while the mask data is stored in a sparse matrix compression format.


\bheading{Settings:} To quantitively assess the performance of our model, we conduct comparative experiments involving fully-supervised networks, including nnUNet~\cite{nnunet}, a one-shot network SSL-ALp~\cite{SSLALP}, as well as SAM~\cite{sam}, SAM-Med2d~\cite{sammed2d}, SAM3d-Adapter~\cite{sam3dadapter}. The fully supervised networks employ all training data and corresponding labels. SAM and SAM-Med2d are initialized with public weights. The one-shot segmentation model utilizes training data but abstains from using labels, and requires only the labels of one volume to make predictions.
% SAM (ft.) is finetuned using data from AbdomenCT-1K and labels for four specific organs, and the pseudo-labels we generated. 
% Our network is trained using a combination of publicly available datasets, a small quantity of private datasets, and the generated pseudo-labels.


% After performing these steps, we obtained approximately 4.6 million images and 19.7 million masks. Based on the image index, we randomly split the data into 80\% for training and 20\% for testing. The resulting training set contains approximately 3.67 million images and 15.8 million masks, while the test set contains 0.92 million images and 3.9 million masks. We also introduced 9 MICCAI2023 data sets (including approximately 520,000 images and 1.31 million masks), only to verify the generalization ability of the model. We believe that with a more comprehensive and diverse training data set, SAM will better adapt to the complexity and subtleties of the medical imaging field, providing more accurate and reliable application support. This will also bring new opportunities and challenges to the research and development in the field of medical image segmentation.

\bheading{Implementation details:} We use AdamW as the optimizer, and set the training rate to 0.0002. $\beta_1$, $\beta_2$ and weight decay settings are 0.9, 0.999 and 0.1 respectively. We end training at 20 epochs. Our models are trained on 4 Nvidia RTX GPUs with 24G GPU memory.


\subsection{Main results}
% For 2D SAMs, as they can only process individual slices, a distinct prompt must be provided for each slice, which imposes a significant annotation cost burden on medical practitioners. 
In Table~\ref{tab:chaos}, we find that our performances are much better than SAM and SAM-Med2d when using point prompts, and compared to them using prompts on each slice, we only use one point prompt.
When using 1 box prompt, we achieve similar performance to SAM-Med2d on multiple anatomies. In addition, when we also use box prompts on each slice like SAM, we can achieve the best performance on multiple anatomies.
While initial performance gains are significant, further improvements plateau as the number of prompts increases. 
This is because our method automatically generates reliable prompts for other slices, minimizing the marginal benefit of additional prompts.
% In Table~\ref{tab:chaos}, with only 1 or 5 prompts, we observe that our method achieves performance (90.51\% and 91.82\%) comparable to the fully supervised method nnUNet (90.83\%) and SAM-Med2d fine-tuned on extensive medical data (92.81\%). The latter requires an average of approximately 17 prompts for each annotation. When employing points prompts, not only do we utilize fewer prompts, but our performance also significantly surpasses SAM-Med2d by \textbf{11.65\%}. This is because we use points as the initial prompt,  if we generate box prompts for other layers, it can get better results.
% 
% In Table~\ref{tab:btcv}, our model also outperforms other methods in multiple anatomies (\textit{e.g.} kidney and liver). However, there is a gap in performance compared to SAM-Med2d in others (\textit{e.g.} gallbladder, esophagus). However, considering that their method requires an average of 34 prompts while ours only needs a single prompt, our method remains more advantageous. Especially when using points as prompts, since the initial predictions may have large errors, the errors will be propagated to other layers through the generated prompts, thus making the predictions of the entire volume worse.

For tumor detection, we compare our method with SAM3d-Adapter~\cite{sam3dadapter} and some popular segmentation methods, nnU-Net~\cite{nnunet}, nnFormer~\cite{zhou2023nnformer}, Swin-UNETR~\cite{hatamizadeh2021swin}. We finetune Slide-SAM with tumor datasets based on weights well-trained on fore-mentioned large-scale medical datasets. As shown in Table~\ref{tab:tumor}, our performance 
Our model significantly outperforms existing supervised methods and SAM models, while requiring only a small number of prompts. This can greatly improve annotation efficiency.

For the WORD testset, as illustrated in Table~\ref{tab:word}, we achieve competitive results with only 5 prompts per anatomical structure, in stark contrast to SAM and SAM-Med2d, which necessitate approximately 40 prompts on average for each structure. Moreover, we leverage a larger backbone network (ViT-H) and incorporate additional CT data to train Slide-SAM, observing a more robust performance.

\begin{figure}[t]
    \centering
    \includegraphics[width=0.8\linewidth]{figs/prompt_efficiency.pdf}
    \caption{Labeling efficiency: The number of images that can be annotated using 1000 prompts for WORD testset.}
    % \includegraphics[width=0.8\linewidth]{figs/num_prompts.pdf}
    % \caption{The number of prompts required for dice to reach 90\% on WORD testset.}
    \label{fig:num_prompts}
\end{figure}

\begin{figure*}[h]
    \centering
    \includegraphics[width=0.8\linewidth]{figs/vis2.pdf}
    \caption{Visual comparison on the CHAOS dataset. } % The volumes are rendered using ITK-SNAP.
    \label{fig:vis3d}
\end{figure*}

\begin{figure}
    \centering
    \includegraphics[width=0.8\linewidth]{figs/vis2d.pdf}
    \caption{Predictions of BTCV testset with different \textit{noisy} prompts. We display 3 slices and their masks in RGB format.}
    \label{fig:vis2d}
\end{figure}

\subsection{Other analysis}
% \label{sec:pseudo}
\bheading{Analysis of prompt efficiency: }
% 假设我们将预测与真实标签的dice大于0.9设定为标注成功，我们统计了我们的方法与其他方法在同样使用1000个prompt的情况下能够成功标注图像的数量。如图所示，我们方法的标注效率远远高于其他方法。
Assuming we set the condition for successful annotation as having a Dice coefficient greater than 0.9 between predictions and ground-truth labels, we counted the number of images successfully annotated by our method and other methods (SAM, SAM-Med2D) when using the same 1000 prompts. As demonstrated in Figure~\ref{fig:num_prompts}, the annotation efficiency of our method is significantly higher than that of other methods.
% We count the number of prompts required to segment dice to 90\% for some anatomies as illustrated in Figure~\ref{fig:num_prompts}, and we find that the number of prompts we needed is much smaller than the 2D SAM methods. Figure~\ref{fig:num_prompts} proves that our model has higher annotation efficiency in actual annotation scenarios.

\bheading{Visual comparison:}
As depicted in Figure~\ref{fig:vis3d}, the segmentation results from the original SAM exhibit noticeable discontinuities between upper and lower layers, leading to incoherence between adjacent layers. Additionally, when using point prompts, a significant number of segmentation errors are prevalent, causing considerable challenges for experts. In contrast, our method produces remarkably smooth results, whether utilizing point prompts or box prompts as initial prompts. It provides excellent 3D segmentation results and facilitates subsequent annotation optimization, making it a more user-friendly option for experts.

\bheading{Noisy prompts:}
As shown in Figure~\ref{fig:vis2d}, we attempt to simulate a realistic annotation environment by using points or bounding box prompts with noise. We find that our method exhibits a certain level of robustness to noise. For box prompts, stable prediction results are obtained regardless of translation or scaling. The stability of point prompts is relatively lower.
Poor predictions may occur with points at edges.
% For some anatomies, poorer predictions may occur when the points are located in extreme edges.


\section{Conclusion}
One main challenge AM encounters when applied to medical images mainly stems from its limitations in effectively segmenting 3D image data. To address this, we introduce Slide-SAM, which leverages pretrained weights and facilitates multi-slice inference through a sliding window technique. Incorporating our data enrichment strategies and a hybrid loss function that encompasses both 3D labels and 2D pseudo-labels, our method enhances the training process and results in performance advancements. Extensive experiments prove that our Slide-SAM can gain superior inference performance on 3D images with a minimal prompt cost.

\section{Acknowledgements}
Supported by Natural Science Foundation of China under Grant 62271465, Suzhou Basic Research Program under Grant SYG202338, and Open Fund Project of Guangdong Academy of Medical Sciences, China (No. YKY-KF202206).

% ------------------------------------------------------------------------
% ------------------------------------------------------------------------
% ------------------------------------------------------------------------
% This is where the content of your paper goes.  Some random
% notes\footnote{Random footnote are discouraged}:
% \begin{itemize}
% \item You should use \LaTeX \cite{Lamport:Book:1989}.
% \item JMLR/PMLR uses natbib for references. For simplicity, here, \verb|\cite|  defaults to
%   parenthetical citations, i.e. \verb|\citep|. You can of course also
%   use \verb|\citet| for textual citations.
% \item Eprints such as arXiv papers can of course be cited \cite{Hinton:arXiv:2015:Distilling}. We recomend using a \verb|@misc| bibtex entry for these as shown in the sample bibliography.
% \item You should follow the guidelines provided by the conference.
% \item Read through the JMLR template documentation for specific \LaTeX
%   usage questions.
% \item Note that the JMLR template provides many handy functionalities
% such as \verb|\figureref| to refer to a figure,
% e.g. \figureref{fig:example},  \verb|\tableref| to refer to a table,
% e.g. \tableref{tab:example} and \verb|\equationref| to refer to an equation,
% e.g. \equationref{eq:example}.
% \end{itemize}


% \begin{figure}[htbp]
%  % Caption and label go in the first argument and the figure contents
%  % go in the second argument
% \floatconts
%   {fig:example}
%   {\caption{Example Image}}
%   {\includegraphics[width=0.5\linewidth]{example-image}}
% \end{figure}

% \begin{algorithm2e}
% \caption{Computing Net Activation}
% \label{alg:net}
%  % older versions of algorithm2e have \dontprintsemicolon instead
%  % of the following:
%  %\DontPrintSemicolon
%  % older versions of algorithm2e have \linesnumbered instead of the
%  % following:
%  %\LinesNumbered
% \KwIn{$x_1, \ldots, x_n, w_1, \ldots, w_n$}
% \KwOut{$y$, the net activation}
% $y\leftarrow 0$\;
% \For{$i\leftarrow 1$ \KwTo $n$}{
%   $y \leftarrow y + w_i*x_i$\;
% }
% \end{algorithm2e}

% % Acknowledgments---Will not appear in anonymized version
% \midlacknowledgments{We thank a bunch of people.}

\bibliography{midl23_059}
\newpage

\appendix
\section{Usage for clinicians}
% The practicality of integrating such an advanced tool into routine medical practice may raise concerns. These include the time and expertise clinicians need to provide initial prompts and to merge the tool into existing diagnostic workflows. The authors might consider adding a discussion to mitigate the learning curve for researchers and clinicians.
To assist clinicians in using our tool more easily, we provide a detailed overview of the workflow for our method. We plan to integrate our tool into existing annotation software. Assuming the annotation tool with our method is presented as follows, the process begins by dragging the data needing annotation into the tool. Then, a suitable slice is selected, and an enclosing box is drawn around the desired area for annotation. For further refinement of the annotation results, additional point prompts can be added, or a more suitable bounding box can be drawn on slices where other segmentation results are unsatisfactory. 
We recommend ensuring that the box prompt wraps around the target as much as possible to avoid missing cases.
As our method has certain performance limitations, if the desired results are not achieved, users can modify the labels manually using the annotation tool.

\begin{figure}[h]
    \centering
    \includegraphics[width=0.8\linewidth]{figs/tools.png}
    \caption{Example of labeling an anatomy. Drawing a bounding box of the target on a slice.}
    \label{fig:draw}
\end{figure}

\section{Dataset preparation}
The training data we used is divided into two parts. 
\begin{itemize}
    \item \underline{Annotated public datasets and private datasets}. The public datasets include AbdomenCT-1K~\cite{Abdomenct1k}, Total Segmentor~\cite{wasserthal2023totalsegmentator}, CTPelvic1K~\cite{CTPelvic1K}, WORD~\cite{word}, etc. and some private data.
    \item \underline{Pseudo-labels generated by SAM.} As shown in Figure~\ref{fig:pseudo}, we use SAM to generate labels for unlabeled or partially labeled data. These labels are typically in 2D format and need to be used in conjunction with the mixed loss function we propose.
\end{itemize}

\underline{Annotated datasets:} We collect multiple datasets including over 4000 CT and MRI volumes and over 30,000 3D masks. We segment all 3D volumes and labels into sets of three consecutive slices, and resized them to (1024, 1024). The images are stored in JPG format with compression, while the labels are stored as sparse matrices.

\underline{Pseudo-labels:} 
Since some datasets have only partial annotations, we employ a straightforward method to generate a large number of pseudo-labels and apply them after training. Moreover, we observe that these data indeed result in a significant performance improvement.
The generation of pseudo-labels is as follows: We find that by adjusting the window width of CT or MRI images (i.e., adopting different truncation methods, such as constraining data within the range of [-200, 400] for CT), SAM can produce different results for the same data. We believe that this adjustment can make certain regions, originally with small color differences, more distinguishable, allowing SAM to segment these areas. Therefore, we used multiple truncation thresholds, $[\mu \pm 3*\delta]$, $[\mu\pm 2*\delta]$, $[\mu\pm\delta]$, and $[\mu\pm 0.5*\delta]$, where $\mu$ and $\delta$ refer to the average and standard variance, respectively. In addition, we used superpixels to generate point/box prompts for SAM. We would exclude superpixels with an average value below a certain threshold, as we consider them potentially representing background. Figure~\ref{fig:pseudo} illustrates the pseudo-labels we generate.

\begin{figure}[h]
    \centering
\includegraphics[width=0.8\linewidth]{figs/vmasks3.png}
    \caption{Pseudo labels on an image from AbdomenCT-1K dataset. (left) GT + Pseudo labels with different value ranges; (right) + superpixel-prompted pseudo labels. }
    \label{fig:pseudo}
\end{figure}

\section{Implementation details}

\bheading{Prompt genetation:}
% \textcolor{red}{New}
Following SAM and other interactive segmentation models, we simulate an interactive segmentation setup during training. First, with equal probability, either a foreground point or bounding box is selected randomly for the target mask. Points are sampled uniformly from the ground truth mask. Boxes are taken as the ground truth mask’s bounding box, with random noise added in each coordinate with a standard deviation equal to 10\% of the box sidelength, to a maximum of 20 pixels. This noise profile is a reasonable compromise between applications like instance segmentation, which produces a tight box around the target object, and interactive segmentation, where a user may draw a loose box.

\subsection{Inference}
\bheading{2D post-processing:} 
\begin{itemize}
    \item (a) Filter out areas with IoU predictions less than 0.4;
    \item (b) Filter out areas with stability scores less than 0.6. The stability score is calculated as follows: given a certain stable interval such as [-0.1, 0.1], add the corresponding offset to the original logits and check the changes in the prediction area. Stability score = smallest prediction area/largest prediction area. Areas with a stability score less than a certain value will be filtered.
    \item (c) Calculate circumscribed matrices for each mask, and use non-maximum suppression (NMS) to remove overlapping masks using all matrices and their corresponding prediction confidence values as input. 
\end{itemize}

\bheading{3D post-processing (sliding window):} 
First, we predict the results for the central slice. Then, the iterative inference process splits into two directions: forward and backward. For instance, in the forward direction, 
\begin{itemize}
    \item (a) We utilize the masks on the first slice of each predicted slice result. We apply morphological opening to denoise each mask and compute bounding boxes. These bounding boxes serve as prompts for another round of segmentation using the model. In this segmentation step, the central slice is the one associated with the prompt.
    \item (b) For areas on this slice that lack coverage from existing masks, we evenly sample points as prompts for segmentation, following the same segmentation procedure as described earlier. 
    \item (c) All obtained masks are then subjected to the filtering method described earlier. Subsequently, the process continues with shifting and predicting masks in the specified direction. The operations in the backward direction are similar to those outlined above.
\end{itemize}


\bheading{Parallel strategy:}

We adopt a parallel strategy during inference, that is, divide all adjacent slices into multiple batches, and increase the batch size as large as possible to ensure that as much GPU memory is utilized as possible. Inference is performed on Nvidia Titan RTX 24G and the batchsize is set to 4. For Slide-SAM, the running speed is 3.25 sec/volume, and the GPU memory usage is 14G. For Slide-SAM-H, the running speed is about 10 sec/volume, and the GPU memory usage is 17G. In future, we will also try more mobile-friendly technologies to light-weight and accelerate model inference, such as using EfficientViT as backbone or SAMI pretrain strategy in EfficientSAM, using model compression and distillation technology or other operator acceleration technologies, etc.


\section{Other Analysis}
\subsection{Noisy Prompts}
\begin{figure}
    \centering
    \includegraphics[width=\linewidth]{figs/more_noise.png}
    \caption{Noisy prompts. The \textcolor{green}{green} box refers to the box prompt. White refers to the ground-truth label. \textcolor{red}{Red} refers to the overlapping of the prediction and the GT label. \textcolor{yellow}{Yellow} refers to the incorrect prediction.}
    \label{fig:more_noise}
\end{figure}

We test more noisy prompts as shown in Figure~\ref{fig:more_noise}. Here, we mainly test box prompts because the results obtained from box prompts are more robust. Box prompts are our primary annotation method. We find that areas not covered by box prompts are easily missed, and overly large box prompts may lead to incorrect predictions. However, slightly larger box prompts than the standard bounding boxes result in minimal impact. Therefore, we recommend making the box prompts slightly larger to avoid missing annotations as much as possible.


\subsection{Z-spacing}
Our method works fine under common spacing settings (such as (1,1,3), (1,1,5)). Currently, the performance of our Slide SAM on consecutive slices with larger differences may be relatively poor. As shown in the Table~\ref{tab:zspacing}, we resampled the volumes of the Word testset along the z-axis to change the z-spacing. We find that slightly increasing the z-spacing does not have much impact on performance, but performance notably decreases when the z-spacing becomes too large. We plan to improve this problem in future work, such as adding an adaptive module to Slide-SAM to try to robust the model identify variable slices that are further apart.

\begin{table*}[h]
\centering
\footnotesize

\begin{tabular}{l|cccccc}
\hline
\textbf{Method}  & \textbf{Z-spacing} & \textbf{Resampling ratio} & Liver    & Spleen  & Pancreas  & Gallbladder   \\
\hline
Slide-SAM-H & $2.5\,0mm\sim3.00\,mm$ & 1.0 & 94.36  & 90.01   & 71.84	& 69.03 \\
 & $2.75\,mm\sim3.30\,mm$ & 1.1 & 94.99  & 89.10   & 71.18 & 69.31 \\
 & $3.00\,mm\sim3.60\,mm$ & 1.2 & 95.03  & 88.34   & 69.59 & 70.79 \\
 & $3.75\,mm\sim4.50\,mm$ & 1.5 & 94.51  & 89.17   & 68.36 & 68.10 \\
 & $5.00\,mm\sim6.00\,mm$ & 2.0 & 94.09  & 88.20   & 66.19 &  -   \\
 & $6.25\,mm\sim7.50\,mm$ & 2.5 & 93.29  & 89.15   & 60.66 &  -   \\
 & $10.00\,mm\sim12.00\,mm$ & 4.0 & 90.25  &  82.58 & 48.41  &  -   \\
 & $15.00\,mm\sim18.00\,mm$ & 6.0 & 85.84  &  75.20 & 42.38 &  -   \\
 & $17.50\,mm\sim21.00\,mm$ & 7.0 & 83.60  &  73.54 &  -  & - \\
 & $18.00\,mm\sim24.00\,mm$ & 8.0 & 81.71  &  - &  -  & - \\
\hline
\end{tabular}

\caption{Evaluation on WORD testset (\textbf{Dice (\%)}).}
\label{tab:zspacing}
\end{table*}


\subsection{Heterogeneous datasets}
We conduct an additional testing on an in-house dataset sourced from the Guangdong Provincial People’s Hospital in China. This dataset comprises 120 MRI volumes, partitioned into subsets: 100 volumes for training, 10 volumes for validation, and 10 volumes for testing purposes. The dataset comprises scans focusing on the lower abdomen, with the segmentation targets being the rectum and rectal tumors.
One challenge of this dataset is its divergence from our pre-training data, which primarily consists of CT scans. Specifically, the CHAOS dataset is our sole MRI dataset, involving upper abdominal scans.
Consequently, our model has not been exposed to lower abdominal MRI data previously. Furthermore, the spacing between slices in this dataset measures approximately (0.36, 0.36, 5), with a pronounced inter-slice gap. This structural peculiarity poses a significant challenge for our model's adaptation.
To address these challenges, we employ a fine-tuning strategy using the available training data and subsequently evaluate the performance of our model. The results are shown in Table~\ref{tab:rectum}.

\begin{table}[ht]
    \centering
    \begin{tabular}{llll}
    \hline
        \textbf{Method} & \textbf{Prompt} & \textbf{Rectum} & \textbf{Tumor} \\ 
        \hline
        nnUNet-2d &  & 59.62 & 47.56 \\ 
        nnUNet-3d &  & 68.48 & 61.68 \\ 
        \hline
        \multirow{4}{*}{Slide-SAM-finetune} & 1 box & 65.10 & 52.89 \\ 
         & 3 boxes & 75.92 & 61.85 \\ 
         & 5 boxes & 81.11 & 64.48 \\ 
         & 10 boxes & \textbf{83.96} & \textbf{65.12} \\ 
        \hline
    \end{tabular}
    \caption{Evaluation on Rectal in-house dataset. (Dice (\%))}
    \label{tab:rectum}
\end{table}

We find that when only one box prompt is used, the performance is not as good as nnUNet, but when we use 3 or more prompts, the performance can exceed nnUNet. Additionally, the segmentation accuracy can be further improved as the number of prompts increases.

\subsection{Pseudo labels:} 
To prove the efficacy of pseudo-labels, we conducte a comparative analysis by assessing model performance with and without their utilization. As shown in Table~\ref{tab:abla1}, we employ all slices from the 3D volumes of AbdomenCT-1K as the training images and finetune SAM with the incorporation of the LoRA module. Subsequently, validation is carried out on the test set of AbdomenCT-1K. Our findings indicate that, when using only the original data and their associated labels, the finetuned model's performance is even inferior to that of the original SAM. However, a notable enhancement in performance is observed when we incorporate the additional pseudo-labels, thereby affirming the constructive impact of the pseudo-labels used in our model training.

\begin{table}[h]
    \centering
    \small
    \begin{tabular}{cccc}
    \hline
         \textbf{mIoU}   &\textbf{ Dataset for ft.} & \textbf{Point} & \textbf{Box} \\
    \hline
        SAM &  & 56.16 & 72.36 \\
        SAM (ft.) & Abd-1K & 45.68&	56.06 \\
        SAM (ft.) & + pseudo masks& 66.82 & 74.87 \\
    \hline
    \end{tabular}
    \caption{Comparison between the utilization of generated pseudo-labels and their absence on AbdomenCT-1K testset.}
    \label{tab:abla1}
\end{table}


\end{document}
