% This is samplepaper.tex, a sample chapter demonstrating the
% LLNCS macro package for Springer Computer Science proceedings;
% Version 2.21 of 2022/01/12
%
\documentclass[runningheads]{llncs}
%
\usepackage[T1]{fontenc}
% T1 fonts will be used to generate the final print and online PDFs,
% so please use T1 fonts in your manuscript whenever possible.
% Other font encondings may result in incorrect characters.
%
\usepackage{multirow}
\usepackage{amsmath}
\usepackage{graphicx}
% Used for displaying a sample figure. If possible, figure files should
% be included in EPS format.
%
% If you use the hyperref package, please uncomment the following two lines
% to display URLs in blue roman font according to Springer's eBook style:
%\usepackage{color}
%\renewcommand\UrlFont{\color{blue}\rmfamily}
\usepackage[pagebackref=true,breaklinks=true,colorlinks,bookmarks=false]{hyperref}
\usepackage[marginal]{footmisc}
% \renewcommand{\thefootnote}{}
%
\begin{document}
%
\title{Advancing Multi-Organ and Pan-Cancer Segmentation in Abdominal CT Scans through Scale-Aware and Self-Attentive Modulation}
%
\titlerunning{Advancing Multi-Organ and Pan-Cancer Segmentation}
% If the paper title is too long for the running head, you can set
% an abbreviated paper title here
%
% \author{Pengju Lyu \inst{1,2}authornote{*}\and
% Junchen Xiong \inst{2}\authornote{*}\and
% Wei Fang\inst{2}\and
% Weifeng Zhang\inst{2}\and
% Cheng Wang\inst{2\dag} \and
% Jianjun Zhu\inst{2,3\dag} 
% }
\author{Pengju Lyu \inst{1,2}\textsuperscript{*}\orcidID{0009-0004-0863-8110} \and
Junchen Xiong\inst{2}\textsuperscript{*}\orcidID{0009-0000-1988-1184} \and
Wei Fang\inst{2}\orcidID{0009-0004-2291-6068}\and
Weifeng Zhang\inst{2}\orcidID{0000-0001-6408-6489} \and
Cheng Wang\inst{2}\textsuperscript{\dag}\orcidID{0000-0003-1138-337X} \and
Jianjun Zhu\inst{2,3}\textsuperscript{\dag} \orcidID{0000-0001-5895-7663}
}

%
\authorrunning{P. Lyu et al.}
% First names are abbreviated in the running head.
% If there are more than two authors, 'et al.' is used.
%
\institute{City University of Macau, Macau, China \and Hanglok-Tech Co., Ltd., Hengqin 519000, China \and Zhongda Hospital, Medical School, Southeast University, Nanjing 210009, China
% ABC Institute, Rupert-Karls-University Heidelberg, Heidelberg, Germany
\\
\email{\{pj.lv, cheng.wang, jj.zhu\}@hanglok-tech.cn}}
%

\maketitle              % typeset the header of the contribution
\renewcommand{\thefootnote}{\fnsymbol{footnote}}

\footnotetext[1]{Equal contribution.}
\footnotetext[2]{Corresponding authors.}

% \footnotetext{* Equal contribution.\\
% \dag \ Corresponding authors.}
%
\begin{abstract}
Accurately segmenting abdominal organs and tumors within computed tomography (CT) scans holds paramount significance for facilitating computer-aided diagnosis and devising treatment plans. However, inherent challenges such as lesion heterogeneity and the scarcity of adequately annotated data hamper model development. In this study, we present a two-phase cascaded framework to address the complexities of multi-organ and pan-cancer segmentation. A lightweight CNN first generates candidate regions of interest (ROIs) followed by a hybrid CNN-Transformer model culminating in refined segmentation by synergizing scale-aware modulation for local features and self-attention for global context.
Our proposed method secured the 5th position in the MICCAI FLARE23 final test set, showcasing its competitive edge in achieving precise target segmentation with mean Dice Similarity Coefficients of 90.51\% for multi-organ and 53.04\% for pan-cancer respectively.
Additionally, efficient inference is exhibited with an average runtime of 18 seconds per 512 × 512 × 215 3D volume with less than 2G GPU memory consumption. Our code is available at: https://github.com/lyupengju/Flare23.  

\keywords{Multi-organ and pan-cancer segmentation  \and Hybrid CNN-Transformer model \and Scale-aware and self-attention modulation.}
\end{abstract}

\section{Introduction}
Medical image segmentation plays a crucial role in clinical diagnosis. Accurate organ and cancer segmentation in abdomen computed tomography (CT) as one of the most commonly used modalities for the abdominal diagnosis can assist clinicians in identifying distinct anatomical regions as well as assessing the structure of lesions which assumes critical significance in computer-aided diagnosis, treatment planning, and image-guided interventions. For instance, the efficacy of radiotherapy treatment planning (RTP), to a great extent, hinges upon the precise demarcation of both the organ at risk (OAR) and the target tumor~\cite{zhu2023embedding}. Moreover, segmentation on pan-cancer enables the identification of common features and patterns across different cancer types, facilitating the development of targeted therapies and personalized medicine approaches, e.g., identification of unique gene expression signatures associated with different cancers are valuable as diagnostic biomarkers and therapeutic targets~\cite{li2017comprehensive}. 

In the deep learning era, the application of convolutional neural network (CNN) or Transformer-based U-Net represents a seminal milestone in the field of medical image segmentation. By virtue of its expansive encoder-decoder structure, U-Net~\cite{U-Net} effectively captures both local and global contextual information, enabling the precise delineation of anatomical structures. Its hierarchical approach, coupled with skip connections, facilitates the fusion of multi-scale features, empowering U-Net to discern fine-grained details and accurately segment complex structures in medical images. CNN-based U-Net variants~\cite{V-net}~\cite{Unet3} leverage the power of convolutional layers to extract spatial features, enabling the network to discern intricate patterns and variations in tumor morphology, with remarkable precision. On the other hand, Transformer-based U-Net models~\cite{Unetr}~\cite{tang2022self}~\cite{zhou2021nnformer} exploit self-attention mechanisms to capture long-range dependencies and contextual relationships, facilitating a comprehensive understanding of anatomical structures. The hybridization of CNN and Transformer~\cite{gao2021utnet}\cite{xie2021cotr}stands to the pursuit of synthesizing the best of both paradigms, aiming to forge a sophisticated framework that pushes the boundaries of segmentation accuracy and efficiency. 

Abdominal multi-organ and pan-cancer segmentation, however, continues to pose several challenges due to the inherent complexity and variability of cancer lesions, e.g., inter- and intra-tumor heterogeneity coupled with the presence of surrounding anatomical structures that can confound accurate segmentation~\cite{AbdomenCT-1K}. On top of that, the scarcity of cancer annotated datasets, especially for rare cancer types, poses a significant hurdle in training accurate and generalizable models. MICCAI FLARE23\footnote{https://codalab.lisn.upsaclay.fr/competitions/12239} (Fast, Low-resource, and Accurate oRgan and Pan-cancer sEgmentation in Abdomen CT) makes a significant contribution with the availability of an extensive partial labeled dataset, enabling comprehensive research and analysis in the field. To mitigate the requirement for fully labeled data, which aligns with FLARE23 challenge's objectives, self-training with pseudo labeling and semi-supervised learning emerge as a valuable strategy~\cite{liu2023towards}. Self-training entails the generation of surrogate labels through models trained on partially labelled datasets, thereby offering a bridge towards the realm of fully supervised methodologies. Lian et al.~\cite{lian2023learning} introduces a novel approach that employs partially labelled single-organ datasets to generate pseudo labels for multi-organ segmentation, utilizing partial and mutual priors to enhance organ segmentation performance. Though iterative pseudo labeling with one resource-intensive nnU-Net and selecting reliable ones, Huang et al.~\cite{huang2022revisiting}, under this knowledge distillation framework, ultimately attain a lightweight model achieving accuracy and efficiency tradeoff in FLARE22~\cite{FLARE22}. Semi-supervised learning leverages unlabeled samples to improve generalization~\cite{li2023multi} where consistency regularization is a popular approach enforcing invariant predictions under input perturbations~\cite{tarvainen2017mean}~\cite{lai2022dlunet}. 
Other than that, Pan et al.~\cite{pan2022unlabeled} adopt adversarial training~\cite{zhang2017deep} that focuses on training a generator against a discriminator that tries to differentiate segmented outputs derived from labeled versus unlabeled data to promote outputs distribution convergence.
On the other hand, the majority of extant deep learning architectures for medical image segmentation, such as APAU-Net~\cite{wang2019organ}, TransBTS~\cite{li2201transbtsv2}, albeit achieving impressive precision optimized on high-compute laboratory settings with GPUs, typically manifest immense computational demands and parametric complexity. While in bed-side setting with on-device processing of limited computational resources and memory capacities., e.g., point-of-care imaging~\cite{valanarasu2203unext} or interventional surgeries demanding immediate decision-making~\cite{zhu2023embedding}, developing light-weighted, yet competent and scalable models for robust and reliable segmentation becomes paramount.

In this work, we aim to develop a fast, low-resource, and accurate organ and pan-cancer segmentation framework. Our approach is based on the classic two-phase (location-segmentation) cascaded processing stream wherein a lightweight CNN in phase one employing partial convolution and a novel hybrid CNN-Transformer model with synergistic amalgamation of scale-aware modulator and self-attention in phase two are proposed. We trained both models with forementioned simple self-training with pseudo labeling technique. The obtained results on validation set not only demonstrate superior performance on Dice Similarity Coefficient (DSC) and Normalized Surface Dice (NSD) but also showcase favorable inference speeds, underscoring the efficacy and practicality of our proposed method. 

\begin{figure}[htbp]

\centering
\includegraphics[scale=0.5]{imgs/two_phase.png}
\caption{An overview of the two-phase cascade network. 
}
\label{fig:two_phase}
\end{figure}

\section{Method}
We adopted localization and segmentation strategy to instantiate multi-phase cascade methodologies which has been proven useful in the past FLARE challenges~\cite{wang2019organ}~\cite{wangcascade}, the overall framework is as shown in the Figure~\ref{fig:two_phase}. The first phase of the network bestows invaluable location information, furnishing a candidate frame that subsequently facilitates the precise cropping of the image's region of interest (ROI). This localized region (i.e., hard attention~\cite{jiang2022apaunet}~\cite{zhang2021weakly}), thus extracted, serves as the input for the second-stage network, wherein the process of fine segmentation ensues. This sequential strategy imparts the profound advantage of confining the segmentation focus solely to the target organ, effectively excluding any perturbations arising from unrelated organs or background noise.


\begin{figure}[htbp]
\centering
\includegraphics[width=1.0\textwidth]{imgs/models.png}
% \includegraphics[scale=0.35]{imgs/111.png}
\caption{ The schematic illustration of proposed models.
(a) The shared two-phase backbone structure with Phase\_1 model   residual inverted bottleneck block where partial convolution (PConv) efficiently conduct spatial token mixing while Phase\_2 model utilizing scale-aware modulator (SAM) or Multi-head Self-Attention (MSA) in Metaformer structure; (b) Phase\_1 decoder adapted from~\cite{xie2021segformer}; 
(c) Phase\_2 decoder adapted from~\cite{Unetr}.}

\label{fig:encoder}
\end{figure}


%###########################
\subsection{Preprocessing}
% Full description of any pre-processing strategy. Please details on the following aspects 

This preprocessing workflow commences with a percentile-based rescaling (percentile values: 5th and 95th) constraining intensity range to crop region containing salient features while suppressing outliers. It is followed by respacing to (1.5mm, 1.5mm, 2mm) rectifies inter-slice spacing disparities, imparting uniformity to the image domain. Image intensities are further Z-normalized to ameliorate convergence dynamics and numerical stability during model training. For phase one, we resize the image dimension to the (128, 128, 128), while patch-wise training method are found to be optimal in identifying tumor, thus four cubes of size (96, 96, 96) are randomly cropped with the ratio between foreground and background equals to 3 : 1. 
This process culminates in data augmentation where each patch is subjected to random operations, including flipping, rotation, affine, intensity shifting (offset: 0.1), and scaling (scaling factor: 0.1).


\subsection{Proposed Method}
The selection of a lightweight and computationally efficient model is of paramount importance in this framework. The careful choice of the model strikes a delicate balance between computational resource utilization and precision.
\subsubsection{Hierarchical Encoder}
We choose to build our model for each phase upon the macro design of U-Net~\cite{U-Net}  architecture that incorporates multiple levels of hierarchy to capture and process features at different scales as shown in Figure~\ref{fig:encoder}. The encoder structure shares across phases with minor variance that stem (patch embedding) block in phase one contains a convolution of kernel size and stride of 4, the number of which halves in phase two. With input size $H\times W\times D$ representing height, width, depth, stem module down scales feature size to corresponding $h\times w\times d$. Base channel number is set as 32 / 60 for each phase respectively at initial stage, which progressive doubles and feature map size $\frac{h}{2^{i-1}}\times \frac{w}{2^{i-1}}\times \frac{d}{2^{i-1}}$,  $i \in \{1, 2, 3, 4\}$ reduces itself down the four encoder stages. Between two consecutive stages, down sampling operations is carried out for resolution reduction and channel expansion by a $2\times 2\times 2$ convolution with stride 2 followed by layer normalization. 




\subsubsection{Phase\_1 Model Components}
The localization network is represented by a binary segmentation U-Net, which is designed to treat all labeled organs as the foreground label. To obtain a coarse ROI, we resort to partial convolution (PConv)~\cite{chen2023run} as choice of spatial token mixing. PConv improves the efficiency by applying filters on only a subset of input channels (first quarter in our case) while preserving the remaining ones. This reduces computational redundancy and the number of memory accesses, resulting in lower FLOPs than regular convolution and higher FLOPS than depthwise convolution. With the completion of shortcut connection and two successive pointwise convolutions, the Phase\_1 encoder presents itself as a stacking of residual inverted bottleneck blocks in which channel expansion ratio is 2 and the number of such block is set 2 per stage. 

For the decoding of this phase\_1, we employ the streamlined MLP decoder from Segformer~\cite{xie2021segformer} for efficient information aggregation. Specifically, the multi-level features derived from encoder blocks undergo channel wise compression to base channel number via MLP layers before being upsampled to the size of $h\times w\times d$ and a second MLP layer condenses the concatenated features channels to the number equivalent to that of output classes, trilinear interpolation is ultimately applied to recover to full image size.


\begin{figure}[htbp]
\centering
\includegraphics[width=0.9\textwidth]{imgs/module.png}

% \includegraphics[scale=0.4]{imgs/module.png}
\caption{(a) Spatial modulation comparison between self-attentive and scale-aware operator. self-attention first generates the key K, query Q, and value V using MLP layers and the weights to modulate the V representations are determined by attention weights computed by measuring the similarity between Q and K. SAM instead 
directly obtain the weights with Multi-Head Mixed Convolution (MHMC) and a Scale-Aware Aggregation (SAA) blocks.  (b) Evolving  from~\cite{lin2023scale}, the schematic illustration of SAM integrating multi-scale contexts via a MHMC and adapts token representations through a SAA.}
\label{fig:module}
\end{figure}


\subsubsection{Phase\_2 Model Components}
For fine segmentation, by taking advantage of the strengths of both CNNs and Transformers in Meta-former style~\cite{yu2022metaformer}, which contains a spatial token mixing layer and a feed-forward layer (FFN)~\cite{ren2022shunted}. We adopt Scale-Aware Modulator (SAM)~\cite{lin2023scale} to reweight the value representations for lower-level local feature extraction in early stages while Multi-head Self-Attention (MSA)~\cite{Unetr} dedicated to global information in later stages, see Figure~\ref{fig:module} for details. SAM consists of a Multi-Head Mixed Convolution (MHMC) and a Scale-Aware Aggregation (SAA) module to enable the integration of multi-scale contexts and adaptive modulation of tokens. Together, SAM and MSA provide complementary modeling of multi-scale local features and long-range global contexts. Their combination enables extracting both localized fine details and overall spatial relationships. 

The MHMC introduces multiple depth-wise convolutions with different kernel sizes, enabling it to capture various spatial features across multiple scales. Figure~\ref{fig:module} illustrates the structure of MHMC, wherein the input channels are divided into multiple groups (heads), each subjected to depth-wise separable convolutions with diverse kernel sizes respectively, which are able to discern a diverse spectrum of granularity features in an adaptive fashion.

The SAA module engages in a practice of cross-group information aggregation across all features to harmonize diverse insights from distinct groups. Specifically, three mixed groups are curated with each selecting one channel from previously partitioned group, and the inverse bottleneck structure (expansion ratio = 2) with point-wise convolutions are subsequently leveraged fostering a holistic synergy of knowledge propagation and enriched representation, which, by means of the Hadamard product operation, eventually serves as weight modulator of the value V in contrast to yielding attention matrices via a matrix multiplication between the query and key in self-attention. The whole process of SAM can be summarized in the following steps:

\begin{scriptsize} 
\begin{equation}
    \begin{aligned}
        &\boldsymbol{Input}:\,\,     \boldsymbol{z}\in \mathbb{R} ^{C\times H^{\prime}\times W^{\prime}\times D^{\prime}}
        \\&\boldsymbol{MHMC}:\,\,H_{j}^{i}=DWConv_{k_j\times k_j\times k_j}\left( \boldsymbol{z}_{j}^{i} \right) , j\in \left\{ 1,2,\cdots ,M \right\} , i\in \left\{ 1,2,\cdots ,C/M \right\} \\&\boldsymbol{SAA}:\,\,     G_i=Relu\left( IN\left( Conv_{1\times 1\times 1}\left( \left[ H_{1}^{i},H_{2}^{i},\cdots ,H_{M}^{i} \right] \right) \right) \right) \\&\qquad \quad\ \  W=Conv_{1\times 1\times 1}\left( \left[ G_1,G_2,\cdots ,G_{C/M} \right] \right) \\&\boldsymbol{Output}:\,\,   \hat{\boldsymbol{z}}\,\,=\,\,W\odot \left( Conv_{1\times 1\times 1}\left( \boldsymbol{z} \right) \right) 
    \end{aligned}
\end{equation}
\end{scriptsize}

Let $\,\,\boldsymbol{z}\in \mathbb{R} ^{C\times H^{\prime}\times W^{\prime}\times D^{\prime}}$ denote the input tensor to the SAM module with $C$ channels and spatial dimensions $H^{\prime}\times W^{\prime}\times D^{\prime}$ for the current layer. We divide the channels into $M=3$ heads, indexed by $j\in \left\{ 1,2,3 \right\} $, with $C/M$ channels in each head. The output is denoted as $\hat{\boldsymbol{z}}$ with the same dimensions as $\boldsymbol{z}$. Within each head j, we have single-channel feature maps $\boldsymbol{z}_{j}^{i}\in \mathbb{R} ^{1\times H^{\prime}\times W^{\prime}\times D^{\prime}}$ for $i\in \left\{ 1,2,\cdots ,C/M \right\} $. These are convolved with learned depth-wise kernels $DWConv$ of size $k_j$, where we set $k_j\in \left\{ 3,5,7 \right\} $ for the 3 heads respectively. $\odot $ denotes dot product operation.

SAM blocks reside only in the initial two stages. During the penultimate stage, triple of SAM blocks and Multi-Head Self-Attention (MSA) blocks are alternatively stacked, effectively capturing the transition from local to global dependencies. In the ultimate stage, exclusively MSA blocks are employed, thereby ensuring proficient capture of long-range dependencies. The number of such blocks in each stage amounts to $2, 4, 6, 2$ correspondingly.

We adopted phase\_2 decoder similar to that from UNETR~\cite{Unetr}. A residual block, composed of two consecutive sequences of Conv + InstanceNorm + LeakyRelu, is applied to skip connections as well as subsequent concatenated features. Upsampling is realized with transpose convolution. 



% \textbf{Please introduce your strategies to use the unlabeled images.} If you don't use them, please explicitly say "Unlabeled images were not used."
% Please also clarify whether you used the pseudo labels generated by the FLARE21 winning algorithm~\cite{FLARE22-1st-Huang} and the best-accuracy-algorithm~\cite{FLARE22-bestDSC-Wang}.

% \textbf{Please introduce your strategies to improve inference speed and reduce resource consumption} 


\subsection{Post-processing}
After phase one, we remove objects of size smaller than $(20\times20\times20)$, which might be outliers affecting a precise ROI cropping for phase two whose result are refined by preserving solely the largest components of organs. Based on the observation that predicted tumor mask could appear separate with abdominal organs though within the ROI defined by bounding box. This contradicts a well-established fact that tumors originate on organs. We have tumor mask through basic morphological operations of dilation and subtraction to identify any organs in proximity, thereby filtering out those isolated components as shown in Figure~\ref{fig:post_processing}. The 
 resultant mask are finally mapped back to the same size of input image.


\begin{figure}[htbp]
\centering
\includegraphics[width=0.65\textwidth,height=0.3\textwidth]{imgs/post_processing.png}

% \includegraphics[scale=0.4]{imgs/post_processing.png}
\caption{Feasibility analysis of post-processing operations. It is evident that the proposed post-processing applied to the predictive mask effectively eliminates isolated tumors.
}
\label{fig:post_processing}
\end{figure}


\section{Experiments}
\subsection{Dataset}
% The FLARE 2023 challenge is an extension of the FLARE 2021-2022~\cite{MedIA-FLARE21}\cite{FLARE22}, aiming to aim to promote the development of foundation models in abdominal disease analysis. The segmentation targets cover 13 organs and various abdominal lesions. The training dataset is curated from more than 30 medical centers under the license permission, including TCIA~\cite{TCIA}, LiTS~\cite{LiTS}, MSD~\cite{simpson2019MSD}, KiTS~\cite{KiTS,KiTSDataset}, and AbdomenCT-1K~\cite{AbdomenCT-1K}. The training set includes 4000 abdomen CT scans where 2200 CT scans with partial labels and 1800 CT scans without labels. The validation and testing sets include 100 and 400 CT scans, respectively, which cover various abdominal cancer types, such as liver cancer, kidney cancer, pancreas cancer, colon cancer, gastric cancer, and so on. The organ annotation process used ITK-SNAP~\cite{ITKSNAP}, nnU-Net~\cite{nnUNet}, and MedSAM~\cite{MedSAM}.


% The evaluation metrics encompass two accuracy measures—Dice Similarity Coefficient (DSC) and Normalized Surface Dice (NSD)—alongside two efficiency measures—running time and area under the GPU memory-time curve. These metrics collectively contribute to the ranking computation. Furthermore, the running time and GPU memory consumption are considered within tolerances of 15 seconds and 4 GB, respectively.
The FLARE23 challenge constitutes an extension of its precursor, the FLARE 2021-2022 initiative~\cite{MedIA-FLARE21}~\cite{FLARE22}. Its primary objective is to foster the advancement of foundational models in the realm of abdominal disease analysis. The delineation objectives encompass a spectrum of 13 distinct organs including liver, spleen, pancreas, right kidney, left kidney, stomach, gallbladder, esophagus, aorta, inferior vena cava, right adrenal gland, left adrenal gland, and duodenum in addition to diverse abdominal lesions, namely pan-cancer.
The training dataset is curated from more than 30 medical centers under the license permission, including TCIA~\cite{TCIA}, LiTS~\cite{LiTS}, MSD~\cite{simpson2019MSD}, KiTS~\cite{KiTS,KiTSDataset}, autoPET~\cite{autoPET-Data,autoPET-MICCAI22}, TotalSegmentator~\cite{TotalSegmentator}, and AbdomenCT-1K~\cite{AbdomenCT-1K}.
The training dataset consists of a total of 4000  abdominal CT scans in which 2200 scans with partial annotations and 1800 scans devoid of annotations. Two sets of 4000 pseudo labels of multi organs, generated by two top-performance teams during FLARE22~\cite{huang2022revisiting}~\cite{wangcascade}, were appended afterwards. The validation and testing sets include 100 and 400 CT scans, respectively, which cover various abdominal cancer types, such as liver cancer, kidney cancer, pancreas cancer, colon cancer, gastric cancer, and so on. The organ annotation process used ITK-SNAP~\cite{ITKSNAP}, nnU-Net~\cite{nnUNet}, and MedSAM~\cite{MedSAM}.



\subsection{Implementation details}
\begin{table}[htbp]
\caption{Development environments and requirements.}
\label{table:environments}

\centering
\begin{tabular}{c|c}
\hline
System                  & Ubuntu 20.04.5 LTS                           \\ \hline
CPU                     & Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz \\ \hline
RAM                     & 1.0 Ti; 3200 MT/S                              \\ \hline
GPU (number and type)    & Two NVIDIA A800 80G                          \\ \hline
CUDA version            & 11.8                                         \\ \hline
Programming language    & Python 3.8.16                                \\ \hline
Deep learning framework & torch 2.0.1, torchvision 0.15.2              \\ \hline
Specific dependencies   & monai 1.2.0                                  \\ \hline
Code                    & {https://github.com/lyupengju/Flare23}                                    \\ \hline
\end{tabular}
\end{table}
Throughout the entire experimental process, we implemented our code based on PyTorch library\footnote{http://pytorch.org/}  and MONAI framework\footnote{https://monai.io/}. All models were trained on two Nvidia A800 GPUs. To accelerate model training, the CacheDataset method in the MONAI was utilized for data pre-loading. During the training phase, the Adam optimizer was adopted with weight decay of$1e^{-5}$ to minimize the most widely used joint loss function, i.e., dice and cross entropy~\cite{Unetr}. Initial learning rate was set as $3e^{-4}$ scheduled by cosine annealing strategy. The number of training epochs was up to 300 with batch size of 4. See Table~\ref{table:environments},~\ref{table:Training_protocols} for more training and environment settings. 



\begin{table}[htbp]
\caption{Training protocols.}
\label{table:Training_protocols}
\begin{tabular}{c|c}
\hline
Network initialization                        & Random                           \\ \hline
Batch size                                    & 4                                \\ \hline
Patch size (Phase\_2 model)                    & $96\times 96\times 96$           \\ \hline
Resized size (Phase\_1 model)                   & $128\times 128\times 128$        \\ \hline
Total epochs                                  & 300                              \\ \hline
Optimizer                                     & AdamW                            \\ \hline
Initial learning(lr)                          & $3e^{-4}$                        \\ \hline
Lr decay schedule                             & Cosine annealing                 \\ \hline
Training time for each model                  &  36 hours                         \\ \hline
Loss function                                 & Dice loss and Cross entropy loss \\ \hline
Number of model parameters (Phase\_1 / Phase\_2) & 1.38 M / 35.84 M                     \\ \hline
Number of flops (Phase\_1 / Phase\_2)            & 1.56 G / 374.77 G                    \\ \hline
\end{tabular}
\end{table}



\subsection{Training protocols}

Leveraging the entire dataset comprising 4000 cases and one set of their corresponding organ pseudo labels from FLARE22 winning algorithm~\cite{huang2022revisiting}, we are able to obtain our Phase\_1 model by means of a label filtering technique, along with a pre-trained Phase\_2 model, the specific process is as depicted in Figure~\ref{fig:data_train}. 
Similar to~\cite{huang2022revisiting}, we adopted self-training with pseudo labeling strategy to obtain final Phase\_2 model. Specially, we reassigned pseudo annotations in conjunction with 2200
partial ground truth for the whole dataset to update the segmentation model. This process facilitated the creation of a comprehensive dataset, complete with fully annotated organs and tumors. 
The process of pseudo labeling was executed iteratively 3 times, thereby enabling the iterative enhancement of the quality of pseudo annotations, which is pivotal in advancing the model's performance. In practice, we first split the renewed dateset into two folds, the updating pseudo labels was then formed by ensembling two branch networks through soft voting, which are later utilized to train our final Phase\_2 model. We empirically selected the model that tend to produce oversegmented results on pan-cancer, which generally yield better Dice score on online validation leaderboard.


\begin{figure}[htbp]
\centering
\includegraphics[scale=0.1]{imgs/data_train.png}
\caption{Training pipeline.
}
\label{fig:data_train}
\end{figure}


% Please describe at least the following aspects:

% 1. processing of the unlabeled images and partial labels

% 2. Data augmentation (Based on the winning solutions in FLARE 2021~\cite{MedIA-FLARE21}, we recommend using extensive data augmentation)

% 3. patch sampling strategy

% 4. optimal model selection criteria





\section{Results and discussion}
% Note: Please describe at least the following aspects in this section


% 1. The effect of using unlabelled cases;


% 2. In what kind of cases the proposed method works well?

% 3. What are the possible reasons for the failed cases or organs?


% 4. Segmentation efficiency analysis
We conducted comprehensive quantitative evaluation of our proposed model using standard segmentation and efficiency metrics. Regarding accuracy, we report the Dice similarity coefficient (DSC) and normalized surface Dice (NSD) between predicted and ground truth organ and lesion masks with DSC elucidating overall overlap and NSD focusing on boundary alignment precision~\cite{tang2022self}. Efficiency-wise, running time and the GPU memory consumption, are integral for assessing the algorithm's practicality and real-world applicability.  The running time and GPU memory consumption are considered within tolerances of 15 seconds and 4 GB, respectively.
\begin{table}[htbp]
\caption{Quantitative evaluation results in terms of DSC and NSD for organs and tumor respectively.
}
\label{table:results}
\centering
\begin{tabular}{l|cc|cc|cc}
\hline
\multirow{2}{*}{Target} & \multicolumn{2}{c|}{Public Validation} & \multicolumn{2}{c|}{Online Validation} & \multicolumn{2}{c}{Testing} \\ \cline{2-7} 
                        & DSC (\%)            & NSD (\%)           & DSC (\%)                 & NSD (\%)           & DSC (\%)      & NSD (\%)     \\ \hline
Liver                   & 97.68 $\pm$ $0.54$ & 99.28 $\pm$ 0.77  & 97.64                   & 99.24          &   97.04           &     97.92         \\
Right Kidney            & 95.90 $\pm$ $2.94$ & 96.95 $\pm$ 4.27  & 94.86                   & 95.95                  & 95.54             &  95.42            \\
Spleen                  & 96.89 $\pm$ $1.57$ & 98.46 $\pm$ 4.13  & 96.19                   & 98.06                  & 96.54             &  98.47            \\
Pancreas                & 85.96 $\pm$ $7.20$ & 96.67 $\pm$ 6.09  & 84.63                   & 95.79                  &89.14              &  97.32            \\
Aorta                   & 94.17 $\pm$ $4.39$ & 97.34 $\pm$ 5.82  & 94.72                   & 98.08                  & 95.35             &  99.31            \\
Inferior vena cava      & 90.16 $\pm$ $5.63$ & 92.93 $\pm$ 5.77  & 89.46                   & 91.88                  & 90.76             &  93.76            \\
Right adrenal gland     & 83.66 $\pm$ $1.25$ & 95.93 $\pm$ 1.39  & 83.97                   & 96.58                  &    83.67          &96.24              \\
Left adrenal gland      & 84.67 $\pm$ $5.47$ & 96.73 $\pm$ 4.13  & 83.98                   & 95.90                  & 84.50             & 96.16             \\
Gallbladder             & 88.28 $\pm$ $19.06$ & 90.81 $\pm$ 20.10  & 88.92                   & 91.09                  & 84.84             & 88.15             \\
Esophagus               & 80.78 $\pm$ $17.86$& 91.23 $\pm$ 17.34 & 82.04                   & 92.84                  &   87.86           & 97.25             \\
Stomach                 & 94.46 $\pm$ $3.09$ & 97.75 $\pm$ 3.42  & 94.50                   & 97.70                  &  94.71            & 97.48             \\
Duodenum                & 83.07 $\pm$ $8.72$ & 94.74 $\pm$ 6.59  & 83.41                  & 94.70                  & 86.48             &  95.51             \\
Left kidney             & 93.06 $\pm$ $14.38$ & 94.05 $\pm$ 15.32  & 93.60                   & 94.80                  & 93.41             &  94.40           \\
Organ Average           & 89.90              & 95.61             & 89.84                   & 95.56                  &    90.51          &    95.88          \\ \hline
Tumor                   & 54.25 $\pm$ $36.10$& 49.65 $\pm$ 33.51 & 50.26                   & 45.31                  & 53.04             &  44.47            \\ \hline
\end{tabular}
\end{table}
\subsection{Quantitative results}
To validate the efficacy of the model, we present in Table~\ref{table:results}  the details of 50 cases from the validation dataset, the online validation and the final testing outcomes.
Our model demonstrates strong performance on both organ and pan-cancer segmentation from abdominal CTs. For the 13 organs on online validation, we achieve competitive accuracy score with DSC ranging from 82.04\% (Esophagus) to 97.64\% (liver), and NSD all over 90\%, which highlight our model's  ability to capture fine anatomical details.
Specifically, our model in Phase\_2 with only 35.84M parameters achieves considerable gains on average Dice over prior arts spanning CNN-based V-Net (67.70M)~\cite{V-net}, nnUNet (30.74M)~\cite{nnUNet}, and Transformer-based Swin UNETR (69.94M)~\cite{tang2022self}, nnFormer (158.9M)~\cite{zhou2021nnformer} as well as their hybrid CoTr (41.93M)~\cite{xie2021cotr}, as presented in Figure~\ref{fig:model_compare}. This again validates the benefits of synergistically combining SAM and MSA from both paradigms.


With regards to pan-cancer segmentation, although our approach attains a relatively high average DSC of 50.26\% across all lesion types, since the fact that best model was selected based on its performance on the public 50 cases, the divergence on tumor metrics between it and full validation set coupling with a high standard variance (36.10\%)  indicates that model's weak capacity of learning generalizable representations of pan-cancer. Our methodology distinguished itself by securing a commendable 5th position in the final test set, quantified by elevated mean Dice on both multi-organ (90.51\%) and tumor (53.04\%).

\begin{figure}[htbp]
\centering
% \includegraphics[width=0.5\textwidth,height=0.3\textwidth]{imgs/post_processing.png}

\includegraphics[scale=0.6]{imgs/models_compare.png}
\caption{Phase\_2 models comparison with prior arts on online validation. The diameter of the circular data points is proportional to the total number of parameters in each respective model.}
\label{fig:model_compare}
\end{figure}

% \begin{table}[htbp]

% \caption{Comparison among prior arts of V-Net, SwinUNETR and our SAM-MSA hybrid model on online validation.}
% \label{table:ablation}
% \centering
% \begin{tabular}{cc|c|c|c}
% \hline
% \multicolumn{2}{c|}{Phase\_2 model}                 & V-Net (CNN) & SwinUNETR (Transformer) & Ours (Hybrid) \\ \hline
% \multicolumn{1}{c|}{\multirow{2}{*}{DSC (\%)}} & Organs & 87.36     & 89.03                 & 89.84       \\ \cline{2-5} 
% \multicolumn{1}{c|}{}                      & Tumor & 46.64     & 47.73                 & 50.26       \\ \hline
% \multicolumn{2}{c|}{Parameter (M)}                   & 67.70      & 69.94                  & 35.84        \\ \hline
% \end{tabular}
% \end{table}

To analyze the impact of training set size, an ablation study was conducted comparing validation performance between models trained on the full 4000 case dataset versus the 2200 partially labeled cases alone. Despite nearly doubling the training data through pseudo-labeling, the models seem not to learn novel anatomical representations but rather fine-tuning of existing feature spaces, exemplified by both DSC and NSD metrics on either scenario revealing negligible differences regarding organs (0.1\%) and tumor (0.5\%), as shown in Figure~\ref{fig:data_compare},
which indicates that model's learned features might not be universally applicable, resulting in limited generalization to different cases, which in turn impacts the overall effectiveness of pseudo labeling.

\begin{figure}[htbp]
\centering
% \includegraphics[width=0.8\textwidth,height=0.7\textwidth]{imgs/data_compare.png}

\includegraphics[scale=0.5]{imgs/data_compare.png}
\caption{Performance comparison on online validation using 2200 partially labeled examples and 4000 fully labeled examples for training.}
\label{fig:data_compare}
\end{figure}



% \begin{table}[htbp]
% \caption{Performance comparison on online validation using 2200 partially labeled examples and 4000 fully labeled examples for training.}
% \label{table:train_data}
% \centering
% \begin{tabular}{cc|c|c}
% \hline
% \multicolumn{2}{c|}{Utilized Dataset}               & Partial labeled 2200 cases & All 4000 cases \\ \hline
% \multicolumn{1}{c|}{\multirow{2}{*}{DSC (\%)}} & Organs & 89.77                     & 89.84       \\ \cline{2-4} 
% \multicolumn{1}{c|}{}                      & Tumor  & 49.37                  &  50.26       \\ \hline
% \multicolumn{1}{c|}{\multirow{2}{*}{NSD (\%)}}  & Organs & 95.44                     &  95.56      \\ \cline{2-4} 
% \multicolumn{1}{c|}{}                      & Tumor  & 44.68                     & 45.30        \\ \hline
% \end{tabular}
% \end{table}


\subsection{Qualitative results on validation set}
We supplement our quantitative results with qualitative analysis to gain further insights, as shown in Figure \ref{fig:results}. 
Notably, the segmentation performance exhibits variability across organs. In contrast to near perfect demonstration (Case \#27),  our model generates fragmentary or inaccurate contours with smaller structures like esophagus and duodenum (Case \#69), echoed by their relatively lower Dice scores on validation set. For pan-cancer, while some tumor instances (Case \#35) are effectively segmented, showcasing a robust alignment with ground truth annotations, others exhibit violent segmentation inconsistencies (Case \#99). This variance in tumor segmentation proficiency is indicative of the complexity inherent in cancer lesions, often characterized by diverse morphological traits and inter-tumor heterogeneity. Column (c) represents the segmentation result by model trained only with partial-label 2200 cases demonstrating similar performance with that of column (d) using all 4000 cases. 
\begin{figure}[htbp]
\centering
\includegraphics[width=1\textwidth]{imgs/vis.jpg}
\caption{Qualitative evaluation on four cases from validation set.
}
\label{fig:results}
\end{figure}



\subsection{Segmentation efficiency results on online validation}

Our two-phase cascaded network provides major speed and memory benefits. Table~\ref{tabel:efficiency} provides the efficiency for certain examples from the validation dataset. For the majority of test cases, our proposed method can complete the inference process requiring extra seconds (8 in average) than the prescribed time budget of 15 seconds, while maintaining GPU memory consumption well under the allotted 4GB limit. Moreover, running time appears to exhibit a positive correlation with input image size owing to the serial scanning nature of the sliding window, traversing spatially across the input, consequently inflicting a computational burden that scales directly with image area, as evidenced by the near 31 second run time for the largest case 0029 scan, resulting in  greater cumulative GPU utilization.

\begin{table}[htbp]
\caption{
Quantitative evaluation of segmentation efficiency in terms of the run-
ning time and GPU memory consumption. Total GPU denotes the area under GPU
Memory-Time curve. Evaluation GPU platform: NVIDIA QUADRO RTX5000 (16GB).
}
\label{tabel:efficiency}
\centering
\begin{tabular}{ccccc}
\hline
Case ID & Image Size      & Running Time (s) & Max GPU (MB) & Total GPU (MB) \\ \hline
0001    & (512, 512, 55)  & 15.17            & 2020         & 17200            \\
0051    & (512, 512, 100) & 16.73            & 2020         & 22141          \\
0017    & (512, 512, 150) & 17.84            & 2020         & 23832          \\
0019    & (512, 512, 215) & 18.01            & 2020         & 23875          \\
0099    & (512, 512, 334) & 20.61            & 2020         & 27427          \\
0063    & (512, 512, 448) & 24.40            & 2020         & 32647          \\
0048    & (512, 512, 499) & 25.36            & 2020         & 33937          \\
0029    & (512, 512, 554) & 30.87            & 2020         & 43991          \\ \hline
\end{tabular}
\end{table}



% \subsection{Results on final testing set}

\subsection{Limitation and future work}

While our method shows promise for multi-organ and tumor segmentation, enabling  clinical utilization through efficient computation and memory usage. These validation results highlight areas for continued future refinement, especially enhancing delineation of tiny organs and handling greater tumor heterogeneity. For that, tumor synthesis technique~\cite{hu2023label} could be employed to artificially generate additional lesion examples. This data augmentation approach may facilitate greater robustness in the segmentation model, allowing it to generalize more effectively to the heterogeneity inherent in pathological anatomy. Since our pseudo labeling approach is mostly off-line making impossible real time updating, we should further explore online semi-supervised method as well as mechanisms to enhance the fidelity and reliability of generated pseudo labels such as applying confidence thresholding, and detecting out-of-distribution pseudo labels~\cite{liu2023towards}. 

\section{Conclusion}
In the pursuit of advancing the state of the art in multi-organ and pan-cancer image segmentation, we have made significant strides in this realm by our participation in the MICCAI FLARE23 challenge through the development and application of a two-phase cascade framework.  Phase\_1 model built upon partial convolution enjoys computational efficiency while yielding credible segmented ROI. The harmonious fusion of scale-aware and self-attentive modulation forms the foundation of our Phase\_2 model backbone, enabling enhanced segmentation accuracy. Through meticulous model selection, tuning, and optimization, our algorithm has shown promising overall results with reference to precision and efficiency metrics on the online validation and test datasets, substantiating its efficacy in target segmentation. We believe our approach holds the promise of enhancing clinical practices and contributing to the broader scientific understanding of complex medical image analysis in abdominal oncology.


\subsubsection{Acknowledgements} The authors of this paper declare that the segmentation method they implemented for participation in the FLARE 2023 challenge has not used any pre-trained models nor additional datasets other than those provided by the organizers. The proposed solution is fully automatic without any manual intervention. We thank all the data owners for making the CT scans publicly available and CodaLab~\cite{codalab} for hosting the challenge platform. 

The study was supported by National Natural Science Foundation of China (81827805, 82130060, 61821002, 92148205), National Key Research and Development Program (2018YFA0704100, 2018YFA0704104). The project was funded by China Postdoctoral Science Foundation (2021M700772), Zhuhai Industry-University-Research Collaboration Program (ZH22017002210011PWC), Jiangsu Provincial Medical Innovation Center (CXZX202219), Collaborative Innovation Center of Radiation Medicine of Jiangsu Higher Education Institutions, and Nanjing Life Health Science and Technology Project (202205045). The funding sources had no role in the writing of the report, or decision to submit the paper for publication.



%
% ---- Bibliography ----
%
% BibTeX users should specify bibliography style 'splncs04'.
% References will then be sorted and formatted in the correct style.
%
\bibliographystyle{splncs04}
\bibliography{ref}

\newpage

% Please add the following required packages to your document preamble:
% \usepackage[normalem]{ulem}
% \useunder{\uline}{\ul}{}







% \begin{figure}[htbp]
% \centering
% \includegraphics[width=0.8\textwidth]{imgs/decoder1.png}
% % \includegraphics[scale=0.6]{imgs/decoder1.png}
% \caption{Illustration of Phase\_1 decoder adapted from~\cite{xie2021segformer}.}
% \label{fig.Phase1 decoder}
% \end{figure}

% \begin{figure}[htbp]
% \centering
% \includegraphics[width=0.8\textwidth]{imgs/phase2_decoder.png}
% % \includegraphics[scale=0.35]{imgs}
% \caption{Illustration of Phase\_2 decoder adapted from~\cite{Unetr}.}
% \label{fig.Phase2 decoder}
% \end{figure}








% \begin{table}[!htbp]
% \caption{Checklist Table. Please fill out this checklist table in the answer column.}
% \centering
% \begin{tabular}{ll}
% \hline
% Requirements                                                                                                                    & Answer        \\ \hline
% A meaningful title                                                                                                              & Yes        \\ \hline
% The number of authors ($\leq$6)                                                                                                             & 6       \\ \hline
% Author affiliations, Email, and ORCID                                                                                           & Yes       \\ \hline
% Corresponding author is marked                                                                                                  & Yes        \\ \hline
% Validation scores are presented in the abstract                                                                                 & Yes        \\ \hline
% \begin{tabular}[c]{@{}l@{}}Introduction includes at least three parts: \\ background, related work, and motivation\end{tabular} & Yes        \\ \hline
% A pipeline/network figure is provided                                                                                           & 3 \\ \hline
% Pre-processing                                                                                                                  & 5   \\ \hline
% Strategies to use the partial label                                                                                             & 9   \\ \hline
% Strategies to use the unlabeled images.                                                                                         & 10  \\ \hline
% Strategies to improve model inference                                                                                           & 5   \\ \hline
% Post-processing                                                                                                                 & 7   \\ \hline
% Dataset and evaluation metric section is presented                                                                              & 8   \\ \hline
% Environment setting table is provided                                                                                           & 8  \\ \hline
% Training protocol table is provided                                                                                             & 9  \\ \hline
% Ablation study                                                                                                                  & 12   \\ \hline
% Efficiency evaluation results are provided                                                                                     & 13 \\ \hline
% Visualized segmentation example is provided                                                                                     & 13 \\ \hline
% Limitation and future work are presented                                                                                        & 12        \\ \hline
% Reference format is consistent.  & Yes        \\ \hline

% \end{tabular}
% \end{table}

\end{document}



