% This is samplepaper.tex, a sample chapter demonstrating the
% LLNCS macro package for Springer Computer Science proceedings;
% Version 2.21 of 2022/01/12
%
\documentclass[runningheads]{llncs}
%
\usepackage[T1]{fontenc}
% T1 fonts will be used to generate the final print and online PDFs,
% so please use T1 fonts in your manuscript whenever possible.
% Other font encondings may result in incorrect characters.
%
\usepackage{multirow}
\usepackage{graphicx}
% Used for displaying a sample figure. If possible, figure files should
% be included in EPS format.
%
% If you use the hyperref package, please uncomment the following two lines
% to display URLs in blue roman font according to Springer's eBook style:
%\usepackage{color}
%\renewcommand\UrlFont{\color{blue}\rmfamily}
\usepackage[pagebackref=true,breaklinks=true,colorlinks,bookmarks=false]{hyperref}
%
\begin{document}
%
\title{Abdominal Organs and Pan-Cancer Segmentation based on Self-supervised Pre-training and Self-training}
%
\titlerunning{Segmentation based on Self-supervised and Semi-supervised Learning}
% If the paper title is too long for the running head, you can set
% an abbreviated paper title here
%
\author{He Li\inst{1}\and
Meng Han\inst{1} \and
Guotai Wang\inst{1,2}
} 
%
\authorrunning{H. Li et al.}
% First names are abbreviated in the running head.
% If there are more than two authors, 'et al.' is used.
%
\institute{School of Mechanical and Electrical Engineering, University of Electronic Science and Technology of China, Chengdu, China \and Shanghai Artificial Intelligence Laboratory, Shanghai, China \\
\email{\{guotai.wang\}@uestc.edu.cn}}
%
\maketitle              % typeset the header of the contribution
%
\begin{abstract}
Despite the effective progress in automatic abdominal multi-organ segmentation methods based on deep learning, there are still few studies on general models for abdominal organ and pan-cancer segmentation. Additionally, the manual annotation of organs and tumors from CT scans is a time-consuming and labor-intensive process. To deal with these problems, an efficient two-stage framework combining self-supervised pre-training and self-training is proposed. Specifically, in the first stage, we adopt the Model Genesis method for image reconstruction to promote the model to learn effective anatomical representation information, thereby improving the model's perception of anatomical structures in downstream segmentation tasks and generating high-quality tumor pseudo-labels. Afterward, we fuse partial organ fine-standard of labeled data with pseudo-labels to improve the organ labeling quality. In the second stage, we overlay the generated tumor pseudo-labels onto the corresponding regions of the organ pseudo-labels, and the final pseudo-label images are used to train the nnU-Net model for efficient inference. The proposed method has been evaluated on the FLARE2023 validation cased, and get a relatively good segmentation performance. The average DSC and NSD for organs are 91.51\% and 95.52\%, respectively. For tumors, the average DSC is 43.47\%, and the average NSD is 33.81\%. In addition, the average running time and area under the GPU memory-time curve are 85.4 s and 246157.2 MB, respectively. On the test set, we achieved average organ and tumor DSC of 92.17\% and 54.99\%, respectively, and average inference time of 95.83 s. Our code is publicly available at \url{https://github.com/lihe-CV/HiLab_FLARE23}

\keywords{Semi-supervised learning  \and Self-supervised learning \and Pseudo labels.}
\end{abstract}

%###########################
\section{Introduction}
Abdominal organ and tumor segmentation is a critically important task in abdominal disease diagnosis, cancer treatment, and radiation therapy planning~\cite{luo2022word}. The abdomen is a common site for the occurrence of cancer, and accurate segmentation results can provide valuable information for clinical diagnosis and surgical planning, like the size and location of organs and tumors, the spatial relationship of multiple organs, etc. In recent years, deep learning-based methods have been widely used for automatic segmentation of organs and tumors~\cite{ma2021abdomenct}. However, these methods heavily rely on a large amount of annotated data for training purposes. In past clinical practice, segmentation labels for organs and tumors was usually performed manually by radiologists. It is time-consuming and labor-intensive. Thus, it is often challenging to obtain a large number of labeled cases. In light of this situation, semi-supervised semantic segmentation aims to utilize limited labeled data and abundant unlabeled data for model training. It addresses the issue of label scarcity by exploring valuable information from the unlabeled data.

FLARE~\cite{MedIA-FLARE21} is an international challenge focusing on abdominal scene segmentation. Compared with FLARE22, the challenge for FLARE23 adds the pan-cancer segmentation task and provides only partial organ segmentation labels for the labeled data in semi-supervised segmentation. The organizer of FLARE23 provided the largest abdomen CT dataset, including 4000 3D CT scans from 30+ medical centers. 2200 cases have partial labels and 1800 cases are unlabeled. For the task scenario combining semi-supervised and partial-label segmentation, the main solutions can be divided into two types: (1) consistency-regularization-based methods~\cite{chen2021semi}. (2) pseudo-label-based methods~\cite{lee2013pseudo,sime2023uncertainty}. Since the organizer invited the FLARE22 champion team to generate pseudo labels for FLARE23 data. We choose the pseudo-labeling-based approach and integrate it with the nnU-Net framework~\cite{nnUNet} to train organ segmentation model. However, due to the uncertainty in tumor shape, size, and location, as well as the scarcity of tumor labels, we attempt to incorporate self-supervised strategies to learn effective representation information from images, thereby enhancing the model's perception of tumor category.

In this work, we propose a two-stage training framework that combines self-supervised and semi-supervised learning to generate high-quality pseudo-labels and improve the segmentation performance of the model, respectively. Specifically, in the first stage, we employ the Model Genesis method~\cite{zhou2021models} for image reconstruction to learn effective anatomical representation information. From 2200 labeled images, 735 tumor-containing images and corresponding labels were further selected, and the pre-trained model was transferred to the tumor segmentation task to generate high-quality pseudo-labels for 3265 tumor-free labeled data~\cite{chen2021forecasting}. For the pseudo-label generation of organs, we simply fused the pseudo-labels provided by the organizer with partial organ segmentation annotations, and achieved good segmentation results. In the second stage, we overlay the generated tumor pseudo-labels onto the corresponding regions of the organ pseudo-labels, and the final pseudo-labeled images are used to train nnU-Net model~\cite{nnUNet} for inference.

In summary, we make the following three contributions:

\begin{itemize} 
 \item We design a two-stage training framework based on nnU-Net to generate high-quality pseudo-labels and improve the segmentation performance of the model.
 \item We adopt self-supervised learning strategy to learn anatomical representation information, enabling the model to generate high-quality pseudo-labels.
 \item We optimize the organ segmentation task by fusing pseudo-labels and partial organ segmentation annotations. Models trained with our fused labels perform better.
\end{itemize}

\begin{figure}[htbp]
\centering
\includegraphics[width=1\textwidth]{imgs/network.pdf}
\caption{Overview of our proposed framework. }
\label{fig:Network}
\end{figure}

%###########################
\section{Method}
To deal with a training dataset with partial labels on a small part of images,  we propose a two-stage training framework that combines self-supervised and semi-supervised learning, as shown in Figure~\ref{fig:Network}. We adopt self-supervised learning and image fusion strategies to generate high-quality pseudo-labels. The self-training~\cite{lee2013pseudo} is adopted for semi-supervised semantic segmentation. The detailed description of this framework is as follows.

%###########################
\subsection{Preprocessing}

\begin{table}[!htbp]
\setlength{\tabcolsep}{4pt}
\caption{Comparison of different segmentation models. The order of axes of input patch size and spacing is (z,y,x).}\label{table:comp_property}
\centering
\begin{tabular}{c|c|c|c}
\hline
Settings                     & Default                  & Tumor                     & Organ\&Pan-cancer \\ \hline
convolution kernel sizes     & (1, 3, 3)                & (3, 3, 3)                 & (1, 3, 3)      \\
step size for sliding window & 0.5                      & 0.5                       & 1 \\ 
input patch size             &(64$\times$160$\times$192)&(112$\times$160$\times$160)&(56$\times$160$\times$192) \\
input spacing                &(2.0, 0.8, 0.8)           &(1.8, 1.8, 1.8)            &(2.5, 0.8, 0.8) \\\hline
\end{tabular}
\end{table}

The preprocessing strategies for labeled data and pseudo-labeled data in the two-stage segmentation framework are as follows:
\begin{itemize} 
 \item Image cropping: Crop the bounding box of the image to the non-zero area, thereby reducing the image size and improving computational efficiency.
 \item We adopt image resampling to ensure that the actual physical space of each voxel is consistent across different image data.
 \item We applied z-score normalization based on the mean and standard deviation of foreground intensity values across the training set.
 \item The detailed configurations and the comparison with default nnU-Net are listed in Table~\ref{table:comp_property}.
\end{itemize}

%###########################
\subsection{Generate High-quality Pseudo-labels}
We employ pseudo-label generation as a simple and effective method to utilize unlabeled data for model training. Specifically, we make full use of the pseudo-labels of abdominal organs provided by the organizer. To improve the labeling quality, we fuse partial organ fine-standard of labeled data with pseudo-labels. However, tumor category is difficult to segment due to the uncertainty of tumor shape, size and location, as well as the scarcity of tumor labels. We utilize self-supervised learning strategy to facilitate tumor segmentation model $S_{tum}$ to understand local and global features, thereby boosting the model's awareness of tumor category and generating high-quality tumor pseudo-labels.

%###########################
\subsubsection{Self-supervised Pre-training.} 
Model Genesis~\cite{zhou2021models} learns from scratch on unlabeled images with the goal of learning a universal visual representation that can be generalized and transferred across diseases, organs, and modalities. In order to improve the model's transfer and perception capabilities for tumor category, we use similar self-supervised training strategies as the Model Genesis~\cite{zhou2021models} to pre-train $S_{tum}$ with the provided FLARE23 dataset. Throughout the pre-training process, $S_{tum}$ reconstructs the original patches according to the augmented variants, thereby learning anatomical representation information of 3D abdominal CT images. The generation process of augmented variants is shown in Figure~\ref{fig:Transfor}.

\begin{figure}[htbp]
\centering
\includegraphics[width=1\textwidth]{imgs/model_gensis.pdf}
\caption{The transformations made to the original patch during the pre-training. I:Nonlinear transformation, II:local pixels shuffling, III: in-painting, IV: out-painting. (RC: random combine.)
}
\label{fig:Transfor}
\end{figure}

Specifically, four transformations are randomly combined and applied to the original patch to generate augmented variants. The transformations include: 1) Non-linear transformation. By integrating Bézier Curve~\cite{mortenson1999mathematics} to assign a uniquely determined value to each pixel, to encourage self-supervision focusing on the information of image appearance, shape and intensity distribution. 2) Local pixel shuffling. By sampling a window smaller than the model's receptive field in the patch and rearranging the internal pixels, to encourage model learning the local texture and boundary of the image. 3) Out-painting and In-painting. By blending windows of different sizes to create a complex shape. Out-painting sets the outer pixels of the shape to random values, while the inner pixels retain their original intensities. In-painting is the opposite.

Then, the pre-training model $S_{tum}$ will learn the anatomical representation information by reconstructing the original patch. The mean squared error (MSE) loss is used for training $S_{tum}$ by minimizing a reconstruction error
$\mathcal{L}_{rec}$:
\begin{equation}
\small
\label{eq1:loss_rec}
    \mathcal{L}_{rec} = \frac{1}{N}\sum_{i=1}^{N}\big|X_i-\hat{X}_i\big|
\end{equation}
where $i$ is the voxel index, $N$ is the number of the voxels, $X_i$ is original patch and $\hat{X}_i$ is the prediction of the model. Finally, we screened out 735 tumor-containing images and corresponding labels from 2200 labeled data, and transferred the pre-trained model $S_{tum}$ to the tumor segmentation task. We adopt an average of cross-entropy loss and Dice loss to supervise the tumor segmentation model:
\begin{equation}
\small
\label{eq2:loss_seg_tum}
    \mathcal{L}_{seg} = \frac{1}{2N^{t}}\sum_{i=1}^{N^{t}}\big( \mathcal{L}_{Dice}(p_i,y_i) + \mathcal{L}_{ce}(p_i,y_i)\big)
\end{equation}
where $y_i$ is the tumor label, $N^{t}$ is the number of training images and $p_i$ is the prediction of the model $S_{tum}$.

%###########################
\subsubsection{Label Fusion.}
Due to the organizer invited FLARE22 champion team to use its docker to generate pseudo labels for FLARE23 data. Therefore, we adopt a simple but effective label fusion strategy. Specifically, we achieve high-quality fusion by replacing the corresponding organ regions in the pseudo-labels with accurately annotated organ parts from the labeled data.
\begin{equation}
\small
\label{eq3:fusion}
    \hat{y} = y_{p}\oplus y_{q}
\end{equation}
where $y_p$ is ground truth and $y_{q}$ is pseudo label. At the same time, the unlabeled data retains the corresponding pseudo-labels as supervision signals.

%###########################
\subsection{Model Training and Inference}
We adopt similar label fusion strategy to the high-quality organ and tumor pseudo-labels obtained in the first stage, generating a dataset $\mathcal{D}=\{x_i,y_i\}_{i=1}^{N}$ for training organ and pan-cancer segmentation model $S$. In addition, in order to improve the inference efficiency of the model $S$, we try using small patch size as in Table~\ref{table:comp_property} to increase the training and inference speed of each patch and reduce GPU memory. Finally, the segmentation model $S$ learns from organ and pan-cancer data by minimizing a supervised loss function:
\begin{equation}
\small
\label{eq4:loss_seg}
    \mathcal{L}_{seg} = \frac{1}{2N^{d}}\sum_{j=1}^{N^{d}}\big( \mathcal{L}_{Dice}(p_j,y_j) + \mathcal{L}_{ce}(p_j,y_j)\big)
\end{equation}
where $y_j$ is the organ and pan-cancer label, $N^{d}$ is the number of training images and $p_j$ is the prediction of the model $S$.

Due to the high resolution of 3D medical images, nnU-Net~\cite{nnUNet} adopts the slidingwindow strategy for inference. However, this strategy significantly consumes the time and space complexity. Therefore, we set the step-size to 1 during inference to effectively improve inference speed and reduce resource consumption while ensuring accuracy.

%###########################
\subsection{Post-processing}
A connected component analysis of segmentation mask is applied on the outputs to remove small connected areas. And then the results are resampled back to original spacing for the convenience of the following evaluation.

%###########################
\section{Experiments}

%###########################
\subsection{Dataset and evaluation measures}
The FLARE 2023 challenge is an extension of the FLARE 2021-2022~\cite{MedIA-FLARE21}\cite{FLARE22}, aiming to promote the development of foundation models in abdominal disease analysis. The segmentation targets cover 13 organs and various abdominal lesions. The training dataset is curated from more than 30 medical centers under the license permission, including TCIA~\cite{TCIA}, LiTS~\cite{LiTS}, MSD~\cite{simpson2019MSD}, KiTS~\cite{KiTS,KiTSDataset}, autoPET~\cite{autoPET-Data,autoPET-MICCAI22}, TotalSegmentator~\cite{TotalSegmentator}, and AbdomenCT-1K~\cite{AbdomenCT-1K}. The training set includes 4000 abdomen CT scans where 2200 CT scans with partial labels and 1800 CT scans without labels. The validation and testing sets include 100 and 400 CT scans, respectively, which cover various abdominal cancer types, such as liver cancer, kidney cancer, pancreas cancer, colon cancer, gastric cancer, and so on. The organ annotation process used ITK-SNAP~\cite{ITKSNAP}, nnU-Net~\cite{nnUNet}, and MedSAM~\cite{MedSAM}.

The evaluation metrics encompass two accuracy measures—Dice Similarity Coefficient (DSC) and Normalized Surface Dice (NSD)—alongside two efficiency measures—running time and area under the GPU memory-time curve. These metrics collectively contribute to the ranking computation. Furthermore, the running time and GPU memory consumption are considered within tolerances of 15 seconds and 4 GB, respectively.

%###########################
\subsection{Implementation details}

%###########################
\subsubsection{Environment settings}
The development environments and requirements are presented in Table~\ref{table:env}. 

\begin{table}[!htbp]
\caption{Development environments and requirements.}\label{table:env}
\centering
\begin{tabular}{ll}
\hline
System vision           & Ubuntu 18.04.5 LTS\\
\hline
CPU                     & Intel(R) Xeon(R) Gold 6248 CPU@2.50GHz \\
\hline
RAM                     & 16$\times $4GB; 2.67MT$/$s\\
\hline
GPU (number and type)   & One NVIDIA V100 32G\\
\hline
CUDA version            & 11.0\\                          
\hline
Programming language    & Python 3.10.8\\ 
\hline
Deep learning framework & torch 2.0.0, torchvision 0.15.1\\
\hline
Specific dependencies   & nnU-Net 2.1.1\\                                                  \hline
Code                    & \url{https://github.com/lihe-CV/HiLab_FLARE23}  \\
\hline
\end{tabular}
\end{table}

%###########################
\subsubsection{Training protocols}
The training protocols of $S_{tum}$ and $S$ are shown in Table~\ref{table:training} and ~\ref{table:training2nd} respectively. During the training process, we dynamically adopt elastic deformation, rotation, random cropping, Gaussian noise transformation, Gamma transformation, contrast transformation, morphological transformation and other data enhancement strategies. In addition, we applied mirror test time data augmentation during inference.

\begin{table*}[!htbp]
\caption{Training protocols for tumor segmentation model $S_{tum}$.}
\label{table:training}
\begin{center}
% \resizebox{0.47\textwidth}{!}{
\begin{tabular}{ll} 
\hline
Network initialization        & "He" normal initialization \\
\hline
Batch size                    & 2 \\
\hline 
Patch size                    & 112$\times$160$\times$160 \\
\hline
Total epochs                  & 2000 \\
\hline
Step size                     & 1 \\
\hline
Optimizer                     & SGD with nesterov momentum ($\mu = 0.99$) \\ 
\hline
Initial learning rate (lr)    & 0.01 \\ 
\hline
Lr decay schedule             & Poly learning rate policy: $(1-epoch/2000)^{0.9}$ \\
\hline
Training time                 & 132.5 hours \\
\hline 
Loss function                 & Dice loss and cross entropy loss\\
\hline
Number of model parameters    & 88.21M\\
\hline
Number of flops               & 913.4G\\ 
\hline
CO$_2$eq                      & 41.05 Kg\\  
\hline
\end{tabular}
%}
\end{center}
\end{table*}

\begin{table*}[!htbp]
\caption{Training protocols for organ and pan-cancer segmentation model $S$.}
\label{table:training2nd}
\begin{center}
% \resizebox{0.47\textwidth}{!}{
\begin{tabular}{ll} 
\hline
Network initialization        & "He" normal initialization \\
\hline
Batch size                    & 2 \\
\hline 
Patch size                    & 56$\times$160$\times$192 \\
\hline
Total epochs                  & 1000 \\
\hline
Optimizer                     & SGD with nesterov momentum ($\mu = 0.99$) \\ 
\hline
Initial learning rate (lr)    & 0.01 \\
\hline
Lr decay schedule             & Poly learning rate policy: $(1-epoch/1000)^{0.9}$ \\
\hline
Training time                 & 41.5 hours \\
\hline
Loss function                 & Dice loss and cross entropy loss\\
\hline
Number of model parameters    & 71.02M \\ 
\hline
Number of flops               & 727.76G \\
\hline
CO$_2$eq                      & 35.02 Kg\\
\hline
\end{tabular}
\end{center}
\end{table*}

%###########################
\section{Results and discussion}

\begin{table}[htbp]
\caption{Quantitative results of validation set in terms of DSC and NSD. (Public Validation: the performance on the 50 validation cases with ground truth. Online Validation: the leaderboard results. Testing: the performance on the testing cases.)}\label{table:Quan_results}
\centering
\resizebox{1\textwidth}{!}{
\begin{tabular}{l|cc|cc|cc}
\hline
\multirow{2}{*}{Target} & \multicolumn{2}{c|}{Public Validation} & \multicolumn{2}{c|}{Online Validation} & \multicolumn{2}{c}{Testing} \\ \cline{2-7} 
                        & DSC(\%)            & NSD(\%)           & DSC(\%)            & NSD(\%)           & DSC(\%)      & NSD (\%)     \\ \hline
Liver                   & 98.43 $\pm$ 0.0102  & 98.91 $\pm$ 0.0231   & 98.30    & 98.77                         & 96.50    & 96.79    \\
Right Kidney            & 93.25 $\pm$ 12.12   & 94.13 $\pm$ 11.83   & 93.47     & 93.47                         & 93.55    & 93.14    \\
Spleen                  & 96.52 $\pm$ 11.39   & 96.74 $\pm$ 13.23    & 95.82    & 96.53                         & 96.42    & 96.65    \\
Pancreas                & 85.43 $\pm$ 10.89   & 95.35 $\pm$ 10.06    & 86.84    & 96.54                         & 91.36    & 97.30    \\
Aorta                   & 97.45 $\pm$ 1.82    & 99.41 $\pm$ 2.55     & 97.68    & 99.29                         & 97.58    & 98.91    \\
Inferior vena cava      & 93.02 $\pm$ 7.29    & 92.66 $\pm$ 6.91     & 93.21    & 93.87                         & 93.60    & 95.05    \\
Right adrenal gland     & 88.94 $\pm$ 9.21    & 97.88 $\pm$ 10.23    & 89.09    & 97.89                         & 87.95    & 95.99    \\
Left adrenal gland      & 87.89 $\pm$ 9.63    & 96.17 $\pm$ 8.31     & 87.58    & 95.61                         & 89.58    & 96.68    \\
Gallbladder             & 88.95 $\pm$ 20.48   & 90.79 $\pm$ 21.38    & 89.79    & 90.54                         & 85.40    & 87.20    \\
Esophagus               & 84.22 $\pm$ 10.06   & 93.76 $\pm$ 10.18    & 84.19    & 93.30                         & 89.24    & 96.28    \\
Stomach                 & 95.01 $\pm$ 3.71    & 97.90 $\pm$ 6.65     & 94.66    & 97.18                         & 94.84    & 96.65    \\
Duodenum                & 83.74 $\pm$ 10.01   & 94.98 $\pm$ 8.24     & 84.53    & 95.21                         & 88.82    & 96.32    \\
Left kidney             & 94.81 $\pm$ 12.69   & 93.31 $\pm$ 12.80    & 94.44    & 93.60                         & 92.89    & 92.48    \\
Tumor                   & 43.86 $\pm$ 25.98   & 33.27 $\pm$ 23.56    & 43.47    & 33.81                         & 54.99    & 42.45    \\ \hline
Average(Organ)          & 91.62 $\pm$ 9.73    & 95.38 $\pm$ 14.70    & 91.51    & 95.52                         & 92.17    & 95.44    \\ \hline
\end{tabular}
}
\end{table}

%###########################
\subsection{Quantitative results on validation set}

Quantitative result is illustrated in Table~\ref{table:Quan_results}, it can be observed that the two-stage framework can achieve very promising segmentation results for large regional organs, such as liver, spleen, kidney, stomach, etc. However, the segmentation of small and structurally complex organs such as the duodenum, esophagus, and adrenal glands remains challenging in comparison. Moreover, the strong uncertainty in tumor shapes, sizes, and locations in the pan-cancer segmentation task added to the FLARE23 challenge makes the segmentation task extremely challenging. Indeed, there is a problem of missing in the segmentation results, particularly for small tumors, where the segmentation model fails to predict their presence.

Then, Table~\ref{table:Abla_styDice} and Table~\ref{table:Abla_styNSD} showed the Dice and NSD metrics calculated on the validation set. Evidently, compared with models trained using labeled data with only partial organ segmentation annotations, training the model using the Label Fusion strategy can significantly improve segmentation performance. Moreover, the introduction of self-supervised pretraining strategy has significantly improved the performance of the two-stage framework on tumor classes, as evidenced by the achieved Dice Similarity Coefficient (DSC) of 43.47\%.

\begin{table}[htbp]
\caption{Ablation study of Dice(\%) metrics on validation set. (BaseLine: Training nnU-Net with labeled images only. LF: Label Fusion. SP: Self-supervised Pre-training.)}\label{table:Abla_styDice}
\centering
\begin{tabular}{lcccccccc}
\hline
Methods       & Liver          & RK             & Spleen         & Pancreas       & Aorta          & IVC            & RAG            & LAG            \\ \hline
Baseline       & 97.58          & 92.71          & 94.96          & 85.94          & 97.01          & 91.29          & 82.32          & 83.69          \\
Baseline+LF    & \textbf{98.46} & \textbf{95.98} & \textbf{97.10} & 86.72          & 97.51          & \textbf{93.34} & 88.46          & \textbf{88.77} \\
Baseline+LF+SP & 98.30          & 93.47          & 95.82          & \textbf{86.84} & \textbf{97.68} & 93.21          & \textbf{89.09} & 87.58          \\ \hline
Methods       & GBD       & EPG      & Stomach        & Duodenum       & LK             & \multicolumn{1}{c|}{Average}        & \multicolumn{2}{c}{Tumor}          \\ \hline
Baseline       & 85.11          & \textbf{85.69} & 91.24          & 80.65          & 93.29          & \multicolumn{1}{c|}{89.38}          & \multicolumn{2}{c}{33.06}          \\
Baseline+LF    & 88.34          & 84.35          & 94.35          & \textbf{84.64} & \textbf{95.22} & \multicolumn{1}{c|}{\textbf{91.68}} & \multicolumn{2}{c}{37.52}          \\
Baseline+LF+SP & \textbf{89.79} & 84.19          & \textbf{94.66} & 84.53          & 94.44          & \multicolumn{1}{c|}{91.51}          & \multicolumn{2}{c}{\textbf{43.47}} \\ \hline
\end{tabular}
\end{table}

\begin{table}[htbp]
\caption{Ablation study of NSD(\%) metrics on validation set. (BaseLine: Training nnU-Net with labeled images only. LF: Label Fusion. SP: Self-supervised Pre-training.)}\label{table:Abla_styNSD}
\centering
\begin{tabular}{lcccccccc}
\hline
Methods       & Liver          & RK             & Spleen         & Pancreas       & Aorta          & IVC            & RAG            & LAG            \\ \hline
Baseline       & 97.83          & 92.12          & 96.12          & 95.24          & 98.35          & 91.67          & 92.29          & 92.41          \\
Baseline+LF    & \textbf{99.05} & \textbf{96.16} & \textbf{98.25} & 96.46          & 99.25          & \textbf{94.01} & 97.69          & \textbf{96.85} \\
Baseline+LF+SP & 98.77          & 93.47          & 96.53          & \textbf{96.54} & \textbf{99.29} & 93.87          & \textbf{97.89} & 95.61          \\ \hline
Methods       & GBD       & EPG      & Stomach        & Duodenum       & LK             & \multicolumn{1}{c|}{Average}        & \multicolumn{2}{c}{Tumor}          \\ \hline
Baseline       & 85.74          & 92.98          & 94.62          & 91.33          & 92.01          & \multicolumn{1}{c|}{93.91}          & \multicolumn{2}{c}{22.07}          \\
Baseline+LF    & 89.84          & \textbf{93.55} & 96.92          & \textbf{95.33} & \textbf{95.08} & \multicolumn{1}{c|}{\textbf{96.07}} & \multicolumn{2}{c}{28.52}          \\
Baseline+LF+SP & \textbf{90.54} & 93.30          & \textbf{97.18} & 95.21          & 93.60          & \multicolumn{1}{c|}{95.52}          & \multicolumn{2}{c}{\textbf{33.81}} \\ \hline
\end{tabular}
\end{table}

Finally, we quantitatively evaluated the segmentation efficiency of the model, as shown in Table~\ref{table:Seg_eff}. It can be found that the three evaluation metrics show an increasing trend as the input instances grow larger. Although the inference time is mostly within 60 seconds, the proportion of inference times below 15 seconds is relatively low. Therefore, further optimization is needed in terms of model inference efficiency to strive for achieving clinical usability standards.

\begin{table}[htbp]
\caption{Quantitative evaluation of segmentation efficiency in terms of the running them and GPU memory consumption.(Total GPU: the area under GPU Memory-Time curve. Evaluation GPU platform: NVIDIA QUADRO RTX5000 (16G).)}\label{table:Seg_eff}
\centering
\begin{tabular}{ccccc}
\hline
Case ID & Image Size      & Running Time (s) & Max GPU (MB) & Total GPU (MB) \\ \hline
0001    & (512, 512, 55)  & 8.47             & 4266         & 17433          \\
0051    & (512, 512, 100) & 11.12            & 5290         & 34759          \\
0017    & (512, 512, 150) & 20.29            & 5526         & 70528          \\
0019    & (512, 512, 215) & 23.62            & 4722         & 64923          \\
0099    & (512, 512, 334) & 30.20            & 5282         & 96292          \\
0063    & (512, 512, 448) & 42.10            & 5506         & 144037         \\
0048    & (512, 512, 499) & 49.15            & 5420         & 150701         \\
0029    & (512, 512, 554) & 97.21            & 6142         & 247289         \\ \hline
\end{tabular}
\end{table}

%###########################
\subsection{Qualitative results on validation set}

Figure~\ref{fig:seg} displays the qualitative results on the validation set. The first and second rows depict relatively easy segmentation cases, while the third and fourth rows showcase challenging segmentation cases. It can be observed that in the first and second rows, the organ boundaries are clear, there is good contrast, and there are no complex tumor lesions within the organs. Compared with well-segmented instances, challenging instances often have complex tumor lesions (row 3) and noise (row 4), which bring difficulties to accurate segmentation of organs and pan-cancer.

\begin{figure}[!htbp]
\centering
\includegraphics[width=1\textwidth]{imgs/result.pdf}
\caption{Qualitative evaluation of model performance on validation set. Row 1 and 2: Well-segmented examples. Row 3 and 4: challenging examples.}
\label{fig:seg}
\end{figure}

%###########################
\subsection{Segmentation efficiency results on validation set}
We combine efficient inference schemes to build nnU-Net~\cite{nnUNet} as the final submitted Docker image. The average running time per instance during the inference phase is 85.4 seconds, and average used GPU memory is 2352 MB. The area under the GPU memory-time curve is 246157.2 MB, and the area under CPU utilization-time curve is 2973.

%###########################
\subsection{Results on final testing set}
Table~\ref{table:Quan_results} show the detailed evaluation metrics of our method in the final testing set. It can be observed that the two-stage framework achieved average DSC scores of 92.17\% for organs and 54.99\% for lesions, along with NSD scores averaging 95.44\% for organs and 42.45\% for lesions. Additionally, the average running time was 95.83 seconds, and the area under the GPU memory-time curve was 227770 MB.

%###########################
\subsection{Limitation and future work}
While ensuring accuracy, we can explore the use of the following advanced processing strategies to speed up inference and reduce resource consumption:
\begin{itemize} 
 \item Model Pruning. Identify and remove redundant or less important model parameters, reducing the model size and improving inference speed without significant loss in accuracy.
 \item Model Quantization. Convert the model from floating-point precision to lower-precision fixed-point representation, reducing memory usage and improving inference speed.
 \item Filter Data Augmentation. Select specific data enhancement strategies based on organ and tumor characteristics to prevent redundancy. 
\end{itemize}

%###########################
\section{Conclusion}
In this work, we propose a two-stage training framework that combines self-supervised and semi-supervised learning to efficiently perform training and inference on organ and pan-cancer segmentation tasks. Experiments show that our method achieves good segmentation performance. In the future, we hope to optimize the model framework to further improve the segmentation accuracy of difficult tumor samples, improve inference speed and reduce resource consumption.

%###########################
\subsubsection{Acknowledgements} The authors of this paper declare that the segmentation method they implemented for participation in the FLARE 2023 challenge has not used any pre-trained models nor additional datasets other than those provided by the organizers. The proposed solution is fully automatic without any manual intervention. We thank all the data owners for making the CT scans publicly available and CodaLab~\cite{codalab} for hosting the challenge platform. 


%
% ---- Bibliography ----
%
% BibTeX users should specify bibliography style 'splncs04'.
% References will then be sorted and formatted in the correct style.
%
\bibliographystyle{splncs04}
\bibliography{ref}

\newpage
% Please add the following required packages to your document preamble:
% \usepackage[normalem]{ulem}
% \useunder{\uline}{\ul}{}
\begin{table}[!htbp]
\caption{Checklist Table. Please fill out this checklist table in the answer column.}
\centering
\begin{tabular}{ll}
\hline
Requirements                                                                                                                    & Answer        \\ \hline
A meaningful title                                                                                                              & Yes        \\ \hline
The number of authors ($\leq$6)                                                                                                             & 3        \\ \hline
Author affiliations and ORCID                                                                                           & Yes        \\ \hline
Corresponding author email is presented                                                                                                  & Yes        \\ \hline
Validation scores are presented in the abstract                                                                                 & Yes        \\ \hline
\begin{tabular}[c]{@{}l@{}}Introduction includes at least three parts: \\ background, related work, and motivation\end{tabular} & Yes        \\ \hline
A pipeline/network figure is provided                                                                                           & 1 \\ \hline
Pre-processing                                                                                                                  & 3   \\ \hline
Strategies to use the partial label                                                                                             & 5   \\ \hline
Strategies to use the unlabeled images.                                                                                         & 4   \\ \hline
Strategies to improve model inference                                                                                           & 5   \\ \hline
Post-processing                                                                                                                 & 6   \\ \hline
Dataset and evaluation metric section is presented                                                                              & 6   \\ \hline
Environment setting table is provided                                                                                           & 7  \\ \hline
Training protocol table is provided                                                                                             & 7  \\ \hline
Ablation study                                                                                                                  & 9   \\ \hline
Efficiency evaluation results are provided                                                                                     & 10 \\ \hline
Visualized segmentation example is provided                                                                                     & 11 \\ \hline
Limitation and future work are presented                                                                                        & Yes       \\ \hline
Reference format is consistent.  & Yes        \\ \hline

\end{tabular}
\end{table}

\end{document}
