% This is samplepaper.tex, a sample chapter demonstrating the
% LLNCS macro package for Springer Computer Science proceedings;
% Version 2.21 of 2022/01/12
%
\documentclass[runningheads]{llncs}
%
\usepackage[T1]{fontenc}
% T1 fonts will be used to generate the final print and online PDFs,
% so please use T1 fonts in your manuscript whenever possible.
% Other font encondings may result in incorrect characters.
%
\usepackage{multirow}
\usepackage{graphicx}
\usepackage[figuresright]{rotating}
% Used for displaying a sample figure. If possible, figure files should
% be included in EPS format.
%
% If you use the hyperref package, please uncomment the following two lines
% to display URLs in blue roman font according to Springer's eBook style:
%\usepackage{color}
%\renewcommand\UrlFont{\color{blue}\rmfamily}
\usepackage[pagebackref=true,breaklinks=true,colorlinks,bookmarks=false]{hyperref}
%
\begin{document}
%
\title{Multi-task Learning with Iterative Training in Hybrid Labeling Dataset for Semi-supervised Abdominal Multi-organ and Tumor Segmentation}
%
\titlerunning{Multi-task Learning with Iterative Training in Hybrid Labeling Dataset}
% If the paper title is too long for the running head, you can set
% an abbreviated paper title here
%
\author{Zhiqiang Zhong\inst{1,\dag}\orcidID{0000-0002-0220-159X} \and
Rongxuan He\inst{1,2,\dag}\orcidID{0009-0005-0407-6963} \and
Deming Zhu\inst{1}\orcidID{0000-0002-6747-0766} \and
Mengqiu Tian\inst{1}\orcidID{0009-0006-1355-6691} \and
Songfeng Li\inst{1}\orcidID{0000-0002-5228-9630}\thanks{Corresponding author}
}
%
\authorrunning{Z. Zhong et al.}
% First names are abbreviated in the running head.
% If there are more than two authors, 'et al.' is used.
%
\institute{Percept Vis Med Technol Co LTD, Guangzhou 510275, People’s Republic of China.\\
\email{lisongfeng@pvmedtech.com}\and
Johns Hopkins University, Baltimore MD 21218, USA}
%
\maketitle              % typeset the header of the contribution
%
\begin{abstract}
Simultaneous segmentation of organs and tumors from abdominal CT images is challenging, and the task has many critical clinical applications such as disease diagnosis, lesion and organ measurements, and surgical planning.
Based on nnU-Net, we develop a method for abdominal organ and whole-body pan-tumor segmentation for both abdominal and whole-body CT images. First, in a fully supervised setting, we train the base models of organs and tumors to generate initial pseudo-labels. Then, in a semi-supervised setting, a mixed-labeled dataset is used to iteratively train a higher-performance segmentation model to create higher-quality pseudo-labels. Due to the correlation between organs and tumors in the abdominal region, we leverage the idea of multi-task learning to train a single model to segment both organs and tumors to improve the performance of a single task. Finally, to trade off segmentation efficiency and accuracy, we design a sliding window strategy based on the body prior and a simplified version of test-time augmentation (TTA4).
Our final model achieved 88.93\% mean organ DSC and 45.76\% tumor DSC on the FLARE23 online validation set. In addition, the average running time and area under GPU memory-time curve were 26.7s and 49352.9MB, respectively.
On the test set, we achieved mean organ and tumor DSC of 89.68\% and 62.89\%, respectively, NSD of 95.89\% and 51.69\%, respectively, and average inference time of 18.53s.
Our code is publicly available at \url{https://github.com/LeoZhong997/FLARE23}.

\keywords{Segmentation  \and Multi-task learning \and Semi-supervised learning.}
\end{abstract}


\section{Introduction}

Simultaneous segmentation of organs and tumors from abdominal CT images is a formidable challenge that holds immense clinical significance. It plays a pivotal role in various critical clinical applications, such as disease diagnosis, precise lesion and organ measurements, and the development of surgical plans.
Nevertheless, manually labeling organs and lesion locations is a time-consuming task that demands a great deal of expertise from physicians. 
FLARE23 is a challenge aimed at fostering the development of fully automatic solutions for this task. Expanding upon the 13 abdominal organs segmentation task of FLARE22~\cite{FLARE22}, FLARE23 requires participants to simultaneously segment tumors, a more practical study given that the majority of real clinical data may contain lesions.
Furthermore, the challenge restricts the inference time and GPU memory usage to mimic actual clinical conditions, implying that we cannot complete the task solely by increasing the model size or using more computational resources.

Semi-supervised learning is a crucial strategy employed in medical image segmentation tasks, due to the limited availability of medical data and the time-consuming annotation process. 
One of the most common approaches to semi-supervised segmentation is to use pseudo-labels~\cite{lee2013pseudo} generated by a model trained on the labeled data. When training a model with a large amount of unlabeled data, the accuracy of the pseudo-labels becomes critical. Consequently, eliminating uncertain pseudo-labels is a vital step in the training procedure.
The standard method for filtering out uncertain pseudo-labels involves applying a confidence threshold to determine whether the pseudo-labels are reliable. Furthermore, recent studies have demonstrated that these unreliable pseudo-labels can also be leveraged in the self-training process~\cite{wang2022semi}.

In this paper, we propose an iterative training framework based on nnU-net to perform organ and tumor segmentation tasks. 
We start from a single-task setting, where we iteratively train the organ segmentation model. Semi-supervised learning is employed to generate pseudo labels for the partially labeled data and unlabeled data.
Subsequently, we transition to a multi-task setting, training a model to perform both organ and tumor segmentation tasks using the pseudo labels generated in the prior stage. Additionally, we incorporate unlabeled data into the training set.
Furthermore, to enhance inference speed, we introduce a sliding window strategy and we utilize a simplified version of test-time augmentation (TTA4) to improve segmentation accuracy.


\section{Method}

\subsection{Preprocessing}
\label{Preprocessing}
The preprocessing strategies we use are as follows:

\begin{itemize}
 \item Data cleaning or statistical analysis:\\
We perform label analysis to check label completeness. Out of 2200 labeled data, 222 cases include complete organ labels without tumors, and 1497 cases have tumor labels. 
These two subsets are utilized for training our single-task models.
 \item Reorientation:\\
 As we want the network to predict images regardless of orientation, we reorient the images to the standard RAS orientation during the training phase. 
 Later, we will apply mirroring operations in the later stages of data augmentation to enhance the network's orientation robustness.
 \item Resampling method for anisotropic data:\\
 In order to leverage the physical information within the CT data, all images are resampled to the same resolution of 4.0mm\ $\times$\ 1.2mm\ $\times$\ 1.2mm. 
 \item Intensity normalization method:\\
Initially, we compute the 0.5 and 99.5 percentiles, as well as the mean and standard deviation of the data intensity. Subsequently, the data is clipped to the 0.5 and 99.5 percentiles, and z-score normalization is applied using the global mean and standard deviation.
\end{itemize}

\subsection{Proposed Method}

We introduce an iterative training framework for the task of multi-organ and tumor segmentation. 
Our networks are derived from the 3D nnU-Net~\cite{nnUNet}. However, we separate from the nnU-Net's auto-configuration and introduce two fixed network architectures: the medium and large nnU-Net, with their parameters detailed in the experiment part.
Fig.~\ref{fig:workflow} illustrates the workflow of our proposed approach. Our approach comprises two stages: single-task training and multi-task training.

\subsubsection{Single-task Training}

During the single-task stage, we train the nnU-Net separately for organ and tumor segmentation. To address the multi-organ segmentation task, we utilize the 222 labeled data that include complete organ labels.

\label{UsePartialLabel}
Following the development of the organ segmentation model, we employ it to generate pseudo labels for the remaining 1978 labeled data lacking organ labels. Nevertheless, within these 1978 labeled data, we have part of ground truth labels. We propose combining these ground truth labels with the pseudo labels. 
Since this model only performs organ segmentation, we filter out organs that do not contain tumors in the true labels. Determining the organ to which the tumor belongs is accomplished through morphological analysis. We conduct a morphological dilation operation on the tumors and if an overlap exists between the tumor and an organ, the tumor is attributed to that organ.
Subsequently, we replace the corresponding pseudo labels with the ground truth labels for organs without tumors, resulting in a hybrid labeled dataset.

The hybrid labeled dataset is employed for training the organ segmentation model, and we utilize the model to generate pseudo labels for the entire 2200 training set. Iterative training is then conducted to enhance the accuracy of our pseudo labels of organs.

In the context of the tumor segmentation task, we utilize the 1497 labeled data containing tumor labels. However, due to suboptimal Dice Similarity Coefficient (DSC) and Normalized Surface Dice (NSD) performance, we do not employ this model in our subsequent training procedures.

\subsubsection{Multi-task Training}

To reduce inference time costs and maximize the utilization of the correlation between organs and tumors, we suggest training a single model capable of accomplishing both organ and tumor segmentation tasks. 
The organ model trained in the previous stage is utilized to generate the pseudo labels for the 1497 labeled data. These pseudo labels are then combined with the ground truth, following the same procedure described earlier.  
Following the utilization of the hybrid labeled subset for training the multi-task model, we employ the model to generate the pseudo labels of the 2200 training set and retrain the model.

\label{UseUnLabeledImages}
Once the multi-task model is trained using the 2200 labeled data, we employ the model to generate the pseudo labels of the 1800 unlabeled data. Subsequently, we straightforwardly add these data to the training set and conduct iterative training twice to obtain the final model.

\begin{figure}[htbp]
    \centering
    \includegraphics[scale=0.37]{imgs/WorkflowNew.png}
    \caption{Workflow of our proposed approach. The workflow comprises two stages: single-task training and multi-task training.}
    \label{fig:workflow}
    \end{figure}


\subsubsection{Loss Function} 

We use the summation between a weighted Dice loss and cross-entropy loss because compound loss functions have been proven to be robust in various medical image segmentation tasks~\cite{LossOdyssey}.
% The weighted Dice loss is weighted for ground truth and pseudo labels. Before averaging the Dice loss for each organ, we multiply the Dice loss of each organ by a weight $\gamma$. If the organ label is from the ground truth, $\gamma$ is set to 1. Otherwise, $\gamma$ is set to a value that is less than 1. This weighted Dice loss is used to balance the contribution of the ground truth and pseudo labels. 
What's more, deep supervision is used to fully utilize the feature information of the intermediate encoding and decoding layers.  


\subsubsection{Sliding Window Strategy}
\label{ImproveInference}
In order to improve inference speed and reduce resource consumption, we adopt the sliding window strategy to fuse the predictions of overlapping patches.
We adapt the fast sliding window strategy initially proposed by the FLARE22 winning team ~\cite{FLARE22-1st-Huang} to align it with the requirements of the tumor segmentation task. 
Given that tumors can appear in various regions in the abdominal area, the absence of a label in the central patch does not necessarily imply the absence of tumors in the surrounding patches. 
Consequently, for every slice along the z-axis, after the acquisition of the central patch, we also retrieve all the surrounding patches to generate the final prediction.


\subsection{Post-processing}
\label{PostProcessing}
To improve the performance of pseudo-labels, we employ connected component analysis on organs, retaining the largest 3D connected component. If the organ's Dice loss increases following connected component analysis, we opt to conduct the analysis for that specific organ. During the validation and testing phases, connected component analysis is deactivated to reduce time overhead. 

Additionally, we introduce a streamlined test-time augmentation approach (TTA4). Instead of applying augmentation in all 8 directions, we restrict it to 4 directions: the original orientation and the flipped orientations along the x, y, and z axes, respectively. 

 
\section{Experiments}
\subsection{Dataset and evaluation measures}
\label{DatasetandEvaluationMetric}
The FLARE 2023 challenge is an extension of the FLARE 2021-2022~\cite{MedIA-FLARE21}\cite{FLARE22}, aiming to promote the development of foundation models in abdominal disease analysis. The segmentation targets cover 13 organs and various abdominal lesions. The training dataset is curated from more than 30 medical centers under the license permission, including TCIA~\cite{TCIA}, LiTS~\cite{LiTS}, MSD~\cite{simpson2019MSD}, KiTS~\cite{KiTS}\cite{KiTSDataset}, autoPET~\cite{autoPET-Data,autoPET-MICCAI22}, TotalSegmentator~\cite{TotalSegmentator}, and AbdomenCT-1K~\cite{AbdomenCT-1K}. The training set includes 4000 abdomen CT scans where 2200 CT scans with partial labels and 1800 CT scans without labels. The validation and testing sets include 100 and 400 CT scans, respectively, which cover various abdominal cancer types, such as liver cancer, kidney cancer, pancreas cancer, colon cancer, gastric cancer, and so on. The organ annotation process used ITK-SNAP~\cite{ITKSNAP}, nnU-Net~\cite{nnUNet}, and MedSAM~\cite{MedSAM}.


The evaluation metrics encompass two accuracy measures—Dice Similarity Coefficient (DSC) and Normalized Surface Dice (NSD)—alongside two efficiency measures—running time and area under the GPU memory-time curve. These metrics collectively contribute to the ranking computation. Furthermore, the running time and GPU memory consumption are considered within tolerances of 15 seconds and 4 GB, respectively.


\subsection{Implementation details}
\subsubsection{Environment settings}
The development environments and requirements are presented in Table~\ref{table:env}. The training protocols of medium nnU-Net and large nnU-Net are listed in Table~\ref{table:medium_model} and Table~\ref{table:large_model} respectively. We adopt data augmentation of rotation, scaling, Gaussian noise and blur, brightness, contrast, gamma, elastic deformation, and mirror on the fly during training. Notably, we reduced the number of test time augmentation(TTA) flips to balance segmentation accuracy and inference time.


\begin{table}[!htbp]
\caption{Development environments and requirements.}\label{table:env}
\centering
\begin{tabular}{ll}
\hline
System      & Ubuntu 18.04.5 LTS\\
\hline
CPU         & Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz \\
\hline
RAM         & 8$\times $64GB; 3200MT$/$s\\
\hline
GPU (number and type)         & Four NVIDIA A4000 16G\\
\hline
CUDA version                  & 11.6\\                          
\hline
Programming language          & Python 3.8.13\\ 
\hline
Deep learning framework & torch 1.13, torchvision 0.14.0\\
\hline
Specific dependencies         & nnU-Net 1.7.0\\
\hline
Code                          & \url{https://github.com/LeoZhong997/FLARE23}\\
\hline
\end{tabular}
\end{table}

\begin{table*}[!htbp]
\caption{Training protocols for medium nnU-Net}
\label{table:medium_model}
\begin{center}
% \resizebox{0.47\textwidth}{!}{
\begin{tabular}{ll} 
\hline
Network initialization & "He" normal initialization\\
\hline
Batch size & 2\\
\hline
Stage number & 5\\
\hline
Convolution number per stage & 2\\
\hline 
Patch size & 32$\times$128$\times$192  \\ 
\hline
Total epochs & 1500 \\
\hline
Optimizer & SGD with nesterov momentum (µ = 0.99)\\ 
\hline
Initial learning rate (lr) & 0.01\\ 
\hline
Lr decay schedule &  Poly learning rate policy: $(1 - epoch / 1000)^{0.9}$\\
\hline
Training time & 25 hours \\  
\hline 
Loss function & Dice loss and cross-entropy loss\\     
\hline
Number of model parameters & 22M \\ \hline
Number of flops & 253.90G \\ \hline
CO$_2$eq & 8.14 Kg \\  \hline
\end{tabular}
%}
\end{center}
\end{table*}

\begin{table*}[!htbp]
\caption{Training protocols for large nnU-Net}
\label{table:large_model}
\begin{center}
% \resizebox{0.47\textwidth}{!}{
\begin{tabular}{ll} 
\hline
Network initialization & "He" normal initialization\\
\hline
Batch size & 2\\
\hline
Stage number & 6\\
\hline
Convolution number per stage & 3\\
\hline 
Patch size & 32$\times$128$\times$192  \\ 
\hline
Total epochs & 1500 \\
\hline
Optimizer & SGD with nesterov momentum (µ = 0.99)\\ 
\hline
Initial learning rate (lr) & 0.01\\ 
\hline
Lr decay schedule &  Poly learning rate policy: $(1 - epoch / 1000)^{0.9}$\\
\hline
Training time & 33 hours \\  
\hline 
Loss function & Dice loss and cross-entropy loss\\     
\hline
Number of model parameters & 85M \\ \hline
Number of flops & 375.14G \\ \hline
CO$_2$eq & 9.83 Kg \\  \hline
\end{tabular}
%}
\end{center}
\end{table*}


\section{Results and discussion}

\subsection{Quantitative results on validation set}
\label{AblationStudy}
First, we train the single models $\rm{M}_{\rm{O}}$ and $\rm{M}_{\rm{T}}$ on the fully-labeled dataset of 222 cases of organ and 1497 cases of tumor, respectively. To obtain complete labels of organs, $\rm{M}_{\rm{O}}$ first generates pseudo-labels on partially-labeled data of 2200 cases and combines them with ground true labels to produce a mixed-labeled organ dataset for training $\rm{M}_{\rm{O1}}$, and continues to iterate to generate a new dataset for training $\rm{M}_{\rm{O2}}$, to produce high-quality organ pseudo labels.

To validate the effectiveness of multi-task segmentation, we combine the mixed labels of organs with the ground true label of tumor on 1497 cases to train the model $\rm{M}_{\rm{OT}}$, which is able to segment all organs and tumor at once and achieves better segmentation performance than single-task segmentation.

Further, $\rm{M}_{\rm{OT}}$ was utilized to generate new organs and tumor pseudo-labels on 2200 images and combined with ground true labels to form a hybrid-label dataset, wherein, due to the low accuracy of tumor segmentation, we utilized organs to constrain tumor pseudo-labels during label merging, and disregarded the results of tumor segmentation outside of organs. Using this dataset, we trained the model $\rm{M}_{\rm{OT1}}$.

In order to verify the effectiveness of unlabeled data on model segmentation performance improvement, we use $\rm{M}_{\rm{OT1}}$ to generate segmentation results on 4000 cases, of which 2200 cases are regenerated as a mixed-labeled dataset on partially-labeled data. The remaining 1800 cases are directly used as pseudo-labels for unlabeled data. We train the model $\rm{M}_{\rm{OT2}}$ on these 4000-cases dataset.

Finally, we utilize $\rm{M}_{\rm{OT2}}$ to iterate on the 4000-cases to generate a new dataset and upgrade the medium model to large to extract more feature information, then train the final model $\rm{M}_{\rm{OT3}}$. In order to balance the inference speed and segmentation accuracy, we adopt the TTA4 strategy (by reducing the number of flips of TTA, i.e., flipping the input image over x, y, and z, respectively) to complete the final inference process. 

We report the final results of DSC and NSD of organ and tumor on the validation set in Table~\ref{table:final_results}. The results of ablation studies to analyze the effect of multi-task segmentation and unlabeled data can be obtained from Table~\ref{table:studies_results}.


\begin{table}[htbp]
\caption{Quantitative evaluation results. The public validation denotes the performance on the 50 validation cases with ground truth. Please present both the mean score and standard deviation. The online validation denotes the leaderboard results. The Testing results will be released during MICCAI.
}\label{table:final_results}
\centering
\begin{tabular}{l|cc|cc|cc}
\hline
\multirow{2}{*}{Target} & \multicolumn{2}{c|}{Public Validation} & \multicolumn{2}{c|}{Online Validation} & \multicolumn{2}{c}{Testing} \\ \cline{2-7} 
                        & DSC(\%)            & NSD(\%)           & DSC(\%)            & NSD(\%)           & DSC(\%)      & NSD (\%)     \\ \hline
Liver                   & 97.46±0.48   & 99.26±1.32  & 97.44          & 99.15         & 96.72        & 98.33        \\
Right kidney            & 94.35±8.27   & 97.4±7.01   & 93.56          & 95.61         & 94.05        & 95.16        \\
Spleen                  & 96.51±0.69   & 99.76±0.58  & 96.72          & 99.23         & 96.06        & 98.43        \\
Pancreas                & 86.14±5.42   & 98.38±2.79  & 85.49          & 96.99         & 90.05        & 98.4         \\
Aorta                   & 94.68±1.67   & 98.09±2.37  & 95.38          & 98.94         & 95.58        & 99.43        \\
Interior vena cava      & 92.94±1.66   & 96.69±2.69  & 93.94          & 97.27         & 94.51        & 98.37        \\
Right adrenal gland     & 80.21±12.36  & 96.66±14.01 & 79.74          & 93.8          & 79.15        & 93.58        \\
Left adrenal gland      & 80.48±5.9    & 97.54±2.89  & 79.62          & 93.67         & 79.16        & 93.66        \\
Gallbladder             & 81.95±24.98  & 88.32±27.02 & 80.87          & 81.42         & 79.89        & 82.11        \\
Esophagus               & 81.09±14.86  & 94.51±14.65 & 82.41          & 94.39         & 87.46        & 98.45        \\
Stomach                 & 92.69±3      & 98.37±3.27  & 93.37          & 98.45         & 93.07        & 98.41        \\
Duodenum                & 83.46±6.07   & 96.49±4.48  & 84.38          & 96.33         & 88.09        & 98.10        \\
Left kidney             & 93.99±6.21   & 96.33±8.33  & 93.22          & 95.24         & 92.90        & 94.62        \\
Tumor                   & 52.23±35.08  & 51.8±34.38  & 45.76          & 38.5          & 62.89        & 51.69        \\ \hline
Average             & 86.30        & 93.54       & 85.85          & 91.36         & 87.83        & 92.77 \\ \hline
\end{tabular}
\end{table}

\begin{sidewaystable}[htbp]
\caption{DSC(\%) and NSD(\%) of organs and tumors on online validation set.
}\label{table:studies_results}
\centering
\begin{tabular}{lccccccccccccccccc}
\hline
Model                 & \begin{tabular}[c]{@{}c@{}}Training \\ images\end{tabular} & Metrics & Liver & RK    & Spleen & \begin{tabular}[c]{@{}c@{}}Panc-\\ reas\end{tabular} & Aorta & IVC   & RAG   & LAG   & \begin{tabular}[c]{@{}c@{}}Gallb-\\ ladder\end{tabular} & \begin{tabular}[c]{@{}c@{}}Esop-\\ hagus\end{tabular} & Stomach & \begin{tabular}[c]{@{}c@{}}Duod-\\ enum\end{tabular} & LK    & Tumor & \begin{tabular}[c]{@{}c@{}}organ\\ Mean\end{tabular}  \\ \hline
\multirow{2}{*}{$\rm{M}_{\rm{O}}$}   & \multirow{2}{*}{222}                                       & DSC     & 95.72 & 90.75 & 94.14  & 82.41                                                & 94.79 & 92.65 & 78.73 & 77.09 & 78.00                                                   & 80.14                                                 & 91.00   & 81.55                                                & 89.88 & -     & 86.68 \\
                      &                                                            & NSD     & 97.48 & 92.72 & 95.09  & 95.71                                                & 97.96 & 95.55 & 93.53 & 92.03 & 76.50                                                   & 92.72                                                 & 95.19   & 94.34                                                & 91.95 & -     & 93.14 \\
\multirow{2}{*}{$\rm{M}_{\rm{O1}}$}  & \multirow{2}{*}{2200}                                      & DSC     & 96.65 & 91.80 & 95.97  & 83.49                                                & 95.19 & 93.57 & 79.50 & 78.08 & 82.24                                                   & 81.85                                                 & 92.30   & 82.74                                                & 89.01 & -     & 87.88 \\
                      &                                                            & NSD     & 98.14 & 93.91 & 98.03  & 96.13                                                & 98.73 & 96.75 & 93.67 & 92.34 & 81.92                                                   & 93.97                                                 & 96.91   & 95.20                                                & 91.23 & -     & 94.38 \\
\multirow{2}{*}{$\rm{M}_{\rm{O2}}$}  & \multirow{2}{*}{2200}                                      & DSC     & 97.08 & 92.03 & 96.63  & 84.90                                                & 95.43 & 94.06 & 79.97 & 79.53 & 81.43                                                   & 82.57                                                 & 92.79   & 84.12                                                & 88.41 & -     & 88.38 \\
                      &                                                            & NSD     & 98.67 & 94.61 & 99.04  & 96.83                                                & 99.02 & 97.44 & 94.13 & 93.80 & 81.56                                                   & 94.73                                                 & 97.76   & 96.13                                                & 91.85 & -     & 95.04 \\
\multirow{2}{*}{$\rm{M}_{\rm{T}}$}   & \multirow{2}{*}{1497}                                      & DSC     & -     & -     & -      & -                                                    & -     & -     & -     & -     & -                                                       & -                                                     & -       & -                                                    & -     & 34.34 & -     \\
                      &                                                            & NSD     & -     & -     & -      & -                                                    & -     & -     & -     & -     & -                                                       & -                                                     & -       & -                                                    & -     & 24.02 & -     \\
\multirow{2}{*}{$\rm{M}_{\rm{OT}}$}  & \multirow{2}{*}{1497}                                      & DSC     & 97.32 & 92.62 & 96.46  & 84.56                                                & 95.19 & 93.61 & 79.84 & 79.46 & 81.18                                                   & 81.88                                                 & 93.12   & 83.50                                                & 92.88 & 43.76 & 88.59 \\
                      &                                                            & NSD     & 99.00 & 94.68 & 98.85  & 96.62                                                & 98.64 & 96.72 & 93.89 & 93.62 & 81.38                                                   & 93.94                                                 & 98.13   & 95.86                                                & 94.86 & 36.22 & 95.09 \\
\multirow{2}{*}{$\rm{M}_{\rm{OT1}}$} & \multirow{2}{*}{2200}                                      & DSC     & 97.42 & 93.44 & 96.64  & 85.23                                                & 95.29 & 93.85 & 79.63 & 79.34 & 81.45                                                   & 82.26                                                 & 93.05   & 83.98                                                & 93.47 & 44.31 & 88.85 \\
                      &                                                            & NSD     & 99.17 & 95.48 & 99.09  & 96.88                                                & 98.80 & 97.21 & 93.95 & 93.63 & 81.82                                                   & 94.52                                                 & 98.03   & 96.13                                                & 95.41 & 37.49 & 95.39 \\
\multirow{2}{*}{$\rm{M}_{\rm{OT2}}$} & \multirow{2}{*}{4000}                                      & DSC     & 97.43 & 93.67 & 96.63  & 84.99                                                & 95.28 & 93.88 & 80.77 & 79.60 & 80.61                                                   & 82.12                                                 & 93.08   & 83.83                                                & 93.32 & 44.73 & 88.86 \\
                      &                                                            & NSD     & 99.22 & 95.84 & 99.02  & 96.77                                                & 98.79 & 97.24 & 94.79 & 93.64 & 80.92                                                   & 94.18                                                 & 98.09   & 96.03                                                & 95.27 & 37.41 & 95.37 \\
\multirow{2}{*}{$\rm{M}_{\rm{OT3}}$} & \multirow{2}{*}{4000}                                      & DSC     & 97.44 & 93.56 & 96.72  & 85.49                                                & 95.38 & 93.94 & 79.74 & 79.62 & 80.87                                                   & 82.41                                                 & 93.37   & 84.38                                                & 93.22 & 45.76 & 88.93 \\
                      &                                                            & NSD     & 99.15 & 95.61 & 99.23  & 96.99                                                & 98.94 & 97.27 & 93.80 & 93.67 & 81.42                                                   & 94.39                                                 & 98.45   & 96.33                                                & 95.24 & 38.50 & 95.42 \\ \hline
\end{tabular}
\end{sidewaystable}


\subsection{Qualitative results on validation set}

Fig.~\ref{fig:seg} shows four representative segmentation results of the final model $\rm{M}_{\rm{OT3}}$ in the validation dataset. For Case \#FLARETs\_0083 and Case \#FLARETs\_0027, the model successfully identified all organs and accurately segmented the tumor boundaries. For Case \#FLARETs\_0051, although the model had identified all the correct organs, it failed to successfully segment the tumor, resulting in lower metrics for both the tumor and the organs. In Case \#FLARETs\_0091, the model even failed to determine the location of the prostate tumor. We believe that, on the one hand, there is no annotation information for prostate organs in the dataset, resulting in the failure to establish a connection between organ and tumor; on the other hand, prostate tumors are a low percentage in the dataset, and the model lacks sufficient data to learn to segment this target.

\begin{figure}[!htbp]
\centering
\includegraphics[scale=0.12]{imgs/flare23_final_results.png}
\caption{Qualitative results of our final model on two easy cases and two hard cases.
}
\label{fig:seg}
\end{figure}


\subsection{Segmentation efficiency results on validation set}

We applied a sliding window strategy with body prior and a simplified TTA4 method on the final model $\rm{M}_{\rm{OT3}}$ to build the final submitted docker image. In Table~\ref{table:cases_efficiency} and Table~\ref{table:mean_efficiency}, we report the efficiency evaluation results from the official platform.

\begin{table}[htbp]
\caption{Quantitative evaluation of segmentation efficiency in terms of the running time and GPU memory consumption. Total GPU denotes the area under GPU Memory-Time curve. Evaluation GPU platform: NVIDIA QUADRO RTX5000 (16G).
}
\label{table:cases_efficiency}
\centering
\begin{tabular}{ccccc}
\hline
Case ID & Image Size      & Running Time (s) & Max GPU (MB) & Total GPU (MB) \\ \hline
0001    & (512, 512, 55)  & 19.28            & 2468         & 31332          \\
0051    & (512, 512, 100) & 26.05            & 2468         & 49968          \\
0017    & (512, 512, 150) & 38.61            & 2468         & 54208          \\
0019    & (512, 512, 215) & 23.93            & 2468         & 43972          \\
0099    & (512, 512, 334) & 27.27            & 2468         & 51457          \\
0063    & (512, 512, 448) & 31.75            & 2468         & 59780          \\
0048    & (512, 512, 499) & 34.23            & 2468         & 65627          \\
0029    & (512, 512, 554) & 38.02            & 2468         & 73601          \\\hline
\end{tabular}
\end{table}

\begin{table}[htbp]
\caption{Efficiency evaluation results of our submitted docker. All metrics reported are the average values on 20 validation cases.
}
\label{table:mean_efficiency}
\centering
\begin{tabular}{cccccccc}
\hline
Time & \begin{tabular}[c]{@{}c@{}}GPU\\ Memory\end{tabular} & \begin{tabular}[c]{@{}c@{}}AUC GPU\\ Time\end{tabular} & \begin{tabular}[c]{@{}c@{}}CPU \\ Utilization\end{tabular} & \begin{tabular}[c]{@{}c@{}}AUC CPU\\ Time\end{tabular} & RAM     & \begin{tabular}[c]{@{}c@{}}AUC RAM\\ Time\end{tabular} \\ \hline
26.7 & 2504.6                                               & 49352.9                                                & 66.67                                                      & 916.63                                                 & 6283.97 & 126713.2                                               \\ \hline
\end{tabular}
\end{table}

\subsection{Results on final testing set}
Our method achieved seventh place out of 37 submissions in the final testing set. Tables~\ref{table:final_results} and~\ref{table:test_report} show the detailed evaluation metrics of our method in the final testing set.

\begin{table}[htbp]
\caption{Testing results of our proposed method. All metrics reported are the average values on 400 testing cases.}
\label{table:test_report}
\centering
\begin{tabular}{cccccc}
\hline
\begin{tabular}[c]{@{}c@{}}Organ\\ DSC\end{tabular} & \begin{tabular}[c]{@{}c@{}}Organ\\ NSD\end{tabular} & \begin{tabular}[c]{@{}c@{}}Tumor\\ DSC\end{tabular} & \begin{tabular}[c]{@{}c@{}}Tumor\\ NSD\end{tabular} & \begin{tabular}[c]{@{}c@{}}AUC GPU\\ Time\end{tabular} & Time  \\ \hline
0.8968    & 0.9589    & 0.6289    & 0.5169    & 33804    & 18.53  \\ \hline
\end{tabular}
\end{table}


\subsection{Limitation and future work}
We used a simple but effective iterative training strategy to gradually improve the quality of pseudo-label generation, but there may be noise in the pseudo-labels, which can limit or even degrade the segmentation performance of the model. Therefore, we will investigate the latest pseudo-label selection strategy in our future work to form a positive feedback loop in iterative training.


\section{Conclusion}
In this paper, we iteratively train a model capable of segmenting both abdominal organs and whole-body pan-tumors on a mixed-labeled dataset based on the nnU-Net framework, which combines fully supervised, semi-supervised, and multi-task learning. In addition, this paper designs a sliding window strategy based on the body prior and a simplified test-time augmentation to trade-off efficiency and accuracy during inference. The results of the public validation set of FLARE2023 show that the method has good segmentation performance and computational efficiency.


\subsubsection{Acknowledgements} The authors of this paper declare that the segmentation method they implemented for participation in the FLARE 2023 challenge has not used any pre-trained models nor additional datasets other than those provided by the organizers. The proposed solution is fully automatic without any manual intervention. We thank all the data owners for making the CT scans publicly available and CodaLab~\cite{codalab} for hosting the challenge platform. 


%
% ---- Bibliography ----
%
% BibTeX users should specify bibliography style 'splncs04'.
% References will then be sorted and formatted in the correct style.
%
\bibliographystyle{splncs04}
\bibliography{ref}

\newpage
% Please add the following required packages to your document preamble:
% \usepackage[normalem]{ulem}
% \useunder{\uline}{\ul}{}
\begin{table}[!htbp]
\caption{Checklist Table. Please fill out this checklist table in the answer column.}
\centering
\begin{tabular}{ll}
\hline
Requirements                                                                                                                    & Answer        \\ \hline
A meaningful title                                                                                                              & Yes        \\ \hline
The number of authors ($\leq$6)                                                                                                 & 5        \\ \hline
Author affiliations, Email, and ORCID                                                                                           & Yes        \\ \hline
Corresponding author is marked                                                                                                  & Yes        \\ \hline
Validation scores are presented in the abstract                                                                                 & Yes        \\ \hline
\begin{tabular}[c]{@{}l@{}}Introduction includes at least three parts: \\ background, related work, and motivation\end{tabular} & Yes        \\ \hline
A pipeline/network figure is provided                                                                                           & Fig.~\ref{fig:workflow} \\ \hline
Pre-processing                                                                                                                  & Page~\pageref{Preprocessing}   \\ \hline
Strategies to use the partial label                                                                                             & Page~\pageref{UsePartialLabel}   \\ \hline
Strategies to use the unlabeled images.                                                                                         & Page~\pageref{UseUnLabeledImages}   \\ \hline
Strategies to improve model inference                                                                                           & Page~\pageref{ImproveInference}   \\ \hline
Post-processing                                                                                                                 & Page~\pageref{PostProcessing}   \\ \hline
Dataset and evaluation metric section is presented                                                                              & Page~\pageref{DatasetandEvaluationMetric}   \\ \hline
Environment setting table is provided                                                                                           & Table~\ref{table:env}  \\ \hline
Training protocol table is provided                                                                                             & Table~\ref{table:medium_model} and Table~\ref{table:large_model}  \\ \hline
Ablation study                                                                                                                  & Page~\pageref{AblationStudy}   \\ \hline
Efficiency evaluation results are provided                                                                                     & Table~\ref{table:cases_efficiency} and Table~\ref{table:mean_efficiency} \\ \hline
Visualized segmentation example is provided                                                                                     & Fig.~\ref{fig:seg} \\ \hline
Limitation and future work are presented                                                                                        & Yes        \\ \hline
Reference format is consistent.  & Yes        \\ \hline

\end{tabular}
\end{table}

\end{document}
