% This is samplepaper.tex, a sample chapter demonstrating the
% LLNCS macro package for Springer Computer Science proceedings;
% Version 2.21 of 2022/01/12
%
\documentclass[runningheads]{llncs}
%
\usepackage[T1]{fontenc}
% T1 fonts will be used to generate the final print and online PDFs,
% so please use T1 fonts in your manuscript whenever possible.
% Other font encondings may result in incorrect characters.
%
\usepackage{multirow}
\usepackage{graphicx}
% Used for displaying a sample figure. If possible, figure files should
% be included in EPS format.
%
% If you use the hyperref package, please uncomment the following two lines
% to display URLs in blue roman font according to Springer's eBook style:
%\usepackage{color}
%\renewcommand\UrlFont{\color{blue}\rmfamily}
\usepackage[pagebackref=true,breaklinks=true,colorlinks,bookmarks=false]{hyperref}
%
\begin{document}
%
\title{Pseudo Label-Based Semi-Supervised Learning for Abdominal Organ and Cancer Segmentation in CT Image With Partial Labeled Data}
%
\titlerunning{Pseudo Label-Based Semi-Supervised Learning}
% If the paper title is too long for the running head, you can set
% an abbreviated paper title here
%
\author{Panlong Xu\inst{1}\orcidID{0000-0001-7200-1238} \and
Zhijian Li\inst{1}\orcidID{0009-0008-6050-8216}\and Weiping Liu\inst{1}} 
%
\authorrunning{Panlong Xu et al.}
% First names are abbreviated in the running head.
% If there are more than two authors, 'et al.' is used.
%
\institute{Imaging and Navigation Research Center, Shanghai Microport Medbot(Group)Co.,Ltd., Shanghai, China \\
\email{\{xupl\}@microport.com}}
%
\maketitle              % typeset the header of the contribution
%
\begin{abstract}
Abdominal multi-organ and pan-tumor segmentation in CT image plays a critically important role in preoperative planning, intraoperative navigation, and postoperative assessment for surgical procedures.In this study, we propose a semi-supervised learning approach using nnU-Net on the FLARE2023 competition dataset. Our methodology involves training an initial model on fully annotated data, followed by inference on partially annotated data to generate pseudo-labels, and subsequently training a final model using these pseudo-labeled data. To optimize computational efficiency, we adopt a parameter-efficient model with a reduced number of parameters. By leveraging the availability of both labeled and unlabeled data, our approach aims to enhance the performance of the nnU-Net model while maintaining a reasonable computational cost. Ultimately, our trained small nnU-Net achieved significant results on a validation set of 100 samples, with a dice coefficient of 0.8854 for multi-organ segmentation and 0.4186 for tumor segmentation. Moreover, the average inference time of the model was only 18 seconds.

\keywords{Semi-supervised learning  \and Image segmentation \and Pseudo label.}
\end{abstract}



\section{Introduction}
The FLARE2023 challenge aims to promote the development of universal organ and tumor segmentation in abdominal CT scans, which is an extension of FLARE2021 and FLARE2022 challenge. The participants should develop segmentation algorithm which enable segment 13 organs (liver, spleen, pancreas, right kidney, left kidney, stomach, gallbladder, esophagus, aorta, inferior vena cava, right adrenal gland, left adrenal gland, and duodenum) and one tumor class with all kinds of cancer types (such as liver cancer, kidney cancer, stomach cancer, pancreas cancer, colon cancer) in abdominal CT scans.

Abdominal organ and tumor segmentation hold significant clinical importance in several aspects. Firstly, accurate segmentation of abdominal organs allows for precise identification and analysis of specific structures, aiding in surgical planning by providing detailed information about the spatial relationships between organs. This assists surgeons in determining the optimal surgical approach and reducing the risk of complications during the procedure.Furthermore, tumor segmentation plays a crucial role in the diagnosis, treatment planning, and evaluation of cancer patients. By accurately delineating tumor boundaries, clinicians can assess tumor size, location, and response to therapy. This information guides treatment decisions, such as determining the extent of surgical resection and predicting prognosis.

However, multi-organ segmentation and pan-tumor segmentation face several challenges. Firstly, variations in organ shape, size, and appearance across different individuals and disease states make accurate segmentation challenging. Secondly, the presence of overlapping structures and ambiguous boundaries between organs or tumors adds difficulty to the segmentation task. Finally, image artifacts, noise, and limited image resolution can affect the quality of segmentation results.

The nnU-Net~\cite{nnUNet} segmentation framework has proven effective in addressing the challenges mentioned due to its ability to analyze the fingerprint features of training data. By understanding the unique characteristics of the data, nnU-Net can adapt the network structure complexity and preprocessing strategy accordingly. This adaptability enables the framework to handle variations in organ shapes, sizes, and appearances, as well as cope with ambiguous boundaries and image artifacts. As a result, nnU-Net can provide accurate and robust segmentation results for multi-organ and pan-tumor segmentation tasks. 

In this study, we propose a semi-supervised learning approach based on nnU-Net to solve the abdominal multi-organ and pan-tumor segmentation problem in CT images. Our methodology involves training an initial model on fully annotated data, followed by inference on partially annotated data to generate pseudo-labels, and subsequently training a final model using these pseudo-labeled data. To optimize computational efficiency, we adopt a parameter-efficient model with a reduced number of parameters. By leveraging the availability of both labeled and unlabeled data, our approach aims to enhance the performance of the nnU-Net model while maintaining a reasonable computational cost. 






\section{Method}
The FLARE2023 challenge provide the largest abdomen CT dataset. The training set includes 4000 3D CT scans from 30+ medical centers. 2200 cases have partial labels and 1800 cases are unlabeled.Despite the availability of a large training dataset, a statistical analysis revealed severe class imbalance in the annotated dataset. Notably, among the 2,200 annotated examples, none included annotations for all 14 classes. The graph below illustrates the distribution of annotations for each class in the incomplete dataset, indicating significantly fewer annotations for classes 5-12 compared to others. To address this issue, we employed a semi-supervised learning approach based on pseudo-labeling to iteratively train the segmentation model. The training process involved five stages: 

1. In the first stage, we trained a segmentation model using 222 examples annotated for classes 1-13.

2. In the second stage, we selected 597 examples annotated for classes 1-4, 13, and 14. Using the segmentation model from the first stage, we inferred the unannotated classes and trained a 14-class segmentation model.

3. The third stage involved inferring tumor pseudo-labels using the model from the second stage on the training data from the first stage. The combined dataset of 819 annotated examples was then used to train the segmentation model.

4. In the fourth stage, we utilized the model from the third stage to inference the unannotated class labels for the remaining 1,200 examples, and mixed the entire dataset of 2,200 examples for training.

5. Finally, in the fifth stage, we inferred the labels for the remaining 1,800 unannotated examples using the model obtained from the fourth stage. The model was then further trained through the hybrid training process to obtain the final segmentation model. 

\begin{figure}[htbp]
\centering
\includegraphics[scale=0.65]{imgs/fig1_training_data_statistics.png}
\caption{Statistics on the annotated data for each class in the training set. The horizontal axis represents the IDs of the 14 different classes, while the vertical axis represents the number of annotated samples. 
}
\label{fig:Statis}
\end{figure}

%###########################
\subsection{Preprocessing}
All data preprocessing follows the original nnU-Net framework. Firstly, the raw images are cropped to remove contiguous regions with pixel values of 0, although such cases do not exist in real CT images. Secondly, the images are resampled according to the predetermined spacing, as shown in Table~\ref{table:net1} and Table~\ref{table:net2}. The input spacing for larger model is smaller than that for smaller model. Finally, the data is normalized, with two threshold values of 0.05 and 0.95 obtained from pixel value statistics used for truncation.

\subsection{Proposed Method}
As shown in Figure~\ref{fig:Network}, our proposed method contains five training stages. Meanwhile, two different size 3D nnU-Net were applied to train different models.  

\begin{figure}[htbp]
\centering
\includegraphics[scale=0.4]{imgs/pipeline.png}
\caption{Pipeline of our proposed training strategy. }
\label{fig:Network}
\end{figure}

The big nnU-Net model is characterized by a wider and deeper network structure and a higher input resolution. On the contrary, the small nnU-Net model has a narrower and shallower network structure, along with a lower input resolution. The differences in their network structures can be observed from Table~\ref{table:net1} and Table~\ref{table:net2}. This training strategy is inspired by the approach described in ~\cite{FLARE22-1st-Huang}. During the initial training phase with partially labeled data, we aim to obtain more reliable pseudo-labels. As for the final network training, we balance the inference speed, memory consumption, and segmentation accuracy by reducing the complexity of the segmentation model.


\begin{table}[!htbp]
\caption{Big nnU-Net network structure.}\label{table:net1}
\centering
\begin{tabular}{ll}
\hline
Settings       & Value \\
\hline
channels in the first stage   & 32 \\
\hline
convolution number per stage                         & 3\\
\hline
downsampling times                         & 5\\
\hline
input spacing                  & (2.5, 0.8, 0.8)\\                          \hline
test time augmentation                 & yes\\ 
\hline
\end{tabular}
\end{table}


\begin{table}[!htbp]
\caption{Small nnU-Net network structure.}\label{table:net2}
\centering
\begin{tabular}{ll}
\hline
Settings       & Value \\
\hline
channels in the first stage   & 16 \\
\hline
convolution number per stage                         & 2\\
\hline
downsampling times                         & 4\\
\hline
input spacing                  & (4.0, 1.2, 1.2)\\                          \hline
test time augmentation                 & no\\ 
\hline
\end{tabular}
\end{table}


Loss function: we use the summation between Dice loss and cross-entropy loss because compound loss functions have been proven to be robust in various medical image segmentation tasks~\cite{LossOdyssey}. 

Regarding the training process, as mentioned before, the entire dataset suffers from severe class imbalance. Through statistical analysis of the annotated data, we have proposed a hierarchical training strategy, progressing from easy to difficult and gradually increasing the number of classes. This strategy is illustrated in Figure~\ref{fig:Network}. Firstly, we select dataset1, which consists of 222 examples annotated with class labels ranging from 1 to 13. With this set of data, we train a large nnU-Net, which named big nnU-Net 1. In the second step, we further select dataset2, comprising 597 examples annotated with classes 1 to 4, 13, and 14 (a total of 6 classes). To supplement the missing 8 classes, we utilize the inference results from big nnU-Net 1 and then train another large nnU-Net model, called big nnU-Net 2. The third step involves using big nnU-Net 2 to infer the missing tumor annotation in dataset1, then combined with dataset2 and train big nnU-Net 3. For the fourth step, the remaining 1381 partially annotated data samples form dataset3. We use big nnU-Net 3 to infer the missing labels, obtain pseudo-labels, and mix them with the rest of the data to train big nnU-Net 4.


To fully utilize the remaining unlabeled data and strike a balance between inference speed and memory consumption, in the final stage of training, we employ 2200 unlabeled examples, namely dataset4, to train the small nnU-Net. We don't used the pseudo labels generated by the FLARE21 winning algorithm~\cite{FLARE22-1st-Huang} and the best-accuracy-algorithm~\cite{FLARE22-bestDSC-Wang}.

In order to improve inference speed and reduce resource consumption, on one hand, we have reduced the model complexity and the size of input patches. On the other hand, we have adopted the same sliding window strategy as described in ~\cite{FLARE22-1st-Huang}.


\subsection{Post-processing}
During the post-processing stage, we experimented with connected component operations but found that they hardly improved the final results. As a result, we ultimately decided not to employ any post-processing operations.


\section{Experiments}
\subsection{Dataset and evaluation measures}
The FLARE 2023 challenge is an extension of the FLARE 2021-2022~\cite{MedIA-FLARE21}\cite{FLARE22}, aiming to aim to promote the development of foundation models in abdominal disease analysis. The segmentation targets cover 13 organs and various abdominal lesions. The training dataset is curated from more than 30 medical centers under the license permission, including TCIA~\cite{TCIA}, LiTS~\cite{LiTS}, MSD~\cite{simpson2019MSD}, KiTS~\cite{KiTS,KiTSDataset}, and AbdomenCT-1K~\cite{AbdomenCT-1K}. The training set includes 4000 abdomen CT scans where 2200 CT scans with partial labels and 1800 CT scans without labels. The validation and testing sets include 100 and 400 CT scans, respectively, which cover various abdominal cancer types, such as liver cancer, kidney cancer, pancreas cancer, colon cancer, gastric cancer, and so on. The organ annotation process used ITK-SNAP~\cite{ITKSNAP}, nnU-Net~\cite{nnUNet}, and MedSAM~\cite{MedSAM}.


The evaluation metrics encompass two accuracy measures—Dice Similarity Coefficient (DSC) and Normalized Surface Dice (NSD)—alongside two efficiency measures—running time and area under the GPU memory-time curve. These metrics collectively contribute to the ranking computation. Furthermore, the running time and GPU memory consumption are considered within tolerances of 15 seconds and 4 GB, respectively.


\subsection{Implementation details}
\subsubsection{Environment settings}
The development environments and requirements are presented in Table~\ref{table:env}. The training protocols of big nnU-Net and small nnU-Net are listed in Table~\ref{table:training} and ~\ref{table:training2nd} respectively. We adopt data augmentation of additive brightness, gamma, rotation, scaling, and elastic deformation on the fly during training.


\begin{table}[!htbp]
\caption{Development environments and requirements.}\label{table:env}
\centering
\begin{tabular}{ll}
\hline
System       & Ubuntu 20.04.3 LTS \\
\hline
CPU   & AMD EPYC 7643 48-Core Processor@1.50GHz \\
\hline
RAM                         &504GB\\
\hline
GPU (number and type)                         & One NVIDIA A100 40G\\
\hline
CUDA version                  & 11.6\\                          \hline
Programming language                 & Python 3.8\\ 
\hline
Deep learning framework & torch 1.12 \\
\hline
\end{tabular}
\end{table}




\begin{table*}[!htbp]
\caption{Big nnU-Net training protocols.}
\label{table:training}
\begin{center}
% \resizebox{0.47\textwidth}{!}{
\begin{tabular}{ll} 
\hline
Network initialization         & "He" normal initialization \\
\hline
Batch size                    & 2 \\
\hline 
Patch size & 48$\times$224$\times$224  \\ 
\hline
Total epochs & 1000 \\
\hline
Optimizer          & SGD with nesterov momentum ($\mu=0.99$)      \\ \hline
Initial learning rate (lr)  & 0.01 \\ \hline
Lr decay schedule & Poly learning rate policy \\
\hline
Training time                                           & 24 hours \\  \hline 
Loss function & Dice loss and cross entropy loss\\     \hline
Number of model parameters    & 82M\footnote{https://github.com/sksq96/pytorch-summary} \\ \hline
Number of flops & 776G\footnote{https://github.com/facebookresearch/fvcore} \\ \hline
CO$_2$eq & 34 Kg\footnote{https://github.com/lfwa/carbontracker/} \\  \hline
\end{tabular}
%}
\end{center}
\end{table*}


\begin{table*}[!htbp]
\caption{Training protocols for the small nnU-Net.}
\label{table:training2nd}
\begin{center}
% \resizebox{0.47\textwidth}{!}{
\begin{tabular}{ll} 
\hline
Network initialization         & "He" normal initialization\\
\hline
Batch size                    & 2 \\
\hline 
Patch size & 32$\times$128$\times$192  \\ 
\hline
Total epochs & 1000 \\
\hline
Optimizer          & SGD with nesterov momentum ($\mu=0.99$)          \\ \hline
Initial learning rate (lr)  & 0.01 \\ \hline
Lr decay schedule & Poly learning rate policy \\
\hline
Training time                                           & 12 hours \\  \hline 
Number of model parameters    & 5.4M\footnote{https://github.com/sksq96/pytorch-summary} \\ \hline
Number of flops & 136G\footnote{https://github.com/facebookresearch/fvcore} \\ \hline
CO$_2$eq & 11 Kg\footnote{https://github.com/lfwa/carbontracker/} \\  \hline
\end{tabular}
\end{center}
\end{table*}


\section{Results and discussion}


\begin{table}[htbp]
\caption{Quantitative evaluation results on the public 50 validation cases and 100 online validation cases.
}
\centering
\label{table:evaluation}
\begin{tabular}{l|cc|cc|cc}
\hline
\multirow{2}{*}{Target} & \multicolumn{2}{c|}{Public Validation} & \multicolumn{2}{c|}{Online Validation} & \multicolumn{2}{c}{Testing} \\ \cline{2-7} 
                        & DSC(\%)            & NSD(\%)           & DSC(\%)            & NSD(\%)           & DSC(\%)      & NSD (\%)     \\ \hline
Liver      & 97.27$\pm$$0.48$    &   98.88$\pm$1.44   &   97.23   &     98.91   &            &              \\
Right Kidney  &   94.58$\pm$$7.79$    &  96.10$\pm$$7.34$  &   93.83   &    95.47    &              &              \\
Spleen  &     96.38$\pm$$1.07$     &    98.35$\pm$$2.35$ &     96.61     &        98.80    &              &              \\
Pancreas                & 85.50$\pm$$5.44$           &        97.00$\pm$$3.56$           &      84.37              &        96.22           &              &              \\
Aorta                   & 95.69$\pm$$1.25$                    & 98.88$\pm$$1.83$                   &95.80                    &98.85                   &              &              \\
Inferior vena cava      &93.97$\pm$$1.85$                    &96.70$\pm$$2.64$                   &93.77                    &96.16                   &              &              \\
Right adrenal gland     &81.89$\pm$$5.08$                    &95.04$\pm$$2.73$                   &80.84                    &94.46                   &              &              \\
Left adrenal gland      &78.82$\pm$$6.00$                    &93.18$\pm$$4.88$                   &78.35                    &92.43                   &              &              \\
Gallbladder             &79.34$\pm$$24.71$                    &78.00$\pm$$25.19$                   &79.79                    &78.12                   &              &              \\
Esophagus               &81.43$\pm$$14.71$                    &92.91$\pm$$14.67$                   &82.06                    &93.90                   &              &              \\
Stomach                 &91.93$\pm$$3.14$                    &96.97$\pm$$4.42$                   &92.46                    &97.48                   &              &              \\
Duodenum                &81.55$\pm$$7.19$                    &94.83$\pm$$5.32$                   &82.47                    &95.38                   &              &              \\
Left kidney             &93.37$\pm$$9.80$                    &94.18$\pm$$12.15$                   &93.41                    &94.93                   &              &              \\
Tumor                   &48.40$\pm$$34.35$                    &39.18$\pm$$30.50$                   &41.86                    &33.81                   &              &              \\ \hline
Average                   &85.72$\pm$$17.82$                    &90.73$\pm$$19.46$                   & 85.17                   &90.35                   &              &              \\ \hline
\end{tabular}
\end{table}


\subsection{Quantitative results on validation set}
After multiple rounds of iterative training and hierarchical learning, the big nnU-Net4 model has achieved good performance in segmentation. The average Dice coefficient for organ segmentation is 0.895, and for tumor segmentation, it is 0.447. The tumor segmentation metric ranked ninth on the validation leaderboard. Figure~\ref{fig:ablation} presents a comparative analysis of the average Dice coefficient achieved by three models during the training process on the online validation dataset.

\begin{figure}[!htbp]
\centering
\includegraphics[scale=0.7]{imgs/ablation_study.png}
\caption{Comparison of different models on validation mean Dice metric.
}
\label{fig:ablation}
\end{figure}

As for the final small nnU-Net model, the average Dice coefficient for organ segmentation on the validation set is 0.885, and for tumor segmentation, it is 0.419. Although there is a slight decrease in segmentation performance, the inference time cost and GPU memory consumption have been significantly reduced. The quantitative evaluation results of the small nnU-Net on the validation set are shown in Table~\ref{table:evaluation}.


\subsection{Qualitative results on validation set}

Despite the reduced complexity of the small nnU-Net model, the inclusion of a large amount of unlabeled data in the training process allows the model to maintain good segmentation performance on various organs. Figure\ref{fig:easy_seg} illustrates two well-segmented cases, demonstrating the model's ability to capture organ edges and details accurately. However, due to the decrease in resolution, the segmentation performance of the model is more noticeably affected on smaller anatomical structures, particularly tumors. Figure\ref{fig:hard_seg} displays two cases where the segmentation results are less satisfactory.

\begin{figure}[!htbp]
\centering
\includegraphics[scale=0.5]{imgs/flare23-easy_cases.png}
\caption{Qualitative results of the small nnU-Net on two easy cases.
}
\label{fig:easy_seg}
\end{figure}

\begin{figure}[!htbp]
\centering
\includegraphics[scale=0.5]{imgs/flare23-hard_cases.png}
\caption{Qualitative results of the small nnU-Net on two hard cases.
}
\label{fig:hard_seg}
\end{figure}



\subsection{Segmentation efficiency results on validation set}

We build our small nnU-Net with an efficient inference strategy as a docker image for final submission. In Table\ref{table:efficiency}, we report the efficiency evaluation results on the FLARE2023 organizer's computer server with GPU NVIDIA QUADRO RTX5000.

\begin{table*}[!htbp]
\caption{Efficiency evaluation results of our submitted docker. All metrics reported are the average values on 20 validation cases.}
\label{table:efficiency}
\begin{center}
\begin{tabular}{c | c | c | c | c } 
\hline
\textbf{Time} & \textbf{GPU Memory} & \textbf{AUC GPU Time} &\textbf{ CPU Utilization} & \textbf{AUC CPU Time}\\ 
\hline
18.5s & 2532MiB & 18466 & 63.2$\%$ & 357 \\
\hline
\end{tabular}
\end{center}
\end{table*}


\subsection{Results on final testing set}


\subsection{Limitation and future work}

Although our final model has shown promising performance in terms of inference speed and GPU memory consumption, there is still considerable room for improvement in its segmentation performance. In the future, we will explore more semi-supervised learning techniques, particularly deep learning methods based on auto-encoders. By extracting high-level semantic features from a large amount of data and transferring the learned feature descriptors to downstream segmentation tasks, we aim to enable the segmentation model to converge faster and achieve higher accuracy.



\section{Conclusion}
In this paper, we propose a semi-supervised training strategy based on nnU-Net. Specifically, we adopt a hierarchical learning approach to leverage both partially labeled and unlabeled data. We progressively train the model from easy to difficult samples. Additionally, to accelerate the model's inference speed, we reduce its complexity. We believe that our approach can provide valuable insights and inspiration for other researchers in this field.


\subsubsection{Acknowledgements} The authors of this paper declare that the segmentation method they implemented for participation in the FLARE 2023 challenge has not used any pre-trained models nor additional datasets other than those provided by the organizers. The proposed solution is fully automatic without any manual intervention. We thank all the data owners for making the CT scans publicly available and CodaLab~\cite{codalab} for hosting the challenge platform. 


%
% ---- Bibliography ----
%
% BibTeX users should specify bibliography style 'splncs04'.
% References will then be sorted and formatted in the correct style.
%
\bibliographystyle{splncs04}
\bibliography{ref}

\newpage
% Please add the following required packages to your document preamble:
% \usepackage[normalem]{ulem}
% \useunder{\uline}{\ul}{}
\begin{table}[!htbp]
\caption{Checklist Table. Please fill out this checklist table in the answer column.}
\centering
\begin{tabular}{ll}
\hline
Requirements                                                                                                                    & Answer        \\ \hline
A meaningful title                                                                                                              & Yes        \\ \hline
The number of authors ($\leq$6)                                                                                                             & 3        \\ \hline
Author affiliations, Email, and ORCID                                                                                           & Yes        \\ \hline
Corresponding author is marked                                                                                                  & Yes        \\ \hline
Validation scores are presented in the abstract                                                                                 & Yes        \\ \hline
\begin{tabular}[c]{@{}l@{}}Introduction includes at least three parts: \\ background, related work, and motivation\end{tabular} & Yes        \\ \hline
A pipeline/network figure is provided                                                                                           & 2 \\ \hline
Pre-processing                                                                                                                  & 3   \\ \hline
Strategies to use the partial label                                                                                             & 5   \\ \hline
Strategies to use the unlabeled images.                                                                                         & 5   \\ \hline
Strategies to improve model inference                                                                                           & 5   \\ \hline
Post-processing                                                                                                                 & 5   \\ \hline
Dataset and evaluation metric section is presented                                                                              & 6   \\ \hline
Environment setting table is provided                                                                                           & 2  \\ \hline
Training protocol table is provided                                                                                             & 4,5  \\ \hline
Ablation study                                                                                                                  & 8   \\ \hline
Visualized segmentaiton example is provided                                                                                     & 9,10 \\ \hline
Limitation and future work are presented                                                                                        & Yes        \\ \hline
Reference format is consistent.  & Yes        \\ \hline

\end{tabular}
\end{table}

\end{document}
