% This is samplepaper.tex, a sample chapter demonstrating the
% LLNCS macro package for Springer Computer Science proceedings;
% Version 2.21 of 2022/01/12
%
\documentclass[runningheads]{llncs}
%
\usepackage[T1]{fontenc}
% T1 fonts will be used to generate the final print and online PDFs,
% so please use T1 fonts in your manuscript whenever possible.
% Other font encondings may result in incorrect characters.
%

\usepackage{multirow}
\usepackage{graphicx}
% Used for displaying a sample figure. If possible, figure files should
% be included in EPS format.
%
% If you use the hyperref package, please uncomment the following two lines
% to display URLs in blue roman font according to Springer's eBook style:
%\usepackage{color}
%\renewcommand\UrlFont{\color{blue}\rmfamily}
\usepackage[pagebackref=true,breaklinks=true,colorlinks,bookmarks=false]{hyperref}
%
\begin{document}
%
\title{A Two-Step Deep Learning Approach for Abdominal Organ Segmentation}
%
%\titlerunning{Abbreviated paper title}
% If the paper title is too long for the running head, you can set
% an abbreviated paper title here
%
\author{Jianwei Gao\orcidID{0000-0002-6358-4117} \and
Juan Xu \and Honggao Fei}
%
\authorrunning{Gao et al.}
% First names are abbreviated in the running head.
% If there are more than two authors, 'et al.' is used.
%
\institute{Digital Health China Technologies Co., LTD, Beijing, China\\
\email{\{gaojw,xujuan,feihg\}@dchealth.com}}
%
\maketitle              % typeset the header of the contribution
%
\begin{abstract}
The MICCAI FLARE23 segmentation paper presents a solution to the challenging problem of segmenting 13 organs and tumor from CT scans, provided 2200 CT scans with partial labels and 1800 CT scans without labels, while balancing model performance and resource consumption. To address these challenges, the paper proposes a two-step segmentation approach that combines organ segmentation and tumor segmentation, with 0.8765 average DSC and 0.9299 average NSD.

\keywords{Abdominal organ segmentation \and Supervised Learning \and nnUnet.}
\end{abstract}

\section{Introduction}

Medical image segmentation plays a pivotal role in various clinical applications, enabling the accurate delineation and analysis of anatomical structures within medical images. However, achieving precise and efficient segmentation of abdominal organs poses significant challenges, because it typically requires a large amount of labeled data to train an accurate model, while manually annotating organs from CT scans is a time-consuming and labor-intensive process, furthermore,abdominal organs may have complex morphological structures and heterogeneous lesions, which segmentation a more difficult task.

In recent years, deep learning became the mainstream method for medical image analysis, demonstrating remarkable capabilities in automated organ segmentation tasks.~\cite{Kart2022} Specifically, the nnU-Net model~\cite{isensee2021nnu} has emerged as a powerful framework for achieving state-of-the-art results in medical image segmentation. nnU-Net combines the strengths of the U-Net architecture with advancements in neural network design and training strategies, allowing for improved accuracy and robustness.

Because there are no full 14 classes labeled data but 13 classes organ segmentation labeled data,in this paper,we break it down into two tasks:organ segmentation and tumour segmentation.Therefor, we propose an approach which involved training two nnUnet model with labeled data, which are used to segment organs and tumours respectively. A post-process is used to merge two Deep Learning results when inferencing.

\section{Method}

%###########################
\subsection{Preprocessing}
We use several pre-processing strategies as follows.
\begin{itemize} 
 \item Data choose and preprocessing
 We choose train data with full 13 organ label and data with tumor label,thus get 222 data for organ segment and 735 for tumor segment. then we split them by 8:2 ratio for train and validation. 
 \item Cropping strategy
 
 We use the CT scans as the data source to generate the bounding box of foreground, and then crop only the foreground object of the images.
 \item Resampling method for anisotropic data
 
 We resample the original data to unify the voxel spacing into $[1.0, 1.0, 1.0]$.
  
 \item Intensity normalization method

We collect intensity values from the foreground classes (all but the background and ignore) from all training cases, compute the mean, standard deviation as well as the 0.5 and 99.5 percentile of the values. Then clip to the percentiles, followed by subtraction of the mean and division with the standard deviation. The normalization that is applied is the same for each training case (for this input channel).

\end{itemize}

\subsection{Deep Network}
Figure~\ref{fig:Network} illustrates the applied 3D nnU-Net~\cite{isensee2021nnu}, where a 3D U-Net architecture is adopted. We use the leaky ReLU function with a negative slope of 0.01 as the activation function. Our first 3D nnU-Net has 14 out channels, corresponding to the background and 13 organs, while our second 3D nnU-Net has 2 out channels, corresponding to the background and the tumor. In this case, only data with 13 organ label and data with tumor label are used, the others is abandon.Unlabeled images were not used.

\begin{figure}[htbp]
\centering
\includegraphics[scale=0.3]{imgs/unet.png}
\caption{Our 3D U-Net architecture}
\label{fig:Network}
\end{figure}


We use the sum of Dice loss (after applying a softmax function) and Cross Entropy Loss as the loss function.

When predicting a single image with the trained segmentation model, we first resample it to a voxel spacing of [1.0, 1.0, 1.0], as we did during training, and try to predict. 


\subsection{Post-processing}
During model prediction, we select the label (from 0 to 13) corresponding to the largest of the 13 outputs for each voxel, after that, we combine predictions of two model to get the final result. when one pixel is both predicted as tumor and organ, it will be considered as tumor. 

\section{Experiments}
\subsection{Dataset and evaluation measures}
The FLARE 2023 challenge is an extension of the FLARE 2021-2022~\cite{MedIA-FLARE21}\cite{FLARE22}, aiming to aim to promote the development of foundation models in abdominal disease analysis. The segmentation targets cover 13 organs and various abdominal lesions. The training dataset is curated from more than 30 medical centers under the license permission, including TCIA~\cite{TCIA}, LiTS~\cite{LiTS}, MSD~\cite{simpson2019MSD}, KiTS~\cite{KiTS,KiTSDataset}, autoPET~\cite{autoPET-Data,autoPET-MICCAI22}, TotalSegmentator~\cite{TotalSegmentator}, and AbdomenCT-1K~\cite{AbdomenCT-1K}. The training set includes 4000 abdomen CT scans where 2200 CT scans with partial labels and 1800 CT scans without labels. The validation and testing sets include 100 and 400 CT scans, respectively, which cover various abdominal cancer types, such as liver cancer, kidney cancer, pancreas cancer, colon cancer, gastric cancer, and so on. The organ annotation process used ITK-SNAP~\cite{ITKSNAP}, nnU-Net~\cite{nnUNet}, and MedSAM~\cite{MedSAM}.

The evaluation metrics encompass two accuracy measures—Dice Similarity Coefficient (DSC) and Normalized Surface Dice (NSD)—alongside two efficiency measures—running time and area under the GPU memory-time curve. These metrics collectively contribute to the ranking computation. Furthermore, the running time and GPU memory consumption are considered within tolerances of 15 seconds and 4 GB, respectively.

\subsection{Implementation details}
\subsubsection{Environment settings}
The development environments and requirements are presented in Table~\ref{table:env}.

\begin{table}[!htbp]
\caption{Development environments and requirements.}\label{table:env}
\centering
\begin{tabular}{ll}
\hline
Windows/Ubuntu version       & Ubuntu 20.04.4 LTS\\
\hline
CPU   & Intel(R) Xeon(R) Gold 5218R CPU @ 2.10GHz \\
\hline
RAM                         &128G\\
\hline
GPU (number and type)                         & NVIDIA Tesla T4 (16G)\\
\hline
CUDA version                  & 11.6\\                          \hline
Programming language                 & Python 3.9\\ 
\hline
Deep learning framework & Pytorch (Torch 1.13.1, torchvision 0.14.1) \\
\hline
Specific dependencies         & numpy 1.25.2, SimpleITK 2.2.1, nnunetv2 2.1 nibabel 5.1.0 \\  
\hline
\end{tabular}
\end{table}


\subsubsection{Training protocols}
As described below.

Random flipping strategy (only for initial training stage): each image has a 20$\%$ probability of flipping along the x-axis and a 20$\%$ probability of flipping along the y-axis.

Random Gaussian smooth (only for initial training stage): each image has a 10$\%$ probability of being Gaussian smoothed with sigma in (0.5, 1.15) for every spatial dimension.

Random Gaussian noise (only for initial training stage): each image has a 20$\%$ probability of being added with Gaussian noise with mean in (0, 0.5) and standard deviation in (0, 1).

Random intensity change (only for initial training stage): each image has a 10$\%$ probability of changing intensity with gamma in (0.5, 2.5).

Random intensity shift (only for initial training stage): each image has a 10$\%$ probability of shifting intensity with offsets in (0, 0.3).

Patch sampling strategy: 2 patches of size $[128, 128, 128]$ are randomly cropped from each image. The center of each patch has 50$\%$ probability in the foreground and 50$\%$ probability in the background.

Optimal model selection criteria: we tried several different training protocols and selected the model with the highest DSC on the validation set.

Some details of the initial training stage and the fine-tuning stage are shown in Table~\ref{table:initial} and Table~\ref{table:fine-tuning} respectively.

\begin{table*}[!htbp]
\caption{Training protocols (initial training stage).}
\label{table:initial}
\begin{center}
% \resizebox{0.47\textwidth}{!}{
\begin{tabular}{ll} 
\hline
Network initialization         & "he" normal initialization\\
\hline
Batch size                    & 2 \\
\hline 
Patch size & 128$\times$128$\times$128  \\ 
\hline
Total epochs & 1000 \\
\hline
Optimizer          & Adam          \\ \hline
Initial learning rate (lr)  & 0.0001 \\ \hline
Lr decay schedule & initial learning rate$\times(1-epoch/500)^{0.9}$ \\
\hline
Training time                                           & 20 hours \\  \hline 
Loss function & the sum of dice loss and cross entropy loss \\  \hline 
Number of model parameters    & 31.42M \\ \hline
\end{tabular}
%}
\end{center}
\end{table*}

\begin{table*}[!htbp]
\caption{Training protocols (fine-tuning stage).}
\label{table:fine-tuning}
\begin{center}
% \resizebox{0.47\textwidth}{!}{
\begin{tabular}{ll} 
\hline
Network initialization         & model after initial training\\
\hline
Batch size                    & 2 \\
\hline 
Patch size & 128$\times$128$\times$128  \\ 
\hline
Total epochs & 40 \\
\hline
Optimizer          & Adam          \\ \hline
Initial learning rate (lr)  & 0.00005\\ \hline
Lr decay schedule & initial learning rate$\times(1-epoch/500)^{0.9}$ \\\hline
Training time                                           & 39 hours \\  \hline 
Loss function & the sum of dice loss and cross entropy loss \\  \hline 
Number of model parameters    & 31.42M \\ \hline
\end{tabular}
\end{center}
\end{table*}


\section{Results and discussion}
\subsection{Quantitative results on validation set}

DSC and NSD results on validation set are shown in Table~\ref{table:validation_set}. It can be seen from the table that Aorta and Liver have best proformance, while tumor has worst proformance.A possible reason of it is that Aorta and Liver is larger organ and has less morphological change.To the contrary, tumor is relatively small,and have more complex morphological structures because all kinds of cancer types is included. 
\begin{table}[htbp]
\caption{Results on validation set.}
\label{table:validation_set}
\begin{center}
% \resizebox{0.47\textwidth}{!}{
\begin{tabular}{l|cc|cc|cc}
\hline
\multirow{2}{*}{Target} & \multicolumn{2}{c|}{Public Validation} & \multicolumn{2}{c|}{Online Validation} & \multicolumn{2}{c}{Testing} \\ 
\cline{2-7} 
                        & DSC(\%)            & NSD(\%)           & DSC(\%)            & NSD(\%)           & DSC(\%)      & NSD (\%)     \\ 
\hline
\hline
Liver                  & 0.9625&0.9775& 0.9624&0.9761&  \\ 
\hline
RK                  & 0.9225&0.9216 &  0.9192 &0.9271&\\ 
\hline
Spleen                   & 0.9497&0.9524 &  0.9434&0.9510& \\ 
\hline
Pancreas                  & 0.8448&0.9615 &  0.8326&0.9544& \\ 
\hline
Aorta                   & 0.9538& 0.9774&  0.9577&0.9816& \\ 
\hline
IVC                  & 0.9425& 0.9613&  0.9397&0.9566& \\ 
\hline
RAG                   & 0.8230&0.9518 &  0.8160&0.9467& \\ 
\hline
LAG                  & 0.8046&0.9430 & 0.8042 &0.9327& \\ 
\hline
Gallbladder                   & 0.7735&0.7510 &  0.7785&0.7553& \\ 
\hline
Esophagus                & 0.8031&0.9075 &  0.8186&0.9287& \\ 
\hline
Stomach                  & 0.8956&0.9364 &  0.9107&0.9461& \\ 
\hline
Duodenum                  & 0.8134&0.9402 &  0.8194&0.9435& \\ 
\hline
LK                 & 0.8688&0.8673 &  0.8920&0.8892& \\ 
\hline
Tumor                 & 0.1173&0.0736 &  0.1530&0.0865& \\ 
\hline
\end{tabular}
\end{center}
\end{table}

\begin{table}[htbp]
\caption{Quantitative evaluation of segmentation efficiency in terms of the running them and GPU memory consumption. Total GPU denotes the area under GPU Memory-Time curve.}
\centering
\begin{tabular}{ccccc}
\hline
Case ID & Image Size      & Running Time (s) & Max GPU (MB) & Total GPU (MB) \\ \hline
0001    & (512, 512, 55)  &  141.13                & 3746   & 30428 MB   \\
0051    & (512, 512, 100) &  143.65                &  3806            & 14327 MB               \\
0017    & (512, 512, 150) &  121.93                &  3590            & 24825 MB               \\
0019    & (512, 512, 215) & 100.79            &        3323      &       14863 MB         \\
0099    & (512, 512, 334) &      73.7            &      3374        &        11619 MB        \\
0063    & (512, 512, 448) &      72.57            &       3370       &         10082 MB       \\
0048    & (512, 512, 499) &    70.87              &       3316       &       10466 MB         \\
0029    & (512, 512, 554) &       74.38           &         3382     &      11035 MB          \\ \hline
\end{tabular}
\end{table}


\subsection{Qualitative results on validation set}

Two examples of good segmentation are shown in Figure ~\ref{fig:goodcases} and two examples of bad segmentation are shown in Figure ~\ref{fig:badcases}. Visualization is achieved with ITK-SNAP~\cite{py06nimg} version 3.6.0.

From the perspective of images, some potential reasons for the bad-segmentation cases are listed below. 

(1) The size of the case is very large, so we have to reduce the size of the case by cuting top and bottom slice to process it in 60 second.

(2) The case is not clear, distorted, or skewed.

(3) There are rare structures in the case that are not in the training set.

\begin{figure}[htbp]
\centering
\includegraphics[scale=0.3]{imgs/good cases.png}
\caption{Good segmentation examples}
\label{fig:goodcases}
\end{figure}

\begin{figure}[htbp]
\centering
\includegraphics[scale=0.3776]{imgs/bad cases.png}
\caption{Bad segmentation examples}
\label{fig:badcases}
\end{figure}

\subsection{Segmentation efficiency results on validation set}

\subsection{Results on final testing set}


\subsection{Limitation and future work}

In terms of model accuracy, first, we does not use pseudo-labels for unlabeled image at present. In the future, we are going to use pseudo-labels for unlabeled image. Second, we consider using some post-processing methods, such as largest connected component extraction, hole filling, open operation and closed operation, which are not used at present.

to reduce the GPU memory consumption. In addition, we consider using some optimization methods to improve the running speed of the model in the future.

\section{Conclusion}
In this paper, we have explored the application of the nnU-Net model for Flare23 abdominal organ segmentation. Leveraging the power of deep learning and the architectural advancements of nnU-Net, we have demonstrated its effectiveness in accurately segmenting organ and tumor in diverse abdominal CT scans.


\subsubsection{Acknowledgements} The authors of this paper declare that the segmentation method they implemented for participation in the FLARE 2023 challenge has not used any pre-trained models nor additional datasets other than those provided by the organizers. The proposed solution is fully automatic without any manual intervention.

%
% ---- Bibliography ----
%
% BibTeX users should specify bibliography style 'splncs04'.
% References will then be sorted and formatted in the correct style.
%
\bibliographystyle{splncs04}
\bibliography{ref}
\begin{table}[!htbp]
\caption{Checklist Table. Please fill out this checklist table in the answer column.}
\centering
\begin{tabular}{ll}
\hline
Requirements                                                                                                                    & Answer        \\ \hline
A meaningful title                                                                                                              & Yes       \\ \hline
The number of authors ($\leq$6)                                                                                                             & 4        \\ \hline
Author affiliations and ORCID                                                                                           & Yes        \\ \hline
Corresponding author email is presented                                                                                                  & Yes        \\ \hline
Validation scores are presented in the abstract                                                                                 & Yes        \\ \hline
\begin{tabular}[c]{@{}l@{}}Introduction includes at least three parts: \\ background, related work, and motivation\end{tabular} & Yes        \\ \hline
A pipeline/network figure is provided                                                                                           & Figure 1 \\ \hline
Pre-processing                                                                                                                  & Page 2   \\ \hline
Strategies to use the partial label                                                                                             & Page 2   \\ \hline
Strategies to use the unlabeled images.                                                                                         & Page 2   \\ \hline
Strategies to improve model inference                                                                                           & Page 2   \\ \hline
Post-processing                                                                                                                 & Page 3   \\ \hline
Dataset and evaluation metric section is presented                                                                              & Page 3   \\ \hline
Environment setting table is provided                                                                                           & Table 1  \\ \hline
Training protocol table is provided                                                                                             & Table 2  \\ \hline
Ablation study                                                                                                                  & Page 6   \\ \hline
Efficiency evaluation results are provided                                                                                     & Table 5 \\ \hline
Visualized segmentation example is provided                                                                                     & Figure 2/3 \\ \hline
Limitation and future work are presented                                                                                        & Yes        \\ \hline
Reference format is consistent.  & Yes    \\ \hline

\end{tabular}
\end{table}
\end{document}
