% This is samplepaper.tex, a sample chapter demonstrating the
% LLNCS macro package for Springer Computer Science proceedings;
% Version 2.21 of 2022/01/12
%
\documentclass[runningheads]{llncs}
%
\usepackage[T1]{fontenc}
% T1 fonts will be used to generate the final print and online PDFs,
% so please use T1 fonts in your manuscript whenever possible.
% Other font encondings may result in incorrect characters.
%
\usepackage{multirow}
\usepackage{graphicx}
% Used for displaying a sample figure. If possible, figure files should
% be included in EPS format.
%
% If you use the hyperref package, please uncomment the following two lines
% to display URLs in blue roman font according to Springer's eBook style:
%\usepackage{color}
%\renewcommand\UrlFont{\color{blue}\rmfamily}
\usepackage[pagebackref=true,breaklinks=true,colorlinks,bookmarks=false]{hyperref}
%
\usepackage{orcidlink}
\usepackage{booktabs}
\newcommand{\orcid}[1]{\href{https://orcid.org/#1}{\textcolor[HTML]{A6CE39}{\aiOrcid}}}
\begin{document}
%
% Efficient cascade network for partially supervised Abdominal pan-cancer segmentation
\title{Two-stage training for abdominal pan-cancer segmentation in weak label}
%
%\titlerunning{Abbreviated paper title}
% If the paper title is too long for the running head, you can set
% an abbreviated paper title here
%
\author{Hanwen Zhang\inst{1,2,\orcidlink{0000-0002-2440-1023}} \and
Yongzhi Huang\inst{1,2,3,\orcidlink{0000-0002-5545-5307}} \and
Bingding Huang\inst{1,*,\orcidlink{0000-0002-4748-2882}}
} 
%
\authorrunning{Zhang et al.}
% First names are abbreviated in the running head.
% If there are more than two authors, 'et al.' is used.
%
\institute{
College of Big Data and Internet, Shenzhen Technology University, Shenzhen, 518188, China\\
\and 
College of Applied Sciences, Shenzhen University, Shenzhen, 518060, China\\
\and
School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing, 100876, China \\
\email{}}
%
\maketitle              % typeset the header of the contribution
%
\begin{abstract}
Constructing comprehensive labeled datasets for medical image segmentation tasks is time-consuming, requiring intensive masks annotated carefully by experienced radiologists. Existing benchmark datasets provide the necessary masks to train the supervised-based segmentation models, including single-organ datasets and multiple-organ datasets. However, it is still challenging when deploying large-scale models with a union of multiple datasets due to annotation conflicts. For example, some organ or tumor annotations are missing in most cases (weak label) in the FLARE23 challenge dataset.
%Constructing a well-labeled dataset is difficult.
To overcome the limitation of segmentation models in this situation, we propose a two-stage training method to train an efficient segmentation model with weak label. In the first stage, only strong labels (complete organ labels) are used to train models by the nnU-Net, while the weak labels (incomplete organ labels) are filled by generating pseudo labels using nnU-Net. Then the lightweight coarse-to-fine network is trained using the supplemented data in the second stage.
Experiments on the FLARE23 challenge (\href{https://codalab.lisn.upsaclay.fr/competitions/12239}{MICCAI FLARE23}) demonstrate that coarse-to-fine networks can reduce computational complexity and resource consumption during the inference stage while maintaining high performance, in the case of pseudo labeled supplementary data. With a speed of 12.6 seconds per case, our proposed method achieves an average DSC of 0.8920 and an average NSD of 0.9482 on the FLARE23 validation set. 

\keywords{Weak label  \and Pseudo label \and Two-stage training.}
\end{abstract}

\footnotetext[1]{*Corresponding author: Bingding Huang(huangbingding@sztu.edu.cn).}

\section{Introduction}

% 1. Background & Representative works
Abdominal organ segmentation is a crucial step in the clinical diagnosis of abdominal diseases. Deep learning-based segmentation methods have demonstrated the ability to efficiently and accurately identify organ boundaries, sizes, and locations, aiding doctors in rapidly identifying potential lesions and disease areas\cite{shen2017deep}. The family of U-Net \cite{U-Net} architectures is the most mainstream in deep supervised learning methods for medical image segmentation tasks. Subsequently, various CNN-based segmentation networks based on the U-Net architecture emerged, such as ResU-Net \cite{xiao2018weighted} and U-Net++ \cite{zhou2018unet++}. Meanwhile, the transformer-based models are also naturally compatible with U-Net architecture, and excellent networks such as Trans U-Net \cite{chen2021transunet}, Swin U-Net \cite{cao2022swin}, and so on have emerged for medical image segmentation. Additionally, some works focus on improving segmentation performance by using multi-view, multi-task, and multi-scale techniques, trying complex data augmentation methods, or other tricks like multi-level feature fusion and deep supervision. The most representative framework is nnU-Net \cite{isensee2021nnu}, which is a milestone work that achieves SOTA performance using U-Net architecture with a series of heuristic rules that can deploy and train segmentation models on any dataset automatically, demonstrating the high adaptability and robustness of its framework. 

% 2. Supervised-based learning has limitations. -> Partially labeled learning
However, even such a comprehensive framework, nnU-Net, cannot be used directly for annotations with different labels in multiple datasets, which is caused by the problem of annotation conflicts. Fig.~\ref{fig:part} gives a specific example to illustrate this problem. Specifically, weak label case (2) contains only tumor, case (3) contains tumor and some organs, and (4) includes all organs without tumor. Therefore, some organs are incorrectly annotated as background, and overlapping annotation conflicts over cases. Although partly labeled data has additional annotation information and also inherits semantic information like unlabeled data, due to annotation conflicts, the performance of models trained by multiple datasets will probably not improve or even degrade compared with models using a single dataset.

% The labeling cost of medical image organ segmentation is very high, resulting in the current benchmark generally containing only a single target organ like MSD, or plus tumor information like LiTS, KiTS, and NIH Pancreas. Despite containing label information of multi-organ (BTCV, FLARE22, and WORD), there are only a few dozen scans.

% Training one general model over multiple datasets will make the features more robust and accurate.  However, in most cases, different datasets have different annotations. 


% 3. Details for Others' Attempts.

To address this issue, many attempts have been made to explore multiple weak label datasets in a more efﬁcient manner. Fang {et al.} proposed a new network named Pyramid Input Pyramid Output Feature Abstraction Network (PIPO-FAN) using multi-scale features to exploit weak label proportion information \cite{fang2020multiorgan}. Enlightened by multi-branch networks and dynamic filter learning, Zhang {et al.}  considered multiple datasets as independent tasks and designed a single shared model, a dynamic on-demand network (DoDNet), receiving task-specific signals to avoid label conflicts \cite{zhang2021dodnet}. A similar approach is conditional nnU-Net proposed by Zhang {et al.} \cite{zhang2021multiorgan}, which also used special signals to control segmentation models dynamically. Different from the design of segmentation architectures, some works tried to reconsider the point of loss functions to solve label conflicts. For instance, Shi {et al.} proposed marginal loss and exclusion loss for weak label supervised multi-organ segmentation \cite{shi2021marginal}. Furthermore, Liu {et al.} merged weak labeled datasets using incremental learning methods, introducing a light memory module mechanism based on marginal loss and exclusion loss to further improve and stabilize the model performance with continuously incremental datasets \cite{liu2022learning}. These methods fully used weak label datasets, enabling the deployment of a comprehensive segmentation model trained by multiple datasets simultaneously. 

% 4. FLARE23 background & Our contributions

In the FLARE23 challenge, the dataset consists of labeled, weakly labeled, and unlabeled CT image data. As shown in Fig.~\ref{fig:data}, only 222 images have complete annotations for all organs, and the remaining 1978 cases only have annotations for specific organs. To achieve higher segmentation performance than baseline supervised learning methods, fully utilizing unlabeled data and resolving annotation conflicts caused by weak label data is a key breakthrough in this competition. To this end, we attempt to merge weak labeled data with completely labeled data and propose an efficient strategy that breaks down the barriers between weak label datasets, even existing conflicts overlapping and further alleviates the problem of developing vanilla segmentation methods combining several different benchmark datasets. We also follow the trend of the FLARE competition series, and pay attention to optimizing the resource consumption and speed in the inference phase. Based on the experience of the Flare22 challenge \cite{ma2023fast} (\href{https://flare22.grand-challenge.org}{2022-MICCAI-FLARE}), using either the nnU-Net \cite{isensee2021nnu} adaptive framework or the EfficientSeg \cite{zhang2021efficient} coarse-to-fine framework combined with a semi-supervised algorithm can effectively handle unlabeled data. We will use the two networks mentioned above to design a training framework that can use weak labels to address the abovementioned challenges.

% figure
\begin{figure}[!htb]
    \centering
    \includegraphics[scale=0.27]{imgs/part.png}
    \caption{Image(1) shows a CT without any annotation. Images (2), (3), and (4) show the weak label, where (2) has only tumor labeling, (3) contains tumor labeling and labeling of some organs, and (4) includes labeling all organs without tumors.}
    \label{fig:part}
\end{figure}

\section{Method}

% 1. Framework
To address the challenges posed by weak labels and imbalanced data in abdominal organ segmentation, we propose a novel training framework that utilizes statistical analysis to divide the data into different categories.

The main objective of our approach is to select relatively well-annotated strong labels from weak labels for the first round of training. We then use the model obtained from the first round of training to supplement the weak label data according to specific rules, enabling iterative training to obtain the final model.

In Section \ref{Proposed-method}, we provide further details on our proposed approach, including the specific rules used to supplement the weak label data and the iterative training process. Our approach leverages the strengths of both the nnU-Net adaptive framework and the two-stage EfficientSeg framework, combined with semi-supervised learning algorithms, to improve the accuracy and efficiency of abdominal organ segmentation.
\subsection{Preprocessing} \label{pre-process}
% \subsubsection{Training}
Our proposed approach leverages the strengths of two networks, nnU-Net and EfficientSeg, each with its own preprocessing techniques.

% overwritten by yhuang
nnU-Net provides a self-configuration pre-training pipeline depending on statistics information in specific datasets. To ensure the high performance of nnU-Net, we utilized this automatic preprocessing method for the FLARE23 dataset, including adjusting the target spacing and then resampling, voxel intensity normalization, and data augmentation techniques.
% For nnU-Net, the network utilizes adaptive statistics and preprocessing to adjust the spacing, crop size, image intensity, and data augmentation techniques for each CT image based on its statistical data. This enables the network to effectively handle variations in image quality and reduce the impact of noise and artifacts.

% overwritten by yhuang
% check!! normalized (z-scoring / to a range of [-1, 1])?
% Which one is correct?
As for EfficientSeg, the network is a two-stage segmentation network that accepts an interpolated overall image as input, eliminating the need to adjust the image spacing. During the coarse segmentation stage, the image is interpolated and scaled to a size of [160, 160, 160]. During the fine segmentation stage, images are cropped so that only foreground regions remain and then padded to a size of [192, 192, 192] before being interpolated and scaled. The foreground information in the training process is provided by ground truths, while the one in the inference process is from masks generated from the coarse segmentation stage. The image intensity is clipped to a range of [-325, 325]. Additionally, a series of data augmentations are used in the fine segmentation stage, shown in Table \ref{table:aug}.

% For EfficientSeg, the network is a two-stage segmentation network that accepts an interpolated overall image as input, eliminating the need to adjust the image spacing. During the coarse segmentation stage, the image is interpolated and scaled to a size of [160, 160, 160]. During the fine segmentation stage, the training and inference processes differ. During training, the image is cropped according to the label's foreground and then padded to a size of [192, 192, 192] before being interpolated and scaled. During inference, the image is cropped based on the coarse segmentation results, rather than the foreground label. The image intensity is cropped to a range of [-325, 325] and then normalized, enabling the network to effectively capture the complex features of abdominal organs and cancerous regions.

% In the fine segmentation stage, a large number of data augmentation techniques were used to improve the model's generalization performance. The specific details of the data augmentation are shown in Table \ref{table:aug}.
% \vspace{-0.5cm}
\begin{table}[!htbp]
\setlength{\abovecaptionskip}{0.2cm}
\setlength{\belowcaptionskip}{0.2cm}
% \vspace{-0.3cm}
\caption{Data augmentation details in the fine segmentation stage.}
\label{table:aug}
\centering
\begin{tabular}{ll}
\hline
RandFlipd-x       & prob=0.5\\
\hline
RandFlipd-y   & prob=0.5 \\
\hline
RandFlipd-z         & prob=0.5\\
\hline
RandZoomd                         &  min-zoom=0.9, max-zoom=1.2, prob=0.15\\
\hline
RandGaussianNoised                  & std=0.01, prob=0.15\\                          \hline
RandGaussianSmoothd                 & sigma=(0.5, 1.15), prob=0.15\\ 
\hline
RandScaleIntensityd & factors=0.3, prob=0.15 \\
\hline
RandAdjustContrastd & prob=0.15 \\
\hline
\end{tabular}
% \vspace{-1cm}
\end{table}

\subsection{Proposed method}\label{Proposed-method}
As shown in Fig.~\ref{fig:data}, statistical analysis is conducted on 2200 annotated data samples in this dataset, revealing a ubiquitous lack or omission of organ or tumor segmentation. To address the challenges of weak labels and imbalanced data, we further analyze the distribution of annotations and propose a framework that can effectively train segmentation models with weak labels. It is worth mentioning that all unlabeled images are not used in our proposed method.
 
After checking category information in annotations, we found that annotations with a single category (excluding background) were mainly for the pan-cancer region segmentation. In contrast, annotations with thirteen categories mainly include regions of abdomen organs. Therefore, we split the dataset into two categories: cases with complete organ annotations (strong label) and cases with partial organ annotations (weak label). 
 
Based on the condition of the FLARE23 dataset, our motivation is to distill knowledge from cases with strong labels, then use it to guide models to segment organs annotated wrongly as background in the weak label, and finally re-train the segmentation model with the whole annotated data. Specifically, our proposed framework consists of three stages: strong label training, weak label supplement, and retraining, as shown in Fig.~\ref{fig:frame}. Each stage's network architecture is configured separately based on specific objectives and requirements. First, the strong label training stage automatically applies the self-configured framework nnU-Net to learn from the well-annotated strong label data. Second, the weak label supplement stage utilizes the EfficientSeg coarse-to-fine framework combined with semi-supervised learning algorithms to supplement the weak label data. Third, the retraining stage combines the two networks to iteratively refine the segmentation model using the supplement weak label data.

% The analysis revealed that all 2200 CT scans were annotated with incomplete weak labels due to the high cost of annotation and the difficulty of obtaining complete annotations for the pan-cancer region.
% By extracting strong organ labels from the weak label annotations, we could design a segmentation framework for weak label training.

% figure
\begin{figure}[!htb]
    \centering
    \includegraphics[scale=0.3]{imgs/data.png}
    \caption{Distribution of label counts: one important finding from our statistical analysis of the 2200 annotated data samples was that 888 of them contained only one label, which was mainly for pan-cancer region segmentation. On the other hand, the 222 samples that contained thirteen labels were primarily used for abdominal multi-organ segmentation.}
    \label{fig:data}
\end{figure}




% figure
\begin{figure}[!htb]
    \centering
    \includegraphics[scale=0.45]{imgs/frame_2.png}
    \caption{This framework consists of three parts. Strong label training: Strong labels are selected from weak labels to be trained individually using nnU-Net. Weak label supplement: The remaining weak label is complemented using nnU-Net-generated labels. Retraining: The coarse-to-fine EfficientSegNet is trained using all the supplement labels to obtain the inference model.}
    \label{fig:frame}
\end{figure}



\subsubsection{Strong label training}
%overwritten by yhuang
In this stage, all annotated training data is split into two parts: weak label data and strong label data. Weak label data are not used due to annotation conflicts caused by missing organ annotations, resulting in degeneration and even not convergence during the training stage. To solve this problem, the strong label is selected to train nnU-Net as a teacher model that can generate credible pseudo labels for complementing annotations on missing organs. In detail, we consider 222 cases of strong label data as an independent training set and train a segmentation model through default nnU-Net 3D configuration.

% During the strong label training stage of our proposed framework, we selected CT scans with labels containing the segmentation of thirteen abdominal organs as the training set. However, the weak label data in the training set suffered from significant incompleteness due to the large number of missing organ annotations. Many labels only included annotations for large-volume organs such as the liver and kidneys. We proposed providing pseudo-labels for the missing annotations during the weak label supplement stage to address this issue.

% To generate high-quality pseudo-labels, we trained a pseudo-label generation network on CT scans with complete organ annotations. We used a common network that predicts gradually by sliding windows, as this prediction method typically provides higher performance and greatly influences the generation of pseudo-labels. Due to its high performance and high robustness, nnU-Net is used for pseudo-label generation networks.


\subsubsection{Weak label supplement} \label{2.2.2}

In the weak label supplement stage, we aim to utilize the nnU-Net model trained on strong label data to complement the missing annotations for organ regions. First, all cases with weak labels are inferred by nnU-Net to generate pseudo labels. We take a redundancy inference mode to obtain accurate pseudo labels, including the Test-Time Augmentation (TTA) method and connected component analysis. It is worth noting that the tumor category is not involved in the above step of pseudo-label generation. Due to poor performance and significant uncertainty in the tumor region, only 13 organ categories are predicted to complement weak labels. 

% In the weak label supplement stage of our proposed framework, we utilize the trained pseudo-label generation network to predict all weak label data, except for the cancer region. This is because cancer region segmentation prediction is challenging, and the final model Dice score is typically low, indicating significant uncertainty in the model's predictions for the cancer region. Additionally, whether CT without labeled pan-cancer areas contains tumors is uncertain. Generating pseudo-labels for the tumors is an inefficient and unreliable approach.

% Therefore, the pseudo-label generation network generates only the thirteen organ labels to supplement the weak labels. We use the Test-Time Augmentation (TTA) method and nnU-Net's adaptive maximum connected region processing method during the pseudo-label generation process to improve the accuracy of the generated labels. Specifically, we apply TTA by randomly rotating and flipping the input CT scans to generate multiple predictions, and then combine them to obtain a more robust and accurate final prediction.

Second, we replace the foreground region wrongly annotated as background in each weak label following a criterion: retaining original foreground annotations in weak labels. This motivation is based on a belief that original foreground annotations have higher accuracy than predicted pseudo labels. In detail, we process each foreground category separately. The specific rules are as follows: For each foreground category in the pseudo label, if this category appears in the weak label, then the pseudo label for this category will be discarded; if the category never appears, the corresponding background region in the weak label will be replaced with this category.

% We only fill in missing organ annotations for each CT scan using the generated organ pseudo-labels. Since the generated organ pseudo-labels have high Dice scores (usually around 0.88), we directly use them as labels for label supplement. We discard pseudo-labels that conflict with existing annotations to ensure the final labels are consistent and accurate.


\subsubsection{Retraining} 
At this stage, the two-stage EfficientSeg will be used for retraining. All annotations used in this stage are from 2200 supplement label data combined with strong label data and supplemented weak label data.

The coarse segmentation stage roughly locates the foreground region in the original image, which guides the foreground cropping for the fine segmentation stage. During the coarse segmentation training, 2200 supplement label data were used in training. Then, the fine segmentation stage further refined segmentation masks cropped from the coarse stage. During the fine segmentation training stage, we utilized supplement labels to locate the foreground as input. By utilizing the supplement labels for fine segmentation training, we achieved significantly improved segmentation accuracy and robustness in EfficientSeg.

\subsubsection{Inference speed and resources consumption trade-offs} 
We use a coarse-to-fine segmentation network in the inference stage to optimize the inference speed and resource usage and to avoid using a time-consuming sliding window technique. Any size image can be segmented through two inference stages by using the coarse-to-fine network. Therefore, the inference speed is improved significantly compared with one-stage segmentation models with the sliding window technique. Following the EfficientSeg implementation, anisotropic convolution, anisotropic pooling, and FP16 are also used to reduce GPU memory usage, which is discussed in detail in \cite{zhang2021efficient}.
% We use a coarse-to-fine segmentation network in the inference phase to optimize the inference speed and resource usage instead of using a sliding window to predict the whole image. The coarse-to-fine inference network only needs to perform inference twice, significantly saving time. Also, anisotropic convolution, anisotropic pooling, and FP16 reduce GPU memory usage. The details are the same as the EfficientSeg implementation.

\subsection{Post-processing}
We employed TTA to improve the final segmentation results during the strong label training stage. Additionally, final segmentation will adaptively keep the largest connected region to reduce false positives. Meanwhile, the coarse and fine segmentation results are also refined by the connected region analysis.

\section{Experiments}
\subsection{Dataset and evaluation measures}
The FLARE 2023 challenge is an extension of the FLARE 2021-2022~\cite{MedIA-FLARE21}~\cite{FLARE22}, aiming to promote the development of foundation models in abdominal disease analysis. The segmentation targets cover 13 organs and various abdominal lesions. The training dataset is curated from more than 30 medical centers under the license permission, including TCIA~\cite{TCIA}, LiTS~\cite{LiTS}, MSD~\cite{simpson2019MSD}, KiTS~\cite{KiTS,KiTSDataset}, and AbdomenCT-1K~\cite{AbdomenCT-1K}. The training set includes 4000 abdomen CT scans where 2200 CT scans with weak label and 1800 CT scans without label. The validation and testing sets include 100 and 400 CT scans, respectively, which cover various abdominal cancer types, such as liver cancer, kidney cancer, pancreas cancer, colon cancer, gastric cancer, and so on. The organ annotation process used ITK-SNAP~\cite{ITKSNAP}, nnU-Net~\cite{nnUNet}, and MedSAM~\cite{MedSAM}.


The evaluation metrics encompass two accuracy measures—Dice Similarity Coefficient (DSC) and Normalized Surface Dice (NSD)—alongside two efficiency measures—running time and area under the GPU memory-time curve. These metrics collectively contribute to the ranking computation. Furthermore, the running time and GPU memory consumption are considered within tolerances of 15 seconds and 4 GB, respectively.


\subsection{Implementation details}
\subsubsection{Environment settings}
The development environments and requirements are presented in Table~\ref{table:env}.
\subsubsection{Dataset split}
There is no  multiple cross-validation for the training of nn-Unet and EfficientSegNet. For the nnU-net, 20\% of 222 cases was randomly selected as the validation set. For the EfficientSegNet, 100 cases in 2200 cases were randomly selected as the validation set.

\begin{table}[!htbp]
\caption{Development environments and requirements.}\label{table:env}
\centering
\begin{tabular}{ll}
\hline
System       & Ubuntu 20.04.1 LTS\\
\hline
CPU   & AMD EPYC 7742 64-Core Processor \\
\hline
RAM                         &1.8TB\\
\hline
GPU                          & 8 NVIDIA A100 (40G)\\
\hline
CUDA version                  & 11.7\\                          \hline
Programming language                 & Python 3.10\\ 
\hline
Deep learning framework & torch 1.10, monai 1.0 \\                                                                 
\hline
Code     &    https://github.com/XIANYUNYEHE-DEL/two-stage-retraining-seg                                                            \\
\hline
\end{tabular}
\end{table}


\subsubsection{Training protocols}
In both the strong label training stage and retraining process of our proposed framework, we utilized three different models with different configurations to improve segmentation accuracy. The protocols of these models are shown in Table \ref{table:protocals1}.
In the training stage of nnU-Net, relevant hyperparameters are automatically generated according to its adaptive rules. Patch size is fixed as 32 * 128 * 192 (D * W * H) and network training using SGD with a learning rate of 0.01 for 1000 epochs.
As for EfficientSegNet, training will be divided into coarse model training and fine model training. In the coarse model training stage, batch size is set to 2 and patch size is fixed as 160 * 160 * 160 (W * H * D). Optimizer in the training is used AdamW with 0.01 learning rate and 0.00001 weight decay. First 50 epochs used as warm-up and using 500 epochs for the training with Cosine Annealing strategy. Loss function is selected to Dice and Cross-Entropy. In the fine model training stage, Most of the settings have not been modified. Patch size is fixed as 192 * 192 * 192 (W * H * D) and training epochs reduced to 300 for saving training time.

% A detailed description of image processing and data augmentation can be found in section \ref{pre-process}


% yhuang: table3~5的三个阶段放在同一个表里吧，不然三个小表占一页，太零散了。
\begin{table}[!htbp]
\setlength{\abovecaptionskip}{0.2cm}
\setlength{\belowcaptionskip}{0.2cm}

\caption{Training and Inference protocols.}
\label{table:protocals1}
\centering
\resizebox{\textwidth}{!}{
\begin{tabular}{lllll} 
\hline
\textbf{Stage}  & Pseudo labeling & Coarse model & Fine model \\ 
\hline
\textbf{Mode}  & nnU-Net 3D & 3D U-Net & EfficientSegNet\\
\hline
\textbf{Network initialization}   & ``he" normal initialization & ``he" normal initialization & ``he" normal initialization\\ \hline
\textbf{Batch size}                     & 2 & 2 & 2 \\ \hline 
\textbf{Patch size}   & 48$\times$192$\times$192 & 160$\times$160$\times$160 & 192$\times$192$\times$192\\ \hline
\textbf{Total epochs}  & 1000 & 500 & 300\\ \hline
\textbf{Optimizer}             & SGD  & AdamW & AdamW    \\ \hline
\textbf{Weight decay}             & 3e-5  & 1e-5 & 1e-5   \\ \hline
\textbf{Initial learning rate (lr)}   & 0.01 & 0.01 & 0.01\\ \hline
\textbf{Lr scheduler} & ReduceLROnPlateau & Warmup and Cosine Annealing & Warmup and Cosine Annealing \\ \hline
\textbf{Training time}     & 72 hours & 24 hours & 36 hours\\  \hline 
\textbf{Loss function}  & Dice and Cross-Entropy & Dice and Cross-Entropy & Dice and Cross-Entropy \\ \hline

\end{tabular}
}
\end{table}



% \begin{table}[!htbp]
% \setlength{\abovecaptionskip}{0.2cm}
% \setlength{\belowcaptionskip}{0.2cm}

% \caption{Training and Inference protocols for nnU-Net.}
% \label{table:protocals1}
% \centering
% % \resizebox{0.47\textwidth}{!}{
% \begin{tabular}{lll} 
% \hline
% Mode  & nnU-Net 3D \\
% \hline
% Network initialization         & ``he" normal initialization \\
% \hline
% Batch size                     & 2  \\
% \hline 
% Patch size   & 48$\times$192$\times$192\\ 
% \hline
% Total epochs  & 1000 \\
% \hline
% Optimizer             & SGD      \\ \hline
% Weight decay             & 3e-5      \\ \hline
% Initial learning rate (lr)   & 0.01\\ \hline
% Lr scheduler & ReduceLROnPlateau\\
% \hline
% Training time     & 72 hours\\  \hline 
% Loss function  & Dice and Cross-Entropy\\ \hline

% \end{tabular}
% %}
% \end{table}


% \begin{table}[!htbp]
% \setlength{\abovecaptionskip}{0.2cm}
% \setlength{\belowcaptionskip}{0.2cm}
% \caption{Training and Inference protocols for the coarse model.}
% \label{table:protocals2}
% \centering
% % \resizebox{0.47\textwidth}{!}{
% \begin{tabular}{lll} 
% \hline
% Mode  & 3D U-Net \\
% \hline
% Network initialization         & ``he" normal initialization \\
% \hline
% Batch size                     & 2  \\
% \hline 
% Patch size   & 160$\times$160$\times$160\\ 
% \hline
% Total epochs  & 500 \\
% \hline
% Optimizer             & AdamW      \\ \hline
% Weight decay             & 1e-5      \\ \hline
% Initial learning rate (lr)   & 0.01\\ \hline
% Lr scheduler & Warm Up and CosineAnnealing\\
% \hline
% Training time     & 24 hours\\  \hline 
% Loss function  & Dice and Cross-Entropy\\ \hline

% \end{tabular}
% %}
% \end{table}

% \begin{table}[!htbp]
% \setlength{\abovecaptionskip}{0.2cm}
% \setlength{\belowcaptionskip}{0.2cm}
% \caption{Training and Inference protocols for the fine model.}
% \label{table:protocals3}
% \centering
% % \resizebox{0.47\textwidth}{!}{
% \begin{tabular}{lll} 
% \hline
% Mode  & EfficientSegNet \\
% \hline
% Network initialization         & ``he" normal initialization \\
% \hline
% Batch size                     & 2  \\
% \hline 
% Patch size   & 192$\times$192$\times$192\\ 
% \hline
% Total epochs  & 300 \\
% \hline
% Optimizer             & AdamW      \\ \hline
% Weight decay             & 1e-5      \\ \hline
% Initial learning rate (lr)   & 0.01\\ \hline
% Lr scheduler & Warm Up and CosineAnnealing\\
% \hline
% Training time     & 36 hours\\  \hline 
% Loss function  & Dice and Cross-Entropy\\ \hline

% \end{tabular}
% %}
% \end{table}
% % \vspace{-0.7cm}


\section{Results and discussion}

\subsection{Quantitative results on validation set}
We used EfficientSegNet, which was trained directly using 2200 cases of labeled data as the baseline. nnU-Net, which was trained using 222 cases (strong label) containing all organ segmentations, and EfficientSegNet, which was trained using our weak label training framework, were compared with baseline on public validation, respectively. The quantitative results are shown in Table \ref{table:nn}.

\begin{table}[htbp]
\caption{Quantitative evaluation results for ablation study on online validation.
}\label{table:nn}
\centering
\setlength{\tabcolsep}{2.5mm}{
\begin{tabular}{l|cc|cc|cc}
\hline
\multirow{2}{*}{Target} & \multicolumn{2}{c|}{baseline} & \multicolumn{2}{c|}{nnU-net(222)} & \multicolumn{2}{c}{EfficientSegNet} \\ \cline{2-7} 
                        & DSC(\%)            & NSD(\%)           & DSC(\%)            & NSD(\%)    & DSC(\%)            & NSD(\%)        \\ \hline
Liver & 94.72  & 93.03 & 96.58 & 98.34 & 97.45 & 98.63\\

Right Kidney & 87.81  & 85.23 & 93.12 & 94.29 & 93.36 & 94.11\\

Spleen & 91.55  & 91.51 & 96.04 & 97.35 & 96.71 & 98.29\\

Pancreas & 02.08  & 02.37 & 84.69 & 96.29 & 84.67 & 95.19\\

Aorta & 85.36  & 84.95 & 96.28 & 98.62 & 95.85 & 98.68\\

Inferior vena cava & 71.92  & 63.29 & 94.47 & 96.24 & 93.78 & 96.17\\

Right adrenal gland & 02.00  & 02.00 & 83.08 & 95.39 & 81.18 & 94.64\\

Left adrenal gland & 01.00  & 01.00 & 80.59 & 93.18 & 78.55 & 92.03\\

Gallbladder & 10.00  & 10.00 & 82.93 & 82.01 & 85.38 & 85.87\\

Esophagus & 00.00  & 00.00 & 83.17 & 93.47 & 82.79 & 93.53\\

Stomach & 04.73  & 03.55 & 92.71 & 96.65 & 92.67 & 96.66\\

Duodenum & 31.19  & 55.92 & 84.84 & 95.97 & 83.76 & 95.27\\

Left kidney & 87.84  & 91.34  & 84.95 & 92.29 & 93.13 & 93.55\\

Tumor & 05.48  & 01.68  & 00.00 & 00.00 & 29.98 & 20.49\\ \hline

Average & 43.83  & 44.47 & 89.22 & 94.62 & 89.20 & 94.82\\ \hline

\end{tabular}  }
\end{table}

% \begin{table}[htbp]
% \caption{Quantitative evaluation results for EfficientSegNet on online validation.
% }\label{table:ef}
% \centering
% \setlength{\tabcolsep}{2.5mm}{
% \begin{tabular}{l|cc|cc}
% \hline
% \multirow{2}{*}{Target} & \multicolumn{2}{c|}{baseline} & \multicolumn{2}{c}{EfficientSegNet}  \\ \cline{2-5} 
%                         & DSC(\%)            & NSD(\%)           & DSC(\%)            & NSD(\%)           \\ \hline
% Liver & 0.9472  & 0.9303 & 0.9745 & 0.9863 \\

% Right Kidney & 0.8781  & 0.8523 & 0.9336 & 0.9411 \\

% Spleen & 0.9155  & 0.9151 & 0.9671 & 0.9829 \\

% Pancreas & 0.0208  & 0.0237 & 0.8467 & 0.9519 \\

% Aorta & 0.8536  & 0.8495 & 0.9585 & 0.9868 \\

% Inferior vena cava & 0.7192  & 0.6329 & 0.9378 & 0.9617 \\

% Right adrenal gland & 0.0200  & 0.0200 & 0.8118 & 0.9464 \\

% Left adrenal gland & 0.0100  & 0.0100 & 0.7855 & 0.9203 \\

% Gallbladder & 0.1000  & 0.1000 & 0.8538 & 0.8587 \\

% Esophagus & 0.0000  & 0.0000 & 0.8279 & 0.9353 \\

% Stomach & 0.0473  & 0.0355 & 0.9267 & 0.9666 \\

% Duodenum & 0.3119  & 0.5592 & 0.8376 & 0.9527 \\

% Left kidney & 0.8784  & 0.9134  & 0.9313 & 0.9355 \\

% Tumor & 0.0548  & 0.0168  & 0.2998 & 0.2049 \\ \hline

% Average & 0.4383  & 0.4447 & 0.8920 & 0.9482 \\ \hline

% \end{tabular}  }
% \end{table}

We observed that using weak labels for direct training often resulted in poor labeling quality, which can negatively impact the training process and lead to eventual failure. We decomposed the task into three stages to address this issue: strong label training, weak label supplement, and retraining.

For strong label training, we utilized nnU-Net, a well-established segmentation model trained on a dataset of 222 cases containing all organ segmentations with strong labels. Our experiments showed that nnU-Net achieved a Dice similarity coefficient (DSC) of 0.892, indicating that it is effective in organ segmentation. We then used the organ segmentation results obtained from nnU-Net as a generative network for organ pseudo-label.

We used EfficientSegNet to train on the 2200 cases with pseudo-label for retraining. Our experiments showed that EfficientSegNet achieved an average DSC of 0.892 for all organs and a tumor DSC of 0.299. 
% The final results of our method on the test set are shown in Table \ref{tab:final-results}.

\begin{table}[htbp]
\caption{Quantitative evaluation results.}
\label{tab:final-results}
\centering
\begin{tabular}{l|cc|cc|cc}
\hline
\multirow{2}{*}{Target} & \multicolumn{2}{c|}{Public Validation} & \multicolumn{2}{c|}{Online Validation} & \multicolumn{2}{c}{Testing} \\ \cline{2-7} 
                        & DSC(\%)            & NSD(\%)           & DSC(\%)            & NSD(\%)           & DSC(\%)      & NSD (\%)     \\ \hline
Liver                   & 97.46 $\pm$ $1.03$                  &   94.61 $\pm$ 5.13                  &   97.45                 &   98.63                &  96.38          &  96.99            \\
Right Kidney            & 90.46 $\pm$ $19.95$                    & 87.09 $\pm$ 20.45                  &  93.36                  &   94.11                &  93.89            & 93.49             \\
Spleen                  & 95.83 $\pm$ $8.02$                   & 96.87 $\pm$ 7.34                  &  96.71                  &  98.29                 &   96.02           &  97.41            \\
Pancreas                & 86.09 $\pm$ $7.24$                   & 83.09 $\pm$ 11.89                  &  84.67                  &  95.19                 & 88.87             &  96.62            \\
Aorta                   & 95.03 $\pm$ $2.98$                   & 95.42 $\pm$ 5.30                  & 95.85                   &  98.68                 & 96.16             &  99.15            \\
Inferior vena cava      & 92.70 $\pm$ $3.84$                   & 89.18 $\pm$ 6.59                  &  93.78                  &   96.17                & 94.32             &  97.06            \\
Right adrenal gland     & 77.31 $\pm$ $20.09$                    & 89.80 $\pm$ 19.19                  & 81.18                   &  94.64                 & 80.57             &  94.22            \\
Left adrenal gland      & 77.35 $\pm$ $16.86$                    & 88.95 $\pm$ 19.38                  &  78.55                   &  92.03                 & 79.63             & 93.15             \\
Gallbladder             & 79.84 $\pm$ $28.03$                    & 79.85 $\pm$ 29.78                  &  85.38                  &  85.87                 & 81.92             & 84.31             \\
Esophagus               & 81.58 $\pm$ $16.65$                    & 83.45 $\pm$ 17.22                  &  82.79                  &  93.53                 & 88.26             & 97.75             \\
Stomach                 & 93.21 $\pm$ $3.89$                    & 90.62 $\pm$ 10.57                 &  92.67                  &  96.66                 &  92.18            &  96.26            \\
Duodenum                & 83.49 $\pm$ $6.38$                    & 79.70 $\pm$ 9.00                  &  83.76                   & 95.27                   &   86.20           & 96.24             \\
Left kidney             & 91.43 $\pm$ $14.96$                    & 87.72 $\pm$ 17.00                  &  93.13                  &  93.55                 &  92.96            & 93.34             \\
Tumor                   & 35.39 $\pm$ $34.80$                    & 24.97 $\pm$ 28.54                 &  29.98                  &  20.49                 &   39.64           &   26.51           \\ \hline
Organ average           & 84.08 $\pm$ $22.19$                    & 83.67 $\pm$ 23.83                  &  89.20                  &  94.82                 &  89.69            &  95.02            \\ \hline


\end{tabular}
\end{table}




\subsection{Qualitative results on validation set}
Figure \ref{fig:seg} shows the segmentation results for the baseline and our method. Among the results in case\#0047 and case\#0070, our method can accurately segment organs and identify tumor regions and make precise judgments even for segmentation at the boundaries of some small organs. However, in case\#0029 and case\#0035, our method shows some false-negative determinations of the tumor region. The locations marked in the red box in the diagram show some false-negative situations. The blue area in the box is the pan-cancer area label. It can be observed that our method always wrongly classified tumor regions as normal organs. The reason may be that there is tumor regions in the supplemented organ label area, but our method has no suitable strategy to correct it.


% 解决图片排版不紧凑的终极代码！
\renewcommand\floatpagefraction{.9}
\renewcommand\topfraction{.9}
\renewcommand\bottomfraction{.9}
\renewcommand\textfraction{.1}
\setcounter{totalnumber}{50}
\setcounter{topnumber}{50}
\setcounter{bottomnumber}{50}

\begin{figure}[!htbp]
\centering
\includegraphics[scale=0.34]{imgs/16img.jpg}
\caption{Qualitative results of on good (\#0047 and \#0070) and bad (\#0029 and \#0035) cases. The first column is the image, the second is the ground truth, the third is the Baseline results, and the fourth is the predicted results by our method.
}
\label{fig:seg}
\end{figure}
% \vspace{-0.6cm}

\subsection{Segmentation efficiency results on validation set}
The efficiency test results are shown in Table \ref{table:efficiency-results}. Using less than 4GB of GPU Memory, our method can also infer larger images in less than 20 seconds.

%yhuang: table7的排序有什么规则吗？可以按照case ID的有大到小排序吗？(Q:官方默认的，根据数据大小来测试速度的)
\begin{table}[htbp]

\caption{Quantitative evaluation of segmentation efficiency in terms of the running them and GPU memory consumption. Total GPU denotes the area under the GPU Memory-Time curve. Evaluation GPU platform: NVIDIA QUADRO RTX5000 (16G).
}
\centering
\begin{tabular}{ccccc}
\label{table:efficiency-results}
\hline
Case ID & Image Size      & Running Time (s) & Max GPU (MB) & Total GPU (MB) \\ \hline
0001    & (512, 512, 55)  & 12.6       & 3032   & 10455    \\
0051    & (512, 512, 100) & 9.04                 &  2956            &  9044              \\
0017    & (512, 512, 150) & 9.64       & 2970             &  10245             \\
0019    & (512, 512, 215) & 11.39                 & 2994             &   13420            \\
0099    & (512, 512, 334) & 11.33                 & 3172             & 13395               \\
0063    & (512, 512, 448) &  14.58                &  3400            &  20043              \\
0048    & (512, 512, 499) &  14.6                &  3254            & 19642               \\
0029    & (512, 512, 554) &  16.87                & 3938             &  24931              \\ \hline
\end{tabular}
\end{table}

\subsection{Results on final testing set}
The test results are shown in Table \ref{tab:final-results}. In the test dataset, we achieved an average DSC of 0.8969 and NSD of 0.9502 for all organs. This is reliable for organ segmentation. At the same time, the average inference time of our method is less than 10s with few resources. However, there are great limitations on the segmentation effect of tumors.

% \begin{table}[htbp]
% \caption{Testing Result}
% \centering
% \begin{tabular}{cccccc}
% \label{table:test-results}
% Organ DSC & Organ NSD   & Lesion DSC  & Lesion NSD & Time(s) & GPU Memory \\ \hline
% 0.8969    & 0.9502  & 0.3964       & 0.2651   & 9.9   & 10833 \\ \hline
% \end{tabular}
% \end{table}


\subsection{Limitation and future work}
The two-stage coarse-to-fine model used in our proposed framework maintains high inference speed while achieving a high level of segmentation performance. However, we found that the performance of tumor segmentation was worse than that of abdominal organs. After an elaborate analysis of bad cases, we found that tumors are annotated as irregular regions with non-smooth edges. In contrast, the predicted ones are probably smoothed into sphere-like regions after the resizing operation, resulting in an unnegligible error in the edge. In future work, we will further investigate segmentation models with high performance on tumors. One solution is to replace the traditional resizing operation with learning-based methods like correlation interpolation.

% After an elaborate analysis of bad cases, we found that the labeling of tumor regions was more detailed layer by layer, resulting in non-smooth edges. For the coarse-to-fine model, the predicted tumor segmentation is often rounded due to resizing the input data, leading to a low Dice. Correlation interpolation methods using deep learning can adaptively solve this problem instead of traditional ones.



\section{Conclusion}
This paper proposed a two-stage training approach to overcome the problem that weak label data cannot be used for training general segmentation models directly. A pseudo-label generating network is trained using those cases with strong labels in the first training. After supplementing all weak label data using pseudo labels, the coarse-to-fine network is retrained for the inference stage. Under the limitation of computing resources, experimental results show that our method fully uses weak label data and performs well in segmentation and inference speed. 


\subsubsection{Acknowledgements} We would like to thank the School-Enterprise Graduate Student Cooperation Fund of Shenzhen Technology University. The authors of this paper declare that the segmentation method they implemented for participation in the FLARE 2023 challenge has not used any pre-trained models nor additional datasets other than those provided by the organizers. The proposed solution is fully automatic without any manual intervention. We thank all the data owners for making the CT scans publicly available and CodaLab~\cite{codalab} for hosting the challenge platform. 


%
% ---- Bibliography ----
%
% BibTeX users should specify bibliography style 'splncs04'.
% References will then be sorted and formatted in the correct style.
%
% \bibliographystyle{splncs04}
\bibliographystyle{unsrt}
\bibliography{ref}

\newpage
% Please add the following required packages to your document preamble:
% \usepackage[normalem]{ulem}
% \useunder{\uline}{\ul}{}
\begin{table}[!htbp]
\caption{Checklist Table. Please fill out this checklist table in the answer column.}
\centering
\begin{tabular}{ll}
\hline
Requirements                                                                                                                    & Answer        \\ \hline
A meaningful title                                                                                                              & Yes        \\ \hline
The number of authors ($\leq$6)                                                                                                             & 3        \\ \hline
Author affiliations, Email, and ORCID                                                                                           & Yes       \\ \hline
Corresponding author is marked                                                                                                  & Yes        \\ \hline
Validation scores are presented in the abstract                                                                                 & Yes       \\ \hline
\begin{tabular}[c]{@{}l@{}}Introduction includes at least three parts: \\ background, related work, and motivation\end{tabular} & Yes        \\ \hline
A pipeline/network figure is provided                                                                                           & 4 \\ \hline
Pre-processing                                                                                                                  & 4   \\ \hline
Strategies to use the partial label                                                                                             & 6   \\ \hline
Strategies to use the unlabeled images.                                                                                         & None   \\ \hline
Strategies to improve model inference                                                                                           & 8   \\ \hline
Post-processing                                                                                                                 & 8   \\ \hline
Dataset and evaluation metric section is presented                                                                              & 9   \\ \hline
Environment setting table is provided                                                                                           & 9 \\ \hline
Training protocol table is provided                                                                                             & 10  \\ \hline
Ablation study                                                                                                                  & 9 - 11   \\ \hline
Efficiency evaluation results are provided                                                                                     & 12 \\ \hline
Visualized segmentation example is provided                                                                                     & 13 \\ \hline
Limitation and future work are presented                                                                                        & Yes       \\ \hline
Reference format is consistent.  & Yes       \\ \hline

\end{tabular}
\end{table}

\end{document}
