% This is samplepaper.tex, a sample chapter demonstrating the
% LLNCS macro package for Springer Computer Science proceedings;
% Version 2.21 of 2022/01/12
%
\documentclass[runningheads]{llncs}
%
\usepackage[T1]{fontenc}
% T1 fonts will be used to generate the final print and online PDFs,
% so please use T1 fonts in your manuscript whenever possible.
% Other font encondings may result in incorrect characters.
%
\usepackage{multirow}
\usepackage{graphicx}
% Used for displaying a sample figure. If possible, figure files should
% be included in EPS format.
%
% If you use the hyperref package, please uncomment the following two lines
% to display URLs in blue roman font according to Springer's eBook style:
\usepackage{color}

\usepackage[pagebackref=true,breaklinks=true,colorlinks,bookmarks=false]{hyperref}
\renewcommand\UrlFont{\color{blue}\rmfamily}
%
\usepackage{amsmath}
% \usepackage{amsfonts}
% \usepackage{amssymb}
% \usepackage{amsthm}

\usepackage{bbding}

%==================================================
\begin{document}
%
\title{Selected Partially Labeled Learning for Abdominal Organ and Pan-cancer Segmentation}
%speed inference based on nnU-Net
%
\titlerunning{Selected Partially Labeled Learning for Segmentation}
% If the paper title is too long for the running head, you can set
% an abbreviated paper title here
%
% \author{First Author\inst{1}\orcidID{0000-1111-2222-3333} \and
% Second Author\inst{2,3}\orcidID{1111-2222-3333-4444} \and
% Third Author\inst{3}\orcidID{2222--3333-4444-5555} (please list your team members here. Each team can have at most 6 members.)
% } 
% %
\author{Yuntao Zhu\inst{1(}\Envelope\inst{)}\orcidID{0000-0003-2816-2709}
\and
Liwen Zou\inst{1}\orcidID{0000-0003-4085-4003}
\and
Linyao Li\inst{1}\orcidID{0009-0001-2630-1683}
\and
Pengxu Wen\inst{1}\orcidID{0009-0000-5211-4876}
}
%
\authorrunning{Y. Zhu et al.}
% First names are abbreviated in the running head.
% If there are more than two authors, 'et al.' is used.
%
\institute{ \inst{1} Department of Mathematics, Nanjing University, Nanjing, China\\
\email{YuntaoZhu7@smail.nju.edu.cn}}
%
\maketitle              % typeset the header of the contribution
%
\begin{abstract}
% The abstract should briefly summarize the main contribution of the paper and the validation performance.
% Our method achieved an average DSC score of 90.34\% and 60.23\% for the organs and lesions on the validation set and the average running time and area under GPU memory-time cure are 10s and 10000MB, respectively. 
% (150--250 words)
% \\
% The total length of the manuscript should be at least 8 pages (don't include references). There is no limitation for the maximum number of pages. 

Obtaining labeled data from medical images is very expensive and labor intensive. At the same time, the large number of existing publicly available medical image datasets are usually labeled with only some of the organs as target regions, while other organs in the image are ignored. It is a challenge to train a neural network to segment all labeled categories using only partially labeled data.
We design a compound loss, the selected partially cross entropy and dice loss, that allows the neural network to learn specific categories from partially labeled data.
In addition, we improve the inference and training process of nnU-Net to reduce computational resources and accelerate inference. 
Experiments demonstrate that our method achieves the average Dice Similarity Coefficient of 0.8514 and 0.1514 on 13 abdominal organ and tumor segmentation tasks, and enables the network to efficiently segment specific categories from partially labeled data. 
Moreover, it significantly improves the inference speed, with an average running time of 21.8 seconds, and uses only an average of 2531 MB of maximum GPU memory.

\keywords{Partially labeled learning  \and Accelerate inference \and Lightweight network.}
\end{abstract}



\section{Introduction}
% The introduction should have at least three parts. For each part, you can write multiple paragraphs to clarify your motivations and ideas. 

% P1. Introduce the background and difficulty of this challenge


% P2. Related work/state-of-the-art methods on semi-supervised/partial-label segmentation


% P3. Your motivation and solution/contribution. 
Medical image segmentation aims to extract and quantify regions of interest in biological tissue or organ images. 
The results of target organ segmentation have many important clinical applications, such as organ quantification, surgical planning, and disease diagnosis.
In recent years, deep learning-based methods have been widely used to automatically segment abdominal organs.
Among these methods, nnU-Net~\cite{nnUNet} is a popular and robust framework that has won a number of organ segmentation challenges. 
Although it is convenient for fully supervised organ segmentation tasks and provides a solid baseline result by automatically setting network hyperparameters, this approach does not support weakly supervised segmentation and the inference process is computationally expensive and time consuming.
%
Numerous studies have shown that the methodological performance of deep neural networks often relies heavily on the availability of large, high-quality labeled datasets for organ segmentation tasks.
In order to learn robust data representations for robust and efficient medical image segmentation, we need large datasets with thousands of labeled or unlabeled data for supervised, weakly supervised, and self-supervised learning.
But,
the annotation of 3D medical images is a difficult and laborious task. Thus, depending on the task, only a bare minimum of images and target structures is usually annotated. This results in a situation where a zoo of partially labeled datasets is available to the community.
In this context, the organizer of FLARE2023 build a large-scale and diverse abdomen CT dataset, including 40000 CT scans from servel medical datasets.
There are 2200 labeled data and 1800 unlabeled data available. 
Compared with FLARE 2021-2022 ~\cite{MedIA-FLARE21,FLARE22}, the challenge for FLARE 2023 is how to leverage the large amount of partial labels and unlabeled data to improve the segmentation performance while taking into account efficient inference.


% ===2
% The success of deep neural networks heavily relies on the availability of large and diverse annotated datasets across a range of computer vision tasks. To learn a strong data representation for robust and performant medical image segmentation, huge datasets with either many thousands of annotated data structures or less specific self-supervised pretraining objectives with unlabeled data are needed \cite{ZHOU2021101840,swinunetr}. 
% The annotation of 3D medical images is a difficult and laborious task. Thus, depending on the task, only a bare minimum of images and target structures is usually annotated. This results in a situation where a zoo of partially labeled datasets is available to the community. 
% Recent efforts have resulted in a large dataset FLARE 2023~\cite{MedIA-FLARE21} of 4000 CT images with 14 annotated classes each, thus providing more than 10.000 manual annotations. Focusing on such a dataset prevents leveraging the potentially precious additional information of the above mentioned other datasets that are only partially annotated. 

In recent years, there has been a rapid evolution of semi-supervised and self-supervised learning methods \cite{ZHOU2021101840,swinunetr}. These techniques typically learn better representations by utilizing unlabeled data, ultimately improving segmentation performance.
On the one hand,
one frequently employed approach in semi-supervised learning is pseudo-labeling. This method pairs the segmentation results of the network on unlabeled data as pseudo-labels, adds them to the training set, and repeats the process over several iterations.
On the other hand,
integrating potentially valuable additional information from different datasets, which are partially labeled, can provide more information about different anatomical target structures or related details, as well as different types of pathology.
Therefore, recent advances in weak supervision explore how partially annotated datasets can train a model to segment all annotated categories \cite{partially_survey}.
Early methods considered unlabeled organs as background \cite{roulet_joint_2019,fang_multi-organ_2020} and imposed penalties for overlapping predictions based on mutual exclusivity of organs \cite{shi_marginal_2020,fidon_label-set_2021}.
\cite{wangBowelNetJointSemanticGeometric2023} transforms the cross-entropy loss and dice loss by assigning unlabeled data from partially labeled data to the background class.
 \cite{dmitriev_learning_2019,zhang_dodnet_2020,clip_medicalseg} predict just one structure of concern per forward pass through the integration of category information at various network stages.
\cite{luoWORDLargeScale2022} use of partial cross entropy and intraclass gray regular terms allows segmentation under weak supervision.
\cite{ulrichMultiTalentMultiDatasetApproach2023} ignores the channels where unlabeled categories are located, designs a loss function that mixes binary cross-entropy and dice loss, and can handle the task of category overlap in partial labeling learning.
However, there is a lack of methods that utilize both pseudo-labeled data and partially labeled learning techniques to handle organ, tumor segmentation tasks like FLARE23 that contain partially labeled and unlabeled data.

    
% So far, all previous methods do not convincingly leverage cross-dataset synergies. 
% As Liu et al. pointed out, one common caveat is that many methods force the resulting model to average between distinct annotation protocol characteristics \cite{clip_medicalseg} by combining labels from different datasets for the same target structure %(visualized in Figure 1 b))
% . Hence, they all fail to reach segmentation performance on par with cutting-edge single dataset segmentation methods. 

% ====3
% To this end, we introduce MultiTalent (MULTI daTAset LEarNing and pre-Training), a new, flexible, multi-dataset training method: 1) MultiTalent can handle classes that are absent in one dataset but annotated in another during training. 2) It retains different annotation protocol characteristics for the same target
% structure and 3) allows for overlapping target structures with
% different level of detail such as liver, liver vessel and liver tumor. Overall, MultiTalent can include all kinds of new datasets irrespective of their annotated target structures. \\ MultiTalent can be used in two scenarios: First, in a combined multi-dataset (MD) training to generate one foundation segmentation model that is able to predict all classes that are present in any of the utilized datasets, and second, for pre-training to leverage the learned representation of this foundation model for a new task. In experiments with a large collection of abdominal CT datasets, the proposed model outperformed state-of-the-art segmentation networks that were trained on each dataset individually as well as all previous methods that incorporated multiple datasets for training. Interestingly, the benefits of MultiTalent are particularly notable for more difficult classes and pathologies. In comparison to an ensemble of single dataset solutions, MultiTalent comes with shorter training and inference times.\\
% Additionally, at the example of three challenging datasets, we demonstrate that fine-tuning MultiTalent yields higher segmentation performance than training from scratch or initializing the model parameters using unsupervised pre-training strategies \cite{swinunetr,ZHOU2021101840}. It also surpasses supervised pretrained and fine-tuned state-of-the art models on most tasks, despite requiring orders of magnitude less annotations during pre-training.


In this paper, we present a framework that utilizes both pseudo-labeled and partially labeled learning by designing a selected partially loss.
We also improve nnU-Net for efficient inference and less computational resource respectively.
Specifically, we choose to merge 13 organ classes of pseudo-labels and partial labels, while leaving the remaining classes unchanged, resulting in a partial labeling of the tumor.
The selected partially loss, which is a combination of cross-entropy loss and dice loss, introduces a selected class mask to determine whether the class loss will compute and backward gradient.
Otherwise, we find that the resampling process in the inference is time-consuming.
To address this issue, we have rewritten the implementation of the resampling method and utilized a smaller network and lower resolution to minimize the computational requirements during inference.

Our main contributions are summarized as follows:
\begin{itemize}
    \item We present a new approach, selected partially loss, which enables the use of both pseudo label and partial label data, thereby expanding the potential applications of current segmentation models.
    \item We optimize the time-consuming components of the resampling code in nnU-Net.
    \item The experiment shows that our method improves the detectability of the segmentation network for the selected class. This outperforms the baseline by 5 percentage points for the Dice Similarity Coefficient (DSC).
\end{itemize}

\section{Method}
% A detailed description of the method used and a figure should be included to show your pipeline.
\begin{figure}[htbp]
\centering
\includegraphics[scale=0.18]{imgs/pipelinev2.pdf}
\caption{Overview of our framework.
Our framework consists of three parts. Firstly, we construct a training set by combining pseudo-labels. Secondly, we reduce computation costs by using a small nnU-Net. 
Lastly, we train nnU-Net by a selected-partially-loss so that it can learn from both unlabeled and partially-labeled data.
}
\label{fig:Network}
\end{figure}
%###########################
\subsection{Preprocessing}
\label{sec:preprocess}
% Full description of any pre-processing strategy. Please details on the following aspects
% \begin{itemize} 
%  \item Data clean or statistical analysis (e.g., annotation distribution of different segmentation targets, volume distribution...)
%  \item Resampling method for anisotropic data
%  \item Intensity normalization method
%  \item Others
% \end{itemize}

For image prepossessing, all of our settings follow the default nnU-NetV2.
\begin{itemize} 
 \item Statistical analysis is conducted on data
 pertaining to volume spacing and foreground intensity. 
\item CT images are clipped
 at the 0.5 and 99.5 percentiles of foreground voxels. 
\item All images are normalized through the subtraction
 of the mean and division by the standard deviation. 
\item The volume is then resampled to a target spacing of (2.42,1.95,1.95).
\end{itemize}

\subsection{Proposed Method}
% \textbf{Please provide figures to show your pipeline or network architecture.} 
% Figure~\ref{fig:Network} shows a typical example of 3D nnU-Net~\cite{nnUNet}.
% (note: the name is \textbf{nnU-Net} rather than nnunet, nnUNet. Similarly, please use \textbf{U-Net} rather than unet, u-net).



% Explain network architecture details.

% Loss function: we use the summation between Dice loss and cross-entropy loss because compound loss functions have been proven to be robust in various medical image segmentation tasks~\cite{LossOdyssey}. 

% \textbf{Please introduce your strategies to deal with the partial labels.}


% \textbf{Please introduce your strategies to use the 1800 unlabeled images.} If you don't use them, please explicitly say "Unlabeled images were not used."
% Please also clarify whether you used the pseudo labels generated by the FLARE22 winning algorithm~\cite{FLARE22-1st-Huang} and the best-accuracy-algorithm~\cite{FLARE22-bestDSC-Wang}.


In Figure \ref{fig:Network} we present an overview of our framework, which consists of three components.
We filter the data by pseudo and select 300 cases as the training set.
We then train a small nnU-Net using a compound partially loss on lower resolution.
And our compound partially loss main refer to 
\cite{ulrichMultiTalentMultiDatasetApproach2023,wangBowelNetJointSemanticGeometric2023,luoWORDLargeScale2022}.

\subsubsection{Fusion of pseudo-labels and partial-labels}
\label{sec:unlabel}
% Strategies to use the unlabelled cases
We use two pseudo-labels generated by ~\cite{FLARE22-1st-Huang,FLARE22-bestDSC-Wang}, consisting of 13 organ categories for all 4000 cases.
First, We calculate the DSC of the two pseudo-labels, evaluate their differences, and filter out the samples with DSC greater than 0.85. 
we sort them by their ID numbers. 
Subsequently, we select first 200 cases from partially labeled CT volumes and first 100 cases from unlabeled CT volumes to construct the training set. 
Then, the pseudo-labels are merged with the selected cases that do not contain the ground truth annotation of the class.
Therefore, for the 300 cases, there are 13 organ labels (ground truth or pseudo) and tumor is partially labeled.
All of our results use the pseudo-labels generated by the two FLARE 2022 methods.
% In addition, some cases only have a partial organ region in CT scans, resulting in a lack of several class labels.

\subsubsection{Problem definition}
We begin with a dataset $D$, with $N$ image and label pairs $D = \{(x,y)_1,..., (x,y)_{N}\}$. In the dataset, every image voxel $x_i, i \in [1,I]$, is assigned to one class $c \in C$, where $ C$ is the label set associated to dataset $D$.
Since the tumor is included in some organs commonly, but the pseudo label does not annotate the tumor.
This implies that the network must predict multiple classes for one voxel to account for the inconsistent class definitions. 
To resolve the issue of label inconsistency, we separate the segmentation results for each class by applying a sigmoid activation function to replace the softmax activation function on the dataset.

% Motivation and description of the method details. \textbf{Pre-trained models are not allowed to use in this challenge.}
\subsubsection{Partially loss for selected categories}
\label{sec:partloss}
We employ the binary cross-entropy (BCE) loss and the dice loss for each class over all $B, b\in [1,B]$, images in a batch: 

\begin{equation}
     L_c = \frac{1}{ B\times I  }\sum_{b,i} BCE(\hat{y}_{i,b,c}, \; y_{i,b,c})  -\frac{2\sum_{b,i}\hat{y}_{i,b,c} \; y_{{i,b,c}}} {\sum_{b,i}\hat{y}_{i,b,c} +\sum_{b,i} y_{i,b,c}}
\end{equation}
% While the regular dice loss is calculated for each image within a batch, we calculate the dice loss jointly for all images of the input batch. This regularizes the loss if only a few voxels of one class are present in one image and a larger area is present in another image of the same batch. Thus, an inaccurate prediction of a few pixels in the first image has a limited effect on the loss. In the following, we unite the sum over the image voxels $i$ and the batch $b$ to $\sum_{z}$.
We modify the loss function to be calculated only for classes that are annotated in the corresponding partially labeled dataset \cite{roulet_joint_2019,fang_multi-organ_2020}.
% in the following indicated by $\mathbb{1}_c$, where $\mathbb{1}_c = 1$ if $c \in S$ and 0 otherwise.
% Instead of averaging, we add up the loss over the classes.
% Hence, the loss signal for each class prediction does not depend on the number of other classes within the batch.
This partially loss formalize as follow:
\begin{equation}
    L = \frac{1}{\sum_{b,c} \mathbb{1}_{b,c}^{(h)}  } \sum_{b,c} \Big( \frac{\mathbb{1}_{b,c}^{(h)}}{I } \sum_{i} BCE(\hat{y}_{i,b,c}, \; y_{i,b,c})  - 
    \frac{2\sum_{i}\mathbb{1}_{b,c}^{(h)} \; \hat{y}_{i,b,c} \; y_{i,b,c}}{\sum_{i}\mathbb{1}_{b,c}^{(h)} \; \hat{y}_{i,b,c} +\sum_{i} \mathbb{1}_{b,c}^{(h)} \; y_{i,b,c}} \Big)
\end{equation}

\begin{align*}
    \mathbb{1}_{b,c}^{(h)} = 
    \begin{cases}
     0, \, if \, c \in S \, \text{and} \, h = False,  \\
    1, \, otherwise, \\
    \end{cases} 
\end{align*}
where $c \in S$ is the selected class set, we set $S = \{tumor\}$,
$h$ is false if the ground truth data does not include the class $c$, otherwise it is true. 
The loss use the summation between dice loss and binary cross entropy loss because compound loss functions have been proved to be robust in various medical image segmentation tasks~\cite{LossOdyssey}. 

\subsubsection{Speeding inference}
\label{sec:speed}
% \textbf{Please introduce your strategies to improve inference speed and reduce resource consumption} 

\begin{table*}[!htbp]
\caption{Network architecture and inference process.}
\label{table:speed}
\centering
\begin{tabular}{l|l} 
\hline
Channels in the first stage         & 16\\
\hline
Convolution number per stage         & 2 \\
\hline 
Patch size & 128$\times$128$\times$128  \\ 
\hline
Downsampling times & 4 \\
\hline
inference process          & (Sigmoid, Threshold, Resample)     \\ \hline
Deep supervision          & True     \\
\hline
\end{tabular}

\end{table*}
In order to improve inference speed and reduce resource consumption,
 we use a small-size network structure in reference~\cite{FLARE22-1st-Huang}.
And we change the default resampling function and order, which effectively speeds up the inference. 
The setup of network architecture and inference process are presented in Table~\ref{table:speed}.
Comparison of different strategy settings in Table~\ref{table:net-setting} . The default is full resolution setting of nnU-Net and the small is low resolution modified. The tiny is the first stage of the cascade network that we design to have a lower resolution.
However, we do not use the cascade network as the final docker submission because it does not improve the accuracy and speed of the segmentation results.

\begin{table*}[!htbp]
\caption{Comparison of different strategy settings. The order of axes of input patch size and spacing is (z,y,x).}
\label{table:net-setting}
\centering
\begin{tabular}{l|l|l|l} 
\hline
Settings & Default & Small & Tiny \\
\hline
Channels in the first stage      & 32   & 16 & 8 \\
\hline
Convolution number per stage     & 2    & 2 & 2 \\
\hline 
Patch size & 56$\times$192$\times$160 & 128$\times$128$\times$128 & 80$\times$96$\times$96  \\ 
\hline
Downsampling times & 5 & 4 & 4 \\
\hline
Input spacing & (2.5, 0.8, 0.8) & (2.42, 1.95, 1.95) & (5, 3.9, 3.9)  \\ 
\hline
\end{tabular}

\end{table*}
\subsection{Post-processing}
\label{sec:postprocess}
% \textbf{Description of post-processing of the model outputs to get the final output in the training stage.}

We do not perform any post-processing, such as connected component analysis or testing time augmentation, during our inference.

\section{Experiments}
\subsection{Dataset and evaluation measures}
\label{sec:measures}
The FLARE 2023 challenge is an extension of the FLARE 2021-2022~\cite{MedIA-FLARE21}\cite{FLARE22}, aiming to aim to promote the development of foundation models in abdominal disease analysis. The segmentation targets cover 13 organs and various abdominal lesions. The training dataset is curated from more than 30 medical centers under the license permission, including TCIA~\cite{TCIA}, LiTS~\cite{LiTS}, MSD~\cite{simpson2019MSD}, KiTS~\cite{KiTS,KiTSDataset}, autoPET~\cite{autoPET-Data,autoPET-MICCAI22}, TotalSegmentator~\cite{TotalSegmentator}, and AbdomenCT-1K~\cite{AbdomenCT-1K}. The training set includes 4000 abdomen CT scans where 2200 CT scans with partial labels and 1800 CT scans without labels. The validation and testing sets include 100 and 400 CT scans, respectively, which cover various abdominal cancer types, such as liver cancer, kidney cancer, pancreas cancer, colon cancer, gastric cancer, and so on. The organ annotation process used ITK-SNAP~\cite{ITKSNAP}, nnU-Net~\cite{nnUNet}, and MedSAM~\cite{MedSAM}.


The evaluation metrics encompass two accuracy measures—Dice Similarity Coefficient (DSC) and Normalized Surface Dice (NSD)—alongside two efficiency measures—running time and area under the GPU memory-time curve. These metrics collectively contribute to the ranking computation. Furthermore, the running time and GPU memory consumption are considered within tolerances of 15 seconds and 4 GB, respectively.


\subsection{Implementation details}
\subsubsection{Environment settings}
The development environments and requirements are presented in Table~\ref{table:env}.
\begin{table}[!htbp]
\caption{Development environments and requirements.}\label{table:env}
\centering
\begin{tabular}{l|l}
\hline
System       &  Ubuntu 20.04.5 LTS\\
\hline
CPU   &  Intel(R) Xeon(R) Gold 6354 CPU @ 3.00GHz \\
\hline
RAM                         & 16$\times $4GB;  1600MT$/$s\\
\hline
GPU (number and type)                         & 1 $\times$ NVIDIA A100 40G\\
\hline
CUDA version                  &  11.7\\                          \hline
Programming language                 &  Python 3.10.11\\ 
\hline
Deep learning framework & Pytorch 2.0.0, torchvision 0.2.2 \\
\hline
Specific dependencies         &        nnU-Net 2.0                \\                                                                      
\hline
Code     &      \url{https://github.com/orangeqqq/FLARE23}                            \\
\hline
\end{tabular}
\end{table}


\subsubsection{Training protocols}
% Please describe at least the following aspects:

% 1. processing of the unlabeled images and partial labels

% 2. Data augmentation (Based on the winning solutions in FLARE 2021~\cite{MedIA-FLARE21}, we recommend using extensive data augmentation)

% 3. patch sampling strategy

% 4. optimal model selection criteria
\begin{table*}[!htbp]
\caption{Training protocols.}
\label{table:training}
\centering
\begin{tabular}{ll} 
\hline
Network initialization         & “He” normal initialization\\
\hline
Batch size                    & 4 \\
\hline 
Patch size & 128$\times$128$\times$128  \\ 
\hline
Total epochs & 1000 \\
\hline
Optimizer          &    SGD with nesterov momentum ($\mu$ =0.99)    \\ \hline
Initial learning rate (lr)  & 0.01 \\ \hline
Lr decay schedule & Poly learning rate policy: $(1 - epoch/1000)^{0.9}$ \\
\hline
Training time                                           & 10 hours \\  \hline 
Loss function & Cross entropy loss and dice loss\\     \hline
Number of model parameters    & 5.22M\footnote{https://github.com/sksq96/pytorch-summary} \\ \hline
Number of flops & 121G\footnote{https://github.com/facebookresearch/fvcore} \\ \hline
CO$_2$eq & 11.2 Kg\footnote{https://github.com/lfwa/carbontracker/} \\  \hline
\end{tabular}
\end{table*}
% 叙述
The training protocols of the small nnU-Net are listed in Table \ref{table:training}.
For the unlabeled images, we select 100 cases with the pseudo label to train the network.
For partial labels, we use the partial cross-entropy and dice loss in the training stage.
the pseudo labels generated by the FLARE22 winning algorithm~\cite{FLARE22-1st-Huang} and the best-accuracy-algorithm~\cite{FLARE22-bestDSC-Wang}.
 We employ the same data augmentation as the default setting of nnU-Net, which includes additive brightness, gamma, rotation, scaling, and elastic deformation on the fly during training. 
During inference, the model does not perform test time augmentation (TTA) of flipping.
The patch sampling strategy is foreground over-sampling. 
Finally, we choose the model that obtains the fast and best accuracy on the online validation.




% \begin{table*}[!htbp]
% \caption{Training protocols for the refine model (if using two-stage framework).}
% \label{table:training2nd}
% \begin{center}
% % \resizebox{0.47\textwidth}{!}{
% \begin{tabular}{ll} 
% \hline
% Network initialization         & \\
% \hline
% Batch size                    & 2 \\
% \hline 
% Patch size & 80$\times$192$\times$160  \\ 
% \hline
% Total epochs & 1000 \\
% \hline
% Optimizer          & SGD with nesterov momentum ($\mu=0.99$)          \\ \hline
% Initial learning rate (lr)  & 0.01 \\ \hline
% Lr decay schedule & halved by 200 epochs \\
% \hline
% Training time                                           & 72.5 hours \\  \hline 
% Number of model parameters    & 41.22M\footnote{https://github.com/sksq96/pytorch-summary} \\ \hline
% Number of flops & 59.32G\footnote{https://github.com/facebookresearch/fvcore} \\ \hline
% CO$_2$eq & 1 Kg\footnote{https://github.com/lfwa/carbontracker/} \\  \hline
% \end{tabular}
% \end{center}
% \end{table*}


\section{Results and discussion}
% Note: Please describe at least the following aspects in this section


% 1. The effect of using unlabelled cases;


% 2. In what kind of cases the proposed method works well?

% 3. What are the possible reasons for the failed cases or organs?


% 4. Segmentation efficiency analysis

% % 可变设置
% resolution, lowlow,low,full
% Network size: smallsmall, small, nomarl
% pseudo label
% unlabel data
% loss: CE+dice; selected partial CE+dice;
% inference: probility upsample + softmax; 
% softmax + upsample near neigborhood


\begin{table}[htbp]
\caption{Quantitative evaluation results in terms of DSC(\%) and NSD(\%). 
% \textbf{The results should correspond to your final docker submission. The public validation denotes the performance on the 50 validation cases with ground truth. Please present both the mean score and standard deviation. The online validation denotes the leaderboard results. The Testing results will be released during MICCAI. Please leave them blank at present.} You can use a similar 
% Table format to present the ablation study results of the public and online validation. A useful online tool to create latex table \url{https://www.tablesgenerator.com/latex_tables.}
}\label{tab:final-results}
\centering
\begin{tabular}{l|ll|cc|cc}
\hline
\multirow{2}{*}{Target} & \multicolumn{2}{c|}{Public Validation} & \multicolumn{2}{c|}{Online Validation} & \multicolumn{2}{c}{Testing} \\ \cline{2-7} 
    % & DSC(\%)            & NSD(\%)           & DSC(\%)            & NSD(\%)           & DSC(\%)      & NSD (\%)     \\ \hline
    & DSC           & NSD         & DSC            & NSD           & DSC      & NSD     \\ \hline
% Liver                   & 83.45 $\pm$ $5.32$                  &   83.45 $\pm$ 5.32                  &   0.9734                 &     0.9746               &            &              \\
        Liver & 95.54 $\pm$ 2.53 & 96.86 $\pm$ 5.34 & 95.62 & 97.12 & 94 & 95.75  \\ 
        Right Kidney & 87.89 $\pm$ 19.54 & 88.35 $\pm$ 20.41 & 89.35 & 90.01 & 90.64  & 90.44  \\ 
        Spleen & 93.06 $\pm$ 3.77 & 93.55 $\pm$ 8.12 & 93.18 & 93.86 & 95.23  & 93.1 \\ 
        Pancreas & 82.05 $\pm$ 5.93 & 95.41 $\pm$ 4.86 & 80.72 & 94.5 & 82.34  & 95.29 \\ 
        Aorta & 93.05 $\pm$ 2.06 & 97.64 $\pm$ 3.19 & 93.35 & 97.98 & 93.03  & 98.25 \\ 
        Inferior vena cava & 88.05 $\pm$ 5.56 & 90.98 $\pm$ 6.52 & 88.06 & 90.7 & 88.84  & 92.31 \\ 
        Right adrenal gland & 74.67 $\pm$ 12.86 & 91.33 $\pm$ 13.74 & 75.24 & 92.12 & 72.30  & 90.79 \\ 
        Left adrenal gland & 71.41 $\pm$ 13.29 & 88.43 $\pm$ 14.0 & 72.83 & 89.22 & 70.88  & 88.32 \\ 
        Gallbladder & 82.06 $\pm$ 19.92 & 81.27 $\pm$ 21.06 & 82.52 & 81.86 & 74.83  & 74.85 \\ 
        Esophagus & 78.46 $\pm$ 14.01 & 91.15 $\pm$ 14.41 & 79.12 & 92.15 & 81.91  & 94.91 \\ 
        Stomach & 90.23 $\pm$ 6.08 & 95.25 $\pm$ 6.71 & 90.6 & 95.07 & 89.59  & 94.53 \\ 
        Duodenum & 78.06 $\pm$ 8.28 & 93.96 $\pm$ 5.57 & 78.25 & 93.53 & 79.30  & 94.42 \\ 
        Left kidney & 86.96 $\pm$ 16.61 & 87.77 $\pm$ 17.72 & 87.96 & 88.78 & 89.12  & 89.19 \\ 
        Tumor & 18.21 $\pm$ 23.28 & 10.27 $\pm$ 15.24 & 15.14 & 8.72 & 17.61  & 8.32  \\ 
        \hline
        Average & 79.98 $\pm$ 10.98 & 85.87 $\pm$ 11.21 & 80.14 & 86.12 & 79.97  & 85.75 \\ 
\hline
\end{tabular}
\end{table}

% Note to Table~\ref{tab:final-results}: if you have multiple solutions, such as a faster model with lower DSC or a slower model with higher DSC, you can use a similar Table format to report the performance on the public/online validation set.


\subsection{Quantitative results on validation set}
% Please report the Dice and NSD scores of organs and tumors on the validation set

% Please do ablation studies to analyze the effect of unlabelled data.

In Table~\ref{tab:final-results}, we report the DSC and NSD of the final docker commit results. The average of the 50 public validation and the 100 online validation are the same, both achieving a DSC of about 0.80 and an NSD of 0.86.
In general, large organs like the liver, spleen, kidney, and stomach have high accuracy. 
However, accurate identification of small and complex objects, such as tumors, adrenal glands, and the duodenum, presents significant challenges. It requires more attention, especially when dealing with extremely small and indistinct boundaries.

\begin{figure}[!htbp]
\centering
\includegraphics[scale=0.35]{imgs/flare_show.pdf}
\caption{Qualitative results on two easy cases (Case \#FLARE23Ts\_0038 with DSC of 0.89 and Case \#FLARE23Ts\_0043 with DSC of 0.84) and two hard cases (Case \#FLARE23Ts\_0057 with DSC of 0.66 and Case \#FLARE23Ts\_0067 with DSC of 0.74).
}
\label{fig:seg}
\end{figure}

We report the online validation results of the model without unlabelled data, normal inference processes, and cascade networks in Table~\ref{tab:Ablation-results}. 
The model using unlabelled data resulted in an increase of the DSC from 0.7925 to 0.8013. Specifically, in tumor regions, it increased the DSC by 0.045.
Additionally, normal inference alone increased the DSC by approximately 0.04.
However, the cascade network, P-Cascade and N-Cascade, which added a network training in a lower resolution setup with twice the spacing of
the original, did not achieve higher DSC and NSD results.
P-Cascade is the results of partially compound loss and N-Cascade is the the results of normal compound loss. 
Comparing the two, we find that the model trained by partially labeled loss has better results for tumor segmentation, with an improvement in DSC value of 0.05.

\begin{figure}[!htbp]
\centering
\includegraphics[scale=0.35]{imgs/flare_show3.pdf}
\caption{3D visualization on two easy cases (Case \#FLARE23Ts\_0038 with DSC of 0.89 and Case \#FLARE23Ts\_0043 with DSC of 0.84) and two hard cases (Case \#FLARE23Ts\_0057 with DSC of 0.66 and Case \#FLARE23Ts\_0067 with DSC of 0.74).
}
\label{fig:seg3d}
\end{figure}






\subsection{Qualitative results on validation set}
% please show at least two examples with good segmentation results and two examples with bad segmentation results in the validation set 

% the ground truth of 50 validation cases has been released 
% \url{https://drive.google.com/drive/folders/16Jz7nC907UOhr7hhWLUS4X9XwgkV0qBJ?usp=drive_link}

% \begin{figure}[!htbp]
% \centering
% \includegraphics[scale=0.25]{imgs/flare23-results.png}
% \caption{please show two examples with good segmentation results and two examples with bad segmentation results in the validation set. this is a demo figure. (credit to Zhang, F. et al. FLARE22.)
% }
% \label{fig:seg}
% \end{figure}


% How to draw this kind of figure?
% 1. Install ITKSNAP \url{http://www.itksnap.org/pmwiki/pmwiki.php}

% 2. Open the image and adjust the window level and width to 40 and 400 respectively

% 3. Load segmentation results and adjust the label opacity to 50

% 4. Task snapshots (ITKSNAP has this function: click on the camera button )

% 5. Put all these snapshots in ppt
Figure~\ref{fig:seg} presents easy and difficult validation set examples for segmentation, along with a 3D visualization in Figure~\ref{fig:seg3d}.
Promising results were observed for Case \#FLARE23Ts\_0038 and Case \#FLARE23Ts\_43, but the segmentation of Case \#FLARE23Ts\_57 and Case \#FLARE23Ts\_67 was poor due to a large tumor that caused the network to make classification errors.


\subsection{Segmentation efficiency results on validation set}


In Table \ref{tab:efficiency}, we observe a set of cases that increase in size from (512,512,55) to (512,512,554).
The efficiency evaluation results are reported from official tests.
It is seen that the average max GPU is 2531MB, and run time increase twice for the biggest case \#0029 than the smallest case \#0001.
This demonstrates the effectiveness of our inference strategy.


\begin{table}[h!]
\caption{Quantitative evaluation of segmentation efficiency in terms of the running time and GPU memory consumption. Total GPU denotes the area under GPU Memory-Time curve. Evaluation GPU platform: NVIDIA QUADRO RTX5000 (16G).
% Note: if you didn't make validation docker submissions during the challenge, you can obtain these metrics on your local GPU. Our evaluation code has been released at \url{https://github.com/JunMa11/FLARE/tree/main/FLARE23}. You could use a similar table format for ablation studies. Please don't change the case IDs. 
}\label{tab:efficiency}
\centering
\begin{tabular}{ccccc}
\hline
Case ID & Image Size      & Running Time (s) & Max GPU (MB) & Total GPU (MB) \\ \hline
% 0001    & (512, 512, 55)  & e.g., 8.23       & e.g., 2453   & e.g., 10453    \\
0001 & (512, 512, 55) & 19.61  & 2426  & 10028  \\ 
0051 & (512, 512, 100) & 17.83  & 2590  & 12296  \\ 
0017 & (512, 512, 150) & 30.86  & 2634  & 15949  \\ 
0019 & (512, 512, 215) & 22.72  & 2486  & 12401  \\ 
0099 & (512, 512, 334) & 27.94  & 2586  & 15394  \\ 
0063 & (512, 512, 448) & 33.50  & 2630  & 17508  \\ 
0048 & (512, 512, 499) & 35.22  & 2614  & 18610  \\ 
0029 & (512, 512, 554) & 42.53  & 2744  & 22299 \\ 
\hline
\end{tabular}
\end{table}
\subsection{Results on final testing set}
% This is a placeholder. We will send you the testing results during MICCAI (2023.10.8).
In table \ref{tab:final-results}, we report the DSC and NSD of the final testing set.
The average values are comparable to those of the 50 public validations and the 100 online validations, with both achieving a DSC of about 0.80 and a NSD of about 0.86.
In general, the low accuracy of segmenting small and complex shaped objects such as tumors, adrenal glands and duodenums
Their accurate segmentation still faces great challenges and needs more attention, especially when dealing with extremely small and unclear boundaries.

% 消融实验
\begin{table}[htbp]
\caption{Ablation studies of online validation quantitative evaluation results in terms of DSC(\%) and NSD(\%). P-Cascade is the results of partially compound loss and N-Cascade is the the results of normal compound loss.
}\label{tab:Ablation-results}
\centering
\begin{tabular}{l|cc|cc|cc|cc}
\hline
\multirow{2}{*}{Target} & \multicolumn{2}{c|}{w/o unlabeled data} & \multicolumn{2}{c|}{Normal inference} & \multicolumn{2}{c|}{N-Cascade} & \multicolumn{2}{c}{P-Cascade} \\ \cline{2-9} 
        % & DSC(\%)            & NSD(\%)           & DSC(\%)            & NSD(\%)           & DSC(\%)      & NSD (\%)     \\ \hline
        & DSC           & NSD         & DSC            & NSD           & DSC      & NSD   & DSC      & NSD   \\ \hline
Liver               & 95.77 & 97.09 & 97.34 & 97.46 & 95.63 & 97.63 & 95.9     & 97.5     \\
Right Kidney        & 89.93 & 90.49 & 92.18 & 91.46 & 90.27 & 91.27 & 89.9     & 91.28    \\
Spleen              & 93.57 & 94.46 & 97    & 97.58 & 91.34 & 92.01 & 92.68    & 93.44    \\
Pancreas            & 79.66 & 93.5  & 84.22 & 94.82 & 79.74 & 93.78 & 79.79    & 93.6     \\
Aorta               & 92.29 & 96.86 & 96.57 & 99.03 & 92.59 & 97.42 & 93.23    & 97.79    \\
Inferior vena cava  & 87.24 & 89.84 & 91.06 & 91.43 & 86.38 & 88.25 & 87.06    & 89.12    \\
Right adrenal gland & 74.24 & 91.78 & 85.51 & 95.48 & 72.75 & 90.19 & 73.35    & 90.59    \\
Left adrenal gland  & 71.19 & 87.59 & 83.27 & 93.27 & 72.47 & 89.09 & 72.33    & 88.76    \\
Gallbladder         & 80.34 & 79.38 & 86.09 & 86.55 & 77.9  & 77.05 & 80.54    & 79.83    \\
Esophagus           & 78.13 & 90.88 & 83.09 & 93.4  & 78.57 & 91.89 & 79.05    & 92.26    \\
Stomach             & 90.52 & 94.52 & 93.12 & 95.51 & 89.95 & 94.58 & 90.37    & 94.91    \\
Duodenum            & 77.31 & 93.19 & 81.45 & 93.43 & 78.42 & 94.19 & 78.25    & 93.97    \\
Left kidney         & 88.69 & 88.97 & 91.06 & 90.65 & 88.23 & 89.43 & 87.67    & 86.93    \\
Tumor               & 10.64 & 5.92  & 15.17 & 8.42  & 10.25 & 6.99  & 15.88    & 10.43    \\


\hline
Average             & 79.25 & 85.32 & 84.08 & 87.75 & 78.89 & 85.27 & 79.71 & 85.74 \\ 
\hline
\end{tabular}
\end{table}

% \begin{table}[htbp]
% \caption{Ablation studies of online validation quantitative evaluation results. 
% }\label{tab:Ablation-results}
% \centering
% \begin{tabular}{l|cc|cc|cc}
% \hline
% \multirow{2}{*}{Target} & \multicolumn{2}{c|}{w/o unlabeled data} & \multicolumn{2}{c|}{Normal inference} & \multicolumn{2}{c}{Cascade} \\ \cline{2-7} 
%         % & DSC(\%)            & NSD(\%)           & DSC(\%)            & NSD(\%)           & DSC(\%)      & NSD (\%)     \\ \hline
%         & DSC           & NSD         & DSC            & NSD           & DSC      & NSD     \\ \hline
%         Liver & 95.77 & 97.09 & 97.34 & 97.46 & 95.63 & 97.63 \\ 
%         Right Kidney & 89.93 & 90.49 & 92.18 & 91.46 & 90.27 & 91.27 \\ 
%         Spleen & 93.57 & 94.46 & 97 & 97.58 & 91.34 & 92.01 \\ 
%         Pancreas & 79.66 & 93.5 & 84.22 & 94.82 & 79.74 & 93.78 \\ 
%         Aorta & 92.29 & 96.86 & 96.57 & 99.03 & 92.59 & 97.42 \\ 
%         Inferior vena cava & 87.24 & 89.84 & 91.06 & 91.43 & 86.38 & 88.25 \\ 
%         Right adrenal gland & 74.24 & 91.78 & 85.51 & 95.48 & 72.75 & 90.19 \\ 
%         Left adrenal gland & 71.19 & 87.59 & 83.27 & 93.27 & 72.47 & 89.09 \\ 
%         Gallbladder & 80.34 & 79.38 & 86.09 & 86.55 & 77.9 & 77.05 \\ 
%         Esophagus & 78.13 & 90.88 & 83.09 & 93.4 & 78.57 & 91.89 \\ 
%         Stomach & 90.52 & 94.52 & 93.12 & 95.51 & 89.95 & 94.58 \\ 
%         Duodenum & 77.31 & 93.19 & 81.45 & 93.43 & 78.42 & 94.19 \\ 
%         Left kidney & 88.69 & 88.97 & 91.06 & 90.65 & 88.23 & 89.43 \\ 
%         Tumor & 10.64 & 5.92 & 15.17 & 8.42 & 10.25 & 6.99 \\ 
% \hline
% Average & 79.25 & 85.32 & 84.08 & 87.75 & 78.89 & 85.27 \\ 
% \hline
% \end{tabular}
% \end{table}
\subsection{Limitation and future work}
There are many ways to improve the network inference process, such as a more efficient sliding window.
The challenge provided 4,000 CT cases, but we only utilized 300 cases and did not adequately utilize the data.
For the challenging task of tumor segmentation, pseudo-labeling is a simple and effective way to improve model performance, and we will continue to explore methods that utilize both pseudo-labeling and partial labeling learning in the future.



\section{Conclusion}
In this paper, we present a framework that combines partial labeling learning and pseudo-labeling, which is effective and flexible for a variety of situations.
In addition, we use a small nnU-Net and improve the inference process, effectively reducing its required computational resources and inference time.
Because the amount of data used in training is small, performance on the full data will be explored in the future.
The approach in this paper will be a good baseline result for exploring partial labeling learning and pseudo-labeling.



\subsubsection{Acknowledgements} The authors of this paper declare that the segmentation method they implemented for participation in the FLARE 2023 challenge has not used any pre-trained models nor additional datasets other than those provided by the organizers. The proposed solution is fully automatic without any manual intervention. We thank all the data owners for making the CT scans publicly available and CodaLab~\cite{codalab} for hosting the challenge platform. 


%
% ---- Bibliography ----
%
% BibTeX users should specify bibliography style 'splncs04'.
% References will then be sorted and formatted in the correct style.
%
\bibliographystyle{splncs04}
\bibliography{ref}

\newpage
% Please add the following required packages to your document preamble:
% \usepackage[normalem]{ulem}
% \useunder{\uline}{\ul}{}
\begin{table}[!htbp]
\caption{Checklist Table. Please fill out this checklist table in the answer column.}
\centering
\begin{tabular}{ll}
\hline
Requirements                                                                                                                    & Answer        \\ \hline
A meaningful title                                                                                                              & Yes        \\ \hline
The number of authors ($\leq$6)                                                                                                             & 4       \\ \hline
Author affiliations and ORCID                                                                                           & Yes        \\ \hline
Corresponding author email is presented                                                                                                  & Yes        \\ \hline
Validation scores are presented in the abstract                                                                                 & Yes       \\ \hline
\begin{tabular}[c]{@{}l@{}}Introduction includes at least three parts: \\ background, related work, and motivation\end{tabular} & Yes        \\ \hline
A pipeline/network figure is provided                                                                                           & \ref{fig:Network} \\ \hline
Pre-processing                                                                                                                  & 3~\ref{sec:preprocess}  \\ \hline
Strategies to use the partial label                                                                                             & 3~\ref{sec:partloss}  \\ \hline
Strategies to use the unlabeled images.                                                                                         & 3~\ref{sec:unlabel}   \\ \hline
Strategies to improve model inference                                                                                           & 4~\ref{sec:speed}   \\ \hline
Post-processing                                                                                                                 & 4~\ref{sec:postprocess}   \\ \hline
Dataset and evaluation metric section is presented                                                                              & 5~\ref{sec:measures}   \\ \hline
Environment setting table is provided                                                                                           & \ref{table:env}  \\ \hline
Training protocol table is provided                                                                                             & \ref{table:training}  \\ \hline
Ablation study                                                                                                                  & \ref{tab:Ablation-results}   \\ \hline
Efficiency evaluation results are provided                                                                                     & \ref{tab:efficiency} \\ \hline
Visualized segmentation example is provided                                                                                     & \ref{fig:seg} \\ \hline
Limitation and future work are presented                                                                                        & Yes        \\ \hline
Reference format is consistent.  & Yes        \\ \hline

\end{tabular}
\end{table}

\end{document}
