% This is samplepaper.tex, a sample chapter demonstrating the
% LLNCS macro package for Springer Computer Science proceedings;
% Version 2.21 of 2022/01/12
%
\documentclass[runningheads]{llncs}
%
\usepackage[T1]{fontenc}
% T1 fonts will be used to generate the final print and online PDFs,
% so please use T1 fonts in your manuscript whenever possible.
% Other font encondings may result in incorrect characters.
%
\usepackage{multirow}
\usepackage{graphicx}

% Used for displaying a sample figure. If possible, figure files should
% be included in EPS format.
%
% If you use the hyperref package, please uncomment the following two lines
% to display URLs in blue roman font according to Springer's eBook style:
%\usepackage{color}
%\renewcommand\UrlFont{\color{blue}\rmfamily}
\usepackage[pagebackref=true,breaklinks=true,colorlinks,bookmarks=false]{hyperref}
\usepackage{amssymb, amsmath, bm, latexsym, comment}
\usepackage{xcolor}
\usepackage{subfigure}
\usepackage{indentfirst}
\usepackage{setspace}
\usepackage{verbatim}
\usepackage{array}
\usepackage{cite}
\usepackage{booktabs}
\usepackage[capitalize]{cleveref}
\usepackage{arydshln}
\usepackage{multirow}
\usepackage{subfigure}

\usepackage[ruled,vlined]{algorithm2e}

%
\begin{document}
%
\title{Partial annotation-based organs and tumor segmentation with progressive weakly supervised learning}
%
\titlerunning{PWS-Seg}
% If the paper title is too long for the running head, you can set
% an abbreviated paper title here
%
\author{Dengqiang Jia\inst{1}\orcidID{0000-0002-0902-1882} \and
Zilong Wang\inst{2} } 
%
\authorrunning{Dengqiang Jia et al.}
% First names are abbreviated in the running head.
% If there are more than two authors, 'et al.' is used.
%
\institute{Hong Kong Centre for Cerebro-cardiovascular Health Engineering (COCHE), HongKong, China\\
\and
School of Electronic Information and Electrical Engineering,
Shanghai Jiao Tong University, Shanghai, China\\
\email{\{dqjia\}@hkcoche.org}}
%
\maketitle              % typeset the header of the contribution
%
\begin{abstract}
In medical image analysis, obtaining labeled data is expensive and time-consuming.
Numerous unlabeled data can be used for efficient abdominal organ segmentation.
Besides, partially annotated data is easier to collect and can be used to develop label-efficient algorithms, reducing the annotation cost for considerable performance.
We proposed a progressive weakly supervised learning for abdomen organs and tumor segmentation, i.e., PWS-Seg.
PWS-seg can learn from organs to tumors via a progressive framework based on partially annotated images.
Moreover, we applied a class-wise label fusion strategy to get a new set of reliable pseudo labels.
On the FLARE2023 online validation cases, with the help of unlabeled data, our method obtained the average dice similarity coefficient (DSC) of 82.68\% and average normalized surface distance (NSD) of 86.00\%, which is better than the method only using partial annotated. 
The average running time is 100.17s per case in the inference phase, and the maximum used GPU
memory is 4128 MB.
%\\
%The total length of the manuscript should be at least 8 pages (don't include references). There is no limitation for the maximum number of pages. 

\keywords{Partial label  \and Self-training \and Organ segmentation.}
\end{abstract}



\section{Introduction}

Supervised segmentations in medical image analysis depend on the quantity and quality of manual voxel-level labels, which is time-consuming and expensive because of the requirement of professional domain knowledge.

In more common scenarios where there is a small amount of labeled data and a large amount of unlabeled data \cite{everingham2015pascal, cordts2016cityscapes}, semi-supervised segmentation is effective \cite{blum1998combining,chen2020naive, mittal2019semi}.
Larger medical image datasets are provided for researchers, e.g., FLARE22 Challenge\cite{ma2023fast}, to develop the semi-supervised segmentation.

Besides the semi-supervised segmentation, label-effective image segmentation has attracted attention due to the relaxation of the need for dense labels to weak or partial labels, resulting in the weakly supervised segmentation (WSS)\cite{wang2022bowelnet,lv2023robustness,lei2023one}.
The bounding boxes, scribbles, and partial labels are the most commonly used supervision types for WSS\cite{wang2022bowelnet}. 
Among all these types, partial annotations, i.e., only one or a few classes are labeled in images, can provide the flexibility to allocate the workload and have the potential to reduce the annotation cost.

However, it is difficult to directly learn from partially labeled datasets via traditional fully supervised learning frameworks.
To segment multiple organs, existing methods made efforts to learn from partially annotated datasets.
 
When training with partially-labeled images, Zhou et al., tried to incorporate anatomical priors from fully-labeled dataset\cite{zhou2019prior}. 
Fang et al., proposed the target adaptive loss to learn a unified multi-organ segmentation model from partially- and fully-labeled datasets\cite{fang2020multi}. 
Huang et al., trained multiple binary segmentation models using partially-labeled images and then learned a multi-organ network using pseudo labels from the binary models\cite{huang2020multi}. 
Zhang et al., used a dynamic segmentation head and a task-specific controller to address the partial annotation issues\cite{zhang2021dodnet}.


\begin{figure}[!htp]
\centering
\includegraphics[scale=0.4]{imgs/lab_vol_statis.pdf}
\caption{Volume statistics of organs and tumors for partially labeled data of FLARE23. The labels in the horizontal coordinate are the abbreviations of the names of 13 organs (from Liver(LI) to Left Kidney(LK)) and tumor. 
}
\label{fig:lab_vol}
\end{figure}

FLARE23 is different from FLARE22 in two ways; the first is that FLARE23 is partially labeled, and the second is that FLARE23 requires segmenting the tumor. 
One of the reasons why the latter is more difficult than organ segmentation is due to its uncertain location and state.
For example, as shown in Figure~\ref{fig:lab_vol}, the volume distribution of the tumor (TU) has a larger variance than that of the other 13 organs.

In this paper, we propose a self-training strategy, i.e., a progressive weakly supervised learning framework (PWS-Seg), to segment the abdomen organs and tumors based on partial label annotation.
The framework consists of two stages, i.e., the organ segmentation (OS) stage and the organ-tumor segmentation (OTS) stage.
The OS stage extracts the features of multiple organs from the limited labeled organs and then propagates the knowledge to unlabeled regions.
The OTS stage uses the consistency of organ prediction during the progressive self-training process to correct the pseudo labels of organs and tumors.






\section{Method}
Figure~\ref{fig:method_diagram} shows the diagram of our two-stage approach.

We propose a progressive weakly-supervised segmentation (PWS-Seg) framework for multi-organ and tumor segmentation tasks, which can leverage partial annotated data and numbers of unlabeled data.
Since we assume that the abdominal tumors were highly spatially related to the abdominal organs, we introduce a two-stage segmentation to complete the organ-tumor segmentation task.
The two networks have similar UNet-based architectures.

Algorithm~\ref{alg:pws} presents the scheme for PWS-Seg.
In the first stage, we train the organ segmentation network (OS CNN) twice (Section~\ref{sec:organ seg}). 
We can use each of the trained OS CNNs to predict the pseudo labels of organs of unlabeled data and partially annotated data.
OS CNN was trained using our previous work, i.e., a cross-supervision method\cite{jia2022semi}.

In the second stage, we also train the organ and tumor segmentation network (OTS CNN) twice.
We can use each of the trained CNNs to predict the pseudo labels of organs and tumors of unlabeled data and partially annotated data.
To generate more reliable pseudo labels, we propose a fusion strategy for the pseudo labels and partially annotated data (Section~\ref{sec:label fusion}).








%###########################
\subsection{Preprocessing}
The annotated images were cropped as patches using their corresponding labels, which avoided using numerous patches without any labels.
For the unlabeled images, we used the results of the OS stage to crop the abdominal organ regions.
All the images were re-sampled for a fixed spacing, i.e., $2.5 \rm{ mm}\times 0.8 \rm{ mm}\times 0.8 \rm{ mm}$.



\begin{figure}[htbp]
\centering
\includegraphics[scale=0.75]{imgs/figures_method.pdf}
\caption{The diagram of the two-stage organ and tumor segmentation framework.
Stage 1 (OS stage): A fully annotated organ dataset and an unlabeled dataset are used to segment the organ using the cross-supervision method. 
The trained OS CNN ($p_{\theta_1}$) is used to predict pseudo labels of organs of the unlabeled images.
The pseudo labels of organs, which can also seen as a set of partially annotated labels of organs and tumors, are combined with the given partially annotated labels to generate a set of more reliable pseudo labels.
Stage 2 (OTS stage): Based on the more reliable dataset, we can segment the organs and tumors, simultaneously.
}
\label{fig:method_diagram}
\end{figure}

\begin{algorithm}[t]
	\caption{Framework of PWS-Seg.}
	\label{alg:pws}
	\KwIn{Partially annotated data: $\mathcal{P}=\{\mathcal{P}_O,\mathcal{P}_{OT}\}$; Unlabeled data:$\mathcal{U}$; Organ segmentation network:OS CNN($\theta$),Organ-tumor segmentation network:OTS CNN($\varphi$); }
	\KwOut{$\varphi$.}  
	\BlankLine
	Initialize $\theta$ and $\varphi$ according to Tables~\ref{table:training-OS} and \ref{table:training-OTS};
	
	\ForEach {epoch in max-num-epoch}{
        Sample batch from $\mathcal{P}_O$, $\mathcal{P}_{OT}$ and $\mathcal{U}$; $\mathcal{P}_{OT}$ in this step were considered as unlabeled data;
        
        Train parameter $\theta$ of OS CNN;
	}
	Predict pseudo labels of organs for $\mathcal{U}$ and $\mathcal{P}_{OT}$, divide the tumor label into different organs;
 
 \ForEach {epoch in max-num-epoch}{
        Sample batch from $\mathcal{P}_O$, and sample batch from $\mathcal{P}_{OT}$ and $\mathcal{U}$ with their pseudo labels of organs;
        
        Train parameter $\theta$ of OS CNN;
	}
 
 Predict pseudo labels of organs ($\hat{\mathcal{Y}}$) for $\mathcal{P}_{OT}$ using trained OS CNN; 
 
 For $\mathcal{P}_{OT}$, abandon the data whose partial label did not contain tumor class and fuse the pseudo organ labels of organs and partial annotations, resulting in a set of data with more reliable pseudo labels $\tilde{\mathcal{Y}}$;

 \ForEach {epoch in max-num-epoch}{
        Sample batch from $\tilde{\mathcal{Y}}$;
        Train parameter $\varphi$ of OST CNN;
	}
  Predict pseudo labels of organs and tumors ($\hat{\mathcal{Y}}$)  for $\mathcal{P}_{O}$, $\mathcal{P}_{OT}$ and $\mathcal{U}$; 
  
 Fuse the partial labels and pseudo labels into $\tilde{\mathcal{Y}}$;
 
 \ForEach {epoch in max-num-epoch}{
        Sample batch from  $\tilde{\mathcal{Y}}$ ;
        
        Train parameter $\varphi$ of OST CNN;
	}


\end{algorithm}



\subsection{Notation of partial label segmentation }\label{note ow pl}

Let $\mathcal{L}=\left\{(\boldsymbol{x}^{(1)},\boldsymbol{y}^{(1)}),(\boldsymbol{x}^{(2)},\boldsymbol{y}^{(2)}),...,(\boldsymbol{x}^{(N)},\boldsymbol{y}^{(N)})\right\}$ and $\mathcal{U}=\left\{\boldsymbol{x}^{(N+1)},...,\boldsymbol{x}^{(M)}\right\}$ denote the labeled data and unlabeled data.
Here, we denote $\boldsymbol{x}$ as the intensity image, and denote $\boldsymbol{y}^{(i)}$ as annotation(label) image.

We can define the partial label data $\mathcal{P}$ based on $\mathcal{L}$.
For a given element $(\boldsymbol{x}^{(i)}, \boldsymbol{y}^{(i)})$ in $\mathcal{L}$ and class $k$, we can define the partial annotated label image:
\begin{equation}
\boldsymbol{y}^{(i)}=[y_1^{(i)}, y_2^{(i)}, ..., y_K^{(i)}] \subseteq   \{0,1\}^K.
\end{equation}
Here $y^{(i)}=1$ and  $y^{(i)}=0$ mean the annotation of class $k$ is present and absent for $\boldsymbol{x}^{(i)}$, respectively.
Based on the definitions of $\mathcal{P}$ and $\mathcal{L}$, the partially labeled data $\mathcal{P}$ is also the labeled data related to the target class $K$.
In this paper, we assume that the tumors were unknown, such that those images where all the organs are fully labeled (i.e., $\mathcal{P}_O)$ can be seen as partially annotated images. 
The images where organs and tumors are partially labeled are denoted as $\mathcal{P}_{OT}$.

The aim of the partial label segmentation is to obtain a segmentation plan that can leverage all the obtained data, i.e., $\mathcal{P}$ and $\mathcal{U}$.
We can use the segmentation plan $\boldsymbol{P}$ to predict a probability map $p$ for $\boldsymbol{x}$ as:
\begin{equation}
    \boldsymbol{P}=p_{\varphi}(\boldsymbol{x}).
\end{equation}

\subsection{Organ segmentation using cross supervision}\label{cross-supervision}
\label{sec:organ seg}

In the organ and tumor segmentation task, fully annotated organ labels can also be seen as partial labels when we assume that the tumor class is absent.
In this stage, to segment abdominal organs with limited organ-labeled and unlabeled data $\mathcal{U}$, we use the cross-supervision method, which is presented in our previous work\cite{jia2022semi}.
Two sub-networks ($p_{\theta_1}$ and $p_{\theta_2}$) are introduced.
They have the same structures and the same number of parameters but are initialized differently at the beginning.

During the training procedure, to leverage the unlabeled data, the cross-supervised (CS) losses for organ-labeled and unlabeled data, i.e., $\mathcal{C}_{u}^{u}$ and $\mathcal{C}_{u}^{l}$, are introduced, respectively.

The training loss function for organ segmentation can be formulated as:
\begin{equation}
    \mathcal{C}=\mathcal{C}_{s}+\mathcal{C}_{u}^{u}+\mathcal{C}^{l}_{u}.
\end{equation}

In the first stage, we train the OS CNN twice.
During the first training procedure, we input fully organ-labeled images ($\mathcal{P}_O$), partially labeled images($\mathcal{P}_{OT}$), and unlabelled images ($\mathcal{U}$).
The partially labeled images are treated as unlabelled images in this stage.
We use the first trained OS CNN to predict pseudo-labels for both partially labeled and unlabelled images.
Since there are a certain number of correct organ labels in the partially labeled images, we retain the labeled organ labels and classify the tumor regions into organ categories when generating pseudo-labeled images for the partially labeled images.
In other words, we only use data with organ labels in the second training of OS CNN.
At the end of the OS stage, we use the second trained OS CNN to predict $\hat{\boldsymbol{y}}$ as shown in Figure~\ref{fig:method_diagram}.
More details about this stage are provided in Algorithms~\ref{alg:pws}.


\subsection{Class-wise label fusion strategy for partially annotated labels}\label{sec:label fusion}
To obtain more reliable pseudo labels, we propose a class-wise label fusion strategy to use partially labeled data.
Although cross-supervision strategy can leverage the unlabeled data $\mathcal{U}$, partially labeled data could not be used directly.
We can generate more reliable pseudo labels of partially annotated images.

Given the partial labeled data $(\boldsymbol{x}^{(j)},\boldsymbol{y}^{(j)})$ in $\mathcal{P}$ and its corresponding pseudo label is:
\begin{equation}
\hat{\boldsymbol{y}}^{(j)}=[\hat{y}_1^{(j)}, \hat{y_2}^{(j)}, ..., \hat{y_K}^{(j)}] \subseteq   \{1\}^K.
\end{equation}
Here, $\hat{\boldsymbol{y}}^{(j)}\subseteq \hat{\mathcal{Y}}$ can be generated by any teacher models.
Based on partial label and pseudo label information, we can generate more reliable pseudo labels for partially labeled images:
$\tilde{\boldsymbol{y}}^{(j)}=[\tilde{y}_1^{(j)}, \tilde{y_2}^{(j)}, ..., \tilde{y_K}^{(j)}]\subseteq \tilde{\mathcal{Y}}$:
\begin{equation}\label{eq:label fusion}
\tilde{y_k}^{(j)}=\left\{\begin{array}{l}
y_k^{(j)}, \quad \mathrm{if} ~ y_k^{(j)}=1; \\ 
\hat{y_k}^{(j)}, \quad \mathrm{others}.
\end{array}\right.
\end{equation}

In the second stage, we also train the OTS CNN twice.
During the first training procedure, we use the label fusion strategy (Equation~\ref{eq:label fusion}) to generate a set of more reliable pseudo labels for the partially annotated images.
It should be noted that the partially annotated images, whose labels did not contain tumor class were abandoned, resulting in a set of data with more reliable pseudo labels $\tilde{\mathcal{Y}}$.
We use the first trained OTS CNN to predict pseudo-labels for both partially labeled and unlabelled images.
Label fusion strategy and label selection strategy are also used to generate the second set of pseudo labels, which are used to train a second OTS CNN.
More details about this stage are provided in Algorithms~\ref{alg:pws}.





\section{Experiments}
\subsection{Dataset and evaluation measures}
The FLARE 2023 challenge is an extension of the FLARE 2021-2022~\cite{MedIA-FLARE21}\cite{FLARE22}, aiming to aim to promote the development of foundation models in abdominal disease analysis. The segmentation targets cover 13 organs and various abdominal lesions. The training dataset is curated from more than 30 medical centers under the license permission, including TCIA~\cite{TCIA}, LiTS~\cite{LiTS}, MSD~\cite{simpson2019MSD}, KiTS~\cite{KiTS,KiTSDataset}, autoPET~\cite{autoPET-Data,autoPET-MICCAI22}, TotalSegmentator~\cite{TotalSegmentator}, and AbdomenCT-1K~\cite{AbdomenCT-1K}. The training set includes 4000 abdomen CT scans where 2200 CT scans with partial labels and 1800 CT scans without labels. The validation and testing sets include 100 and 400 CT scans, respectively, which cover various abdominal cancer types, such as liver cancer, kidney cancer, pancreas cancer, colon cancer, gastric cancer, and so on. The organ annotation process used ITK-SNAP~\cite{ITKSNAP}, nnU-Net~\cite{nnUNet}, and MedSAM~\cite{MedSAM}.


The evaluation metrics encompass two accuracy measures—Dice Similarity Coefficient (DSC) and Normalized Surface Dice (NSD)—alongside two efficiency measures—running time and area under the GPU memory-time curve. These metrics collectively contribute to the ranking computation. Furthermore, the running time and GPU memory consumption are considered within tolerances of 15 seconds and 4 GB, respectively.


\subsection{Implementation details}
\subsubsection{Environment settings}
The development environments and requirements are presented in Table~\ref{table:env}.


\begin{table}[!htbp]
\caption{Development environments and requirements.}\label{table:env}
\centering
\begin{tabular}{ll}
\hline
System       &  Ubuntu 20.04.4 LTS and Windows 11\\
\hline
CPU   & e.g., Intel(R) Core(TM) i9-13900K CPU@3.00GHz \\
\hline
RAM                         &16$\times $4GB; 2.67MT$/$s\\
\hline
GPU (number and type)                         & One NVIDIA 3090 24G\\
\hline
CUDA version                  & 11.0\\                          \hline
Programming language                 & Python 3.8\\ 
\hline
Deep learning framework & torch 1.11.0 \\                                              
\hline
\end{tabular}
\end{table}


\subsubsection{Training protocols}
We implemented the proposed framework using EfficientSegNet\cite{zhang2021efficient} and nnUnet\cite{nnUNet} used in FLARE21\cite{MedIA-FLARE21} and FLARE22\cite{ma2023fast} challenge.


In the OS stage of our proposed method, we used 2250 partially labeled images (50 fully organ-annotated images from FLARE22 and 2200 partially labeled images from FLARE23) and 2800 (1000 from FLARE22\cite{ma2023fast} and 1800 from FLARE23) unlabeled images.
In this stage, we also employed a cascade strategy, which aimed to segment the abdomen organs via a coarse-to-fine procedure.
Since the abdominal region is large, we can not efficiently segment all organs in a single-stage way.
Therefore, we segmented the organs from down-sampled images in the first place, which can be seen as a coarse segmentation.
With the help of coarse segmentation results, we segmented the organs of the original images.


In the OTS stage, we used 50 partially annotated images (fully organ-annotated images), 1496 partially annotated images (must have the tumor label), and  2800 (1000 from FLARE22 and 800 from FLARE23) unlabeled images to train the network.

We used the same processing strategy and data augmentation method for all images as in our previous work\cite{jia2022semi}.
Crop, random rotation, random transition, and random elastic deformation were used for data augmentation.
We randomly resampled the data with the size and spacing described in Table~\ref{table:training-OS} and Table~\ref{table:training-OTS}.
%Please describe at least the following aspects:
%1. processing of the unlabeled images and partial labels



%2. Data augmentation (Based on the winning solutions in FLARE 2021~\cite{MedIA-%FLARE21}, we recommend using extensive data augmentation)

%3. patch sampling strategy

%4. optimal model selection criteria


\begin{table*}[h]
\caption{Training protocols for OS stage.}
\label{table:training-OS}
\begin{center}
% \resizebox{0.47\textwidth}{!}{
\begin{tabular}{ll} 
\hline
Network initialization         & Kaiming normal initialization\\
\hline
Batch size                    & 8(coarse), 1(fine) \\
\hline 
Input size (coarse) & 160$\times$160$\times$160 \\ 
\hline
Input size (fine) & 192$\times$192$\times$192 \\ 
\hline
Total epochs & 500(coarse), 1000(fine) \\
\hline
Optimizer         &Adam with betas (0.9, 0.99),
L2 penalty: 0.00001      \\ 
\hline
Loss &Dice loss and focal loss
(alpha = 0.5, gamma = 2)\\
\hline
Initial learning rate (lr)  & 0.01 \\ \hline
Training time (coarse)                                         
& 6 (coarse), 300(fine) hours\\ \hline                                    
\end{tabular}
%}
\end{center}
\end{table*}


\begin{table*}[!htbp]
\caption{Training protocols for OTS stage.}
\label{table:training-OTS}
\begin{center}
% \resizebox{0.47\textwidth}{!}{
\begin{tabular}{ll} 
\hline
Network initialization         &Kaiming normal initialization \\
\hline
Batch size                    & 2 \\
\hline 
Patch size & 48$\times$192$\times$192  \\ 
\hline
Total epochs &  1000 \\
\hline
Optimizer          & Adam       \\ \hline
Initial learning rate (lr)  &0.01  \\ \hline
Training time                                           & 672 hours \\  \hline 
Loss function & Cross-entropy and Dice loss \\     \hline
Number of model parameters    & 48.84M\footnote{https://github.com/sksq96/pytorch-summary} \\ \hline
Number of flops & 995.46G\footnote{https://github.com/facebookresearch/fvcore} \\ \hline
\end{tabular}
%}
\end{center}
\end{table*}




\section{Results}



The results show that the method using unlabeled data improves the dice score of the method with partially labeled images.

Table~\ref{tab:final-results} shows the results of the proposed methods on the validation dataset.
The results of our submitted solution (docker container), which was evaluated by the organizers of FLARE2023, are reported in Table~\ref{tab:final-results}.
The public validation dataset contains 50 cases, while the online validation dataset contains 100 cases.






% Note: Please describe at least the following aspects in this section


% 1. The effect of using unlabelled cases;


% 2. In what kind of cases the proposed method works well?

% 3. What are the possible reasons for the failed cases or organs?


% 4. Segmentation efficiency analysis


\begin{table}[htbp]
\caption{Quantitative results PWS-Seg($\mathcal{P}$+$\mathcal{U}$) in terms of DSC and NSD on the validation dataset.
There are 50 validation cases in the public validation and 100 cases in the online validation. We report the mean and standard deviation with $\pm$.
}\label{tab:final-results}
\centering
\begin{tabular}{l|cc|cc|cc}
\hline
\multirow{2}{*}{Target} & \multicolumn{2}{c|}{Public Validation} & \multicolumn{2}{c|}{Online Validation} & \multicolumn{2}{c}{Testing} \\ \cline{2-7} 
                        & DSC(\%)            & NSD(\%)           & DSC(\%)            & NSD(\%)           & DSC(\%)      & NSD (\%)     \\ \hline
Liver                   & 97.21 $\pm$ $1.36$                  &   96.36 $\pm$ 4.55                  &   97.24                 &     96.66                &            &              \\
Right Kidney            & 94.47 $\pm$ $6.31$                   &  94.30 $\pm$ 8.02                  &  94.07                  & 94.12                  &              &              \\
Spleen                  & 96.28 $\pm$ $3.03$                  & 97.14 $\pm$ 5.46                   & 96.47                  &  97.35                 &              &              \\
Pancreas                & 81.74 $\pm$ $5.26$                   & 93.38 $\pm$ 3.60                  &  79.65                  & 91.54                  &             &              \\
Aorta                   & 95.69 $\pm$ $3.68$                   & 97.86 $\pm$ 4.70                  & 95.97                   & 98.09                  &              &              \\
Inferior vena cava      & 92.44 $\pm$ $4.10$                   & 94.23 $\pm$ 4.93                 & 92.21                   & 93.75                  &              &              \\
Right adrenal gland     & 80.35 $\pm$ $6.17$                   & 91.07 $\pm$ 5.29                 & 79.73                   & 91.10                  &              &              \\
Left adrenal gland      & 75.79 $\pm$ $10.49$                   & 84.46 $\pm$ 10.58                  & 75.29                   & 83.67                  &              &              \\
Gallbladder             & 77.61 $\pm$ $25.28$                   & 74.85 $\pm$ 25.43                  & 78.05                   & 74.59                  &              &              \\
Esophagus               & 79.41 $\pm$ $15.57$                   & 90.19 $\pm$ 15.06                  & 80.21                   & 91.24                  &              &             \\
Stomach                 & 90.48 $\pm$ $6.98$                   & 92.52 $\pm$ 9.59                 & 90.02                   & 91.75                  &              &              \\
Duodenum                & 77.32 $\pm$ $9.20$                   & 91.67 $\pm$ 6.10                  & 77.97                   & 91.88                  &              &              \\
Left kidney             & 93.01 $\pm$ $10.26$                   & 91.85 $\pm$ 13.09                  & 92.55                  & 91.83                  &              &              \\
Tumor                   & 34.42 $\pm$ $33.26$                   & 27.40 $\pm$ 27.47                  & 28.05                   & 16.40                  &             &              \\ \hline
Average(Organ)                   & 87.06 $\pm$ $8.02$                 & 91.53 $\pm$ 5.85                  & 86.88                   & 91.35                &             &              \\ \hline
Average                   & 83.30 $\pm$ $15.61$                 & 86.95 $\pm$ 17.45                  & 82.68                   & 86.00                &             &              \\ \hline
\end{tabular}
\end{table}



\begin{table}[t]
\caption{Quantitative results of i.e., PWS-Seg in terms of DSC and NSD on the validation dataset. The symbol 1540 ($\mathcal{P}$) denotes the method using 50 fully annotated organ images $\mathcal{P}_O$ and 1496 partially annotated organ-tumor images $\mathcal{P}_{OT}$. $\mathrm{PWS}$-$\mathrm{Seg}^{*}$($\mathcal{P}$) denotes the method without using label fusion strategy. $\mathrm{PWS}$-$\mathrm{Seg}^{\dag}$($\mathcal{P}$+$\mathcal{U}$) denotes the method with larger sized network. We report the mean and standard deviation with $\pm$.}
\label{tab:results}
\centering
\begin{tabular}{l c c c c c c }
\hline
\multirow{3}{*}{Organ/Tumor} & \multicolumn{2}{c}{ $\mathrm{PWS}$-$\mathrm{Seg}^{*}$($\mathcal{P}$)} & \multicolumn{2}{c}{$\mathrm{PWS}$-$\mathrm{Seg}$($\mathcal{P}$)} & \multicolumn{2}{c}{$\mathrm{PWS}$-$\mathrm{Seg}^{\dag}$($\mathcal{P}$+$\mathcal{U}$)}  \\ 
{} & \multicolumn{2}{c}{ 1540($\mathcal{P}$)=50+1496} & \multicolumn{2}{c}{1540($\mathcal{P}$)=50+1496} & \multicolumn{2}{c}{1540($\mathcal{P}$)+2800($\mathcal{U}$)} \\ 
{} & \multicolumn{2}{c}{DSC(\%), NSD(\%)} & \multicolumn{2}{c}{DSC(\%), NSD(\%)} & \multicolumn{2}{c}{DSC(\%), NSD(\%)}\\ 
\hline
Liver&\multicolumn{2}{c}{96.52$\pm$2.21,95.50$\pm$6.04}& \multicolumn{2}{c}{96.86$\pm$3.29,96.49$\pm$4.20} & \multicolumn{2}{c}{97.71$\pm$0.62,97.81$\pm$1.94}\\
RK&\multicolumn{2}{c}{92.02$\pm$9.34,90.55$\pm$10.99}& \multicolumn{2}{c}{91.04$\pm$15.59,90.15$\pm$16.42 } & \multicolumn{2}{c}{94.94$\pm$7.18,94.93$\pm$7.75}\\
Spleen&\multicolumn{2}{c}{94.79$\pm$7.42,94.32$\pm$8.00}& \multicolumn{2}{c}{95.77$\pm$4.54,96.30$\pm$6.63} & \multicolumn{2}{c}{96.36$\pm$3.45,97.11$\pm$5.44}\\
Pancreas&\multicolumn{2}{c}{78.28$\pm$6.73,89.72$\pm$5.67}& \multicolumn{2}{c}{82.08$\pm$6.11,93.71$\pm$4.87 } & \multicolumn{2}{c}{83.20$\pm$4.91,94.68$\pm$3.22}\\
Aorta&\multicolumn{2}{c}{96.14$\pm$3.06,98.43$\pm$4.40}& \multicolumn{2}{c}{94.84$\pm$4.32,97.49$\pm$5.28} & \multicolumn{2}{c}{95.73$\pm$3.96,97.97$\pm$4.71}\\
IVC&\multicolumn{2}{c}{92.69$\pm$4.03,94.97$\pm$5.13}& \multicolumn{2}{c}{92.74$\pm$4.38,94.54$\pm$5.19} & \multicolumn{2}{c}{92.75$\pm$3.90,94.42$\pm$4.87}\\
RAG&\multicolumn{2}{c}{81.72$\pm$6.04,91.61$\pm$4.87}& \multicolumn{2}{c}{74.41$\pm$10.12,86.38$\pm$7.71 } & \multicolumn{2}{c}{83.42$\pm$5.60,92.84$\pm$4.40}\\
LAG&\multicolumn{2}{c}{74.97$\pm$13.58,84.93$\pm$15.20}& \multicolumn{2}{c}{74.91$\pm$10.47,83.57$\pm$9.88} & \multicolumn{2}{c}{78.49$\pm$9.76,87.31$\pm$9.17}\\
Gallbladder&\multicolumn{2}{c}{77.33$\pm$25.78,76.26$\pm$25.64}& \multicolumn{2}{c}{78.19$\pm$24.62,77.54$\pm$26.19 } & \multicolumn{2}{c}{83.27$\pm$20.08,81.23$\pm$21.11}\\
Esophagus&\multicolumn{2}{c}{76.85$\pm$15.38,89.65$\pm$13.74}& \multicolumn{2}{c}{77.32$\pm$16.26,87.40$\pm$15.92} & \multicolumn{2}{c}{80.04$\pm$15.07,90.75$\pm$14.23}\\
Stomach&\multicolumn{2}{c}{87.97$\pm$9.18,89.38$\pm$11.12}& \multicolumn{2}{c}{89.83$\pm$7.10,92.76$\pm$8.95} & \multicolumn{2}{c}{91.25$\pm$6.25,94.30$\pm$7.40}\\
Duodenum&\multicolumn{2}{c}{77.89$\pm$7.79,92.11$\pm$5.98}& \multicolumn{2}{c}{78.95$\pm$8.63,92.87$\pm$6.13 } & \multicolumn{2}{c}{79.58$\pm$8.71,93.18$\pm$6.02}\\
LK&\multicolumn{2}{c}{87.41$\pm$17.53,83.35$\pm$18.39}& \multicolumn{2}{c}{91.57$\pm$12.15,91.44$\pm$13.83} & \multicolumn{2}{c}{93.33$\pm$12.19,93.08$\pm$14.20 }\\
Tumor&\multicolumn{2}{c}{31.10$\pm$33.89,25.40$\pm$27.84}& \multicolumn{2}{c}{37.72$\pm$33.76,35.18$\pm$29.26} & \multicolumn{2}{c}{44.19$\pm$34.26,40.06$\pm$30.85 }\\
 \hline
Avg.&\multicolumn{2}{c}{81.83$\pm$15.97,60.45$\pm$35.05 }& \multicolumn{2}{c}{82.59$\pm$14.72,86.84$\pm$15.29} & \multicolumn{2}{c}{85.30$\pm$13.21,89.26$\pm$14.29}\\
 \hline
\end{tabular}
\end{table}



\subsection{Quantitative results on validation set}

In Table~\ref{tab:final-results}, the average DSC of organ segmentation on public and online validation datasets are separately 87.06\% and 86.88\%, which demonstrates that our method shows robust performance on organ segmentation.
However, the DSC of tumor segmentation on public and online validation datasets are separately 34.42\% and 28.05\%, which shows that tumor segmentation remains challenging.


Compared to the method of only using partially annotated data ($\mathrm{PWS}$-$\mathrm{Seg}$($\mathcal{P}$) ) shown in Table~\ref{tab:results}), using unlabeled data, the PWS-Seg improves the average DSC from 82.59\% to 83.30\%, while the average NSC improves from 86.84\% to 86.95\%, which is consistent with the conclusion of FLARE22 challenge\cite{ma2023fast}.

Table~\ref{tab:final-results} shows that the Tumor, LAG, Duodenum, and Gallbladder are the three difficult regions, while the Liver, Spleen, and Aorta are the three easy organs for abdominal organ and tumor segmentation. 
The difficulties may be due to unclear boundaries, class imbalanced issues, and large variations of shapes.
Besides, Table~\ref{tab:final-results} and Table~\ref{tab:results} show that the standard deviations of the tumor and Gallbladder segmentation are relatively large, which demonstrates the method achieves disappointed robustness for Tumor and Gallbladder. 

To validate the effect of the label fusion strategy, we perform PWS-Seg with simple pseudo labels without any fusion, i.e., $\mathrm{PWS}$-$\mathrm{Seg}^{*}$($\mathcal{P}$) in Table~\ref{tab:results}.
The average DSC decreases from 82.59 \% to 81.83\%, which shows the efficacy of the label fusion strategy.

To validate the effect of the larger size of the network on the results, we increase the number of features to 32 and the patch size to $48\times224\times224$, denoted as $\mathrm{PWS}$-$\mathrm{Seg}^{\dag}$($\mathcal{P}$+$\mathcal{U}$).
The results in Table~\ref{tab:results} show that a larger sized network can improve the accuracy of the segmentation, which is consistent with the conclusion in the work\cite{huang2022revisiting} of FLARE22.


As shown in Figure~\ref{fig:seg}, in Case $\#0093$, the segmentation results of our method have large boundary variations of Duodenum. 
Moreover, both in Case $\#0093$ and Case $\#0033$, our method fails to recognize the tumor of RK even if the tumor boundaries are clear.



\begin{table}[htbp]
\caption{Quantitative evaluation of segmentation efficiency in terms of the running them and GPU memory consumption. Total GPU denotes the area under the GPU Memory-Time curve. Evaluation GPU platform: NVIDIA QUADRO RTX5000 (16G). 
}
\centering
\begin{tabular}{ccccc}
\hline
Case ID & Image Size      & Running Time (s) & Max GPU (MB) & Total GPU (MB) \\ \hline
0001    & (512, 512, 55)  & 65.13                 & 3442    & 156868         \\
0051    & (512, 512, 100) & 101.57                 & 4402             & 358826               \\
0017    & (512, 512, 150) & 117.42                 & 4618             & 432704               \\
0019    & (512, 512, 215) & 87.14                 & 3874             & 262002               \\
0099    & (512, 512, 334) & 122.24                 & 4388             & 396838               \\
0063    & (512, 512, 448) & 146.61                 & 4602             & 489157               \\
0048    & (512, 512, 499) & 150.43                 & 4524             & 509211               \\
0029    & (512, 512, 554) & 180.96                 & 5196             & 692933               \\ 
\hline
Avg.    & \multirow{2}{*}{-}& \multirow{2}{*}{100.17}                 & \multirow{2}{*}{4128}            & \multirow{2}{*}{339753}  \\
(20 cases)
\\
\hline
\end{tabular}
\label{efficient_results}
\end{table}


\subsection{Qualitative results on validation set}

Figure~\ref{fig:seg} shows two examples with good segmentation results($\#0038\_\#\mathrm{slice} 172$ and $\#0053\_\#\mathrm{slice}72$) and two examples with bad segmentation results($\#0093\_\#\mathrm{slice} 58$ and $\#0033\_\#\mathrm{slice} 74$) in the validation set.

\begin{figure}[!htp]
\centering
\includegraphics[scale=0.9]{imgs/figures_example.pdf}
\caption{Qualitative results on good (Case $\#$0038 and Case $\#$0053) and bad (Case $\#$0093 and Case $\#$0033) examples. The first column is the intensity image, \textcolor{black}{the second column is the ground truth, and the third and the fourth columns are the results achieved by our proposed method}. 
PWS-Seg($\mathcal{P}$) denotes the proposed method using only the partially annotated dataset. PWS-Seg($\mathcal{P}$+$\mathcal{U}$) denotes our proposed solution. The DSC of each case is presented in the top-left corner.
}
\label{fig:seg}
\end{figure}

\subsection{\textcolor{black}{Segmentation efficiency results on validation set}}

 		
Table~\ref{efficient_results} presents the segmentation efficiency results of 8 cases, whose image sizes are increasing.
The runtime of the case with the smallest image size, i.e.,  Case $\#$0001, is 65.13 s.
By contrast, the runtime of the case with the largest image size, i.e.,  Case $\#$0029, is 180.96 s.

The mean runtime is 100.17 s per case in the prediction step, the maximum used GPU memory is 4128 MB, and the AUC GPU time is 339753 MB.
\subsection{Results on final testing set}




\section{Discussion and conclusion}

% The main finding and results
Using unlabeled data, the proposed progressive weakly supervised method achieved better results than the results of the method using the partially annotated data.
Whichever method is used, the segmentation of some organs and tumors is still challenging.
Tumors are highly variable in shape and appearance due to uncertainty of location and status. 
As shown in Figure~\ref{fig:lab_vol}, the volumes of tumors have larger variations than organs.
Thus, tumor segmentation obtained disappointing performance because of uncertainties of locations, regularity of unremarkable shapes, unclear boundaries, number of individuals, etc.
The existence of tumors in organs, such as Livers and Kidneys, are critical factor for poor organ segmentation performance. 
Besides, further research is needed to identify and use the remarkable image properties and shape patterns.

The proposed PWS-Seg model used over 1000 unlabelled images, but the performance of the method is limited by the amount of time-consuming training of the model using images of the same type. 
Future attention may need to be paid to how representative training samples can be filtered out of thousands of data.


\subsection{Limitation and future work}

We summarize the limitations and future work as follows:

\begin{itemize}
	\item Efficiently extract the features of tumors and organs with large shape and appearance variations.
	\item Robust network trained with partially annotated labels.
	\item High-quality datasets which have enough diversity and common features.

\end{itemize}



\subsubsection{Acknowledgements} The authors of this paper declare that the segmentation method they implemented for participation in the FLARE 2023 challenge has not used any pre-trained models nor additional datasets other than those provided by the organizers. The proposed solution is fully automatic without any manual intervention. We thank all the data owners for making the CT scans publicly available and CodaLab~\cite{codalab} for hosting the challenge platform. 

This work was supported fully by InnoHK Project at Hong Kong Centre for Cerebro-cardiovascular Health Engineering (COCHE).
%
% ---- Bibliography ----
%
% BibTeX users should specify bibliography style 'splncs04'.
% References will then be sorted and formatted in the correct style.
%
\bibliographystyle{splncs04}
\bibliography{ref}

\newpage
% Please add the following required packages to your document preamble:
% \usepackage[normalem]{ulem}
% \useunder{\uline}{\ul}{}
\begin{table}[!htbp]
\caption{Checklist Table. Please fill out this checklist table in the answer column.}
\centering
\begin{tabular}{ll}
\hline
Requirements                                                                                                                    & Answer        \\ \hline
A meaningful title                                                                                                              & Yes/No        \\ \hline
The number of authors ($\leq$6)                                                                                                             & Number        \\ \hline
Author affiliations and ORCID                                                                                           & Yes/No        \\ \hline
Corresponding author email is presented                                                                                                  & Yes/No        \\ \hline
Validation scores are presented in the abstract                                                                                 & Yes/No        \\ \hline
\begin{tabular}[c]{@{}l@{}}Introduction includes at least three parts: \\ background, related work, and motivation\end{tabular} & Yes/No        \\ \hline
A pipeline/network figure is provided                                                                                           & Figure number \\ \hline
Pre-processing                                                                                                                  & Page number   \\ \hline
Strategies to use the partial label                                                                                             & Page number   \\ \hline
Strategies to use the unlabeled images.                                                                                         & Page number   \\ \hline
Strategies to improve model inference                                                                                           & Page number   \\ \hline
Post-processing                                                                                                                 & Page number   \\ \hline
Dataset and evaluation metric section is presented                                                                              & Page number   \\ \hline
Environment setting table is provided                                                                                           & Table number  \\ \hline
Training protocol table is provided                                                                                             & Table number  \\ \hline
Ablation study                                                                                                                  & Page number   \\ \hline
Efficiency evaluation results are provided                                                                                     & Table number \\ \hline
Visualized segmentation example is provided                                                                                     & Figure number \\ \hline
Limitation and future work are presented                                                                                        & Yes/No        \\ \hline
Reference format is consistent.  & Yes/No        \\ \hline

\end{tabular}
\end{table}

\end{document}
