% This is samplepaper.tex, a sample chapter demonstrating the
% LLNCS macro package for Springer Computer Science proceedings;
% Version 2.21 of 2022/01/12
%
\documentclass[runningheads]{llncs}
%
\usepackage[T1]{fontenc}
% T1 fonts will be used to generate the final print and online PDFs,
% so please use T1 fonts in your manuscript whenever possible.
% Other font encondings may result in incorrect characters.
%
\usepackage{multirow}
\usepackage{graphicx}
% Used for displaying a sample figure. If possible, figure files should
% be included in EPS format.
%
% If you use the hyperref package, please uncomment the following two lines
% to display URLs in blue roman font according to Springer's eBook style:
%\usepackage{color}
%\renewcommand\UrlFont{\color{blue}\rmfamily}
\usepackage[pagebackref=true,breaklinks=true,colorlinks,bookmarks=false]{hyperref}
%
\begin{document}
%
\title{Exploiting Pseudo-Labeling and nnU-Netv2 Inference Acceleration for Abdominal Multi-Organ and Pan-Cancer Segmentation}
%
\titlerunning{Pseudo-Labeling and nnU-Netv2 Acceleration}
% If the paper title is too long for the running head, you can set
% an abbreviated paper title here
%
\author{Ziyan Huang\inst{1,2}\orcidID{0000-0002-1533-5239} \and
Jin Ye\inst{2}\orcidID{0000-0003-0667-9889} \and
Haoyu Wang\inst{2}\orcidID{0000-0002-1753-7336} \and
Zhongying Deng\inst{2}\orcidID{0000-0003-0887-7408} \and
Tianbin Li\inst{2}\orcidID{0009-0001-3617-8324} \and
Junjun He\inst{2}\orcidID{0000-0002-1813-1784} 
} 
%
\authorrunning{Ziyan Huang et al.}
% First names are abbreviated in the running head.
% If there are more than two authors, 'et al.' is used.
%
\institute{Shanghai Jiao Tong University, Shanghai, China \\ \email{ziyanhuang@sjtu.edu.cn} \and
Shanghai AI Laboratory, Shanghai, China
\\
\email{hejunjun@pjlab.org.cn}}
%
\maketitle              % typeset the header of the contribution
%
\begin{abstract}
Deep-learning based models offer powerful tools for the automatic segmentation of abdominal organs and tumors in CT scans, yet they face challenges such as limited datasets and high computational costs. The FLARE23 challenge addresses these by providing a large-scale dataset featuring both partially and fully annotated data, and by prioritizing both segmentation accuracy and computational efficiency. In this study, we adapt the winning FLARE22 strategy to FLARE23 by utilizing a two-step pseudo-labeling approach. Initially, a large model trained on datasets with complete organ annotations generates pseudo-labels for datasets that originally contain only tumor annotations. These labels are then integrated to create a comprehensive training dataset. A smaller, more efficient model is subsequently trained on this enriched dataset for deployment, targeting both tumors and organs. Our approach, utilizing the FLARE23 dataset, has achieved notable results. On the online validation leaderboard, it reached an average DSC of 89.63\% for organs and 46.07\% for lesions, with an average processing time of 16.1 seconds for 20 selected validation cases. In the final testing set, our model demonstrated improved performance, achieving an organ DSC of 89.98\% and lesion DSC of 62.61\%, while reducing the average processing time to 12.02 seconds. The code and model are publicly available at \href{https://github.com/Ziyan-Huang/FLARE23}{https://github.com/Ziyan-Huang/FLARE23}.
\\

\keywords{Medical Image Segmentation  \and Computational Efficiency \and Abdominal Tumors}
\end{abstract}



\section{Introduction}
The abdomen is a prevalent site for tumor growth. 
Accurate annotation of tumors and relevant abdominal organs in CT scans is essential for the diagnosis and treatment of abdominal tumors.
While deep-learning-based methods ease the task of manual annotation for radiologists, several challenges hinder their effectiveness.
Firstly, there's a lack of comprehensive datasets that include annotations for both tumors and various abdominal organs. Many existing datasets focus either on organ-specific or tumor-specific annotations. Therefore, learning accurate segmentations from these partially labeled and unlabeled datasets remains a challenge. 
Second, while state-of-the-art solutions like nnU-Net offer robust performance, they are often computationally intensive, thereby limiting their clinical utility. 
Recognizing these challenges, the FLARE23 challenge has been established. It offers a large-scale dataset that includes both partially annotated and unlabeled data, and it focuses on both segmentation accuracy and efficiency as evaluation metrics.

Given the challenge of insufficiently fully annotated datasets, semi-supervised and partial-label methods have increasingly garnered attention in the field of medical image segmentation.  
DoDNet~\cite{zhang2021dodnet} employs a dynamic on-demand network with a shared encoder-decoder architecture and a unique segmentation head, efficiently segmenting multiple organs and tumors from partially labeled datasets.
In a similar vein, the Universal Model~\cite{liu2023universal_model} employs Contrastive Language–Image Pretraining (CLIP)~\cite{radford2021clip} to extract semantic relationships between abdominal structures, achieving high performance across multiple datasets. MultiTalent~\cite{ulrich2023multitalent} adopts a multi-dataset learning approach, incorporating a class and dataset adaptive loss function to handle varying dataset characteristics and overlapping classes.
As for using unlabeled data, the FLARE22 championship solution~\cite{FLARE22-1st-Huang} demonstrates significant performance gains through pseudo-labeling and label-filtering techniques on unlabeled data. It also introduces a highly efficient, optimized version of nnU-Net~\cite{nnUNet}. However, the advent of nnU-Net v2, which excels in code usability, calls for new acceleration techniques tailored to this updated framework.

In this study, we extend the winning strategy of FLARE22 for application in the FLARE23 challenge by leveraging pseudo-labeling techniques. We employ partially-annotated and unannotated data to create datasets with comprehensive pseudo-labels. For efficiency, two different model sizes are utilized: a larger model for generating pseudo-labels and a smaller, deployable model for the final application. Specifically, we categorize the partially-labeled data into two main groups: one with comprehensive annotations for 13 types of abdominal organs, and another focused on tumor annotations. The pseudo-labeling process is executed in two stages. Initially, a larger model is trained on data with complete organ annotations to specialize in segmenting the 13 abdominal organs. This model then pseudo-labels organ annotations for datasets initially containing only tumor annotations. Subsequently, a full-annotation dataset is created by combining the new organ annotations with existing tumor annotations. A smaller, more efficient model is then trained on this comprehensive dataset for the final deployment. In this manner, we successfully generate organ and tumor labels for all 4000 complete datasets, while optimizing the inference speed of the latest nnU-Netv2 framework.

% In summary, our contributions can be highlighted as follows:
% \begin{itemize}
%     \item We generate a complete pseudo-labeled dataset for FLARE23 with 4000 images covering 13 abdominal organs and tumors by  a two-step pseudo-labeling approach.
%     \item We optimize the inference speed of the nnU-Net v2 framework, making our solution both accurate and computationally efficient.
% \end{itemize}

\section{Method}

%###########################
\subsection{Preprocessing}
We employ the nnU-Net framework's default preprocessing. For anisotropic data resampling, trilinear interpolation is used in the axial plane and linear interpolation in the sagittal direction. Intensity normalization is performed by clipping values to the 0.5\% (-970.0) and 99.5\% (279.0) Hounsfield Unit levels, followed by z-normalization using a mean of 80.3 and a standard deviation of 141.4.

\begin{figure}[htbp]
\centering
\includegraphics[scale=0.5]{imgs/pipline.png}
\caption{Pipeline of our two-stage pseudo-labeling method. In the first stage, a large model trained for segmenting 13 organs assigns pseudo-labels to 1,497 tumor-annotated images. These images then receive combined organ and tumor labels. In the second stage, another large model trained on these 1,497 images assigns pseudo-labels for the remaining dataset. Finally, a small model is trained using the complete 4,000-image dataset.}
\label{fig:pipline}
\end{figure}

\subsection{Proposed Method}

Inspired by the winning solution of FLARE 2022 from Huang et al.~\cite{FLARE22-1st-Huang}, we implement a two-stage approach for generating pseudo-labels and eventual model deployment. We employ varying sizes of STU-Net architectures~\cite{huang2023stu} for these stages. For a comprehensive overview of our method, please refer to Figure~\ref{fig:pipline}. 


\subsubsection{STU-Net with different scales}

Figure~\ref{fig:Network} illustrates the architecture of our STU-Net, which serves as an extendable and transferable version of the nnU-Net. We achieve this by fixing certain configurations within the nnU-Net framework, adding residual connections to the basic blocks, and modifying the up-sampling and down-sampling techniques. In our experiments, we employed STU-Net-L for the generation of pseudo-labels and utilized STU-Net-B for the final inference deployment. These specific configurations are elaborated in the Table~\ref{table:model}.

\begin{figure}[htbp]
\centering
\includegraphics[scale=0.5]{imgs/fig_model.png}
\caption{Illustration of our STU-Net architecture which is built upon the nnU-Net architecture with several modifications to enhance its scalability and transferability.  
(a) An overview of the STU-Net architecture. The blue arrows denote downsampling while the yellow ones represent upsampling. (b) Residual blocks to achieve a large-scale model. (c) Downsampling in the first residual block of each encoder stage. (d-e) Stem and segmentation head for channel conversion of input and output. (f) Weight-free interpolation for upsampling, which effectively addresses the issue of weight mismatch across different tasks.
}
\label{fig:Network}
\end{figure}

\begin{table}[tb]
\caption{Configurations of STU-Net-L and STU-Net-B models. Depth indicates the number of residual blocks at each resolution stage, and width denotes the channel count at each stage.}
\label{table:model}
\centering
\resizebox{\columnwidth}{!}{
{\begin{tabular}{ccccc}
\hline
 Model &  depth &  width  &  Params (M) & FLOPs (T)  \\ \hline
STU-Net-B & (1,1,1,1,1,1) & (32,64,128,256,512,512) & 58.26 & 0.51 \\
STU-Net-L & (2,2,2,2,2,2) & (64,128,256,512,1024,1024) & 440.30 & 3.81 \\
\hline
\end{tabular}}
}
\end{table}





Loss function: we use the summation between Dice loss and cross-entropy loss because compound loss functions have been proven to be robust in various medical image segmentation tasks~\cite{LossOdyssey}. 

\subsubsection{Handling Partially-Labeled and Unlabeled Data}

We divide the 2,200 partially-labeled FLARE23 images into three main categories, as summarized in Table~\ref{table:partial_labels}. We particularly focus on the subsets containing 250 and 1,497 images. Initially, a large STU-Net model (STU-Net-L) is trained on the 250 images annotated for 13 abdominal organs. This model is then applied to the set of 1,497 images, augmenting the organ annotations while preserving existing tumor labels.

\begin{table}[tb]
\caption{Categorization of Partially-labeled Data in FLARE23 Dataset: 2,200 images grouped into three categories}
\label{table:partial_labels}
\centering
\begin{tabular}{lc}
\hline
Category & Number of Cases \\ \hline
13-organs, no tumor & 250 \\
Tumor, some 5-organs & 1,497 \\
Only 5-organs & 453 \\
\hline
\end{tabular}
\end{table}

For consistency, all pseudo-labels are generated by a large STU-Net model (STU-Net-L). Using the augmented 1,497-image set from the first stage, we train another STU-Net-L model to generate pseudo-labels for the remaining dataset. In the event of annotation conflicts, the original labels are preserved. Ultimately, we employ the fully augmented 4,000-image dataset to train a smaller STU-Net model (STU-Net-B) for efficient deployment and inference.


\subsubsection{Inference Accelaration Based on nnU-Netv2}
We build our efficient inference code upon the popular nnU-Net framework, particularly its latest version, v2. Several optimizations are made to accelerate the inference process. These include using larger target spacing, eliminating the cropping stage, and replacing the resampling function in skimage with torch.nn.interpolate to reduce computational load. Given that the FLARE2023 competition performs inference on a per-image basis, we transition from multi-threading to single-threaded inference to better align with the competition's structure. Additionally, we adopt last year's championship-winning efficient inference strategy, which involves skipping certain patches during patch-based inference.



\subsection{Post-processing}
During the pseudo-labeling generation phase, we employed Testing Time Augmentation (TTA) along the anatomical axes: sagittal, coronal, and axial, to enhance the quality of the generated labels. 

However, in the final submission, we skipped post-processing for computational efficiency. The model's raw outputs serve as the final segmentation results without further modification.




\section{Experiments}
\subsection{Dataset and evaluation measures}
The FLARE 2023 challenge is an extension of the FLARE 2021-2022~\cite{MedIA-FLARE21}\cite{FLARE22}, aiming to promote the development of foundation models in abdominal disease analysis. The segmentation targets cover 13 organs and various abdominal lesions. The training dataset is curated from more than 30 medical centers under the license permission, including TCIA~\cite{TCIA}, LiTS~\cite{LiTS}, MSD~\cite{simpson2019MSD}, KiTS~\cite{KiTS,KiTSDataset}, autoPET~\cite{autoPET-Data,autoPET-MICCAI22}, TotalSegmentator~\cite{TotalSegmentator}, and AbdomenCT-1K~\cite{AbdomenCT-1K}. The training set includes 4000 abdomen CT scans where 2200 CT scans with partial labels and 1800 CT scans without labels. The validation and testing sets include 100 and 400 CT scans, respectively, which cover various abdominal cancer types, such as liver cancer, kidney cancer, pancreas cancer, colon cancer, gastric cancer, and so on. The organ annotation process used ITK-SNAP~\cite{ITKSNAP}, nnU-Net~\cite{nnUNet}, and MedSAM~\cite{MedSAM}.


The evaluation metrics encompass two accuracy measures—Dice Similarity Coefficient (DSC) and Normalized Surface Dice (NSD)—alongside two efficiency measures—running time and area under the GPU memory-time curve. These metrics collectively contribute to the ranking computation. Furthermore, the running time and GPU memory consumption are considered within tolerances of 15 seconds and 4 GB, respectively.


\subsection{Implementation details}
\subsubsection{Environment settings}

The development environments and requirements are presented in Table~\ref{table:env}.

\subsubsection{Training protocols}
To handle partially labeled and unlabeled data, we utilize the preprocessing and pseudo-labeling scheme discussed earlier. Alongside, we adopt extensive data augmentation techniques, including rotations, elastic deformations, and random cropping, to enhance our models' generalization capabilities. For training, a patch-based approach is employed. We use a balanced sampling mechanism in our patch sampling strategy to ensure equal representation of each class in each batch, effectively countering class imbalance issues. We do not conduct model selection. 


\begin{table}[!htbp]
\caption{Development environments and requirements.}\label{table:env}
\centering
\begin{tabular}{ll}
\hline
System       & CentOS 7 \\
\hline
CPU   & Intel(R) Xeon(R) Platinum 8369B CPU @ 2.90GHz \\
\hline
RAM                         &32$\times $4GB; 2.67MT$/$s\\
\hline
GPU (number and type)                         & one NVIDIA A100 80G\\
\hline
CUDA version                  & 11.7\\                          \hline
Programming language                 & Python 3.9\\ 
\hline
Deep learning framework & torch 2.0 \\
\hline
Specific dependencies         &     nnU-Net 2.1                   \\                                                                      
\hline
Code     &     \href{https://github.com/Ziyan-Huang/FLARE23}{https://github.com/Ziyan-Huang/FLARE23} \\
\hline
\end{tabular}
\end{table}




\begin{table*}[!htbp]
\caption{Training protocols for the STU-Net-L model.}
\label{table:training}
\begin{center}
% \resizebox{0.47\textwidth}{!}{
\begin{tabular}{ll} 
\hline
Network initialization         & He \\
\hline
Batch size                    & 2 \\
\hline 
Patch size & 48$\times$192$\times$192  \\ 
\hline
Total epochs & 2000 \\
\hline
Optimizer          &    SGD with nesterov momentum ($\mu=0.99$)    \\ \hline
Initial learning rate (lr)  & 0.01 \\ \hline
Lr decay schedule &  poly decay \\ 
\hline
Training time                                           & 48 hours \\  \hline 
Loss function & Dice Loss + Cross Entropy  \\    \hline
Number of model parameters    & 440M\footnote{https://github.com/sksq96/pytorch-summary} \\ \hline
Number of flops & 3.81T\footnote{https://github.com/facebookresearch/fvcore} \\ \hline
CO$_2$eq & 114.02 Kg\footnote{https://github.com/lfwa/carbontracker/} \\  \hline
\end{tabular}
%}
\end{center}
\end{table*}


\begin{table*}[!htbp]
\caption{Training protocols for the STU-Net-B model.}
\label{table:training2nd}
\begin{center}
% \resizebox{0.47\textwidth}{!}{
\begin{tabular}{ll} 
\hline
Network initialization         & He \\
\hline
Batch size                    & 2 \\
\hline 
Patch size & 48$\times$128$\times$160  \\ 
\hline
Total epochs & 2000 \\
\hline
Optimizer          & SGD with nesterov momentum ($\mu=0.99$)          \\ \hline
Initial learning rate (lr)  & 0.01 \\ \hline
Lr decay schedule & poly decay \\
\hline
Training time                                           & 24 hours \\  \hline 
Loss function & Dice Loss + Cross Entropy  \\    \hline
Number of model parameters    & 58M\footnote{https://github.com/sksq96/pytorch-summary} \\ \hline
Number of flops & 510G\footnote{https://github.com/facebookresearch/fvcore} \\ \hline
CO$_2$eq & 17.08 Kg\footnote{https://github.com/lfwa/carbontracker/} \\  \hline
\end{tabular}
\end{center}
\end{table*}

\section{Results and discussion}

\begin{table}[htbp]
\caption{Quantitative evaluation results.
}\label{tab:final-results}
\centering
\begin{tabular}{l|cc|cc|cc}
\hline
\multirow{2}{*}{Target} & \multicolumn{2}{c|}{Public Validation} & \multicolumn{2}{c|}{Online Validation} & \multicolumn{2}{c}{Testing} \\ \cline{2-7} 
                        & DSC(\%)            & NSD(\%)           & DSC(\%)            & NSD(\%)           & DSC(\%)      & NSD (\%)     \\ \hline
Liver                   & 97.70 $\pm$ 0.51   & 99.37 $\pm$ 0.48  & 97.61              & 99.29             & 96.57        & 98.20        \\
Right Kidney            & 94.96 $\pm$ 5.19   & 96.84 $\pm$ 6.50  & 93.78              & 95.96             & 93.91        & 95.22        \\
Spleen                  & 96.64 $\pm$ 0.85   & 99.20 $\pm$ 1.32  & 96.67              & 99.41             & 96.09        & 98.47        \\
Pancreas                & 87.07 $\pm$ 4.85   & 97.71 $\pm$ 2.95  & 85.82              & 96.97             & 90.37        & 98.20        \\
Aorta                   & 94.17 $\pm$ 2.18   & 98.64 $\pm$ 2.67  & 94.33              & 98.74             & 94.62        & 99.49        \\
Inferior vena cava      & 92.84 $\pm$ 2.34   & 97.47 $\pm$ 2.32  & 92.81              & 97.34             & 93.34        & 98.40        \\
Right adrenal gland     & 79.18 $\pm$ 12.52  & 94.93 $\pm$ 13.80 & 79.99              & 95.80             & 79.17        & 95.33        \\
Left adrenal gland      & 80.41 $\pm$ 6.70   & 95.70 $\pm$ 4.18  & 79.94              & 94.97             & 80.00        & 95.16        \\
Gallbladder             & 85.91 $\pm$ 19.62  & 88.06 $\pm$ 20.92 & 88.27              & 89.93             & 84.12        & 87.67        \\
Esophagus               & 82.04 $\pm$ 15.17  & 93.95 $\pm$ 14.49 & 82.81              & 94.93             & 88.21        & 98.95        \\
Stomach                 & 93.92 $\pm$ 2.91   & 98.24 $\pm$ 3.25  & 94.19              & 98.34             & 93.53        & 98.09        \\
Duodenum                & 84.65 $\pm$ 6.22   & 96.21 $\pm$ 4.65  & 85.47              & 96.75             & 88.37        & 98.01        \\
Left kidney             & 94.00 $\pm$ 6.88   & 95.41 $\pm$ 9.33  & 93.46              & 95.59             & 92.96        & 94.62        \\
Tumor                   & 53.35 $\pm$ 34.22  & 45.24 $\pm$ 30.74 & 46.07              & 39.17             & 62.61        & 52.15        \\ \hline
Average                 & 86.92 $\pm$ 8.58   & 92.64 $\pm$ 8.40  & 86.52              & 92.37             & 88.13        & 93.43        \\ \hline
\end{tabular}

\end{table}

\begin{table}[htbp]
\caption{Performance Comparison: Partially Labeled vs. Total Data}
\label{table:unlabeled_effect}
\centering
\begin{tabular}{c|c|c|c|c}
\hline
Training Data & Organ DSC & Organ NSD & Tumor DSC & Tumor NSD \\
\hline
2200 Partial Label & 89.45 & 96.20 & 45.91 & 40.04 \\
4000 Total & 89.63 & 96.46 & 46.07 & 39.17 \\
\hline
\end{tabular}
\end{table}

\subsection{Quantitative results on validation set}

Our final model's performance metrics are summarized in Table \ref{tab:final-results}. Due to limitations in the online submission system, we present the average results obtained solely on a publicly labeled validation set of 50 cases.

Additionally, we conducted an ablation study to assess the impact of utilizing unlabeled data. Specifically, we compared the performance of STU-Net-L models trained on two different datasets: one with 2,200 partially labeled images and another with a total of 4,000 images. The results from the online leaderboard for both training scenarios are detailed in Table \ref{table:unlabeled_effect}. As indicated by the data in Table \ref{table:unlabeled_effect}, the inclusion of an extra 1,800 unlabeled images led to only minimal changes in performance metrics.



\begin{table}[htbp]
\caption{Quantitative evaluation of segmentation efficiency in terms of the running them and GPU memory consumption. Total GPU denotes the area under GPU Memory-Time curve. Evaluation GPU platform: NVIDIA QUADRO RTX5000 (16G). 
}
\label{table:efficiency}
\centering
\begin{tabular}{ccccc}
\hline
Case ID & Image Size      & Running Time (s) & Max GPU (MB) & Total GPU (MB) \\ \hline
0001    & (512, 512, 55)  & 22.03       & 2836   & 14975    \\
0051    & (512, 512, 100) & 13.02       & 3144   & 16366    \\
0017    & (512, 512, 150) & 28.82       & 3212   & 23825     \\
0019    & (512, 512, 215) & 20.33       & 2974   & 16467     \\
0099    & (512, 512, 334) & 14.13       & 3140   & 16904     \\
0063    & (512, 512, 448) & 16.51       & 3210   & 19762     \\
0048    & (512, 512, 499) & 16.17       & 3180   & 17090     \\
0029    & (512, 512, 554) & 19.85       & 3394   & 23710     \\ \hline
\end{tabular}
\end{table}


\subsection{Qualitative results on validation set}
Qualitative results of two examples with good segmentation results and two examples with bad segmentation results in the validation set are shown in Figure \ref{fig:seg}. As can be seen from the figure, our model performs well in segmenting larger tumors that are situated on organs. However, for smaller tumors that are not located on organs, the model tends to miss the segmentation.
Further investigation reveals that the model's limitations on smaller, isolated tumors could be attributed to the initial training set, which mainly consists of larger, organ-associated tumors. 

\begin{figure}[!htbp]
\centering
\includegraphics[scale=0.8]{imgs/results.png}
\caption{Qualitative results of two examples with good segmentation results and two examples with bad segmentation results in the validation set.
}
\label{fig:seg}
\end{figure}

\subsection{Segmentation Efficiency Results on Validation Set}
Efficiency results for multiple validation cases are presented in Table \ref{table:efficiency}. As observed, our algorithm completes the segmentation in less than 30 seconds for all cases, with the majority finishing within 20 seconds. Additionally, the GPU memory consumption stays below 4GB. 
These results demonstrate that our model not only performs well in terms of accuracy but also excels in computational efficiency. 

\begin{figure}[!htbp]
\centering
\includegraphics[scale=0.7]{imgs/efficiency.png}
\caption{Comparison of time consumption for various segmentation phases before and after optimization. The case analyzed is FLARE23Ts\_0063, a typically time-consuming case.}
\label{fig:efficiency}
\end{figure}


\subsection{Segmentation Efficiency Ablation}
We conduct our experiments on a consistent setup featuring an Intel Core i9-13900K CPU and an NVIDIA RTX 4090 GPU. We analyze the time efficiency for Case FLARE23Ts\_0063, a typically time-consuming case, with dimensions \(448 \times 512 \times 512\) and spacing \(1.5 \times 0.875 \times 0.875\).

Figure \ref{fig:efficiency} illustrates the time consumption for various segmentation phases both before and after optimization. Before optimization, the process was most time-consuming in "Resample Logits," taking up to 54 seconds. After applying our optimization techniques, the time spent on this phase dramatically dropped to just 0.06 seconds. Similarly, "Sliding Window Inference" was reduced from 13.4 to 2 seconds.

Overall, the total time was reduced from approximately 92 seconds to about 11 seconds, demonstrating an 8-fold efficiency improvement in the segmentation process.



\subsection{Results on final testing set}

We represent our final testing set in Table~\ref{tab:test_results}.

\begin{table}[htbp]
    \centering
    \begin{tabular}{cccccc}
    \hline
           Organ DSC&  Organ NSD&  Lesion DSC&  Lesion NSD&  Time& GPU Memory\\ \hline
           89.98&  96.53&  62.61&  52.15&  12.02& 12033\\ \hline
    \end{tabular}
    \caption{Results on final testing set}
    \label{tab:test_results}
\end{table}


\subsection{Limitation and Future Work}
One of the limitations of our approach lies in the segmentation of tumors, where a notable number of false negatives and missed detections have been observed. This issue is partly attributed to our data processing methodology, where cases marked with tumors were not comprehensively annotated. We operated under the assumption that all tumors were identified in such cases, which was a misstep. A more meticulous approach to tumor annotation is essential to overcome this challenge. Additionally, in our pursuit of accelerating the process, we opted to resize the segmentation results instead of the logits. This decision led to a significant decline in accuracy. Future work will focus on augmenting the training data to include more varied tumor types and sizes for improved generalization, alongside refining our data processing and segmentation methods to enhance precision and reliability.



\section{Conclusion}
The primary focus of our study has been to address the issue of partially labeled data in abdominal multi-organ and tumor segmentation. We explored a pseudo-labeling strategy to efficiently handle this challenge, breaking it down into a two-step process focused on separate organ and tumor annotations. Additionally, to reconcile the trade-off between accuracy and computational efficiency, we optimized the nnU-Netv2 segmentation framework. As a result, we have developed a methodology that is both accurate and efficient. 



\subsubsection{Acknowledgements} The authors of this paper declare that the segmentation method they implemented for participation in the FLARE 2023 challenge has not used any pre-trained models nor additional datasets other than those provided by the organizers. The proposed solution is fully automatic without any manual intervention. We thank all the data owners for making the CT scans publicly available and CodaLab~\cite{codalab} for hosting the challenge platform. 


%
% ---- Bibliography ----
%
% BibTeX users should specify bibliography style 'splncs04'.
% References will then be sorted and formatted in the correct style.
%
\bibliographystyle{splncs04}
\bibliography{ref}

\newpage
% Please add the following required packages to your document preamble:
% \usepackage[normalem]{ulem}
% \useunder{\uline}{\ul}{}
\begin{table}[!htbp]
\caption{Checklist Table. Please fill out this checklist table in the answer column.}
\centering
\begin{tabular}{ll}
\hline
Requirements                                                                                                                    & Answer        \\ \hline
A meaningful title                                                                                                              & Yes        \\ \hline
The number of authors ($\leq$6)                                                                                                             & 6        \\ \hline
Author affiliations and ORCID                                                                                           & Yes        \\ \hline
Corresponding author email is presented                                                                                                  & Yes        \\ \hline
Validation scores are presented in the abstract                                                                                 & Yes        \\ \hline
\begin{tabular}[c]{@{}l@{}}Introduction includes at least three parts: \\ background, related work, and motivation\end{tabular} & Yes        \\ \hline
A pipeline/network figure is provided                                                                                           & 3 \\ \hline
Pre-processing                                                                                                                  & 3   \\ \hline
Strategies to use the partial label                                                                                             & 5   \\ \hline
Strategies to use the unlabeled images.                                                                                         & 5   \\ \hline
Strategies to improve model inference                                                                                           & 5   \\ \hline
Post-processing                                                                                                                 & 5   \\ \hline
Dataset and evaluation metric section is presented                                                                              & 6   \\ \hline
Environment setting table is provided                                                                                           & 7  \\ \hline
Training protocol table is provided                                                                                             & 7  \\ \hline
Ablation study                                                                                                                  & 8,11   \\ \hline
Efficiency evaluation results are provided                                                                                     & 9 \\ \hline
Visualized segmentation example is provided                                                                                     & 9 \\ \hline
Limitation and future work are presented                                                                                        & Yes        \\ \hline
Reference format is consistent.  & Yes        \\ \hline

\end{tabular}
\end{table}

\end{document}
