% This is samplepaper.tex, a sample chapter demonstrating the
% LLNCS macro package for Springer Computer Science proceedings;
% Version 2.21 of 2022/01/12
%
\documentclass[runningheads]{llncs}
%
\usepackage{amsmath,amssymb}
\usepackage{adjustbox}
\usepackage{mathrsfs}
\usepackage[T1]{fontenc}
% T1 fonts will be used to generate the final print and online PDFs,
% so please use T1 fonts in your manuscript whenever possible.
% Other font encondings may result in incorrect characters.
%
\usepackage{graphicx}
% Used for displaying a sample figure. If possible, figure files should
% be included in EPS format.
%
% If you use the hyperref package, please uncomment the following two lines
% to display URLs in blue roman font according to Springer's eBook style:
%\usepackage{color}
%\renewcommand\UrlFont{\color{blue}\rmfamily}
\usepackage[pagebackref=true,breaklinks=true,colorlinks,bookmarks=false]{hyperref}
%
\begin{document}
%
\title{Semi-Supervised Detection, Identification and Segmentation for Abdomen Organs}
%
%\titlerunning{Abbreviated paper title}
% If the paper title is too long for the running head, you can set
% an abbreviated paper title here
%
\author{Mingze Sun \and
Yankai Jiang \and
Heng Guo} 
%
\authorrunning{Mingze Sun et al.}
% First names are abbreviated in the running head.
% If there are more than two authors, 'et al.' is used.
%
\institute{Alibaba DAMO Academy}
% \email{lncs@springer.com}\\
% \url{http://www.springer.com/gp/computer-science/lncs} \and
% ABC Institute, Rupert-Karls-University Heidelberg, Heidelberg, Germany\\
% \email{baifeng.smz@alibaba-inc.com}}
%
\maketitle              % typeset the header of the contribution
%
\begin{abstract}
Abdomen organ segmentation is an important prerequisite in many medical image analysis applications. 
Methods based on U-Net have demonstrated their scalability and achieved great success in different organ segmentation tasks. 
%But most of them fail to generalize well when organs show large variations in shape and texture.
However, the limited number of data and labels hinders the training process of these methods. Moreover, traditional U-Net models based on convolutional neural network suffer from limited receptive fields. 
Lacking the ability to model long-term dependencies from a global perspective, these methods are prone to 
produce false positive predictions.
%when abdominal organs show large variation in shape and texture.
In this paper, we propose a new semi-supervised learning algorithm based on vision transformer to overcome these challenges.
The overall architecture of our method consists of three stages. In the first stage, we tackle the abdomen region location problem via a lightweight segmentation network.
In the second stage, we adopt a vision transformer model equipped with semi-supervised learning strategy to detect different abdominal organs.
In the final stage, we attach multiple organ-specific segmentation networks to automatically segment organs from their bounding boxes. 
%The models in the three stages are trained end-to-end.  
We evaluate our method on the MICCAI FLARE 2022 challenge dataset. Experimental results demonstrate the effectiveness of each network part in our method.
%We train and ensemble 12 nnU-Net models with different settings through the five-fold cross-validation method. 
Our segmentation results currently achieve 0.897 mean DSC on the leaderboard of FLARE 2022 validation set.

\keywords{Semi-supervised \and Organ detection \and Organ segmentation.}
\end{abstract}



\section{Introduction}
Learning feature representations from few labeled data is a fundamental problem in medical image analysis. It has attracted interests of academia and industry because acquiring enough annotated medical images is tedious, time-consuming and expensive. Compared to supervised methods, semi-supervised methods mainly focus on using labeled and large amounts of unlabeled data efficiently and properly~\cite{chen2021semi,french2019semi,wua2021mutual}. Nowadays, 
semi-supervised methods are becoming the standard choice for data label shortage regime.
%These methods can use large amounts of unlabeled data efficiently. 

Deep learning has been very popular in the field of medical image analysis. 
Modern deep learning based strong baselines for medical image analysis are mostly trained on a large amount of manually labeled data and tailored for specific tasks.
Abdomen organ segmentation is one of the most common tasks in this subject, which has many important clinical applications, such as organ quantification, surgical planning, and disease diagnosis. 
However, the shortage of labeled data hinders the developing of deep learning models
in this scenario since segmentation tasks often require enough dense annotations which come from domain experts’ concentration and are hard to access. In addition, the diversity of data sources also challenges the robustness of existing state-of-the-art (SOTA) methods.
As a potential alternative, semi-supervised learning can explore useful information from unlabeled cases.
Therefore, exploiting unlabeled medical data in a semi-supervised learning scheme has become extremely important to improve the performance of medical image segmentation models and has attracted increasing research attention.

%Manually annotating medical images (e.g., volumetric CT or MRI scans) at the pixel/voxel level requires expertise and is labor-intensive.
%manually annotating organs from CT scans is time-consuming and labor-intensive. Thus, we usually cannot obtain a huge number of labeled cases. 
In this paper, we propose a new semi-supervised learning algorithm based on vision transformer to overcome the aforementioned challenges. The architecture of our method consists of three stages. 
In the first stage, we build a lightweight segmentation network to locate the abdomen region.
Then, in the second stage, we adopt a vision transformer model equipped with semi-supervised learning strategy to detect different abdominal organs.
In the third stage, we attach multiple organ-specific segmentation networks to automatically segment organs from their bounding boxes. 
%The models in the three stages are trained end-to-end.  
We evaluate our method on the MICCAI FLARE 2022 challenge dataset. Experimental results demonstrate the effectiveness of each network component in our method.
%We train and ensemble 12 nnU-Net models with different settings through the five-fold cross-validation method. 
The contribution of our method are three fold. (1) We propose a semi-supervised learning scheme which adopts multiple models' consistent predictions to produce high quality pseudo labels to train student network. (2) We propose a vision transformer based detection model to detect different organs which has large variations in shape and texture. (3) Combining semi-supervised training strategy and a vision transformer architecture with several segmentation head, we build a strong segmentation model which currently achieves 0.897 mean DSC on the leaderboard of FLARE 2022 validation set.

%which is crucial for 
%considering the scarcity of target disease, the protection of patient’s privacy and the limited medical resources.
%It is common practice that training deep neural networks often requires a large amount of manually labeled data.
%This is because acquiring enough densely annotated medical data is extremely expensive.
%Introduce the background and difficulty of the segmentation task and your ideas.

\section{Method}
In order to leverage the unlabeled data, we first train a teacher model using labeled data
and then predict segmentation results for unlabeled data with the trained teacher model.
Considering that many new network structures may not have good generalization ability in the unseen dataset, we choose a strong and general baseline, nnU-Net~\cite{isensee2018nnu}, as the standard choice for teacher model.
In previous deep learning works, network structure and parameters often need to be adjusted according to practical application~\cite{li2018h,oktay2018attention}. It relies on users' experience and usually needs many experiments. If the whole training process can be properly designed, U-Net can achieve good results in most cases~\cite{ronneberger2015u}.
So it seems that the most straightforward way to build student model is initializing another nn-UNet model with different initial parameters.
However, training nnU-Net cost a lot of time and its inference efficiency may not meet practical demands, we do not use it as our final choice. Despite of this, from well-trained nnU-Net model, we can get strong pseudo labels of the unlabeled data. 
We use these pesudo labels and unlabeled data as a new training set to pretrain a new student
model built with vision transformer. 
In this section, we first introduce nnU-Net briefly and then bring out our new student model.

%\begin{itemize} 
% \item Preprocessing
% \item Training Procedure
% \item Inference
% \item Postprocessing
%\end{itemize}
%###########################
\subsection{nnU-Net}
Isensee et al. proposed nnU-Net~\cite{isensee2018nnu}, which can adapt to many dataset in a supervised training process. nnU-Net adjusts the network structure according to the characteristics of the training set. It can process images with various shape and texture, so as to achieve SOTA results in multiple medical segmentation tasks~\cite{Antonelli2021TheMS}. Specifically, for different datasets, nnU-net defines adaptive adjustment strategies from four perspectives, including: preprocessing, training procedure, inference and postprocessing.
\subsubsection{Network structure}
nnU-Net consists of 2D U-Net, 3D U-Net and U-Net Cascade. In these architectures, ReLU is replaced to Leaky ReLU and batch normalization is replaced to instance normalization. while the network structure remains almost the same as default U-Net and it did not adopt additional modules such as attention mechanisms.

3D U-Net is usually used for training on 3D medical images, including CT and MRI. 
However, it occupies a large amount of GPU memory. In order to improve training speed and reduce resource consumption, the patch based 3D U-Net can be adopted to reduce the cost of network computing. 3D U-Net is mainly to solve the problem of poor effect of 2D U-Net in anisotropic data. On the other hand, the patch based 3D U-Net may have poor effect on large image sizes due to a limit global view. 3D U-Net Cascade is used to solve this problem. It is first trained on the down-sampled image and then the results are up-sampled to the original resolution. %The two-stage 3D U-Net is introduced and trained by full-resolution images.

The network topologies adjust adaptively according to the image size. It considers the image geometry and balances the GPU memory occupation which corresponds to the adjustment of the network capacity and batch size. The initial network configuration is as follows:

2D U-Net: An input patch size is set to $256 \times 256$, a batch size of 42, and the number of feature maps of the highest layer is set to 30 (the number of feature maps will be doubled with each downsampling). The network parameters are automatically adjusted to the median plane size of each dataset, so that the network can effectively train the whole slice. 
%We configure the network to be pooled along each axis until the feature map for that axis is less than eight.

3D U-Net: An input patch size is set to $128 \times 128 \times 128$, a batch size of 2, and the number of feature maps at the highest level is 30. Due to the GPU memory limitation, the resolution of the image size beyond $128^3$ voxels is not increased, but matches the median voxel size of the input image. If the median shape of the dataset is smaller than $128^3$, we use the median shape as the input image size and add batch size. 

\subsubsection{Preprocessing}
Image preprocessing is a very important part before training. For nnU-Net teacher models, this process is divided into three steps:
(1) Cropping: Crop all data to the non-zero area.
(2) Resampling: In order to enable the network to learn spatial semantics, images are resampled to the median voxel spacing of the dataset, and third order spline interpolation and nearest neighbor interpolation methods are used for data and segmentation mask respectively.
(3) Normalization: For CT images, pixel values within the segmentation mask are collected, and all data is truncated to [0.5, 99.5] percentiles of these pixel values, followed by a z-score normalization. If the average size is decreased by more than 1/4, normalization is only applied to non-zero elements of the mask, and values outside the mask are set to 0.

\subsection{Semi-supervised Cascaded Organ Detection, Identification and Segmentation}
The overall architecture of our method is shown in Figure~\ref{fig:Network}.
It consists of three stages. First, we adopt a lightweight U-Net to obtain the abdomen regions. Then we locate each organ with a new detection network built upon vision transformer. Finally, we segment organs according to the detection bounding boxes. 
\subsubsection{Pseudo Label preparation}
The quality of pseudo labels is the key to determining whether the use of unlabeled data in semi-supervised training is effective. Poor quality pseudo labels may mislead student model to learn wrong semantic information. In order to acquire high quality pseudo labels, we adopt a consistency voting strategy which measure the consistency between pseudo labels generated by different teachers for the same case. The insight of our strategy is straightforward. For example, simple 
cases should be easy for most teacher models while hard cases may cause most models to fail. 
If a case causes different models to produce very inconsistent prediction outputs, we think that the distribution of this example is likely to be outside the distribution of most examples. We therefore reject examples with inconsistent pseudo labels, as they are likely to mislead the student network.

We choose nnU-Net as the teacher model. In order to enhance consistency between different teacher models, We build multiple different nnU-Net models with different initialization parameters. The same architecture of these teacher models ensure a better consistency. Then we train these teacher models using 50 labeled data, and the models obtained are not used for final testing stage, but only for generating pseudo labels. The mean DSC of the results on the validation set exceeds 0.89. We believe that the nnU-Net models has been able to generate pseudo labels of good quality. We take 2000 unlabeled data as input and use trained nnU-Net models to generate corresponding pseudo labels.
Finally, we measure the consistency between these pseudo labels and screen high quality pseudo labels. After a segmentation results ensemble, we obtain the final pseudo labels as input labels for our framework. 

\subsubsection{Abdomen RoI Extraction}
We train a simplified U-Net model to identify organ regions, then the proper RoI can be inferred by calculating the weighted average coordinates and distribution scope of the predicted organ voxels. This step helps us filter irrelevant background regions.

\subsubsection{Organ Detection and Identification}
We propose a new detection framework based on DETR \cite{carion2020end} to detect each organ. DETR handles object detection as a direct set prediction problem through the conjunction of the bipartite matching loss and transformer with parallel decoding of queries. In abdomen CTs, the number and relative position of organs are stable. We intend to estimate a 9-DoF bounding box for each organ to obtain an accurate and compact RoI.
To this end, we compute 9-DoF bounding boxes via principle componet analysis (PCA), based on the comprehensively annotated instance-level organ segmentation mask (ground-truth label and pseudo label), as supervision signal. For queries matched to the background class, only classification loss is accounted. % Note that we merely use L1 loss for geometrical 9-DoF box regression. 

\subsubsection{Organ Segmentation}
To get high accuracy instance segmentation results, we adopt multiple stand-alone U-Net~\cite{ronneberger2015u} models to segment each organ independently with a finer spatial resolution but in a locally cropped patch based on the detected bounding boxes from the second stage. The segmentation heads perform a binary segmentation for all 3D patches. After this, all predicted binary masks are merged back with their corresponding labels and spatial locations to form the final instance segmentation results of organs.

\begin{figure}[htbp]
\centering
%\addtolength{\leftskip} {-1.0cm} 
\includegraphics[width = \textwidth]{imgs/Model2.png}
\caption{Network architecture.}
\label{fig:Network}
\end{figure}

\subsubsection{Preprocessing}
Before training student model, we conduct preprocessing similar to the preprocessing used for teacher model. We peform cropping, resampling and normalization.
% Image preprocessing is a very important part before training. For nnU-Net teacher models, this process is divided into three steps:
% (1) Cropping: Crop all data to the non-zero area.
% (2) Resampling: In order to enable the network to learn spatial semantics, images are resampled to the median voxel spacing of the dataset, and third order spline interpolation and nearest neighbor interpolation methods are used for data and segmentation mask respectively.
% (3) Normalization: For CT images, pixel values within the segmentation mask are collected, and all data is truncated to [0.5, 99.5] percentiles of these pixel values, followed by a z-score normalization. If the average size is decreased by more than 1/4, normalization is only applied to non-zero elements of the mask, and values outside the mask are set to 0.

\subsubsection{Training Procedure}
The model is trained from scratch and evaluated by five-fold cross-validation on the training set. Total loss for segmentation is Dice loss~\cite{drozdzal2016importance} combined with cross entropy. 

\begin{equation}
\mathscr{L}_{total} = \mathscr{L}_{dice} + \mathscr{L}_{CE} \\
\end{equation}

For the teacher model, Adam is selected as the optimizer in the training process, with an initial learning rate of $3 \times 10^{-4}$ and 250 batches of each epoch. a Learning rate adjustment strategy is used which calculates the exponential moving average loss of the training set and validation set. If the training set loss decreases less than $5 \times 10^{-3}$ within 30 epochs, then the learning rate decreases by 5 times.
When the learning rate is larger than $10^{-6}$ and the exponential moving average loss of the validation set decreases less than $5\times10^{-3}$ within 60 epochs, the training is terminated. Random rotations, Random scaling, Random elastic deformations, Gamma correction augmentation and Mirroring are adopted as data augmentation. If the maximum side length of the image patch size of 3D U-Net is more than twice the minimum side length, then 2D data augmentation methods are used. For the student model, readers are referred to \ref{table:training2nd} for stage-specific training details.

\subsubsection{Inference}
All inferences are performed by the student model. In our implementation, we dynamically clear the memory footprint to release the redundant memory occupancy in time and
reduce resource consumption. The inference speed of our method is very fast thanks to the cascaded detection-then-segmentation strategy which significantly reduce the computation cost of redundancy regions.

\subsubsection{Postprocessing}
For the teacher model, we adopt commonly used postprocessing methods such as Dense CRF.
It is generally considered that a certain class is within a simply connected domain, which means that there is only one such domain within a case. So only the largest connected domain is retained, and the other small connected domains are removed. For the student model, we omit postprocessing step for the sake of inference efficiency.

\section{Experiments}
\subsection{Dataset and evaluation measures}
The FLARE 2022 dataset is curated from more than 20 medical groups under the license permission, including KiTS~\cite{KiTS} and AbdomenCT-1K~\cite{AbdomenCT-1K}. The training set includes 50 labelled CT scans with pancreas disease and 2000 unlabelled CT scans with liver, kidney, spleen, or pancreas diseases. The validation set includes 50 CT scans with liver, kidney, spleen, or pancreas diseases.
The testing set includes 200 CT scans where 100 cases has liver, kidney, spleen, or pancreas diseases and the other 100 cases has uterine corpus endometrial, urothelial bladder, stomach, sarcomas, or ovarian diseases. All the CT scans only have image information and the center information is not available.

The evaluation measures consist of two accuracy measures: Dice Similarity Coefficient (DSC) and Normalized Surface Dice (NSD), and three running efficiency measures: running time, area under GPU memory-time curve, and area under CPU utilization-time curve. All measures will be used to compute the ranking. Moreover, the GPU memory consumption has a 2 GB tolerance.


\subsection{Implementation details}
\subsubsection{Environment settings}
We develop our cascaded model based on PyTorch. 
All models are trained from scratch. We train the segmentation networks with a combination of dice and cross entropy loss.
We use the AdamW optimizer in detection part and the Adam optimizer in RoI extractor and segmentation part. An initial learning rate of $1 \times 10^{-4}$ is used in RoI extractor, $4 \times 10^{-4}$ and $1 \times 10^{-3}$ are used respectively in detection and segmentation. Training batches are set as 8, 8 and 4 respectively. The development environments and requirements are presented in Table~\ref{table:env}.


\begin{table}[!htbp]
\caption{Development environments and requirements.}\label{table:env}
\centering
\begin{tabular}{lc}
\hline
Windows/Linux version       & AliOS 7\\
\hline
CPU   & Intel(R) Xeon(R) Platinum 8163 CPU @ 2.50GHz \\
\hline
RAM                         &724GB\\
\hline
GPU (number and type)                         & Eight Tesla V100 32G\\
\hline
CUDA version                  & 11.4\\                          \hline
Programming language                 & Python 3.7.3\\ 
\hline
Deep learning framework & PyTorch (torch 1.7.0, torchvision 0.8.1) \\ \hline

\end{tabular}
\end{table}


\subsubsection{Training protocols}
All images are automatically normalized based on statistics of the entire respective dataset.
During training, in order to help networks properly learn spatial semantics, all patients are resampled to the median
voxel spacing of their respective dataset, where third order spline interpolation
is used for image data and nearest neighbor interpolation for the corresponding
segmentation mask. The detailed training protocols are shown in Table~\ref{table:training2nd}.

\begin{table*}[!htbp]
\caption{Training protocols. ``roi'' means the ROI extraction in stage 1. ``det'' means the organ detection network in stage 2. ``seg'' means the segmentation head in stage 3.}
\label{table:training2nd}
\begin{center}
% \resizebox{0.47\textwidth}{!}{
\begin{tabular}{ll} 
\hline
Network initialization         & Kaiming normal initialization\\
\hline
Batch size                    & roi: 8 | det: 8 | seg: 4 \\
\hline 
Patch size & seg only: organ-specific patch size \\ 
\hline
Total epochs & roi: 1000 | det: 1000 | seg: 500 \\
\hline
Optimizer          & roi: Adam | det: AdamW | seg: Adam  \\ \hline
Initial learning rate (lr)  & roi: 0.0001 | det: 0.0004 | seg: 0.001 \\ \hline
Lr decay schedule & warmup 200 epochs and $\times$0.1 at 800th epoch \\ \hline
Training time  &  roi: 52h | det: 20h | seg: organ-specific \\  \hline 
Number of model parameters  & roi: 4.8M | det: 9.6M | seg: 4.8M \\ \hline
\end{tabular}
\end{center}
\end{table*}

% Following nnU-Net, we adopt augmentation techniques including 
% random rotations, random scaling, random elastic deformations, gamma correction augmentation and mirroring on the fly during training.
% We find that the extensive data augmentation strategies are essential to better performance.
% This helps prevent overfitting when training from limited data.
% As for the patch sampling strategy, we enforce that more than a third of the samples in a batch contain at least one randomly chosen foreground class
% to increase the stability of our network training.
% For the optimal model selection criteria, the model that achieves the highest mean
% foreground dice score on the training set cross-validation is automatically chosen.

%In test phase, we adopt two strategies to reduce computation complexity. 
%(1) Before running the nnU-Net preprocessing program, we change the spacing of all data to $(1, 1, 3)$, so that no interpolatation is adopted in the preprocessing stage. 
%After the prediction is complete, we change the spacing back to the origin.
%(2) Before training the model, we modify the number of convolutional layers, which will %significantly reduce the number of parameters of the model.

\section{Results and discussion}

\subsection{Quantitative results on validation set}
We compare our method with two state-of-the-art segmentation models including CNN based methods and vision transformer based methods. As shown in Table.~\ref{table:results2}, our results currently obtain 0.897 mean DSC on the leaderboard of FLARE 2022 validation set.
% We adopt consistency learning method to utilize the unlabelled cases.
% Specifically, we first random initialize two different models. Then we train a teacher model based on the 50 labeled training cases.
% Then we uses these two models to obtain the segmentation results of the same case.
% Then we minimize the MSE loss between the two segmentation results of the same case. 
% In this way, the student model learns the knowledge of the teacher model and gradually 
% achieve better results than the teacher model. This procedure helps improve about $1\%$ DSC.
Compared with nnU-Net and Swin-UNETR~\cite{tang2022self}, which are also trained from scratch, our method exceeds these two methods in terms of DSC on most abdomen organs. Moreover, 
our model is even better than the Swin-UNETR model with pre-training on FLARE unlabelled part. This emphasizes the significance of our semi-supervised method. Last but not least, our method even outperforms the ensembled nnU-Net, which ensemble the segmentation results of 13 different initialized nnU-Net models, and has much less training and inference time than the nnU-Net with ensemble. The segmentation results of these methods are shown in Figure~\ref{fig:qual}. Our method can obtain better segmentation results than all the other methods.

\begin{figure}[htbp]
\centering
\includegraphics[width = \textwidth]{imgs/1.png}
\caption{Comparison between segmentation results of different methods.}
\label{fig:qual}
\end{figure}

\begin{table}[ht]
	\centering
	%\addtolength{\leftskip} {-1cm} 
	\caption{DSC values on different organs.}
	\label{table:results2}
	\resizebox{\textwidth}{!}{
	\begin{tabular}{lcccccccccccccc}
		\hline
		Methods & Liv. & RK & Spl. & Pan. & Aor. & IVC & RAG & LAG & Gall. & Eso. & Sto. & Duo. & LK & mDSC\\\hline
		Swin-UNETR & 0.965 & 0.912 & 0.942 & 0.846 & 0.930 & 0.865 & 0.758 & 0.742 & 0.771 & 0.790 & 0.886 & 0.765 & 0.887 & 0.850\\\hline
		Swin-UNETR pre. & 0.964 & 0.921 & 0.952 & 0.881 & 0.937 & 0.862 & 0.794 & 0.791 & 0.792 & 0.818 & 0.895 & 0.790 & 0.879 & 0.867\\\hline
		nnU-Net & 0.977 & 0.941 & 0.958 & 0.872 & 0.968 & 0.878 & 0.830 & 0.801 & 0.765 & 0.892 & 0.899 & 0.771 & 0.911 & 0.882\\\hline
		nnU-Net ens. & 0.979 & 0.948 & 0.960 & 0.886 & 0.969 & 0.897 & 0.838 & 0.819 & 0.787 & 0.901 & 0.907 & 0.792 & 0.920 & 0.892\\\hline
		Ours & 0.980 & 0.945 & 0.972 & 0.890 & 0.966 & 0.903 & 0.824 & 0.806 & 0.861 & 0.874 & 0.915 & 0.787 & 0.937 & 0.897\\\hline
	\end{tabular}}
\end{table}

\subsection{DSC comparisons between with and without unlabelled images}
Due to the long training and inference time of nnU-Net, we only use it to generate pseudo labels of 2000 unlabelled images. Then we use these unlabelled images and their pseudo labels to train our model. In order to validate the effectiveness of the unlabeled images and the pseudo labels, we conduct ablation study on our organ detection module, which is relatively more sensitive to the amount of data due to its task attribute and transformer component. As shown in Table.~\ref{table:abl}, the effect of using unlabelled cases is significant. If we remove the training of unlabelled images with pseudo labels, we observe a significant performance drop in our final results. Since there are few labelled images, the distribution of labelled images is very different from the real data distribution. So if we do not use the unlabelled images, the model will have no chance to learn unseen cases in the target data distribution. This adds huge difficulties to regress 9-DoF boxes and segment accurate boundaries for organs, especially for relatively small organs such as gallbladder and adrenal glands.

\begin{table}[ht]
	\centering
	%\addtolength{\leftskip} {-1cm} 
	\caption{DSC comparisons between with and without using unlabelled images. \emph{wo.} means without using unlabelled images and \emph{w.} means using unlabelled images.}
	\label{table:abl}
	\resizebox{\textwidth}{!}{
	\begin{tabular}{lcccccccccccccc}
		\hline
		Methods & Liv. & RK & Spl. & Pan. & Aor. & IVC & RAG & LAG & Gall. & Eso. & Sto. & Duo. & LK & mDSC\\\hline
		Ours \emph{wo.} & 0.975 & 0.885 & 0.879 & 0.876 & 0.952 & 0.898 & 0.809 & 0.753 & 0.594 & 0.855 & 0.862 & 0.770 & 0.881 & 0.845\\\hline
		Ours \emph{w.}  & 0.980 & 0.945 & 0.972 & 0.890 & 0.966 & 0.903 & 0.824 & 0.806 & 0.861 & 0.874 & 0.915 & 0.787 & 0.937 & 0.897\\\hline
	\end{tabular}}
\end{table}


\subsection{Visualized examples of successful and failed cases}
Fig.~\ref{fig:results} shows the segmentation results of our method. It clearly reveals that our method can obtain excellent segmentation results on most organs. However, we find that sometimes the model failed especially when some organs have larger size and shape variation due to the appearance of tumors. For example, the trained models can't generalize well when the patient has a kidney tumor, which makes the size of kidney much larger than usual. One possible solution is adding more supervised cases which have similar distribution to those hard cases.

\subsection{Segmentation efficiency analysis}
The segmentation efficiency analysis is shown in Table~\ref{table:eff}. our method is significantly faster than other methods in terms of inference time. %Moreover, our method is almost 180 times faster than nnU-Net while consuming less GPU memory resource. 

\begin{table}[ht]
	\centering
	%\addtolength{\leftskip} {-1cm} 
	\caption{Efficiency analysis of different methods.}
	\label{table:eff}
	\begin{tabular}{l|c|c}
		\hline
		Methods & inference time (s)  & GPU memory footprint (MB) \\\hline
		Swin-UNETR & 18.00    & 22284  \\\hline
		nnU-Net    & 126.40  & 4639    \\\hline
		Ours       & 3.10    & 3208   \\\hline
	\end{tabular}
\end{table}

\begin{figure}[htbp]
\centering
\includegraphics[scale=0.45]{imgs/results_new.png}
\caption{(a) to (b): Plots of good results visualization and (c) to (d): Plots of bad results visualization.}
\label{fig:results}
\end{figure}

\subsection{Limitations and future work}
The proposed method works well on most cases. However, there are still some misclassification failures on some organs. Perhaps adding organ shape related prior knowledge will help solve the limitations, which is left for future work.

\section{Conclusion}
In this paper, we propose a novel three stage instance segmentation network for the abdomen organ segmentation task. We develop and test the whole framework on the FLARE 2022 challenge dataset. 
The network consists of a vision transformer based detection model and several lightweight segmentation heads. We adopt semi-supervised learning strategy to leverage the large amount of unlabelled data. We use nnU-Net as the teacher model and design a consistency measuring strategy to generate high quality pseudo labels. The whole framework of our method acquires 0.897 mean DSC on the FLARE 2022 challenge validation dataset.

\subsubsection{Acknowledgements} The authors of this paper declare that the segmentation method they implemented for participation in the FLARE 2022 challenge has not used any pre-trained models nor additional datasets other than those provided by the organizers. The proposed solution is fully automatic without any manual intervention.
%
% ---- Bibliography ----
%
% BibTeX users should specify bibliography style 'splncs04'.
% References will then be sorted and formatted in the correct style.
%
\bibliographystyle{splncs04}
\bibliography{ref}

\end{document}
