% This is samplepaper.tex, a sample chapter demonstrating the
% LLNCS macro package for Springer Computer Science proceedings;
% Version 2.21 of 2022/01/12
%
\documentclass[runningheads]{llncs}
%\setcounter{secnumdepth}{5}
%
\usepackage[T1]{fontenc}
% T1 fonts will be used to generate the final print and online PDFs,
% so please use T1 fonts in your manuscript whenever possible.
% Other font encondings may result in incorrect characters.
%
\usepackage{graphicx}
% Used for displaying a sample figure. If possible, figure files should
% be included in EPS format.
%
% If you use the hyperref package, please uncomment the following two lines
% to display URLs in blue roman font according to Springer's eBook style:
%\usepackage{color}
%\renewcommand\UrlFont{\color{blue}\rmfamily}
\usepackage[pagebackref=true,breaklinks=true,colorlinks,bookmarks=false]{hyperref}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{tablefootnote}
\usepackage{orcidlink}
\usepackage{arydshln}
%
\begin{document}
%
\title{Knowledge Distillation from Cross Teaching Teachers for Efficient Semi-Supervised Abdominal Organ Segmentation in CT}
%
\titlerunning{Knowledge Distillation from Cross Teaching Teachers}
% If the paper title is too long for the running head, you can set
% an abbreviated paper title here
%
\author{Jae Won Choi\inst{1,2}\orcidlink{0000-0002-5937-7238}}
%
\authorrunning{J. Choi}
% First names are abbreviated in the running head.
% If there are more than two authors, 'et al.' is used.
%
\institute{Department of Radiology, Armed Forces Yangju Hospital, South Korea \and
College of Medicine, Seoul National University, South Korea
\\ \email{jhoci@snu.ac.kr}}
%
\maketitle              % typeset the header of the contribution
% restart footnote number
\setcounter{footnote}{0}
%
\begin{abstract}
For more clinical applications of deep learning models for medical image segmentation, high demands on labeled data and computational resources must be addressed. This study proposes a coarse-to-fine framework with two teacher models and a student model that combines knowledge distillation and cross teaching, a consistency regularization based on pseudo-labels, for efficient semi-supervised learning. The proposed method is demonstrated on the abdominal multi-organ segmentation task in CT images under the MICCAI FLARE 2022 challenge, with mean Dice scores of 0.8429 and 0.8520 in the validation and test sets, respectively. The code is available at \url{https://github.com/jwc-rad/MISLight}.
\keywords{Knowledge distillation \and  semi-supervised learning \and medical image segmentation.}
\end{abstract}

\section{Introduction}

Organ segmentation has been one of the most popular applications of artificial intelligence in abdominal radiology~\cite{soffer2019convolutional}. As more high-quality imaging data are becoming available and advanced deep learning methods are being developed, many recent studies on automated abdominal organ segmentation have achieved promising results~\cite{kavur2021chaos,bilic2019lits,heller2021state}. However, these methods are based on supervised learning that depends on large-scale, carefully labeled data.  Also, current segmentation methods often require high computation costs. Therefore, for practical application in the clinical workflow, demands on labeled data and computational resources must be reduced.

Acquiring labeled data for medical image segmentation is especially expensive as it requires expert-level voxel-wise labeling and clinical data is innately heterogeneous. In this context, to utilize unlabeled data, various semi-supervised learning (SSL) in medical imaging have been studied, including Uncertainty-aware Mean Teacher~\cite{yu2019uncertainty}, Uncertainty Rectified Pyramid Consistency~\cite{luo2021efficient}, and Dual-task Consistency~\cite{luo2021dtc}. Among them, we adopt cross teaching, a simple consistency regularization based on pseudo-labels, which recently showed promising results in semi-supervised medical image segmentation on cardiac MR data~\cite{luo2021semi}. Also, we use models with slightly different decoders to boost the consistency regularization, following Mutual Consistency Training~\cite{wu2021semi}.

The main strategies to address the high computational cost of deep learning methods include (1) efficient building blocks and (2) model compression and acceleration techniques~\cite{gou2021knowledge}. The latter has not gained as much interest as the former, especially in medical image segmentation~\cite{qin2021efficient}, while there are many studies on lightweight networks~\cite{zhang2021efficient,alalwan2021efficient}. Among model compression and acceleration techniques, knowledge distillation (KD), which refers to knowledge transfer from a larger teacher model to a smaller student model~\cite{hinton2015distilling}, has been applied increasingly in recent research~\cite{gou2021knowledge,wang2021knowledge}. The target knowledge to transfer can be the response of the last output layer, outputs of intermediate feature layers, or relationships between different feature maps~\cite{gou2021knowledge}. Here, we apply the response-based KD because it is simple and can be implemented regardless of network architectures.

The current study proposes a coarse-to-fine framework (\autoref{fig:CoarseFine}) with two teacher models and a student model that combines KD and cross teaching, a consistency regularization based on pseudo-labels, for efficient semi-supervised medical image segmentation. Labeled data are used in all three models to train supervised segmentation. Pseudo-labels from unlabeled data are used to perform cross teaching between the two teachers and pseudo-supervision of the student. Meanwhile, outputs of the teachers on both labeled and unlabeled data are used to guide the student model through KD. Only the student model is used for efficient inference. The proposed method is developed and evaluated on the abdominal multi-organ segmentation task in CT images under the MICCAI FLARE 2022 challenge\footnote{\url{https://flare22.grand-challenge.org/}}.

\begin{figure}[htbp]
\centering
\includegraphics[width=1.0\columnwidth]{imgs/CoarseFine.pdf}
\caption{An overview of the coarse-to-fine segmentation framework. For the coarse segmentation, the whole-volume input is resampled to $96\times96\times96$. For the fine segmentation, cropping with 10\% padding around the coarse mask is first performed, and the cropped volume is resampled to $96\times64\times96$. The resultant fine segmentation mask is resized and padded back to the original input size.}
\label{fig:CoarseFine}
\end{figure}

\section{Method}

%###########################
\subsection{Preprocessing}
The following preprocessing steps are performed in all experiments:
\begin{itemize} 
 \item Reorienting images to the right-anterior-inferior (RAI) view.
 \item For coarse segmentation, whole-volume resampling to fixed size $96\times96\times96$ with trilinear interpolation. For fine segmentation, cropping with 10\% padding around the coarse mask (ground truth, if present), then resampling to fixed size $96\times64\times96$ with trilinear interpolation.
 \item Clipping based on the Hounsfield units to [-300, 300].
 \item Patch-wise intensity normalization with z-score normalization based on the mean and standard deviation of the voxel values.
\end{itemize}

\subsection{Proposed Method}
The proposed method is a coarse-to-fine framework, where coarse segmentation is first yielded from whole-volume input and then refined by fine segmentation (\autoref{fig:CoarseFine}). Such a two-stage framework lowers computation costs, especially in terms of memory use and running time, compared to the sliding window approach, which is a more common solution in medical image segmentation~\cite{zhang2021efficient,thaler2021efficient}. Empirically, a single-stage segmentation led to poor segmentation results and long inference time in large field-of-view or whole-body CT images. Each stage of the proposed framework consists of two teacher models $T_1$ and $T_2$ and a smaller student model $S$ which are trained simultaneously (\autoref{fig:Pipeline}). At inference, only the student model is used.

\begin{figure}[bhtp]
\centering
\includegraphics[width=1.0\columnwidth]{imgs/Pipeline.pdf}
\caption{An overview of KD from Cross Teaching Teachers. Each stage of the coarse-to-fine framework consists of two teacher models and a smaller student model. While labeled data are used in all three models to train supervised segmentation, the unlabeled data are used for cross teaching between the two teachers and pseudo-supervision of the student. All data are used for KD from the teacher models to the student model.}
\label{fig:Pipeline}
\end{figure}

\subsubsection{Supervised Segmentation}
Labeled data are used to train supervised segmentation for all models. Recently, compound losses have been suggested as the most robust losses for medical image segmentation tasks~\cite{LossOdyssey}. For model prediction $P$ and label $Y$, we apply the sum of Dice loss~\cite{milletari2016v} and focal loss~\cite{lin2017focal} as the supervised segmentation loss:
\begin{align*}
L_{seg}& = Dice(P, Y) + Focal(P, Y)
\end{align*}

\subsubsection{Cross Teaching and Pseudo-supervision}
For SSL of the teacher models $T_1$ and $T_2$, we use the cross teaching strategy adopted from Cross Teaching between CNN and Transformer~\cite{luo2021semi} and inspired by Cross Pseudo-supervision~\cite{chen2021semi} and Mutual Consistency Training~\cite{wu2021semi}. These methods all train two models with network-level perturbations that supervise each other with pseudo-labels to encourage consistent outputs on the same input. They differ in the perturbation targets (initialization~\cite{chen2021semi}, upsampling method for decoder~\cite{wu2021semi}, and learning paradigm~\cite{luo2021semi}). Here, to distinguish using pseudo-labels for training between teacher models from using them to train the student model, we refer to the former as cross teaching and the latter as pseudo-supervision. With predictions of the student model $P_S$, teacher models $P_{T_1}$ and $P_{T_2}$, and teachers' mean $P_{\bar{T}}$, the cross teaching and pseudo-supervision losses for the unlabeled data are defined as:
\begin{align*}
L_{ctl}& = Dice(P_{T_1}, argmax(P_{T_2})) + Dice(P_{T_2}, argmax(P_{T_1}))\\
L_{psv}& = Dice(P_S, argmax(P_{\bar{T}}))
\end{align*}

\subsubsection{Knowledge Distillation}
The main idea of response-based KD is training the student model to directly mimic the final prediction of the teacher model. Following Hinton et al.~\cite{hinton2015distilling}, we apply the Kullback-Leibler (KL) divergence loss between $P_S$ and $P_{\bar{T}}$ on both labeled and unlabeled data. A weight factor $\lambda_{dis}$ is applied to balance distillation loss with the supervised segmentation loss for labeled data and the cross teaching and pseudo-supervision losses for unlabeled data:
\begin{align*}
L_{labeled}& = L_{seg} + \lambda_{dis} KL(P_S, P_{\bar{T}})\\
L_{unlabeled}& = L_{ctl} + L_{psv} + \lambda_{dis} KL(P_S, P_{\bar{T}})
\end{align*}
Moreover, the proposed method is an online distillation where both the teachers and student models are updated simultaneously~\cite{gou2021knowledge}. 

\subsubsection{Overall Objective}
The overall training objective of the proposed method is the weighted sum of $L_{labeled}$ and $L_{unlabeled}$ with a weight factor $\lambda_{ssl}$ defined as:
\begin{align*}
Loss = L_{labeled} + \lambda_{ssl}L_{unlabeled}
\end{align*}

\subsubsection{Network Architecture}

\begin{figure}[htbp]
\centering
\includegraphics[width=0.95\columnwidth]{imgs/Network.pdf}
\caption{Residual-USE-Net architecture. Mobile-Residual-USE-Net uses depthwise separable convolutions for residual and plain convolution blocks.}
\label{fig:Network}
\end{figure}

\begin{figure}[htbp]
\centering
\includegraphics[width=0.8\columnwidth]{imgs/Blocks.pdf}
\caption{(a) Residual convolution block. If stride $n=1$ and the number of input and output channels are the same, the residual connection uses an identity layer instead of $1\times1\times1$ convolution. (b) Plain convolution block. (c) Residual SE block. \textsuperscript{*}These layers are replaced with depthwise separable convolutions for Mobile-Residual-USE-Net.}
\label{fig:Blocks}
\end{figure}

An overview of the network architecture is shown in \autoref{fig:Network}.

Inspired by the residual variant of the nnU-Net framework~\cite{isensee2021nnu,isensee2019attempt} and USE-Net~\cite{rundo2019use}, we employ Residual-USE-Net, a 3D U-Net~\cite{cciccek20163d} with an encoder with residual convolution blocks and a decoder with plain convolution blocks incorporated with residual squeeze-and-excitation (SE) blocks~\cite{hu2018squeeze}. A convolution block is implemented as two sets of convolution, normalization, and nonlinear activation layers, and for the residual block, the residual summation takes place before the last activation. We set $r = 8$ for the reduction ratio of SE blocks ~\cite{hu2018squeeze,rundo2019use} (\autoref{fig:Blocks}).

The teacher models $T_1$ and $T_2$ are Residual-USE-Nets with 32 base features and 4 skip connections. Following mutual consistency training~\cite{wu2021semi}, while $T_1$ and $T_2$ share the same encoder structure, their decoders use different upsampling methods where $T_1$ uses transposed convolutions and $T_2$ uses trilinear interpolation followed by regular convolutions.

The student and teacher models share the same overall network structure, but we apply depthwise separable convolutions as in MobileNets~\cite{howard2017mobilenets} to build a lighter neural network for the student model. The student model $S$ is Mobile-Residual-USE-Net, a Residual-USE-Net with depthwise separable convolutions instead of regular convolutions except for the initial convolution layer, with 32 base features, 4 skip connections, and transposed convolutions for the decoder.

\subsection{Post-processing}
The largest connected component of the segmentation mask is extracted per each class for both coarse and fine outputs. The connected component analysis is performed using Python connected-components-3d\footnote{\url{https://github.com/seung-lab/connected-components-3d}} and fastremap\footnote{\url{https://github.com/seung-lab/fastremap}} packages~\cite{zhang2021efficient}.

\section{Experiments}
\subsection{Dataset and evaluation measures}
The MICCAI FLARE 2022 is an extension of the FLARE 2021~\cite{MedIA-FLARE21} with more segmentation targets and more diverse images. The dataset is curated from more than 20 medical groups under the license permission, including MSD~\cite{simpson2019MSD}, KiTS~\cite{KiTS,KiTSDataset}, AbdomenCT-1K~\cite{AbdomenCT-1K}, and TCIA~\cite{clark2013TCIA}. The training set includes 50 labeled CT scans with pancreas disease and 2000 unlabeled CT scans with liver, kidney, spleen, or pancreas diseases. The validation set includes 50 CT scans with liver, kidney, spleen, or pancreas diseases.
The testing set includes 200 CT scans where 100 cases has liver, kidney, spleen, or pancreas diseases and the other 100 cases has uterine corpus endometrial, urothelial bladder, stomach, sarcomas, or ovarian diseases. All the CT scans only have image information and the center information is not available.

The evaluation measures consist of two accuracy measures: Dice Similarity Coefficient (DSC) and Normalized Surface Dice (NSD), and three running efficiency measures: running time, area under GPU memory-time curve, and area under CPU utilization-time curve. All measures will be used to compute the ranking. Moreover, the GPU memory consumption has a 2 GB tolerance.

\subsection{Implementation details}
\subsubsection{Environment settings}
The environments and requirements are presented in \autoref{table:env}.

\begin{table}[!htbp]
\caption{Environments and requirements.}\label{table:env}
\centering
\begin{tabular}{ll}
\hline
Windows/Ubuntu version       & Ubuntu 20.04 \\
\hline
CPU   & AMD Ryzen Threadripper PRO 3975WX \\
\hline
RAM                         &251G\\
\hline
GPU (number and type)                         & NVIDIA GeForce RTX 3090 (24G, $\times$1)\\
\hline
CUDA version                  & 11.4\\                          \hline
Programming language                 & Python 3.9\\ 
\hline
Deep learning framework & PyTorch (torch 1.10.0, torchvision 0.11.1) \\
\hline
Code available at & \url{https://github.com/jwc-rad/MISLight} \\
\hline
\end{tabular}
\end{table}

\subsubsection{Training protocols}
The training protocols are shown in \autoref{table:protocol}. Except for the preprocessing, coarse and fine segmentation training are performed with the same protocols. During training, the labeled and unlabeled data are randomly sampled alternatively at a ratio of 1:1. An epoch is defined as an iteration over all the labeled data. Therefore, each epoch includes a random subset of the unlabeled data.

The weight factors $\lambda_{dis}$ and $\lambda_{ssl}$ are time-dependent Gaussian warming-up functions~\cite{yu2019uncertainty} $\lambda(t)=\lambda_0 \cdot e^{-5(1-t/t_{max})^2}$ where $t$ denotes the current training epoch and $t_{max}$ is the total epoch number. We use $\lambda_0=10$ for $\lambda_{dis}$~\cite{liu2019structured} and $\lambda_0=0.1$ for $\lambda_{ssl}$~\cite{yu2019uncertainty}.

The coarse segmentation is first trained using the whole-volume inputs. Then, the trained student model is applied to all the unlabeled data to acquire coarse masks. For the fine segmentation, cropping is performed around the coarse masks and the ground truth masks for the unlabeled and labeled data, respectively. Using the cropped volumes as inputs, the fine segmentation training is performed.

\begin{table}[!htbp]
\caption{Training protocols.}
\label{table:protocol}
\begin{center}
% \resizebox{0.47\textwidth}{!}{
\begin{tabular}{ll} 
\hline
Data augmentation               & Elastic deformation, scaling, rotation, \\
& crop, Gaussian noise, brightness\\
\hline
Network initialization         & Xavier normal initialization\\
\hline
Batch size                    & 1 \\
\hline 
Patch size & $96\times64\times96$  \\ 
\hline
Total epochs & 1000 \\
\hline
Optimizer          & SGD with nesterov momentum\\
& ($\mu=0.99$, $decay=3e-5$)          \\ \hline
Loss               & Dice + Focal ($\alpha=0.5$, $\gamma=2$)          \\ \hline
Initial learning rate  & 0.01 \\ \hline
Learning rate decay schedule & $(1-epoch/epoch_{max})^{0.9}$~\cite{chen2017deeplab} \\
\hline
Training time               & 7.5 hours \\  \hline 
Number of model parameters    & 189.3M (5.2M in test) \tablefootnote{\url{https://github.com/PyTorchLightning/pytorch-lightning}}\\ \hline
Number of flops & 443.1G (21.7G in test) \tablefootnote{\url{https://github.com/sovrasov/flops-counter.pytorch}} \\ \hline
%CO$_2$eq & XXX Kg\tablefootnote{\url{https://github.com/lfwa/carbontracker}} \\ \hline

\end{tabular}
%}
\end{center}
\end{table}

\subsubsection{Testing protocols}
Only the student is used at inference, with the number of model parameters 5.2M and the number of flops 21.7G.

The same preprocessing as the training protocols except for data augmentation is applied for the testing. For coarse segmentation, inference is performed with a sliding window approach with overlap by half of the size of a patch where the resulting prediction is a weighted sum of sliding windows. To reduce the influence of predictions close to boundaries, a Gaussian importance weighting is applied for each predicted patch~\cite{isensee2021nnu}. For fine segmentation, since the image is cropped with 10\% padding around the coarse mask and resampled to the size same as the input size of the model, inference is only performed once without the sliding window approach.

\subsection{Ablation study}
In the ablation study, as the baseline, fully supervised learning (FSL) is performed to train both coarse and fine segmentation models using only the labeled data. In other experiments, the coarse segmentation is fixed to the proposed method, and different training pipelines are used for the fine segmentation. First, FSL is applied to the fine segmentation using only the student model. Also, we conduct experiments with a single teacher and a student framework: FSL with KD, SSL with KD, SSL with pseudo-supervision, and SSL with KD and pseudo-supervision. We investigate the isolated effect of cross-teaching by training two cross-teaching students. Moreover, the proposed method's variants with no KD, no pseudo-supervision, and teachers sharing the same architecture, respectively, are performed. In all experiments, the network architectures of teacher and student models and training protocols are the same as in the proposed method. For experiments with two models of the same size for inference, we choose the one with transposed convolutions. Otherwise, the student model is used for inference.

\section{Results and discussion}
All DSC results for the experiments are obtained via the validation leaderboard of the MICCAI FLARE 2022 challenge. Also, detailed results, including efficiency analysis, are processed privately and provided by the challenge organizers based on submissions using Docker containers.

\subsection{Ablation study}
\autoref{table:ablation} shows the results of the ablation study. The baseline FSL shows a mean DSC of 0.7712, which slightly increases to 0.7812 when the proposed method is performed for the coarse segmentation. Applying KD to the basic FSL model yields an improved mean DSC of 0.8261 from 0.7812. This is better than the experiments on SSL with a single teacher and a student, which implies that ineffective use of unlabeled data only hinders the training of the student model. When unlabeled data is effectively exploited by the cross-teaching strategy, it shows better results than the FSL with KD even without the teacher model. Although there is little performance gain with pseudo-supervision from cross teaching teachers only, KD and combined use of KD and pseudo-supervision improve results. Moreover, teachers with slightly different decoders achieve better results than those with the same decoders, which is consistent with the results in Mutual Consistency Training ~\cite{wu2021semi}.

\begin{table}[!htbp]
\centering\setlength{\tabcolsep}{2.5pt}
\caption{Ablation study results on the MICCAI FLARE 2022 validation set. The baseline uses only the labeled data to train both coarse and fine segmentations, whereas, in the rest of the experiments, the proposed method is used for training the coarse segmentation and each row shows the training settings for the fine segmentation. CTS and CTT mean cross teaching between two students and two teachers, respectively. CTT\textsubscript{SD} uses teachers with decoders with the same architecture. \textsuperscript{*}The one with transposed convolutions out of two models is used for inference.}
\label{table:ablation}
\begin{tabular}{cc|cccc|c}
\hline
\# of T & \# of S & SSL & KD & PSV & Cross Teaching & Mean DSC  \\ \hline
0   & 1 &               &           &           &                       & 0.7712±0.1193 (baseline)\\
\hdashline
0   & 1 &               &           &           &                       & 0.7812±0.1121         \\
1   & 1 &               &\checkmark &           &                       & 0.8261±0.1107         \\
1   & 1 & \checkmark    &\checkmark &           &                       & 0.8227±0.1122 \\
1   & 1 & \checkmark    &           &\checkmark &                       & 0.8234±0.1101 \\
1   & 1 & \checkmark    &\checkmark &\checkmark &                       & 0.8173±0.1149 \\
0   & 2 & \checkmark    &           &           &CTS                    & 0.8296±0.1092\textsuperscript{*} \\
2   & 1 & \checkmark    &           &\checkmark &CTT                    & 0.8297±0.1111 \\
2   & 1 & \checkmark    &\checkmark &           &CTT                    & 0.8407±0.1075 \\
2   & 1 & \checkmark    &\checkmark &\checkmark &CTT\textsubscript{SD}  & 0.8394±0.1086 \\
2   & 1 & \checkmark    &\checkmark &\checkmark &CTT                    & \textbf{0.8429±0.1043 (proposed)} \\ \hline
%2   & 1 & \checkmark    &\checkmark &\checkmark &CTT                    &  ($T_1$)\textsuperscript{*} \\
\end{tabular}

% \begin{tabular}{l|cc|c}
% Method                  & \# of T   & \# of S   & Mean DSC  \\ \hline
% FSL                     & & & \\
% \hspace{1ex}Baseline    & 0         & 1         & 0.7812±0.1121         \\
% \hspace{1ex}KD          & 1         & 1         & 0.8261±0.1107         \\ \hline
% SSL                     & & & \\
% \hspace{1ex}KD          & 1         & 1         & 0.8227±0.1122 \\
% \hspace{1ex}PSV         & 1         & 1         & \\
% \hspace{1ex}PSV + KD    & 1         & 1         & 0.8173±0.1149 \\
% \hspace{1ex}CTS\textsuperscript{*}  & 0     & 2 & 0.8296±0.1092 \\
% \hspace{1ex}CTT + PSV   & 2         & 1         & 0.8297±0.1111 \\
% \hspace{1ex}CTT + KD    & 2         & 1         & 0.8407±0.1075 \\
% \hspace{1ex}CTT\textsubscript{SD} + PSV + KD & 2 & 1 & 0.8394±0.1086 \\
% \hspace{1ex}\textbf{CTT + PSV + KD (Proposed)} & 2 & 1 & 0.8429±0.1043 \\ \hdashline
% \hspace{1ex}CTT + PSV + KD ($T_1$)\textsuperscript{*} & 2 & 1 &
% \end{tabular}

\end{table}

\subsection{Quantitative results on validation set}
The proposed method shows a mean DSC of 0.8429±0.1043 and a mean NSD of 0.8990±0.0755 in the MICCAI FLARE 2022 validation set (\autoref{table:results-valid}). While large organs such as the liver or spleen are well segmented with DSC higher than 0.9, the proposed method works relatively poorly for adrenal glands and gallbladder. This may be attributed to the weakness of overlap-based metrics, including DSC, to small objects, since the proposed method depends on the Dice loss~\cite{taha2015metrics}.

\begin{table}[!htbp]
\centering\setlength{\tabcolsep}{2.5pt}
\caption{Segmentation results on the MICCAI FLARE 2022 validation set.}
\label{table:results-valid}
\begin{tabular}{l|cc}
\hline
Organ       & DSC           & NSD           \\
\hline
Liver       & 0.9711±0.0214 & 0.9762±0.0406 \\
RK          & 0.9095±0.2092 & 0.9177±0.2221 \\
LK          & 0.8975±0.2163 & 0.9053±0.2240 \\
Spleen      & 0.9593±0.0417 & 0.9704±0.0663 \\
Pancreas    & 0.8575±0.0529 & 0.9468±0.0526 \\
Aorta       & 0.9383±0.0249 & 0.9744±0.0544 \\
IVC         & 0.8781±0.0963 & 0.8855±0.1254 \\
RAG         & 0.6907±0.1533 & 0.8383±0.1684 \\
LAG         & 0.6578±0.2009 & 0.7876±0.2228 \\
Gallbladder & 0.7165±0.3546 & 0.7225±0.3632 \\
Esophagus   & 0.8189±0.1200 & 0.9199±0.1187 \\
Stomach     & 0.8959±0.1647 & 0.9288±0.1563 \\
Duodenum    & 0.7672±0.1281 & 0.9139±0.0872 \\
\hdashline
Mean        & 0.8429±0.1043 & 0.8990±0.0755 \\
\hline
\end{tabular}
\end{table}

\subsection{Qualitative results on validation set}
\autoref{fig:samples} illustrates the example segmentation results of the baseline FSL model and the proposed method from the MICCAI FLARE 2022 validation set. Whereas baseline and proposed methods yield satisfactory results for routine contrast-enhanced CT images and healthy organs, the proposed method shows better results for CT with noise and non-portal contrast phases and lesion-affected organs. However, the proposed fails in some cases with large lesions or out-of-distribution diseases such as hiatal hernia or large amounts of ascites.

\begin{figure}[htbp]
\centering
\includegraphics[width=0.8\columnwidth]{imgs/Samples.pdf}
\caption{Example cases from the MICCAI FLARE 2022 validation set. The first column is the CT image, the last column is the ground truth, and the second and third columns show the segmentation by the baseline fully supervised model and the proposed method, respectively. Descriptions for each row are as follows: (a) noisy image, (b) arterial phase contrast CT, (c) gallstones, (d) left kidney tumor, (e) large right kidney tumor, (f) hiatal hernia, and (g) large amounts of ascites.}
\label{fig:samples}
\end{figure}

\subsection{Segmentation efficiency results on validation set}
The segmentation efficiency results are acquired in the private testing environment of the MICCAI FLARE 2022 challenge (\autoref{table:testenv}). In the validation set, the mean running time of the proposed method is 28.89 s with a range of 24.77–48.43 s. The maximum GPU memory usage is 2025 MB for all cases. The areas under the GPU memory-time and CPU utilization-time curves shows a mean of 27167 MB$\cdot$s and 596.95 \%$\cdot$s, respectively, and a range of 25890–37132 MB$\cdot$s and 466.76–990.31 \%$\cdot$s, respectively.

\begin{table}[!htbp]
\caption{Testing environments in MICCAI FLARE 2022 challenge.}\label{table:testenv}
\centering
\begin{tabular}{ll}
\hline
Windows/Ubuntu version       & Ubuntu 20.04 \\
\hline
CPU   & Intel® Xeon(R) W-2133 CPU @ 3.60GHz × 12 \\
\hline
RAM                         &32G (Available memory 28G)\\
\hline
GPU             & NVIDIA QUADRO RTX5000 (16G)\\
\hline
\end{tabular}
\end{table}


\subsection{Results on test set}
The proposed method ranked 10th in the MICCAI FLARE 2022 test phase. The segmentation results showed a mean DSC of 0.8520±0.0987 and a mean NSD of 0.9137±0.0666 (\autoref{table:results-test}). The mean running time was 28.16 s. The areas under the GPU memory-time and CPU utilization-time curves showed a mean of 23092 MB$\cdot$s and 575 \%$\cdot$s, respectively.

\begin{table}[!htbp]
\centering\setlength{\tabcolsep}{2.5pt}
\caption{Segmentation results in the MICCAI FLARE 2022 test phase.}
\label{table:results-test}
\begin{tabular}{l|cc}
\hline
Organ       & DSC           & NSD           \\
\hline
Liver       & 0.9763±0.0154 & 0.9859±0.0244 \\
RK          & 0.9332±0.1672 & 0.9471±0.1742 \\
LK          & 0.9420±0.1179 & 0.9542±0.1306 \\
Spleen      & 0.9471±0.1386 & 0.9634±0.1445 \\
Pancreas    & 0.8204±0.1002 & 0.9281±0.0961 \\
Aorta       & 0.9375±0.0469 & 0.9748±0.0634 \\
IVC         & 0.8850±0.0858 & 0.9050±0.0991 \\
RAG         & 0.7338±0.1200 & 0.8808±0.1429 \\
LAG         & 0.7135±0.1361 & 0.8559±0.1488 \\
Gallbladder & 0.7312±0.3493 & 0.7370±0.3567 \\
Esophagus   & 0.7721±0.1398 & 0.8777±0.1554 \\
Stomach     & 0.9254±0.0947 & 0.9569±0.0982 \\
Duodenum    & 0.7587±0.1177 & 0.9109±0.1021 \\
\hdashline
Mean        & 0.8520±0.0987 & 0.9137±0.0666 \\
\hline
\end{tabular}
\end{table}

\subsection{Limitations and future work}
Although the idea of KD from SSL-based teachers can be applied to any kind of SSL design, this study only uses the cross teaching method, but there are other state-of-the-art SSL methods, including uncertainty-aware strategies~\cite{yu2019uncertainty,luo2021efficient}. Also, for KD, other losses than the KL divergence loss and other distillation methods such as feature-based or relation-based KD can be utilized~\cite{gou2021knowledge}. Moreover, we only use depthwise separable convolutions to build a student model, but other efficient building blocks such as the spatial pyramid module in ESPNet ~\cite{mehta2018espnet} may be a better choice. Comparisons of different SSL designs, KD methods, and efficient network architectures should be addressed in future work.

\section{Conclusion}
This study combines several methods for efficient semi-supervised abdominal organ segmentation in CT. A whole-volume-based coarse-to-fine framework and depthwise separable convolutions contribute to efficiency. Cross teaching and pseudo-supervision are applied to utilize unlabeled data. Also, models with slightly different decoders further enhance the effect of cross teaching. Finally, knowledge distillation enables the joint use of model compression and semi-supervised learning. The proposed method showed mean Dice scores of 0.8429 and 0.8520 in the MICCAI FLARE 2022 validation and test sets, respectively.

\subsubsection{Acknowledgements} The author of this paper declares that the segmentation method implemented for participation in the FLARE 2022 challenge has not used any pre-trained models or additional datasets other than those provided by the organizers. Also, the proposed solution is fully automatic without any manual intervention.


%
% ---- Bibliography ----
%
% BibTeX users should specify bibliography style 'splncs04'.
% References will then be sorted and formatted in the correct style.
%
\bibliographystyle{splncs04}
\bibliography{ref}

\end{document}
