\documentclass{midl} % Include author names
%\documentclass[]{midl} % Anonymized submission

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\usepackage{multirow}

\jmlrvolume{-- nnn}
\jmlryear{2024}
\jmlrworkshop{Full Paper -- MIDL 2024}
\editors{Accepted for publication at MIDL 2024}

\title[Efficient Anatomy Segmentation with KD]{Efficient Anatomy Segmentation in Laparoscopic Surgery using Multi-Teacher Knowledge Distillation}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Lennart Maack \nametag{$^{1}$}} \Email{lennart.maack@tuhh.de}\\
\Name{Finn Behrendt \nametag{$^{1}$}} \Email{Finn.Behrendt@tuhh.de}\\
\Name{Debayan Bhattacharya \nametag{$^{1}$}} \Email{Debayan.Bhattacharya@tuhh.de}\\
\Name{Sarah Latus \nametag{$^{1}$}} \Email{s.latus@tuhh.de}\\
\Name{Alexander Schlaefer \nametag{$^{1}$}} \Email{schlaefer@tuhh.de}\\
\addr $^{1}$ Institute of Medical Technology and Intelligent Systems, Hamburg University of Technology, Hamburg, Germany \\
}

\begin{document}

\maketitle

\begin{abstract}
Automatic segmentation of anatomical structures in laparoscopic images or videos is an important prerequisite for visual assistance tools which are designed to increase efficiency and safety during an intervention.
In order to be used in a realistic clinical scenario, both high accuracy and real-time capability are required.
Current deep learning networks for anatomy segmentation show high accuracy, but are not suitable for real-time clinical application due to their large size.
As smaller, real-time capable deep learning networks show lower segmentation performance, we propose a multi-teacher knowledge distillation approach applicable to partially labeled datasets.
We leverage the knowledge of multiple anatomy-specific, high-accuracy teacher networks to improve the segmentation performance of a single and efficient student network capable of segmenting multiple anatomies simultaneously.
To do so, we minimize the Kullback-Leibler divergence between the normalized anatomy-specific teacher logits and the respective normalized logits of the student.
We conduct experiments on the Dresden Surgical Anatomy Dataset, which provides multiple subsets of binary segmented anatomical structures.
Results show that our approach can increase the overall Dice score for different real-time capable network architectures for anatomy segmentation.

\end{abstract}

\begin{keywords}
Anatomy Segmentation, Real-Time, Surgical Computer Vision, Knowledge Distillation
\end{keywords}

\section{Introduction}

Postoperative complications remain a major problem for both the healthcare system and the individual patient and are associated with higher healthcare costs and poorer outcomes \cite{Dencker2021}. An important factor in decreasing postoperative complications is the reduction of technical errors, which are defined as adverse events directly related to manual errors of the surgeon \cite{Fecso2017, Suliburk2019}. \\
The increasing adoption of minimally invasive procedures, which rely on the visualization by endoscopic cameras, leads to an increasing amount of available surgical video data. 
This enables the data-driven analysis of surgical video data using computer vision techniques, such as deep learning, to visually assist surgeons \cite{Mascagni2022}.
An essential prerequisite for computer vision-based surgical assistance applications is the accurate and real-time perception of the intraoperative environment, e.g. by segmenting the surgical instruments and anatomical structures. To evaluate the accuracy of computer vision-based methods for anatomy segmentation, various data sets with pixel-wise annotations of anatomical structures were introduced \cite{Bamba2021, Madani2020, https://doi.org/10.48550/arxiv.2001.11190}.
As these data sets have little diversity due to their size or are based solely on simpler porcine tissue data, the applicability of deep learning models trained with such data sets in a real clinical setting is limited. 
Therefore, Carstens et al. \cite{Carstens2023} published the largest public data set of laparoscopic images to date, namely the Dresden Surgical Anatomy Dataset (DSAD). 
It is divided into partially annotated sub-datasets, containing overall $13,195$ laparoscopic images with pixel-wise annotations of eleven anatomical structures.
As only binary annotations for one anatomy are provided in a sub-dataset, although several other anatomical structures without annotation are visible, complete information about the background remains unknown.
% This prevents the simple usage of multi-channel segmentation networks.
To tackle the problem of training networks with partially labeled datasets, the usage of
annotation adaptive loss functions has been proposed \cite{vu2021data, ulrich2023multitalent}.
In their recent work, Kolbinger et al. \cite{kolbinger2023anatomy} trained a combined network with a shared encoder and multiple decoders for each of the sub-datasets in the DSAD. In addition, they made use of mutual-exclusivity by incorporating the information of a positive annotation of one class as a negative annotation for all other classes. 
Despite achieving segmentation accuracy comparable to human experts, there are two limitations.
First, the binary segmentation accuracy of the combined network is inferior compared to anatomy-specific single-encoder, single-decoder networks \cite{kolbinger2023anatomy}. Second, due to the large size of the encoder and decoder networks, achieving a sufficiently high frame rate for delay-free segmentation of anatomical structures on dedicated hardware in the operating room can not be guaranteed. 
Methods for real-time capable segmentation in surgical videos include the development of lightweight convolution-based architectures for faster segmentation of surgical instruments in laparoscopy \cite{tomasini2022efficient, https://doi.org/10.48550/arxiv.2007.04449, jha2021nanonet}.
While networks with a low amount of parameters can provide good segmentation performance for simpler tasks such as surgical instrument segmentation, they are ineffective at learning complex features required for accurate segmentation of anatomies in surgical videos.
Knowledge Distillation has drawn attention to overcome the dilemma of decreasing performance when using smaller models capable of faster inference speed.   
Xie et al. \cite{Xie2018ImprovingFS} transfer the zero- and first-order knowledge from a strong teacher network to guide the fast student network. Qin et al. \cite{qin2021efficient} improve the segmentation performance on public CT datasets by proposing a novel module that encodes regional knowledge for a student network.
To overcome the dependency of student networks on a single teacher network, Amirkhani et al. \cite{amirkhani2021robust} include multiple teacher networks trained with the same input but different style transfers and data augmentations.\\
In this work, we propose a multi-teacher knowledge distillation (MT-KD) approach that leverages the knowledge of multiple anatomy-specific, high-accuracy teacher networks to tackle the problem of training a single network with partially labeled datasets.
Specifically, we use MT-KD to improve the segmentation performance of a real-time capable student network with a small number of parameters.
In a first step, multiple teacher networks are trained to obtain high anatomy-specific accuracy. 
In a second step, the Kullback-Leibler (KL) divergence is minimized between the normalized output logits of the individual anatomy-specific teacher models and the normalized output logits of the corresponding anatomy-specific decoder of the student model.
This way, the teacher networks guide the student to pay more attention to the most salient regions in order to accurately segment the anatomies. By improving the segmentation accuracy of small student networks, capable of segmenting multiple anatomies simultaneously, we aim to increase the applicability of computer vision methods in realistic surgical scenarios.
The comprehensive evaluation of our MT-KD approach shows increased segmentation performance across various network architectures with a small number of parameters, as shown in Figure \ref{fig:Figure_1}.

\begin{figure} [ht]
   \begin{center}
   \includegraphics[width=0.75\textwidth]{Fig1.pdf}
   \end{center}
   \caption{Performance of different segmentation networks presented as the mean Dice score over all eleven anatomies in the DSAD. The red stars indicate the performance of the network trained with Multi-Teacher Knowledge Distillation (MT-KD). The red arrows indicate the respective improvement. $\bigstar$ refers to models using a common encoder and eleven anatomy-specific decoders. $\blacksquare$ indicates segmentation models that are trained on each anatomy separately.}
    \label{fig:Figure_1}
\end{figure} 
\section{Method}

Our multi-teacher knowledge distillation approach with its two stages is schematically illustrated in Figure \ref{fig:Figure_2} and explained in more detail in the following. \\
\newline
\textbf{Stage 1:} \\
During the first stage, $D=\{A_{i=1}, A_{i=2}, .., A_{i=N}\}$ corresponds to the overall data set, where $A_i$ denotes the anatomy-specific data set of the $i^{th}$ anatomy of $N$ anatomies.
In the following, only one of the anatomy-specific data sets is considered. \\
Let $A$ be denoted as $A = \{( x_j , y_j )\}^{M}_{j=1}$, where $x_{j} \in \mathbb{R}^{3 \times H \times W}$ and $y_{j} \in \mathbb{R}^{1 \times H \times W}$ correspond to the input image and binary segmentation mask, respectively. $M$ denotes the overall number of images in $A$.
Further, a teacher segmentation model $T$, consisting of one encoder $F^{enc}$ and one decoder $F^{dec}$, is trained using the standard pixel-wise binary cross entropy loss function formulated as:
\begin{equation}
    \mathcal{L}_{CE} = - \sum_{j=1}^{M} \sum_{k=1}^{H \times W} [ y_{j,k} \log(T(x_{j,k})) + (1-y_{j,k}) \log(1-T(x_{j,k}))  ]
    \label{eq:equation_1}
\end{equation}
\newline
\textbf{Stage 2:} \\
In this stage, the same data set $D$ is utilized as in stage 1. This time, a student segmentation model $S$ that consists of one encoder $F^{enc}$ and $N$ anatomy-specific decoders $ \{F^{dec}_{1}, F^{dec}_{2}, .., F^{dec}_{N} \}$ is optimized using two different objective functions. \\
For the first objective function, we follow the description from Kolbinger et al. \cite{kolbinger2023anatomy}.
There, the pixel-wise binary cross entropy loss according to Equation \ref{eq:equation_1} is calculated separately using the output probabilities of each of the $N$ anatomy-specific decoders.
In detail, the loss is calculated for each pixel, only if the annotated anatomy $i$ in the input image corresponds to the respective anatomy-specific decoder $F_{i}^{dec}$ of $S$.
For all other decoders $F^{dec}_{ \neq i}$, only the pixels in $x_{j}$ that belong to the anatomy $i$ are considered for the loss calculation as the false positive class, as shown in Figure \ref{fig:Figure_2}.
The remaining pixels are not considered for the loss calculation, as only binary segmentation masks are used in DSAD and several anatomies can appear per image and it cannot be ruled out that all other pixels do not contain the anatomy $i$. \\
The second objective function utilizes the anatomy-specific knowledge of the various teacher models with frozen parameters from stage 1.
Similar as in work by Shu et al. \cite{shu2021channel}, where the normalized activations of corresponding channels between the teacher and student network are aligned using the KL divergence, we utilize the normalized output logits of each of the anatomy-specific teacher models and minimize the discrepancy to the normalized output logits of the corresponding anatomy-specific decoder of the student model.
As the logits of a well-trained, anatomy-specific teacher model generally show salient anatomical regions, the student model, capable of segmenting multiple anatomies simultaneously, can be guided. This results in overall higher segmentation performance of the student.
Let $z^T_{i, C}$ and $z^S_{i, C}$ be the output logits of the anatomy-specific teacher model $T_i$ and the student decoder $F^{dec}_{i}$ of anatomy $i$, with $C$ being either the anatomy or false positive/background class.
First, the output logits $z^T_{i, C}$ and $z^S_{i, C}$ are divided by a temperature value $\mathcal{T}$ and then normalized using the softmax function $\sigma(z)=\frac{e^z}{\sum(e^z)}$. The temperature value $\mathcal{T}$ is used to control the softness of the probability distribution.
Second, to evaluate the discrepancy between the two probability distributions $p^{T}_{i,C} =  \sigma(\frac{z^T_{i, C}}{\mathcal{T}})$ and $p^{S}_{i,C} = \sigma(\frac{z^S_{i, C}}{\mathcal{T}})$, we utilize the KL divergence \cite{kullback1951information}. \\ 
The second objective function during stage 2 can therefore be denoted as:
\begin{equation}
    \mathcal{L}_{KL_{i}}(p^{S}_{i,C}, p^{T}_{i,C})  = p^{T}_{i,C} \cdot \log(\frac{p^{T}_{i,C}}{p^{S}_{i,C}})
    \label{eq:equation_2}
\end{equation}
With $\lambda$ as a weighting parameter, the overall objective function of stage 2 can be formulated as:
\begin{equation}
    \mathcal{L} = \sum^N_{i=1} \mathcal{L}_{i} = \sum^N_{i=1} \mathcal{L}_{CE_{i}} \cdot \lambda \mathcal{L}_{KL_{i}}
    \label{eq:equation_3}
\end{equation}


\begin{figure} [ht]
   \begin{center}
   \includegraphics[width=1\textwidth]{Figure_2.pdf}
   \end{center}
   \caption{Schematic of the proposed MT-KD approach. In stage 1, multiple teacher networks are trained to obtain high anatomy-specific accuracy. In stage 2, the normalized output logits of the individual anatomy-specific teacher models are used to guide the training of the student models using the KL-divergence.}
    \label{fig:Figure_2}
\end{figure} 

\section{Experimental Setup}

\subsection{Data}

To evaluate our MT-KD approach, we use the DSAD that consists of $13,195$ high-quality laparoscopic images with pixel-wise annotations of eleven intra-abdominal anatomical structures, i.e.,  abdominal wall, colon, inferior mesenteric artery, intestinal veins, liver, pancreas, small intestine, spleen, stomach, ureter and vesicular glands \cite{Carstens2023}. 
We follow the training, validation and test splits as well as the pre-processing steps and augmentations as in the original work from \cite{kolbinger2023anatomy}.


\subsection{Implementation details}

For all experiments with the proposed approach, we utilize the Adam optimizer \cite{kingma2014adam}, using a learning rate of 5e-4 during stage 1 and a learning rate of 1e-3 for stage 2. Additionally, an exponential learning rate scheduler is used. We train our models for $100$ epochs and $60$ epochs in stage 1 and 2 and end up with a final learning rate of 1.5e-6 and 2.5e-5, respectively.
During the teacher-student knowledge distillation, we follow the implementation details from \cite{shu2021channel} and use the temperature value $\mathcal{T} = 4.0$ for the calculation of the KL divergence and the weighting parameter $\lambda=3$. During all training experiments, a batch size of 8 and an input image size of $640 \times 512$ is utilized.
The utilized convolutional encoder architectures are pretrained on either the COCO or Cityscapes data set \cite{lin2014microsoft, cordts2016cityscapes}. For the transformer-based segmentation networks, i.e., SegFormer, we use pretrained weights from the Cityscape data set.  
For the evaluation on the test data set, the model that obtained the best results in terms of the Dice score on the validation set is used. We implement all models in Pytorch \footnote{Code available at: \\ 
\url{https://github.com/lennart-maack/Efficient-Anatomy-Segmentation-w-Multi-Teacher-KD}}.

\section{Results}
To evaluate the proposed approach in this work, the mean Dice score among all eleven anatomies in the DSAD is used. Furthermore, the number of parameters of each segmentation model as well as the inference time in form of time for computing the segmentation mask of one input image of size $1280 \times 720$ (High Definition) is determined. \\

\begin{table}[ht]
\caption{Segmentation and inference speed results. $\blacksquare$ indicates segmentation models that are trained on each anatomy separately, consisting of one encoder and one decoder. $\bigstar$ indicates models using a common encoder and eleven anatomy-specific decoders. Dice score ($\%$) is calculated as the mean over all eleven anatomies. Time corresponds to the inference time in ms for one image ($1280 \times 720$) on a NVIDIA RTX3090. SegFormer-B3 $\blacksquare$ is used as a teacher network for our Multi-Teacher Knowledge Distillation (MT-KD) approach.}
\begin{center}
\resizebox{\columnwidth}{!}{%
\begin{tabular}{cccccc}
\hline
    Architecture & Encoder &  Params(M) & Dice ($\uparrow$) & Time ($\downarrow$) & FPS ($\uparrow$) \\
    \hline
    \multirow{1}{*}{DeepLabv3 $\blacksquare$}& ResNet18 & 174.8 & 53.8 & 160 & 6 \\
    \cite{chen2017rethinking} & ResNet50 &  435.9 & 65.3 & 367 & 3\\
    \cline{2-6}
    \multirow{1}{*}{}& EfficientNetb0 & 80.3 & 62.8 & 262 & 4 \\
    & EfficientNetb3 & 159.1 & 66.3 & 479 & 2 \\
    \cline{2-6}
    \multirow{1}{*}{SegFormer $\blacksquare$}& SegFormerB0 & 40.8 & 64.0 & 180 & 6\\
    \cite{xie2021segformer} & SegFormerB3 & 519.5 & \textbf{69.7} & 548 & 2 \\
    \cline{2-6}
    \multirow{1}{*} {MiniNetv2 $\blacksquare$}  & MiniNetv2 & 5.5 &  43.2 & 35 & 28 \\
    \cite{tomasini2022efficient} &  &  &  &  & \\
    \hline
    \multirow{1}{*}{}& ResNet18 & 63.1 & 58.7 & 42 & 23 \\
    DeepLabv3 $\bigstar$ & ResNet50 & 200.0 & 59.8 & 112 & 9 \\
    \cline{2-6}
    \multirow{1}{*}{\cite{chen2017rethinking}}& EfficientNetb0 & 40.0 & 60.3 & 45 & 22 \\
    & EfficientNetb3 &  52.2 & 64.2 & 67 & 15 \\
    \cline{2-6}
    \multirow{1}{*}{SegFormer $\bigstar$}& SegFormerB0 & 7.7 & 60.5 & 39 & 25 \\
    \cite{xie2021segformer} & SegFormerB3 & 78.7 & 66.9 & 153 & 6 \\
    \cline{2-6}
    \multirow{1}{*}MiniNetv2 $\bigstar$& MiniNetv2 & 1.1 & 36.1 & \textbf{26} & \textbf{38}\\
    \cite{tomasini2022efficient} &  &  & &  & \\
    
    \hline
    DeepLabv3 $\bigstar$& EfficientNetb0 & 40.0 & 64.5 & 45 & 22 \\
    (with our MT-KD approach) &ResNet18 & 63.1  & 64.4  & 42 & 23  \\
    \hline
    SegFormer $\bigstar$& SegFormerB0 & 7.7 & 64.9 & 39 & 25 \\
    (with our MT-KD approach) & &  &  &  \\
    \hline
    MiniNetv2 $\bigstar$&MiniNetv2 & 1.1 & 60.5 & \textbf{26} & \textbf{38} \\
    (with our MT-KD approach) & &  &  &  \\
    \hline
\end{tabular}
}
\end{center}
\label{tab:Table_1}
\end{table}
\begin{figure} [ht]
   \begin{center}
   \includegraphics[width=0.85\textwidth]{qual_results.pdf}
   \end{center}
   \caption{Qualitative segmentation results for different anatomies.}
    \label{fig:qual_results}
\end{figure} 
In order to assess the improvements of our the approach, we evaluate previous deep learning models applied to the DSAD regarding segmentation accuracy and inference time, as well as another state-of-the-art network for efficient surgical segmentation, as shown in Table \ref{tab:Table_1}.
Both anatomy-specific models with one encoder and one decoder ($\blacksquare$), as well as models with one common encoder and multiple anatomy-specific decoders ($\bigstar$) are evaluated. From the results in Table \ref{tab:Table_1}, we observe that segmentation models trained on each anatomy separately outperform networks with the same architecture but using a common encoder and eleven anatomy-specific decoders in terms of Dice score.
However, the number of parameters increases significantly when using multiple anatomy-specific models which leads to low inference speed.
Furthermore, the results show superior performance in terms of Dice score when DeepLabv3 is used with EfficientNet as an encoder compared to ResNet encoders.
Small network architectures such as MiniNetv2 enable real-time capabilities, but show significantly lower segmentation performance.
The transformer-based architectures SegFormer achieves higher segmentation accuracy in comparison to convolutional-based DeepLabv3 networks.
Our proposed MT-KD approach increases the segmentation performance of both convolutional-based segmentation networks and transformer-based segmentation networks. 
The most significant increase due to our MT-KD approach can be shown for small models, i.e. MiniNetv2. In this case, the Dice score increases from $36.1\%$ to $60.5\%$.
Qualitative results in Figure \ref{fig:qual_results} show more accurate segmentation of anatomies for MiniNetv2 when trained with our MT-KD.
Especially for smaller details, the segmentation accuracy can be increased by using large and accurate teacher models. 
A uniform increase in segmentation performance can be observed across all eleven anatomies.
The specific segmentation results in terms of Dice score for individual anatomies can be found in the Appendix.



%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


\section{Discussion and Conclusion}

Current deep learning networks for anatomy segmentation suffer from two problems. Either they show good segmentation performance, similar to human experts, but are too large for real-time applications or they are efficient enough for real-time applications but do not show sufficient performance for high-accuracy segmentation.
In this work, we propose a multi-teacher knowledge distillation (MT-KD) approach that leverages the knowledge of multiple anatomy-specific, high-accuracy teacher networks to tackle the problem of training a single and efficient network with partially labeled datasets.
By minimizing the discrepancy between the normalized logits of anatomy-specific, high-accuracy teacher networks and a single and efficient student network, the segmentation accuracy of various small, real-time capable network architectures is improved while retaining high inference speed.\\
Our results demonstrate highest segmentation performance with $66.3 \%$ and $69.7 \%$ Dice score for anatomy-specific, high capacity teacher networks such as DeepLabv3/Eff.Netb3 $\blacksquare$ and SegFormerB3 $\blacksquare$. We assume higher capacity to learn complex, anatomy-specific features.
The results further demonstrate that anatomy-specific, low capacity networks such as MiniNetv3 $\blacksquare$ only achieve an overall Dice Score of $43.2 \% $, failing to learn valuable features in the data.
For application in realistic surgical scenarios, anatomy-specific models need to be operated sequentially in order to segment several anatomies. A combined architecture with one common encoder and anatomy-specifc decoders enables simultaneous anatomy segmentation.
% To enable simultaneous anatomy segmentation, the architecture is combined by using one common encoder and several anatomy-specific decoders.
Although the segmentation performance of the combined architecture decreases only by $2.1 \%$ and $2.8 \%$, for DeepLabv3/EfficientNetb3 $\bigstar$ and SegFormerB3 $\bigstar$, respectively, models with large encoder and decoder architectures still do not achieve a sufficiently high frame rate (6 FPS and 15 FPS).
With our approach, the segmentation performance of models with smaller and thus faster architectures increases by up to $24.4 \%$. Especially, the accurate segmentation of smaller and more complex anatomies can be improved for smaller segmentation networks when guided by accurate teacher models with our MT-KD approach.
Overall, the segmentation accuracy of high-capacity, anatomy-specific networks remains at least $1.8 \%$ higher, however, almost two orders of magnitude fewer parameters are required. 
We evaluated our approach on the recently published DSAD.
In order to evaluate whether the generalization ability of smaller student models changes similarly to that of larger teacher models when applied to other laparoscopic data sets, it is of interest to perform further comprehensive studies on different laparoscopic data sets.

%%%

% predictions were, overall, better for large and similar-appearing organs such as the abdominal wall, the liver, the stomach, and the spleen as compared to smaller and more diverse-appearing organs such as the pancreas, the ureter, or vesicular glands.

% \midlacknowledgments{We thank a bunch of people.}


\bibliography{midl24_320}

\newpage

\appendix

\section{Anatomy specific results}

\begin{table}[h]
\begin{center}
\resizebox{\columnwidth}{!}{%
\begin{tabular}{cc|c|c|c|c|c|c|c|c|c|c|c}
\hline
    \multirow{1}{*}{Architecture}& Encoder & Abdominal & Colon & Inferior & Intestinal & Liver & Pancreas & Small & Spleen & Stomach & Ureter & Vesicular \\
     &  & wall &  & mesenteric & veins &  &  & intestine &  &  &  & glands\\
     &  &  &  & artery &  &  &  &  &  &  &  & \\
    \hline
    \multirow{1}{*}{DeepLabv3 $\blacksquare$}& ResNet18 & 83 & 69 & 44 & 39 & 65 & 28 & 79 & 56 & 59 & 33 & 37 \\
    \cite{chen2017rethinking} & ResNet50 & 90 & 79 & 54 & 54 & 80 & 37 & 87 & 79 & 71 & 47 & 40\\
    \cline{2-13}
    \multirow{1}{*}{}& Eff.Netb0 & 88 & 77 & 51 & 49 & 71 & 42 & 86 & 74 & 66 & 40 & 47 \\
    & Eff.Netb3 & 90 & 79 & 54 & 56 & 76 & 43 & 88 & 78 & 66 & 48 & 52 \\
    \cline{2-13}
    \multirow{1}{*}{SegFormer $\blacksquare$}& SegFormerB0 & 89 & 76 & 51 & 51 & 78 & 45 & 85 & 73 & 64 & 44 & 48\\
    \cite{xie2021segformer} & SegFormerB3 & 91 & 79 & 58 & 58 & 83 & 46 & 89 & 81 & 75 & 52 & 55 \\
    \cline{2-13}
    \multirow{1}{*} {MiniNetv2 $\blacksquare$}  & MiniNetv2 & 80 & 55 & 34 & 24 & 61 & 25 & 70 & 38 & 44 & 20 & 25 \\
    \cite{tomasini2022efficient} &  &  &  &  & \\
    \hline
    \multirow{1}{*}{}& ResNet18 & 83 & 70 & 44 & 47 & 76 & 37 & 77 & 73 & 61 & 32 & 46 \\
    DeepLabv3 $\bigstar$ & ResNet50 & 83 & 72 & 49 & 47 & 68 & 35 & 79 & 73 & 63 & 35 & 47 \\
    \cline{2-13}
    \multirow{1}{*}{\cite{chen2017rethinking}}& Eff.Netb0 & 83 & 72 & 48 & 47 & 71 & 37 & 80 & 75 & 64 & 39 & 48 \\
    & Eff.Netb3 & 84 & 75 & 53 & 57 & 72 & 43 & 81 & 76 & 70 & 47 & 51 \\
    \cline{2-13}
    \multirow{1}{*}{SegFormer $\bigstar$}& SegFormerB0 & 83 & 73 & 42 & 53 & 75 & 42 & 79 & 69 & 62 & 42 & 46 \\
    \cite{xie2021segformer} & SegFormerB3 & 89 & 77 & 55 & 53 & 78 & 45 & 84 & 81 & 71 & 49 & 54 \\
    \cline{2-13}
    \multirow{1}{*}{MiniNetv2 $\bigstar$}& MiniNetv2 & 61 & 46 & 18 & 36 & 52 & 29 & 51 & 36 & 34 & 13 & 21\\
    \cite{tomasini2022efficient} &  &  & &  & \\
    \hline
    % DeepLabv3 $\bigstar$& ResNet18 & xy & 63.13 & 42 & 23 \\
    % (w/ our KD-approach) & &  &  &  \\
    % \hline
    DeepLabv3 $\bigstar$& Eff.Netb0 & 86 & 74 & 55 & 54 & 69 & 41 & 84 & 81 & 68 & 48 & 49 \\
    (w/ our KD-approach) & ResNet18 & 87 & 76 & 55 & 54 & 77 & 37 & 81 & 78 & 70 & 43 & 51  \\
    \hline
    SegFormer $\bigstar$& SegFormerB0 & 84 & 75 & 51 & 53 & 76 & 45 & 81 & 81 & 74 & 45 & 49 \\
    (w/ our KD-approach) & &  &  &  \\
    \hline
    MiniNetv2 $\bigstar$& MiniNetv2 & 83 & 74 & 46 & 49 & 72 & 40 & 80 & 74 & 64 & 41 & 42 \\
    (w/ our KD-approach) & &  &  &  \\
    \hline
\end{tabular}%
}
\end{center}
\caption{Segmentation results for all individual anatomies in terms of Dice score ($\%$).}
\end{table}



\end{document}
