\documentclass{midl} % Include author names
%\documentclass[anon]{midl} % Anonymized submission
\newcommand{\red}[1]{\textcolor{red}{#1}}
\usepackage{multirow}
\usepackage{soul}
\usepackage{url}
% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

% \usepackage{mwe} % to get dummy images

\jmlrvolume{-- TBA}
\jmlryear{2020}
\jmlrworkshop{Full Paper -- MIDL 2020}
% \editors{Accpet for MIDL 2020}

\title[CNNs with Distance Transform Maps]{How Distance Transform Maps Boost Segmentation CNNs: An Empirical Study}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

\midlauthor{\Name{Jun Ma\nametag{$^{1}$}} \Email{junma@njust.edu.cn}\\
\addr $^{1}$ Department of Mathematics, Nanjing University of Science and Technology \\
\Name{Zhan Wei\nametag{$^{2}$}} \Email{weizhan@hdu.edu.cn}\\
\addr $^{2}$ HangZhou DianZi University \\
\Name{Yiwen Zhang\nametag{$^{3}$}} \Email{whisney@i.smu.edu.cn}\\
\addr $^{3}$ School of Biomedical Engineering, Southern Medical University \\
\Name{Yixin Wang\nametag{$^{4}$}} \Email{wangyixin19@mails.ucas.ac.cn}\\
\addr $^{4}$ Institute of Computing Technology, Chinese Academy of Sciences; University of Chinese Academy of Sciences \\
\Name{Rongfei Lv\midlotherjointauthor\nametag{$^{5}$}} \Email{lrf@cqu.edu.cn}\\
\addr $^{5}$ College of Optoelectronic Engineering, Chongqing University \\
\Name{Cheng Zhu\nametag{$^{6}$}} \Email{zhuc@hcsd-med.com}\\
\addr $^{6}$ Shenzhen Haichuang Medical Co. Ltd \\
\Name{Gaoxiang Chen\midljointauthortext{Contributed equally}\nametag{$^{7}$}} \Email{gaoxiangchen@wmu.edu.cn}\\
\addr $^{7}$ The First Affiliated Hospital of Wenzhou Medical University \AND
\Name{Jianan Liu\midlotherjointauthor\nametag{$^{8}$}} \Email{ljnnzb@mail.nankai.edu.cn}\\
\addr $^{8}$ College of Artificial Intelligence, Nankai University \AND
\Name{Chao Peng\midlotherjointauthor\nametag{$^{9}$}} \Email{cqupengchao@cqu.edu.cn}\\
\addr $^{9}$ College of Optoelectronic Engineering, Chongqing University \AND
\Name{Lei Wang\midlotherjointauthor\nametag{$^{10}$}} \Email{wanglei\_nuist@126.com}\\
\addr $^{10}$ School of Automation, Nanjing University of Information Science and Technology\\
\Name{Yunpeng Wang\nametag{$^{11}$}} \Email{18111510027@fudan.edu.cn}\\
\addr $^{11}$ Institutes of Biomedical Sciences, Fudan University \\
\Name{Jianan Chen\nametag{$^{12}$}} \Email{chenjn2010@gmail.com}\\
\addr $^{12}$ Department of Medical Biophysics, University of Toronto
}


\begin{document}

\maketitle

\begin{abstract}
Incorporating distance transform maps of ground truth into segmentation CNNs has been an interesting new trend in the last year. Despite many great works leading to improvements on a variety of segmentation tasks, the comparison among these methods has not been well studied.
In this paper, our \emph{first contribution} is to summarize the latest developments of these methods in the 3D medical segmentation field.
The \emph{second contribution} is that we systematically evaluated five benchmark methods on two representative public datasets.
These experiments highlight that all the five benchmark methods can bring performance gains to baseline V-Net. However, the implementation details have a noticeable impact on the performance, and not all the methods hold the benefits on different datasets.
Finally, we suggest the best practices and indicate unsolved problems for incorporating distance transform maps into CNNs, which we hope would be useful for the community. The codes and trained models are publicly available at \url{https://github.com/JunMa11/SegWithDistMap}.
\end{abstract}

\begin{keywords}
Distance transform maps, medical image segmentation, convolutional neural networks, signed distance function
\end{keywords}

\section{Introduction}
Convolutional neural networks (CNNs)\footnote{In this paper, CNNs refers specifically to the networks for medical image segmentation.} have been widely used on a variety of medical image segmentation tasks, and achieved great success, such as liver segmentation \cite{bilic2019LiTS}, heart segmentation \cite{bernard2018ACDC}, brain segmentation \cite{wang2019iseg} and so on. Recently, a new segmentation methodology is emerging where the distance transform maps are incorporated into existing CNNs \cite{MIDL2019-BDLoss, TMI2019HDLoss, AAAI2020-SDFLoss, MLMI2019-MH-Fernando, MP2019-RecBranch-Shusil} to obtain further improvements.

Most existing CNNs use binary or multi-label mask as ground truth. Distance transform maps (DTM) offer an alternative to classical ground truth. For example, a binary mask can be transformed to a graylevel image, termed as distance transform map, where the intensities of pixels in the foreground are changed according to the distance to the closest boundary. One can also compute the signed distance function (SDF) of the ground truth, which embeds object contours in a higher dimensional space. In general, the signed distance function takes negative values inside the object and positive values outside the object. The absolute value is defined by the distance between the point of interest and the closest boundary point. In a word, distance transform map or signed distance function is an implicit representation of ground truth, and there exists a rigorous mapping between them.


In the past year, incorporating the distance transform maps of image segmentation labels into CNNs pipelines has received significant attention. These methods can be classified into two classes (Figure \ref{fig:overview}) in terms of the usage of distance transform maps: (1) new loss functions \cite{MIDL2019-BDLoss, TMI2019HDLoss, AAAI2020-SDFLoss}: use distance transform maps to design new loss functions; and (2) adding auxiliary tasks \cite{MLMI2019-MH-Fernando, MP2019-RecBranch-Shusil}: generating the segmentation probabilistic map and regressing the distance transform maps at the same time.

All these methods argue that using distance transform maps can boost existing baseline CNNs, such as U-Net and V-Net. However, these methods are tested on different datasets, and there is no shared experimental protocol followed by all. Thus, we do not know which method should be chosen to improve performance in practice.

This paper aims to experimentally answer the question:


\begin{center}
\emph{How can distance transform maps boost segmentation CNNs?}
\end{center}


Our contributions are summarized as follows:
\begin{itemize}
  \item summarizing the latest developments about incorporating distance transform maps into CNN-based 3D medical image segmentation.
  \item benchmarking five methods on two representative datasets by extensive experiments.
\end{itemize}

\begin{figure}[!ht]
  \centering
  % Requires \usepackage{graphicx}
  \includegraphics[scale=0.65]{overview.PNG}\\
  \caption{Overview of the two categories of recent distance transform maps-related CNNs in medical image segmentation.}\label{fig:overview}
\end{figure}

The rest of the paper is organized as follows. A brief review of the recent\footnote{Here, ``recent" means after 2019 in this paper.} DTM-related CNNs in 3D medical image segmentation is given in Section \ref{s:review}. We present the experimental settings in Section \ref{s:exp} and the corresponding results in Section \ref{s:res-dis}. Finally, we conclude this paper in Section \ref{s:con}.

\section{CNNs with Distance Transform Maps}
\label{s:review}
In this section, we present an overview of five benchmark methods that are selected based on two criteria: (1) the method is general and can be applied to many 3D segmentation tasks; (2) The method is published in 2019-2020. Several related methods also use distance transform maps, but they are designed for specific tasks such as tubular segmentation \cite{wangMICCAI2019tubular} and lesion detection \cite{vanMICCAI2019lesion_dec}.
Evaluating these tailored methods is beyond the scope of this paper.

\subsection{Basic Notation}
Let $\Omega$ denote the grid on which the image $I$ is defined, and $G, S$ denote the corresponding ground truth and segmentation, respectively. $S_\theta$ denotes the softmax outputs of CNNs where $\theta$ is the parameters. Formally, we define the distance transform map (DTM) of ground truth $G$ by
\begin{equation}\label{Eq:DTM}
  G_{DTM} =
\begin{cases}
    \inf\limits_{y\in \partial G}||x-y||_2, & x \in G_{in}  \\
    0, & others
\end{cases}
\end{equation}
where $||x-y||_2$ is the Euclidian distance between voxels $x$ and $y$, and $G_{in}$ denotes the inside of the object.
The signed distance function (SDF) of ground truth $G$ is defined by
\begin{equation}\label{Eq:LSF}
  G_{SDF} =
\begin{cases}
    -\inf\limits_{y\in \partial G}||x-y||_2, & x \in G_{in} \\
    0, & x \in \partial G \\
    \inf\limits_{y\in \partial G}||x-y||_2, & x \in G_{out}
\end{cases}
\end{equation}
where $G_{out}$ and $\partial G$ denote the outside and boundary of the object, respectively.
The main difference between distance transform map $G_{DTM}$ and signed distance function $G_{SDF}$ is that the $G_{SDF}$ considers the distance transformation information of both foreground and background, while $G_{DTM}$ only computes the distance transformation of the foreground.

In the following two subsections, we give a brief review of five methods that will be evaluated in Section \ref{s:exp}.
As shown in Figure \ref{fig:overview}, we divided the five methods into two categories, new loss functions and adding auxiliary tasks, based on their main contributions (the usage of distance transform maps).

\subsection{New Loss Functions}
Kervadec et al. \cite{MIDL2019-BDLoss} proposed boundary loss (BD) to mitigate unbalanced segmentation problems. The key idea is to use an integral approach for computing boundary variations between segmentation and ground truth, which avoids complex local differential computations. Specifically, the loss is defined by:
\begin{equation}
  L_{BD} = \frac{1}{|\Omega|} \sum_{\Omega} G_{SDF} \circ S_\theta
\end{equation}
where $G_{SDF}$ denotes the signed distance function of ground truth $G$, and $\circ$ is  the Hadamard (i.e. voxel-wise) product.


To reduce the Hausdorff distance (HD) during training CNNs, Karimi et al. \cite{TMI2019HDLoss} proposed Hausdorff distance loss for direct minimization of HD. The loss function is defined by
\begin{equation}
  L_{HD} = \frac{1}{|\Omega|}\sum_\Omega [(S_\theta-G)^2 \circ (G_{DTM}^2 + S_{DTM}^2)]
\end{equation}
where $G_{DTM}$ and $S_{DTM}$ denote the distance transform maps of ground truth $G$ and predicted segmentation $S$, respectively.

Recently, Yuan et al. \cite{AAAI2020-SDFLoss} proposed using CNNs to directly regress the signed distance function (SDF) of ground truth rather than to generate softmax outputs, because there is rigorous mapping between the ground truth and the SDF. The signed distance function regression loss is defined by
\begin{equation}
  L_{SDF} = - \sum_\Omega \frac{G_{SDF}\circ S_{SDF}}{G_{SDF}^2 + S_{SDF}^2 + G_{SDF}\circ S_{SDF}}
\end{equation}
where $G_{SDF}$ and $S_{SDF}$ denote the ground truth and the predicted signed distance functions, respectively.
The SDF loss aims to penalize the output SDF with wrong sign.


In summary, the distance transform map (DTM) of ground truth was incorporated in all the three loss functions. Boundary loss \cite{MIDL2019-BDLoss} assigned weights to the softmax probability outputs based on the ground truth SDF, while Hausdorff distance loss \cite{TMI2019HDLoss} introduced not only the ground truth DTM but also the predicted segmentation DTM to weight the softmax probability outputs. SDF loss \cite{AAAI2020-SDFLoss} employed the product of predicted SDF and ground truth SDF to guide the SDF regression network during training.

In practice, it should be noted that the three loss functions should be coupled with Dice loss so as to stabilize training process, especially at the beginning of training, otherwise training may not converge. More details about the usage of the loss functions are presented in Section \ref{ss:exp_design}.


\subsection{Auxiliary Tasks}
\label{ss:aux}
Distance  transform  maps  can  also  be  used to  augment  CNNs  by  adding  auxiliary  tasks.   Usually,  the  auxiliary  task  is  a regression  task,  and  we  found  two  different  ways  to  regress  the  DTM  from  recent publications. First, a  new  head  sharing the same backbone network can be added to the  end  of  the CNNs (Figure \ref{fig:overview}, top right), for the purpose of learning shape information of chest organs \cite{MLMI2019-MH-Fernando} or tubular structure reconstruction \cite{wang2019vessel}.  The other way is to add a reconstruction branch for learning robust global features by regressing pixel-wise distance map (Figure \ref{fig:overview}, bottom left) \cite{MP2019-RecBranch-Shusil}.

In summary, both multi-heads and reconstruction-branch CNNs aim at regressing the DTM of ground truth. The main difference is that the multi-heads CNN shares the backbone network while the reconstruction-branch CNN only shares the encoder network. In addition, we observed that these methods only consider the DTM of foreground, but not the SDF of ground truth which consists of the DTMs of both foreground  and  background.  To the best of our knowledge, regressing the SDF of ground truth has not been explored in existing studies.

% In addition to designing new loss functions, distance transform maps can also be used to augment existing CNNs by adding auxiliary tasks.
% Usually, the auxiliary task is a regression task, and we found two different ways to regress the DTM based on recent related publications.

% One added a new head in the end of the CNNs, and the two heads shared the same backbone network as shown in Figure \ref{fig:overview} (top right). For example, Fernando et al. \cite{MLMI2019-MH-Fernando} added a new head as a complementary task in the end of U-Net \cite{ronneberger20152DUNet} for ground truth DTM regression, which aims at enforcing the network to learn shape information of chest organs.
% Recently, Wang et al. \cite{wang2019vessel} also used this pipeline to regress the foreground distance map which can be further used for tubular structure reconstruction.

% The other augmented the CNNs by adding a reconstruction branch in the end of the encoder network, and the two branches only shared the encoder network and had their own decoder networks as shown in Figure \ref{fig:overview} (bottom left). For example, Shusil et al. \cite{MP2019-RecBranch-Shusil} added a reconstruction branch in the end of the encoder for pixel-wise distance map regression so as to facilitate the network to learn robust global features in the cardiac cine MRI segmentation task.

% In summary, both multi-heads and reconstruction-branch CNNs aim at regressing the DTM of ground truth. The main difference is that the multi-heads CNN shares the backbone network while the reconstruction-branch CNN only shares the encoder network.
% In addition, we observed that these methods only consider the DTM of foreground, but the SDF of ground truth consists of the DTMs of both foreground and background. To the best of our knowledge, regressing the SDF of ground truth has not been explored in the existing studies.


\section{Experiments}
\label{s:exp}
In this section we describe the datasets, the backbone CNN, quantitative segmentation metrics and experimental design.

\subsection{Dataset, network backbone, and metrics}
We use two representative datasets to evaluate the above five benchmark methods. One dataset is the left atrial (LA) MRI, which is an organ segmentation task\footnote{MICCAI 2018 left atrial segmentation: http://atriaseg2018.cardiacatlas.org/.}. The other dataset is the liver tumor CT (LiTS) ,which is a popular tumor segmentation task\footnote{MICCAI 2017 liver tumor segmentation: https://competitions.codalab.org/competitions/17094}.
LA includes 100 3D gadolinium-enhanced MR training cases. We randomly selected 16 cases for training and 20 cases for testing to create a typical small sample learning setting.
% Using 80 cases for training may lead to a trivial segmentation task, because our baseline V-Net achieves a high Dice with 0.9102 when using 80 cases for training, which is comparable with the state-of-the-art method in challenges\cite{chen2018LASOTA}.
% Firstly, to validate the backbone design, we train the baseline V-Net with all 80 labeled cases and test on the 16 testing cases. The baseline V-Net achieve a Dice with 0.9102 which is comparable with the state-of-the-art method in challenges\cite{chen2018LASOTA}. Thus, we can regard the V-Net as a standard baseline model.
% Thus, we only use 16 cases for training to increase difficulty, which is a typical small samples learning task.
LiTS includes 118 CT training cases. We split them into 90 for training and 28 for testing. All the cases were cropped centering at the heart or liver region for better comparison of the segmentation performance of different methods, and normalized by subtracting the mean and divided by standard deviation.
We chose these two datasets because we want to involve typical modalities (CT and MR), tasks (organ and tumor) and challenges (small sample learning and small objects segmentation) in 3D medical image segmentation tasks.

We employ V-Net \cite{milletari2016VNet} as the network backbone. It has five stage convolutional blocks in different resolutions. The base convolution block ($1_{st}$ stage) has 16 feature maps, and the number of feature maps is doubled every next stage.
During training, we used the Adam optimizer for all experiments and searched the best leaning rate in the set $\{0.01, 0.001, 0.0001\}$. To make the experiments reproducible, we set the random seed as 2019. We also added two dropout layers after the $L-5_{th}$ and $R-1_{st}$ stage layers\footnote{L and R denote the left encode path and right decode path in V-Net.} with dropout rate 0.5.
For left atrial MRI dataset, dropout was turned on during training, but turned off during inference. Using dropout could bring performance gains on left atrial MRI dataset. However, we fould that using dropout hurts the performance on liver tumor CT dataset based on our experiments.
Hence, we turned off dropout in all experiments for liver CT tumor segmentation.
All the networks and loss functions are implemented in PyTorch, and run in Linux.


Four complementary segmentation metrics are introduced to quantitatively evaluate the segmentation results. Dice and Jaccard, two region-based metrics, are used to measure the region mismatch. Average surface distance (ASD) and 95\% Hausdorff Distance (95HD), two boundary-based metrics, are used to evaluate the boundary errors between the segmentation results and the ground truth.


\subsection{Experimental design}
\label{ss:exp_design}
We evaluated the five benchmark methods on the two representative datasets with the above training protocol.
For boundary loss and Hausdorff distance loss, the final loss function is defined by
\begin{equation}
  L = \alpha L_{Dice} + (1-\alpha) L(\cdot)
\end{equation}
where $\alpha \in [0,1]$ is the weight parameter, and $L(\cdot)$ denotes boundary loss and Hausdorff distance loss, respectively. In practice, $\alpha$ is set to 1 at the start of the training and decreased by 0.001 after each epoch until it reaches 0.01, which is suggested in \cite{MIDL2019-BDLoss, TMI2019HDLoss}.
For signed distance function loss, the final loss is defined by  $$L = L_{Dice} + 10(L1 + L_{SDF}) $$ as suggested in \cite{AAAI2020-SDFLoss}.
In addition, For multi-heads and reconstruction-branch CNNs, directly regressing the signed distance function is still undeveloped as we mentioned in Section \ref{ss:aux}.
Thus, we also evaluated several combinations among different network architectures (multi-heads versus reconstruction-branch CNNs), different regression tasks (DTM versus SDF), and different loss functions ($L1$, $L2$ or $L1+L2$).

\section{Results and Discussion}
\label{s:res-dis}
In this section, we present the quantitative results of the five benchmark methods on the two datasets.

\begin{table}[!ht]
 % The first argument is the label.
 % The caption goes in the second argument, and the table contents
 % go in the third argument.
 \floatconts
  {tab:la-results}%
  {\caption{Quantitative results with average (standard deviation) on left atrial MRI segmentation. FG, DTM and SDF denote the foreground distance transform map and the signed distance function, respectively. Rec-Branch denotes the the network with reconstruction branch, and L1/L2 denotes . The arrows indicate which direction is better.}}%
  {\begin{tabular}{lcccc} \hline
  \bfseries Methods & \bfseries Dice (\%) $\uparrow$ & \bfseries Jaccard (\%) $\uparrow$ & \bfseries 95HD $\downarrow$ & \bfseries ASD $\downarrow$ \\ \hline
  V-Net baseline & 84.4 (5.70) & 73.6 (7.00) & 20.1 (13.8) & 5.29 (3.43) \\ \hline
  Boundary loss & 85.0 (5.64) & 74.2 (7.87) & 20.8 (15.0) & 5.43 (3.43) \\ % \hline
  Hausdorff distance loss & \textbf{85.5 (4.96)} & \textbf{75.0 (7.30)} & 15.9 (13.3) & 4.46 (3.68) \\ %\hline
  Signed distance function loss & 84.2 (8.48) & 73.5 (11.0) & \textbf{13.5 (11.2)} & \textbf{3.24 (3.10)} \\ \hline
  Multi-heads: FG DTM-L1 & 83.7 (6.33) & 72.5 (8.97) & 24.7 (12.8) & 6.62 (3.32) \\
  Multi-heads: FG DTM-L2 & 82.6 (6.87) & 71.0 (9.65) & 15.5 (11.5) & 4.10 (3.12) \\
  Multi-heads: FG DTM-L1+L2 & 83.3 (10.7) & 72.6 (12.6) & 17.5 (12.1) & 4.87 (3.12) \\
  Multi-heads: SDF-L1 & 85.5 (7.82) & 75.3 (10.2) & \textbf{11.8 (8.86)} & \textbf{2.65 (2.11)} \\
  Multi-heads: SDF-L2 & \textbf{87.0 (3.49)} & \textbf{77.2 (5.49)} & 16.1 (13.5) & 3.97 (3.14) \\
  Multi-heads: SDF-L1+L2 & 84.5 (4.38) & 73.5 (6.49) & 24.7 (15.0) & 6.09 (3.71) \\ \hline
  Rec-Branch: FG DTM-L1 & 83.5 (5.91) & 72.2 (8.30) & 23.6 (14.8) & 5.45 (3.57) \\
  Rec-Branch: FG DTM-L2 & 81.5 (8.40) & 69.5 (10.9) & 19.5 (16.9) & 4.49 (4.76) \\
  Rec-Branch: FG DTM-L1+L2 & 83.8 (4.57) & 72.3 (6.78) & 28.5 (14.1) & 7.47 (3.40) \\
  Rec-Branch: SDF-L1 & 82.5 (9.05) & 73.6 (10.9) & 12.0 (4.61) & 2.73 (1.38) \\
  Rec-Branch: SDF-L2 & \textbf{86.9 (4.43)} & \textbf{77.1 (7.92)} & \textbf{10.2 (6.03)} & \textbf{2.71 (1.68)} \\
  Rec-Branch: SDF-L1+L2 & 85.1 (67.5) & 74.6 (9.24) & 16.7 (13.1) & 4.00 (3.19) \\
  \hline
  \end{tabular}}
\end{table}

\subsection{Dataset 1: Left atrial MRI}
\label{ss:LAResults}
\tableref{tab:la-results} presents the quantitative results for left atrial MRI segmentation.
Compared with the naive V-Net baseline, the two types of methods (New loss functions and adding auxiliary tasks) can obtain performance gains. Specifically, Hausdorff distance loss, multi-heads CNN and Rec-Branch CNN improved the baseline by 1.1\%, 2.6\%, and 2.5\% in terms of Dice, respectively. SDF loss improved 95HD by 6.6, Multi-heads CNN and Rec-Branch CNN also improved 95HD by 8.3 and 9.9, respectively.
Multi-heads CNN achieved the best Dice, Jaccard and ASD, and Rec-Branch CNN achieved the best 95HD with approximate 50\% reduction. Paired T-test shows that the improvements are statistically significant at $p<0.01$.
We also found the regression branch and loss functions have significant impact on the performance. In particular, adding SDF regression task can provide better performance compared with adding the foreground DTM regression. It can be found that adding foreground distance map regression as an auxiliary task even degrades the performance compared with baseline in both multi-heads and Rec-Branch CNNs.
Moreover, using $L2$ loss is better than using $L1$ loss or their sum.


\subsection{Dataset 2: Liver tumor CT}
\label{ss:lits}

\tableref{tab:lits-results} shows the quantitative results on liver tumor CT dataset of the ``winner" methods\footnote{``Winner" methods: the methods that achieve performance improvements.} in left atrial segmentation.
Boundary loss and Hausdorff distance loss achieved minor improvements that are statistically significant at  $p<0.05$.
It can be found that SDF loss, multi-heads and Rec-Branch CNNs didn't improve network performance.
The potential reason may be that liver tumor segmentation is much more challenging than left atrial segmentation. For example, tumor has various location, shape and size, while these characteristics are relatively fixed for left atrial segmentation. It is non-trivial to regress the DTM or SDF of liver tumor.


\begin{table}[!htbp]
 % The first argument is the label.
 % The caption goes in the second argument, and the table contents
 % go in the third argument.
\floatconts
  {tab:lits-results}%
  {\caption{Quantitative results with average (standard deviation) on liver tumor CT dataset. The arrows indicate which direction is better.}}%
  {\begin{tabular}{lllll} \hline
  \bfseries Methods & \bfseries Dice $\uparrow$ & \bfseries Jaccard $\uparrow$ & \bfseries 95HD $\downarrow$ & \bfseries ASD $\downarrow$ \\ \hline
  V-Net baseline & 51.0 (28.8) & 39.8 (21.6) & 43.6 (45.2) & 14.9 (22.3) \\ \hline
  Boundary loss & \textbf{52.5 (24.1)} & \textbf{41.0 (21.1)} & \textbf{26.3 (33.7)} & 7.70 (21.9) \\
  Hausdorff distance loss & 52.0 (25.4) & 40.9 (22.6) & 28.8 (34.3) & \textbf{7.56 (19.4)} \\
  Signed distance function loss & 47.6 (29.8) & 37.5 (26.9) & 31.1 (48.7) & 11.2 (23.8) \\ \hline
  Multi-heads: SDF-L1 & 48.1 (27.6) & 38.2 (24.4) & 31.5 (40.6) & 8.11 (15.4) \\
  Multi-heads: SDF-L2 & 47.1 (28.0) & 37.0 (25.3) & 25.5 (34.1) & 8.82 (22.3)\\ \hline
  Rec-Branch: SDF-L1 & 48.4 (27.7) & 37.9 (25.3) & 32.2 (48.6) & 11.8 (31.1) \\
  Rec-Branch: SDF-L2 & 48.6 (27.3) & 38.5 (25.0) & 31.0 (48.0) & 7.52 (21.8) \\
  \hline
  \end{tabular}}
\end{table}



\section{Conclusion}
\label{s:con}
For the question \emph{``how can distance transform maps boost segmentation CNNs"}, our answer is that all the benchmark methods have the potential to improve the performance of baseline CNNs based on the experimental results. However, the performance gains are not consistent in different datasets. In particular, implementation details have remarkable effects on the final performance, for example learning rates, regression tasks, loss functions and so on.
In practice, we would recommend multi-heads and Rec-Branch CNNs for the first try in organ segmentation tasks. On the other hand, boundary loss and Hausdorff distance loss would be suggested for the first try in tumor segmentation tasks.
Importantly, how should we use the distance transform maps to boost existing CNNs and obtain \textbf{robust} performance gains is still an open question.

We can not claim we have completely reproduced the five benchmark methods, because most of them are not open-source except boundary loss\footnote{https://github.com/LIVIAETS/surface-loss}. However, we tried our best to tune each method to achieve the best performance. For example, we tried different learning rates for each experiments. We also tried different $\alpha$ decay rates for boundary loss and Hausdorff distance loss.
% The total number of experiments we have run is more than 70.
More than 70 experiments were run to ensure a fair comparison of these methods as shown in Appendix. Another limitation is that we used V-Net as backbone without justification. In future work, we will evaluate these methods with recent network architectures on more segmentation datasets, for example the Medical Segmentation Decathlon \cite{decathlon}. Furthermore, exploring the combination of the two different kinds of methods is also a promising extension.
Our codes and trained models are publicly available at \url{https://github.com/JunMa11/SegWithDistMap}, which we hope would be useful for the community.

% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{This project is supported by the National Natural Science Foundation of China (No. 91630311, No. 11971229). The authors would also like to thank the organization team of MICCAI 2017 liver tumor segmentation challenge MICCAI 2018 and left atrial segmentation challenge for the publicly available dataset. We also thank the reviewers for their valuable comments and suggestions. Last but not least, we thank Lequan Yu for his great PyTorch implementation of \href{https://github.com/yulequan/UA-MT}{V-Net} \cite{lequanMICCAI19} and Fabian Isensee for his great PyTorch implementation of \href{https://github.com/MIC-DKFZ/nnUNett}{U-Net} \cite{nnunet2020}.}


\bibliography{ma20}

\newpage
\appendix
%
\section{Hyper-parameters trials for boundary loss}
Table \ref{BD-Hyper-LA} and Table \ref{BD-Hyper-LITS} present the hyper-parameters experiments of boundary loss.

\begin{table}[!hb]
\caption{Boundary loss (BD) with different learning rates (LR), $\alpha$ decay rates and signed distance functions (non-normalized or normalized to $[-1,1]$) on left atrial MRI dataset. Failed means the training does not converge.}\label{BD-Hyper-LA}
\centering
\begin{tabular}{lllllll}
\hline
\textbf{Methods}      & \textbf{LR} & \textbf{$\alpha$ decay} & \multicolumn{1}{l}{\textbf{Dice}} & \multicolumn{1}{l}{\textbf{Jaccard}} & \multicolumn{1}{l}{\textbf{ASD}} & \multicolumn{1}{l}{\textbf{95HD}}  \\
\hline
BD           & 0.001         & 0.01        & 0.766                    & 0.643                       & 15.758                  & 3.884                     \\
BD           & 0.001         & 0.001       & 0.801                    & 0.677                       & 30.219                  & 8.980                     \\
BD           & 0.0001        & 0.01        & 0.246                    & 0.166                       & 35.736                  & 7.086                     \\
BD           & 0.0001        & 0.001       & 0.625                    & 0.509                       & 25.370                  & 5.026                     \\
\hline
BD Norm. SDF & 0.001         & 0.01        & 0.659                    & 0.515                       & 30.551                  & 9.804                     \\
BD Norm. SDF & 0.001         & 0.001       & 0.777                    & 0.647                       & 28.666                  & 8.756                     \\
BD Norm. SDF & 0.0001        & 0.01        & \multicolumn{4}{c}{Failed}                                                                                   \\
BD Norm. SDF & 0.0001        & 0.001       & 0.460                    & 0.338                       & 31.945                  & 9.350                     \\
BD Norm. SDF & 0.01          & 0.001       & \textbf{0.850 }          & \textbf{0.742 }             & \textbf{20.823 }        & \textbf{5.435 }           \\
\hline
\end{tabular}
\end{table}



\begin{table}[!ht]
\centering
\caption{Boundary loss (BD) with different learning rates (LR), $\alpha$ decay rates and signed distance functions (non-normalized or normalized to $[-1,1]$) on liver tumor CT dataset. Failed means the training does not converge.}\label{BD-Hyper-LITS}
\begin{tabular}{lllllll}
\hline
\textbf{Methods} & \textbf{LR} & \textbf{$\alpha$ decay} & \textbf{Dice}                       & \textbf{Jaccard}                    & \textbf{ASD}                        & \textbf{95HD}                       \\
\hline
BD               & 0.001                  & 0.01                 & 36.470                              & 27.130                              & 30.770                              & 3.860                               \\
BD               & 0.001                  & 0.001                & 0.511                               & 0.398                               & 29.351                              & 8.316                               \\
BD               & 0.0001                 & 0.01                 & \multicolumn{4}{c}{\multirow{2}{*}{Failed}}                                                                                                           \\
BD               & 0.0001                 & 0.001                & \multicolumn{4}{c}{}                                                                                                                                  \\
\hline
BD Norm. SDF     & 0.001                  & 0.01                 & 0.4877                              & 0.3784                              & \multicolumn{1}{r}{38.460}          & 16.036                              \\
BD Norm. SDF     & 0.001                  & 0.001                & \multicolumn{1}{r}{\textbf{0.525}} & \multicolumn{1}{r}{\textbf{0.410 }} & \multicolumn{1}{r}{\textbf{26.317}} & \multicolumn{1}{r}{\textbf{7.698}}  \\
BD Norm. SDF     & 0.0001                 & 0.01                 & \multicolumn{4}{c}{\multirow{2}{*}{Failed}}                                                                                                           \\
BD Norm. SDF     & 0.0001                 & 0.001                & \multicolumn{4}{c}{}                                                                                                                                  \\
\hline
\end{tabular}
\end{table}


\section{Hyper-parameters trials for Hausdorff distance loss}
Table \ref{HD-Hyper-LA} and Table \ref{HD-Hyper-LITS} present the hyper-parameters experiments of Hausdorff distance loss.

\begin{table}[!h]
\centering
\caption{Hausdorff distance loss (HD) with different learning rates (LR), $\alpha$ decay rates and distance transform map (non-normalized DTM or normalized DTM to $[0,1]$) on left atrial MRI dataset.}\label{HD-Hyper-LA}
\begin{tabular}{lllllll}
\hline
\textbf{Methods} & \textbf{LR} & \textbf{$\alpha$ decay} & \textbf{Dice}   & \textbf{Jaccard} & \textbf{ASD}     & \textbf{95HD}    \\
\hline
HD               & 0.001                  & 0.01                 & 0.656           & 0.503            & 40.770           & 14.810           \\
HD               & 0.001                  & 0.001                & 0.757           & 0.623            & 27.640           & 7.625            \\
HD               & 0.0001                 & 0.01                 & 0.723           & 0.578            & 37.630           & 12.600           \\
HD               & 0.0001                 & 0.001                & 0.640           & 0.485            & 40.050           & 14.360           \\
\hline
HD Norm. DTM     & 0.001                  & 0.01                 & 0.474           & 0.335            & 40.940           & 14.080           \\
HD Norm. DTM     & 0.001                  & 0.001                & 0.773           & 0.641            & 31.020           & 9.765            \\
HD Norm. DTM     & 0.0001                 & 0.01                 & 0.252           & 0.157            & 47.670           & 19.940           \\
HD Norm. DTM     & 0.0001                 & 0.001                & 0.400           & 0.276            & 38.860           & 14.140           \\
HD Norm. DTM     & 0.01                   & 0.001                & \textbf{0.855 } & \textbf{0.750 }  & \textbf{15.921 } & \textbf{4.461 }  \\
\hline
\end{tabular}
\end{table}


\begin{table}[!ht]
\centering
\caption{Hausdorff distance loss (HD) with different learning rates (LR), $\alpha$ decay rates and distance transform map (non-normalized DTM or normalized DTM to $[0,1]$) on liver tumor CT dataset. Failed means the training does not converge.}\label{HD-Hyper-LITS}
\begin{tabular}{lllllll}
\hline
\textbf{Methods} & \textbf{LR} & \textbf{$\alpha$ decay} & \textbf{Dice}   & \textbf{Jaccard} & \textbf{ASD}     & \textbf{95HD}   \\
\hline
HD           & 0.001  & 0.01        & 0.292                    & 0.196                       & 76.793                  & 39.510                    \\
HD           & 0.001  & 0.001       & 0.519                    & 0.405                       & 34.884                  & 11.152                    \\
HD           & 0.0001 & 0.01        & \multicolumn{4}{c}{Failed}                                                                                   \\
HD           & 0.0001 & 0.001       & 0.294                    & 0.211                       & 53.763                  & 25.509                    \\
\hline
HD Norm. DTM & 0.001  & 0.01        & 0.478                    & 0.370                       & 43.546                  & 19.233                    \\
HD Norm. DTM & 0.001  & 0.001       & 0.520                    & 0.409                       & 28.820                  & 7.562                     \\
HD Norm. DTM & 0.0001 & 0.01        & \multicolumn{4}{c}{\multirow{2}{*}{Failed}}                                                                  \\
HD Norm. DTM & 0.0001 & 0.001       & \multicolumn{4}{c}{}                                                                                         \\
\hline
\end{tabular}
\end{table}


\section{Hyper-parameters trials for signed distance function loss}
Table \ref{SDF-LA} presents the hyper-parameters experimental results of signed distance function loss.

\begin{table}[!h]
\centering
\caption{Signed distance function (SDF) loss ablation study results with different learning rates (LR) on left atrial dataset.}\label{SDF-LA}
\begin{tabular}{llllll}
\hline
\textbf{Methods}      & \textbf{LR}    & \multicolumn{1}{l}{\textbf{Dice}} & \multicolumn{1}{l}{\textbf{Jaccard}} & \multicolumn{1}{l}{\textbf{ASD}} & \multicolumn{1}{l}{\textbf{95HD}}   \\
\hline
Dice loss+L1          & 0.01  & 0.847                    & 0.739                       & 23.260                  & 6.572                     \\
Dice loss+L1          & 0.001 & 0.771                    & 0.658                       & 19.750                  & 5.490                     \\
\hline
Dice loss+L1+SDF loss & 0.01  & 0.813                    & 0.704                       & 16.090                  & 4.044                     \\
Dice loss+L1+SDF loss & \textbf{0.001} & \textbf{0.842 }          & \textbf{0.735 }             & \textbf{13.540 }        & \textbf{3.243 }                    \\
\hline
\end{tabular}
\end{table}


\section{Hyper-parameters trials for multi-heads V-Net}
Table \ref{Multi-LA} and \ref{Multi-LITS} present the hyper-parameters experimental results of multi-heads V-Net.

\begin{table}[!htbp]
\centering
\caption{Multi-heads V-Net with different regression tasks, loss functions and learning rates on left atrial dataset.}\label{Multi-LA}
\begin{tabular}{llllll}
\hline
\textbf{Multi-heads}   & \textbf{LR} & \textbf{Dice}   & \textbf{Jaccard} & \textbf{ASD}     & \textbf{95HD}    \\
\hline
FG DTM regression-L1    & 0.01        & 0.837           & 0.725            & 24.676           & 6.622            \\
FG DTM regression-L1    & 0.001       & 0.837           & 0.725            & 23.712           & 6.226            \\
FG DTM regression-L2    & 0.01        & 0.798           & 0.671            & 14.504           & 3.076            \\
FG DTM regression-L2    & 0.001       & 0.826           & 0.709            & 15.564           & 4.101            \\
FG DTM regression-L1+L2 & 0.01        & 0.814           & 0.695            & 19.087           & 4.919            \\
FG DTM regression-L1+L2 & 0.001       & 0.833           & 0.726            & 17.452           & 4.867            \\
\hline
SDF regression-L1       & 0.01        & 0.855           & 0.753            & \textbf{11.823 } & \textbf{2.646 }  \\
SDF regression-L1       & 0.001       & 0.817           & 0.703            & 17.632           & 4.044            \\
SDF regression-L2       & 0.01        & \textbf{0.870 } & \textbf{0.772 }  & 16.119           & 3.970            \\
SDF regression-L2       & 0.001       & 0.772           & 0.657            & 28.987           & 6.609            \\
SDF regression-L1+L2    & 0.01        & 0.845           & 0.734            & 24.713           & 6.093            \\
SDF regression-L1+L2    & 0.001       & 0.796           & 0.691            & 17.217           & 4.315            \\
\hline
\end{tabular}
\end{table}

\begin{table}[!htbp]
\centering
\caption{Multi-heads V-Net (signed distance function regression) with different loss functions and learning rates (LR) on liver tumor CT dataset.}\label{Multi-LITS}
\begin{tabular}{llllll}
\hline
\textbf{Multi-heads } & \textbf{LR} & \textbf{Dice} & \textbf{Jaccard} & \textbf{ASD} & \textbf{95HD}  \\
\hline
SDF regression-L1     & 0.01        & 0.4841        & 0.3819           & 31.5352      & 8.1127         \\
SDF regression-L1     & 0.001       & 0.4705        & 0.3718           & 31.6681      & 8.4459         \\
\hline
SDF regression-L2     & 0.01        & 0.4672        & 0.3649           & 30.7485      & 9.8766         \\
SDF regression-L2     & 0.001       & 0.471         & 0.3704           & 25.4891      & 8.8161         \\
\hline
\end{tabular}
\end{table}


\section{Hyper-parameters trials for reconstruction-branch  V-Net}
Table \ref{Rec-LA} and \ref{Rec-LITS} present the hyper-parameters experimental results of reconstruction-branch V-Net.

\begin{table}[!htbp]
\centering
\caption{Reconstruction-branch V-Net with different regression tasks, loss functions and learning rates on left atrial dataset.}\label{Rec-LA}
\begin{tabular}{llllll}
\hline
\textbf{Rec-Branch}       & \textbf{LR} & \textbf{Dice}   & \textbf{Jaccard} & \textbf{ASD}     & \textbf{95HD}    \\
\hline
FG DTM regression-L1      & 0.01        & 0.835           & 0.722            & 23.552           & 5.450            \\
FG DTM regression-L1      & 0.001       & 0.830           & 0.715            & 26.234           & 6.997            \\
FG DTM regression-L2      & 0.01        & 0.798           & 0.672            & 24.431           & 6.932            \\
FG DTM regression-L2      & 0.001       & 0.815           & 0.695            & 19.484           & 4.488            \\
FG DTM regression-L1 + L2 & 0.01        & 0.774           & 0.638            & 23.541           & 6.531            \\
FG DTM regression-L1 + L2 & 0.001       & 0.838           & 0.723            & 28.466           & 7.466            \\
\hline
SDF regression-L1         & 0.01        & 0.843           & 0.737            & 12.007           & 2.734            \\
SDF regression-L1         & 0.001       & 0.811           & 0.694            & 18.274           & 4.508            \\
SDF regression-L2         & 0.01        & \textbf{0.869 } & \textbf{0.771 }  & \textbf{10.234 } & \textbf{2.714 }  \\
SDF regression-L2         & 0.001       & 0.800           & 0.686            & 19.129           & 4.830            \\
SDF regression-L1 + L2    & 0.01        & 0.851           & 0.746            & 16.672           & 4.003            \\
SDF regression-L1 + L2    & 0.001       & 0.820           & 0.704            & 15.254           & 3.284            \\
\hline
\end{tabular}
\end{table}




\begin{table}[!htbp]
\centering
\caption{Multi-Head V-Net (signed distance function regression) with different loss functions and learning rates (LR) on liver tumor CT dataset.}\label{Rec-LITS}
\begin{tabular}{llllll}
\hline
\textbf{Rec-Branch}  & \textbf{LR} & \textbf{Dice} & \textbf{Jaccard} & \textbf{ASD} & \textbf{95HD}  \\
\hline
SDF regression-L1    & 0.01        & 0.484         & 0.379            & 32.249       & 11.786         \\
SDF regression-L1    & 0.001       & 0.467         & 0.366            & 32.844       & 6.687          \\
\hline
SDF regression-L2    & 0.01        & 0.447         & 0.343            & 42.535       & 15.428         \\
SDF regression-L2    & 0.001       & 0.486         & 0.385            & 30.996       & 7.550          \\
\hline
SDF regression-L1+L2 & 0.01        & 0.429         & 0.333            & 42.095       & 14.791         \\
SDF regression-L1+L2 & 0.001       & 0.456         & 0.353            & 34.837       & 8.522          \\
\hline
\end{tabular}
\end{table}




\end{document}