\documentclass{midl} % Include author names
%\documentclass[anon]{midl} % Anonymized submission

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\usepackage{threeparttable}
% \jmlrvolume{-- Under Review}
\jmlryear{2020}
\jmlrworkshop{Full Paper -- MIDL 2020}
% \editors{Under Review for MIDL 2020}

\title[Siamese Content Loss Networks for Medical Image Segmentation]{Siamese Content Loss Networks for Highly Imbalanced Medical Image Segmentation}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship

\midlauthor{\Name{Brandon Mac\nametag{$^{1}$}} \Email{bmac@ryerson.ca}\\
\Name{Alan R. Moody\nametag{$^{2,3}$}} \Email{alan.moody@sunnybrook.ca}\\
\Name{April Khademi\nametag{$^{1,4}$}} \Email{akhademi@ryerson.ca}\\
\addr $^{1}$ Image Analysis in Medicine Lab (IAMLAB), Ryerson University\\
\addr $^{2}$ Department of Medical Imaging, University of Toronto\\
\addr $^{3}$ Department of Medical Imaging, Sunnybrook Health Sciences Centre\\
\addr $^{4}$ Keenan Research Centre for Biomedical Science, St. Michael's Hospital\\
}


\begin{document}

\maketitle

\begin{abstract}
Automatic segmentation of white matter hyperintensities (WMHs) in magnetic resonance imaging (MRI) remains highly sought after due to the potential to streamline and alleviate clinical workflows. WMHs are small relative to whole acquired volume, which leads to class imbalance issues, and instability during the training process of many deep learning based solutions. To address this, we propose a method which is robust to effects of class imbalance, through incorporating multi-scale information in the training process. Our method consists of training an encoder-decoder neural network utilizing a Siamese network as an auxiliary loss function. These Siamese networks take in pairs of image pairs, input images masked with ground truth labels, and input images masked with predictions, and computes multi-resolution feature vector representations and provides gradient feedback in the form of a L2 norm. We leverage transfer learning in our Siamese network, and present positive results without need to further train. It was found these methods are more robust for training segmentation neural networks and provide greater generalizability. Our method was cross-validated on multi-center data, yielding significant overall agreement with manual annotations. 

\end{abstract}

\begin{keywords}
Semantic Segmentation, White Matter Hyperintensities, Siamese Networks, Medical Imaging, Magnetic Resonance Imaging, Label Imbalance, Transfer Learning.
\end{keywords}

\section{Introduction}

White matter hyperintensities (WMH) in magnetic resonance (MR) images of a presumed vascular origin are understood to manifest due to a combination of local macroscopic tissue structure erosion and increased water content due to inflammation \cite{bakshi_imaging_2005}. Quantitative analysis of WMH in MR imaging is typically conducted in order to diagnose, and evaluate effectiveness of treatments. Typically, analysis is conducted manually utilizing specific criteria \cite{polman_diagnostic_2011}, visual scales \cite{pantoni_visual_2002}, or manual delineations \cite{egger_mri_2017}. The most informative analysis are manual delineations, as they provide volumetric information of lesion load and spatial distribution. However, acquisition of manual delineations are laborious, and are subject to high inter- and intra-rater variability. Reported voxel-wise agreement (F1 score) between radiologists have been reported to range from a low of 0.66 \cite{egger_mri_2017} to a high of 0.83 \cite{steenwijk_accurate_2013}.

Recent advances in semantic segmentation in medical imaging has been largely in part to advent of deep learning methodologies. Most notably, U-Net style encoder-decoder fully convolutional networks (FCNs) has seen significant adoption in research community \cite{ronneberger_u-net_2015}. This is exemplified, as the top 11 teams of the MICCAI 2017 Grand Challenge for automatic segmentation of WMH all used U-Net inspired architectures. Typically, these architectures are trained utilizing a loss function which explicitly compares predictions to ground truths. However, WMHs are inherently class imbalanced due to their small size relative to acquired image. This skewed distribution affects training as predictions tend towards majority class. In a systematic study on impact of class imbalance on convolutional neural networks concluded that performance degrades, and is not just a relationship of number of training samples \cite{buda_systematic_2018}. Milletari et al. address class imbalance in their implementation of V-Net, which extends the U-Net from 2D to 3D domain, by utilizing a modification of Dice similarity coefficient as a loss function \cite{milletari_v-net_2016}. Investigations by Fidon et al. highlight potential limitations, namely that the loss function does not take advantage of multi-scale information \cite{fidon_generalised_2017}. Sudre et al. present a rebalancing strategy to allow more robust dice loss function \cite{sudre_generalised_2017}. Li et al. utilize a post-processing method in which predictions in the first and last 10\% of the brain volume along the axial plane were discarded \cite{li_fully_2018}.

\subsection{Contribution}

\begin{figure}[htbp]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {fig:overview}
  {\caption{Overview of proposed training setup.}}
  {\includegraphics[width=0.8\linewidth]{overview.png}}
\end{figure}

To overcome these challenges, this work proposes to utilize an auxiliary Siamese network to train a FCN segmentation model. Through this, multi-scale information is accounted for, which is not present in loss functions that explicitly compare predicted masks to ground truths. An U-Net style FCN with dense block convolutions was trained by only the gradients defined by an auxiliary Siamese network. Inspiration was drawn from the task of person re-identification (Re-ID), in which explicit comparison between images is not viable, and Siamese networks are typically used to encode information and measure similarity \cite{geng_deep_2016}. Early layers of the VGG19 network \cite{simonyan_very_2014} were used in our Siamese network, and feature mappings were sampled at different resolutions. For the loss function, we draw inspiration from style transfer implementations which utilize content loss, in which feature mappings are flattened into column vectors and compared by their squared Euclidean distance \cite{gatys_neural_2015}. The Siamese network measures the content similarity between pairs of masked input images; the input image multiplied by ground truth and input image multiplied by predicted mask. We developed this method on the dataset provided for MICCAI 2017 WMH Grand Challenge to have a standardized comparison to submitted entries. To verify generalization, we validate our results on manually segmented white matter lesions from the Canadian Atherosclerosis Imaging Network (CAIN). We compare our method by training the same FCN segmentation network, but trained using several loss for imbalanced segmentation tasks.

\section{Method}
\begin{figure}[htbp]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {fig:generator}
  {\caption{Generator Diagram.}}
  {\includegraphics[width=0.75\linewidth]{generator_diagram.png}}
\end{figure}
\subsection{Generator}

The design of the segmentation model is focused on a fully convolutional encoder-decoder structure inspired by U-Net \cite{ronneberger_u-net_2015}. Formulation of convolution block consists of sequence of (2D) convolution, followed by batch normalization \cite{simonyan_very_2014}, and leaky ReLU activation (alpha = 0.1) \cite{xu_empirical_2015}. Choice of leaky ReLU over basic ReLU was to avoid “dying ReLU” problem in which some neurons become inactive and only output zero \cite{lu_dying_2019}. All down sampling operations were conducted utilizing (2D) max pooling operations. Upsampling operation utilize (2d) transpose operations as shown in original U-Net \cite{ronneberger_u-net_2015}.

2D convolutions were utilized due to empirical results observed during the MICCAI 2017 WMH Grand Challenge, in which, submissions that utilized dilated or 3D convolutions placed near mid to low rankings \cite{kuijf_standardized_2019}. Choice of kernel size did not appear to affect performance, as such 3x3 kernels were utilized to reduce on number of parameters to optimize. Dense blocks similar to ones in DenseNet were utilized, due to their properties to alleviate the vanishing-gradient problem, strengthen feature propagation, and encourage feature reuse \cite{huang_densely_2017}. Initial dense block consisted of 32 filters, with the number of filters doubling with each max pooling operation. Number of dense block operations in the encoder portion was found by grid search and monitoring performance on a hold-out validation set, with a mirrored number of dense block plus one bottleneck dense block for the decoder, as shown in \figureref{fig:generator}. 
\begin{figure}[htbp]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {fig:discriminator}
  {\caption{Discriminator Diagram.}}
  {\includegraphics[width=0.65\linewidth]{discriminator_diagram.png}}
\end{figure}

\subsection{Discriminator}

The discriminator consists of a two path Siamese network. For every prediction mask generated by the segmentation network, the corresponding input image and ground truth mask is inputted into the discriminator. The masks are then multiplied to the input image, as shown in the two paths in \figureref{fig:discriminator}. Intention of multiplying the masks and input image together is to highlight the regions of interest and to allow network to jointly evaluate information from both images. We utilize weights from VGG19 network trained on ImageNet in our network \cite{simonyan_very_2014}. Choice to utilize VGG19 is due to the linear structure of the network progressively decreasing resolution of the convolution layer, thereby increasing receptive field. We sample the feature mappings at the max pooling layers of VGG19 network. We iteratively tested the number of feature mappings used and it was found that the first two max pooling layers yielded the best results, in which feature mapping dimensions were 112x112x64 and 56x56x128. Features in deeper layers are more domain specified towards natural images in ImageNet, and as such were unsuitable for this task. As well, the receptive field in deeper layers were also likely to be too large for this task, reflecting the small size of WMHs. In \equationref{eq:mse}, the function $f_c(.)$ is the transformation function representing the encoding of the masked images through the network and flattening to become feature vector representations. The feature vectors are then compared to each other by their squared Euclidean distance. Given as the following:

\begin{equation}\label{eq:mse}
l_{mse}(f_c(x \cdot y), f_c(x \cdot y')) = \frac{1}{L} \sum_{i=1}^{L}{\left (f_c(x \cdot y) - f_c(x \cdot y')\right )}^2
\end{equation}

Where, x is the input image, y is the ground mask, y' is the predicted mask, and L is the length of the feature vector. 
\section{Experiment}
\subsection{Dataset}

The training data used was provided by MICCAI as part of their 2017 grand challenge for WMH segmentation \cite{kuijf_standardized_2019}. The training set that was released publicly for development consisted of 60 MR image volumes from three institutes. Both T1 and fluid-attenuated inversion recovery (FLAIR) MR images were available, but in this implementation, only FLAIR images were used to correspond to acquired modality in cross-validation set. Of the 60 volumes, 48 volumes were randomly selected to be used to train, 6 volumes were used for validation of the aforementioned hyperparameters, and 6 volumes were used for cross-validation. We ensure even sampling between all centers when splitting data. 

To cross-validate, 50 FLAIR volumes from Canadian Atherosclerosis Imaging Network (CAIN) were manually segmented by experienced raters using ITK-SNAP segmentation software (version 3.6.0) \cite{yushkevich_itk-snap_2017}. All white matter lesions were manually delineated in a superior-to-inferior slice progression in the axial dimension. Once all white matter lesions were outlined on the axial plane, sagittal and coronal reconstructions were used to verify the segmentation and margins of the lesions. 

\subsection{Implementation Details}
\subsubsection{Preprocessing}
Whole volume was taken and normalized to an intensity between 0 and 1 by dividing by maximum intensity. The axial slices were then taken and resampled to 224 x 224 using default parameters of resize in python skikit-image library \cite{van_der_walt_scikit-image_2014}. Images were then concatenated with itself twice in order to convert single channel intensity image into 3-channel RGB image. In order keep uniformity on all models, preprocessing steps outlined for VGG19 model were used \cite{simonyan_very_2014}. During training, random rotation augmentations up to a range of 30 degrees were applied to the image, utilizing generator functions in Keras.
\subsubsection{Training}
For all models, 100 epochs were used during training with a batch size of 8 images and 100 batches per epoch. The ground truth was soft-binerized, meaning 0 values were set to 0.1 and 1 values were set to 0.9. Choice of this was to avoid exploding gradients when utilizing sigmoid as final output. Our initial learning rate was set to 0.001, and we utilize Adam optimizer with parameters set to $\beta_1 = 0.9$ and $\beta_2 = 0.999$ \cite{kingma_adam_2014}. The models were developed on Keras-Tensorflow 2.0, and was trained in a single NVIDIA GTX 1080 Ti GPU. 

\subsubsection{Performance Metrics}
Five metrics outlined by MICCAI 2017 WMH Grand Challenge for evaluation were used in order to have a standardized comparison to submitted entries \cite{kuijf_standardized_2019}. To measure performance in terms of class imbalance, we also define a normalized ratio which is the number of positive pixels in the ground truth, compared to the total image volume called the positive class density (PCD):

\[PCD = \frac{Sum(\#ofPositivePixels)}{Product(VolumeDimensions)} X 100\]

This is to address the variation of the volume dimensions of acquired images. We utilize this metric as a normalized means to compare volume agreement between predicted masks and ground truths.

\subsubsection{Benchmarks}
We compare our proposed method to several loss functions commonly used in imbalanced semantic segmentation tasks. Dice loss as proposed by Milletari et al. makes modification the Dice score coefficient by introducing $\epsilon$ to allow for stability during training \cite{milletari_v-net_2016}. $\epsilon$ was set to 1 for benchmark comparison. Salehi et al. make modifications to this by proposing Tversky loss, in which $\alpha$ and $\beta$ terms are introduced as additional weightings for false positives and false negatives respectively \cite{salehi2017tversky}. In this investigation, $\alpha$ and $\beta$ were set to 0.3 and 0.7. To account for the bias of the Dice metric for larger volumes, Sudre et al. propose generalized Dice loss, in which the Dice loss is re-balanced by the squared volume of the ground truth \cite{sudre_generalised_2017}. For each of the aforementioned loss functions, we train the generator model with the same settings as mention prior in training section. 
\section{Results}

\begin{figure}[htbp]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {fig:sample}
  {\caption{Prediction samples}}
  {\includegraphics[width=\linewidth]{samples_rev.png}}
\end{figure}

In this section, we will describe the results of our proposed model to models trained with the mentioned benchmark loss functions. Visual inspection of segmentation masks, as shown in \figureref{fig:sample}, shows an overall greater sensitivity for lesion detection for proposed model. Models trained with Dice loss and Tversky loss appear to be under segmenting, as highlighted by the large number of false positives present in the first row of \figureref{fig:sample}. When validated on images derived from the same distribution as training set, the generalized Dice model and our proposed model appear to have similar performances on the MICCAI holdout. However, when evaluated with images outside the distribution of the training set, as shown in the last row of \figureref{fig:sample}, each of the benchmark models segment only a few of the lesions, missing the significant lesions on the bottom left. Whereas, our proposed model segments most of the lesions present. We observe this trend reflected in the average performances described in \tableref{tab:results_table}, namely that while the benchmark models have comparative performances on dataset derived from same distribution, when validated on a dataset outside the distribution, our proposed model shows significant improvement. 

\begin{table}[htbp]
 % The first argument is the label.
 % The caption goes in the second argument, and the table contents
 % go in the third argument.
\floatconts
  {tab:results_table}%
  {\caption{Summary of Performance Metrics}}%
  {\begin{tabular}{l|l|l|l|l|l}
  \hline
  \bfseries Model & \bfseries Dice $\uparrow$ & \bfseries HD $^2$ $\downarrow$ & \bfseries AVD $^3$ $\downarrow$ & \bfseries L-Recall $\uparrow$ & \bfseries L-F1 $\uparrow$ \\
  \hline\hline
  \multicolumn{6}{c}{Dataset: MICCAI Holdout Validation} \\
  \hline\hline
  Dice Loss &0.73($\pm$0.07)&9.33($\pm$3.17) &17.54($\pm$10.77)&0.46($\pm$0.18) &0.55($\pm$0.18)\\
  \hline
  Tversky Loss &0.78($\pm$0.06) &5.39($\pm$2.28) &10.79($\pm$9.33) &0.53($\pm$0.13) &0.63($\pm$0.14)\\
  \hline
  Gen. Dice &0.80($\pm$0.06)&4.43($\pm$1.61) &5.57($\pm$5.33)&0.61($\pm$0.18) &0.68($\pm$0.17)\\
  \hline
  \bfseries Siamese Loss &0.79($\pm$0.06) &9.14($\pm$9.96) &11.82($\pm$7.52) &0.79($\pm$0.14) &0.62($\pm$0.17)\\
  \hline
  \textit Leaderboard$^1$ &\textit 0.81&\textit 5.63&\textit 18.58&\textit 0.82&\textit 0.79\\
  \hline\hline
  \multicolumn{6}{c}{Dataset: CAIN Independent Validation} \\
  \hline\hline
  Dice Loss &0.45($\pm$0.22) &26.76($\pm$16.67) &55.69($\pm$38.24) &0.48($\pm$0.20) &0.47($\pm$0.16)\\
  \hline
  Tversky Loss &0.43($\pm$0.24) &28.8($\pm$20.54) &52.52($\pm$29.36) &0.47($\pm$0.25) &0.48($\pm$0.21)\\
  \hline
  Gen. Dice &0.44($\pm$0.27) &21.40($\pm$17.50) &56.33($\pm$28.79) &0.50($\pm$0.28) &0.51($\pm$0.25)\\
  \hline
  \bfseries Siamese Loss &0.52($\pm$0.18) &24.26($\pm$16.40) &39.76($\pm$27.77) &0.75($\pm$0.15) &0.54($\pm$0.13)\\
  \hline
  \end{tabular}}
  {\begin{tablenotes}
   \small
   \item $^1$ MICCAI 2017 WMH Grand Challenge Leaderboard. Available at \url{https://wmh.isi.uu.nl/results/}
   \item $^2$ HD refers to modifed Hausdorff distance (95th percentile) (mm)
   \item $^3$ AVD refers to average volume difference (\%)
   \end{tablenotes}}
\end{table}

To observe the effects of class imbalance on the models, we assign bins to each volume in the CAIN independent validation set according to their positive class density and plot versus the average volume difference between predicted volume and ground truth. \figureref{fig:dsc_vs_density} depicts relatively low volume difference for our proposed model for most of bins. It is noted that two outliers exists in the 0.0 - 0.03 bin, in which the average volume difference is greater than 100\%, indicating over-segmentation. For the benchmark models, there appears to be some volumes where the average volume difference is 100\%, indicating no lesion was detected. Our proposed model on the other hand, detects at least some agreement to the ground truth. 

\begin{figure}[htbp]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {fig:dsc_vs_density}
  {\caption{Average Volume Difference versus Positive Pixel Density}}
  {\includegraphics[width=0.8\linewidth]{bined_AVD_v_ll_CAIN.png}}
\end{figure}

\section{Discussion}

In the previous section, we observe our proposed model generalizing well to data outside the distribution of the training set. We attribute this to the discriminator allowing for more contextual optimization of the generator weights. By evaluating the masked input images, texture features derived from the Siamese network allow for better comparison of information inherent in the pathology. The information taken at multiple depths of the pre-trained network represents the activation at multiple receptive fields, allowing for the latent vector to have a multi-scale representation. Whereas, overlap based optimization functions compare only masks, no additional information significant to the pathology is considered. 

For this implementation, the core of the Siamese network is the VGG19 network pre-trained on ImageNet \cite{simonyan_very_2014}. Through tuning hyperparameters, as mentioned above, we found that the optimal layers to sample from were in the earlier layers, in which can be visually understood as edges and blobs \cite{zeiler2014visualizing}. Fundamentally, ImageNet is a different domain from the medical images used in this implementation, namely ImageNet consists of natural colored images. We explore the aspect of fine tuning the VGG19 network to shift the domain more towards the present task as summarized in \tableref{tab:results_trained}. The generator and discriminator were trained in a min-max fashion inspired by GAN type architectures \cite{goodfellow2014generative}. Pre-trained weights of the Siamese network were unfrozen, and a small learning rate of 0.00001 was used for fine tuning. The objective of the generator was to minimize the L2 loss, while the discriminator sought to maximize it. 

\begin{table}[htbp]
 % The first argument is the label.
 % The caption goes in the second argument, and the table contents
 % go in the third argument.
\floatconts
  {tab:results_trained}%
  {\caption{Summary of Trained and Transfer Learned Siamese Networks}}%
  {\begin{tabular}{l|l|l|l|l|l}
  \hline
  \bfseries Model & \bfseries Dice $\uparrow$ & \bfseries HD $^1$ $\downarrow$ & \bfseries AVD $^2$ $\downarrow$ & \bfseries L-Recall $\uparrow$ & \bfseries L-F1 $\uparrow$ \\
  \hline\hline
  \multicolumn{6}{c}{Dataset: MICCAI Holdout Validation} \\
  \hline\hline
  Trained SL &0.51($\pm$0.14) &24.49($\pm$14.07) &32.32($\pm$18.65) &0.23($\pm$0.17) &0.24($\pm$0.16)\\
  \hline
  \bfseries Siamese Loss &0.79($\pm$0.06) &9.14($\pm$9.96) &11.82($\pm$7.52) &0.79($\pm$0.14) &0.62($\pm$0.17)\\
  \hline\hline
  \multicolumn{6}{c}{Dataset: CAIN Independent Validation} \\
  \hline\hline
  Trained SL &0.26($\pm$0.21) &61.12($\pm$43.00) &137.55($\pm$384.34) &0.16($\pm$0.13) &0.15($\pm$0.13)\\
  \hline
  \bfseries Siamese Loss &0.52($\pm$0.18) &24.26($\pm$16.40) &39.76($\pm$27.77) &0.75($\pm$0.15) &0.54($\pm$0.13)\\
  \hline
  \end{tabular}}
  {\begin{tablenotes}
   \small
   \item $^1$ HD refers to modifed Hausdorff distance (95th percentile)
   \item $^2$ AVD refers to average volume difference (Percentage)
   \end{tablenotes}}
\end{table}

The result of fine tuning the weights yielded less than satisfactory results. Primarily, one reason could be due to the VGG19 network being highly parameterized, there is a lack of sufficient data samples to optimize the weights, despite the use of a low learning rate for fine tuning. Other reasons could be attributed to the GANs nature of the model. Since a min-max setup was used to optimize the weights, vanishing gradients could have affected the optimization of the model. A variety of other factors such as game setup and loss function selection could have attributed, however analysis of these design paradigms are beyond the scope of this paper and will be investigated in future works. 

\section{Conclusion}
We present a training method which utilizes an auxiliary Siamese network to train a FCN segmentation model. Through this we found greater generalizability when compared to FCN model trained on loss functions which only evaluate only segmentation masks. By utilizing a Siamese network to evaluate the content loss between the masked images at each training step, we feedback multi-scale information in the training process. We found that this method allows for greater sensitivity, allowing for more robust evaluation across datasets and raters. 

In this work, we leverage the use of transfer learning for our task, however we acknowledge the limitations in namely the domain specificity of the network and the over-parametrization contributing to computation overhead. We attempt to fine-tune the Siamese network through a min-max GANs setup, but found the results unsatisfactory. Future works intends explore more of the design paradigms to allow for more efficient use of latent vector space and further refinement of features.

% Acknowledgments---Will not appear in anonymized version

\midlacknowledgments{We thank NSERC Discovery Grant Program for funding this research.}


\bibliography{mac20}



\end{document}
