
\documentclass{midl} % Include author names
%\documentclass[anon]{midl} % Anonymized submission

%\usepackage{todonotes}
%\usepackage{comment}
\usepackage{booktabs}
\usepackage{verbatim}
\usepackage{multirow}
\usepackage{amsmath,upgreek,bm}
\usepackage{enumitem}

\newcommand{\tr}[1]{{#1}^\top}

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\newcommand{\mr}[1]{\mathrm{#1}}
\DeclareMathOperator*{\argmax}{arg\,max}

% Include other packages here, before hyperref.
\DeclareMathOperator{\E}{\mathbb{E}}
\newcommand{\Y}{\widetilde{Y}}
\newcommand{\ppm}{$\,\pm\,$}
\newcommand{\MI}{I} 
\newcommand{\Data}{\mathcal{D}}
\newcommand{\img}{\mathbf{x}}
\newcommand{\lbl}{\mathbf{y}}
\newcommand{\Int}{\mathbb{Z}}
\newcommand{\Real}{\mathbb{R}}
\newcommand{\params}{\bm{\uptheta}}
\newcommand{\loss}{\mathcal{L}}
\newcommand{\lossSup}{\loss_{\mathrm{sup}}}
\newcommand{\lossMI}{\loss_{\mathrm{MI}}}
\newcommand{\lossReg}{\loss_{\mathrm{reg}}}
\newcommand{\Tcur}{T_{\mathrm{cur}}}
\newcommand{\Tmax}{T_{\mathrm{max}}}
\newcommand{\neigh}{\mathcal{N}}
\newcommand{\Perturb}{\mathcal{T}}
\newcommand{\ff}{\mathbf{f}}
\newcommand{\patch}{\mathbf{p}}

\newcommand{\acdc}[1]{
\includegraphics[ width=0.17\linewidth]{figures/images/acdc/GT#1.png} &
\includegraphics[ width=0.17\linewidth]{figures/images/acdc/Baseline#1.png} &
\includegraphics[ width=0.17\linewidth]{figures/images/acdc/Mean_Teacher#1.png} &
\includegraphics[ width=0.17\linewidth]{figures/images/acdc/Consistency#1.png} &
\includegraphics[ width=0.17\linewidth]{figures/images/acdc/UDA_IIC#1.png}
}

\newcommand{\prostate}[1]{
\includegraphics[  width=0.17\linewidth]{figures/images/prostate/GT#1.png} &
\includegraphics[  width=0.17\linewidth]{figures/images/prostate/Baseline#1.png} &
\includegraphics[  width=0.17\linewidth]{figures/images/prostate/Mean_Teacher#1.png} &
\includegraphics[  width=0.17\linewidth]{figures/images/prostate/Consistency#1.png} &
\includegraphics[  width=0.17\linewidth]{figures/images/prostate/UDA_IIC#1.png}
}

\newcommand{\spleen}[1]{
\includegraphics[  width=0.17\linewidth]{figures/images/spleen/GT#1.png} &
\includegraphics[  width=0.17\linewidth]{figures/images/spleen/Baseline#1.png} &
\includegraphics[  width=0.17\linewidth]{figures/images/spleen/Mean_Teacher#1.png} &
\includegraphics[  width=0.17\linewidth]{figures/images/spleen/Consistency#1.png} &
\includegraphics[  width=0.17\linewidth]{figures/images/spleen/UDA_IIC#1.png}
}


%\renewcommand{\baselinestretch}{.95}

\usepackage{mwe} % to get dummy images
% \jmlrvolume{-- Under Review}
\jmlryear{2020}
\jmlrworkshop{Full Paper -- MIDL 2020}
\editors{}

\title%[Short Title]
{Mutual information deep regularization for \\ semi-supervised segmentation}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 \midlauthor{\Name{Jizong Peng} \Email{jizong.peng.1@ens.etsmtl.ca}\\
 \Name{Marco Pedersoli} \Email{marco.pedersoli@etsmtl.ca}\\
 \Name{Christian Desrosiers} \Email{christian.desrosiers@etsmtl.ca}\\
 \addr Ecole de technologie superieure\\ 
 1100 Notre-Dame W., Montreal, Canada (H1C 3K3)}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
%\midlauthor{\Name{Jizong Peng\midljointauthortext{Contributed equally}\nametag{$^{1,2}$}} \Email{abc@sample.edu}\\
%\addr $^{1}$ Address 1 \\
%\addr $^{2}$ Address 2 \AND
%\Name{Author Name2\midlotherjointauthor\nametag{$^{1}$}} \Email{xyz@sample.edu}\\
%\Name{Author Name3\nametag{$^{2}$}} \Email{alphabeta@example.edu}\\
%\Name{Author Name4\midljointauthortext{Contributed equally}\nametag{$^{3}$}} \Email{uvw@foo.ac.uk}\\
%\addr $^{3}$ Address 3 \AND
%\Name{Author Name5\midlotherjointauthor\nametag{$^{4}$}} \Email{fgh@bar.com}\\
%\addr $^{4}$ Address 4
%}

\begin{document}

\maketitle

\begin{abstract}
The scarcity of labeled data often limits the application of deep learning to medical image segmentation. Semi-supervised learning helps overcome this limitation by leveraging unlabeled images to guide the learning process. In this paper, we propose using a clustering loss based on mutual information that explicitly enforces prediction consistency between nearby pixels in unlabeled images, and for random perturbation of these images, while imposing the network to predict the correct labels for annotated images. Since mutual information does not require a strict ordering of clusters in two different cluster assignments, we propose to incorporate another consistency regularization loss which forces the alignment of class probabilities at each pixel of perturbed unlabeled images. We evaluate the method on three challenging publicly-available medical datasets for image segmentation. Experimental results show our method to outperform recently-proposed approaches for semi-supervised and yield a performance comparable to fully-supervised training.
\end{abstract}

\begin{keywords}
Semantic segmentation, Semi-supervised learning, Deep clustering, Mutual information, Convolutional neural network
\end{keywords}

\section{Introduction}

%Semantic segmentation, where the goal is to assign the correct class label to each pixel of an image, is an essential step in image analysis pipelines for the assessment and treatment of various medical conditions \cite{Litjens2017}.

While supervised learning approaches based on deep convolutional neural networks (CNNs) \cite{long2015fully} have achieved outstanding performances in a wide range of segmentation tasks, such approaches typically require a large amount of labeled images for training. In medical imaging applications, obtaining this labeled data is often expensive since annotations must be made by trained clinicians, typically in 3D volumes, and regions to segment can have very low contrast. Semi-supervised learning is a paradigm which reduces the need for fully-annotated data by exploiting the abundance of unlabeled data, i.e. data without expert-annotated ground truth. In contrast to standard approaches that learn exclusively from labeled data, semi-supervised methods also leverage intrinsic properties of unlabeled data (or \emph{priors}) to guide the learning process. Although initially proposed for classification \cite{Oliver2018Realistic}, various semi-supervised methods have also been developed for semantic segmentation, including approaches based on self-training \cite{bai2017semi}, distillation \cite{radosavovic2018data}, attention learning \cite{min2018robust}, adversarial learning \cite{souly2017semi,zhang2017deep}, entropy minimization \cite{vu2019advent}, co-training \cite{peng2019deep,zhou2019semi}, temporal ensembling \cite{perone2018deep}, manifold learning \cite{baur2017semi}, and data augmentation \cite{chaitanya2019semi,zhao2019data}. A simple yet powerful strategy employed in several semi-supervised segmentation methods is transformation consistency \cite{bortsova2019semi}. In this semi-supervised strategy, a point-wise loss like Kullback–Leibler (KL) divergence is used to impose similar network outputs for different transformations of the same unlabeled image. Even though this helps make the network robust to such transformations, it does not directly enforce spatial consistency within the image.

Recently, important efforts have been invested toward learning representations from unlabeled data that can be employed as features in a supervised learning task such as classification \cite{hjelm2018learning}. A powerful way to obtain such representation is deep clustering \cite{ji2018iic,caron2018deep,ghasedi2017deep}. However, because clustering is an ill-posed problem, techniques for this task often lead to poor or degenerate solutions \cite{caron2018deep}, for instance where all examples are assigned to a single cluster (i.e., mode collapse). To avoid this problem, recent work has proposed using the principle of mutual information (MI)
\cite{hu2018imsat,zhao2019region,ji2018iic}. The mutual information $\MI(X,Y)$ between two random variables $X$ and $Y$ is an information-theoretic criterion that measures the dependency between these variables. It is defined as the KL divergence between the joint distribution $p(X,Y)$ of the variables and the product of their marginals: 
\begin{equation}
 \MI(X;Y) \ = \ D_{\mr{KL}}\big(p(X,Y)\, || \, p(X)\,p(Y)\big).
\end{equation}
Two significant advantages of MI for clustering, compared to traditional techniques like k-means or Gaussian mixtures, is that it does not make any assumptions about the data distribution and it alleviates the problem of mode collapse by favoring balanced clusters. The second advantage can be seen by an equivalent definition of MI,
\begin{align}
 \MI(X;Y) \ = \ & H(Y) \, - \, H(Y|X) \\
  \ = \ & \E_Y\big[\log \E_X[\,p(Y|X)\,]\,\big] \, - \, \E_{X,Y}[\,\log p(Y|X)\,],
\end{align}
where $H(Y)$ is the entropy of $Y$ and $H(Y|X)$ is the conditional entropy of $Y$ given $X$. If we suppose that $X$ is an image and $Y|X$ is the cluster to which $X$ is assigned then maximizing $\MI(X;Y)$ can be achieved by increasing the entropy of cluster marginals $H(Y)$, which corresponds to more balanced clusters. 

\begin{comment}
In recent years, two deep clustering approaches based on MI have achieve outstanding results. The first one, Information Maximizing Self-Augmented Training (IMSAT) \cite{hu2018imsat}, maximizes the MI between input data $X$ and the cluster assignment $Y$. The output is regularized through the use of virtual adversarial samples \cite{miyato2019virtual}, imposing that the original sample and the adversarial one should have a similar cluster assignment probability distribution (by minimizing their KL divergence). The second approach, Invariant Information Clustering (IIC) \cite{ji2018iic}, maximizes the MI of cluster assignments of a sample and its transformed version obtained with a transformation preserving semantics (i.e., geometric transformation). 
\end{comment}

So far, very few works have investigated the usefulness of MI-based deep clustering as a regularization prior for semantic segmentation. In \cite{zhao2019region}, authors propose a region loss for semantic segmentation which represents a pixel by a patch surrounding this pixel and then maximizes the MI between the distribution of predicted outputs and ground truth labels for this patch. The advantage of this approach over standard segmentation losses like cross-entropy is that it explicitly considers the dependencies between nearby pixels within the loss, thereby enabling spatial regularization. While it achieved better performance than traditional spatial consistency techniques like CRFs \cite{krahenbuhl2011efficient}, this approach only considers fully-supervised segmentation settings. The Invariant Information Clustering (IIC) method proposed for segmentation in \cite{ji2018iic} also considers patches centered on each pixel, however it instead maximizes the MI between the distribution of predicted outputs for a patch and the output distribution for a transformed version of this patch. Two strategies are presented for applying this in a semi-supervised setting: fine-tuning and overclustering. In fine-tuning, the network is pre-trained on a clustering task using unlabeled images and then fine-tuned on a segmentation task with labeled ones. The second strategy employs unlabeled images to learn a fine-grained clustering and, in a post-processing step, learns a many-to-one mapping from the clusters to segmentation labels based on labeled examples. This mapping uses an algorithm external to gradient descent optimization and labeled images do not participate in the computation of gradients. 

%The post-processing task of finding the optimal mapping can be seen as a weaker form of supervised adaptation, compared to the fine-tuning approach, where modifications to the network are limited to the final cluster-to-label assignments. Unlike these two strategies, our model leverages both labeled and unlabeled images *jointly* during training.


%This unsupervised approach is used to pre-train a segmentation network which is then fined-tuned on labeled images. 

%None of these existing methods employs a loss incorporating both labeled and unlabeled images, and which is optimized end-to-end.

\paragraph{Contributions} In this paper, we propose a semi-supervised segmentation method which leverages both MI-based regularization and transformation consistency in a single model. The major contributions of our work are the following:
%
\begin{itemize}[itemsep=1pt,topsep=2pt]
\item We present a first application of MI for regularization in semi-supervised segmentation, where both labeled and unlabeled images are used simultaneously in an end-to-end manner. The proposed loss function incorporates both fully-supervised guidance from labeled data and an unsupervised regularization term based on MI, which enforces spatial consistency on unlabeled images;
%
\item We extend MI regularization by further encouraging KL-based consistency between the segmentation output for unlabeled images and their transformed version. We show that this additional unsupervised regularization term stabilizes training and leads to higher accuracy;
%
\item We perform an extensive set of experiments on three challenging segmentation benchmarks, comparing our proposed method against recently-proposed approaches for this task. Results show our method to yield significantly higher performance, near to fully-supervised training.
\end{itemize}
   
%The rest of this paper is organized as follows. In Section \ref{sec:method}, we present our semi-supervised segmentation method based on MI and describe how to improve its accuracy using unsupervised consistency training. Section \ref{sec:experiments} then presents the datasets and metrics used in experiments, and provides implementation details of our method. In Section  \ref{sec:results}, we give experimental results highlighting the main advantages of our method. Last, we conclude the paper with a summary of contributions and results.
In the next sections, we present our semi-supervised segmentation method and perform experiments demonstrating its advantages over existing approaches.

\section{Proposed method}\label{sec:method}

\begin{figure}[ht!]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {fig:transformation}
  {\caption{\textbf{Training pipeline of our semi-supervised segmentation method}. Given an unlabeled image $\img_{u}$, a regularization loss is imposed on two related predictions. The first is the prediction of $\img_{u}$, i.e., $f(\img_{u}; \params)$, and the second is the prediction of the given image under an invertible transformation $\textit{T}$, after reversing this transform to return to the original image coordinates, i.e., $T^{-1}\big(f(T(\img_u); \params)\big)$. To highlight the foreground region, images shown here have been center-cropped.}}
  {\includegraphics[width=0.92\linewidth]{figures/diagram-crop.pdf}
  %\vspace{-5mm}
  }
\end{figure}

Given a labeled set $\Data_l$ of image-label pairs $(\img, \lbl)$, with image $\img \in \Real^\Omega$, $\Omega=\{1,\ldots,W\} \times \{1,\ldots,H\}$, and ground-truth labels $\lbl \in \{1,\ldots,C\}^\Omega$ where $C$ is the number of classes, and a large unlabeled dataset $\Data_{u}$ comprised of images without their labels ($|\Data_u| \gg |\Data_l|$). We want to learn a neural network $f$ parameterized by $\params$ to predict the label probability distribution of each pixel in an input image. As shown in Figure \ref{fig:transformation}, the proposed model exploits both labeled and unlabeled images during training. Labeled data $\Data_l$ is used as in standard supervised methods with a loss $\lossSup$ that imposes the pixel-wise prediction of the network for an annotated image to be similar to the ground truth labels. While other segmentation losses such as Dice loss \cite{milletari2016v} could have been considered, in this work we employed the well-known cross-entropy loss defined as
\begin{equation}
 \lossSup(\params; \Data_l) \ = \ - \frac{1}{|\Data_l| \,|\Omega|}\!\sum_{(\img,\lbl) \in \Data_l} \sum_{(i,j)\in\Omega} y_{ij}  \log f_{ij}(\img; \params).
\end{equation}

In semi-supervised methods, unlabeled data is typically used within a regularization loss to guide the parameter optimization process toward suitable solutions. A popular regularization strategy, called consistency-based regularization, enforces the network to output similar predictions for perturbed inputs corresponding to unlabeled data. This strategy is exploited in temporal ensembling techniques like Mean Teacher \cite{perone2018deep}, where the output of a Student network at different training iterations should be similar (e.g., in terms of $L2$ norm or KL divergence) to that of a Teacher network whose parameters are a weighted temporal average of the Student's. A common limitation of such methods is that they regard the prediction for separate pixels as independent in the loss. 

\paragraph{MI-based regularization} To better exploit the structured nature of segmentation, we add a loss term based on MI, denoted as $\lossMI$, which is similar to the one used in \cite{ji2018iic} for deep clustering. In this loss, we represent each pixel $(i,j)$ of an unlabeled image as a patch $\patch_{ij} = [\img]_{ij}$ centered on this pixel, where $[\cdot]$ denote a patch extraction operator. The network's output patch $\ff_{ij}$ at each position $(i,j)$ can be computed in a single convolution pass using the following relation: $\ff_{ij} = f(\patch_{ij}; \params) = [f(\img; \params)]_{ij}$. Considering each output patch $\ff_{ij}$ as a distribution, we seek to maximize the MI between this distribution and the one corresponding to adjacent patches. Moreover, we want this spatial consistency to hold for different invertible transformations $T \in \Perturb$ applied to the unlabeled image. We note that loss terms imposing strict equivalence between adjacent patches (e.g., $L_2$ or KL divergence) are not suitable since these patches can be different. In contrast, the MI loss makes a more relaxed assumption that information content does not vary much between adjacent patches, if these patches have a sufficient overlap.

Let $\neigh$ a be predefined set of pixel displacements $(p,q)$ which defines the neighbors of a pixel $(i,j)$, i.e. $\{(i+p, j+q) \ | \ (p,q) \in \neigh\}$. Using the same square patch for all pixels, $|\neigh|$ then corresponds to the patch size. We define our MI loss as 
\begin{align}
 \lossMI(\params; \Data_u) & \ = \ \frac{1}{|\neigh|} \sum_{(p,q) \in \neigh} \MI(\mathbf{P}_{pq}), 
 \label{equ:main}
\end{align}
where $I(\mathbf{P}_{pq})$ is the MI given by the joint distribution $\mathbf{P}_{pq}$. Denote as $\ff^T_{ij} = T^{-1}\big(f(T(\img_{ij}); \params)\big) = \big[T^{-1}\big(f(T(\img); \params)\big)\big]_{ij}$ the output for a patch transformed by $T \in \Perturb$, after reversing this transform to return to the original patch coordinates. Joint distribution $\mathbf{P}_{pq}$ is a $C \times C$ matrix computed as
\begin{equation}
\mathbf{P}_{pq} \ = \ \frac{1}{|\Data_u|\, |\Perturb|\, |\Omega|}\sum_{\img \in \Data_u} \sum_{T \in \Perturb}\sum_{(i,j)\in \Omega} \ff_{ij} \cdot \tr{(\ff_{i+p,j+q}^{T})}
\label{equ:joint}
\end{equation}
Note that the sum in this equation can computed efficiently using a 2D convolution operation. Finally, given the joint distribution $\mathbf{P}_{pq}$, the MI in Eq. (\ref{equ:main}) is obtained as
\begin{equation}
 I(\mathbf{P}) \ = \ \sum_{k=1}^{C} \sum_{k'=1}^{C}\mathbf{P}(k,k')\cdot \log\,\frac{\mathbf{P}(k,k')}{\big(\sum_{k'}\mathbf{P}(k,k')\big) \cdot \big(\sum_k\mathbf{P}(k,k')\big)}.
\end{equation}

\paragraph{Transformation consistency} As we will show in experiments, employing $\lossMI$ as the only regularization may however be insufficient to guide the learning towards good solutions. This can be attributed to the clustering nature of the proposed loss. Given two distributions conditionally independent given the same input image, MI is maximized if there is a deterministic mapping between clusters (classes) in each distribution such that they are equivalent. For instance, modifying the cluster indexes in one distribution (e.g., by permutation) does not change the MI. To ensure that learned clusters align across different patch outputs, we add a second regularization term, $\lossReg$, which minimizes the pixel-wise KL divergence between the network output for an image and its transformed version:
\begin{equation}
 \lossReg(\params; \Data_u) \ = \ \frac{1}{|\Data_u| \, |\Perturb| \, |\Omega|} \sum_{\img \in \Data_u} \sum_{T \in \Perturb} \sum_{(i,j) \in \Omega} D_{\mr{KL}}\Big(f_{ij}(\img; \params)\, \big|\big| \, T^{-1}_{ij}\big(f(T(\img); \params)\big)\Big).
\end{equation}
Our final loss combines the supervised term and the two unsupervised terms based on MI and consistency-based regularization:
\begin{equation}\label{eq:totalLoss}
\loss(\params; \Data_l, \Data_u) \ = \ \lossSup(\params; \Data_l) \ + \ \lambda\big(\lossMI(\params; \Data_u) \, + \, \lossReg(\params; \Data_u) \Big),
\end{equation}
where $\lambda \geq 0$ is a hyper-parameter controlling the relative importance of labeled and unlabeled data.

\section{Experimental setup}\label{sec:experiments}

\subsection{Dataset and metrics}

Our experiments are performed on three clinically-relevant benchmark datasets for medical image segmentation: the Automated Cardiac Diagnosis Challenge (ACDC) dataset \cite{bernard2018deep}, the Prostate MR Image Segmentation (PROMISE) 2012 Challenge dataset \cite{litjens2014evaluation}, and the Spleen sub-task dataset of the Medical Segmentation Decathlon Challenge \cite{simpson2019large}. The three datasets consist of different image modalities and have various acquisition resolutions.

\paragraph{\textbf{ACDC dataset}} The publicly-available ACDC dataset consists of 200 short-axis cine-MRI scans from 100 patients, evenly distributed in 5 subgroups: normal, myocardial infarction, dilated cardiomyopathy, hypertrophic cardiomyopathy, and abnormal right ventricles. Scans correspond to end-diastolic (ED) and end-systolic (ES) phases, and were acquired on 1.5T and 3T systems with resolutions ranging from 0.70\,$\times$\,0.70 mm to 1.92\,$\times$\,1.92 mm in-plane and 5 mm to 10 mm through-plane. Segmentation masks delineate 4 regions of interest: left ventricle endocardium (LV), left ventricle myocardium (Myo), right ventricle endocardium (RV), and background. Short-axis slices within 3D-MRI scans were considered as 2D images, which were re-sized to 256\,$\times$\,256. For our experiments, we used a random split of 8 fully-annotated and 167 unlabeled scans for training, and the remaining 25 scans for validation. We employed conventional data augmentation for both labeled and unlabeled images, including random crop and random rotation within a range of [-20, 20] degrees.

\paragraph{\textbf{Prostate dataset}} This dataset is composed of multi-centric transversal T2-weighted MR images from 50 subjects acquired with multiple MRI vendors and different scanning protocols, which are representative of typical MR images acquired in a clinical setting. Image resolution ranges from 15\,$\times$\,256\,$\times$\,256 to 54\,$\times$\,512\,$\times$\,512 voxels with a spacing ranging from 2\,$\times$\,0.27\,$\times$\,0.27 to 4\,$\times$\,0.75\,$\times$\,0.75 mm$^3$. We randomly selected 7 patients as labeled data, 33 as unlabeled data, and 10 for validation during the experiments.

\paragraph{\textbf{Spleen datset}} This public dataset consists of patients undergoing chemotherapy treatment for liver metastases. A total of 61 portal venous phase CT scans (only 41 were given with ground truth) were included in the dataset with acquisition and reconstruction parameters described in \cite{simpson2019large}. The ground truth segmentation was generated by a semi-automatic segmentation software and then refined by an expert abdominal radiologist. For our experiments, 2D images are obtained by slicing the high-resolution CT volumes along the axial plane, followed by a max-min normalization with a range between 0 and 1. Each slice is then resized to a resolution of 512\,$\times$\,512. To evaluate algorithms in a semi-supervised setting, we randomly split the dataset into labeled, unlabeled and validation image subsets, comprising CT scans of 4, 32, and 5 patients respectively.

We use the commonly-adopted Dice similarity coefficient (DSC) metric to evaluate segmentation quality. DSC measures the overlap between the predicted labels ($S$) and the corresponding ground truth labels ($G$):
\begin{equation}
 \mathrm{DSC}(S,G) = \frac{2|S\cap G|}{|S|+|G|} 
\end{equation}
DSC values range between 0 and 1, a higher value corresponding to a better segmentation. In all results, we report the 3D DSC metric for the validation set.

\subsection{Implementation details}\label{sec:implementation}

\paragraph{Network and parameters} For all three datasets, we use the same network architecture of U-Net with 15 layers including batch normalization, dropout and ReLU activation. We adopted this architecture as it has been shown to work well for different medical image segmentation tasks. Networks were trained using stochastic gradient descent (SGD) with an Adam optimizer having a initial learning rate of $1\times10^{-3}$ which is decreased during training. To control the relative importance of labeled and unlabeled data in Eq. (\ref{eq:totalLoss}) we used a fixed $\lambda$ of 0.1 for all datasets and experiments.
%We programmed a learning rate warm-up before imposing a cosine decrease scheduler: for the first 10 training epochs, the learning rate increases linearly from $5\times10^{-5}$ to $1\times10^{-3}$ and then decreases following a cosine function of %$1\times10^{-3}\times\frac{1}{2}\times \big(1+\cos(\frac{\Tcur-10}{\Tmax}\,\pi)\big)$,  
%$0.0005 \cdot \big(1+\cos(\frac{\Tcur-10}{\Tmax}\,\pi)\big)$,
%where $\Tcur$ is the current epoch ($\Tcur \geq 10$), and $\Tmax$ is the max epoch which is set to 90. Similar to the observation reported in \cite{}, a learning-rate warm up in the first few epochs can effectively improve the generalibility of the network and lead to a better segmentation performance. 
%
The same strategy was employed to generate transformed images for both $\lossMI$ and $\lossReg$ terms. Given an unlabeled image, we randomly draw a transformation from a pool of invertible transformations, including cascaded transformation of random rotation, shearing and scaling \cite{xie2019unsupervised}. The size of patches in the MI loss is also an important hyper-parameter. Patches must be large enough so that information content remains similar between adjacent ones, but small enough to capture local context. In experiments, we used 3$\times$3 pixels which corresponds to regions of 3-5 mm in images depending on the resolution. We also tested our method with 5$\times$5 and 7$\times$7 patches, however this increased computational cost without significantly improving accuracy.

\paragraph{Comparison baselines} We compared our method against several baselines and recently-proposed approaches for semi-supervised segmentation. First, to get an upper bound on performance, we trained the network described above using the supervised loss $\lossSup$ on \emph{all} training images. We call this baseline Full supervision. Likewise, an lower bound on performance is obtained by optimizing $\lossSup$ only on labeled images, ignoring unlabeled ones. This second baseline is referred to as Partial supervision in our results. Next, we tested two well-known approaches for semi-supervised learning: Entropy minimization \cite{vu2019advent} and Mean Teacher \cite{perone2018deep}. The first approach minimizes the pixel-wise entropy of predictions made for unlabeled images. This forces the network to become more confident  about its predictions, and can be seen as a soft version of the pseudo-label algorithm \cite{vu2019advent}. For Mean Teacher, we use the same formulation as in \cite{perone2018deep}, where the Student model is trained using labeled data and the Teacher model is updated using an exponential moving average (EMA) of 0.999. The same strategy as \cite{perone2018deep} is employed to generate transformations for unlabeled data and to impose consistency between Teacher's and Student's predictions for unlabeled images. We report the accuracy of the Teacher, which usually performs better than the Student. 

\textbf{Ablation study} To assess the impact of our two unsupervised loss terms $\lossMI$ and $\lossReg$ on performance, we performed an ablation study where we disable one of them while keeping the other. Using only $\lossMI$ with the supervised loss $\lossSup$, which we call Mutual information in the results, is similar to the IIC method \cite{ji2018iic} except that in our case MI-based regularization is used \emph{jointly} with the supervised loss in a semi-supervised setting, instead of for pre-training the network on a clustering task before adapting it to segmentation using labeled images. Likewise, using only $\lossReg$ as unsupervised loss with $\lossSup$, which is referred to as Consistency regularization, is similar to the semi-supervised segmentation method recently presented in \cite{bortsova2019semi}. Last, following recent work enforcing consistency with $L_2$ distance \cite{tarvainen2017mean}, we tested a mean-squared error (MSE) loss instead of KL for $\lossReg$. 

\section{Results}\label{sec:results}

\begin{comment}
\begin{table}[t]
\begin{center}
\caption{Mean Dice Score Coefficient (DSC) of tested methods on the ACDC, Prostate (PROMISE) and Spleen datasets. RV, Myo and LV refer to the right ventricle, myocardium and right ventricle classes, respectively. Mutual information corresponds to our method (KL) without loss term $\lossReg$ and Consistency regularization to our method (KL) without $\lossMI$.}\label{table:result}
\begin{small}
\begin{tabular}{lccccccc}
\toprule
& \multicolumn{4}{c}{\textbf{ACDC}} & &  \\
\cmidrule(l{6pt}r{6pt}){2-5}
 & \bfseries RV  & \bfseries Myo & \bfseries LV & \bfseries Mean & ~\textbf{Prostate}~ & \textbf{Spleen}\\
 \midrule%\midrule
Full supervision & 88.85 & 85.72 & 92.92 & 89.16 & 87.67 & 93.76 \\
\midrule
Partial supervision & 73.09 & 75.71 & 86.75 & 78.52 & 84.11 & 87.10 \\
Entropy minimization & 73.10 & 75.16 & 86.04 & 78.10 & 82.96 & 90.01 \\
Mean Teacher & 83.22 & 80.37 & 89.33 & 84.30 & 86.03 & 93.20 \\
Mutual information & 81.93 & 75.45 & 88.07 & 81.82 & 83.77 & 90.26 \\
Consistency regularization & 82.36 & 79.21 & 88.10 & 83.22 & 85.00 & 91.75 \\
\bfseries Our method & \bfseries 85.14 & \bfseries 80.94 & \bfseries 90.88 & \bfseries 85.66 & \bfseries 86.41 & \bfseries 93.30 \\
\bottomrule
\end{tabular}
\end{small}
\end{center}
\end{table}
\end{comment} 

\begin{table}[t]
\begin{center}
\caption{Mean 3D DSC of tested methods on the ACDC, Prostate and Spleen datasets. RV, Myo and LV refer to the right ventricle, myocardium and right ventricle classes, respectively. We test our method using KL and MSE for $\lossReg$. Mutual information corresponds to our method without loss term $\lossReg$ and Consistency regularization to our KL-based method without $\lossMI$. Reported values are averages (standard deviation in parentheses) for 3 runs with different random seeds.}\label{table:result}
\begin{footnotesize}
\setlength{\tabcolsep}{5pt}
\begin{tabular}{lccccccc}
\toprule
& \multicolumn{4}{c}{\textbf{ACDC}} & &  \\
\cmidrule(l{6pt}r{6pt}){2-5}
 & \bfseries RV  & \bfseries Myo & \bfseries LV & \bfseries Mean & ~\textbf{Prostate}~ & \textbf{Spleen}\\
 \midrule%\midrule
Full supervision & 88.98 (0.09) & 84.95 (0.15) & 92.44 (0.33) & 88.79 (0.13) & 87.33 (0.40) & 93.52 (0.48) \\
\midrule
Partial supervision & 73.25 (0.36) & 75.54 (1.27) & 86.89 (0.26) & 78.56 (0.42) & 84.20 (0.73) & 87.38 (1.05) \\
Entropy min.  & 73.85 (1.29) & 74.92 (0.85) & 86.12 (0.53) & 78.30 (0.87) & 83.04 (0.51) & 90.21 (0.31) \\
Mean Teacher   & 82.99 (0.49) & 80.43 (1.02) & 89.33 (0.33) & 84.25 (0.56) & 86.15 (0.19) & 93.22 (0.34) \\
Mutual information  & 81.98 (0.62) & 75.75 (0.47) & 87.89 (0.11) & 81.87 (0.32) & 83.75 (1.21) & 90.35 (0.36) \\
Consistency reg.  & 82.30 (0.60) & 79.43 (0.81) & 88.55 (0.37) & 83.42 (0.48) & 84.88 (0.54) & 91.50 (0.61) \\
Ours (MSE)  & 82.82 (0.35) & 79.91 (0.72) & 88.84 (0.77) & 83.85 (0.39) & 85.77 (0.46) & 93.12 (0.19) \\
Ours (KL)  & 85.08 (0.10) &  81.08 (0.42) & 90.72 (0.44) & 85.63 (0.20) & 86.63 (0.07) & 93.37 (0.13) \\
\bottomrule
\end{tabular}
\end{footnotesize}
\end{center}
\end{table}

Table \ref{table:result} reports the mean 3D DSC on the validation set of the ACDC, Prostate (PROMISE) and Spleen datasets. Overall, the proposed method with KL-based loss achieves the highest accuracy for all three datasets. Using a one-sided paired t-test, the improvement of our method over all other approaches is found to be significant (p $<$ 0.05) for the RV, LV, Mean of ACDC and the Prostate segmentation tasks. Note that, for the Spleen task, there is no significant difference between our method and the fully-supervised baseline. Improvements are particularly notable for the more challenging task of right ventricle and myocardium segmentation in the ACDC dataset. Furthermore, despite training with a very small fraction of labeled images (i.e., 4.5\% of the training set as labeled data for ACDC, 17.5\% for Prostate and 3\% for Spleen dataset), our method achieves a performance near to full supervision with a DSC difference less than 4\% in all cases. 

Our ablation study shows that combining both $\lossMI$ and $\lossReg$ regularization losses gives better results than using these losses individually, with statistically significant improvements of 1.65\,--\,2.78\% compared to using only $\lossReg$ and of 2.83\,--\,5.33\% with respect to using only $\lossMI$. As expected, employing $\lossMI$ alone yields poor results since $\lossReg$ is also required to align the cluster assignments across different image patches. Comparing KL-based with MSE-based consistency for $\lossReg$, we find the former to give a higher accuracy in all cases. This observation is in line with \cite{perone2018deep} and recent work on consistency-based unsupervised data augmentation \cite{xie2019unsupervised}, showing KL to work well with a wide variety of regularization terms.

The performance of our method can be appreciated visually in Fig. \ref{fig:visual_inspect}, which shows examples of segmentation results for tested methods. One can see that our method better predicts the contour of target regions despite the low contrast in images. On the other hand, using only consistency regularization leads to non-smooth contours of segmented regions. 

% \begin{figure}[ht!]
%  % Caption and label go in the first argument and the figure contents
%  % go in the second argument
% \floatconts
%   {fig:visual_inspection}
%   {\caption{Visual inspection}
%   {\includegraphics[width=0.98\linewidth]{figures/visual_inspection.jpg}}
% \end{figure}

\begin{comment}
\begin{figure}
 \centering
 \includegraphics[width=0.91\linewidth]{figures/visual_inspection-crop.pdf}
 \caption{Visual comparison of tested methods on validation images. Top two rows: ACDC dataset. Middle two rows: Prostate (PROMISE) dataset. Bottom two rows: Spleen dataset.}
 \label{fig:visual_inspect}
\end{figure}
\end{comment}

\begin{figure}
 \centering
 \setlength{\tabcolsep}{1pt}
 \renewcommand{\arraystretch}{.8}
 \begin{footnotesize}
 \begin{tabular}{ccccc}
 \acdc{3} \\ 
 \acdc{4} \\
 \prostate{2} \\
 \prostate{1} \\
 \spleen{3} \\
 \spleen{2} \\
 Ground truth & Partial supervision & Mean Teacher & Consistency reg. & Our method
 \end{tabular}
 \end{footnotesize}
 \caption{Visual comparison of tested methods on validation images. Top two rows: ACDC dataset. Middle two rows: Prostate (PROMISE) dataset. Bottom two rows: Spleen dataset.}
 \label{fig:visual_inspect}
\end{figure}


\section{Discussion and conclusion}
% \todo{do I need to say something on clustering prior???}

We presented a novel semi-supervised method for segmenting medical images which regularizes a CNN network for segmentation by maximizing the MI between output distributions for both adjacent patch pairs and images pairs undergoing invertible transformations. Our loss explicitly enforces the network to capture the high-order dependencies between spatially-related pixels, and preserve structure under perturbations on its input. By incorporating the MI within a consistency term, the network can be effectively trained with abundant unlabeled data. We  applied the proposed method to three challenging medical segmentation tasks with few images having labeled annotations (4.5\% of the training set for ACDC, 17.5\% for Prostate and 3\% for Spleen). Experimental results showed our method to outperform recently-proposed semi-supervised approaches such as Mean Teacher and Entropy minimization, and to offer an accuracy near to full supervision. 

While standard loss function for segmentation consider the prediction for different pixels independently, an important advantage of our MI regularization loss is that it takes into consideration the structured nature of segmentation, where adjacent pixels often have similar class probability distributions. The merit of this loss is demonstrated by the higher DSC score and the more plausible segmentation contours obtained by our method. However, the benefit of MI clustering in semi-supervised segmentation should be further evaluated by providing a deeper theoretical analysis, and validating on large-scale segmentation datasets such as Cityscapes \cite{cordts2016cityscapes}. Moreover, due to limited computational resources, we fixed the labeled-unlabeled trade-off hyper-parameter $\lambda$ in Eq. (\ref{eq:totalLoss}) to 0.1 for all three datasets. Likewise, the importance of the two unsupervised losses $\lossMI$ and $\lossReg$ was kept same in all experiments. However, giving more importance to $\lossMI$ could help the network better explore its solution space, as it increases uncertainty in hard-to-segment regions like boundaries. Emphasizing $\lossMI$ could thus potentially alleviate the problem of sub-optimal solutions. Future work could also involve the online optimization of hyper-parameters, for instance based on the concept of hyper-gradient \cite{baydin2017online}, and testing other types of invertible transformations, such as diffeomorphic nonlinear transformations \cite{narayanan2005diffeomorphic}. 

% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{
We acknowledge the support of the Natural Sciences and Engineering Research Council of Canada (NSERC), and thank NVIDIA corporation for supporting this work through their GPU grant program.
}

\bibliography{Peng20}

\end{document}
