% This is samplepaper.tex, a sample chapter demonstrating the
% LLNCS macro package for Springer Computer Science proceedings;
% Version 2.21 of 2022/01/12
%
\documentclass[runningheads]{llncs}
%
\usepackage[T1]{fontenc}
% T1 fonts will be used to generate the final print and online PDFs,
% so please use T1 fonts in your manuscript whenever possible.
% Other font encondings may result in incorrect characters.
%
\usepackage{graphicx}
% Used for displaying a sample figure. If possible, figure files should
% be included in EPS format.
%
% If you use the hyperref package, please uncomment the following two lines
% to display URLs in blue roman font according to Springer's eBook style:
%\usepackage{color}
%\renewcommand\UrlFont{\color{blue}\rmfamily}
\usepackage[pagebackref=true,breaklinks=true,colorlinks,bookmarks=false]{hyperref}

\usepackage{booktabs}
\usepackage[table]{xcolor}
\usepackage{amsmath}

\newcommand{\norm}[1]{\lVert #1 \rVert}
%
\begin{document}
%
\title{CLEF: Contrastive Learning of Equivariant Features in CT Images}
%
%\titlerunning{Abbreviated paper title}
% If the paper title is too long for the running head, you can set
% an abbreviated paper title here
%
\author{Ilya Kuleshov\inst{1}
\and
Mikhail Goncharov\inst{2} \and
Vera Soboleva\inst{2}}
%
\authorrunning{I. Kuleshov et al.}
% First names are abbreviated in the running head.
% If there are more than two authors, 'et al.' is used.
%
\institute{Moscow Institute of Physics and Technology, Russia \and
Skoltech Institute, Russia}
%
\maketitle              % typeset the header of the contribution
%
\begin{abstract}
This work focuses on developing a self-supervised method of pretraining on biomedical images. The pretrained models are then fine-tuned on a small labelled dataset. We show, that using contrastive learning along with an equivariance loss and a loss, designed by us to maximise the features' information, we manage to improve quality in comparison to a fully-supervised baseline.  Our method of pretraining achieves an average dice score of 0.86, reducing the baseline error by 20\%.

\keywords{Self-supervised learning  \and Biomedical image segmentation \and Contrastive learning.}
\end{abstract}



\section{Introduction}

The biomedical datasets with labelled image are very limited due to high complexity of the process of labelling these images. This gives way to the idea of self-supervised pretraining on unlabelled images. We propose to force the learned features, corresponding to a voxel in the human body, to reflect it's anatomical location. This would guarantee high quality on the task of organ segmentation, which is the task at hand. To achieve said quality, we propose a compound, three-part loss, which would force the features to behave similarly to such a general anatomical system of coordinates.

\section{Method}

%###########################
\subsection{Preprocessing}
In this section we will describe our preprocessing strategy. The transformations are as follows.
\begin{enumerate}
    \item A mask of all voxels with intensities, greater than $-500$HU is generated.
    \item The image is cropped to the smallest box, containing the mask, generated in the previous step.
    \item The image is resized (via interpolation) to shape $(192, 192, 192)$.
    \item If necessary, the axes are flipped, so that the resulting image has canonical orientation.
    \item The intensities are clipped to the window $(-200, 300)$, the HU interval in which most soft tissues reside.
    \item Finally, the intensities are scaled to the range $(-1, 1)$
\end{enumerate} 


\subsection{Proposed Method}
Our key contribution lies in the self-supervised pretraining on the large unlabeled part of our dataset. The pretraining method utilises three losses, each accomplishing a different objective. We use the notation $f_{enc}$ to depict the encoding part of our network, and $X$ for the preprocessed input image.

\subsubsection{Pretraining: Decoding Loss}
Firstly, we want to ensure that the encoded features contain the information about the intensity of the original voxel. For that purpose we use the decoding head: two 1x1 convolutions, applied to the output features. We minimise the mean squared error between the output of our decoding head $f_{dec}$ and our original image $X$:
$$
L_{dec} = MSE(f_{dec}(f_{enc}(X)), X) = \frac{1}{N} \sum_{i,j,k} (f_{dec}(f_{enc}(X))[i,j,k] - X[i,j,k])^2
$$
\subsubsection{Pretraining: Discriminativeness loss}
We also aim to be able to guess the location of a voxel by it's features. Thus, we add a loss which forces the model to predict distinct features, such that voxels, located far apart from each other, have different representations. After getting the features of each voxel, we randomly sample a small subset of anchor voxels $I_A$ and a large subset of leaf voxels $I_L$ and compute the pairwise distance in feature space (negative inner product) between the anchors and the leaves $D$:
$$
D[i_A, i_L] = -\langle X[i_A] \cdot X[i_L] \rangle; i_A \in I_A, i_L \in I_L
$$
Next, for each anchor $i_A$, we compute the indices of the voxels which are far enough from it, $F(i_A)$. Typically, we considered voxels to be far enough from each other if the euclidean distance between them was $\ge 10$mm, so $F(i_A)$ can be computed as the following (we use $\norm{\cdot}_{mm}$ to denote the physical distance in milimeters between voxels):
$$
F(i_A) = \{i \in I_L: \norm{i - i_A}_{mm} \ge 10\}
$$
Finally, we apply the following activation function:
\begin{equation}
    L_d = \sum_{i_A \in I_A} \; \sum_{i_L \in F(i_A)}\rm{relu}(M - D[i_A, i_L])
\end{equation}
Here, $M$, margin, is a hyperpameter, we set it to $-0.9$ in our experiments. 
Such a loss forces the model to cluster features of close voxels together. The higher $M$ is, the less is the amount of possible feature vectors, located from each other at a distance, greater than $M$. For example, if our feature space is a 3D sphere and $M=0$, then there are at most $8$ vectors $\{v_i\}_{i=1}^8$, which all satisfy the inequality $(v_i, v_j) \ge M$ (all of such vectors are either perpendicular to each other, or facing in opposite directions from each other).

\subsubsection{Pretraining: Equivariance loss}
Finally, we also demand the equivariance property from our model. Although this accomplishes more or less the same task as our decoder, we decided to include this loss, as it is easily learned by the model. One of many strenghts of convolutional networks is that the object's representation is independent of it's location in the image, which implies equivariance, at least in relation to shifts. In addition to shifts, we also train our neural network to be equivariant in relation to zooming and rotation. The loss is the negative inner product between a randomly rotated, scaled and shifted representation of the original image, and the representation of the transformed image. In other words, if we denote $T$ as our random affine transformation, $X$ as our image, and $f$ as our neural network, the loss is as follows:
\begin{equation}
    L_e = -\langle T(f(X)) \cdot f(T(X)) \rangle
\end{equation}

\subsubsection{Finetuning}
We fine-tune with a compound loss function, which is the summation between Dice loss and cross entropy loss. This kind of loss function has proven to be effective in biomedical image segmenation~\cite{LossOdyssey}.

\subsubsection{Architecture}
We use a two-part architecture, consisting of a large, feature-extracting backbone and a small head. The backbone is a simple 3D U-Net. The head consists of two 1x1 convolutions, which we apply to the output features of the backbone. Figure~\ref{fig:Network} illustrates the applied 3D U-Net~\cite{ronneberger2015u}. The number of channels the head outputs determines the dimensionality of our feature space in the case of pretraining, whereas in the case of fine-tuning it must be equal to the number of classes, thus when transferring weights from the pretraining model to the fine-tuning model, we transfer only the weights of the backbone U-Net, while the head is re-initialized with fresh random weights.

\begin{figure}[htbp]
\centering
\includegraphics[width=\textwidth]{imgs/UNet.pdf}
\caption{Network architecture}
\label{fig:Network}
\end{figure}


\subsection{Post-processing}
The fine-tuning predictions are passed through a sigmoid function, thus scaling to a $(0, 1)$ range. Then, for each voxel: if all of the resulting logits are less than .5, we deduce that voxel to be outside of any organs, which interest us in this task. Otherwise, that voxel is labelled with the index of the largest logit value in it's predicted vector.

\section{Experiments}
\subsection{Dataset and evaluation measures}
The FLARE2022 dataset is curated from more than 20 medical groups under the license permission, including MSD~\cite{simpson2019MSD}, KiTS~\cite{KiTS,KiTSDataset}, AbdomenCT-1K~\cite{AbdomenCT-1K}, and TCIA~\cite{clark2013TCIA}. The training set includes 50 labelled CT scans with pancreas disease and 2000 unlabelled CT scans with liver, kidney, spleen, or pancreas diseases. The validation set includes 50 CT scans with liver, kidney, spleen, or pancreas diseases.
The testing set includes 200 CT scans where 100 cases has liver, kidney, spleen, or pancreas diseases and the other 100 cases has uterine corpus endometrial, urothelial bladder, stomach, sarcomas, or ovarian diseases. All the CT scans only have image information and the center information is not available.

The evaluation measures consist of two accuracy measures: Dice Similarity Coefficient (DSC) and Normalized Surface Dice (NSD), and three running efficiency measures: running time, area under GPU memory-time curve, and area under CPU utilization-time curve. All measures will be used to compute the ranking. Moreover, the GPU memory consumption has a 2 GB tolerance.


\subsection{Implementation details}
\subsubsection{Environment settings}
The development environments and requirements are presented in Table~\ref{table:env}.


\begin{table}[!htbp]
\caption{Development environments and requirements.}\label{table:env}
\centering
\begin{tabular}{ll}
\hline
Windows/Ubuntu version       & Ubuntu 18.04.5 LTS\\
\hline
CPU   & Intel(R) Core(TM) i9-7900X CPU@3.30GHz \\
\hline
RAM                         &16$\times $4GB; 2.67MT$/$s\\
\hline
GPU (number and type)                         & Four NVIDIA V100 16G\\
\hline
CUDA version                  & 11.0\\                          \hline
Programming language                 & Python 3.9\\ 
\hline
Deep learning framework & Pytorch (Torch 1.10, torchvision 0.2.2) \\
\hline
Specific dependencies         &                        \\                                                                      
\hline
(Optional) Link to code     &                                                                \\
\hline
\end{tabular}
\end{table}


\subsubsection{Training protocols}
We used no data augmentation, passing the whole $192\times 192 \times 192$ image to the model without random cropping. The equivariance loss requires a random affine transform. We randomly sampled two of three axes, and applied a random transform in the corresponding plane. For the transform, we used a combination of a random scale in the range $(1, 1.5)$, a random shift in the range $(-0.1, 0.1)$ on each of the selected axes, and a random rotation in the range $(-30^\circ, 30^\circ)$ in the selected plane. \\


\begin{table*}[!htbp]
\caption{Training protocols.}
\label{table:training}
\begin{center}
% \resizebox{0.47\textwidth}{!}{
\begin{tabular}{ll} 
\hline
Network initialization         & default PyTorch initialization\\
\hline
Batch size                    & 1 \\
\hline
Total epochs & 50 \\
\hline
Optimizer          & Adam \\ 
\hline
Initial learning rate (lr)  & $0.0004$ \\
\hline
Training time                                           & 15.5 hours \\  \hline 
Number of model parameters    & 33.0M\footnote{https://github.com/sksq96/pytorch-summary} \\ \hline
\end{tabular}
%}
\end{center}
\end{table*}

When fine-tuning, we randomly divide the 50 labeled samples into 5 folds. One of the five resulting subsets is used for validation the others are used for training. We use early stopping to stop our model from overfitting on the small dataset of labelled samples: when the loss on the validation set does not decrease for three epochs, the training stops. This results in the training being cut short after 18-22 epochs, depending on the chosen fold.

\begin{table*}[!htbp]
\caption{Training protocols for the fine-tuning model (if using two-stage framework).}
\label{table:training2nd}
\begin{center}
% \resizebox{0.47\textwidth}{!}{
\begin{tabular}{ll} 
\hline
Network initialization         & default PyTorch initialization\\
\hline
Batch size                    & 1 \\
\hline
Total epochs & 50/Early stopping \\
\hline
Optimizer          & Adam \\ \hline
Initial learning rate (lr)  & $0.001$ \\
\hline
Training time                                           & 1-2 hours \\  \hline 
Number of model parameters    & 33.0M\footnote{https://github.com/sksq96/pytorch-summary} \\ \hline
\end{tabular}
\end{center}
\end{table*}


\section{Results and discussion}

\subsection{Evaluation on the validation set}
The self-supervised pretraining on unlabelled cases provided an improvement in comparison to a model, trained in a supervised fashion. The model works very well on clearly visible organs, which can be visually separated from their surroundings. Such organs include the liver, kidneys, the aorta and some others. Less visible organs, such as the duodenum (see \autoref{fig:duodenum}), the pancreas (see \autoref{fig:pancreas}), the left adrenal gland proved to be more complicated for our method, which is to be expected. But, according to \autoref{tab:seg_res}, there are some isolated cases, in which highly-visible organs have lower quality. This is due to anomalies, as can be seen on \autoref{fig:doublekidney}. The average dice score after pretraining on the validation set is 0.87, which is a rather big improvement from the baseline average dice score (0.84).

\include{tables/segmentation_results}

\begin{figure}
    \centering
    \includegraphics[width=\textwidth]{imgs/duodenum.png}
    \caption{A slice and the corresponding duodenum mask.}
    \label{fig:duodenum}
\end{figure}

\begin{figure}
    \centering
    \includegraphics[width=\textwidth]{imgs/pancreas.png}
    \caption{A slice and the corresponding pancreas mask}
    \label{fig:pancreas}
\end{figure}



\begin{figure}
    \centering
    \includegraphics[width=\textwidth]{imgs/double_kidney.png}
    \caption{An anomaly in the left kidney, image FLARE22\_Tr\_0045}
    \label{fig:doublekidney}
\end{figure}

\subsection{Validation results}
Here are some segmentation maps on the validation set: \autoref{fig:ex_0001}, \autoref{fig:ex_0010}, \autoref{fig:ex_0016}, \autoref{fig:ex_0030}, \autoref{fig:ex_0050}. As instructed, all segmentation maps are presented in a window, centered at 40HU, with a width of 400HU.

\begin{figure}[h!]
    \centering
    \includegraphics[width=\textwidth]{imgs/case_0001.png}
    \label{fig:ex_0001}
    \caption{Segmentation results on case 1}
\end{figure}

\begin{figure}[h!]
    \centering
    \includegraphics[width=\textwidth]{imgs/case_0010.png}
    \label{fig:ex_0010}
    \caption{Segmentation results on case 10}
\end{figure}

\begin{figure}[h!]
    \centering
    \includegraphics[width=\textwidth]{imgs/case_0016.png}
    \label{fig:ex_0016}
    \caption{Segmentation results on case 16}
\end{figure}

\begin{figure}[h!]
    \centering
    \includegraphics[width=\textwidth]{imgs/case_0030.png}
    \label{fig:ex_0030}
    \caption{Segmentation results on case 30}
\end{figure}

\begin{figure}[h!]
    \centering
    \includegraphics[width=\textwidth]{imgs/case_0050.png}
    \label{fig:ex_0050}
    \caption{Segmentation results on case 50}
\end{figure}

\subsection{Results on final testing set}
The results on the testing set can be seen in \autoref{tab:test_res}, they are worse than the results on the validation set, probably due to a slightly skewed distribution.

\begin{table}[ht]
    \centering
    \caption{Testing results}
    \label{tab:test_res}
    \include{tables/test_res}
\end{table}

\subsection{Limitation and future work}
Our method shows great promise. Nevertheless, it can be modified in many ways. These include, but are not limited to the following.
\begin{itemize}
    \item Adding preprocessing. This could help with artifacts that are visible on some predicted masks (see \autoref{fig:pred_artifacts}).
    
    \begin{figure}
        \centering
        \includegraphics[width=\textwidth]{imgs/photo_2022-07-16_17-01-57.jpg}
        \caption{Artifacts on predictions}
        \label{fig:pred_artifacts}
    \end{figure}
    
    \item This method could be improved by predicting several neighbouring image voxels by the MLP head, instead of a single voxel. This should help store information on the surrounding voxels in pretrained features, forcing our model to learn better features.
\end{itemize}

\section{Conclusion}
Contrastive self-supervised pretraining helps improve quality of the resulting fine-tuned network. This means, that our pretraining method forces the backbone to learn informative features, which, at least in part, carry information on human organs.

\subsubsection{Acknowledgements} The authors of this paper declare that the segmentation method they implemented for participation in the FLARE 2022 challenge has not used any pre-trained models nor additional datasets other than those provided by the organizers. The proposed solution is fully automatic without any manual intervention.


%
% ---- Bibliography ----
%
% BibTeX users should specify bibliography style 'splncs04'.
% References will then be sorted and formatted in the correct style.
%
\bibliographystyle{splncs04}
\bibliography{ref}

\end{document}
