\documentclass{midl} % Include author names
%\documentclass[anon]{midl} % Anonymized submission

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\usepackage{comment}
%\jmlrvolume{-- Under Review}
\jmlryear{2021}
\jmlrworkshop{Full Paper -- MIDL 2021}
%\editors{Under Review for MIDL 2021}
\title[Self-supervised OOD Detection for Cardiac CMR Segmentation]{Self-supervised Out-of-distribution Detection for Cardiac CMR Segmentation}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
\midlauthor{\Name{Camila Gonzalez} \Email{camila.gonzalez@gris.informatik.tu-darmstadt.de} \and \\
\Name{Anirban Mukhopadhyay} \Email{anirban.mukhopadhyay@gris.informatik.tu-darmstadt.de}\\
 \addr Technical University of Darmstadt, Karolinenpl. 5, 64289 Darmstadt, Germany}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
% \midlauthor{\Name{Author Name1\midljointauthortext{Contributed equally}\nametag{$^{1,2}$}} \Email{abc@sample.edu}\\
% \addr $^{1}$ Address 1 \\
% \addr $^{2}$ Address 2 \AND
% \Name{Author Name2\midlotherjointauthor\nametag{$^{1}$}} \Email{xyz@sample.edu}\\
% \Name{Author Name3\nametag{$^{2}$}} \Email{alphabeta@example.edu}\\
% \Name{Author Name4\midljointauthortext{Contributed equally}\nametag{$^{3}$}} \Email{uvw@foo.ac.uk}\\
% \addr $^{3}$ Address 3 \AND
% \Name{Author Name5\midlotherjointauthor\nametag{$^{4}$}} \Email{fgh@bar.com}\\
% \addr $^{4}$ Address 4
% }

\usepackage{xcolor}
\usepackage{amssymb}
\usepackage{multirow,graphicx,float}
\newcommand*{\com}{\textcolor{green}}
\newcommand*{\todo}{\textcolor{red}}
\newcommand*{\reb}{\textcolor{teal}}

\begin{document}

\maketitle

\begin{abstract}
The segmentation of cardiac structures in Cine Magnetic Resonance imaging (CMR) plays an important role in monitoring ventricular function, and many deep learning solutions have been introduced that successfully automate this task. Yet due to variabilities in the CMR acquisition process, images from different centers or acquisition protocols differ considerably. This causes deep learning models to \emph{fail silently}. It is therefore crucial to identify out-of-distribution (OOD) samples for which the trained model is unsuitable. For models with a self-supervised proxy task, we propose a simple method to identify OOD samples that does not require adapting the model architecture or access to a separate OOD dataset during training. As the performance of self-supervised tasks can be assessed without ground truth information, it indicates during test time when a sample differs from the training distribution. The proposed method combines a voxel-wise uncertainty estimate with the self-supervision information. Our approach is validated across three CMR datasets and two different proxy tasks. We find that it is more effective at detecting OOD samples than state-of-the-art post-hoc OOD detection and uncertainty estimation approaches.
\end{abstract}

\begin{keywords}
out-of-distribution detection, self-supervision, distribution shift
\end{keywords}


\section{Introduction}

Despite significant advances in diagnostic deep learning research, the adoption of learning-based systems in clinical practice is very limited. One reason for this is the inability of models to generalize to out-of-distribution (OOD) samples in real clinical settings, coupled with their tendency to produce overconfident predictions. Most deep learning systems are evaluated on test data similar in distribution to that used for training. When testing takes place on data gathered from different pieces of equipment or with a different protocol, there is a noticeable drop in performance \cite{glocker2019machine}.

Cardiac Cine Magnetic Resonance imaging (CMR), the gold-standard for non-invasive volumetric quantification, is particularly prone to shifts in image properties. The acquisition process requires breath-holding, which is difficult for patients with arrythmias. As a consequence, variations in image quality are magnified \citep{oksuz2019automatic, ruijsink2020fully}. Automatic cardiac segmentation that generalizes well to unseen manufacturers is still an open challenge \citep{bevandic2019simultaneous, yan2020mri}. Clinical deployment of deep neural networks (DNNs) would comprise a two-step process where the plausibility of a model output being correct is considered alongside the prediction. Observing softmax outputs is not sufficient, as DNNs produce overconfident predictions for OOD data \citep{hein2019relu}. Fig.~\ref{fig:qualitative_comparison} shows how the segmentation performance of a U-Net deteriorates silently on OOD data. As OOD detection is a secondary goal, an ideal detector would integrate into any existing model and require no modifications in the architecture or training procedure.

In this work, we explore how self-supervision can help uncover OOD samples for the task of left ventricular blood pool segmentation, which is often utilized clinically to calculate parameters such as Ejection Fraction. DNNs only produce meaningful outputs for in-distribution (ID) data \citep{su2020does}. This manifests in a drop in performance for OOD samples and, accordingly, a higher loss between the predicted and target values. While the loss cannot be calculated during inference for supervised tasks, it \emph{can} be for self-supervised tasks that derive target values from the input images. For self-supervised models, this opens the possibility to leverage the test-time performance as a signal for the identification of OOD samples without needing any manual annotations or OOD training data. 

Our proposed method uses the value of the self-supervision loss in combination with post-hoc uncertainty estimation. While other works have used the self-supervision loss to detect OOD samples in classification tasks, we adopt this idea for medical image segmentation. Unlike current state-of-the-art, the proposed approach does not require a specific proxy task, or training the model with the explicit goal of OOD detection, and is therefore applicable to a wide array of self-supervised architectures. The proposed method outperforms state-of-the-art post-hoc approaches for OOD detection and uncertainty estimation across three CMR datasets and for two different proxy tasks: edge detection and contrastive learning. Our main contributions are: (A) the introduction of self-supervision as a lightweight OOD detector for cardiac CMR segmentation and (B) a thorough evaluation of OOD detection methods on CMR imaging for three datasets and two different self-supervised architectures.

\begin{figure}[htbp]
\floatconts
  {fig:qualitative_comparison}
  {\caption{Distribution shift causes a deterioration on the left ventricular blood pool segmentation for a  subject from the \emph{Multi-Centre, Multi-Vendor and Multi-Disease (M\&M) Vendor A} dataset, but traditional confidence quantification fails silently.}
  }
  {\includegraphics[width=\linewidth]{images/qualitative_comparison.pdf}}
\end{figure}

\section{Related Work}

In this section, we review relevant related work for self-supervision and OOD detection.

\textbf{Self-supervision} methods combine the training for the regular \textit{target task} with a \textit{proxy task}. Whereas the target task is usually supervised, the proxy task does not require manual annotations, i.e. the target value can be derived from the input. For the sake of brevity we refer to \citet{asano2019critical} and \citet{zhang2019survey} for a detailed description of self-supervision in image segmentation.

In the field of \textbf{out-of-distribution detection}, several methods look at network outputs to detect novel samples. \citet{hendrycks2016baseline} introduce the baseline of using the distribution of softmax values as an indicator for novelty. \citet{guo2017calibration} find temperature scaling to be an effective DNN calibration method. \citet{liang2018enhancing} introduce the \textit{ODIN} method, which extends temperature scaling by adding small adversarial-like perturbations to the inputs during inference which increase the separation between ID and OOD softmax values. \citet{lee2018simple} use the class-conditional distribution of neural activations to detect OOD samples. Other methods -- that do not work in a post-hoc basis -- use OOD data during training to explicitly train an outlier detector \citep{hendrycks2018deep, lee2018training, mohseni2020self, vyas2018out, bevandic2019simultaneous}. Related to the task of OOD detection is \textbf{uncertainty estimation}. Popular methods include Monte Carlo (MC) Dropout \citep{gal2016dropout} and Deep Ensembles \citep{lakshminarayanan2017simple}. Several publications look at their effectiveness in the field of medical image segmentation, and find that ensembles are most reliable, though MC Dropout is also effective \citep{jungo2019assessing, jungo2020analyzing, mehrtash2020confidence}. Other methods have shown better performance in some cases, but require special training considerations \citep{blundell2015weight, kohl2018probabilistic, NEURIPS2020_95f8d990}.

Some research delves into \textbf{OOD detection in self-supervised models}. \citet{pidhorskyi2018generative} use the reconstruction error of an autoencoder to assess novelty. \citet{winkens2020contrastive} and \citet{wu2020simple} augment classification networks with a contrastive learning term and estimate the density on different feature spaces. Similar to us, \citet{golan2018deep} train a multi-head model, where one head performs image classification and the second learns to detect image transformations, and calculate the novelty through the softmax outputs. \citet{hendrycks2019using} improve OOD detection by training a classifier with a proxy rotation estimation loss. For image segmentation, \citet{xia2020synthesize} calculate the reconstruction error between the original image and a synthesized version.

Unlike other approaches, our proposed method does not require the use of a particular proxy task, and works entirely in a post-hoc manner. This ensures the applicability to a variety of deployed learning systems that include a self-supervised component. In terms of application we focus on semantic segmentation, and evaluate our method on datasets which solve the same semantic task (left ventricular blood pool segmentation) but differ in terms of acquisition vendor and center. Our research is, to our knowledge, the first to utilize self-supervision losses for OOD detection in medical image segmentation.

\section{Methods}

Consider a model $\mathcal{F}$ trained with $n$ samples $\left\{ x_i\right\}^{n}_{i=1}$. The goal of OOD detection is to identify -- during deployment -- new samples that variate significantly from the training distribution. For this, a continuous \emph{novelty} function $\mathcal{N}:\mathcal{X} \to \mathbb{R}$ and a threshold $\psi$ are defined so that $x_i$ is classified as out-of-distribution if $\mathcal{N}(x_i) \ge \psi$. The expectation is that real-world OOD samples are flagged for which the model produces unreliable predictions. In this section, we describe our proposed method to detect OOD samples in a post-hoc manner for models trained with a self-supervised proxy task. We start by introducing the two architectures we explore in this work, and then explain the process of OOD detection.

\subsection{Self-supervised Learning}

A task is said to be \emph{self-supervised} if the target information is generated by the learning system. Increasingly, DNNs for semantic segmentation are being augmented with self-supervision \citep{wang2020self, pan2020unsupervised} in order to leverage non-annotated data or shape the feature space. In this work, we explore \textbf{edge detection} and \textbf{contrastive learning}. These proxy tasks are well-suited to the segmentation of cardiac structures as they encourage learning geometrically-aware features that disregard image quality information \citep{chu2020pay, winkens2020contrastive, sahu2020endo}. However, the novelty metric we introduce in Sec.~\ref{sec:nov} can be calculated for models trained with any self-supervised task.

\textbf{Contrastive learning} teaches the model to distinguish between different data points in the training set, while at the same time learning a semantically meaningful feature space that disregards certain transformations. Inspired by \citet{winkens2020contrastive}, we transform an original image $x_i$ into $\mathcal{T}(x_i)=\overline{x_i}$. During training, we maximize the cosine similarity between $x_i$ and $\overline{x_i}$ in the feature space and minimize the similarity between $x_i$ and a second image $x_j$. For function $\mathcal{T}$, we use implementations from the \emph{TorchIO} library (version 0.17.46) \cite{perez_garcia_torchio_2020}. We randomly apply \textit{RescaleIntensity}, \textit{RandomGamma}, \textit{RandomMotion}, \textit{RandomBiasField}, \textit{RandomNoise} and \textit{RandomBlur} operations, each with a probability of $p=0.5$. Features $z_i$ are extracted from the output of the encoder $\mathcal{E}$. Eq.~\ref{eq:contrastive} defines the contrastive loss $\mathcal{L}_{ss}^{C}$, and the architecture is displayed in Fig.~\ref{fig:architectures} (left).

\begin{equation}
    \label{eq:contrastive}
    \mathcal{L}_{ss}^{C}(x_i, x_j)= \mathcal{L}_{sim}(\mathcal{E}(x_i), \mathcal{E}(x_j)) - \mathcal{L}_{sim}(\mathcal{E}(x_i), \mathcal{E}(\mathcal{T}(x_i)),\quad    \mathcal{L}_{sim}(z_i, z_j)= \frac{z_i \cdot z_j}{ \left\| z_i \right\|_2 \cdot \left\| z_j \right\|_2 }
\end{equation}

The goal of \textbf{edge detection} is to extract a mask of edges $\widehat{h}_i$ from image $x_i$. We train a standard two-headed architecture consisting of a shared encoder $\mathcal{E}$ and two decoders, $\mathcal{G}$ for the segmentation task and $\mathcal{H}$ for edge detection. Fig.~\ref{fig:architectures} (right) outlines the proposed architecture. We train both heads with a combined loss of Dice ($\mathcal{L}_{Dice}$) and binary cross entropy ($\mathcal{L}_{BCE}$) weighted equally. To produce target masks $h_i$ in a deterministic manner, we use the \emph{Canny Edge} detector \cite{canny1986computational} of the \textit{Scikit Learn} \citep{scikit_learn} library (version 0.24.1) with lower and upper bounds of, respectively, 150 and 200. During inference, we treat the edge detection loss $\mathcal{L}_{ss}^{E}$ (Eq.~\ref{eq:edge_det}) as a component of our novelty metric.

\begin{equation}
    \label{eq:edge_det}
    \mathcal{L}_{ss}^{E}(x_i, h_i)=\mathcal{L}_{Dice}(\mathcal{H}(x_i), h_i) + \mathcal{L}_{BCE}(\mathcal{H}(x_i), h_i)
\end{equation}

\begin{figure}[htbp]
\floatconts
  {fig:architectures}
  {\caption{Two self-supervision architectures are explored in this work. Left: features are extracted for $x_i$, $\mathcal{T}(x_i)=\overline{x_i}$ and $x_j$ to calculate a contrastive loss term. Right: network with an additional decoder head for the task of edge detection.}}
  {\includegraphics[width=0.9\linewidth]{images/architectures.pdf}}
\end{figure}

\subsection{Novelty Estimation} \label{sec:nov}

For detecting OOD samples during inference we combine uncertainty estimates with the loss of the self-supervised proxy task. Uncertainty estimation produces good calibrations in ID data, but often fails in the presence of dataset shift \citep{NEURIPS2019_8558cb40}. We expect dataset shift to manifest in an unusually large self-supervision loss \citep{su2020does} that compensates for the decreased ability to detect uncertain cases of uncertainty estimation methods. By combining these two factors, we obtain a reliable detection signal.

As we aim to find a flexible post-hoc method applicable to most learning-based systems, we explore two different types of uncertainty estimation. \textbf{MC Dropout} \citep{gal2016dropout} involves performing several forward passes with dropout during test time. The method can be applied to any model that uses dropout layers, which includes most modern architectures. \textbf{Deep Ensembles} -- the practice of training several networks and averaging their predictions -- have consistently shown the best performance in uncertainty estimation \citep{jungo2020analyzing, mehrtash2020confidence}. They are also a straightforward way to improve prediction performance and therefore often used in practice. In the event that several trained models are present, we propose using this method as an uncertainty estimate.

During inference, the novelty of a test subject is assessed by combining the self-supervised loss $\mathcal{L}_{ss}$ with uncertainty estimation. The $\mathcal{L}_{ss}$ loss is calculated in the same way as during training. For the experiments performed in this work, either $\mathcal{L}_{ss}^{C}(x_i, x_j)$ or $\mathcal{L}_{ss}^{E}(x_i, h_i)$ are calculated depending on the model architecture. In the first case, we use a different subject from the same dataset as $x_j$. For 2D models, the loss for a test subject is the average across slices, as is also the case during training. As the uncertainty estimation component we take the voxel-wise standard deviation between model predictions, which is averaged over all voxels to produce a subject-level score. Different predictions are obtained by performing MC Dropout or, if ensembles are available, by making a prediction with each model. We define the proposed novelty function $\mathcal{N}$ in Eq.~\ref{eq:total}, where $K$ is the number of trained models or dropout forward passes and $N$ is the number of voxels $x_{i,j}$ in an image $x_i$.

\begin{equation}
    \label{eq:total}
\mathcal{N}(x_i) = \lambda \mathcal{L}_{ss}(\cdot) + \frac{1}{N} \sum_{j=1}^{N} \sqrt{\frac{1}{K} \sum_{k=1}^{K}\left(x_{i,j}^k - \mu_{i,j} \right)^{2}},\quad    \mu_{i,j}= \frac{1}{K} \sum_{k=1}^{K} x_{i,j}^k
\end{equation}

\section{Experimental Setup and Results}

We use three CMR datasets. The first two are part of the \textit{Multi-Centre, Multi-Vendor and Multi-Disease Cardiac Segmentation} (M\&M) dataset 
\citep{campello2020multi} and contain healthy subjects as well as subjects with hypertrophic and dilated cardiomyopathies. We use the data for vendors \textit{A} and \textit{B}, for which ground truth segmentations are available. The images were acquired with \textit{Siemens Avanto} and \textit{Philips Achieva} scanners, respectively, at different centers.
Each dataset contains 75 subjects. Lastly, we use the \textit{Sunnybrook Cardiac Data} \citep{radau2009evaluation}, acquired at a different center with a \textit{General Electric Signa} scanner. The data consists of 45 scans from healthy as well as diseased subjects suffering from hypertrophy and heart failure. All images were acquired with 1.5T fields strength. We extract from each subject the segmented diastolic and systolic phase volumes.

We train a slice-by-slice U-Net with five encoding blocks based on the implementation by \citet{perez_garcia_fernando_unet}. Images are center-cropped to $256 \times 256$. Each model is trained for $200$ epochs with the \textit{PyTorch Adam} optimizer. For the edge detection task, the encoder is shared and the decoder is replicated from the point with minimum spatial resolution. Refer to Appendix \ref{sec:annex_seg} for an overview of segmentation performance in ID and OOD data. Note that the results on the target task change slightly due to the incorporation of self-supervision.

We compare the proposed method against taking the inverse maximum softmax value \citep{hendrycks2016baseline} (reported as \textbf{Max. Softmax}), temperature scaling (\textbf{Temp. Scaling}) \citep{guo2017calibration} and the \textbf{ODIN} method \citep{liang2018enhancing}; as well as against the corresponding uncertainty estimation (\textbf{MC Dropout} and \textbf{Ensemble}) and using only the self-supervised loss as a novelty estimate (\textbf{SS Loss}). When necessary, we average voxel-wise estimates to produce a volume-wise novelty score. We refer to our method variations using and not using ensembles as \textbf{Ours E} and \textbf{Ours}, respectively. We further specify in parenthesis whether the model learned a contrastive (C) or edge detection (E) task.

In turn, we consider each of the three datasets as ID and the other two as OOD. We divide the ID cases into three folds to perform cross-validation. For each cross-validation run, we train a model with the \emph{ID train data} made out of two folds and evaluate it with the third fold, which is the \emph{ID test data}. For OOD detection, we use one OOD dataset and the ID train data to select the best hyperparameters and evaluate the detection performance on the second OOD dataset and the ID test samples. We average the results of using each of the two OOD datasets for the evaluation, and report the mean and standard deviation of the three-fold cross-validation. Refer to Appendix ~\ref{sec:eval_setup} for a graphical illustration of our evaluation strategy.
The following hyperparameters are tested: $T \in \left \{ 1e1, 1e2, 1e3 \right \}$ for temperature, $\varepsilon  \in \left \{ 1e-1, 1e-2, 1e-3 \right \}$ for perturbation magnitude (ODIN), $p \in \left \{ 0.3, 0.5, 0.7 \right \}$ for dropout probabilities and $\lambda \in \left \{ 1e0, 1e2, 1e4 \right \}$ for weighting magnitudes. 

\setlength\tabcolsep{5pt}
\begin{table}[htbp]
\floatconts
{tab:results_contrastive}%
{\caption{OOD Detection Error and FPR at $95\%$ TPR for models trained with a contrastive learning loss term (lower is better). The mean and standard deviation are reported of testing with each OOD dataset and performing three-fold cross validation.
}}%
{\begin{tabular}{l|ll|ll|ll}
 &\multicolumn{2}{|c|}{\textbf{M\&M Vendor A}}  &\multicolumn{2}{c}{\textbf{M\&M Vendor B}} &  \multicolumn{2}{|c}{\textbf{Sunnybrook}}\\
\textbf{\textbf{Method}} & \textbf{Error} & \textbf{FPR} & \textbf{Error} & \textbf{FPR} & \textbf{Error} & \textbf{FPR}\\
\hline
\textbf{Max. Softmax} & .48 \textpm .00 & .93 \textpm .01 & .51 \textpm .02 & .90 \textpm .02 & .53 \textpm .00 & .91 \textpm .09\\
\textbf{Temp. Scaling} & .51 \textpm .01 & .93 \textpm .01 & .51 \textpm .02 & .93 \textpm .01 & .47 \textpm .01 & .90 \textpm .02\\
\textbf{ODIN} & .43 \textpm .02 & .84 \textpm .03 & .49 \textpm .00 & .87 \textpm .01 & .51 \textpm .01 & .87 \textpm .02\\
\textbf{SS Loss (C)} & \textbf{.33 \textpm .03} & .61 \textpm .04 & .36 \textpm .11 & .60 \textpm .17 & .50 \textpm .04 & .91 \textpm .02\\
\textbf{MC Dropout} & .45 \textpm .01 & .85 \textpm .05 & .38 \textpm .10 & .72 \textpm .20 & .21 \textpm .02 & .23 \textpm .09\\
\textbf{Ours (C)} & \textbf{.33 \textpm .03} & \textbf{.60 \textpm .05} & \textbf{.33 \textpm .12} & \textbf{.58 \textpm .18} & \textbf{.19 \textpm .02} & \textbf{.19 \textpm .09}\\
\hline
\textbf{Ensemble} & .46 \textpm .02 & .86 \textpm .01 & .44 \textpm .03 & .37 \textpm .08 & \textbf{.26 \textpm .01} & .06 \textpm .02\\
\textbf{Ours E (C)} & \textbf{.32 \textpm .05} & \textbf{.49 \textpm .13} & \textbf{.26 \textpm .05} & \textbf{.17 \textpm .04} & .28 \textpm .01 & \textbf{.05 \textpm .00}
\end{tabular}}
\end{table}

We train ensembles with $K=3$ models and perform $K=30$ MC Dropout passes. We select the threshold $\psi$ that achieves a $95\%$ True Positive Rate (TPR) in the in-distribution train data, and flag samples as OOD when $\mathcal{N}(x) \ge \psi$. Reported are the Detection Error as defined by \citet{liang2018enhancing} and the False Positive Rate (FPR) at $95\%$ TPR.

\subsection{Results for Contrastive Learning Models}

We start by analyzing the results of OOD detection methods for the models trained with a contrastive learning loss component. Table~\ref{tab:results_contrastive} summarizes our findings. We see that for all datasets, the popular temperature scaling and ODIN methods perform poorly. This may be due to the fact that both methods are developed for the classification task and not segmentation, where different voxels may be more or less significant for determining whether a sample is in-distribution. Our proposed method results in a lower detection error and FPR than all baselines both in cases where ensembles are available and when they are not. Only in dataset \textit{Sunnybrook} does the ensemble alone achieve a lower detection error than the proposed method. As expected, considering the deviation between ensembles as an uncertainty estimation component leads to better results than applying MC Dropout. However, this method variation is only applicable if multiple models have been trained.

\begin{figure}[htbp]
\floatconts
  {fig:boxplot_contrastive}
  {\caption{Distribution of novelty scores for contrastive learning models (lesser overlap is better). The scores for ID and OOD data are aggregated for all experiments and normalized to $\left [ 0, 1 \right ]$ by taking the range of the ID training set.
  }}
  {\includegraphics[width=0.9\linewidth]{images/violinplot_contrastive.png}}
\end{figure}

Fig.~\ref{fig:boxplot_contrastive} illustrates the ranges that different novelty scores occupy, normalized by taking the minimum and maximum novelty for ID train data, so that different methods are comparable. Ideally, novelty scores would cluster close to one (upper plot segment) for OOD data, and there would be a minimal overlap between ID and OOD scores. By observing the boxes ranging from the first to the third quantiles we notice that the proposed method achieves the best separation between ID and OOD novelty scores in its two variations.

\subsection{Results for Architectures with Edge Detection}

Table~\ref{tab:results_edge_detection} compiles the results for models trained with an edge detection proxy task. Despite this being a very different task and self-supervision loss, the proposed method still performs best in all but one cases. However, the method shows its limitations for models trained with data from \textit{M}\&\textit{M Vendor B}. This indicates that although our method is suited to any self-supervised task, some tasks may be more helpful than others.

\begin{table}[htbp]
\floatconts
{tab:results_edge_detection}%
{\caption{OOD Detection Error and FPR at $95\%$ TPR (\textpm \ standard deviation) for models trained with an edge-detection proxy task (lower is better).}}%
{\begin{tabular}{l|ll|ll|ll}

&\multicolumn{2}{|c|}{\textbf{M\&M Vendor A}}  &\multicolumn{2}{c}{\textbf{M\&M Vendor B}} &  \multicolumn{2}{|c}{\textbf{Sunnybrook}}\\
 
\textbf{\textbf{Method}} & \textbf{Error} & \textbf{FPR} & \textbf{Error} & \textbf{FPR} & \textbf{Error} & \textbf{FPR}\\
\hline
\textbf{Max. Softmax} & .49 \textpm .00 & .97 \textpm .00 & .49 \textpm .01 & .95 \textpm .04 & .50 \textpm .00 & .96 \textpm .01\\
\textbf{Temp. Scaling} & .51 \textpm .01 & .87 \textpm .02 & .51 \textpm .03 & .91 \textpm .01 & .48 \textpm .02 & .92 \textpm .04\\
\textbf{ODIN} & .47 \textpm .02 & .90 \textpm .01 & .48 \textpm .03 & .89 \textpm .02 & .48 \textpm .01 & .92 \textpm .00\\
\textbf{SS Loss (E)} & .33 \textpm .01 & .66 \textpm .01 & .55 \textpm .03 & .99 \textpm .01 & .29 \textpm .01 & .53 \textpm .04\\
\textbf{MC Dropout} & .43 \textpm .04 & .81 \textpm .04 & \textbf{.44 \textpm .06} & \textbf{.33 \textpm .33} & \textbf{.28 \textpm .03} & .28 \textpm .16\\
\textbf{Ours (E)} & \textbf{.32 \textpm .01}  & \textbf{.63 \textpm .01} & \textbf{.44 \textpm .05} & .81 \textpm .16 & \textbf{.28 \textpm .02} & \textbf{.25 \textpm .14}\\
\hline
\textbf{Ensemble} & .39 \textpm .04 & .68 \textpm .14 &  \textbf{.45 \textpm .03} & .45 \textpm .02 & .37 \textpm .13 & .51 \textpm .49\\
\textbf{Ours E (E)} & \textbf{\textbf{.32} \textpm .01} & \textbf{.55 \textpm .08} & \textbf{.45 \textpm .03} & \textbf{.44 \textpm .02} & \textbf{.25 \textpm .01} & \textbf{.23 \textpm .22}
   \end{tabular}}
\end{table}

\section{Conclusion}

Automatic segmentation of cardiac structures in CMR data could significantly alleviate the  burden of clinicians. Competitive performance has been achieved by DNNs, but as long as these are susceptible to domain shift their applicability is limited. One way to approach this is by identifying OOD samples during deployment. For self-supervised models, combining the test-time value of the proxy loss with uncertainty estimation forms a reliable and lightweight novelty score. This finding is significant when considering the surge in popularity of self-supervision and introduces a further benefit of including a proxy term in DNN training. The proposed method can augment a wide array of learning-based systems, although for fully-supervised models incorporating a proxy task can have unintended effects in the target task. Future work should contemplate whether our results extend to other proxy tasks and anatomies. As it requires minimal overhead, we hope that monitoring the proxy loss during deployment becomes a widespread method for quality assurance.

\midlacknowledgments{This work was supported by the Bundesministerium für Gesundheit (BMG) with grant [ZMVI1-2520DAT03A].}

\newpage
\appendix

\section{Segmentation Performance of Trained Models} \label{sec:annex_seg}

Table \ref{tab:dice_contrastive} showcases the Dice coefficient for left ventricular blood pool segmentation for models trained with two proxy tasks (contrastive and edge detection), as well as without any proxy task. In the diagonal, the results are displayed of testing each model with ID data.


\begin{table}[htbp]
\floatconts
{tab:dice_contrastive}%
{\caption{Mean Dice for models trained with a contrastive learning loss component (first row), edge detection (second row) and no self-supervised loss (third row). Reported are the mean and standard deviation of three cross-validation runs. 
}}%
{\begin{tabular}{l|l|l|l|l}

& \textbf{\textbf{}} & \textbf{$\mathcal{F}$ trained with} & \textbf{$\mathcal{F}$ trained with} & \textbf{$\mathcal{F}$ trained with}\\
 & \textbf{\textbf{\ \ \ Data}} & \textbf{M\&M Vendor A} & \textbf{M\&M Vendor B} & \textbf{Sunnybrook}\\
\hline
\parbox[t]{11mm}{\multirow{3}{*}{$\mathcal{L}_{ss}^{C}$}} & \textbf{M\&M Vendor A} & \textbf{.85 \textpm .02} & .37 \textpm .05 & .57 \textpm .02\\
& \textbf{M\&M Vendor B} & .71 \textpm .01 & \textbf{.87 \textpm .02} & .44 \textpm .10\\
& \textbf{Sunnybrook} & .57 \textpm .03 & .14 \textpm .04 & \textbf{.83 \textpm .02} \\
\hline
\parbox[t]{11mm}{\multirow{3}{*}{$\mathcal{L}_{ss}^{E}$}} & \textbf{M\&M Vendor A} & \textbf{.83 \textpm .04} & .36 \textpm .05 & .50 \textpm .05\\
& \textbf{M\&M Vendor B} & .65 \textpm .02 & \textbf{.86 \textpm .02} & .35 \textpm .15\\
& \textbf{Sunnybrook} & .60 \textpm .03 & .09 \textpm .03 & \textbf{.82 \textpm .01} \\
\hline
\parbox[t]{11mm}{\multirow{3}{*}{No $\mathcal{L}_{ss}$}} & \textbf{M\&M Vendor A} & \textbf{.86 \textpm .02} & .42 \textpm .05 & .60 \textpm .06\\
& \textbf{M\&M Vendor B} & .71 \textpm .07 & \textbf{.87 \textpm .06} & .36 \textpm .08\\
& \textbf{Sunnybrook} & .53 \textpm .02 & .16 \textpm .08 & \textbf{.80 \textpm .06} \\
   \end{tabular}}
\end{table}

\section{Novelty Distribution for Edge Detection Models}

Fig. \ref{fig:boxplot_edges} displays the distribution of novelty scores for models with an edge detection proxy task. We see that the amount of overlap between ID and OOD data is more pronounced than for contrastive learning models (see Fig.~\ref{fig:boxplot_contrastive}). The variant of our method that uses ensembles (\textit{Ours E (E)}) is the only approach that achieves a good separation.

\begin{figure}[htbp]
\floatconts
  {fig:boxplot_edges}
  {\caption{Distribution of novelty scores for models with an edge detection proxy task (lesser overlap is better). The novelty scores for ID and OOD data are aggregated for all experiments. The scores were normalized to $\left [ 0, 1 \right ]$.
  }}
  {\includegraphics[width=1\linewidth]{images/violinplot_edges.png}}
\end{figure}

\section{Generation of Target Data for Proxy Tasks} \label{sec:annex_proxy}

Fig.~\ref{fig:selfsup_gt} displays exemplary data generated to train the proxy tasks explored in this work. The first column showcases slices from the \textit{M}\&\textit{M Vendor B} dataset with overlayed ventricle blood pool segmentation (in red). The second column shows the same slices but with overlayed edge masks. Finally, the third column illustrates possible results of applying the transformation $\mathcal{T}(x_i) = \overline{x_i}$.

\begin{figure}[htbp]
\floatconts
  {fig:selfsup_gt}
  {\caption{From left to right: image $x_i$ with overlayed left ventricle blood pool segmentation ($y_i$), $x_i$ with overlayed edges $h_i$ and transformed image $\overline{x_i}$ with overlayed $y_i$.}}
  {\includegraphics[width=0.9\linewidth]{images/selfsup_gt.pdf}}
\end{figure}

\section{Evaluation Strategy} \label{sec:eval_setup}

Fig.~\ref{fig:eval} graphically illustrates our evaluation setup with three datasets for one cross-validation run. In turn, each dataset is considered ID and is divided into \textit{ID train} and \textit{ID test} data. The ID train data is used to train the model, as well as to set hyperparameters alongside one OOD dataset. The detection performance is reported in the ID test data and the second OOD dataset. The results of using each OOD dataset for each purpose are averaged.

\begin{figure}[htbp]
\floatconts
  {fig:eval}
  {\caption{Graphical illustration of the evaluation strategy.}}
  {\includegraphics[width=0.85\linewidth]{images/eval_setup.pdf}}
\end{figure}


\bibliography{gonzalez21}


\end{document}
