\section{Experiments}
\label{sec:experiments}
% ====================================================================================================

\begin{figure}[ht]
  \centering
  \includegraphics[width=.85\textwidth]{figures/brain_samples.png}%
  \caption{Examples of pixel-wise anomaly detection pipeline using various OOD datasets. The model has been trained on \textit{CamCAN T2}. Columns 1-2 show  For \textit{CamCAN T2 artifical}, the lesions have been generated artificially using 2D Gaussian blobs where the ground truth segmentation refers to the $1\sigma$ intensity drop.}
  \label{fig:brain_samples}
\end{figure}

\paragraph{Datasets.}
Brain scan MRI training samples from 653 healthy patients originate from the CamCAN dataset \cite{TAYLOR2017262} and only T2 weighted MR images are used for training.
For detection, we use T2- (and T1-weighted) images from the BraTS 2017 dataset \cite{BraTS17} which contains high- and low-grade glioma samples and provides pixel-level ground-truth lesion segmentations. BraTS is considered to be OOD w.r.t. domain-shift due to different scanning parameters and models. Since, to the best of our knowledge, there exist no public MR brain datasets holding both healthy and lesional samples, we artificially crafted lesional samples on top of the in-distribution validation set as described in Sec.~\ref{sec:appendix_pre_post}.  
We also test our algorithms on \ac{ood} datasets of non-medical images, which in theory reside far away from the data distribution of group (i), e.g., MNIST and images of Gaussian noise.
Finally, to assess whether the widely used practice of histogram matching mitigates this problem, we created matched and un-matched versions of the BraTS datasets. For all tables, \textit{HM} stands for matched histograms, where the histogram of each subject in CamCAN and BRATS is matched to that of a reference subject randomly selected from CamCAN. As histogram matching can be seen as a naive way to reduce the domain shift, we investigate whether it leads to an accuracy gain when detecting anomalous slices from different domains. 


\subsection{OOD Detection and Sample-wise Anomaly Detection}
\label{sec:results:ood_detection}

\begin{comment} 
\begin{figure}[htbp]
  \centering
  \subfloat[Training statistics CamCAN T2]
  {\includegraphics[height=5cm]{figures/camcan_train_stats.png}
  \label{fig:dose_train_stat}}
  \hfill
  \subfloat[DoSE KDE scores CamCAN T2]
  {\includegraphics[height=5cm]{figures/dose_kde_camcan.png}
  \label{fig:dose_train_stat}}
  \centering
  \vskip\baselineskip
  \subfloat[DoSE KDE scores BraTS T2]
  {\includegraphics[height=4.5cm]{figures/dose_kde_brats_t2.png}
  \label{fig:brats_dose_kde_t2}}
  \hfill
  \subfloat[DoSE KDE scores BraTS T1]
  {\includegraphics[height=4.5cm]{figures/dose_kde_brats_t1.png}
  \label{fig:brats_dose_kde_t1}}
  \caption{Sub-figures (c) and (d) show the potential of the entropy $H_{\ell_{1}}$ of the residual image to act as a disentangled sample-wise anomaly detection score due to its discriminative power across various OOD datasets.}
    \label{fig:dose_kde}
\end{figure}
\end{comment}

Tab.~\ref{tab:ood_results} reveals that the $WAIC$ score performs best at distinguishing between in-distribution and \ac{ood} slices regardless of whether the \ac{ood} samples contain lesions. It outperformed the $DoSE$ approach (using $l_{1}$, $D_{KL}$ and $\mathcal{L}$ statistics) which is seemingly affected by the $D_{KL}$ term suffering from performance degradation. Thus, the $WAIC$ score may serve as a strong candidate to perform initial filtering for OOD samples before employing a lesion-agnostic score to distinguish healthy from lesional slices. Tab.~\ref{tab:slice_wise_anomaly_detection} on the other hand shows results for classifying slices as healthy or lesional. The positive class is chosen to hold lesional slices from the test set while the negative class consists of healthy slices from the in-distribution training data as well as healthy slices from the test set. While we have found the $D_{KL}$-term to be a strong candidate metric for anomaly detection in some settings (similar to \cite{Zimmerer2020}), we have found it to be heavily influenced by architectural choices and training dynamics which let's us conclude that it is not a robust choice for metric for the task at hand. The proposed normalized entropy score shows no more discriminative power compared to the established metrics (such as $l_1$) which demands more investigation into ways to disentangle the underlying mechanisms for a sample to be OOD or not.  

\begin{table}[t]
\centering
\caption{OOD detection performance. The negative class holds samples from CamCAN T2 while the positive class consists of samples from the respective \ac{ood} dataset. OOD samples are subdivided s.t. either only healthy, lesional or all of them are considered. $\ell_{1}$ is the mean reconstruction error per pixel for a slice, $D_{KL}$ the KL divergence between the prior and approximate posterior and $\mathcal{L}$ the ELBO. We further introduce recent metrics from \ac{ood} detection in the form of $WAIC$ and $DoSE$ score using $\ell_{1}$, $D_{KL}$ and $\mathcal{L}$ training statistics. We observe that $WAIC$ performs best at detecting domain-shifted samples across all datasets.}
\resizebox{\textwidth}{!}{%
\begin{tabular}{llcccccccccc}
\hline
\multicolumn{2}{l}{\textbf{OOD Metric}} & \multicolumn{2}{c}{\textbf{$\ell_{1}$}} & \multicolumn{2}{c}{\textbf{$D_{KL}$}} & \multicolumn{2}{c}{\textbf{$\mathcal{L}$}} & \multicolumn{2}{c}{\textbf{$WAIC$}} & \multicolumn{2}{c}{\textbf{$DoSE_{(l_{1}, D_{KL}, \mathcal{L})}$}} \\ \hline
\textbf{OOD Dataset} & \multicolumn{1}{c}{} & AU_{ROC} & AU_{PRC} & AU_{ROC} & AU_{PRC} & AU_{ROC} & AU_{PRC} & AU_{ROC} & AU_{PRC} & AU_{ROC} & AU_{PRC} \\
\multirow{3}{*}{\textbf{\begin{tabular}[c]{@{}l@{}}BraTS \\ T2 \\ HM\end{tabular}}} & all & 0.89 & 0.86 & 0.68 & 0.61 & 0.85 & 0.81 & \underline{0.93} & \textbf{0.95} & 0.78 & 0.70 \\
 & healthy & 0.85 & 0.84 & 0.73 & 0.71 & 0.79 & 0.79 & \underline{0.87} & \textbf{0.91} & 0.78 & 0.76 \\
 & lesion & 0.92 & 0.89 & 0.63 & 0.55 & 0.90 & 0.87 & \underline{0.99} & \textbf{0.99} & 0.78 & 0.70 \\ \hline
\multirow{3}{*}{\textbf{\begin{tabular}[c]{@{}l@{}}BraTS \\ T2\end{tabular}}} & all & 0.87 & 0.85 & 0.69 & 0.62 & 0.83 & 0.81 & \underline{0.91} & \textbf{0.92} & 0.79 & 0.72 \\
 & healthy & \underline{0.84} & 0.83 & 0.74 & 0.72 & 0.77 & 0.78 & \underline{0.84} & \textbf{0.85} & 0.80 & 0.77 \\
 & lesional & 0.92 & 0.90 & 0.64 & 0.56 & 0.89 & 0.86 & \underline{0.98} & \textbf{0.98} & 0.78 & 0.70 \\ \hline
\multirow{3}{*}{\textbf{\begin{tabular}[c]{@{}l@{}}BraTS \\ T1 \\ HM\end{tabular}}} & all & \underline{0.92} & 0.91 & 0.73 & 0.64 & 0.88 & 0.88 & \underline{0.92} & \textbf{0.95} & 0.82 & 0.76 \\
 & healthy & \underline{0.88} & 0.88 & 0.76 & 0.71 & 0.83 & 0.85 & 0.86 & \textbf{0.91} & 0.82 & 0.79 \\
 & lesional & 0.96 & 0.95 & 0.70 & 0.60 & 0.95 & 0.93 & \underline{0.99} & \textbf{0.99} & 0.83 & 0.76 \\ \hline
\multirow{3}{*}{\textbf{\begin{tabular}[c]{@{}l@{}}BraTS \\ T1\end{tabular}}} & all & 0.95 & 0.94 & 0.77 & 0.74 & 0.91 & 0.90 & \underline{0.97} & \textbf{0.98} & 0.86 & 0.82 \\
 & healthy & \underline{0.94} & 0.94 & 0.82 & 0.82 & 0.88 & 0.88 & \underline{0.94} & \textbf{0.96} & 0.88 & 0.86 \\
 & lesional & 0.97 & 0.96 & 0.73 & 0.66 & 0.96 & 0.96 & \underline{1.00} & \textbf{1.00} & 0.84 & 0.78 \\ \hline
\multirow{3}{*}{\textbf{\begin{tabular}[c]{@{}l@{}}CamCAN \\ T2 \\ artificial\end{tabular}}} & all & 0.69 & 0.69 & 0.55 & 0.58 & 0.66 & 0.68 & \underline{0.74} & \textbf{0.79} & 0.63 & 0.61 \\
 & healthy & 0.51 & 0.51 & 0.51 & 0.50 & 0.52 & 0.52 & \underline{0.68} & \textbf{0.73} & 0.52 & 0.50 \\
 & lesional & \underline{0.87} & \textbf{0.85} & 0.60 & 0.64 & 0.84 & 0.82 & 0.81 & 0.84 & 0.74 & 0.70 \\ \hline
\textbf{MNIST} & \multicolumn{1}{c}{\textbf{}} & 1.00 & 1.00 & 0.00 & 0.31 & 1.00 & 1.00 & 1.00 & 1.00 & 1.00 & 1.00 \\ \hline
\textbf{Gaussian Noise} & \multicolumn{1}{c}{\textbf{}} & 1.00 & 1.00 & 0.00 & 0.30 & 1.00 & 1.00 & 1.00 & 1.00 & 1.00 & 1.00 \\ \hline
\end{tabular}
}
\label{tab:ood_results}
\end{table}

\begin{table}[t]
\centering
\caption{Slice-wise anomaly detection on various Out-of-Distribution datasets. The positive class holds lesional slices from the respective test set while the negative class holds healthy slices from the test and in-distribution validation set (CamCAN T2). To this point, neither the entropy score nor classical \ac{ood} detection metrics are able to outperform the classical reconstruction error metric.}
\resizebox{\textwidth}{!}{%
\begin{tabular}{lccccccccllcc}
\hline
\textbf{Metric} & \multicolumn{2}{c}{$\boldsymbol \ell_{1}$} & \multicolumn{2}{c}{$D_{KL}$} & \multicolumn{2}{c}{$\mathcal{L}$} & \multicolumn{2}{c}{$H_{\ell_{1}}$} & \multicolumn{2}{c}{WAIC} & \multicolumn{2}{c}{DoSE({$\ell_{1}$, $D_{KL}$, $\mathcal{L}, H_{\ell_{1}}$})} \\ 
\textbf{Test Set} & AU_{ROC} & AU_{PRC} & AU_{ROC} & AU_{PRC} & AU_{ROC} & AU_{PRC} & AU_{ROC} & AU_{PRC} & \multicolumn{1}{c}{AU_{ROC}} & \multicolumn{1}{c}{AU_{PRC}} & AU_{ROC} & AU_{PRC} \\\hline
\textbf{BraTS T2 HM} & \underline{0.74} & \textbf{0.44} & 0.50 & 0.31 & \underline{0.74} & \textbf{0.44} & 0.66 & 0.37 & 0.64 & 0.41 & 0.55 & 0.30 \\ \hline
\textbf{BraTS T2} & 0.72 & 0.41 & 0.47 & 0.29 & \underline{0.73} & \textbf{0.42} & 0.65 & 0.35 & 0.62 & 0.39 & 0.53 & 0.29 \\ \hline
\textbf{BraTS T2 HM H-flip} & \underline{0.73} & \textbf{0.41} & 0.50 & 0.30 & \underline{0.73} & \textbf{0.41} & 0.66 & 0.36 & 0.62 & 0.38 & 0.54 & 0.29 \\ \hline
\textbf{BraTS T2 HM V-flip} & 0.59 & 0.31 & 0.38 & 0.22 & 0.59 & 0.31 & 0.53 & 0.27 & \underline{0.62} & \textbf{0.36} & 0.46 & 0.25 \\ \hline
\textbf{CamCAN T2 artificial} & \underline{0.83} & \textbf{0.82} & 0.53 & 0.61 & 0.82 & \textbf{0.82} & 0.71 & 0.75 & 0.77 & 0.78 & 0.67 & 0.66 \\
\hline
\end{tabular}
}
\label{tab:slice_wise_anomaly_detection}
\end{table}


