\documentclass{midl} % Include author names
% \documentclass[anon]{midl} % Anonymized submission

% Löschen - nur zum durchstreichen verwendet
% \usepackage{ulem}
% \usepackage{color,soul}

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\jmlrvolume{-- nnn}
\jmlryear{2024}
\jmlrworkshop{Full Paper -- MIDL 2024}
\editors{Accepted for publication at MIDL 2024}

\title[3D Patch-based Student-Teacher Pyramid Matching in MRI]{A Patch-based Student-Teacher Pyramid Matching Approach to Anomaly Detection in 3D Magnetic Resonance Imaging}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Johannes Schwarz \nametag{$^{1, 4}$}} \Email{johannes.schwarz@ruhr-uni-bochum.de}
\AND
\Name{Lena Will \nametag{$^{3, 4}$}} \Email{lena.will@kk-bochum.de}
\AND
\Name{J{\"o}rg Wellmer \nametag{$^{4}$}} \Email{joerg.wellmer@kk-bochum.de}
\AND
\Name{Axel Mosig \nametag{$^{1, 2}$}} \Email{axel.mosig@ruhr-uni-bochum.de} \\
\addr $^{1}$ Center for Protein Diagnostics, Ruhr University Bochum, Germany \\
\addr $^{2}$ Faculty of Biology and Biotechnology, Bioinformatics Group, Ruhr University Bochum, Germany \\
\addr $^{3}$ Department of Diagnostic and Interventional Radiology, Neuroradiology and Nuclear Medicine, University Hospital Knappschaftskrankenhaus Bochum, Ruhr-University Bochum, Germany \\
\addr $^{4}$ Ruhr-Epileptology, Department of Neurology, University Hospital Knappschaftskrankenhaus Bochum, Ruhr-University Bochum, Germany
}

\begin{document}

\maketitle

\begin{abstract}
Anomaly detection on 3D magnetic resonance images (MRI) is of high medical relevance in the context of detecting lesions associated with different diseases. Yet, reliable anomaly detection in MRI images involves major challenges, specifically taking into account information in 3D, and the need to localize relatively small and subtle abnormalities within the context of whole organ MRIs. In this paper, a top-down approach, which uses student-teacher feature pyramid matching (STFPM) for detecting anomalies at image and voxel level, is applied to 3D brain MRI inputs. The combination of a 3D patch based self-supervised pre-training and axial-coronal-sagittal (ACS) convolutions pushes the performance above that of f-AnoGAN (bottom-up). The evaluation is based on a tumor dataset. Our code is available on \href{https://github.com/johannesSX/3D-STFPM-3DSSPL-ACS}{GitHub (3D-STFPM-3DSSPL-ACS)}.
\end{abstract}

\begin{keywords}
magnetic resonance imaging, anomaly detection, semi-supervised learning, student-teacher feature pyramid matching, voxel and image-level detection
\end{keywords}

\section{Introduction}
\label{sec:intro}
In medical image analysis, it is often an attractive and promising approach to view the identification and localization of disease as an anomaly detection problem, i.e., to regard the recognition of disease patterns as the identification of deviations from the norm. While unsupervised or semi-supervised anomaly detection approaches limit the need for extensive annotation, its application in specific medical settings is hampered by several factors. Most current anomaly detection approaches were devised for 2D images and thus do not accommodate the 3D nature of magnetic resonance imaging (MRI).  Furthermore, the predominant bottom-up or generative approaches only work well on smaller scaled volumes~\cite{Simarro2020}. 

We here investigate a novel approach to anomaly detection based on the Student-Teacher model \cite{Bergmann2019,Wang2021}. This approach is based on coupling the training process of two convolutional networks: A teacher network, which is trained on normal images as well as images containing anomalies, and a student network, which is trained on normal, anomaly-free images only. The basic idea of a student-teacher network is that when presented with an image containing anomalies, the image representation of the teacher will deviate significantly from the image representation of the student model. An anomaly map is formed by the difference between the layers of the teacher and the student.
The conceptual advantage over generative models for detecting anomalies is that the student-teacher is a purely top-down approach and thus allows an explicit definition of a classification loss. As a novel contribution to student-teacher networks, our approach presented here employs triplet margin loss \cite{BMVC2016_119} for 3D self-supervised patch learning.

A further limiting factor for utilizing the student-teacher approach in medical applications is the reliance on ImageNet-pretrained networks, so that the teacher network cannot be trained with domain-specific knowledge. To address this, our 3D student-teacher feature pyramid matching (STFPM) network uses the top-down structure of the ResNet to connect it to self-supervised patch learning. Here, this technique is applied to the teacher network in the form of patch learning to be able to provide the teacher with a wide range of input data~\cite{Danon2018}. 

Although not required in our approach, pre-training is still useful to deal with notoriously limited training data in medical applications. To facilitate pre-training in our 3D MRI setting, we employ Axial-Coronal-Sagittal (ACS) convolutions \cite{Yang2019}, which allow the use of 2D ImageNet weights for 3D convolutions. 

We evaluate our approach using a data set combining the BraTS tumor data set \cite{Baid2021, Menze2015, Bakas2017} with healthy MRI data from the IXI data set \cite{ixi}. We show that the use of ImageNet weights and self-supervised patch learning has a major impact on the performance of the 3D STFPM.

This combination of the student-teacher approach with patch-based learning and ACS convolutions creates a network that can detect anomalies both at the image level and at the pixel level. 

To summarize, the main contributions of the paper are: \textbf{1.} the first approach that extends STFPM to 3D MRI scans; \textbf{2.} the use of self-supervised patch learning (SSPL) on 3D MRI scans; and \textbf{3.} the combination of ACS-Convolution and ImageNet pre-trained weights on a medical domain.


\section{Related Work}
Anomaly detection is often performed via generative networks. This includes GAN-based \cite{Schlegl2017, pmid30831356, Akcay2018, Donahue2016} approaches as well as autoencoders~\cite{An2015VariationalAB}. In medical imaging on 3D MRI scans, a low resolution is usually selected for GAN-based networks~\cite{pmid30831356, Siddiquee2019, Luo2023} and for autoencoders~\cite{Behrendt2022, Pinaya2021, BAUR2021101952}. Slices are often also extracted from the axial, sagittal or coronal and anomaly detection is only operated at slice level~\cite{Pinaya2021, Han2020}.

The student-teacher approach was applied to the MVTec data set and has proven its effectiveness here~\cite{Bergmann2019, Wang2021}. Self-supervised patch learning comes from the field of metric learning and describes how local image descriptors can be learned. In connection with the student-teacher approach, self-supervised patch learning was already mentioned in \citet{Bergmann2019}. Here, however, it was not applied to medical images and only to 2D data.


\section{Method}

\begin{figure*}[t!]
\begin{center}
    \includegraphics[width=\textwidth]{./images/arch_final.png} % \fbox{\rule{0pt}{2in} \rule{.9\linewidth}{0pt}}
\end{center}
   \caption{Schematic structure of the 3D student teacher pyramid matching framework, with 3D self-supervised patch learning ($ \mathbf{p} $, $ \mathbf{p^+} $, $ \mathbf{p^-} $). In contrast to the teacher, the student only knows how healthy tissue is represented in its interlayers. By comparing the intermediate layer outputs from the teacher $ F_t $ and student $ F_s $, a 3D anomaly map $ \mathbf{A}_{map} $ is generated.}
\label{fig:arch}
\end{figure*}
In this work, the Student Teacher Model, introduced by \citet{Wang2021}, is used and extended to implicitly learn the feature distribution of healthy and diseased data. Both networks, student and teacher, are based on the same architecture to minimize knowledge loss \cite{Hinton2015}. 

The teacher network is pre-trained on healthy and diseased MRI's using self-supervised patch learning. On the other hand, there is the student, which only sees data from healthy patients and is not pre-trained. 
Pre-training the teacher is done through the following steps: Use Axial-Coronal-Sagittal (ACS) convolution \cite{Yang2019} to make use of ImageNet weights in 3D convolution and self-supervised image patch learning \cite{Danon2018}. Then, patch learning allows the teacher to recreate healthy and non-healthy tissue in the feature maps.
After the pre-training is complete, the teacher's weights are frozen and only the student's weights are trained. Both the teacher and the student receive an input image, $ \mathbf{I} \in \mathbb{R}^{C \times X \times Y \times Z}$, where $ C $ is the channel dimension, and $ X $, $ Y $ and $ Z $ represents the 3D image resolution, respectively. Then feature embeddings of the student and the teacher are computed after each ResNet block $ R_{block}^i $ (with $ i \in \{ 1, 2, 3 \}$). In this way, the student is meant to learn how normal samples are distributed.
During testing and evaluation, the difference between feature maps $ F_t $ and $ F_s $ are calculated, scaled up, and multiplied which each other and further used as anomaly map $ \mathbf{A}_{map} $ with the meaning of the higher the difference, the higher the probability of an anomaly. 


\subsection{Pre-training the Teacher}
A ResNet is used as the basis of the teacher. All convolutional layers are replaced by ACS convolutional layers, while the pre-trained ImageNet weights are retained.
\paragraph{ACS convolution} 
In ACS convolution \cite{Yang2019}, 2D convolutions are performed in three directions  (axial ($ a $), coronal ($ c $), sagittal ($ s $)) of the 3D volume. For this, the 2D kernel is split into three 3D kernels:
\begin{align}
    \mathbf{W}_{a} \in \mathbb{R}^{C_{in} \times C_{out}^{(a)} \times K \times K \times 1}, \mathbf{W}_{c} \in \mathbb{R}^{C_{in} \times C_{out}^{(c)} \times K \times 1 \times K}, \mathbf{W}_{s} \in \mathbb{R}^{C_{in} \times C_{out}^{(s)} \times 1 \times K \times K} 
\end{align}
From a 3D input feature $ \mathbf{I}_{in} \in \mathbb{R}^{C_{in} \times X \times Y \times Z} $, a 3D output $ \mathbf{I}_{out} \in \mathbb{R}^{C_{out} \times X \times Y \times Z} $ is created with the ACS convolution, which uses the 2D convolutional kernel.
$ C_{in} $ and $ C_{out} $ are the input and output channels, and $ K $ denotes the kernel size. 

\paragraph{Self-Supervised Image Patch Learning} 
After the ImageNet weights have been initialized, the teacher is trained using self-supervised patch learning which, following~\citet{Bergmann2019} and~\citet{Danon2018}, yields local image descriptors as a result. In this work, we extended the approach from~\citet{Danon2018} to 3D input images.

For this purpose, anchor boxes $ \mathbf{p} $ of spatial size $ (32 \times 32 \times 32) $ are randomly cut out for each image $ \mathbf{I} $ in the teacher training. Then, following a grid, another box $ \mathbf{p}^+ $ is cut out in the immediate vicinity (positive box). Negative patches are cut out from another MRI image at a random position. Each of the boxes has a spatial size of $ (32 \times 32 \times 32) $. Anchor boxes that only contain background voxels are discarded.

As \citet{Bergmann2019} suggests, in-triplet hard negative mining with anchor swap \cite{BMVC2016_119} is used as a loss function that implements an embedding sensitive to the $ \ell_2 $ metric:
\begin{align}
    \mathcal{L}_{teacher} = \max{\{0, \delta + \delta^{+} - \delta^{-} \} }
\end{align}
where $ \delta > 0 $ denotes the margin parameter and in-triplet distances $ \delta^+ $ and $ \delta^- $ are defined as:
\begin{align}
    \delta^+ &= || \hat{T}(\mathbf{p}) - \hat{T}(\mathbf{p^+}) ||^2 \\
    \delta^- &= \min\{|| \hat{T}(\mathbf{p}) - \hat{T}(\mathbf{p^-}) ||^2, || \hat{T}(\mathbf{p}^+) - \hat{T}(\mathbf{p^-}) ||^2\}
\end{align}
$ \hat{T} $ is the output of the Teacher ResNet.

\subsection{Training the Student}
The student's training uses the same architecture as the teacher's to achieve optimal knowledge distillation. This means that all convolution layers are replaced by an ACS convolution. Only, there is no pre-training and the weights are initialized randomly.

\paragraph{Training} After the teacher has been pre-trained, their weights are frozen and only the student's weights are trained, which is initialized with randomized weights. In each student training step, a batch $ \mathcal{B} $ of images $ \mathcal{B} = {\mathbf{I}_1, \mathbf{I}_2, \dots, \mathbf{I}_n}$ is fed into the teacher and into the student. Given an input image, $ \mathbf{I} \in \mathbb{R}^{C \times X \times Y \times Z} $ the features $ F_t^l( \mathbf{I} ) $ and $ F_s^l( \mathbf{I} ) $ are calculated after each ResNet block $ l \in \{1, 2, 3\} $. 
To calculate the loss at all positions $ (x, y, z) $ in the feature maps, a $ \ell_2 $ distance between $ \ell_2 $ normalized feature vectors is defined and thus the loss over the whole image is calculated via the average at each image position:
\begin{align}
    \mathcal{L}_{student}^l(\mathbf{I}) = \frac{1}{X_l Y_l Z_l} \sum_{x=1}^{X_l} \sum_{y=1}^{Y_l} \sum_{z=1}^{Z_l} \left( \frac{1}{2} \left| \left| \hat{F}_t^l(\mathbf{I})_{xyz} - \hat{F}_s^l(\mathbf{I})_{xyz} \right| \right|_{\ell_2}^2 \right)
\end{align}
Here $X^l$, $Y^l$ and $Z^l$ are the spatial resolution of the feature map by ResNet block $l$. As in~\citet{Wang2021}, the features $F_t^l(\mathbf{I})$ and $F_s^l(\mathbf{I})$ are respectively normalized to form $ \hat{F}_t^l(\mathbf{I}) = \left(F_t^l(\mathbf{I}))\right) / \left(\left|\left| F_t^l(\mathbf{I}) \right| \right|_{\ell_2}^2\right) $  and $ \hat{F}_s^l(\mathbf{I}) = \left(F_s^l(\mathbf{I}))\right) / \left(\left|\left| F_s^l(\mathbf{I}) \right| \right|_{\ell_2}^2\right) $.

\paragraph{Evaluation} 
For an image $ \mathbf{I} $ that is to be evaluated, the features of the teacher $ F_t^l( \mathbf{I} ) $ and the student $ F_s^l( \mathbf{I} ) $ are calculated and then scaled up by trilinear interpolation to the size of the input image $ \mathbf{I} $. The upscaled images are each multiplied with one another, resulting in an anomaly map $ \mathbf{A}_{map} $. To obtain a detection score $ s_{detect} $ for an image $ \mathbf{I} $, the maximum value of $ \mathbf{A}_{map} $ is used: $ s_{detect} = \max{(\mathbf{A}_{map})} $. The entire anomaly map $ \mathbf{A}_{map} $ is used for pixel level detection.


\section{Experiments}
All experiments and their evaluation are performed on the BraTS 2021~\cite{Baid2021, Menze2015, Bakas2017} and IXI~\cite{ixi} data set. To find the optimal model and show that the combination of ACS convolution, patch learning and pre-trained ImageNet weights delivers state-of-the-art performance, the following experiments were performed:
\textbf{Experiment 1}: The teacher is initialized with ImageNet with no further teacher training, constituting a fully unsupervised learning setting.
\textbf{Experiment 2}: The teacher is not initialized with ImageNet weights, but the self-supervised patch learning is applied.
\textbf{Experiment 3}: The teacher is initialized with ImageNet weights, and the self-supervised patch learning is used in addition.
\textbf{Experiment 4}: Same as Experiment 3, but k-means clustering is applied in the evaluation of the $\mathbf{A}_{map} $ anomaly map, as suggested by~\citet{Siddiquee2019}. Assuming that there is always one contiguous lesion, two clusters can be formed. Namely, one cluster for the healthy tissue and one cluster for the diseased tissue. Since only lesional images are used for this, no detection performance is given here (see table \ref{tbl:results}).

For all experiments, ACS convolution is used for the teacher and the student. The experiments aiming to use tumors on the BraTS dataset each use the T2 sequence, since the IXI~\cite{ixi} dataset only provides a T2 sequence and no FLAIR sequence. 
As a reference method for anomaly detection, we use the f-AnoGAN from \citet{pmid30831356}. The \mbox{f-AnoGAN} is an unsupervised method that detects anomalies both at the image level and at the pixel level. Originally, the f-AnoGAN can only process 2D images. For comparison, all 2D convolutional layers have been replaced by 3D convolutional layers, similar to~\citet{Simarro2020}.

\subsection{Preprocessing}
For comparability, all images are registered on the template MNI-152 \cite{pmid32669554} and skull-stripped with a prefabricated mask to avoid hyperintensities. In addition, histogram standardization~\cite{Nyul2000} and Z-normalization is performed since the MRIs come from different sources (healthy from IXI dataset, diseased from BraTS dataset). The MRI scans are cropped to $ (156 \times 156 \times 156 ) $ after registration, and then scaled to $ (224 \times 224 \times 224) $. This is done to eliminate the large black borders around the MRIs that appear just after skull stripping.

\subsection{Dataset}
BraTS and IXI data sets are each split into training (70\%), validation (15\%), and test (15\%) data sets. For the training of the teacher, the same number of MRI scans are taken from the pool of training data of the BraTS and the IXI data set. All IXI images from the training pool are used for the training of the student. All BraTS images from the validation or test pool are used to evaluate the segmentation performance. Equal amounts of IXI and BraTS data from the validation and test pools are used to evaluate the classification performance. The segmentation map of the BraTS data set consists of several regions (no lesion - label 0; non-enhancing tumor core - label 1;  the peritumoral edema - label 2;  GD-enhancing tumor - label 4). To generate a binary segmentation map, all values greater than 0 are considered a lesion~\cite{Baid2021}.

\subsection{Implementation Details}
\label{sec:impldetails}
To keep the number of parameters as small as possible, a ResNet-18 was selected for the experiments. Further experiments with a ResNet-50 can be found in the appendix (see chapter~\ref{sec:appendix}). The teacher and the student are each trained for $ 64 $ epochs. In the validation, those weights of the student and teacher network were used where the AUROC metric is highest. Such thresholds were then applied to the independent test set. Stochastic Gradient Descent (SGD) with a learning rate of $0.1$ is used for the teacher. For the student, the learning rate is $0.5$. The batch size is 2 for teachers and students. 
The training parameters for f-AnoGAN are the same as in \citet{Simarro2020}.


\section{Results}
\begin{table}[t]%
    \begin{center}
    {
    \footnotesize
\begin{tabular}{|c|cccc|cc|ccc|}
\hline
     & \multicolumn{4}{c|}{\textbf{Configuration}}                                                                                                                                                                & \multicolumn{2}{c|}{\textbf{Detection}}                                                                                               & \multicolumn{3}{c|}{\textbf{Segmentation}}                                                                                                                                                                            \\ \hline
Exp. & \multicolumn{1}{c|}{Arch.}    & \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}Patch \\ Learning\end{tabular}} & \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}ImageNet\\ Weights\end{tabular}} & k-means & \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}AUROC\\ {[}\%{]}\end{tabular}} & \begin{tabular}[c]{@{}c@{}}AP\\ {[}\%{]}\end{tabular} & \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}AUROC\\ {[}\%{]}\end{tabular}} & \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}IOU\\ {[}\%{]}\end{tabular}} & \begin{tabular}[c]{@{}c@{}}DICE\\ {[}\%{]}\end{tabular} \\ \hline
     & \multicolumn{1}{c|}{f-AnoGAN}  & \multicolumn{1}{c|}{}                                                          & \multicolumn{1}{c|}{}                                                           &         & \multicolumn{1}{c|}{86.89}                                                    & 86.38                                                 & \multicolumn{1}{c|}{82.81}                                                    & \multicolumn{1}{c|}{8.29}                                                   & 14.98                                                   \\ \hline
1    & \multicolumn{1}{c|}{3D STFPM} & \multicolumn{1}{c|}{}                                                          & \multicolumn{1}{c|}{x}                                                          &         & \multicolumn{1}{c|}{62.81}                                                    & 60.37                                                 & \multicolumn{1}{c|}{45.83}                                                    & \multicolumn{1}{c|}{9.15}                                                   & 0.0                                                     \\ \hline
2    & \multicolumn{1}{c|}{3D STFPM} & \multicolumn{1}{c|}{x}                                                         & \multicolumn{1}{c|}{}                                                           &         & \multicolumn{1}{c|}{61.42}                                                    & 68.71                                                 & \multicolumn{1}{c|}{71.43}                                                    & \multicolumn{1}{c|}{6.52}                                                   & 12.14                                                   \\ \hline
3    & \multicolumn{1}{c|}{3D STFPM} & \multicolumn{1}{c|}{x}                                                         & \multicolumn{1}{c|}{x}                                                          &         & \multicolumn{1}{c|}{\textbf{94.13}}                                           & \textbf{94.38}                                        & \multicolumn{1}{c|}{\textbf{89.22}}                                           & \multicolumn{1}{c|}{\textbf{12.09}}                                         & \textbf{20.98}                                          \\ \hline
4    & \multicolumn{1}{c|}{3D STFPM} & \multicolumn{1}{c|}{x}                                                         & \multicolumn{1}{c|}{x}                                                          & x       & \multicolumn{1}{c|}{-}                                                        & -                                                     & \multicolumn{1}{c|}{59.72}                                                    & \multicolumn{1}{c|}{\textbf{17.69}}                                         & \textbf{28.17}                                          \\ \hline
\end{tabular}}
\end{center}
\caption{Results of the experiments on the BraTS data set. 3D STFPM with a ResNet-18 as a backbone and with pre-trained ImageNet weights, using ACS convolution, and 3D self-supervised patch learning beats f-AnoGAN.}
\label{tbl:results}
\end{table}

To evaluate performance, we followed common standards and used the area under the reciver-operator curve (AUROC) and Average Precision (AP). In addition, intersection over union and dice-coefficient were calculated to assess segments detected at pixel level. 

\paragraph{Metrics} Table \ref{tbl:results} shows the results compared to the f-AnoGAN. It should be noted that the method with ACS convolution, ImageNet initialization and 3D self-supervised patch learning achieves the best result. Both on the image level and on the pixel level. In addition, one can see that with each additional piece of information that is entered into the training (ImageNet weights, patches for the teacher, and k-Means Clustering), the performance of the independent test data set increases.

In the original paper from \citet{Wang2021}, the teacher is only initialized with \mbox{ImageNet} weights. One can clearly see that 3D patch learning provides domain-specific knowledge. In addition, the ImageNet-weight information improves the result again. However, it should also be noted that the evaluation of the intersection over union delivers a poor result. This is because many false positives are produced and no supervised method is used for correction.

\paragraph{Parameters} The f-AnoGAN with its 3D convolutional layers requires a few more parameters during training. Generator, discriminator and encoder come to $ 55,029,542 $ parameters. Therefore, the network can only process $ ( 64 \times 64 \times 64 ) $ voxel images.
In contrast, the Student Teacher approach requires only 22,346,752 parameters (11,173,376 each). 

\paragraph{Training-Convergence}
Through early stopping, the optimal AUROC value for detection was achieved after 12 epochs.

\subsection{3D Self-Supervised Patch Learning}
\begin{figure} % pure_emb_1_160 von version_11195
    \centering
    \subfigure[\small Input Image $ \mathbf{I} $]{
        %\label{fig:a}
        \includegraphics[width=0.175\textwidth]{./images/patch_1.png}
    }
    \hfill
    \subfigure[\small Pseudo-RGB]{
        %\label{fig:a}
        \includegraphics[width=0.175\textwidth]{./images/patch_2.png}
    }
    \hfill
    \subfigure[\small Emb. Large]{
        %\label{fig:a}
        \includegraphics[width=0.175\textwidth]{./images/patch_3.png}
    }
    \hfill
    \subfigure[\small Emb. Medium]{
        %\label{fig:a}
        \includegraphics[width=0.175\textwidth]{./images/patch_4.png}
    }
    \hfill
    \subfigure[\small Emb. Small]{
        %\label{fig:a}
        \includegraphics[width=0.175\textwidth]{./images/patch_5.png}
    }
    \caption{Euclidean distances are preserved both in the pseudo-RGB image and in the feature embedding layers after each ResNet block.Marked here with Large, Medium and Small for the 1st, 2nd and 3rd ResNet block respectively.}
    \label{fig:pseudo}
\end{figure}
Through self-supervised patch learning, the teacher learns to replicate the input images exactly in their intermediate layers. The network can thus represent an input image spatially well in the feature space. Pseudo-RGB images can be created by dimension reduction over the multiplied feature embedding layers, to visualize that the Euclidean distances between the individual patches are maintained in the embedding space~\cite{Danon2018}. Looking at the teacher's summed output layers in figure \ref{fig:pseudo} after self-supervised patch learning, one can also see that the network is quite good at tracking healthy and diseased tissue. The pseudo-RGB image, which uses dimension reduction via PCA, also suggests that Euclidean distances are preserved~\cite{Danon2018}.


\subsection{Anomaly Maps}
\begin{figure*} % pure_emb_0_170 von version_11198
    \centering
    \subfig{
        \includegraphics[width=0.11\textwidth]{./images/ano_1.png}
    }
    \hfill
    \subfig{
        \includegraphics[width=0.11\textwidth]{./images/ano_2.png}
    }
    \hfill
    \subfig{
        \includegraphics[width=0.11\textwidth]{./images/ano_3.png}
    }
    \hfill
    \subfig{
        \includegraphics[width=0.11\textwidth]{./images/ano_4.png}
    }
    \hfill
    \subfig{
        \includegraphics[width=0.11\textwidth]{./images/ano_5.png}
    }
    \hfill
    \subfig{
        \includegraphics[width=0.11\textwidth]{./images/ano_6.png}
    }
    \hfill
    \subfig{
        \includegraphics[width=0.11\textwidth]{./images/ano_7.png}
    }
    \caption{From left to right: Input image $ \mathbf{I} $ as 2D slice from sagittal orientation of a BraTS T2 scan, anomaly map after ResNet block 1, anomaly map after ResNet block 2, anomaly map after ResNet block 3, generated anomaly map $ \mathbf{A} _{map} $, anomaly map $ \mathbf{A}_{map} $ placed on input image $ \mathbf{I} $, ground truth of the BraTS sequence.}
    \label{fig:ano}
\end{figure*}
After the first three ResNet blocks, an anomaly map can be created by multiplying the teacher and the student. The figure \ref{fig:ano} shows these color-encoded maps belonging to the input image on the left, as well as the intermediate anomaly maps on the different scales. The color-encoding is used to indicate the relative level of anomaly, where blue areas encode low differences and yellow areas encode the highest differences found.
The BraTS dataset contains ground truth labels, which encode where abnormal tissue is in the image. When comparing the resulting anomaly map from the network to the ground truth map, one finds that these are similar, suggesting that the network can detect pathological brain formations.

\section{Conclusion}
We presented a framework for anomaly detection of 3D MRI scans that uses axial-coronal-sagittal convolution to use ImageNet pretrained networks and can simultaneously process a 3D input.
With the newly introduced 3D self-supervised patch learning for the teacher, a broad knowledge of healthy and diseased tissue is taught. Together with the student's 3D training, state-of-the-art performance is achieved.
Large to medium-sized lesions can be well identified thanks to the top-down approach that can process high-resolution MRI scans. In addition, the framework presented scores with short training times and can therefore be used flexibly.

\newpage

\midlacknowledgments{This research was funded in part by the humAIne project, funded by the German Ministry of Science and Education (FKZ 02L19C203). We would like to thank Lena Will, radiologist at the University Hospital Knappschaftskrankenhaus Bochum, for the enriching discussion on the subject of 3D MRI data.

\noindent We thank Sven Kreienbrock and Tobias Erm for technical assistance.}

\newpage

\bibliography{midl24_205}

\newpage 


\appendix

\section{Supplementary Material}
\label{sec:appendix}
\paragraph{3D images}:
Anomaly image after applying our 3D patch-based student-teacher framework (see figure \ref{fig:applano}).
\begin{figure*}[h] % pure_emb_0_170 von version_11198
    \centering
    \subfigure{
        \includegraphics[width=0.11\textwidth]{./images/ano_s_1.png}
    }
    \hfill
    \subfigure{
        \includegraphics[width=0.11\textwidth]{./images/ano_s_2.png}
    }
    \hfill
    \subfigure{
        \includegraphics[width=0.11\textwidth]{./images/ano_s_3.png}
    }
    \hfill
    \subfigure{
        \includegraphics[width=0.11\textwidth]{./images/ano_s_4.png}
    }
    \hfill
    \subfigure{
        \includegraphics[width=0.11\textwidth]{./images/ano_s_5.png}
    }
    \hfill
    \subfigure{
        \includegraphics[width=0.11\textwidth]{./images/ano_s_6.png}
    }
    \hfill
    \subfigure{
        \includegraphics[width=0.11\textwidth]{./images/ano_s_7.png}
    }
    \\
    \subfigure{
        \includegraphics[width=0.11\textwidth]{./images/ano_c_1.png}
    }
    \hfill
    \subfigure{
        \includegraphics[width=0.11\textwidth]{./images/ano_c_2.png}
    }
    \hfill
    \subfigure{
        \includegraphics[width=0.11\textwidth]{./images/ano_c_3.png}
    }
    \hfill
    \subfigure{
        \includegraphics[width=0.11\textwidth]{./images/ano_c_4.png}
    }
    \hfill
    \subfigure{
        \includegraphics[width=0.11\textwidth]{./images/ano_c_5.png}
    }
    \hfill
    \subfigure{
        \includegraphics[width=0.11\textwidth]{./images/ano_c_6.png}
    }
    \hfill
    \subfigure{
        \includegraphics[width=0.11\textwidth]{./images/ano_c_7.png}
    }
    \\
    \subfigure{
        \includegraphics[width=0.11\textwidth]{./images/ano_a_1.png}
    }
    \hfill
    \subfigure{
        \includegraphics[width=0.11\textwidth]{./images/ano_a_2.png}
    }
    \hfill
    \subfigure{
        \includegraphics[width=0.11\textwidth]{./images/ano_a_3.png}
    }
    \hfill
    \subfigure{
        \includegraphics[width=0.11\textwidth]{./images/ano_a_4.png}
    }
    \hfill
    \subfigure{
        \includegraphics[width=0.11\textwidth]{./images/ano_a_5.png}
    }
    \hfill
    \subfigure{
        \includegraphics[width=0.11\textwidth]{./images/ano_a_6.png}
    }
    \hfill
    \subfigure{
        \includegraphics[width=0.11\textwidth]{./images/ano_a_7.png}
    }
    \caption{From top to bottom: Sagittal, -coronal, -axial alignment. With each input image $ \mathbf{I} $ as a 2D slice of a BraTS T2 scan, anomaly map after ResNet block 1, anomaly map after ResNet block 2, anomaly map after ResNet block 3, generated anomaly map $ \mathbf{A}_{map} $, anomaly map $ \mathbf{A}_{map} $ placed on input image $ \mathbf{I} $, ground truth of the BraTS sequence.}
    \label{fig:applano}
\end{figure*}

\paragraph{ResNet-Versions}: 
We tested different ResNet versions using ImageNet weights and 3D SSPL without k-means clustering. Results are listed in the table \ref{tbl:resnet}. In each case, the same learning rates were set for the teacher and for the student (see chapter~\ref{sec:impldetails}).
\begin{table}[h]
    \begin{center}
    {
    \footnotesize
\begin{tabular}{|c|cc|ccc|}
\hline
\textbf{Configuration}                                   & \multicolumn{2}{c|}{\textbf{Detection}}                                                                                                    & \multicolumn{3}{c|}{\textbf{Segmentation}}                                                                                                                                                                            \\ \hline
\begin{tabular}[c]{@{}c@{}}ResNet\\ Version\end{tabular} & \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}AUROC\\ {[}\%{]}\end{tabular}} & \begin{tabular}[c]{@{}c@{}}AVGPREC\\ {[}\%{]}\end{tabular} & \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}AUROC\\ {[}\%{]}\end{tabular}} & \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}IOU\\ {[}\%{]}\end{tabular}} & \begin{tabular}[c]{@{}c@{}}DICE\\ {[}\%{]}\end{tabular} \\ \hline
50                                                       & \multicolumn{1}{c|}{66.90}                                                    & 61.08                                                      & \multicolumn{1}{c|}{81.95}                                                    & \multicolumn{1}{c|}{7.73}                                                   & 13.95                                                   \\ \hline
34                                                       & \multicolumn{1}{c|}{63.61}                                                    & 67.03                                                      & \multicolumn{1}{c|}{\textbf{91.92}}                                           & \multicolumn{1}{c|}{\textbf{16.37}}                                         & \textbf{26.92}                                          \\ \hline
18                                                       & \multicolumn{1}{c|}{\textbf{94.24}}                                           & \textbf{94.68}                                             & \multicolumn{1}{c|}{\textbf{90.08}}                                           & \multicolumn{1}{c|}{\textbf{12.22}}                                         & \textbf{21.13}                                          \\ \hline
\end{tabular}}
\end{center}
\caption{Experiments on the BraTS dataset with ImageNet pre-trained weights and 3D self-supervised patch learning. The ResNet-18 and ResNet-34 performs best, but ResNet-18 can be train faster.}
\label{tbl:resnet}
\end{table}

\paragraph{System and runtime}: 
All training was conducted on a computer with an Nvidia RTX 4090 24 Gbyte graphics card, an Intel Core i7-6800K CPU, and 64 Gbytes of memory.

Student-Teacher runtime: 7 minutes per epoch are required to train the teacher. The student is trained in 3 minutes per epoch. For both networks, training is performed over 64 epochs.

f-AnoGAN runtime: On the same system, with the same data loader, the f-AnoGAN requires 3 minutes per epoch. The encoder is also trained in 3 minutes per epoch. As suggested by~\cite{Simarro2020}, more than 200 epochs have been trained.

\end{document}
