\documentclass{midl} % Include author names
% \documentclass[anon]{midl} % Anonymized submission

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\usepackage{booktabs}
\usepackage{arydshln}
\usepackage{multirow}
\usepackage{graphicx}

\makeatletter
\def\adl@drawiv#1#2#3{%
        \hskip.5\tabcolsep
        \xleaders#3{#2.5\@tempdimb #1{1}#2.5\@tempdimb}%
                #2\z@ plus1fil minus1fil\relax
        \hskip.5\tabcolsep}
\newcommand{\cdashlinelr}[1]{%
  \noalign{\vskip\aboverulesep
           \global\let\@dashdrawstore\adl@draw
           \global\let\adl@draw\adl@drawiv}
  \cdashline{#1}
  \noalign{\global\let\adl@draw\@dashdrawstore
           \vskip\belowrulesep}}
\makeatother


\jmlryear{2024}\jmlrworkshop{Full Paper -- MIDL 2024}\jmlrvolume{-- nnn}\editors{Accepted for publication at MIDL 2024}
% \editors{Accepted at MIDL 2024}

\title[UAD with Supervision]{Combining Reconstruction-based Unsupervised Anomaly Detection with Supervised Segmentation for Brain MRIs}
%{Improving Generalization of Self-Supervised Anomaly Detection using Diffusion Models}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Finn Behrendt\nametag{$^{1}$}} \Email{finn.behrendt@tuhh.de}\\
\Name{Debayan Bhattacharya\nametag{$^{1}$}} \Email{debayan.bhattacharya@tuhh.de}\\
\Name{Lennart Maack \nametag{$^{1}$}} \Email{lennart.maack@tuhh.de}\\
\Name{Julia Krüger\nametag{$^{2}$}} \Email{julia.krueger@jung-diagnostics.de}\\
\Name{Roland Opfer\nametag{$^{2}$}} \Email{roland.opfer@jung-diagnostics.de}\\
\Name{Alexander Schlaefer\nametag{$^{1}$}} \Email{schlaefer@tuhh.de}\\
\addr $^{1}$ Institute of Medical Technology and Intelligent Systems, Hamburg University of Technology,  Hamburg, Germany \AND
\addr $^{2}$ Jung Diagnostics GmbH, Hamburg, Germany
}


\begin{document}

\maketitle

\begin{abstract}
In contrast to supervised deep learning approaches, unsupervised anomaly detection (UAD) methods can be trained with healthy data only and do not require pixel-level annotations, enabling the identification of unseen pathologies. While this is promising for clinical screening tasks, reconstruction-based UAD methods fall short in segmentation accuracy compared to supervised models. Therefore, self-supervised UAD approaches have been proposed to improve segmentation accuracy. Typically, synthetic anomalies are used to train a segmentation network in a supervised fashion. However, this approach does not effectively generalize to real pathologies.
We propose a framework combining reconstruction-based and self-supervised UAD methods to improve both segmentation performance for known anomalies and generalization to unknown pathologies. The framework includes an unsupervised diffusion model trained on healthy data to produce pseudo-healthy reconstructions and a supervised Unet trained to delineate anomalies from deviations between input-reconstruction pairs. 
Besides the effective use of synthetic training data, this framework allows for weakly-supervised training with small annotated data sets, generalizing to unseen pathologies. Our results show that with our approach, utilizing annotated data sets during training can substantially improve the segmentation performance for in-domain data while maintaining the generalizability of reconstruction-based approaches to pathologies unseen during training.
\end{abstract}

\begin{keywords}
Unsupervised Anomaly Detection, Diffusion Models, Brain MRI, Self Supervision, Weak Supervision
\end{keywords}

\section{Introduction}
Deep learning (DL) methods have advanced in their ability to detect and segment brain pathologies in MRI images \cite{Lundervold.2019}. However, acquiring annotated data for each pathology is a challenge, especially when considering screening tasks, where the objective is to detect any potential anomaly.\\ 
% Reco-based UAD
Unsupervised anomaly detection (UAD) provides a potential solution by modeling the distribution of healthy brain MRI scans to identify anomalies as outliers. A common technique in UAD is reconstruction-based anomaly detection, where generative models (GM) are trained to reconstruct healthy brain images. At test time, the GMs fail to replicate pathologies, thereby revealing anomalies through discrepancies between input and reconstruction. This method only necessitates healthy data and enables the identification of pathologies not encountered during training, which poses a challenge for supervised models.
However, the performance of reconstruction-based UAD methods is often surpassed by supervised models when sufficient task-specific data is available \cite{Chen.2020,Baur.2021}. Unlike supervised methods, UAD methods that rely on reconstructions do not directly learn the relationship between abnormal patterns and their corresponding annotations. Instead, the segmentation map is a byproduct of measuring the discrepancy between input and reconstruction. This results in a noisy anomaly map with potential false positives caused by the GM’s imperfect reconstructions. Consequently, distinguishing actual anomalies from normal reconstruction errors can be challenging. 
An alternative approach is self-supervised UAD, where synthetic anomalies are introduced to the healthy brain images to train a segmentation network in a supervised manner. Unlike reconstruction-based UAD, this strategy produces distinct anomaly maps with high specificity, simplifying the discrimination of abnormal structures similar to the synthesized anomalies. However, the segmentation performance depends on the nature of the generated anomalies and tends to have limited generalization to real pathologies \cite{Lagogiannis.2023,Cai.2023}. \\
In this study, we aim to combine the strong generalization capabilities and high sensitivity of reconstruction-based methods with the high specificity of self-supervised methods. We develop a framework that employs a denoising diffusion probabilistic model (DDPM; DM) to generate pseudo-healthy reconstructions of potentially abnormal input images (reconstruction branch). Furthermore, an Unet is trained to segment anomalies based on the residual of the input and the pseudo-healthy reconstruction (segmentation branch). 
We consider different settings to obtain the annotations for the supervised training of the Unet. First, in the self-supervised setting, we introduce synthetically generated anomalies to healthy brain MRIs. Second, in the semi-supervised setting, we utilize a small amount of annotated data containing real pathologies. 
At test time, the unsupervised anomaly maps from the reconstruction branch and the supervised predictions from the segmentation branch are fused to a final anomaly score. \\
The results demonstrate that in contrast to self-supervised methods, our approach allows to integrate supervision while maintaining the generalizability of the underlying reconstruction branch. Specifically, we can improve the Dice score of reconstruction-based UAD methods from 58.55 \% to 69.68 \% for tumors when using the same pathologies for training, while the Dice score for stroke lesions unseen during training increases from 24.74 \% to 26.77 \%.

\section{Related Work}
For reconstruction-based UAD, different architectures have been proposed as GM. While the majority focuses on Autoencoders (AE) \cite{Baur.2021b} or Variational autoencoders (VAE) \cite{Zimmerer.2019,Chen.2020,Bercea.2023MIDL,Bercea.2023MICa}, also vector-quantized VAEs \cite{Pinaya.2022} and GANs \cite{Nguyen.2021} have been employed. Moreover, it has been shown that utilizing denoising tasks for regularization with Unet-like AEs can improve the UAD performance \cite{Kascenas.2022b,Kascenas.2023}. Consequently, DDPMs have emerged as a GM for reconstruction-based UAD \cite{Wyatt.2022,Behrendt.2023b,Behrendt.2023c,Bercea.2023IMHL}. 
In self-supervised UAD, typically, synthetic anomalies are incorporated into normal brain images. Subsequently, Unets are trained to segment these synthetic anomalies \cite{Tan.2021,Tan.2022,Cho.2022,Meissen.2022}. We note that while AE-based reconstruction methods may also fall under the category of self-supervised techniques, within this work, the term "self-supervised" refers to the aforementioned approach of training segmentation models using synthetic anomalies.
Expanding on this strategy, DRAEM \cite{Zavrtanik.2021} employs a dual-network architecture comprising a generator and a segmentation network. The generator is trained to eliminate synthetic anomalies, thereby providing a pseudo-healthy reconstruction. The segmentation network is then used to segment the generated anomalies, given the concatenation of abnormal input and pseudo-healthy reconstruction. Note that for the generator network in DRAEM, inpainting of synthetic anomalies is enforced by calculating the reconstruction loss between reconstruction and the anomaly-free input.
In contrast, in our approach, the reconstruction model is trained on healthy data in an unsupervised fashion to remove any abnormal structure that is not part of the healthy training distribution. Hence, we expect this approach to generalize more readily to real pathologies.
The authors \cite{Liu.2022} take a similar approach, aiming to improve supervised segmentation performance by augmenting a dual-branch Unet with pseudo-healthy reconstructions. These reconstructions are generated by a Soft-Intro VAE trained on healthy data.
In contrast, our proposed framework does not solely depend on supervised predictions. Instead, these predictions are combined with the unsupervised anomaly scores derived from reconstructions of a DM. We hypothesize that this combination enables general anomaly detection, particularly for pathologies unseen during training.


\begin{figure}[h]
    \centering
    \includegraphics[width=.7\columnwidth]{MIDL24_ga.pdf}
    \caption{Schematic drawing of SADM. In Stage I, $F_{\theta}^{DM}$ is trained to reconstruct healthy brain images. In stage II, the parameters $\theta$ are fixed, and the segmentation network $F_{\phi}^{seg}$ is trained, either on synthetic anomalies (self-supervised) or real pathologies (semi-supervised). At test time, the supervised prediction $\hat{\dot{y}}$ and the unsupervised anomaly map $\tilde{x}$ are combined to the final anomaly score (AS).}
    \label{fig:overview}
\end{figure}

\section{Method}
In this section, we introduce our framework for supervised anomaly detection with DMs (SADM), detailed schematically in Figure \ref{fig:overview}.
\subsection{Supervised Anomaly Detection with Diffusion Models (SADM)}
SADM integrates two primary branches: a DM for generating pseudo-healthy reconstructions (reconstruction branch) and a supervised Unet for segmentation (segmentation branch). We train SADM in two sequential stages.
\subsubsection*{Stage I: Unsupervised Reconstruction}
In the first stage, our objective is to train the DM to reconstruct healthy brain scans $\bm{\hat{x}} = F^{DM}_{\theta}(\bm{x})$ where $\bm{x} \in \mathbb{R}^{H\times W}$. The training of the DM focuses on optimizing parameters $\theta$ to minimize the $l1$-reconstruction loss:
\begin{equation}
\mathcal{L}_{Rec}=|\bm{x}-\bm{\hat{x}}|.
\end{equation}
\subsubsection*{Stage II: Supervised Segmentation}
In the second stage, the pseudo-healthy reconstruction generated by the DM trained in Stage I is utilized to support anomaly segmentation. Given an input scan with a real or synthetic anomaly $\bm{\dot{x}} \in \mathbb{R}^{H\times W}$ and its corresponding ground truth annotation $\bm{\dot{y}} \in \mathbb{R}^{H\times W}$, we utilize the DM, trained in stage I to generate the pseudo-healthy reconstruction $\bm{\hat{\dot{x}}} = F^{DM}_{\theta}(\bm{\dot{x}})$. Next, we feed  $(\bm{\dot{x}} - \bm{\hat{\dot{x}}})$ in a Unet that predicts the segmentation map $\bm{\hat{\dot{y}}} = F^{seg}_{\phi}(\bm{\dot{x}}-\bm{\hat{\dot{x}}})$. The parameters $\phi$ are optimized to minimize the cross-entropy (CE) segmentation loss 
\begin{equation}
\mathcal{L}_{Seg}=CE(\bm{\hat{\dot{y}}},\bm{\dot{y}})
\end{equation}
while the parameters $\theta$ are frozen during stage II.
\subsubsection*{Anomaly Detection}
The anomaly detection process leverages both components of our framework for anomaly segmentation.
Given a potentially abnormal input $\bm{\dot{x}}$, we generate a reconstruction $\bm{\hat{\dot{x}}} = F^{DM}_{\theta}(\bm{\dot{x}})$ by the DM. Next, we utilize $F^{seg}_{\phi}$ to derive the supervised anomaly prediction $\bm{\hat{\dot{y}}} = F^{seg}_{\phi}(\bm{\dot{x}}-\bm{\hat{\dot{x}}})$. In addition, we utilize the pixel-wise structural similarity (SSIM \cite{Wang.2004}) between input and reconstruction $\bm{\tilde{x}}=1 - SSIM(\bm{\dot{x}}-\bm{\hat{\dot{x}}})$ for unsupervised anomaly scoring. The anomaly score (AS) is then derived as a combination of the unsupervised anomaly map and supervised anomaly prediction 
\begin{equation}
\text{Anomaly Score (AS)} = \bm{\tilde{x}} + \bm{\hat{\dot{y}}}. 
\end{equation}
For pathologies similar to the anomalies seen during training, the supervised anomaly prediction will feature higher probabilities in abnormal regions, refining the unsupervised anomaly map. For unseen pathologies, the predicted probabilities are low such that  $\bm{\tilde{x}}$ is unaltered. We hypothesize that this combination allows for comprehensive anomaly detection, leveraging the unsupervised anomaly map for general anomaly identification and the supervised prediction for precise segmentation of known abnormal patterns.

\section{Experimental Setup}
\subsection{Data}
We use T1-weighted MRIs from the IXI data set to train the DM in Stage I. We separate a healthy test set consisting of 160 samples. The remaining data is partitioned into five training sets (N=358) and validation sets (N=44) for cross-validation.  
In Stage II, we utilize the strategy applied in \cite{Zavrtanik.2021} to generate pairs of synthetic anomalies and ground truth annotation based on the IXI data set (DRAEM). Additional information about the generation process and exemplary anomalies are provided in Appendix \ref{app:synano}.
Additionally, for the weakly supervised setting, we utilize small subsets containing approximately 10\% of the BraTS21 (BRATS, N=1251) \cite{Baid.2021,Bakas.2017,Menze.2014}, and ATLAS-v2.0 (ATLAS, N=655) \cite{Liew.2022} data sets.
For evaluation, we utilize the remaining 1151 and 589 samples of the BRATS and ATLAS data sets, respectively. Furthermore, we utilize the augmented IXI test set (DRAEM) to assess the segmentation performance concerning synthetic anomalies.  \\
\textbf{Pre- and post-processing: }We resample all T1 MRI scans to a resolution of  $[1 \times 1 \times 1]$ mm and register them to the SRI24-Atlas \cite{Rohlfing.2010}. Subsequently, we perform skull-stripping using HD-BET \cite{Isensee.2019} leading to volumes of size $[192 \times 192 \times 160]$ voxels. Finally, we apply bias-field corrections, reduce the resolution by a factor of two and crop 15 top and bottom slices in the transverse plane. For post-processing, we apply median filtering with a kernel size of 5 to the unsupervised anomaly maps. \\  
\subsection{Implementation Details}
We utilize DMs as GM within our proposed framework to generate pseudo-healthy reconstructions\footnote{Code available at\\ \url{https://github.com/FinnBehrendt/Supervised-Anomaly-Detection-with-Diffusion-Models}}. Specifically, we use conditioned DDPMs (cDDPM) following the implementation of \cite{Behrendt.2023c}.  
For the supervised segmentation of the residual image, we utilize a Unet \cite{Ronneberger.2015} like architecture, adapted from \cite{Kascenas.2022b}.
The volumes are processed in a slice-wise fashion, sampling slices uniformly during training. At test time, we reconstruct the full volume by iterating over all slices. 
We compare our framework against different established baselines. We compare reconstruction-based AEs and VAEs \cite{Baur.2021b}, FAEs \cite{Meissen.2022d}, DDPMs \cite{Wyatt.2022},  pDDPMs \cite{Behrendt.2023b} and cDDPMs \cite{Behrendt.2023c}. Furthermore, we compare the feature-based reverse distillation method (RD) \cite{Deng.2022}, the self-supervised Poisson image interpolation (PII) \cite{Tan.2021} and DRAEM-Net \cite{Zavrtanik.2021} approaches. Note that for PII we perform the anomaly generation based on the IXI data set. For all reconstruction-based methods, we utilize SSIM for anomaly scoring with a Gaussian kernel with standard deviation of $\sigma_{ssim}=1$, leading to a window size of $k_{ssim}=9$. 
Implementation details of our proposed framework and compared baselines are provided in Appendix \ref{app:impl}.  
\section{Experiments}
For all our experiments, we evaluate the BRATS and ATLAS data sets containing real pathologies and the IXI data set augmented with synthetic anomalies (DRAEM). We report the mean $\pm$ standard deviation across the different folds for the best possible Dice Score ([DICE]) as well as the Area under Precision-Recall Curve (AUPRC) to assess the segmentation performance. We evaluate different variants of SADM. In SADM$_{res}$, the residual of input and reconstruction is fed to the Unet, whereas in SADM, only the (abnormal) input is used. Furthermore, we consider Unet and Unet$_{res}$, where, in contrast to SADM only the prediction of the supervised segmentation branch is used, ignoring the anomaly map of the unsupervised reconstruction branch. In Appendix \ref{app:beta}, we provide an ablation study on the weighted combination of the segmentation and reconstruction branch.

\begin{table}[t]
\centering
     \resizebox{\textwidth}{!}{
    \begin{tabular}{llcccccccc}
        \toprule
         && \multicolumn{2}{c}{\textbf{Training Data}}& \multicolumn{6}{c}{\textbf{Test Data}}\\
         \cmidrule(l){3-4} \cmidrule(l){5-10}
         &\multirow{2}{*}{\textbf{Model}}&\multirow{2}{*}{$\mathcal{D}_{healthy}$}&\multirow{2}{*}{$\mathcal{D}_{unhealthy}$} & \multicolumn{2}{c}{\textbf{BRATS} (real)}  & \multicolumn{2}{c}{\textbf{ATLAS} (real)} & \multicolumn{2}{c}{\textbf{DRAEM} (synthetic)} \\
         \cmidrule(l){5-6} \cmidrule(l){7-8} \cmidrule(l){9-10}  
        & & &  & $\lceil$\textbf{DICE}$\rceil$ &  \textbf{AUPRC} & $\lceil$\textbf{DICE}$\rceil$ & \textbf{AUPRC} & $\lceil$\textbf{DICE}$\rceil$ & \textbf{AUPRC} \\
        \midrule

        \multirow{8}{*}{\rotatebox{90}{\footnotesize{I. Unsupervised}}}
        & AE & IXI & None &     39.16 $\pm$ 0.64 &  35.95 $\pm$ 0.70 &    14.14 $\pm$ 0.28 & 11.84 $\pm$ 0.37 &          9.91 $\pm$ 0.04 &       5.27 $\pm$ 0.04 \\
          &VAE & IXI & None &   39.25 $\pm$ 0.50 & 36.07 $\pm$ 0.56 &    14.52 $\pm$ 0.37 & 12.18 $\pm$ 0.39 &          9.83 $\pm$ 0.14 &       5.28 $\pm$ 0.08 \\
           &DAE & IXI & None & 55.93 $\pm$ 0.66 & 56.42 $\pm$ 0.84 &    19.95 $\pm$ 0.96 & 18.18 $\pm$ 0.98 &          12.50 $\pm$ 0.31 &        7.50 $\pm$ 0.22 \\
          &FAE & IXI & None & 43.04 $\pm$ 0.49 & 42.04 $\pm$ 0.41 &    17.59 $\pm$ 0.15 &  13.91 $\pm$ 0.10 &          \textbf{19.60 $\pm$ 0.49} &      \textbf{13.68 $\pm$ 0.25} \\
          & RD & IXI & None & 32.90 $\pm$ 0.65 & 28.31 $\pm$ 0.86 &    19.45 $\pm$ 0.25 &  15.51 $\pm$ 0.20 &          19.55 $\pm$ 0.60 &      13.17 $\pm$ 0.61 \\
         &DDPM & IXI & None &  48.65 $\pm$ 0.90 & 46.93 $\pm$ 1.02 &    17.86 $\pm$ 0.87 &   14.70 $\pm$ 0.70 &         10.37 $\pm$ 0.23 &       6.04 $\pm$ 0.27 \\
         &pDDPM & IXI & None &55.93 $\pm$ 0.28 & 55.44 $\pm$ 0.36 &     21.79 $\pm$ 0.40 & 19.12 $\pm$ 0.43 &         14.59 $\pm$ 0.47 &       9.27 $\pm$ 0.31 \\
        &cDDPM & IXI & None &\textbf{58.55 $\pm$ 0.78} &\textbf{ 59.09 $\pm$ 0.91} &    \textbf{24.74 $\pm$ 1.15} & \textbf{21.76 $\pm$ 0.98 }&         11.94 $\pm$ 0.52 &       7.31 $\pm$ 0.43 \\

        \midrule
        \multirow{6}{*}{\rotatebox{90}{\footnotesize{II. Self-Supervised}}}
        & PII  & None & PII &   30.38 $\pm$ 2.46 & 24.66 $\pm$ 2.54 &     9.81 $\pm$ 1.93 &  7.31 $\pm$ 1.64 &         23.44 $\pm$ 1.61 &      15.09 $\pm$ 0.97 \\
        &DRAEM-Net & None & DRAEM &   24.78 $\pm$ 4.21 & 18.49 $\pm$ 4.05 &     12.65 $\pm$ 1.90 &  9.51 $\pm$ 1.75 &        \textbf{ 79.77 $\pm$ 2.37} &      \textbf{83.39 $\pm$ 2.34} \\
        &Unet & None & DRAEM & 40.75 $\pm$ 3.30 & 37.64 $\pm$ 3.92 &    16.91 $\pm$ 0.38 & 15.25 $\pm$ 0.26 &         76.03 $\pm$ 1.21 &       80.30 $\pm$ 1.32 \\
        &Unet$_{res}$ & IXI & DRAEM &45.80 $\pm$ 3.22 & 44.05 $\pm$ 4.09 &    18.44 $\pm$ 0.47 & 16.81 $\pm$ 0.44 &         77.43 $\pm$ 1.16 &      81.93 $\pm$ 1.23 \\
        &SADM & IXI & DRAEM & 50.81 $\pm$ 0.57 & 49.81 $\pm$ 0.81 &    23.82 $\pm$ 0.32 & 20.71 $\pm$ 0.35 &          73.77 $\pm$ 2.50 &      71.85 $\pm$ 3.02 \\
        &SADM$_{res}$ & IXI & DRAEM & \textbf{60.53 $\pm$ 0.54} & \textbf{60.27 $\pm$ 1.02} &    \textbf{27.78 $\pm$ 0.14} & \textbf{24.57 $\pm$ 0.13} &          76.72 $\pm$ 1.30 &      75.45 $\pm$ 1.96 \\

        \midrule
        \multirow{8}{*}{\rotatebox{90}{\footnotesize{III. Weakly-Supervised}}}
        &Unet & None & BRATS & 64.81 $\pm$ 0.21 & 69.24 $\pm$ 0.33 &     11.82 $\pm$ 0.60 & 10.32 $\pm$ 0.61 &         \textbf{ 24.83 $\pm$ 1.10} &      \textbf{20.96 $\pm$ 1.46} \\
        &Unet$_{res}$ & IXI & BRATS & 67.01 $\pm$ 0.70 &  71.80 $\pm$ 0.87 &    17.33 $\pm$ 1.31 &  15.55 $\pm$ 1.50 &          19.93 $\pm$ 2.40 &      16.41 $\pm$ 2.64 \\
        &SADM & IXI & BRATS   & 69.01 $\pm$ 0.21 & 72.62 $\pm$ 0.46 &    25.25 $\pm$ 0.58 &  21.03 $\pm$ 0.50 &         14.93 $\pm$ 0.51 &      11.65 $\pm$ 0.66 \\
        &SADM$_{res}$ & IXI & BRATS &\textbf{ 69.68 $\pm$ 0.48} & \textbf{73.34 $\pm$ 0.85} &   \textbf{ 26.77 $\pm$ 0.65} & \textbf{23.22 $\pm$ 0.86} &         17.11 $\pm$ 1.78 &      14.47 $\pm$ 1.91 \\ 

        \cdashlinelr{2-10}
        &Unet & None & ATLAS &   35.13 $\pm$ 2.97 & 32.87 $\pm$ 3.07 &     46.30 $\pm$ 0.72 & 46.37 $\pm$ 0.73 &  \textbf{ 29.11 $\pm$ 1.02} &      \textbf{24.55 $\pm$ 1.91} \\
        &Unet$_{res}$ & IXI & ATLAS & 36.82 $\pm$ 4.18 & 34.91 $\pm$ 4.92 &     47.36 $\pm$ 0.80 & \textbf{47.61 $\pm$ 0.88}&          22.07 $\pm$ 2.20 &      17.94 $\pm$ 2.39 \\
        &SADM  & IXI & ATLAS &   58.52 $\pm$ 0.60 &  57.17 $\pm$ 1.60 &     46.40 $\pm$ 0.17 & 44.71 $\pm$ 0.15 &           16.10 $\pm$ 1.10 &      12.81 $\pm$ 1.09 \\
        &SADM$_{res}$ & IXI & ATLAS &  \textbf{58.85 $\pm$ 0.44} & \textbf{57.68 $\pm$ 1.23}&     \textbf{47.64 $\pm$ 1.40} & 46.13 $\pm$ 1.36   &         17.77 $\pm$ 1.82 &      14.49 $\pm$ 1.73 \\

        \bottomrule
    \end{tabular} 
    }
    \caption{Segmentation performance regarding DICE and AUPRC. \textbf{Block I:} Unsupervised approaches, trained with healthy data.  \textbf{Block II:}, Self-supervised approaches, trained with synthetic anomalies.  \textbf{Block III:} Weakly-supervised approaches, trained with real pathologies.
    $\mathcal{D}_{healthy}$ and $\mathcal{D}_{unhealthy}$ represent the type of data used during training. }
    \label{tab:synpath}
\end{table}

\begin{figure}[ht]
    \centering
    % \includegraphics[width=\linewidth]{BraTS2021_01589_slice_40_Grid.png}
    % \includegraphics[width=\linewidth]{sub-r040s072_slice_30_Grid.png}
    \includegraphics[width=.8\linewidth]{plot_sadm_updated.pdf}
    \caption{Examplary test cases for SADM$_{res}$, trained and evaluated in the weakly-supervised setting with the BRATS and ATLAS data sets, respectively. For visualization purposes, we provide exemplary binary segmentation maps for the unsupervised anomaly score, the supervised prediction and the final AS, respectively. We derive the binarization threshold by optimizing for the best possible dice score.}
    \label{fig:ex_wsad}
\end{figure}
\subsection{Training with Synthetic Anomalies}
We evaluate our approach in different settings. First, we assume the typical UAD case where only data with healthy labels is available. We use synthetic anomalies to obtain a supervised signal for the segmentation branch in SADM. We utilize the generation process proposed in DRAEM \cite{Zavrtanik.2021} to generate the anomalies. In this setting, we compare our framework to various UAD baselines. Results are reported in block I and block II of Table \ref{tab:synpath}. 
Across the compared UAD baselines in block I, cDDPMs show the highest segmentation performance for real pathologies. Hence, we consider them as a reconstruction model for the SADM framework. 
For real pathologies, SADM$_{res}$ outperforms cDDPMs with performance improvements of 3.4 \%, 12.3 \% for the BRATS and ATLAS data sets, respectively. Considering the synthetic anomalies in the DRAEM data set, a substantially higher DICE of 76.72 \% is reported for SADM$_{res}$ compared to the DICE of 11.94 \% achieved by cDDPMs.
Notably, while the DRAEM-Net shows relative performance improvements of 10.5 \% over SADM$_{res}$ for synthetic anomalies, it fails to generalize to the real pathologies in the BRATS and ATLAS data sets. 
Even the Unet, trained with the same synthetic anomalies as in DRAEM-Net, outperforms DRAEM-Net considering real pathologies. \\
Comparing SADM and SADM$_{res}$, we observe that utilizing the residual of abnormal input and pseudo-healthy reconstruction instead of the abnormal input substantially improves the segmentation performance across all data sets.
\subsection{Training with Real Pathologies}
In this section we investigate using our framework in a weakly-supervised setting. Instead of generating synthetic anomalies, we assume a small amount of annotated data is available and consider a subset of the BRATS and ATLAS data sets for training, respectively. We only train with one data set at a time to evaluate the generalization to unseen pathologies. The results for this weakly-supervised setting are reported in block III of Table \ref{tab:synpath}. 
Using a small subset of annotated data substantially improves the segmentation of all models when evaluating the same (in-domain) data set. However, the segmentation performance of Unet and Unet$_{res}$ is poor for data sets containing pathologies unseen during training. In contrast, both SADM and SADM$_{res}$ enhance the segmentation performance on in-domain data while maintaining or even improving the performance of unsupervised cDDPMs for unseen pathologies. A visualization of the anomaly maps coming from different branches of the SADM framework is provided in Figure \ref{fig:ex_wsad}.

\section{Discussion and Conclusion}
A significant challenge of supervised methods that UAD addresses is the need for annotated training data. This is especially crucial when considering screening tasks where the type and shape of potential lesions are unknown. Therefore, it is highly desirable to achieve generalization to different kinds of lesions while minimizing false positive predictions.
In this work, we aim for a framework that benefits from the robust generalization of reconstruction-based UAD methods and the high discriminative power of supervised strategies. \\
Comparing the unsupervised and self-supervised approaches in Table \ref{tab:synpath}, the additional shape information typically improves the segmentation performance with the magnitude of improvement dependent on the lesion type. However, considering purely self-supervised models, it is evident that supervised training based on synthetic data can result in overfitting. In contrast, our proposed framework, improves the segmentation performance for anomalies of known shape and appearance while maintaining or even improving the generalization of reconstruction-based UAD for pathologies unseen during training. 
This indicates that the framework effectively utilizes the complementary information of the reconstruction and segmentation branches, as highlighted in Figure \ref{fig:ex_wsad}. On the one hand, the supervised segmentation branch enhances the specificity for pathologies similar to the anomalies seen during training. On the other hand, the reconstruction branch maintains the high sensitivity of reconstruction-based UAD for any abnormal pattern unseen during the training of the DM. 
Furthermore, feeding the residual of input and reconstruction to the Unet (SADM$_{res}$) instead of the abnormal input only (SADM) can enhance the segmentation performance, particularly in the self-supervised setting. This indicates that the additional information in the residual may contribute to learning the deviation from a normal representation, potentially reducing the risk of overfitting to specific anomaly shapes. While the DRAEM-Net shares some similarities with our approach, there are significant differences. First, DRAEM-Net uses a generator network trained to remove synthesized anomalies. In contrast, our reconstruction branch employs a DM trained to reconstruct healthy data without explicitly enforcing the removal of specific anomalies. Second, instead of solely relying on the segmentation branch, we combine the supervised prediction with the unsupervised anomaly map derived from the reconstruction branch. As demonstrated in our experiments, these adaptations lead to improved segmentation performance and generalization, enabling the effective use of SADM in a weakly-supervised setting. Therefore, our framework adds a significant feature to UAD approaches, especially considering that some annotated data is typically available.\\
In summary, our approach shows encouraging results, paving the way for a practical solution for UAD in brain MRI. Limitations are seen in the potential reconstruction of unhealthy structures by the reconstruction branch and in the investigated synthetic anomalies intended initially for industrial defect detection. Despite the demonstrated improvement in performance, we anticipate further enhancements when integrating more realistic synthetic anomalies. Additionally, we intend to include data sets featuring subtler anomalies or different imaging modalities to broaden the evaluation of our approach.
\midlacknowledgments{This work was partially funded by grant number KK5208102HV3 and ZF4026303TS9 (Zentrales Innovationsprogramm Mittelstand) and  by the Free and Hanseatic City of Hamburg (Interdisciplinary Graduate School).}
\bibliography{midl24_225}

\newpage
\appendix
\section{Qualitative Comparison} \label{app:qual}
\begin{figure}[ht]
    \centering
    \includegraphics[width=\linewidth]{recos_appendix.pdf}
    \caption{Comparison of baseline models for pathologies from the BRATS (left two columns) and ATLAS (right two columns) data sets.}
    \label{fig:enter-label}
\end{figure}
\section{Implementation Details} \label{app:impl}
All models are implemented in Pytorch (v0.10). For data handling and augmentation, torchio \cite{PEREZGARCIA2021106236} is utilized. We choose the best-performing model checkpoint, measured by the validation set performance. We utilize Adam as an optimizer with a batch size of 32. For data augmentation, we utilize random -blur, -bias, -gamma and -ghosting. All Baselines are implemented following the official GitHub repositories. We train our models on NVIDIA RTX 3090 and V100 GPUs.
\subsection{SADM}
Our SADM framework consists of a reconstruction branch and a segmentation branch. In the reconstruction branch, we utilize cDDPMs \cite{Behrendt.2023c} as a generative model. We follow the official implementation\footnote{\url{https://github.com/FinnBehrendt/Conditioned-Diffusion-Models-UAD}} and utilize a 3-layer Unet with channel dimensions [128, 128, 256] as a denoising network with a pre-trained resnet50 encoder for conditioning. During training, we uniformly sample noise levels $t \in [0,T]$. At test time, we derive the final reconstruction as an average from reconstructions of different noise levels $t_{test} \in [250,500,750]$.
For the segmentation branch, we adapt the Unet architecture, employed in \cite{Kascenas.2022b}, leading to an Unet architecture consisting of 3 layers with channel dimensions [64, 128, 256], group normalization and SiLU activation functions. Additionally, we add a sigmoid layer after the final convolution for segmentation. In stage I and II, we train for 1600 and 600 epochs, with learning rates of 1e-4 and 5e-5, respectively.
\subsection{Baselines}
We implement various baseline methods based on the official code with individual adaptations of hyper-parameters that have been shown to improve training stability or performance regarding the validation data. Unless stated otherwise, all models are trained for 1600 epochs, choosing the best checkpoint based on the validation set performance, using Adam as an optimizer. For AEs and VAEs, we use a latent dimension of 128 and set the learning rate to 1e-4. For VAEs, we set $\beta_{KLD} = 0.001$. For RD and DRAEM, we set the learning rate to 1e-4. The DDPM, pDDPM and cDDPM baselines are trained with simplex noise as proposed in \cite{Wyatt.2022} and a learning rate of 1e-5, respectively. Note that for all DDPM-based baselines, we utilize the averaged reconstruction from three different noise levels $t_{test} \in [250,500,750]$. 




\section{Synthetic Anomalies}\label{app:synano}
We generate the synthetic anomalies by following the procedure of \cite{Zavrtanik.2021}. First, a noise image is generated using Perlin noise \cite{Perlin.1985}, capturing a wide variety of shapes. Subsequently, the noise image is binarized by a uniformly sampled threshold, resulting in an anomaly map M$_a$, that is used as ground truth annotation. The binary map is further processed by three random augmentation functions, sampled from the set of \{posterize, sharpness, solarize, equalize, brightness change, color change, auto-contrast\}, leading to I$_{aug}$. Finally, I$_{aug}$ is masked by M$_a$ and blended with the original image I, leading to
$I_{syn} = (1-M_a) \odot I + (1-\gamma)(M_a \odot I) + \gamma(M_a \odot I_{aug})$. The operator $\odot$ denotes element-wise multiplication and $\gamma$ denotes the opacity parameter that is uniformly sampled from $\gamma \in [0.2,1.0]$. Figure \ref{fig:app_syn} showcases exemplary synthetic images with the corresponding annotation mask.  

\begin{figure}[ht]
    \centering
    \includegraphics[width=\linewidth]{DRAEM_examples.pdf}
    \caption{Examplary Synthetic Anomalies generated by the DRAEM procedure. Top: Images from the IXI data set, augmented with synthetic anomalies. Bottom: Annotation corresponding to the introduced anomalies.}
    \label{fig:app_syn}
\end{figure}

\section{Analysis of the Anomaly Score Weighting}\label{app:beta}
In this section, we analyze the different weightings of the anomaly scores from the supervised and reconstruction branches. 
We derive the AS by weighing the individual scores as follows
\begin{equation}
    \text{Anomaly Score (AS)} = \beta \cdot x_{tilde} + (1 - \beta) \cdot \dot{\hat{y}}.
\end{equation}
We vary the weighting parameter $\beta$ from zero to one. $\beta = 0$ corresponds to solely relying on the supervised branch (Unet$_{res}$). $\beta = 1$ corresponds to solely using the reconstruction branch (cDDPM).
\begin{figure}[ht]
    \centering
    \begin{tabular}{cc}
        \multicolumn{2}{c}{Trained on ATLAS} &\\
         Evaluated on ATLAS & Evaluated on BRATS \\
        &\\
         \includegraphics[width=.4\linewidth]{BestDice_ATLAS_v2_trained_onATLAS.png} & 
        \includegraphics[width=.4\linewidth]{BestDice_Brats21_trained_onATLAS.png} \\
      \midrule
        \multicolumn{2}{c}{Trained on BRATS} &\\
        Evaluated on ATLAS & Evaluated on BRATS \\
         \includegraphics[width=.4\linewidth]{BestDice_ATLAS_v2_trained_onBraTS.png} & 
         \includegraphics[width=.4\linewidth]{BestDice_Brats21_trained_onBraTS.png}
    \end{tabular}
    
    
   
    
    \caption{Analysis of the anomaly score weighting given \\AS = $\beta \cdot x_{tilde} + (1 - \beta) \cdot \dot{\hat{y}}$, where $x_{tilde}$ represents the anomaly map coming from the unsupervised reconstruction branch and $\dot{\hat{y}}$ represents the anomaly map coming from the supervised segmentation branch. The $\lceil$DICE$\rceil$ is plotted against different values of $\beta$. }
    \label{fig:app_beta}
\end{figure}

\end{document}
