\documentclass{midl} % Include author names
% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution
\usepackage[skip=0.5pt]{caption}
\usepackage{mwe} % to get dummy images
\jmlrvolume{-- 314}
\jmlryear{2024}
\jmlrworkshop{Full Paper -- MIDL 2024}
\editors{Accepted for publication at MIDL 2024}

\usepackage{soul}
%\usepackage{multirow}


\title[Ano-swinMAE]{Ano-swinMAE: Unsupervised Anomaly Detection in Brain MRI using Swin Transformer-based Masked Auto Encoder}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship


\begin{document}
\midlauthor{\Name{Kumari Rashmi\midljointauthortext{Contributed equally}\nametag{$^{1}$}} \Email{ee20s051@smail.iitm.ac.in}\\
\Name{Ayantika Das\midlotherjointauthor\nametag{$^{1}$}} \Email{ee19d422@smail.iitm.ac.in }\\
\Name{Matcha Naga Gayathri\nametag{$^{1}$}}
\Email{ee21s048@smail.iitm.ac.in}\\
% \Name{xxxxxx\nametag{$^{2}$}} \Email{xxxxxxxxxx}\\
\Name{Keerthi Ram\nametag{$^{2}$}} \Email{keerthi@htic.iitm.ac.in}\\
\Name{Mohanasankar Sivaprakasam \nametag{$^{1,2}$}} \Email{mohan@ee.iitm.ac.in} \\
\addr $^{1}$ Indian Institute of Technology, Madras \\
\addr $^{2}$ HealthCare Technology Innovation Center \\
}
\maketitle
\begin{abstract}
% \vspace{-1cm}
%%%%%%%%%%%%%%%%%%%%% Ayantika %%%%%%%%%%%%%%%%%%%%%%%
The advanced deep learning-based Autoencoding techniques have enabled the introduction of efficient Unsupervised Anomaly Detection (UAD) approaches. Several autoencoder-based approaches have been used to solve UAD tasks. However, most of these approaches do not have any constraints to ensure the removal of pathological features while restoring the healthy regions in the pseudo-healthy image reconstruction. To minimize the occurrence of pathological features, we propose to utilize an Autoencoder which deploys a masking strategy to reconstruct images. Additionally, the masked regions need to be meaningfully inpainted to enforce global and local consistency in the generated images which makes transformer-based masked autoencoder a potential approach. Although the transformer models can incorporate global contextual information, they are often computationally expensive and dependent on a large amount of data for training. Hence we propose to employ a Swin transformer-based Masked Autoencoder (MAE) for anomaly detection (Ano-swinMAE) in brain MRI. Our proposed method Ano-swinMAE is trained on a healthy cohort by masking a certain percentage of information from the input images. While inferring, a pathological image is given to the model, and different segments of the brain MRI slice are sequentially masked, and their corresponding generation is accumulated to create a map indicating potential locations of pathologies. We have quantitatively and qualitatively validated the performance increment of our method on the following publicly available datasets: BraTS (Glioma), MSLUB (Multiple Sclerosis), and White Matter Hyperintensities (WMH). We have also empirically evaluated the generalization capability of the method in a cross-modality data setup.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% Promising results have been demonstrated by generative models in the realm of unsupervised anomaly detection. However, We have yet to explore one of the core module of generative models i.e. transformers in its full potential in unsupervised anomaly detection tasks.
% Recent works have displayed the use of a transformer for unsupervised pathology detection as well as image segmentation tasks.
% The use of transformers was limited in the field of unsupervised anomaly detection due to its data-hungry nature.  After the masked autoencoder was introduced which was backboned with a vision transformer, it opened the opportunity to explore it for anomaly detection. In this paper, we are introducing Ano-swinMAE, a Swin transformer-based masked autoencoder (MAE) which can be used for anomaly detection in brain MRI. We have initialised our model with ImageNet weights before training. During training, we are using the healthy brain MRI slices, while showcasing a certain percentage of the image to the model. Removing a segment of the image and reconstructing the removed part strengthens the model with improved Generalization capability as well. While inferring, we give unhealthy slices to the model to detect and localize pathology. In our algorithm, different segments of the brain in the MRI slice are systematically masked which represents areas with potential pathology. The model endeavors to reconstruct each masked region individually. This iterative masking procedure across various brain regions aids in pinpointing the localization of the pathology.
% %Ano-swinMAE excels in unsupervised anomaly detection in brain MRI when tested with BraTS21, MSLUB and WMH dataset, outperforming state-of-the-art methods by achieving a 2\% accuracy improvement while reducing inference time compared to diffusion methods.%
% %Removing a segment of the image and reconstructing the removed part strengthen the model with improved Generalization capability as well, which was reflected with reconstruction quality of brain slices while evaluating it with MSLUB, BraTS21 and WMH datasets since our model was trained with completely different dataset i.e. IXI datset.%
% Our method has demonstrated a significant accuracy improvement when compared with various baseline methods on data of tumors, MS lesions and white matter hyperintensities while reducing the inference time when compared to diffusion models.
\end{abstract}

\begin{keywords}
Unsupervised Anomaly Detection (UAD), Ano-swinMAE, MRI, Masked Autoencoder (MAE), Pathology Detection.
\end{keywords}

\section{Introduction}
    The identification and delineation of pathology in brain MRI plays a crucial role in disease diagnosis and prognosis. Unsupervised Anomaly detection (UAD) methods alleviate the task of annotating pathologies at the pixel level. The state-of-the-art deep learning-based UAD methods are Autoencoding models that learn to encode healthy data distribution. The appearance of pathological features in brain MRI scans is typically localized in certain anatomical regions. The Autoencoding networks must ensure changes are introduced in those localized pathological regions while restoring the healthy features. The changes in pathological regions consequently arise due to the fact that Autoencoders are trained on healthy data, and there is a performance degradation in these pathological regions. However, there is no constraint in the Autoencoding models that ensure pathologies will be absent in the pseudo-healthy reconstructed images. To mitigate this, a masking-based Autoencoding strategy is an intuitive approach that can be incorporated. While masking strategies have been adopted for unsupervised anomaly detection \cite{nguyen2021unsupervised} \cite{iqbal2023unsupervised}, most of the approaches do not consider position-aware global context dependencies. These properties are capacitated by Vision transformers (ViT) \cite{dosovitskiy2020image}, which will ensure that there is an association among distant imaging features of the brain MRI scans \cite{WANG2023100004} essential for in-painting the masked regions with meaningful healthy features while removing the pathological ones. However, ViT-based models lack \cite{xu2021vitae}\cite{bietti2019inductive}inductive biases in modeling local visual structure which cannot be traded with global context since inductive biases for locality information is crucial as we can see in the case of CNN. Additionally, this limitation results in a heavy reliance on large datasets and pre-trained models, which is difficult in scarce data setups like in medical images. To mitigate these challenges, the Swin Transformer \cite{liu2021swin} introduces a shifted windows-based Multi-head self-attention (MSA) for modeling global feature relations. This not only enhances performance but also reduces model complexity. Hence, (i) we propose to utilize a Swin Transformer-based Masked Auto Encoder (Ano-swinMAE) for detecting and localizing pathologies in an unsupervised manner. We have shown the efficacy of our method using publicly available pathology datasets. Additionally, (ii) we have shown how masking helps our models map the pathological distribution closer to a healthy distribution in the latent representational space. (iii) We also validate that our method is generalizable in different datasets and computational efficiency is relatively better.
\vspace{-0.2cm}
% \cite{behrendt2023patched}
% In recent years, there has been a significant focus on Unsupervised Anomaly detection (UAD) methods in medical imaging, which leverage the potency of extensive sets of normal data to detect anomalies. Unlike supervised methods, these approaches don't need anomaly information during training. As we acknowledge, obtaining medical imaging data with accurate labels is a challenging task. It additionally motivates researchers to concentrate on Unsupervised Anomaly Detection (UAD).
% The identification and delineation of pathology in brain MRI contribute to various clinical objectives such as diagnosis, prognosis, and treatment selection, which help patients in various ways. In recent years, there has been a significant focus on Unsupervised Anomaly detection (UAD) methods in medical imaging, which leverage the potency of extensive sets of normal data to detect anomalies. Unlike supervised methods, these approaches don't need anomaly information during training. As we acknowledge, obtaining medical imaging data with accurate labels is a challenging task. It additionally motivates researchers to concentrate on Unsupervised Anomaly Detection (UAD).

% For an extended period, CNNs \cite{shen2017deep} have been the predominant methods in medical image analysis, such as pathology detection, segmentation, and others. Supervised methods utilizing CNNs face constraints, including the requirement for extensive expert-annotated training data and the difficulty of learning from noisy or imbalanced datasets.
% Apart from this, CNNs struggle to model non-local relationships between features, often requiring a more complex and deeper network architecture.

% In medical image analysis, transformer-based methods succeed in classification, segmentation, and detection tasks, demonstrating promising performance and strong Generalization capabilities.
% Vision transformers \cite{dosovitskiy2020image}, though more computationally demanding than CNNs and RNNs, triumph over bottlenecks by utilizing non-local receptive fields. Unlike traditional methods, vision transformers consider all local image patches to establish comprehensive global feature relations. This enhances the modeling of pathology and background correlations, significantly improving pathology detection and brain tumor segmentation. Moreover, the traditional ViT, swinT \cite{liu2021swin}, introduces a shifted windows-based MSA for modeling global feature relations. This not only enhances performance but also reduces model complexity. Several research studies use SwinT \cite{liang2023btswin} as the fundamental transformer backbone for the segmentation of brain tumors.

% Transformer-based models are data-hungry models, and we have less training data available in medical images. Masked auto encoder (MAE) develops highly capable models that exhibit strong Generalization. Through MAE pre-training, we enhance the Generalization performance of data-hungry models, such as ViT-Large \cite{dosovitskiy2020image}, on ImageNet-1K. In recent studies, \cite{zhou2023self} \cite{georgescu2023masked} MAE was used for unsupervised pathology detection task in various medical images.


% Lately, state-of-the-art methods for unsupervised anomaly detection in brain MRI, such as Patched DDPM\cite{behrendt2023patched} and Masked, Stitch, Resample\cite{bercea2023mask}, have demonstrated promising results. In addition to these approaches, AnoGAN, several Variational Autoencoder (VAE)\cite{baur2021autoencoders} based methods have also demonstrated promising results in this domain. The diffusion-based approach faces one challenge, as DDPMs preserve spatial information in
% their hidden representation of the input which is important for the image generation process.There is huge dependency upon re-sampling and the inherent uncertainty is there.


\begin{figure}[t]
\floatconts
  {fig:base1}
  {\caption{Main architecture consists of Training and Inference of Ano-swinMAE. During Inference, Mask slides across various parts of Anomaly brain MRI image(slice), and a pseudo healthy MRI slice is reconstructed. Then with the help of original Anomaly MRI image(slice), we get combined heatmap and one segmentation image using Atropos. With the help of combined heatmap and Atropos segmentation image, Anomaly is Localised.  }}
  % {\includegraphics[width=1.0\linewidth]{sample.drawio.pdf}}
  {\includegraphics[width=0.8\linewidth]{Images/architecture_midl.png}}
  \vspace{-0.5cm}
\end{figure}
\vspace{-0.3cm}
\section{Related work}
\vspace{-0.2cm}
Several deep-learning approaches have been investigated in recent research on Unsupervised Anomaly Detection (UAD). Among these, methods incorporating Autoencoders (AE) and Variational Autoencoders (VAE) have proven to be effective during both training and inference \cite{zhou2021vae}, \cite{baur2021ae}. Nevertheless, a common limitation observed in these approaches is the reconstructed image quality, which tends to be blurry. This blurriness poses a challenge, rendering these methods less effective for UAD tasks, as described in \cite{baur2021autoencoders}.
To overcome this limitation, researchers have worked towards utilizing the image context by adding spatial latent dimension \cite{baur2019deep}, erasing spatial context, \cite{zimmerer2018context}, making use of 3D information \cite{bengs2021three} \cite{behrendt2022capturing}

As an alternative to AE, Generative Adversarial networks have been applied to the problems of UAD task \cite{schlegl2019f}. However, the unstable training nature of GANs poses challenges, leading to issues such as mode collapse and a lack of anatomical coherence \cite{baur2021autoencoders}. Meanwhile, the recent work in UAD has utilized DDPM (Denoising Diffusion Probabilistic Model) because it exhibits scalable and stable training properties while producing high-quality, sharp images \cite{wolleb2022diffusion} \cite{wyatt2022anoddpm} \cite{sanchez2022healthy} \cite{pinaya2022fast}. In the DDPM-based approach, there is a tradeoff between preserving crucial information about healthy tissues and efficiently eliminating anomalies due to the inherent noise addition strategy. Recent DDPM works that are used for UAD tasks and deal with this tradeoff are \cite{behrendt2023patched} and \cite{bercea2023mask}. This tradeoff often limits the applicability of the models among diverse pathological data. Additionally, \cite{iqbal2023unsupervised} uses a DDPM-based model which incorporates a mechanism to simulate pathology during the training phase. This strategy involves an approximation of the pathological distribution in the training phase that may not ensure capturing the true distribution.

% Addressing paper [A] in this work a diffusion model is used, and during the training phase, a mechanism to simulate pathology is introduced. The simulation is done by masking the original image and adding a frequency-altered version of the original image in the masked images. Their notion of introducing masks is to simulate pathologies unlike ours where we intend to mask the pathological image during inference.
%;

% In addition to managing noise to achieve a balanced trade-off as mentioned above, pDDPMs(patched DDPMs) \cite{behrendt2023patched} enhance brain MRI reconstruction by integrating detailed context information about individual brain structures and appearances while estimating patches.
% \vspace{-0.3cm}
However, the intricate structure of the brain can be captured by learning to model the relationship between individual brain structures, which can be modeled using transformer models. UAD using transformers was done by \cite{pinaya2022unsupervised}, \cite{ghorbel2022transformer}. The transformer-based approaches incorporate masking\cite{zhou2023self} strategies that ensure the mapping of the pathological brain into a healthy distribution without adding any additional constraints. Employing transformer-based Masked Autoencoders (MAE) \cite{hu2023features}, \cite{yu2022image}, \cite{georgescu2023masked} generate a pseudo-normal counterpart of the normal image. Additionally, \cite{georgescu2023masked} integrates a pseudo-abnormal module to simulate pathology to train a classifier to discriminate between healthy and unhealthy. Their dependency on the secondary classifier clearly highlights that MAE independently is not suitable for pathology detection.


%} Another trail of work has utilised transformer for UAD\cite{pinaya2022unsupervised} \cite{ghorbel2022transformer} task. Transformer possesses almost similar modelling capability as of Diffusion network in terms of capturing detailed context information about individual brain structures and appearances.Due to it's attention mechanism, it can capture global relationship between individual brain structure. Masked Auto Encoder \cite{he2022masked} showed promising result in the field of generic image reconstruction as well as swinMAE \cite{dai2023swin}. Since transfomer contains positional embedding for individual tokens, this gave us scientific curiosity to utilize it in our UAD task for brain MRI due to its deterministic modelling capability. Again Masking comes naturally due to patch wise structure of transformer,it further encourages us to utilise this capability.

\vspace{-0.5cm}
\section{Methodology}
\vspace{-0.1cm}
\subsection{Problem Formulation}
% Image Representation
Let $ x \in \mathbb{R}^{H \times W \times C} $ be a healthy or pathology brain MRI scan with dimensions \(H \times W\) and C channels. The task of unsupervised detection and localization of pathologies in $x$ can formulated as the reconstruction of the pseudo healthy ($\hat{x}\in \mathbb{R}^{H \times W \times C}$) counterpart of $x$ and extracting the residual information by subtracting $\hat{x}$ from $x$. The reconstruction task is achieved by an Autoencoding network, which is trained to map $x$ to $\hat{x}$, where $x$ belongs to a healthy cohort input and $\hat{x}$ is its reconstruction respectively. At the inference time, $x$ belonging to a pathological distribution is given as input to the model and expected to generate $\hat{x}$ belonging to the healthy distribution. 

\vspace{-0.3cm}
\subsection{Proposed Framework (Ano-swinMAE)}

% The appearance of pathological features in brain MRI scans is typically localized in certain anatomical regions. The Autoencoding networks must ensure changes are introduced in those localized pathological regions while restoring the healthy features. The changes in pathological regions consequently arise due to the fact that Autoencoders are trained on healthy data, and there is a performance degradation in these pathological regions. However, there is no constraint that ensures that pathologies will be absent in the pseudo-healthy reconstructed images. To mitigate this, a masking-based Autoencoding strategy is an intuitive approach that can be incorporated. While masking strategies have been adopted for unsupervised anomaly detection \cite{behrendt2023patched}\cite{nguyen2021unsupervised}, most of the approaches do not consider position-aware global context dependencies. These properties will ensure that there is an association among distant imaging features of the brain MRI scans \cite{WANG2023100004}, which is quite essential for in-painting the masked regions with meaningful healthy features while removing the pathological ones.

 
% Apart from the conventional CNNs and RNNs that can capture local dependencies, long-range dependencies and the global context of the image can be captured using transformer-based models resulting in better representation of global features. \cite{WANG2023100004}
% For structures like brain MRI where the global context plays a significant role, we thought of utilizing a transformer as our backbone.
% \in \mathbb{R}^{7 \times 7 \times 728} 
% \vspace{-0.3cm}
The overview of our proposed Swin Transformer-based MAE (Ano-swinMAE) for anomaly detection is given in Figure \ref{fig:base1}. In the training phase, an input to the model ($x$) is randomly masked at regions within the brain area, which are passed through the encoder to extract latent representations ($ z $ ). These representations are further processed through the decoder to generate the reconstructed image, learning to incorporate meaningful information within the masked regions. The masking positions used during training are random but the masking ratio and size of the masks are given by us. The learning objective is a mean square error between the masked region of the original and reconstructed image. 

The significance of the model lies in the integration of three key components within the autoencoding model. \textit{First}, a Swin Transformer block \cite{liu2021swin} is introduced within the model along with a patch merging layer which reduces the number of positional information (token). This is quite essential for task which faces data scarcity and requires lesser model complexity. \textit{Second}, A windowing strategy is incorporated which limits the capability of every patch to be masked. This strategy allows patches within a certain window size to be masked. This masking strategy resembles the pathological occurrence since frequent masking with a small mask size will fail to reconstruct in realistic larger pathological cases. \textit{Third}, The token of a masked patch is allowed to pass through the model such that the model is aware of the absolute positions of all the patches. This strategy helps in obtaining reconstructed images with better fidelity. We have incorporated all these three strategies and adopted the architectural details of \cite{dai2023swin} to build the encoder-decoder structure of the Ano-swinMAE.


% The encoder and decoder structure incorporates a sequence of Swin Transformer Blocks \cite{liu2021swin} along with a specialised "window masking". The windowed masking strategy does not allow masking of content within any of the patches but rather allows one patch to be masked within a single window. This masking strategy resembles the pathological occurrence since frequent masking with a small mask size will fail to reconstruct in realistic larger pathological cases. Additionally, we have considered the token (or the positional information) of the masked regions to be processed within the encoder since our goal is to achieve anomaly detection through the reconstruction of the pseudo-healthy images. The processing of the mask tokens can increase the fidelity of the images, which can reduce the occurrence of false positives.   

In the inference phase, we generate a non-overlapping sliding mask sequentially moving across regions having brain pixels as indicated in the Inference block of Figure \ref{fig:base1}. Each masked reconstruction undergoes L1 norm-based computation to derive a residual map compared to its original counterpart. These individual residual maps are then aggregated to form a coarse anomaly map. 

% The basic working of MAE(mask auto encoder) \cite{he2022masked} is to train the network with masking the random patches of the input image and reconstruct the missing pixels of the masked input. We can choose the masking percentage during training of the MAE, it can be anything between 0 to 100 percent, but optimal result will be gained by values between 35\% to 75\% . The backbone of the MAE is ViT i.e. Vision Transformer. 

% We get the idea that during masking of the unhealthy MRI slice, the pathology can get masked which results into pseudo healthy pixel generation in that area during image reconstruction. This advantage of the network motivated us to use it as our Anomaly detection task.
% There are two reasons which motivated us to use \cite{dai2023swin} swinMAE for our Anomaly detection task: (1) Swin Transformer \cite{liu2021swin} uses a shifted windows based MSA to model the global feature relations, which improves the performance and gives better modelling power. Hence Swin Transformer\cite{WANG2023100004} as backbone of masked auto encoder will be more suitable for our task.

% (2). reconstructed image quality by MAE was a bit blurred and swinMAE was giving better reconstruction quality. So, we decided to use swinMAE.

% In our framework, the model is trained with healthy MRI slices, and for the detection of pathology, we are evaluating the model by giving unhealthy slices which is an unknown distribution for the model. The reconstruction loss is $ |x - \hat{x}| $ and the maximum reconstruction loss will localize the pathology in the unhealthy slice.

% \vspace{-0.27cm}
% \subsection{Training and Inference}
% The overall training and inference process is illustrated in Algorithm \ref{alg:main1} and \ref{alg:main2} respectively.

% \begin{algorithm2e}
% % \scriptsize
% \caption{ training procedure:}
% \label{alg:main1}

%  \begin{itemize}
%   \item Step 1: Take the pretrained weights on ImageNet dataset of swinT.
%   \item Step 2: Train the swinT model using the transfer learning with healthy brain MRI data by applying random masking strategy. 
% \end{itemize} 
% \end{algorithm2e}
% \vspace{-0.5cm}

% \begin{algorithm2e}[htbp]
% % \scriptsize
% \caption{Inference procedure:}
% \label{alg:main2}

%   \begin{itemize}
%   \item Step 1: Now do the inference of model on completely different unhealthy brain MRI datasets. Here we will take a window of W X W and slide it by W pixels through the image. It will cover the pathology completely and the network will be unable to reconstruct the unhealthy pathology and it will try to generate the healthy brain part only.
%   \item Step 2: Now, calculate the loss while sliding the window and create heatmap with the accumulated losses. Its called combined heatmap
%   \item Step 3: Now using superpixel segmentation over the combined heatmap with the help of original image slice, we will localize the pathology 
% \end{itemize} 
% \end{algorithm2e}

\vspace{-0.2cm}
\section{Experiments and Results}
\vspace{-0.2cm}
 \subsection{Dataset Description, Pre and Post Processing, Evaluation Metrics and Implementation Details}

 \textbf{Dataset Description}: We have used four publicly available datasets, IXI \cite{IXI}, BraTS21 \cite{baid2021rsna}, MSLUB  \cite{lesjak2018novel} and WMH dataset\cite{AECRSD_2022}. The IXI dataset(T1, T2) consisting of 580 subjects MRI volume is used as a reference for training our healthy distribution. BraTS21 dataset(T2) consisting of 1251 subjects with MRI volume, and the MSLUB dataset (T2) consisting of 30 subjects with MRI volume, are used for evaluating the performance of our model with different baselines. WMH dataset(Flair) consists of 100 subjects with MRI volume, and the BraTS21 dataset (T1, T2) are used for evaluating the generalizability of the model. %(1) \textbf{IXI Dataset}: We employ the publicly accessible IXI dataset as a reference for training our healthy distribution. The IXI dataset comprises 580 subjects of T1 and T2-weighted brain MRI scans of 256x256x120 dimensions.
%     (2) \textbf{BraTS21 Dataset}: Specifically the Multimodal Brain Tumor Segmentation Challenge 2021 (BraTS21) dataset used for inference purposes..The BraTS21 dataset comprises 1251 brain MRI scans of 240X240X155 dimensions with four different weightings (T1, T1-CE, T2, FLAIR). (3) \textbf{MSLUB Dataset}: The multiple sclerosis data set from the University Hospital of Ljubljana (MSLUB). The MSLUB dataset includes brain MRI scans from 30 patients with multiple sclerosis (MS), featuring T1, T2, and FLAIR-weighted scans for each patient. We have taken a co-registered volume having dimensions 192x512x512 
%     (4) \textbf{WMH Dataset}:The White Matter Hyperintensities dataset consists of 100 volumes of the test dataset containing T1, T2, and Flair modalities, having dimensions 240x240x48.

% All the evaluation datasets provide expert annotations in the form of pixel-wise segmentation maps. Throughout our experiments, we specifically utilize T2-weighted images.

\textbf{Pre and Post Processing}: In order to standardize the images over geometric variations, we perform skull-stripping using HD-Bet \cite{isensee2019automated} and rigid body registration with SRI24 \cite{rohlfing2010sri24} atlas, effectively resulting the volume to be of 240 $\times$ 240 $\times$ 155 dimensions. Additionally, to mitigate photometric variations, we perform bias field correction and normalization to the [0,1] intensity range.
% During the inference process, a residual map is derived from each reconstructed image and its corresponding original image using the L1 norm. This procedure is applied to each masked original input image fed into the model. Subsequently, these individual residual maps are aggregated to generate a comprehensive coarse anomaly map. 
For post-processing, morphological filters are applied to the coarse anomaly map given by the aggregated residual output of our model to eliminate smaller objects, followed by a connected component-based analysis. This analysis isolates the significant residual components. A Gaussian mixture model-based approach is employed on the original image through Atropos \footnote{https://antspyx.readthedocs.io/en/latest/segmentation.html} to generate a segmentation mask. Integrating this Atropos segmentation information refines the anomaly map, creating precise segmentation boundaries of the pathologies.

\textbf{Evaluation Metrics}: For quantifying the segmentation performance of different models, we have considered the standard definitions of Dice coefficient and Area Under the Precision Recall curve (AUPRC). For analyzing the latent representations of our model, we have projected the high dimensional vectors in a 2d space by using Umap projections \cite{mcinnes2018umap}. In order to further analyze the spatial relationship of the data, we have performed k-means clustering of the 2d points and extracted the eigenvectors and eigenvalues of the covariance matrix of each cluster. These eigen-components are used to draw an ellipse to represent the spread and orientation of each cluster.

% \vspace{-0.75cm}
 
\textbf{Implementation Details and Baselines:} 
Models are implemented in PyTorch 2.0.1 version on an 80GB NVIDIA A100 GPU and CUDA Version: 12.1. For every step mentioned in the training algorithm, models are trained for 400 epochs using the Adam optimizer, with a learning rate of $1 \mathrm{e}^{-3}$. We have compared our model performance with several existing baselines, such as VAE \cite{baur2021autoencoders}, f-AnoGAN \cite{schlegl2019f}, MAE \cite{he2022masked} and autoDDPM \cite{bercea2023mask}.We have evaluated on all the baseline methods by adopting the existing implementations. The details of the parameters used in post-processing are mentioned in Appendix \ref{Appendix_A1}. The code for our proposed method will be available at Ano-swinMAE repository \footnote{https://github.com/rashmi05pathak/Ano-swinMAE}.

\subsection{Results and discussion}
\subsubsection{Quantitative And Qualitative Analysis}
% \vspace{-0.3cm}
% The quantitative evaluations of our method compared to baselines are detailed in Table \ref{tab:results1}. Our model Ano-swinMAE has better performance compared to other baselines. The performance of the two different mask sizes in our model is dataset dependent, for mask size 32 the model performs better for MSLUB and for 64 it performs better in BraTS21. The gain in performance of our method compared to autoDDPM is because of the noising strategy used by autoDDPM to map the pathological distribution to a healthy distribution. This can not always ensure that pathological regions will not be reconstructed like in hyper-intense large high-grade glioma present in the BraTS21 dataset. The MAE, which shares the closest resemblance to our method, could not gain increment in performance since the model convergence is dependent on large number of data and obtains images with relatively lower fidelity as compared to our model. The GAN-based approaches has often fails to generate one-to-one mapping in healthy regions of the brain due to modelling failures like mode collapse. Similar to GAN, VAE also suffers from posterior collapse which reduslts in low fidelity images resulting large false positives. In Table \ref{tab:results1} we also observe that model our model has relatively better inference time given the performance gain we have as compared to methods which has lower inference time than our model.
% \vsapce{-0.3cm}
The quantitative evaluations of our method against baselines are summarized in Table \ref{tab:results1}. Our model, Ano-swinMAE, shows an increment in performance compared to the other baselines. The effectiveness of different mask sizes in our model varies depending on the dataset: a mask size of 32X32 performs better for MSLUB, whereas a mask size of 64X64 yields superior results in BraTS21. The performance drop of autoDDPM compared to our method could be due to the noising strategy employed by autoDDPM to map pathological distributions to healthy ones. This strategy might not consistently prevent the reconstruction of pathological regions, such as hyper-intense large high-grade gliomas found in the BraTS21 dataset. MAE could not achieve performance increments because of its reliance on a large amount of data, leading to lesser image fidelity compared to Ano-swinMAE.
% We observed that MAE failed to demonstrate performance improvements because of its reliance on a large amount of data, leading to lesser image fidelity compared to Ano-swinMAE
GAN-based approaches often lack one-to-one mappings in healthy brain regions due to modeling challenges like mode collapse. Similarly, VAEs suffer from posterior collapse, resulting in low-fidelity images and an increased occurrence of false positives. Furthermore, Table \ref{tab:results1} highlights that our model exhibits better inference time when compared with baselines that have relatively good Dice and AUPRC.

% Please add the following required packages to your document preamble:
% \usepackage{graphicx}

% Please add the following required packages to your document preamble:
% \usepackage{graphicx}
% Please add the following required packages to your document preamble:
% \usepackage{graphicx}

% Please add the following required packages to your document preamble:
% \usepackage{graphicx}
\begin{table}[h]
\caption{Comparison of Ano-swinMAE with other baseline models which are used for unsupervised pathology detection in brain MRI.Ano-swinMAE(32x32) uses a 32x32 mask size in brain MRI and shifts by 32 pixels.Ano-swinMAE(64x64) uses a 64x64 mask size in brain MRI and shifts by 64 pixels.}
\label{tab:results1}
\resizebox{\columnwidth}{!}{%
\begin{tabular}{ccccccc}
                                        & \multicolumn{2}{c}{BraTS21}        & \multicolumn{2}{c}{MSLUB}          &                                    &                   \\ \hline
\multicolumn{1}{c|}{Model}              & Dice[\%]  & \multicolumn{1}{c|}{AUPRC[\%]} & Dice[\%]  & \multicolumn{1}{c|}{AUPRC[\%]} & \multicolumn{1}{c|}{Parameters(M)} & \begin{tabular}[c]{@{}l@{}}Inference Time(s) \\  (per MRI slice)\end{tabular} \\ \hline
\multicolumn{1}{c|}{VAE\cite{baur2021autoencoders}}                & 31.11 & \multicolumn{1}{c|}{28.80} & 6.89  & \multicolumn{1}{c|}{5.00}  & \multicolumn{1}{c|}{4.96}          & 0.02              \\
\multicolumn{1}{c|}{f-AnoGAN\cite{schlegl2019f}}           & 24.16 & \multicolumn{1}{c|}{22.05} & 4.18  & \multicolumn{1}{c|}{4.01}  & \multicolumn{1}{c|}{5.56}          & 0.04              \\
\multicolumn{1}{c|}{MAE\cite{he2022masked}}                & 31.97 & \multicolumn{1}{c|}{25.44} & 15.46 & \multicolumn{1}{c|}{10.56} & \multicolumn{1}{c|}{329} & 90                   \\
\multicolumn{1}{c|}{autoDDPM\cite{bercea2023mask}}           & 35.80 & \multicolumn{1}{c|}{29.07} & 19.35 & \multicolumn{1}{c|}{11.79} & \multicolumn{1}{c|}{18.50}         & 16.32             \\
\multicolumn{1}{c|}{Ano-swinMAE(32x32)} & 42.55 & \multicolumn{1}{c|}{30.44} & 19.78 & \multicolumn{1}{c|}{12.78} & \multicolumn{1}{c|}{26.10}         &  52                 \\
\multicolumn{1}{c|}{Ano-swinMAE(64x64)} & 42.92 & \multicolumn{1}{c|}{30.24} & 18.52 & \multicolumn{1}{c|}{12.50} & \multicolumn{1}{c|}{26.10}         & 14.38                
\end{tabular}%
}
\end{table}



The pathology segmentation masks produced by autoDDPM, MAE, and our method for BraTS21 T2 data are illustrated in Figure \ref{fig: Resultsimg}. We present results across three distinct intensities in pathological regions relative to those in the ventricles and sulci (hyper, medium, and low). Our method and MAE exhibit a consistent trend in performance across these intensity levels, with metrics decreasing in the order of hyper, medium, and low-intensity cases. In contrast, autoDDPM shows a reverse performance trend. Our model demonstrates improved anomaly capture compared to MAE, due to the blurrier reconstruction of MAE that generates more false positives. The autoDDPM model tends to %reconstruct or
reconstruct traces of pathologies in hyper-intense regions, leading to more false negatives. %Notably, the integration of superpixel information enhances the precision of anomaly boundaries. 
Additionally, the integration of Atropos-based segmentation information enhances the precision of anomaly boundaries.
\begin{figure}[h]
\small
% \resizebox{\columnwidth}{!}
\floatconts
  {fig: Resultsimg}
  {\caption{(a) represents original BraTS21 (T2) brain MRI, (b) represents the pathology segmentation from the autoDDPM method, (c) represents pathology segmentation from the MAE method, and (d) represents the pathology segmentation from our Ano-swinMAE method}}
  {\includegraphics[width=1.0\linewidth]{Images/Qualitative.pdf}}
  % \vspace{-0.8cm}

\end{figure} 
\begin{table}[]
\small
\scriptsize
\centering
\caption{Generalization study of Ano-swinMAE (64X64) and AutDDPM models. We have utilized WMH flair brain MRI slices for this study along with BraTS21. Our model Ano-swinMAE(64X64) and the autoDDPM models trained with IXI dataset are used here}
\label{tab:Gen_results}
\resizebox{0.8\columnwidth}{!}{%
% \center
\begin{tabular}{c|c|cc|cc}
\hline
Model                        & \begin{tabular}[c]{@{}c@{}}Training\\ on IXI Data\end{tabular} & \multicolumn{2}{c|}{Inference Data} & Dice{[}\%{]} & AUPRC{[}\%{]} \\ \hline
\multirow{}{}{AutoDDPM}    & T1                                                             & BraTS21           & T2              & 30.87        & 25.67         \\
                             & T2                                                             & BraTS21           & T1              & 29.77        & 25.88         \\
                             & T1                                                             & WMH               & Flair           & 35.97        & 32.91         \\ \hline
\multirow{}{}{Ano-swinMAE} & T1                                                             & BraTS21           & T2              & 38.99        & 27.19         \\
                             & T2                                                             & BraTS21           & T1              & 28.62        & 25.11         \\
                             & T1                                                             & WMH               & Flair           & 35.92        & 29.95        
\end{tabular}%
}
\end{table}



\subsubsection{Analysing The Effect Of Masking in The Latent Space}
In Figure \ref{fig:latent_space}, we compare the latent space of masked and unmasked images. From Figure \ref{fig:latent_space} (a), it is evident that the 2d UMAP projections of the latent representations of Ano-swinMAE tend to separate into two clusters when healthy data slices from IXI and pathological ones from BraTS21 are sent to the model without masking. Each of the ellipses formed from the covariance matrix of each cluster %mostly 
approximately contains projected points from one of the datasets (IXI or BraTS21). Figure \ref{fig:latent_space} (b) indicates that the projections of both datasets tend to collapse within a single cluster. This indicates that since the encoding capability of the model did not explicitly enforce any constraint to map pathological data into a learnt healthy distribution but the masking strategy has enabled the mapping. Consequently, this allows the latent representational space to encode diverse semantics, unlike modeling capabilities like GAN and VAE, which constrain the latent space to follow a prior standard Gaussian distribution.
% Consequently, this allows the latent representational space to encode diverse semantics, unlike modeling capabilities like GAN and VAE, which restrict the latent space.
\begin{figure}[hbt!]
\floatconts
  {fig:latent_space}
  {\caption{Two-dimensional UMAP-based projection of the latent space vectors of the Ano-swinMAE model. From left to right: (a) displays the UMAP projection of latent vectors of both IXI and BraTS slices without masking, and (b) displays the projection of BraTS slices, with masking applied to pathological locations alongside unmasked IXI slices.}}
  {\includegraphics[width=0.8\linewidth]{Images/latent_space.png}}
  \vspace{-0.5cm}
 % K-means-based clustering was performed in both cases, revealing that after masking, the representation of BraTS slices has shifted closer to the representations of IXI slices, contrasting with the representations of unmasked BraTS and IXI slices.
\end{figure} 
\subsubsection{Generalization Across Modalities}
From Table \ref{tab:Gen_results}, it is evident that our model Ano-swinMAE shows incremental performance when evaluated on a cross-data setup. When the model is trained on IXI T1 data and evaluated on BraTS21 T2 data, it can perform better than the baseline under a similar setup. Similarly, when IXI T2 is used for training and BraTS21 T2 data for inference, the quantitative metrics are better for our method. In the case of WMH Flair data, our model has %comparative 
slightly inferior results to the baseline. Figure \ref{fig:latent_space} (a), also supports that our model has better Generalization capability since the representational space encodes meaningful semantics tending to form %differentiable
separable clusters for anomaly and healthy data. Ano-swinMAE exhibits better performance when trained on T1 and evaluated on T2 since T2 enhances the pathological appearances in the images and it is easily discriminative. Whereas in the case of autoDDPM if the pathological information is evident, then it is present after the noising process. Further details are discussed in \ref{generalization}.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Please add the following required packages to your document preamble:
% \usepackage{graphicx}

% \begin{table}[]
% \caption{Comparison of SFT and SFT-KD-Recon for AT distillation method on MRBrainS dataset and 4x acceleration. For SFT, the classification task loss terms are replaced with MSE loss for reconstruction. Our framework which trains in the image domain ensures better reconstruction fidelity than feature domain learning.}
% \label{tab:sft_sftn_psnr}
% \resizebox{\columnwidth}{!}{%
% \begin{tabular}{|l|ccc|ccc|}
% \hline
% \multirow{3}{*}{Dataset} &
%   \multicolumn{3}{c|}{\textbf{SFT training setup}} &
%   \multicolumn{3}{c|}{\textbf{SFT-KD-Recon training setup}} \\ \cline{2-7} 
%  &
%   \multicolumn{3}{c|}{Training done with MSE loss in place of CE/KL loss} &
%   \multicolumn{3}{c|}{Training done in image domain with MSE loss} \\ \cline{2-7} 
%  &
%   \multicolumn{1}{c|}{\textbf{Model}} &
%   \multicolumn{1}{c|}{\textbf{PSNR}} &
%   \textbf{SSIM} &
%   \multicolumn{1}{c|}{\textbf{Model}} &
%   \multicolumn{1}{c|}{\textbf{PSNR}} &
%   \textbf{SSIM} \\ \hline
% \multirow{5}{*}{MRBrainS, 4x} &
%   \multicolumn{1}{c|}{Teacher} &
%   \multicolumn{1}{c|}{34.06   0.7114} &
%   0.9291   0.007151 &
%   \multicolumn{1}{c|}{Teacher} &
%   \multicolumn{1}{c|}{40.05   2.023} &
%   0.9785   0.00655 \\ \cline{2-7} 
%  &
%   \multicolumn{1}{c|}{SF Teacher} &
%   \multicolumn{1}{c|}{34.06   0.6263} &
%   0.9296   0.005212 &
%   \multicolumn{1}{l|}{SFT-KD-Recon Teacher} &
%   \multicolumn{1}{c|}{40.23   2.081} &
%   0.9799   0.00636 \\ \cline{2-7} 
%  &
%   \multicolumn{1}{c|}{Student} &
%   \multicolumn{1}{c|}{33.48   0.4628} &
%   0.9199   0.0051 &
%   \multicolumn{1}{c|}{Student} &
%   \multicolumn{1}{c|}{39.49   1.778} &
%   0.9759   0.005949 \\ \cline{2-7} 
%  &
%   \multicolumn{1}{c|}{KD} &
%   \multicolumn{1}{c|}{32.50   0.4323} &
%   0.9156   0.006479 &
%   \multicolumn{1}{c|}{KD} &
%   \multicolumn{1}{c|}{39.76   1.899} &
%   0.9769   0.006172 \\ \cline{2-7} 
%  &
%   \multicolumn{1}{c|}{SFT} &
%   \multicolumn{1}{c|}{\textbf{33.50   0.4232}} &
%   \textbf{0.9205   0.005292} &
%   \multicolumn{1}{c|}{SFT-KD-Recon} &
%   \multicolumn{1}{c|}{\textbf{40.07   1.983}} &
%   \textbf{0.9789   0.0062} \\ \hline
% \end{tabular}%
% }
% \vspace{-0.4cm}
% \end{table}
% \begin{figure}[ht]
% % \resizebox{\columnwidth}{!}
% \floatconts
%   {fig:boxplot}
%   % \squeezeup
%   {\caption{(a) SSIM Box plots of KD, SFT-KD-Recon  with respect to teacher and student across the brain and cardiac datasets for 4x and 5x acceleration. (b) Reconstruction loss of teacher, student, SFT-Teacher, KD, SFT-KD-Recon on the validation set for the cardiac dataset, 4x acceleration. KD and SFT-KD-Recon use AT as the distillation method.}}
%   {\includegraphics[width=1.0\linewidth]{Images/ssim_valid.pdf}}
%   \vspace{-0.5cm}
% \end{figure}

\vspace{-0.4cm}
\section{Conclusion}
In this work, we have proposed Ano-swinMAE, a transformer-based system for unsupervised anomaly detection, which further extends the scope of transformer usage in the field of medical imaging. Our method has outperformed the baselines as well as has shown promising results for generalizable capability. Through latent space analysis, we have observed that masking is quite effective for pseudo-healthy reconstruction of brain MRI.



% \begin{abstract}
% This is a great paper and it has a concise abstract.
% \end{abstract}

% \begin{keywords}
% List of keywords, comma separated.
% \end{keywords}

% \section{Introduction}

% This is where the content of your paper goes.  Some random
% notes\footnote{Random footnote are discouraged}:
% \begin{itemize}
% \item You should use \LaTeX \cite{Lamport:Book:1989}.
% \item JMLR/PMLR uses natbib for references. For simplicity, here, \verb|\cite|  defaults to
%   parenthetical citations, i.e. \verb|\citep|. You can of course also
%   use \verb|\citet| for textual citations.
% \item Eprints such as arXiv papers can of course be cited \cite{Hinton:arXiv:2015:Distilling}. We recomend using a \verb|@misc| bibtex entry for these as shown in the sample bibliography.
% \item You should follow the guidelines provided by the conference.
% \item Read through the JMLR template documentation for specific \LaTeX
%   usage questions.
% \item Note that the JMLR template provides many handy functionalities
% such as \verb|\figureref| to refer to a figure,
% e.g. \figureref{fig:example},  \verb|\tableref| to refer to a table,
% e.g. \tableref{tab:example} and \verb|\equationref| to refer to an equation,
% e.g. \equationref{eq:example}.
% \end{itemize}

% \begin{table}[htbp]
%  % The first argument is the label.
%  % The caption goes in the second argument, and the table contents
%  % go in the third argument.
% \floatconts
%   {tab:example}%
%   {\caption{An Example Table}}%
%   {\begin{tabular}{ll}
%   \bfseries Dataset & \bfseries Result\\
%   Data1 & 0.12345\\
%   Data2 & 0.67890\\
%   Data3 & 0.54321\\
%   Data4 & 0.09876
%   \end{tabular}}
% \end{table}

% \begin{figure}[htbp]
%  % Caption and label go in the first argument and the figure contents
%  % go in the second argument
% \floatconts
%   {fig:example}
%   {\caption{Example Image}}
%   {\includegraphics[width=0.5\linewidth]{example-image}}
% \end{figure}

% \begin{algorithm2e}
% \caption{Computing Net Activation}
% \label{alg:net}
%  % older versions of algorithm2e have \dontprintsemicolon instead
%  % of the following:
%  %\DontPrintSemicolon
%  % older versions of algorithm2e have \linesnumbered instead of the
%  % following:
%  %\LinesNumbered
% \KwIn{$x_1, \ldots, x_n, w_1, \ldots, w_n$}
% \KwOut{$y$, the net activation}
% $y\leftarrow 0$\;
% \For{$i\leftarrow 1$ \KwTo $n$}{
%   $y \leftarrow y + w_i*x_i$\;
% }
% \end{algorithm2e}

% Acknowledgments---Will not appear in anonymized version


% \midlacknowledgments{We thank a bunch of people.}


\bibliography{midl24_314}

% \appendix

% \section{Proof of Theorem 1}

% This is a boring technical proof of
% \begin{equation}\label{eq:example}
% \cos^2\theta + \sin^2\theta \equiv 1.
% \end{equation}

% \section{Proof of Theorem 2}

% This is a complete version of a proof sketched in the main text.
% \vspace{0.5cm}
\appendix
\section{}
\subsection{ Inference and Post-Processing Details } \label{Appendix_A1}

During inference on pathological data, the non-overlapping sliding mask ($64 \times 64$) moves across 6 unique positions to cover all brain regions within the image. We have used L1 normalization on the residual images, obtained by subtracting the reconstructed images from the unmasked original image for each of the 6 positions. All the reconstructed images are normalized (0 to 1 normalization) and added to form a combined coarse anomaly map. The results of the post-processing steps are given in Figure \ref{fig: post_processing}. The combined map is eroded with kernel size ($k \times k$) to remove very small false positives and then it is thresholded to retain pixel values greater than 0.5. For BraTS21 $k = 3$ and for MSLUB \& WMH $k=1$. To extract the unique objects in the binary map, we perform connected component analysis. Each of the unique objects is assigned a mean intensity value acquired from the combined map before thresholding. Depending on the higher mean intensity of the entire object area, a certain number of objects are retained. The number of objects that have given the best result for BraTS21 is $2$, and for MSLUB \& WMH, it is $7$ since BraTS21  primarily has a few larger appearing pathologies and the other two sets have multiple smaller pathologies. The obtained map is dilated with kernel size ($3 \times 3$) to obtain a filtered combined map.
\begin{figure}[]
\small
% \resizebox{\columnwidth}{!}
\floatconts
  {fig: post_processing}
  {\caption{ Post-processing steps Rows (i) and (iii): column-wise a. Original Image b. Atropos Segmentation c. Ground Truth d. Combined map after Morphological operations. Rows (ii) and (iv): column-wise a. Combined map after Connected Component Analysis b. Filtered Combined map c. Filtered Combined map with Atropos d. Overlay of Ground Truth and Prediction.}} 
  {\includegraphics[width=0.9\linewidth]{Images/post_processing2.png}}
  \vspace{-0.75cm}

\end{figure} 

We have performed Atropos-based multi-class segmentation (the number of classes is 5) on the original image. We also obtain the unique objects from these segmentation masks and look for objects that share maximal area overlap with the filtered combined map. This gives us the final segmentation map, maintaining the same number of objects as the filtered combined map while refining the precise object boundaries obtained from the Atropos segmentation.

\subsection{Visualisation of Attention maps}
% To calculate the attention map across the encoder layers of the Ano-swinMAE, we have taken the feature vector outputs from each encoder layer. After that, we calculated the mean across channels for every feature vector output. For example, the dimension of feature vector output from encoder layer1 is 56x56x96, after taking the mean across 96 channels, we will have a 56x56 attention map.

To compute the attention map across the encoder layers of Ano-swinMAE, we extracted the feature vector outputs from each encoder layer and then computed the mean across channels for each feature vector output. For instance, the feature vector output dimension from encoder layer 1 is 56x56x96; after averaging across 96 channels, a 56x56 attention map is obtained.

% From Figure \ref{fig: attention_32}, Row(ii) and Row(iv), we observe the attention(yellow region i.e. more intensity region) is there in the masked region and pathological region of the input image as evident from (ii)a and (iv)b. In further encoder layers the attention was distributed. The same pattern is observed when the attention map is analyzed in a 64x64 masking setup which can be seen in Figure \ref{fig: attention_64}. We may conclude that this type of model behavior is happening because the model is trained on healthy MRI slices while looking at the unhealthy slices, the model is trying to reconstruct its healthy counterpart. When we have masked the pathological region the model is even paying more attention.

% To compute the attention map across the encoder layers of Ano-swinMAE, we extracted the feature vector outputs from each encoder layer and then computed the mean across channels for each feature vector output. For instance, the feature vector output dimension from encoder layer 1 is 56x56x96; after averaging across 96 channels, a 56x56 attention map is obtained.

In Figure \ref{fig: attention_32}, Rows (ii) and (iv) reveal that the attention (yellow region i.e. more intensity region), is concentrated within the masked and pathological regions of the input image, as indicated by (ii)a and (iv)a. As we progress through the encoder layers, attention becomes more dissipated. A similar pattern is seen when analyzing the attention map in a 64x64 masking setup, as depicted in Figure \ref{fig: attention_64}. This model behavior can be attributed to the training data consisting of healthy MRI slices; when presented with unhealthy slices, the model attempts to reconstruct their healthy counterparts. Furthermore, the model demonstrates increased attention when the pathological region is masked.
\begin{figure}[]
\small
% \resizebox{\columnwidth}{!}
\floatconts
  {fig: attention_32}
  {\caption{Visual Results of attention map of 32x32 mask setup. Row(i) and Row(iii) column-wise a. Original Image b. Masked/Unmasked Input Image c. Reconstructed Image d. Residual Image. Row(ii) and (iii) are the layer-wise Attention Maps for Masked and Unmasked Input Images respectively.
  }}
  {\includegraphics[width=1.0\linewidth]{Images/attention32.png}}
  \vspace{-0.75cm}

\end{figure} 

\subsection{Generalization Across Modalities} \label{generalization}
T2-weighted MRI highlights pathologies like tumors with high water content or those surrounding edema. Hence pathologies mostly become more visually discriminable in T2 modality. Visual results of cross-modality setup are shown in Figure \ref{fig: T1_T2}. When, we are assessing the metrics for T2 while being trained on T1, we observe that the performance of Ano-swinMAE is better since the model is able to easily differentiate between healthy and pathological content. When the modalities are reversed (inferring on T1 while trained on T2) due to the lesser discriminative appearance of the pathology, the performance of the model drops. There is an analogy between the appearance of the pathology and the performance of the model since it attempts to fill the masked regions with contextual neighborhood information. Whereas in the case of autoDDPM the trend is reversed since having a more evident appearance of pathology (as in T2), the chances to pass the pathological information through the noising process is higher. This degrades the performance scores.
% Figure \ref{fig: attention_32} (i) and (iii)represents the image reconstruction and the anomaly with a 32x32 mask and without using mask respectively. The attention maps in (ii) are layer wise attention maps of the encoder. We analysed 4 attention maps of the encoder. When the input image is passed into the model we get 56x56x96 feature vector and the image in (ii)a. is the attention map 56x56 obtained by taking mean over 96 channels of the layer1 output. Similarly (ii)b. represents the output after the 2nd layer of an encoder. (ii represents output from 3rd layer of the encoder. (d)represents output from 4th layer of encoder. 


  % {\caption{In this figure:
  % (i) represents the image reconstruction and anomaly with a 32x32 mask. 
  % (ii)represents an attention map while the input image is processed through the encoders, here we are analyzing up to 4 layers. As we can see (a) The output after 1st layer of an encoder is of dimension 56X56X96, we have taken the mean across 3rd channel and then displayed the attention map. Similarly (b) represents the output after the 2nd layer of an encoder. (c) represents output from 3rd layer of the encoder. (d)represents output from 4th layer of encoder.
  % In (ii) The yellow region shows the highest intensity region of the image, which depicts the model attention in case of pathology region masked as well as unmasked.
  % From left to right, we can see how attention is shifting. Layerwise attention in encoder of Ano-swinMAE with masking Anomaly}}
% \begin{figure}[]
% \small
% % \resizebox{\columnwidth}{!}
% \floatconts
%   {fig: Resultsimg3}
%   {\caption{ (i) represents the image reconstruction and anomaly without a  mask.(ii) Layerwise attention in encoder of Ano-swinMAE without masking Anomaly.The yellow region shows the highest intensity region of the image, which depicts the model attention in case of the pathology region masked as well as unmasked. While looking at (ii)a of Figure 4 and Figure 5, we can observe the difference in the yellow region.}}
%   {\includegraphics[width=0.9\linewidth]{Images/attention_withoutmask.png}}
%   \vspace{-0.75cm}

% \end{figure} 


\begin{figure}[]
\small
% \resizebox{\columnwidth}{!}
\floatconts
  {fig: attention_64}
  {\caption{Visual Results of attention map of 64x64 mask setup. Row(i) and (iii) column-wise a. Original Image b. Masked/Unmasked Input Image c. Reconstructed Image d. Residual Image. Row(ii) and (iii) are the layer-wise Attention Maps for Masked and Unmasked Input Images respectively}}
  {\includegraphics[width=1.0\linewidth]{Images/attention64.png}}
  \vspace{-0.75cm}

\end{figure} 

\begin{figure}[]
\small
% \resizebox{\columnwidth}{!}
\floatconts
  {fig: T1_T2}
  {\caption{Visual Results of cross-modality setup. Row-wise (i) and (ii) have the result of T2 and T1 images, while the models were trained on T1 and T2 images respectively. Column-wise a. Original Images b. Masked Images c. Reconstructed Images d. Residual Images}}
  {\includegraphics[width=1.0\linewidth]{Images/T1_T2.png}}
  \vspace{-0.75cm}

\end{figure} 

% \begin{figure}[]
% \small
% % \resizebox{\columnwidth}{!}{
% \floatconts
%   {fig: Resultsimg5}
%   {\caption{This figure represents the image reconstruction without masking and corresponding attention map. Layerwise attention in encoder of Ano-swinMAE without masking Anomaly}}
%   {\includegraphics[width=0.9\linewidth]{Images/attentionwithoutmask64.png}}
%   \vspace{-0.75cm}

% \end{figure} 
\end{document}
