\documentclass{midl} % Include author names

\usepackage{mwe} % to get dummy images
\usepackage{multirow}
\usepackage{makecell}

\jmlrvolume{-- 228}
\jmlryear{2024}
\jmlrworkshop{Full Paper -- MIDL 2024 submission}
\editors{Accepted for publication at MIDL 2024}


 \title[Age-Related White Matter Hyperintensity Detection in Clinical Routine]{Detecting Brain Anomalies in Clinical Routine with the $\beta$-VAE: Feasibility Study on Age-Related White Matter Hyperintensities}

\midlauthor{\Name{Sophie Loizillon\nametag{$^{1}$}}
\Email{sophie.loizillon@icm-institute.org}
\AND
\Name{Yannick Jacob\nametag{$^{3}$}} 
\Email{yannick.jacob@aphp.fr}
\AND
\Name{Aurelien Maire\nametag{$^{3}$}} 
\Email{aurelien.maire@aphp.fr}
\AND
\Name{Didier Dormont\nametag{$^{1,2}$}} 
\Email{didier.dormont@icm-institute.org}
\AND
\Name{Olivier Colliot\nametag{$^{1}$}} 
\Email{olivier.colliot@cnrs.fr}
\AND
\Name{Ninon Burgos\nametag{$^{1}$}} 
\Email{ninon.burgos@cnrs.fr}
\AND
\Name{{APPRIMAGE Study Group}\nametag{$^{4}$}} \\
%
\addr $^{1}$ Sorbonne Université, Institut du Cerveau - Paris Brain Institute - ICM, CNRS, Inria, Inserm, AP-HP, Hôpital de la Pitié Salpêtrière, Paris, France. \\ % ARAMIS
\addr $^{2}$ AP-HP, Pitié Salpêtrière, DMU DIAMENT, Dep. of Neuroradiology, Paris, France. \\ % Hospital-Neuroradiology
\addr $^{3}$ AP-HP,  Innovation \& Donn\'{e}es -- D\'{e}partement des Services Num\'{e}riques, Paris, France \\ % Hospital-Neuroradiology
\addr $^{4}$ Members of the APPRIMAGE study group can be found at \href{https://www.aramislab.fr/apprimage} {www.aramislab.fr/apprimage}
}

\begin{document}

\maketitle

\begin{abstract}
This experimental study assesses the ability of variational autoencoders (VAEs) to perform anomaly detection in clinical routine, in particular the detection of age-related white matter lesions in brain MRIs acquired at different hospitals and gathered in a clinical data warehouse (CDW). We pre-trained a state-of-the-art $\beta$-VAE on a healthy cohort of over 10,000 FLAIR MR images from the UK Biobank to learn the distribution of healthy brains. The model was then fine-tuned on a cohort of nearly 700 healthy FLAIR images coming from a CDW. We first ensured the good performance of our pre-trained model compared with the state-of-the-art using a widely used public dataset (MSSEG). We then validated it on our target task, age-related WMH detection, on ADNI3 and on a curated clinical dataset from a single-site neuroradiology department, for which we had manually delineated lesion masks. Next, we applied the fine-tuned $\beta$-VAE for anomaly detection in a CDW characterised by an exceptional heterogeneity in terms of hospitals, scanners and image quality. We found a correlation between the Fazekas scores extracted from the radiology reports and the volumes of the lesions detected by our model, providing a first insight into the performance of VAEs in a clinical setting. We also observed that our model was robust to image quality, which strongly varies in the CDW. However, despite these encouraging results, such approach is not ready for an application in clinical routine yet due to occasional failures in detecting certain lesions, primarily attributed to the poor quality of the images reconstructed by the VAE.
\end{abstract}

\begin{keywords}
Anomaly Detection, White Matter Hyperintensities, Clinical Data Warehouse, MRI
\end{keywords}

\section{Introduction}
% WMH in FLAIR
Age-related white matter hyperintensities (WMHs) are lesions in the white matter that appear hyperintense on FLAIR MRI and are a very common finding in elderly patients \cite{Debettec3666,prins2015white,habes2016white,wardlaw2015white}. Detection of these lesions is a clinically relevant task to assess their severity. In clinical routine, such WMHs are visually evaluated using the Fazekas score, which is a 3-grade scale where 0 corresponds to no lesion and 3 to large, extensive and confluent lesions \cite{fazekas1987mr}. One of the main problems with visual rating scales is that they suffer from intra- and inter-subject variability, leading to inconsistencies between studies \cite{caligiuri2015automatic}. This is why the development of an automatic tool capable of performing WMH detection, robust to MRIs acquired with different machines, manufacturers or acquisition parameters at different sites and of various image quality, is desirable in clinical routine.

%Anomaly Detection
In recent years, unsupervised machine learning algorithms for medical imaging have emerged, with the advantage of not requiring costly and time-consuming manual annotation \cite{chen2022unsupervised}. These algorithms are capable of tackling complex tasks such as anomaly detection \cite{fernando2021deep}. Unsupervised anomaly detection (UAD) can be based on generative models such as variational autoencoders (VAEs) \cite{baur2021autoencoders}, generative adversarial networks (GANs) \cite{schlegl2019f}, or diffusion models \cite{wolleb2022diffusion}. Trained on healthy brain MRIs, the model learns the distribution of healthy brain tissue, ensuring that when confronted with an image presenting anomalies, the abnormal tissues are not well reconstructed. Comparing the reconstructed image with the real one then allows the detection of anomalies. 

%VAE
Among the deep generative models that have been used for UAD, VAEs have demonstrated their ability to detect lesions in datasets of brain MRIs with diverse pathology including multiple sclerosis \cite{commowick2018objective}, glioblastoma \cite{menze2014multimodal} and cerebral small vessel disease \cite{kuijf2019standardized}.
However, the effectiveness of VAEs on routine clinical datasets that reflect the reality of current practice remains to be demonstrated. Clinical data warehouses (CDWs), which gather medical images from thousands to millions of patients, are representing an exceptional opportunity to perform this kind of validation on images acquired on different machines, with non-standardised acquisition parameters and variable image quality \cite{bottani2023evaluation,loizillon2024automatic}. 

%%Annonce de l'article
We propose an experimental study to assess the potential of VAEs, using the state-of-the-art $\beta$-VAE~\cite{higgins2016beta}, for anomaly detection, and more specifically targeting age-related WMHs, on clinical datasets. After pre-training our VAE on a non-lesional cohort of FLAIR MRIs from the UK Biobank, we will fine-tune it on a unique FLAIR dataset extracted from the Parisian CDW gathering images from up to 39 different hospitals. After validating the effectiveness of our anomaly detection model on research datasets and on a curated clinical dataset, which has been acquired at a single neuroradiology department, we will evaluate the feasibility of its application on routine clinical MRIs from the Parisian CDW known for its exceptional heterogeneity.    


\section{Materials}

% \subsection{Datasets}

% To perform anomaly detection using a VAE, we first used healthy cohorts to train our model and validate this model on abnormal cohorts. 

\subsection{Construction of the Non-Lesional Cohorts}

% \subsubsection*{Research Dataset}
{\scshape Research Dataset}
The UK Biobank (UKB) is a prospective cohort study involving 500,000 participants aged between 40 and 69 years at time of recruitment (2006--2010). \cite{sudlow2015uk}. Neuroimages, all acquired on the same type of scanner (Siemens Skyra 3~T) using the same acquisition parameters at different sites, are available for some of the participants \cite{Alfaro-Almagro2018ImageProcessing}. In our study, we were only interested in 3D FLAIR MRIs of healthy appearance. After linking the FLAIR images to all diagnostic codes (ICD-10) associated with the patients at each visit, we excluded images in which patients were diagnosed with dementia or lesions. In this way, we extracted a supposedly healthy cohort of 11,990 FLAIR images.

{\scshape Clinical Data Warehouse}
We built a healthy cohort with clinical routine data coming from a large CDW containing all the FLAIR brain MRIs of adult patients scanned in hospitals of the Greater Paris area (Assistance Publique-H\^{o}pitaux de Paris [AP-HP]). Within the CDW we had access to 13,703 FLAIR MRIs and developed the following approach to build a non-lesional cohort. We first associated each MRI with its radiological report and any ICD-10 diagnostic codes associated with the patient. This allowed us to perform an initial filter, eliminating all patients with an ICD-10 code related to dementia or the presence of brain lesions. We then analysed the radiological reports using the \texttt{EDS-NLP} tool \cite{edsnlp}, which allowed us to extract the \textit{``Conclusion''} section of the document. A filtering process was applied to these conclusions, looking for references to the absence of abnormalities:  \textit{``pas d'anomalie'', ``normal'', ``absence d'anomalie'', ``pas d'argument'', ``sans anomalie'', ``pas de signe'', ``pas d'accident'', ``sans particularité'', ``Pas de lésion cérébrale'' et ``Pas de lésion encéphalique''} (Figure ~\ref{supfig:translations}). The dataset was further refined by a manual filtering process on the \textit{Conclusion} associated with a visual inspection of the images to ensure that the resulting MRIs correspond to 3D non-lesional FLAIRs (i.e., no straight reject, see below). Thus, we built a new healthy cohort of 674 FLAIR images out of the 13,703 which were acquired on 12 different machines from three different manufacturers. This dataset is characterised by its great heterogeneity with images acquired over 17 different hospitals with no homogenisation in the acquisition parameters. Thus, this cohort well represents 3D FLAIR brain MRIs that may be acquired in other hospitals every day. 

\subsection{Datasets with Images Presenting Lesions}
{\scshape Research Datasets}
%MSSEG : à reformuler
The MSSEG MICCAI challenge, which includes 53 patients affected by multiple sclerosis across four different sites, aims to perform the segmentation of WMHs \cite{commowick2018objective}. Four different scanners were used: GE Discovery 3~T, Philips Ingenia 3~T, Siemens Aera 1.5~T and Siemens Verio 3~T. Each patient underwent four MRI sequences: 3D FLAIR, 3D T1w, 3D contrast-enhanced T1w and 2D T2w. In our study, we only considered the 3D FLAIR.
%ADNI3
The Alzheimer's Disease Neuroimaging Initiative (ADNI) is a multi-site study of elderly individuals with normal cognition, mild cognitive impairment, or Alzheimer's disease \cite{Weiner2017AlzheimerDisease}. 
The ADNI-3 phase includes 3D FLAIR MRIs acquired exclusively on 3~T scanners from different manufacturers (GE, Siemens, and Philips). We used 20 of the FLAIR images that had previously been manually segmented by a trained radiology resident as described in  \cite{vanderbecq2020comparison}.

{\scshape Curated single-site clinical dataset}
This routine clinical dataset consists of 60 patients diagnosed with cognitive impairment at the neuroradiology department of the Pitié-Salpêtrière hospital. We reused the dataset from a previous study \cite{vanderbecq2020comparison}, in which all patients had 3D T1-w and FLAIR sequences (except for two patients who had a 2D FLAIR). All data were collected during a routine clinical workup and were retrospectively extracted for the purpose of this study. Therefore, according to French legislation, explicit consent was waived. The images were acquired on four different MRI scanners and curated for image quality. Manual segmentation of WMHs was performed by a trained radiology resident \cite{vanderbecq2020comparison}. This dataset will be referred to as PITIE in the remainder of the article. 

%\subsubsection*{Clinical data Warehouse}
{\scshape Clinical data Warehouse}
We constructed a cohort of patients presenting brain lesions using routine clinical data from the AP-HP CDW. We developed the following approach to construct a new cohort out of the 13,703 FLAIR MRIs available. We first linked each FLAIR with its radiological report. From the radiological report, we extracted the Fazekas scale \cite{fazekas1987mr} when it was mentioned in the \textit{``Conclusion''} using \texttt{EDS-NLP}. 
We found this information for 204 FLAIR images. In contrast to MRIs from the PITIE dataset, these images come from nine different types of machines in 14 hospitals and have a wide range of image quality, reflecting the diversity of MRIs seen in clinical routine. We automatically assessed the image quality using our quality control model, which classifies images into good, medium, low quality and straight reject (e.g., truncated images) \cite{loizillon2023semi}. The quality distributions across Fazekas scores are displayed in the appendix (Table~\ref{suptab:fazekas_quality}).

Participant demographics for the various cohorts are summarised in Table~\ref{tab:demo}.

\begin{table}[h]
\centering
\caption{Age (average [range]) and sex (\% females) of the participants from the non-lesional (UKB \& CDW) and lesional (MSSEG, ADNI3, PITIE and CDW) cohorts. Note that for the CDW cohort with images presenting lesions, we had access to demographic data for only 171 out of the 204 images.}
\begin{tabular}{ccccc}
\hline
                                 & Dataset      & N images & Age           & Sex (\%F) \\ \hline
\multirow{2}{*}{Non-lesional cohorts}  & UKB          & 11990    & 56.63 [44-83]  &     62.73         \\
                                 & CDW          & 674      & 45.57 [18-87]
         & 55.49     \\ \hline
\multirow{4}{*}{\makecell{Cohort with images \\ presenting lesions}} & MSSEG        & 53       & 45.42 [24-66] & 71.70 \\
                                 & ADNI         & 20       & 71.07 [58-83] & 50     \\
                                 & PITIE        & 60       & 78.20 [52–101] & 50        \\
                                 & CDW          & 204         &   77.34 [43-96]            &   52.05        \\ \hline
\end{tabular}
\label{tab:demo}
\end{table}

\section{Proposed Approach}

\subsection{Image Pre-processing}
FLAIR MRIs were pre-processed using the \texttt{flair-linear} pipeline from Clinica \citep{routier2021clinica}. First, a bias field correction was applied using the N4ITK method \citep{tustison2010n4itk}. An affine registration to the MNI space was then performed \citep{avants2008symmetric}. Registered images were normalised by clipping the intensity values to the [2,98] percentiles and cropped to remove background resulting in images of size 169×208×179, with 1 mm isotropic voxels. 

\subsection{Unsupervised Anomaly Detection with a $\beta$-VAE}

%VAE general
%A VAE was used to model the distribution of healthy brain MRIs encountered during training. As the model has only been exposed to healthy MRIs during training, it faces challenges when confronted with abnormal structures. As a result, reconstruction errors in these regions are expected to be higher than in normal regions. By differentiating the input image and its reconstruction, we can detect and localise abnormalities in a so called residual image \cite{kingma2013auto}.

% More technical details
%The main objective of VAE is to approximate the true distribution of the healthy training FLAIR MR images with a simple parameterised distribution.  
% The encoder maps the input image \textbf{x} to a distribution over latent space \textbf{z} that is regularised to approximate a prior distribution defined as a normal distribution. The latent vector \textbf{z} is then sampled from this normal distribution, and the decoder is trained to reconstruct $\hat{\textbf{x}}$ from \textbf{z}. Thus, the loss function that is minimised during training is composed of a reconstruction and a regularisation terms. The reconstruction loss forces the model to reconstruct the original image \textbf{x} from the compressed representation \textbf{z} of the input image. The regularisation loss ensures that the latent space \textbf{z} are distributed according to a standard normal distribution.

Many VAE variants have been proposed to perform UAD \cite{chadebec2022pythae} and several have recently been compared in a benchmark focused on detecting dementia-related lesions in 3D positron emission tomography images \cite{hassanaly2023unsupervised,hassanaly2024benchmark}.
%Beta VAE : définir les notations de l'équation
Based on the results of this benchmark \cite{hassanaly2024benchmark}, we decided to train a $\beta$-VAE \cite{higgins2016beta}, which encourages the disentanglement of features in the latent space by adding a weight $\beta$ in front of the Kullback-Leibler divergence regularisation term to adjust the balance with the reconstruction loss. 

% The objective function is: 
% \begin{equation*}
%     \mathcal{L}_{\beta\text{-VAE}} = \mathbb{E}_{z \sim q_\phi(z|x)}\left[\log p_\theta(x | z)\right] - \beta \mathcal{D}_\text{KL}\left[q_\phi(z | x) || p_z(z)\right],
% \end{equation*}

We used a 3D $\beta$-VAE model with an encoder of five blocks and a symmetric decoder \cite{hassanaly2024benchmark}, see Figure~\ref{fig:vae_archi}. Each encoder block is composed of a convolutional layer, a batch normalisation and a swish activation function. These blocks are followed by a flatten and a fully connected layer. The latent space size was 256 and $\beta$ equal to 10. The model was trained over 30 epochs, with a learning rate of $10^{-5}$ and a batch size of 4 using ClinicaDL \cite{thibeau2022clinicadl}.

%Modifier figure : modifier l'image saine reconstruite
\begin{figure}[ht!]
    \centering
    \caption{Variational autoencoder (VAE) architecture for brain anomaly detection}
    \includegraphics[width=0.9\linewidth]{vae_archi_end.pdf}
    \label{fig:vae_archi}
\end{figure}

\subsection{Post-processing}
After multiplying each residual image, i.e., the difference between the input and output of the $\beta$-VAE, with an eroded brain mask to eliminate false positives near the brain contour, a 3D median filter with a kernel size of 5 was used to obtain a smoother mask that was thresholded to obtain a binary segmentation mask. As in \cite{baur2021autoencoders}, the threshold was model specific and determined as the 98th percentile of the model reconstruction errors on the training dataset. Finally, we performed a 3D connected component analysis by excluding any segmented regions with an area of less than 10 voxels.

\subsection{Experimental Setup}
% Data Split
For pre-training our $\beta$-VAE model on the UKB dataset, the 11,990 images were split into a training and a validation set containing 8304 and 3686 MRIs. In the following fine-tuning step on the CDW, 574 images were used for training and 100 were left for validation. The separation between training and validation sets was done at the subject level to avoid data leakage and stratified by sex and age. We evaluate the pre-trained model on three independent test sets presenting brain anomalies: MSSEG, ADNI3 and PITIE, and apply the fine-tuned model on the CDW.

\subsection{Evaluation Metrics}

To evaluate the anomaly detection results, the two best suited metrics according to \cite{maier2022metrics} were the Dice Similarity Coefficient (DSC) and the Normalised Surface Dice (NSD). We also computed three other relevant metrics: absolute volume error rate (AVR), voxel-level false positive ratio (FPR), voxel-level false negative ratio (FNR).

\begin{align*}
     AVR =  \frac{|V_R -V_A|}{V_R} \enspace , \quad 
     FPR & = \frac{FP}{FP + TN} \enspace , \quad 
     FNR = \frac{FN}{FN + TP} \enspace ,  
\end{align*}
where \(V_R\) is the reference volume, \(V_A\) is the automatic volume, \(FP\) is a false positive voxel, \(TN\) is a true negative voxel, \(FN\) is a false negative voxel and \(TP\) is a true positive voxel.
For each metric, we report the mean and the 95\% confidence interval computed using bootstrapping on the corresponding independent test set (9999 resamples).

\section{Results}

\subsection{Validation of the $\beta$-VAE for Anomaly Detection in Research Datasets}

We first tested our pre-trained model using the MSSEG dataset, which has been used extensively in the literature to detect multiple sclerosis lesions. Although these lesions differ from age-related WMHs, our target, this step ensures that our VAE produces results consistent with those reported in the literature on a publicly available dataset. We then validated our model on our target task -- the anomaly detection of age-related WMHs -- on ADNI3 and the curated clinical dataset PITIE. Results are presented in Table~\ref{tab:res_pretrained}. 

\begin{table}[ht!]
    \centering
    \caption{Validation %of the WMH detection 
    on research datasets (MSSEG and ADNI3) and a curated clinical dataset (PITIE). Each metric is presented as average [95\% confidence interval].%, the table displays the average and the 95\% confidence interval within brackets.
    } % Utlisation des accronymes dans le tablesau
    \begin{tabular}{ccccccc}
        \hline
         Dataset & FLAIR & DSC (\%) & NSD (\%) & AVR & FPR & FNR \\ \hline
         MSSEG & 53 & \makecell{30.81\\ $\lceil$25.66,36.07$\rceil$ }  & \makecell{32.05\\ $\lceil$26.63,37.34$\rceil$ }& \makecell{ 0.80 \\ $\lceil$0.64,0.98$\rceil$}  & \makecell{0.63 \\ $\lceil$0.55,0.71$\rceil$} & \makecell{0.62 \\ $\lceil$0.58,0.66$\rceil$}  \\ \hline
         ADNI3 & 20 & \makecell{27.06 \\ $\lceil$19.73,34.52$\rceil$ }& \makecell{30.68\\ $\lceil$22.98,38.4$\rceil$ } & \makecell{ 0.71 \\ $\lceil$0.61,0.8$\rceil$}  & \makecell{0.77 \\ $\lceil$0.68,0.85$\rceil$} & \makecell{0.48 \\ $\lceil$0.4,0.56$\rceil$} \\ \hline
         PITIE & 60 & \makecell{35.02 \\ $\lceil$29.89,40.2$\rceil$ }& \makecell{36.63\\ $\lceil$31.29,41.81$\rceil$ }  & \makecell{ 0.64\\ $\lceil$0.56,0.71$\rceil$}  & \makecell{0.71 \\ $\lceil$0.65,0.77$\rceil$} & \makecell{0.4 \\ $\lceil$0.35,0.44$\rceil$} \\ \hline
    \end{tabular}
    \label{tab:res_pretrained}
\end{table}

% Conclusion intermédiaire disant que les résultats sont alignés avec la littérature
The DSC of 30.81\% observed on MSSEG is in agreement with existing works using VAEs for WMH detection on this dataset, such as that of~\cite{baur2021autoencoders}, which obtained a DSC of 25.70\%. We obtained similar results on ADNI3 and PITIE, showing that we were able to detect age-related WMHs. FPR was higher on the ADNI3 and PITIE datasets, which may be attributed to the difficulty of the model to reconstruct areas with atrophy. In contrast, we observe a higher FNR on the MSSEG dataset, suggesting that our model fails to detect many lesions of multiple sclerosis patients, which present different shapes and contrasts distributions compared to age-related WMHs. A sample from the PITIE dataset is presented in Figure~\ref{fig:residualmaps}. Further examples are given in the appendix (Figure~\ref{supfig:research_results}). 

\begin{figure}[ht!]
    \caption{Left to right: input MRI, reconstructed MRI, residual map, post-processed lesion map, ground truth. This example corresponds to a favourable outcome in terms of DSC in regard to the overall results.}
    \centering
    \includegraphics[width=0.85\linewidth]{resi_short_2.png}
    \label{fig:residualmaps}
\end{figure}


\subsection{Application of the $\beta$-VAE on Clinical Routine Images}
As observed in \cite{bottani2023evaluation, loizillon2024automatic}, there is an important drop of performance when applying a model trained on research data to heterogeneous clinical data. This is why we fine-tuned our pre-trained model on 674 healthy FLAIRs from the CDW before applying it on 204 patients of the CDW for which we knew the Fazekas score. Figure~\ref{fig:seg-cdw} depicts the lesion maps obtained for Fazekas scores 1, 2, and 3. More examples are shown in the appendix (Figure~\ref{supfig:results_cdw}).

\begin{figure}[ht!]
    \caption{Automatic lesion maps for patients with Fazekas score 1 (A), 2 (B), and 3 (C).
    %B is a tier 1 MRIs, A a tier 2 and C a tier 3 image. 
    A is of medium image quality, B of good image quality and C of low image quality.}
    \centering
    \includegraphics[width=0.9\linewidth]{eds_red.png}
    \label{fig:seg-cdw}
\end{figure}

We examined the relationship between the Fazekas score and the volume of lesions automatically obtained by our $\beta$-VAE (Figure~\ref{fig:boxplot-cdw}). Specifically, differences in volumes between the three patient groups with different Fazekas scores (Fazekas 1, Fazekas 2, Fazekas 3) were assessed using one-way ANOVA. The results were statistically significant (F=32.38, p$<10^{-10}$). Tukey post hoc tests were performed to assess differences between pairs of groups. All pair-wise differences were significant (Fazekas 1 vs 3: $p=0.0036$, Fazekas 1 vs 2: $p<0.001$, Fazekas 2 vs 3: $p<0.001$).

\begin{figure}[ht!]
    \caption{Boxplot depicting the distribution of lesion volumes per Fazekas score (left) and per Fazekas score categorised by image quality (right). Every box is bounded by the lower and upper quartiles, with the centre line representing the median. Whiskers extend from the box to the furthest data point within 1.5$\times$ the interquartile range of the box. $^{**}$: statistically significant for Tukey post-hoc test.}
    \centering
    \includegraphics[width=0.9\linewidth]{subset_rebuttal.png}
    \label{fig:boxplot-cdw}
\end{figure}

As images from the CDW present a wide range of image quality, we assessed whether our anomaly detection model was robust to quality. In Figure~\ref{fig:boxplot-cdw}, we plot for every quality level the distribution of WMH volumes by Fazekas score. We did not perform statistical testing due to the very small sample size in some subclasses. Nevertheless, the graph qualitatively shows the robustness of our model to image quality since the order in volumes across Fazekas classes is preserved for all quality levels.


\section{Conclusion}
In this feasibility study, we assessed the ability of VAEs to detect age-related WHMs on clinical routine MRIs acquired at different hospitals and gathered in a CDW. Despite, the promising results found -- correlation between the Fazekas scores and the volumes of the lesions detected by our model; robustness to image quality -- such models are not ready for a clinical routine application yet. This may be due to a systematic failure to reconstruct details of the cortical and subcortical brain structures, resulting in difficulties in identifying some WMHs and leading to failure cases where no WMH is detected at all in Fazekas 2 images (Figure~\ref{fig:boxplot-cdw}). A limitation of our study is the use of a $\beta$-VAE whose architecture and hyperparameters were optimised on a different imaging modality \cite{hassanaly2024benchmark}. It may therefore be of interest to optimise the architecture and hyperparameters for our specific use case. In addition, it would be interesting to evaluate the ability of this model to detect other types of WMHs, such as multiple sclerosis lesions. This is left for future work.

\midlacknowledgments{
The research leading to these results has received funding from the French government under management of Agence Nationale de la Recherche as part of the "Investissements d'avenir" program, reference ANR-19-P3IA-0001 (PRAIRIE 3IA Institute) and reference ANR-10-IAIHU-06 (Agence Nationale de la Recherche-10-IA Institut Hospitalo-Universitaire-6).

The research was done using the Clinical Data Warehouse of the Greater Paris Hospitals. The authors are grateful to the members of the AP-HP DSN and URC teams, and in particular St\'{e}phane Br\'{e}ant, Florence Tubach, Jacques Ropers, Pierre Rufat, Antoine Roz\`{e}s, Camille Nevoret, Christel Daniel, Martin Hilka, Julien Dubiel, Cyrina Saussol and Rafael Gozlan. They would also like to thank the ``Coll\'{e}giale de Radiologie of AP-HP'' as well as, more generally, all the radiology departments from AP-HP hospitals.

Data used in preparation of this article were obtained from the Alzheimer's Disease Neuroimaging Initiative (ADNI) database (\url{adni.loni.usc.edu}). As such, the investigators within the ADNI contributed to the design and implementation of ADNI and/or provided data but did not participate in analysis or writing of this report. A complete listing of ADNI investigators can be found at: \url{http://adni.loni.usc.edu/wp-content/uploads/how_to_apply/ADNI_Acknowledgement_List.pdf}.

% Ajouter remerciement Maelys et Ravi pour l'aide sur les VAE
The authors would also like to thank Ravi Hassanaly and Maëlys Solal for their help implementing VAEs and their feedback.



}


\bibliography{midl24_228}

\clearpage

\renewcommand{\theHsection}{A\Alph{section}}

\renewcommand\theHfigure{A{\arabic{figure}}} 
\renewcommand\thefigure{A{\arabic{figure}}} 
\setcounter{figure}{0} 
\renewcommand\theHtable{A\arabic{table}} 
\renewcommand\thetable{A\arabic{table}} 
\setcounter{table}{0}   

\appendix
\section{Construction of Non-Lesional Cohorts}
\begin{figure}[h]
    \centering
    \caption{Translation of the terms used to construct the non-lesional  cohort in the CDW.}
    \begin{tabular}{|l|l|}
        \hline
        \textbf{French} & \textbf{English} \\
        \hline
        Pas d'anomalie & No anomaly \\
        Normal & Normal \\
        Absence d'anomalie & Absence of anomaly \\
        Pas d'argument & No argument \\
        Sans anomalie & No anomaly \\
        Pas de signe & No sign \\
        Pas d'accident & No accidents \\
        Sans particularité & No particularities \\
        Pas de lésion cérébrale & No cerebral lesion \\
        Pas de lésion encéphalique & No encephalic lesion \\
        \hline
    \end{tabular}
    \label{supfig:translations}
\end{figure}

\section{Results of the pre-trained $\beta$-VAE }

\begin{figure}[ht!]
    \centering
    \caption{Automatic WMH maps for two patients of MSSEG (left), PITIE (middle) and ADNI3 (right). Green voxels correspond to true positives, red to false positives and blue to false negatives.}
    \includegraphics[width=1\linewidth]{res_research.png}
    \label{supfig:research_results}
\end{figure}

It is important to note that because of the exclusion criteria applied by ADNI, we found a lower vascular burden in this dataset compared to others (cf. Figure \ref{supfig:research_results}).


\newpage
 \begin{figure}[ht!]
    \centering
    \caption{Automatic lesion maps for cognitively normal patients of ADNI dataset.}
    \includegraphics[width=0.7\linewidth]{rebuttal_healthy_adni3.png}
    \label{supfig:research_results_healthy_ADNI}
\end{figure}

We evaluated our $\beta$-VAE on 25 ADNI subjects labelled as cognitively normal (CN). For each MRI, we computed the amount of lesional volume and obtained a mean of 9.30~cm$^3$ lesional tissues for an average age of 66.87 years. Even if these images are labelled as CN as they are normal related to the age of the patients, they are still presenting some lesions (cf. Figure~\ref{supfig:research_results_healthy_ADNI}). The validation of our model on a young healthy cohort such as the UKB is left for future work.

In Figure~\ref{supfig:rebuttal_age}, we plotted the lesion burden volume as a function of age for each Fazekas group in order to observe the robustness of our model to age. We visually notice that for the Fazekas 1 group, the volume of lesion remains almost always below 10~cm$^3$ even for patients over 80 years. Same trends were obtained for Fazekas 2 and 3. Thus, we believe our model to be robust to age.

\begin{figure}[h]
    \centering
    \caption{Plot illustrating the variation of lesion volume with age across different Fazekas groups.}
    \includegraphics[width=0.8\linewidth]{rebuttal_age.png}
    \label{supfig:rebuttal_age}
\end{figure}

\section{Quality Levels Across Fazekas Scores}

\begin{table}[ht!] % Ajouter age / sex par fazekas ?
    \centering
    \caption{Number of images across quality levels and Fazekas scores. Straight reject images are not proper 3D FLAIRs (e.g., images of segmented tissues or truncated images).}
    
    \begin{tabular}{ccccc}
     \hline
         & Good quality & Medium quality & Low quality & Straight reject \\ \hline
        Fazekas 1 & 18 & 15 & 1 & 0 \\  \hline
        Fazekas 2 & 29 & 36 & 6 & 0\\  \hline
        Fazekas 3 & 12 & 37 & 14 & 8 \\  \hline
    \end{tabular}
    \label{suptab:fazekas_quality}
\end{table}


\begin{table}[ht!]
    \centering
    \caption{Age (average ± standard deviation) and sex (\% females) across Fazekas scores}
    \begin{tabular}{ccc}
    \hline
         & Age  & Sex (\% F) \\ \hline
        Fazekas 1 & 71.17 ± 11.06 & 57.14  \\ \hline
        Fazekas 2 & 75.99 ± 11.81 &  39.44 \\ \hline
        Fazekas 3 & 82.2 ± 7.78  &  60.87 \\\hline
    \end{tabular}
    \label{tab:my_label}
\end{table}

\begin{figure}[h]
    \centering
    \caption{Automatic WMH maps for two patients with Fazekas score 1 (left), 2 (middle), and 3 (right). A, C and D are good quality MRIs, B and E medium quality images and F a low quality image.}
    \includegraphics[width=1\linewidth]{eds.png}
    \label{supfig:results_cdw}
\end{figure}

\end{document}