\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\jmlryear{2024}
\jmlrworkshop{Full Paper -- MIDL 2024}
\jmlrvolume{-- 141}
\editors{Accepted for publication at MIDL 2024}

 % Added package by HY
% \usepackage[nohyperlinks, printonlyused, withpage, smaller]{acronym}
 % Added package by HY to include acronyms
% \usepackage{acro}
\usepackage{wrapfig}
\usepackage{booktabs}
\usepackage{graphicx}
\usepackage{adjustbox}

% Import the acronym package
\usepackage[nolist]{acronym}

% Added package by HY to include acronyms from the external file
% \input{acros}
\input{acronyms}


\title[Anatomical Consistency of Synthetic vs. Real MRI]{Evaluating Age-Related Anatomical Consistency in Synthetic Brain MRI against Real-World Alzheimer's Disease Data.}


 % Three or more authors with different addresses:
 \midlauthor{\Name{Hadya Yassin\nametag{$^{1,2}$}} \Email{hadya.yassin@hpi.de}\\
  \Name{Jana Fehr\nametag{$^{1,2,3}$}} \Email{jana.fehr@bih-charite.de}\\
  \Name{Wei-Cheng Lai\nametag{$^{1,2}$}} \Email{wei-cheng.lai@hpi.de}\\
  \Name{Alina Krichevsky\nametag{$^{1,2}$}} \Email{alina.krichevsky@student.hpi.de}\\
  \Name{Alexander Rakowski\nametag{$^{1,2}$}} \Email{alexander.rakowski@hpi.de}\\
  \Name{Christoph Lippert\nametag{$^{1,2,4}$}} \Email{Christoph.Lippert@hpi.de}\\
  \addr  $^{1}$ Digital Engineering Faculty, University of Potsdam, Germany \\
  \addr $^{2}$ Digital Health and Machine Learning, Hasso-Plattner-Institute, Potsdam, Germany \\
  \addr $^{3}$ QUEST Center for Responsible Research, Berlin Institute of Health (BIH), Charit\'e Universit\"atsmedizin Berlin, Germany \\
  \addr $^{4}$ Hasso Plattner Institute for Digital Health at Mount Sinai, Icahn School of Medicine at Mount Sinai, New York, NY, United States of America} 


\begin{document}

\raggedbottom

\maketitle

\begin{abstract}
This study examines the realism of medical images created with deep generative models, specifically their replication of aging and \ac{AD} related anatomical changes. Previous research focused on developing generative methods with limited attention to image fidelity. We aim to assess the resemblance of brain MRI generated by a StyleGAN3 model with causal controls to neurodegenerative changes. For a benchmark, we conducted a \ac{VTT} to see if radiologists could distinguish between synthetic and real images. Then, we employed a U-Net-based model to segment hallmarks relevant to normal aging and \ac{AD}. Finally, we conducted statistical tests for our hypothesis that no significant differences existed between real and synthetic images. \ac{VTT} results showed radiologists struggled to differentiate between image types, highlighting \ac{VTT}'s limitations due to subjectivity and time constraints. We found slight hippocampus distribution differences ($\textit{P}$ = 5.7e-2) and significant lateral ventricle discrepancies ($\textit{P}$s $<$ 5.0e-2), indicating higher hippocampus realism and ventricle size inconsistencies. The model more effectively simulated changes in the hippocampus than in the lateral ventricles, where difficulties were encountered with certain subgroups. We conclude that the \ac{VTT} alone is inadequate for a comprehensive quality evaluation, promoting a more objective approach. Future research could adapt our approach to evaluate other generated medical images intended for different downstream tasks. For reproducibility, we provide detailed code implementation\footnote{\href{https://github.com/hsyassin/BioPlausibleSynthImgEval.git}{github.com/hsyassin/BioPlausibleSynthImgEval.git}}.
\end{abstract}

\begin{keywords}
Evaluating Generative Models, MRI, Alzheimer's, Anatomical Consistency.
\end{keywords}

\section{Introduction}
\label{sec:intro}
Deep learning algorithms, essential for automating medical image analysis, rely heavily on access to abundant, high-quality datasets for training \citep{sarker2021deep}. Acquiring such data is hindered by high costs associated with equipment, expert annotation, subject availability, and privacy concerns \citep{diaz2021data, kaissis2020secure}. Consequently, these challenges can lead to data biases that significantly impair the algorithms' performance in real-world applications, as they may not accurately reflect the diversity of real patient populations or the complexity of medical conditions encountered in clinical settings \citep{nittas2023beyond}. 

Despite significant progress in synthetic medical image generation through \acp{GAN} \citep{dash2023review} and Diffusion models \citep{yang2023diffusion}, traditional evaluation metrics like the \ac{FID} and \ac{SSIM} primarily assess overall image quality, overlooking anatomical accuracy critical for medical applications. Thus, accurately mirroring human physiology and conditions like \ac{AD} is crucial for synthetic images to bridge dataset gaps. Therefore, we need "Biological Plausibility" metrics, introduced by \citep{treder2022quality} and designed to provide anatomical analysis specific to clinical needs. Our evaluation method focuses on crucial neurodegenerative features for normal aging, which are more prominent in \ac{AD}, including ventricular enlargement and hippocampal shrinkage \citep{frisoni2010jack, katabathula2021predict}. 

\Ac{VTT} is a standard for assessing synthetic images' biological plausibility in the generative field, as utilized by \citet{khader2022medical} to evaluate diffusion model performance. However, \citet{treder2022quality} highlights \ac{VTT}'s limitations: subjectivity, high costs, and difficulty detecting subtle anatomical changes. These issues suggest that \ac{VTT} is insufficient in ensuring synthetic images' clinical applicability. Our study benchmarks against \ac{VTT}, promoting more objective, robust, and clinically relevant evaluation methods to improve synthetic images' clinical utility.

Longitudinal \ac{GAN} studies on \ac{AD} explore biological plausibility. \citet{xia2021learning} analyzes \ac{RC} in the \ac{ROI} volumes between baseline and follow-up images, real and synthetic. \citet{peng2021longitudinal} examines the \ac{ROI} \ac{AVD} between synthetic follow-ups and real manual segmentation. \citet{fu2023fast} calculate \ac{MAE} for \ac{ROI} volumes in longitudinal data, differing from \citet{ravi2022degenerative} that calculate \ac{MAE} for randomly matching samples by age, sex, and \ac{CDR}. The pairwise approach in longitudinal studies, which tracks changes in the same individual, offers more accuracy than random matching. \citet{ribeiro2023high} evaluates \ac{MAE} using model-predicted counterfactual volumes rather than actual volumes segmented from the counterfactual images, which may misrepresent true volumes. These methods aim to compare anatomical consistencies between real and synthetic data but reduce the evaluation to a single average value (\ac{RC}, \ac{AVD}, \ac{MAE}), favoring benchmark comparisons of generative models over anatomical consistency and clinical relevance. \citet{wilms2022invertible} investigates age-related brain volume changes across generative models without real data comparison, omitting key evaluations of synthetic-to-real anatomical consistency.

Additionally, the mentioned methods segment \acp{ROI} using statistical and algorithmic strategies, such as FSL (FMRIB Software Library) and FreeSurfer \citep{jenkinson2012woolrich,fischl2012freesurfer}, or multi-atlas methods \citep{wang2014multi,doshi2013multi}. In contrast, our method utilizes the advanced capabilities of deep learning. \citet{litjens2017survey,shen2017deep} demonstrate that deep learning surpasses traditional methods in accuracy, efficiency, and detection of complex patterns.

Deep learning segmentation enables accurate anatomical consistency comparisons between synthetic \ac{ROI} sizes and real counterparts, capturing essential variability in aging and \ac{AD}. Such analysis is crucial for clinical applications requiring high accuracy and transparency. Our objective is to assess the null hypothesis (H0) that there is no significant distributional difference in brain \ac{ROI} areas between real \ac{ADNI} images and their synthetic counterparts. We utilize statistical tests to examine differences in magnitude, certainty, direction, and \ac{dist} shapes. Finally, \citet{jung2023conditional,xia2021learning} conducted statistical analyses to identify \ac{dist} differences and focused specifically on anatomical consistencies across \ac{CDR} groups using multi-hypothesis testing. However, our research expands to include age and sex covariates and employs an alternative approach that avoids multi-hypothesis testing.

\section{Methodology:}

\begin{wrapfigure}{r}{0.4\textwidth}
    \floatconts
  {fig:QA}
  {\caption{Overview of the analysis}}
  {\includegraphics[width=0.91\linewidth]{images/Figure1.png}}
\end{wrapfigure}

Biological plausibility overview in \figureref{fig:QA}. Input: \ac{AD}-focused Synthetic images generated by a conditional \ac{GAN} with causal control and a subset of the \ac{ADNI} dataset as the real-world reference (unseen in training and validation of both generative and segmentation models to reduce bias). \ac{VTT} was selected as a benchmark for its established role in assessing synthetic image realism. Our method quantifies \ac{ROI} area via a segmentation model and performs statistical tests to evaluate anatomical consistency between image types across different covariates. Understanding our analysis requires familiarity with real-world and synthetic data characteristics, synthetic image generation, and the generative model's capabilities.

\subsection{Use Case and Datasets:}

\bigskip 

% \subsubsection{Real Data}
\noindent  \textbf{Real Data.} We sourced real \ac{MRI} brain scans from the \ac{UKB} \citep{sudlow2015uk} and \ac{ADNI} \citep{ADNI_data_2021} datasets. The UKB provided 42,427 high-resolution T1-weighted 3T \ac{MRI} brain scans of individuals aged 40-69 (mean age: 55). Unlike the \ac{UKB}, which serves a general cohort, the \ac{ADNI} provides a focused dataset on specific diseases. The \ac{ADNI} database (\href{adni.loni.usc.edu}{adni.loni.usc.edu}), established in 2003, explores the use of imaging, biological markers, and clinical assessments to track the progression of \ac{MCI} and early \ac{AD}. \ac{ADNI} consists of high-resolution T1-weighted 3T \ac{MRI} brain scans from 9,183 participants aged 55-90 (mean age: 74), including 3,273 \ac{CN} cases (CDR=0), 4,943 \ac{MCI} cases (CDR=0.5), and 967 \ac{AD} cases (CDR$\geq$1).

\bigskip

\noindent \textbf{Synthetic Data.} We generated 600,000 2D mid-slice images using a conditional \ac{StyleGAN3} model \citep{karras2021alias} with causal control \citep{pawlowski2020deep}. While \citet{kocaoglu2017causalgan} introduced causal inference to \ac{StyleGAN3}, our model combines a conditional \ac{GAN} structure \citep{mirza2014conditional,miyato2018cgans} with a custom causal model. To achieve a high-quality synthesis, we initially trained the model on the \ac{UKB} dataset for its extensive data volume, followed by fine-tuning it on the \ac{ADNI} dataset. For both datasets, our causal model consistently incorporated covariates like \textbf{age}, \textbf{sex}, and \textbf{left and right lateral ventricle volumes}. Given the disease-specific nature of the \ac{ADNI}, we added \ac{AD}-associated conditional labels like \textbf{\ac{CDR}} and \textbf{hippocampus volumes} \citep{frisoni2010jack, katabathula2021predict} during its fine-tuning. 

During the inference step, the images are synthesized by the causal and \ac{StyleGAN3} models. The causal model is used to provide conditional labels as vectors, \textit{i.e.}, \textbf{age}, \textbf{sex}, \textbf{\ac{CDR}}, \textbf{lateral ventricle volumes}, \textbf{cerebral cortex volumes}, and \textbf{hippocampus volumes}. Taking advantage of the causal model allows us to control labels by changing ages, resulting in different volumes since our assumption is that age causally affects brain volumes. These conditional labels are fitted into the StyleGAN3 model to synthesize images with specific attributes related to age, sex, and either healthy or \ac{AD} diagnoses, as indicated by \ac{CDR} values, for more details on the conditional causal model (Appendix~\ref{apd:synthetic_data}).

 
\subsection{Benchmark: Visual Turing Test (\ac{VTT})}
To assess synthetic image realism, we conducted a \ac{VTT} with two radiologists reviewing 100 image pairs: 50 from \ac{UKB} and 50 from \ac{ADNI} with their synthetic counterparts. Each pair had one real and one synthetic image, matched by \textbf{age}, \textbf{sex}, and \textbf{\ac{CDR}} for \ac{ADNI} images only. The radiologists identified the synthetic image, rated their confidence, and explained their choices, enabling a comprehensive realism evaluation. The test, done with ImFusion Labels software \citep{ImFusionImFusionLabels}, was untimed to allow breaks and resumptions.


\subsection{Biological Plausibility Analysis}
\noindent \textbf{\ac{ROI} Quantification via Segmentation.} Our generative model produces 2D mid-slice brain \ac{MRI}, while established deep-learning segmentation models like SynthSeg \citep{billot_synthseg_2023} are designed for 3D volumes. To bridge this gap, we developed a 2.5D segmentation model suitable for 2D and 3D data. Using SynthSeg, we generated ground truth masks for \ac{UKB} and \ac{ADNI} datasets. To prevent bias in our segmentation towards \ac{ADNI} (real-world reference), we trained and validated \ac{SOTA} architectures solely on \ac{UKB}.

During inference, the best-performing model (\ac{A-UNet}) was selected for its highest median \ac{DSC} from external \ac{ADNI} testing. This model, an adaptation of the original UNet \citep{ronneberger2015u}, was optimized by increasing initial features from 32 to 64 and adding a layer of depth. Details on model implementation and comparisons with other architectures are in Appendix~\ref{apd:SegTE}. We quantified \ac{ROI} areas by counting pixels in segmented masks, then normalized these areas against the intracranial area to account for brain size variations. Min-max scaling was applied to these normalized values to ensure consistency with observed area ranges.

\bigskip
% \subsubsection{Statistical Analysis of Biological Plausibility:}
\noindent \textbf{Statistical Analysis.} In the \ac{VTT}, a \textbf{two-tailed binomial test} evaluated radiologists' accuracy against random guessing and \textbf{Cohen's kappa} for inter-rater agreement. To evaluate anatomical consistency between image types, we used statistical tests to explore the hypothesis that no significant differences exist in the \ac{ROI} \acp{dist}. between real and synthetic images, setting the significance level at 5.0e-2. The \textbf{\ac{permute}}, chosen for its non-parametric nature that requires no \ac{dist} assumptions, assesses the statistical significance of mean differences through repeated recalculations, minimizing false significance risks. Subsequently, \textbf{Cohen's d} was used to measure \textbf{\ac{ES}} and direction. Additionally, we employed a 95\% \textbf{\ac{CI}}, based on recommendations from \citet{lee2016alternatives}, to assess \ac{ES} estimation precision and indicate the statistical significance of our findings. Finally, The \textbf{\ac{KS}} further compared \ac{dist} shapes, providing added insights into discrepancies between real and synthetic image \acp{dist}.

\section{Experiments and Results}
\label{sec:res}

\textbf{Experiment Setup.} Our experiments evaluated the impact of aligning covariate \acp{dist}. (age, \ac{CDR}, and sex) on image analysis, maintaining a 5:1 synthetic-to-real \ac{ROI} area ratio throughout the study. We conducted experiments on matched and mismatched covariate \acp{dist}, where we would expect a higher deviation between real and synthetic \ac{ROI} \acp{dist}. when deviating from matching \acp{dist}. The experiments are: a) matched covariate \acp{dist}, b) introduced variances by adjusting the median age of the synthetic samples' \ac{dist} by $\pm$5 years, c) \ac{CDR}, and d) sex mismatches in synthetic \ac{dist} are detailed in Appendix~\ref{apd:ST}.

\bigskip

\noindent  \textbf{\ac{VTT} Results.} \Ac{R1} achieved a 51\% accuracy rate, similar to random guessing (\textit{P} = 9.2e-1), while \ac{R2} scored 15\%, a significant difference (\textit{P} = \textbf{4.8e-13}). Cohen's kappa of 0.167 indicates low agreement between the two, suggesting variability in image perception and image fidelity conclusions. \ac{R1} often indicated "moderate certainty" and cited "contrast problems," while \ac{R2} often reported "very low certainty" and highlighted "anatomical inaccuracies" as their certainty level and reasons behind most of their decisions. Interestingly, neither of them considered an "inaccurate representation of pathology." A follow-up revealed \ac{R2}'s bias stemming from \ac{MRI} characteristics familiarity, leading to misclassifications by associating smoothing effects with real images despite their presence in both image types. \figureref{unus-sml} a) and \figureref{VTT_Ex} in Appendix~\ref{apd:VTT} display a misclassification by both \ac{R2} and \ac{R1} for the same pair of images, but for different reasons. R2 stated "anatomical inaccuracies", while R1 stated "noise patterns", despite the image being real. The sole instance where both radiologists strongly agreed and accurately identified a synthetic image, citing inaccurate anatomy due to unusually small ventricles, is shown in \figureref{unus-sml} b). Further \ac{VTT} results are in Appendix~\ref{apd:VTT}.

\bigskip

\noindent \textbf{Biological Plausibility Results.} Regression analysis in \figureref{AreaVvsH-Age} reveals that \ac{LV} areas increase and \ac{HC} areas decrease with age, aligning with neurodegeneration patterns in both real and synthetic images. Furthermore, \tableref{Age_Mismatches} demonstrates significant differences for \ac{LV} in \textbf{matched \acp{dist}.} (\textit{P} = \textbf{2.0e-3}, \textbf{negative $CI -_l -_u$}), indicating synthetic images overestimate \ac{LV}'s enlargement with age compared to real images (Negative \ac{ES}, see \figureref{AreaVvsH-Age}). The \ac{LV} KS test results show significant shape differences in \acp{dist}. across \textbf{both matched and unmatched conditions} (\figureref{fig:diff-dist-Age}). In contrast, hippocampus results show no significant difference in \ac{permute} tests. However, a near-significant \textit{p} (5.7e-2) and positive \ac{ES} (7.0e-2) suggest minor overestimation in synthetic image \ac{ROI} size reduction with age at matched \acp{dist}. Analysis in \figureref{AreaVvsH-Age} shows minimal differences between image types in \ac{LV} for the 75-80 and 80-85 age groups, with nearly identical box plots. For \ac{HC}, the 60-65 and 85-90 age groups have lower medians of synthetic areas, while the 80-85 group has a higher median.

\begin{wraptable}{L}{0.6\textwidth}
    \floatconts
  {Age_Mismatches}
  {\caption{Statistical Comparison between Real vs. Synthetic \ac{ROI} Areas Across Distributions. \textbf{Match:} same covariate \ac{dist} for Image Types (Median age of 74); \textbf{Mismatch:} Synthetic \ac{dist} age median shifts by \textbf{$\pm$ 3 or 5 years}). $CI -_l -_u$: \ac{CI} bounds. \textbf{Significant differences} are highlighted.}}
  {\begin{adjustbox}{max width=0.9\linewidth}
  {\begin{tabular}{lcccccc}
    \toprule
    ROI & Cond. & Permute & ES & $CI_l$ & $CI_u$ & KS \\
    \midrule
    LV & -5 & 6.0e-2 & 7.1e-2 & -5.8e-3 & 1.5e-1 & \textbf{1.3e-2} \\
    LV & -3 & 8.2e-1 & 9.5e-3 & -6.7e-2 & 8.6e-2 & \textbf{1.4e-2}   \\
    LV & Match &  \textbf{2.0e-3} & -1.3e-1 & \textbf{-2.0e-1}  & \textbf{-5.0e-2} & \textbf{2.0e-8} \\
    LV & +3 & \textbf{0.0e-0} & -2.3e-1	& \textbf{-3.1e-1} & \textbf{-1.6e-1} &          \textbf{2.1e-14} \\
    LV & +5 & \textbf{5.0e-2} & -3.1e-1 & \textbf{-3.9e-1} & \textbf{-2.3e-1} & \textbf{4.4e-16} \\
    
    \ac{HC} & -5 & \textbf{4.0e-3} & -1.1e-1 & \textbf{-1.9e-1} & \textbf{-3.6e-2} & \textbf{1.1e-2} \\
    \ac{HC} & -3    & 3.2e-1 & -4.0e-2	&-1.2e-01 & 3.7e-2 & 4.5e-1\\
    \ac{HC} & Match & 5.7e-2 & 7.0e-2	& -6.7e-3 &	1.5e-1 & 8.7e-2 \\
    \ac{HC} & +3 & \textbf{4.0e-3} & 1.1e-1 & \textbf{3.8e-2} & \textbf{1.9e-1} & \textbf{2.4e-3}\\
    \ac{HC} & +5 & \textbf{5.0e-2} & 2.1e-1	& \textbf{1.4e-1} & \textbf{2.9e-1} & \textbf{9.4e-8}  \\
    
   \bottomrule
\end{tabular}}
\end{adjustbox}}
\end{wraptable}

\textbf{Mismatched Conditions} with age adjustments (+3, +5) showed significant differences in \ac{LV} size, with larger negative \ac{ES} than the matched \ac{dist}. Conversely, deviations of (-3, -5) lacked significant findings and had smaller \ac{ES}, particularly at -3, confirming the generative model's overestimation of \ac{LV} sizes in matched \acp{dist}. (As seen in \figureref{AreaVvsH-Age}). For \ac{HC}, the -3 mismatch, like matched \ac{dist}, showed no significant differences but had a lower \ac{ES}, suggesting minor overestimation in matched conditions due to its higher \ac{ES} and a closer \textit{P} value to significance. Finally, all other conditions showed significance.

Outliers and the whisker ranges of box plots for both \acp{ROI} in \figureref{AreaVvsH-Age} show variability within and between image types, with synthetic images exhibiting a broader area size range than real ones. Closer analysis shows more outliers in real images for lateral ventricles and in synthetic images for the hippocampus. Notably, 0.82\% of synthetic ventricles' areas were below the minimum observed areas in real data. These sizes are clinically significant, indicating implausibly small lateral ventricles for adults (Agreed on by \ac{R1} \& \ac{R2}). \figureref{unus-sml} b) illustrates such a case, clearly showing the differences in the unusual ventricle sizes of synthetic vs. real images for the same covariates, highlighting the need for rigorous evaluation of synthetic image generation for clinical use. Despite the segmentation model's training on real data, the model accurately segmented the unusually small ventricles (unseen in training), demonstrating robustness.


\begin{figure}[h!]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
 \floatconts
  {AreaVvsH-Age}
  {\caption{Age-wise comparison of observed \ac{LV} and \ac{HC} areas in real versus synthetic brain \ac{MRI} across matched \acp{dist}. shows the aging effect on brain \ac{ROI} sizes.}}
  {\includegraphics[width=0.87\linewidth]{images/Figure2.png}}
\end{figure}

\begin{wrapfigure}{L}{0.50\textwidth}
% \begin{figure}[h!]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {unus-sml}
  {\caption{a) \ac{R2}'s misclassification and incorrectly noting "anatomical inaccuracies" (Red brush) in a real image \citep{Krichevsky2023}, b) Unusually small \ac{LV} sizes vs. real counterparts with identical covariates. Colored \ac{LV}s represent our segmentation model's masks.}}
  {\includegraphics[width=1\linewidth]{images/Figure3.pdf}}

% \end{figure}
\end{wrapfigure}

\section{Discussion}
\label{sec:Dis}

The \ac{VTT} reveals the challenge radiologists face in distinguishing synthetic from real brain \ac{MRI}, attributed to the synthetic images' compelling realism and resulting in varied accuracy values. \ac{R2}'s frequent misclassifications, driven by cognitive biases, hint that performance could potentially improve through retraining with varied image types. Nonetheless, the smoothness effect is not only present in synthetic images but also in real \ac{ADNI} and \ac{UKB} images. Additionally, \ac{R2} consistently reported low confidence, often citing 'anatomical inaccuracies' in real images identified as synthetic, emphasizing the complexity of identifying subtle changes in \acp{ROI} linked to normal aging or \ac{AD} progression. Although both radiologists agreed and correctly identified the unusually small ventricles, manually inspecting 600,000 synthetic images to find similar conditions is infeasible. In contrast, our method allows for efficient identification of such cases. Based on these challenges, \ac{VTT} alone is insufficient as a definitive measure of image realism, necessitating a more objective approach.

Our study advances this objective by rigorously evaluating the generative model's ability to replicate age-related anatomical changes, revealing its strengths and pinpointing limitations. Unlike the \ac{VTT}, our method provides precise insights into anatomical accuracy, especially highlighting how the model overestimates ventricular enlargement while performing better in simulating hippocampal changes. This nuanced understanding highlights the importance of employing focused methods to evaluate the anatomical consistency of synthetic medical imagery. Finally, while observing a wide range of area sizes in synthetic images indicates no mode collapse, but outliers, particularly implausibly small ventricles, illustrates the model's current limitations in capturing the full spectrum of individual variability in real-world data. 

The presence of biases in synthetic data, especially those that do not accurately reflect real-world conditions, poses a significant challenge in research on aging and \ac{AD} diseases. When synthetic images inaccurately represent brain ventricles as larger than expected, this introduces an unrealistic bias that can mislead studies, potentially leading to incorrect conclusions about ventricular changes. This issue is compounded in areas like \ac{AD} research, where patient data may be scarce, yet it is vital that the data used reflect genuine conditions to maintain the integrity of findings. The risk extends to diagnostic accuracy, where reliance on biased synthetic images could lead to misdiagnoses or the formulation of ineffective treatment plans. Similarly, training AI systems with inaccurate images could lead to errors in clinical applications, particularly those dependent on precise identification of age-related changes. To mitigate these challenges, it is crucial to refine generative models to more faithfully represent the complexities of aging. By tackling these biases head-on, we can enhance the realism and reliability of synthetic images, thereby supporting more accurate clinical research, diagnostics, and the effective use of AI in healthcare settings.

These discrepancies may stem from training with the large \ac{UKB} dataset and fine-tuning on the smaller \ac{ADNI} dataset. A potential direction is integrating \ac{UKB}'s demographic diversity with \ac{ADNI}'s AD-specific features, potentially improving the synthetic representation of underrepresented conditions. Additionally, advanced techniques could balance the influence of each data source, ensuring both cohorts' unique characteristics are captured in the generated images. Moreover, while causal models account for \ac{ROI} volumes, the generative model's 2D training might restrict performance because of reduced contextual and spatial information. This limitation highlights the necessity for additional research as we evolve our 2D proof of concept to a more clinically relevant 3D model in future work.

\section{Conclusion and Future Work}
\label{sec:Con}
In conclusion, our study highlights the difficulties in differentiating synthetic from real brain \ac{MRI}, emphasizing the limitations of the \ac{VTT} and advocating for more objective evaluation methods. Our analysis reveals the generative model's strength in replicating age-related anatomical changes and simulating neurodegenerative features, alongside a tendency to introduce unrealistic biases, such as overestimated ventricular enlargement or implausibly small ventricles. These insights underline the necessity for robust evaluation methods for synthetic medical images, aiming to enhance image generation and facilitate their successful application in clinical settings. 

While our evaluation primarily focuses on neurodegeneration, its relevance extends across various medical domains. For example, adapting our segmentation model to tumor analysis in different organs necessitates retraining for tumor segmentation. Our statistical pipeline would then be used to compare tumor shapes and sizes with those of actual tumors at various stages. In brain \ac{MRI} studies, examining mid-line shifts and the progression effects of tumors on adjacent \acp{ROI} in both real and synthetic images offers a unique opportunity to discern consistencies and discrepancies between image types. Developing these evaluation methods, though challenging, is essential for the effective integration of synthetic images into clinical practice and enhancing the transparency of deep learning in such a critical field.

\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{The authors thank ImFusion GmbH for providing the software used to conduct the visual Turing test free of charge. We also acknowledge the UK Biobank Resource under Application ID Number 77717. Additionally, we thank the Alzheimer’s Disease Neuroimaging Initiative (ADNI) for their resources.

Data used in preparation of this article were obtained from the Alzheimer's Disease Neuroimaging Initiative (ADNI) database (adni.loni.usc.edu). As such, the investigators within the ADNI contributed to the design and implementation of ADNI and/or provided data but did not participate in analysis or writing of this report. A complete listing of ADNI investigators can be found at: \href{http://adni.loni.usc.edu/wp-content/uploads/how_to_apply/ADNI_Acknowledgement_List.pdf}{ADNI Acknowledgement List.} Data collection and sharing for the Alzheimer's Disease Neuroimaging Initiative (ADNI) is funded by the National Institute on Aging (National Institutes of Health Grant U19 AG024904). The grantee organization is the Northern California Institute for Research and Education. In the past, ADNI has also received funding from various organizations listed in: \href{https://adni.loni.usc.edu/wp-content/uploads/how_to_apply/ADNI_Manuscript_Citations.pdf}{ADNI Acknowledgement Section.}}

\authorcontribution{HY and JF conceptualized the study. HY implemented the segmentation-based method in Python, conducted the analyses, and drafted the initial manuscript. JF assisted HY in interpreting the results and editing the script. WL developed the generative model and synthesized the brain \ac{MRI}. AK prepared and conducted the Turing test and analyzed the results from that section with the help and supervision of JF and HY. Katrin Schanack and Beate Endemann are the radiologists who performed the Turing test. AR managed the preliminary processing of the datasets and obtained the segmentation ground truths. CL supervised the study. All authors contributed to the manuscript and approved its final version.}

\funding{This research was funded by the German Federal Ministry of Education and Research (BMBF) within the project ‘Syreal’ (Grant No. 01/S21069A). The funders had no role in the study design, data analysis, interpretation, or report writing.}

\availdata{The \ac{ADNI} dataset analyzed during the current study is available at the Alzheimer’s Disease Neuroimaging Initiative (ADNI) database (\href{https://adni.loni.usc.edu/data-samples/access-data/}{Access Data}) upon consenting to the data sharing agreement. Access to the \ac{UKB} dataset used during the training of the segmentation models requires an application and is subject to approval (\href{https://www.ukbiobank.ac.uk/enable-your-research/register}{Register}). The synthetic data used in the analysis are publicly available at \href{https://figshare.com/ndownloader/files/41911386}{figshare}.}

\bibliography{midl24_141}

\newpage
\appendix

\section{Synthetic Data}
\label{apd:synthetic_data}
Here, we provide more details on the conditional causal model, which generates the latent variables for the generative model. 
In this work, the synthetic images are generated from a conditional StyleGAN3 model with a handcrafted causal model, which controls the latent variables, \textit{e.g.}, age, sex, and lateral ventricle volumes. 
\figureref{fig:causal_dag} depicts the causal modeling of the conditional variables. Age and sex are the confounders of the CDR and volumes.
Furthermore, we use Maximum Likelihood Estimation to fit the training data into a handcrafted parametric model for all latent variables.
Age is modeled by a beta \ac{dist}, sex is a Bernoulli \ac{dist}, while CDR is sampled as a multi-class softmax regression, conditioned on age and sex. There are three classes in CDR, "0" means a healthy subject, "0.5" means a cognitively impaired subject, and "1" means a dementia patient.
Lastly, we use a Gaussian mixture regression model, conditioned on age, sex, and CDR, to model the volumes.

Therefore, with the trained causal model, we can randomly sample latent variables from the \ac{dist} of the training set \citep{pawlowski2020deep}.
Furthermore, we can also sample specific variables, \textit{e.g.}, CDR and volumes, by conditioning on a specific age. After sampling the latent variables, we fit them into the conditional StyleGAN3 to generate images with specific characteristics.

\begin{figure}[h!]
\floatconts
  {fig:causal_dag}
  {\caption{DAG (Directed acyclic graph) of the causal model. A: Age, S: Sex, C: CDR, V: Volumes (lateral ventricle volumes, cerebral cortex volumes, and hippocampus volumes.)}}
  {\includegraphics[width=.48\linewidth]{images/Figure4.pdf}}
\end{figure}


\newpage
\section{Visual Turing Test (VTT)}\label{apd:VTT}
% and certainty levels

\figureref{fig:VTT_accuracy} shows the accuracy of two radiologists during the \ac{VTT}, alongside the reasons cited for identifying synthetic images. Upon further analysis of the results, insights were gained into the certainty levels associated with the radiologists' decisions. \Ac{R1} often indicated 'moderate certainty' in their choices, whereas \ac{R2} was more inclined towards 'very low certainty.' This variation in certainty directly correlates with their accuracy rates. Nonetheless, the task's complexity, from evaluating a single mid-brain slice (256x256) instead of a full 3D volume, which is atypical in clinical practice, posed significant challenges. In a subsequent meeting, the radiologists concurred that a repeat of the test, even after practicing, would unlikely improve their accuracy, highlighting the fundamental challenge of the task.

\begin{figure}[h!]
\floatconts
  {fig:VTT_accuracy}
  {\caption{Bar plots of Radiologists' Accuracy values in \ac{VTT} and Reasons for Correctly Identifying Synthetic Images.}}
  {\includegraphics[width=1\linewidth]{images/Figure5.png}}
\end{figure}

\begin{figure}[h!]
\floatconts
  {VTT_Ex}
  {\caption{\ac{R1} misclassified a real image on the left as synthetic (actually on the right), stating "Noise Patterns" under "Other reasons" (green label brush) with no additional comments. The covariates assigned to both images indicate a male subject, 66 years old, and classified as CN \citep{Krichevsky2023}.} }
  {\includegraphics[width=0.7\linewidth]{images/Figure6.pdf}}
\end{figure}

Additionally, we provide a breakdown of the reasons each radiologist cited for their choices, particularly when they correctly identified a synthetic image (see \figureref{fig:VTT_accuracy} b)). \Ac{R1} frequently attributed their decisions to 'contrast problems,' whereas \ac{R2}, biased towards associating smooth images with real ones despite this characteristic being present in both image types, leaned toward 'anatomical inaccuracies.' as their rationale. Interestingly, neither radiologist selected "Inaccurate representation of pathology" as the reason for their choice. Lastly, \figureref{unus-sml,VTT_Ex} demonstrate an example where both \ac{R1} and \ac{R2} misclassified a real image as synthetic for two different reasons ("anatomical inaccuracies" and "Noise Patterns," respectively).

\newpage
\section{Segmentation Training and Evaluation}
\label{apd:SegTE}

This appendix complements the main paper with detailed implementation, data-related specifics, and performance comparisons, which are crucial for reproducibility yet placed here to keep the main text focused.

\subsection{Model Architecture}

We aimed to precisely segment \ac{ROI} from brain \ac{MRI} using state-of-the-art deep learning models, laying a solid foundation for evaluating age-related biological changes. We employed two principal architectures and their variations: \textbf{UNet} \cite{ronneberger2015u} (both original and adapted versions) and \textbf{\ac{FCN}} \cite{long2015fully} (incorporating ResNet-50 and ResNet-101 variants). For the \ac{A-UNet}, we increased the initial feature count to 64 from 32 and added an extra layer of depth, significantly boosting its segmentation efficiency for our dataset.


\subsection{Model Implementations}


For consistency, we used standard optimizers and loss functions in \ac{A-UNet} (lr = 0.0001, Adam optimizer with betas (0.9, 0.999), epsilon of 1e-8, dice loss) and \ac{FCN} models (lr = 0.01, SGD with momentum of 0.9, weight decay of 1e-6, cross-entropy and Dice loss with auxiliary loss). We applied max normalization and augmentations (random rotations, flips). We customized models for the high-resolution \ac{UKB} dataset and chose a batch size of 4 with gradient accumulation. Validation was conducted using a 5-fold cross-validation, dividing the \ac{UKB} dataset into 60\% training, 15\% validation, and 25\% testing. The training was performed exclusively on \ac{UKB} to avoid bias towards real \ac{ADNI} images selected as a reference for our evaluation approach. 

We implemented our segmentation models on PyTorch Lightning and trained them on 1 A100 GPU until the convergence of the \ac{DSC} score on a
validation set. For consistency with synthetic images, we adapted to the 2D StylGAN3 pre-processing, registering \ac{MRI} scans and ground truth masks to the MNI152 atlas space and resizing images to 256X256 using pytorch-complex package \citep{chatterjee2022complex} and extracting the central coronal 2D slice for training and evaluation. 

\subsection{Models' Performance and Architectural Comparison}

The \ac{DSC} served as the primary metric to evaluate segmentation accuracy \citep{zou2004statistical}, reflecting the overlap between model predictions and SynthSeg ground truth masks. Initial optimization attempts started with models showing a \ac{DSC} around 0.6, indicative of underfitting. Through iterative enhancements, the \ac{A-UNet} model emerged as the top performer in internal testing on \ac{UKB} and external testing on \ac{ADNI}. The best-performing models were based on external testing, where \ac{A-UNet} achieved the highest median \ac{DSC} scores of 96.53\% ± 0.16 for ventricles and 91.68\% ± 0.28 for the hippocampus across 3 out of 5 folds (\tableref{tab:3-5fold}).

Comprehensive comparisons in \tableref{tab:3-5fold,tab:bestfold,tab:firstfold} detail \ac{DSC} scores across models and validation folds, illustrating \ac{A-UNet}'s superior performance and its selection for subsequent analyses.

\begin{table}[h!]
\setlength{\tabcolsep}{2.5pt}
\floatconts
  {tab:3-5fold}
  {\caption{Comparison of model performance using \textbf{median \ac{DSC} $\pm$ \ac{IQR} of lateral ventricles and hippocampus \acp{ROI}} over 3 out of 5 folds cross-validation for the two best performing models, highlighting the highest obtained \ac{DSC} values between both}}
  {\begin{tabular}{lcccc}
    \toprule
    Model & \multicolumn{2}{c}{Lateral Ventricles (DSC)} & \multicolumn{2}{c}{Hippocampus (DSC)}\\
    \cmidrule(lr){2-5} 
     & Internal & External & Internal & External\\
    \midrule
    \textbf{\ac{A-UNet}} & \textbf{97.48\% $\pm$ 0.04} & \textbf{96.53\% $\pm$ 0.16}  & \textbf{94.37\% $\pm$ 0.04} & \textbf{91.68\% $\pm$ 0.28}\\
    FCN-Res101 & 94.20\% $\pm$ 0.01 & 93.34\% $\pm$ 0.42  & 91.62\% $\pm$ 0.05 & 88.33\% $\pm$ 0.27\\
    % \bottomrule
     \\
     \toprule
    \end{tabular}}
    \end{table}

\begin{table}[h!]
\setlength{\tabcolsep}{2.5 pt}
\floatconts
  {tab:bestfold}
  {\caption{Comparison of model performance using \textbf{mean DSC of lateral ventricles and hippocampus \acp{ROI}} over the first fold in cross-validation for all models}}
  {\begin{tabular}{lcccccc}
    \toprule
    Model & \multicolumn{2}{c}{Lateral Ventricles} & \multicolumn{2}{c}{Hippocampus}\\
    \cmidrule(lr){2-3} \cmidrule(lr){4-5}
    & Internal & External & Internal & External\\
    \midrule
    \textbf{\ac{A-UNet}} &\textbf{ \textbf{97.52}\%} & \textbf{96.68\%} & \textbf{94.37\%} & \textbf{91.61\%}\\
    UNet & 96.16\% & 94.65\% & 91.60\% & 88.09\%\\
    FCN-Res101 & 94.21\% & 93.71\% & 91.45\% & 86.96\%\\
    FCN-Res50 & 94.20\% & 93.55\% & 91.13\% & 84.79\% \\
    \bottomrule
    \end{tabular}}
\end{table}

\begin{table}[h!]
\setlength{\tabcolsep}{2.5 pt}
\floatconts
  {tab:firstfold}
  {\caption{Comparison of model performance using \textbf{median DSC of lateral ventricles and hippocampus \acp{ROI}} over the first fold in cross-validation for models trained on Intracranial \ac{ROI}}}
  {\begin{tabular}{lcccccc}
    \toprule
    Model & \multicolumn{2}{c}{Intracranial Areaa (DSC)} \\
    \cmidrule(lr){2-3} 
     & Internal & External \\
    \midrule
    \textbf{\ac{A-UNet}} & \textbf{99.37}\% & \textbf{98.57}\% &  \\
    FCN-Res50 & 99.10\% & 98.08\%  \\
    \bottomrule
    \end{tabular}}
\end{table}

\subsection{Outlier Analysis}

In our final analysis phase, we investigated extreme outliers within the lateral ventricles and hippocampus regions to detect potential segmentation inaccuracies. Notably, errors were confined to a handful of cases in both real and synthetic images, mainly resulting from the lack of typical T1-weighted contrast inherent to each image type. Furthermore, we encountered two empty synthetic images. Nonetheless, excluding these outliers from our dataset did not influence the overall significance of our results. It appears that the observed irregular contrast patterns can be traced back to anomalies in the real dataset, which were then mirrored in some synthetic images, potentially arising from errors in scanning protocols. These discrepancies introduce segmentation challenges by creating contrast variations distinct from those in the training set. For visual examples of these anomalies and the segmentation issues they caused, see \figureref{fig:sigfail}.




\begin{figure}[htbp]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {fig:sigfail}
  {\caption{The first row displays the successful segmentation of three \ac{ROI} masks in normal T1-weighted contrast used for training. In contrast, the second and third rows highlight segmentation challenges in contrasts different from training. The first column shows the image, followed by segmentation masks for lateral ventricles, hippocampus, and intracranial area. The first and second rows feature real images, while the third row shows a synthetic image.}}
  {\includegraphics[width=1\linewidth]{images/Figure7.pdf}}
\end{figure}


\newpage
\section{Supplementary Analyses on Biological Plausibility}
\label{apd:ST}

\subsection{Results}

\begin{figure}[h!]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
 \floatconts
  {fig:diff-dist-Age}
  {\caption{Comparison of \ac{ROI} areas in real vs. synthetic brain \ac{MRI} across different age \acp{dist}: Matched \acp{dist}. are centered, showcasing a median age of 74. On the sides, mismatched \acp{dist}. where synthetic images have a higher or lower median age ($\pm$ 3 and 5 years).}}
   {\includegraphics[width=1\linewidth]{images/Figure8.png}}
  
\end{figure}


\begin{figure}[h!]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
 \floatconts
  {fig:diff-dist-CDR}
  {\caption{Comparison of \ac{ROI} areas in real vs. synthetic brain \ac{MRI} across different age \acp{dist}: Matched \acp{dist}. are centered, showcasing a median age of 74. On the sides, mismatched \acp{dist}. where synthetic images have a higher or lower median age ($\pm$ 3 and 5 years).}}
   {\includegraphics[width=1\linewidth]{images/Figure9.png}}
  
\end{figure}


\begin{table}[h!]
    \floatconts
  {tab:CDR_Mismatches}
  {\caption{\textbf{Statistical Comparison of ROI Areas in Real vs. Synthetic Images Across different CDR \acp{dist}:} Analysis examines ROI areas under \textbf{Matched Conditions} (covariates align, MCI predominant) and Shifted Conditions (Synthetic images' \ac{dist} is changed to either CN or AD dominance). \textbf{Significant differences} are highlighted.}}
 {\begin{tabular}{lcccccc}
\toprule
ROI & Cond. & Permute & ES & $CI_l$ & $CI_u$ & KS \\
\midrule

LV & CN & 2.00E-01 & -4.80E-02 & -1.30E-01 & 2.90E-02 & \textbf{3.8e-05} \\
LV & Match & \textbf{2.0e-03} & -1.30E-01 & \textbf{-2.00E-01} & \textbf{-5.00E-02} & \textbf{2.0e-08} \\
LV & AD & \textbf{0.0e+00} & -1.90E-01 & \textbf{-2.70E-01} & \textbf{-1.10E-01} & \textbf{2.0e-12} \\
\ac{HC} & CN & 7.00E-01 & 1.50E-02 & -6.20E-02 & 9.20E-02 & 2.20E-01 \\
\ac{HC} & Match & 5.70E-02 & 7.00E-02 & -6.70E-03 & 1.50E-01 & 8.70E-02 \\
\ac{HC} & AD & \textbf{0.0e+00} & 2.10E-01 & \textbf{1.30E-01} & \textbf{2.80E-01} & \textbf{7.5e-08} \\

\bottomrule
\end{tabular}}
\end{table}

\begin{figure}[h!]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
 \floatconts
  {fig:diff-dist-Sex}
  {\caption{Comparison of \ac{ROI} areas in real vs. synthetic brain \ac{MRI} across different age \acp{dist}: Matched \acp{dist}. are centered, showcasing a median age of 74. On the sides, mismatched \acp{dist}. where synthetic images have a higher or lower median age ($\pm$ 3 and 5 years).}}
   {\includegraphics[width=1\linewidth]{images/Figure10.png}}
  
\end{figure}


\begin{table}[h!]
    \floatconts
  {tab:Sex_Mismatches}
  {\caption{Statistical Comparison of \ac{ROI} Areas in Real vs. Synthetic Images across Different Sex \acp{dist}: Analysis examines ROI areas under \textbf{Matched Conditions} (Approx. balanced Female to male ratio) and Shifted Conditions (Synthetic images either female or male predominant at 40\% and 60\%). \textbf{Significant differences} are highlighted.}}
  {\begin{tabular}{lcccccc}
\toprule
ROI & Cond. & Permute & ES & $CI_l$ & $CI_u$ & KS \\
\midrule

LV & Females & 2.10E-01 & -5.00E-02 & -1.30E-01 & 2.70E-02 & \textbf{1.7e-04} \\
LV & Match & \textbf{2.0e-03} & -1.30E-01 & \textbf{-2.00E-01} & \textbf{-5.00E-02} & \textbf{2.0e-08} \\
LV & Males & \textbf{1.2e-02} & -9.70E-02 & \textbf{-1.70E-01} & \textbf{-2.00E-02} & \textbf{1.2e-07} \\
\ac{HC} & Females & 3.70E-01 & -3.60E-02 & -1.10E-01 & 4.10E-02 & 2.70E-01 \\
\ac{HC} & Match & 5.70E-02 & 7.00E-02 & -6.70E-03 & 1.50E-01 & 8.70E-02 \\
\ac{HC} & Males & 2.10E-01 & 4.70E-02 & -3.00E-02 & 1.20E-01 & 1.30E-01 \\

\bottomrule
\end{tabular}}
\end{table}


The insights derived from \tableref{tab:CDR_Mismatches}, and \tableref{tab:Sex_Mismatches} are pivotal in comparing synthetic and real data \acp{dist}. We see a similar pattern to what was observed in hippocampus in age mismatches, where \acp{dist}. predominantly featuring Cognitive Normal (CN) or Females also have lower non significant \ac{permute} \textit{P} values than matched \acp{dist}, which have \textit{P} values close to significance. Furthermore, in lateral ventricles, CN- or female-predominated \acp{dist}.' \ac{permute} \textit{P} values are non-significant, in contrast to matched, and hence more closely resemble real data than matched distributions (MCI predominant, balanced sex ratio). In \figureref{fig:diff-dist-CDR}, and \figureref{fig:diff-dist-Sex}, violin plots are more similar in the hippocampus than in the lateral ventricles, reflecting the non-significant KS \textit{P}-values for the hippocampus in contrast with the lateral ventricles. The only significant KS and shape differences in the hippocampus occur in the \ac{AD}-predominant \ac{dist} mismatch. These results align with the age mismatch findings and suggest that certain demographic characteristics influence the model's ability to generate synthetic images that accurately reflect the diversity of real anatomical structures.

\subsection{Discussion}

The observed results highlight the significance of incorporating covariates in the training and generating synthetic medical images. Our approach, utilizing a causal model that accounts for these covariates, aims to address this need. The increased resemblance of CN-predominated \acp{dist}. to real data suggests that our synthetic models may be more adept at capturing the anatomical nuances of cognitively normal subjects, possibly due to initial training on extensive assumed to be healthy (CN) \ac{UKB} cohort or reflecting the varied anatomical characteristics associated with neurodegenerative conditions like MCI and AD. The closer alignment of female-predominated \acp{dist}. with real data further prompts a review of the model's sensitivity to sex-specific anatomical differences. 

\subsection{Future Work}
Future research should explore the underlying mechanisms contributing to these observed discrepancies in synthetic image generation. It is essential to investigate the model's training data and algorithms for potential biases or limitations in capturing the full spectrum of human anatomical diversity. Additionally, expanding the model's training dataset to include a more diverse representation of ages, cognitive states, and sexes may help generate synthetic images that more accurately mirror the variability found in real-world data.


\end{document}
