\documentclass{midl}

\usepackage{booktabs}
\usepackage{multirow}
\usepackage{siunitx}
\usepackage{array}
\usepackage{dsfont}
\usepackage{indentfirst}
\newcommand{\fm}[2]{\newline{\scriptsize(#1/#2)}}
\newcommand{\meanpm}[2]{#1\,{\tiny$\pm$\,#2}}
\newcommand{\dirup}{\ensuremath{\uparrow}}
\newcommand{\dirdown}{\ensuremath{\downarrow}}
\sisetup{
  reset-text-series = false, text-series-to-math = true,
  reset-text-family = false, text-family-to-math = true
}


\newcommand{\cell}[2]{\begin{tabular}{@{}c@{}}#1 \\ \scriptsize{#2}\end{tabular}}

\jmlrvolume{-- 34}
\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026}
\editors{Accepted for publication at MIDL 2026}

\title[On the Stability and Robustness of ViT for Neurodegenerative Classification]{On the Stability and Robustness of Vision Transformers for Neurodegenerative Disease Classification}

\midlauthor{\Name{Eloi Navet\nametag{$^{1}$}}\orcid{0009-0009-0705-3678} \Email{eloi.navet@u-bordeaux.fr}\\
\Name{R{\'e}mi Giraud\nametag{$^{2}$}}\orcid{0000-0003-3965-1719} \Email{remi.giraud@u-bordeaux.fr}\\
\Name{Boris Mansencal\nametag{$^{1}$}}\orcid{0000-0002-9190-4819} \Email{boris.mansencal@u-bordeaux.fr}\\
\Name{Pierrick Coup{\'e}\nametag{$^{1}$}}\orcid{0000-0003-2709-3350} \Email{pierrick.coupe@u-bordeaux.fr}\\
\addr $^{1}$ Universit{\'e} de Bordeaux, CNRS, Bordeaux INP, LaBRI, UMR 5800, F-33400 Talence, France \\
\addr $^{2}$ Universit{\'e} de Bordeaux, CNRS, Bordeaux INP, IMS, UMR 5218, F-33400 Talence, France
}

\begin{document}
\maketitle
\begin{abstract}
Vision Transformers (ViTs) have recently been explored for structural MRI classification, motivated by their ability to capture non-local image structure.
However, in limited and heterogeneous clinical cohorts, their weak inductive biases and sensitivity to training conditions often lead to high-variance behaviour. While binary settings such as cognitively normal vs.\ dementia are widely reported and typically exhibit moderate variability, we show that this stability does not extend to differential diagnosis. When increasing task complexity (e.g., controls vs.\ Alzheimer's Disease vs.\ Frontotemporal Dementia), performance becomes sensitive to class imbalance and phenotype overlap, with greater variability driven by fewer samples per class, noisier labels, and increased inter-site heterogeneity.

In this study, we investigate a stabilization protocol combining data augmentation, architectural constraints, and optimization strategies on multi-site MRI datasets. We assess how model variance evolves with task complexity using patient-level paired bootstrapping, calibration analysis, paired significance tests, and estimates of the probability of false outperformance to obtain uncertainty-aware comparisons across models.

Our results highlight conditions under which Transformer-based classifiers can be consistently trained with limited neuroimaging data and illustrate that several performance gains disappear once stochastic variability is reported. 
These results emphasize that reliable differential diagnosis with ViTs requires both robust stabilization protocols to mitigate optimization noise and standardized uncertainty quantification beyond simple point-estimates.
\end{abstract}

\begin{keywords}
Vision Transformers, Neurodegenerative Disease, Differential Diagnosis, Stability, Reproducibility, Robustness, Uncertainty Quantification.
\end{keywords}


\section{Introduction}
\label{sec:intro}

Transformer-based deep learning architectures such as ViT~\citep{Dosovitskiy2021ViT} and Swin~\citep{Liu2021Swin} are increasingly applied to neurodegeneration classification from structural MRI alone, due to their ability to capture distributed atrophy patterns that extend beyond local receptive fields~\citep{shamshad2023transformers,alamir2024detection}. 
Yet, unlike Convolutional Neural Networks (CNNs), these models lack the spatial inductive biases inherent to visual data (e.g., locality and translation invariance) and are typically trained on large, homogeneous datasets, which are conditions rarely met in clinical neuroimaging~\citep{matsoukasItTimeReplace2021}.
Medical data specificities, including inherent noise, acquisition heterogeneity, and severe class imbalance, exacerbate the instability of unregularized transformers.
Combined with the lack of intrinsic spatial constraints, these factors make optimization brittle, with sharp loss landscapes and heightened sensitivity to initialization, data ordering, and hyperparameters~\citep{chen2022visiontransformersoutperformresnets, park2022visiontransformerswork}. 

Such instability is a critical bottleneck for the field. While CNNs remain robust baselines, they lack the unified token representation required to seamlessly integrate imaging with non-spatial modalities, such as genomics or tabular data in a unified embedding space \citep{wangMultimodalTokenFusion2022,biMultimodalVisionTransformer2024a}.
Reported gains in medical imaging often lie within stochastic variability once uncertainty is quantified~\citep{Bouthillier2021Accounting}, and recent audits show that formal significance testing is uncommon in leading venues \citep{Christodoulou2025FalsePromises}.
These issues are particularly acute in classification tasks compared to segmentation.
While voxel-wise segmentation metrics benefit from high information density per image, allowing for precise performance estimates even with moderate sample sizes, image-level classification relies on sparse supervisory signals (one label per volume)~\citep{eljurdiConfidenceIntervalsPerformance2025a}. Consequently, classification tasks require substantially larger cohorts to achieve comparable statistical precision~\citep{varoquauxCrossvalidationFailureSmall2018}, making them susceptible to the stochastic variability inherent in clinical datasets where sample sizes are typically constrained.

In neurodegenerative disease classification, while Cognitively Normal (CN)~vs.~Alzheimer Disease (AD) settings mask this difficulty~\citep{basaiaAutomatedClassificationAlzheimers2019,wen2020convolutional}, multiclass tasks involving behavioral variant Frontotemporal Dementia (bvFTD), semantic variant Primary Progressive Aphasia (svPPA), and nonfluent variant Primary Progressive Aphasia (nfvPPA) frequently reveal ranking inversions between architectures and unstable decision boundaries.
This instability is exacerbated by the reliance on purely imaging-based diagnosis; for example differentiating bvFTD from AD is particularly challenging solely from MRI due to overlapping atrophy patterns in the anterior cingulate and frontoinsula~\citep{Perry2017Clinicopathological}. This creates a "grey zone" where data-hungry models struggle to identify decision boundaries without the guidance of clinical or neuropsychological scores.

This work examines the stability and robustness of Transformers under the specific challenges of differential diagnosis on MRI. We identify techniques preserving calibration and ranking consistency across random seeds and distribution shifts. To this end, we propose:
\begin{enumerate}
\setlength\itemsep{0em}
\setlength\parskip{0em}
    \item \textbf{A stability assessment across task complexities:} By contrasting a standard 3-class setup (detailed in Appendix~\ref{app:instabilities_325}) with a granular 5-class differential diagnosis (CN/AD/bvFTD/svPPA/nfvPPA), we quantify how increased phenotypic overlap and class imbalance amplify instability in Transformers compared to CNNs.
    
    \item \textbf{A benchmarking of stabilization strategies:} Using a representative hierarchical vision transformer as a reference, we conduct an extensive ablation study reviewing the impact of data-level (augmentation, sampling), architectural (initialization, regularization), and optimization strategies. We identify a specific protocol that allows to close the generalization gap in limited-data regimes without relying on pretraining.
    
    \item \textbf{An uncertainty-aware evaluation protocol:} Moving beyond point-estimates, we employ patient-level paired bootstrapping, calibration analysis, and the probability of false outperformance~\citep{Christodoulou2025FalsePromises}. This framework allows us to differentiate genuine signal from optimization noise, \emph{i.e.} that standard evaluation reports gains that fail to reach statistical significance with proper uncertainty quantification. 
\end{enumerate}

Our findings delineate the conditions under which Transformers can be deployed reliably for neurodegenerative disease classification. As architectures like MedViT show that Transformer-based models can surpass strong CNN baselines, our stabilization principles offer a pathway to unlock this potential across medical ViTs, clarifying which performance differences persist once stochastic variability and distribution shifts are accounted for. The code and stabilization protocols are publicly available.\footnote{\url{https://github.com/EloiNavet/ViT-Stability-Neurodegeneration/}}

\section{Related Work}
\label{sec:related}
Transformers have emerged as interesting alternatives to CNNs in medical imaging, offering the ability to capture long-range dependencies and global image structure~\citep{Dosovitskiy2021ViT, Liu2021Swin}.
While hierarchical variants like Swin~\citep{Liu2021Swin} and domain-specific adaptations~\citep[e.g.,][]{hatamizadeh2022swin, Wald2025Primus} attempt to mitigate the quadratic cost of attention and introduce inductive biases, training remains notoriously fragile.
Unlike CNNs, Transformers lack spatial priors, making them prone to overfitting and unstable optimization on the small, heterogeneous cohorts typical of clinical neuroimaging~\citep{he2023transformers, movit2024}.

Dementia classification has progressed from hand-crafted features with SVMs~\citep{kloppel2008automatic} to deep 3D CNNs~\citep{wen2020convolutional, nguyenDeepGradingMRIbased2023b} and, recently, Transformers~\citep{Nguyen2024}.
Current literature reports high performance for binary AD detection~\citep{alamir2024detection}.
However, differential diagnosis involving FTD subtypes remains challenging due to overlapping phenotypes~\citep{wu2025boostering}, a setting where the lack of rigorous uncertainty quantification is critical.
Recent audits suggest that many reported performance gains in medical imaging may be attributable to stochastic variability rather than genuine architectural improvements~\citep{Christodoulou2025FalsePromises}.
This issue is exacerbated in Transformers due to their sensitivity to initialization and hyperparameters~\citep{chen2022visiontransformersoutperformresnets}, requiring multi-seed evaluation~\citep{Bouthillier2021Accounting,delpup2024toward}.

Addressing this instability requires a holistic approach to regularization and evaluation.
Standard stabilization strategies include heavy data augmentation~\citep{Zhang2018mixup,Cardoso2022MONAI}, architectural constraints (e.g., LayerScale, stable initialization)~\citep{Touvron2021CaiT, kedia2024transformersstableendtoendsignal}, and optimization techniques such as sharpness-aware minimization (SAM) or label smoothing~\citep{Foret2021SAM, Muller2019LabelSmoothing}.
Furthermore, relying solely on accuracy metrics is insufficient for clinical reliability.
Robust assessment requires analyzing calibration~\citep{Guo2017Calibration} and validating statistical significance through paired tests and bootstrapping~\citep{McNemar1947, Efron1979Bootstrap}. Complementarily, we employ the Brier score~\citep{Brier1950} to specifically quantify prediction over-confidence, offering an assessment of probabilistic reliability distinct from discrimination ranking.
Our work tends to unify these disparate components, benchmarking stabilization strategies specifically for the low-data, high-imbalance framework of dementia differential diagnosis.
 

\section{Method}\label{sec:data}

\subsection{Datasets}
\label{sec:datasets}

We construct an in-domain (ID) pool combining data from the Alzheimer's Disease Neuroimaging Initiative (ADNI)~\citep{Mueller2005adni} and the multi-site ALLFTD consortium~\citep{Boeve2020ALLFTD}, two longitudinal studies monitoring Alzheimer's Disease (AD) and Frontotemporal Lobar Degeneration (FTLD) respectively. We focus on the clinical spectrum of FTD subtypes, categorized into behavioral variant Frontotemporal Dementia (bvFTD), semantic variant Primary Progressive Aphasia (svPPA), and nonfluent variant Primary Progressive Aphasia (nfvPPA). To assess robustness to domain shifts, we employ an Out-Of-Domain (OOD) pool aggregating NIFD~\citep{FTLDNI_NIFD} and NACC~\citep{Beekly2007NACC}. \tableref{tab:dataset_counts} summarizes the class composition across these cohorts, while full cohort-level criteria and demographics are provided in Appendix~\ref{app:datasets}. Although this aggregated dataset is substantial by medical standards, it remains orders of magnitude smaller than the massive datasets typically required to train vision transformers in general computer vision.

\paragraph{Distribution shifts and class imbalance.}
While the ID cohort (ADNI+ALLFTD) already displays inherent class imbalance, it retains actionable representation for all FTD subtypes (see \tableref{tab:dataset_counts}). In contrast, the OOD cohort (NIFD+NACC) shows an even more pronounced disparity, with rare FTD subtypes representing $<1.5\%$ of the test sample. We preserve this natural prevalence to assess robustness under realistic epidemiological and covariate shifts. Evaluation is performed on unmodified OOD data; only ID training may use balanced sampling (Section~\ref{sec:stab_training}). Due to scarcity, minority-class OOD metrics show wider confidence intervals and are interpreted primarily for global stability and calibration.

\begin{table}[htbp]
\floatconts
  {tab:dataset_counts}
  {%
    \caption{\small\textbf{Subject distribution across cohorts.} 
The ID set combines ADNI and ALLFTD, and the OOD set aggregates NIFD and NACC.
Note the scarcity of FTD subtypes in the OOD set reflecting clinical prevalence. Counts displayed as \textit{Dataset1 / Dataset2} for each group.}%
  }%
  {%
    \centering
    \scriptsize
    \renewcommand{\arraystretch}{1.01}
    \begin{tabular}{l|l|ccccc|c}
    \toprule
    \textbf{Group} & \textbf{Datasets} & \textbf{CN} & \textbf{AD} & \textbf{bvFTD} & \textbf{nfvPPA} & \textbf{svPPA} & \textbf{Total} \\
    \midrule
    \textbf{ID} & ADNI / ALLFTD & 1090 / 322 & 649 / 5 & -- / 229 & -- / 66 & -- / 76 & 1739 / 698 \\
    \textbf{OOD} & NIFD / NACC & 136 / 2115 & -- / 485 & 74 / 26 & 37 / 6 & 39 / 4 & 286 / 2636 \\
    \bottomrule
    \end{tabular}
  }
\end{table}

\paragraph{Cross-validation and evaluation.}
ID experiments use patient-level stratified 10-fold cross-validation across dataset, diagnosis, sex, and age bins, with a 7/2/1 train/val/test split per fold. OOD evaluation uses a fixed NIFD+NACC test set without overlap.

\paragraph{Image preprocessing.}
All T1-weighted MRIs undergo a unified pipeline: N4 bias correction~\citep{Tustison2010N4}, skull stripping, affine and diffeomorphic MNI registration~\citep{Avants2011ANTS,Fonov2011ICBM}, $1\,\mathrm{mm}$ resampling, $Z$-scoring, and fixed FOV cropping. To avoid leakage from longitudinal data, a single baseline scan is selected per subject; CN participants are required to maintain longitudinal diagnostic stability. Genetic FTD cases in ALLFTD are excluded.

\subsection{Models}\label{sec:models}

\paragraph{Backbones.}
We benchmark a set of volumetric transformers and CNN baselines, selecting the variant model whose parameter count is close to the median across backbones:
\begin{enumerate}
\setlength\itemsep{0em}
\setlength\parskip{0em}
\item \textbf{ViT-3D}: a non-hierarchical ViT extended to 3D via volumetric patch embedding \citep{Dosovitskiy2021ViT,Yan2022_3DFrom2DViT};
\item \textbf{Swin-3D (baseline)}: a hierarchical Swin with shifted 3D windows~\citep{Liu2022VideoSwin};
\item \textbf{Swin-3D (deformable)}: Swin-3D equipped with deformable patch locations \citep{Nguyen2024};
\item \textbf{MedViT-3D}: A robust hybrid CNN-Transformer model~\citep{manzariMedViTRobustVision2023}, that we extended to 3D, combining local convolutions and global attention mechanisms;
\item \textbf{Segmentation-based CNNs + SVM}: A neuroanatomically driven pipeline using an ensemble of 125 3D U-Nets (AssemblyNet)~\citep{Coupe2020AssemblyNet} for regional feature extraction, followed by an SVM classifier. This baseline serves as a high-capacity reference for anatomy-driven performance rather than a resource-equivalent competitor;
\item \textbf{3D CNN}: A 3D ResNet-18~\citep{Hara2018ResNet3D}. We selected the 18-layer variant to match the parameter count of the transformer architectures ($\approx 30$M). 
While deeper variants theoretically offer higher capacity, 3D convolutions induce a rapid growth in parameter count, so the 18-layer variant offers a more favorable trade-off between model complexity and the available dataset scale.
\end{enumerate}

\paragraph{Rationale for stabilization testbed.}
While hybrid architectures like MedViT-3D demonstrate superior baseline performance (as shown in Section~\ref{sec:results}), we explicitly selected Swin-3D with deformable patch location as the primary testbed for our stabilization ablation study (Section~\ref{sec:stab_training}). Unlike MedViT, which relies on convolutional stems for stability, Swin-DPL allows us to isolate the optimization challenges intrinsic to hierarchical self-attention mechanisms. Our goal is to identify training protocols that allow standard Transformers to close the gap with hybrids and CNNs.

\paragraph{Training framework.}
Training uses AdamW \citep{Loshchilov2019AdamW}, cross-entropy loss, global batch size $128$ (with gradient accumulation), cosine decay with warmup~\citep{Loshchilov2017sgdr}, mixed precision, and early stopping on validation loss.
We adopted this specific scheduler configuration as it represents the established standard for Vision Transformers~\citep{Dosovitskiy2021ViT,Liu2021Swin}. While other optimization heuristics exist (e.g., gradient clipping, alternative schedules), we fixed these hyperparameters to focus our analysis on regularization and landscape smoothing techniques rather than exhaustive optimizer tuning.
Regularization via Stochastic Depth (DropPath, \citep{huangDeepNetworksStochastic2016}) and weight decay follows the specific configurations recommended by the respective model authors.
All hyperparameters were fixed prior to experiments to isolate seed-dependent variation.
While deterministic flags were enabled, strictly reproducible training remains elusive in 3D deep learning due to hardware-level implementation details.\footnote{Despite deterministic seeds, atomic operations in specific 3D CUDA kernels (e.g., \texttt{avg\_pool3d\_backward\_cuda}, \texttt{grid\_sampler\_3d\_backward\_cuda}) introduce irreducible bit-wise noise. This implementation-induced variability necessitates the multi-seed protocol described in Section~\ref{sec:instability_quantification}.}

To rigorously quantify this instability, we employed a multi-seed evaluation protocol, repeating training runs with distinct random seeds for initialization and sampling. While this strategy is computationally expensive, multiplying the training budget by the number of seeds, it is strictly necessary to dissociate genuine architectural improvements from stochastic optimization noise when working with small datasets as we are. Unless stated otherwise, we ran 5 trainings per architecture for this analysis.

We evaluate stabilization components both individually and in combination. All models use the same preprocessed volumes, identical splits and compute budgets, except for SVM that requires training 125 U-Nets ($\approx$2.17M parameters each)~\citep{Coupe2020AssemblyNet}.

For each architecture, 10 models are trained (one per fold). ID performance is computed by concatenating the fold-specific test predictions. For OOD evaluation, we average the softmax outputs of the 10 fold-specific models.

\subsection{Stabilization strategies}
\label{sec:stabilization}

To mitigate the instability inherent to training Transformers on limited, heterogeneous MRI cohorts, we investigate a composite stabilization protocol. We categorize these strategies into data-centric, architectural, and optimization-based components. Mathematical formulations and implementation details are provided in Appendix~\ref{app:stabilization_details}.

\subsubsection{Training and optimization}

\paragraph{Data regularization.}
Given the high dimensionality of volumetric inputs relative to the cohort size, we employ extensive \textbf{data augmentation}. We use the \texttt{MONAI} framework~\citep{Cardoso2022MONAI} to apply domain-specific 3D transformations, including affine and elastic deformations, sagittal flips, bias-field simulation, and $k$-space artifact injection (see Appendix~\ref{app:augmentation} for parameter ranges).
To counteract decision boundary collapse in high-dimensional space, we employ \textbf{MixUp}~\citep{Zhang2018mixup}, which trains the network on convex combinations of sample pairs and their labels. This encourages the model to behave linearly in-between training examples.
Furthermore, to address the severe class imbalance in the OOD settings (see \tableref{tab:dataset_counts}), we use \textbf{balanced sampling}, ensuring that minibatches contain a uniform distribution of classes.

\paragraph{Optimization landscape smoothing.}
Standard Stochastic Gradient Descent (SGD) often converges to sharp minima in Transformers, which generalizes poorly. To try to mitigate this, we use \textbf{Sharpness-Aware Minimization (SAM)}~\citep{Foret2021SAM}, that simultaneously minimizes the loss value and the loss curvature, biasing the solution toward flatter regions of the loss landscape.
We further smooth the optimization trajectory using \textbf{exponential moving average (EMA)} of model weights. By averaging parameters over the training trajectory, EMA provides a robust estimate of the "center" of the optimization basin, often yielding better generalization than the final checkpoint. Finally, we apply \textbf{label smoothing} to prevent the network from becoming over-confident on noisy labels, a critical factor given the phenotypic overlap in neurodegenerative diseases.

\paragraph{Architectural constraints.}
We evaluate signal propagation stabilization techniques:
\textbf{LayerScale}~\citep{Touvron2021CaiT}, which introduces learnable diagonal matrices to scale residual updates, and a \textbf{Stable Initialization} scheme~\citep{kedia2024transformersstableendtoendsignal} to preserve activation variance throughout the network depth. We also benchmark \textbf{ShakeDrop}~\citep{Yamada2018ShakeDrop}, a stochastic regularization method within the residual block.

\subsubsection{Inference and Evaluation}

\paragraph{Uncertainty-aware inference.}
Comparison based on single point-estimates is unreliable when using small datasets. We therefore employ \textbf{checkpoint ensembling}, averaging the softmax predictions of the top-$K$ validation checkpoints. This acts as a simplified Snapshot Ensemble~\citep{Huang2017Snapshot}, marginalizing out local optimization noise.
To address calibration, we apply post-hoc \textbf{temperature scaling}~\citep{Guo2017Calibration}, optimizing a single scalar parameter on the validation set to align confidence scores with empirical accuracy.
Finally, we explore \textbf{Test-Time Augmentation (TTA)}, aggregating predictions across multiple transformed views (flips, crops) of the test volume via inverse-entropy weighting, prioritizing views where the model is most confident.

\subsection{Evaluation protocol}
\label{sec:protocol}

\subsubsection{Instability quantification}\label{sec:instability_quantification}
To distinguish genuine architectural improvements from stochastic noise, we adopt a rigorous multi-seed evaluation protocol. Each fold is trained with a distinct seed, capturing variability arising from weight initialization, data ordering, and hardware nondeterminism.

We move beyond point-estimates by quantifying uncertainty via patient-level paired bootstrapping ($B=10^4$ replicates)~\citep{Efron1979Bootstrap}. While parametric assumptions can hold for dense segmentation tasks~\citep{eljurdiConfidenceIntervalsPerformance2025a}, classification metrics on imbalanced cohorts often exhibit skewed, non-Gaussian distributions. We therefore opt for non-parametric bootstrapping to avoid distributional assumptions.
We report the normalized Coefficient of Variation (nCV) to compare stability across datasets of varying sizes. This metric measures the variation between different trainings with the same configuration. The coefficient of variation (CV) is defined as the standard deviation divided by the mean metric, to integrate both the variability and the performance in one metric. It's normalized by the square root of the sample size to account for dataset size differences between classes and in-domain and out-of-domain, defining the normalized CV (see Appendix~\ref{app:metrics} for derivation).
Statistical significance is assessed using a dual strategy: \textbf{paired Wilcoxon signed-rank tests} evaluate architectural stability across the 10 folds (in-domain), while \textbf{McNemar's test}~\citep{McNemar1947} assesses the diagnostic agreement of the final ensembled models (out-of-domain), with Bonferroni correction applied. Following \citet{Christodoulou2025FalsePromises}, we also report the Probability of False Outperformance (PFO) to estimate the likelihood that a reported gain is not significant.

\subsubsection{Metric Selection}

\paragraph{Discrimination metrics.}
To address class imbalance (see Table~\ref{tab:dataset_counts}), we report four complementary metrics: Accuracy (ACC), Matthews Correlation Coefficient (MCC), Macro-F1, and Precision-Recall AUC (PR-AUC).
While \textbf{ACC} provides a standard overview, it is biased toward majority classes.
Since Balanced Accuracy (BACC) can be inflated by saturated majority performance, we prioritize \textbf{MCC} as our primary ranking metric, that leverages the full confusion matrix and provide a robust correlation estimate regardless of class ratios.
Complementarily, \textbf{Macro-F1} ensures equal contribution from all phenotypes, penalizing collapse on rare subtypes, while weighted one-vs-rest \textbf{PR-AUC} assesses discriminatory power across decision thresholds, accounting for the varying support of each class.
Finally, granular \textbf{per-class F1 scores} are detailed in Appendix~\ref{app:instabilities_325}.

\paragraph{Calibration.}
Clinical deployment requires models to be trustworthy, not just accurate~\citep{begoliNeedUncertaintyQuantification2019}. Standard discrimination metrics do not distinguish between a model that is cautiously wrong and one that is confident but wrong.
We therefore evaluate reliability using the \textbf{Expected Calibration Error (ECE)}~\citep{Guo2017Calibration}, which measures the alignment between predicted confidence and empirical accuracy (\textit{i.e.}, "does a 90\% confidence prediction imply a 90\% probability of correctness?"). Complementarily, we employ the \textbf{Brier Score}~\citep{Brier1950}, a proper scoring rule that penalizes over-confident false predictions. This metric helps identify over-confident models that assign high probability to incorrect classes, a critical failure mode in differential diagnosis that accuracy can mask.

\section{Results}
\label{sec:results}

\subsection{Instability in differential diagnosis}
\label{sec:results_5c}

While preliminary experiments on a standard 3-class setup (CN/AD/FTD) confirmed the viability of Swin-DPL (see Appendix~\ref{app:instabilities_325}), the transition to the 5-class differential diagnosis (CN/AD/bvFTD/svPPA/nfvPPA) reveals critical stability bottlenecks (\tableref{tab:base_5c}). The granular classification induces a performance drop, highlighting the challenge of distinguishing phenotypically similar FTD subtypes. 

In ID, pure Transformers struggle due to data scarcity: ViT-3D exhibits optimization collapse, failing to disentangle minority classes. This confirms that global attention mechanisms require more data or priors to find stable decision boundaries. Conversely, SVM and ResNet-18 maintain robust discrimination. The introduction of the Deformable Patch Location (DPL) allows Swin-3D to recover significant performance, aligning its ID convergence with CNN baselines.

OOD evaluation reveals a hierarchy shift. The hybrid MedViT-3D achieves the highest generalization in global metrics (ACC, MCC, PR-AUC), suggesting strong robustness to acquisition shifts. However, its lower Macro-F1 compared to SVM and ResNet indicates that this global performance comes at the cost of minority classes. While hybrids generalize well on average, strong inductive biases (SVM/CNN) seem better equipped to preserve recall on rare phenotypes (nfvPPA/svPPA) under distribution shifts. Finally, ViT-3D's low ECE reflects under-confidence rather than true calibration, a known limitation of the metric where uniform predictions yield low error~\citep{Nixon_2019_CVPR_Workshops}.

\begin{table}[htbp]
\floatconts{tab:base_5c}
{\caption{\small \textbf{Baseline performance (5-class: CN/AD/bvFTD/nfvPPA/svPPA differential diagnosis).}
Performance metrics for Convolutional (ResNet, SVM) and Transformer (ViT, Swin) architectures on the CN/AD/bvFTD/nfvPPA/svPPA task.
Results are reported for in-domain (ID, 10-fold CV) and out-of-domain (OOD, 10 models average) settings.
\textbf{Bold} indicates the best performance per column.\\
\scriptsize Values: Mean $\pm$ 95\% CI ($B=10^4$), all metrics in \%.
$\uparrow$=higher-is-better, $\downarrow$=lower-is-better.}}
{\scriptsize
\setlength{\tabcolsep}{2pt}
\begin{tabular}{c @{\hspace{1em}} l c c c c c c c}
\toprule
& \bfseries Configuration & \bfseries Params & \bfseries ACC $\uparrow$ & \bfseries MCC $\uparrow$ & \bfseries PR-AUC $\uparrow$ & \bfseries Macro-F1 $\uparrow$ & \bfseries ECE $\downarrow$ & \bfseries Brier $\downarrow$ \\
\midrule

\multirow{6}{*}{\rotatebox[origin=c]{90}{\textbf{ID}}}
& CNNs + SVM & $\approx$270M\textsuperscript{*} & \bfseries \meanpm{82.80}{1.52} & \bfseries \meanpm{69.27}{2.53} & \bfseries \meanpm{67.72}{3.67} & \bfseries \meanpm{62.61}{3.68} & 47.84 & 11.43 \\
& ResNet-18 3D & 33.16M & \meanpm{79.86}{1.56} & \meanpm{64.04}{2.60} & \meanpm{60.73}{3.61} & \meanpm{57.47}{3.52} & 42.89 & \bfseries 11.20 \\
& MedViT 3D & 34.99M & \meanpm{78.83}{1.62} & \meanpm{62.39}{2.63} & \meanpm{58.37}{3.41} & \meanpm{54.14}{3.12} & 44.04 & 11.65 \\
& ViT-3D & 23.18M & \meanpm{69.39}{1.82} & \meanpm{44.21}{2.84} & \meanpm{39.41}{2.05} & \meanpm{33.56}{2.29} & \bfseries 36.65\textsuperscript{\textdagger} & 12.68 \\
& Swin-3D & 29.27M & \meanpm{73.33}{1.74} & \meanpm{51.53}{2.90} & \meanpm{52.15}{3.06} & \meanpm{46.96}{2.80} & 39.07 & 12.12 \\
& Swin-3D DPL & 41.02M & \meanpm{78.75}{1.65} & \meanpm{61.60}{2.74} & \meanpm{57.46}{3.06} & \meanpm{49.57}{2.86} & 43.76 & 11.67 \\

\midrule

\multirow{6}{*}{\rotatebox[origin=c]{90}{\textbf{OOD}}}
& CNNs + SVM & $\approx$270M\textsuperscript{*} & \meanpm{88.38}{1.17} & \meanpm{69.67}{2.88} & \meanpm{72.97}{4.51} & \bfseries \meanpm{67.97}{4.62} & 53.28 & 11.05 \\
& ResNet-18 3D & 33.16M & \meanpm{88.90}{1.15} & \meanpm{71.03}{2.86} & \meanpm{73.58}{4.51} & \meanpm{65.07}{3.72} & 52.54 & \bfseries 10.71 \\
& MedViT 3D & 34.99M & \bfseries \meanpm{89.31}{1.12} & \bfseries \meanpm{72.16}{2.77} & \bfseries \meanpm{75.67}{4.31} & \meanpm{61.99}{3.62} & 55.08 & 11.26 \\
& ViT-3D & 23.18M & \meanpm{80.43}{1.44} & \meanpm{49.74}{3.14} & \meanpm{53.14}{3.69} & \meanpm{29.75}{0.77} & \bfseries 47.60\textsuperscript{\textdagger} & 11.96 \\
& Swin-3D & 29.27M & \meanpm{83.04}{1.35} & \meanpm{57.56}{3.16} & \meanpm{63.72}{4.31} & \meanpm{50.76}{3.90} & 49.07 & 11.62 \\
& Swin-3D DPL & 41.02M & \meanpm{85.74}{1.23} & \meanpm{63.86}{2.95} & \meanpm{70.64}{4.84} & \meanpm{56.18}{3.57} & 50.64 & 11.22 \\
\bottomrule
\end{tabular}\\
\scriptsize{\textsuperscript{*}Includes the segmentation backbone. \textsuperscript{\textdagger}Low ECE here reflects under-confidence, not calibration.}
}
\end{table}

Transitioning from the 3-class to the 5-class setting, we observe that the widths of the bootstrap confidence intervals do not differ significantly. This highlights a limitation of bootstrapping, which is inherently constrained by the fixed sample size $N$ rather than the intrinsic difficulty of the optimization landscape. Similarly, while global nCV values remain comparable, the granular per-class analysis (see Appendix~\ref{app:instabilities_325}) reveals highly non-uniform stability: variance spikes for minority phenotypes (nfvPPA, svPPA) where nCV exceeds 0.1 (vs $<$0.01 for controls), yet this volatility is masked in global metrics by the robustness of majority classes. These results show that 1) global metrics alone cannot fully characterize model reliability, and 2) greater task complexity introduces subtype-specific stochastic variability, motivating the proposed stabilization study.

\subsection{Effect of stabilization}

\subsubsection{Training}\label{sec:stab_training}
\tableref{tab:stab_train} reports ablations of training-time stabilization components on Swin-3D DPL for the 5-class task.
Domain-specific 3D MRI augmentation is the only isolated component that consistently improves OOD discrimination.
In contrast, EMA weights and label smoothing, when used alone, induce small changes in accuracy and MCC but systematically degrade ECE and Brier, indicating that they sharpen the decision function at the expense of probability reliability.
The cumulative protocol (\texttt{DA+E+LS+BS+M}) narrows the generalization gap with the stabilized ResNet-18 baseline without relying on architectural changes.
While this composite strategy increases the computational budget, we provide a detailed cost-benefit analysis in Appendix~\ref{app:cost}, demonstrating that the induced overhead is justified by gains in OOD reliability (e.g., $+15\%$ Macro-F1 for both ID and OOD).
Class-balanced sampling and MixUp have limited effect when applied in isolation but are required in the cumulative setting to maintain non-zero F1 for minority FTD subtypes and to prevent decision-boundary collapse on nfvPPA and svPPA.

\begin{table}[htbp]
\floatconts{tab:stab_train}
{\caption{\small \textbf{Training stabilization ablation (5-class).}
Quantitative comparison of training strategies applied to the Swin-3D DPL backbone.
The table reports discrimination (ACC, MCC, PR-AUC, Macro-F1) and calibration (ECE, Brier) metrics for in-domain (ID, 10-fold CV) and out-of-domain (OOD, 10 models averaged predictions) settings.
The upper section evaluates components applied individually to the baseline, while the lower section presents the results of the cumulative stabilization protocol.
\\
\scriptsize Values: Mean $\pm$ 95\% CI ($B=10^4$), all metrics in \%. $\uparrow$=higher-is-better, $\downarrow$=lower-is-better.}}
{\scriptsize
\setlength{\tabcolsep}{3pt}
\begin{tabular}{@{\hspace{0em}}c @{\hspace{1em}} l c c c c c c@{\hspace{0em}}}
\toprule
& \bfseries Configuration & \bfseries ACC $\uparrow$ & \bfseries MCC $\uparrow$ & \bfseries PR-AUC $\uparrow$ & \bfseries Macro-F1 $\uparrow$ & \bfseries ECE $\downarrow$ & \bfseries Brier $\downarrow$ \\
\midrule

\multirow{6}{*}{\rotatebox[origin=c]{90}{\textbf{ID}}}
& Baseline (\texttt{BL})& \meanpm{78.75}{1.65} & \meanpm{61.60}{2.74} & \meanpm{57.46}{3.06} & \meanpm{49.57}{2.86} & \bfseries 43.76 & 11.67 \\
& + 3D Augments (\texttt{DA}) & \bfseries \meanpm{82.02}{1.52} & \bfseries \meanpm{68.12}{2.57} & \meanpm{62.82}{3.78} & \meanpm{59.26}{3.68} & 46.32 & \bfseries 11.31 \\
& + EMA (\texttt{DA+E}) & \meanpm{81.79}{1.52} & \meanpm{67.42}{2.56} & \meanpm{62.97}{3.83} & \meanpm{61.15}{3.72} & 45.97 & 11.32 \\
& + Label smoothing  (\texttt{DA+E+LS})& \meanpm{81.11}{1.54} & \meanpm{66.15}{2.60} & \meanpm{65.14}{3.75} & \meanpm{58.88}{3.83} & 48.29 & 12.05 \\
& + Balanced sampling (\texttt{DA+E+LS+BS}) & \meanpm{81.33}{1.54} & \meanpm{66.89}{2.62} & \meanpm{65.29}{3.71} & \meanpm{61.42}{3.62} & 48.33 & 11.98 \\
& + MixUp (\texttt{DA+E+LS+BS+M}) & \meanpm{81.62}{1.54} & \meanpm{67.28}{2.59} & \bfseries \meanpm{66.43}{3.89} & \bfseries \meanpm{63.02}{3.71} & 48.94 & 12.04 \\

\midrule

\multirow{6}{*}{\rotatebox[origin=c]{90}{\textbf{OOD}}}
& Baseline (\texttt{BL})& \meanpm{85.74}{1.23} & \meanpm{63.86}{2.95} & \meanpm{70.64}{4.84} & \meanpm{56.18}{3.57} & \bfseries 50.64 & 11.22 \\
& + 3D Augments (\texttt{DA}) & \meanpm{87.39}{1.22} & \meanpm{68.28}{2.88} & \meanpm{72.72}{4.91} & \meanpm{66.21}{4.63} & 51.61 & 10.94 \\
& + EMA  (\texttt{DA+E})& \meanpm{88.24}{1.20} & \meanpm{69.41}{2.89} & \meanpm{74.64}{4.52} & \meanpm{65.66}{4.71} & 52.22 & \bfseries 10.83 \\
& + Label smoothing (\texttt{DA+E+LS})& \meanpm{88.34}{1.16} & \meanpm{69.98}{2.89} & \meanpm{74.10}{4.81} & \meanpm{66.44}{4.50} & 55.25 & 11.65 \\
& + Balanced sampling (\texttt{DA+E+LS+BS})& \meanpm{88.14}{1.19} & \meanpm{70.20}{2.83} & \bfseries \meanpm{75.30}{4.99} & \meanpm{71.44}{4.48} & 55.06 & 11.65 \\
& + MixUp (\texttt{DA+E+LS+BS+M})& \bfseries \meanpm{88.41}{1.19} & \bfseries \meanpm{70.38}{2.87} & \meanpm{74.91}{4.90} & \bfseries \meanpm{71.54}{4.52} & 55.56 & 11.71 \\
\bottomrule
\end{tabular}}
\end{table}

\figureref{fig:cv_analysis_train} analyzes inter-seed stability \textit{via} the normalized coefficient of variation across the training configurations of \tableref{tab:stab_train}.
Data augmentation and EMA consistently improve stability (lower nCV) across all metrics and domains, suggesting they effectively smooth the optimization landscape.
Conversely, MixUp increases ID variability, particularly for ECE, while preserving OOD stability.
This suggests that local variability prevents the model from settling into sharp minima, effectively trading training precision for better generalization.

\begin{figure}[htbp]
\centering
    \includegraphics[width=0.95\textwidth]{fig/swinv1dpl_train_cv.pdf}
    \caption{\small \textbf{Normalized coefficient of variation profiles of training strategies.}
    Bar plots displaying the inter-seed stability for 5 different seeds and multiple training stabilization components described in \tableref{tab:stab_train}, covering both individual strategies and the cumulative protocol.
    Results are separated into in-domain (left) and out-of-domain (right) evaluations.
    We report accuracy, Matthews Correlation Coefficient, Precision-Recall AUC, Macro-F1, Expected Calibration Error, and Brier score for each individual component as well as the composition of them.}
    \label{fig:cv_analysis_train}
\end{figure}

Figures~\ref{subfig:wilcoxon} and~\ref{subfig:mcnemar} summarize the statistical validation among the six training protocols of \tableref{tab:stab_train}. We employ Wilcoxon to confirm that performance gains are consistent across training folds (ID, Figure~\ref{subfig:wilcoxon}), and McNemar to validate the diagnostic superiority of the final ensembled system (OOD, Figure~\ref{subfig:mcnemar}).
These matrices show that several numerical differences, including the improvement of the fully stabilized protocol over the baseline, reach standard significance thresholds, whereas intermediate variants such as \texttt{DA+E} often remain statistically indistinguishable despite visible shifts in mean performance.
Beyond these training ablations, McNemar tests on OOD predictions also indicate that Swin-3D DPL significantly outperforms the standard Swin-3D backbone ($p<0.05$), supporting the contribution of the deformable patch inductive bias.
Complementarily, Figure~\ref{subfig:pfo} reports the PFO to assess the risk of illusory gains. It highlights that while the fully stabilized protocol yields a negligible PFO, intermediate strategies often exhibit high risk, \emph{i.e.} their apparent gain may stem from stochastic variability rather than genuine signal.

\begin{figure*}[t]
    \footnotesize
    \floatconts{fig:stabilization_summary_grid}
    {\caption{\small\textbf{Statistical assessment of stabilization and robustness.}
    Statistical validation matrices: (a) Wilcoxon signed-rank test $p$-values (in-domain), (b) McNemar test $p$-values (out-of-domain), and (c) Probability of False Outperformance (PFO) heatmap.}}
    {
    \subfigure[\textbf{In-domain Wilcoxon.} Pairwise significance tests on MCC ($N=10$ folds).]{
        \label{subfig:wilcoxon}
        \includegraphics[width=0.29\textwidth]{fig/wilcoxon_significance_matrix.pdf}
    }
    \hfill
    \subfigure[\textbf{OOD McNemar.} Pairwise significance tests on aggregated predictions.]{
        \label{subfig:mcnemar}
        \includegraphics[width=0.29\textwidth]{fig/mcnemar_significance_matrix.pdf}
    }
    \hfill
    \subfigure[\textbf{Reliability (PFO).} Probability of False Outperformance for OOD PR-AUC.]{
        \label{subfig:pfo}
        \includegraphics[width=0.29\textwidth]{fig/pfo_heatmap_pr_auc.pdf}
    }
    }
\end{figure*}

\subsubsection{Models}
\label{sec:stab_model}

\tableref{tab:stab_model} benchmarks architectural regularization techniques commonly employed in large-scale vision transformers. Unlike the data-centric strategies identified in Section~\ref{sec:stab_training}, architectural modifications proved ineffective or detrimental on this limited dataset.

Transitioning from Pre-Norm to Post-LN caused a collapse in discrimination, likely due to gradient vanishing in early stages, an issue typically mitigated by massive-batch pretraining not feasible with clinical MRI. Similarly, Sharpness-Aware Minimization (SAM), hypothesized to improve generalization, paradoxically degraded performance. This suggests that SAM's adversarial perturbations disrupt convergence when gradients are already noisy due to the restricted batch sizes of 3D training.

Regarding signal propagation, LayerScale~\citep{Touvron2021CaiT} proved hypersensitive: small initialization ($\gamma_0 = 10^{-5}$) prevented convergence, while $\gamma_0 = 0.1$ merely restored baseline parity. Passive regularization (ShakeDrop, Stable Initialization~\citep{kedia2024transformersstableendtoendsignal}) offered no statistically significant improvement.

Crucially, the cumulative training protocol (bottom row) yields the only significant leap. This confirms that for medical ViTs trained from scratch, the bottleneck lies in the optimization landscape (smoothing \textit{via} augmentation and averaging) rather than in the intrinsic architectural definition.

\begin{table}[htbp]
\floatconts{tab:stab_model}
{\caption{\small \textbf{Model stabilization ablation (5-class).}
\textbf{Architectural and optimization constraints} for in-domain (ID, 10-fold CV) and out-of-domain (OOD, 10 models averaged predictions). Standard vision strategies prove ineffective compared to the optimization protocol defined in \tableref{tab:stab_train}.\\
\scriptsize Values: Mean $\pm$ 95\% CI ($B=10^4$), all metrics in \%.
$\uparrow$=higher-is-better, $\downarrow$=lower-is-better.}}
{\scriptsize\setlength{\tabcolsep}{3pt}
\setlength{\tabcolsep}{2pt}
\begin{tabular}{c @{\hspace{1em}} l c c c c c c}
\toprule
& \bfseries Configuration & \bfseries ACC $\uparrow$ & \bfseries MCC $\uparrow$ & \bfseries PR-AUC $\uparrow$ & \bfseries Macro-F1 $\uparrow$ & \bfseries ECE $\downarrow$ & \bfseries Brier $\downarrow$ \\
\midrule

\multirow{6}{*}{\rotatebox[origin=c]{90}{\textbf{ID}}}
& Baseline & \meanpm{78.75}{1.65} & \meanpm{61.60}{2.74} & \meanpm{57.46}{3.06} & \meanpm{49.57}{2.86} & 43.76 & \bfseries 11.67 \\
& Post-LN & \meanpm{62.25}{1.93} & \meanpm{25.94}{2.71} & \meanpm{48.58}{3.01} & \meanpm{29.92}{3.34} & \bfseries 26.51 & 12.64\textsuperscript{\textdagger} \\
& ShakeDrop & \bfseries \meanpm{78.83}{1.62} & \bfseries\meanpm{62.28}{2.65} & \meanpm{58.27}{3.37} & \bfseries\meanpm{53.75}{3.38} & 44.68 & 11.86 \\
& Stable Init & \meanpm{77.88}{1.64} & \meanpm{59.87}{2.71} & \bfseries\meanpm{58.31}{3.32} & \meanpm{48.83}{2.88} & 43.32 & 11.78 \\
& LayerScale ($\gamma_0{=}0.1$) & \meanpm{78.17}{1.62} & \meanpm{60.30}{2.72} & \meanpm{57.61}{3.33} & \meanpm{48.24}{2.90} & 43.70 & 11.79 \\
& SAM ($\rho{=}0.05$) & \meanpm{66.68}{1.87} & \meanpm{37.04}{3.09} & \meanpm{39.76}{2.48} & \meanpm{31.68}{2.22} & 34.62 & 12.99 \\

\midrule

\multirow{6}{*}{\rotatebox[origin=c]{90}{\textbf{OOD}}}
& Baseline & \bfseries\meanpm{85.74}{1.23} & \bfseries\meanpm{63.86}{2.95} & \bfseries\meanpm{70.64}{4.84} & \meanpm{56.18}{3.57} & 50.64 & 11.22 \\
& Post-LN & \meanpm{79.23}{1.47} & \meanpm{28.48}{3.36} & \meanpm{61.41}{4.34} & \meanpm{36.60}{4.62} & \bfseries 43.36 & \bfseries 11.13\textsuperscript{\textdagger} \\
& ShakeDrop & \meanpm{84.61}{1.30} & \meanpm{62.67}{2.97} & \meanpm{70.13}{4.90} & \bfseries\meanpm{58.35}{3.80} & 50.48 & 11.53 \\
& Stable Init & \meanpm{85.71}{1.25} & \meanpm{63.66}{2.99} & \meanpm{70.20}{4.75} & \meanpm{56.82}{3.49} & 50.94 & 11.29 \\
& LayerScale ($\gamma_0{=}0.1$) & \meanpm{85.50}{1.27} & \meanpm{62.71}{3.08} & \meanpm{68.71}{4.79} & \meanpm{53.24}{3.85} & 50.78 & 11.30 \\
& SAM ($\rho{=}0.05$) & \meanpm{77.97}{1.51} & \meanpm{39.90}{3.49} & \meanpm{47.18}{3.64} & \meanpm{30.65}{1.98} & 46.14 & 12.33 \\

\bottomrule
\end{tabular}
  \scriptsize{\\\textsuperscript{\textdagger}Low ECE and Brier here reflects under-confidence, not calibration.}}
\end{table}

\subsubsection{Evaluation}\label{sec:evalutation}
To assess stability across imbalanced classes, we employ nCV to decouple intrinsic stability from sample size bias ($\sigma \propto 1/\sqrt{N}$) (see Appendix~\ref{app:stability_metrics}). As shown in \figureref{fig:multi_seed_cv}, while raw CV decreases mechanically with $N$, nCV remains invariant once $N > 1000$, enabling fair comparison between majority classes and rare FTD subtypes.

\begin{figure}[t]
\includegraphics[width=0.95\textwidth]{fig/metric_cv_vs_data_availability_tta_comparison.pdf}
\caption{\small\textbf{Impact of sample size and Test-Time Augmentation (TTA) on stability.} Evolution of the normalized coefficient of variation across 5 random seeds as a function of dataset size. We compare standard inference (No-TTA, dashed lines) against TTA (solid lines) for both In-Domain (ID) and Out-of-Domain (OOD) settings. Note that while TTA acts as an effective stabilizer in the ID regime (lowering nCV, particularly for Macro-F1), this benefit does not consistently transfer to the OOD setting.}
\label{fig:multi_seed_cv}
\end{figure}

Regarding inference-time strategies quantified in \tableref{tab:stab_eval}, snapshot ensembling emerges as the most effective method for variance reduction.
By averaging predictions across the top-$K$ validation checkpoints, ensembling effectively marginalizes the local optimization noise inherent to the loss landscape of Transformers.
As shown in Figure~\ref{fig:cv_analysis_eval} (Appendix~\ref{app:instabilities_325}), this strategy yields the lowest nCV across all metrics in both ID and OOD regimes.
Furthermore, the stability analysis in \figureref{fig:ensemble_models_cv} indicates that these benefits saturate beyond $K=12$ models, suggesting a diminishing return that balances computational cost with reliability.
Complementarily, post-hoc temperature scaling significantly improves calibration, lowering the OOD Brier score without altering ranking metrics (PR-AUC), confirming that calibration errors can be addressed orthogonally to discrimination stability.

\begin{table}[htbp]
\floatconts{tab:stab_eval}
{\caption{\small \textbf{Inference-time strategies performance.}
Quantitative comparison of Test-Time Augmentation (TTA), Temperature Scaling, and Snapshot Ensembling ($K=12$) applied to the fully stabilized Swin-3D DPL model.
We report discrimination (ACC, MCC, PR-AUC, Macro-F1) and reliability (ECE, Brier) metrics for in-domain and out-of-domain.\\
\scriptsize Values: Mean $\pm$ 95\% CI ($B=10^4$), all metrics in \%. $\uparrow$=higher-is-better, $\downarrow$=lower-is-better.}}
{\scriptsize
\setlength{\tabcolsep}{2pt}
\begin{tabular}{c @{\hspace{1em}} l c c c c c c}
\toprule
& \bfseries Variant & \bfseries ACC $\uparrow$ & \bfseries MCC $\uparrow$ & \bfseries PR-AUC $\uparrow$ & \bfseries Macro-F1 $\uparrow$ & \bfseries ECE $\downarrow$ & \bfseries Brier $\downarrow$ \\
\midrule

\multirow{5}{*}{\rotatebox[origin=c]{90}{\textbf{ID}}}
& Baseline & \meanpm{81.62}{1.54} & \meanpm{67.28}{2.59} & \meanpm{66.43}{3.89} & \meanpm{63.02}{3.71} & 48.94 & 12.04 \\
& TTA & \bfseries \meanpm{82.07}{1.54} & \bfseries \meanpm{68.82}{2.51} & \bfseries \meanpm{67.75}{3.83} & \meanpm{66.73}{3.38} & 51.44 & 12.62 \\
& Temp. Scale & \meanpm{81.54}{1.52} & \meanpm{67.08}{2.58} & \meanpm{65.61}{3.98} & \meanpm{63.02}{3.71} & \bfseries 46.24 & \bfseries 11.44 \\
& TTA + Temp. & \bfseries \meanpm{82.07}{1.54} & \bfseries \meanpm{68.82}{2.51} & \meanpm{67.17}{3.97} & \bfseries \meanpm{67.17}{3.31} & 46.97 & 11.50 \\
& Ensemble & \meanpm{82.02}{1.54} & \meanpm{68.09}{2.56} & \meanpm{65.31}{3.79} & \meanpm{62.88}{3.63} & 49.78 & 12.15 \\

\midrule

\multirow{5}{*}{\rotatebox[origin=c]{90}{\textbf{OOD}}}
& Baseline & \meanpm{88.41}{1.19} & \meanpm{70.38}{2.87} & \meanpm{74.91}{4.90} & \meanpm{71.54}{4.52} & 55.56 & 11.71 \\
& TTA & \meanpm{86.43}{1.23} & \meanpm{67.40}{2.83} & \meanpm{74.66}{5.00} & \meanpm{70.23}{4.03} & 55.85 & 12.44 \\
& Temp. Scale & \bfseries \meanpm{88.86}{1.16} & \bfseries \meanpm{71.46}{2.82} & \meanpm{74.34}{5.08} & \meanpm{71.54}{4.50} & 53.41 & \bfseries 10.99 \\
& TTA + Temp. & \meanpm{86.46}{1.25} & \meanpm{67.63}{2.83} & \meanpm{74.08}{5.14} & \meanpm{70.67}{3.99} & \bfseries 51.51 & 11.22 \\
& Ensemble & \meanpm{88.31}{1.18} & \meanpm{70.56}{2.82} & \bfseries \meanpm{75.90}{4.62} & \bfseries \meanpm{72.15}{4.40} & 55.85 & 11.83 \\

\bottomrule
\end{tabular}
}
\end{table}

Conversely, our results expose a key limitation of test-time augmentation (TTA) in neuroimaging. Although TTA is typically used to reduce variance, it proved detrimental in this differential-diagnosis setting, degrading OOD discrimination. As shown in \figureref{fig:multi_seed_cv}, TTA does not yield the consistent stability gains provided by ensembling. This suggests that the applied augmentations, though aligned with the training transforms, are either not representative of the OOD distribution or not well suited to our architecture.

\section{Discussion}
\label{sec:discussion}

\paragraph{Optimization landscape and the calibration-discrimination trade-off.}
While ViTs theoretically offer global receptive fields capable of modeling distributed atrophy patterns \citep{Dosovitskiy2021ViT}, their application to medical imaging is constrained by weak inductive biases and limited cohort sizes.
Our findings suggest that the performance gap often observed between pure ViTs and CNNs stems largely from optimization: in small, heterogeneous clinical cohorts, sharp minima and sensitivity to initialization dominate model behaviour.
Medical data specificities, such as acquisition noise, inter-site heterogeneity, and pronounced class imbalance, exacerbate this instability in unregularized Transformers.

Crucially, our analysis of the Brier score reveals a recurrent trade-off: models achieving the highest discrimination (Accuracy/MCC) often exhibit degradation in probabilistic reliability.
Without regularization, the minimization of cross-entropy drives the network toward over-confident predictions (sharp decision boundaries); consequently, when these "superior" models are wrong, they do so with high confidence, penalizing the Brier score.
We show that a targeted training scheme combining domain-specific 3D augmentation and regularization mitigates this behavior, enabling Swin-based models to match the robustness of strong baselines while preserving calibration in both in-domain and out-of-domain.

\paragraph{The potential of Transformers and the hybrid bridge.}
Contrary to the narrative that Transformers are inherently unsuitable for small-scale medical datasets, our results with MedViT-3D demonstrate that attention-based models can indeed outperform robust CNN baselines (ResNet-18) and segmentation ensembles. This validates the potential of token-based architectures to capture subtle, distributed markers of neurodegeneration that may elude purely local convolutional filters.

However, this performance must be considered carefully.
The stability of MedViT stems largely from its hybrid design, where convolutional stems inject inductive biases that smooth the optimization. In contrast, pure or hierarchical Transformers, which lack this explicit structural guidance, exhibit high variance and optimization brittleness (as seen in Figure~\ref{fig:archs_5c_cv_analysis}).
While hybrids offer a performance gain, investigating the stabilization of standard Transformers remains essential. Unlike CNNs, whose performance tends to saturate, Transformers exhibit favorable scaling laws~\citep{Dosovitskiy2021ViT} and offer the unified architecture required for future multimodal integration (e.g., fusing MRI, PET, and tabular neuropsychological scores).
Establishing robust stabilization schemes for these backbones is therefore a prerequisite for deploying scalable, multimodal architectures in clinical settings without relying on convolutional backbones.
While the integration of complementary modalities (e.g., MRI+PET+Tabular) is expected to reduce uncertainty by resolving phenotypic ambiguities, especially for FTD subtypes~\citep{metzFrontotemporalDementiaSubtyping2025}, it simultaneously increases input complexity.
Just as we observed a counter-intuitive trade-off where improved discrimination came at the cost of calibration (increased Brier/ECE), we caution against the assumption that multimodal integration naturally guarantees stability.
While new modalities introduce inductive biases, they also expand the token space, which may paradoxically exacerbate optimization instability.
Therefore, rather than relying on intuition, we argue that the proposed stabilization principles remain important to prevent overfitting in these high-dimensional regimes.

\paragraph{Disentangling data variance from optimization instability.}
Our scaling analysis (Figure~\ref{fig:scaling}) aligns with theoretical expectations (\(1/\sqrt{N}\)): confidence interval widths decrease predictably as the number of subjects grows.
As shown by \citet{eljurdiConfidenceIntervalsPerformance2025a}, bootstrapping provides a computationally efficient way to estimate confidence intervals without distributional assumptions. However, it captures only data-driven variance (aleatoric uncertainty), not the intrinsic architecture instability (epistemic/optimization uncertainty). Thus, two models may exhibit identical bootstrap CIs yet differ greatly in their sensitivity to initialization. Ideally, architectural stability would be assessed by averaging predictions over many random seeds, but the cost of 3D training makes this impractical for routine development.
Consequently, the stabilization techniques used here serve as a practical proxy: by flattening the loss landscape and enforcing consistency, they reduce the irreducible error of single-seed training, which remains standard in clinical deep-learning deployment.
It is important to distinguish between the computational cost of validating stability and the cost of deploying a stabilized model. While we employed extensive multi-seed evaluation to rigorously quantify optimization noise, this computationally intensive process serves as a research instrument rather than a requirement for deployment. The primary objective of the proposed stabilization protocol is to quantify and reduce inter-seed variability. By effectively reducing the nCV across architectures, the proposed protocol allows to rely on a single robust training cycle. This eliminates the need for prohibitive multi-seed ensembling in clinical practice, offering a favorable trade-off between training duration and diagnostic reliability (see Cost Analysis in Appendix~\ref{app:cost}).

\paragraph{Limitations.}
The OOD cohort exhibits significant class imbalance, particularly for nfv-PPA and svPPA. Despite reporting macro-averaged metrics and employing balanced sampling, evaluation variance for these minority classes remains elevated, as confirmed by per-class stability analysis.
Furthermore, label noise presents a specific challenge in the differential diagnosis of PPA. Clinical ground truth relies heavily on neuropsychological assessments and language tests, forming a multimodal composite reference standard.
Training MRI-only models on these targets introduces inherent ambiguity, as structural signatures may lag behind or only partially reflect clinical phenotypes defined by non-imaging tests.

We acknowledge that this label noise is a significant factor. For example \citep{selvackaduncoComparisonClinicalNeuropathological2019} have shown that more than a third of clinical diagnoses for FTD present discordance between clinical diagnosis and final neuropathological diagnosis on their Brains for Dementia Research (BDR) cohort~\citep{Thomas2017}. However, we argue that this constitutes an irreducible error (random uncertainty) inherent to large-scale clinical datasets~\citep{karimiDeepLearningNoisy2020}. Our results (\figureref{fig:cv_f1_archs_3c,fig:cv_f1_archs_5c}) demonstrate that while CNN baselines maintain low variance on these noisy classes, unregularized Transformers still exhibit volatility.
This divergence proves that the instability we address is effectively optimization-driven, caused by the model overfitting to ambiguous labels, and is distinct from the data-inherent noise, which our protocol tries to mitigate.

Finally, regarding initialization strategies, we acknowledge the growing importance of self-supervised learning (SSL) on large-scale cohorts.
However, our study explicitly targets the \textit{from-scratch} training regime, which remains a standard scenario in clinical settings where massive unlabeled datasets or domain-specific foundation models are not always available due to privacy or data concerns.
While SSL can inject strong inductive biases~\citep{waldOpenMind3DMedical2025}, understanding how to stabilize architectures on limited data remains a prerequisite for their wider adoption.
Future work should investigate the potential synergy between our optimization protocol and SSL initialization to further reduce variance in rare subtypes.

To summarize our findings into an actionable format, we provide a practitioner's guide in Table~\ref{tab:guidelines}. Note that we do not provide any guidelines for architectural modifications (Section~\ref{sec:stab_model}): even if these architectural changes do not yield good results in our case, they are subject to initialization conditions and the chosen architecture, which is why we cannot make a definitive conclusion.

\begin{table}[htbp]
\floatconts{tab:guidelines}
{\caption{\small \textbf{Our guide to ViT stabilization in medical imaging.}
Summary of best practices derived from our ablation study on differential diagnosis.
We synthesize recommendations for Architecture, Training, Inference, and Evaluation to mitigate stochastic variability.}}
{\scriptsize
\setlength{\tabcolsep}{3pt}
\begin{tabular}{l p{0.32\linewidth} p{0.45\linewidth}}
\toprule
\bfseries Component & \bfseries Recommendation & \bfseries Rationale (Based on our findings) \\
\midrule
\bfseries Architecture &
Prioritize Hierarchical (e.g., Swin) over Vanilla ViT &
Vanilla ViTs exhibit optimization collapse on small 3D datasets; hierarchical biases stabilize convergence, see~\tableref{tab:all_archs_stabilized}. \\
\addlinespace
\bfseries Training &
Combine: 3D Aug. + EMA + MixUp + Balanced Sampling &
Individual components are insufficient. The composite protocol closes the gap with CNNs and reduces variance, see~\tableref{tab:stab_train} and~\figureref{fig:cv_analysis_train}. \\
\addlinespace
\bfseries Inference &
Snapshot Ensembling ($K \approx 12$); Investigate TTA applicability &
Ensembling is the strongest variance reducer, see~\figureref{fig:ensemble_models_cv}. TTA may degrade OOD performance on asymmetric pathologies (e.g., PPA) as shown in~\tableref{tab:stab_eval}. However, this behavior may be dataset-specific and highly dependent on the task and chosen augmentations; thus, this advice should be interpreted with caution. \\
\addlinespace
\bfseries Evaluation &
Report nCV \& Brier Score instead of pure Accuracy/ECE &
Accuracy masks instability in minority classes, see~\figureref{fig:cv_f1_archs_5c}. ECE can be deceptively low due to under-confidence. \\
\bottomrule
\end{tabular}
}
\end{table}

\section{Conclusion}
\label{sec:conclusion}

This work examined the stability and reproducibility of Vision Transformers for neurodegenerative disease classification from structural MRI.
All deep models exhibited non-negligible variability across seeds, with ViTs showing the highest sensitivity in the low-data, multiclass differential diagnosis setting involving FTD variants.
Through systematic ablation, we showed that a tailored optimization protocol, combining domain-specific 3D MRI augmentation, optimization smoothing, and balanced sampling, substantially reduces variance and enables Swin-based models to approach the robustness of strong CNN baselines in both in-domain and out-of-domain evaluations, without modifying the backbone architecture.

Our uncertainty-aware evaluation framework, based on patient-level bootstrapping, calibration analysis, and the probability of false outperformance, revealed that differences that appear meaningful at the level of mean accuracy often fall within stochastic variability once uncertainty is quantified.
Reproducible medical deep learning therefore requires going beyond single-seed point estimates to routinely report calibration, confidence intervals, and ranking stability across seeds and distribution shifts.
Future work should investigate whether self-supervised pretraining on large unlabeled cohorts and the integration of non-imaging clinical covariates can provide additional inductive biases to stabilize Transformer-based classifiers while reducing the need for heavy regularization.

\clearpage

\midlacknowledgments{
This work benefited from the support of the project HoliBrain of the French National Research Agency (ANR-23-CE45-0020-01). 
This project is supported by the Precision and Global Vascular Brain Health Institute funded by the France 2030 investment plan as part of the IHU3 initiative (ANR-23-IAHU-0001). 
This study received financial support from the French government in the framework of the University of Bordeaux's France 2030 program / RRI "IMPACT" and the PEPR StratifyAging and PEPR Prodrom-ND. 
This work benefited from the support of the project ChatvolBrain from CNRS. This work was granted access to the HPC resources of IDRIS under the allocation AD011013926R3 made by GENCI.

The ADNI data used in the preparation of this manuscript were obtained from the Alzheimer's Disease Neuroimaging Initiative (ADNI) (National Institutes of Health Grant U01 AG024904). The ADNI is funded by the National Institute on Aging and the National Institute of Biomedical Imaging and Bioengineering and through generous contributions from the following: Abbott, AstraZeneca AB, Bayer Schering Pharma AG, Bristol-Myers Squibb, Eisai Global Clinical Development, Elan Corporation, Genentech, GE Healthcare, GlaxoSmithKline, Innogenetics NV, Johnson and Johnson, Eli Lilly and Co., Medpace Inc., Merck and Co. Inc., Novartis AG, Pfizer Inc., F. Hoffmann-La Roche, Schering-Plough, Synarc Inc., as well as nonprofit partners, the Alzheimer's Association and Alzheimer's Drug Discovery Foundation, with participation from the U.S. Food and Drug Administration. Private sector contributions to the ADNI are facilitated by the Foundation for the National Institutes of Health (\url{www.fnih.org}). The grantee organization is the Northern California Institute for Research and Education, and the study was coordinated by the Alzheimer's Disease Cooperative Study at the University of California, San Diego. ADNI data are disseminated by the Laboratory for NeuroImaging at the University of California, Los Angeles. This research was also supported by NIH grants P30 AG010129, K01 AG030514, and the Dana Foundation.

Frontotemporal Lobar Degeneration Neuroimaging Initiative (FTLDNI) was funded through the National Institute of Aging and started in 2010. The primary goals of FTLDNI was to identify neuroimaging modalities and methods of analysis for tracking frontotemporal lobar degeneration (FTLD) and to assess the value of imaging versus other biomarkers in diagnostic roles.
The Principal Investigator of NIFD was Dr. Howard Rosen, MD, at the University of California, San Francisco. The data are the result of collaborative efforts at three sites in North America. For up-to-date information on participation and protocol, please visit \url{http://memory.ucsf. edu/research/studies/nifd}. Data collection and sharing for this project were funded by the Frontotemporal Lobar Degeneration Neuroimaging Initiative (National Institutes of Health). The study is coordinated through the University of California, San Francisco, Memory and Aging Center. FTLDNI data are disseminated by the Laboratory for Neuro Imaging at the University of Southern California.

The NACC database was funded by NIA/NIH Grants listed at \url{https://naccdata.org/publishproject/authors-checklist\#acknowledgment}.

Data collection and dissemination of the data presented in this manuscript was supported by the ALLFTD Consortium (U19 AG063911, funded by the National Institute on Aging and the National Institute of Neurological Diseases and Stroke) and the former ARTFL and LEFFTDS Consortia (ARTFL: U54 NS092089, funded by the National Institute of Neurological Diseases and Stroke and National Center for Advancing Translational Sciences; LEFFTDS: U01 AG045390, funded by the National Institute on Aging and the National Institute of Neurological Diseases and Stroke). The authors acknowledge the invaluable contributions of the study participants and families as well as the assistance of the support staff at each of the participating sites.
}

\clearpage

\small\bibliography{midl26_34}

\clearpage

\appendix

\section{Extended dataset description}
\label{app:datasets}

This appendix provides detailed inclusion criteria, cohort characteristics, and full demographic tables referenced in Section~\ref{sec:datasets}.

\subsection{Cohort overview}
The In-Domain (ID) pool combines ADNI and ALLFTD, two harmonized longitudinal initiatives capturing AD and FTD clinical spectra. The Out-of-Domain (OOD) pool merges NIFD and NACC, which differ substantially in acquisition protocols, diagnostic granularity, recruitment strategies, and site diversity, thereby inducing both covariate and label distribution shifts.

\subsection{Subject selection}
All analyses rely strictly on cross-sectional sampling. For each participant, we retain a single baseline T1w MRI to prevent leakage from repeated sessions, longitudinal progression, or variable scan quality. CN subjects must exhibit consistent diagnosis throughout follow-up. Participants with mixed or inconsistent diagnoses, atypical comorbidities, or missing metadata are excluded.

Because ALLFTD is genetically enriched, we remove carriers of known pathogenic mutations (e.g., \emph{MAPT}, \emph{C9orf72}, \emph{GRN}) to align it with the predominantly sporadic composition of NIFD. For FTD subtyping, diagnostic labels follow each consortium's clinical adjudication protocols; ambiguous or overlapping classifications are excluded.

\subsection{Imaging pipeline}
All T1w volumes undergo a unified preprocessing workflow to reduce site-specific variance:
\begin{enumerate}
    \item N4 bias-field correction~\citep{Tustison2010N4};
    \item brain extraction;
    \item affine + diffeomorphic ANTs registration to MNI~\citep{Avants2011ANTS,Fonov2011ICBM};
    \item resampling to a $1\,\mathrm{mm}$ isotropic grid;
    \item per-subject $Z$-score intensity normalization;
    \item center-cropping to a fixed 3D field of view.
\end{enumerate}
The same pipeline is applied across all cohorts. Scans failing QC after registration or skull stripping are discarded.

\subsection{Distribution shifts}
The NIFD+NACC OOD pool exhibits pronounced long-tail distributions. FTD subtypes (especially svPPA and nfvPPA) are rare, accounting for fewer than $40$ subjects per subtype. In contrast, CN and AD groups reach several thousand samples in NACC alone. This mismatch reflects real clinical prevalence rather than sampling artifacts.

These shifts imply:
\begin{itemize}
    \item limited statistical power for subtype-resolved OOD metrics;
    \item sensitivity of bootstrap confidence intervals to minority-class scarcity;
    \item an increased role of calibration measures over raw accuracy;
    \item a stringent test of robustness to both covariate (scanner/site) and label (class ratio) shifts.
\end{itemize}

\subsection{Cross-validation strategy}
ID experiments use 10-fold patient-level CV. Stratification jointly accounts for dataset membership, diagnostic category, biological sex, and discretized age (5-bin scheme). Within each fold, data are split into 70\%/20\%/10\% train/validation/test. Each subject appears exactly once in an ID test set across folds.

OOD evaluation is performed on a fixed held-out NIFD+NACC set with no overlap with ID subjects.

\subsection{Detailed demographics and statistical assessment}
\label{app:demographics}

\begin{table}[htbp]
\floatconts
{tab:dataset_counts_extended}
{\caption{
\small\textbf{Subject distribution across cohorts.} 
The in-domain (ID) set combines ADNI and ALLFTD, while the out-of-domain (OOD) set aggregates NIFD and NACC. Cells report the number of subjects with sex distribution \textit{(Female/Male)} in the first line, and age \textit{mean [min-max]} range in the second line. Dataset names include magnetic field strength and the count of unique scanner models.
}}
{
\centering
\scriptsize
\setlength{\tabcolsep}{3pt}
\renewcommand{\arraystretch}{1.2}
\begin{tabular}{
>{\centering\arraybackslash}m{0.8cm}|
>{\centering\arraybackslash}m{1.6cm}|
*{5}{>{\centering\arraybackslash}c}|
>{\centering\arraybackslash}c}
\toprule
{\multirow{2}{*}{\rotatebox[origin=c]{90}{\textbf{Group}}}} & {\multirow{2}{*}{\textbf{Dataset}}} & 
\multicolumn{5}{c|}{\textbf{Diagnosis}} & 
    \multirow{2}{*}{\textbf{Total}} \\
\cmidrule(lr){3-7}
     &  & CN & AD & bvFTD & nfvPPA & svPPA &  \\
\midrule

\multirow{6}{*}{\rotatebox[origin=c]{90}{\textbf{ID}}} 
 & ADNI \newline \textit{\tiny (1.5T/3T)} \newline \textit{\tiny 30 scanners}
 & \cell{1090 (688/402)}{69.7 [50-90]} & \cell{649 (281/368)}{75.1 [55-94]} & -- & -- & -- & \cell{1739 (969/770)}{71.7 [50-94]} \\
 
 & ALLFTD \newline \textit{\tiny (3T)} \newline \textit{\tiny 19 scanners}
 & \cell{322 (203/119)}{46.3 [18-79]} & \cell{5 (4/1)}{66.8 [60-71]} & \cell{229 (76/153)}{64.5 [40-85]} & \cell{66 (36/30)}{68.9 [48-83]} & \cell{76 (39/37)}{66.0 [50-86]} & \cell{698 (358/340)}{56.7 [18-86]} \\
\cmidrule(lr){2-8}

 & \textbf{Total} 
 & \cell{\textbf{1412} (891/521)}{64.4 [18-90]} 
 & \cell{\textbf{654} (285/369)}{75.0 [55-94]} 
 & \cell{\textbf{229} (76/153)}{64.5 [40-85]} 
 & \cell{\textbf{66} (36/30)}{68.9 [48-83]} 
 & \cell{\textbf{76} (39/37)}{66.0 [50-86]} 
 & \cell{\textbf{2437} (1327/1110)}{67.4 [18-94]} \\
\midrule\midrule

\multirow{6}{*}{\rotatebox[origin=c]{90}{\textbf{OOD}}} 
 & NIFD \newline \textit{\tiny (3T)} \newline \textit{\tiny 3 scanners}
 & \cell{136 (77/59)}{63.5 [39-81]} & -- & \cell{74 (23/51)}{61.8 [45-74]} & \cell{37 (20/17)}{68.8 [54-81]} & \cell{39 (15/24)}{63.4 [50-73]} 
 & \cell{286 (135/151)}{63.7 [39-81]} \\

 & NACC \newline \textit{\tiny (3T)} \newline \textit{\tiny 16 scanners}
 & \cell{2115 (1437/678)}{68.1 [19-100]} & \cell{485 (266/219)}{72.3 [38-96]} & \cell{26 (10/16)}{64.4 [54-73]} & \cell{6 (4/2)}{68.0 [57-77]} & \cell{4 (3/1)}{64.9 [57-81]} 
 & \cell{2636 (1720/916)}{68.8 [19-100]} \\
\cmidrule(lr){2-8}

 & \textbf{Total} 
 & \cell{\textbf{2251} (1514/737)}{67.8 [19-100]} 
 & \cell{\textbf{485} (266/219)}{72.3 [38-96]} 
 & \cell{\textbf{100} (33/67)}{62.5 [45-74]} 
 & \cell{\textbf{43} (24/19)}{68.7 [54-81]} 
 & \cell{\textbf{43} (18/25)}{63.5 [50-81]} 
 & \cell{\textbf{2922} (1855/1067)}{68.3 [19-100]} \\
\bottomrule
\end{tabular}
}
\end{table}

\tableref{tab:dataset_counts_extended} details the repartition of diagnosis across datasets whereas \tableref{tab:dataset_statistics} shows the demographic characteristics of the cohorts.
We performed statistical testing to assess group differences. 
For age, a Kruskal-Wallis test revealed significant differences across diagnostic groups in both ID ($H=421.28$, $p<0.001$) and OOD ($H=147.10$, $p<0.001$) settings. 
For sex distribution, a Chi-square test also indicated significant variations in ID ($\chi^2=115.83$, $p<0.001$) and OOD ($\chi^2=79.30$, $p<0.001$).

We acknowledge that the cohorts are not explicitly matched. However, these differences largely reflect the epidemiological reality of the diseases (e.g., AD generally occurs later than subtypes of FTD). Although resampling matched subcohorts would eliminate these confounding factors, it would significantly reduce the sample size of already rare classes (e.g., nfvPPA/svPPA), making it even more difficult to train Vision Transformers, which require large amounts of data. Therefore, we prioritized maximizing sample size and diversity in order to study model stability under realistic clinical conditions. Our stratified cross-validation strategy ensures that these demographic distributions are strictly identical between the training and validation sets within each fold to avoid bias during evaluation.

\begin{table}[htbp]
\floatconts
{tab:dataset_statistics}
{\caption{
\small\textbf{Detailed demographics and statistical assessment.} 
Distribution of age (Mean $\pm$ Std) and sex (Female/Male) across diagnostic groups.
Statistical comparisons were performed using the Kruskal-Wallis test for age and Pearson's $\chi^2$ test for sex comparisons. 
Both tests reveal highly significant differences ($p < 0.001$) in In-Domain (ID) and Out-of-Domain (OOD) settings, statistically confirming that the groups are not demographically matched. 
This heterogeneity reflects the distinct epidemiological profiles of the diseases (e.g., AD typically has a later onset than FTD subtypes).
}}
{
\centering
\scriptsize
\setlength{\tabcolsep}{4pt}
\renewcommand{\arraystretch}{1.2}
\begin{tabular}{l|c|ccccc|c}
\toprule
\textbf{Metric} & \textbf{Set} & \textbf{CN} & \textbf{AD} & \textbf{bvFTD} & \textbf{nfvPPA} & \textbf{svPPA} & \textbf{Stats (p-value)} \\
\midrule

\multirow{4}{*}{\textbf{Age}} 
 & \multirow{2}{*}{ID} 
 & $64.4 \pm 13.2$ & $75.0 \pm 8.1$ & $64.5 \pm 7.8$ & $68.9 \pm 8.1$ & $66.0 \pm 7.2$ & \multirow{2}{*}{\cell{$H=421.3$}{$(<0.001)$}} \\
 & & ($n=1412$) & ($n=654$) & ($n=229$) & ($n=66$) & ($n=76$) & \\
 \cmidrule{2-8}
 & \multirow{2}{*}{OOD} 
 & $67.8 \pm 10.8$ & $72.3 \pm 9.6$ & $62.5 \pm 6.2$ & $68.7 \pm 7.2$ & $63.5 \pm 6.6$ & \multirow{2}{*}{\cell{$H=147.1$}{$(<0.001)$}} \\
 & & ($n=2251$) & ($n=485$) & ($n=100$) & ($n=43$) & ($n=43$) & \\
\midrule

\multirow{4}{*}{\textbf{Sex} \tiny{(F/M)}} 
 & \multirow{2}{*}{ID} 
 & 891 / 521 & 285 / 369 & 76 / 153 & 36 / 30 & 39 / 37 & \multirow{2}{*}{\cell{$\chi^2=115.8$}{$(<0.001)$}} \\
 & & (63\% F) & (44\% F) & (33\% F) & (55\% F) & (51\% F) & \\
 \cmidrule{2-8}
 & \multirow{2}{*}{OOD} 
 & 1514 / 737 & 266 / 219 & 33 / 67 & 24 / 19 & 18 / 25 & \multirow{2}{*}{\cell{$\chi^2=79.3$}{$(<0.001)$}} \\
 & & (67\% F) & (55\% F) & (33\% F) & (56\% F) & (42\% F) & \\

\bottomrule
\end{tabular}
}
\end{table}

\section{Augmentation protocols}
\label{app:augmentation}

\tableref{tab:aug_params} details the hyperparameter configurations for the MRI-specific data augmentation pipeline implemented via \texttt{MONAI}. All transformations are applied stochastically during training with the specified probabilities.

\begin{table}[htbp]
    \centering
    \caption{\small Detailed hyperparameters for 3D MRI augmentations. Probabilities ($p$) indicate the likelihood of applying the transform per sample. Note that sagittal flipping is included to enforce anatomical invariance during training.}
    \label{tab:aug_params}
    \small
    \setlength{\tabcolsep}{2pt}
    \renewcommand{\arraystretch}{0.95}
    \begin{tabular}{l l p{6cm}}
    \toprule
    \textbf{Category} & \textbf{Transform (MONAI)} & \textbf{Parameters} \\
    \midrule
    \multirow{3}{*}{\textbf{Spatial}} 
    & \texttt{RandAffine} & $p=0.5$, Rot $\pm 30^{\circ}$, Scale $\pm 0.3$, Trans $\pm 10$ vox, Padding: border \\
    & \texttt{Rand3DElastic} & $p=0.2$, $\sigma \in [5, 8]$, Magnitude $\in [100, 200]$ \\
    & \texttt{RandFlip} & $p=0.5$, Axis 0 (Sagittal) \\
    \midrule
    \multirow{6}{*}{\textbf{Intensity}} 
    & \texttt{RandBiasField} & $p=0.3$, Coeff range $\in [0.0, 0.3]$ (Order 3) \\
    & \texttt{RandAdjustContrast} & $p=0.3$, $\gamma \in [0.7, 1.5]$ \\
    & \texttt{RandScaleIntensity} & $p=0.3$, Factor $\in [-0.5, 1.0]$ \\
    & \texttt{RandHistogramShift} & $p=0.2$, Control points $\in [5, 15]$ \\
    & \texttt{AdaptiveGaussianNoise} & $p=0.2$, Factor$=0.1$ (injection relative to std) \\
    & \texttt{AdaptiveRicianNoise} & $p=0.2$, Standard Rician injection \\
    \midrule
    \multirow{2}{*}{\textbf{Artifacts}} 
    & \texttt{RandGibbsNoise} & $p=0.2$, $\alpha \in [0.5, 1.0]$ \\
    & \texttt{RandKSpaceSpikeNoise} & $p=0.1$, Intensity $\in [13, 15]$ (k-space scale) \\
    \bottomrule
    \end{tabular}
\end{table}

\section{Detailed stabilization protocols}
\label{app:stabilization_details}

This section provides the mathematical formulation and theoretical justification for the stabilization strategies employed in Section~\ref{sec:stabilization}. We detail the specific hyperparameters used to ensure reproducibility.

\subsection{Data-Level regularization}

\paragraph{MixUp.}
Standard empirical risk minimization often leads to memorization when $N$ is small. To counteract this, MixUp~\citep{Zhang2018mixup} encourages the model to behave linearly between training examples. It generates synthetic samples $(x', y')$ by interpolating between random pairs of inputs $(x_i, y_i)$ and $(x_j, y_j)$:
\begin{equation}
    x' = \lambda x_i + (1-\lambda)x_j, \qquad
    y' = \lambda y_i + (1-\lambda)y_j,
\end{equation}
\noindent where the interpolation coefficient $\lambda$ is drawn from a Beta distribution $\lambda \sim \mathrm{Beta}(\alpha, \alpha)$ with $\alpha=0.3$. MixUp is applied after spatial and intensity transforms and preferentially pairs samples from \textbf{distinct classes} to enforce decision boundary regularization~\citep{Tokozume2018BCL}. In high-dimensional MRI space, this regularization prevents over-confident predictions in regions free of training data, effectively smoothing the decision boundary between phenotypically similar classes, such as FTD subtypes. 

Note that we also investigated CutMix~\citep{Yun2019CutMix} as an alternative mixing strategy. However, preliminary experiments yielded inferior performance compared to baseline or MixUp, likely because the rectangular region replacement destroys global anatomical context essential for analyzing distributed atrophy patterns. Consequently, we report only MixUp results.

\paragraph{Class-aware balanced sampling.}
Given the severe imbalance in the OOD cohort (see~\tableref{tab:dataset_counts}), standard uniform sampling would bias the gradient updates toward majority classes (CN, AD). We adjust the sampling probability $p_i$ for an image $x_i$ with label $y_i=c$ as:
\begin{equation}
    p_i = \frac{1}{C \cdot n_c},
\end{equation}
\noindent where $C$ is the total number of classes and $n_c$ is the number of available samples for class $c$. This ensures that the expected number of samples per class in each mini-batch is uniform ($B/C$), preventing the minority FTD subtypes from being treated as outliers during optimization.

\subsection{Optimization dynamics}

\paragraph{Sharpness-Aware Minimization (SAM).}
Vision Transformers trained on small datasets tend to converge to sharp local minima, which generalize poorly under distribution shifts. SAM~\citep{Foret2021SAM} explicitly seeks parameters $w$ that lie in a "flat" neighborhood by solving a minimax game:
\begin{equation}
    \min_{w} \mathcal{L}^{\text{SAM}}(w) \quad \text{where} \quad \mathcal{L}^{\text{SAM}}(w) = \max_{\|\epsilon\|_2 \le \rho} \mathcal{L}(w + \epsilon).
\end{equation}
Here, $\rho=0.05$ is the radius of the perturbation neighborhood. In theory, by minimizing the loss under the worst-case weight perturbation $\epsilon$, SAM finds solutions robust to parameter noise. This flatness is a proxy for generalization capability, essential when transferring models from ID to OOD domains.

\paragraph{Label smoothing.}
Medical diagnostic labels inevitably contain aleatoric uncertainty due to inter-rater variability. Training with "hard" one-hot targets $y_k \in \{0, 1\}$ forces the model to be over-confident, often leading to overfitting. We relax the targets into soft probabilities $\tilde{y}_k$:
\begin{equation}
    \tilde{y}_k = (1-\varepsilon)y_k + \frac{\varepsilon}{K},
\end{equation}
\noindent where $K=5$ is the number of classes and $\varepsilon=0.1$ is the smoothing factor. This prevents the network from seeking infinite logit gaps for challenging samples (e.g., ambiguous early-stage dementia), resulting in better calibrated probabilities (lower ECE) as shown in Figure~3.

\paragraph{Exponential Moving Average (EMA).}
Stochastic Gradient Descent introduces noise into the optimization trajectory, particularly with small batch sizes. EMA maintains a "shadow" model with weights $\tilde{\theta}$ that are updated at each step $t$ using the current online weights $\theta_t$:
\begin{equation}
    \tilde{\theta}_t=\frac{\sum_{k=0}^{K-1}\beta^{k}\theta_{t-k}}{\sum_{k=0}^{K-1}\beta^{k}}.
\end{equation}
We use a slow decay rate $\beta=0.999$ and $K=3$ to average the last 3 model weights. This acts as an averaging filter, effectively smoothing out the high-frequency oscillations of the optimization path. It provides a more stable estimate of the central tendency of the loss basin.

\subsection{Architectural Constraints}

\paragraph{LayerScale.}
Deep Transformers (like Swin) often suffer from signal degradation in deeper layers. LayerScale~\citep{Touvron2021CaiT} facilitates signal propagation by introducing a learnable diagonal matrix $\Lambda_l$ to scale the output of the residual block $\mathcal{F}$:
\begin{equation}
    x_{l+1} = x_l + \Lambda_l \cdot \mathcal{F}(\mathrm{LN}(x_l)), \quad \Lambda_l = \mathrm{diag}(\lambda_{l,1}, \dots, \lambda_{l,d}).
\end{equation} 
Initializing $\lambda$ to a small value (e.g., $10^{-5}$) allows the network to behave closer to an identity function at the start of training, easing the optimization of deep architectures on small datasets where gradients might otherwise vanish or explode.

\subsection{Inference-time aggregation}

\paragraph{Test-Time Augmentation (TTA) with entropy weighting.}
We apply $M=8$ transformations $T_m$ (flips, crops) to each test volume $x$. Simple averaging can be detrimental if certain views (e.g., occluded crops) yield noisy predictions. We therefore use inverse-entropy weighting to prioritize confident predictions. The final probability $\bar{y}$ is:
\begin{equation}
    \bar{y} = \sum_{m=1}^{M} w_m p_m(T_m(x)), \quad w_m \propto \frac{1}{H(p_m) + \xi},
\end{equation}
where $H(p_m)$ is the entropy of the prediction and $\xi$ is a stability constant. This explicitly marginalizes over geometric nuisance variables, enforcing invariance to acquisition variations that the model may not have fully learned during training.

\paragraph{Ensemble of snapshots.}
To determine the optimal ensemble size $K$, we analyzed the evolution of inter-seed stability as a function of the number of aggregated checkpoints. \figureref{fig:ensemble_models_cv} illustrates the nCV for discrimination metrics across varying $K$. We observe a characteristic convex profile in the In-Domain setting: while increasing $K$ initially reduces variance by marginalizing out local optimization noise, the stability benefits saturate and even degrade beyond $K=12$. This inflection point likely indicates that expanding the ensemble further necessitates the inclusion of suboptimal checkpoints (ranked $13^{\text{th}}$ and below on validation data), which dilutes the consensus and re-introduces variance. Consequently, we selected $K=12$ as the operating point for all reported experiments, providing the optimal trade-off between variance reduction and computational inference cost.

\begin{figure}[htbp]
    \floatconts{fig:ensemble_models_cv}
    {\caption{\small \textbf{Effect of ensemble size on stability.} Evolution of the normalized CV for Accuracy, MCC, and PR-AUC as a function of the number of models in the snapshot ensemble ($K$).}}
    {\includegraphics[width=\textwidth]{fig/metric_cv_vs_ensemble_size.pdf}}
\end{figure}

\section{Evaluation metrics and definitions}
\label{app:metrics}

This appendix provides the formal definitions of the metrics used in the evaluation protocol.

\subsection{Stability metrics}\label{app:stability_metrics}

\paragraph{Normalized Coefficient of Variation (nCV).}
\figureref{fig:scaling} validates uncertainty estimates by tracking 95\% CI width against test set size. The widths follow a theoretical $C/\sqrt{N}$ decay ($R^2 > 0.9$ for Accuracy/MCC), confirming that reported instability is intrinsic to the models rather than a sampling artifact.

\begin{figure}[htbp]
    \floatconts{fig:scaling}
    {\caption{\small \textbf{Confidence Interval Scaling Analysis.} Evolution of the 95\% CI half-width as a function of the training set size $N$ (subsampled from the OOD cohort).
The solid lines represent the theoretical fit $y = C/\sqrt{N}$.
The high $R^2$ values indicate that the estimated uncertainty strictly follows expected statistical laws, validating the reliability of the reported variance.}}
    {\includegraphics[width=0.8\linewidth]{fig/stability_scaling_analysis.pdf}}
\end{figure}

Standard deviation naturally decreases as sample size $N$ increases ($\sigma \propto 1/\sqrt{N}$). To compare the intrinsic stability of models across classes with vastly different sizes (e.g., 2000 CN vs. 40 nfvPPA), we use the nCV to decouple stability from sampling density:
\begin{equation}
    \mathrm{nCV} = \sqrt{N} \frac{\sigma}{\mu},
\end{equation}
where $\mu$ and $\sigma$ are the mean and standard deviation of the metric across seeds. This normalization allows for a fair comparison of variance between majority and minority classes.

\paragraph{Probability of False Outperformance (PFO).}
The PFO estimates the probability that a baseline model $A$ is actually superior to a proposed model $B$, despite the observed mean difference $\bar{\delta} > 0$. It is computed directly from the bootstrap replicates:
\begin{equation}
    \widehat{\Pr}(\Delta \le 0) = \frac{1}{B} \sum_{b=1}^{B} \mathds{1}(\delta_b \le 0),
\end{equation}
where $\delta_b$ is the performance difference in the $b$-th bootstrap sample.

\subsection{Classification Metrics}
To ensure consistency and readability throughout the paper, all reported metrics (discrimination and reliability) are scaled by a factor of 100 and expressed as percentages.

To formally define the metrics, let $K$ be the $C \times C$ confusion matrix where $K_{ij}$ represents the number of samples of class $j$ predicted as class $i$. We define the total samples $s = \sum_{ij} K_{ij}$, the total correct predictions $c = \sum_k K_{kk}$, the total predictions for class $k$ as $p_k = \sum_j K_{kj}$, and the total true labels for class $k$ as $t_k = \sum_i K_{ik}$.

\paragraph{Accuracy.}
Standard accuracy measures the overall proportion of correct predictions. While intuitive, it can be misleading in imbalanced settings where majority classes dominate the score.
\begin{equation}
    \mathrm{ACC} = \frac{\sum_k K_{kk}}{s} = \frac{c}{s}.
\end{equation}

\paragraph{Matthews Correlation Coefficient (MCC).}
We employ the multiclass generalization of the MCC. Unlike F1 or Accuracy, MCC involves all four quadrants of the confusion matrix (True Positives, False Positives, True Negatives, False Negatives), making it the most robust single-value metric for imbalanced datasets.
\begin{equation}
    \mathrm{MCC} = \frac{c \cdot s - \sum_k p_k t_k}{\sqrt{(s^2 - \sum_k p_k^2)(s^2 - \sum_k t_k^2)}}.
\end{equation}

\paragraph{Precision-Recall AUC.}
The PR-AUC assesses the trade-off between Precision ($P$) and Recall ($R$) across different decision thresholds $\tau \in [0, 1]$. For a multi-class problem, we compute the Area Under the Curve (AUC) for each class $k$ in a one-vs-rest manner and report the macro-average (unweighted mean) to ensure equal contribution from all phenotypes regardless of prevalence:
\begin{equation}
    \mathrm{PR\text{-}AUC} = \frac{1}{C} \sum_{k=1}^{C} \int_{0}^{1} P_k(R) \, dR,
\end{equation}
where $P_k = \frac{K_{kk}}{p_k}$ and $R_k = \frac{K_{kk}}{t_k}$ are the precision and recall for class $k$ computed at varying operating points.

\paragraph{Per-class F1-Score.}
The F1-score for a specific class $k$ is the harmonic mean of its precision and recall. It effectively penalizes the model if it fails to retrieve instances of class $k$ (low recall) or hallucinates them (low precision).
\begin{equation}
    \mathrm{F1}_k = 2 \cdot \frac{P_k \cdot R_k}{P_k + R_k} = \frac{2 K_{kk}}{p_k + t_k}.
\end{equation}

\paragraph{Macro F1-Score.}
To obtain a global performance metric that treats all classes equally regardless of their support size (prevalence), we compute the unweighted mean of the per-class F1 scores.
\begin{equation}
    \mathrm{Macro\text{-}F1} = \frac{1}{C} \sum_{k=1}^{C} \mathrm{F1}_k.
\end{equation}

\subsection{Reliability Metrics}

\paragraph{Expected Calibration Error (ECE).}
We calculate the ECE \citep{Guo2017Calibration}, which approximates the expected difference between the model's confidence and its actual accuracy. Following standard practice,we employ a fixed discretization scheme with $M=15$ equidistant bins based on the maximum softmax probability:
\begin{equation}
    \mathrm{ECE} = \sum_{m=1}^{M} \frac{|B_m|}{N} \big| \mathrm{acc}(B_m) - \mathrm{conf}(B_m) \big|,
\end{equation}
where $B_m$ is the set of samples in bin $m$, $\mathrm{acc}(B_m)$ is the accuracy within the bin, and $\mathrm{conf}(B_m)$ is the average confidence. Lower ECE indicates better calibration. In a clinical setting, a low ECE is critical as it implies that a prediction made with 90\% confidence indeed corresponds to a 90\% probability of correctness, fostering trust in the decision support system.

\paragraph{Brier Score.}
The Brier score \citep{Brier1950} is computed as the Mean Squared Error between the predicted probability distribution and the one-hot encoded ground truth. To align its scale with accuracy, we report it as a percentage:
\begin{equation}
    \mathrm{Brier} = \frac{1}{N \cdot C} \sum_{i=1}^N \sum_{k=1}^C \big( P(y_i=k \mid x_i) - \mathds{1}[y_i=k] \big)^2.
\end{equation}
Here, the score is normalized by the number of classes $C$, preventing mechanical inflation due to task dimensionality. Unlike accuracy, which relies solely on the ranking, the Brier score heavily penalizes over-confident false predictions. It thus provides a holistic assessment of probabilistic reliability, favoring models that remain uncertain when evidence is ambiguous.

\section{Instabilities arise going from 3 to 5 classes}
\label{app:instabilities_325}

\tableref{tab:base_3c} summarizes the baseline performance on the standard 3-class task. As detailed in the Introduction, differentiating bvFTD from AD presents a significant challenge due to overlapping atrophy patterns in the anterior cingulate and frontoinsula~\citep{Perry2017Clinicopathological}. Consequently, unlike binary classification, this task exposes the limitations of models lacking strong inductive biases.

In In-Domain (ID), while the hybrid MedViT-3D sets the upper bound, the anatomy-driven SVM establishes a high baseline among standard methods, leveraging precise segmentation priors to disentangle overlapping phenotypes. The 3D ResNet-18 follows closely, confirming that convolutional inductive biases are data-efficient even without explicit segmentation. Conversely, the lack of priors in standard ViT-3D leads to poor convergence. However, the hierarchical Swin-3D DPL variant significantly mitigates this issue, reaching an MCC of $64.84\%$, thereby narrowing the performance gap with CNNs compared to the vanilla ViT.

The OOD evaluation reveals a shift in the performance hierarchy. While the SVM dominates ID, the 3D ResNet-18 achieves robust generalization performance among non-hybrid architectures, marginally outperforming the SVM. This indicates that end-to-end convolutional features generalize slightly better to site-specific variations than fixed segmentation priors in this 3-class regime. Among Transformers, Swin-DPL remains the most viable option, significantly outperforming the standard Swin-3D and ViT-3D. As observed in the 5-class task, ViT-3D's seemingly favorable ECE is deceptive, resulting from low-confidence predictions rather than accurate calibration.

\begin{table}[htbp]
\floatconts{tab:base_3c}
{\caption{\small \textbf{Baseline performance comparisons for 3-class classification (CN/AD/FTD).}
Performance metrics for Convolutional (ResNet, SVM) and Transformer (ViT, Swin) architectures.
Results are reported for in-domain (10-fold CV) and out-of-domain (10 models average) settings.
\textbf{Bold} indicates the best performance per column.\\
\scriptsize Values: Mean $\pm$ 95\% CI ($B=10,000$). $\uparrow$=higher-is-better, $\downarrow$=lower-is-better.}
}%
{\footnotesize
  \setlength{\tabcolsep}{3pt}
  \begin{tabular}{lccccccc}
  \toprule
  \bfseries Configuration & \bfseries \# Params & \bfseries ACC $\uparrow$ & \bfseries MCC $\uparrow$ & \bfseries PR-AUC $\uparrow$ & \bfseries Macro-F1 $\uparrow$ & \bfseries ECE $\downarrow$ & \bfseries Brier $\downarrow$ \\
  \midrule
  \multicolumn{8}{c}{\bfseries In-domain (10-fold CV)} \\
  \midrule
  CNNs + SVM &  $\approx$270M\textsuperscript{*} & \meanpm{84.25}{1.46} & \meanpm{71.62}{2.56} & \meanpm{85.02}{1.89} & \meanpm{78.77}{2.01} & 33.04 & 13.90 \\
  ResNet-18 3D & 33.16M & \meanpm{82.40}{1.50} & \meanpm{68.47}{2.65} & \meanpm{83.43}{1.97} & \meanpm{77.25}{1.98} & 28.88 & 13.32 \\
  MedViT 3D & 34.99M & \bfseries \meanpm{85.92}{1.38} & \bfseries \meanpm{74.68}{2.41} & \bfseries \meanpm{86.51}{1.79} & \bfseries \meanpm{80.77}{1.92} & 32.89 & \bfseries 13.13 \\
  ViT-3D & 23.18M & \meanpm{68.97}{1.83} & \meanpm{43.88}{2.91} & \meanpm{61.81}{2.19} & \meanpm{55.08}{2.26} & \bfseries 21.43\textsuperscript{\textdagger} & 16.83 \\
  Swin-3D & 29.27M & \meanpm{75.50}{1.68} & \meanpm{55.53}{2.89} & \meanpm{72.30}{2.31} & \meanpm{65.36}{2.31} & 26.02 & 15.65 \\
  Swin-3D DPL & 41.02M & \meanpm{80.54}{1.52} & \meanpm{64.84}{2.64} & \meanpm{79.83}{2.15} & \meanpm{73.18}{2.15} & 30.10 & 14.63 \\
  \midrule
  \multicolumn{8}{c}{\bfseries Out-of-domain (10 models averaged predictions)} \\
  \midrule
  CNNs + SVM &  $\approx$270M\textsuperscript{*} & \meanpm{88.52}{1.15} & \meanpm{70.18}{2.87} & \meanpm{83.84}{2.48} & \meanpm{77.31}{2.35} & 37.44 & 13.26 \\
  ResNet-18 3D & 33.16M & \meanpm{88.73}{1.15} & \meanpm{71.09}{2.87} & \meanpm{85.88}{2.23} & \meanpm{80.63}{2.17} & 36.21 & 12.68 \\
  MedViT 3D & 34.99M & \bfseries \meanpm{91.23}{1.05} & \bfseries \meanpm{76.95}{2.65} & \bfseries \meanpm{88.93}{2.07} & \bfseries \meanpm{84.98}{1.98} & 38.97 & \bfseries 12.53 \\
  ViT-3D & 23.18M & \meanpm{78.17}{1.49} & \meanpm{45.77}{3.19} & \meanpm{68.49}{2.84} & \meanpm{56.57}{2.85} & \bfseries 30.51\textsuperscript{\textdagger} & 15.51 \\
  Swin-3D & 29.27M & \meanpm{82.63}{1.38} & \meanpm{56.76}{3.22} & \meanpm{75.01}{2.80} & \meanpm{70.42}{2.69} & 33.90 & 14.82 \\
  Swin-3D DPL & 41.02M & \meanpm{85.23}{1.27} & \meanpm{63.35}{3.03} & \meanpm{81.05}{2.51} & \meanpm{74.46}{2.47} & 34.88 & 13.97 \\
  \bottomrule
  \end{tabular}\\
  \scriptsize{\textsuperscript{*}Includes the parameters of the underlying segmentation backbone, AssemblyNet \citep{Coupe2020AssemblyNet}, composed of 125 U-Nets ($\approx$2.17M params each). \textsuperscript{\textdagger}Low ECE for ViT-3D reflects under-confidence due to poor discrimination, not effective calibration.}
}
\end{table}

\figureref{fig:archs_3c_cv_analysis} profiles the stochastic stability via the nCV. The SVM (purple bars) serves as a stability lower bound ($\mathrm{nCV} < 0.1$), confirming that observed instability in deep models arises from weight optimization rather than aleatoric uncertainty. Standard ViT-3D exhibits substantial volatility (orange bars), with variance spikes indicating sensitivity to initialization. Crucially, the introduction of deformable patches in Swin-DPL (green bars) acts as a stabilizer, reducing variance to levels approaching the ResNet baseline (blue bars), suggesting that constraining the attention mechanism effectively smooths the optimization landscape.

Moving to the 5-class setting, \figureref{fig:archs_5c_cv_analysis} illustrates how increased task complexity disproportionately affects Transformer stability. ViT-3D nCV spikes in the OOD setting, indicating inconsistent decision boundaries for minority FTD subtypes. The SVM baseline retains a low variance profile, suggesting that predefined U-Net features provide a representation space less susceptible to optimization noise driven by class imbalance. Although Swin-3D DPL improves mean performance over standard Swin, it retains discernible seed-to-seed variability compared to CNN baselines.

\begin{figure}[htbp]
    \floatconts{fig:archs_3c_cv_analysis}
    {\caption{\small \textbf{Inter-seed stability profiling (3-class).}
    Normalized Coefficient of Variation across 5 random seeds for the 3-class task.
    Comparison between In-domain (left) and Out-of-domain (right) regimes.
    Lower nCV values indicate higher stability against initialization noise.}}
    {\includegraphics[width=\textwidth]{fig/architectures_3c_cv.pdf}}
\end{figure}

\begin{figure}[htbp]
    \floatconts{fig:archs_5c_cv_analysis}
    {\caption{\small \textbf{Inter-seed stability profiling (5-class).}
    Normalized Coefficient of Variation across 5 random seeds for the differential diagnosis task. Comparison between In-domain (left) and Out-of-domain (right) regimes. Lower nCV values indicate higher stability against initialization noise.}}
    {\includegraphics[width=\textwidth]{fig/architectures_5c_cv.pdf}}
\end{figure}

\clearpage
\section{Extended stability and granular analysis}
\label{app:granular_stability}
While the main text reports macro-averaged stability metrics, Figures~\ref{fig:cv_f1_archs_3c} through \ref{fig:cv_f1_swinv1dpl} present the nCV for the F1-score of each specific class.
This stratified analysis indicates that the stability observed in aggregated metrics is predominantly driven by the majority classes (CN, AD).
The minority phenotypes, specifically nfvPPA and svPPA, exhibit notably higher variance across seeds ($\mathrm{nCV} > 0.1$, reaching up to $1.2$).

\begin{figure}[htbp]
    \floatconts{fig:cv_f1_archs_3c}
    {\caption{\small \textbf{Per-class F1-score normalized coefficient of variation for the 3-class task (\tableref{tab:base_3c})} computed across 5 random seeds. The stability profile shows lower variance for majority classes (CN, AD), while the FTD class exhibits higher variability across architectures, particularly for the standard ViT-3D.}}
    {\includegraphics[width=\textwidth]{fig/architectures_3c_cv_f1.pdf}}
\end{figure}

\begin{figure}[htbp]
    \floatconts{fig:cv_f1_archs_5c}
    {\caption{\small \textbf{Per-class F1-score normalized coefficient of variation for the 5-class task (\tableref{tab:base_5c})} computed across 5 random seeds. The decomposition of the FTD class into subtypes reveals substantial instability for the minority classes (nfvPPA, svPPA), which is masked in macro-averaged metrics.}}
    {\includegraphics[width=\textwidth]{fig/architectures_5c_cv_f1.pdf}}
\end{figure}

This breakdown identifies specific limitations of stabilization strategies that are not apparent in global averages.
As illustrated in \figureref{fig:cv_f1_swinv1dpl}, while label smoothing (LS) generally improves calibration metrics, it appears to induce higher variance for the nfvPPA class in the out-of-domain setting (Right panel).
This suggests that enforcing soft targets on rare, distinct phenotypes may interfere with robust feature learning under distribution shifts.
Conversely, the combined protocol (\texttt{DA+E+LS+BS+M}) effectively suppresses variance for the ID minority classes (Left panel), supporting the synergistic effect of the proposed framework.

\begin{figure}[htbp]
    \floatconts{fig:cv_f1_swinv1dpl}
    {\caption{\small \textbf{Per-class F1-score normalized coefficient of variation for training strategies (\tableref{tab:stab_train})} computed across 5 random seeds. 
    Comparison of stability across diagnosis subtypes for ID and OOD settings.
    Note the scale difference in variance for minority classes compared to majority classes.
    While the Baseline (BL) and MixUp (M) show high volatility in-domain for nfvPPA, Label Smoothing (LS) exhibits increased sensitivity for this class in OOD settings.}}
    {\includegraphics[width=\textwidth]{fig/swinv1dpl_train_cv_f1.pdf}}
\end{figure}

Figure \ref{fig:cv_analysis_eval} details the stability profile of the evaluated inference-time strategies. While Test-Time Augmentation (TTA) reduces variance in the in-domain setting, it fails to generalize this benefit to the out-of-domain cohort. Specifically, TTA exhibits higher volatility than the baseline for calibration metrics (ECE, Brier) in OOD. 
This finding supports the hypothesis that applying geometric transformations (particularly sagittal flipping) to lateralized phenotypes such as nfvPPA and svPPA creates anatomically inconsistent samples that degrade model reliability. Conversely, snapshot ensembling consistently achieves the lowest nCV across all discrimination and reliability metrics, confirming that averaging predictions across validation checkpoints is the most robust strategy to mitigate stochastic optimization variability.

\begin{figure}[htbp]
    \includegraphics[width=\textwidth]{fig/swinv1dpl_eval_cv.pdf}
     \caption{\small \textbf{Normalized coefficient of variation for evaluation strategies.} Comparison of inter-seed stability ($N=3$ runs) for Baseline (BL), Test-Time Augmentation (TTA), Calibrated models, and Ensemble methods.
    Ensembling consistently yields the lowest nCV across metrics, indicating that averaging predictions effectively mitigates the variance induced by optimization noise.}
    \label{fig:cv_analysis_eval}
\end{figure}

\section{Generalizability of the stabilization protocol across architectures}

To assess whether the proposed stabilization protocol ($DA+E+LS+BS+M$) benefits other Transformer-based architectures beyond Swin-3D DPL, we performed an additional study on the standard ViT-3D and Swin-3D backbones. Table~\ref{tab:all_archs_stabilized} reports the performance of these architectures when trained with the stabilization protocol presented in~\tableref{tab:stab_train}.

The results demonstrate that while our protocol significantly improves ViT-3D performance compared to the baseline reported in~\tableref{tab:base_5c}, the non-hierarchical ViT still lags behind Swin-based variants. This confirms that while the proposed optimization strategies effectively reduce variance, hierarchical inductive biases (as found in Swin) remain essential for achieving optimal performance in this setup.

\begin{table}[htbp]
\floatconts{tab:all_archs_stabilized}
{\caption{\small \textbf{Comparison of fully stabilized architectures (ViT vs. Swins).} 
Performance of distinct backbones trained with the complete optimization protocol (\texttt{DA+E+LS+BS+M}).
Results are reported for in-domain (ID, 10-fold CV) and out-of-domain (OOD, 10 models average) settings. The stabilization protocol benefits all architectures, but hierarchical models (Swin) retain a significant advantage over vanilla ViT.
\textbf{Bold} indicates the best performance per column.
\\
\scriptsize Values: Mean $\pm$ 95\% CI ($B=10^4$). $\uparrow$=higher-is-better, $\downarrow$=lower-is-better.}}
{\scriptsize
\setlength{\tabcolsep}{2pt}
\begin{tabular}{c @{\hspace{1em}} l c c c c c c c}
\toprule
& \bfseries Architecture & \bfseries Params & \bfseries ACC $\uparrow$ & \bfseries MCC $\uparrow$ & \bfseries PR-AUC $\uparrow$ & \bfseries Macro-F1 $\uparrow$ & \bfseries ECE $\downarrow$ & \bfseries Brier $\downarrow$ \\
\midrule

\multirow{3}{*}{\rotatebox[origin=c]{90}{\textbf{ID}}}
& ViT-3D & 23.18M & \meanpm{69.65}{1.83} & \meanpm{44.23}{2.97} & \meanpm{43.04}{2.78} & \meanpm{43.55}{3.15} & \bfseries 40.53\textsuperscript{\textdagger} & 13.53 \\
& Swin-3D & 29.27M & \meanpm{81.12}{1.54} & \meanpm{66.66}{2.56} & \meanpm{65.85}{3.64} & \meanpm{62.60}{3.46} & 48.18 & 12.05 \\
& Swin-3D DPL & 41.02M & \bfseries\meanpm{81.62}{1.54} & \bfseries\meanpm{67.28}{2.59} & \bfseries\meanpm{66.43}{3.89} & \bfseries\meanpm{63.02}{3.71} & 48.94 & \bfseries 12.04 \\

\midrule

\multirow{3}{*}{\rotatebox[origin=c]{90}{\textbf{OOD}}}
& ViT-3D & 23.18M & \meanpm{84.65}{1.30} & \meanpm{61.38}{4.31} & \meanpm{64.61}{4.56} & \meanpm{63.47}{4.43} & 56.48 & 13.22 \\
& Swin-3D & 29.27M &\bfseries\meanpm{88.48}{1.15} & \meanpm{70.27}{2.87} & \bfseries\meanpm{75.45}{4.48} & \meanpm{70.57}{4.41} & \bfseries 55.51 & \bfseries 11.68 \\
& Swin-3D DPL & 41.02M & \meanpm{88.41}{1.19} & \bfseries\meanpm{70.38}{2.87} & \meanpm{74.91}{4.90} & \bfseries\meanpm{71.54}{4.52} & 55.56 & 11.71 \\
\bottomrule
\end{tabular}\\
\scriptsize{\textsuperscript{\textdagger}Low ECE here reflects under-confidence due to poor discrimination, rather than effective calibration.}
}
\end{table}

\section{Computational cost analysis}
\label{app:cost}

We quantify the computational overhead of the proposed stabilization protocol in \tableref{tab:training_times_5c}.
The analysis reveals that the fully stabilized protocol (\texttt{DA+E+LS+BS+M}) incurs an approximate $4.8\times$ increase in training time compared to the baseline ($0.93$h vs.\ $4.51$h per fold on a V100 GPU).
This overhead is primarily driven by the on-the-fly 3D data augmentation and the balanced sampling routine.
However, this additional cost must be weighed against the substantial improvements in generalization and robustness: the stabilized model achieves a gain of $+15.4$ points in OOD Macro-F1 and $+6.5$ points in OOD MCC compared to the baseline (see \tableref{tab:stab_train}).
In the context of medical differential diagnosis, where model failure can have severe consequences, we argue that this trade-off is favorable, as the absolute training duration remains compatible with clinical research workflows ($<5$ hours per fold).

\begin{table}[htbp]
\floatconts{tab:training_times_5c}
{%
  \caption{\small \textbf{Computational Analysis (Swin 3D DPL).} 
  Evaluation of training cost and environmental impact across configurations.
  All models were trained on Jean Zay ($2{\times}$V100) with 5 repetitions of 10-fold cross-validation (50 runs total). Time is reported per fold (mean $\pm$ std). Total CO$_2$ represents the cumulative footprint of the 50 runs\textsuperscript{a}. \\ \scriptsize Abbreviations: \texttt{BS}: Balanced Sampling, \texttt{M}: MixUp, \texttt{EMA}: Exp. Moving Avg, \texttt{LS}: Label Smoothing, \texttt{DA}: 3D Augmentation.}%
}
{%
\centering\scriptsize\setlength{\tabcolsep}{5pt}
\begin{tabular}{l c c c}
\toprule 
\bfseries Configuration & \bfseries Time / Fold (h) & \bfseries Steps / Fold & \bfseries Total Eq.
CO$_2$ (kg) \\
\midrule
\multicolumn{4}{l}{\textit{Single Component Analysis}} \\
Baseline & $0.93 \pm 0.07$ & $434 \pm 16$ & $2.40 \pm 0.18$ \\
\texttt{DA} & $2.78 \pm 0.52$ & $1012 \pm 181$ & $7.17 \pm 1.34$ \\
\texttt{EMA} & $0.93 \pm 0.04$ & $436 \pm 14$ & $2.40 \pm 0.10$ \\
\texttt{LS} & $0.95 \pm 0.09$ & $456 \pm 40$ & $2.45 \pm 0.23$ \\
\texttt{BS} & $0.83 \pm 0.04$ & $409 \pm 19$ & $2.14 \pm 0.10$ \\
\texttt{M} & $0.89 \pm 0.05$ & $435 \pm 22$ & $2.30 \pm 0.13$ \\
\midrule
\multicolumn{4}{l}{\textit{Cumulative Complexity}} \\
+ \texttt{DA} & $2.78 \pm 0.52$ & $1012 \pm 181$ & $7.17 \pm 1.34$ \\
+ \texttt{DA + EMA} & $2.70 \pm 0.46$ & $974 \pm 165$ & $6.97 \pm 1.19$ \\
+ \texttt{DA + EMA + LS} & $3.05 \pm 0.79$ & $1110 \pm 292$ & $7.87 \pm 2.04$ \\
+ \texttt{DA + EMA + LS + BS} & $4.59 \pm 1.72$ & $1624 \pm 509$ & $11.84 \pm 4.44$ \\
+ \texttt{DA + EMA + LS + BS + M} & $4.51 \pm 1.37$ & $1648 \pm 496$ & $11.64 \pm 3.53$ \\
\bottomrule
\end{tabular}

\scriptsize\textsuperscript{a}Based on established factor of $\approx 25.8$ gCO$_2$e/h for Jean Zay V100 using \url{https://labos1point5.org/les-rapports/estimation-empreinte-calcul}.
}
\end{table}

\end{document}