\documentclass{midl} % Include author names
% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{tikz}
\usepackage{multirow}
\usepackage{xcolor}
\usepackage{booktabs}

\usetikzlibrary{positioning}
%\jmlrvolume{-- Under Review}
%\jmlryear{2026}
%\jmlrworkshop{Full Paper -- MIDL 2026 submission}
%\editors{Under Review for MIDL 2026}

\jmlryear{2026}\jmlrworkshop{Full Paper -- MIDL 2026}\jmlrvolume{-- nnn}\editors{Accepted for publication at MIDL 2026}

\title[Benchmarking graph learning in rs-fMRI]{A comprehensive benchmark of graph neural networks, graph kernels, and classical machine learning approaches on rs-fMRI brain graphs}% across multiple cohorts}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Razan Mhanna\nametag{$^{1,2}$}} \orcid{1111-2222-3333-4444} \Email{razan.mhanna@inria.fr}
 \AND
\Name{Sophie Achard\nametag{$^{1}$}} \orcid{0000-0001-9448-1199} \Email{sophie.achard@inria.fr}
 \AND
\Name{Alexander Petersen\nametag{$^{3}$}} \Email{petersen@stat.byu.edu}
 \AND
\Name{Jonas Richiardi\nametag{$^{4}$}} \orcid{0000-0002-6975-5634} \Email{Jonas.Richiardi@chuv.ch}\\
\addr $^{1}$ Univ. Grenoble Alpes, CNRS, Inria, Grenoble INP, LJK, Grenoble, France \\
\addr $^{2}$ Univ. Grenoble Alpes, Inserm U1216, CHU Grenoble Alpes, Institut des Neurosciences, France\\
\addr $^{3}$ Department of Statistics, Brigham Young University, Provo, UT, USA\\
\addr $^{4}$ Lausanne University Hospital and University of Lausanne, Switzerland
}

\begin{document}

\maketitle

\begin{abstract}
Resting-state functional MRI (rs-fMRI) provides a powerful lens through which large-scale brain organization can be examined by modeling functional connectivity as a graph. These functional brain graphs now form the basis of machine-learning applications in neuroscience, ranging from relatively straightforward classification problems to more challenging behavioral and cognitive prediction tasks. While graph neural networks (GNNs) have gained increasing attention in neuroimaging, the absence of a unified, reproducible benchmark comparing GNNs with classical machine-learning models and graph kernel methods, across heterogeneous datasets and tasks, has made it difficult to assess their relative strengths.
 In this work, we introduce a comprehensive benchmarking framework spanning four heterogeneous cohorts ($N = 1513$) and multiple classification tasks, including clinical diagnosis and phenotypic prediction. We systematically evaluate classical models, graph kernels, and representative GNN architectures under a rigorous repeated nested cross-validation design and assess pairwise differences using the corrected repeated k-fold test with false-discovery-rate control.
 Our results show that, for this class of relatively small graphs with fixed vertex ordering, well-tuned classical ML approaches and graph kernels are competitive with GNNs, while requiring substantially fewer computational resources. For instance, the Shortest-Path graph kernel achieves 0.98 accuracy on the COMA dataset, logistic regression reaches 0.81 accuracy and 0.63 MCC on HCP sex prediction, and all model families cluster closely on multi-site datasets such as ABIDE and ADHD, where no statistically significant differences emerge. 
All code, seeds, cross-validation folds, fold-specific hyperparameters, full prediction logs and computational-cost measurements are publicly  released at
\url{https://gitlab.inria.fr/rmhanna/benchmark-study} to ensure full transparency and reproducibility. This benchmark provides practical guidance for model selection in rs-fMRI connectome analysis.
%This benchmark provides practical guidance for model selection in rs-fMRI connectome analysis and establishes a reproducible foundation for future methodological developments in graph-based neuroimaging. 
%To ensure full transparency and reproducibility, all code, seeds, and results will be made publicly available https://gitlab.inria.fr/rmhanna/benchmark-study.
\end{abstract}

\begin{keywords}
Resting-state fMRI, brain networks, Graph kernels, Graph neural networks, benchmarking, reproducibility, computational efficiency.
\end{keywords}

\section{Introduction} 
A popular approach used to investigate brain function is resting-state functional magnetic resonance imaging (rs-fMRI), a noninvasive neuroimaging technique that measures fluctuations in the blood-oxygenation-level-dependent (BOLD) signal as a correlate of brain activity. fMRI data inherently possess a complex spatio-temporal structure: each voxel (volumetric pixel) provides a time series of BOLD measurements, resulting in high-dimensional, noisy observations that inherently reside on non-Euclidean domains. To reduce the spatial complexity, voxels can be collated into user-specified regions of interest (ROI), after which aggregation is performed across voxels within the same ROI by averaging. %For each ROI, the BOLD signal is averaged over all the voxels within that region to produce a single time series that represents the overall activity of the region.
Based on these regional signals, a functional connectome is commonly modeled as a graph, where nodes correspond to ROIs and edges represent estimated pairwise functional connectivity. %Such graph-based representations have proven valuable for studying large-scale brain organization and for characterizing differences between healthy individuals and clinical populations.
Such graph-based representations have been successfully applied across a broad spectrum of network neuroscience studies, including investigations of neurodegenerative conditions such as Alzheimer's disease and mild cognitive impairment, autism spectrum disorder, disorders of consciousness, and the prediction of psychometric, cognitive, and behavioral phenotypes in healthy individuals  \cite{dadi2019benchmarking,di2014autism,li2021braingnn,cui2022braingb}.
 However, despite substantial progress in neuroimaging analysis and machine learning, significant challenges persist due to the intrinsic complexity of functional connectivity data—including high dimensionality, sensitivity to parcellation schemes, variability across acquisition sites, and limited sample sizes. As a result, developing reliable computational methods that are robust, generalizable across cohorts, and reproducible remains an open and pressing problem in functional connectivity modeling.

 
%Prior benchmarking efforts typically rely on ....
%Prior benchmarking efforts have focused primarily on message-passing GNNs applied to conventional correlation- or tractography-derived connectomes. BrainGB, for example, provides a unified evaluation framework across three functional datasets (HIV, PNC, ABCD) and one structural dataset (PPMI), covering disease classification as well as gender prediction. Its design decomposes GNNs into modular components—node feature initialization, message-passing functions, attention mechanisms, and pooling strategies—and systematically enumerates their combinations to explore a broad architectural space. While BrainGB includes a limited set of shallow baselines (e.g., M2E, MPCA, multiple-kernel SVM), it does not offer a comprehensive comparison with classical machine-learning approaches, graph-kernel methods, or CNN-based pipelines under a single, consistent experimental protocol.
Prior benchmarking efforts in brain functional connectivity have mainly focused on either GNNs or classical ML pipelines. BrainGB \cite{cui2022braingb} evaluates message-passing GNN architectures across three functional datasets (HIV disease classification, PNC and ABCD gender prediction) and one structural dataset (PPMI Parkinson's disease classification), using a modular design to compare a large family of GNN operators. In parallel, the benchmark of  \citet{dadi2019benchmarking} systematically assessed classical ML models over 6 datasets, 8 atlases, and 3 connectivity profiles, highlighting the strong performance of tangent-space parametrization and $\ell_2$-penalized classifiers. 
 We focus on these two benchmarks because they represent the most comprehensive and methodologically rigorous evaluations currently available: BrainGB provides the most extensive assessment of GNN architectures, whereas \citet{dadi2019benchmarking} remains the reference point for classical ML pipelines. Other studies typically investigate a single dataset, a single prediction task, or a narrowly defined model family, making broad methodological comparisons difficult. Nonetheless, these benchmarks remain limited to either GNNs or shallow ML, and do not provide a unified comparison spanning ML, graph kernels, and GNNs across heterogeneous datasets and tasks.
 
In this work, we develop a systematic benchmark that integrates multiple cohorts with distinct demographic and clinical profiles, including ADHD, HCP, ABIDE, and a clinical COMA dataset. This multi-cohort design allows us to assess model performance across a broad range of prediction tasks: clinical case--control classification, phenotypic sex prediction, ASD diagnosis, and ADHD classification, mirroring the diversity of applications commonly encountered in network neuroscience. To the best of our knowledge, no prior study has provided a unified comparison of classical machine-learning models, graph-kernel methods, and graph neural networks across such heterogeneous datasets and task paradigms. 
 Throughout this study, we focus exclusively on inductive GNNs for graph--level prediction, where each subject is represented by an independent brain graph; transductive population-level GNNs for node classification are naturally excluded.

Our contributions are threefold:
\begin{itemize}
\item Multi-dataset, multi-task evaluation: We evaluate models acrosss four cohorts covering both healthy and clinical populations, enabling assessment on diverse classification tasks (clinical diagnosis, ASD/ADHD classification, sex prediction).
\item Comprehensive comparison of modeling families: We benchmark classical ML methods (logistic regression, SVM, XGBoost), graph--kernel approaches (Shortest--Path, Weisfeiler--Lehman), and GNN architectures (GCN, GraphSAGE, GAT) under a unified and fully reproducible experimental pipeline.
\item Robust evaluation strategy: We employ a repeated nested cross-validation framework in which, for each of the 50 outer folds, one fold is reserved exclusively for testing, whereas the remaining folds are further partitioned into training and validation subsets. This procedure has been demonstrated to provide more stable model rankings \cite{eve2025importance}.
\item Statistical significance analysis: We assess whether performance differences between models are statistically significant using the corrected repeated k-fold test of \citet{bouckaert2004evaluating}, combined with \citet{benjamini1995controlling} FDR correction test across pairwise comparisons.
\item Practical efficiency assessment: We report computational aspects including training time, and $CO_2$ emisions, for CPU-operated classical machine learning and graph kernels models. This provides a clear picture of the computational footprint of lightweight methods within our benchmark.
\end{itemize}
Taken together, this benchmark offers a transparent and systematic comparison spanning classical machine-learning models, graph-kernel methods (which remain underexplored in the neuroimaging community) and more recent GNN architectures, evaluated across multiple datasets and prediction tasks. Our goal is to provide a clearer picture of the relative strengths of each modeling family in terms of accuracy, robustness, and computational efficiency.
\section{Materials and Methods}
\subsection{Datasets}
\label{datasets}

In this study, we selected publicly available resting-state fMRI datasets that offer comparable preprocessing pipelines and compatible brain parcellations, enabling methodological coherence across datasets. This choice allows us to assess the generalizability of state-of-the-art (SOTA) models under varying sample sizes, scanner characteristics, and population demographics. We first use data from the publicly available, fully de-identified Neuro Bureau ADHD-200 dataset \cite{bellec2017neuro}, which originally includes  participants recruited across eight sites: Peking University, Bradley Hospital (Brown University), Kennedy Krieger Institute, the Donders Institute, New York University Child Study Center, Oregon Health and Science University, the University of Pittsburgh, and Washington University in St. Louis. All cohorts received approval from their respective institutional review boards, and written informed consent was obtained from all participants or their legal guardians. Individuals had no history of psychiatric, neurological, or medical conditions other than ADHD.

We also include the ABIDE dataset, released through the Autism Brain Imaging Data Exchange initiative \cite{di2014autism}, which aggregates rs-fMRI acquisitions from multiple sites to study Autism Spectrum Disorder. We rely on rs-fMRI time series provided by the Preprocessed Connectome Project (PCP) \cite{craddock2013neuro}, as used in \citet{dadi2019benchmarking}, and use them to perform ASD vs. control classification.
Then, we incorporate data from the Human Connectome Project (HCP), which offers high-quality imaging and behavioral assessments for healthy young adults \cite{van2013wu}. We use the preprocessed rs-fMRI time series from the HCP900 release, also recovered from the preprocessing distributed in \citet{dadi2019benchmarking}. This dataset enables experiments on sex classification, providing a complementary setting with substantially longer acquisitions.
Finally, we use a clinical dataset of 44 subjects acquired at Grenoble Alpes University Hospital \cite{oujamaa2023functional}. This cohort comprises 24 patients who had sustained acute severe traumatic brain injury; at the time of scanning, 15 had recovered consciousness, whereas 9 remained in a minimally conscious state (MCS). A control group of 20 age-matched healthy volunteers was collected under comparable acquisition conditions \cite{job2020functional}. All rs-fMRI scans were obtained on the same MRI system using identical acquisition parameters and were parcellated using a 105-region modified AAL3 atlas. % Each resting-state run lasted 13 minutes and 20 seconds.
%In addition, we use a full simulated coma dataset (description in the appendix), We emphasize that the simulated COMA dataset is used as a methodological sanity check and is not treated as an independent benchmark dataset. Our main empirical conclusions are based on real rs-fMRI cohorts described before, while the simulations are used to corroborate the behaviour of connectivity estimators and models in a controlled setting.

A summary of all datasets, associated prediction tasks, and sample sizes is provided in Table~\ref{tab:datasets}.
\begin{table}[t]
\centering
\resizebox{\linewidth}{!}{
\begin{tabular}{l l c c l c}
\hline
Dataset & Source & Atlas & \#ROIs & Classification task & \# Subjects \\
\hline
HCP & \citet{dadi2019benchmarking} & AAL & 116 &
 Sex classification & 443 \\
%HCP & \citet{dadi2019benchmarking} & AAL & 116 FI prediction  & 443 \\
ADHD& \citet{bellec2017neuro} & AAL & 116 & ADHD prediction & 160 \\
ABIDE & \citet{dadi2019benchmarking} & AAL & 116 & ASD prediction & 866 \\
COMA & \citet{oujamaa2023functional} & AAL & 105 & DoC prediction & 44 \\
%Simulated COMA & Ours & AAL & 105 & DoC prediction & 800 \\
\hline
\end{tabular}
}
\caption{Summary of dataset statistics. ADHD = Attention Deficit Hyperactivity Disorder; ASD = Autism Spectrum Disorder; DoC = Disorders of Consciousness.}
\label{tab:datasets}
\end{table}

\subsection{Graph Construction}
For each subject, regional BOLD time courses were collected from preprocessed datasets reported in Section \ref{datasets}. Based on these signals, pairwise Pearson correlation coefficients were computed using the ConnectivityMeasure function from the Nilearn library \cite{abraham2014machine}, resulting in one weighted connectivity matrix per subject. 
Depending on the model requirements, different graph representations were derived from these matrices. For graph-kernel methods described in \ref{graph-kernels}, we constructed thresholded binary graphs by first extracting the minimum spanning tree (MST) to ensure graph connectivity, and subsequently retaining the top 10\% of the strongest remaining correlations. This threshold was selected based on empirical performance evaluation on the COMA dataset. In contrast, classical machine-learning models described in \ref{classical-ml} were applied directly to the vectorized upper triangular part of the full, weighted connectivity matrices. For GNN-based models, we followed the BrainGB \cite{cui2022braingb} recommendations by using connection profiles as node features, which have been shown to be among the most effective node feature choices for rs-fMRI brain connectome analysis. Under this formulation, explicit edge thresholding plays a less critical role, as connectivity information is primarily encoded in the node features rather than through a sparsified graph topology.
\subsection{Benchmark Models}
\subsubsection{Classical ML}
\label{classical-ml}
As baseline classifiers, we employ three widely used machine-learning algorithms: Logistic Regression, Random Forests, and XGBoost. Contrary to many prior neuroimaging studies, including the benchmark of \citet{dadi2019benchmarking}, we perform systematic hyperparameter optimization independently for each dataset. The full search space and optimization details are reported in Section \ref{implementation_details}. These classical models have consistently demonstrated strong performance across multiple domains, including neuroimaging studies. 
\subsubsection{Graph kernels}
\label{graph-kernels}
Kernel methods provide a powerful mathematical framework for measuring similarities between structured objects, such as graphs, by implicitly mapping them from the original feature space to a (possibly infinite-dimensional) Hilbert space, where it corresponds to an inner product between transformed samples.
Their main advantage lies in their computational efficiency: many kernels admit closed-form expressions for these inner products, removing the need to compute the explicit transformation. As established by Mercer's theorem, a function qualifies as a valid kernel if it satisfies the conditions of positive semi-definiteness, ensuring it represents a legitimate inner product in some feature space.
%A kernel is a function that measures the similarity between a pair of objects and corresponds mathematically to an inner product in a reproducing kernel Hilbert space (RKHS). 
Formally, for a non-empty set $\chi$ and a function $k : \chi \times \chi \rightarrow \mathbb{R}$, there exists a Hilbert space $\mathcal{H}_k$ and a feature map $\phi : \chi \rightarrow \mathcal{H}_k$ such that
\[
k(x,y) = \langle \phi(x), \phi(y) \rangle_{\mathcal{H}_k}, \quad x,y \in \chi.
\]
Once the kernel function is defined, kernel-based algorithms such as the Support Vector Machine (SVM) can be applied directly for classification or regression tasks \cite{hofmann2008kernel}.

In this study, we evaluate two graph kernels implemented in the GraKeL library \cite{siglidis2020grakel}—the Shortest-Path and Weisfeiler--Lehman kernels.
%—alongside one additional approach specifically designed for brain networks, the Sub-network Kernel proposed by Jie et al. \cite{jie2018sub}.
\begin{itemize}
    \item The Shortest-Path kernel measures graph similarity by comparing the lengths and endpoint labels of all shortest paths between node pairs. For each graph, the shortest-path distances are computed, and two graphs are considered similar when their node pairs are connected through paths of comparable lengths. This kernel captures the overall topological layout of a network and is particularly effective when global path structure differentiates the graphs. It is later referred to as $GK-SP$ throughout the paper.
    \item The Weisfeiler--Lehman kernel relies on the iterative node-label refinement procedure proposed in the WL test of graph isomorphism. At each iteration, a node's label is updated by combining its current label with those of its neighbors, generating progressively enriched node representations. By comparing graphs across multiple refinement steps, the kernel captures hierarchical structural information and neighborhood similarity. Its efficiency and scalability make it a strong baseline for graph classification tasks. It is referred to as $GK-WL$ in the rest of the paper. 
  
\end{itemize}


\subsubsection{Graph Neural Networks}
\label{GNN}
Graph Neural Networks (GNNs) have gained significant attention in the field of network neuroscience \cite{cui2022braingb,li2021braingnn,comparini2026lateearlyfusion,xu2023data,xu2024contrastive} due to their ability to effectively model and analyze complex graph structures. In this work, all GNN models are considered within an inductive, graph-level learning framework, where each subject is represented by an independent brain graph and the prediction target is defined at the graph level (e.g., clinical diagnosis or phenotypic classification). Consequently, transductive GNN approaches commonly used for node classification on population graphs are not considered, as they are not well suited to the multi-subject graph classification setting addressed here.

Most modern architectures can be expressed under the message passing neural network (MPNN) framework, in which node 
representations are iteratively updated by aggregating information from their neighborhoods. A 
generic MPNN layer can be written as:
\[
\mathbf{h}_i^{(l+1)} = 
\phi\!\left( 
\mathbf{h}_i^{(l)},\; 
\square_{j \in \mathcal{N}(i)} 
\psi\!\left(\mathbf{h}_i^{(l)},\, \mathbf{h}_j^{(l)}\right) 
\right),
\]
where $\mathbf{h}_i^{(l)}$ is the feature vector of node $i$ at layer $l$, $\psi$ and $\phi$ are 
learnable functions, and $\square$ denotes a permutation-invariant aggregation operator.

In this study, we focus on three representative GNN architectures widely used in many applications including brain graph analysis:
\begin{itemize}
    \item Graph Convolutional Network (GCN): 
    A baseline architecture that updates node representations by combining features from adjacent nodes through a predefined, normalized aggregation strategy \cite{jiang2019semi}.
    \item Graph Attention Network (GAT): Incorporates an attention mechanism that adaptively weights neighboring nodes by learning how much each one should contribute during the feature aggregation process, allowing the model to emphasize the most informative interactions \cite{velivckovic2017graph}.
    \item GraphSAGE (SAmple and aggreGatE): A sampling-based architecture that learns node embeddings by aggregating information from a fixed number of sampled neighbors, enabling inductive generalization to unseen nodes and graphs \cite{hamilton2017inductive}.
    \item Graph Transformer (TransformerConv): An attention-based message-passing architecture that learns adaptive aggregation weights over neighboring nodes defined by the input graph topology, providing a flexible alternative to convolution-based GNNs \citet{ijcai2021p214}.
\end{itemize}
These architectures provide complementary mechanisms for learning from functional brain 
connectivity graphs and form suitable candidates for comparative evaluation in this study.
\subsection{Evaluation metrics}
The performance of our classification models is assessed using accuracy (ACC), balanced accuracy (BACC), and the Matthews correlation coefficient (MCC). While ACC quantifies the overall proportion of correctly classified samples, BACC accounts for class imbalance by averaging sensitivity and specificity. MCC provides a more comprehensive and balanced summary of performance by incorporating all four entries of the confusion matrix. It is defined as:
\[
\mathrm{MCC} = 
\frac{ TP \times TN - FP \times FN }
{ \sqrt{ (TP+FP)(TP+FN)(TN+FP)(TN+FN) } },
\]
where $TP$, $TN$, $FP$, and $FN$ denote true positives, true negatives, false positives, and false negatives, respectively. 
MCC values range from $-1$ to $1$, with higher values indicating stronger agreement between predicted and true labels and offering a reliable assessment of classifier performance under class imbalance.
%The AUC evaluates the discriminative ability of the classifier across all decision thresholds.

%For regression experiments, performance is quantified using the coefficient of determination ($R^2$) computed on held-out data. 
%$R^2$ measures the proportion of variance in the target variable that is explained by the model and is defined as:
%\[R^2 = 1 -\frac{\sum_{i=1}^{n} (y_i - \hat{y}_i)^2}{\sum_{i=1}^{n} (y_i - \bar{y})^2},\]where $y_i$ denotes the true values, $\hat{y}_i$ the predicted values, and $\bar{y}$ the mean of the true values. Higher $R^2$ indicates a better fit of the model to the data.

%MAE quantifies the average magnitude of prediction errors in the same units as the target variable. 
%It is defined as:
%\[\mathrm{MAE} = \frac{1{n}\sum_{i=1}^{n} | y_i\hat{y}_i|,\]with lower MAE indicating smaller average deviations between predictions and ground truth.
\subsection{Implementation details and hyperparameter optimization}\label{implementation_details}

\begin{figure}[t]
\centering
\resizebox{\columnwidth}{!}{
\begin{tikzpicture}[scale=0.95, every node/.style={font=\small}]

% Outer 5 folds bar
\draw[black, thick] (0,0) rectangle (10,1);
\node at (5,1.3) {Repeated Stratified 5-Fold CV (10 repetitions)};

% Five outer blocks
\foreach \x/\label in {1/Test, 3/TrainVal, 5/TrainVal, 7/TrainVal, 9/TrainVal} {
    \draw[fill=gray!20] (\x-1,0) rectangle (\x+1,1);
    \node at (\x,0.5) {\label};
}

% Inner 4-way split zoom
\draw[thick,->] (7,0) -- (12,0.5);

\draw[black, thick] (12,0) rectangle (18,1);
\node at (15,1.3) {Internal split of TrainVal};

\foreach \x/\lab/\col in {13/Train/red!20, 14.5/Train/red!20, 16/Train/red!20, 17.5/Val/blue!20} {
    \draw[fill=\col] (\x-0.75,0) rectangle (\x+0.75,1);
    \node at (\x,0.5) {\lab};
}

\end{tikzpicture}}
\label{fig:cv}

\caption{Repeated 5-fold nested cross-validation: Across 50 outer folds(5 folds repeated 10 times), one outer fold is held out as the test set, while the remaining four folds are internally split into three 
training folds and one validation fold for hyperparameter tuning.}
\end{figure}
To minimize the risk of data leakage and obtain stable performance estimates on relatively small neuroimagaging datasets, all models were trained and evaluated within a repeated stratified K-fold cross-validation framework (10 repetitions, 5 folds). This choice is supported by recent empirical analyses demonstrating that repeated k-fold cross validation yields more reliable model rankings than single-split or non-repeated procedures, especially when sample sizes are limited within each outer fold \cite{eve2025importance}. Hyperparameters were tuned independently using an internal stratified split of the training data, as illustrated in Figure \ref{fig:cv}, ensuring that the test fold remains completely unseen during both training and hyperparameter selection. The implementations were developed using PyTorch, NetworkX, PyTorch Geometric, GraKeL and CodeCarbon libraries \cite{fey2019fast,siglidis2020grakel,courty2024mlco2}. All experiments were conducted on a Dell workstation running Ubuntu 22.04, equipped with an Intel Core i7 processor and an NVIDIA RTX A500 GPU.

A systematic hyperparameter search was conducted to identify the optimal configuration for the classical machine-learning models and graph kernel classifiers. The search spaces were defined as follows: for Logistic Regression, the regularization parameter was 
$C \in [10^{-4}, 10^{4}]$ (log-uniform prior); for the Random Forest classifier, 
$n_{\text{estimators}} \in [100, 500]$, 
$\text{max\_depth} \in [2, 20]$, 
$\text{min\_samples\_leaf} \in [1, 10]$, 
and $\text{max\_features} \in \{\text{"sqrt"}, \text{"log2"}\}$. 
For XGBoost, we optimized 
$\text{max\_depth} \in [2, 6]$ 
and $\text{min\_child\_weight} \in [0.1, 10]$ (log-uniform prior).
 
Since the SP kernel produces a precomputed Gram matrix in GraKeL, only the SVM regularization parameter $C$ is tuned.  
For the Weisfeiler--Lehman (WL) kernel, we optimized both 
$C \in [10^{-3}, 10^{3}]$ 
and the WL height $h \in [2, 4]$.
Hyperparameters were optimized separately for each dataset using Bayesian optimization on an inner validation split, with balanced accuracy as the criterion to maximize. Each search was limited to 20 Bayesian optimization calls. All seeds, fold-specific hyperparameters, and full logs are provided in our public GitHub repository to ensure transparency and reproducibility.

Due to the substantially higher computational cost of training GNNs compared to classical graph kernels, we adopt fixed hyperparameter settings that follow common practice in prior neuroimaging benchmarks, including BrainGB \cite{cui2022braingb} and the work of \citet{comparini2026lateearlyfusion}. This approach is standard in multi-cohort evaluations, where exhaustive hyperparameter tuning for deep models is prohibitively expensive and provides limited benefit for small- to medium-sized neuroimaging datasets. All GNN models were trained with the Adam optimizer and a maximum of 200 epochs, with early stopping triggered after 30 consecutive epochs without improvement in the validation loss. For the GCN architecture, we used a learning rate of 0.001 with 32 hidden channels and three graph convolutional layers. The GAT model was configured with 16 hidden channels, four attention heads, and two layers, while GraphSAGE employed 32 hidden channels and three layers. All GNNs shared the same learning rate, number of epochs, optimizer, and early-stopping criterion



\section{Results}
\subsection{Classification results}
\begin{table*}
\centering
\caption{Classification performance across datasets. 
LR = Logistic Regression, RF = Random Forest, 
Acc = Accuracy, MCC = Matthews correlation coefficient.}
\label{tab:results}
\renewcommand{\arraystretch}{1.2}
\setlength{\tabcolsep}{6pt}
\resizebox{\textwidth}{!}{
\begin{tabular}{|p{3cm}|p{3cm}|cc|cc|cc|cc|}
\hline
\textbf{Model's family} & \textbf{Model}
& \multicolumn{2}{c|}{\textbf{COMA}}
& \multicolumn{2}{c|}{\textbf{HCP}}
& \multicolumn{2}{c|}{\textbf{ADHD}}
& \multicolumn{2}{c|}{\textbf{ABIDE}} \\

& &
Acc & MCC & Acc & MCC & Acc & MCC & Acc & MCC \\ \hline
\multirow{3}{*}{Classical ML}
& LR        & \underline{0.84$\pm$0.1} &  \underline{0.71 $\pm$0.19}& \textbf{0.81$\pm$0.03} &\textbf{0.63$\pm$0.07}  & 0.62$\pm$0.07  &  0.11$\pm$0.17& \textbf{0.63$\pm$0.03} &\textbf{0.26$\pm$0.06}  \\ 
& RF        & 0.8 $\pm$ 0.16&0.63$\pm$0.31  & 0.7$\pm$0.04 & 0.4$\pm$0.09 & \underline{0.67$\pm$0.04}  &0.02$\pm$0.15  & 0.58$\pm$0.03 & 0.16$\pm$0.07 \\ 
& XGBoost   & 0.71 $\pm$ 0.14 & 0.42 $\pm$ 0.35 & \underline{0.73$\pm$0.03} &\underline{0.46$\pm$0.07}  & 0.62$\pm$0.06 & 0.01$\pm$0.17 & \underline{0.6$\pm$0.03} & \underline{0.19$\pm$ 0.06}  \\ \hline

\multirow{2}{*}{Graph kernels}
& GK-SP     & \textbf{0.98$\pm$0.04} & \textbf{0.97$\pm$0.07} & 0.71$\pm$0.04 &0.41$\pm$0.09  & \textbf{0.67$\pm$0.03} &0.04$\pm$0.16  & \underline{0.6$\pm$0.04} &\underline{0.19$\pm$0.08}  \\ 
& GK-WL     &0.63$\pm$0.11  & 0.25$\pm$0.25 &0.6$\pm$0.04  &0.17$\pm$0.1  & 0.64$\pm$0.1 &0.01$\pm$0.06  & 0.53$\pm$0.03 & 0.05$\pm$0.06 \\ 
%& GK-XXX    &  &  &  &  &  &  &  &  \\ 
\hline

\multirow{4}{*}{GNN}
& GCN       & 0.74 $\pm$ 0.15&0.49$\pm$0.32  &0.68$\pm$0.05  &0.35$\pm$0.1  & 0.61$\pm$0.1 &0.1$\pm$0.2  &0.53$\pm$0.04  & 0.06$\pm$0.07 \\ 
& GAT       & 0.71$\pm$0.15 &0.45$\pm$0.31  &  0.71$\pm$0.04& 0.42$\pm$0.09 & 0.6$\pm$0.08 &0.11$\pm$0.16  & 0.56 $\pm$ 0.03 & 0.13$\pm$ 0.06\\ 
& GraphSAGE & 0.75$\pm$0.13 & 0.52$\pm$0.27 &0.73$\pm$0.04  &0.45$\pm$0.09  & 0.64$\pm$0.07 &\textbf{0.16$\pm$0.17}  & 0.57$\pm$0.04 &0.13$\pm$0.08  \\
& TransformerConv & 0.79$\pm$0.12 & 0.6$\pm$0.24 & 0.73$\pm$0.05 & 0.45 $\pm$0.09 & 0.62 $\pm$0.07 & \underline{0.14$\pm$0.17} & 0.57 $\pm$ 0.04 & 0.13 $\pm$ 0.07 \\
\hline
\end{tabular}}
\end{table*}
Table \ref{tab:results} summarizes the classification performance across all datasets in terms of accuracy (Acc) and Matthews correlation coefficient (MCC), while Figure \ref{fig:bacc_plot} reports the corresponding balanced accuracy averaged over repeated cross-validation splits, with error bars indicating the standard deviation. Overall, we observe that classical machine learning methods, graph kernels, and graph neural networks achieve broadly comparable performance across cohorts. Importantly, graph kernels—particularly the Shortest-Path kernel (GK-SP)—achieve some of the highest performance levels, most notably on the COMA dataset, where GK-SP attains an accuracy of approximately 0.98 and an MCC of 0.97 (Table \ref{tab:results}). On the ADHD dataset, GK-SP also yields competitive accuracy ($\approx$0.67), with lower MCC values, reflecting the difficulty of the task, likely due to the heterogeneity introduced by multi-site data acquisition. In addition, logistic regression achieves strong performance on both the HCP (Acc $\approx$ 0.81, MCC $\approx$ 0.63) and ABIDE (Acc $\approx$ 0.63, MCC $\approx$ 0.26) datasets, highlighting the effectiveness of simple linear models in these settings. In contrast, graph neural network models, including GCN, GAT, GraphSAGE, and GraphTransformer, generally exhibit similar or slightly lower mean performance across datasets and cross-validation splits. This behaviour is also reflected in the balanced accuracy results shown in Figure \ref{fig:bacc_plot}, where GNNs typically match or underperform classical machine learning and graph kernel approaches.

The limited performance separation observed across the evaluated GNN architectures can be attributed to their shared reliance on message-passing and permutation-invariant aggregation mechanisms. Although the models differ architecturally, they all iteratively aggregate neighborhood information and propagate it across the graph. In the present setting, where graphs are relatively small and node identities are consistent across subjects, a small number of message-passing layers is sufficient to integrate near-global information. Under such conditions, increased architectural complexity or message-passing depth does not necessarily yield additional discriminative capacity and may instead lead to representation homogenization. 
%These observations are consistent with recent analyses highlighting intrinsic limitations of message-passing GNNs, including over-smoothing and diminishing returns with increasing depth, particularly in regimes where informative signal is not strictly local.
Consequently, the clustered performance across GNN variants observed in this benchmark reflects not a deficiency of the models, but rather the characteristics of the data and task, under which the inductive biases introduced by message passing and aggregation have a limited differentiating effect.

The COMA dataset exhibits the clearest separation between model families, with graph kernels achieving the highest accuracy and MCC values, highlighting their ability to capture global alterations in functional brain connectivity associated with disorders of consciousness. In contrast, graph neural networks yield lower average performance in this setting, suggesting limitations in learning stable and discriminative representations from very small clinical datasets. As shown in Figure \ref{fig:bacc_plot}, balanced accuracy values are high overall but are accompanied by increased variability across cross-validation folds. This variability is likely related to the inter-patient heterogeneity within the COMA cohort, where patterns of functional alteration can differ substantially across individuals, leading to sensitivity to the specific composition of training and test folds.
\begin{figure}[!tbh]
    \centering
    \includegraphics[width=1\linewidth]{image2.png}
    \caption{Repeated cross-validation balanced accuracy. The height of each bar corresponds to the mean balanced accuracy across all folds and repetitions, with error bars representing the standard deviation.}
    \label{fig:bacc_plot}
\end{figure}
The HCP dataset exhibits the most homogeneous performance across models, with several approaches achieving similar accuracy values. Notably, logistic regression attains strong performance, comparable to or exceeding that of more complex graph-based models. This suggests that the high quality, consistency, and relatively low noise of the HCP data allow even simple linear models to capture discriminative information effectively.
For comparison with prior work, the NeuroGraph benchmark \cite{said2023neurograph} reports an accuracy of 69.9\% for sex classification on the HCP Young Adult S1200 dataset (1,078 subjects) using a Random Forest classifier, which is broadly consistent with the performance of classical machine-learning baselines observed in our study. In contrast, graph neural networks in NeuroGraph achieve higher accuracy, with reported values of 75.46\% for GCN, 77.69\% for GraphSAGE, and 76.20\% for GAT. However, these results are obtained under different experimental conditions, namely using static functional connectivity graphs constructed with the Schaefer atlas at 1000 ROIs and a single stratified 70/10/20 train---validation--test split, which limits direct comparability with our repeated cross--validation setting.

For the ADHD dataset, all methods exhibit relatively modest performance, with substantial overlap in balanced accuracy across model families, as shown in Figure \ref{fig:bacc_plot}. Graph kernels, particularly the Shortest-Path kernel (GK-SP), achieve some of the highest accuracy values ($\approx$ 0.67 in Table \ref{tab:results}). In terms of MCC, the best values are also attained by classical approaches, although MCC remains low overall across all models, reflecting the limited class separability in this dataset. This pattern reflects the subtle and heterogeneous nature of ADHD-related functional connectivity alterations, as well as the variability introduced by multi-site data collection. In this context, increased model complexity does not provide a clear advantage, and performance differences between approaches remain limited and often not statistically significant. A similar performance pattern is observed on the ABIDE dataset, where the absence of clear gains from graph neural networks suggests that the high inter-site heterogeneity of ABIDE, combined with limited sample sizes per site, may hinder the learning of robust graph-level representations. In contrast, simpler models appear less sensitive to this variability, resulting in comparable or even superior performance. While careful hyperparameter tuning can modestly improve results, it does not fundamentally alter the observed patterns.
In comparison with prior work, the ContrastPool study \cite{xu2024contrastive}reports, on the ABIDE dataset (989 subjects) evaluated using 10-fold cross-validation, an accuracy of $65.82 \pm 3.51$\% for logistic regression and $61.18 \pm 5.01$\% for random forest, values that are consistent with the performance of classical pipelines observed in our experiments. General-purpose GNNs achieve comparable or slightly lower accuracy, including GCN ($60.97 \pm 2.84$\%), GAT ($60.87 \pm 5.02$\%), and GraphSAGE ($63.09 \pm 3.11$\%). Importantly, GNNs in \citet{xu2024contrastive} study benefit from explicit hyperparameter tuning via grid search.

%\subsection{Regression results}
%See Table 2 of Dadi's work, a nice way to present the results across 50 repetitions/folds.

%\subsection{Leave-one-site-out cross-validation for ABIDE}
\subsection{Statistical comparison of models}
\begin{figure}[!tbh]
    \centering
    \includegraphics[width=0.9\columnwidth]{image.png}
    \caption{Pairwise statistical comparison of models across all datasets (ABIDE, ADHD, Coma, and HCP) based on accuracy. Each cell reports the p-value obtained from the corrected repeated k-fold cross-validation test \cite{bouckaert2004evaluating}. Raw p-values are displayed in the upper triangular part of each matrix. The lower triangular part displays p-values after Benjamini--Hochberg false discovery rate (FDR) correction applied across all pairwise model comparisons within each dataset. Diagonal entries correspond to self-comparisons. Cells marked with a star indicate comparisons that remain statistically significant after FDR correction at level $\alpha = 0.05$.
}
    \label{fig:stat}
\end{figure}

To assess whether observed performance differences between models were statistically significant, we followed the recommendations of \citet{bouckaert2004evaluating}, who showed that standard significance tests applied to cross-validation results can be overly optimistic due to dependencies between folds. We therefore employed the corrected repeated k-fold cross-validation test. For each dataset and each pair of models, predictions collected on identical test splits across all folds and repetitions were compared using a loss function derived from the evaluation metric. Statistical significance was then assessed using a two-sided Student's t-test on the mean loss difference. Furthermore, to account for multiple pairwise comparisons, Benjamini--Hochberg false discovery rate (FDR) correction was applied within each dataset \cite{benjamini1995controlling}. Raw p-values from the corrected repeated k-fold test are shown in the upper triangular part of each matrix, while FDR-adjusted p-values are reported in the lower triangular part of Figure \ref{fig:stat}. Starred cells indicate comparisons that remain statistically significant after FDR correction.

Across datasets, the statistical analysis largely supports the conclusion that most models achieve comparable performance, with limited evidence of robust pairwise differences. In particular, the ADHD dataset shows no statistically significant differences between models, as reflected by uniformly large p-values in both the raw and FDR-adjusted matrices. This lack of statistical separation is likely due to several factors, including the limited sample size retained from the ADHD-200 initiative (160 subjects in our study) and the multi-site heterogeneity, which results in small effective sample sizes per site. A similar pattern is observed for the ABIDE dataset, where only a small number of isolated pairwise comparisons remain significant after correction. As shown in Table \ref{tab:results} and Figure \ref{fig:bacc_plot}, most performance differences across models in ABIDE are small and statistically fragile, indicating substantial overlap between model families once cross-validation dependence and multiple testing are properly accounted for.

For the HCP dataset, the statistical comparisons reveal more consistent differences between model families than in ABIDE or ADHD. Logistic regression achieves the strongest performance, and several comparisons between LR and GNNs yield very small raw p-values in the upper triangular matrix (e.g., LR vs. GAT and LR vs. GCN with raw p-values $\approx$ 0.000), with some remaining significant after FDR correction. In addition, the Weisfeiler--Lehman (WL) graph kernel shows statistically significant differences when compared to most other models; however, these differences correspond to inferior performance, as WL consistently achieves the lowest accuracy and MCC values across models (Table \ref{tab:results}). This indicates that, while WL captures graph representations that differ significantly from those learned by other approaches, these representations are less aligned with sex-related discriminative patterns in the HCP dataset. Overall, these results suggest that in a high-quality, homogeneous cohort such as HCP, simple linear models are sufficient to capture relevant population-level effects, whereas increased structural expressiveness—as in WL kernels or GNNs—does not necessarily translate into improved predictive performance.

In contrast, for the COMA dataset, the Shortest-Path graph kernel exhibits the strongest and most statistically consistent differences when compared with other model families, (e.g., raw p-values as low as p $<$ 0.02 for SP versus GraphSAGE or GAT, with comparisons remaining significant after FDR correction at p$<$0.05), while differences among the remaining models are less pronounced. These results indicate that shortest-path--based features capture discriminative network patterns that are not effectively exploited by other approaches. This finding is consistent with the neurobiological characteristics of disorders of consciousness: as shown by \citet{achard2012hubs}, comatose patients exhibit a pronounced reorganization of hub structure and path-length--related properties, despite a largely preserved global network topology.
\subsection{Computational and environmental costs}
Classical machine-learning models and graph kernel methods were executed on the CPU of a Dell workstation running Ubuntu 22.04 and equipped with an Intel Core i7 processor, whereas graph neural network (GNN) models were trained using an NVIDIA RTX A500 GPU on the same machine. Because GPU acceleration constitutes a fundamentally different computational regime, raw wall-clock runtimes obtained from CPU-based and GPU-based implementations should not be interpreted as direct algorithmic comparisons. Moreover, GPU-accelerated training typically entails higher power consumption, potentially leading to increased environmental costs. Accordingly, we report in Table \ref{tab:total_costs} the computational runtime and CO\textsubscript{2} emissions exclusively for classical machine-learning and graph kernel methods; CO\textsubscript{2} emissions were estimated using CodeCarbon \cite{benoit_courty_2024_11171501}.
\begin{table*}[t]
\centering
\caption{Total training time (seconds) and CO\textsubscript{2} emissions (kg), aggregated across all cross-validation folds and repetitions.}
\label{tab:total_costs}
\renewcommand{\arraystretch}{1.15}
\setlength{\tabcolsep}{6pt}
\resizebox{\textwidth}{!}{
\begin{tabular}{|l|cc|cc|cc|cc|}
\toprule
%\hline
\textbf{Model} &
\multicolumn{2}{c|}{ABIDE} &
\multicolumn{2}{c|}{ADHD} &
\multicolumn{2}{c|}{COMA} &
\multicolumn{2}{c|}{HCP} \\
 & Time (s) & CO$_2$ (kg)
 & Time (s) & CO$_2$ (kg)
 & Time (s) & CO$_2$ (kg)
 & Time (s) & CO$_2$ (kg) \\
\midrule
%\hline
LR       & 15\,850.97 & 1.26$\times10^{-2}$ & 1\,499.40 & 5.09$\times10^{-4}$ & 369.74 & 1.51$\times10^{-4}$ & 8\,202.85 & 6.76$\times10^{-3}$ \\
RF       & 7\,916.05  & 1.27$\times10^{-3}$ & 1\,395.58 & 2.14$\times10^{-4}$ & 386.46 & 5.95$\times10^{-5}$ & 4\,467.13 & 6.80$\times10^{-4}$ \\
XGBoost  & 450\,497.07 & 2.94$\times10^{-1}$ & 5\,210.19 & 8.32$\times10^{-4}$ & 559.06 & 1.14$\times10^{-5}$ & 47\,518.22 & 5.89$\times10^{-3}$ \\
GK-SP    & 4\,537.77  & 6.97$\times10^{-4}$ & 810.98   & 1.22$\times10^{-4}$ & 228.32 & 3.47$\times10^{-5}$ & 2\,332.29 & 3.56$\times10^{-4}$ \\
GK-WL    & 9\,429.97  & 1.38$\times10^{-3}$ & 1\,012.94 & 1.61$\times10^{-4}$ & 230.97 & 3.49$\times10^{-5}$ & 2\,317.81 & 3.58$\times10^{-4}$ \\

%\hline
\bottomrule
\end{tabular}}
\end{table*}


\section{Conclusion}
In this work, we conducted a comprehensive benchmarking study of classical machine-learning methods, graph kernel approaches, and graph neural networks (GNNs) on four resting-state fMRI brain graphs across multiple classification tasks. By evaluating predictive performance, statistical significance, and computational cost within a unified experimental framework, we provide a principled assessment of the practical trade-offs between model families. Across the evaluated datasets, well-tuned classical machine-learning methods and graph kernels achieve performance comparable to GNN architectures. In particular, for small clinical datasets, classical approaches such as logistic regression, random forests, and kernel-based methods exhibit strong predictive accuracy while incurring substantially lower computational and environmental costs.
Although GNNs are often presented as the dominant paradigm for graph-structured neuroimaging data, our results indicate that their empirical advantages over classical baselines are limited and strongly task-dependent.
Several limitations should be acknowledged.
First, our analysis relies on AAL-based parcellations with fixed node ordering across datasets, which ensures a fair comparison across model families by preserving consistent anatomical correspondence. However, this design choice may influence the relative performance of methods, particularly as classical approaches are not permutation-invariant. As a result, the extent to which our conclusions generalize to alternative parcellation schemes, especially those involving a larger number of regions, remains an open question.
Second, for functional connectivity estimation, we focus in this work on Pearson correlation, as it remains the most widely adopted measure in the literature. Nevertheless, we fully acknowledge that Pearson correlation is not necessarily the optimal estimator of brain connectivity. In parallel work building upon distribution-based connectivity estimators \cite{lbath2024clustering}, we investigate richer representations of inter-regional variability, which have shown promising advantages for brain graph modeling \cite{mhanna2026distribution}.
Third, while we relied on well-established hyperparameter configurations informed by prior large-scale benchmarks \cite{cui2022braingb} and recent studies \cite{comparini2026lateearlyfusion}, we acknowledge that dataset--specific tuning of GNNs could yield modest performance improvements. However, such gains are unlikely to fundamentally alter the relative positioning of GNNs with respect to classical baselines in the fixed-parcellation, static connectome setting considered here, and must be balanced against the associated substantial computational costs.
Overall, our findings question the assumption that increased model complexity necessarily leads to superior performance in brain graph analysis. In line with recent calls for responsible and reproducible machine learning, we emphasize the importance of transparent reporting, fair baselines, and cost--aware evaluation in future applications of graph learning methods to neuroimaging.





%The authors urged the community to emphasize reproducibility, cost–benefit analysis, and transparency in future GNN applications.
\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{This work was supported by the Agence Nationale de la Recherche under the France 2030 programme, reference ANR-23-IACL-0006.}


\bibliography{midl26_32}


\end{document}
