% This is samplepaper.tex, a sample chapter demonstrating the
% LLNCS macro package for Springer Computer Science proceedings;
% Version 2.21 of 2022/01/12
%
\documentclass[runningheads]{llncs}
%
\usepackage[T1]{fontenc}
% T1 fonts will be used to generate the final print and online PDFs,
% so please use T1 fonts in your manuscript whenever possible.
% Other font encondings may result in incorrect characters.
%
\usepackage{graphicx}

% Used for displaying a sample figure. If possible, figure files should
% be included in EPS format.
%
% If you use the hyperref package, please uncomment the following two lines
% to display URLs in blue roman font according to Springer's eBook style:
%\usepackage{color}
%\renewcommand\UrlFont{\color{blue}\rmfamily}
%\urlstyle{rm}
%
\begin{document}
\title{Deep Feature Fusion Framework for Alzheimer’s Disease Staging using Neuroimaging Modalities}
%
%\titlerunning{Abbreviated paper title}
% If the paper title is too long for the running head, you can set
% an abbreviated paper title here
%
\author{Aya Gamal \inst{1}\orcidID{0000-0002-8175-4554}% index{Gamal, Aya}
\and Mustafa Elattar\inst{1,2}\orcidID{0000-0001-7936-3522} % index{Elattar, Mustafa}
\and Sahar Selim\inst{1,2}\orcidID{0000-0002-9886-1364}  }
%index{Selim, Sahar}
\authorrunning{A. Gamal et al.}
% First names are abbreviated in the running head.
% If there are more than two authors, 'et al.' is used.
%
\institute{
Medical Imaging and Image Processing Research Group, Center for Informatics Science, Nile University, Giza, Egypt.\and
School of Information Technology and Computer Science, Nile University, Giza, Egypt.
% \email{lncs@springer.com}\\
% \url{http://www.springer.com/gp/computer-science/lncs} \and
% ABC Institute, Rupert-Karls-University Heidelberg, Heidelberg, Germany\\
% \email{\{abc,lncs\}@uni-heidelberg.de}
}
%
\maketitle              % typeset the header of the contribution
%
\begin{abstract}
% Alzheimer's Disease (AD) is a severe neurodegenerative disease.
% % It is the main contributor to dementia, a more comprehensive term for a group of signs that impair mental processes like memory, thinking, and behavior. 
% Early identification of AD is crucial for effective management and for enhancing the quality of life for both patients and their families. In recent years, advances in medical imaging technology have led to the emergence of neuroimaging- based methods for the early diagnosis of AD. Considering the challenges in early AD diagnosis, relying on a mono-modal dataset in DL-based studies; especially neuroimaging, might not reap its benefits in a precise prediction of AD progression as in integrating data from different imaging modalities. Intuitively, utilizing the information from multi-modal data fusion improves the capturing of the very subtle changes and biomarkers leading to a reliable and more accurate diagnosis. 
% % Multimodal data fusion has been proposed as a promising approach to address this limitation by combining information from different modalities. 
% In our study, we develop an automated multimodal system to integrate MRI and PET images at an intermediate level of fusion, enabling the automatic early diagnosis of Alzheimer's disease. This fusion method doesn't require multistep preprocessing to achieve the
% fusion as in image fusion works. Our proposed methodology surpasses previous studies in distinguishing between individuals with Alzheimer's disease and cognitive normal (CN), achieving AUC score of 97.67\% with an accuracy (ACC) of 95.24\%.
Alzheimer’s Disease (AD) is a significant neurodegenerative disorder. Detecting AD early is essential for effective management and improving the quality of life for both patients and their families. Recent advancements in medical imaging technology have introduced neuroimaging-based methods for early AD diagnosis. However, the challenges in early AD detection suggest that using a single modality dataset in deep learning (DL) studies, particularly neuroimaging, might not yield precise predictions of AD progression compared to integrating data from multiple imaging modalities. Utilizing information from multi-modal data fusion can enhance the detection of subtle changes and biomarkers, leading to more reliable and accurate diagnosis. In our study, we develop an automated multimodal system that integrates MRI and PET images at an intermediate fusion level, facilitating the early diagnosis of Alzheimer’s disease. This fusion approach eliminates the need for extensive preprocessing steps that are typically required in image fusion methods. Our proposed methodology outperforms previous studies in differentiating between individuals with Alzheimer’s disease and cognitively normal (CN) individuals, achieving an AUC score of 97.67\% and an accuracy (ACC) of 95.24\%.

\keywords{Alzheimer’s Disease  \and Neuroimaging Features \and 3D Image Classification.}
\end{abstract}
%
%
%
\section{Introduction}
Alzheimer's Disease (AD) is a severe neurodegenerative disease. 
% It is the main contributor to dementia, a more comprehensive term for a group of signs that impair mental processes like memory, thinking, and behavior. 
Early identification of AD is crucial for effective management and enhancement of the quality of life of both patients and their families. Unfortunately, most existing diagnostic techniques rely on subjective assessments of behavioral and cognitive symptoms, leading to potential unreliability and misdiagnosis. In recent years, advances in medical imaging technology have led to the emergence of neuroimaging-based methods for the early diagnosis of AD. However, these methods often rely on analyzing a single modality, which may fail to capture the full complexity of the disease. Multimodal data fusion has been proposed as a promising approach to address this limitation by combining the information from different modalities.   

In a clinical setting, AD is typically diagnosed by systematically examining various aspects of a patient’s multiple modalities \cite{tu2022alzheimer}. These aspects are commonly derived from the diverse information sources of patients, including neuroimaging data, gene sequence data, profile data, and clinical mental state scale data. In contrast to the classification of AD based solely on single-modal neuroimaging, enhanced performance can be attained through the utilization of multimodal classification, involving the integration of diverse information sources. Investigating the synergies among various multimodal neuroimaging modalities significantly contributes to the identification of pathological processes in neurological disorders. This technique has been applied in image classification \cite{wang2017sparse,zhou2019effective} and image registration \cite{fan2019adversarial}. The motivation for engaging in multimodal fusion stems from two primary advantages: first, the potential for more robust predictions through the observation of the same phenomenon across multiple modalities \cite{baltruvsaitis2018multimodal}; and second, the extraction of complementary information from diverse modalities to enhance the precision of classification results \cite{bailey2015combined}. 

The multimodal framework comprises essential components that are primarily structured at three key levels. The initial level, known as the integration level, involves defining various modalities of data intended for fusion. Thus, at this stage, a determination is made regarding what should be fused. The subsequent level is the fusion methodology, encompassing the approach employed to combine the identified data guided by the chosen fusion strategy. In the literature, fusion strategies are classified into three groups: early fusion, also known as feature-level fusion, is the process of merging multimodal data by concatenating its features into a vector, which is subsequently inputted into a machine learning model. Intermediate fusion that integrates feature representations gained from one modality at the intermediate layers of a neural network with feature representations learned from other modalities is referred to as joint fusion. Late fusion involves decision-level fusion, in which a distinct model is trained for each modality and the predictions of all models are subsequently integrated to create a final decision. The final level in the framework is the knowledge level, where the final results of the diagnosis are obtained.

Numerous studies have focused on the fusion of diverse modalities for AD diagnosis. Notably, Dwivedi et al. \cite{dwivedi2022multi}, Dong et al. \cite{Dong_Zhang_Liu_Wei_2022}, Xu \cite{Xu}, Ning \cite{Ning}, Hao \cite{Hao}, and Zhang \cite{Zhang} have introduced methodologies primarily focused on neuroimaging features, particularly utilizing MRI and PET modalities. Similarly, Khvostikov et al. \cite{Khvostikov20183DCC}, Kang \cite{Kang2020}, and Aderghal et al. \cite{Aderghal2018} have directed their attention to the fusion of neuroimaging data, specifically from sMRI and DTI scans.
In addition to these, Zuo et al. \cite{Zuo} integrated sMRI, PET, and fMRI data, while Choi and Jin \cite{DBLP:journals/corr/ChoiJ17} used flurodeoxyglucose and florbetapir PET. Peng et al. \cite{Peng2019} combined sMRI, PET, and genetic data, and Lee et al. \cite{Lee2019} integrated cognitive performance, demographic information, CSF, and MRI imaging data.




% In addition to these, there are studies that extend beyond the fusion of only two modalities. For instance, Zuo et al. \cite{Zuo} employed a comprehensive approach by integrating sMRI, PET, and Functional MRI (fMRI) data in their system. Choi and Jin \cite{DBLP:journals/corr/ChoiJ17} utilized flurodeoxyglucose and florbetapir positron emission tomography (PET) to devise a novel framework based on a deep convolutional neural network, predicting future cognitive decline in mild cognitive impairment (MCI) patients.
% Expanding beyond neuroimaging modalities, Peng et al. \cite{Peng2019} demonstrated the integration of high-dimensional multi-modality imaging (sMRI & PET) and genetic data for Alzheimer's disease (AD) diagnosis. Furthermore, Lee et al. \cite{Lee2019} proposed a method that integrates cognitive performance, demographic information, cerebrospinal fluid (CSF), and MRI imaging data for predicting the conversion of mild cognitive impairment (MCI).
In reviewing these studies, it is evident that the most frequently fused modalities are MRI and PET, indicating their prominent role in multimodal investigations within this research domain. Various approaches to fusing MRI and PET volumes have been explored in the literature. For example, Song et al. \cite{song2021effective} introduced a framework for AD diagnosis using a feature-fusion approach to extract semantic information from 3D MRI and PET volumes. They also proposed an image fusion method that outperformed their initial approach by reducing the number of model parameters using a single composite image, although it required multistep preprocessing. Castellano et al. \cite{castellano2024automated}
developed a dual branch, multimodal diagnostic model for Alzheimer's Disease using 3D MRI and amyloid PET scans in parallel, demonstrating that these modalities provide complementary insights that enhance predictive accuracy. However, limitations include the selection of only 50 slices from the axial plane, potentially missing comprehensive spatial information, and a loss of temporal resolution in PET scans due to frame averaging. 

Kong et al. \cite{kong2022multi} similarly employed an image fusion technique, while Venugopalan et al. \cite{venugopalan2021multimodal} utilized 3D CNNs to extract features from MRI and PET data, demonstrating improved performance over traditional fusion methods despite being limited by dataset sizes. In contrast to CNN-based methods, transformers leverage the self-attention mechanism to capture long-range dependencies within multimodal features. Zhang et al. \cite{zhang2023transformer} introduced a model comprising three components: dual 3D CNN encoders for MRI and PET modalities, a Multimodal Transformer Encoder, and a classification head. They employed a feature fusion strategy that utilized a transformer-based cross-attention mechanism to fuse features more effectively. Furthermore, Miao et al. \cite{miao2024mmtfn} utilized a transformer-based approach for multimodal multiscale fusion networks for the diagnosis of AD by fusing neuroimaging data.

In this study, we developed an automated multimodal system that integrates MRI and PET images at an intermediate fusion level, facilitating the early diagnosis of AD. This fusion method requires minimal preprocessing compared to traditional image-fusion techniques. Our approach surpasses previous studies in distinguishing between AD and CN individuals.

\section{Methodology}
To preserve modality-specific information for both modalities, we introduced a heuristic intermediate feature fusion framework that can capture complementary information from PET and MRI modalities independently. The components of the proposed feature fusion framework are shown in Figure \ref{fig1}., which illustrates the intermediate feature fusion approach that preserves modality-specific information while enabling the effective integration of MRI and PET features for classification. The first level of our framework identifies the modalities to be integrated. We then applied preprocessing steps from a streamlined pipeline to MRI and PET scans separately, preparing the data for feature extraction. In the feature extraction step, a 3D pre-trained deep learning model was used as a feature extractor for each modality. Subsequently, we employed an intermediate feature fusion approach by leveraging the feature maps extracted from the previous step and processing them for input into the classification network. Finally, a small and simple 3D CNN network was used as a classification network for the effective classification of the AD stages.
\begin{figure}
\includegraphics[width=\textwidth]{proposed feature fusion framework_merged.drawio.png}
\caption{The proposed intermediate feature fusion framework, highlights the stages of modality integration, feature extraction, and classification.} \label{fig1}
\end{figure}
\vspace{-15pt} % Adjust the value as needed
\subsection{Dataset}
Our study concentrated on the ADNI dataset (adni.loni.usc.edu), which is widely used to address this problem. We specifically implemented our experiments using structural MRI and 18-fluorodeoxyglucose (FDG)-PET modalities, which  are commonly employed noninvasive methods for capturing the characteristics of brain tissue. 
% Structural MRI is utilized in AD diagnosis due to its high resolution for soft tissue and its capability to reveal detailed anatomical features of the brain. While PET imaging plays a crucial role as a functional technique, allowing clinicians to quickly and precisely observe activities related to the human brain, particularly in early Alzheimer's Disease (AD) detection. PET images obtained through the diffusion of radioactive 18-fluorodeoxyglucose (FDG) have been utilized to obtain sensitive measurements of cerebral metabolic rates of glucose (CMRglc). 
We collected 3D data from subjects who underwent scans using both of these modalities. 

First, we filtered the participants to include only those with data available on both PET and MRI during the same visit and scanning period. In total, 253 subjects participated in this experiment, contributing to a dataset of 822 scans. We aimed to mitigate the risk of data leakage by considering only the first or baseline scans for each subject. This decision ensured an equal number of scans and participants. However, to address the challenge of a small dataset size owing to the constraint of scans from the same time period, each subject could have three to four visits in different years or at least a 6-month gap within the same year. To maintain our principle of avoiding data leakage, we carefully split the data into 80\%, 10\%, and 10\% for the training, validation, and test sets, respectively. Ensuring that a subject's scans do not appear in different sets but all in one place. 

The summary of subjects and scans in the dataset is provided in Table \ref{tab1}.  gives an overview of the participants included in the study, highlighting the distribution across AD, MCI, and CN groups, which is crucial for understanding the dataset’s composition.
\begin{table}
\caption{Summary of participant statistics in the ADNI dataset (MRI and PET).}\label{tab1}
\centering
\begin{tabular}{|l|l|l|}
\hline
Class &  Subjects & Scans\\
\hline
AD & 43 & 117 \\ \hline
MCI & 111 & 433 \\\hline
CN & 99 & 272 \\
% Title (centered) &  {\Large\bfseries Lecture Notes} & 14 point, bold\\
% 1st-level heading &  {\large\bfseries 1 Introduction} & 12 point, bold\\
% 2nd-level heading & {\bfseries 2.1 Printing Area} & 10 point, bold\\
% 3rd-level heading & {\bfseries Run-in Heading in Bold.} Text follows & 10 point, bold\\
% 4th-level heading & {\itshape Lowest Level Heading.} Text follows & 10 point, italic\\
\hline
\multicolumn{3}{|c|}{Total Number of scans = 822 }\\ \hline
\end{tabular}
\end{table}
\vspace{-10pt}




\subsection{Data Preprocessing}
Both MRI and FDG-PET images in ADNI underwent various processing stages. Each modality was pre-processed separately. Specifically, the MRI images underwent a series of processing steps, including skull stripping, intensity normalization, uniform resampling to achieve isotropic resolution, 3D cropping to extract only the brain from the black background, resizing all scans to 128$\times$128$\times$128, and the application of histogram equalization to enhance the contrast. The preprocessing pipeline proposed in \cite{Gamal2022} was applied here, except for the histogram equalization step, which was applied to the scans to enhance the quality and discriminatory power of the images.   

Regarding PET scans preprocessing, the initial FDG-PET scans underwent the following processing steps to ensure consistency in PET data across various systems. 
% % First, co-registered dynamic processing is applied, where six 5-minute FDG-PET frames are captured within 30$-$60 minutes post-injection. 
% Each frame is co-registered to the initial extracted frame, and independent frames are further co-registered to mitigate the impact of patient motion. Subsequently, an averaging step is implemented, where the six co-registered frames are averaged. The next step involves standardizing the image and voxel size. The averaged image is reoriented into a standardized 160$\times$160$\times$96 voxel image grid, with 1.5 mm cubic voxels, following anterior commissure posterior commissure correction. Intensity normalization is then performed using a subject-specific mask to ensure that the average value of voxels within the mask is precisely one. Finally, uniform resolution is achieved by applying a scanner-specific filter to the normalized image, resulting in an image with a consistent isotropic resolution of 8 mm full width at half maximum. This step aims to smooth the aforementioned images for further analysis.
First, we converted all the PET files into Neuroimaging Informatics Technology Initiative (NIFTI) format files, as all the processed PET image data were in the DICOM format. The dicom2nifti Python package was used to apply the conversion. Similar to MRI, PET images include extensive background regions characterized by zero pixel values beyond the brain tissue. We effectively reduced these non-essential background regions to decrease the volume of the input data via 3D cropping, as in the MRI pipeline. Furthermore, we resized the volume to 128$\times$128$\times$128. Finally, histogram equalization was applied to the PET scans. Figure \ref{fig2} outlines the PET image processing pipeline, which includes essential steps for standardizing PET images and enhancing their quality for the feature extraction process. 

In this study, handling multimodal data posed a significant challenge owing to the limitations of a small sample size. To address this concern, an essential component of the proposed methodology is the augmentation step. We employed various 3D transformations on both MRI and PET data, including 3D random rotation and flipping.
\begin{figure}
\centering
\includegraphics[width=0.7\textwidth]{PET_preprocessing output.drawio.png}
\caption{PET Image Processing Method.} \label{fig2}
\end{figure}

\subsection{Networks Architecture}
The effectiveness of the 3D CNN models and transfer learning approach in diagnosing AD led us to choose them as the optimal starting point for designing our multimodal framework. The proposed multimodal model architecture is illustrated in Figure \ref{fig1}. The 3D DenseNet201-based transfer learning model was used as a deep feature extractor for the processed images of both modalities. The feature encoder had four dense blocks, and transition layers were employed between them.  After extracting the feature maps from each modality, a concatenation layer was added to the model to fuse the intermediate features and prepare them for the single final network. The last layers in our network form a small and simple 3D CNN. The layer details of the final classification network are shown in Figure \ref{fig1}.

\section{Experiments Setup and Results}
In this part of our study, our experiments are organized as follows. Initially, two 3D DenseNet201 models were utilized as feature extractors for both the MRI and PET images. Subsequently, we loaded the weights of each modality independently and incorporated them into the fusion phase. Finally, a straightforward 3D CNN network is applied to the fused features for AD diagnosis. We conducted three classification tasks: AD vs CN, AD vs MCI, and MCI vs CN.

One of the challenges highlighted in the literature is the variability in hyperparameter choices across different studies and experiments. To address this issue, we employ an open-source hyperparameter optimization framework called Optuna \cite{optuna_2019}. Optuna is compatible with any machine learning or deep learning framework, offering versatility. 
% Using Optuna's intuitive syntax, we define the hyperparameter search space and objective function within our existing codebase.

We used Optuna’s automated hyperparameter optimization algorithms to efficiently explore and evaluate different configurations, facilitating the discovery of optimal model settings. Specifically, we specified the search space for the hyperparameter batch size, learning rate, and input shape by defining their types as categorical, float, and categorical with possible ranges of [5, 8, 16, 32], [0.000001, 0.0001], and [64, 96, 128], respectively. Following the optimization process, Optuna returns the best set of hyperparameters that leads to optimal performance according to the defined objective function.

The optimal configuration obtained was [batch size of 16, an input size of 128, and a learning rate of 3.4885205571560794e-05], achieved in trial 9. All experiments were performed using the TensorFlow deep learning framework \cite{tensorflow2015-whitepaper} in Python. In the training phase of the feature extractors, we employed 200 epochs with a batch size of 16, aligned with the recommendations derived from the Optuna optimization process. Nevertheless, when training the final CNN network, we encountered hardware constraints, compelling us to decrease the batch size to five. Adam optimizer \cite{KingBa15} is employed with a learning rate that is recommended from Optuna algorithm and a ReduceLROnPlateau strategy is utilized here to reduce the learning rate when the validation loss has stopped improving. According to the final classification network, all the setups were the same.

In our study, we addressed the challenge of imbalanced classes by implementing oversampling and class-weighting strategies during training of our fusion model. To overcome class imbalance, we applied oversampling to the minority classes using the resample function. This step ensures that each class is adequately represented in the training dataset, thereby preventing the model from being biased towards the majority class. It randomly selects samples with replacements from the provided class indices, effectively duplicating some samples to achieve desired oversampling. This is performed until the size of the minority class matches the size of the majority class, making the class distribution more balanced in the training data. To further mitigate the impact of class imbalance, we computed class weights using the \ "compute class weight"\ function from sci-kit-learn \cite{pedregosa2011scikit}. It is used to assign different weights to different classes during model training. In the experiments, the BinaryFocalCrossentropy loss function was employed, which combines the characteristics of both binary cross-entropy (BCE) and focal loss. Binary Cross-Entropy (BCE) serves as the standard loss function for binary classification problems. On the other hand, focal loss is introduced to address class imbalance in binary classification tasks. This is achieved by modulating the cross-entropy loss and downweighting the contribution of well-classified examples where the predicted probability is high. This adjustment allowed the model to prioritize hard-to-classify examples.

We evaluated the performance of our fusion model across three binary classification tasks to recognize the three AD stages, as shown in  table \ref{tab2}. The Table presents the performance metrics of our proposed feature fusion method, demonstrating the model's ability to differentiate between AD, MCI, and CN with high accuracy and AUC scores. Notably, the best results were achieved for the AD vs. CN task, with an AUC score of 97.67\%, based on a single inference on a hold-out test set. It is important to highlight that the lower performance observed in the AD vs. MCI and MCI vs. CN tasks is expected, as the MCI stage is notoriously challenging to classify due to its overlapping features with both normal aging and early Alzheimer's, which poses a difficulty even for advanced models. Table \ref{tab3} compares the performance of uni-modal approaches against our proposed feature fusion method, highlighting the substantial improvements in classification accuracy and AUC achieved through multimodal integration. 
\begin{table}
\caption{Proposed Feature Fusion Results for 3 classification tasks.}\label{tab2}
\centering
\begin{tabular}{|l|l|l|l|l|}
\hline
Task &  ACC & BA & AUC & F1-score\\
\hline
AD vs CN &95.24&95.71&97.67&93.33\\
MCI vs CN&80&77.81&86.08&72.86\\
AD vs MCI &75.0&74.23&80.54&73.4\\

\hline
\end{tabular}
\end{table}
\begin{table}
\caption{The uni-modal and Proposed Feature Fusion Results for AD vs CN task.}\label{tab3}
\centering
\begin{tabular}{|l|l|l|l|}
\hline
Metric &  MRI & PET & Fusion method \\
\hline
ACC & 68.75 & 87.5 & 95.24\\
AUC & 72.5 & 94.29 & 97.67\\
BA & 68.67 & 87.71 & 95.71\\
F1-score & 67.15 & 87.67 & 93.33\\

\hline
\end{tabular}
\end{table}
\section{Discussion and Conclusion}
In this study, our goal was to utilize the power of neuroimaging multimodal data instead of unimodal data. Table \ref{tab4} benchmarks the performance of our proposed method against other recent studies, showcasing the superiority of our approach in terms of accuracy across multiple classification tasks. Our proposed method outperforms other studies with superior performance for the AD vs. CN task with ACC= 95.24\%.

Regarding data subjects used in this paper, instead of utilizing only the baseline scans, we obtained three to four scans for each subject in different years to overcome the small data sizes as much as possible. In addition, we took into consideration the problem of data leakage that could happen through having multiple scans for each subject, so, we split the data very carefully to ensure that the scans of each subject will not appear in different sets. 
% Our data sets splits are as follows: 80\% for training, 10\% for validation, and finally 10\% for testing. 
By utilizing the oversampling and class weighting in our experiments, we got a superior performance of the model and we can see this effect clearly through investigating the metrics especially the f1-scores for each class in different tasks. 
% We followed almost the same preprocessing pipeline that we proposed previously while dealing with MRI only.

Integrating 3D augmentation functions significantly improved our experiments and the model's performance. The process followed these steps: first, we applied oversampling to all training data within each classification task, balancing the minor class with the major class after splitting the data into training, validation, and test sets at the subject level. Next, preprocessing was conducted on the oversampled data. Finally, the transformation was applied exclusively to the training data, with an augmentation factor of 5.

As shown in Table \ref{tab4} there are many studies, some of which follow different approaches for fusing MRI and PET volumes. Song et al. \cite{song2021effective} introduced a framework for AD diagnosis with the feature fusion approach (intermediate fusion) to obtain semantic information from the 3D volumes of MRI and PET. In addition, they proposed another fusion method by applying an image fusion process that outperformed the first method. 
The image fusion approach helps reduce the number of model parameters, as a single composite image is used in the network. However, multistep pre-processing is required to achieve this fusion. Kong et al. \cite{kong2022multi} presented also an image fusion method which is considered as the early fusion approach where PET and MRI images are fused and fed into the network. In addition, Venugopalan et al. \cite{venugopalan2021multimodal} suggested that the deep models for integration also showed improved performance over traditional feature-level and decision-level integrations. However, their study suffers from a limited dataset size. Zhang et al. \cite{zhang2023transformer} proposed an adversarial learning approach to enhance the cross-attention mechanism for more effective feature fusion. They focused on subjects with complete T1w and FDG-PET images, utilizing feature fusion with their baseline scans. The effectiveness of their approach was then evaluated on two tasks: AD vs. CN and pMCI vs. sMCI.

% Our methodology distinctly preserves modality-specific features by employing intermediate feature fusion methods, unlike previous studies that relied on image fusion techniques, which require extensive preprocessing steps such as registration and alignment of volumes. This approach not only streamlined the process but also allowed us to achieve significant improvements, especially in the AD vs. CN task. By leveraging the power of a simple CNN for both feature extraction and classification, our method outperformed others, demonstrating the effectiveness of maintaining modality-specific information while minimizing the need for complex preprocessing.
Our methodology preserves modality-specific features using intermediate feature fusion, avoiding the extensive preprocessing typically required by image fusion techniques, such as volume registration and alignment. This streamlined approach led to significant improvements, particularly in the AD vs. CN task. By using a simple CNN for feature extraction and classification, our method outperformed others, highlighting the effectiveness of maintaining modality-specific information while minimizing preprocessing complexity
% In this study part, subjects were chosen who had both T1-weighted MRI and FDG-PET scans captured at the same period. We also tried to make use of the our previous phases through this study. 

% The features extracted from MRI and FDG-PET scans are clinically relevant as they capture critical structural and metabolic information associated with Alzheimer's disease. These modalities offer complementary insights, with MRI highlighting structural atrophy and FDG-PET revealing metabolic changes in the brain, both of which are key indicators in AD diagnosis. However, a limitation of our current work is the lack of interpretability methods, which are essential for translating these features into actionable clinical insights. Another limitation of this study is the difficulty of applying a multi-class classification task. Handling 3D data from MRI and PET scans is challenging and requires significant computational resources. 
% Future work should focus on integrating interpretability techniques to better understand the decision-making process of the model and enhance its utility in a clinical setting. We also intended to incorporate a more rigorous statistical analysis, including reporting central tendencies for cross-validation runs, as well as conducting appropriate statistical tests to confirm the significance of the observed improvements in model performance.
The features extracted from MRI and FDG-PET scans are clinically relevant, capturing critical structural and metabolic information associated with Alzheimer's disease. MRI highlights structural atrophy, while FDG-PET reveals metabolic changes, offering complementary insights for AD diagnosis. However, our current work lacks interpretability methods, essential for translating these features into actionable clinical insights, and faces challenges in multi-class classification due to the complexity of handling 3D data from MRI and PET scans, which requires substantial computational resources. Future work should integrate interpretability techniques to better understand the model's decision-making process and enhance its clinical utility. Additionally, incorporating more rigorous statistical analysis, including reporting central tendencies for cross-validation runs and conducting statistical tests to confirm the significance of observed improvements, is necessary.

In conclusion, our study presented a comprehensive framework for aiding in the early diagnosis of Alzheimer’s disease through a focus on neuroimaging features. We specifically chose to focus on fusing neuroimaging features by combining 3D MRI scans with 18-FDG PET scans through the introduction of an intermediate feature fusion method. Our proposed fusion framework demonstrated superior results compared to related studies in the literature.
\vspace{-10pt}
\begin{table}
\caption{Comparative performance of our classifiers and competitors.}\label{tab4}
\centering
\begin{tabular}{|l|l|l|l|}
\hline
& \multicolumn{1}{|l|}{\bfseries{AD vs CN}}& \multicolumn{1}{|l|}{\bfseries {MCI vs CN}} & \multicolumn{1}{|l|}{\bfseries{AD vs MCI}}&  \cline{2-4}
{\bfseries Study} & ACC (\%)& ACC(\%)& ACC(\%)&\hline
Kong et al. (2022) \cite{kong2022multi} &93.21&86.52&85.63
\\ \hline
song et al. (2021) \cite{song2021effective}(feature fusion) &93.22&82.37&81.00\\ \hline
song et al. (2021) \cite{song2021effective}(image fusion) &94.11&88.48&84.83\\ \hline
Venugopalan et al. (2021)\cite{venugopalan2021multimodal}& 86&-&- \\ \hline
Castellano et al. (2024)\cite{castellano2024automated}& 95.00&-&- \\ \hline
Zhang et al. (2023)\cite{zhang2023transformer}& 92.9&-&- \\ \hline
Proposed feature fusion method& {\bfseries 95.24}&80&75
\\ \hline
\end{tabular}
\end{table}


\begin{credits}


\subsubsection{\discintname}
The authors have no competing interests to declare that are
relevant to the content of this article.
\end{credits}
%
% ---- Bibliography ----
%
% BibTeX users should specify bibliography style 'splncs04'.
% References will then be sorted and formatted in the correct style.
%
\bibliographystyle{splncs04}
\bibliography{Bibliography}
%
% \begin{thebibliography}{8}
% \bibitem{ref_article1}
% Author, F.: Article title. Journal \textbf{2}(5), 99--110 (2016)

% \bibitem{ref_lncs1}
% Author, F., Author, S.: Title of a proceedings paper. In: Editor,
% F., Editor, S. (eds.) CONFERENCE 2016, LNCS, vol. 9999, pp. 1--13.
% Springer, Heidelberg (2016). \doi{10.10007/1234567890}

% \bibitem{ref_book1}
% Author, F., Author, S., Author, T.: Book title. 2nd edn. Publisher,
% Location (1999)

% \bibitem{ref_proc1}
% Author, A.-B.: Contribution title. In: 9th International Proceedings
% on Proceedings, pp. 1--2. Publisher, Location (2010)

% \bibitem{ref_url1}
% LNCS Homepage, \url{http://www.springer.com/lncs}, last accessed 2023/10/25
% \end{thebibliography}
\end{document}
