\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\jmlryear{2025}
\jmlrworkshop{Full Paper -- MIDL 2025}
\jmlrvolume{-- 83}
\editors{Accepted for publication at MIDL 2025}

\title[Short Title]{Anatomy-Guided Surface Diffusion Model for Alzheimer's Disease Normative Modeling}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Jianwei Zhang\textsuperscript{1,2}} \Email{jiazhang@loni.usc.edu}\\
  \Name{Yonggang Shi\textsuperscript{1,2,3}} \Email{yshi@loni.usc.edu}\\
  \addr \textsuperscript{1}Stevens Neuroimaging and Informatics Institute, Keck School of Medicine,
University of Southern California, Los Angeles, CA, USA \\
\textsuperscript{2}Ming Hsieh Department of Electrical and Computer Engineering, Viterbi School of Engineering, University of Southern California, 
Los Angeles, CA, USA\\
\textsuperscript{3}Alfred E. Mann Department of Biomedical Engineering, Viterbi School of Engineering, University of Southern California, Los Angeles, CA, USA}

\begin{document}

\maketitle

\begin{abstract}
Normative modeling has emerged as a pivotal approach for characterizing heterogeneity and individual variance in neurodegenerative diseases, notably Alzheimer's disease (AD). One of the challenges of cortical normative modeling is the anatomical structure mismatch due to folding pattern variability. Traditionally, registration is applied to address this issue and recently deep generative models are employed to generate anatomically aligned samples for analyzing disease progression; however, these models are predominantly applied to volume-based data, which often falls short in capturing intricate morphological changes on the brain cortex. As an alternative, surface-based analysis has been proven to be more sensitive in disease modeling such as AD. Yet, like volume-based data, it also suffers from the mismatch problem. To address these limitations, we propose a novel generative normative modeling framework by transferring the conditional diffusion generative model to the spherical domain. Furthermore, the proposed model  generates normal feature map distributions by explicitly conditioning on individual anatomical segmentation to ensure better geometrical alignment which helps to reduce variance between subjects in normative analysis. We find that our model can generate samples that are better anatomically aligned than registered reference data and through ablation study and normative assessment experiments, the samples are able to better measure individual differences from the normal distribution and increase sensitivity in differentiating cognitively normal (CN), mild cognitive impairment (MCI), and Alzheimer's disease (AD) patients.
\end{abstract}

\begin{keywords}
Alzheimer's Disease, Diffusion Generative Model, Cortical Surface.
\end{keywords}

\section{Introduction}

\label{sec:intro}
Normative modeling has been proven to be an effective approach for modeling neurodegenernative diseases such as Alzheimer's disease \cite{10.7554/eLife.85082}. The core idea of normative modeling is defining normal distribution such that each subject can be measured against it to characterize deviation from norm. One of the major challenges of such tasks is the individual anatomical variability. Specifically, the cortical folding patterns exhibit considerable heterogeneity across individuals, thereby complicating the establishment of meaningful comparisons.  Conventionally, statistical analysis techniques are applied on the anatomically registered images to attenuate effect of individual variability. However, due to shape differences, registered images still have significant gyral/sulcal mismatch \cite{10.1007/978-3-031-43904-9_6} and statistical methods are usually limited in their abilities to capture complex nonlinear relationships. As an alternative, deep generative models have recently been introduced to address these limitations. The idea is to train a generative model that encodes how normal distribution behaves.
% conditioned on structural information such that diseased instances might be detected as outliers.  
Variational Autoencoder (VAE) \cite{ICAM,RAVI2022102257}, flow-based model \cite{9008303}, Generative Adversarial Network (GAN) \cite{BAI2022353} were employed to model the normal distribution on the brain MRI volume space and utilize deviation of original data from generated normal samples as disease atrophy map for analysis. Although, these previous research achieved good results on the volume data, few attempts have been made to adapt these methods for cortical surface-based data, which has been proven to more prominent at capturing detailed anatomical changes\cite{HUTTON2009371,LERCH2005163}.

\par Traditional surface-based analysis is built upon registering brain surfaces across subjects or with a template surface\cite{yeo2009spherical,fischl2004automatically}, but this process  also suffers from cortical structure mismatch. To account for this problem, previous works have attempted personalized analysis where, instead of using entire dataset, only a subset with similar anatomical structures were used for analysis\cite{10.1007/978-3-031-43904-9_6,10.1007/978-3-030-87234-2_67}. However, these approaches suffer from limited data availability and computational complexity as high cortical variability might not be represented by the existing datasets. Therefore, generative model is a promising alternative approach to generate personalized reference sets to alleviate challenges in matching against  real data. 
\par Recently, diffusion models have emerged as an effective framework for stable and effective image generation\cite{pmlr-v139-nichol21a,Song2021DenoisingDI,ho2020denoising}. To leverage this advancement of generative model, in this paper, we adapt the Denoising Diffusion Probabilistic Models(DDPM) framework \cite{ho2020denoising} from euclidean image domain to non-euclidean spherical domain and propose a conditional surface diffusion model that utilizes gyral sulcal segmentation masks to generate cortical surface features that are anatomically aligned. The proposed model is applied to HCP\cite{HCP} and ADNI dataset\cite{MUELLER2005869} to conduct unconditional generative task, ablation study and normative modeling on cognitively normal(CN), mild cognitive impairment (MCI) and Alzheimer's disease(AD) subjects. The results show that our model is able to generate faithful and anatomically aligned feature maps and increase the sensitivity of surface based disease analysis. 

\section{Method}
Our proposed method consists of three parts: surface based diffusion model with condition, denoising network in spherical domain, and normative modeling via sampling. The overall diffusion model is shown in Fig. \ref{fig:pipeline}.
\subsection{Denoising Diffusion Probabilistic Models(DDPM)}
DDPM \cite{ho2020denoising,rombach2022highresolution} is an iterative generative model for modeling data distribution from samples. Given a series of observed samples {$x_{i}$}, which is drawn from the data distribution p(x), the model learns to generate new samples from p(x) through a forward and backward diffusion process. The diffusion process of DDPM is governed by a Markov chain as in equation \ref{eq:ddpm_forward} and \ref{eq:ddpm_backward}, which describe forward and backward process respectively:

\begin{equation}
    q(x_{t}|x_{t-1}) = \mathcal{N}(x_{t}; \sqrt{1-\beta_{t}}x_{t-1},\beta_{t}\mathcal{I}) \quad q(x_{1:t}|x_{0}) = \prod_{t=1}^{T} q(x_{t}|x_{t-1})
    \label{eq:ddpm_forward}
\end{equation}
\noindent
\begin{equation}
    p_{\theta}(x_{0:T}) = p(x_{T}) \prod_{t=1}^{T} p_{\theta}(x_{t}|x_{t-1}), 
    p_{\theta}(x_{t}|x_{t-1}) = \mathcal{N}(x_{t-1}; \mu_{\theta}(x_t,t),\Sigma{\theta}(x_t,t))
    \label{eq:ddpm_backward}
\end{equation}

where $\mathcal{N}$ is the Gaussian distribution and q is the transition probability of the forward process. $p_{\theta}$, $\mu_{\theta}$ and $\Sigma_{\theta}$ are parameterized estimation from neural networks. $x_{0}$ is the original data and $x_{t}$ is the noisy data after adding t steps of noise. The T denotes the total number of steps. $\beta_{t}$ is from a predefined set of variance schedule $\{\beta_{t} \in (0,1)\}|_{1}^{T}$. The information within the data is progressively destroyed by adding independent Gaussian noise for a certain number of steps in the forward process. The backward process is then formulated as a sampling process by implementing a neural network to estimate $\mu_{\theta}(x_t,t),\Sigma{\theta}(x_t,t)$ iteratively and denoise the noisy data  $x_{T}$ to achieve new sample generation. \\
\begin{equation}
    L = \EX_{t  \thicksim [1,T],x0,\epsilon_{t} }\Bigr[ ||\epsilon_{t} - \epsilon_{\theta}( \sqrt{\overline{\alpha}_{t}}x_{0} + \sqrt{1-\overline{\alpha}_{t}}\epsilon_{t},t)|| \Bigr]
    \label{eq:loss}
\end{equation}

\begin{figure}[htb!]

\begin{minipage}[b]{1.0\linewidth}
  \centering
  \centerline{\includegraphics[width=\linewidth]{figs_eps/pipeline.eps}}
%  \vspace{2.0cm}
\end{minipage}
\caption{\textbf{Overall framework of the proposed surface diffusion model.} (A) The training procedure of the conditional DDPM model, where CT denotes cortical thickness, SI denotes shape index and G/S seg. denotes Gyral/Sulcal segmentation. (B) The sampling process for each test subject to generate abnormal score for analysis (Note: all images are actual data and actual generated samples).}
 \label{fig:pipeline}
\end{figure}

\par The model is trained by optimizing a simplified Evidence Lower Bound loss\cite{ho2020denoising} in equation \ref{eq:loss}. $\epsilon_{t}$ is the noise at time t and $\epsilon_{\theta}$ is the neural network. $\overline{\alpha}_{t}$ is $\prod_{i=1}^{T}{(1-\beta_{i})}$. We employ the cosine beta schedule\cite{nichol2021improved} as the variance schedule and the velocity sample scheme in \cite{salimans2022progressive}, which we empirically find to be more stable. During training, a randomly sampled t steps of noise is applied to a feature map and the resulting noisy image is the input to the network along with the time step t in the form of a time embedding vector. The loss is computed between network output and original feature map without noise.
%The training iterates until a preset max epoch number.
% The training and sampling procedure is same as \cite{ho2020denoising}. 
After training, the model can be iteratively applied to random noise or noisy input data for a selected number of steps to generate new samples. 

\subsection{Anatomical and Demographic Conditioning}
The original DDPM is for modeling unconditional distributions. 
To generate samples that are better anatomically aligned, we modified the model to take additional conditions.
% In order to better model the normal distribution of cortical features based on anatomical structure, we modified the model to take additional conditioning. 
In our method, two types of conditions are used: demographic and anatomical conditions. The demographic conditions include sex and biological age. Both values are first passed into the network through serveral multilayer perceptrons and activation layers. The embedding vectors are then added to the time embedding \cite{ho2020denoising} and passed to the network. For anatomical condition,the gyral/sulcal segmentation mask\cite{4389763} is concatenated with the input feature map as input to the network. All the conditions are used during training and sampling. 


\subsection{Denoising Network in Spherical Domain} 
To align data in a common space, the feature maps and masks are resampled to a standard icosahedron. %Unlike in euclidean domain, there is no native definition of direction in the spherical domain. 
To transfer convolution in image domain, we adapt the convolution method from \cite{Spherical_unet} in the spherical domain, which defines convolution by the 1 ring neighborhood of each vertex.
% The convolution kernel is defined on the 1 ring neighborhood of each vertex on the 6th order icosahedron. For a vertex i on unit icosahedron centered at origin, a direction can be computed as the cross product between the z-axis and the vector pointing from origin to the vertex i, which yields a vector $d$ along the circle of latitude. An ordering of the neighboring vertices can then be defined as the clockwise order of the angles between each vertices and $d$ in the tangent space of the center vertex as shown in Figure \ref{}. 
The network in \cite{Spherical_unet} utilizes neighborhood averaging for pooling and up-pooling,which we empirically find to introduce grid artifacts into the generated samples. Therefore,
%and mimic operations in euclidean domain, 
we employ a different pooling and up pooling method. Utilizing the natural structure of the icosahedron, the pooling for ith order is defined as only keeping vertices in the (i-1)-th order icosahedron and up pooling is the zero padding for vertices added from i-th to (i\textbf{+}1)-th order. Fig. \ref{fig:network} shows the structure of the network and illustrations of the operations. The network has a standard UNet structure with 2 ResBlocks in each level. Each ResBlock has an additional time embedding input, from Sinusoidal embedding layer + MLP layer, same as in \cite{ho2020denoising} for denoisnig at each time step. For memory efficiency, the attention layer is only included in the last two levels. 

\begin{figure}[htb!]

\begin{minipage}[b]{1.0\linewidth}
  \centering
  \centerline{\includegraphics[width=\linewidth]{figs_eps/network2.eps}}
%  \vspace{2.0cm}
\end{minipage}
\caption{\textbf{Denoising network structure and operations in spherical domain:} (A) A UNet structure with ResBlock built out of spherical convolution, pooling, up-pooling and attention layers. (B) Illustration of spherical operations.}
 \label{fig:network}
\end{figure}

% \begin{figure}[tb]
%     \centering    
%     \includegraphics[width=\textwidth]{figs_eps/network.eps}

%     \caption{\textbf{Denoising network structure and operations in spherical domain:} (A) A UNet structure with ResBlock built out of spherical convolution, pooling, up-pooling and attention layers. (B) Illustration of spherical operations.}
%     \label{fig:network}
% \end{figure}

\subsection{Sampling for Normative Modeling}
The core idea for our normative modeling is to use sampled feature maps to measure deviation scores as opposed to registered real data. Through procedures described in previous sections, the model will generate N samples per test subject conditioned on original cortical feature maps with 500 steps of added noise, which is determined empirically, individual anatomical segmentation, sex and age. This step aims to reconstruct disease feature maps to be 
 pseudo-healthy ones while still maintain the same anatomical structure. For each Region of Interest (ROI) defined by FreeSurfer output file \textit{aparc.annot}, an abnormal score is computed as in equation \ref{eq:zscore}. 

\begin{equation}
    Z_{i} = \frac{x_{i}-mean([x_{(i,1)}...x_{(i,N)}])}{std_{j}([x_{(i,1)}...x_{(i,N)}])}
    \label{eq:zscore}
\end{equation}
For a test subject, $Z_{i}$ is the abnormal score for the i-th ROI. $x_{i}$ is the mean feature value of the test subject in the i-th ROI. $x_{(i,j)}$ denotes the mean feature value for the i-th ROI of the j-th sample. The abnormal score measures the deviation from normal in each ROI. Additionally, the abnormal scores of 34 ROIs are used as feature in a standard SVM for 10-fold cross validation of CN vs MCI and CN vs AD classification. 

\section{Experiments and Results}

\subsection{Preprocessing and Implementation}
Two public datasets are used in the experiments. 584 subjects are selected from Human Connectome Project (HCP) dataset\cite{HCP}. 9:1 train test split is applied for unconditional task. 646 subjects are selected from the Alzheimer’s Disease Neuroimaging Initiative(ADNI) dataset\cite{MUELLER2005869}, including 482 CN, 82 MCI, and 82 AD patients. 400 CN subjects are used as training set and all others as test set. All the T1 MRI images are processed through FreeSurfer 6.0 \cite{DALE1999179} to extract the cortical surfaces and cortical thickness (CT) map, curvature(curv) and sulcal depth(sulc). Surfaces are registered by FreeSurfer in the spherical domain. The Desikan-Killiany Atlas in FreeSurfer is used for ROI parcellation. The shape index (SI) map and gyral/sulcal segmentation mask are obtained following \cite{4389763}. All feature maps and masks are resampled to a 6th order icosahedron(40962 vertices) using the \textit{mris\_surf2surf} command in FreeSurfer. All feature maps are standardized to 0 mean and std 1. Sex label is set as female:0 and male:1. Age label is scaled to [0,1] range by dividing by 100. For computational costs, the experiments are only conducted on the left hemisphere. 

% \subsection{Implementation Details}
The input to the network is the concatenation of feature maps including CT, SI, Curv, Sulc, and the segmentation mask based on the task. In the unconditional task, we use CT,Curv and Sulc as input. In normative modeling, we use CT,SI,age,sex, and segmentation mask. All feature maps are 40962 length vector. Hidden dimension of each network level is 128,256 and 512. The max timesteps of DDPM is set at 1000. The model is trained with ADAM \cite{kingma2014adam} as optimizer, cosine annealing\cite{loshchilov2016sgdr} as scheduler and a starting learning rate of 1e-5 for a total of 1000 epochs, about 24 hours. The network is implemented using Pytorch and trained on a NVIDIA A5000 GPU.
% Feature maps and segmentation masks are length 40962 vectors. The input to the network is the concatenation of the CT map and SI map with the gyral/sulcal segmentation mask in 1 hot encoding, resulting in a matrix of size 2 $\times$ 4 $\times$ 40962, where 2 is the batch size. Hidden dimension of each network level is 128,256 and 512. The max timesteps of DDPM is set at 1000. The model is trained with ADAM as optimizer, cosine annealing as scheduler and a starting learning rate of 1e-5 for a total of 1000 epochs, about 24 hours. The network is implemented using Pytorch and trained on a NVIDIA A5000 GPU.

\subsection{Generating Ability through FID Score}
To show the superior generative power of our model, we performed the unconditional generative task on the HCP dataset and compared the performance in terms of the FID score\cite{heusel2017gans} with two other generative models, Variational Autoencoder(VAE) \cite{kingma2013auto} and surface transformer based diffusion model\cite{xie2024corticalsurfacediffusiongenerative}. We use the same structure as our backbone in the VAE by removing the skip connections and adding fully connected layer between encoder and decoder. We also included the FID score between test data and training data as a reference. Since FID is designed for 2D images, we embed surface feature maps on fsaverage as snapshots in front and back sagittal views using the jet colormap in Plotly.\cite{plotly}. We use the 192 dimension embedding for FID. Each model generates 200 sets of feature maps and are compared to the training data by FID score. Examples are shown in appendix A. Table \ref{tab:FID} shows that, with the exception of sulc back view, our model achieved the best FID score among generative models and is closest to the real data. This result demonstrats that our backbone in DDPM is able to generate new feature maps closer to the real data distribution.
\begin{table}[tb]
\makebox[\columnwidth]{
\begin{tabular}{|c|c|c|c|c|c|c|}
\hline
type/FID score & CT(front)$\downarrow$  & CT(back)$\downarrow$& Curv(front)$\downarrow$& Curv(back)$\downarrow$ & Sulc(front)$\downarrow$ & Sulc(back)$\downarrow$\\
% \hline
% Template data & -0.0102 $\pm$ 0.0014 & N/A\\
\hline 
VAE  & 4.2268 & 5.2528 & 1.4641& 2.3283 & 0.5640 & \textbf{0.3503}\\
\hline
SiT Diffusion & 0.4179 & 0.5556& 0.3526& 0.3693 & 0.4169 & 0.9080\\
\hline
Our Model & \textbf{0.3453} & \textbf{0.3103}& \textbf{0.2308}& \textbf{0.2670} & \textbf{0.2351} & 0.4725\\
\hline 
Test data  & 0.0091 & 0.0080 & 0.0077& 0.0064 & 0.0037 & 0.0023\\
\hline
\end{tabular}
}
\caption{\textbf{FID score for each feature map:}. The generated feature maps are embedded to 2D images. The FID score is computed between generated sample or test data and train data.}
\label{tab:FID}
\end{table} 

\subsection{Ablation Study for Conditional DDPM}
\begin{table}[tb]
\makebox[\columnwidth]{
\begin{tabular}{|c|c|c|c|c|}
\hline
Model type & SI SSIM$\uparrow$  & SI MSE(mm) $\downarrow$ &  CT SSIM$\uparrow$ & CT MSE(mm) $\downarrow$\\

\hline 
DDPM  & 0.4911 $\pm$ 0.0165 & 0.1461 $\pm$ 0.0064 & 0.3978 $\pm$ 0.0168 & 0.4329 $\pm$ 0.0174\\
\hline
DDPM + mask & \textbf{0.6167 $\pm$ 0.0127} & \textbf{0.1011 $\pm$ 0.0046} & \textbf{0.4903 $\pm$ 0.0224} & \textbf{0.3894 $\pm$ 0.0253}\\
\hline
% Model type & CT SSIM$\uparrow$ & CT MSE(mm) $\downarrow$\\
% % \hline
% % Template data & 0.2311 $\pm$ 0.0136 & N/A\\
% \hline 
% DDPM  & 0.3978 $\pm$ 0.0168 & 0.4329 $\pm$ 0.0174\\
% \hline
% DDPM + mask & \textbf{0.4903 $\pm$ 0.0224} & \textbf{0.3894 $\pm$ 0.0253}\\
% \hline
\end{tabular}
}
\caption{\textbf{Ablation study results:} For both CT and SI, the anatomical condition improves SSIM and MSE.}
\label{tab:ab}
\end{table} 

We performed ablation study for conditional DDPM to demonstrate improvement. 
%The experiment is conducted on the CN subjects' cortical surface. 
Two models are trained on the 400 CN subjects: unconditional DDPM and DDPM with gyral/sulcal segmentation. All test data are first blurred with 500 time steps of noise, then denoised for sampling. Both models include sex and age conditions. From the 82 test CN subjects, the mean Structural Similarity(SSIM) and mean squared error(MSE) between samples and real data are shown in Table \ref{tab:ab}.
% For reference, we also computed the mean SSIM between test subject and 1 randomly selected registered CN subject (Template data) to show that the model can generate better aligned samples as compared to registered real data.
From the ablation study results, we show that the conditioning can indeed improve the sample quality and produce better aligned feature maps. 



\subsection{Normative Assessment on ADNI dataset}

\begin{figure}[htb]

\begin{minipage}[b]{1.0\linewidth}
  \centering
  \centerline{\includegraphics[width=\linewidth]{figs_eps/real_gen_compare.eps}}
%  \vspace{2.0cm}
\end{minipage}
\caption{\textbf{Comparison between real and generated feature maps for CN and AD subjects} In the figure, CT denotes cortical thickness in unit of millimeter(mm). SI denotes shape index. A, B are 2 AD subjects and C, D are 2 CN subjects. The blue circles highlight the temporal region which highly correlates with AD. A,B demonstrate that our model can infer normal feature distribution for AD subjects. B,C verify that generated feature maps are similar to real ones for CN subject which is expected}
 \label{fig:real_gen_compare}
\end{figure}

To evaluate our model's performance on reducing heterogeneity from anatomical mismatch, we compare our model to spherically registered real data using FreeSurfer \cite{DALE1999179} to compare normative modeling performance using registered data and generated data. The conditional DDPM model is trained on the 400 template CN subjects. After training, for each CN, MCI and AD subject in the test set, 10 samples are generated as the DDPM reference set. To ensure a fair comparison, a template reference set was constructed by selecting 10 subjects from the 400 training CN subjects whose ages are closest to that of the test subject. Abnormality scores for each subject per ROI are computed using both reference sets, following equation \ref{eq:zscore}. The scores were computed using only cortical thickness as an accepted biomarker of brain atrophy in AD. 



% \begin{figure}
%     \centering    
%     \includegraphics[width=\textwidth]{figs_eps/real_gen_compare.eps}

%     \caption{\textbf{Comparison between real and generated feature maps for CN and AD subjects} In the figure, CT denotes cortical thickness. SI denotes shape index. A, B are 2 AD subjects and C, D are 2 CN subjects. The blue circles highlight the temporal region which highly correlates with AD. A,B demonstrate that our model can infer normal feature distribution for AD subjects. B,C verify that generated feature maps are similar to real ones for CN subject which is expected}
%     \label{fig:real_gen_compare}
% \end{figure}

Qualitative comparison between the real and generated data is illustrated in Fig. \ref{fig:real_gen_compare}. All feature maps are resampled to the inflated fsaverage surface for visualization. This figure demonstrates that our model, trained on CN subjects, is capable of estimating the normal feature distribution based on AD subjects' individual anatomical structure, particularly in the temporal region, which  strongly correlates to AD pathology. Fig. \ref{fig:norm_test} is a box plot of the mean abnormality scores across cortical ROIs. The statistical difference between CN and MCI, as well as AD, are quantified using ttest p-values for both reference sets. Based on the p-values, our model exhibits increased power in differentiating
 CN vs MCI and CN vs AD. \\
%The comparison between generated AD and generated CN feature map indicates that our model can capture the noraml distribution rather than just shifting the AD distribution \\

\begin{figure}[htb]

\begin{minipage}[b]{1.0\linewidth}
  \centering
  \centerline{\includegraphics[width=\linewidth]{figs_eps/wb_norm_test_wide.eps}}
%  \vspace{2.0cm}
\end{minipage}
\caption{\textbf{Mean abnormal score across whole cortex} The figure shows the distribution comparison between mean abnormal score of the whole cortex per subject for template and DDPM reference sets}
 \label{fig:norm_test}
\end{figure}

% \begin{figure}[tb]
%     \centering    
%     \includegraphics[width=\textwidth,height=3cm]{figs_eps/wb_norm_test_wide.eps}

%     \caption{\textbf{Mean abnormal score across whole cortex} The figure shows the distribution comparison between mean abnormal score of the whole cortex per subject for template and DDPM reference sets}
%     \label{fig:norm_test}
% \end{figure}

% \begin{table}[htb!]
% \makebox[\linewidth]{
% \begin{tabular}{|c|c|c|c|}
% \hline

% \multicolumn{4}{|c|}{CN vs AD} \\
% \hline
% Score Type & Accuracy  & Precision & Recall  \\
% \hline
% Template & 0.6882 & 0.6482 & 0.7049 \\
% \hline
% DDPM  & \textbf{0.7128}  & \textbf{0.7182} & \textbf{0.7091}\\
% \hline
% \multicolumn{4}{|c|}{CN vs MCI} \\
% \hline
% Score Type & Accuracy  & Precision & Recall \\
% \hline
% Template & 0.5850 & 0.5850 & 0.5676 \\
% \hline
% DDPM  & \textbf{0.6214}  & \textbf{0.6554} & \textbf{0.6221} \\
% \hline
% \end{tabular}
% }
% \caption{\textbf{Classification of CN vs MCI and CN vs AD.} The table shows the accuracy, precision and recall of 10 fold cross validation using template and DDPM reference sets' abnormal score per ROI as feature for SVM.}
% \label{tab:class}
% \end{table}

\begin{table}[tb]
\makebox[\textwidth]{
\begin{tabular}{|c|c|c|c|c|c|c|c|}
\hline

\multicolumn{4}{|c|}{CN vs AD} & \multicolumn{4}{|c|}{CN vs MCI}\\
\hline
Score Type & Accuracy  & Precision & Recall & Score Type & Accuracy  & Precision & Recall \\
\hline
Template & 0.6882 & 0.6482 & 0.7049 & Template & 0.5850 & 0.5850 & 0.5676\\
\hline
DDPM  & \textbf{0.7128}  & \textbf{0.7182} & \textbf{0.7091} & DDPM  & \textbf{0.6214}  & \textbf{0.6554} & \textbf{0.6221}\\
\hline
\end{tabular}
}
\caption{\textbf{Classification of CN vs MCI and CN vs AD.} The table shows the accuracy, precision and recall of 10 fold cross validation using template and DDPM reference sets' abnormal score per ROI as feature for SVM.}
\label{tab:class}
\end{table}

Additionally, we also conduct classification experiments for CN vs MCI and CN vs AD. The abnormal scores are computed for 34 ROIs for all test subjects, which are formatted as length 34 vectors. These vectors are then used as features for classification in a standard SVM classifier. To validate the results, we perform 10-fold cross validation and the accuracy, precision and recall are shown in Table \ref{tab:class}. In both CN vs AD and CN vs MCI, our abnormal score performs better than the  template's with closest matched age. 

\section{Conclusion}
In this paper, we proposed a framework for DDPM model on the spherical domain, conditioned on the anatomical segmentation, sex and age to generate anatomically aligned feature maps. The ablation study and normative tests have shown that our model can generate reliable feature maps on the cortical surface and perform better than registered reference set in AD normative modeling. We will freely distribute our source codes and trained models to the research community and enable researchers to utilize our model for other generative tasks on surfaces beyond normative analyses.

\newpage
\section{Acknowledgments}
\label{sec:acknowledgments}
This work was supported by the National Institute of Health (NIH) under grants RF1AG077578, RF1AG064584, R01EB022744, RF1AG084072, U19AG078109, and P30AG066530.

Data used in preparing this article were obtained from the ADNI database (\url{adni.loni.usc.edu}). As such, many investigators within the ADNI contributed to the design and implementation of ADNI and/or provided data but did not participate in analysis or writing of this report. A complete list of ADNI investigators: \url{http://adni.loni.usc.edu/wp-content/uploads/how_to_apply/ADNI_Acknowledgement_List.pdf.} 

Data were provided [in part] by the Human Connectome Project, WU-Minn Consortium (Principal Investigators: David Van Essen and Kamil Ugurbil; 1U54MH091657) funded by the 16 NIH Institutes and Centers that support the NIH Blueprint for Neuroscience Research; and by the McDonnell Center for Systems Neuroscience at Washington University.



\bibliography{midl25_83}
\newpage
\appendix
\section{Example images for FID Score Computation}

\begin{figure}[htb!]

\begin{minipage}[b]{1.0\linewidth}
  \centering
  \centerline{\includegraphics[width=\linewidth]{figs_eps/Appendix_A.eps}}
%  \vspace{2.0cm}
\end{minipage}
\caption{\textbf{Example feature maps embedded into 2D images:} Each row shows the saggital front and back view of each type of feature, cortical thickness(CT), curvature(curv) and sulcal depth(sulc). Each column denotes the generative model for the feature maps.}
 % \label{fig:network}
\end{figure}

\end{document}
