\documentclass{midl} % Include author names
%\documentclass[anon]{midl} % Anonymized submission

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\usepackage{multirow}
\usepackage{graphicx}
\graphicspath{ {./images/} }
\jmlryear{2021}
\jmlrworkshop{Full Paper -- MIDL 2021}
\jmlrvolume{-- Under Review}
\editors{Under Review for MIDL 2021}

\title[Diffusion u-net]{Learning the Latent Heat Diffusion Process through Structural Brain Network from Longitudinal Amyloid-$\beta$ Data}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Md Asadullah Turja\nametag{$^{1}$}} \Email{mturja@cs.unc.edu}\\
\Name{Guorong Wu\nametag{$^{2}$}} \Email{guorong\_wu@med.unc.edu}\\
\Name{Defu Yang}\nametag{$^{2}$} \Email{defu@email.unc.edu}\\
\Name{Martin Styner\nametag{$^{1,2}$}} \Email{styner@email.unc.edu}\\
\addr $^{1}$ Department of Computer Science, University of North Carolina at Chapel Hill, USA. \\
\addr $^{2}$ Department of Psychiatry, University of North Carolina at Chapel Hill, USA.
}


\begin{document}

\maketitle

\begin{abstract}
The excessive deposition of misfolded proteins such as amyloid-$\beta$~(A$\beta$) protein is an aging event underlying several neurodegenerative diseases. Mounting evidence shows that the spreading of neuropathological burden has a strong association to the white matter tracts in the brain which can be measured using diffusion-weighted imaging and tractography technologies. Most of the previous studies analyze the dynamic progression of amyloid using cross-sectional data which is not robust to the heterogeneous A$\beta$ dynamics across the population. In this regard, we propose a graph neural network-based learning framework to capture the disease-related dynamics by tracking the spreading of amyloid across brain networks from the subject-specific longitudinal PET images. To learn from limited (2 – 3 timestamps) and noisy longitudinal data, we restrict the space of amyloid propagation patterns to a latent heat diffusion model which is constrained by the anatomical connectivity of the brain. Our experiments show that restricting the dynamics to be a heat diffusion mechanism helps to train a robust deep neural network for predicting future time points and classifying Alzheimer's disease brain.
\end{abstract}

\begin{keywords}
Heat Diffusion, Florbetapir PET, Structural Brain Network, Alzheimer's Disease,  Graph Neural Network, Graph Embedding, Graph u-net, Amyloid Progression Pattern, Amyloid-$\beta$ Pathology.
\end{keywords}

\section{Introduction}
Accumulation of amyloid-$\beta$ (A$\beta$) peptide into extracellular plaques is known to be a central event in Alzheimer's disease brain \citep{LUE, Palop2010, Jack, fan2018, Canter116244}. An increasing number of studies \citep{Seeley, Jonkmane532} suggest that amyloid deposition expands into areas that receive neuronal projections from other brain regions already exhibiting A$\beta$, spreading between interconnected neurons through large-scale networks. Advancement in A$\beta$-PET imaging enables us to quantify the spatial distribution of amyloid in the brains of living humans, allowing for the study of disease progression before symptoms manifest \citep{Villemagne}. \citet{Raj2012} proposes a network diffusion model to characterize this progression pattern as a diffusive mechanism mediated by the brain’s connectivity network which finds characteristic sub-modules in Alzheimer's~(AD) brain. However, their approach doesn't consider misfolded protein~(MP) burdens such as amyloid or tau. In this regard, \citet{Vogel2020} proposes an epidemic spreading model based on anatomical connectivity of the brain to explain the observed MP burdens in healthy and diseased~(AD) brains. Both of these approaches show the importance of the large-scale brain network in modeling disease progression based on MP such as A$\beta$. However, these approaches use cross-sectional data which is not sufficient to uncover the true dynamics of amyloid because of the subject-wise heterogeneity across the population. Although the authors of the epidemic spreading model reconstruct the MP burdens in a subject-specific manner, they have to tune their model for each individual during test time which might lead to overfitting and potentially distort the true dynamics.

We propose \textit{Diffusion u-net}: a graph u-net \citep{Gao2019GraphU} based architecture that uses longitudinal data to learn the disease dynamics in a subject-wise manner. However, the complex mechanics of A$\beta$ through brain networks is difficult to learn from a dataset that is noisy and has limited timestamps per subject~($<3$ on average). We tackle this problem by constraining our model to learn a latent heat diffusion process that explains the neurodegenerative progression by the intercellular transfer of A$\beta$. We assume the observed longitudinal amyloid distributions are future instances of the heat diffusion mechanism constrained by the structural brain network. This way our model learns in a subset of possible disease processes which makes it possible to infer patterns from limited temporal data. Moreover, while previous approaches assume a predefined set of epicenter brain regions~(For example, \citet{Vogel2020} fit their model using the left and right entorhinal cortex as the epicenters) from where A$\beta$ spreads to other brain areas, we show our model can find these epicenters from higher-order dynamics of A$\beta$ deposition. Extensive experimental results show that our model is quite effective in learning a robust embedding from only a few time point data and outperforms more complex models in terms of future A$\beta$ prediction, Alzheimer's classification, and disease epicenter estimation.

\begin{figure}[ht]
\centering
\includegraphics[width=\linewidth]{images/main_reg.png}
\caption{(a) Our proposed model \textbf{Diffusion u-net} consists of two neural networks --- graph u-net and $\beta$-net to estimate future A$\beta$ depositions using Equation~\ref{eq:int}, (b) \textbf{Cluster u-net} learns group specific dynamics using an addition classifier to classify sex (Male/Female) based on $\boldsymbol{\rho}_{s, 0}$, and, (c) \textbf{Adaptive u-net} learns separate group specific representation $\boldsymbol{\rho}_{s, 0}^M$ and $\boldsymbol{\rho}_{s, 0}^F$ using a multi-task learning paradigm.}
\label{fig:Model Architecture}
\end{figure}


\section{Method and Materials} \label{methodology}
\subsection{Dataset} \label{exp:dataset}
Our dataset consists of longitudinal amyloid data processed from 1312 subjects from Alzheimer's Disease Neuroimaging Initiative~(ADNI) from five different diagnosis group --- Normal Control (CN), Significant Memory Concern~(SMC), Early Mild Cognitive Impairment~(EMCI), Late Mild Cognitive Impairment~(LMCI), and, Alzheimer's Disease~(AD). with 316, 157, 357, 260, and, 222 subjects respectively. Each of the subjects has PET images at 1-6 time points (2.3 on average) with a 6 - 30 months gap between consecutive time points. The PET scans are first registered to T1-weighted MR images and then standardized uptake values (SUV) are computed for each voxel. Region-specific SUV values are computed by partitioning the cortical surface into 148 regions by using Destrieux atlas in FreeSurfer \citep{fs} and computing the average SUV for all the voxels of that brain region. The ratio of the SUV values (SUVR) is computed using the SUV of the cerebellum as the reference. We then normalize the SUVR based on the batch-wise min-max normalization approach. We compute the structural connectivity matrices from dMRI data of 103 subjects using FSL’s probtrackx \citep{probtrackx} (two-fiber model, 10000 tracks per seed) to obtain 148×148 connectivity matrices. We use the average white matter connectivity networks from the CN group in our experiments which is more robust to spurious noises in our data compared to subject-wise networks. 
\subsection{Method}
Let $\boldsymbol \Phi_s = [\boldsymbol \Phi_{s, 0}, \cdots, \boldsymbol \Phi_{s, T_s}]$ be a matrix for the subject $s$ where each column $\boldsymbol \Phi_{s, t} = [\phi_{s, t} ^ v]$ represent the observed longitudinal A$\beta$ values for the brain region $v$ at different time-points $t$ and $T_s$ is the number of observations of $s$. We consider $\boldsymbol \Phi_{s, t}$ as a graph signal on the structural brain network $\mathcal{G}(\mathcal{V}, \mathcal{E})$ which is an undirected weighted graph with $v \in \mathcal{V}$ corresponding to the region of interests (ROI) from destriux parcellation \citep{fs} and $e_{ij} \in \mathcal{E}$ corresponding to the edge that connects two ROIs $i$ and $j$ with weight $w_{ij}$ proportional to the number of white matter fibers between $i$ and $j$.
Let $\boldsymbol \rho_{s} = \frac{d \boldsymbol \Phi_s}{dt} = [\boldsymbol \rho_{s, 0}, \cdots, \boldsymbol \rho_{s, T_s}]$ be the net amyloid generation rate at each ROI which characterizes the dynamics of the disease progression within the brain. $\boldsymbol \rho_s$ exhibits a non-linear dynamics due to inter-regional transfer of amyloid between two regions connected by white matter pathways. We model this dynamics by assuming that $\boldsymbol \rho_s$ follows the heat diffusion equation in Equation~\ref{eq:headdiffeq} which is in accordance with the prion-like hypothesis in neurodegenerative diseases \citep{Palop2010, Frost2010}.
\begin{equation}\label{eq:headdiffeq}
\frac{d \boldsymbol \rho_{s}}{dt} = - \beta \mathcal{L} \boldsymbol \rho_{s}
\end{equation}
Here, $\beta$ is a scalar which controls speed of the amyloid progression through the structural network and $\mathcal{L}$ is the graph laplacian operator with,
\begin{equation}
    \mathcal{L}_{ij} = 
    \begin{cases}
    \sum_{j'} w_{ij'}\textrm{, if } i = j \\
    -w_{ij} \textrm{, if } i \neq j\textrm{ and } i\textrm{ adjacent to } j \\
    0 \textrm{, otherwise}
    \end{cases}
\end{equation}

\textbf{Prediction of future A$\beta$, $\boldsymbol \Phi'_{s}$}: To estimate the future amyloid depositions $\boldsymbol \Phi'_{s,t}$ at time $t$, we first compute $\boldsymbol \rho_{s, t}$ from a learned $\boldsymbol \rho_{s, 0}$ using the heat diffusion equation (Equation \ref{eq:headdiffeq}) and aggregate $\boldsymbol \rho_{s,t}$ over time using Equation~\ref{eq:accu}.
\begin{equation} \label{eq:accu}
    \boldsymbol \Phi'_{s,t} = \int_0^t \boldsymbol \rho_{s, \tau} d\tau
\end{equation}
Equation \ref{eq:headdiffeq} has a closed form solution which is, $\boldsymbol \rho_{s,t} = e ^ {- \beta \mathcal{L} t} \boldsymbol \rho_{s,0} = \mathcal{U}e^{-\beta \Lambda t} \mathcal{U^T} \boldsymbol \rho_{s,0}$. Here $\mathcal{U} = [u_1, \cdots, u_{|\mathcal{V}|}]$ and $\Lambda = \textrm{ diag } (\lambda_1, \cdots, \lambda_{|\mathcal{V}|})$ come from eigendecomposition of $\mathcal{L} = \mathcal{U}\Lambda \mathcal{U^T}$. Replacing $\boldsymbol \rho_{s,t}$ on Equation \ref{eq:accu} and doing some simplification we get our longitudinal estimate:
\begin{equation} \label{eq:int}
    \boldsymbol \Phi'_{s, t} = \sum_{i=1}^{|\mathcal{V}|} \frac{1}{\beta \lambda_i} (1 - e^{-\beta \lambda_it})u_i^T \boldsymbol \rho_{s,0} u_i
\end{equation}
Equation~\ref{eq:int} has two unknown variables --- $\beta$ and $\boldsymbol \rho_{s,0}$. We train \textbf{Diffusion u-net} which consists of the following two neural network models to estimate those variables from baseline A$\beta$ values $\boldsymbol \Phi_{s,0}$.

\textbf{Graph u-net}: We train a graph u-net model which takes $\boldsymbol \Phi_{s, 0}$  as inputs and predicts it's derivative $\boldsymbol \rho_{s,0}$. The u-net architecture has a depth of 4 with graph convolution layers of 64 units. The pooling layers in u-net select the top 50\% of the ROIs based on learned ranking.

\textbf{$\beta$-net}: The inter-regional amyloid transfer speed $\beta$ can be affected by several risk factors of Alzheimer's disease \citep{Raffaitin2009, Solfrizzi433, Campbell2013} such as 1. age --- amyloid progression slows down over time towards a plateau \citep{Villemagne_amy}, 2. sex --- interaction between tau and amyloid is stronger in women resulting in a higher risk of developing symptoms of the serious brain disease \citep{jamaneurol}, apolipoprotein E (APOE) $\epsilon$4 allele --- A$\beta$ deposition rate is higher with APOE-$\epsilon$4 carriers \citep{Kanekiyo2014}, and education level --- people with a lower level of education are known to be less resistant against Alzheimer's \citep{Ngandu2007}. In this regard, we train a multilayer perceptron with 2 hidden layers of 256 units to predict $\beta$ from the aforementioned risk factors which account for the heterogeneous dynamics of A$\beta$ across the population.

Finally, we propose two more models (Figure~\ref{fig:Model Architecture}(b)) which account for the heterogeneity between female and male groups in the population by learning sex-specific representations $\boldsymbol{\rho}_{s, 0}^F$ and $\boldsymbol{\rho}_{s, 0}^M$ respectively. The two models are ---

\textbf{Cluster u-net}: Here we aim to cluster the embedding $\boldsymbol\rho_{s,0}$ based on the sex information(Male/Female). We train a multilayer perceptron (3 hidden layers with 256 units each) to classify $\boldsymbol\rho_{s,0}$ based on sex. In this way, the u-net learns an embedding that is informative of the group. The u-net and the classifier are then jointly trained to reconstruct $\boldsymbol \Phi'_s$.

\textbf{Adaptive u-net}: We then took a different approach based on the multi-task learning paradigm which learns an adaptive embedding for each group. Here we consider the male and female group as a separate task and learn separate embeddings ($\boldsymbol\rho_{M,0}, \boldsymbol\rho_{F,0}$ respectively) for them. To do this, we use our graph u-net as a base architecture and fork two different branches for each of the groups. These branches consist of a multilayer perceptron with 1 hidden layer of $64$ units which are trained using the data from the corresponding groups.

\textbf{Training}: With $\boldsymbol \rho_{s,0}$ from graph u-net and $\beta$ from $\beta$-net, we estimate the future amyloid depositions, $\boldsymbol \Phi'_{s,0}, \cdots, \boldsymbol \Phi'_{s,t}$ using equation~\ref{eq:int}. We then jointly train both of these neural networks by minimizing the following loss function:
\begin{equation} \label{eq:recon_loss}
    \mathcal{L}_{recon} = \sum_{s \in S} (\sum_{v \in \mathcal{V}} \sum_{t=0}^{T_s} (\phi'^v_{s, t} - \phi_{s, t}^v)^2 + (I + \mathcal{L})^{-1} \boldsymbol \rho_{s0})
\end{equation}

The first term is the mean-squared error between $\boldsymbol \Phi'_s$ and $\boldsymbol \Phi_s$. The second term is a regularization term on $\boldsymbol \rho_{s,0}$ \citep{diff_distance} which is used to stabilize the training during the initial epochs by enforcing the model to learn embedding with lower entropy, i.e., a higher imbalance between neighboring regions.
\subsection{Experimental Setup}
We evaluate our method by doing a series of experiments on three different tasks --- reconstruction, classification, and estimation of A$\beta$ epicenters.
\subsubsection{Reconstruction}
We compare the accuracy of the reconstructed A$\beta$ values from our model with the following models:

\textbf{Stationary model}: We assume the A$\beta$ values are constant, i.e, there no change in amyloid deposition over time.

\textbf{Linear model}: Here we assume a linear progression of amyloid without any diffusion through the structural brain network. This is equivalent to assuming constant $\boldsymbol{\rho}_{s,t}$ in Equation~\ref{eq:accu}, i.e., $\boldsymbol{\rho}_{s,t}=\boldsymbol{\rho}_{s,0}=\boldsymbol{m}_s$ which results in $\boldsymbol \Phi'_{s, t}=\boldsymbol{m}_s t + \boldsymbol{\Phi}_{s, 0}$. We consider two models in this regard --- \textbf{Linear-shared} where $\boldsymbol m_s$ represents the shared slope for each ROI for all the subjects, and \textbf{Linear-subject} where we learn subject-specific $\boldsymbol{m}_s$ from $\boldsymbol{\Phi}_{s,0}$ as a linear combination, i.e., $\boldsymbol m_s = \boldsymbol W \boldsymbol{\Phi}_{s,0}$. Here $\boldsymbol W$ is a $|\mathcal{V}| \times |\mathcal{V}|$ weight matrix which is learned using stochastic gradient descent.

\textbf{GRU}: Here we learned the non-linear dynamics from a gated recurrent unit (GRU) \citep{cho-etal-2014-learning} based architecture instead of our heat diffusion equation. We use the same graph u-net based architecture as ours to learn a hidden embedding $\boldsymbol \rho_{s,t}$ for each time point $t$ and then feed them to a GRU to predict the future amyloid depositions.
\subsubsection{Classification}
We show the effectiveness of our predicted $\boldsymbol \Phi'_s$ by a series of classification experiments. We train 5 different classifiers which classify between CN and AD group based on $\boldsymbol \Phi'_s$ from Adaptive u-net. We compare the classification result by training all of these models on the ground truth A$\beta$ deposition $\boldsymbol \Phi_s$. The models are:

$\boldsymbol{v_1}$: This is a simple random forest classifier trained using only the baseline A$\beta$ depositions ($\boldsymbol \Phi_{s,0}$ and $\boldsymbol \Phi'_{s, 0}$).

$\boldsymbol{v_2}$: This model is a graph convolution based classifier (\textbf{GCNClassifier}) with 2 graph convolutional blocks with size 64 \citep{Kipf:2016tc} followed by a multilayer perceptron with 3 hidden layers of size 256. This model is trained using the baseline A$\beta$ depositions ($\boldsymbol \Phi_{s,0}$ and $\boldsymbol \Phi'_{s, 0}$) to minimize the cross-entropy loss between predicted and ground-truth diagnosis label.

$\boldsymbol{v_3}$: Unlike $\boldsymbol{v_2}$, in this model we use the longitudinal data ($\boldsymbol \Phi_{s}$ and $\boldsymbol \Phi'_{s}$) to predict the diagnosis label (CN/AD). First, we learn an embedding from the longitudinal data using a recurrent neural network with 2 stacked gated recurrent units (GRU) with hidden unit size 148 and then feed this embedding through our GCNClassifier.

$\boldsymbol{v_4}$: In this version, instead of using a GRU, we utilize the longitudinal data by taking the minimum ($\boldsymbol{\Phi'}_s^{min}$) and maximum ($\boldsymbol{\Phi'}_s^{max}$) A$\beta$ values across all the time points for each brain region and feed them to the GCNClassifier (Min-Max Classifier). The input $\boldsymbol{\Phi'}_s^{mnx}$ to the model is the concatenation of the minimum and maximum values, i.e., $\boldsymbol{\Phi'}_s^{mnx} = [\boldsymbol{\Phi'}_s^{min}, \boldsymbol{\Phi'}_s^{max}]$.

$\boldsymbol{v_5}$: In all the previous models, the input of the classifier $\boldsymbol \Phi'_{s, 0}$  comes from the pretrained Adaptive u-net. In this model, we first pretrain Adaptive u-net with data from all the diagnosis groups and then jointly train this Adaptive u-net and $\boldsymbol{v_4}$ for only CN and AD group. We minimize the sum of reconstruction loss from Equation~\ref{eq:recon_loss} and cross-entropy loss for joint training.
\subsubsection{Epicenter estimation}
A$\beta$ deposition starts at certain epicenters in the Alzheimer's brain from where it spreads to neighbouring brain regions. It is known that A$\beta$ load increases almost in a linear fashion until it plateaus exhibiting non-linear dynamics \citep{Jack2013BrainL, Villemagne_amy}. This means the epicenters should exhibit higher order dynamics ($\boldsymbol{\Phi}^1_s = \frac{d \Phi_{s,0}}{dt}$, $\boldsymbol{\Phi}^2_s = \frac{d \Phi^1_{s,0}}{dt}$ etc.) compared to other brain regions. In this experiment, we use these higher order dynamics to find the epicenters from where the A$\beta$ spreads to the brain. Our diffusion u-net model learns to estimate a graph signal by reconstructing $\boldsymbol{\Phi}_{s, 0}$ from $\boldsymbol{\rho}_{s, 0}$ using Equation~\ref{eq:accu} which enforces $\boldsymbol{\rho}_{s, 0} = \frac{d \boldsymbol{\Phi}_{s, 0}}{dt}$. This means we can treat our model as a differential operator of a graph signal. We recursively compute higher order dynamics using $\boldsymbol{\Phi}^{l+1}_s = \frac{d \boldsymbol{\Phi}^{l}_s}{dt}$ where $\boldsymbol{\Phi}^{0}_s = \boldsymbol \Phi_{s, 0}$ by feeding $\boldsymbol{\Phi}^l_s$ to our model until $\boldsymbol{\Phi}^{l}_s = \boldsymbol{0}$. We then define a diffusion distance measure $\boldsymbol d_{s} = [d_s^v]$ for each brain region $v$ by simply counting how many higher order derivatives $v$ has and normalizing it using min-max normalization approach across all subjects. Higher $d_{s}^v$ value will indicate higher likelihood of being an epicenter.
\section{Results}
\subsection{Reconstruction} \label{exp:recon}
Table~\ref{tab:r_2} shows the $r^2$ values between predicted A$\beta$ values $\boldsymbol \Phi'_{s,t}$ and ground truth A$\beta$ values $\boldsymbol \Phi_{s,t}$ across all the ROIs for all our reconstruction models. Our experiments show that the stationary model can explain 79\% of the variance of the longitudinal A$\beta$ data. Linear-shared and Linear-subject models improve the performance by $2\%$ and $7\%$ respectively which shows the effectiveness of learning subject-specific dynamics. Surprisingly, the non-linear GRU model performs worse~($r^2=0.84$) compared to the Linear-subject model~($r^2=0.86$). We hypothesize that it's challenging for complex neural networks to learn the hidden dynamics due to the low signal-to-noise ratio in our dataset. In this regard, our model Diffusion u-net replaces the GRU with a much simpler non-linear process (Equation~\ref{eq:int}) which outperforms the aforementioned models with $r^2=0.88$. Moreover, our results in Table \ref{tab:r_2} show that the performance of all the models is significantly worse for the female subjects in Alzheimer's group. Cluster u-net and Adaptive u-net attempt to solve this problem by learning sex-specific representation. While Cluster u-net doesn't show any improvement, Adaptive u-net significantly improves the performance for both males and females across all the diagnosis groups with 2\% improvement ($r^2=0.90$) overall and 5\% improvement ($r^2=0.56$) for the female-AD group compared to our original Diffusion u-net.
\begin{table}[htbp]
 % The first argument is the label.
 % The caption goes in the second argument, and the table contents
 % go in the third argument.
\floatconts
  {tab:r_2}%
  {\caption{$r^2$ values across different diagnosis groups}}%
  {\begin{tabular}{|p{1.7cm}|p{1.5cm}|p{1.1cm}|p{1.1cm}|p{1.1cm}|p{1.1cm}|p{1.1cm}|p{1.4cm}|}
  \hline
  \bfseries Model & \bfseries Sex & \bfseries CN & \bfseries SMC & \bfseries EMCI & \bfseries LMCI & \bfseries AD & \bfseries Overall\\
  \hline
  \multirow{2}{5em}{Stationary} & Female & 0.74 & 0.61 & 0.73 & 0.60 & 0.51 & \multirow{2}{2em}{0.79}\\
  & Male & 0.83 & 0.55 & 0.79 & 0.77 & 0.82 &\\
  \hline
  \multirow{2}{5em}{Linear-shared} & Female & 0.76 & 0.64 & 0.72 & 0.61 & 0.52 & \multirow{2}{2em}{0.81}\\
  & Male & 0.84 & 0.62 & 0.80 & 0.78 & 0.82 &\\
  \hline
  \multirow{2}{5em}{Linear-subject} & Female & 0.82 & 0.71 & 0.85 & 0.72 & 0.52 & \multirow{2}{2em}{0.86}\\
  & Male & 0.86 & 0.70 & 0.84 & 0.82 & 0.86 &\\
  \hline
  \multirow{2}{5em}{GRU} & Female & 0.79 & 0.65 & 0.83 & 0.60 & 0.21 & \multirow{2}{2em}{0.84}\\
  & Male & 0.86 & 0.54 & 0.84 & 0.73 & 0.85 &\\
  \hline
  \multirow{2}{5em}{Diffusion u-net} & Female & 0.84 & 0.73 & 0.88 & 0.74 & 0.52 & \multirow{2}{2em}{0.88}\\
  & Male & 0.88 & 0.78 & 0.88 & 0.82 & 0.88 & \\
  \hline
  \multirow{2}{5em}{Cluster u-net} & Female & 0.84 & 0.74 & 0.88 & 0.75 & 0.53 & \multirow{2}{2em}{0.88}\\
  & Male & 0.87 & 0.79 & 0.87 & 0.81 & 0.89 &\\
  \hline
  \multirow{2}{5em}{Adaptive u-net} & Female & 0.87 & 0.77 & 0.91 & 0.78 & 0.56 & \multirow{2}{2em}{\textbf{0.90}}\\
  & Male & 0.92 & 0.89 & 0.93 & 0.86 & 0.96 &\\
  \hline
  \end{tabular}}
\end{table}
\subsection{Classification} \label{exp:cls}
The results of the classification experiments are listed in Table~\ref{tab:roc}. From the results we find that $\boldsymbol{v_1}$ and $\boldsymbol{v_2}$ don't show any significant difference in performance between $\boldsymbol \Phi_s$ and $\boldsymbol \Phi'_s$. In $\boldsymbol{v_3}$, the performance for both $\boldsymbol{\Phi}_s$ and $\boldsymbol{\Phi'}_s$ are again similar and they outperform our previous models $\boldsymbol{v_1}$ and $\boldsymbol{v_2}$. This shows that our model doesn't lose any temporal information pertinent for diagnosis. Interestingly, in $\boldsymbol{v_4}$ we see a significant difference in performance where $\boldsymbol \Phi'_s$ beats $\boldsymbol \Phi_s$ by more than 2\% in terms of ROC value. Minimum and maximum A$\beta$ values are highly sensitive to spurious noises in data. We maintain that our proposed method reduces such noises by restricting the dynamics in a heat diffusion mechanism.
However, jointly training the Adaptive u-net with classifier in $\boldsymbol{v_5}$, doesn't show any performance boost compared to $v_4$.
\begin{table}[htbp]
 % The first argument is the label.
 % The caption goes in the second argument, and the table contents
 % go in the third argument.
\floatconts
  {tab:roc}%
  {\caption{Micro ROC values for CN/AD classification}}
  {\begin{tabular}{|c|c|c|c|}
  \hline
  \bfseries Model & \bfseries Model Description & \bfseries Input & \bfseries ROC\\
  \hline
  \multirow{2}{1em}{$v_1$} & \multirow{2}{8em}{Random Forest Classifier} & $\boldsymbol \Phi_{s,0}$ & 0.8855\\
   & & $\boldsymbol \Phi'_{s,0}$ & 0.8857\\
  \hline
  \multirow{2}{1em}{$v_2$} & \multirow{2}{8em}{GCNClassifier} & $\boldsymbol \Phi_{s,0}$ & 0.8853\\
   & & $\boldsymbol \Phi'_{s,0}$ & 0.8872\\
  \hline
  \multirow{2}{1em}{$v_3$} & \multirow{2}{8em}{GRU + GCNClassifier} & $\boldsymbol \Phi_s$ & \textbf{0.9304}\\
   & & $\boldsymbol \Phi'_s$ & 0.9288\\
  \hline
  \multirow{2}{1em}{$v_4$} & \multirow{2}{8em}{Min-Max + GCNClassifier} & $\boldsymbol \Phi_s^{mnx}$ & 0.9061\\
   & & $\boldsymbol \Phi'^{mnx}_s$ & 0.9241\\
  \hline
  $v_5$ & Adaptive u-net + $v_4$ & $\boldsymbol \Phi'_s$ & 0.9289\\
  \hline
  \end{tabular}}
\end{table}
\subsection{Epicenter estimation} \label{sec:diff_dist}
The group-wise average of $\boldsymbol d_s$ for different diagnosis groups (CN, AD, etc.) shows that the amyloid starts at the temporal lobe (\textit{Anterior transverse collateral sulcus}, \textit{Temporal pole}) and then spreads through limbic lobe (\textit{Subcallosal area}), and, frontal lobe (\textit{Medial orbital sulcus, Orbital sulci}) and finally expand into the parietal lobe and the occipital lobe(Figure~\ref{fig:diffusion_distance}). These results align with the current findings of cortical AD progression \citep{smith02, dickerson, Wu2021}.
\begin{figure}[ht]
\centering
\includegraphics[width=\linewidth]{images/diff_dist_final.png}
\caption{(a) The dynamics of diffusion distance ($\boldsymbol d_s$) of the right hemisphere from EMCI to AD group are shown where darker color indicates higher diffusion distances. (b) Average diffusion distances for the right hemisphere of the LMCI group are organized by the lobes~(marked with different color bars) of the brain. The peak distances in the temporal lobe indicate epicenter brain regions from where A$\beta$ spreads to other brain regions.}
\label{fig:diffusion_distance}
\end{figure}
\section{Conclusion} \label{sec:conclusion}
Understanding the progression dynamics of Alzheimer's disease is a major factor in the preclinical stages of the disease. In this paper, we propose a graph neural network-based model to learn these dynamics from longitudinal amyloid data in a subject-wise manner. By constraining these dynamics to a heat diffusion process, our model can learn robust dynamics from a few time point data which shows effectiveness in future amyloid prediction, Alzheimer's disease classification, and, disease epicenter prediction.

\midlacknowledgments{This work is supported by the National Institutes of Health (NIH) (grant numbers AG068399, AG059065, AG049089), and Foundation of Hope.}

\bibliography{Turja21}

\appendix
\section{Implementation Details}
This section provides the details of the neural network architectures in this work.

\subsection{Diffusion u-net}
We have two different neural networks in our model: 1. Graph u-net (Figure~\ref{fig:u-net}), and, 2. $\beta$-net. The graph u-net has a depth of 4. The graph convolution layers inside the u-net have 64 units and the graph pooling layer selects top 50\% of the nodes based on the learned ranking. The $\beta$-net is a multilayer perceptron with two hidden layers each having 256 units. For training we split the dataset into training (70\%), validation (15\%), and, evaluation (15\%) folds and jointly train both of these networks for 700 epochs with a learning rate = 1e-6, dropout rate = 0.01, activation = Exponential linear unit (ELU) and batch size = 32.

\begin{figure}[ht]
\centering
\includegraphics[width=\linewidth]{images/gunet.png}
\caption{Graph u-net architecture}
\label{fig:u-net}
\end{figure}

\subsection{GCNClassifier}
The GCNClassifier has two convolutional blocks from \citep{Kipf:2016tc} each having 64 units. The output of these convolutional blocks is then concatenated and fed into a multilayer perceptron with 3 hidden layers each having 256 units. The classifier is trained for 500 epochs using the hyper-parameters: learning rate = 1e-5, dropout rate = 0.01, activation = Exponential linear unit (ELU) and batch size = 32.
\begin{figure}[ht]
\centering
\includegraphics[width=\linewidth]{images/GCNClassifier.png}
\caption{Graph convolution based classifier}
\label{fig:gcncls}
\end{figure}
\end{document}