\documentclass{midl} % Include \usepackage{multirow}
\usepackage{array}
\newcommand{\rpm}{\raisebox{.2ex}{$\scriptstyle\pm$}}
\usepackage{wrapfig}
\usepackage{multirow}
\usepackage{mwe} % to get dummy images
\DeclareMathOperator*{\argmin}{arg\,min}
\DeclareMathOperator{\Tr}{Tr}
\newcommand{\tgreen}[1]{%
	\textcolor{green}{#1}
}%
\usepackage{adjustbox}
%\jmlrvolume{-- Under Review}
%\editors{Under Review for MIDL 2021}
\jmlryear{2021}
\jmlrworkshop{Full Paper -- MIDL 2021}

\title[M-GCN]{M-GCN: A Multimodal Graph Convolutional Network to Integrate Functional and Structural Connectomics Data to Predict Multidimensional Phenotypic Characterizations}

 
\midlauthor{\Name{Niharika S. D'Souza~\nametag{$^{1}$}}, \Name{Mary Beth Nebel~\nametag{$^{2,3}$}}, \Name{Deana Crocetti~\nametag{$^{2,3}$}}, \Name{Joshua Robinson~\nametag{$^{2}$}},  \Name{Stewart Mostofsky~\nametag{$^{2,3,4}$}},  \Name{Archana Venkataraman~\nametag{$^{1}$}}  \\
\addr $^{1}$ Dept. of Electrical and Computer Engineering, Johns Hopkins University, Baltimore, USA \\
\addr $^{2}$ Kennedy Krieger Institute, Johns Hopkins School of Medicine, Baltimore, USA \\
\addr $^{3}$ Dept. of Neurology, Johns Hopkins School of Medicine, Baltimore, USA\\
\addr $^{4}$ Dept. of Psychiatry, Johns Hopkins School of Medicine, Baltimore, USA}

\begin{document}

\maketitle

\begin{abstract}
We propose a multimodal graph convolutional network (M-GCN) that integrates resting-state fMRI connectivity and diffusion tensor imaging tractography to predict phenotypic measures. Our specialized M-GCN filters act topologically on the functional connectivity matrices, as guided by the subject-wise structural connectomes. The inclusion of structural information also acts as a regularizer and helps extract rich data embeddings that are predictive of clinical outcomes. We validate our framework on 275 healthy individuals from the Human Connectome Project and 57 individuals diagnosed with Autism Spectrum Disorder from an in-house data to predict cognitive measures and behavioral deficits respectively. We demonstrate that the M-GCN outperforms several state-of-the-art baselines in a five-fold cross validated setting and extracts predictive biomarkers from both healthy and autistic populations. Our framework thus provides the representational flexibility to exploit the complementary nature of structure and function and map this information to phenotypic measures in the presence of limited training data. 
\end{abstract}

\begin{keywords}
Graph Convolutional Networks, Functional Connectomics, Structural Connectomics, Multimodal Integration, Phenotypic Prediction, Autism Spectrum Disorder
\end{keywords}

\section{Introduction}

Resting-State functional MRI (rs-fMRI) is a stimulus-free acquisition used to track steady-state changes in co-activation (i.e., connectivity) across the brain \cite{lee2013resting}. Complementary to this functional connectivity, Diffusion Tensor Imaging (DTI) captures the directional diffusion of water molecules in the brain as a proxy for structural connectivity \cite{assaf2008diffusion}. There is mounting evidence in the literature that links the functional signaling and structural pathways in the brain  \cite{skudlarski2008measuring}, with several studies suggesting that this functional connectivity may be mediated by either direct or indirect anatomical connections \cite{fukushima2018structure,atasoy2016human}. Consequently, multimodal integration of connectomics data has become an important topic of study, particularly when characterizing neuropsychiatric disorders, such as autism, ADHD, and schizophrenia \cite{liu2015multimodal}. Traditional multimodal analyses of rs-fMRI and DTI data largely focus on group-wise discrimination. Such methods include statistical tests on edge/node biomarkers \cite{hahn2013selectively} to distinguish subgroups in AD, data-driven representations to discriminate schizophrenia patients vs controls \cite{sui2013combination}, and Bayesian models to extract differential networks \cite{venkataraman2011joint,venkataraman2013connectivity,venkataraman2016bayesian}
While highly informative at the group level, these methods do not directly address inter-individual variability, for example continuous measures of behavior or cognition.

\par The rise of machine learning has prompted a shift in connectomics towards subject-level predictions. This shift has been accelerated by deep learning, which provides unparalleled representational power. The bulk of deep learning methods focus on diagnostic classification. These approaches range from Multi-Layered Perceptrons \cite{heinsfeld2018identification}, Deep Belief Networks  \cite{aghdam2018combination}, to Convolutional Neural Networks \cite{khosla20183d}. Methods to predict finer-grained characteristics (e.g, demographics or behavior) are sparser and largely focus on a single modality. For example, the authors of \cite{kawahara2017brainnetcnn} introduced a convolutional neural network that mapped DTI connectivity matrices to cognitive and motor measures. The work of \cite{lin2016predicting} proposes an artificial neural network for age prediction from structural connectomes. Finally, the work of \cite{d2019integrating} takes the alternative approach of combining a generative dictionary learning framework with a predictive artificial neural network to simultaneously map multiple clinical measures. While these methods achieve good empirical performance, they ignore the interplay between structure and function in the brain. To address this gap, the authors of \cite{d2019integrating} extend their framework to combine dynamic rs-fMRI correlations with DTI tractography using a structurally-regularized matrix decomposition \cite{d2020deep}. While promising, this method does provide explicit control over the extent to which multi-hop (indirect) structural connections mediate functional connectivity. 

\par Graph neural networks are designed to build representations of nodes and edges within graph structured data, and have found applications in a variety of domains where data naturally assumes a network-like organization \cite{zhou2018graph}. These architectures have shown great promise for modeling multi-stage interactions between brain regions that also reflect the hierarchy of brain organization. Hence, these techniques have become important tools in brain connectivity research. Examples include\textcolor{green}{:} modeling dynamic functional connectivity for groupwise discrimination \cite{gadgil2020spatio}, diagnosis of neurodevelopmental disorders \cite{anirudh2019bootstrapping,parisot2018disease} from rs-fMRI correlation inputs, or structural connectivity modeling for disease classification \cite{song2019graph}. However, current approaches do not leverage the complementarity between the structural and functional graphs or examine dimensional measures of behavior beyond diagnostic classification. We propose a multimodal graph convolutional network (M-GCN) to integrate functional and structural connectivity from rs-fMRI and DTI data respectively, and map this information to phenotypic measures. We employ specialized graph convolutional filters based on \cite{kipf2016semi,kawahara2017brainnetcnn} that operate on functional connectivity inputs, as guided by the subject-level structural graph topology. We demonstrate that our framework generalizes to prediction of phenotypic measures on two separate real world datasets and learns to extract predictive brain biomarkers from limited data.

\section{Multimodal Graph Convolutional Network for Connectomics }
\begin{figure}[t!]
   \centering
   \includegraphics[width=\dimexpr \textwidth-20\fboxsep-20\fboxrule\relax]{Figures/M-GCN.PNG}
   \caption{\small{Our M-GCN framework for predicting phenotypic measures \textbf{Green Box:}~Graph Convolutional Model for Representation Learning from Multimodal Connectomics Data. \textbf{Blue Box:}~Fully Connected Artificial Neural Network to map to phenotypic measures.}}  \label{M-GCN}
\end{figure}

Fig.~\ref{M-GCN} illustrates our graph convolutional framework, which consists of a representation learning module on the connectomics data (Green Box) cascaded with a fully connected ANN for regression (Blue Box). Let $N$ be the number of patients and $P$ be the number of regions in our brain parcellation. Our framework first extracts the structural connectivity graph $\mathcal{G} = (\mathcal{V},\mathcal{E}_{n})$ from DTI tractography. The nodes in $\mathcal{V}$ are brain ROIs defined by the parcellation, while the edges in $\{\mathcal{E}_{n}\}$ indicate the presence of at least one fiber tract between these regions. Let $\mathbf{A}_{n} \in \mathcal{R}^{P \times P}$ be the adjacency matrix for $\mathcal{G}$. Correspondingly, we assume that the functional connectivity profile is a signal that rides on the fixed graph montage and is given by rs-fMRI correlation matrices $\mathbf{\Gamma}_{n} \in \mathcal{R}^{P\times P}$. 

\par Traditional convolutional layers assume a spatial contiguity of the input features, as in the case of 2-D images. This assumption breaks down in general graphs, as node orderings may be arbitrary. Thus, graph convolutional networks define a layer-wise propagation rule designed to aggregate information efficiently at each node based on the underlying graph topology \cite{bruna2013spectral, kipf2016semi}. For a generic input signal $\mathbf{X}^{l-1} \in \mathcal{R}^{P \times C_{l-1}}$, a graph filtering operation can be formulated as follows:
\begin{equation} \label{eqn:graphfilt}
    \mathbf{X}^{l} = \mathbf{\phi}(\mathbf{L}\mathbf{X}^{l-1}\mathbf{W}) = \mathbf{\phi}(\tilde{\mathbf{D}}^{-\frac{1}{2}}\tilde{\mathbf{A}}\tilde{\mathbf{D}}^{-\frac{1}{2}}\mathbf{X}^{l-1}\mathbf{W} )  \ \ \ \text{where} \ \ \ \tilde{\mathbf{A}} = \mathcal{I}_{P}+\mathbf{A}; \ \tilde{\mathbf{D}}_{ii} = \sum_{j}\tilde{\mathbf{A}}_{ij}\ \ 
\end{equation}
where $\mathbf{W}\in\mathcal{R}^{C_{l-1}\times C_{l}}$ denotes the filter weights, $\mathcal{I}_{P}$ is an identity matrix of dimension $P$, and $\mathbf{L} = \tilde{\mathbf{D}}^{-\frac{1}{2}}\tilde{\mathbf{A}}\tilde{\mathbf{D}}^{-\frac{1}{2}}$ is the graph Laplacian of the reparameterized adjacency matrix $\tilde{\mathbf{A}}$ and degree matrix $\tilde{\mathbf{D}}$. The authors of \cite{kipf2016semi} demonstrate that Eq.~(\ref{eqn:graphfilt}) is a first order approximation to spectral filtering in the graph Fourier domain.

Inspired by Eq.~(\ref{eqn:graphfilt}), we define a graph filtering operation that acts on the input functional connectivity matrix $\mathbf{\Gamma}_{n}$ to generate a connectivity embedding $\mathbf{H}_{n}^{1,m} \in \mathcal{R}^{P \times P}$ as follows:
\begin{equation}
    \mathbf{H}_{n}^{1,m}(i,j) = \mathbf{\phi}\Big(   (\mathbf{w}_{r}^{m})^{T} \mathbf{L}_{n}\mathbf{\Gamma}_{n} (:,j) +  \mathbf{\Gamma}_{n}(i,:) \mathbf{L}_{n}\mathbf{w}_{c}^{m} + \mathbf{b}^{1}\Big)  \ \ \ \ m \in \{1, \dots M\}
    \label{eqn:Input_layer}
\end{equation}
Here, $M$ is the number of channels, each parametrized by a row and column filter $\mathbf{w}^m_{r},\mathbf{w}^m_{c} \in \mathcal{R}^{P\times 1}$ and a bias term $\mathbf{b}^{1} \in \mathcal{R}^{P\times 1}$, resulting in a total of $(2P+1)$ learnable parameters per channel. Effective, $\mathbf{H}_{n}^{1,m}(i,j)$ computes a weighted sum of the functional connectivity profile of nodes $i$ and $j$, further regularized by the DTI graph Laplacian $\mathbf{L}_{n}$. Conceptually, Eq.~(\ref{eqn:Input_layer}) is similar to the cross shaped E2E filters in \cite{kawahara2017brainnetcnn}. We also note that, despite the symmetry of the correlation matrices $\mathbf{\Gamma}_{n}$, the embedding $\mathbf{H}_{n}^{1,m}$ can be assymmetric. This allows us to account for any laterality in functional subsystems.

\par Following the connectome embedding in Eq.~(\ref{eqn:Input_layer}), we use two more graph convolutional layers with pooling to first compute a node-wise representation
$\mathbf{H}_{n}^{2} \in \mathcal{R}^{P\times 1}$ and a whole-graph embedding $\mathbf{H}_{n}^{3} \in \mathcal{R}^{D\times 1}$. Mathematically, these operations can be represented as:
\begin{equation}
    \mathbf{H}_{n}^{2} = \mathbf{\phi}\Big( \sum_{m} \mathbf{L}_{n}\mathbf{H}^{1,m}_{n} \mathbf{f}^{m} + \mathbf{b}^{2}\Big) \qquad \qquad \mathbf{H}_{n}^{3} = \mathbf{\phi}\Big( \mathbf{G}  \mathbf{L}_{n}\mathbf{H}^{2}_{n}  + \mathbf{b}^{3}\Big)
    \label{eqn:Sub_layers}
\end{equation}

The filter weights are parameterized by the vectors $\mathbf{f}^{m} \in \mathcal{R}^{P\times 1}$ per $M$ channel, the graph embedding matrix $\mathbf{G} \in \mathcal{R}^{D \times P}$, and the bias terms $\mathbf{b}^{2}$ and $\mathbf{b}^{3}$ respectively. In total, these layers add another $(M+D)P +2$ learnable parameters. Eq.~(\ref{eqn:Sub_layers}) parallels the computation of centrality measures in graph theoretic literature by summarizing node-wise information based on functional similarity, as guided by structure. Finally, our graph embedding $\mathbf{H}^{3}_{n}$ is input to an ANN to map to the phenotypic measures $\mathbf{y}_{n} \in \mathcal{R}^{S\times 1}$ for patient $n$. The ANN is a simple three layered fully connected network of sizes $D\times K_{1}$, $ K_{1} \times K_{2}$ and $K_{2} \times S$.

\paragraph{\textbf{Implementation Details:}}
We train our M-GCN on a combination of $\ell_{2}$ loss and $\ell_{1}$ loss between the predicted $\hat{\mathbf{y}}_{n}$ and true measures ${\mathbf{y}}_{n}$:
\begin{equation}
\mathcal{L} = \frac{1}{NS} \sum^{N}_{n=1}\Big[ {\vert\vert{\mathbf{y}_{n}-\hat{\mathbf{y}}_{n}}\vert\vert}_{2} + {\vert\vert{\mathbf{y}_{n}-\hat{\mathbf{y}}_{n}}\vert\vert}_{1}\Big]
\end{equation}
The $\ell_{1}$ loss function has been shown to be more robust to outliers as compared to the $\ell_{2}$ loss \cite{qi2020mean}, but less stable during training due to the lack of smoothness near the optimal solution \cite{friedman2001elements}. We found that this combined loss empirically provided a good tradeoff between stability and generalization. Layer sizes for the M-GCN were set to $M=32$ channels for the connectome embedding, $D=256$ for the graph embedding and $\{K_{1},K_{2}\} = {128,30}$, as we found these choices to be sufficient to map the connectomics data to the phenotypic measures during training. We chose a LeakyReLU ($\phi(x) = \text{max}(0,x) + 0.1*\text{min}(0,x)$) as the activation function with our network layers, which we found empirically robust to saturation and exploding gradients during training. We train our M-GCN via stochastic gradient descent (SGD) algorithm with momentum ($\delta=0.9$), batch size = $16$, with an initial learning rate of $0.001$ decayed by $0.9$ every 10 epochs. Additionally, we utilize a weight decay of $0.001$ as regularization and train our network for $40$ epochs to avoid overfitting. All parameters were determined based on a validation set of $30$ additional patients from the HCP dataset. We carried forward the same settings to the second KKI dataset.
\subsection{Baselines}
We compare the predictive performance of our network against the following baselines:

\medskip \noindent
\textbf{Multimodal ANN:} We use a four layer ANN that maintains the same number of parameters, activation, and loss function as the M-GCN. It operates on the vectorized $P\times(P-1)/2$ rs-fMRI correlations, each multiplied by the corresponding entry of the DTI Laplacian $\mathbf{L}_{n}$. This baseline evaluates the benefit of maintaining the graph structure of the data.

\medskip \noindent
\textbf{rs-fMRI only GCN:} We use the same architecture as our M-GCN but omit the graph Laplacian in Eqs.~(\ref{eqn:Input_layer}-\ref{eqn:Sub_layers}). This baseline evaluates the benefit of DTI regularization.

\medskip \noindent
\textbf{BrainNetCNN:} We integrate multimodal connectivity data via the BrainNetCNN \cite{kawahara2017brainnetcnn}, originally designed to predict cognitive outcomes from DTI data. We modify this architecture to have two branches, one for the rs-fMRI correlation matrices $\mathbf{\Gamma}_{n}$, and another for the DTI Laplacians $\mathbf{L}_{n}$. The ANN is modified to output $S$ measures of clinical severity. We set the hyperparameters according to \cite{kawahara2017brainnetcnn}

\medskip \noindent
\textbf{Dictionary Learning + ANN:} The integrated framework in \cite{d2019integrating} uses static rs-fMRI correlation matrices ($\mathbf{\Gamma}_{n}$) to simultaneously predict multiple clinical or behavioral measures. The model combines a dictionary learning generative term with a neural network predictor. The two blocks are optimized jointly in an end-to-end fashion.

\medskip \noindent
\textbf{Dynamic Deep-Generative Hybrid:} The framework in \cite{d2020deep} uses a similar joint optimization strategy but operates on dynamic rs-fMRI correlation matrices $\{\mathbf{\Gamma}^{t}_{n}\}$ and incorporates DTI regularizer in the dictionary learning term. Overall, these last two baselines evaluate the benefit of GCNs for implicit representational learning over a classical decomposition strategy. We have followed the guidelines provided by the authors to set the hyperparameters and train both of these baselines.
%% Adding plots to appendix

%%\tgreen{Section 2 is missing the hyperparamter tuning. Hopefully, this was done on a separate validation set. Or you have a good argument about why a validation set is not needed.}
%% (Yes, I fixed this now)
\section{Experimental Evaluation and Results}
\subsection{{Datasets and Pre-processing}}
\paragraph{\textbf{HCP Dataset}:} Our first dataset contains $275$ healthy individuals from the Human Connectome Project (HCP) S1200 database \cite{van2013wu}. Rs-fMRI and DTI scans are acquired on a Siemens $3$T scanner (\textbf{rs-fMRI:} EPI, TR/TE$=0.72ms/0.33ms$, flip angle = $52$, res = $2$mm$^3$, duration = $1200$ time samples per run; \textbf{DTI}: EPI, SENSE factor = $1$, TR/TE = $5520/89.5$ms, res = $1.25\times 1.25 \times 1.25$mm, b-value = $1000/2000/3000$s/$mm^{2}$ interleaved, with $95/96/96$ gradient directions respectively). To remain commensurate with clinical scanning protocols, we selected a $15$-minute interval from the rs-fMRI scans for our analysis. Rs-fMRI data was pre-processed according to the standard HCP pipeline \cite{smith2013resting}, which accounts for motion and physiological confounds. DTI data was processed using the standard Neurodata MR Graphs package \cite{kiar2016ndmg}, which uses streamline tractography to estimate fiber bundles. Our phenotypic measure was the Cognitive Fluid Intelligence Score (CFIS) \cite{duncan2005frontal, bilker2012development} adjusted for age, which is obtained via a battery of tests measuring cognitive reasoning (dynamic range:~$70-150$)

\begin{table*}[b!]
\centering
\small
{
\begin{tabular}{|c|c|c|c|c|c|} 
\hline 
  \textbf{Meas.} &\textbf{Method}  & \textbf{MAE Test}  & \textbf{NMI Test} & \textbf{R Stat.} & \textbf{p}\\ 
\hline 
  \multirow{6}{*}{CFIS} 
 & Mult. ANN & 14.06~\rpm~{10.16}  & 0.61 & 0.23 & 0.065\\
 & rs-fMRI only GCN & 14.16~\rpm~{8.96} &  0.54 & 0.23 & ${0.044}^{*}$\\
 & BrainNetCNN & 17.90~\rpm~{17.55} & 0.58  & 0.25 & ${0.0015}^{*}$\\
 & Dict. Learn.~+~ANN  & 15.26~\rpm~{13.99}  & 0.66 & 0.29 & ${0.024}^{*}$ \\
 & Dyn. Deep-Gen. Hyb. & 16.31~\rpm~{15.43} & 0.67  & 0.30 & ${0.0043}^{*}$ \\
 & \textbf{Our Framework}  & \textbf{12.87~\rpm~{9.65}}  & \textbf{0.73} & \textbf{0.41} & -\\
\hline
\end{tabular}
}
{\caption{\small{\textbf{HCP Dataset:} Evaluation using the \textbf{Median Absolute Error (MAE)}, \textbf{Normalized Mutual Information (NMI)} and \textbf{R Statistic} for the test set. Best performance is highlighted in bold. Near misses are underlined p value (\textbf{p}) for differences in distribution of the test MAE of the M-GCN against the baselines via the t test. $*$ denotes $p<0.05$. }}}
\label{table:HCP}
\end{table*} 

\paragraph{\textbf{KKI Dataset}:} Our in-house clinical dataset was acquired at the Kennedy Krieger Institute. It consists of $57$ children with high-functioning ASD. Rs-fMRI and DTI scans were acquired on a Philips $3T$ Achieva scanner (\textbf{rs-fMRI:} EPI, TR/TE = $2500/30$ms, flip angle = $70$, res = $3.05\times3.15\times3$mm, duration = $128$ or $156$ time samples; \textbf{DTI}: EPI, SENSE factor = $2.5$, TR/TE = $6356/75$ms, res = $0.8\times 0.8 \times 2.2$mm, b-value = $700$s/$mm^{2}$, $32$ gradient directions). Our rs-fMRI preprocessing includes motion correction, normalization to the MNI template, spatial and temporal filtering, and nuisance regression with CompCorr \cite{behzadi2007component}. We use the FDT pipeline in FSL to pre-process the DTI scans \cite{jenkinson2012fsl}. Tractography is performed using the BEDPOSTx and PROBTRACKx functions in FSL \cite{behrens2007probabilistic}. We use three separate clinical batteries to characterize various impairments associated with ASD. The Autism Diagnostic Observation Schedule (ADOS) \cite{payakachat2012autism} measures socio-communicative deficits and restricted/repetitive behaviors via a behavioral evaluation (dynamic range: $0-30$). The Social Responsiveness Scale (SRS) \cite{payakachat2012autism} quantifies impaired social functioning via a parent/teacher questionnaire (dynamic range: $70-200$). Finally, Praxis \cite{dziuk2007dyspraxia,mostofsky2006developmental} measures the ability to perform skilled motor gestures on command and is scored by two research reliable raters (dynamic~range:~$0-100$). 

\par For both datasets, we use the Automatic Anatomical Labeling (AAL) atlas \cite{tzourio2002automated} to define $116$ cortical, sub-cortical and cerebellar brain ROIs for both the functional and structural connectivity matrices. We also subtract the first eigenvector from the rs-fMRI correlation matrices, which is a roughly constant bias, and use the residual matrices as the inputs to all models.

\subsection{Performance Characterization.}

\paragraph{\textbf{Predicting CFIS:}}
Table 1 (and Fig.~2-Appendix) illustrates our method and baselines for predicting CFIS for the HCP dataset in a five-fold cross validated setting. We quantify the performance via the Median Absolute Error (MAE), the Normalized Mutual Information (NMI) and the Coefficient of Correlation (R Stat.) between the actual and predicted measures. Lower MAE and higher NMI/R Stat. indicate better performance. The training performance is good for all methods. However, the M-GCN clearly outperforms the baselines when generalizing to unseen testing data. As a benchmark, our validation performance (Test MAE: $13.41~\rpm~8.17$, NMI Test: $0.71$, R: $0.42$) also provides similar generalization.

\paragraph{\textbf{Multidimensional Clinical Severity Prediction:}} Table \ref{table:KKI} (and Fig.~2 in the appendix) compares the multi-output prediction performance of ADOS, SRS, and Praxis on the KKI dataset for a five fold cross validation. Again, we observe that the M-GCN outperforms the baselines for the prediction of all three severity measures in almost every case. Note that, from a clinical standpoint generalization to prediction of multiple deficits is inherently more challenging than predicting a single phenotypic measure. This also partially accounts for the poor performance of some of the baselines, where they perform reasonably well for the prediction of one of the measures (for example, the rs-fMRI only GCN for ADOS), but at the expense of generalization onto the other two measures. Overall, our experiments on two different real world datasets allude to reproducibility and suggest that the M-GCN generalizes effectively even with modest training sample sizes. Moreover, the performance gains against the M-GCN baseline without the DTI indicate the benefit provided by the multimodal integration via our graph convolutional framework.
\begin{table*}[t]
\centering
\small{{
\begin{tabular}{|c |c | c| c| c| c |} 
\hline 
  \textbf{Meas.} &\textbf{Method} & \textbf{MAE Test} & \textbf{NMI Test}  & \textbf{R Stat.} &\textbf{p}\\  
\hline 
  \multirow{6}{*}{ADOS} 
 & Mutl. ANN & 2.96~\rpm~{2.30} & 0.30 & 0.04  & $0.041^{*}$\\
 & rs-fMRI only GCN &  3.14~\rpm~{2.25} &  0.41 & 0.16 & $0.002^{*}$\\
 & BrainNetCNN  & 3.50~\rpm~{2.20}  & 0.25 & 0.41 & $0.009^{*}$\\
 & Dict. Learn. + ANN  & \textbf{2.71~\rpm~{2.40}}  & 0.43 & \textbf{0.50} & 0.20 \\
 & Dyn. Deep-Gen. Hyb.  & 2.84~\rpm~{2.79}  & 0.34 & 0.47 & 0.10\\
 & \textbf{Our Framework} & \textbf{2.71~\rpm~{2.15}}  & \textbf{0.45} & \textbf{0.50}  & - \\
[0.2ex]  
\hline
\multirow{6}{*}{SRS} 
 & Mult. ANN  & 18.47~\rpm~{11.04} & 0.60 & 0.03 & $0.033^{*}$ \\
 & rs-fMRI only GCN  & 21.34~\rpm~{8.58} & 0.62 & 0.16 & $0.019^{*}$\\
 & BrainNetCNN  & 18.96~\rpm~{15.65} & 0.75 & 0.13 & $0.039^{*}$\\
 & Dict. Learn. + ANN  & 16.79~\rpm~{13.83} & \textbf{0.89}  & \textbf{0.37}& 0.13 \\
 & Dyn. Deep-Gen. Hyb.  & 17.81~\rpm~{16.09}  & \underline{0.88} & 0.30 & 0.073\\
 & \textbf{Our Framework} & \textbf{16.50~\rpm~{9.44}} & \underline{0.85} & \underline{0.35} & -\\ [0.2ex]
 \hline
\multirow{6}{*}{Praxis} 
 & Mult. ANN  & 17.12~\rpm~{16.66}  & 0.65 & 0.25 & $0.008^{*}$\\
 & rs-fMRI only GCN & 16.71~\rpm~{16.66} & 0.74 & 0.17 & $0.019^{*}$\\
  & BrainNetCNN & 15.15~\rpm~{11.49}  & 0.19 & 0.3 & $0.024^{*}$
 \\
 & Dict. Learn. + ANN  & 13.19~\rpm~{10.75} & 0.82 & 0.37 & 0.15\\
 & Dyn. Deep-Gen. Hyb. & 13.50~\rpm~{11.55} & \underline{0.85} & 0.31 & 0.089\\
 & \textbf{Our Framework} & \textbf{12.82~\rpm~{12.04}} & \textbf{0.86} & \textbf{0.46} &  - \\ [0.2ex]
 \hline
\end{tabular}
}}
\caption{\small{{\textbf{KKI Dataset:} Evaluation using the \textbf{Median Absolute Error (MAE)}, \textbf{Normalized Mutual Information (NMI)} and \textbf{R Statistic} for the test set. Best performance is highlighted in bold. Near misses are underlined p value (\textbf{p}) for differences in distribution of the test MAE  of the M-GCN against the baselines via the t test. $*$ denotes $p<0.05$.}}\label{table:KKI}
}
\end{table*} 


\paragraph{\textbf{Extracting Clinical Biomarkers:}} The representations learned by the row and column filter pairs $\mathbf{w}_{r}$ and $\mathbf{w}_{c}$ at the input layer of the M-GCN (i.e. Eq.~(\ref{eqn:Input_layer})) may illuminate key biomarkers for each population. We first match the filter pairs across the cross validation folds based on the average correlation coefficient between the row and column filter weights. Fig.~\ref{HCP-MGCN_filts}  illustrates four filter pairs out of 32 that appear most frequently across subsets of the HCP and KKI dataset. In each case, we plot the average row filter (RF) and column filter (CF) weights projected onto the corresponding regions of the AAL atlas. Compared with the filters learned by the rs-fMRI only GCN (Appendix Fig.~3), the DTI regularization in the M-GCN offers sparsity and better spatial selectivity in the patterns captured.
\begin{figure}[t!]
 \centerline{\includegraphics[width=\dimexpr \textwidth-36\fboxsep-36\fboxrule\relax]{Figures/HCP-MGCN.PNG}}
{\caption{\small{Four pairs of row \& column filter weights learned by the M-GCN on the (a) HCP dataset and (b) KKI dataset. The colorbar quantifies the filter weight for each AAL ROI.} }\label{HCP-MGCN_filts} }
\end{figure}
\par For the HCP dataset (Fig.~\ref{HCP-MGCN_filts} (a)), we observe that RF1, RF2, CF1 and CF2 display contributions from regions of the Default Mode Network (DMN), known to play a critical role in consolidating working memory \cite{sestieri2011episodic} and is widely inferred within the resting state literature. RF3 and CF3 highlight regions of the Frontoparietal Network (FPN) and the Medial Prefrontal Network (MPN), believed to play a role in working memory, attention and decision making, which are associated with cognitive intelligence \cite{menon2011large}. CF4 highlights regions from the Somatomotor Network (SMN) while RF4 includes subcortical and cerebellar regions. Together, these are believed to be important functional biomarkers of cognitive intelligence in literature \cite{chen2019resting}. For the KKI dataset (Fig.~\ref{HCP-MGCN_filts} (b)), we observe that RF1, CF1, CF2 and CF4 highlight areas from the DMN and SMN. Altered connectivity within these regions is widely reported in ASD literature \cite{nebel2016intrinsic}. RF3, RF4 and CF4 also highlight contributions from the higher order visual processing areas and sensorimotor regions, which are in line with findings of reduced visual motor integration in Autism \cite{nebel2016intrinsic}. RF3, RF4 and CF4 also display contributions from subcortical regions along with the prefrontal cortex and DMN, which is believed to be relevant to social-emotional regulation in ASD \cite{pouw2013link}.

\section{Conclusion}
We have introduced a novel multimodal graph convolutional framework to leverage complementary information from functional and structural connectivity. Our M-GCN is designed to effectively utilize the underlying anatomical pathways to learn rich representations from functional connectivity data that are simultaneously informative of multidimensional phenotypic characterizations. We demonstrate that this framework is able to learn effectively from limited training data and generalize well to unseen patients. Finally, our framework makes minimal assumptions, and can potentially be applied to study other neuro-psychiatric disorders (eg. ADHD, Schizophrenia) as a diagnostic tool.


\midlacknowledgments{This work is supported by the National Science Foundation CRCNS award 1822575 and CAREER award 1845430, the National  Institute  of Mental Health (R01 MH085328-09, R01 MH078160-07, K01 MH109766 and R01 MH106564), the National Institute of Neurological Disorders and Stroke (R01NS048527-08), and the Autism Speaks foundation.}


{{\bibliography{Dsouza21.bib}}}


\end{document}
