\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\usepackage{floatrow}
\usepackage{chngcntr}
\usepackage{booktabs}
\counterwithin{figure}{section}
\usepackage{caption}
\captionsetup{justification=justified, singlelinecheck=false, format=plain, font=small}

\usepackage{microtype}

\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026}
\jmlrvolume{-- 318}
\editors{Accepted for publication at MIDL 2026}
\title[CSVR]{CSVR: Combined Surface and Volume Registration for Neonatal Brain MRI}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicated cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Saga N.B. Masui\nametag{$^{1,2}$}} \orcid{0009-0008-7547-8996} \Email{saga.masui@kcl.ac.uk}\\
\Name{Yourong Guo\nametag{$^{1,2}$}} \Email{yourong.guo@kcl.ac.uk}\\
\Name{Mohamed A. Suliman\nametag{$^{1}$}} \Email{mohamed.suliman@kcl.ac.uk}\\
\Name{Mattias P. Heinrich \nametag{$^{3}$}} \Email{mattias.heinrich@uni-luebeck.de}\\
\Name{Nashira Baena\nametag{$^{2}$}} \Email{nashira.baena@kcl.ac.uk}\\
\Name{Irina Grigorescu\nametag{$^{2}$}} \Email{irina.grigorescu@kcl.ac.uk}\\
\Name{Logan Z. J. Williams \nametag{$^{1,2}$}} \Email{logan.williams@kcl.ac.uk}\\
\Name{Ashleigh Davies\nametag{$^{1,6}$}}
\Email{ashleigh.davies@kcl.ac.uk}\\
\Name{Vanessa Kyriakopoulou\nametag{$^{1,2}$}} \Email{vanessa.kyriakopoulou@kcl.ac.uk}\\
\Name{Gráinne McAlonan\nametag{$^{4}$}} \Email{grainne.mcalonan@kcl.ac.uk}\\
\Name{Jonathan O'Muircheartaigh\nametag{$^{2,4,5}$}} \Email{jonathanom@kcl.ac.uk}\\
\Name{Emma C. Robinson\nametag{$^{1,2,5,6}$}} \Email{emma.robinson@kcl.ac.uk}\\
\addr $^{1}$ Research Department of Biomedical Computing, School of Biomedical Engineering and Imaging Sciences, King's College London, London, SE1 7EH, UK \\
\addr $^{2}$ Centre for the Developing Brain, School of Biomedical Engineering and Imaging Science, King's College London, London, SE1 7EH, UK \\
\addr $^{3}$ Institute of Medical Informatics, Universität zu Lübeck, Germany \\
\addr $^{4}$ Forensic and Neurodevelopmental Sciences, King's College London, London, SE5 8AF, UK \\
\addr $^{5}$ MRC Centre for Neurodevelopmental Disorders, King's College London, London, SE1 1UL, UK \\
\addr $^{6}$ Research Department of Early Life Imaging, School of Biomedical Engineering and Imaging Sciences, King's College London, London, SE1 7EH, UK
}
\begin{document}

\maketitle


\begin{abstract}
Nonlinear image registration is a cornerstone of neuroimaging analysis, supporting both qualitative and quantitative comparisons of brain structures across individuals and over time. While traditional volumetric registration methods, driven by voxel intensities, achieve good alignment of subcortical regions, they generally fail to capture correspondences between highly convoluted and variable cortical shapes. Surface-based methods, which instead regularise mappings as geodesics along the cortical sheet, yield improved cortical alignment but ignore the subcortical domain, limiting their utility for whole-brain analyses. A unified registration framework would address these limitations to enable integrated analysis of cortical and subcortical structures and the neuronal fibres that connect them. However, achieving this is challenging, since matching heterogeneous cortical shapes implies large volumetric displacements local to the cortex. To overcome these challenges, we introduce CSVR, the first deep learning-based framework for combined surface–volume registration of neonatal MRI. By integrating hierarchical registration strategies with discrete optimisation, CSVR achieves accurate, smooth, and anatomically plausible alignment of the entire brain. 
\end{abstract}

\begin{keywords}
Image registration, Surface-based cortical registration, Discrete optimisation, Volumetric registration, Neuroimaging, 
\end{keywords}

\section{Introduction} 
%Image registration is fundamental to medical imaging pipelines, as it supports qualitative and quantitative comparisons of biological structures across populations and/or over time. 
Accurate spatial alignment of medical images is essential for both cross-sectional comparison of anatomical structures and longitudinal tracking of morphological changes, making image registration a foundational component of medical imaging pipelines. Yet for neuroimaging, the heterogeneity of human cortical morphology contradicts the core principles of classical image registration frameworks \citep{glasser2016multi, amunts2007cytoarchitecture}, which assume that all images can be diffeomorphically mapped to a common coordinate space where analogous anatomical structures overlap. In reality, cortical folds vary across individuals in terms of their number, branching and orientations  \citep{guo2025motifs, thompson1996three, ono1990atlas, guillon2024identification} and since this variability is often entirely natural and unrelated to disease or cognition, it translates into experimental noise that confounds downstream analysis.

One way of addressing this problem has been to selectively compare individual brains with others that share similar patterns of cortical folding \citep{meng2016discovering, duan2017exploring}. Taking this logic further, there has also been a move toward hierarchical registration frameworks \citep{guo2025motifs, ahmad2019deep, dong2018efficient} that incrementally align brains of increasingly dissimilar shapes by e.g. clustering individuals into groups that share common anatomies; then using these groups to generate a \textit{family} of templates that represent key modes of shape variation; before gradually registering subjects through this hierarchy until all brains are brought into a single reference space \citep{guo2025motifs}.

Since matching and aligning complex shapes in 3D is a highly ill-posed problem, this clustering approach can be made more tractable through the use of spherical registration \citep{suliman2022deep,suliman2025unsupervised,robinson2014msm, robinson2018multimodal, yeo2009spherical}, which simplifies the problem of cortical alignment to one of matching feature maps defined over smooth 2D spheres \citep{fischl1999high, drury1996computerized}. In this way, spherical image registration methods %\citep{robinson2014msm, robinson2018multimodal, fischl} 
have repeatedly demonstrated their capacity for improving cortical shape and/or cytoarchitectonic correspondence, relative to volumetric approaches \cite{glasser2016multi,coalson2018impact}, to support more precise characterisation of cortical asymmetries  \citep{williams2023structural, meyer2014cortical, raznahan2011does}, neurodevelopmental growth \citep{garcia2018dynamic, da2025differential}, and functional organisation \citep{glasser2016multi, yeo2011organization, kong2019spatial}. 


However, while spherical mapping approaches may improve cortical alignment, they come at the cost of losing geometric correspondence to the rest of the brain, meaning that surface and volume analyses are typically run in disconnected template spaces %such as fsaverage and MNI-ICBM152 
that are not aligned and cannot be directly compared \citep{wu2018accurate}. These challenges disproportionately impact structural connectivity studies which seed from the cortex but project through white matter volumes.

A holistic framework for combined surface and volume mapping could address these limitations. Yet, achieving this is non-trivial due to the challenge of diffusing spherical warps---that allow sulci and gyri to overlap---into the volume without inducing extreme distortions. Previous solutions have attempted to regularise this task by either regularising volumetric warps to preserve spherical correspondences \citep{ahmad2019surface}, or by optimising very slow iterative processes that diffuse the warp into the volume \citep{postelnicu2008combined} with prohibitively long ($\sim$17-hour) run-times. Other methods limit processing to within-subject longitudinal alignment \citep{gibson2009combined} or alignment of twins (that share similar brain shapes)\citep{lepore2010new}. 

One increasingly cited solution for non-convex image matching problems has been to use discrete optimisation, which reframes image matching as a combinatorial optimisation problem, in which the optimal displacement field must be selected from a quantised set of possible options. By constraining deformations in this way, these algorithms are able to reject locally appealing but globally inconsistent configurations, to model large and highly deformable transformations while maintaining anatomically plausible warps \citep{robinson2014msm,robinson2018multimodal,besenczi2024high,glocker2008dense}. Recent advances in deep discrete registration (e.g., DDR \citep{suliman2022deep}; GeoMorph \citep{suliman2025unsupervised}; PDD \citep{heinrich2019closing}) take this one step further to combine the geometric robustness of such discrete formulations with the efficiency of deep feature extractors, making them particularly well-suited to coupled cortical and volumetric alignment. 


\noindent \textbf{Contributions:} This paper presents a holistic framework for cluster-based deep-discrete surface and volumetric registration (CSVR) for precise anatomical alignment of individual human brains. It integrates deep discrete registration (DDR) \cite{suliman2022deep} of cortical surfaces with probabilistic dense displacement (PDD) \cite{heinrich2019closing} of MRI volumes within a single jointly-optimised framework to improve the alignment of whole-brain MRI more precisely than either surface or volume registration alone. We validate on neonatal data and show that the resulting warps are smooth and biologically plausible, and generalise robustly to unseen examples. In particular, CSVR performs well on all subjects, even more heterogeneous morphologies, where traditional volumetric methods show limited success and outperforms the classical adult baseline CVS.


\section{Methods}
Let $M = \{\mathbf{x}^M\}$ and $T = \{\mathbf{x}^T\}$ denote affinely aligned moving and target brain volumes, with $\mathbf{x}^M, \mathbf{x}^T \in \mathbb{Z}^3$ representing voxel coordinates. Let corresponding cortical surfaces be represented by mesh pairs $\partial M=(\mathcal{A}_M,\mathcal{S}^2_M)$ and $\partial T=(\mathcal{A}_T,\mathcal{S}^2_T)$ of corresponding anatomical and spherical surfaces that share vertex correspondences. In each case, anatomical surfaces $\mathcal{A}_M=\{\mathbf{v}^M_i \in \mathbb{R}^3\}_{i=1}^{N_v}$ and $\mathcal{A}_T=\{\mathbf{v}^T_i \in \mathbb{R}^3\}_{i=1}^{N_v}$ correspond to the inner cortical, or white matter, boundary defined from their corresponding volumes, from which spheres $\mathcal{S}^2_M=\{\mathbf{s}^M_i \in \mathbb{R}^3\}_{i=1}^{N_v}$ and $\mathcal{S}^2_T=\{\mathbf{s}^T_i \in \mathbb{R}^3\}_{i=1}^{N_v}$ are generated through learning-based spherical projection \cite{ma2025developing}. 

\subsection{Deep Discrete Alignment}
Deep discrete alignment frames image registration as a classification problem in which displacements—for a low resolution control point (CP) grid—are learnt through offering each (CP) $\{ c_i \}^{N_c}_{i=1}$ a choice of target locations $\{ l_i \}^{N_l}_{i=1}$ to deform to (\figureref{fig:discrete}). For DDR, the control point grid and label points are defined from vertices derived from regularly tessellated icospheres. For PDD, the control point grid and label points are defined in 3D. Unlike classical discrete optimisation frameworks \citep{glocker2008dense,robinson2014msm,robinson2018multimodal,heinrich2015multi}, the label space is made as wide as computationally possible such that the final deformation field may be classified in one shot. Regularisation is imposed through deep conditional random fields (CRFs) that enforce spatial smoothness by encouraging neighbouring control points to deform to similar target locations \citep{zheng2015conditional}.


\begin{figure}[h!]
\floatconts {fig:discrete} 
{\caption{Control grids and label spaces of discrete registration in the volumetric (left) and spherical (right) domains.}} {\includegraphics[width=1\linewidth]{images/discrete.png}}\vspace{-3ex} 
\end{figure}

\subsection{Proposed Framework}
\vspace{-0.5ex}
Our overall goal is to estimate a mapping such that both internal intensities or MRI volumes and cortical surface features (and geometry) are aligned. CSVR (\figureref{fig:csvr}) achieves this via:

\begin{figure}[b!]
\centering
\includegraphics[width=\linewidth]{images/CSVR.png}
\caption{The CSVR framework: Moving and target sulcal depth maps are projected onto a regular icosphere and input to DDR to produce a deformed input sphere $\mathcal{S}'^2_M$. This spherical warp is used to project the moving anatomical mesh $\mathcal{A}_M$ onto the target anatomy $\mathcal{A}_T$ to return a deformed anatomy $\mathcal{A}_M'$ (with the shape of the target but topology of the input). Subtracting $A_M$ and $A_M'$ yields a sparse 3D displacement field $\{\mathbf{d}_i\}_{i=1}^{N_v}$, which is converted into a dense warp $\mathbf{u}_s$ through differentiable Gaussian interpolation, then inverted pull back intensities from the moving volume onto the target grid (as per convention) to generate a deformed and resampled input volume $M'$, which is used as inputs for PDD. This process is jointly optimised to learn a combined surface-and-volume transformation.}
\label{fig:csvr}
\end{figure}

\vspace{-1.6em}
\subsubsection{Surface-driven alignment (DDR)}
\label{sec:ddr}
Spherical alignment is performed using a pretrained DDR network from \citep{guo2025motifs}, that learns a spatial transformation $\phi_s: \mathcal{S}^2_M \rightarrow \mathcal{S}^2_T$ that optimises overlap of sulcal depth functions ($f^M$, $f^T$) over three stages: (1)  a rotation network $h_R(f^M, f^T)$ first estimates a 3D rotation matrix $\mathbf{R}_S \in \mathbb{R}^{3\times3}$ that corrects the residual misalignment between spheres that remains after volumetric affine initialisation; %$f_R^M$ then corresponds to the transformed and resampled SD map generated from this transform. 
(2) a non-rigid deformation module $h_{NR}(f^M\circ \mathbf{R}_S \mathcal{S}^2_M ,f^T) $ is then trained to classify the target location of each control point $\{ c^{DDR}_i \}^{N_c}_{i=1} \in \mathcal{S}^2_C$ by choosing from a fixed set of labels $\{ l^{DDR}_i \}^{N_l}_{i=1} \in \mathcal{S}^2_L$, where the deformation grid $\mathcal{S}^2_C$ and the label grid $\mathcal{S}^2_L$ correspond to regularly tessellated icospheres of different resolutions ($|\mathcal{S}^2_L| > |\mathcal{S}^2_C|$); (3) this process is smoothed through implementing  CRF regularisation through a recurrent neural network (RNN) \citep{zheng2015conditional}, combined with a diffusion regulariser that operates on the spatial gradients. 
Both rotation and deformation modules employ spherical U-Net architectures \citep{ronneberger2015u} that take sulcal depth features as input (resampled to a sixth-order isosurface) and learn MoNet convolutional filters \citep{monti2017geometric}. The final warp (output from $h_{NR}$) corresponds to a deformed spherical configuration $\mathcal{S}'^2_M$. To transfer this to the anatomical domain, we exploit vertex correspondence between $\mathcal{S}^2_M$ and $\mathcal{A}_M$ (and $\mathcal{S}^2_T$ and $\mathcal{A}_T$), and implement differentiable barycentric interpolation $\mathcal{I}_{B}$. Using the step-by-step framework from \cite{robinson2018multimodal} this allows moving spherical vertices $\mathbf{s}'^M{_i}$ to be located relative to target vertices $\mathbf{s}^t{_i}$, from which barycentric weights may be derived to project moving anatomical mesh vertices  $\mathbf{v}_i^M$ onto the target anatomical mesh shape.  
This deformed surface $\mathcal{A}_M'= \{\mathbf{v}'^M_i \in \mathbb{R}^3\}_{i=1}^{N_v}$ % =\mathcal{I}_b(\mathcal{S}'^2_M, \mathcal{S}^2_M, \mathcal{A}_M)$ with the mesh topology of $\mathcal{A}_M$ but the shape of $\mathcal{A}_T$. From this we can 
is then used to extract a sparse displacement field $\{\mathbf{d}_i = \mathbf{v}'^M_i - \mathbf{v}_i^M\}_{i=1}^{N_v}$.

%\frac{1}{N_v} \sum_{i=1}^{N_v} \left( \|f^M_i - f^T_i\|_2^2 - \frac{cov(f^M_i, f^T_i)}{\sigma_{f^M_i} \sigma_{f^T_i}} \right) + \lambda \left( |\nabla \Phi_{\mathbf{x}}| + |\nabla \Phi_{\mathbf{y}}| + |\nabla \Phi_{\mathbf{z}}| \right)

%Spherical alignment is performed using a pretrained DDR network (\figureref{fig:ddr}) that learns a spatial transformation $\phi_s: \mathcal{S}^2_M \rightarrow \mathcal{S}^2_T$  to optimise overlap of sulcal depth feature maps  $f^M$, and $f^T$ over three stages:  (1) a rigid-body rotation $h_R(f^M, f^T): \mathcal{S}^2_M \rightarrow \mathbf{R}_S(\mathcal{S}^2_M)$ that estimates a 3D rotation matrix $\mathbf{R}_S \in \mathbb{R}^{3\times3}$ that corrects any residual misalignment between surface pairs that remains after volumetric affine initialisation to a population-average template; (2) a non-rigid deformation module $h: \mathbf{R}_S(\mathcal{S}^2_M) \rightarrow \mathcal{S}^2_T$ that predicts the target location of each control point $\{ c^{DDR}_i \}^{N_c}_{i=1} \subset \mathcal{S}^2_T$  from a fixed set of target locations $\{ l^{DDR}_i \}^{N_c}_{i=1} \subset \mathcal{S}^2_T$; and (3) a CRF-RNN that enforces spatial smoothness by encouraging neighbouring control points to deform to similar target locations \citep{zheng2015conditional}. Both rotation and deformation modules employ spherical U-Net architectures \citep{} that take sulcal depth features as input and learn MoNet convolutional filters \citep{monti}.  The final warp corresponds to a deformed spherical configuration $\mathcal{S}'^2_M$. To transfer this spherical alignment to the anatomical domain, we exploit vertex correspondence between $\mathcal{S}^2_M$ and $\mathcal{A}_M$ (and $\mathcal{S}^2_T$ and $\mathcal{A}_T$) to map the spherical deformations to anatomical space, using a differentiable implementation of the barycentric interpolation from \cite{robinson2018multimodal}. This produces a deformed surface $\mathcal{A}_M' = h_{total}(\mathcal{S}'^2_M, \mathcal{S}^2_T, \mathcal{A}_T)$ with the mesh topology of $\mathcal{A}_M$ but the shape of $\mathcal{A}_T$. From this we can extract a sparse displacement field $\{\mathbf{d}_i = \mathbf{v}_i' - \mathbf{v}_i^M\}_{i=1}^{N_v}$.


\subsubsection{Surface-to-volume interpolation}  
\label{sec:interp}

From this sparse vertex-wise displacement we require a dense volumetric displacement field $\mathbf{u}_s(\mathbf{x})$, which is derived from a two-stage, differentiable Gaussian interpolation scheme. For each voxel location $\mathbf{x}^M$, we compute a weighted combination of displacements from nearby cortical vertices, such that: % Gaussian soft splatting: 
\vspace{-0.5em}
\begin{equation}
\mathbf{u}_s^{\text{sparse}}(\mathbf{x}^M_j) = \sum_{i\in \mathcal{N}(\mathbf{x}^M)} w_{i,j} \mathbf{d}_i, \quad \text{with} \quad w_{i,j} = \exp\left(-\frac{\|\mathbf{v}^M_i - \mathbf{x}^M_j\|^2}{2\sigma_{\text{scatter}}^2}\right)
\vspace{-0.5em}
\end{equation}
Here, $\mathcal{N}(\mathbf{x}^M)$ defines a cubic neighbourhood surrounding $\mathbf{x}^M$. This sparse field is then diffused into subcortical regions via a separable 3D Gaussian convolution with standard deviation $\sigma_{\text{vox}}$:
\begin{equation}
\mathbf{u}_s(\mathbf{x}) = G_{\sigma_{\text{vox}}} * \mathbf{u}_s^{\text{sparse}}(\mathbf{x})
\end{equation}
The final displacement field is normalised to preserve displacement magnitudes.

Finally, because volumetric registration networks typically predict \emph{backward} deformation fields — i.e., mappings $\phi_v : T \rightarrow M$ that sample intensities as $M(\mathbf{x}^T + \mathbf{u}_v(\mathbf{x}^T))$, where voxels $\mathbf{x}^T \in T$ — this surface-derived field is first inverted before being used to transform $M$ and its corresponding segmentation onto the reference configuration, yielding preliminary warped volumes ($M'$) that propagate cortical correspondences between $A_M$ and $A_M'$ into the voxel domain.

%This sparse displacement is converted to a dense volumetric warp $\phi_s(\mathbf{x})$ using a fast, differentiable Gaussian interpolation scheme. For each voxel location $\mathbf{x}$ on the integer grid, we compute a weighted combination of displacements from nearby surface vertices using: % Gaussian soft splatting: 
%\begin{equation}
%\phi_s(\mathbf{x}^M_j) = \sum_{i\in N_{\mathbf{x}^M}} w_{i,j} \mathbf{d}_i, \quad \text{with} \quad w_{i,j} = \exp\left(-\frac{\|\mathbf{v}^M_i - \mathbf{x}^M_j\|^2}{2\sigma_{\text{scatter}}^2}\right)
%\end{equation}
%where $N_{\mathbf{x}}$ represents the set of all vertices within a $5 \times5\times5$ cube centered around $\mathbf{x}^M$. This sparse field is then diffused into subcortical regions via a separable 3D Gaussian convolution:
%\begin{equation}
%\phi_s(\mathbf{x}) = G_{\sigma_{\text{vox}}} * \phi_s^{\text{sparse}}(\mathbf{x})
%\end{equation}
%In which $G_{\sigma_{\text{vox}}}$ denotes a 3D Gaussian kernel with standard deviation $\sigma_{\text{vox}}$. The result is normalised to preserve displacement magnitudes.

%Finally, because volumetric registration networks typically predict \emph{backward} deformation fields — i.e., mappings $\phi_v : T \rightarrow M$ that sample intensities as $M(\mathbf{x}^T + \phi_v(\mathbf{x}^T))$, where voxels $\mathbf{x} \in T$ — this surface-derived field is first inverted before being used to deform $M$ and its corresponding segmentation, yielding preliminary warped volumes that propagate the cortical correspondences between $A_M$ and $A_M'$ into the voxel domain.

    
\subsubsection{Volumetric subcortical alignment (PDD)}

$M'$ is then passed to a pretrained probabilistic dense displacement network driven by image intensities $I^M$, $I^T$, and weakly supervised by tissue segmentation maps $Seg^M$ and $Seg^T$ \citep{heinrich2019closing}. 
The model comprises three components: (1) an OBELISK-based feature extractor that produces intensity-invariant descriptors for $M'$ and $T$ \citep{heinrich2019obelisk}, preceded by a 5×5×5 convolution that captures edge-like information; (2) a correlation layer that considers possible displacement vectors and evaluates feature dissimilarity from negated MSE across all candidates; and (3) architecturally embedded regularisation that alternates approximate min-convolutions and mean-field inference to simulate a Markov Random Field \citep{krahenbuhl2011efficient, zheng2015conditional} and enforce smoothness. %Alignment is evaluated using 
Ultimately, PDD uses the regularised MSE feature dissimilarities to predict a probability distribution over displacement labels at each control point via a softmax layer, which drives a non-local (MSE-based) segmentation loss. A continuous displacement field is then obtained through probability-weighted averaging and upsampling and subjected to a deterministic diffusion regularisation penalty. Since PDD was originally developed for abdominal CT registration, we adapt this method to subcortical brain MRI by complementing these terms with deterministic Dice and mutual information losses evaluated on the final warped image:
%\begin{equation}
%    \mathcal{L}_{\text{total}} = \mathcal{L}_{\text{nonlocal}}%(\text{seg}_{\text{prob}}^{\text{LR}}) + \mathcal{L}_{\text{MI}}%(\text{img}_{\text{det}}^{\text{LR}}) + \mathcal{L}_{\text{Dice}}%(\text{seg}_{\text{det}}^{\text{LR}}) + \mathcal{L}_{\text{reg\_diffusion}}(\text{img}_{\text{det}}^{\text{HR}})
%\end{equation}

\begin{equation}
    \mathcal{L}_{\text{total}} = \mathcal{L}_{\text{nonlocal}}(\text{Seg}_{\text{prob}}^{\text{LR}}) + \mathcal{L}_{\text{MI}}(\text{I}_{\text{det}}^{\text{LR}}) + \mathcal{L}_{\text{Dice}}(\text{Seg}_{\text{det}}^{\text{LR}}) + \mathcal{L}_{\text{reg\_diffusion}}(\text{I}_{\text{det}}^{\text{HR}})
\end{equation}
Here, all losses are calculated at low resolution (LR), on either probabilistic or deterministic images and labels, but the diffusion regularisation is applied at high resolution (HR), and the approximated MRF implicitly regularises final deformation via the probabilistic weights of the candidate displacements. 
We also allow non-isotropic, dataset-adaptive control-grid dimensions and introduce a cortical deformation penalty to minimise interference with $\mathbf{u}_s(\mathbf{x})$. This penalty is defined as:
{\setlength{\abovedisplayskip}{6pt}
\setlength{\belowdisplayskip}{6pt}
\begin{equation}
L_{cort} = \lambda_{cort} \sum_{\mathbf{x} \in \text{seg}(\mathbf{x})==2} \|\mathbf{u}_v(\mathbf{x})\|^2
\end{equation}}
Here, $\mathbf{u}_v(\mathbf{x})$ represents the volumetric deformation field, and $\lambda_{cort}$ is a tunable weight. This penalty is enforced only for moving locations indexed by cortical grey matter to limit displacements at locations where surface registration already provides accurate alignment.


\subsubsection{Joint optimisation and warp composition}  
\label{sec:joint}
After pretraining DDR and PDD to separately optimise cortical and volumetric alignment, we then refine the models through differentiable Gaussian Interpolation (Section \ref{sec:interp}) into a unified model, CSVR, that jointly optimises both networks by backpropagating the PDD losses, complemented with an additional high-resolution Dice loss, through both networks. 
% This allows the surface warp to adapt beyond cortical feature alignment also to support optimal volumetric registration, while the volumetric warp learns to complement the surface-driven initialisation.



\section{Experimental Methods and Implementation}
\vspace{-0.5em}
\subsection{Data}
\label{sec:dataproc}
This framework was trained on 681 subjects from the developing Human Connectome Project (dHCP) and independently validated on 100 subjects from the Brain Imaging in Babies study (BIBS) \citep{edwards2022developing}. Out of these neonates, 648 were term-born (scan age $41.5 \pm 1.7$ weeks, 297 female) and 133 were preterm-born (scan age: $41.3 \pm 1.7$ weeks, 59 female). Both datasets were acquired at the Evelina Newborn Imaging Centre using a Philips Achieva 3T scanner with a dedicated 32-channel neonatal head coil and positioning device \citep{glasser2013minimal, hughes2017dedicated}. 
 T2-weighted images were acquired in two stacks (sagittal/axial), with a repetition time (TR)/echo time (TE) of 12,000/156ms, flip angle of 90°, in-plane resolution of 0.8 × 0.8mm, slice thickness of 1.6mm and overlap of 0.8mm, and SENSE factor of 2.11 (axial) and 2.60 (sagittal). T1-weighted images were also acquired in sagittal and axial stacks with identical in-plane resolution and slice thickness, overlapped by 0.8mm (sagittal overlap of 0.74mm), and TR/TI/TE of 4,795/1,740/8.7ms, with SENSE factor of 2.27 (axial) and 2.66 (sagittal). These were motion-corrected and super-resolved to generate output volumes with 0.5mm isotropic grid sizes \citep{cordero2016sensitivity,cordero2018three}. All babies were imaged during natural sleep.
 
  % Full details regarding neonatal preparation for scanning have been outlined previously in [Hughes et al., 2017], but, in brief, all imaging was performed in natural sleep without sedatives, and examinations were supervised by an experienced nurse and paediatrician who monitored heart rate, temperature and oxygen saturation.
\subsection{Pre-processing}
T2-weighted images were histogram-normalised and preprocessed using the dHCP Deep Learning–based Neonatal Pipeline \citep{ma2025developing}, which builds on and is trained using outputs from the classical dHCP structural pipeline \citep{schuh2017deformable, makropoulos2018developing}. This affinely aligns volumes to the dHCP 40-week neonatal T2w template \citep{schuh2018unbiased}; then uses learning-based mesh fitting to shrink a template surface to the inner cortical (white) boundary, which is next reinflated to meet the outer cortical (pial) surface. White surfaces are subsequently inflated using a GPU-accelerated reimplementation of FreeSurfer’s inflation algorithm \citep{fischl2012freesurfer}, from which sulcal depth maps are estimated by integrating over the functional. Finally, spheres are generated through spherical mapping, optimised using a spherical U-Net implementation similar to \citep{zhao2021s3reg}. Prior to registration using CSVR, sulcal depth maps and white matter surfaces were resampled to ico-6 resolution, and all volumetric images were histogram normalised and resampled to $H=176, W=224, D=160$. 

\subsection{Baseline (CVS)}
The performance of the proposed CSVR framework was validated through comparison against CVS \citep{postelnicu2008combined}. Since this requires all outputs in the FreeSurfer format, attempts to adapt our dHCP surfaces failed, requiring us to re-run surface generation with Infant Freesurfer (InfantFS) \citep{zollei2020infant}, which instead requires T1w images. For this, we first affinely registered T1w volumes to their corresponding template-aligned T2s, then performed InfantFS' recon-all processing, followed by CVS registration. The resulting CVS warpfields were applied to the T2w images and their DrawEM segmentations, enabling direct comparison with our CSVR method. Validation of CVS was performed using a smaller subset of 30 image pairs from the BIBS testing data (due to constraints of run time). %Each run took $\sim 40-60 mins$ for InfantFS and $4-5$ hours for CVS.}


\subsection{Clustering}
\label{sec:clust}
%ADD YOURONGS DENDROGRAM FIGURE
To simplify the registration problem and minimise extreme distortions, subjects were clustered into groups of similar folding patterns using the method outlined in \citep{guo2025motifs}. In total, 90 clusters were used (30 for each of the frontal, parietal and temporal lobes) due to observations that cortical folding variants do not co-occur, i.e. individuals that share similar folding patterns of the frontal lobes do not necessarily share variants of parietal or temporal lobes. Clusters were generated based on pairwise folding similarity, assessed through overlap of curvature following diffeomorphic alignment using DDR. Train and test datasets thus consisted of pairs of individuals with broadly similar folding patterns (for any of their lobes). Importantly, to avoid data leakage, all samples were separated into training (dHCP) and test (BIBS) subsets before pairing was performed, and each pair appeared twice---with samples used once as moving and once as target. Using the standard convention, the dataloader randomised the order in which pairs were presented during training.

\subsection{Training and Model Implementation}
% Data is input at high resolution - resampled to a sixth-order icosphere (ico-6) with 40,962 vertices. Labels are also defined at ico-6 resolution. Following CRF optimisation, the deformed ico-2 control grid is upsampled to the input data resolution (ico-6) using barycentric interpolation. The network is optimised end-to-end using an unsupervised loss combining mean-squared error and cross-correlation between resampled feature maps, supplemented with a diffusion regularisation penalty on the spatial gradients of the deformation field to balance alignment accuracy against smoothness.

\paragraph{DDR (Surface Registration)}
The original DDR paper \cite{suliman2022deep} trained subject to template alignment. Therefore, here, we utilise the modified \emph{pairwise} training scheme of \citep{guo2025motifs}. Data was input at high resolution - resampled to a sixth-order icosphere (ico-6) with 40,962 vertices. Labels were also defined at ico-6 resolution, and set to the 80 nearest neighbours of each control point, defined on an ico-2 grid (162 vertices). Following CRF optimisation, the deformed low-resolution control grid was upsampled to the input data resolution (ico-6) using barycentric interpolation. Optimisation was performed using Adam with a learning rate of $10^{-4}$, a diffusion regularisation weight of $\lambda = 1$, and performance was assessed via five-fold cross-validation on held-out subject pairs.

\paragraph{PDD (Volumetric Registration)}
PDD was trained using: a diffusion regularisation weight $\lambda = 1$, dice weight of 0.5, MI weight of 1, a cortical penalty weight of $\lambda_{cort}=0.001$, and non-local label weight of 15. The latter was scaled to account for inherent differences in magnitudes between the losses. We used control grids of size $44 \times 56 \times 40$ (which corresponds to sampling the input volume by 4). The network was optimised using the Adam optimiser with a learning rate of $5 \times 10^{-3}$, for 5 epochs over approximately 126k pairs. Model checkpoints were evaluated every 2000 iterations on a small validation subset containing at least one subject pair from each anatomical cluster, and the best-performing checkpoint was retained. Notably, we found that accurate alignment was achieved after only 2h 45 min of training, and more specifically, after the first 30k iterations. % (with an average inference time of 0.33s). 
This aligns well with \citet{heinrich2019closing}'s original finding that PDD only requires a small number of subjects and a short training time. 


\paragraph{CSVR}
After independent pretraining, the combined model, CSVR, was trained on a subset of subjects representative of all anatomical clusters, but with permutations capped at 30 subjects per cluster, motivated by the promising generalisation performance of PDD. Training used the same hyperparameters as the pretrained networks, with the addition of a high-resolution Dice loss term (weight 0.5). $\sigma_{\text{scatter}}$ was set to $0.5$ and $\sigma_{\text{vox}}$ to $12.0$. These values were tuned manually by trading off extreme distortions (Jacobian determinants) against segmentation Dice scores on validation sets. Before producing a final output, the inverted surface warp was composed with the volumetric warp to produce a single deformation field $\mathbf{u}(\mathbf{x}) = \mathbf{u}_v \circ \mathbf{u}_s^{-1}$, 
applied to the original moving image as $M_{\text{registered}}(\mathbf{x}) = M(\mathbf{x} + \mathbf{u}(\mathbf{x}))$, and to the surface as $\mathcal{A}_{M,\text{registered}} = \{\mathbf{v}_i + \mathbf{u}^{-1}(\mathbf{v}_i)\}_{i=1}^{N_v}$. The purpose of this was to minimise interpolation artefacts.

%\subsection{Template creation}
%Generating a group template is a time-consuming and technically non-trivial process. It typically involves affinely coaligning all subjects, averaging them to obtain an initial coarse template, and repeatedly re-registering and re-averaging until a sharp, well-defined population mean emerges. The procedure becomes even more challenging when constructing joint surface–volume templates, where alignment must remain consistent across both domains. Moreover, deriving a biologically meaningful cortical surface from an initially blurred volumetric average is inherently difficult, due to the highly convoluted nature of the cortex. Historically, some pipelines have instead used a single subject as the initial reference rather than an averaged volume, allowing for a sharper, albeit slightly biased, starting point.
%For the purpose of generating a prototype cluster template, we selected cluster F1527, one of the most common folding patterns, and chose a single subject within this cluster with a highly representative anatomy. 


\section{Results and Discussion}

We evaluate CSVR against five baseline methods: affine alignment (using ANTsPy \citep{avants2011reproducible}), FMRIB's Nonlinear Image Registration Tool, FNIRT \citep{andersson2007non} and classical volume and surface alignment (CVS) \citep{postelnicu2008combined}, with ablation performed by also comparing PDD and DDR (diffused into the volume). To keep comparisons as consistent as possible,  FNIRT was configured with a control grid of the same resolution as PDD, and our whole-brain PDD network was trained using the parameters specified above. All learning methods were trained on pre-processed dHCP T2w MRI volumes and validated on the BIBS; FNIRT and ANTsPy were run on the BIBS test set only; whereas, CVS was necessarily run on T1w volumes (as described in section \ref{sec:dataproc}). The average inference time of CSVR was only 0.73s, compared to 32 min for FNIRT and 5.5 hours for CVS, which also required an additional 46 minutes for preprocessing with InfantFS.

\paragraph{Dice overlap} of tissue labels following alignment are summarised in Table \ref{tab:dice_summary} and \figureref{fig:dice} with statistical significance assessed using paired two-tailed t-tests, with false discovery rate (FDR) correction for multiple comparisons. Results show that CSVR achieved statistically significant improvements over all baseline methods across anatomical tissue types (all $p < 0.001$, FDR corrected) except deep grey matter compared to PDD ($p = 0.59$). These results demonstrate that our cortical surface constraints benefit not only cortical alignment but also propagate improvements to subcortical structures.
\begin{figure}[h!] 
\floatconts {fig:dice} 
{\caption{\small Regional Dice Overlap. On average, CSVR performs better than other methods across all tissue types, with the exception of deep grey matter, where alignment is on par with PDD. Notably, CVS performs poorly across all tissue types. For comparison, the average overall Dice across corresponding unregistered brains is 0.3659. }} 
{\includegraphics[width=0.9\linewidth]{images/dice_comparison_combined_new.png}}
\vspace{-3ex}
\end{figure}

\begin{table}[h!]
\centering
\caption{Dice Similarity Coefficient (DSC) across different brain regions (Mean $\pm$ Std).}
\label{tab:dice_summary}
\resizebox{\textwidth}{!}{%
\begin{tabular}{lcccccc}
\hline % Line above header
\textbf{Region} & \textbf{Affine} & \textbf{FNIRT} & \textbf{CVS} & \textbf{PDD} & \textbf{DDR} & \textbf{*CSVR} \\
\hline % Line below header
Cortical GM & $0.528_{\pm 0.041}$ & $0.580_{\pm 0.032}$ & {$0.491_{\pm 0.021}$} & $0.577_{\pm 0.034}$ & $0.584_{\pm 0.022}$ & $\mathbf{0.615}_{\pm 0.020}$ \\
White Matter & $0.616_{\pm 0.052}$ & $0.653_{\pm 0.042}$ & $0.572_{\pm 0.040}$ & $0.654_{\pm 0.031}$ &$0.641_{\pm 0.042}$ & $\mathbf{0.661}_{\pm 0.031}$ \\
Ventricles & $0.529_{\pm 0.081}$ & $0.616_{\pm 0.102}$ & $0.452_{\pm 0.031}$ & $0.714_{\pm 0.063}$ & $0.727_{\pm 0.054}$ & $\mathbf{0.748}_{\pm 0.051}$ \\
Deep GM & $0.881_{\pm 0.031}$ & $0.890_{\pm 0.030}$ & $0.824_{\pm 0.041}$ & $0.923_{\pm 0.022}$ & $0.903_{\pm 0.021}$ & $\mathbf{0.924}_{\pm 0.012}$ \\
Hippocampi & $0.681_{\pm 0.071}$ & $0.753 _{\pm 0.062}$ & $0.591_{\pm 0.052}$ & $0.780_{\pm 0.042}$ &$0.760_{\pm 0.031}$ & $\mathbf{0.803}_{\pm 0.032}$ \\
\hline % Line above final row
\textbf{Overall Mean} & $0.647_{\pm 0.051}$ & $0.698_{\pm 0.051}$ & $0.586_{\pm 0.041}$ & $0.730_{\pm 0.041}$ & $0.723_{\pm 0.031}$ & $\mathbf{0.750}_{\pm 0.031}$ \\
\hline % Line below final row
\vspace{-3ex}
\end{tabular}%
}
\end{table}

\vspace{-1.2em}
\paragraph{Distortions:} Meanwhile, results from Table \ref{tab:jacobians} demonstrate that CSVR achieves this with less than 0.15\% negative Jacobians, relative to CVS with $1.4\%$. To investigate how this impacts alignment, Figures \ref{fig:quality} and \ref{fig:segs} show qualitative comparisons of image registration performance, highlighting that CVS' distortions often result in loss of cortical detail and misaligned subcortical structures, while CSVR maintains smooth and anatomically plausible alignments, with better overlap of segmentations and surfaces. We attribute CVS' failures to two factors: (1) the method was designed for adult rather than neonatal data, and (2) adaptation for use with InfantFS may have been suboptimal since this is primarily designed for neonates after birth (with only 5 training subjects aged under 2 months) \citep{zollei2020infant}. Despite selecting subjects with the best surface reconstructions, these results highlight that applying CVS to neonatal data is fundamentally limited by surface quality.

\begin{figure}[h!] 
\floatconts {fig:quality} 
{\caption{\small Visual comparison of CVS vs CSVR. (a) Target cortical surfaces overlaid on T2w images across registration stages. (b) Cortical surface reconstructions showing the target surface resampled onto moving topology, with red dots showing improved vertex correspondences after CSVR deformation; note the distortions on the CVS surface. (c) Sagittal views of deformed T2w images demonstrating CVS' anatomical distortions compared to CSVR. }}
{\includegraphics[width=0.70\linewidth]{images/quality.png}}
\vspace{-3ex}
\end{figure}

\begin{figure}[t!] 
\floatconts {fig:segs} 
{\caption{Qualitative comparison of CVS and CSVR registration across multiple views. Target cortical surfaces (white contours) overlaid on moving, CVS-deformed, CSVR-deformed, and target T2w images with segmentations in coronal (a), sagittal (b), and axial (c-d) orientations. White arrows and yellow circles highlight regions of improved alignment from CSVR, with magnified cortical details shown in (d).}} {\includegraphics[width=0.75\linewidth]{images/segs.png}}\vspace{-3ex}
\end{figure}

\paragraph{Improved gains for the most challenging subject pairs:} While subjects were grouped into registration clusters based on locally similar cortical folding patterns (at the lobar level), these do not always translate to similar whole-brain morphology. Consequently, some subject pairs exhibit substantially different overall brain shape, presenting challenging registration scenarios where affine or conventional non-linear methods struggle (\figureref{fig:scatter}). Notably, CSVR maintains robust performance even for these difficult cases, with much improved alignment, in particular relative to affine and FNIRT, with up to $40\% $ increase in dice overlap for some of the most challenging subjects. This pattern suggests that PDD's discrete optimisation framework confers robustness to local minima associated with highly heterogeneous brain morphology, while CSVR's cortical surface constraints provide additional gains primarily for cases where cortical folding patterns are informative for whole-brain alignment. \figureref{fig:difficult} further highlights CSVR's ability to align difficult cases. 

\paragraph{Limitations and Future Work:} The current implementation of CSVR operates on relatively low-resolution control point- and label grids, which, while enabling rapid inference times, may limit the precision of fine-scale anatomical alignment. To achieve higher registration accuracy, future developments will include adopting a multi-stage optimisation strategy where surface and volumetric registrations are iteratively refined across progressively finer resolutions. Such an approach would allow coarse alignment to establish global correspondence before fine-tuning local deformations, further improving Dice scores while maintaining computational efficiency.

\begin{figure}[h!] 
\floatconts {fig:scatter} 
{\caption{Per-Subject Comparison of CSVR vs other methods. Scatter plots comparing the average Dice score of CSVR against Affine, FNIRT, and PDD registration methods for individual subject pairs. The dashed red line represents the identity line ($y=x$), where no performance change occurs. Points above the line indicate superior performance by CSVR. CSVR demonstrates widespread improvement over Affine and FNIRT across the entire range of registration difficulty, suggesting robust performance even in challenging cases. The performance difference is reduced when comparing CSVR to PDD for the most difficult subject pairs, indicating PDD's robustness to challenging whole-brain morphology.}} {\includegraphics[width=0.95\linewidth]{images/csvr_scatter_comparison2.png}}\vspace{-3ex}
\end{figure}
% While simpler than many other popular interpolation frameworks \citep{}, this method provides fast, $C^\infty$ smooth interpolation while preserving surface-defined deformations. Crucially, it enables efficient extrapolation of a large number of control points into the volume at minimal computational cost, allowing us to use high-resolution anatomical surfaces to drive our displacements, yielding significantly improved cortical deformation accuracy.
\vspace{-5ex}
\section{Conclusion}
In conclusion, we introduce CSVR, a deep discrete combined surface-volume registration network for brain MRI. Our method represents a significant step toward addressing the longstanding challenge of combined surface-volumetric template construction. Currently, whole-brain analyses are constrained by the lack of integrated templates, but by jointly optimising cortical surface and volumetric alignment, CSVR provides the foundational method necessary for generating such templates within \citet{guo2025motifs}'s hierarchical framework. While we have shown that CSVR is able to accurately register subjects within clusters of similar cortical folding, we have also shown that the model performs particularly well on more heterogeneous, difficult registration cases, showing great promise for downstream inter-cluster alignment and template creation. 

Critically, CSVR outperforms the current combined registration baseline, CVS, which, designed for adult brains, struggles to align neonatal data. Moreover, CSVR achieves superior registration quality approximately 27,000 times faster than CVS, making it a practical solution for large-scale neonatal brain studies requiring combined surface-volumetric alignment. Finally, while we have validated CSVR on neonatal data, the main components of our method have demonstrated strong generalisability across diverse datasets: DDR and the MSM-HT hierarchical framework both show comparable performance on adult data, and PDD has already successfully generalised from abdominal CT to brain MRI, suggesting that CSVR should extend naturally to adult neuroimaging applications.

% 6) surface reconstruction 

\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgements ---Will not appear in anonymised version
 
\midlacknowledgments{The authors would like to acknowledge the participants of the dHCP and BIBS studies. The dHCP neonatal dataset was provided by the developing Human Connectome Project, KCL-Imperial-Oxford Consortium, funded by the European Research Council (ERC) under the European Union Seventh Framework Programme (FP/2007-2013) / ERC Grant Agreement no. [319456]. The BIBS data were funded by EU-AIMS (European Autism Interventions)/EU AIMS-2-TRIALS, an Innovative Medicines Initiative Joint Undertaking under Grant Agreement No. 777394.
The School of Biomedical Engineering and Imaging Sciences is supported by the Wellcome EPSRC Centre for Medical Engineering at King's College London (WT 203148/Z/16/Z) and the Department of Health via the National Institute for Health Research (NIHR) comprehensive Biomedical Research Centre (BRC) award to Guy's \& St Thomas' NHS Foundation Trust in partnership with King's College London and King's College Hospital NHS Foundation Trust. We acknowledge infrastructure support from the NIHR Mental Health BRC at South London and Maudsley NHS Foundation Trust, King's College London.}

\bibliography{midl26_318}
\clearpage 
\appendix
\section{}





\renewcommand{\thetable}{A.\arabic{table}}
\begin{figure}[h!] 
\floatconts {fig:difficult} 
{\caption{Performance of other models (Affine and FNIRT) vs \% Improvement with CSVR. Harder cases for which FNIRT and affine alignment is poor, benefit more from CSVR registration.}} {\includegraphics[width=1\linewidth]{images/difficulty_vs_improvement.png}}\vspace{-3ex}
\end{figure}

\vspace{-0.7em}
\renewcommand{\thetable}{A.\arabic{table}}
\begin{figure}[h!] 
\floatconts {fig:boxplots} 
{\caption{Dice scores from Figure \figureref{fig:dice} summarised into cortical vs subcortical box plots. Green and red percentages show mean improvement compared to the affine baseline.}} {\includegraphics[width=0.95\linewidth]{images/option1_enhanced_boxplot_with_stats.png}}\vspace{-3ex}
\end{figure}
\renewcommand{\thetable}{A.\arabic{table}}

\vspace{-0.7em}
\begin{table}[h!]
\centering
\small
\caption{Jacobian Determinant Statistics by Method}
\label{tab:jacobians}
\begin{tabular}{lcccc}
\toprule
\textbf{Method} & \textbf{Neg. Jac. (\%)} & \textbf{Min Jac.} & \textbf{Mean Jac.} & \textbf{Std Jac.} \\
\midrule
Affine          & $0.000$        & $1.000$    & $1.000$ & $0.000$ \\
FNIRT           & $1.854$        & $-4.167$   & $0.996$ & $0.634$ \\
CVS             & $1.401$        & $-102.661$ & $0.473$ & $6.186$ \\
PDD             & $0.191$        & $-1.496$   & $0.994$ & $0.345$ \\
DDR             & $0.001$        & $-0.110$    & $0.983$ & $0.128$ \\
*CSVR           & $0.141$        & $-1.253$   & $0.997$ & $0.346$ \\
\bottomrule
\vspace{-3ex}
\end{tabular}
\end{table}
\end{document}