\documentclass{midl} % Include author names
% \documentclass[anon]{midl} % Anonymized submission

% the following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution
\usepackage{enumitem}   
\usepackage{multirow}
\usepackage{booktabs}
\usepackage{mathtools}
\usepackage{caption}
% \usepackage{subcaption}

\usepackage[normalem]{ulem}
\useunder{\uline}{\ul}{}


%% some notations
\newcommand{\MI}{I(\mathcal{F}, \mathcal{M}\circ\phi)}
\newcommand{\NMI}{I_{\!N\!M\!I}(\mathcal{F}, \mathcal{M}\circ\phi)}

\newcommand{\HF}{H(\mathcal{F})} % fixed entropy
\newcommand{\HM}{H(\mathcal{M}\circ\phi)} % moving entropy
\newcommand{\HJ}{H(\mathcal{F}, \mathcal{M}\circ\phi)} % joint entropy

\newcommand{\pdff}{p(f)} % fixed pdf
\newcommand{\pdfm}{p(m)} % moving pdf 
\newcommand{\pdfj}{p(f,m)} % joint pdf

\newcommand{\IM}{\mathcal{M}}  % moving image
\newcommand{\IF}{\mathcal{F}}  % fixed image
%%% 


\usepackage{mwe} % to get dummy images
\jmlryear{2021}
\jmlrworkshop{Full Paper -- MIDL 2021}


\title{Learning Diffeomorphic and Modality-invariant Registration using B-splines}

\midlauthor{
\Name{Huaqi Qiu\nametag{$^{1}$}} \Email{huaqi.qiu15@imperial.ac.uk} \\
\addr $^{1}$ BioMedIA Group, Imperial College London, London, UK\\
\Name{Chen Qin $^{1,2}$} \Email{chen.qin@ed.ac.uk} \\
\addr $^{2}$ Institute for Digital Communications, University of Edinburgh, Edinburgh, UK\\
\Name{Andreas Schuh\nametag{$^{1}$}} \Email{andreas.schuh@imperial.ac.uk}\\
\Name{Kerstin Hammernik\nametag{$^{1,3}$}} \Email{k.hammernik@imperial.ac.uk}\\
\Name{Daniel Rueckert\nametag{$^{1,3}$}} \Email{d.rueckert@imperial.ac.uk}\\
\addr $^{3}$ Technical University Munich, Munich, Germany
}


\begin{document}

\maketitle

\begin{abstract}
We present a deep learning (DL) registration framework for fast mono-modal and multi-modal image registration using differentiable mutual information and diffeomorphic B-spline free-form deformation (FFD). Deep learning registration has been shown to achieve competitive accuracy and significant speedups from traditional iterative registration methods.
% 
In this paper, we propose to use a B-spline FFD parameterisation of Stationary Velocity Field (SVF) to in DL registration in order to achieve smooth diffeomorphic deformation while being computationally-efficient.
%
In contrast to most DL registration methods which use intensity similarity metrics that assume linear intensity relationship, we apply a differentiable variant of a classic similarity metric, mutual information, to achieve robust mono-modal and multi-modal registration. 
%
We carefully evaluated our proposed framework on mono- and multi-modal registration using 3D brain MR images and 2D cardiac MR images.
%
\end{abstract}

% \begin{keywords}
% modality-invariant,multi-modal image registration,mutual information,Parzen window,B-splines,diffeomorphic,deep learning
% \end{keywords}



%%%%%%%%%%%%%%%%%%%%%%%%%%% Intro %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Introduction}
% registration problem formulation
Image registration is an essential task in medical image analysis. Given a moving image $\IM$ and a fixed image $\IF$, image registration aims to find the spatial transformation $\phi$ which maps a location $\mathbf{x}$ in $\IF$ to the location with corresponding tissue or structure in $\IM$. In traditional approaches, a regularised transformation is embedded in an optimisation problem that minimises a dis-similarity metric between the fixed image $\IF$ and the transformed moving image $\IM\circ\phi$. This optimisation problem is commonly solved in an iterative way \cite{Sotiras2012-nh}. Despite being effective, many iterations are often required to register one pair of images in such optimisation-based methods which can be very time consuming.  

Recently, researchers have explored data-driven methods and the use of deep learning in image registration \cite{Rueckert2020-rl}. Although often time-consuming in training, DL-based registration networks can perform one-pass registration during inference substantially faster than iterative optimisation methods. 
%
% While early \textit{supervised} approaches trained convolutional neural networks (CNNs) using synthesised \textit{ground-truth} transformations\cite{Sokooti2017-ia,Yang2017-qy}, 
%
Recently proposed so-called \textit{unsupervised} DL registration methods train networks using intensity-based similarity metrics instead of (synthesised) ground truth transformations \cite{Balakrishnan2018-qz,De_Vos2019-ni,DBLP:conf/miccai/QinBSPPNR18,DBLP:conf/miccai/QinWCQBR20,DBLP:conf/miccai/QiuQFHSR19}.
%
Most \textit{unsupervised} methods use Mean Squared Error (MSE) or Cross-correlation (CC) as image matching criterion, and thereby assume either identity or a strong linear relationship between the intensities of the images. However, these metrics are often ineffective when there are complex non-linear intensity relationships, e.g. in contrast-enhanced images or images from different modalities. 

In this paper, we adopt a classic information-theoretic similarity metric, mutual information (MI), in \textit{unsupervised} DL registration to handle non-linear intensity relationships. MI has been widely used in traditional registration algorithms for robust multi-modal registration~\cite{Pluim2003-nu}. However, MI is commonly computed using non-differentiable intensity histogram construction, making it challenging to be directly applied in DL registration. To compute MI in a differentiable way, we adopt the formulations introduced in \citet{Thevenaz2000-hw} and use continuous Parzen windows (PW) to estimate differentiable intensity distributions. 
%
In addition, we propose to use a B-spline FFD based diffeomorphic transformation model in our DL registration framework. Specifically, we use CNNs to learn a B-spline model of stationary velocity fields (SVF) over the entire image domain to obtain diffeomorphic transformations, to take advantage of the parameter-efficiency and intrinsic smoothness of B-splines.


% highlight contributions
The main contributions of our work are as follows: 
%
1) We propose to learn a diffeomorphic SVFs parameterised efficiently by B-spline FFD to achieve fast and smooth diffeomorphic registration;
%
2) We use a differential formulation of mutual information in a whole-image DL registration framework to register images across modalities;  
%
3) We carefully evaluate the introduced components on both mono-modal and multi-modal registration tasks using 3D brain MR images and 2D cardiac MR images.


\section{Related works}
% multi-modal registration recent works
Some recent learning-based registration methods specifically address multi-modal or modality-invariant registration problem. One approach is to use segmentation of the anatomical structures to guide registration, when a large amount of segmentation is available for training \cite{DBLP:journals/mia/HuMGLGBWBMEONBV18}. Another approach is to reduce the problem to mono-modal registration via image-to-image translation \cite{DBLP:conf/cvpr/ArarGDBC20} or disentanglement \cite{DBLP:conf/ipmi/QinSLMRK19}. These methods utilise powerful advances in deep learning generative models but cannot explicitly guarantee that the structures are not changed during the intensity transformation, and often have complicated frameworks that are non-trivial to train. Towards modality-invariant registration, \citet{DBLP:journals/corr/abs-2004-10282} proposed to use contrast-varying synthetic images to train contrast-invariant registration networks. Most related to our work, \citet{De_Vos2020-xg} also adopts mutual information for DL registration. In contrast to their method, our approach uses fully convolutional network to parameterise registration over the entire image domain instead of patches and employs a diffeomorphic transformation model. We also evaluated our framework on more challenging inter-subject multi-modal registration tasks.


\section{Methods}
\subsection{Mutual information in DL registration}
Mutual information relaxes the linear intensity relationship constraint and measures information that one image contains about another image based on their intensity distributions. \citet{Studholme1999-bu} later introduced Normalised Mutual Information (NMI) which is more invariant to the amount of overlap between the two images.  
%
Here, we introduce the formulation of a differentiable estimate of NMI to enable its use in DL registration. In the context of image registration, NMI can be written as:
%
\begin{equation}
\label{eq:NMI}
\NMI = \frac{\HF + \HM}{\HJ},
\end{equation}
%
where $\HF$ and $\HM$ denote the marginal entropies of the fixed and moved images, and $\HJ$ denotes the joint entropy. The entropy of an image $\mathcal{I}$ can be defined as $H(\mathcal{I})  = - \int_i  p(i) \ln({p(i)}) di$ where $p(i)$ is the intensity distribution of image $\mathcal{I}$. To estimate this intensity distribution in digital images, one can construct histograms as discretised estimation of the distributions. This is usually achieved by counting the number of intensities or (intensity-pairs) that fall into intensity \textit{bins}, which is mathematically equivalent to adding a rectangular window function centred at the intensity value of each point in the images to the histogram (pair of points for joint histogram). However, the rectangular window function makes the constructed histogram non-differentiable. To use MI in deep learning registration, we need a differentiable way to construct the intensity histogram to allow back-propagation and gradient-based training of the networks. To this end, we use a differentiable Parzen window (PW) \cite{Thevenaz2000-hw} instead of the rectangular window, as illustrated in Figure A\ref{fig:histogram_demo} in the Appendix. Formally, the joint histogram is computed as: 
%
\begin{equation}\label{PW_histogram}
    h(f, m) = 
    \sum_{\mathbf{x}\in \Omega^\dagger} 
    w\left(\IF(\mathbf{x})-f\right) 
    w\left(\IM(\mathbf{x}\circ\phi)-m \right),
\end{equation}
%

where $f, m$ denotes intensity values on the fixed and moved images, $\Omega^\dagger$ denotes all points in the overlapping image domain, and $w(\cdot)$ is the Parzen window function. Normalising the joint histogram yields the joint distribution: 
%
\begin{equation}
    \pdfj = \frac{h(f, m)}
    {\sum_{f\in L_\IF, m\in L_\IM\circ\phi}  h(f, m)}.
\end{equation}
%
where $L_{\mathcal{F}}$ and $L_{\mathcal{M\circ\phi}}$ denotes the \textit{bin} centres where the histogram is evaluated.  We opt to use the Gaussian function as the Parzen window, which fulfils the partition-of-unity constraint and is easy to compute, namely $w(i)=\frac{1}{\sqrt{2\pi}\sigma}\cdot\exp({-\frac{i^2}{2\sigma^2}})$. The $\sigma$ is chosen so that the Full Width at Half Maximum (FWHM) of the function is one \textit{bin}-width. 
The marginal distributions are estimated by marginalising the joint distribution, i.e. $\pdff = \sum_{m \in L_{\mathcal{M}}} \pdfj$ and $\pdfm = \sum_{f \in L_{\mathcal{F}}} \pdfj$. Finally, we can compute the entropies and the NMI using Eq~\ref{eq:NMI} with the joint and marginal distributions. We compute this effeciently using vectorised operations to combine with DL registration.

% \begin{equation}
% \label{eq:MI}
% \MI = \HF + \HM - \HJ
% \end{equation}
%

% measured using the joint  and marginal intensity distributions, formulated as: 
% \begin{equation}
% % \label{eq:MI  }
% \MI = \sum_{f\in L_{\mathcal{F}}}\sum_{m\in L_{\mathcal{M}\circ\phi}} 
%         \pdfj \ln{\frac{\pdfj}{\pdff\pdfm}},
% \end{equation}

%%%%
\subsection{Registration framework}
Here we introduce our deep learning registration framework, as illustrated in Figure~\ref{fig:framework}. We focus on deformable registration of images after affine alignment. We follow the approach of \cite{DBLP:conf/miccai/DalcaBGS18} and use a convolutional neural network (CNN) with parameters $\theta$ to map the fixed and moving images to the parameters of the transformation. Instead of directly outputing the velocity fields as in \citet{DBLP:conf/miccai/DalcaBGS18}, our network (detailed in \ref{sec:network}) outputs the velocities of the B-spline control points $\vec{v}^\beta$, from which we can compute the transformation $\phi$ via B-spline tensor product and Squaring and Scaling (detailed in \ref{sec:transform}). During training, the moving image $\IM$ is warped using $\phi$ via linear interpolation to acquire the moved image $\IM\circ\phi$ which is then used to compute similarity loss $\mathcal{L}_{Sim}(\IF, \IM\circ\phi)$. We iterate over pairs of fixed-moving images in a training dataset to find the network parameters $\hat{\theta}$ that minimises the similarity loss $\mathcal{L}_{Sim}$ with the constraint of the regularisation loss $\mathcal{L}_{Reg}$. The overall loss can be written as, 
%
\begin{equation}
\label{eq:loss}
\mathcal{L}(\IF, \IM\circ\phi)
= \mathcal{L}_{Sim}(\IF, \IM\circ\phi)
+ \lambda \mathcal{L}_{Reg}
\end{equation}
%
where $\mathcal{L}_{Sim} = - I_{\!N\!M\!I}$ is the negative Normalised Mutual Information and $\mathcal{L}_{Reg}$ is a regularisation on the velocity field $\vec{v}$ to further enforce smoothness and diffeomorphism \cite{Beg2005-jh}:
\begin{equation}
\label{eq:diffusion}
\mathcal{L}_{Reg} = \frac{1}{|\Omega|} \sum_{\mathbf{x}\in \Omega}
\sum_{d \in D}
\left\Vert\frac{\partial \vec{v}(x)}{\partial d}\right\Vert_2^2
\end{equation}

where $\Omega$ denotes all points in the image domain and $d$ denotes the spatial dimension.

% Figure Framework
\begin{figure}[ht]
    \centering
    \includegraphics[width=0.7\textwidth]{Figures/figures/framework.pdf}
    \caption{Our DL registration framework. The CNN predicts time-stationary velocities of the control points, and transformation is obtained by evaluating B-spline functions at the control points and efficient integration via Scaling and Squaring. The similarity loss based on MI and the smoothness regularisation loss is only needed for training.}
    \label{fig:framework}
\end{figure}


\subsubsection{Transformation}\label{sec:transform}
Diffeomorphic transformation is topology preserving and invertible, which are desirable properties for some medical image registration applications. To ensure these properties, we use the flow of diffeomorphisms generated by the group exponential of spatially smooth Stationary Velocity Fields (SVFs). The diffeomorphic transformation $\phi$ is the group exponential of the time-stationary velocity field $\vec{v}$, i.e. $\phi = \exp{(\vec{v})}$, which can be efficiently computed using the Scaling and Squaring (SS) algorithm \cite{Arsigny2006-th}.
To represent SVFs in a parameter efficient way while taking advantage of implicit smoothness of spline functions \cite{Modat2012-xv}, we propose to use the cubic B-spline parameterisation of SVFs (SVFFD) in our deep learning registration framework. The CNN outputs the velocities of a grid of B-spline control points $\vec{v}^{\beta}$ with regular spacing $\vec{\delta}$. And the dense velocity field is obtained using a weighted combination of cubic B-spline basis functions $\beta(\cdot)$ \cite{Rueckert1999-bn}:

\begin{equation}\label{eq:bspline}
\vec{v}(\mathbf{x}) = \sum_{c\in C} \vec{v}^\beta_c \prod_{d \in D} \beta_d (\mathbf{x}_d - \vec{k}_{c, d})),
\end{equation}
%
where $c$ is the index of the control points on the control point grid $C$, $\vec{k}$ denotes the coordinates of the control points in image space. The displacement field is obtained from the SVF via scaling and squaring. Since the B-spline basis function has limited local support, Eq.~\ref{eq:bspline} can be implemented using transposed convolution with pre-computed B-spline basis functions as kernels.


\subsubsection{Networks}\label{sec:network}
% to the network used in \cite{Balakrishnan2018-qz}
To learn the velocities of the B-spline control points $\vec{v}^\beta$, we use a fully convolutional network adapted from a U-net based architecture, as shown in Figure~B\ref{fig:networks} in the Appendix. 
%
% The moving and fixed images, concatenated in the channel dimension, go through the first convolution layer which has no downsampling. This is followed by four \textit{encoder} convolution layers with stride of 2, each reducing the spatial resolution by half while doubling the number of channels. In the \textit{decoder} part, each layer applies convolution with a stride of 1 and upsamples the output feature map by 2 before concatenating with the feature map of the same resolution from the \textit{encoder}. 
%
In order to support different control point spacings, we dynamically adapt the base U-net architecture so the output matches the size of the control point grid. To achieve this, we keep the U-net \textit{decoder} layers that produce the largest feature map smaller than the size of the control point grid. Then we apply a linear interpolation layer to resize the feature map to match the size of the control point grid. This allows the use of arbitrary control point spacing. Finally, we apply three convolution layers to predict the output velocities. All convolution layers use a kernel size of 3 in all spatial dimensions and a LeakyReLU nonlinearity with negative slope of 0.2. No nonlinearity function is applied to the final layer. The same architecture is used for 2D and 3D.

%  Figure~\ref{fig:networks}(a) shows the base U-net and Figure~\ref{fig:networks}(b) shows the modified architecture for control point spacing of 4


%%%%%%%%%%%%%%%%%%%%%%%%%%% End of Method %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%



%%%%%%%%%%%%%%%%%%%%%%%%%%% Experiments %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Experimental Settings}
\subsection{Tasks and Data}
We evaluate the proposed framework on three tasks: inter-subject 3D brain MRI registration of 1) T1w-T1w volumes; 2) T1w-T2w volumes; and 3) cardiac motion estimation via 2D registration between end-diastolic (ED) frame and end-systolic (ES) frame of cardiac MR images. For the brain registration task, we use 3D T1w and T2w images of 310 randomly selected subjects from the Cambridge Centre for Ageing and Neuroscience (CamCAN) project \cite{Shafto2014-sc,Taylor2017-rd}. The images have isotropic spatial resolution with voxel size of $1mm^3$ and are cropped to the size of $176\times 192 \times 176$. All images are spatially normalised to a common MNI space using affine registration, skull-stripped using ROBEX\footnote{https://www.nitrc.org/projects/robex} and bias-field corrected using the N4 algorithm in SimpleITK\footnote{https://simpleitk.org}. For evaluation, we also acquired the segmentation of 138 cortical and sub-cortical structures (grouped into 5 groups) automatically using MALPEM \cite{Ledig2015-zv}. For cardiac motion estimation, we use 2D cardiac MR images of 210 subjects from the UK Biobank study\footnote{UK Biobank Imaging Study. \url{http://imaging.ukbiobank.ac.uk}}. The images have in-plane resolution of $1.8mm\times1.8mm$ and are cropped to the size of $192\times192$. The segmentation of left ventricle cavity (LV), myocardium (MYO) and right ventricle (RV) are acquired using a state-of-the-art CNN-based automatic segmentation algorithm \cite{Bai2018-yc}. 


\subsection{Evaluation metrics}
We evaluate both the \textit{accuracy} and the transformation \textit{regularity} of registration. We evaluate the \textit{accuracy} by measuring the overlap between the anatomical segmentation of the fixed image and the segmentation transformed by $\phi$ in the moving image using Dice score. The transformation \textit{regularity} is evaluated based on the determinant of the Jacobian $J=|\nabla \phi|$. We evaluate the amount of points in the image that are ``folded" due to the transformation by the ratio of points with $J<0$. We also evaluate the spatial smoothness of the transformation by measuring the magnitudes of the gradient of the Jacobian determinant $|\nabla J|$.


\subsection{Baseline comparisons}
The presented deep learning registration framework using mutual information and B-spline SVF (denoted by ``MIDIR") is firstly compared to a traditional iterative registration method based on the SVFFD transformation model. We also compare to a state-of-the-art DL registration method proposed in \citet{DBLP:conf/miccai/DalcaBGS18} (the deterministic version) combined with our differentiable mutual information, denoted by ``VM$_{\!N\!M\!I}$". For the mono-modal tasks, we also compare to variants of methods that use localised normalised cross-correlation (LNCC) as similarity. The evaluation metrics at the initial affine registration (``Affine") is also provided for reference. The B-spline control point spacing $\vec{\delta}$ and regularisation weighting $\lambda$ are hyper-parameters. For all competing models, we carefully tuned these hyper-parameters using a held-out validation dataset while considering the balance between registration accuracy and transformation regularity. The hyper-parameters that yield best mean Dice score, with under $0.5\%$ of the points with $|\nabla\phi|<0$ were chosen. Hyper-parameter values of all results are shown in the Appendix~\ref{sec:app_hparams}.



\subsection{Implementation details} 
The traditional SVFFD method is implemented using Medical Image Registration ToolKit (MIRTK) \cite{Schuh2015-cd}\footnote{https://mirtk.github.io}. The DL registration frameworks are implemented using Pytorch v1.5.1\footnote{https://pytorch.org}. To reduce GPU memory usage, we compute NMI on a subset (50\%) of randomly sampled positions in the image space at each iteration. The Adam optimiser was used with an initial learning rate of 1e-4. Learning rate decay of 1/10 per 50 epoch was used for all brain registration models. Running speed were measured on a workstation with an Intel\textsuperscript{\textregistered} i7-8700 CPU and NVIDIA\textsuperscript{\textregistered} Titan Xp GPUs. Our code is available at \href{https://github.com/qiuhuaqi/midir}{this url}.


\section{Results}

Table~\ref{tab:results} presents the quantitative evaluation of all models on all brain MR and cardiac MR registration tasks.
%
We performed two-sided Wilcoxon signed-rank test to check for statistical significance in differences between methods (significant if p-value is smaller than 0.05). 
%
On T1w-T1w tasks, the traditional SVFFD method outperformed all DL methods on Dice score while achieving good transformation regularity. Methods using LNCC as similarity achieved better Dice accuracy than the ones use NMI. Our MIDIR performed similarly to the baseline VM framework when the same similarity metric is used. 
%
On T1w-T2w registration, similar to T1w-T1w, methods using LNCC as similarity quantitatively achieved better results than those using NMI as similarity, with the traditional SVFFD using LNCC outperforming all other methods. The proposed MIDIR frameworks achieved similar Dice accuracy but significantly better regularity than the VM baseline.
%
On the cardiac motion task, the DL models are on-par with traditional SVFFD when LNCC is used as similarity but significantly more accurate when NMI is used as similarity. Our MIDIR models achieved competitive accuracy and marginally better regularity on this task.
% 
The boxplots in Figure~C\ref{fig:t1t1_dice}-C\ref{fig:cardiac_jac} in the Appendix show the distribution of the results over test subjects. It can be noticed that traditional SVFFD produced more dispersed results with more outliers than DL methods. This could be an advantage of using a DL frameworks for the same transformation model, since the more complex CNN can learn a prior from a dataset to produce more consistent results. Some visual examples of the registration results are shown in Figure~D\ref{fig:t1t1_images}-D\ref{fig:cardiac_images} in the Appendix.
% %
% Mostly notably, for T1w-T2w registration using LNCC, aggressive deformation of gray-matter can be seen as a result of forcing a strong linear intensity relationship required by LNCC matching.
% %

%% comment on runtime
The runtime for each method to register one pair of 2D or 3D images are also reported in Table~\ref{tab:results}. Both CPU and GPU inference times are shown for a fair comparison with the CPU-based MIRTK (SVFFD). DL models are substantially faster on CPU and GPU. Our parameter-efficient MIDIR models run faster than the dense VM models especially on 3D tasks. 

% main table
\begin{table}[htbp]
\centering
\caption{Quantitative results on brain and cardiac registration tasks. The Dice score of different anatomical structures are averaged. $J_{<0}\%$ denotes the percentage of points with negative Jacobian determinant, higher means more "folding". $|\nabla_J|$ denotes the gradient magnitude of the Jacobian determinant, lower value means spatially smoother.}
\label{tab:results}

\resizebox{\textwidth}{!}{%
\begin{tabular}{@{}l|ccc|ccc|ccc|ll@{}}
\toprule
                       & \multicolumn{3}{c|}{Brain T1w-T1w}               & \multicolumn{3}{c|}{Brain T1w-T2w}               & \multicolumn{3}{c|}{Cardiac Motion}              & \multicolumn{2}{c}{Runtime (2D/3D)}              \\ \midrule
Methods                & Dice           & $J_{<0}\%$     & $|\nabla_J|$   & Dice           & $J_{<0}\%$     & $|\nabla_J|$   & Dice           & $J_{<0}\%$     & $|\nabla_J|$   & \multicolumn{1}{c}{CPU} & \multicolumn{1}{c}{GPU} \\ \midrule
Affine                 & 0.619          & -              & -              & 0.619          & -              & -              & 0.500          & -              & -              & -                       & -                       \\ \midrule
SVFFD$_{\!L\!N\!C\!C}$ & \textbf{0.836} & 0.107          & 0.024          & \textbf{0.770} & 0.150          & \textbf{0.027} & 0.781          & 0.161          & 0.045          & 43.3s/44min24s          & -                       \\
VM$_{\!L\!N\!C\!C}$    & 0.814          & 0.295          & 0.051          & 0.753          & 0.176          & 0.047          & 0.797          & 0.094          & 0.034          & 115ms/17.7s             & 6.48ms/228ms            \\
MIDIR$_{\!L\!N\!C\!C}$ & 0.816          & 0.238          & 0.044          & 0.743          & 0.090          & 0.039          & \textbf{0.806} & 0.096          & 0.029          & 116ms/11.8s             & 4.78ms/124ms            \\ \midrule
SVFFD$_{\!N\!M\!I}$    & 0.822          & 0.118          & \textbf{0.023} & 0.728          & 0.135          & \textbf{0.027} & 0.701          & \textbf{0.080} & \textbf{0.016} & 1min10s/3min34s         & -                       \\
VM$_{\!N\!M\!I}$       & 0.807          & \textbf{0.106} & 0.038          & 0.733          & 0.197          & 0.047          & 0.797          & 0.151          & 0.036          & 115ms/17.7s             & 6.48ms/228ms            \\
MIDIR$_{\!N\!M\!I}$    & 0.813          & 0.121          & 0.038          & 0.735          & \textbf{0.023} & \textbf{0.028} & 0.803          & 0.151          & 0.033          & 116ms/11.8s             & 4.78ms/124ms            \\ \bottomrule
\end{tabular}%
}
\end{table}


%%%%%%%%%%%%%%%%%%%%%%%%%%% End of Experiments %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Discussion}
%% Discuss performance comparison
% Discussion about DL method performances vs. MIRTK (R2)
The quantitative results show that the traditional SVFFD method outperforms the DL methods in several settings. SVFFD performs slow but detailed optimisation for each pair of images, while the DL methods perform one-pass fast predictions but less accurately in its current form. Our traditional baseline also employs a multi-resolution framework which improves optimisation and registration accuracy, while the DL methods only use one resolution.

% MI vs LNCC results
Our experiments also show that LNCC outperformed NMI on several settings, and perhaps interestingly so on T1w-T2w registration. The intensity relationships between T1w and T2w images in our dataset, when observed locally, can be roughly described by intensity inversion and could be handled by LNCC. However, (L)NCC can not be expected to be applicable to other multi-modal registration with more complex appearance and intensity relationships, such as MR-PET or MR-CT, while (N)MI is more generally applicable~\cite{Sotiras2012-nh}. Experiments with more multi-modal data is therefore required to demonstrate this, which we will investigate in the future. On the other hand, a major drawback of globally evaluated (N)MI is that no spatial information in the image is considered~\cite{DBLP:conf/miip/RueckertCHH00}. We also empirically found that the training of NMI models are more dependent on spatial regularisation.
% NGF Re: Reviwer 3
Other similarity-invariant similarity metrics such as Normalised Gradient Fields (NGF) has also been explored in DL registration where rich amount of edges of the structures of interest can be found in the images \cite{DBLP:conf/miip/HeringH19}, which could also be compared to for multi-modal registration. 

%% Discuss remaining folding
Noticeably, folding are still present in experimental results for all methods despite using the diffeomorphic SVF transformation. If topology preservation is required for specific applications, it can be achieved by changing the hyperparameters. The velocity field could be enforced to be smoother to achieve diffeomorphism by increasing the smoothness regularisation, often at a cost of substantial drop in Dice accuracy; the number of time steps used in the Scaling-and-Squaring (SS) algorithm to approximate the continuous integral can be increased to reduce folding but with an increased computational cost. In this paper, we carefully tune the hyper-parameters within the constraints of our computational resource so that the presented results are more comparable for a general evaluation. 

\section{Conclusions}
In this work, we present a deep learning framework trained using differentiable mutual information for fast and robust mono- and multi-modal image registration. We also propose to use a parameter-efficient B-spline free form deformation (FFD) via stationary velocity field (SVF) for smooth and diffeomorphic deformation. Evaluation results show that the proposed framework achieves competitive registration accuracy and transformation regularity across modality settings while being computationally more efficient. In future works, we will study the sensitivity of hyper-parameters, evaluate the different approaches on more diverse multi-resolution tasks, and investigate adding multi-resolution scheme and incorporating multi-step optimisation in DL registration. 



% The B-spline SVFs module is parameter-efficient and can be embedded more easily into any DL methods that requires the learning of deformation. 


% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{We thank EPSCRC grants EP/P001009/1 and EP/R005982/1 for supporting this work. Access to the cardiac MR data is granted under UK Biobank application 40119. We also thank Loic Le Folgoc, Jeremy Tan and Jo Schlemper from the BioMedIA group for discussions without which the work would have not been possible.} 


\bibliography{Qiu21.bib}

\newpage
\appendix
\section{Parzen window illustration}

% (Figure 1a & 1b: demonstrating Parzen estimate vs. binning)
\begin{figure}[h!]
    \centering
    \includegraphics[width=0.85\textwidth]{Figures/figures/Fig1_PW.png}
    \caption{1D illustration of histogram estimation using the rectangular window (left), known as \textit{binning}, and Gaussian function as Parzen window (right). The horizontal axis shows the bin number in the intensity range. The red triangles mark the intensity value of one sample point in the image and the red arrows indicate the values that this sample contributes to the histogram at the evaluated bin centres $i_\iota$.} \label{fig:histogram_demo}
\end{figure}



\section{Network architecture}
\begin{figure}[ht]
    \centering
    \includegraphics[width=1.0\textwidth]{Figures/figures/networks.pdf}
    \caption{The architectures of: a) The base U-net architecture for direct prediction of dense fields; and b) Our network for prediction of the control point velocities $\vec{v}^\beta$ for B-spline parameterised SVFs. The example network shown here is configured for control point spacing of 4 (pixels/voxels). The resolution relative to the original images and number of channels are shown below each data block.}
    \label{fig:networks}
\end{figure}


\newpage

\section{Hyper-parameters}\label{sec:app_hparams}
\begin{table}[ht]
\centering
\caption{Hyper-parameters of all methods producing the results shown in the paper. $\delta$ is the spacing of the B-spline control point grid (in image space). $\lambda$ is the regularisation weight introduced in \ref{eq:loss}, for SVFFD is the weighting on Bending Energy regularisation loss.}
\label{tab:hparams}
% \resizebox{\textwidth}{!}{%
\begin{tabular}{@{}l|cc|cc|cc@{}}
\toprule
                       & \multicolumn{2}{c|}{Brain T1w-T1w} & \multicolumn{2}{c|}{Brain T1w-T2w} & \multicolumn{2}{c}{Cardiac Motion} \\ \midrule
Methods                & $\delta$         & $\lambda$       & $\delta$        & $\lambda$        & $\delta$       & $\lambda$          \\ \midrule
SVFFD$_{\!L\!N\!C\!C}$ & 4                & $10^{-3}$       & 6               & $10^{-4}$        & 4              & $10^{-3}$          \\
VM$_{\!L\!N\!C\!C}$    & -                & 0.1             & -               & 0.1              & -              & 0.1                \\
MIDIR$_{\!L\!N\!C\!C}$ & 2                & 0.1             & 2               & 0.1              & 2              & 0.1       \\ \midrule
SVFFD$_{\!N\!M\!I}$    & 8                & $10^{-5}$       & 7               & $10^{-5}$        & 4              & $10^{-6}$          \\
VM$_{\!N\!M\!I}$       & -       & 0.1             & -               & 0.05             & -              & 0.1                \\
MIDIR$_{\!N\!M\!I}$    & 2                & 0.08            & 2               & 0.1              & 2              & 0.1                \\ \bottomrule
\end{tabular}%
% }
\end{table}


Other hyper-parameters:
\begin{itemize}
    \item Window size when computing LNCC: 7
    \item Number of \textit{bins} used when computing NMI: 32 for SVFFD on cardiac registration, 64 for all other experiments
\end{itemize}



%%%%%%% Box plots %%%%%%%

\section{Boxplots of quantitative results}
%% T1-T1 Box plots
% Dice
\begin{figure}[htb]
    \centering
    \includegraphics[width=0.95\textwidth]{Figures/boxplots/t1t1_dice_plot.pdf}
    \caption{Boxplot of Dice results for brain T1w-T1w registration. The red lines in the boxes mark the mean value and the black lines mark the median. Results are shown for different groups of anatomical structures (GM stands for Grey Matter), with the Mean Dice over all structures on the right most.}
    \label{fig:t1t1_dice}
\end{figure}


% Jacobian
\begin{figure}[htbp]
\floatconts
{fig:t1t1_jac}% label for whole figure
{\caption{Boxplot of regularity results for brain T1w-T1w registration.}}
{%
\subfigure[Folding ($J<0$)][b]{%
  \includegraphics[width=0.48\textwidth]{Figures/boxplots/t1t1_folding.pdf}
}
\subfigure[Smoothness ($|\nabla_J|$)][b]{%
  \centering
  \includegraphics[width=0.48\textwidth]{Figures/boxplots/t1t1_gradJ.pdf}
}
}
\end{figure}


%% T1-T2 Box plots
% Dice
\begin{figure}[htbp]
    \centering
    \includegraphics[width=0.95\textwidth]{Figures/boxplots/t1t2_dice_plot.pdf}
    \caption{Boxplot of Dice results for brain T1w-T2w registration. Similar configuration as Figure \ref{fig:t1t1_dice}.}
    \label{fig:t1t2_dice}
\end{figure}


% Jacobian
\begin{figure}[htbp]
\centering
\floatconts
{fig:t1t2_jac}% label for whole figure
{\caption{Boxplot of regularity results for brain T1w-T2w registration.}}
{%
\subfigure[Folding ($J<0$)][b]{%
  \includegraphics[width=0.48\textwidth]{Figures/boxplots/t1t1_folding.pdf}
}
\subfigure[Smoothness ($|\nabla_J|$)][b]{%
  \centering
  \includegraphics[width=0.48\textwidth]{Figures/boxplots/t1t2_gradJ.pdf}
}
}
\end{figure}


%% cardiac boxplots
% Dice
\begin{figure}[h]
    \centering
    \includegraphics[width=0.95\textwidth]{Figures/boxplots/cardiac_dice_plot.pdf}
    \caption{Boxplot of Dice results for cardiac MR registration. Results are shown for Left Ventricle volume (LV), Myocardium (Myo) and Right Ventricle volume (RV).}
    \label{fig:cardiac_dice}
\end{figure}


% Jacobian
\begin{figure}[htbp]
\centering
\floatconts
{fig:cardiac_jac}% label for whole figure
{\caption{Boxplot of regularity results for cardiac MR registration.}}
{%
\subfigure[Folding ($J<0$)][b]{%
  \includegraphics[width=0.48\textwidth]{Figures/boxplots/cardiac_folding.pdf}
}
\subfigure[Smoothness ($|\nabla_J|$)][b]{%
  \centering
  \includegraphics[width=0.48\textwidth]{Figures/boxplots/cardiac_gradJ.pdf}
}
}
\end{figure}

%%%%%%%%%%%%%%%%%%

\newpage
\section{Visualisation of images and transformations}
%% visual images
% t1t1
\begin{figure}[htbp]
    \centering
    \includegraphics[width=0.95\textwidth]{Figures/images/t1t1_images.png}
    \caption{An example axial slice of brain MR T1w-T1w registration from all competing methods. The rows are the target fixed image, the moving image transformed by registration (moved), the error of the registration (white indicates zero error, red means positive and blue means negative) and the transformation.}
    \label{fig:t1t1_images}
\end{figure}

% t1t2
\begin{figure}[htbp]
    \centering
    \includegraphics[width=0.95\textwidth]{Figures/images/t1t2_images.png}
    \caption{An example axial slice of brain MR T1w-T2w registration results. Same configuration as Figure \ref{fig:t1t1_images}, except the error is between the fixed image (T1w) and the transformed T1w image from the same subject as the moving T2w image, which is initially perfectly aligned with the T2w image.}
    \label{fig:t1t2_images}
\end{figure}

% cardiac
\begin{figure}[htbp]
    \centering
    \includegraphics[width=0.95\textwidth]{Figures/images/cardiac_images.png}
    \caption{Example registration of an ED frame (fixed image) of mid-ventricle slice with the ES frame (moving image) of the same sequence. Same configuration as Figure \ref{fig:t1t1_images}.}
    \label{fig:cardiac_images}
\end{figure}



\end{document}
