\documentclass{midl} % Include author names
% \documentclass[anon]{midl} % Anonymized submission

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\usepackage{bm}
\usepackage{bbm}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{booktabs}
\usepackage{multirow}
\usepackage{amssymb}% http://ctan.org/pkg/amssymb
\usepackage{pifont}% http://ctan.org/pkg/pifont
\newcommand{\cmark}{\ding{51}}%
\newcommand{\xmark}{\ding{55}}%
\usepackage{enumitem}
\usepackage{graphicx}

%Tables
% \usepackage{tabularx}
% MATH
\newcommand{\coord}{\mathbf{x}}
\newcommand{\latent}{\mathbf{z}}
\newcommand{\hidden}{\mathbf{h}}
\newcommand{\gain}{\mathbf{g}}
\newcommand{\act}{\psi}


% \jmlrvolume{-- Under Review}
% \jmlryear{2024}
% \jmlrworkshop{Full Paper -- MIDL 2024 submission}
% \editors{Under Review for MIDL 2024}
\jmlryear{2024}
\jmlrworkshop{Full Paper -- MIDL 2024}
\jmlrvolume{-- 006}
\editors{Accepted for publication at MIDL 2024}

\title[SINR]{SINR: Spline-enhanced implicit neural representation\\ for multi-modal registration}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

\midlauthor{\Name{Vasiliki Sideri-Lampretsa\nametag{$^{1,2}$}} \Email{vasiliki.sideri-lampretsa@tum.de}\\
\Name{Julian McGinnis\nametag{$^{1,2}$}} \Email{julian.mcginnis@tum.de}\\
\Name{Huaqi Qiu\nametag{$^{1}$}} \Email{huaqi.qiu@tum.de}\\
\Name{Magdalini Paschali\nametag{$^{4}$}} \Email{magda.paschali@stanford.edu}\\
\Name{Walter Simson\nametag{$^{4}$}} \Email{wsimson@stanford.edu}\\
\Name{Daniel Rueckert\nametag{$^{1,2,3}$}} \Email{daniel.rueckert@tum.de}\\
\addr $^{1}$ Institute for AI in Medicine, Technical University of Munich, Germany\\ 
\addr $^{2}$ Klinikum rechts der Isar, Munich, Germany \\ 
\addr $^{3}$ Biomedical Image Analysis Group, Department of Computing, Imperial College London\\
\addr $^{4}$ Department of Radiology, School of Medicine, Stanford University, USA\\
}


\begin{document}

\maketitle


\begin{abstract}

Deformable image registration has undergone a transformative shift with the advent of deep learning. While convolutional neural networks (CNNs) allow for accelerated registration, they exhibit reduced accuracy compared to iterative pairwise optimization methods and require extensive training cohorts. Based on the advances in representing signals with neural networks, implicit neural representations (INRs) have emerged in the registration community to model dense displacement fields continuously. Using a pairwise registration setup, INRs mitigate the bias learned over a cohort of patients while leveraging advanced methodology and gradient-based optimization. However, the coordinate sampling scheme makes dense transformation parametrization with an INR prone to generating physiologically implausible configurations resulting in spatial folding. In this paper, we introduce SINR - a method to parameterize the continuous deformable transformation represented by an INR using Free Form Deformations (FFD). SINR allows for multi-modal deformable registration while mitigating folding issues found in current INR-based registration methods. SINR outperforms existing state-of-the-art methods on both 3D mono- and multi-modal brain registration on the CamCAN dataset, demonstrating its capabilities for pairwise mono- and multi-modal image registration. 
\end{abstract}

\begin{keywords}
implicit neural representations, image registration, multi-modal
\end{keywords}

\section{Introduction}

Image registration involves aligning corresponding semantic regions in two or more images acquired with different imaging modalities or at separate points in time~\cite{Sotiras2013DeformableMI}. In medical imaging, registration is vital for the quantitative interpretation of multiple images of the same patient, e.g., multi-modal image fusion, motion correction, and disease progression tracking. Conventional registration methods rely on pairwise instance optimization to learn gridded displacement fields, where a dissimilarity measure over a space of transformations is iteratively minimized \cite{ashburner2007,beg2005,Rueckert1999NonrigidRU}. 

\begin{figure}[t]
\centering
\includegraphics[width=\textwidth]{figures/outline9.png}
\caption{
Given a densely sampled MRI, our approach, SINR, selects a subset of coordinates as control points to train an INR. Using gradient descent, the INR learns to model the continuous displacement field. By incorporating Free Form Deformations (FFD), we implicitly regularize the INR, achieving smoother transformations. } \label{fig:overview}
\end{figure}
%
Data-driven registration-learning methods~\cite{haskins2020}, commonly implemented with CNNs, learn image correspondence over a dataset of image pairs and predict dense displacement fields~\cite{balakrishnan2019}, diffeomorphisms~\cite{Dalca2018UnsupervisedLF,mok2020} or parameters of the transformation model~\cite{Qiu2021LearningDA}. Due to training bias, these approaches are often resolution-dependent and can fail to generalize to other modalities. This registration-learning paradigm can offer fast inference time at the cost of accuracy~\cite{Hansen2021RevisitingIH}. 
% Works by\cite{Mok2023DeformableMI,heinrich2022voxelmorph} aim to overcome this limitation by employing instance optimization on inferred deformation fields.

Recently, coordinate-based implicit neural representations (INRs) have been proposed to encode signals such as images or transformations as a function stored in the weights of a multi-layer perceptron (MLP)~\cite{sitzmann2020implicit,tancik2020fourier,mildenhall2021nerf}. These approaches allow for a \emph{continuous} representation of the underlying signal with potentially lower storage requirements than gridded representations~\cite{dupont2021coin}.
Activation functions, such as ReLUs, sinusoids (SIRENs), and Gaussian activations, have been proposed~\cite{rahaman2019spectral,sitzmann2020implicit}, with benefits to fidelity and training speed. INRs have also been used for registration; \cite{wolterink2022implicit} proposed representing a dense deformable transformation between lung Computer Tomography (CT) images using INRs, while ~\cite{han2023diffeomorphic} introduces mono-modal diffeomorphic registration with INRs.
Finally,~\cite{byra2023exploring} examines the efficacy of INRs in improving the registration of mono-modal brain images in MRI. In these works, INRs are fitted only with the normalized coordinates of a single image pair and predict the dense displacement field, minimizing a conventional intensity-based dissimilarity measure. Additionally, unlike CNNs, INRs do not require a large training dataset.
% Due to their lightweight architecture, tuning the INR for a specific application or image pair is straightforward and indicates the efficacy and versatility of this framework.
% This fact also alleviates out-of-distribution challenges commonly faced in large CNNs, reducing the burden of clinical translation.

INR-based registration methods commonly leverage sinusoidal activations, which can better represent higher frequency signal components~\cite{sitzmann2020implicit}. However, the expressiveness of SIRENs is explicitly controlled by the sinusoids' frequency term $\omega$.
% large $\omega$, i.e., larger frequency, increased spatial variability of the deformation field, but 
Large $\omega$ values can lead to spatial folding and require higher explicit regularization to enforce smoothness~\cite{byra2023exploring}. These explicit regularization terms could negatively influence registration's accuracy.

The coordinate sampling might also affect the convergence and registration performance in the context of INR registration. To mitigate this negative effect due to random sampling, ~\cite{wolterink2022implicit} suggested using a mask to ensure that regions of interest are more frequently sampled. This issue of sample prioritization becomes more pronounced in multi-modal registration where information-based metrics such as (normalized) mutual information (NMI)~\cite{Studholme1999AnOI, Wells1996MultimodalVR} are employed. NMI is computationally expensive because it uses histograms to approximate the joint intensity distribution and, therefore, requires a large batch of coordinates for a representative distribution of the image content for successful registration. Consequently, the computational resources scale quickly with image size and dimensionality, reducing tractability with scale. Conversely, smaller batch sizes have been shown to lead to higher signal modeling accuracy in INRs and increased training stability~\cite{mcginnis2023single}, thus posing a challenge to the NMI-based multi-modal registration. 

To address the limitations of current implicit registration methods, we propose Spline-enhanced INR (SINR), which parameterizes the implicit representation of a deformable transformation using Free Form Deformations (FFD)~\cite{Rueckert1999NonrigidRU}. FFD, originally proposed for the flexible manipulation of 3D shapes, deforms a control lattice, allowing the implicit regularization of SINR to produce smoother transformations without compromising registration accuracy. Further, the FFD model reduces the sensitivity of SINR to the choice of frequency ($\omega$) in the SIREN activation. SINR only parameterizes spatially sparse FFD control points, reducing the computational burden of coordinate sampling. Unlike previous work~\cite{wolterink2022implicit}, this allows SINR to use NMI efficiently with INRs not only for mono-modal but also for multi-modal registration. 

\noindent Our contributions are the following:
%\begin{itemize}[nosep]
%    \item We propose SINR - a method to parameterize an INR for registration with Free Form Deformation, constraining the transformation to be smoother and negating the effect of careful sampling, allowing for efficient optimization using NMI;
%    \item Our method improves registration accuracy and transformation regularity while exploring multi-modal registration using INRs for the first time;
%    \item We compare SINR against conventional and deep learning methods and achieve state-of-the-art mono- and multi-modal 3D brain MR image registration results.
%\end{itemize}

% % new draft bullet points
 \begin{itemize}[noitemsep]
    \item We propose SINR, a registration method that parameterizes deformable transformation by combining implicit neural representation (INR) with free-form deformation (FFD). The efficient spatial sampling and intrinsic smoothness, benefits of the FFD model, lead to improved optimization and state-of-the-art registration performance; 
    \item SINR exploits the FFD control point sparsity to efficiently calculate NMI, which enables multi-modal INR-based registration for the first time;
    \item SINR achieves accurate registration with comparable or improved transformation regularity. We evaluate registration performance on mono-modal and multi-modal brain MRIs and compare it with iterative and learning-based methods.
    
 \end{itemize}

\section{Method}
\subsection{Pairwise Image Registration}

Given two $n$-dimensional images, a fixed image \(F\) and a moving image \(M\) with \(F, M:\Omega\subset\mathbbm{R}^n\rightarrow\mathbbm{R}\) ($n = 3$ for 3D MRIs), image registration aims to find an optimal spatial transformation $\phi:\mathbbm{R}^n\rightarrow\mathbbm{R}^n$ such that the transformed moving image is most similar to the fixed image. Typically, this is formulated as an optimization problem $\phi^*=\arg \max_{\phi} \mathcal{J}(F, M,\phi)$ where the distance between the images is minimized with constraints on the transformation. We denote the objective function $\mathcal{J}$ as:

\begin{equation}
    \mathcal{J}(F,M,\phi) = \mathcal{D}(F,M\circ\phi) + \lambda \mathcal{R}(\phi),
    \label{eq:J}
\end{equation}


\noindent where $\mathcal{D}$ is an intensity dissimilarity measure and $\mathcal{R}$ is the regularization on the transformation field whose effect is controlled by the parameter $\lambda$.


\subsection{Free Form Deformations}

Free-form deformations involve the flexible alteration of images by adjusting control points within a parametric space, allowing non-rigid transformations. B-spline-based FFD models parametrize a deformable transformation between two images by defining a mesh of control points in the spatial domain of the image volume \cite{Rueckert1999NonrigidRU}. Giving a uniform spacing $\delta$, the FFD can be formulated as:

\begin{equation}
    \mathsf{u}(x, y, z) = \sum^{3}_{l=0}\sum^{3}_{m=0}\sum^{3}_{n=0}B_l(u)B_m(v)B_n(w)c_{i+l}c_{j+m}c_{k+n},
\end{equation}

\noindent where \((i, j, k)\) are the indices of the control point which is closest to the origin in the control point cube that encloses \((x, y, z)\), $B$ are the B-spline basis functions as presented in (\ref{eq:splines}), and \((u, v, w)\) are the normalized local coordinates of \((x, y, z)\) in its enclosing control point cube.

\begin{equation}
    B_0(u) = \frac{(1-u)^3}{6},
    B_1(u) = \frac{3u^3 - 6u^2 + 4}{6},
    B_0(u) = \frac{u^3}{6}
    \label{eq:splines}
\end{equation}

This transformation parametrization has some advantages due to its B-spline formulation. Firstly, B-splines have local support, which makes them a compelling choice for parameterizing deformable transformation. In other words, each control point affects the transformation only in its local neighborhood. Moreover, the resolution of the control point mesh is proportional to the transformation smoothness. Larger spacing between control points facilitates the representation of more global, smoother, nonrigid deformations, while smaller spacing allows for the modeling of highly localized nonrigid deformations. Therefore, the control point spacing can implicitly act as a method constraint, promoting smoothness.



\subsection{Proposed Method - SINR}
While conventional iterative methods estimate the transformation using pairwise optimization and CNN-based methods learn the transformation over a cohort, we employ a coordinate-based INR with the transformation between a pair of images due to its high signal fidelity and fast training speed. Leveraging the compressed representation, we propose to exploit the inherent smoothness of the B-spline FFD we previously described and use an INR \(f_\theta\) with trainable parameters \(\theta\) to approximate the transformation \(\phi(\bm{x_{cp}}) = \bm{x_{cp}} + u(\bm{x_{cp}})\) between a given pair of images \(F, M\), where \(\bm{x_{cp}}\in\Omega\) are the control point coordinates and \(\phi(\bm{x}) = f_\theta(\bm{x})\) are the displacements on the control point coordinates. The $L$-layer network is modelled as \(\mathbf{f_\theta = f_L \circ f_{L-1} \circ \ldots \circ f_1}\), with

\begin{equation}
    \hidden_{l} = f_l(\hidden_{l-1}) = \act(W_l\hidden_{l-1} + b_l), 0 \leq l\leq L,
\end{equation}

\noindent where $\mathbf{W_l}$ denote the weights, $\mathbf{b_l}$ the bias, \(\hidden_l\) the hidden feature vector for the $l$-th layer, and $\act$ the network's activation function, where we experiment with both, ReLU and SIREN.

% To overcome the low-frequency bias in the modeling of data and signals \cite{rahaman2019spectral}, alternative activation functions \cite{sitzmann2020implicit, saragadam2023wire} and projection methods \cite{tancik2020fourier} have been developed to increase the representation capabilities and fidelity of INRs. In this work, we experiment with ReLUs and sines \cite{sitzmann2020implicit} - SIRENs as activation functions. The sine frequency \(\omega\) regulates the bias towards the signal frequencies. Higher \(\omega\) values tend to generate highly nonrigid deformations but, at the same time more folding.

% and a coordinate feature mapping \(\gamma\) \cite{Tancik2020FourierFL} that maps the input points \(\bm{x}\) to the surface of a higher dimensional hypersphere with a set of sinusoids: \(\gamma(\bm{x}) = [e_1(\bm{x}), e_2(\bm{x}), ..., e_n(\bm{x})], \text{ with }
% e_i(\bm{x}) = [\cos (2 \pi \omega^\intercal_i \bm{x}), \sin (2 \pi \omega^\intercal_i \bm{x})]
% \), where \(\omega_i \in \mathbbm{R}^3\) is randomly sampled i.i.d. from a Gaussian distribution with standard deviation $\sigma$ which regulates the bias towards the higher signal frequencies. 


% To overcome the low-frequency bias in the modelling of data and signals \cite{rahaman2019spectral}, alternative activation functions \cite{sitzmann2020implicit, saragadam2023wire} and projection methods \cite{tancik2020fourier} have been developed, allowing to increase the representation capabilities and fidelity of INRs.




\begin{figure}[t]
\centering
\includegraphics[width=\textwidth]{figures/qual.png}
\caption{Qualitative results on T1w-T2w registration. The proposed SINR with SIREN activations achieves more plausible results (0.51\% folding ratio) compared to IDIR with SIREN~\cite{wolterink2022implicit} activation (0.87\% folding ratio).} \label{fig:qualitative}
\end{figure}

\section{Experimental Setup}

\paragraph{Evaluation metrics.}
Assessing the registration performance can be challenging since the ground truth deformations are unknown. Therefore, registration accuracy and regularity are evaluated with surrogate measures. Accuracy is determined by assessing the overlap between the anatomical segmentation using the Dice score. The regularity of transformation is evaluated based on the Jacobian determinant. The extent of \emph{folding} in the image due to the transformation is measured by the percentage of points with \(\mathcal{J} = |\nabla \phi|< 0\). 


\paragraph{Datasets.}
We evaluate our work on the inter-subject brain registration using the CamCAN\footnote{\url{https://cam-can.mrc-cbu.cam.ac.uk/dataset/}} dataset \cite{Shafto2014TheCC}, \cite{Taylor2017TheCC}. The dataset consists of $310$ T1w and T2w MR 3D volumetric images of size \(192 \times 192 \times 192\) and \(1\rm{mm}^3\) isotropic spatial resolution, which we split into $80$\% training, $10$\% validation and $10$\% test set. We normalize all images to the MNI space~\cite{Horn2016} using affine registration, ensuring an isotropic spatial resolution with a voxel size of \(1\rm{mm}^3\). We perform skull-stripping using ROBEX \cite{Iglesias2011Robex} and bias-field correction with SimpleITK \cite{Lowekamp2013TheDO}. For assessment purposes, we obtained automated segmentation of 138 cortical and subcortical structures, categorized into 5 groups, using MALPEM~\cite{Ledig2015RobustWS}.

\paragraph{Baselines.}
We first compare the proposed method to a conventional iterative method Medical Image Registration ToolKit (MIRTK)~\cite{Schuh2014ConstructionOA}, which is based on the FFD model. We also compare against two CNN-based deep learning methods; the widely used Voxelmorph (VMorph)~\cite{Dalca2018UnsupervisedLF}, which outputs a dense displacement field, and Modality-Invariant Diffeomorphic Deep Learning Image Registration (MIDIR) \cite{Qiu2021LearningDA}, which predicts FFD as transformation. Furthermore, we compare against an INR-based method named Implicit Neural Representations for Deformable Image Registration (IDIR) \cite{wolterink2022implicit}, which outputs a dense displacement field instead of a parameterized transformation. We also test this approach with both SIRENs and ReLUs as activation functions.

% We extend this to utilize a Fourier encoding to allow the MLP to model higher frequency components of the transformation\cite{tancik2020fourier}. 

\paragraph{Implementation.}
We trained all the mono-modal experiments with Normalized Cross Correlation and the multi-modal ones using differentiable NMI~\cite{devos2019} as image similarity measure. All baselines and the proposed method incorporate bending energy for regularization as introduced by~\cite{Rueckert1999NonrigidRU}. All the INRs were trained using the ADAM optimizer with a $10^{-4}$ learning rate for a maximum of $2500$ epochs. We considered sampling the coordinates inside the brain mask only for the baseline INRs and not for SINR. The dense mono-modal experiments used a coordinate batch size of $10$k samples, while the multi-modal experiments used a batch size of $890$k (\(\frac{1}{8}\) of the total points) to ensure convergence with NMI. We tune the selection of hyperparameters, namely the regularization weight \(\lambda\) and \(\omega\), by evaluating every 50 steps and performing an early stopping if the folding ratio becomes larger than \(0.9\%\).
This threshold is chosen empirically by evaluating the registration performance qualitatively.
We refer the reader to Appendix~\ref{app}, Figure~\ref{fig:qual_fold}, where the resulting transformation
demonstrates approximately 0.9\% folding.
We choose the hyperparameters that achieve the highest Dice score for every method while not surpassing this folding ratio threshold.
For the dataset, optimal outcomes are observed when the control points are spaced at \(2\rm{mm}^3\).
The code is publicly available\footnote{\url{https://github.com/vasl12/SINR.git}}.

\begin{table}[t]
    \centering
    \caption{Best scores of SINR and its competitors. The mean and std of the Dice score over anatomical structures are reported along with the transformation's folding ratio and whether or not the method utilizes the FFD transformation.}
    \begin{tabular}{l|c|c|c|c|c}
        \toprule
        \multicolumn{1}{l}{} & \multicolumn{3}{c}{T1w-T1w CamCAN} & \multicolumn{2}{c}{T1w-T2w CamCAN}  \\
        \midrule
        Method & FFD & Dice \(\pm\) std $\uparrow$ & Folding \% $\downarrow$ & Dice \(\pm\) std $\uparrow$ & Folding \% $\downarrow$\\
        \midrule
        Affine & n/a & \(0.619\pm0.01\) & - & \(0.619\pm0.01\) & - \\
        MIRTK & \cmark & \(0.833\pm0.02\) & \(0.11\) & \(0.755\pm0.01\) & \(0.14\)   \\
        VMorph [CNN] & \xmark & \(0.812\pm0.06\) & \(0.31\) & \(0.733\pm0.04\) & \(0.19\)   \\
        MIDIR [CNN] & \cmark & \(0.817\pm0.06\) & \(0.23\) & \(0.735\pm0.04\) & \(0.12\)   \\
        \midrule
        IDIR [ReLU-MLP]
        & \xmark & \(0.806\pm0.02\) & \(0.44\) & \(0.683\pm0.03\) & \(0.15\)   \\
        \textbf{SINR} [ReLU-MLP]
        & \cmark & \(0.789\pm0.03\) & \(0.38\) & \(0.721\pm0.06\) & \(0.05\)   \\
        IDIR [SIREN]  & \xmark & \(0.837\pm0.05\) & \(0.84\) & \(0.736\pm0.02\) & \(0.81\)   \\
        \textbf{SINR} [SIREN] & \cmark & \(\mathbf{0.855\pm0.06}\) & \(0.59\) & \(\mathbf{0.784\pm0.04}\) & \(0.27\)   \\
        % PE & \xmark & \(0.810 \pm 0.01\) & \(0.33\) & \xmark & \(0.745 \pm 0.04\) &  \(0.41\) \\
        % PE & \cmark & \(0.831 \pm 0.04\) & \(0.16\) & \cmark & \(0.770 \pm 0.04\) & \(0.27\) \\
        \bottomrule
    \end{tabular} 
    \label{table:best_scores}
\end{table}

\section{Results and Discussion}

% This section examines the quantitative outcomes of mono- and multi-modal registration experiments (refer to Table \ref{table:best_scores}), highlighting how SINR contributes to improved registration accuracy and smoothness.

%Table \ref{table:best_scores} presents the quantitative evaluation of all models on brain T1w-T2w and T1w-T1w MR registration using the CamCAN dataset. 

In our experiments, SINR, using sinusoidal activation functions, achieves the highest registration accuracy in Dice, surpassing the INR-based IDIR by $1.8$\% in the mono-modal and $4.8$\% in the multi-modal case.
SINR performs sparse spatial sampling with its use of FFD control points.
This sparse sampling strategy allows for stable computation of NMI via more efficient spatial sampling over the entire domain compared with random sampling of IDIR.
Compared to IDIR, SINR can achieve higher Dice without requiring masked sampling strategies.
% The sparsity of the FFD control points used by SINR allows for efficient sampling over the whole domain, benefiting the computation of the NMI compared to random sampling inside a mask that IDIR uses.
% Our method exploits the computational efficiency brought by the sparsity of the FFD control points to enable the training of multi-modal INR-based registration for the first time
Similarly, it demonstrates superior performance over the conventional iterative method (MIRTK) by $2.2$\% and $2.9$\%, respectively. Moreover, the CNN-based VMorph and MIDIR baselines underperform compared to the pairwise SINR (by approximately $2$\% and $5\%$ for mono- and multi-modal) because they are not optimized individually, but they estimate the transformation based on the prior learned over the whole training set. Additionally, our results highlight the superiority of SIREN-based methods over ReLU methods. SIRENs can represent signals with higher frequency components and, hence, more accurate transformations, while ReLUs tend to produce smoother transformations, which might not be descriptive enough and, as a result, lack performance. Comparing SINR with SIREN vs. with ReLU, SINR with SIREN achieves approximately $1$\% higher Dice, which shows that combining ReLUs with the FFD leads to over-smoothed transformations that lack the desired expressiveness.

Figure~\ref{fig:structure_dice} demonstrates the accuracy of the SINR compared to the baselines for individual classes and the overall mean, confirming the finding of Table~\ref{table:best_scores}. SINR with SIREN demonstrates a superior mean Dice score and outperforms all baselines in almost all the individual classes in both registration tasks. MIRTK achieves a marginally higher Dice score for Noncortical GM in the mono-modal case and a comparable score in the multi-modal case for White Matter. However, the proposed method achieves substantially higher Dice for other structures in both mono- and multi-modal cases (c.f. Table~\ref{fig:structure_dice}). We further refer the reader to the Appendix~\ref{app}, Figure~\ref{fig:larger_structures} for an enlarged version of Figure~\ref{fig:structure_dice}.
\begin{figure}[t]
    \centering
    \subfigure[T1w-T1w]{\includegraphics[width=0.49\textwidth]{figures/dice_t1t1.eps}} 
    \subfigure[T1w-T2w]{\includegraphics[width=0.49\textwidth]{figures/dice_t1t2.eps}} 
    \caption{Dice scores for brain registration by structure, indicating the average (mean) across White Matter (WM) and Grey Matter (GM) areas. A larger version can be found in Appendix~\ref{app}, Figure~\ref{fig:larger_structures}. }
    \label{fig:structure_dice}
\end{figure}

Regarding folding ratio, the proposed SINR with SIREN and ReLU activations manages to mitigate the folding ratio effect, which IDIR is prone to, as shown in Table~\ref{table:best_scores}. SINR equipped with ReLUs demonstrates the lowest folding ratio among all its competitors, making it a suitable candidate for applications in which a smooth transformation is desired, such as inhale-exhale lung registration. For multi-modal registration, our SIREN-based SINRs' folding ratio was marginally higher compared to other baselines but demonstrated an improved Dice score over them. Qualitatively this can also be confirmed by Figure~\ref{fig:qualitative}, where it can be seen that the FFD results in a smoother, more accurate transformation suitable for brain registration in comparison to IDIR. 

\paragraph{Hyperparameter robustness:}
We examine the influence of \(\omega\) on registration accuracy while maintaining the folding ratio below \(0.9\%\) using the dense displacement INR method IDIR and the proposed SINR, which uses FFD. The results are presented in Figure~\ref{fig:omegas} as the average and standard deviation of Dice scores over \(\omega\) ranging from $5$ to $70$. SINR displays consistently higher Dice scores over all \(\omega\) values, showing robustness to \(\omega\) hyperparameter selection. Notably, for the T1w-T2w registration setting, both methods achieve the peak Dice score with an \(\omega\) value $30$, as proposed in~\cite{sitzmann2020implicit}. In the mono-modal setting, SINR achieves the best results with an \(\omega\) of $30$, while IDIR requires an \(\omega\) value of $20$ for maximal performance.
Low values of $\omega$ reduce Dice scores for both SINR and IDIR.
The performance degradation over $\omega$ is more pronounced on IDIR and remains constant with larger $\omega$, while SINR displays reduced sensitivity to $\omega$ and stabilized performance for larger $\omega$ values.
% which was expected since lower values lead to smoother, more global transformations that fail to accurately model the local details in the objective transformation. 


\begin{figure}[t]
    \centering
    \subfigure[T1w-T1w]{\includegraphics[width=0.49\textwidth]{figures/dice_omega_t1t1.png}} 
    \subfigure[T1w-T2w]{\includegraphics[width=0.49\textwidth]{figures/dice_omega_t1t2.png}} 
    \caption{Effect of \(\omega\) on the Dice score for a fixed range of folding percentage of $\sim$0.9\%. SINR with SIREN activations outperforms SIREN-IDIR for all values of \(\omega\) for mono-modal and multi-modal registration settings.}
    \label{fig:omegas}
\end{figure}



\section{Conclusion}
In this work, we propose parameterizing the deformable transformation using an INR with Free Form Deformations. Through this combination, we benefit from the lightweight, fast-fitting INR and the inherent smoothness of B-spline FFD parametrization to achieve state-of-the-art performance in mono-modal and multi-modal brain registration. Extensive experimentation demonstrates the versatility of our approach, which not only outperforms conventional approaches, CNN methods, and dense INRs but also mitigates the implausible transformation percentage of the latter. We further perform an ablation study showing that the proposed FFD-enhanced INR is more robust against the activation function's frequency choice. Future work will extend this approach to other data modalities and anatomies, such as abdominal CT scans and further loss functions. Finally, an interesting future direction is to design an architecture that combines the registration with the FFD within the INR.

\section{Acknowledgements}
JM is supported by the Bavarian State Ministry for Science and Art (Collaborative Bilateral Research Program Bavaria – Québec: AI in medicine, grant F.4-V0134.K5.1/86/34).

% \begin{figure}[htbp]
%     \centering
%     \includegraphics[width=0.4\textwidth]{figures/po_enc_t1t1_sc.png}
%     \caption{Your figure caption here}
%     \label{fig:your_figure_label}
% \end{figure}
% \begin{figure}[htbp]
%     \centering
%     \includegraphics[width=0.4\textwidth]{figures/po_enc_t1t2_sc.png}
%     \caption{Your figure caption here}
%     \label{fig:your_figure_label}
% \end{figure}


% This is where the content of your paper goes.  Some random
% notes\footnote{Random footnote are discouraged}:
% \begin{itemize}
% \item You should use \LaTeX \cite{Lamport:Book:1989}.
% \item JMLR/PMLR uses natbib for references. For simplicity, here, \verb|\cite|  defaults to
%   parenthetical citations, i.e. \verb|\citep|. You can of course also
%   use \verb|\citet| for textual citations.
% \item Eprints such as arXiv papers can of course be cited \cite{Hinton:arXiv:2015:Distilling}. We recomend using a \verb|@misc| bibtex entry for these as shown in the sample bibliography.
% \item You should follow the guidelines provided by the conference.
% \item Read through the JMLR template documentation for specific \LaTeX
%   usage questions.
% \item Note that the JMLR template provides many handy functionalities
% such as \verb|\figureref| to refer to a figure,
% e.g. \figureref{fig:example},  \verb|\tableref| to refer to a table,
% e.g. \tableref{tab:example} and \verb|\equationref| to refer to an equation,
% e.g. \equationref{eq:example}.
% \end{itemize}

% \begin{table}[htbp]
%  % The first argument is the label.
%  % The caption goes in the second argument, and the table contents
%  % go in the third argument.
% \floatconts
%   {tab:example}%
%   {\caption{An Example Table}}%
%   {\begin{tabular}{ll}
%   \bfseries Dataset & \bfseries Result\\
%   Data1 & 0.12345\\
%   Data2 & 0.67890\\
%   Data3 & 0.54321\\
%   Data4 & 0.09876
%   \end{tabular}}
% \end{table}

% \begin{figure}[htbp]
%  % Caption and label go in the first argument and the figure contents
%  % go in the second argument
% \floatconts
%   {fig:example}
%   {\caption{Example Image}}
%   {\includegraphics[width=0.5\linewidth]{example-image}}
% \end{figure}

% \begin{algorithm2e}
% \caption{Computing Net Activation}
% \label{alg:net}
%  % older versions of algorithm2e have \dontprintsemicolon instead
%  % of the following:
%  %\DontPrintSemicolon
%  % older versions of algorithm2e have \linesnumbered instead of the
%  % following:
%  %\LinesNumbered
% \KwIn{$x_1, \ldots, x_n, w_1, \ldots, w_n$}
% \KwOut{$y$, the net activation}
% $y\leftarrow 0$\;
% \For{$i\leftarrow 1$ \KwTo $n$}{
%   $y \leftarrow y + w_i*x_i$\;
% }
% \end{algorithm2e}

% Acknowledgments---Will not appear in anonymized version
% \midlacknowledgments{We thank a bunch of people.}


\bibliography{midl24_006}


\newpage
\appendix
\section{Appendix}\label{app}

\begin{figure}[tbh]
\centering
\includegraphics[width=0.9\textwidth]{figures/folding_0.9.png}
\caption{Qualitative results for T1w-T2w registration. The resulting transformation demonstrates approximately $0.9$ \% folding, which deteriorates the registered image quality, justifying our decision to use this value as our early stopping criterion.} \label{fig:qual_fold}
\end{figure}

\begin{figure}[t]
    \centering
    \subfigure[T1w-T1w]{\includegraphics[width=0.9\textwidth]{figures/dice_t1t1.eps}} 
    \subfigure[T1w-T2w]{\includegraphics[width=0.9\textwidth]{figures/dice_t1t2.eps}} 
    \caption{Boxplots show Dice scores for brain registration by structure, indicating the average (mean) across White Matter (WM) and Grey Matter (GM) areas.}
    \label{fig:larger_structures}
\end{figure}

\begin{table}[h!]
    \centering
    \caption{Runtime for mono- and multi-modal registration for all methods.}
    \begin{tabular}{l|c|c|c|c|c}
        \toprule
        \multicolumn{1}{l}{} & \multicolumn{2}{c}{Runtime$\downarrow$}  \\
        \midrule
        Method & T1w-T1w CamCAN & T1w-T2w CamCAN \\
        \midrule
        MIRTK  & $3$min $28$s & $3$min $41$s \\
        VMorph [CNN]  & \textbf{Train:} $15$h $23$min \textbf{Test:} $219$ms & \textbf{Train:} $15$h $34$min \textbf{Test:} $219$ms \\
        MIDIR [CNN] & \textbf{Train:} $12$h $55$min \textbf{Test:} $113$ms & \textbf{Train:} $12$h $49$min \textbf{Test:} $113$ms \\
        \midrule
        IDIR [ReLU-MLP]
        & \textbf{Fit:} $1$min $43$s \textbf{Test:} $2.9$s & \textbf{Fit:}$2$min $01$s \textbf{Test:} $2.9$s\\
        \textbf{SINR} [ReLU-MLP]
        & \textbf{Fit:} $1$min $54$s \textbf{Test:} $2.9$s & \textbf{Fit:} $2$min $17$s \textbf{Test:} $2.9$s\\
        IDIR [SIREN] & \textbf{Fit:} $45$s \textbf{Test:} $2.9$s & \textbf{Fit:} $1$min $39$s \textbf{Test:} $2.9$s \\
        \textbf{SINR} [SIREN] & \textbf{Fit:} $1$min $32$s \textbf{Test:} $2.9$s & \textbf{Fit:} $2$min $12$s \textbf{Test:} $2.9$s\\
        % PE & \xmark & \(0.810 \pm 0.01\) & \(0.33\) & \xmark & \(0.745 \pm 0.04\) &  \(0.41\) \\
        % PE & \cmark & \(0.831 \pm 0.04\) & \(0.16\) & \cmark & \(0.770 \pm 0.04\) & \(0.27\) \\
        \bottomrule
    \end{tabular} 
    \label{table:runtime}
\end{table}

\end{document}
