\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\usepackage{graphicx}
\usepackage{bm}
\usepackage{amssymb}
\usepackage{amsmath}
%\usepackage{tabularx}
\usepackage{booktabs}
\usepackage{multirow}
%\newcolumntype{Y}{>{\centering\arraybackslash}X}
%\newcolumntype{C}[1]{>{\centering\arraybackslash}p{#1}}
\usepackage{etoolbox}
%\AtBeginEnvironment{tabularx}{\small}
\usepackage[normalem]{ulem}
\newcommand{\mup}[1]{\textcolor{red}{#1}} 
\newcommand{\tinycite}[1]{\begin{tabular}{p{0.15\columnwidth}}\scriptsize{#1}\end{tabular}}

%\jmlrvolume{-- Under Review}
%\jmlryear{2025}
%\jmlrworkshop{Full Paper -- MIDL 2025 submission}
%\editors{Under Review for MIDL 2025}
\jmlryear{2025}\jmlrworkshop{Full Paper -- MIDL 2025}\jmlrvolume{-- 219}\editors{Accepted for publication at MIDL 2025}

\title[Rephrasing deformable registration through forward splatting]{Fast forward: Rephrasing 3D deformable image registration through density alignment and splatting}


 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Mattias P. Heinrich\nametag{$^{1}$}} \orcid{0000-0002-7489-1972} \Email{mattias.heinrich@uni-luebeck.de}\\
\addr $^{1}$ Institute of Medical Informatics, Universität zu Lübeck, Germany\\
\Name{Alexander Bigalke\nametag{$^{1,2}$}} \orcid{0000-0001-7824-5735}\\
\addr $^{2}$ Drägerwerk AG \& Co. KGaA, Lübeck, Germany\\
\Name{Lasse Hansen\nametag{$^{3}$}} \orcid{0000-0001-7824-5735} \Email{lasse@echoscout.ai}\\
\addr $^{3}$ EchoScout GmbH, Lübeck, Germany
}

\begin{document}

\maketitle
\begin{abstract}
Unsupervised learning- and optimisation-based 3D registration has almost exclusively been approached using backward warping (interpolation) for transforming images. While this has practical advantages in particular the ease of implementation within common libraries it limits the robustness and accuracy in certain challenging scenarios. The alternative solution of forward splatting (extrapolation) is currently limited to very few applications, e.g. mesh or point cloud registration, requiring specific geometric learning architectures that are so far less efficient compared to dense 3D convolutional networks. In this work, we propose to use a straightforward forward splatting technique based on differentiable rasterisation. Contrary to prior work, we rephrase the problem of deformable image registration as a density alignment of rasterised volumes based on intermediate point cloud representations that can be automatically obtained through e.g. geometric vessel filters or surface segmentations. Our experimental validation demonstrates state-of-the-art performance over a wide range of registration tasks including intra- and inter-patient alignment of thorax and abdomen.
\end{abstract}
\begin{keywords}
Deformable registration, Splatting, Warping
\end{keywords}

\section{Introduction}
Deformable image registration is ubiquitously used in various medical imaging applications many of them requiring realtime and fully automatic solutions. Despite significant progress over the last few years, learning-based image registration still faces some challenges that are absent for other tasks, e.g. a larger performance discrepancy of feed forward models with or without instance optimisation or fine-tuning \cite{balakrishnan2019voxelmorph,heinrich2022voxelmorph++,hering2022learn2reg}. The gap before and after adaptation to a particular instance can be as large as 50\% \cite{heinrich2022voxelmorph++,wang2022plosl}. The necessity to use more complex multi-stage \cite{mok2020large} or cascaded \cite{zhao2019recursive} architectures to mimic an iterative alignment is well documented for large deformation tasks beyond the cranial vault. Moreover, recent research even explored to explicitly build upon parts of conventional gradient descent methods to stabilise the learning of registration networks \cite{jia2021learning}. Eventually, some approaches directly focused on speeding up the optimisation of pre-computed geometric or semantic features \cite{siebert2021fast}. 

A challenge for \textbf{unsupervised deformable image registration} is to find a balance between similarity metric and smoothness regularisation. To avoid degenerated solutions - that e.g. let to two points from different origins in the moving image correspond to the same location in the fixed scan - a smooth mapping is essential. Commonly, a combination of several regularising elements is employed: 1) constraints on the gradient of the estimated displacements, which can also be a conditional hyperparameter \cite{mok2021conditional}; 2) employing a low-parametric transformation model such as B-splines \cite{qiu2021learning} or Fourier-basis functions \cite{jia2023fourier}; 3) a symmetric registration framework that achieves inverse consistency by design \cite{greer2023inverse}. The latter two are preferable because they do not introduce any additional continuous hyperparameters and point to an apparent deficiency of oftentimes weak regularisation constraints. All mentioned approaches formulate registration as a backward warping, which interpolates intermediate intensity values from the moving image at displaced locations. 

Contrarily, sparse \textbf{point cloud registration} can be formulated as forward transform, which aims to align two 3D densities e.g. by solving an optimal transport problem \cite{shen2021accurate} or using the Chamfer distance between moved and fixed points. This, however, comes with its own challenges for defining a well-differentiable cost function \cite{bigalke2023denoised,heinrich2023chasing}. Point cloud processing has greatly benefited from the advent of new geometric deep learning architectures \cite{zhao2021point}. Yet its inherent limitations of expensive sparse memory access \cite{liu2019point} have prevented a more widespread adoption. A notable popular point registration networks \cite{wu2020pointpwc} was only cited 150 times to date. In addition, using forward transforms on dense images requires the extrapolation pixel intensities from one grid to another, a so called splatting operation, that may lead to holes that could negatively affect the loss computation.

We hypothesise that revisiting an under-researched paradigm for medical image registration as forward transform model that combines the complementary strengths of dense and sparse model components can alleviate most aforementioned problems. This leads to a substantially different solution, which replaces the spatial alignment of dense images using backward warping with a forward splatting of density estimates of an intermediate point cloud representation. Prior work \cite{heinrich2023chasing} already showed that splatting provides a well-differentiable loss for training graph-/point registration networks. We crucially extend upon this work by combining forward splatting with well-proven dense 3D convolutional architectures for deformation estimation. Our new paradigm for defining the loss and transform  leads to more robust and accurate performance without finding additional hyperparameter for constraining the displacement regularity.

Our method combines dense volumetric registration with intermediate point cloud representations with the following three \textbf{technical contributions}: 
\begin{itemize}
\item Conversion of volumetric scans into point based multichannel density representations using either semantic, geometric or intensity-based features
\item Differentiable rasterisation for an unsupervised loss definition within a multi-step and optionally symmetric, inverse-consistent registration framework 
\item Fully-convolutional dense 3D networks adapted to rasterised point cloud inputs for improved efficiency and accuracy
\end{itemize}
To demonstrate the efficacy of this newly proposed paradigm for deformable registration we perform a detailed analysis on a challenging proof-of-concept dataset followed by comprehensive experimental validation on 3D medical datasets involving large deformation lung registration and cross-patient shape alignment of abdominal organs. The source code is  available at \url{https://github.com/mattiaspaul/fastforward}.
\begin{figure}
\includegraphics[width=\textwidth]{midl25_219_fastforward_concept.pdf}
\caption{Given two sparse point clouds $\mathcal{F}$ and $\mathcal{M}$ we rasterise 3D volumes $\bm{F}$ and $\bm{M}$ by \textbf{forward splatting} $\zeta$ as input to a registration U-Net (including a B-spline transform) which predicts a dense deformation $\phi$ that is sampled sparsely at the points of $\mathcal{F}$ (vector addition). An $L_1$ loss of a newly splatted  $\bm{F}_{\phi}=\zeta(\mathcal{F},\phi)$ and $\bm{M}$ is used to derive a well differentiable loss and \textbf{stable multi-stage model}.} \label{fig:concept}
\end{figure}
\section{Method}
We consider the task of aligning two 3D images - fixed $\bm{F}$ and moving $\bm{M}$ - with intermediate representations as sparse 3D point cloud,  denoted as $\mathcal{F}$ and $\mathcal{M}$ - using a spatial transform $\varphi$. Given unlabelled training data $\mathcal{T} = \{(\bm{F}_i,\bm{M}_i)\}_{i=1}^{|\mathcal{T}|}$ comprising $|\mathcal{T}|$ unaligned pairs we aim to learn a function $f$ represented by the parameters $\theta_f$ of a deep neural network to predict a displacement field $\hat{\varphi} = f(\bm{F}, \bm{M}, \theta_f)$ that minimises a dissimilarity metric $\mathcal{S}(\bm{F},\varphi\circ\bm{M})$. 

\textbf{Forward versus backward transforms: }In the above notation a conventional backward transformation was formulated that \emph{pulls} $\bm{M}$ towards $\bm{F}$. But it is similarly possible to make the model learn a forward transform $\phi$ that minimises $\mathcal{S}^*(\phi\circ\bm{F},\bm{M})$. Let us start with volumetric data that resides on a regular cartesian grid with integer coordinates $\bm{q}=(q_x,q_y,q_z)\in\mathbb{N}^3$ and associated voxel intensities $\bm{x}\in\mathbb{R}^{n_x\times n_y\times n_z}$ (for ease of presentation we limit ourselves to grayscale but extend the concept to multi-channel images later). To apply the spatial transform $\varphi$ to $\bm{M}$ in a differentiable manner, we can follow \cite{jaderberg2015spatial}, which introduced spatial transformer networks. In practice, we are interested in the intensity values for the coordinates $\bm{p}_{\varphi} = \bm{p}_0 + \bm{\Delta p}$, which represent the addition of identity transform and the relative displacements. We follow the notation of \cite{dai2017deformable} to define a trilinear interpolant $G(\bm{q},\bm{p}) = g(q_x,p_x)\cdot g(q_y,p_y)\cdot g(q_z,p_z), $ with $g$ being defined as $g(q_x,p_x)=\operatorname{max}(0,1-|q_x-p_x|)$, which can be used to perform a weighted average over the 8 neighbouring grid points of displaced voxels $\bm{p}_{\varphi}$ using:
\begin{equation}
\bm{y}(\varphi,\bm{x}) = \sum_{\bm{q}}G(\bm{q},\bm{p}_{\varphi})\cdot\bm{x}(\bm{q}).
\label{eq:interpolate}\end{equation}
Intuitively speaking, this backward transform $\bm{M}_{\varphi}=\bm{y}(\varphi,\bm{M})$ gathers information in an irregular manner from the moving image onto a regular grid for the new warped image that should reside in the domain of the fixed image. Each grid point in the warped image receives the same density, yet the voxels from the moving image may contribute differently to the transformed output. A forward transform conversely scatters voxel intensities from a regular grid onto irregularly spaced spatial coordinates in the transformed image, which requires an extrapolation to neighbouring grid locations. 

Hence as detailed in \cite{heinrich2023chasing} for the case of sparse 3D point clouds $\mathcal{F}$ and $\mathcal{M}$, the order of Eq.~\ref{eq:interpolate} has to be reversed to $\bm{x}(\phi,\bm{q}) = \sum_{\bm{p}}G(\bm{p},\bm{q}_{\phi})\cdot\bm{y}(\bm{p})$ resulting in a trilinear \textbf{splatting operation} $\bm{F}_{\phi}=\bm{x}(\phi,\bm{F})=\zeta(\mathcal{F},\phi)$. When used in isolation this may lead to visually undesirable effects, e.g. holes in the output and multiple intensities that are accumulated onto the same location. While most prior work considered this a disadvantage that required careful post-processing (cf. \cite{birkfellner2005fast}), we want to highlight the benefits of such an approach. When incorporating the splatting operation into the dissimilarity loss, the optimisable function $f_{\theta}$ (e.g. a deep neural network) has to predict a $\phi$ that retains a similar density distribution across images to avoid penalties, which helps to prevent unrealistic and implausible deformations. To that end, we focus our empirical evaluation on the gains that may be obtained by replacing backward with forward transformations. 
% Our method expects , where $N_{\mathcal{F}}=N_{\mathcal{M}}\approx 64\cdot 10^3$. 

\textbf{Dense-to-sparse-to-dense model:} To efficiently handle the challenges of splatting large 3D volumes in a differentiable manner we propose to design a model with intermediate sparse point cloud representations. Fig.~\ref{fig:concept} starts from 3D scans have been converted into two unordered 3D point clouds $\mathcal{F}\in\mathbb{R}^{N_{\mathcal{F}}\times 3},\mathcal{M}\in\mathbb{R}^{N_{\mathcal{M}}\times 3}$. In case of multi-object (multi-channel) registration, each point can have a one-hot vector representing its class assigned.  While geometric deep learning has seen use in the medical domain for shape classification and pose recognition \cite{bigalke2023anatomy}, deformable 3D registration has so far been limited to few sparse point cloud based architectures such as PointPWC-Net \cite{wu2020pointpwc}. They additionally pose a particular challenge for unsupervised learning to define suitable loss functions (cf. \cite{shen2021accurate}). We hence generate a rasterised 3D density representation using the efficient splatting function $\zeta_{\sigma}$ proposed in \cite{heinrich2023chasing} to transform $\mathcal{F}$ and $\mathcal{M}$ into $\bm{F}$ and $\bm{M}$, which reside on a regular cartesian 3D grid and where $\sigma$ defines a spatial smoothing kernel. Such an intermediate dense representations was used e.g. in V2V-PoseNet \cite{moon2018v2v} to address slow random memory access in k-nearest neighbour graph networks (cf. \cite{liu2019point}). With 3D voxel volumes at hand we can employ a straightforward 3D U-Net \cite{cciccek20163d} to define our neural network function $f$. For robustness, the output of the U-Net $\varphi=f(\zeta_{\sigma}(\mathcal{F}),\zeta_{\sigma}(\mathcal{M}))$ is fed into a differentiable B-spline transformation (that comprises no trainable parameters) akin to \cite{qiu2021learning}. To extract the sparse motion vectors that align the two point clouds we first compute a trilinear interpolation (see Eq.~\ref{eq:interpolate}) of $\varphi$ at the coordinates of $\mathcal{F}$ to obtain $\phi=\bm{y}(\mathcal{F},\varphi)$. Next, we perform another \textit{deformable} splatting to perform a forward transform that should satisfy the similarity $\zeta_{\sigma}(\mathcal{F}+\phi)\sim\zeta_{\sigma}(\mathcal{M})$. Note, that absolute and relative coordinates are simply added for the transformation. The hyperparameter $\sigma$ together with the resolution of the cartesian grid $\bm{q}$ defines how much spatial overlap neighbouring points have within the 3D density estimation of $\zeta$ and improves differentiability. 

\textbf{Multi-step, regularisation and inverse consistency:} When designing a backward warping baseline, we noted that the implicit regularisation of the B-spline transform is sometimes not sufficient to achieve competitive performance due to the tendency of this model to \emph{pull} points from multiple different locations to the same grid coordinate. The baseline model hence adds a diffusion regulariser $\mathcal{R}=||\lambda\cdot\nabla\varphi||^2$ to penalise large gradients of $\varphi$, which introduces another hyperparameter to be tuned (cf. \cite{mok2021conditional}). To capture large deformations, we consider the multi-step approach of \cite{mok2020large} that cascades \cite{zhao2019recursive} multiple U-Nets to obtain $\varphi^*_{\bm{FM}}=\varphi^1\circ\varphi^2\circ\ldots\circ\varphi^{(n)}.$ Furthermore, we explore the very recent strategy of establishing symmetry and inverse consistency of the registration by construction following \cite{greer2023inverse} and estimate $\varphi^*=\exp(f_{\bm{FM}}-f_{\bm{MF}})$, where $\exp$ represents the exponentiation step of the scaling-and-squaring approach \cite{avants2008symmetric} (and the output of $f_{\bm{FM}}=f(\bm{F},\bm{M})$ is scaled down appropriately). When estimating large deformations a single network is usually unable to capture the complex transformation. Hence, a two-step consistent approach with two networks $f^{(1)}$ and $f^{(2)}$ is used:
\begin{equation}
\varphi^{(2)}=\exp(\frac{f^{(1)}_{\bm{FM}}-f^{(1)}_{\bm{MF}}}{2})\circ\exp(f^{(2)}_{\bm{\hat{F}\hat{M}}}-f^{(2)}_{\bm{\hat{M}\hat{F}}})\circ\exp(\frac{f^{(1)}_{\bm{FM}}-f^{(1)}_{\bm{MF}}}{2}).
\label{eq:twostep}
\end{equation}
Here, two mid-way warps are defined as $\hat{\bm{M}}=\bm{M}\circ\exp(\frac{f^{(1)}_{\bm{MF}}-f^{(1)}_{\bm{FM}}}{2})$, similar to the ANTs SyN approach \cite{avants2008symmetric}. More details on the implementation of each step can be found within our open source code.

\section{Experiments}
We consider two tasks that are currently challenging for learning based registration and often require additional instance optimisation. Our aim is to ease the difficulty by replacing backward warping with forward splatting in two image registration tasks. First, the large deformation 3D CT lung registration between inspiration and expiration using the PVT1010 \cite{shen2021accurate} and DIRlab COPD \cite{castillo2013reference} patient data respectively. And second, finding shapes correspondences across patients of the multi-organ abdominal CT dataset Beyond the Cranial Vault (BVC) with significant differences across anatomies \cite{xu2016evaluation} that was used in Learn2Reg 2020 \cite{hering2022learn2reg}. We perform up to three registration approaches for those datasets: 1) purely optimisation based, 2) asymmetric (multi-stage) deep learning and 3) symmetric two-step inverse-consistent deep learning registration. 

 The DIRlab COPD dataset remains one of the most challenging tasks for deep learning registration yielding target registration errors (TRE) of $>7$~mm for VoxelMorph \cite{balakrishnan2019voxelmorph} and $\approx5$~mm for LapIRN \cite{mok2020large}. The only highly accurate solution we found in the literature \cite{greer2023inverse} that achieved $<2$~mm was trained on additional 1000 pairs of high-resolution intensity scan pairs of the COPDgene study. The related point cloud dataset (PVT1010) has been tackled with the sparse PointPWC \cite{wu2020pointpwc} model for which results of $\approx5$~mm before instance optimisation \cite{shen2021accurate,heinrich2023chasing} were reached when trained with unsupervised losses and $2.3$~mm for supervision with simulated deformations followed by mean teacher domain adaptation on the real unlabelled data.
 
 %even when additionally using image features for the 10 

%TODO explain the baseline U-Net architecture

\textbf{PVT1010 / COPD} comprises point clouds within the paired lungs of 1000 patients that depict $\approx$ 50-120$\times 10^3$ well distributed 3D coordinates at lung vessels, airways or other points of high local image gradient or curvature in both extreme respiratory phases. For 10 scans the publicly available DIRlab cases provide additional image information (which we do not use) and serve as a benchmark with 300 manual landmarks each.

The \textbf{multi-organ abdomen} dataset (AbdomenCT) is made up by 30 CT scans of different patients for pairwise training \cite{xu2016evaluation}, whereas the test set has 20 scans with 9 annotation labels \cite{gibson2018automatic}. To create multilabel point-clouds we employ predictions from a pre-trained TotalSegmentator \cite{wasserthal2023totalsegmentator} for 14 anatomies and sample 61'440 (15x4096) surface points (using both non-maximum suppression and farthest point sampling). We use \texttt{torch.gradient} to obtain spatial gradients of the segmentations represented as one-hot tensors. The FPS implementation followed the code from \cite{zhao2021point}. The whole process requires $\approx$ 200 millisec. per 3D volume. The Gaussian smoothing employed in the differentiable splatting function $\zeta_{\sigma}$ is sped-up using the approximation of \cite{kovesi2010fast} and extended to the 15-channel input.

\begin{figure}
\centering
\includegraphics[width=\textwidth]{midl25_219_fastforward_abdomen.pdf}
\caption{Example distribution and case for the abdomen test data with initial and transformed labels. A substantial improvement can be found using forward splatting.} \label{fig1}
\end{figure}

\textbf{Implementation details}: We employ the basic MONAI U-Net \cite{cardoso2022monai} with 5 levels and up to 64 channels as $f$ with 2 or 30 (multi-organ) input channels of the concatenated rasterised point clouds with fixed dimensions of $128^3$ voxels. The output is fed to a hyperbolic tangent and multiplied by $0.25$ to limit the displacement range and regularised using a cubic B-spline with a control-point grid of $32^3$ (see Appendix for hyperparameter choices). In case of the two-step inverse consistency (IC), we use scaling-and-squaring and symmetry by design (see Eq.~\ref{eq:twostep}) with two trainable networks. For multi-step, we train up to $st=8$ U-Nets, further limit each output to $\frac{0.3}{st}$ and compose the output transformations. The dissimilarity loss $\mathcal{S}$ is chosen as the $L_1$-Norm of the splatted point clouds of fixed and transformed moving scan. All models or respectively the B-spline grid for optimisation-only approaches are trained with Adam with a learning rate of $0.015$ for 10’000 or 150 iterations respectively. For the latter, the backward warping loss led to insufficient performance, which we addressed by weighting the $L_1$-Norm with the fixed point cloud density. Test-time adaptation is optionally used to fine-tune the weights of $f^{(1)}$ and $f^{(2)}$, since our model uses only an unsupervised loss on the automatically generated point clouds (with $N_{\mathcal{F}}=N_{\mathcal{M}}\approx 64\cdot 10^3$). This step requires only $3-5$ secs. per scan pair for $20-50$ iterations. Training requires less than 1 hour per model on one GPU.  

This is achieved by the very efficient implementation of the splatting operation to rasterise input and intermediate (warped) point clouds, which resulted in orders of magnitude faster processing compared to traditional sparse data interpolation and modern GPU-accelerated kernel operators \cite{JMLR:v22:20-275}, see Appendix \ref{sec:raster} for details. 
\begin{figure}
\centering
\includegraphics[width=\textwidth]{midl25_219_fastforward_pvtcopd1.pdf}
\caption{Rendering of inspiration to expiration point clouds (PVT1010/COPD) showing improved alignment (purple) of our method compared to backward warping.} \label{fig:visualcopd}
\end{figure}


\begin{table}
\caption{Quantitative accuracy for \textbf{multi-organ abdomen} averaged across 8 labels \cite{gibson2018automatic} (initial Dice of 25\%). All methods use label-supervision but only during training. Our forward splatting outperforms backward warping in every case, in particular for optimisation and TTA and sets new SOTA.}\label{tab:abdomen}
\begin{tabular}{l|cccc}
method & optimisation & 8x multi-step & two-step IC & net+TTA \\
    \midrule[1pt]
backw. warp. & 59.88$\pm$7.7\% & 64.56$\pm$9.1\% & 70.95$\pm$5.4\% & 61.42$\pm$6.7\% \\
forw. splat. & 71.74$\pm$6.3\% & 69.10$\pm$6.3\% & 71.25$\pm$5.2\% & \textbf{75.29}$\pm$4.0\% \\
    \midrule[1pt]
SOTA img & 69\% & 67\% & 52.3\% & 69.6\% \\
/label & \tinycite{\cite{siebert2021fast}} & \tinycite{\cite{mok2020large}} & \tinycite{\cite{heinrich2022voxelmorph++}} & \tinycite{\cite{heinrich2022voxelmorph++}} \\
& ConvexAdam & LapIRN & VxM+ & VxM+(+IO) \\
\hline
\end{tabular}
\end{table}
\begin{table}
\caption{Manual landmark error (TRE) in mm for \textbf{PTV1010/COPD} \cite{castillo2013reference} with an initial affine TRE of 11~mm. We set new SOTA for point cloud methods and outperform most approaches that use intensity images (values after / denote inst. optimisation)}\label{tab:copd}
\begin{tabular}{l|cccc}
method & optimisation & 8x multi-step & two-step IC & net+TTA \\
    \midrule[1pt]
backw. warp. & 3.21$\pm$4.0 & 4.00$\pm$2.1 & 3.2$\pm$2.4 & 1.82$\pm$0.3 \\
forw. splat.& 2.00$\pm$0.6 & 2.16$\pm$0.7 & 3.1$\pm$2.4 & \textbf{1.76}$\pm$0.4 \\
    \midrule[1pt]
SOTA  & \textbf{0.82} & 4.99 & 2.03/1.62 & 2.2 \\
images& \tinycite{\cite{ruhaak2017estimation}} & \tinycite{\cite{mok2020large}} & \tinycite{\cite{greer2023inverse}} & \tinycite{\cite{heinrich2022voxelmorph++}} \\
SOTA point  & 3.13(CPD) & & 2.31 & 5.96/2.39 \\
clouds& \tinycite{\cite{falta2024lung250m}} & & \tinycite{\cite{bigalke2023denoised}} & \tinycite{\cite{heinrich2023chasing}} \\
\hline
\end{tabular}
\end{table}
%\scriptsize{\cite{bigalke2023denoised}}
%\scriptsize{\cite{}} 

\section{Results and Discussion}
Our empirical results demonstrate clear advantages compared to prior work. We outperform all previous point cloud registration models on the challenging PVT1010 dataset by a large margin (see Tab.~\ref{tab:copd} and Fig.~\ref{fig:visualcopd}), when replacing a sparse graph-convolution based model (Point-PWC) with a dense multi-stage (symmetric) U-Net, while keeping the attractive properties of sparse inputs. In particular, our novel integration of forward splatting within a cascade of dense U-Net registration networks leads to a threefold reduction in TRE compared to DiVRoC \cite{heinrich2023chasing} (5.96$\rightarrow$2.16~mm) before instance optimisation. Unlike prior methods, we dynamically splat moved points after each transformation rather than relying on backward warping, which significantly improves registration accuracy. Point clouds are efficient for data sharing and substantial reduce the risk of privacy leakage (cf. \cite{shen2021accurate}). However, traditional point cloud networks suffer from unstructured memory access and expensive kNN queries, particularly as the number of points increases. In direct comparison to DiVRoC \cite{heinrich2023chasing}, which also employs rasterisation as loss, our method achieves a sixfold speed-up while reducing computational complexity: the PPWC model used processed only a subset of 8192 points per scan and required eight partial runs to predict a forward transform in $\sim$1.5 seconds, using 7.7M parameters and 65.2 GFlops. In contrast, our sparse-to-dense representation enables U-Nets that are independent of the number of points, requiring only $\approx$0.25 seconds for the two-step inverse consistency model, with 1.2M parameters and 47.2 GFlops. This highlights the efficiency gains of our method, making it highly scalable while retaining the expressiveness and privacy advantages of sparse point clouds. By extending the concept to multi-channel surface point clouds, where points carry categorical labels, we outperform all previous supervised image-based methods for AbdomenCT ($+2\%$) and achieve further gains to $+5\%$ points when employing test-time adaptation. Notably, the stabilising effect of forward splatting is more pronounced for multi-step or iterative alignment, while the two-step inverse consistency \cite{greer2023inverse} introduces complementary benefits, enforcing perfect symmetry in all solutions. A supplementary ablation study (Appendix) empirically shows the sensitivity of hyperparameter choice for the warping strategy, whereas our method remains robust across a wider range of values. Further improved alignment can still be obtained when considering the complete dense image information (see superior results for \cite{greer2023inverse} in Tab.~\ref{tab:copd}), which would motivate hybrid models trained on large-scale, multi-centric, privacy-preserving and modality-agnostic point cloud datasets, with local fine-\-tuning on full patient scans at test time.


%\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
% \midlacknowledgments{We thank a bunch of people.}

\bibliography{midl25_219}

\appendix

\section{Synthetic petal dataset}
We additionally perform experiments on a highly deformable synthetic 2D task with the aim to align flower shapes with different numbers of petals. Petals can be flexible generated by sampling 1024 points along the angle $\alpha$ with the equation $r = \frac{\cos(k\cdot\alpha)+d}{d}$, where $k\in\mathbb{N}$ defines the number of petals and $d\in\mathbb{R}^+$ modulates the curvature of the leaves. With different numbers $k_1$ and $k_2$ of petals in fixed and moving image as well as changes in curvatures, huge local deformations have to be estimated (see Fig.~\ref{fig2}). Results show that the warping loss produces unrealistic transforms, which may explain its difficulty to further improve through test-time adaption for the multi-organ data.

%. We repeat the network training for a number of constellations both for optimisation-based and two-step inverse-consistent alignment.

\begin{figure}
\includegraphics[width=\textwidth]{midl25_219_fastforward_petal.pdf}
\caption{Various cases of our petal dataset with topological changes to be aligned (showing the case with the 5$^{th}$ highest error out of 64 samples each). Forward splatting achieves more accurate alignment and lower Haussdorff distances along with more plausible displacement grids.} \label{fig2}
\end{figure}

\section{Hyperparameter search}
Figure \ref{fig:sweep} shows exemplary hyperparameter sweeps for the 10$^{th}$ COPD case. We perform an 8$\times$8$\times$8 sweep for our approach and evaluate an additional 8 values of $\lambda$ each for backward-warp. An optimal valley can be found for both approaches for the resolution of the rasterisation grid versus the employed B-spline smoothing (equivalent Gaussian $\sigma$), whereas the optimal choice for the smoothing of the extrapolation (splatting $\zeta_{\sigma}$) is more stable for forward splatting. The comparison of B-spline smoothing and diffusion $\lambda$ reveals the finicky behaviour of backward warping, which deteriorates sharply when moving slightly away from the optimum. While the population based learning and inverse-consistency can alleviate this choice to some degree at highlights the current limitation of many unsupervised registration algorithms.

\begin{figure}
\includegraphics[width=\textwidth]{midl25_219_fastforward_hyper.pdf}
\caption{Hyperparameter search for the optimisation based alignment of the 10$^{th}$ COPD case comparing our proposed forward splatting with only intrinsic regularisation with conventional backward warping that employs additional diffusion regularisation. Dark green values indicate accurate registrations.} \label{fig:sweep}
\end{figure}
%
%
\section{Efficiency of splatting / rasterisation}\label{sec:raster}

A detailed comparison of runtimes for our splatting operator that rasterises an input or intermediate (warped) point cloud of varying size to a 3D grid is shown in Fig.~\ref{fig:raster}. It demonstrates a huge speed up of our proposed use of the Jacobian of the \texttt{gridsample} operator in pytorch (see also \url{https://github.com/mattiaspaul/fastforward}).  We benchmarked our approach against several competing implementations: \texttt{scipy.interpolate.griddata}, \texttt{scipy.spatial.cKDTree} and the highly efficient sparse Gaussian kernel from Keops \cite{JMLR:v22:20-275}. We demonstrate enormous speed-ups (150-350x fold) and even outperform the most advanced (Keops) GPU implementation on the CPU. Using CUDA our splatting operator rasterises clouds with $>$60'000 points and 15 intensity channels in $<$10 millisec. on a 128x128x128 dense grid. Hence there is virtually no overhead compared to the U-Net models.
\begin{figure}
\includegraphics[width=\textwidth]{midl25_219_fastforward_rasterisation.pdf}
\caption{Efficiency analysis that shows the very low runtime of our proposed rasterisation across all point cloud sizes for a dense grid of 128x128x128 with 15 channels. Even on a CPU our approach outperforms the Keops GPU implementation for clouds larger than 12'000 points. Time given in seconds.} \label{fig:raster}
\end{figure}
% \begin{figure}
% \caption{Additional visual results for the trained networks (two-step inverse consistent) with test-time adaptation in comparison to the state-of-the-art DiVRoC \cite{heinrich2023chasing} method that employs additional instance optimisation. The initial misalignment highlights the challenges for deformable registration, which is accurately solved  by our approach with a smooth transform (low standard deviation of the Jacobian determinants) and no foldings. Note that while our presented experiments use the point clouds provided by \cite{shen2021accurate}, we found that a simple spatial gradient measure combined with k-nearest neighbour-based non-maximum suppression yields identical accuracies. }  \label{fig:supplement}
% \end{figure}

\end{document}
