% This is samplepaper.tex, a sample chapter demonstrating the
% LLNCS macro package for Springer Computer Science proceedings;
% Version 2.21 of 2022/01/12
%
\documentclass[runningheads]{llncs}
%
\usepackage{amssymb}
\usepackage{amsmath}
\usepackage{bm}
\usepackage{booktabs} 
\usepackage[T1]{fontenc}
% T1 fonts will be used to generate the final print and online PDFs,
% so please use T1 fonts in your manuscript whenever possible.
% Other font encondings may result in incorrect characters.
%
\usepackage{graphicx}
\usepackage{adjustbox}
\usepackage{lipsum} % For dummy text
% Used for displaying a sample figure. If possible, figure files should
% be included in EPS format.
%
% If you use the hyperref package, please uncomment the following two lines
% to display URLs in blue roman font according to Springer's eBook style:
\usepackage{color}
\usepackage{multirow}
\renewcommand{\thetable}{\arabic{table}}
\usepackage[colorlinks, linkcolor=blue, urlcolor=blue, anchorcolor=blue, citecolor=blue]{hyperref}
%
\begin{document}
%
\title{Learning-based CBCT–IOS Registration with PointNet++ and SVD}
%
\titlerunning{Learning-based CBCT–IOS Registration with PointNet++ and SVD}
% If the paper title is too long for the running head, you can set
% an abbreviated paper title here
%
\author{Changkai Ji\inst{1} \orcidID{0009-0007-7090-7360} \and 
Yusheng Liu\inst{1} \orcidID{0009-0004-2624-9223}
\and
Yuxian Jiang\inst{1} \orcidID{0009-0002-7689-5333}  \orcidID{0009-0009-3223-0082} \and
Lisheng Wang\inst{1} \orcidID{0000-0003-3234-7511}}
%
\authorrunning{C. Ji et al.}
% First names are abbreviated in the running head.
% If there are more than two authors, 'et al.' is used.
%
\institute{School of Automation and Intelligent Sensing, Shanghai Jiao Tong University, Shanghai 200240, People's Republic of China \\ \email{\{changkaiji, lswang\}@sjtu.edu.cn}}
%
\maketitle              % typeset the header of the contribution
%
\begin{abstract}
Accurate registration of intraoral scans (IOS) and cone-beam computed tomography (CBCT) is a critical prerequisite for precise diagnosis and treatment planning in dentistry. However, large modality discrepancies and dense point clouds make this task challenging in practice. In this work, we propose a learning-based framework for CBCT–IOS registration, developed in the context of the MICCAI STSR Task 2 2025 Challenge. Our method leverages dual PointNet++ encoders to extract modality-specific features, followed by a differentiable SVD head that execute rigid-body constraints in the predicted transformation. To enhance robustness, we design geometric data augmentation strategies, while point cloud sampling and simplification are employed to accelerate inference. Ablation studies demonstrate that augmentation substantially reduces registration errors, while relaxing CBCT filtering thresholds further improves alignment by preserving richer anatomical cues. Overall, our approach achieves competitive performance, ranking second on the validation leaderboard, and provides a practical balance between accuracy and efficiency.  
\keywords{CBCT-IOS registration \and PointNet++ \and Rigid transformation \and Data augmentation \and Inference acceleration}
\end{abstract}

%
%
%
\section{Introduction}
Three-dimensional registration of dental data plays a crucial role in computer-aided diagnosis, treatment planning, and surgical guidance \cite{flugge2017registration,lim2020registration}. In clinical practice, intraoral scans (IOS) provide high-resolution crown geometry, while cone-beam computed tomography (CBCT) offers comprehensive information on both crowns and roots \cite{su2023evaluation}. Accurate alignment of these heterogeneous modalities is essential for integrating complementary anatomical details, thereby enhancing the precision and reliability of dental treatment \cite{olczyk2024retrospective}. To promote the development of robust registration algorithms, the MICCAI STSR 2025 Challenge Task 2 was organized to benchmark algorithms that can effectively handle multi-modal data discrepancies and to encourage practical solutions that may translate into real-world clinical applications.

Despite its importance, CBCT–IOS registration remains a challenging task. The two modalities differ substantially in terms of resolution, field of view, and information content \cite{kim2024best}. IOS captures only the visible crowns with fine detail but lacks root structures, whereas CBCT provides full jaw coverage but contains significant noise and redundant information. These discrepancies introduce difficulties in establishing reliable correspondences and estimating robust transformations. Moreover, limited availability of paired ground-truth annotations further complicates the training of data-driven approaches.

Recent advances in deep learning have achieved remarkable success across imaging tasks \cite{liu2024individual,jiang2025morphology,zhang2023children,bolelli2024segmenting,jiang2024enhanced,liu2024inferior}. Researchers have increasingly applied deep learning methods to multi-modal 3D registration problems \cite{kim2024best}. Such methods alleviate the need for handcrafted descriptors and have achieved promising results in various medical imaging domains. However, deep learning-based approaches often require large annotated datasets \cite{wang2022follow,ji2023mammo}, and their inference pipelines may still suffer from inefficiency due to the high dimensionality of volumetric data and dense point clouds. Therefore, it remains an open question how to design a framework that is both accurate and computationally efficient \cite{aji2024two,ji2024two}.

In this work, we propose a learning-based registration framework specifically designed for CBCT–IOS alignment in the MICCAI STSR 2025 Challenge. Our method employs PointNet++ encoders to extract modality-specific features from IOS and CBCT point clouds \cite{qi2017pointnet++}, followed by a transformation head based on singular value decomposition (SVD) that enforces rigid-body constraints in the predicted matrix \cite{wang2019deep}. To enhance robustness, we incorporate extensive data augmentation during training, enabling the model to generalize well across diverse clinical cases. Additionally, we use point cloud sampling and simplification to accelerate inference, reducing computational overhead and enabling fast inference without compromising accuracy. As a result, we achieve competitive performance on the validation leaderboard. Our contributions can be summarized as follows:
\begin{itemize}
\item[\textbullet] We design data augmentation strategies to improve model robustness and registration accuracy under diverse clinical conditions.
\item[\textbullet] We adopt point cloud sampling and simplification techniques to accelerate inference while maintaining accuracy.
\item[\textbullet] Our method achieves second place on the validation leaderboard of the STSR 2025 Task 2, demonstrating both effectiveness and efficiency.
\end{itemize}

\section{Method}
\subsection{Framework Overview}
Figure~\ref{fig:framework} illustrates the overall architecture of our proposed framework for CBCT-IOS registration. The framework follows a learning-based paradigm that takes as input two point clouds: one sampled from the IOS mesh and the other from the CBCT volume. Both point clouds are independently encoded by two PointNet++ encoders, which are responsible for extracting hierarchical geometric features. The extracted features are subsequently aligned through a feature matching module, followed by a SVD head that estimates the rigid transformation matrix between the two modalities. This transformation is then used to map the IOS points into the CBCT coordinate system. During training, multiple loss terms are employed to jointly supervise the transformation prediction, including point-based losses, Chamfer distance, and penalties on rotation and translation. This design ensures that the model captures both global and local geometric correspondences in a computationally efficient manner.


\begin{figure}[htb]
\includegraphics[width=0.92\textwidth]{2.pdf}
\caption{Overview of our registration model. CBCT scans and intraoral scans are separately processed by PointNet++, followed by feature matching and rigid transformation estimation using SVD. The predicted transformation is applied to align the intraoral scans with CBCT data.} \label{fig:framework}
\end{figure}

\subsection{Data Augmentation}
To enhance the robustness and generalization of the proposed model, we employed a series of data augmentation strategies tailored to 3D point clouds. Specifically, random rigid transformations, including rotations and translations, were applied independently to both the IOS-derived point sets. These augmentation techniques enrich the diversity of the training dataset and mitigate the risk of overfitting, particularly in scenarios where annotated data is limited.

To better illustrate the effectiveness of our augmentation strategies, Figure~\ref{fig:augmentation} provides a visual example. The first column presents the original CBCT and IOS pairs prior to augmentation, while the subsequent three columns demonstrate augmented versions of the same case. These examples highlight how the applied transformations produce diverse yet clinically plausible variations, enabling the model to learn invariances that are essential for accurate and robust registration.

\begin{figure}[htb]
\includegraphics[width=\textwidth]{tta.pdf}
\caption{The visualization of data augmentation strategies. The first column shows the original CBCT and IOS pairs, while the subsequent columns display augmented versions of the same case, demonstrating the diverse and clinically plausible variations produced by the applied transformations. } \label{fig:augmentation}
\end{figure}


\subsection{Model Training}
The training of our framework follows a fully supervised paradigm, where the objective is to learn accurate rigid transformations between CBCT and IOS point clouds. A central component of the architecture is the feature extraction stage, implemented via PointNet++ encoders. PointNet++ extends the original PointNet architecture by introducing hierarchical feature learning, where local neighborhood information is progressively aggregated at multiple scales. This design allows the network to capture both fine-grained geometric details and global structural context, which is essential for modeling complex dental anatomy. In our framework, two independent PointNet++ encoders are employed, one for the CBCT point cloud and the other for the IOS point cloud. These dual encoders extract modality-specific features while preserving their geometric consistency.

The extracted feature representations are then passed to the transformation estimation module, referred to as the SVDHead. This module aligns the latent embeddings of the two modalities by constructing a correspondence matrix and applying a differentiable SVD. The SVDHead directly estimates the optimal rigid transformation matrix, decomposed into a rotation matrix and a translation vector, which maps the IOS point cloud onto the CBCT reference. Compared with regression-based alternatives, the SVD-based formulation offers improved stability and guarantees the orthogonality of the predicted rotation matrix.

Optimization is performed using the Adam optimizer. During training, the network parameters are updated to minimize a composite loss function that jointly enforces geometric alignment and transformation accuracy, which will be detailed in the following subsection. This training strategy enables the network to converge reliably and generalize well to unseen test data.

\subsection{Loss Function}
To achieve robust and accurate registration, we adopt a composite loss function that integrates multiple complementary objectives. Each component of the loss is designed to address a specific aspect of the alignment problem, ensuring both local geometric consistency and global rigid transformation accuracy.

\noindent\textbf{Point Loss.} 
This term enforces point-wise consistency between the transformed source point cloud $\hat{\mathbf{P}}_{src}$ and the ground truth aligned point cloud $\mathbf{P}_{gt}$. It is formulated as a mean squared error (MSE), directly penalizing local misalignments:
\[
\mathcal{L}_{point} = \frac{1}{N} \sum_{i=1}^{N} \left\| \hat{\mathbf{p}}_{src}^{(i)} - \mathbf{p}_{gt}^{(i)} \right\|^2 .
\]

\noindent\textbf{Chamfer Distance.}
To capture global shape similarity, we compute the bidirectional Chamfer distance between the predicted source $\hat{\mathbf{P}}_{src}$ and the target CBCT $\mathbf{P}_{tgt}$:
\[
\mathcal{L}_{chamfer} = \sum_{p \in \hat{\mathbf{P}}_{src}} \min_{q \in \mathbf{P}_{tgt}} \| p-q \|^2 
+ \sum_{q \in \mathbf{P}_{tgt}} \min_{p \in \hat{\mathbf{P}}_{src}} \| q-p \|^2 .
\]
This term encourages the transformed point sets to occupy the same geometric space.

\noindent\textbf{Rotation Loss.}
We explicitly constrain the predicted rotation $\hat{R}$ to be consistent with the ground truth $R_{gt}$. The rotation loss $\mathcal{L}_{rot}$ is defined as:
\[
\mathcal{L}_{rot} = \arccos\left( \frac{\mathrm{Tr}(\hat{R}R_{gt}^\top)-1}{2} \right) .
\]

\noindent\textbf{Translation Loss.}
As translation misalignment often dominates registration error in clinical practice, we emphasize translation accuracy by computing the Euclidean distance between the predicted $\hat{t}$ and ground truth $t_{gt}$ translation vectors:
\[
\mathcal{L}_{trans} = \left\| \hat{t} - t_{gt} \right\|_2^2 .
\]

\noindent\textbf{Matrix Regularization.}
To ensure the predicted transformation matrix remains a valid rigid body transformation, we introduce a regularization term that penalizes deviations from orthogonality and unit determinant:
\[
\mathcal{L}_{mat} = \left\| \hat{R}^\top \hat{R} - I \right\|_F^2 .
\]

\noindent\textbf{Overall Loss.}  
The total loss integrates all components in a weighted sum:
\[
\mathcal{L} = \lambda_{p} \cdot \mathcal{L}_{point} 
+ \lambda_{c} \cdot \mathcal{L}_{chamfer} 
+ \lambda_{r} \cdot \mathcal{L}_{rot} 
+ \lambda_{t} \cdot \mathcal{L}_{trans} 
+ \lambda_{m} \cdot \mathcal{L}_{mat} ,
\]
Where, the weighting coefficients $\lambda_{p}=0.5$, $\lambda_{c}=1.0$, $\lambda_{r}=1.0$, $\lambda_{t}=3.0$, and $\lambda_{m}=0.3$ are employed. The relatively higher weight assigned to the translation loss reflects its critical importance for achieving clinically meaningful registration accuracy.

\subsection{Inference Acceleration}
To ensure computational efficiency and enable practical deployment, we implemented a point cloud sampling and simplification strategy. During inference, the original CBCT scans often produce dense point sets. The density of these sets substantially increases computational cost without proportionally improving accuracy. To address this, we uniformly subsampled the CBCT point clouds to a fixed number of points, while IOS meshes were converted to point clouds with a comparable resolution. This design ensures balanced complexity between modalities, reduces GPU memory consumption, and accelerates inference speed. 

Importantly, this balance between efficiency and precision makes the framework more applicable in real-world clinical scenarios, where both accuracy and time efficiency are crucial.

\section{Experiments and Results}

\subsection{Dataset and Assessment Metrics}

The dataset provided by the STSR 2025 challenge comprises paired CBCT volumes and IOS meshes \cite{wang2024sts,wang2024semi}. In the training phase, two subsets are available: a labeled set, where each CBCT-IOS pair is annotated with an affine transformation matrix aligning the upper and lower dentition, and an unlabeled set containing paired CBCT volumes and IOS meshes. In addition, a validation set is released without annotations, serving as the benchmark for leaderboard evaluation. 

For quantitative evaluation, two complementary metrics are used: the mean translation error, which measures the Euclidean distance between predicted and ground-truth translation vectors, and the mean rotation error, computed as the geodesic distance between the predicted and reference rotation matrices. These metrics directly reflect the fidelity of the registration outcome, with lower values indicating higher accuracy. Although computational efficiency, such as inference time and GPU memory usage, is not explicitly scored in the validation phase due to the limitations of the challenge platform, it remains a practical consideration when deploying the methods in real clinical workflows.


\subsection{Implementation details}
\noindent\textbf{Environments and Requirements.} 
All experiments were conducted on a workstation, and the details of the hardware and software configuration are summarized in Table~\ref{tab:system_config}. The model was trained using the PyTorch framework for a total of 200 epochs.

\begin{table}[h]
\centering
\caption{System Configuration}
\begin{tabular}{|l|l|}
\hline
\textbf{Ubuntu version}       & Ubuntu 24.04 LTS                  \\ \hline
\textbf{CPU}                  & Intel(R) Xeon(R) Platinum 8352S CPU @ 2.20GHz \\ \hline
\textbf{RAM}                  & 503 GB                               \\ \hline
\textbf{GPU}                  & 1 NVIDIA GeForce RTX 4090 (24G)      \\ \hline
\textbf{CUDA version}         & 12.4                                 \\ \hline
\textbf{Programming language} & Python 3.9.19                               \\ \hline
\textbf{Deep learning framework} & PyTorch (torch 1.12.1, torchvision 0.19.1) \\ \hline
\textbf{Codes available at} &  https://github.com/duola-wa/MICCAI-2025-STSR-Task-2 \\ \hline
\end{tabular}

\label{tab:system_config}
\end{table}


\subsection{Results and Analysis}

To evaluate the effectiveness of our method, we present a series of ablation studies focusing on different design choices. As shown in Table~\ref{tab:exp_aug}, applying data augmentation substantially improves registration accuracy. Both translation and rotation errors are reduced, highlighting the importance of introducing geometric variability during training. By exposing the model to diverse transformations, augmentation enhances robustness to unseen cases and prevents overfitting, leading to a more generalizable registration framework.   

\begin{table}[h]
\centering
\caption{Effect of data augmentation on registration accuracy.}
\label{tab:exp_aug}
\begin{tabular}{l|c|c}
\hline
\textbf{Setting} & \textbf{Mean Translation Error (mm)} & \textbf{Mean Rotation Error (°)} \\ \hline
w/o Augmentation & 230.80 & 37.54 \\ 
w/ Augmentation  & 165.57 & 24.00 \\ \hline
\end{tabular}
\end{table}

As shown in Table~\ref{tab:exp_icp}, incorporating Iterative Closest Point (ICP) refinement reduces the mean translation error relative to the baseline prediction \cite{besl1992method}. However, given the limited overall gain and additional computational cost, ICP was not included in our final pipeline.  

\begin{table}[h]
\centering
\caption{Effect of ICP refinement on registration accuracy.}
\label{tab:exp_icp}
\begin{tabular}{l|c|c}
\hline
\textbf{Method} & \textbf{Mean Translation Error (mm)} & \textbf{Mean Rotation Error (°)} \\ \hline
w/o ICP & 165.57 & 24.00 \\ 
w/ ICP  & 157.68 & 43.56 \\ \hline
\end{tabular}
\end{table}

Table~\ref{tab:exp_cbct} further compares the performance under different CBCT filtering thresholds. The threshold refers to the intensity cutoff applied to CBCT voxels when extracting point clouds. A higher threshold retains densest regions such as enamel and cortical bone, while a lower threshold preserves a larger portion of anatomical structures, including lower-density bone. Relaxing the criterion from 800 to 600 therefore increases the number of target points available for alignment, which leads to a modest improvement in both translation and rotation accuracy. This suggests that incorporating a richer set of structural cues benefits the registration process.  

\begin{table}[h]
\centering
\caption{Effect of CBCT filtering threshold on registration accuracy (w/o ICP).}
\label{tab:exp_cbct}
\begin{tabular}{l|c|c}
\hline
\textbf{Filtering Condition} & \textbf{Mean Translation Error (mm)} & \textbf{Mean Rotation Error (°)} \\ \hline
CBCT $>800$ & 165.57 & 24.00 \\ 
CBCT $>600$ & 164.46 & 23.71 \\ \hline
\end{tabular}
\end{table}

\section{Conclusion}
In this paper, we present a learning-based framework for CBCT–IOS registration, tailored to the MICCAI STSR Task 2 2025 Challenge. The framework integrates dual PointNet++ encoders with a differentiable SVD head to estimate rigid transformations under orthogonality constraints. By leveraging tailored data augmentation and efficient point cloud sampling, our approach seeks to balance accuracy and inference speed. Experimental results demonstrate the effectiveness of the proposed augmentation strategies. Ultimately, our method achieved second place on the validation leaderboard. These results highlight the potential of our framework for clinical applications that demand rapid and reliable responses. In future work, we plan to further explore semi-supervised strategies to better leverage unlabeled data and to investigate lightweight architectures that further reduce computational overhead for deployment in clinical settings.

%
% ---- Bibliography ----
%
% BibTeX users should specify bibliography style 'splncs04'.
% References will then be sorted and formatted in the correct style.
%
% \bibliographystyle{splncs04}
% \bibliography{mybibliography}
%
\bibliographystyle{splncs04}
\bibliography{ref}


\end{document}
