% This is samplepaper.tex, a sample chapter demonstrating the
% LLNCS macro package for Springer Computer Science proceedings;
% Version 2.21 of 2022/01/12
%
\documentclass[runningheads]{llncs}
%
\usepackage[T1]{fontenc}
% T1 fonts will be used to generate the final print and online PDFs,
% so please use T1 fonts in your manuscript whenever possible.
% Other font encondings may result in incorrect characters.
%
\usepackage{graphicx}
\usepackage{booktabs}
\usepackage{multirow}
\usepackage{makecell}
\usepackage{bbm}
\usepackage{bbding}
\usepackage{threeparttable} % write table foot
\usepackage[misc]{ifsym}

\usepackage{amsmath}
\usepackage{hyperref}
\usepackage{amssymb}
% Used for displaying a sample figure. If possible, figure files should
% be included in EPS format.
%
% If you use the hyperref package, please uncomment the following two lines
% to display URLs in blue roman font according to Springer's eBook style:
%\usepackage{color}
%\renewcommand\UrlFont{\color{blue}\rmfamily}
%\urlstyle{rm}
%

\begin{document}
%
\title{Efficient and Robust CBCT Segmentation of Oral and Maxillofacial Structures}
%
\titlerunning{Efficient and Robust CBCT Segmentation}
% If the paper title is too long for the running head, you can set
% an abbreviated paper title here
%
\author{Fan Xiao\inst{1} \and
Xinrui Huang\inst{2} \and
Anqi Gao\inst{1,3} \and
Dongming He\inst{1} \and
Xiaofan Zhang\inst{2} \and
Xudong wang\inst{1,3,4,5,6,7,8}$^{(\textrm{\Letter})}$}
%
\authorrunning{F. Xiao et al.}
% First names are abbreviated in the running head.
% If there are more than two authors, 'et al.' is used.
%
\institute{Department of Oral Craniomaxillofacial, Shanghai Ninth People’s Hospital,
 Shanghai Jiao Tong University School of Medicine, Shanghai, China
 \email{ff741333@gmail.com,xudongwang70@hotmail.com}\\
\and
School of Electronic Information and Electrical Engineering, Shanghai Jiao Tong University, Shanghai, China\\
\email{\{huangxr,xiaofan.zhang\}@sjtu.edu.cn}\\
\and 
College of Stomatology, Shanghai Jiao Tong University, Shanghai, China \\
\and
National Center for Stomatology, Shanghai, China \\
\and
National Clinical Medical Research Center for Oral Diseases, Shanghai, China \\
\and
Shanghai Key Laboratory of Stomatology, Shanghai, China\\
\and
Shanghai Research Institute of Stomatology, Shanghai, China\\
\and
Research Unit of Oral and Maxillofacial Regenerative Medicine, Chinese Academy\\
 of Medical Science, Shanghai, China\\
}
%
\maketitle              % typeset the header of the contribution
%
\begin{abstract}
In dental practice, accurate segmentation of oral and maxillofacial structures from cone-beam computed tomography (CBCT) images is essential for diagnostic and treatment planning purposes. However, manual segmentation is time-consuming and labor-intensive. Although numerous deep learning-based methods have been proposed to automate this process, most rely on a single model architecture, which struggles to handle the complex and diverse nature of oral anatomical structures. To address this limitation, we propose a hybrid framework integrating nnUNet and VISTA models for automated and interactive segmentation of oral and maxillofacial structures. Our approach employs a class-wise ensemble strategy to improve inference efficiency and accuracy, and incorporates post-processing techniques such as threshold-based small object removal and disconnected region filtering to enhance robustness. The proposed method achieved third place in Task 1 and second place in Task 2 of the ToothFairy3 Challenge. Code and model weights are available at \url{https://github.com/ff741333/toothfairy3_blcakmyth}.

\keywords{CBCT image  \and Oral and maxillofacial structures segmentation \and Interactive segmentation}
\end{abstract}
%
%
%

\section{Introduction}
In dental practice, obtaining accurate oral and maxillofacial structures is essential for disease diagnosis and treatment. Cone-beam computed tomography (CBCT), as a commonly used imaging modality, is frequently applied in dentistry and related fields due to its advantages of short acquisition time, low radiation dose, and high resolution for hard tissues. The oral and maxillofacial structures that can be obtained from CBCT images are illustrated in Figure \ref{fig:intro}, including teeth (and dental attachments such as bridges, crowns, and implants), jawbone, maxillary sinus, pharynx, inferior alveolar canal (IAC), mandibular incisive canal, lingual canal, and others. These anatomical structures are critical for clinical applications such as surgical planning in implantology \cite{elgarba2023deep} and maxillofacial surgery \cite{huang2025maxillofacial}, as well as tooth alignment in orthodontics.
However, manually segmenting these structures from CBCT images is time-consuming and labor-intensive. 
 
\begin{figure}[ht]
    \centering
    \includegraphics[width=0.95\textwidth]{introduction.png}
    \caption{Visualization of Oral and Maxillofacial Structures in CBCT: (a) Jawbone, Teeth, Pharynx; (b) Inferior Alveolar Canal, Mandibular Incisive Canal, Lingual Canal, Maxillary Sinus, Pulp}
    \label{fig:intro}
\end{figure}

In recent years, numerous studies \cite{cui2022fully,wang2023root,dot2024dentalsegmentator,bolelli2025segmenting,huang2024iossam} have focused on achieving automatic segmentation of oral and maxillofacial structures. Cui et al. \cite{cui2022fully} proposed a two-stage deep network leveraging hierarchical tooth morphology for precise tooth segmentation and a filter-enhanced network enhancing intensity contrasts for accurate alveolar bone segmentation. Dot et al. \cite{dot2024dentalsegmentator} has developed an open-source tool for robust segmentation of oral and maxillofacial structures on CBCT and CT images, including the maxilla, mandible, teeth, and mandibular canal. Bolelli et al. \cite{bolelli2025segmenting} constructed a dataset consisting of 42 different types of CBCT maxillofacial structure segmentation, and employed various strategies to optimize the performance of existing excellent segmentation models \cite{isensee2021nnu,chen2021transunet,zhou2023nnformer,shaker2024unetr++,ma2024u,liu2024vmamba,liu2024swin}.
The existing segmentation models for oral and maxillofacial structures are usually based on a single architecture, which is insufficient for their complex and diverse nature.
Each tooth is of a similar size, yet they vary in morphology and position. Additionally, the jawbone has a relatively large volume and contains numerous neural structures.
Different model architectures possess varying receptive fields and exhibit differences in segmenting diverse oral and maxillofacial structures. Therefore, designing diverse model architectures is highly beneficial for the segmentation of oral and maxillofacial structures.

In this work, we propose an algorithm for segmenting different oral and maxillofacial structures in CBCT images based on the nnUNet \cite{isensee2021nnu} and VISTA \cite{he2024vista3d} framework, which also supports interactive segmentation of the IAC.
To balance inference efficiency and accuracy, we designed multiple strategies to optimize the algorithm's inference process. 
Additionally, we employed post-processing techniques such as custom threshold-based small label removal and non-connected region filtering to further enhance robustness.
Finally, we validated our algorithm in the ToothFairy3 Challenge, achieving \textbf{3}$^{\text{rd}}$ place on Task 1 and \textbf{2}$^{\text{nd}}$ place on Task 2.

\section{Method}
\begin{figure}[ht]
    \centering
    % 预留位置但不加载图片
    \includegraphics[width=0.95\textwidth]{method.png}
    \caption{The framework of our proposed method. An nnUNet model is trained for oral and maxillofacial segmentation and a VISTA model for interactive IAC segmentation. Inference combines the models with a class-voting ensemble and a two stage automatic and point-based refinement for interactive segmentation. Outputs of both models are post-processed to improve robustness.}
    \label{fig:method}
\end{figure}
Our proposed method is illustrated in Figure \ref{fig:method}. We first separately train an nnUNet-based model for oral and maxillofacial structures segmentation and a VISTA-based model for interactive IAC segmentation. Considering computational efficiency, we adopt a class-voting ensemble strategy to reduce memory usage and improve inference speed. To take advantage of the differences in various oral and maxillofacial structures, we integrate the two models to enhance segmentation accuracy. For the interactive segmentation task, we adopt a two-stage strategy: an automatic segmentation stage is first applied to obtain an initial result, followed by a point-based prompt refinement stage to further enhance the segmentation accuracy.
Finally, we apply post-processing techniques such as custom threshold-based small label removal and non-connected region filtering to the outputs of both models to further enhance robustness.
\subsection{Model training}

For nnUNet model training phase, we chose nnUNet ResEnc L as backbone network. 
Since all data share the same spacing [0.3, 0.3, 0.3], no resampling was performed on any data. The data were cropped into patches of size 128×224×224 and augmented using techniques including rotation within the range of $-30^\circ$ to $30^\circ$, scaling with a factor of $0.7$--$1.4$, anterior--posterior and superior--inferior mirroring, addition of Gaussian noise (variance $0.1$) and Gaussian blur (sigma $0.5$--$1.0$), and contrast adjustment with a factor of $0.75$--$1.25$.
The training proceeded for 1500 epochs with a batch size of 2. The SGD optimizer was adopted with an initial learning rate of 0.01 and a Poly scheduler \cite{zhang2020kdecay}. The loss function was defined as the sum of Dice Loss and Cross-Entropy Loss.

For VISTA model training phase, we chose VISTA3D pretrained checkpoint.
No resampling was also performed on any data. The data were cropped into patches of size 96×160×160 and augmented using techniques including scaling with a factor of $0.8$--$1.2$, simulation of low resolution images with a factor of $0.3$--$1.0$, addition of Gaussian noise (variance $0.2$) and Gaussian blur (sigma $0.5$--$1.0$), and contrast adjustment with a factor of $0.9$--$1.1$.
The training proceeded for 200 epochs with a batch size of 2. The AdamW optimizer was adopted with an initial learning rate of 5e-5, a weight decay of 1e-5 and a cosine scheduler. The loss function was defined as Dice Loss.

We utilized all the data provided in Toothfairy3  \cite{bolelli2024segmenting,bolelli2025segmenting,bolelli2024enhancing} as both the training set and the test set, without performing any data partitioning. However, while the nnUNet model employed all available labels, the VISTA model only used the labels corresponding to the IAC.


\subsection{Ensemble strategy}
\subsubsection{Class-voting}
The sliding window inference process of the nnU-Net model is highly memory-intensive. To achieve efficient segmentation, we adopt a class-voting strategy. Specifically, for each patch $S_i$, the predicted logits are converted into one-hot encoded vectors, which are then aggregated through summation. The final predicted label is determined by selecting the class with the maximum accumulated value. The formulation is as follows:
\begin{equation}
\hat{y} = \arg\max_{c \in {1, \cdots, C}} \sum_{i=1}^{N} \mathbb{I}_c\left(\arg\max f(S_i)\right)
\end{equation}
where $\hat{y}$ denotes the final predicted label, $C$ represents the total number of classes, $N$ is the total number of sliding window patches that cover the spatial location, $f(S_i)$ denotes the predicted logits for patch $S_i$, $\mathbb{I}_c(\cdot)$ is the indicator function that outputs 1 if the argument equals class $c$ and 0 otherwise, $\arg\max f(S_i)$ obtains the predicted class label for patch $S_i$. The outer $\arg\max$ operation selects the class with the highest vote count.
\subsubsection{Model integration}
The VISTA model demonstrated superior performance compared to the nnUNet model in segmenting the IAC. Therefore, we integrated the nnUNet model with the VISTA model. Specifically, we first removed the IAC segmentation labels predicted by the nnUNet model and then replaced them with the corresponding labels generated by the VISTA model. This approach allowed us to substitute the relatively inaccurate IAC labels from nnUNet with more precise ones.
\subsection{Interactive segmentation}
To achieve better interactive segmentation results, we employ a method that combines automatic segmentation with point-prompt-based interactive segmentation. Specifically, we first use the automatic segmentation decoder from the VISTA model to obtain the automatic segmentation mask of the IAC, and then apply the point-prompt decoder to generate the point-prompt-based segmentation mask. We then add or remove only the connected component regions that contain the point clicks to avoid unexpected modifications. This refinement of the automatic segmentation results using point-prompt-based segmentation significantly improves the overall performance.
\subsection{Post-processing techniques}
The post-processing techniques employed in our pipeline are designed to enhance segmentation accuracy and robustness by incorporating both morphological operations and prior anatomical knowledge. These techniques include custom threshold-based small label removal, morphological open operations, and non-connected region filtering. Specifically, we first perform label size filtering to remove anatomically implausible labels whose pixel area falls within predefined ranges (e.g., 320--1,819 pixels for certain upper teeth and 970--6,140 pixels for wisdom teeth), as determined from the training set distribution. This step helps to suppress spurious predictions and reduce false positives associated with small, isolated regions.

Next, a morphological open operation is applied to the upper jawbone and pharynx regions to address potential boundary ambiguities and to smooth jagged edges in the predicted segmentation masks. The open operation, which consists of an erosion followed by a dilation, effectively removes small noise while preserving the overall structure of the anatomical regions.

Finally, non-connected region filtering is performed on the lower jawbone, pharynx, and tooth labels. This step leverages prior anatomical knowledge by retaining only the largest connected components for each label, thereby eliminating isolated or incorrectly segmented regions that are inconsistent with realistic anatomical structures. By combining these post-processing steps, our framework not only improves the visual consistency of the segmentations but also enhances quantitative metrics by reducing both false positives and false negatives in critical regions.
\section{Experiment}
\subsection{Implementation details}
The training of the nnUNet model was conducted on two NVIDIA GeForce RTX 4090 GPUs, while all other training and experiments were performed on a single NVIDIA GeForce RTX 4090 GPU. 
The metrics used in the experiment include Dice, HD95, and inference time.

\subsubsection{Comparison with other models}
\begin{table}[]
    \centering
    \begin{tabular}{cccc}\hline
        \multicolumn{1}{c}{Models} & \multicolumn{1}{c}{Patch size} & \multicolumn{1}{c}{Dice}& \multicolumn{1}{c}{Prediction time(s)} \\\hline
         
         nnWnet L&96x160x160&0.7886&27.7\\
         
         nnWnet M&96x160x160& 0.7868&21.3\\
         
         nnWnet S&96x160x160&0.7774&13.7\\
         
         nnUNet ResEnc L&96x160x160&0.8037&5.0\\
         
         U-mamba&128x224x224&\textbf{0.8535}&7.7\\
         
         nnUNet ResEnc L&128x224x224&0.8312&\textbf{4.7}\\ \hline
         
    \end{tabular}
    \caption{Comparison with other models. Prediction time refers to the time taken by the model to predict a single CBCT.}
    \label{tab:comparsion}
\end{table}
In consideration of the balance between efficiency and accuracy of the algorithm, we compared several existing models, such as U-Mamba \cite{ma2024u} and nnWNet \cite{Zhou_2025_CVPR}. As shown in Table \ref{tab:comparsion}, the Dice score of nnUNet ResEnc L (96×160×160) reaches 0.8037, which is higher than that of all nnWNet (ranging from 0.7774 to 0.7886), but lower than nnUNet ResEnc L (128×224×224) with Dice score of 0.8312. This indicates that enlarging the patch size contributes to performance improvement, as nnUNet ResEnc L with patches (128×224×224) outperform their counterparts trained on smaller patches (96×160×160). In terms of inference speed, nnUNet ResEnc L demonstrates the best efficiency, requiring only 4.7 seconds per CBCT, which is faster than all other models including U-Mamba (7.7 seconds). These results suggest that nnUNet ResEnc L achieves a favorable trade-off, delivering the fastest inference while maintaining competitive segmentation accuracy.

\subsubsection{Ablation study}
\begin{table}[]
    \centering
    \begin{tabular}{cccccc}
        \hline
         \multicolumn{1}{c}{Class-voting} & \multicolumn{1}{c}{Model integration}&\multicolumn{1}{c}{Post-processing} & \multicolumn{1}{c}{Dice}& \multicolumn{1}{c}{HD95}& \multicolumn{1}{c}{Inference time(s)} \\\hline
        $\times$&\checkmark&\checkmark&\textbf{0.9575}&\textbf{18.66}&51.0\\
        \checkmark&$\times$&\checkmark&0.9563&18.67&25.5\\
        \checkmark&\checkmark&$\times$&0.8772&55.88&\textbf{25.2}\\
        \checkmark&\checkmark&\checkmark&0.9573&\textbf{18.66}&35.0\\\hline
    \end{tabular}
    \caption{Ablation study on debugging phase. Inference time denotes the complete duration required for processing a single CBCT during inference.}
    \label{tab:ablation}
\end{table}
To further investigate the contributions of different components, we conducted an ablation study, as summarized in Table \ref{tab:ablation}. When only model integration and post-processing were employed, the framework achieved the highest Dice score of 0.9575, with an HD95 of 18.66, albeit at the cost of the longest inference time (51.0 s). By contrast, applying class-voting with post-processing but without model integration reduced the Dice score slightly to 0.9563 while improving efficiency (25.5 s). Removing post-processing led to a substantial degradation in accuracy, with the Dice dropping to 0.8772 and HD95 increasing to 55.88, although this configuration achieved the fastest inference (25.2 s). Incorporating all three components (class-voting, model integration, and post-processing) yielded a balanced performance, with a Dice of 0.9573, HD95 of 18.66, and moderate inference time (35.0 s). These results highlight the critical role of post-processing for maintaining segmentation accuracy and demonstrate that combining ensemble strategies can effectively balance accuracy and efficiency.


\begin{table}[]
    \centering
    \begin{tabular}{cccc}
        \hline
         \multicolumn{1}{c}{Task} & \multicolumn{1}{c}{Team}& \multicolumn{1}{c}{Dice}& \multicolumn{1}{c}{HD95}\\\hline
         \multirow{5}{*}{Multi-class Segmentation}         &Black\_Myth&\textbf{0.7981$\pm$0.0640}&\textbf{88.7228$\pm$32.3250}	\\
         &TAIR Lab&	0.7917$\pm$0.0652&93.1873$\pm$30.4327\\
         &sjtu\_eiee\_2-426lab&0.7705$\pm$0.0754 &	104.5936$\pm$37.2139\\
         &ring821&0.7684$\pm$0.0969&	104.4004$\pm$47.9841\\
         &DLaBella29&0.7386$\pm$0.0708&	97.7059$\pm$33.2051\\   
         \hline
         \multirow{5}{*}{IAC Interactive Segmentation}         &Black\_Myth&\textbf{0.8642$\pm$0.0507}&\textbf{2.2675$\pm$1.7112}	\\
         &TAIR Lab&0.8519$\pm$0.0752&7.3863$\pm$20.4354\\
         &DLaBella29&0.7465$\pm$0.0724&4.7094$\pm$3.4713\\
         &sjtu\_eiee\_2-426lab&0.7683$\pm$0.1896&32.2318$\pm$85.8957 \\
         &gagaha&0.7220$\pm$0.2554&76.8298$\pm$159.1321 \\
         \hline
    \end{tabular}
    \caption{Final result on test phase leaderboards.}
    \label{tab:leaderboard}
\end{table}

Finally, our final results are presented in Table \ref{tab:leaderboard}. We achieved the best performance on both the Multi-class Segmentation leaderboard and the IAC Interactive Segmentation test phase leaderboard of the MICCAI Toothfairy3 Challenge. 
However, due to considerations regarding algorithmic runtime and computational cost, our final official standings were third place on Task~1 and second place on Task~2.
\section{Discussion}
This work presents a segmentation framework that integrates nnUNet and VISTA for accurate delineation of oral and maxillofacial structures in CBCT images. The complementary strengths of the two architectures—nnUNet for large-volume structures and VISTA for the fine-grained IAC—enabled superior performance compared with single-model approaches. The use of class-voting, interactive segmentation, and post-processing further enhanced efficiency and robustness, which was reflected in our third place result in Task 1 and second place result in Task 2 of the ToothFairy3 Challenge.

Nevertheless, the method was trained and validated only on the challenge dataset, and its generalizability to multi-center or clinical data remains to be verified. Future work will explore multi-institutional validation, lightweight deployment strategies, and extension to pathological segmentation for broader clinical applicability.
\begin{credits}
\subsubsection{\ackname} This study was funded by National Key R\&D Program of China (2023YFC2414100), National Natural Science Foundation of China (82370905, 82071096), Shanghai Professional Service Platform of Oral-Cranio-Maxillofacial Digital Technology Research and Application (21DZ2294600), National Clinical Key Specialty (Z155080000004), Shanghai's Top Priority Research Center (2022ZZ01017), and CAMS Innovation Fund for Medical Sciences (CIFMS, 2019-I2M-5-037).

\end{credits}
%
% ---- Bibliography ----
%
% BibTeX users should specify bibliography style 'splncs04'.
% References will then be sorted and formatted in the correct style.
%
\bibliographystyle{splncs04}
\bibliography{mybibliography}
%

\end{document}
