% This is samplepaper.tex, a sample chapter demonstrating the
% LLNCS macro package for Springer Computer Science proceedings;
% Version 2.21 of 2022/01/12
%
\documentclass[runningheads]{llncs}
%
\usepackage{amssymb}
\usepackage{amsmath}
\usepackage{bm}
\usepackage[T1]{fontenc}
% T1 fonts will be used to generate the final print and online PDFs,
% so please use T1 fonts in your manuscript whenever possible.
% Other font encondings may result in incorrect characters.
%
\usepackage{graphicx}
\usepackage{adjustbox}
\usepackage{lipsum} % For dummy text
% Used for displaying a sample figure. If possible, figure files should
% be included in EPS format.
%
% If you use the hyperref package, please uncomment the following two lines
% to display URLs in blue roman font according to Springer's eBook style:
\usepackage{color}
\usepackage{multirow}
\renewcommand{\thetable}{\arabic{table}}
\usepackage[colorlinks, linkcolor=blue, urlcolor=blue, anchorcolor=blue, citecolor=blue]{hyperref}
%
\begin{document}
%
\title{Efficient CBCT Segmentation via nnU-Net with Structure-Aware Post-processing and Interactive Refinement}
%
\titlerunning{Efficient CBCT Segmentation with SAP and Interactive Refinement}
% If the paper title is too long for the running head, you can set
% an abbreviated paper title here
%
\author{Changkai Ji\inst{1} \orcidID{0009-0007-7090-7360} \and 
Yusheng Liu\inst{1} \orcidID{0009-0004-2624-9223}
\and
Yuxian Jiang\inst{1} \orcidID{0009-0002-7689-5333}  \orcidID{0009-0009-3223-0082} \and
Lisheng Wang\inst{1} \orcidID{0000-0003-3234-7511}}
%
\authorrunning{C. Ji et al.}
% First names are abbreviated in the running head.
% If there are more than two authors, 'et al.' is used.
%
\institute{School of Automation and Intelligent Sensing, Shanghai Jiao Tong University, Shanghai 200240, People's Republic of China \\ \email{\{changkaiji, lswang\}@sjtu.edu.cn}}
%
\maketitle              % typeset the header of the contribution
%
\begin{abstract}
Accurate segmentation of anatomical structures from cone-beam computed tomography (CBCT) is essential for clinical applications in dentistry, maxillofacial surgery, and orthodontics. The ToothFairy3 Challenge has a comprehensive 77-class segmentation task, emphasizing both accuracy and computational efficiency. In this work, we present a method based on the nnU-Net framework, enhanced with a Structure Aware Post-processing (SAP) strategy. nnU-Net serves as a backbone for multi-class segmentation, while SAP refines predictions by introducing individualized thresholds for each anatomical structure, thereby mitigating noise and preserving clinically important fine structures. To further improve efficiency, we disabled mirroring augmentation during training and employed inference acceleration strategies, including the removal of test-time augmentation and optimized interpolation on floating-point tensors. Experimental results validate the effectiveness of our approach in balancing segmentation accuracy with computational efficiency. To further ensure robustness in challenging clinical scenarios, we also utilize an interactive refinement module based on nnInteractive. This strategy allows clinicians to correct local segmentation errors with minimal user guidance, providing a safety net for complex anatomical variations.
\keywords{nnU-Net \and Structure Aware Post-processing \and Computational efficiency}
\end{abstract}
%
%
%
\section{Introduction}
Cone-beam computed tomography (CBCT) has become an indispensable imaging modality in dentistry, maxillofacial surgery, and orthodontics due to its short acquisition time, low radiation dose, and high spatial resolution for hard tissues \cite{patel2015cone,acar2014use}. Accurate delineation of anatomical structures from CBCT is essential for surgical planning, risk assessment, and clinical decision-making. Building on the success of previous ToothFairy challenges, the ToothFairy3 – MICCAI 2025 competition pushes the boundaries of multi-class segmentation with an expanded dataset encompassing 77 anatomical categories, including newly introduced structures such as the pulp cavity, incisive nerve, and lingual foramen. This task emphasizes not only segmentation accuracy but also computational efficiency, reflecting the growing demand for real-time, reliable clinical tools.

Traditionally, the identification and delineation of anatomical structures in CBCT images have relied heavily on manual segmentation by experienced radiologists and dental professionals. The process requires substantial expertise and can take considerable time per case, making it impractical for routine clinical workflows where rapid decision-making is essential \cite{wang2022follow,ji2023mammo}. Moreover, the subjective nature of manual segmentation can lead to inconsistent results across different practitioners, potentially affecting treatment planning reliability.

In recent years, artificial intelligence techniques, particularly deep learning-based approaches using convolutional neural networks (CNNs), have demonstrated remarkable success in medical image segmentation tasks \cite{shen2017deep,liu2024inferior,lin2025multi,yang2025blood}. These automated methods have shown promising results in various dental imaging applications, offering the potential to significantly reduce processing time while maintaining or even improving segmentation accuracy. Deep learning frameworks have proven particularly effective at learning complex patterns and features from medical images, enabling robust identification of anatomical structures across diverse patient populations and imaging conditions \cite{bolelli2024segmenting,liu2022cnn,jiang2024enhanced}.

Despite these advances, significant challenges remain for the ToothFairy3 task. First, the large number of categories (77) introduces class imbalance, as certain anatomical structures are underrepresented compared to larger, more prominent ones such as the mandible. This imbalance risks biasing the model toward dominant classes. Second, fine-scale structures like the incisive nerve or lingual foramen are difficult to segment reliably, requiring high-resolution features without overwhelming memory usage. Third, efficiency must be considered alongside accuracy: prolonged inference times or excessive memory consumption may render otherwise accurate models impractical for real-world clinical use \cite{ji2024two,aji2024two}. Striking a balance between precision and computational efficiency is therefore essential.

To address these challenges, we propose a solution based on nnU-Net, enhanced with Structure Aware Post-processing (SAP) \cite{ronneberger2015u,isensee2018nnu}. nnU-Net provides a strong backbone for multi-class segmentation, automatically adapting to the CBCT dataset’s characteristics, while SAP refines predictions by removing spurious regions and ensuring anatomical plausibility. This approach aims to achieve high segmentation accuracy across 77 classes while maintaining computational efficiency, aligning with the dual objectives of the ToothFairy3 challenge. Despite the high performance of automated models, purely automatic segmentation may still falter in cases with severe artifacts or ambiguous boundaries (e.g., discontinuous inferior alveolar canals). To address this, we incorporate an interactive segmentation paradigm as a complementary refinement step. By leveraging user-provided point prompts, this module enables precise correction of difficult targets, ensuring that the system meets the rigorous reliability standards required for surgical planning. The contributions of our work can be summarized as follows:

\begin{itemize}
\item[\textbullet] We employed an automated segmentation framework based on nnU-Net with SAP to address the multi-class segmentation challenge in CBCT images.
\item[\textbullet] The proposed approach optimizes the trade-off between segmentation quality and computational efficiency, ensuring both clinical accuracy and practical feasibility.
\item[\textbullet] Our approach achieved top-three performance in the ToothFairy3 Challenge validation phase, demonstrating its effectiveness for comprehensive dental and maxillofacial structure segmentation.
\end{itemize}

\section{Proposed Method}
\subsection{Framework Overview}
As shown in Fig. \ref{fig:image1}, we propose a segmentation approach for CBCT images, leveraging nnU-Net as the foundational architecture with disabled mirroring augmentation, combined with a SAP strategy. Disabling mirroring augmentation preserves the inherent left-right anatomical asymmetry of oral structures, enabling the model to learn structure-specific positional features. Structure Aware Thresholds provide adaptive morphological optimization, thereby minimizing false positives across diverse oral tissues.

\begin{figure}[htb]
\includegraphics[width=\textwidth]{1.pdf}
\caption{Overview of the proposed framework. The training procedure utilized nnU-Net with disabled mirroring augmentation to enhance structure-specific learning. During inference, initial segmentation outputs were refined through Structure Aware Post-processing, wherein predetermined Structure Aware Thresholds were applied to target anatomical structures for morphological optimization.} \label{fig:image1}
\end{figure}

\subsection{Data Preprocessing}
We employed nnU-Net's automated preprocessing pipeline to optimize data handling and network configuration for our multi-structure segmentation task. The preprocessing stage involved comprehensive dataset validation to ensure annotation consistency and data integrity across all CBCT volumes. The framework automatically determined optimal patch sizes, spacing parameters, and intensity normalization strategies based on the inherent characteristics of the dataset. This automated approach eliminates manual hyperparameter tuning while ensuring that preprocessing parameters are specifically tailored to the morphological and intensity characteristics of CBCT imaging data.

Additionally, the preprocessing pipeline established network topology and memory allocation strategies optimized for 3D volumetric segmentation of high-resolution CBCT images. The intensity normalization was performed using dataset-specific statistics computed from foreground regions, ensuring consistent intensity distributions across the training cohort.

\subsection{Model Training Strategy}
Model training was conducted using the 3D full-resolution configuration to preserve high spatial resolution critical for accurate delineation of fine anatomical structures. During the training phase, we selectively disabled mirroring-based data augmentation techniques. This approach addresses the inherent positional specificity of oral anatomical structures, where spatial location serves as one of the fundamental identifying characteristics. For instance, the left and right inferior alveolar canals, while morphologically similar, are distinguished primarily by their anatomical position. Similarly, FDI numbering assignment for teeth would be compromised by mirroring augmentation. Applying mirroring augmentation would artificially transpose these position-dependent structures, compromising the model's ability to learn spatial-anatomical relationships essential for accurate structure identification and increasing classification difficulty between bilaterally symmetric yet distinct anatomical entities.

\subsection{Structure Aware Post-processing}
Traditional post-processing approaches for medical image segmentation typically employ fixed filtering parameters across all anatomical structures, such as removing connected components smaller than a predetermined volume threshold or retaining only the largest connected component for each structure. However, this "one-size-fits-all" strategy presents significant limitations: overly conservative thresholds may preserve noise and erroneous segmentations, while aggressive thresholds risk eliminating clinically important small structures.

To address these limitations, we propose a Structure Aware Post-processing method that computes individualized Structure-aware Thresholds (SAT) for each anatomical structure, rather than applying uniform criteria across all structures. The core insight is that different anatomical structures exhibit distinct morphological characteristics and volume distributions, necessitating structure-specific optimization strategies.

Let $\mathcal{S}=\{S_1,S_2,\dots,S_K\}$ denote the set of $K$ target anatomical structures.  
For each structure $S_i$, we define a SAT $\tau_i$ that specifies the minimum volume required for a connected component to be considered valid.  
The collection of thresholds is represented as $\mathbf{T}=\{\tau_1,\tau_2,\dots,\tau_K\}$.  
Given an initial segmentation prediction $\mathbf{P}$, the structure-aware post-processing procedure consists of four sequential steps.

\paragraph{1. Connected Component Analysis.}  
For each structure $S_i$, we extract all connected components from the corresponding segmentation mask:
\[
\mathcal{C}_i=\{c_{i,1},c_{i,2},\dots,c_{i,n_i}\},
\]
where $n_i$ denotes the number of connected components predicted for structure $S_i$, and each pair of components is disjoint, i.e., $c_{i,a}\cap c_{i,b}=\varnothing$ for $a\neq b$.

\paragraph{2. Volume Computation.}  
For each connected component $c_{i,j}$, we compute its volume $v_{i,j}$.  
In the general case with voxel volume $V_{\text{vox}}(x,y,z)$, the volume is:
\[
v_{i,j}=\sum_{(x,y,z)\in c_{i,j}} V_{\text{vox}}(x,y,z).
\]

\paragraph{3. Threshold-based Filtering.}  
Each connected component is retained only if its volume exceeds the corresponding threshold:
\[
c_{i,j}^{\mathrm{filtered}}=
\begin{cases}
c_{i,j}, & \text{if } v_{i,j}\ge \tau_i,\\[6pt]
\varnothing, & \text{otherwise},
\end{cases}
\]
where $\varnothing$ denotes the empty set, i.e., the component is discarded.

\paragraph{4. Final Reconstruction.}  
The refined segmentation for structure $S_i$ is obtained by the union of all retained components:
\[
\hat{S}_i=\bigcup_{j:\;v_{i,j}\ge \tau_i} c_{i,j}.
\]
The complete post-processed segmentation is given by
\[
\hat{\mathbf{P}}=\{\hat{S}_1,\hat{S}_2,\dots,\hat{S}_K\},
\]
which can be further represented as a labeled mask for downstream evaluation or visualization.

This approach enables differentiated treatment of anatomical structures with varying size characteristics. For instance, large structures such as jawbones can utilize higher thresholds to effectively eliminate substantial noise regions, while smaller structures like nerve canals employ lower thresholds to preserve their inherently compact morphology. The structure-aware post-processing thus provides a framework for balancing the trade-off between noise removal and structure preservation in multi-class anatomical segmentation tasks.

To determine the optimal values for the structure-aware thresholds ($\tau_{vol}$), we analyze the volumetric distribution of each anatomical class within the training dataset. The filtering strategy is empirically tailored to the scale of the target structures. For the pharynx, we retain only the largest connected component. For other structures, thresholds are stratified by anatomical size: massive bone structures like the lower jawbone and upper jawbone utilize high thresholds ($10,000$ and $5,000$ voxels, respectively) to filter out major misclassifications. Medium-sized prosthetics employ a threshold of $2,000$ voxels. Specific subsets of teeth are assigned a threshold of $1,500$ voxels. Fine-grained structures, including the inferior alveolar canals, use a lower threshold of $500$ voxels. The detailed configuration is presented in our Github.


\subsection{Interactive Refinement Module}
While the proposed nnU-Net with SAP achieves efficient automated segmentation, we introduce an interactive refinement module, nnInteractive, to handle corner cases requiring human expertise. This module adopts a "human-in-the-loop" workflow where clinicians can iteratively refine segmentation results using point prompts.


Network Architecture: Unlike methods using separate image and prompt encoders (e.g., SAM), nnInteractive employs an early prompt strategy. User-provided prompts (e.g., foreground/background clicks) are encoded as Gaussian heatmaps and concatenated with the original image and the current segmentation mask along the channel dimension. The network input consists of eight channels: the original image, the previous mask, and six channels representing different interaction types (points, scribbles, bounding boxes).



AutoZoom Mechanism: To handle small, fine-grained structures like the inferior alveolar canal within large FOV CBCT scans, the module incorporates an AutoZoom mechanism. This dynamic strategy automatically crops and resamples the Region of Interest (ROI) around the user's interaction points, allowing the model to focus on local details at higher resolution without losing context. This ensures that even subtle anatomical structures can be precisely corrected with minimal user interaction (1–5 clicks).


\section{Experiments and Results}
\subsection{Dataset and Assessment Metrics}
The dataset used in Task 1 of the ToothFairy3 challenge is composed of CBCT scans annotated with 77 anatomical classes, encompassing not only large bony structures such as the mandible and maxilla, but also fine-grained elements such as pulp cavities, incisive canals, and the lingual foramen \cite{bolelli2025segmenting,bolelli2024segmenting,lumetti2024enhancing}. The volumes are provided in NIfTI format with intensity values in Hounsfield units. Across all scans, the maximum spatial dimensions are $(298, 512, 512)$, the minimum are $(170, 272, 345)$, and the median shape is $(168, 362, 371)$.

For evaluation, we adopt two widely used metrics in medical image segmentation: the Dice Similarity Coefficient (DSC) and the 95th percentile Hausdorff Distance (HD95). Both metrics are computed for each class on each test volume, followed by averaging across all volumes. DSC quantifies the voxel-wise overlap between the predicted segmentation and the ground truth, while HD95 assesses the boundary-level agreement by measuring the distance between surfaces. Together, these metrics capture both volumetric and geometric accuracy.

Although our analysis in this work focuses on DSC and HD95, it is worth noting that computational efficiency plays a crucial role in the challenge design. Inference runtime and maximum memory usage are also recorded and will contribute to the final ranking of submitted methods, reflecting their practical applicability in clinical settings.

\subsection{Implementation details}
\noindent\textbf{Environments and Requirements.} The training of our method was conducted for a total of 1000 epochs. The details of the computational environment and dependencies are summarized in Table~\ref{tab:system_config}.


\begin{table}[h]
\centering
\caption{System Configuration}
\begin{tabular}{|l|l|}
\hline
\textbf{Ubuntu version}       & Ubuntu 24.04 LTS                  \\ \hline
\textbf{CPU}                  & Intel(R) Xeon(R) Platinum 8352S CPU @ 2.20GHz \\ \hline
\textbf{RAM}                  & 503 GB                               \\ \hline
\textbf{GPU}                  & 1 NVIDIA GeForce RTX 4090 (24G)      \\ \hline
\textbf{CUDA version}         & 12.4                                 \\ \hline
\textbf{Programming language} & Python 3.9.19                               \\ \hline
\textbf{Deep learning framework} & PyTorch (torch 1.12.1, torchvision 0.19.1) \\ \hline
\textbf{Code will available at} &  https://github.com/duola-wa/Toothfairy3 \\ \hline
\end{tabular}

\label{tab:system_config}
\end{table}



\noindent\textbf{Inference Acceleration.} Since runtime was an important factor in the challenge evaluation, we applied several strategies to accelerate inference. First, we disabled test-time augmentation in nnU-Net, which substantially reduced the computational burden while maintaining competitive accuracy. Second, we optimized the handling of multi-class predictions by refining the interpolation step. Instead of relying on conventional integer-based resampling methods that are computationally demanding, we leveraged PyTorch’s \texttt{interpolate} function on floating-point tensors. This choice preserves numerical precision while improving throughput in large-scale volumetric segmentation. Together, these strategies enabled efficient inference across the entire test set.

\subsection{Results and Analysis}

\noindent\textbf{Quantitative Performance.}  
The quantitative results for both the debug and test phases are summarized in Table~\ref{tab:eval_results}. We report the Dice similarity coefficient and the HD95, with the former reflecting overlap accuracy and the latter assessing boundary alignment. Higher Dice and lower HD95 values indicate better performance.

\begin{table}[h]
\centering
\caption{Evaluation results across debug and test phases. Dice similarity coefficient and HD95 are reported.}
\renewcommand{\arraystretch}{1.4}
\begin{tabular}{l|l|c|c}
\hline
\textbf{Metric} & \textbf{Statistic} & \textbf{Debug Phase} & \textbf{Test Phase} \\ \hline

\multirow{7}{*}{\textbf{Dice Average}} & Min & 0.9090 & 0.5671 \\ 

 & 25\%  & 0.9371 & 0.7340 \\ 

 & 50\% & 0.9653 & 0.7821 \\ 

 & 75\%  & 0.9695 & 0.8329 \\ 

 & Max & 0.9737 & 0.8670 \\ 

 & Mean & 0.9493 & 0.7705 \\ 

 & Std & 0.0352 & 0.0754 \\ 
\hline

\multirow{7}{*}{\textbf{HD95 Average}} & Min & 11.13 & 54.58 \\ 

 & 25\%  & 11.16 & 77.29 \\ 

 & 50\% & 11.18 & 93.36 \\ 

 & 75\%  & 28.13 & 122.91 \\ 

 & Max & 45.07 & 206.55 \\ 

 & Mean & 22.46 & 104.59 \\ 

 & Std & 19.58 & 37.21 \\ 
\hline
\end{tabular}
\label{tab:eval_results}
\end{table}

In the debug phase, which included only three cases, our method demonstrated high segmentation accuracy with an average Dice score of 0.949 and a relatively low HD95 of 22.46. However, the larger-scale test phase presented more challenging scenarios, where the average Dice dropped to 0.770, and the mean HD95 increased to 104.59. This performance gap highlights the difficulty of generalization from a limited validation set to a more diverse and comprehensive test set. Nevertheless, the results remain competitive and validate the robustness of our approach under varying anatomical and imaging conditions.

\noindent\textbf{Qualitative Results.}
To provide visual insight into the segmentation performance, Fig.~\ref{fig:qualitative_results} presents representative examples from the debug phase. These three cases illustrate the method's ability to accurately delineate anatomical structures across different imaging conditions and patient anatomies. Each row displays the input CBCT image (left), the predicted segmentation result (center), and the corresponding ground truth annotation (right). The visual comparison demonstrates the accuracy of our segmentation results on debug data, with predicted boundaries closely matching the expert annotations across multiple anatomical regions.

\begin{figure}[htb]
\includegraphics[width=\textwidth]{4.pdf}
\caption{Representative segmentation results from debug phase cases. Each row shows (from left to right): input CBCT image, predicted segmentation result, and ground truth. The results demonstrate accurate delineation of anatomical structures across different patient anatomies and imaging conditions on debug data.} \label{fig:qualitative_results}
\end{figure}

As shown in Fig. \ref{fig:nninteractive}, we provide a visual comparison of the segmentation results for nnInteractive with the introduction of 3 and 5 interaction points, respectively. This visualization highlights the impact of increasing the number of user interactions on the segmentation accuracy. Due to the limited number of submissions in the competition, we did not include metric-based results in this analysis, focusing instead on the visual comparison of the segmentation outputs.

\begin{figure}[htb]
\includegraphics[width=\textwidth]{nninteractive.pdf}
\caption{The nnInteractive method is evaluated with 3 and 5 interaction points, highlighting the effect of prompt refinement on segmentation accuracy.} \label{fig:nninteractive}
\end{figure}

\section{Conclusion}
In this paper, we presented a segmentation framework for multi-class CBCT images, designed for the ToothFairy3 Challenge. Our approach leverages nnU-Net as a backbone and introduces SAP to account for the morphological variability of different anatomical structures. The proposed strategy enables differentiated handling of large and fine-scale structures, thereby reducing false positives while preserving clinically relevant details. Experiments demonstrated that our method achieves consistently high accuracy in the debug phase. Importantly, by optimizing interpolation strategies, we achieved notable improvements in inference efficiency. Furthermore, the integration of the interactive refinement module demonstrates a viable path for clinical deployment. It bridges the gap between fully automated processing and the need for meticulous precision in complex surgical cases, effectively balancing algorithmic efficiency with clinical reliability. 



%
% ---- Bibliography ----
%
% BibTeX users should specify bibliography style 'splncs04'.
% References will then be sorted and formatted in the correct style.
%
% \bibliographystyle{splncs04}
% \bibliography{mybibliography}
%
\bibliographystyle{splncs04}
\bibliography{ref}


\end{document}
