% This is samplepaper.tex, a sample chapter demonstrating the
% LLNCS macro package for Springer Computer Science proceedings;
% Version 2.21 of 2022/01/12
%
\documentclass[runningheads]{llncs}
%
\usepackage[T1]{fontenc}
% T1 fonts will be used to generate the final print and online PDFs,
% so please use T1 fonts in your manuscript whenever possible.
% Other font encondings may result in incorrect characters.
%
\usepackage{float}
\usepackage{multirow}
\usepackage{makecell}
\usepackage{graphicx}
% Used for displaying a sample figure. If possible, figure files should
% be included in EPS format.
%
% If you use the hyperref package, please uncomment the following two lines
% to display URLs in blue roman font according to Springer's eBook style:
%\usepackage{color}
%\renewcommand\UrlFont{\color{blue}\rmfamily}
%\urlstyle{rm}
%
\begin{document}
%
\title{Optimizing the CBCT Segmentation Pipeline with Intuition-Guided Processing}
\titlerunning{Optimizing the CBCT Segmentation Pipeline with IGP}
% If the paper title is too long for the running head, you can set
% an abbreviated paper title here
%
\author{Qingyu Kuang\inst{1,2}\orcidID{0009-0003-3830-0412}}
%
\authorrunning{Q. Kuang}
% First names are abbreviated in the running head.
% If there are more than two authors, 'et al.' is used.
%
\institute{Institute of Automation, Chinese
Academy of Sciences, Beijing, China \and
School of Artificial Intelligence, University of Chinese
Academy of Sciences, Beijing, China}
%
\maketitle              % typeset the header of the contribution
%
\begin{abstract}
In the past, general medical image models attracted considerable research interest. However, since medical imaging modalities vary widely and often fundamentally differ from RGB images, applying a general segmentation framework to specific tasks usually requires further optimization to achieve satisfactory performance. Cone-beam computed tomography (CBCT) is a commonly used medical imaging technique in dentistry. Optimizing the segmentation process for CBCT images can greatly enhance the effectiveness of computer-aided diagnostic systems in dental applications. In this work, we analyzed the ToothFairy3 dataset and proposed improvements to the nnU-Net framework. While preserving the auto-configuration capabilities of nnU-Net, we introduced targeted optimizations across the data preprocessing pipeline, network architecture, inference process, and postprocessing strategies to enhance performance for the CBCT multi-class segmentation task. Furthermore, the trained multi-class segmentation model can be integrated with user click prompts to train an interactive segmentation model. These modifications collectively reduced inference time, improved model effectiveness, and increased practical applicability. Code is available at \url{https://github.com/kaoquanyu-for/formedseg.git}.

\keywords{CBCT Segmentation \and Deep Learning \and Medical Images.}
\end{abstract}
%
\section{Introduction}
Over the past decade, digital dentistry has advanced rapidly, with its key focus being the acquisition and segmentation of complete three-dimensional dental models and related structures. Currently, the mainstream technologies for obtaining 3D dental models mainly include intraoral scanning (IOS) or desktop scanning, and CBCT. Among these, intraoral or desktop scanning can conveniently capture the geometric morphology of the teeth crown surface but is limited to recording the external structure of teeth~\cite{article2,article3}. In contrast, CBCT not only provides teeth surface information but also acquires internal 3D data such as jawbones, dental roots, and surrounding bone structures, offering greater advantages in clinical diagnosis and complex treatment planning~\cite{article4}. As a result, it is widely used in oral and maxillofacial examinations and dental diagnostics~\cite{five5}. However, manual segmentation of CBCT images is time-consuming and demands specialized expertise and experience. Therefore, training deep learning-based models to automatically segment structures such as the maxillofacial bones and teeth in CBCT images can significantly streamline the process of diagnosis, evaluation, and surgical planning for dentists, while also providing critical references for applications such as dental crown design and the fabrication of surgical guides~\cite{six6,seven7}.

However, current segmentation methods still face challenges due to variations in oral cavity opening states caused by different examination purposes, as well as considerable variations in morphological characteristics across different structures, which adversely affect segmentation accuracy and robustness. Examples of images with different oral cavity states from the public dataset ToothFairy3~\cite{nine9,ten10,eleven11} are illustrated in Fig.~\ref{fig1}. Furthermore, the practical application of these models is limited by computational resources and time constraints, underscoring the gap that remains between theoretical research and clinical deployment.
\begin{figure}[h]
\centering
    \includegraphics[width=\textwidth]{diff_state.png}
    \caption{The CBCT images and annotations in the Toothfairy3 dataset are shown in the figure. The top row displays the images under different oral cavity opening states, and the bottom row shows the corresponding annotated label images.} 
    \label{fig1}
\end{figure}

\noindent To address these issues, we re-examined the fundamental differences between CBCT images and RGB images. Inspired by human perceptual intuition for segmentation, we developed a novel processing pipeline for CBCT volume segmentation based on the nnU-Net~\cite{article1} framework.

The main contributions of our work can be summarized as follows:
\begin{itemize}
    \item Based on the nnU-Net framework, we introduced several modifications to enhance segmentation accuracy. These include adjusting the type and depth of deep supervision loss computation, and adding category prediction heads to supervise the encoding process, thereby encouraging the extraction of more discriminative features. 
    \item Furthermore, the trained multi-class segmentation model can be integrated with user click prompts to train a single-class segmentation model, enabling interactive prompt-based segmentation.
    \item The rules for constraining flip augmentation were adjusted to mitigate mis-segmentation caused by symmetrically similar structures in the images. And a novel augmentation method termed "tooth eraser" was introduced to increase data diversity.
    \item New designed post-processing workflow was optimized to align with the structural features of the segmentation output, balancing trade-offs between accuracy and inference time. And to enable large-patch inference, we optimized the inference process.
\end{itemize}


\section{Methods}
\subsection{Overview}
\begin{figure}[h]
\centering
    \includegraphics[width=\textwidth]{method.png}
    \caption{ Overview of the segmentation processing pipeline, illustrating the key stages and components of both the training and inference processes.} 
    \label{fig2}
\end{figure}

\noindent We have revisited the entire training and inference pipeline for medical image segmentation. Building upon the nnU-Net framework, we further improved a multi-class segmentation pipeline tailored for CBCT images, as illustrated in Fig.~\ref{fig2}.

For the segmentation task on the ToothFairy3 dataset, the network utilized the architecture configuration derived from nnU-Net’s automated parameter configuration, including the network depth and feature dimensionality across different layers. Building on this foundation, we introduced modifications to the data preprocessing pipeline, network architecture, postprocessing strategies, and inference procedure to enhance its performance specifically for the ToothFairy3 segmentation task.


\subsection{Dataset}
The ToothFairy3 dataset comprises a large collection of 3D-annotated CBCT scans covering 77 anatomical structures that are highly relevant to orthodontics. In addition, the associated challenge not only focuses on segmentation accuracy but also incorporates inference efficiency as an evaluation metric and introduces an interactive segmentation task for the Inferior Alveolar Canal, addressing both automation and clinical needs. These features make ToothFairy3 particularly suitable for developing and evaluating segmentation models with strong clinical applicability.

The dataset contains 532 images, each with an isotropic resolution of 0.3 along all axes, but we manually selected 507 samples, discarding some extreme cases. The dataset contains three different sets, and their corresponding images are shown in Fig.~\ref{fig1}.  
\subsection{Data Preprocessing}
\subsubsection{Tooth Eraser.} 
Based on image characteristics, given that missing teeth are always present in the images, we randomly remove complete lower teeth without crowns in the images to enhance image diversity. Its effects are shown in Fig.~\ref{fig3}.

\begin{figure}[h]
    \includegraphics[width=\textwidth]{lack.png}
    \caption{ The visualization shows the processing effects of manually removing teeth according to image features.} 
    \label{fig3}
\end{figure}
\subsubsection{Logical Consistency Flip Augment.} 
Flipping augmentation of 3D images is commonly employed as a standard processing step to enhance data diversity. However, as noted in the article~\cite{eight8}, when the segmentation targets include symmetrically similar structures, applying flipping augmentation without appropriate restrictions may cause confusion between these symmetrical parts in the images. This issue is particularly prominent in CBCT data, where multiple anatomical structures such as teeth and the inferior alveolar canal (IAC) display inherent symmetry. Moreover, when working with small patches, it becomes difficult to distinguish between upper and lower teeth based on structural features alone.

This issue is seldom encountered when processing images of other body parts, primarily for two reasons. First, most anatomical regions possess sufficient structural features with low morphological similarity between distinct structure. Second, in many cases there is no clinical need to differentiate between symmetric categories.

Intuitively, to mitigate the mis-segmentation caused by symmetrical structures, we diverged from the experimental setup described in article by retaining the flipping augmentation operation but imposing a key constraint: the number of flipping operations must always be even. When allowing flips along the x, y, and z axes, this means either performing no flips or flipping across an even number of axes.

The intuition behind this is straightforward: an odd number of flips results in a completely symmetrical version of the image, which can disrupt the perception of anatomical orientation. When the model is sufficiently complex, it may still learn to distinguish such flipped samples. However, both the model and human observers are likely to struggle when dealing with inherently symmetrical anatomical regions. The effects of applying different numbers of flipping operations are illustrated in Fig.~\ref{fig4}.
\begin{figure}[h]
\centering
    \includegraphics[width=0.75\textwidth]{mirror.png}
    \caption{ The left figure shows the results of an even number of flipping operations, while the right figure displays the visualizations generated by an odd number of flipping operations.} 
    \label{fig4}
\end{figure}
\subsubsection{Dimension Expansion.} 
The imaging principle of CBCT differs significantly from that of RGB images, as its pixel values carry specific physical meanings. Since the ToothFairy3 dataset used in this study extends ToothFairy2 with additional annotation categories, we randomly sampled points across each category and recorded their Hounsfield Unit (HU) values in the ToothFairy2 dataset and visualized the statistics in Fig.~\ref{fig5}. Based on this analysis, we divided the intensity values into multiple channels using the following intervals: [-600, 0], [0, 1000], [0, 2000], [1000, 3000], and [3000, maximum]. Different HU values may correspond to different tissue types, and in clinical practice, different intensity ranges are commonly used to capture images of specific tissues. Therefore, we analyzed the data ranges for each anatomical label and established the divisions described above. Given that teeth generally exhibit high HU values, the low-HU regions that are challenging to distinguish were split into multiple channels to provide the model with more detailed information.
To enhance edge information, we computed a boundary channel by applying the Laplacian operator to the image restricted to the intensity range [-600, 3000], and incorporated it as an additional channel. At this stage, each channel is individually normalized to [0,1].
 This multi-channel partitioning strategy constitutes one key component of our data preprocessing pipeline.
\begin{figure}[h]
    \includegraphics[width=\textwidth]{10-95_patch.png}
    \caption{The figure shows the Hounsfield Unit (HU) value ranges for different anatomical structures.} 
    \label{fig5}
\end{figure}

\subsection{Network Architecture.}
To enable accurate identification of different segmentation categories, we refined the deep supervision mechanism in the nnU-Net by reducing the number of supervised decoding layers from supervision at each stage to only the final three layers. Additionally, a category prediction head was added to the final encoder layer to perform 77-class prediction for each input image patch in the ToothFairy3 segmentation task. This design enhances discriminative feature learning through explicit category-wise supervision. 

For this task, we employed a composite loss function consisting of cross-entropy loss, Dice loss, and focal cross-entropy loss. The $i$ denotes the outputs at different levels of deep supervision. The formulation is as follows:
\begin{equation}
\text{loss} = \left( \sum_{i=0}^{2} \frac{1}{2^i} \left( l_{\text{dice}}^i + l_{\text{fce}}^i \right) \right) + l_{\text{ice}}^{\text{class}}
\end{equation}

\noindent Dice loss and focal cross-entropy loss are computed for the outputs of the decoding stages at different levels, and cross-entropy loss is computed for the output of the category prediction head.

When training a single-class segmentation model with user click prompts, we froze the pretrained multi-class segmentation model and stacked the user click information with the multi-class inference outputs as an additional input channel to train a dedicated single-class segmentation head. When organizing the user click information, we only assign a value of 1 to the positions that the user clicked, and set all other positions to 0. At this stage, the model outputs predictions for only one class.

\subsection{Model training}
All experiments were conducted on 4 NVIDIA V100 GPUs (32 GB). During
training, 20\% of each set was used for validation. The models were trained for 30 epochs, with each epoch corresponding to a full traversal of all training data rather than random patch sampling. The training patch size was set to [160, 192, 192] and the batch size was set to 1. The model achieving the best performance on the validation set was retained.

For nnU-Net, we adopted the defaultUNet L configuration and further customized it. In addition to the preprocessing enhancements described above, RandAffine augmentation was applied. The model was optimized using AdamW with an initial learning rate of $1 \times 10^{-4}$ and a ReduceLROnPlateau learning rate scheduler. We use a learning rate scheduler with a reduction factor of 0.8. The learning rate will not decrease below $1 \times 10^{-6}$, and the scheduler monitors the validation metric at every epoch.

\subsection{Inference.}
Due to the substantial computational requirements of training 3D data and the constraints on inference time and computing resources imposed by the ToothFairy3 challenge, we intuitively reasoned that increasing the patch size, during training and inference, could serve as an effective way to mitigate mis-segmentation in symmetrically similar structures. Although limited computational resources restricted the training patch size to [160, 192, 192], we re-examined the inference functions in both nnU-Net and MONAI and implemented strict memory management on the GPU during inference. This approach allowed the use of patch sizes consistent with those used in training, reduced the number of patches requiring inference under the same overlap ratio setting, shortened inference time, and ultimately improved the practical usability of the model.
\begin{table}
\centering
\caption{Inference time for different image sizes.}\label{refer}
\begin{tabular}{|c|c|c|c|}
\hline
Image size &  \makecell{Inference \\Repeat (\%)} & Patch Num & \makecell{Inference time(s)\\(without argmax and Post-processing)}
\\
\hline
262, 512, 512(F\_001) & 25 & 32 & 41\\
170, 352, 370(P\_001) &  25 & 18 & 19\\
188, 385, 462(S\_0001) & 25 & 18 & 17\\
\hline
\end{tabular}
\end{table}

\noindent
Table~\ref{refer} shows the inference time required on an RTX 4060 GPU(8GB). By keeping only one patch and the model in GPU memory at a time, memory consumption is relatively low. This enables the use of a larger patch size and significantly reduces the number of patches needed for inference. As tested, the patch size can be increased to [192, 192, 192].

Post-processing and the argmax operation are configured to run on the CPU. As their execution time is strongly affected by system RAM, these steps are excluded from the reported runtime.


\subsection{Data Post-processing.}
Determining the optimal post-processing strategy in nnU-Net requires repeated inference across the entire dataset to evaluate the retention of the largest connected component for each category, which is a highly time-consuming process. Based on the structural characteristics of the data, we employed a hybrid strategy combining projection-based connected region preservation with 3D connected-component labeling.

Compared to nnU-Net’s automated strategy, our method requires configuration based on observational analysis of the dataset, but it reduces inference time and avoids repeated post-processing selection across different combination schemes. The specific procedure is shown in Fig.~\ref{fig6}.

It can be observed that most segmentation errors in the model’s output occur in the maxilla, mandible, and pharyngeal regions. During multi-class segmentation, we first retain the largest 3D connected components of the pharynx and mandible. The labels are then projected along the z-axis, and the two largest connected regions in the projection are preserved. These are back-projected into 3D to remove erroneous segmentation areas outside the main anatomical structures, yielding the final segmentation result after post-processing.

When processing the output of a single-class segmentation, we assign labels to the results based on user prompts, as also illustrated in Fig.~\ref{fig6}.
\begin{figure}[h]
\centering
    \includegraphics[width=\textwidth]{postprocessing.png}
    \caption{The figure illustrates the post-processing pipeline for both multi-class and single-class segmentation. KLCC stands for "Keeping the Largest Connected Component".} 
    \label{fig6}
\end{figure}
\section{Results}
\subsection{Validate the effectiveness of the flip constraint}
We used a portion of the data as a validation set. For each patch, we performed inference to validate the Dice coefficient, then averaged the results. During training, all models were trained for the same duration, and the best performance results on the test set were recorded. The outcomes are presented in Table 2. Demonstrates the effectiveness of the constraint rules.

\begin{table}[h]
\centering
\caption{Results of different constraint rules.}\label{constraint}
\begin{tabular}{|c|c|}
\hline
Model &  Dice 
\\
\hline
default & 0.9288\\
constraint & 0.9409\\
\hline
\end{tabular}
\end{table}

\subsection{Validation Results}
The inference results for the three different sets are presented in Fig.~\ref{fig7}. 
\begin{figure}[t]
\centering
    \includegraphics[width=\textwidth]{result.png}
    \caption{The top row shows the labels, and the bottom row shows the model's inference results, corresponding from left to right to F\_001, P\_001, and S\_0001 in ToothFairy3.} 
    \label{fig7}
\end{figure}

On the current challenge leaderboard, the results from my final submission show that in the multi-instance segmentation task, the maximum Dice score reached 0.82, while the minimum was 0.14. 

\noindent Similarly, for the interactive segmentation task, the highest Dice score achieved was 0.92, although several cases completely failed, producing no segmentation output whatsoever.

The best results of both tasks in the test phase are summarized in Table~\ref{result}. The overlap ratio in the inference function was set to 25\% for the multi-instance segmentation task and 50\% for the interactive segmentation task. And our inference speed was ranked second in the interactive segmentation task.

\begin{table}
\centering
\caption{Evaluation results of both tasks in test phase. Dice similarity coefficient and HD95 are reported.}
\label{result}
\begin{tabular}{c c cc}
\hline
Metric & Statistic & Multi-class & Interactive \\
\hline
\multirow{7}{*}{Dice Average}
 & Min  & 0.149 & 0.0 \\
 & 25\% & 0.516 & 0.66 \\
 & 50\% & 0.652 & 0.83 \\
 & 75\% & 0.718 & 0.88 \\
 & Max  & 0.820 & 0.92 \\
 & Mean & 0.594 & 0.72 \\
 & Std  & 0.185 & 0.25 \\
\hline
\multirow{7}{*}{HD95 Average}
 & Min  & 78.40 & 1.0 \\
 & 25\% & 107.09 & 1.73 \\
 & 50\% & 142.38 & 2.80 \\
 & 75\% & 185.98 & 45.11 \\
 & Max  & 358.28 & 607.86 \\
 & Mean & 163.31 & 76.82 \\
 & Std  & 69.14 & 159.13 \\
\hline
\end{tabular}
\end{table}

\noindent Inspection of the inference results shows that certain data setup errors remain to be resolved, since the model identify the implants and the maxillary sinus. It remains a challenge for the model to accurately distinguish between dental crowns and bridges. In addition, in interactive segmentation tasks, the model may fail when processing certain images. This failure is possibly caused by variations in oral cavity opening states across images or by overfitting, which prevents the model from generalizing to different conditions. However, preliminary experiments have already achieved promising progress in both multi-class segmentation and interactive segmentation, providing valuable insights and ideas for further studies.


% \begin{theorem}
% This is a sample theorem. The run-in heading is set in bold, while
% the following text appears in italics. Definitions, lemmas,
% propositions, and corollaries are styled the same way.
% \end{theorem}
% %
% % the environments 'definition', 'lemma', 'proposition', 'corollary',
% % 'remark', and 'example' are defined in the LLNCS documentclass as well.
% %
% \begin{proof}
% Proofs, examples, and remarks have the initial word in italics,
% while the following text appears in normal font.
% \end{proof}


\begin{credits}
\subsubsection{\ackname}We would like to express our sincere gratitude to the organizers of the ToothFairy Challenge for their continuous efforts in creating and updating the publicly available CBCT datasets. Their dedicated work has made this study possible.

\subsubsection{\discintname}
The authors have no competing interests to declare that are relevant to the content of this article. 
\end{credits}
%
% ---- Bibliography ----
%
% BibTeX users should specify bibliography style 'splncs04'.
% References will then be sorted and formatted in the correct style.

\bibliographystyle{splncs04}
\bibliography{mybibliography}


\end{document}
