% This is samplepaper.tex, a sample chapter demonstrating the
% LLNCS macro package for Springer Computer Science proceedings;
% Version 2.21 of 2022/01/12
%
\documentclass[runningheads]{llncs}
%
\usepackage[T1]{fontenc}
% T1 fonts will be used to generate the final print and online PDFs,
% so please use T1 fonts in your manuscript whenever possible.
% Other font encondings may result in incorrect characters.
%
\usepackage{graphicx}
\usepackage{amsmath}
\usepackage{array}
\usepackage{booktabs}
\usepackage{hyperref}
\usepackage{color}
\renewcommand\UrlFont{\color{blue}\rmfamily}
\urlstyle{rm}
\hypersetup{
    colorlinks=true,
    linkcolor=blue,
    filecolor=magenta,      
    urlcolor=cyan
    }

% Used for displaying a sample figure. If possible, figure files should
% be included in EPS format.
%
% If you use the hyperref package, please uncomment the following two lines
% to display URLs in blue roman font according to Springer's eBook style:
%\usepackage{color}
%\renewcommand\UrlFont{\color{blue}\rmfamily}
%\urlstyle{rm}
%
\begin{document}
%
\title{Morphology-Driven Deep Watershed Transform for 3D Tooth Segmentation}
%
%\titlerunning{Abbreviated paper title}
% If the paper title is too long for the running head, you can set
% an abbreviated paper title here
%
\author{Tomasz Szczepański\inst{1}\orcidID{0000-0001-6189-478X} \and
Szymon Płotka\inst{2}\orcidID{0000-0001-9411-820X}}
%
% \authorrunning{F. Author et al.}
% First names are abbreviated in the running head.
% If there are more than two authors, 'et al.' is used.
%
\institute{Sano Centre for Computational Medicine, Cracow, Poland \\
\email{t.szczepanski@sanoscience.org} \and
Jagiellonian University, Cracow, Poland
}

%
\maketitle              % typeset the header of the contribution
%
\begin{abstract}
Segmentation of dentomaxillofacial structures in Cone-Beam Computed Tomography (CBCT) remains challenging, particularly for fine details such as root apices and nerve canals, which are crucial for evaluating root resorption in digital dentistry or to make surgical planning more precise. We present an approach that unifies instance detection and multi-class dentomaxillofacial structure segmentation in CBCT scans, in the scope of the ToothFairy3 Challenge. We adapt a Deep Watershed method, modeling each anatomical structure as a continuous 3D energy basin encoding voxel distances to class boundaries. This instance-aware representation ensures accurate segmentation of narrow, complex dentomaxillofacial structures. We train and evaluate our solution on the ToothFairy3 dataset, comprising 532 CBCT scans with voxel-wise annotations. Our method achieved a mean Dice coefficient of 0.742 and HD95 of 111.13 on the test set. We provide implementation at \href{https://github.com/tomek1911/GEPAR3D}{\texttt{https://github.com/tomek1911/TF3}}. 

\keywords{CBCT segmentation \and ToothFairy3 Challenge \and Morphological inductive bias \and Deep Watershed}
\end{abstract}
%
%
%
\section{Introduction}

In this report, we describe our solution for Task 1, "Multi-class segmentation" of the ToothFairy3 challenge. Automatic tooth segmentation in dental CBCT volumes is a critical step for various clinical applications, including orthodontic planning, endodontics, and surgical guidance. Building upon previous efforts in the ToothFairy challenges, we present a method adapted to the increased complexity of ToothFairy3. Compared to ToothFairy2, the new dataset contains 52 additional CBCT volumes acquired with a different scanner, and annotations have been substantially expanded to include 35 new labels, covering pulpy cavities for all 32 teeth, left and right incisive canals, and the lingual canal. The quality of annotations has also been improved, offering a richer resource for developing robust segmentation algorithms.

The task requires accurate voxel-wise labeling of all tooth structures and internal anatomical features within high-resolution CBCT volumes. It presents several challenges: the small size and variability of pulp cavities, the complex shape of incisive and lingual canals, and the presence of noise and artifacts in CBCT scans. Furthermore, inter-patient anatomical variations and differences in scanner acquisition parameters increase the difficulty of generalizing segmentation models.

Several methods have been proposed for tooth segmentation in previous challenges and research field \cite{isensee2024scaling,bolelli2024segmenting}. Classical approaches include atlas-based registration, graph-based techniques, or multi-stage approaches but the common part is that all recent advances leverage deep learning for volumetric segmentation. Notably, approaches such as SGANET \cite{li2022semantic}, TSG-GCN \cite{liu2024individual}, ToothSeg \cite{cui2021hierarchical} and GEPAR3D \cite{szczepanski2025gepar3d} have demonstrated the effectiveness of combining volumetric convolutional networks with morphology-aware guidance. What is more, incorporating geometry-related features has been shown to enhance the model's generalization to external datasets \cite{szczepanski2024let}.

Our approach extends the methodology proposed in GEPAR3D, incorporating a 3D Deep Watershed Transform guided by a direction map to enable morphology-aware learning of more than 32 teeth classes. This design allows the network to leverage both volumetric context and fine-grained morphological cues, leading to precise delineation of teeth and internal structures such as pulp cavities or nerve canals. To accommodate the high-resolution CBCT volumes within challenge memory constraints, we adapt a sliding window inference strategy, improving upon the MONAI-based sliding window used in the original GEPAR3D implementation. By combining morphology-guided learning with efficient volumetric inference, our solution effectively addresses the increased label complexity, variability, and inherent challenges of ToothFairy3.

\begin{figure}[t!]
    \centering
    \includegraphics[width=\linewidth]{graphics/TF3_solution_vHQ.png}
    \caption{An overview of the proposed solution, which unifies instance detection and multi-class segmentation for dentomaxillofacial structures in CBCT scans. Our model simultaneously performs multi-class segmentation and instance regression (gray). It also handles both multi-class and binary segmentation, incorporating techniques like majority voting and pulp fusion (blue). During training, we capture complex apex geometries via an Energy Direction loss (yellow) and use an instance regression task to generate energy maps for the Deep Watershed Algorithm (red).}
    \label{fig:overview}
\end{figure}

\section{Methods}

An overview of our pipeline is presented in Fig. \ref{fig:overview}. The proposed solution builds upon the GEPAR3D method \cite{szczepanski2025gepar3d}, extending it to the multi-class setting required by ToothFairy3. Our model jointly addresses multi-class semantic segmentation and instance-level regression, enabling it to separate individual teeth while also capturing their internal anatomical structures. To support both multi-class and binary segmentation objectives, we integrate strategies such as majority voting across classes and pulp fusion to ensure consistent labeling of internal cavities. During training, we introduce auxiliary objectives to enhance morphological awareness: an Energy Direction loss to model complex apex geometries and elongated nerve canals (see Fig. \ref{fig:dir_map}), and an instance regression task to generate energy maps that guide the 3D Deep Watershed Transform. These components together encourage the network to learn both local morphological details and global structural consistency.

\begin{figure} [t!]
    \centering
    \includegraphics[width=0.6\linewidth]{graphics/Energy Map Direction TF3.png}
    \caption{We provide slices of the 3D Energy Direction Map (a) overlaid with semi-transparent (opacity 0.5) segmentation labels, enabling visualization of structural boundaries within spatial context. The direction map, derived by applying a 3D Sobel kernel to the distance map, assists the model in segmenting elongated and thin structures. While the distance map (b) approaches zero at the nerve canal–bone boundary, the direction map shows contrasting values, highlighting regions that are difficult to segment. Boundary regions between individual teeth (c, d) are similarly marked by abrupt vector changes, where regression errors are heavily penalized through the angular loss $L_{dir}$, enforcing directional consistency.}
    \label{fig:dir_map}
\end{figure}


\noindent
\textbf{Deep Watershed Instance Regression.}  
To produce the inputs required by the Deep Watershed algorithm we train the network to solve two complementary volumetric regression tasks: (i) a continuous energy-basin regression that encodes each \textit{pulp-free} tooth instance as a smooth scalar field and (ii) a per-voxel direction (descent) estimate that refines boundary localization, especially in regions with steep gradients such as root apices and elongated nerve canals (see Fig. \ref{fig:nerve_canal}). We first create a secondary set of instance labels in which all pulp voxels have been removed from tooth instances (this guarantees that tooth instances are disjoint and suitable for watershed processing). Ground-truth energy basins $E_{GT}(\mathbf r)$ are computed on these pulp-free instances using the Euclidean Distance Transform (EDT) to the each instance boundary separately (based on semantic classes of GT) and then normalized to $[0,1]$ for numerical stability. The network regresses a continuous energy map $\widehat E(\mathbf r)$ (single-channel) using a mean squared error objective:
\[
L_{EDT} \;=\; \frac{1}{N}\sum_{\mathbf r} \big( E_{GT}(\mathbf r) - \widehat E(\mathbf r) \big)^2.
\]

For directional supervision we compute the gradient field of the ground truth energy $\nabla E_{GT}(\mathbf r)$ (implemented via a 3D Sobel-Feldman operator along $x,y,z$) and form unit direction vectors
\[
\mathbf u_{GT}(\mathbf r) \;=\; \frac{\nabla E_{GT}(\mathbf r)}{\max\{\|\nabla E_{GT}(\mathbf r)\|_2,\varepsilon\}},
\]
with a small $\varepsilon$ to avoid division by zero. The model predicts a 3-channel direction vector $\widehat{\mathbf u}(\mathbf r)$ which we normalize voxelwise. We supervise the directions with an angular loss:
% \[
% L_{dir} \;=\; \frac{1}{|P|}\sum_{\mathbf r \in P} \Big(\cos^{-1}\big(\langle \mathbf u_{GT}(\mathbf r), \widehat{\mathbf u}(\mathbf r)\rangle\big)\Big)^2,
% \]
\[
L_{\mathrm{dir}}
= 
\frac{1}{N}
\sum_{i=1}^{N}
\left(
\frac{\cos^{-1}\!\big(\langle \mathbf u_{GT}^{(i)},\, \widehat{\mathbf u}^{(i)} \rangle\big)}{\pi}
\right)^{2},
\]
where $N$ is the total number of voxels. We clip $\cos^{-1}$ inputs to $[-1,1]$ for stability and divide by $\pi$ to scale the angular error to $[0,1]$. To focus the direction learning on anatomically relevant boundaries we mask $N$, see Fig. \ref{fig:dir_map}c to include voxels belonging to tooth instances and to thin/elongated semantic classes (e.g. nerve canals) but exclude pulp voxels.

\begin{figure}
    \centering
    \includegraphics[width=0.6\linewidth]{graphics/Energy Map Direction TF3_nerve.png}
    \caption{Slices of the 3D Energy Direction Map with the inferior alveolar nerve visualized (a, b) show that the map clearly delineates the boundary between nerve and bone, both in perpendicular cross-sections and along the canal. In (c), the nerve canal and root apices are visible, with rapid angular transitions in the vector field highlighting anatomically complex regions. These transitions are particularly pronounced at the root apices, where fine, tapering structures curve sharply and diverge from surrounding bone.}
    \label{fig:nerve_canal}
\end{figure}

\noindent
\textbf{Deep Watershed Instance Classification via Majority Voting. }  
At inference, we first obtain voxel-wise semantic predictions for all classes (i.a. teeth without pulp, nerve canals, pulp binary map, jaw/skull bones) and the predicted continuous energy map $\widehat E$. To isolate found instances we binarize the semantic outputs into a \textit{objects mask}. Seed points for watershed are extracted from predicted Energy Map basins by thresholding basin depth (empirically $\beta=0.5$). The Watershed Transform is then run on $\widehat E$ constrained to \textit{objects mask} and using the extracted seeds. This yields disjoint 3D objects instances $V_j$.

Each resulting instance is assigned a semantic class by majority voting on the multi-class semantic branch:
\[
\mathrm{class}(V_j)\;=\;\arg\max_c \sum_{\mathbf r\in V_j} \mathbf 1\{S(\mathbf r)=c\},
\]
where $S(\mathbf r)$ is the per-voxel semantic prediction and $\mathbf 1\{\cdot\}$ is the indicator function.

\noindent
\textbf{Pulp fusion.}  
During training, pulp voxels are optimized independently through $L_{pulp}$. Pulp segmentation is trained separately as a binary segmentation problem. We optimize a composite loss $L_{pulp}=L_{BCE}^{(w)}+L_{Dice}$, where $w_p=5$ is for positive voxels to counteract severe imbalance. Since ToothFairy3 provides pulp annotations for all 32 teeth, but evaluation metrics treat pulp as a single aggregated class, we collapse these labels into one fused pulp mask. The final prediction is obtained by first running instance segmentation via deep watershed and majority voting for tooth and canal classes, followed by assigning pulp voxels on top of the corresponding multi-class predictions. This ensures consistency with the challenge evaluation protocol while still leveraging detailed pulp annotations during learning. 

\noindent
\textbf{Overall training objective.}  
The final loss function combines the contributions from semantic segmentation, pulp segmentation, and instance regression. Specifically, we use a weighted sum of four components: (i) multi-class semantic segmentation loss $L_{seg}$, implemented as a combination of cross-entropy and Dice; (ii) binary pulp segmentation loss $L_{pulp}$, formulated as weighted BCE plus Dice to address strong class imbalance; (iii) energy basin regression loss $L_{EDT}$, which drives accurate continuous energy map prediction for watershed separation; and (iv) direction field loss $L_{dir}$, which regularizes geometric consistency by enforcing alignment between predicted and ground-truth descent directions. This design balances voxel-level classification with morphology-aware instance regression, ensuring robust segmentation of both large structures (e.g., jaw bones) and fine-scale anatomy (nerve canals, root apices).

\noindent
\textbf{Memory-efficient sliding-window inference.}  
Large 3D volumes exceed GPU memory limits during dense prediction, so inference is typically performed with a sliding-window approach with overlapping patches. The default MONAI implementation accumulates intermediate patch predictions in lists before merging, which leads to high memory consumption proportional to the number of overlapping patches. To address this, we implemented a memory-efficient variant that directly accumulates predictions into preallocated output tensors, avoiding intermediate storage.  

For each patch, we apply the model to obtain multi-class logits, energy distance maps, and pulp probabilities. Predictions are weighted by an importance map (constant or Gaussian blending) and accumulated on the fly into global tensors: voxel-wise probability sums on the CPU for multi-class segmentation, and GPU-accumulated maps for distance and pulp outputs. A separate weight accumulator ensures correct normalization. This design prevents redundant storage of overlapping patches while retaining smooth blending across patch boundaries.  The memory-efficient approach reduces inference RAM memory usage substantially while preserving identical prediction quality to the original MONAI sliding window inferer.

\section{Experimental design}

\subsection{Dataset}

We train and evaluate our method on the novel ToothFairy3 dataset \cite{lumetti2024enhancing,bolelli2024segmenting,bolelli2025segmenting}, which consists of multi-center data from centers A, B, and C, comprising 417, 63, and 52 cases, respectively. For training, we randomly selected 10 cases from each center for validation (30 in total), while the remaining 502 cases were used for training.

\subsection{Implementation details}
All scans are resampled to an isotropic resolution of $0.3 \times 0.3 \times 0.3$ mm$^{3}$, with Hounsfield Unit intensities clipped to $[0, 3000]$ and normalized to $[0,1]$. During training, we randomly crop 288 $\times$ 288 $\times$ 160 patches and pad with zeros if necessary. The model is trained for 400 epochs with AdamW, batch size of 2, and a cosine annealing scheduler. The loss function is defined as:

\begin{equation}
    L = \Lambda_{1}L_{EDT} + \Lambda_{2}L_{seg} + \Lambda_{3}L_{dir} + \Lambda_{4}L_{pulp},
\end{equation}
\noindent
with empirically set weights $\Lambda_{1} = 10$, $\Lambda_{2} = 0.1$, $\Lambda_{3} = 1.0$, $\Lambda_{4} = 1.0$ for balance. The initial learning rate and weight decay are set to 1e$^{-3}$ and 1e$^{-4}$, respectively.

Our implementation was developed with PyTorch 2.4.0 and MONAI 1.4.0. Training was performed on a single NVIDIA A100 GPU (80 GB) using float32 precision, while inference employed mixed precision (float16) and was executed on an NVIDIA T4 GPU (16 GB).

\subsection{Evaluation metrics}

The segmentation performance was quantitatively evaluated using two metrics: the Dice Similarity Coefficient (DSC, \%) to measure volumetric overlap and the 95th percentile Hausdorff Distance (HD95, mm) to assess boundary accuracy. A third evaluation criterion, segmentation time, will be reported by the organizers following publication of the final ranking board.

\section{Results}

This section presents the quantitative and qualitative results from the official test phase leaderboard for "Task 1 - Multi-class Segmentation".

\noindent
\textbf{Quantitative results.} Our solution participated in the "Task 1 - Multi-class Segmentation" challenge. Table \ref{tab:official_test} shows the official test phase leaderboard of the best eight submissions. Overall, we achieved a mDSC of 74.22$\pm$8.1\% and a mHD95 of 111.13$\pm$39.40 mm across all 50 test cases. In the final leaderboard, we ranked 6th overall, and 5th in terms of mDSC among the 12 teams.

\begin{table}[t!]
\centering
\caption{Official top 8 leaderboard test phase results for Task 1 - Multi-class segmentation of ToothFairy3 challenge.}
\label{tab:official_test}
\begin{tabular}{c|c|c|c}
\toprule
\textbf{Position} & \textbf{Team} & \textbf{mDSC (\%)} & \textbf{mHD95 (mm)}\\
\midrule
1. & Black\_Myth & 79.81$\pm$6.4 & 88.72$\pm$32.33 \\
2. & TAIR Lab & 79.20$\pm$6.5 & 93.18$\pm$30.43 \\
3. & sjtu\_eiee & 77.05$\pm$7.5 & 104.59$\pm$37.21\\
4. & ring821 & 76.84$\pm$9.7 & 104.40$\pm$47.98 \\
5. & DLaBella29 & 73.86$\pm$7.1 & 97.71$\pm$33.20\\
6. & \underline{SMIR (ours)} & 74.22$\pm$8.1 & 111.13$\pm$39.40 \\
7. & LAVIA Lab & 69.70$\pm$9.4 & 144.97$\pm$48.90 \\
8. & gagaha & 55.1$\pm$17.6 & 172.49$\pm$63.60 \\
\bottomrule
\end{tabular}
\end{table}

\noindent
\textbf{Qualitative results} As shown in Fig. \ref{fig:qualitative}, our method produces generally accurate segmentations. Some errors remain, primarily undersegmentation of jaw bone structures or omission of the lingual nerve. Nonetheless, the method successfully delineated most of the challenging inferior alveolar nerve canal and correctly classified individual tooth instances.

\begin{figure}[h!]
    \centering
    \includegraphics[width=\linewidth]{graphics/Qualitative_TF3_FIXED.png}
    \caption{Qualitative results of our method, proposed as a solution to the ToothFairy3 challenge. We visualize sample from validation set, center A. Ground truth is shown on the right, with both a 3D rendering and a representative 2D slices, while corresponding predictions are shown on the left. Our method yields precise nerve canal segmentation, as shown in the top-row slices and 3D transparent volumes, but shows reduced accuracy in matching the ground truth upper and lower jaw bone.  
}
    \label{fig:qualitative}
\end{figure}


\section{Conclusions}
In this work, we presented our solution for the ToothFairy3 challenge, addressing multi-class segmentation of CBCT scans including tooth instances, pulp cavities, nerve canals, and jaw structures. Our method extends the GEPAR3D framework with a 3D deep watershed transform guided by direction maps, enabling morphology-aware learning and robust instance separation adapted to 45 dentomaxillofacial classes. Handling pulp as a separate binary task allowed effective fusion with Deep Watershed-based instances while avoiding label overlap.  

We further introduced a memory-efficient sliding-window inference to process large CBCT volumes and optimized a combined loss comprising multi-class, pulp, and instance regression components to balance geometric precision with fine-structure accuracy. This design improved delineation of challenging anatomical features, such as root apices, narrow nerve canals, and pulp cavities.  

Unfortunately, our method achieved results inferior to those reported in GEPAR3D. Unlike that approach, we did not leverage a geometrical prior to regularize the loss function, as a Statistical Shape Model was not available for the ToothFairy3 dentomaxillofacial labels. Furthermore, after submission we discovered that our Direction Map labels had been discretized, which substantially reduced the information they carried. We plan to address this issue in future iterations.

Future work will integrate pulp directly into the multi-class segmentation branch and refine the direction-map auxiliary task to better capture narrow pulp fragments and fine canal structures, aiming to further enhance segmentation accuracy and anatomical fidelity.

\begin{credits}
\subsubsection{\ackname} Tomasz Szczepański is supported by the EU's Horizon 2020 programme (grant no. 857533, Sano) and the Foundation for Polish Science's International Research Agendas programme (MAB PLUS/2019/13), co-financed by the EU under the European Regional Development Fund and the Polish Ministry of Science and Higher Education (contract no. MEiN/2023/DIR/3796).
\subsubsection{\discintname}
The authors have no competing interests to declare.
\end{credits}

% \section{Conclusions}
%  we presented our solution for the ToothFairy3 challenge, addressing the multi-class segmentation of CBCT scans with an emphasis on tooth instances, pulp cavities, nerve canals, and jaw structures. Our approach extended the GEPAR3D framework by integrating a 3D deep watershed transform with direction-map guidance, allowing for improved morphology-aware learning and robust instance separation. A key contribution of our method was the separation of pulp as an independent binary segmentation task, enabling effective fusion with tooth predictions and alleviating label intersection issues.  

% We further introduced a memory-efficient sliding-window inference scheme, designed to handle large CBCT volumes within hardware constraints without sacrificing accuracy. Our final optimization strategy combined multi-class semantic segmentation, pulp segmentation, and instance regression losses in a balanced formulation that emphasized both geometric precision and small-structure accuracy.  

% 

% In future iterations of our method, we plan to integrate the pulp class directly into the multi-class segmentation branch, rather than handling it as a separate binary task. This joint formulation could allow the network to learn more consistent relationships between teeth and their internal cavities. Additionally, we aim to adapt the direction-map auxiliary task to more precisely capture the geometry of narrow pulp fragments and fine canal structures, improving instance separation and boundary delineation in these challenging regions. Together, these enhancements are expected to further increase segmentation accuracy and anatomical fidelity, particularly for small and elongated structures.


%
% ---- Bibliography ----
%
% BibTeX users should specify bibliography style 'splncs04'.
% References will then be sorted and formatted in the correct style.
%
\bibliographystyle{splncs04}
\bibliography{bibliography}
%
\end{document}
