\section{Extended Methods}
\begin{figure}
  \centering
  \subfigure[\,]{%
    \includegraphics[height=3cm]{figures/teasar/direction_vector_v2.pdf}
    \label{fig:direction_vectors_def}
  }
  \quad
  \subfigure[\,]{%
    \includegraphics[height=3cm]{figures/teasar/rel_dir_vec.pdf}
    \label{fig:angular_difference}
  }
  \caption{\textbf{Direction vectors generation and angular difference penalty used in modified TEASAR.}
  (a) Direction vectors (blue) are generated by first identifying the closest point on the ground-truth graph (dark red) and then stepping a fixed distance toward the root along the graph. Since the step size is constant across all voxel locations, direction vectors near the centerline exhibit smaller magnitudes, while those near the vessel boundary have larger magnitudes.
  (b) A penalty is assigned based on the angular difference between the predicted direction vector at the current voxel and the relative direction vector to each of its neighboring voxels (i.e., the ``walk direction''). Lower penalties correspond to stronger alignment between the predicted and actual tracing directions, encouraging paths that follow the learned vessel orientation.}
  \label{fig:direction_vectors}
\end{figure}

\begin{algorithm}[t]
\caption{Direction Vector Generation}
\label{alg:vector_generation}

\KwIn{Vessel segmentation mask $M$ and the corresponding skeleton graph $G=(V,E)$ with each edge $e\in E$ having a radius $r_e > 0$ assigned}

\textbf{Parameters:} \texttt{step\_size}$>0$

\KwOut{$\texttt{vector\_field}$ of direction vectors}

Fit $G$ to $M$ by rescaling

Find leaf nodes in $G$

Select the root of each connected component to be the leaf node with maximal radius

Direct the edges in each connected component tree such that they are directed towards the root.

\ForEach{\texttt{foreground\_voxel} of $M$}{
    Find \texttt{nearest\_edge} $\in E$ to \texttt{foreground\_voxel} in $G$
    
    Find \texttt{closest\_point} to \texttt{foreground\_voxel} on \texttt{nearest\_edge} 
    
    $d\gets$ distance between \texttt{foreground\_voxel} and \texttt{closest\_point}
    
    \If{$d \leq r_{\text{\texttt{nearest\_edge}}}$}{
        \texttt{target\_point} $\gets$ move \texttt{closest\_point} for \texttt{step\_size} along $G$ towards the root
        
        \texttt{direction\_vector} $\gets$ \texttt{target\_point} - \texttt{foreground\_voxel}
        
        Store \texttt{direction\_vector} in \texttt{vector\_field}
    }
}
\Return{$\texttt{vector\_field}$}\;

\end{algorithm}


\subsection{Automated Root Detection}\label{sec:automated_root_detection}

Vesselpose extracts a centerline graph using an adaptation of the TEASAR algorithm, which requires an initial set of vessel roots. In practice, these roots are annotated manually, a procedure that is both slow and labor-intensive. An automated strategy would remove this bottleneck, but detecting roots directly from the binary foreground mask is unreliable.

The voxel-wise direction field predicted by Vesselpose provides a direct workaround: roots correspond to sinks of this 3D flow field. A virtual particle is hereby initialized at every foreground voxel \(p\) whose distance to the background is at least \(r_{\min}\). The particle position is set to \(x_0 := p\) and updated iteratively by $x_i := x_{i-1} + \lambda\, v_{i-1}$, 
where \(v_{i-1}\) is the interpolated direction vector at \(x_{i-1}\) and \(\lambda>0\) is a step-size parameter. A position \(x_i\) is considered a sink when the displacement magnitude \(\lambda \lVert v_i \rVert\) falls below a tolerance \(\tau>0\). After \(N\) iterations, all detected sinks are collected and used as candidate vessel roots for initializing the adapted TEASAR procedure.

To evaluate the effectiveness of automated root detection we apply this method to the three validation data of the Multi-tree synthetic dataset and report how many manually annotated ground-truth roots are correctly detected. A root is considered to be correctly detected if it lies within a distance of $2$ pixels from a calculated sink. For our experiments we further chose the following parameters: number of steps $N=50$, step size $\lambda=1.0$, tolerance $\tau= 0.1$ and min-radius threshold $r_{\text{min}}=3.0$.

As a result, the automated root detection algorithm is able to detect \emph{all} of the ground-truth roots present in all three volume samples, while producing additional false positive roots.
However, the additional false positive roots do not pose a problem to the TEASAR-based centerline generation since these can be filtered out during multi-root processing as described in \ref{sec:teasar}.

\subsection{Adaptive masking}\label{sec:adaptive masking}
Standard TEASAR excludes processed areas using simple linear thresholding with a fixed scale and constant. However, in vesselpose these parameters vary with respect to the local radius as described in \ref{sec:teasar}. 
Given predefined parameter ranges for $scale \in [s_{\min}, s_{\max}]$ and $const \in [c_{\min}, c_{\max}]$, and radius bounds $[r_{\min}, r_{\max}]$, we compute the normalized radius fraction
\begin{equation}
\alpha_r =
\frac{\operatorname{r} - r_{\min}}
{r_{\max} - r_{\min}}
\end{equation}
The scale and constant are then interpolated:
\begin{equation}
\text{scale}(r) = s_{\min} + \alpha_r(s_{\max} - s_{\min}) ~,~
\text{const}(r) = c_{\min} + \alpha_r(c_{\max} - c_{\min})
\end{equation}
The adaptive masking distance is finally defined as
\begin{equation}
d(r) = \text{scale}(r)\cdot r + \text{const}(r)
\end{equation}

% ------------------ Extended Evaluation ------------------------------
\section{Extended Evaluation}

\subsection{Hierarchical Matching}
In~\algorithmref{alg:hierarchical-matching} we demonstrate the steps involved in our hierarchical matching which takes the node label and its semantic into consideration rather than just the spatial proximity.

%------------ Hierarchical Matching Algorithm ----------------------------
\begin{algorithm}[t]
\caption{Hierarchical matching between $G$ and $P$}
\label{alg:hierarchical-matching}
\DontPrintSemicolon

\KwIn{Directed acyclic graphs $G$ and $P$;
Each node in each graph labelled as belonging to semantic class "root", "leaf", "branching point" or "intermediate point"; Each node in each graph has an associated 3d position; maximum distance for two points being matched $d_{max}$}
\KwOut{\texttt{match\_dict}}

Initialize empty \texttt{match\_dict}

\ForEach {node $v$ in $G$} {Determine the set $W_v$ of closest nodes in $P$ that lie within distance $d_{max}$ to $v$\;

Determine subset $W_{v,sem} \subseteq W_v$ of nodes with same semantic class as $v$\;

Sort each subset by distance to $v$

Append to form sorted list $W_v^{sorted} = [W_{v,sem},\ W_v \backslash W_{v,sem}]$
}

Sort root nodes $r$ of $G$ analogously, i.e.: Primary sorting criterion: non-empty $W_{r,sem}$ before all others; secondary  criterion: distance of closest $w \in W_r$

\ForEach{$r$ in sorted list of root nodes in $G$}{
    \ForEach{class label $c$ in sorted list $["root", "branching\ point" or "leaf", "intermediate\ point"]$}{
    \ForEach{node $v$ with class label $c$ in depth-first traversal of $G$ from $r$}{
        \If{class label of $v$ is $"root"$}{
            remove all elements $w$ in $W_v^{sorted}$ if parent of $w$ is already matched to another tree than $r$\;
        }
        \If{$W_v^{sorted}$ is not empty} {
            \If{class label of $v$ is no $"root"$} {
                get mask $m_v$ with true elements if parent of $w \in W_v^{sorted}$ is matched to same tree as $r$\;
                
                \If{$W_v^{sorted}[m_v]$ is not empty}{
                    update $W_v^{sorted}$ with $W_v^{sorted}[m_v]$\;
                }
            }

            pick first $w$ in $W_v^{sorted}$\;
            
            store $(v,w)$ in \texttt{match\_dict}\;
            
            \ForEach{not yet visited node $v'$ in $G$}{remove $w$ from $W_{v'}^{sorted}$}
        }
    }
}}
\Return{\texttt{match\_dict}}\;
\end{algorithm}



\subsection{Related work: Commonly used metrics}\label{sec:metrics_discussion}
A unified approach for comparing graph structures is still lacking \citet{Lyu2022reta}, and numerous evaluation strategies have emerged across the biomedical literature.
On one hand, there are \textit{detection- and segmentation-based metrics} for comparing centerlines, branching points, or graph edges, which we find unsuitable for graph-based evaluation:
For example, clDice~\cite{Shit2021cldice} does not reflect topology, as it operates at the pixel level and fails to penalize structural changes like loops or disconnections---errors that may drastically alter topology but only minimally impact clDice scores as few pixels are missing or added.
The same limitation applies to precision, recall, and F1 scores when applied on pixel level.
% %, as used in~\cite{tetteh2019_deepvesselnet}.
If mean average precision (mAP)\cite{lin2014coco} is used, as in \citet{vesselformer2024,trexplorer2024}, nodes and edges are compared based on their overlap of the bounding boxes.
However, as shown in \citet{Foucart2023}, intersection-over-union (IoU) is not well-suited for small objects, especially in 3d, where meeting high IoU thresholds becomes impractical.

On the other hand, \textit{graph similarity measures} are commonly used in related studies \cite{vesselformer2024,trexplorer2024,Drees2019gerome,Drees2021voreenSkel}.
However, the Street Mover's Distance (SMD) does not preserve connectivity information as it converts graph to point clouds for distance computation.
Additionally, SMD is sensitive to resampling and hyperparameters, making it less reliable for topological evaluation.
%
In contrast, Betti numbers provide a topological perspective, but, e.g., Betti-0 is too coarse, as it does not capture topological errors within individual trees \cite{lux2025topograph}.
Instead, precision, recall, and F1 scores---when applied at the graph level, particularly for edges, as in \citet{Drees2019gerome,Drees2021voreenSkel}---provide meaningful insights into topological correctness.
However, these metrics do not account for the structural impact of individual errors: missing or spurious edges may vary greatly in severity depending on how drastically they alter the topology of the graph, a nuance not captured by the F1 score. 

Another family of metrics includes \textit{tree edit distance (TED)-based measures} like \citet{li2023ted,matula2015tra}, which quantify the number of operations (e.g., node or edge insertions/deletions) needed to transform the predicted graph into the ground truth.
These metrics offer an intuitive and meaningful notion of similarity, but we have not seen them commonly used in related work.
For example, \citet{li2023ted} introduces a TED variant specifically for vasculature graphs, but it penalizes heavily false merges and splits between two trees as it penalizes the falsely merged tree twice by adding the counts for removing every falsely added node and edge; and then for creating the missing tree from scratch.
Another intuitive measure is the TRA metric \cite{matula2015tra}, originally designed for evaluating cell lineages, but it is not directly applicable here as it relies on IoU-based matching.

Our observations are consistent with \citet{Lyu2022reta}, who conclude that selecting a reliable and unbiased evaluation metric remains an open problem in the community. With this work, we aim to contribute to this discussion by proposing a greedy hierarchical matching strategy and introducing false splits and false merges as topology-aware measures for robust assessment of vessel graph reconstructions. Nevertheless, we believe that a more systematic analysis of how node matching and sampling choices influence evaluation outcomes is still needed to establish common recommendations—an important direction for future work, but beyond the scope of this paper.

% ------------ Table with comparison of different matching strategies ----------------------
%[htbp!]
\begin{table}
    \centering
    %\footnotesize
    \caption{\label{tab:our_results}Quantitative comparison of our hierarchical matching with greedy one-to-one matching and optimal Hungarian matching.
    Lower false merge (FM) and false split (FS) values indicate better topology preservation during matching.
    Results are shown for Vesselpose on the validation set of the Multi-Tree Synthetic dataset.
    For this analysis, graphs were resampled to include only roots, branching points, and end nodes.}
    \begin{tblr}{width=\linewidth,rows={abovesep=1pt,belowsep=1pt}}
        \midrule
        \SetCell[r=2]{l}{Matching} & \SetCell[c=3]{c}{Edges} & & & \SetCell[c=2]{c}{FM} & & \SetCell[c=2]{c}{FS} & \\
        \cmidrule[lr]{2-4} \cmidrule[lr]{5-6} \cmidrule[lr]{7-8}
        & F1$\uparrow$ & Prec$\uparrow$ & Rec$\uparrow$ & Rel.$\downarrow$ & Abs.$\downarrow$ & Rel.$\downarrow$ & Abs.$\downarrow$\\
        \midrule
        Greedy & 0.8 & 0.81 & 0.79 & 0.01 & 49 & 0.01 & 48.6 \\
        Hungarian & \textbf{0.81} & \textbf{0.82} & \textbf{0.80} & 0.02 & 62.6 & 0.01 & 62.3\\
        Hierarchical (Ours) & 0.80 & 0.81 & 0.79 & \textbf{0.007} & \textbf{29.7}& \textbf{0.007} & \textbf{29.3}\\
        \midrule
        \label{tab:matching_comparison}
    \end{tblr}
\end{table}

% ------------------ Extended Experiments ------------------------------
\section{Extended Experiments}\label{sec:ext_experiments}
We conducted all experiments on an HPC cluster using a single NVIDIA H100 GPU. Each model is trained for approximately 3 days. We allocate 200 GB RAM per job, although CPU and memory demands are modest since training relied mainly on GPU computation with Zarr-based random crop loading. The models are trained using PyTorch on a CUDA-enabled Linux environment.

\subsection{Single-Tree Synthetic Data}\label{sec:ext_single_tree_synthetic}
The dataset comprises 500 volumes of size $256^3$ voxels and is split into training, validation, and test sets.
The training, validation and test split contain 368, 32 and 100 samples respectively. The network is trained on randomly sampled input patches of size $128^3$ voxels with intensity shift augmentation.

\begin{figure}[t]
  \centering
  \subfigure[Raw]{%
    \includegraphics[
      width=0.31\linewidth
    ]{figures/syn_st_raw.png}
  }
  \hfill
  \subfigure[Ground-truth]{%
    \includegraphics[
      width=0.31\linewidth]{figures/syn_st_GT.png}
  }
  \hfill
  \subfigure[Ours]{%
    \includegraphics[
      width=0.305\linewidth]{figures/syn_st_ours.png}
  }
  \caption{\textbf{Qualitative comparison for single-tree synthetic.}
  A 3D rendering of one of the samples. (a) shows the raw image (b) Ground-truth skeletons overlaid on the segmentation mask (c) our predicted skeleton overlaid on the predicted binary segmentation.
  Our method (c) produces a reconstruction that closely matches the ground-truth (b), capturing fine structures and maintaining topological consistency.
  }
  \label{fig:qualitative_syn_st}
\end{figure}

\subsection{Parse 2022 Challenge}\label{sec:ext_parse}
The data has a in-plane size of $512\times 512$ pixels and its z-stack comprises between 295 and 390 slices.
The training, validation and test split contain 72, 8 and 20 samples respectively.
Following \citet{trexplorer_super2025}, all volumes are resampled to an isotropic resolution of $0.5mm$.
For both training and inference, we use input sizes of $256^3$ voxels and do not apply any data augmentation. 

\begin{figure}[t]
  \centering
  \subfigure[Raw]{%
    \includegraphics[
      trim=140pt 0pt 200pt 100pt,
      clip,
      width=0.31\linewidth
    ]{figures/parse_slice_raw_2.png}
  }
  \hfill
  \subfigure[Ground-truth]{%
    \includegraphics[trim=140pt 0pt 200pt 100pt,
      clip,
      width=0.31\linewidth]{figures/parse_slice_raw_GT_mask_2.png}
  }
  \hfill
  \subfigure[Ours]{%
    \includegraphics[trim=140pt 0pt 200pt 100pt,
      clip,
      width=0.31\linewidth]{figures/parse_slice_raw_pred_mask_2.png}
  }\\
  \subfigure[Raw]{%
    \includegraphics[width=0.31\linewidth]{figures/parse_raw_2.png}
  }
  \hfill
  \subfigure[Ground-truth]{%
    \includegraphics[width=0.31\linewidth]{figures/parse_raw_GTmask_2.png}
  }
  \hfill
  \subfigure[Ours]{%
    \includegraphics[width=0.295\linewidth]{figures/parse_raw_pred_mask_2.png}
  }
  \caption{\textbf{Qualitative comparison of PARSE2022 segmentation}
  First row: A 2D slice from one sample. (a) shows the raw image, while (b) and (c) show the raw image overlaid with the provided ground-truth segmentation and our segmentation, respectively.
  Second row: A 3D crop from the same sample.
  (d) shows the raw image, and (e) and (f) show the raw image overlaid with the ground-truth and our segmentation, respectively.
  Both segmentation masks miss vessel segments that are visible in the raw data and contain disconnected components. Some of the failure cases in our segmentation is highlighted in red arrows(f).
  }
  \label{fig:qualitative_parse}
\end{figure}

\subsection{Multi-tree Synthetic Data}
This dataset is obtained from \cite{tetteh2019_deepvesselnet}.
Each volume measures $325 \times 304 \times 600$ voxels.
We follow the information provided in \citet{trexplorer2024,vesselformer2024} about how many samples where used in which data split and utilized the first 50 volumes. The training, validation and test split contain 37, 3 and 10 samples respectively.
Training and inference are performed with a input size of $256^3$ voxels, using intensity shifts and masked out crops as data augmentations. For this analysis, graphs are resampled to include only roots, branching points, and end nodes.

\subsection{Micro-CT Heart data}\label{sec:ext_micro_ct}
% The rat heart micro-CT images are acquired using a SkyScan 1276 scanner (Bruker, Belgium) with the vendor acquisition software (v1.4.0.0) in step-and-shoot mode over 360° rotation. The X-ray tube current is 200 $\mu A$. Reconstruction is performed using 41\% beam-hardening correction and ring-artifact correction level 4.
This dataset contains samples from both preeclamptic and healthy animals.
The individual samples measure approximately $1300 \times 1300 \times 1700$ voxels at a resolution of $12\mu m$.
To obtain foreground masks for fine-tuning the U-Net on the micro-CT heart data, we train a random forest classifier \cite{Breiman2001RF} with 100 trees and a maximum depth of 10.
We include Frangi vesselness features \cite{Frangi2000} as additional input features for the random forest classifier and manually annotate a subset of vessels as ground truth.
We then used the resulting foreground masks to fine-tune the U-Net model which was pre-trained with the synthetic multi-tree dataset.
We fine-tune the U-Net by freezing all but the final layer to predict the foreground mask.

A key challenge associated with this dataset is its large size and the presence of substantial heart chambers, which occupy much of the volume.
Thus, we downsample the data by a factor of 0.5 in each dimension and apply a heuristic approach to remove the chambers by eliminating the largest connected component in each 2D segmentation slice. 
Fine-tuning and inference are performed with an input size of $256^3$ voxels.

\begin{figure}[t]
  \centering
  \subfigure[Ground-Truth]{%
    \includegraphics[
      width=0.32\linewidth
    ]{figures/micro-ct_GT.png}
  }
  \hfill
  \subfigure[Vesselpose]{%
    \includegraphics[
      width=0.31\linewidth]{figures/micro-ct_ours.png}
  }
  \hfill
  \subfigure[U-Net+TEASAR]{%
    \includegraphics[
      width=0.32\linewidth]{figures/micro-ct_baseline.png}
  }
  \caption{\textbf{Qualitative results of the micro-CT data.}
  Illustrated is one 3D annotated crop from the raw micro-CT test data together with varying vessel skeleton graphs in red:
(a) shows the annotated ground-truth skeleton; (b) shows the results of our proposed method; (c) shows the result of the baseline, which consists of a U-Net for foreground segmentation followed by the original TEASAR algorithm.
Overall, our method accurately captures the vessel structures and aligns well with the ground-truth.
In contrast, the baseline method fails to trace many vessel branches, in particular in the highlighted region within the blue box.
  }
  \label{fig:qualitative_micro-ct}
\end{figure}

\subsection{Ablation experiments in modified TEASAR}\label{sec:ablation_study}
To assess the contribution of each modification to the standard TEASAR algorithm, we conducted a systematic ablation study. Starting from the kimimaro TEASAR, we incrementally add each proposed component in separate experiments. We observe a consistent improvement in performance with every addition in \tableref{tab:ablation_study}, ultimately achieving the lowest false merges and false splits values with our full model configuration.

\begin{table}
    \centering
    \caption{\label{tab:ablation_study}Ablation study illustrating the contribution of individual components of our method and how they incrementally improve performance over a U-Net with standard TEASAR \cite{unet_ronneberger2015,sato2000teasar}.
    We add the following components step by step: support for multiple roots per connected component (multi-root); an additional penalty for tracing along vectors with small magnitudes (vec mag); an additional penalty for tracing in the same direction as the direction vector (vec dir); and adaptive masking to mark processed regions (adapt. mask).
    Results are shown for Vesselpose on the validation set of the Multi-Tree Synthetic dataset.
    }
    \begin{tblr}{width=\linewidth,rows={abovesep=1pt,belowsep=1pt}}
        \midrule
        \SetCell[r=2]{l}{Experiments}
        & \SetCell[r=2]{c}{multi\\root} 
        & \SetCell[r=2]{c}{vec\\mag} 
        & \SetCell[r=2]{c}{vec\\dir} 
        & \SetCell[r=2]{c}{adapt.\\mask}
        & \SetCell[c=3]{c}{Edges} & & &
        \SetCell[c=1]{c}{FM} & \SetCell[c=1]{c}{FS} \\
        \cmidrule[lr]{6-8}
        & & & & & F1$\uparrow$ & Prec$\uparrow$ & Rec$\uparrow$ & Abs.$\downarrow$ & Abs.$\downarrow$\\
        \midrule
        UNet+TEASAR 
        & \xmark & \xmark & \xmark & \xmark
        & 0.46 & 0.63 & 0.36 & 52.4 & 54.1 
        \\
        Ours 
        & \cmark & \xmark & \xmark & \xmark 
        & 0.71 & 0.78 & 0.64 & 38.3 & 38.0 
        \\
         
        & \cmark & \cmark & \xmark & \xmark    
        & 0.70 & 0.78 & 0.64 & 37.3 & 37.6 
        \\
         
        & \cmark & \cmark & \cmark & \xmark    
        & 0.75 & 0.79 & 0.73 & 33.6 & 35.0 
        \\
         
        & \cmark & \cmark  & \cmark & \cmark 
        & 0.80 & 0.81 & 0.79 & 29.7 & 29.3 
        \\
        \midrule
    \end{tblr}
\end{table}

\subsection{Training settings and hyperparameter analysis}\label{sec:training_settings}
\textbf{Training:} We conduct a series of experiments to evaluate the effects of different training settings. Quantitative results for the various datasets are presented in \tableref{tab:training_hyperparameters_mt_syn} and \tableref{tab:training_hyperparameters_Trex-sup}.

\begin{table}
    \centering
    %\footnotesize
    \caption{\label{tab:our_results}Quantitative comparison evaluating the effect of fixed tiles (in a sliding-window fashion) versus random crops from training samples in the multi-tree synthetic dataset.
    }
    \begin{tblr}{width=\linewidth,rows={abovesep=1pt,belowsep=1pt}}
        \midrule
        \SetCell[r=2]{l}{parameters} & 
        \SetCell[c=3]{c}{Edges} & & & 
        \SetCell[c=2]{c}{FM} & & 
        \SetCell[c=2]{c}{FS} & \\
        \cmidrule[lr]{2-4} \cmidrule[lr]{5-6} \cmidrule[lr]{7-8}
        & F1$\uparrow$ & Prec$\uparrow$ & Rec$\uparrow$ & Rel.$\downarrow$ & Abs.$\downarrow$ & Rel.$\downarrow$ & Abs.$\downarrow$\\
        \midrule
        sliding window & 0.79 & 0.80 & 0.78 & 0.008 & 33.36 & 0.008 & 36 \\
        random crops (ours) & 0.80 & 0.81 & 0.79 & 0.007 & 30.33 & 0.007 & 29.3 \\
        \midrule
        \label{tab:training_hyperparameters_mt_syn}
    \end{tblr}
\end{table}


\begin{table}
    \centering
    \caption{\label{tab:our_results}Quantitative comparison evaluating the effect of different training settings—data augmentation, learning rate, and foreground weighting—on single-tree datasets, using Trexplorer-Super metrics \cite{trexplorer_super2025}, consistent with the corresponding baseline studies in \tableref{tab:point_metrics}.
    }
    \begin{tblr}{width=\linewidth,rows={abovesep=1pt,belowsep=1pt},colspec={l c c c c c}}
        \midrule
        \SetCell[r=2]{l}{parameters} & 
        \SetCell[r=2]{c}{Dataset} & 
        \SetCell[c=1]{c}{Point Level} & 
        \SetCell[c=1]{c}{Branch Level} & 
        \SetCell[c=2]{c}{Graph Level} & \\
        \cmidrule[lr]{1-1}\cmidrule[lr]{3-3} \cmidrule[lr]{4-4} \cmidrule[lr]{5-6}
        & & F1$\uparrow$ & F1$\uparrow$ & Betti-0$\downarrow$ & Betti-1$\downarrow$ \\
        \midrule
        no augmentation  & Synthetic  & 88.56 & 80.98 & 0 & 0 \\
        50\% intensity shift (ours)  & Synthetic  & 92.28 & 81.28 & 0 & 0 \\
        \midrule
        learning rate 0.001  & Parse2022 & 36.13 & 23.10 & 1.55 & 0\\
        learning rate 0.0001 (ours)  & Parse2022  & 49.42 & 28.87 & 2.70 & 0 \\
        \midrule
        w/o foreground weight  & Parse2022 & 49.42 & 28.87 & 2.70 & 0 \\
        w/ foreground weight (ours)  & Parse2022 & 57.89 & 36.75 & 1.20 & 0\\
        \midrule
        \label{tab:training_hyperparameters_Trex-sup}
    \end{tblr}
\end{table}

\textbf{TEASAR:} Similarly, we evaluate the TEASAR parameters—the penalty scale (1,000,000) and penalty exponent (16) (cf. \equationref{eq:pv_flow})—with results reported in \tableref{tab:penalty_scale} and \tableref{tab:penalty_exponent}.
Our experiments indicate that varying penalty scale does not lead to substantial changes in TEASAR performance. We therefore retain the value used in the Kimimaro implementation of TEASAR~\cite{Silversmith_Kimimaro_Skeletonize_densely_2021}. This choice is also consistent with the original TEASAR paper~\cite{sato2000teasar}, where this parameter is described as being selected heuristically based on the skeleton segment.
In contrast, penalty exponent has a more pronounced effect on the results. As the exponent increases, the edge-wise F1 score generally improves. However, for very large values, TEASAR begins to merge distinct trees, which is reflected in an increased Betti-1 error. Based on this trade-off, we selected the same value as used in both Kimimaro and the original TEASAR method.
\begin{table}
    \centering
    \caption{Quantitative comparison of our modified TEASAR with varying penalty scale term (cf. \equationref{eq:pv_flow}).
    Results are shown for Vesselpose on the validation data of the Multi-Tree Synthetic dataset.
    We see that results are constant across different penalty scales.
    }
    \begin{tblr}{width=\linewidth,rows={abovesep=1pt,belowsep=1pt}}
        \midrule
        \SetCell[r=2]{l}{penalty scale} & \SetCell[c=3]{c}{Edges} & & & \SetCell[c=2]{c}{FM} & & \SetCell[c=2]{c}{FS} & \\
        \cmidrule[lr]{2-4} \cmidrule[lr]{5-6} \cmidrule[lr]{7-8}
        & F1$\uparrow$ & Prec$\uparrow$ & Rec$\uparrow$ & Rel.$\downarrow$ & Abs.$\downarrow$ & Rel.$\downarrow$ & Abs.$\downarrow$\\
        \midrule
        $5 \times 10^{3}$ & 0.81 & 0.82 & 0.79 & 0.007 & 28 & 0.006 & 27 \\
        $5 \times 10^{4}$ & 0.81 & 0.83 & 0.80 & 0.007 & 29 & 0.007 & 29 \\
        $5 \times 10^{5}$ & 0.80 & 0.82 & 0.79 & 0.007 & 28 & 0.006 & 27 \\
        $5 \times 10^{6}$ & 0.81 & 0.82 & 0.79 & 0.007 & 28 & 0.006 & 27 \\
        $1 \times 10^{6}$ (ours) & 0.81 & 0.82 & 0.79 & 0.007 & 28 & 0.006 & 27 \\
        \midrule
        \label{tab:penalty_scale}
    \end{tblr}
\end{table}

\begin{table}
    \centering
    \caption{Quantitative comparison of our modified TEASAR with varying penalty exponent (cf. \equationref{eq:pv_flow}) on VesselPose validation data from the Multi-Tree Synthetic dataset. Edge-wise F1 score, false merges, and false splits improve with increasing exponent; however, at very high values, TEASAR merges distinct trees, as reflected in the Betti-0 value.}
    \begin{tblr}{width=\linewidth,rows={abovesep=1pt,belowsep=1pt},colspec={l c c c c c c c}}
        \midrule
        \SetCell[r=2]{l}{penalty exp.} & 
        \SetCell[c=1]{c}{Edges} & 
        \SetCell[c=2]{c}{FM} & & 
        \SetCell[c=2]{c}{FS} & &
        \SetCell[c=2]{c}{Betti} & & \\
        \cmidrule[lr]{2-2} \cmidrule[lr]{3-4} \cmidrule[lr]{5-6} \cmidrule[lr]{7-8}
        & F1$\uparrow$ & Rel.$\downarrow$ & Abs.$\downarrow$ & Rel.$\downarrow$ & Abs.$\downarrow$ & Betti-0$\downarrow$ 
        & Betti-1$\downarrow$ \\
        \midrule
        2 & 0.64 & 0.01 & 40 & 0.009 & 39 & 1 & 0 \\
        4 & 0.70 & 0.008 & 33 & 0.007 & 32 & 1 & 0 \\
        8 & 0.77 & 0.008 & 34 & 0.007 & 32 & 1 & 0 \\
        16 (ours) & 0.81 & 0.007 & 28 & 0.007 & 27 & 1 & 0 \\
        32 & 0.81 & 0.005 & 22 & 0.004 & 18 & 4 & 0 \\
        \midrule
        \label{tab:penalty_exponent}
    \end{tblr}
\end{table}


\subsection{Sensitivity to vector prediction quality}\label{sec:vector noise sensitivity}
We evaluate robustness to directional noise by perturbing the predicted vector field with an additive error term such that the error magnitude is proportional to the local predicted vector norm.
Specifically, each original predicted direction vector $v$ is perturbed to $v' = v + \varepsilon \lVert v\rVert\cdot u$, where $\varepsilon \geq 0$ controls the noise level (noise-to-signal ratio) and $u$ is a random unit vector.
We sweep $\varepsilon$ from $0$ to $2.0$ in steps of $0.1$ and quantify performance using the edge-wise $F1$ score. As shown in \figureref{fig:vector_noise sensitivity}(a), our method remains stable over a broad range of perturbation strengths: Edge-wise F1 remains nearly constant for small to moderate noise levels and degrades only gradually as $\varepsilon$ increases. A pronounced drop is observed only at very large noise ($\varepsilon > 1.0$), where the direction field becomes strongly corrupted as seen in \figureref{fig:vector_noise sensitivity}(d) and the reconstruction quality deteriorates more noticeably. Importantly, even for a higher noise level of our approach consistently outperforms the baseline \textsc{Teasar} ($F1_{\mathrm{edge}} = 0.46)$, indicating higher tolerance to directional uncertainty.

\begin{figure}[t]
  \centering
  \subfigure[Noise sensitivity plot]{%
    \includegraphics[width=0.81\linewidth]{figures/f1_edges_vs_epsilon.png}
  }\\
  \subfigure[Original]{%
    \includegraphics[width=0.31\linewidth]{figures/original.png}
  }
  \hfill
  \subfigure[$\epsilon=0.5$]{%
    \includegraphics[width=0.31\linewidth]{figures/0.5_noise.png}
  }
  \hfill
  \subfigure[$\epsilon=2$]{%
    \includegraphics[width=0.31\linewidth]{figures/2_noise.png}
  }
  \caption{\textbf{Sensitivity to vector noise.}
  (a) The proposed method shows strong robustness to vector noise: small to moderate perturbations (noise level $\varepsilon \leq 1.0$) of the predicted vectors have no noticeable impact on the resulting edge-wise F1 score. (b–d) Visualization of the direction vector field under increasing noise levels  $\varepsilon \in\{0, 0.5, 2\}$. For clarity, all vectors are normalized. Dark blue indicates low vector magnitude, while green indicates high vector magnitude.
  }
  \label{fig:vector_noise sensitivity}
\end{figure}

\subsection{Sensitivity to test time gaussian noise}\label{sec:gaussian noise sensitivity}
We further assess robustness to test-time perturbations by adding voxel-wise Gaussian noise to the normalized raw image. Specifically, the noisy input is generated as
\[
I' = \operatorname{clip}(I_{\mathrm{norm}} + \epsilon, 0, 1), \qquad \epsilon \sim \mathcal{N}(0,\sigma^2)
\]
where
\[
I_{\mathrm{norm}} = \frac{I - I_{\min}}{I_{\max} - I_{\min}}
\]
Here, $\sigma$ denotes the standard deviation of the Gaussian noise and controls the noise level. The effect of increasing noise levels during inference is reported in \tableref{tab:gaussian noise sensitivity}. For small values of $\sigma$, the performance remains largely stable, with no substantial degradation. However, at higher noise levels, we observe the emergence of several small disconnected components, which is reflected in the increase in false splits and Betti-1 error.
\begin{table}
    \centering
    %\footnotesize
    \caption{Test time sensitivity study of Vesselpose under varying gaussian noise.
    Performance remains stable at low $\sigma$, but higher noise levels lead to small disconnected components, increasing false splits and Betti-1 error.
    }
    \begin{tblr}{width=\linewidth,rows={abovesep=1pt,belowsep=1pt},colspec={l c c c c c c c}}
        \midrule
        \SetCell[r=2]{l}{sigma} & 
        \SetCell[c=1]{c}{Edges} & 
        \SetCell[c=2]{c}{FM} & & 
        \SetCell[c=2]{c}{FS} & &
        \SetCell[c=2]{c}{Betti} & & \\
        \cmidrule[lr]{2-2} \cmidrule[lr]{3-4} \cmidrule[lr]{5-6} \cmidrule[lr]{7-8}
        & F1$\uparrow$ & Rel.$\downarrow$ & Abs.$\downarrow$ & Rel.$\downarrow$ & Abs.$\downarrow$ & Betti-0$\downarrow$ 
        & Betti-1$\downarrow$ \\
        \midrule
        0 & 0.81 & 0.007 & 28 & 0.006 & 27 & 1 & 0 \\
        0.03 & 0.81 & 0.007 & 31 & 0.007 & 30 & 1 & 0 \\
        0.06 & 0.81 & 0.008 & 32 & 0.007 & 31 & 1 & 0 \\
        0.09 & 0.80 & 0.008 & 34 & 0.008 & 23 & 1 & 0 \\
        0.12 & 0.80 & 0.01 & 43 & 0.01 & 45 & 2 & 0 \\
        0.15 & 0.78 & 0.009 & 38 & 0.02 & 127 & 94 & 0 \\
        \midrule
        \label{tab:gaussian noise sensitivity}
    \end{tblr}
\end{table}

\subsection{Comparison with nnU-Net}
\label{sec:nnunet}

We evaluate a variant of Vesselpose in which the U-Net backbone is replaced by nnU-Net~\citep{isensee2021nnu}, a self-configuring segmentation framework that automatically determines architecture and training hyperparameters from dataset properties, requiring no manual tuning. We extend the framework to predict voxel-wise direction vectors as additional output channels alongside the foreground mask. The remaining pipeline, including the modified TEASAR algorithm and post-processing, is identical to the main method. Results on the Multi-Tree Synthetic dataset are reported in Table~\ref{tab:nnUNet comaprison}.

% \begin{table}
%     \centering
%     \caption{\label{tab:nnunet}Quantitative comparison of Vesselpose with a U-Net versus nnU-Net backbone on the Multi-Tree Synthetic dataset.
%     }
%     \begin{tblr}{width=\linewidth,rows={abovesep=1pt,belowsep=1pt}}
%         \midrule
%         & \SetCell[c=3]{c}{Edges} & & & FM & FS \\
%         \cmidrule[lr]{2-4}
%         & F1$\uparrow$ & Prec$\uparrow$ & Rec$\uparrow$ & Abs.$\downarrow$ & Abs.$\downarrow$ \\
%         \midrule
%         Ours (U-Net)   & 0.80 & 0.80 & 0.79 & 29.7 & 28.3 \\
%         Ours (nnU-Net) & 0.81 & 0.81 & 0.81 & 33.6 & 32.8 \\
%         \midrule
%     \end{tblr}
% \end{table}

\begin{table}
    \centering
    %\footnotesize
    \caption{\label{tab:our_results}Quantitative comparison of Vesselpose with a U-Net versus nnU-Net backbone on the Multi-Tree Synthetic dataset. We observe that nnU-Net performs slightly better on the edge metrics, but shows slightly worse false merge and false split errors than U-Net.}
    \begin{tblr}{width=\linewidth,rows={abovesep=1pt,belowsep=1pt}}
        \midrule
        \SetCell[r=2]{l}{Model} & \SetCell[c=3]{c}{Edges} & & & \SetCell[c=2]{c}{FM} & & \SetCell[c=2]{c}{FS} & \\
        \cmidrule[lr]{2-4} \cmidrule[lr]{5-6} \cmidrule[lr]{7-8}
        & F1$\uparrow$ & Prec$\uparrow$ & Rec$\uparrow$ & Rel.$\downarrow$ & Abs.$\downarrow$ & Rel.$\downarrow$ & Abs.$\downarrow$\\
        \midrule
        Ours(U-Net) & 0.80 & 0.80 & 0.79 & \textbf{0.007} & \textbf{29.7} & \textbf{0.007} & \textbf{28.3} \\
        Ours(nnU-Net) & \textbf{0.81} & \textbf{0.81} & \textbf{0.81} & 0.008 & 33.6 & 0.008 & 32.8\\
        
        \midrule
        \label{tab:nnUNet comaprison}
    \end{tblr}
\end{table}