\section{Results}
\label{section:results}

% \begin{figure}
% \centering
%     \includegraphics[width=1\textwidth]{5_results/figures/error/boxplot_rotation_error_weighted.png}
%     \includegraphics[width=1\textwidth]{5_results/figures/error/boxplot_translation_error_weighted.png}
% \caption{Landmark detection and pose estimation errors across patients when using all landmarks (blue), uncertainty-based landmark weighting (red, orange, yellow), uncertainty-based landmark filtering where $K=3,5,7$ (white), and landmark detection error using groundtruth 2D landmarks where $K=2$ (green).}
% \label{fig:comparison_box_plot}
% \end{figure}

We evaluated the efficacy of the estimated uncertainty as a criterion for outlier rejection, independent of ground-truth labels. Figure~\ref{fig:top-k-dropout} illustrates the distributions of rotation and translation errors as we progressively exclude out the top-$K$ most uncertain landmarks ($K=0,\dots,7$). We observed a sharp, monotonic decay in both error and interquartile range (IQR) as $K$ increases. 
% The most significant stability gains occur within the first few filtering steps ($K=1$ to $K=3$), 
This suggests that the uncertainty metric $u_i$ effectively isolates the long-tail outliers that disproportionately destabilize the registration solver. By removing these high-uncertainty points, the system recovers a geometrically consistent subset of landmarks, resulting in precise pose estimation even in the presence of detection noise.

% Figure~\ref{fig:pose_comparison_w_nograd} and Table~\ref{tab:pose_comparison_w_nograd} present a 3D pose estimation performance across seven experimental configurations. Experiment that uses all landmarks for the pose estimation with no weights shows high variance with frequent outliers, resulting in a mean rotation error of $26.14$ degrees and translation error of $51.18$~mm. Introducing uncertainty-based discrete selection (DS) in inference nearly halves both errors ($14.22$ degrees, $24.79$~mm), and fine-tuning with continuous weighting (CW) maintains similar performance with lower translation error ($21.92$~mm). CW applied directly in inference achieves the lowest mean rotation error ($13.94$ degrees), while the combined strategy with fine-tuning yields the best overall translation accuracy ($20.63$~mm). Notably, median rotation errors drop from $12.96$ degrees in the \textit{No Weights} to $2.27$–$2.87$ degrees with uncertainty-based methods, and median translation errors from $32.70$~mm to below $8$~mm across all proposed configurations.

Figure~\ref{fig:pose_comparison_w_nograd} and Table~\ref{tab:pose_comparison_w_nograd} summarize 3D pose estimation performance across seven experimental configurations. The unweighted baseline that uses all landmarks equally during pose estimation shows high variance with frequent outliers, resulting in a mean rotation error of 26.14 degrees and a mean translation error of 51.18 mm. Introducing uncertainty-based discrete selection (DS) at inference nearly halves both errors (14.22 degrees, 24.79 mm), while fine-tuning with continuous weighting (CW) maintains similar performance with lower translation error (21.92 mm). CW applied directly at inference achieves the lowest mean rotation error (13.94 degrees), while the combined strategy with fine-tuning yields the best overall translation accuracy (20.63 mm).

% \begin{table}
% \centering
% \begin{tabular}{|c|cc|cc|}
% \hline
% \multirow{2}{*}{Experiment} & \multicolumn{2}{c|}{Rotation Error (degrees)} & \multicolumn{2}{c|}{Translation Error (mm)} \\ \cline{2-5} 
%                          & \multicolumn{1}{c|}{Mean}  & Median & \multicolumn{1}{c|}{Mean}  & Median \\ \hline
% No Weights \cite{grupp2020automatic} &\multicolumn{1}{c|}{26.14} & 12.96  & \multicolumn{1}{c|}{51.18} & 32.70  \\ \hline
% DS &\multicolumn{1}{c|}{14.22} & 2.31   & \multicolumn{1}{c|}{24.79} & 6.38   \\ \hline
% Finetune + NG + DS &\multicolumn{1}{c|}{16.52}     & 2.87      & \multicolumn{1}{c|}{21.98}     & 7.46      \\ \hline
% Finetune + DS &\multicolumn{1}{c|}{16.33}     & 2.84     & \multicolumn{1}{c|}{21.92}     & 7.21      \\ \hline
% CW &\multicolumn{1}{c|}{13.94} & 2.27   & \multicolumn{1}{c|}{24.18} & 6.35   \\ \hline
% Finetune + NG + CW &\multicolumn{1}{c|}{15.65}     & 2.73      & \multicolumn{1}{c|}{22.14}     & 7.44      \\ \hline
% Finetune + CW &\multicolumn{1}{c|}{15.84}     & 2.73      & \multicolumn{1}{c|}{20.63}     & 6.97      \\ \hline

% \end{tabular}
% \caption{Quantitative analysis of pelvic pose estimation comparing No Weights, Discrete Selection (DS, top-3 landmark filtering) and Continuous Weighting (CW). Variants include finetuning without gradient updates on MC dropout model (Finetune NG) and fully finetuned model (Finetune). We report the mean and median Euler angle difference for rotation (degrees) and RMSE for translation (mm).}
% \label{tab:pose_comparison_w_nograd}
% \end{table}


\begin{table*}[t]
\centering
\small
\setlength{\tabcolsep}{6pt}
\renewcommand{\arraystretch}{1.2}

\begin{tabular}{lcccccc}
\hline
\textbf{Experiment} & \textbf{Rotation Error} & \textbf{P=50} & \textbf{P=60} & \textbf{P=70} & \textbf{P=80} & \textbf{P=90} \\
\hline
No Weights          & $26.14 \pm 30.08$ & 12.96 & 17.45 & 29.92 & 41.46 & 75.48 \\
DS                  & $14.22 \pm 29.18$ & 2.31  & 3.39  & 5.42  & 13.68 & 51.83 \\
Finetune + NG + DS  & $16.52 \pm 30.38$ & 2.87  & 4.51  & 9.24  & 21.09 & 57.17 \\
Finetune + DS       & $16.33 \pm 30.78$ & 2.84  & 4.37  & 8.08  & 20.00 & 59.39 \\
CW                  & $13.94 \pm 28.67$ & 2.27  & 3.43  & 5.36  & 12.72 & 46.68 \\
Finetune + NG + CW  & $15.65 \pm 28.57$ & 2.73  & 4.45  & 9.23  & 20.65 & 54.13 \\
Finetune + CW       & $15.84 \pm 30.80$ & 2.73  & 4.78  & 7.41  & 16.97 & 49.39 \\
\hline
\end{tabular}

\vspace{0.75em}

\begin{tabular}{lcccccc}
\hline
\textbf{Experiment} & \textbf{Translation Error} & \textbf{P=50} & \textbf{P=60} & \textbf{P=70} & \textbf{P=80} & \textbf{P=90} \\
\hline
No Weights          & $51.18 \pm 56.58$ & 32.70 & 45.09 & 64.30 & 80.86 & 111.67 \\
DS                  & $24.79 \pm 50.58$ & 6.38  & 9.09  & 13.59 & 33.46 & 70.17 \\
Finetune + NG + DS  & $21.98 \pm 33.38$ & 7.46  & 11.21 & 21.61 & 33.64 & 60.30 \\
Finetune + DS       & $21.92 \pm 36.44$ & 7.21  & 10.53 & 17.19 & 32.08 & 55.74 \\
CW                  & $24.18 \pm 62.23$ & 6.35  & 8.98  & 12.96 & 27.99 & 58.35 \\
Finetune + NG + CW  & $22.14 \pm 34.78$ & 7.44  & 10.78 & 20.02 & 32.67 & 58.41 \\
Finetune + CW       & $20.63 \pm 32.54$ & 6.97  & 10.24 & 18.84 & 30.99 & 55.14 \\
\hline
\end{tabular}
\caption{Quantitative analysis of pelvic pose estimation comparing No Weights, Discrete Selection (DS, top-3 landmark filtering) and Continuous Weighting (CW). Variants include finetuning without gradient updates on MC dropout model (Finetune NG) and fully finetuned model (Finetune). We report the mean $\pm$ standard deviation together with the 50th, 60th, 70th, 80th, and 90th percentiles of the rotation error (degrees) and translation error (mm)}
\label{tab:pose_comparison_w_nograd}
% \caption{Extended quantitative summary of pose-estimation errors across all experimental configurations. For each method, we report the mean $\pm$ standard deviation together with the 50th, 60th, 70th, 80th, and 90th percentiles of the rotation error (degrees) and translation error (mm). DS denotes Discrete Selection, CW denotes Continuous Weighting, and NG denotes fine-tuning without gradient updates through the MC dropout branch.}
\end{table*}

% Across all patients, landmarks exhibited substantial differences in uncertainty magnitude, as shown in Figure \ref{fig:uncertainty_plot}. The global landmark uncertainty ranking showed that Landmark 0, 7, 8, 10, and 12 were consistently the most uncertain landmarks with median deviations ranging from 108-131 pixels. In contrast, landmarks 2, 3, 5, and 6 showed the lowest overall uncertainty with median deviation of 70-85 pixels. This indicates that uncertainty is not uniformly distributed across the anatomy and that certain landmarks present consistently greater difficulty for the model.

% To understand the qualitative behavior of our uncertainty estimates, we analyzed the per-landmark uncertainty distributions across different test subjects, as shown in Figure~\ref{fig:uncertainty_plot}. The plots reveal significant spatial heterogeneity. Certain anatomical landmarks consistently exhibit low uncertainty (narrow boxplots), while others display high dispersion depending on the patient anatomy and imaging angle. For instance, landmarks with consistently higher median uncertainty (e.g., indices 1 or 7) likely correspond to anatomical regions that are prone to occlusion or lack distinct features in fluoroscopic projection. This patient-specific variability confirms that the model is not simply learning a static prior, but is actively estimating confidence based on the visual evidence provided in each image.

\begin{figure}
    \centering
    \includegraphics[width=1\linewidth]{5_results/figures/DeepFluoro/boxplot_pose_metrics.png}
    \caption{DeepFluoro fluoroscopy image registration performance. Left to right: rotation error, translation error, and mTRE for \textit{No Weights}, discrete selection (DS, $K=3$), and continuous weighting (CW). Boxplots exclude gross failures with rotation error $>20^\circ$, translation error $>70$ mm, or mTRE $>70$ mm for readability.}
    \label{fig:DeepFluoro}
\end{figure}

% In addition, we test uncertainty-aware CW which overall had the best performance on the fluoroscopy images included in DeepFluoro with the comparison with \textit{No Weights}. CW outperformed the mTRE, while maintaining rotation and translation errors comparable. As shown in Figure~\ref{fig:DeepFluoro}, \textit{No Weights} achieved median rotation, translation, and mTRE errors of 3.12 dgrees, 18.69~mm, and 16.09~mm, respectively, whereas CW achieved 3.54 degrees, 18.17~mm, and 15.80~mm. Although the rotation difference between the two methods was small, CW yielded lower translation error and the best overall mTRE, indicating that uncertainty-aware weighting improves the final geometric alignment on fluoroscopy images.

On fluoroscopy images, uncertainty-aware continuous weighting produced the strongest overall registration performance. As shown in Figure~\ref{fig:DeepFluoro}, the filtered boxplot show that CW achieved the lowest translation error and the lowest mTRE among the three methods, while maintaining rotation accuracy comparable to the unweighted baseline. Quantitatively, \textit{No Weights}, DS, and CW achieved median rotation errors of 3.12, 5.99, and 3.54 degrees, median translation errors of 18.69 mm, 32.61 mm, and 18.17 mm, and median mTRE values of 16.09 mm, 29.91 mm, and 15.80 mm, respectively. In contrast to the synthetic setting, DS performed worse on real fluoroscopy, likely because many images contain only a limited number of visible landmarks, so removing three uncertain landmarks can leave an insufficient subset for stable registration.

Figure~\ref{fig:image_dropout} presents error retention curves showing a monotonic reduction in residual error as samples with high uncertainty, defined as the mean spatial deviation across all the landmarks in the image, are progressively excluded. In intra-operative guidance, this facilitates graceful failure by allowing the system to withhold prediction on ambiguous frames rather than outputting misleading guidance. Therefore, clinical workflows can strategically trade off temporal for reliability to ensure that surgical decision making is informed exclusively by high-confidence pose estimates.

% \begin{table}[]
% \centering
% \begin{tabular}{|c|c|cc|cc|}
% \hline
% \multirow{2}{*}{Experiment} & \multirow{2}{*}{Time (s)} & \multicolumn{2}{c|}{Rotation Error (degrees)} & \multicolumn{2}{c|}{Translation Error (mm)} \\ \cline{3-6} 
%                            & & \multicolumn{1}{c|}{Mean}  & Median & \multicolumn{1}{c|}{Mean}  & Median \\ \hline
% Baseline                    & $<$ 0.002 &\multicolumn{1}{c|}{26.14} & 12.96  & \multicolumn{1}{c|}{51.18} & 32.70  \\ \hline
% Test Time F                 & $<$ 0.002 &\multicolumn{1}{c|}{14.22} & 2.31   & \multicolumn{1}{c|}{24.79} & 6.38   \\ \hline
% Finetune + Test Time F, NG & $<$ 0.002 &\multicolumn{1}{c|}{16.52}     & 2.87      & \multicolumn{1}{c|}{21.98}     & 7.46      \\ \hline
% Finetune + Test Time F & $<$ 0.002 &\multicolumn{1}{c|}{16.33}     & 2.84     & \multicolumn{1}{c|}{21.92}     & 7.21      \\ \hline
% Test Time W                 & $<$ 0.002 &\multicolumn{1}{c|}{13.94} & 2.27   & \multicolumn{1}{c|}{24.18} & 6.35   \\ \hline
% Finetune + Test Time W, NG & $<$ 0.002 &\multicolumn{1}{c|}{15.65}     & 2.73      & \multicolumn{1}{c|}{22.14}     & 7.44      \\ \hline
% Finetune + Test Time W      & $<$ 0.002 &\multicolumn{1}{c|}{15.84}     & 2.73      & \multicolumn{1}{c|}{20.63}     & 6.97      \\ \hline

% Intensity-based      & TBA &\multicolumn{1}{c|}{15.84}     & 2.73      & \multicolumn{1}{c|}{20.63}     & 6.97      \\ \hline

% \end{tabular}
% \caption{Quantitative comparison of 3D pelvic pose estimation performance. We report the mean and median Root Mean Squared Error (RMSE) for rotation (degrees) and translation (mm) across seven experimental configurations. The methods are categorized by their use of uncertainty-based weighting (\textbf{W}) or filtering (\textbf{F}) strategies. \textbf{NG} indicates fine-tuned models finetuned without gradient-based updates for MC dropout model.}
% \label{tab:pose_comparison_w_nograd}
% \end{table}



%%%%%%% Original Version of the Table
% \begin{table}
% \centering
% \begin{tabular}{|c|cc|cc|}
% \hline
% \multirow{2}{*}{Experiment} & \multicolumn{2}{c|}{Rot. Error (degrees)} & \multicolumn{2}{c|}{Translation Error (mm)} \\ \cline{2-5} 
%                          & \multicolumn{1}{c|}{Mean}  & Median & \multicolumn{1}{c|}{Mean}  & Median \\ \hline
% U-Net and PnP \cite{grupp2020automatic}                     &\multicolumn{1}{c|}{26.14} & 12.96  & \multicolumn{1}{c|}{51.18} & 32.70  \\ \hline
% Test Time F                 &\multicolumn{1}{c|}{14.22} & 2.31   & \multicolumn{1}{c|}{24.79} & 6.38   \\ \hline
% Finetune + Test Time F, NG &\multicolumn{1}{c|}{16.52}     & 2.87      & \multicolumn{1}{c|}{21.98}     & 7.46      \\ \hline
% Finetune + Test Time F &\multicolumn{1}{c|}{16.33}     & 2.84     & \multicolumn{1}{c|}{21.92}     & 7.21      \\ \hline
% Test Time W                 &\multicolumn{1}{c|}{13.94} & 2.27   & \multicolumn{1}{c|}{24.18} & 6.35   \\ \hline
% Finetune + Test Time W, NG &\multicolumn{1}{c|}{15.65}     & 2.73      & \multicolumn{1}{c|}{22.14}     & 7.44      \\ \hline
% Finetune + Test Time W      &\multicolumn{1}{c|}{15.84}     & 2.73      & \multicolumn{1}{c|}{20.63}     & 6.97      \\ \hline

% \end{tabular}
% \caption{Quantitative comparison of 3D pelvic pose estimation performance. We report the mean and median Euler angle difference for rotation (degrees) and RMSE for translation (mm). The methods are categorized by their use of uncertainty-based weighting (W) or filtering (F) strategies. NG indicates fine-tuned models finetuned without gradient-based updates for MC dropout model.}
% \label{tab:pose_comparison_w_nograd}
% \end{table}