% \section{Data Collection and Preprocessing}
% \label{appendix:data}

% \subsection{CT Volume and Segmentation Reorientation}

% The DeepFluoro dataset \cite{grupp2020automatic} provides each CT volume with voxel array $V$, voxel spacing $s = (s_x, s_y, s_z)$, direction matrix $R \in \mathbb{R}^{3\times 3}$, and physical origin $o \in \mathbb{R}^3$. The scanner index--to--world affine matrix is
% \begin{align}
%     A = 
% \begin{bmatrix}
% R \, \mathrm{diag}(s_x, s_y, s_z) & o \\
% 0 & 1
% \end{bmatrix}.
% \end{align}
% To match the NIfTI $(x,y,z)$ axis convention, we apply a permutation of voxel indices followed by a reflection.  Let $V(i,j,k)$ denote the original voxel array with axes $(i,j,k)$. First, we permute axes $0$ and $2$ via
% \begin{align}
%     \tilde{V}(i,j,k) = V(k,j,i).
% \end{align}
% Next, we flip the new $x$--axis (dimension $i$) by reflecting it:
% \begin{align}
%     V'(i,j,k) = \tilde{V}(N'_x - 1 - i,\; j,\; k),
% \end{align}
% where $N'_x$ is the size of the permuted first dimension. Together, this yields the reoriented volume $V'$ used for all NIfTI exports. This flip corresponds to the matrix
% \begin{align}
%     F =
% \begin{bmatrix}
% -1 & 0 & 0 & (N_z - 1)s_z \\
% 0 & 1 & 0 & 0 \\
% 0 & 0 & 1 & 0 \\
% 0 & 0 & 0 & 1
% \end{bmatrix},
% \end{align}
% so the final NIfTI affine is
% \begin{align}
%     A' = A F.
% \end{align}
% This guarantees that $(V', A')$ preserves the original scanner physical coordinates.

% \subsection{3D Landmark Conversion}
% Each anatomical landmark is provided in physical scanner coordinates $p = (x, y, z)^\top$.  
% To map it into the voxel space of the reoriented CT, we apply
% \begin{align}
%     v =
% (A')^{-1}
% \begin{bmatrix}
% p \\ 1
% \end{bmatrix},
% \qquad
% v = (i,j,k).
% \end{align}
% The resulting voxel coordinates are rounded to the nearest integer index.  
% % For visualization and robust training supervision, each landmark is expanded into a spherical region
% % \begin{align}
% %     \mathcal{S}(v) = \{ (x,y,z) \mid (x-i)^2 + (y-j)^2 + (z-k)^2 \le r^2 \},
% % \end{align}
% % with radius $r = 3$ voxels.

% \subsection{2D Projection Extraction}
% Projection images are intensity-normalized to $[0,255]$ and saved as PNG files.  
% The provided 2D landmark coordinates $(u, v)$ are written directly unless they lie outside the image bounds, in which case the corresponding landmark is marked as invisible for the ground truth landmarks.


\section{Data Collection and Preprocessing}
\label{appendix:data}

\subsection{CT Volume and Segmentation Reorientation}

The DeepFluoro dataset \cite{grupp2020automatic} provides each CT volume with voxel array $V$, voxel spacing $s = (s_x, s_y, s_z)$, direction matrix $R \in \mathbb{R}^{3\times 3}$, and physical origin $o \in \mathbb{R}^3$. The scanner index--to--world affine matrix is
\begin{align}
    A = 
\begin{bmatrix}
R \, \mathrm{diag}(s_x, s_y, s_z) & o \\
0 & 1
\end{bmatrix}.
\end{align}
To match the NIfTI $(x,y,z)$ axis convention, we apply a permutation of voxel indices followed by a reflection. Let $V(i,j,k)$ denote the original voxel array with axes $(i,j,k)$. First, we permute axes $0$ and $2$ via
\begin{align}
    \tilde{V}(i,j,k) = V(k,j,i).
\end{align}
Next, we flip the new $x$--axis (dimension $i$) by reflecting it:
\begin{align}
    V'(i,j,k) = \tilde{V}(N'_x - 1 - i,\; j,\; k),
\end{align}
where $N'_x$ is the size of the permuted first dimension. Together, this yields the reoriented volume $V'$ used for all NIfTI exports. This flip corresponds to the matrix
\begin{align}
    F =
\begin{bmatrix}
-1 & 0 & 0 & (N_z - 1)s_z \\
0 & 1 & 0 & 0 \\
0 & 0 & 1 & 0 \\
0 & 0 & 0 & 1
\end{bmatrix},
\end{align}
so the final NIfTI affine is
\begin{align}
    A' = A F.
\end{align}
This guarantees that $(V', A')$ preserves the original scanner physical coordinates.

\subsection{3D Landmark Conversion}

Each anatomical landmark is provided in physical scanner coordinates $p = (x, y, z)^\top$.  
To map it into the voxel space of the reoriented CT, we apply
\begin{align}
    v =
(A')^{-1}
\begin{bmatrix}
p \\ 1
\end{bmatrix},
\qquad
v = (i,j,k).
\end{align}
The resulting voxel coordinates are rounded to the nearest integer index.

\subsection{2D Projection Extraction}

Projection images are intensity-normalized to $[0,255]$ and saved as PNG files.  
The provided 2D landmark coordinates $(u, v)$ are written directly unless they lie outside the image bounds, in which case the corresponding landmark is marked as invisible for the ground truth landmarks.

\subsection{Fluoroscopy Image Pose Evaluation in Native DeepFluoro Geometry}

For the fluoroscopy image experiments, pose estimation was performed directly in the native DeepFluoro HDF5 geometry rather than in the standardized synthetic DRR geometry \cite{suh20252d}. For each projection, the dataset provides 3D anatomical landmarks in the pelvis-volume coordinate system, corresponding 2D detector annotations, the pelvis pose \texttt{cam-to-pelvis-vol}, and the global projection calibration matrices \texttt{intrinsic} and \texttt{extrinsic}. Let $\mathbf{X}_i \in \mathbb{R}^3$ denote a 3D landmark in pelvis-volume coordinates and let
\begin{align}
    T_{\mathrm{cam}\rightarrow\mathrm{pelvis}} \in \mathbb{R}^{4\times 4}
\end{align}
denote the stored HDF5 pose. The corresponding world-to-camera transform used for projection was defined as
\begin{align}
    T_{w2c} = E\,T_{\mathrm{cam}\rightarrow\mathrm{pelvis}}^{-1},
\end{align}
where $E \in \mathbb{R}^{4\times 4}$ is the dataset extrinsic matrix. The projection matrix was then
\begin{align}
    P = K\,[T_{w2c}]_{1:3,:},
\end{align}
with $K \in \mathbb{R}^{3\times 3}$ the HDF5 intrinsic matrix. Writing a landmark in homogeneous form as
\begin{align}
    \tilde{\mathbf{X}}_i =
    \begin{bmatrix}
    \mathbf{X}_i \\ 1
    \end{bmatrix},
\end{align}
its projected detector coordinate is
\begin{align}
    \tilde{\mathbf{x}}_i = P \tilde{\mathbf{X}}_i
    =
    \begin{bmatrix}
    \hat{u}_i \\
    \hat{v}_i \\
    \hat{w}_i
    \end{bmatrix},
    \qquad
    \mathbf{x}_i =
    \begin{bmatrix}
    \hat{u}_i/\hat{w}_i \\
    \hat{v}_i/\hat{w}_i
    \end{bmatrix},
\end{align}
which yields the 2D landmark in the native DeepFluoro detector pixel frame.

The landmark detector predicts coordinates in the resized model image, after which the predictions are mapped back to native detector resolution before registration. If $(u_i^{m}, v_i^{m})$ denotes a predicted point in the model image of size $W_m \times H_m$, and $(W,H)$ denotes the raw detector size, then the corresponding detector-space coordinate is computed as
\begin{align}
    u_i = u_i^{m}\frac{W}{W_m}, \qquad
    v_i = v_i^{m}\frac{H}{H_m}.
\end{align}
These detector-space landmarks are then paired with the HDF5 3D landmarks for pose recovery.

Unlike the original synthetic registration path, this fluoroscopy image evaluation uses a new raw-geometry solver matched to the HDF5 convention. Pose was initialized from the visible 2D--3D correspondences using DLT and then refined by Levenberg--Marquardt nonlinear least squares. The resulting estimate is
\begin{align}
    \hat{T}_{w2c}
    =
    \arg\min_T
    \sum_{i=1}^{L}
    \left\|
    \pi(\mathbf{X}_i;T)-\mathbf{x}_i
    \right\|_2^2,
\end{align}
where $\pi(\mathbf{X}_i;T)$ denotes the projected detector coordinate under transform $T$, and $L$ is the number of visible correspondences used for that image.

For the uncertainty-aware continuous-weighting variant, each landmark is assigned a weight $w_i$ derived from MC-dropout variability, and the weighted objective is
\begin{align}
    \hat{T}_{w2c}^{\mathrm{CW}}
    =
    \arg\min_T
    \sum_{i=1}^{L}
    w_i
    \left\|
    \pi(\mathbf{X}_i;T)-\mathbf{x}_i
    \right\|_2^2.
\end{align}
The fluoroscopy image evaluation follows the same landmark-driven registration principle as the synthetic experiments, but is carried out directly in the native DeepFluoro calibration and pose convention.

To evaluate pose accuracy in the native DeepFluoro geometry, we additionally report mean target registration error (mTRE). Let \(T_{w2c}^{\ast}\) denote the ground-truth world-to-camera transform and \(\hat{T}_{w2c}\) the estimated transform. Then mTRE is computed over the same 3D landmark set as
\begin{align}
    \mathrm{mTRE}(T_{w2c}^{\ast}, \hat{T}_{w2c})
    =
    \frac{1}{L}
    \sum_{i=1}^{L}
    \left\|
    T_{w2c}^{\ast}\tilde{\mathbf{X}}_i
    -
    \hat{T}_{w2c}\tilde{\mathbf{X}}_i
    \right\|_2,
\end{align}
where \(L\) is the number of landmarks used for evaluation and \(\tilde{\mathbf{X}}_i\) denotes the homogeneous form of the \(i\)-th 3D landmark. Thus, mTRE measures the mean Euclidean discrepancy between landmark positions under the estimated and ground-truth poses in camera space.