\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\usepackage{makecell}
\usepackage{float}
\usepackage{comment}
\usepackage{amsmath}   
\usepackage{amssymb}
\usepackage{multirow}
% \usepackage{subfig}
\usepackage{mathtools}  
\usepackage{graphicx}
\usepackage{xcolor}
\usepackage{caption}
\captionsetup{margin=0pt, justification=justified}



% \newcommand{\rev}[1]{\textcolor{red}{#1}}
\newcommand{\rev}[1]{#1}
\newcommand{\transl}{\mathcal{T}}
\newcommand{\rot}{\mathcal{R}}
\newcommand{\NOTE}[1]{\color{red}[#1] \color{black}}


\jmlrvolume{-- 34}
\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026}
\editors{Accepted for publication at MIDL 2026}

\title[RVO-MIS]{RVO-MIS: Robust Visual Odometry for Minimally Invasive Surgery}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship

% \midlauthor{
% \Name{Zhuo Wang\nametag{$^{1}$}} 
% \orcid{0000-0001-8679-5753} 
% \Email{zwang570@arizona.edu}\\
% \addr $^{1}$ Department of Electrical and Computer Engineering, University of Arizona, Tucson, AZ, USA
% \AND
% \Name{Chiang-Heng Chien\nametag{$^{2}$}}
% \orcid{0000-0001-8735-4378}
% \Email{
%   {\hypersetup{urlcolor=black}
%     \href{mailto:Chiang-Heng_Chien@brown.edu}{Chiang-Heng\_Chien@brown.edu}
%   }
% }\\
% \addr $^{2}$ School of Engineering, Brown University, Providence, RI, USA
% \AND
% \Name{Eungjoo Lee\nametag{$^{1,3}$}}
% \orcid{0000-0001-6386-8470} 
% \Email{{eungjoolee@arizona.edu}}\\
% \addr $^{1}$ Department of Electrical and Computer Engineering, University of Arizona, Tucson, AZ, USA \\
% \addr $^{3}$ Department of Ophthalmology and Vision Science, University of Arizona, Tucson, AZ, USA
% }

\midlauthor{%
  \Name{Zhuo Wang\nametag{$^{1}$}} 
  \orcid{0000-0001-8679-5753} 
  \Email{zwang570@arizona.edu} \\
  \Name{Chiang-Heng Chien\nametag{$^{2}$}}
  \orcid{0000-0001-8735-4378}
  \Email{{\hypersetup{urlcolor=black}\href{mailto:Chiang-Heng_Chien@brown.edu}{chiang-heng\_chien@brown.edu}}} \\
  \Name{Eungjoo Lee\nametag{$^{1,3}$}}
  \orcid{0000-0001-6386-8470} 
  \Email{eungjoolee@arizona.edu} \\
  \addr $^{1}$ Department of Electrical and Computer Engineering, University of Arizona, Tucson, AZ, USA \\
  \addr $^{2}$ School of Engineering, Brown University, Providence, RI, USA \\
  \addr $^{3}$ Department of Ophthalmology and Vision Science, University of Arizona, Tucson, AZ, USA
}

\setlength{\parskip}{0pt}


\setlength{\textfloatsep}{10pt plus 0.5pt minus 0.5pt}


\begin{document}


\maketitle

\begin{abstract} % ejl revisit again

Visual odometry (VO) in minimally invasive surgery (MIS) scenarios plays a crucial role in current and future endoscopic surgical intervention assistance systems. However, MIS environments pose severely challenging situations for typical VO algorithms due to \rev{textureless scenes, the presence of surgical instruments, light reflections, flowing blood and organ deformation, {\em etc}. Classic VO methods adopt a smooth motion prior to generate an initial guess for camera pose and then refine it through minimizing reprojection errors. Recent deep learning methods incorporate learned depths and estimate camera poses through minimizing photometric residuals. These approaches, however, lack robustness in estimation due to abrupt motion change and unpredictable illumination changes commonly seen in MIS environments.}
% The motion prior, which assumes smooth and temporal motion consistency, is however violated when the motion change is abrupt, a common situation in MIS.
% Recent advances in this domain have increasingly incorporated deep learning-based depth estimation techniques into photometric tracking frameworks, aiming to address the inherent \rev{MIS} challenges. Yet, photometric tracking remains fragile, particularly in MIS scenes where specular reflections induce rapid and unpredictable illumination changes. 
In this paper, we \rev{present RVO-MIS, a robust VO framework in MIS by first integrating SIFT and LightGlue for reliable feature correspondences, and then solving a sequence of absolute camera poses under a M-estimator sample consensus (MSAC) scheme. By advocating the absolute-pose-first formulation to prioritize geometric consistency and robustness, our approach decouples the camera motion tracking from smooth motion prior, photometric consistency, learned depths, {\em etc}. Through evaluations on the SCARED and EndoSLAM datasets, RVO-MIS demonstrates consistently accurate camera pose estimations. In challenging MIS situations where many methods fail or become inaccurate, RVO-MIS excels in both camera trajectory completion rate and accuracy. Code is publicly available at \url{https://github.com/vsi-lab/RVOMIS.git}.}
% based on feature point matching using M-Estimate Sample Consensus (MSAC) and Perspective-3-Point (P3P) absolute pose estimation to obtain accurate camera poses. To resolve the scale ambiguity, the scale of the absolute pose estimation is fixed by constructing a point cloud in the coordinate system of the first image through triangulating 3D points between keyframes. 
% Evaluated on the SCARED and EndoSLAM datasets, our approach,  demonstrates consistently accurate camera pose estimation, achieving a translation ATE (RMSE) of $0.2970\,\mathrm{cm}$ in the best case. Quantitative results indicate that our method significantly outperforms established baseline methods in both translation and rotation metrics, validating its robustness in challenging MIS environments. \rev{The code is publicly available at \url{https://github.com/vsi-lab/RVOMIS.git}.}
\end{abstract}

\begin{keywords}
Visual Odometry, Minimally Invasive Surgery, Feature-based Tracking
\end{keywords}



\section{Introduction} % ejl revisit again
Accurate camera pose estimation is an important component of navigation and guidance in minimally invasive surgery (MIS). This surgical navigation system, capable of tracking the laparoscope and displaying the spatial relationship between the laparoscope and surrounding anatomical structures, can effectively reduce the risk of critical organ damage caused by excessive contact during surgery while enhancing the surgeon's spatial awareness. Compared to marker-based navigation systems, vision-based approaches demonstrate higher efficiency as they do not interrupt surgical procedures and have the potential to achieve real-time navigation~\cite{yhd, sage}. The objective is to accurately estimate the 6 degrees of freedom (DoF) camera motion from \rev{a sequence of ordered monocular images} in MIS scenarios, as illustrated in Figure~\ref{fig1}. 
% An accurate camera pose estimation is essential for downstream tasks such as three-dimensional (3D) reconstruction, Structure-from-Motion (SfM), Augmented Reality (AR), and Simultaneous Localization and Mapping (SLAM). 

\begin{figure*}[t]
    \centering
    \includegraphics[width=\textwidth]{fig/overview.png}
    % \vspace{4pt}
    \caption{Illustration of the RVO-MIS framework in an MIS environment. 
    (a) A typical laparoscopic surgery setup. Image adapted from~\cite{blausen2014medical}. 
    (b) Visual odometry estimates 6-DoF camera motion, generating a continuous patient-internal trajectory. 
    (c) 3D points arising from keyframes serve as geometric anchors for robust absolute pose estimation in a MSAC scheme\rev{, which is the core of the proposed RVO-MIS.}}
    \label{fig1}
\end{figure*}

Most existing camera pose estimation methods are based on visual odometry (VO) or \rev{simultaneous localization and mapping (SLAM)} frameworks, such as ORB-SLAM2~\cite{mur2017orb}\rev{, and its variations~\cite{lamarca2018camera} designated for non-rigid surgical scenes. These approaches typically start with sparse or semi-sparse feature point correspondences, and recovers the camera motion by first suggesting an initial camera pose based on the assumption that the camera motion is temporally constant, and then refine the initial guess through minimizing reprojection errors.}
% There are also methods for monocular camera pose estimation, such as , which are designed for non-rigid surgical scenes involving motions and tissue deformation. 
\rev{Other approaches, specifically the deep learning-based family, have witnessed the deployment of learned depths~\cite{lv, zoe, survey, depthany} from monocular images to the traditional VO or SLAM methods, {\em e.g.}, Endo-Depth-and-Motion~\cite{endo-depth-and-motion}, EndoSLAM~\cite{endoslam}, {\em etc}. In particular, the learned depths complement the need for sparse features which are hard to be detected in textureless scenes, and estimates camera poses either through minimizing photometric residuals or} PoseNet architectures~\cite{posenet} trained with depth-guided constraints.
% Building upon these monocular approaches, numerous deep learning-based monocular depth estimation methods have also emerged~\cite{lv, zoe, survey, depthany} in recent years, which estimate depth images by inferring from the input RGB images. 
% Depth estimation can be naturally embedded into traditional SfM and SLAM methods, further addressing the problem that traditional feature-based methods cannot recognize features in textureless regions. 
% For example, Endo-Depth-and-Motion~\cite{endo-depth-and-motion}, EndoSLAM~\cite{endoslam}, and LINGMI-MR~\cite{geometry-aware-deep-network} obtain estimated depth images through unsupervised learning methods. These estimated depth images are then used to compute camera poses via photometric tracking (e.g., based on depth-aware reprojection errors) or PoseNet architectures~\cite{posenet} trained with depth-guided constraints.

Despite these advances, VO methods relying on estimated depth images inevitably suffer from errors introduced by depth estimation\rev{, and the depth errors are easily propagated to the errors in estimating camera poses. The assumption on photometric consistency also breaks easily in the MIS environment when light is unpredictable in a sense that it is reflected on organs, liquid, surgical instruments, {\em etc}. On the other hand, methods based on ORB-SLAM2~\cite{mur2017orb} framework}
% Specifically, these methods involve projection- and reprojection-based photometric tracking using estimated depth images to compute relative pose~\cite{endo-depth-and-motion}. Inaccurate depth estimation directly propagates errors into the camera pose estimation process. 
% Moreover, traditional feature-based methods, while effective in outdoor and indoor environments, 
face significant challenges in complex MIS scenarios. \rev{Specifically, its assumption on smooth temporal motion prior is violated when camera moves drastically, a common scenario in the MIS environment. In addition, even though there are ample features recognized from MIS images, constructing correspondences based on feature descriptor similarity is unstable, often leading to insufficient feature correspondences for a reliable pose estimation~\cite{chien2024recovering}.}
% In such settings, factors such as multi-angle lighting-induced reflections, surgical smoke, textureless backgrounds, and the dynamic movement of surgical instruments create highly non-static environments, leading to insufficient or mismatched feature points that make such methods difficult to deploy effectively.

% In this work, we propose a VO approach that incorporates deep learning-based methods for feature point extraction and matching with absolute pose estimation to establish metric scale in camera pose estimation. 
In this work, we propose \rev{RVO-MIS, a robust VO method in MIS that combines the sub-pixel accuracy of classical SIFT~\cite{sift} features with the deep learning-based matching, LightGlue~\cite{lightglue}, for reliable feature correspondences}, integrated with absolute pose estimation for camera pose estimation. To reconcile the rigid-body assumption of \rev{absolute pose from perspective-3-points (P3P)} with non-rigid MIS deformations, we integrate M-estimator sample consensus (MSAC)~\cite{torr2000mlesac} to robustly filter out \rev{features detected from} deforming tissues, ensuring stable \rev{camera motion} tracking on quasi-rigid background structures. \rev{The core philosophy of the proposed method is to decouple camera pose estimation process from smooth-motion priors, photometric consistency, or joint optimization over pose, depth, and scene structure, which are the primary building blocks of the existing methods, 
% {\em e.g.}, ORB-SLAM2~\cite{mur2017orb}, SDDef-SLAM~\cite{lamarca2018camera}, BodySLAM~\cite{body}, Endo-2DTAM~\cite{endodtam}, DPVO~\cite{dpvo}, EndoDepth~\cite{endo-depth-and-motion}, {\em etc}, 
but are often violated in the challenging MIS environments. We showed that, by prioritizing geometric consistency and robustness supported by integrating a deep learning-based feature matcher over temporal smoothness, multi-frame optimization, and strong scene modeling constraints, the proposed design offers a complementary alternative to the competing methods. In addition,}
% Recent studies have shown that such deep learning-based feature detection and matching methods demonstrate improved robustness in camera pose estimation performance compared to traditional approaches~\cite{cmpe}, although challenges remain in highly dynamic MIS environments. 
% Our experimental results show that by integrating a \rev{deep learning-based feature matcher}, our algorithm achieves robust performance on challenging MIS sequences. Furthermore, our method outperforms depth estimation-based deep learning VO approaches by avoiding the inherent errors introduced by depth estimation. 
in contrast to the computationally intensive approaches presented in ~\cite{endo-depth-and-motion, endoslam, geometry-aware-deep-network, robust-pose-estimator} that require extensive GPU resources for scene-specific training of depth estimation networks, our framework employs deep learning only for feature extraction and matching. By leveraging pre-trained weights with demonstrated generalization capabilities~\cite{mackute2024challenges}, our method achieves robust feature detection and matching performance without requiring additional scene-specific fine-tuning. \rev{
The contributions are outlined as follows:
\begin{itemize}
    \item {\bf Absolute-pose-first VO formulation:} RVO-MIS recovers camera motion by robustly estimating a sequence of absolute poses, rather than through temporally coupled minimization of reprojection errors or photometric residuals. This removes reliance on smooth motion assumption, and leaves refining the pose through minimizing reprojection errors as a lightweight optimization rather than the primary operation.
    \item {\bf MSAC-based pose estimation from robust feature detection and matching as the core estimator:} the proposed framework integrates the sub-pixel accuracy of classical SIFT features with the robust deep learning-based matching (LightGlue). This combination effectively addresses challenging MIS conditions such as textureless regions and specular reflections without requiring scene-specific fine-tuning. 
    % \item We propose a \textbf{hybrid monocular VO framework} that integrates the sub-pixel accuracy of classical SIFT features with the robustness of deep learning-based matching (LightGlue). This combination effectively addresses challenging MIS conditions such as textureless regions and specular reflections without requiring scene-specific fine-tuning.
    % \item We implement a robust \textbf{absolute pose estimation strategy} utilizing P3P with MSAC outlier rejection. This approach effectively filters out dynamic outliers caused by tissue deformation and surgical tools, ensuring stable tracking on quasi-rigid background structures.
    \item {\bf Empirical validation through challenging datasets:} we demonstrate {\em state-of-the-art performance} on SCARED~\cite{scared} and EndoSLAM~\cite{endoslam} datasets, both in accuracy and trajectory completion rate. An ablation study reveals that inlier ratio significantly increases by using SIFT in conjunction with LightGlue compared to the similarity-based approach, whereas absolute pose estimation in a MSAC scheme contributes to robust and accurate camera motion recovery.
\end{itemize}
}

% (i) as opposed to local reprojection refinement typically adopted by the existing methods, camera poses are estimated directly with respect to a fixed world coordinate system using 2D–3D correspondences and MSAC-based PnP, making absolute pose recovery the core estimation objective; (ii) robustness to outliers is treated as a first-class component of the estimator: pose hypotheses are selected through MSAC over geometric constraints, with least-squares optimization applied only as a lightweight refinement on inlier supports, rather than as the dominant estimation mechanism; (iii) the proposed framework is explicitly designed for scenarios in which common temporal assumptions, such as smooth motion, may be violated. By avoiding motion priors, sliding-window bundle adjustment, and multi-frame photometric optimization, the proposed method remains applicable under irregular camera motion, abrupt viewpoint changes, etc that are common in the MIS environment.
% To support this design, SIFT feature detection in conjunction with LightGlue feature matching is adopted to provide reliable correspondences for robust pose estimation.

\section{Related Work} % ejl done
\textbf{SLAM Methods:} SLAM enables real-time tracking and mapping, a critical capability for MIS. SAGE~\cite{sage} integrates learned priors with factor graph optimization to ensure robust reconstruction in textureless, illumination-varying environments. Another framework~\cite{sym14020185} combines medical bag-of-words with Poisson reconstruction, generating dense, detailed 3D models from sparse outputs. Addressing visual failure, ArthroSLAM~\cite{arthroslam} utilizes a dynamically weighted extended Kalman filter for continuous multi-sensor localization. Finally, feature-based methods~\cite{studyslam} significantly improve tracking performance by combining K-means with SuperPoint~\cite{superpoints} for enhanced feature extraction. To address the generalization gap in deep learning-based SLAM, BodySLAM~\cite{body} achieves cross-domain generalization without fine-tuning by combining CycleGAN-based pose estimation with zero-shot depth prediction. Recently, Endo-2DTAM~\cite{endodtam} leverages 2D Gaussian Splatting and surface normal-aware tracking to overcome multi-view inconsistencies, enabling high fidelity, geometrically accurate reconstruction.

% Comment: Related work should objectively describe others' work
% This result also provides strong prior evidence for our work to enhance feature point extraction and matching in this scenario. 

\noindent \textbf{VO Methods:} A hybrid approach~\cite{SONG2021115631} integrating deep learning networks and geometric features was implemented in this scenario. Region classification and a two-stage pose refinement procedure are the two main components of this novel approach. It uses a Siamese network architecture~\cite{siamese} and two identical PoseNet models~\cite{posenet} to assess the similarity between the test image and its collected region. Pose is obtained via triangulation using region information. This data-efficient approach outperforms pure deep learning or geometry methods. Furthermore, sensor fusion is highlighted as a solution to the inherent scale drift and ambiguity of conventional monocular VO. In this context, EndoVMFuseNet~\cite{endoVMFuseNet} uses a recurrent CNN to fuse 6DoF visual and 5DoF magnetic data without synchronization. Its energy reduction method integrates dense photometric alignment with sparse optical flow features. While sensor fusion enhances data richness, DPVO~\cite{dpvo} maximizes efficiency by tracking sparse patches instead of dense flow. It combines a recurrent update operator with differentiable bundle adjustment, achieving the robustness of dense methods with significantly reduced computational and memory costs.

% \noindent \textbf{Depth Estimation Methods:} The study~\cite{chen2019slamendoscopyenhancedadversarial} introduces a monocular visual SLAM framework leveraging a conditional Generative Adversarial Network (cGAN)~\cite{cgan} for depth estimation, enhancing robustness in challenging endoscopic environments. The proposed cGAN framework adopts a U-Net-based~\cite{unet} encoder-decoder architecture as its generator, trained on cinematic rendering images to predict depth images. The discriminator network evaluates the authenticity of generated versus real image pairs, optimizing a binary cross-entropy loss function augmented with penalty terms to enforce both pixel-wise fidelity and geometric accuracy. The system subsequently fuses endoscopic RGB video streams with cGAN-derived depth estimates within the ElasticFusion pipeline~\cite{doi:10.1177/0278364916669237}, achieving robust real-time surface reconstruction and demonstrating improved tracking performance in complex endoscopic environments. Recent work~\cite{geometry-aware-deep-network} has introduced a geometry-aware depth estimation framework leveraging synthetic RGB-Depth datasets. The proposed architecture employs an auto-encoder design following MultiDepth~\cite{multidepth}, while incorporating a novel composite loss function to enhance geometric consistency. The loss function combines gradient loss for fine structural preservation, normal loss for surface orientation accuracy, and geometric consistency loss for 3D structural integrity. This integrated approach demonstrates particular effectiveness in reconstructing small anatomical features, sharp edges, and complex surface geometries. Comprehensive evaluation on the EndoSLAM dataset~\cite{endoslam} shows the framework achieves state-of-the-art quantitative performance, with significant improvements in geometric fidelity metrics compared to existing methods.

\noindent \textbf{Depth Estimation Methods:}~\cite{chen2019slamendoscopyenhancedadversarial} propose a cGAN-based framework using a U-Net generator for depth estimation. By enforcing geometric fidelity through adversarial training and fusing estimates within ElasticFusion~\cite{doi:10.1177/0278364916669237}, it achieves robust real-time reconstruction. More recently, ~\cite{geometry-aware-deep-network} introduces a geometry-aware framework based on MultiDepth~\cite{multidepth}. By employing a composite loss function targeting gradient and normal consistency, this approach significantly enhances geometric fidelity for complex anatomical features, achieving state-of-the-art performance on the EndoSLAM dataset~\cite{endoslam}.

\section{Method}

\begin{figure*}[t]
\centering
\includegraphics[width=1\textwidth]{fig/Flow_Chart.pdf}
% \vspace{-0.5em}
\caption{\rev{The proposed VO framework begins with feature extraction using SIFT and robust feature matching via LightGlue (Section 3.2). During the initialization stage, relative pose estimation and triangulation are performed to reconstruct an initial set of 3D points (Section 3.3). For subsequent frames, 2D features detected in the current image are associated with 3D points from the active keyframe, enabling absolute pose estimation via a P3P solver within an MSAC scheme, followed by a lightweight least-squares refinement (Section 3.4). Keyframes are selectively introduced to incrementally enrich the 3D point cloud, facilitating reliable absolute pose estimation for later frames (Section 3.5). 
% By relying on MSAC-based absolute pose recovery supported by robust feature detection and matching, the proposed method operates without assumptions on motion priors or temporal consistency, and excels even without bundle adjustment.
} } \label{fig2}
\end{figure*}

\subsection{\rev{Overview}}

The core innovation of \rev{RVO-MIS lies in promoting geometric consistency in estimating camera poses in a robust feature correspondence construction and MSAC-based absolute pose estimation framework, Figure~\ref{fig2}. Specifically, the input is a sequence of ordered monocular images. An initialization is in charge of giving rise to a set of 3D cloud of points to provide an inherent scale for the resultant trajectory. These 3D points are used to associate with the 2D point correspondences for estimating absolute poses of subsequent frames. When the 2D-3D correspondences are scarce, a frame is elevated to a keyframe, and the process continues which in the end returns a complete trajectory. Notably, RVO-MIS is free from assuming smooth camera motion and photometric consistency, and does not rely on scene model-based constraints and learned depths. 
}
% reconstructing 3D point clouds through precise, numerous feature point detection and robust deep learning-based matching. We achieve accurate absolute pose estimation by establishing sufficient 2D-3D correspondences between keyframes and current frames, and then minimizing reprojection error using a robust $\text{P3P}$ optimization framework (see Figure~\ref{fig2}).

\noindent \textbf{Notations:} Let $\Gamma_{w,k}$ be a $k$-th 3D point in the world coordinate, $K$ be the camera calibration matrix, $\rot_i$ and $\transl_i$ be the estimated absolute rotation matrix and translation vector of camera $i$, respectively. A 2D feature point $\gamma_{k}$ with depth $\rho_{k}$ gives rise to the $k$-th 3D point in the camera coordinate $\Gamma_{i,k} = \rho_k \gamma_k$ which relates the 3D point $\Gamma_w$ in the world coordinate by
\begin{equation}
    \Gamma_{i,k} = \rot_{i} \Gamma_{w,k} + \transl_i.
    \label{eq:general_rel_pose}
\end{equation}
Denote $\gamma_{im,k}$ as the $k$-th image point in pixels so that $\gamma_{im,k} = K \gamma_k$.



% \begin{figure*}[t]
%     \centering
%     \setlength{\tabcolsep}{1pt}
    
    
%     \footnotesize
%     \begin{tabular}{c c c c}
%         \makebox[0.24\textwidth]{Sequence 1} & 
%         \makebox[0.24\textwidth]{Sequence 2} & 
%         \makebox[0.24\textwidth]{Sequence 3} & 
%         \makebox[0.24\textwidth]{Sequence 4} \\
%     \end{tabular}
    
%     % (61, 62, 63, 64)
%     \raisebox{3.5em}{\rotatebox{90}{\scriptsize \textbf{Trajectory}}}
%     \hspace{2pt}%
%     % 
%     \includegraphics[width=0.23\textwidth]{fig/61.png}\hfill
%     \includegraphics[width=0.23\textwidth]{fig/62.png}\hfill
%     % 
%     \includegraphics[width=0.23\textwidth]{fig/63.png}\hfill
%     \includegraphics[width=0.23\textwidth]{fig/64.png}
    
%     \vspace{4pt}
    
%     \caption{Trajectories comparison for the representative sequences in the SCARED dataset. Sequence 1-4 shows the 3D estimated camera poses for several challenging surgical sequences. Our proposed method (blue line) closely tracks the Ground Truth (black dashed line), outperforming established baselines (colored lines).} 
%     \label{fig3}
% \end{figure*}



\subsection{Feature Extraction and Matching} 
As shown in \rev{~\cite{mackute2024challenges}, a diverse set of features was analyzed for endoluminal navigation. It revealed that modern learning-based features such as SuperPoint~\cite{superpoints} provide denser features compared to conventional features such as SIFT~\cite{sift}, ORB~\cite{orb}, {\em etc}, they remain suboptimal in terms of accuracy and reliability. In contrast, SIFT is the most accurate and reliable features. These findings align with our empirical evidence, and thus we adopt SIFT in the proposed framework to prioritize accuracy and reliability.}

% We compared four feature extraction methods, {\em i.e.}, SIFT~\cite{sift}, ORB~\cite{orb}, SURF~\cite{surf} and SuperPoint~\cite{superpoints}. The results demonstrate that SIFT extracts a relatively larger number of accurate feature points, providing abundant candidates for subsequent feature matching. 
\rev{To ensure high quality candidates and computational efficiency, we rank-ordered the features and pick the top 80\% of features ($\rho_{rank}=0.8$) based on their response scores.} During the matching phase, we initially employed the VL\rev{F}eat~\cite{vlfeat} for \rev{constructing SIFT correspondences based on SIFT descriptor similarity, with additional filters of Lowe's ratio test and bidirectional consistency check.}
% . The matching algorithm~\cite{sift} in the VL\rev{F}eat library implements a combination of Lowe's ratio test and bidirectional matching. 
% For each SIFT point in the first image, it calculates the 128-dimensional Euclidean distance to all feature points in the second image, identifying both the nearest and second-nearest neighbors. Matches are retained only when the nearest neighbor distance is smaller than the second-nearest neighbor distance divided by a predefined threshold. 
However, \rev{our prior experiments showed that} a significant proportion of SIFT \rev{correspondences are ambiguous and non-veridical, exacerbating the subsequent pose estimation.} 
% resulting in unreliable performance of conventional descriptor-based matching methods. 
Alternatively, we adopted LightGlue~\cite{lightglue}, a state-of-the-art learning-based feature matcher. 
% LightGlue is a feature matching framework that builds on SuperGlue~\cite{superglue}. 
It leverages a graph neural network (GNN) to jointly reason about feature correspondences, incorporating both local appearance and geometric consistency. \rev{In this case, SIFT correspondences become ample and reliable, which is supported by our ablation study, Section~\ref{sec:experiments}.} 
% compared to conventional feature-based VO methods that often fail in this challenging MIS scenario, our approach demonstrates significantly improved robustness by incorporating advanced deep learning-based feature matching techniques.
% \begin{figure*}[t]
%     \centering
%     \setlength{\tabcolsep}{1pt}
    
%     % 
%     \footnotesize
%     \begin{tabular}{c c c c}
%         \makebox[0.24\textwidth]{Sequence 1} & 
%         \makebox[0.24\textwidth]{Sequence 2} & 
%         % 
%         \makebox[0.24\textwidth]{Sequence 3} & 
%         \makebox[0.24\textwidth]{Sequence 4} \\
%     \end{tabular}
    
%     % 
%     \raisebox{3.5em}{\rotatebox{90}{\scriptsize \textbf{ATE (Trans.)}}}%
%     \hspace{2pt}%
%     % 
%     \includegraphics[width=0.23\textwidth]{fig/11.png}\hfill
%     \includegraphics[width=0.23\textwidth]{fig/21.png}\hfill
%     % 
%     \includegraphics[width=0.23\textwidth]{fig/41.png}\hfill
%     \includegraphics[width=0.23\textwidth]{fig/51.png}
    
%     \vspace{4pt} % 
    
%     % 
%     \raisebox{3.5em}{\rotatebox{90}{\scriptsize \textbf{ATE (Rot.)}}}%
%     \hspace{2pt}%
%     % 
%     \includegraphics[width=0.23\textwidth]{fig/12.png}\hfill
%     \includegraphics[width=0.23\textwidth]{fig/22.png}\hfill
%     % 
%     \includegraphics[width=0.23\textwidth]{fig/42.png}\hfill
%     \includegraphics[width=0.23\textwidth]{fig/52.png}
    
%     \caption{Visualization of ATE (top row: translation, bottom row: rotation).} 
%     \label{fig4}
% \end{figure*}

% \begin{figure*}[t]
%     \centering
%     \setlength{\tabcolsep}{1pt}
    
    
%     \footnotesize
%     \begin{tabular}{c c c c}
%         \makebox[0.24\textwidth]{Sequence 1} & 
%         \makebox[0.24\textwidth]{Sequence 2} & 
%         \makebox[0.24\textwidth]{Sequence 3} & 
%         \makebox[0.24\textwidth]{Sequence 4} \\
%     \end{tabular}
    
%     % (61, 62, 63, 64)
%     \raisebox{3.5em}{\rotatebox{90}{\scriptsize \textbf{Trajectory}}}
%     \hspace{2pt}%
%     % 
%     \includegraphics[width=0.23\textwidth]{fig/61.png}\hfill
%     \includegraphics[width=0.23\textwidth]{fig/62.png}\hfill
%     % 
%     \includegraphics[width=0.23\textwidth]{fig/63.png}\hfill
%     \includegraphics[width=0.23\textwidth]{fig/64.png}
    
%     \vspace{4pt}
    
%     \caption{Trajectories comparison for the representative sequences in the SCARED dataset. Sequence 1-4 shows the 3D estimated camera poses for several challenging surgical sequences. Our proposed method (blue line) closely tracks the Ground Truth (black dashed line), outperforming established baselines (colored lines).} 
%     \label{fig3}
% \end{figure*}
\subsection{\rev{Initialization}}
% To obtain metrically-scaled continuous camera poses through absolute pose estimation, our method establishes a fixed scale reference by reconstructing 3D points from the first two frames. 
\rev{
% To obtain camera poses via absolute pose estimation, 
% RVO-MIS first . 
After constructing a set of SIFT correspondences from the first two frames, relative pose is estimated via 5-point algorithm~\cite{nister2004efficient} under a classic RANSAC~\cite{fischler1981random} scheme. A 2D-2D correspondence is treated as an inlier supporting the relative pose hypothesis when their distance to the corresponding epipolar line is below 2 pixels. The inlier supports of the best relative pose hypothesis are used to triangulate to their 3D counterparts, forming a cloud of 3D points under the world (first frame) coordinate. Thus, the estimate of absolute poses of subsequent frames using the constructed 3D points is inherently referenced to the world coordinate, so that the inherent scale remains consistent throughout the entire trajectory. Once the initialization is complete, the second frame is elevated to be an active keyframe as a reference image which gives rise to the 3D points.}
\subsection{\rev{Tracking}} Upon reconstructing 3D points from inlier 2D feature matches, we identify which 2D features in the keyframe have valid 3D correspondences. By matching 2D features between the current frame and the keyframe, \rev{the association of observed 2D points in the current frame and the reconstructed 3D points (referred to as co-visible 3D points) can be found. These 2D-3D correspondences enable an absolute pose estimation of the current frame via solving a P3P minimal problem. In the presence of incorrect associations, MSAC~\cite{torr2000mlesac} is adopted for a robust absolute pose estimation with a maximum of 3000 iterations. Each absolute pose hypothesis is supported by an inlier 2D-3D point pair if its reprojection error is less than 2 pixels. This MSAC-based absolute pose estimation is the main building block of recovering the camera motion in RVO-MIS. To further boost the estimation accuracy, } 
% To ensure robustness, we employ the MSAC \rev{with an inlier threshold of 2 pixels and a maximal of 3000 iterations} to obtain the optimal absolute pose estimation. 
the estimated absolute pose of camera $i$ is refined by minimizing an energy function $E(\rot_i,\transl_i)$ defined as a function of the absolute rotation $\rot_i$ and absolute translation $\transl_i$ \rev{in the form of a sum of} reprojection errors, {\em i.e.},
\begin{equation}
    E(\rot_i,\transl_i) = \frac{1}{2}\sum_{k=1}^{N} \left| \left| \gamma_{im,k} - \frac{K(\rot_i \Gamma_{w,k} + \transl_i)}{e_3^T K(\rot_i \Gamma_{w,k} + \transl_i)} \right| \right|^2,
\label{eq:energy_function}
\end{equation}
so that the refined absolute pose $(\rot_i^{*},\transl_i^{*})$ is 
\begin{equation}
    \left(\rot_i^{*},\transl_i^{*} \right) = \underset{\rot_i,\transl_i}{\text{argmin}} \; E(\rot_i,\transl_i).
\end{equation}
Minimizing $E(\rot_i,\transl_i)$ with respect to $(\rot_i,\transl_i)$ is done by the Levenberg-Marquardt algorithm. 
Note that the rotation matrix $\rot_i$ is parameterized by the three Euler angles, and thus there are six unknowns in total, {\em i.e.}, three for rotation and three for translation.

% \begin{figure*}[t]
%     \centering
%     \setlength{\tabcolsep}{1pt}
    
%     % 
%     \footnotesize
%     \begin{tabular}{c c c c}
%         \makebox[0.24\textwidth]{Sequence 1} & 
%         \makebox[0.24\textwidth]{Sequence 2} & 
%         % 
%         \makebox[0.24\textwidth]{Sequence 3} & 
%         \makebox[0.24\textwidth]{Sequence 4} \\
%     \end{tabular}
    
%     % 
%     \raisebox{3.5em}{\rotatebox{90}{\scriptsize \textbf{ATE (Trans.)}}}%
%     \hspace{2pt}%
%     % 
%     \includegraphics[width=0.23\textwidth]{fig/11.png}\hfill
%     \includegraphics[width=0.23\textwidth]{fig/21.png}\hfill
%     % 
%     \includegraphics[width=0.23\textwidth]{fig/41.png}\hfill
%     \includegraphics[width=0.23\textwidth]{fig/51.png}
    
%     \vspace{4pt} % 
    
%     % 
%     \raisebox{3.5em}{\rotatebox{90}{\scriptsize \textbf{ATE (Rot.)}}}%
%     \hspace{2pt}%
%     % 
%     \includegraphics[width=0.23\textwidth]{fig/12.png}\hfill
%     \includegraphics[width=0.23\textwidth]{fig/22.png}\hfill
%     % 
%     \includegraphics[width=0.23\textwidth]{fig/42.png}\hfill
%     \includegraphics[width=0.23\textwidth]{fig/52.png}
    
%     \caption{Visualization of ATE (top row: translation, bottom row: rotation).} 
%     \label{fig4}
% \end{figure*}
\subsection{\rev{Mapping and Keyframe Management}}
Our keyframe update strategy triggers under two conditions: {\em (i)} when fewer than 55\% of the current frame’s co-visible 3D landmarks originate from the active keyframe, or {\em (ii)} when exceeding 15 frames since the last keyframe insertion.  
\rev{As a new keyframe is inserted, we triangulate newly observed point features between the current frame and the active keyframe, {\em i.e.}, the inlier 2D matches whose 3D counterparts are absent.} Triangulating 2D point feature \rev{matches} from two views requires the relative \rev{transformation which can be easily} achieved by coordinate transformation of the current camera pose and the keyframe camera pose. Specifically, let $(\rot_c,\transl_c)$ and $(\rot_k,\transl_k)$ be the absolute poses of the current frame and the keyframe, respectively; the goal is to find the relative pose $(\rot_{kc}, \transl_{kc})$ so that a point $\Gamma_c$ under the current camera coordinate is transformed to a point $\Gamma_k$ under the keyframe camera coordinate by $\Gamma_k = \rot_{kc} \Gamma_c + \transl_{kc}$. From Equation~\ref{eq:general_rel_pose}, we have
\begin{equation}
\left\{
\begin{aligned}
    & \Gamma_{c} = \rot_{c} \Gamma_w + \transl_{c} \\
    & \Gamma_{k} = \rot_{k} \Gamma_w + \transl_{k},
\end{aligned}
\right.
\label{eq:world_to_local_transf}
\end{equation}
where for simplicity we omit the index for the point. 
Now, $\Gamma_w$ can be isolated in the first vector equation of Equation~\ref{eq:world_to_local_transf} as 
\begin{equation}
    \Gamma_w = \rot_{c}^T \left(\Gamma_{c} - \transl_{c} \right),
\end{equation}
which can be plugged to the second vector equation of Equation~\ref{eq:world_to_local_transf}, giving
\begin{equation}
    \Gamma_{k} = \rot_{k} \rot_{c}^T \left(\Gamma_{c} - \transl_{c} \right) + \transl_{k} = \rot_k \rot_c^T \Gamma_c + \transl_k - \rot_k \rot_c^T \transl_c.
\end{equation}          
Thus, the relative pose $(\rot_{kc}, \transl_{kc})$ is
\begin{equation}
\left\{
\begin{aligned}
& \rot_{kc} = \rot_k \rot_c^T  \\
& \transl_{kc} =  \transl_k - \rot_k \rot_c^T \transl_c.
\end{aligned}
\right.
\label{eq:ap2rp}
\end{equation}
which provides epipolar constraint across the two frames as the essential matrix $E_{kc}=[\transl_{kc}]_{\times} \rot_{kc}$ can be easily found. This constraint is adopted to pick 2D-2D \rev{correspondences whose 3D counterparts are absent} that satisfy epipolar geometry from thresholding the Sampson error\rev{~\cite{terekhov2023tangent}. Those that pass the epipolar geometry constraints are} triangulated to form a group of {\em new} 3D points and are transformed to the world coordinate in order to keep the entire cloud of 3D points in the same coordinate system. This ensures continuous map expansion while maintaining \rev{consistent inherent scale} through geometric verification. \rev{A complete RVO-MIS pipeline can be found in Appendix~\ref{sec:appendix_a}.}

\section{Experiments}
\label{sec:experiments}
\noindent \textbf{Dataset:} \rev{The proposed method is evaluated} on multiple sequences of the stereo correspondence and reconstruction of endoscopic data (SCARED)~\cite{scared} dataset. \rev{Specifically, 
these
sequences were chosen to cover a diverse set of conditions: 
two of them exhibit illumination variations and flowing blood, and another two are challenging by the presence of surgical instruments and textureless areas (Figure~\ref{fig:example_imgs_of_datasets}(a)). The EndoSLAM dataset~\cite{endoslam} is also used. Recognizing that the EndoSLAM dataset has a frame-skipping problem~\cite{deng2023feature}, namely, the sequence of images is not consecutive, often giving non-overlap in scene between adjacent images. This often leads to excessive large errors for all methods after evaluation, undermining the validity of performance metrics on this dataset. Nevertheless, EndoSLAM provides situations where camera moves drastically, creating small overlaps in scenes between adjacent frames and attributing to blurry images (Figure~\ref{fig:example_imgs_of_datasets}(b)). This challenging situation is adopted in this paper, specifically the small intestine sequence, to test the failure handling strategies of all methods.}
% The SCARED dataset consists of 9 in vivo porcine subjects, with 4 endoscopic video sequences captured for each subject using a da Vinci Xi surgical robotic system. All sequences feature rigid scenes without respiratory motion, providing stereo video streams synchronized with precise camera kinematic data. 
% For experimental validation, we conducted comprehensive evaluations across multiple sequences.

\begin{figure*}[t]
\centering
(a)\includegraphics[width=0.16\textwidth]{fig/s1.png}
\includegraphics[width=0.16\textwidth]{fig/s4.png}
\includegraphics[width=0.16\textwidth]{fig/s2.png}
\includegraphics[width=0.16\textwidth]{fig/s3.png}
(b)\includegraphics[width=0.17\textwidth]{fig/frame_000291.png}
\vspace{-0.5em}
\caption{\rev{Example images of (a) the chosen four sequences (ordered from left to right) from the SCARED dataset, exhibiting scenes with light reflections, flowing blood, presence of surgical instruments, {\em etc}; (b) the chosen sequence of EndoSLAM dataset with abrupt camera motion, attributing to blurry images and skipping frames.}} \label{fig:example_imgs_of_datasets}
\end{figure*}

\noindent \rev{{\bf Baselines:} The proposed RVO-MIS is compared against several methods, ranging from classic geometry-based approaches to the recent learning-based contemporaries. In particular, ORB-SLAM2~\cite{mur2017orb} and SD-DefSLAM~\cite{lamarca2018camera} are representatives for classic non-learning based methods, while BodySLAM~\cite{body}, Endo-2DTAM~\cite{endodtam}, DPVO~\cite{dpvo}, and EndoDepth~\cite{endo-depth-and-motion} are recent learning-based methods. ORB-SLAM2 and DPVO are designed for generic scenes, while the rest targets MIS applications. Throughout our experiments, we use default settings for all competing methods, {\em i.e.}, for learning-based approaches their pretrained weights are used. In addition, native image resolutions from the datasets are fed to all methods without any preprocessing, {\em e.g.}, down-sampling, {\em etc}.
}

\noindent \textbf{Evaluation Metrics:} \rev{For quantitative evaluations, standard metrics~\cite{em} are adopted. Specifically,~\cite{ume}’s method is first applied for a global trajectory alignment, where the inherent scale ambiguity of monocular methods is accounted for by a similarity transformation in $\mathrm{Sim}(3)$.
Absolute trajectory error (ATE) of each aligned estimated pose with respect to the corresponding ground-truth pose is used, from which aggregated errors over the entire trajectory is described by means of root-mean-square error (RMSE). Throughout the paper, ATE($\transl$) and ATE($\rot$) respectively represent the RMSE of ATE for the translation (in centimeters) and the rotation (in degrees) parts. For the experiments on the EndoSLAM dataset, we additionally use Trajectory Completion Rate~\cite{chien2024recovering}, namely, the ratio of the number of images a method  estimates their poses until failure over the total number of images in the sequence, to demonstrate the performance of the competing methods in situations with a high chance of failure.}
% , {\em i.e.}, a transformation containing a rotation matrix, a translation vector, and a scale so that the geometry of the estimated trajectory $\mathbf{P}_{1:n}$ can be as close to the ground-truth trajectory $\mathbf{Q}_{1:n}$ as possible. This alignment is represented by a $4\times 4$ matrix $\mathbf{S}$. We also enforce the estimated trajectories to align to the origin, so that the first frame is identical. ATE at each timestep $i$ is adopted as the evaluation metric which is defined as
% Equation (4): Frame error definition
% \begin{equation}
%     \mathbf{F}_i \coloneqq \mathbf{Q}_i^{-1}\mathbf{S}\mathbf{P}_i,
%     \label{eq:frame_error}
% \end{equation}
% which is a $4\times 4$ matrix describing how ``far apart'' the estimation at timestamp $i$ is from the ground-truth. To examine the performance on the rotation and the translation estimations, the rotation and translation parts of $\mathbf F_i$ are isolated as ${\mathbf F_{i,\text{rot}}}$ and ${\mathbf F_{i,\text{trans}}}$, respectively, and are represented by a single number as
% \begin{equation}
% \left\{
% \begin{aligned}
% & \text{ATE}(\rot) = 2\left(3 - \text{trace}(\mathbf{F}_{i,\text{rot}})\right) \\
% & \text{ATE}(\transl) = \left\| {\mathbf F}_{i,\text{trans}} \right\|,
% \end{aligned}
% \right.   
% \end{equation}
% where $\text{ATE}(\rot)$ describes the Frobenius norm of the rotation error, {\em i.e.}, $\|\mathbf{Q}_{i,\text{rot}} - \mathbf{P}_{i,\text{rot}}\|_{F}^2 = 2\left(3 - \text{trace}(\mathbf{F}_{i,\text{rot}})\right)$.

% To aggregate errors across the entire trajectory, we define the root-mean-square error (RMSE) of translational components:
% % Equation (5): RMSE calculation
% \begin{equation}
%     \text{RMSE}(\mathbf{F}_{1:n}) \coloneqq \left( \frac{1}{n} \sum_{i=1}^n \|(\mathbf{F}_{i,\text{trans}})\|^2 \right)^{1/2}.
%     \label{eq:rmse}
% \end{equation}


% \begin{figure}[t]
% \centering
% \includegraphics[width=0.7\textwidth]{fig/figure5.png}
% \caption{Absolute Trajectory Error (ATE) comparison. The bar plots indicate the translational and rotation ATE results of our proposed method compared with the state-of-the-art baselines on the SCARED dataset.} \label{fig5}
% \end{figure}

% {\scriptsize
% \begin{table*}[!t]
% \centering
% \setlength\tabcolsep{2pt}

% \resizebox{\textwidth}{!}{
% \begin{tabular}{|c|cc|cc|cc|cc|}
% \hline
% \multicolumn{1}{|c|}{\multirow{2}{*}{\textbf{Method}}} & \multicolumn{2}{c|}{\textbf{Sequence 1}} & \multicolumn{2}{c|}{\textbf{Sequence 2}} & \multicolumn{2}{c|}{\textbf{Sequence 3}} & \multicolumn{2}{c|}{\textbf{Sequence 4}} \\
%  & ATE ($\transl$) & ATE ($\rot$) & ATE ($\transl$) & ATE ($\rot$) & ATE ($\transl$) & ATE ($\rot$) & ATE ($\transl$) & ATE ($\rot$) \\
% \hline
% {\textbf{SLAM}} & & & & & & & & \\
% ORB-SLAM2 (M)~~\cite{mur2017orb} &0.5068 &3.0534 &0.9912 &3.0798 &9.6247 &2.8640 &4.4825 &3.0183 \\
% ORB-SLAM2 (S)~~\cite{mur2017orb} &1.3091 &0.3511 &1.3623 &3.0303 &9.7053 &3.0089 &4.3626 &2.9151 \\
% SD-DefSLAM (M)~~\cite{lamarca2018camera} &1.0036 &0.3247 &1.7788 &2.0058 &6.2579 &\underline{0.3455} &3.5653 &\underline{0.1761} \\
% BodySLAM (M)~~\cite{body} &0.4504 &\underline{0.1991} &0.4447 &0.1851 &- &- & 8.2481&0.7986 \\
% Endo-2DTAM (M)~~\cite{endodtam} &2.3290 &2.2410 &2.3187 &1.1858 &6.3792 &1.9767 &5.1394 &2.3166 \\
% \hline
% {\textbf{VO}} & & & & & & & & \\
% DPVO (M)~~\cite{dpvo} &\underline{0.3326} &0.2466 &\underline{0.3902} &\underline{0.1763} &\textbf{2.8271} &0.4607 &\underline{0.9269} &0.4405 \\
% \hline
% {\textbf{Deep Learning}} & & & & & & & & \\
% EndoDepth (M)~~\cite{endo-depth-and-motion} &1.1748 &0.3688 &1.4863 &1.7176 &9.2599 &0.8804 &4.2948 &0.3484 \\
% \hline
% {\textbf{Proposed}} & & & & & & & & \\
% RVO-MIS (M) &\textbf{0.2970} &\textbf{0.0523} &\textbf{0.3574} &\textbf{0.1302} &\underline{4.0381} &\textbf{0.1261} &\textbf{0.6822} &\textbf{0.0548} \\
% \hline
% \end{tabular}
% }
% \caption{Quantitative comparison of ATE (RMSE) on four representative sequences from the SCARED dataset. \textbf{Translation errors ($\mathcal{T}$) are reported in centimeters (cm)}, and \textbf{rotation errors ($\mathcal{R}$) are reported in degrees ($^\circ$)}. Due to the inherent scale ambiguity of monocular methods, all estimated trajectories are aligned to the ground truth using Sim3 transformation (optimizing scale, rotation and translation). Bold numbers indicate the best results, and underlined numbers indicate the second-best performances. (``M'': Monocular, ``S'': Stereo; ``-": Tracking Failure; \rev{Global Alignment})}
% \label{table1}
% \end{table*}
% }

% \begin{figure*}[t]
%     \centering
%     \setlength{\tabcolsep}{1pt}
    
%     % 
%     \footnotesize
%     \begin{tabular}{c c c c}
%         \makebox[0.24\textwidth]{Sequence 1} & 
%         \makebox[0.24\textwidth]{Sequence 2} & 
%         % 
%         \makebox[0.24\textwidth]{Sequence 3} & 
%         \makebox[0.24\textwidth]{Sequence 4} \\
%     \end{tabular}
    
%     % 
%     \raisebox{3.5em}{\rotatebox{90}{\scriptsize \textbf{ATE (Trans.)}}}%
%     \hspace{2pt}%
%     % 
%     \includegraphics[width=0.23\textwidth]{fig/11.png}\hfill
%     \includegraphics[width=0.23\textwidth]{fig/21.png}\hfill
%     % 
%     \includegraphics[width=0.23\textwidth]{fig/41.png}\hfill
%     \includegraphics[width=0.23\textwidth]{fig/51.png}
    
%     % 
%     \raisebox{3.5em}{\rotatebox{90}{\scriptsize \textbf{ATE (Rot.)}}}%
%     \hspace{2pt}%
%     % 
%     \includegraphics[width=0.23\textwidth]{fig/12.png}\hfill
%     \includegraphics[width=0.23\textwidth]{fig/22.png}\hfill
%     % 
%     \includegraphics[width=0.23\textwidth]{fig/42.png}\hfill
%     \includegraphics[width=0.23\textwidth]{fig/52.png}
    
%     \caption{Visualization of ATE \rev{of RVO-MIS (top row: ATE($\transl$); bottom row: ATE($\rot$))}.} 
%     \label{fig5}
% \end{figure*}

{\scriptsize
\begin{table*}[t]
\centering
\caption{Quantitative comparison of ATE (RMSE) using {\em global alignment} on four representative sequences from the SCARED dataset.
% \textbf{Translation errors ($\mathcal{T}$) are reported in centimeters (cm)}, and \textbf{rotation errors ($\mathcal{R}$) are reported in degrees ($^\circ$)}. 
\rev{\textbf{Bold}: best. \underline{underlined}: second best.} (``M'': Monocular, ``S'': Stereo, ``-": Tracking Failure)}
\setlength\tabcolsep{2pt}

\resizebox{\textwidth}{!}{


\begin{tabular}{|c|cc|cc|cc|cc|}
\hline
\multicolumn{1}{|c|}{\multirow{2}{*}{\textbf{Method}}} & \multicolumn{2}{c|}{\textbf{Sequence 1}} & \multicolumn{2}{c|}{\textbf{Sequence 2}} & \multicolumn{2}{c|}{\textbf{Sequence 3}} & \multicolumn{2}{c|}{\textbf{Sequence 4}} \\

~ & ATE ($\transl$) & ATE ($\rot$) & ATE ($\transl$) & ATE ($\rot$) & ATE ($\transl$) & ATE ($\rot$) & ATE ($\transl$) & ATE ($\rot$) \\
\hline
% {\textbf{SLAM}} & & & & & & & & \\

ORB-SLAM2 (M)~~\cite{mur2017orb} &\rev{1.6543} &3.0534 &0.9912 &3.0798 &9.6247 &2.8640 &4.4825 &3.0183 \\
ORB-SLAM2 (S)~~\cite{mur2017orb} &1.3091 &0.3511 &1.3623 &3.0303 &9.7053 &3.0089 &4.3626 &2.9151 \\
SD-DefSLAM (M)~~\cite{lamarca2018camera} &1.0036 &0.3247 &1.7788 &2.0058 &6.2579 &\underline{0.3455} &3.5653 &\underline{0.1761} \\
BodySLAM (M)~~\cite{body} &0.4504 &\underline{0.1991} &0.4447 &0.1851 &- &- & 8.2481&0.7986 \\
Endo-2DTAM (M)~~\cite{endodtam} &2.3290 &2.2410 &2.3187 &1.1858 &6.3792 &1.9767 &5.1394 &2.3166 \\
% \hline
% {\textbf{VO}} & & & & & & & & \\
DPVO (M)~~\cite{dpvo} &\underline{0.3326} &0.2466 &\underline{0.3902} &\underline{0.1763} &\textbf{2.8271} &0.4607 &\underline{0.9269} &0.4405 \\
% \hline
% {\textbf{Deep Learning}} & & & & & & & & \\
EndoDepth (M)~~\cite{endo-depth-and-motion} &1.1748 &0.3688 &1.4863 &1.7176 &9.2599 &0.8804 &4.2948 &0.3484 \\
% \hline
% {\textbf{Proposed}} & & & & & & & & \\
{\bf RVO-MIS (M) (Proposed)} &\textbf{0.2970} &\textbf{0.0523} &\textbf{0.3574} &\textbf{0.1302} &\underline{4.0381} &\textbf{0.1261} &\textbf{0.6822} &\textbf{0.0548} \\
\hline
\end{tabular}
}

\label{tab:scared_GA}
\end{table*}
}
{\scriptsize
\begin{table*}[!t]
\centering
\caption{\rev{Quantitative comparison of ATE (RMSE) using {\em origin alignment} on four representative sequences from the SCARED dataset.
% \textbf{Translation errors ($\mathcal{T}$) are reported in centimeters (cm)}, and \textbf{rotation errors ($\mathcal{R}$) are reported in degrees ($^\circ$)}.
\textbf{Bold}: best. \underline{underlined}: second best.  (``M'': Monocular, ``S'': Stereo, ``-": Tracking Failure)}}
\setlength\tabcolsep{2pt}
\vspace{-0.5em}
\resizebox{\textwidth}{!}{
\begin{tabular}{|c|cc|cc|cc|cc|}
\hline
\multicolumn{1}{|c|}{\multirow{2}{*}{\textbf{Method}}} & \multicolumn{2}{c|}{\textbf{Sequence 1}} & \multicolumn{2}{c|}{\textbf{Sequence 2}} & \multicolumn{2}{c|}{\textbf{Sequence 3}} & \multicolumn{2}{c|}{\textbf{Sequence 4}} \\

~ & ATE ($\transl$) & ATE ($\rot$) & ATE ($\transl$) & ATE ($\rot$) & ATE ($\transl$) & ATE ($\rot$) & ATE ($\transl$) & ATE ($\rot$) \\
\hline
% {\textbf{SLAM}} & & & & & & & & \\

ORB-SLAM2 (M)~~\cite{mur2017orb} &8.3241 &0.1259 &98.5070 &1.4628 & 234.2704 & 17.9475 & 128.9915 & 1.9776 \\
ORB-SLAM2 (S)~~\cite{mur2017orb} &3.4019 &0.0524 &5.9156 &0.1176 &27.8263  &0.5553 & 17.2080 & 0.2732 \\
SD-DefSLAM (M)~~\cite{lamarca2018camera} &3.1928 &\underline{0.0391} &1.3217 & \underline{0.0319} & 8.5289 & \underline{0.1610} & 3.8524 & 0.6624 \\
BodySLAM (M)~~\cite{body} &2.7290 &0.4290 &0.8442 &0.2093 &- &- &11.2834 &0.4392 \\
Endo-2DTAM (M)~~\cite{endodtam} &9.6275 &2.2411 &3.0723 &1.1861 &13.2626 &2.1547 &8.1023 &2.3157 \\
% \hline
% {\textbf{VO}} & & & & & & & & \\
DPVO (M)~~\cite{dpvo} &\underline{1.1042} &0.4124 &\underline{0.5583} &0.1252 &\textbf{3.5819} &0.5303 & \underline{2.3902} &0.5074 \\
% \hline
% {\textbf{Deep Learning}} & & & & & & & & \\
EndoDepth (M)~~\cite{endo-depth-and-motion} &3.0702 &0.0534 &1.8933 &0.0386 &10.4578 &0.2422 &6.4815 & \underline{0.1013} \\
% \hline
% {\textbf{Proposed}} & & & & & & & & \\
{\bf RVO-MIS (M) (Proposed)} &\textbf{0.7963} &\textbf{0.0179} &\textbf{0.5528} &\textbf{0.0236} & \underline{4.7195} &\textbf{0.0830} & {\bf 1.3573} & {\bf 0.0237} \\
\hline
\end{tabular}
}

\label{tab:scared_OA}
\end{table*}
}

\begin{figure*}[!t]
    \centering
    \setlength{\tabcolsep}{1pt}
    
    \footnotesize
    \begin{tabular}{c c c c}
        \makebox[0.24\textwidth]{Sequence 1} & 
        \makebox[0.24\textwidth]{Sequence 2} & 
        \makebox[0.24\textwidth]{Sequence 3} & 
        \makebox[0.24\textwidth]{Sequence 4} \\
    \end{tabular}
    
    \raisebox{3.5em}{\rotatebox{90}{\scriptsize }}
    \hspace{2pt}%
    \includegraphics[width=0.23\textwidth]{fig/81.png}\hfill
    \includegraphics[width=0.23\textwidth]{fig/82.png}\hfill
    \includegraphics[width=0.23\textwidth]{fig/83.png}\hfill
    \includegraphics[width=0.23\textwidth]{fig/84.png}    
    \caption{CDF of the translation ATE across representative sequences. The curves illustrate the cumulative distribution of per-frame errors. In sequences 1, 2, and 4 \rev{of the SCARED dataset, RVO-MIS (blue)} demonstrates superior accuracy. In sequence 3, while DPVO achieves lower overall errors, our method maintains a consistent error distribution for the majority of frames, highlighting the varying challenges posed by different surgical scenes.} 
    \label{fig6}
\end{figure*}

\noindent \rev{{\bf Quantitative Results on SCARED:}
Tables~\ref{tab:scared_GA} and~\ref{tab:scared_OA} respectively summarize the performance comparisons on the SCARED dataset through {\em global alignment}, {\em i.e.}, the geometry of the estimated trajectory is best aligned with the geometry of the ground-truth trajectory, and {\em origin alignment}, {\em i.e.}, the first frame is identical across all methods. RVO-MIS demonstrates superior performances for most of the sequences in both alignments. Although the accuracy reported in the table is averaged over the entire sequence, the cumulative density functions (CDFs) of ATE errors in Figure~\ref{fig6} show that the error does not exhibit sudden jumps typically associated with tracking loss. Rather, it increases gradually and in a largely monotonic fashion, reflecting slow drift accumulation rather than catastrophic failure.} This consistent robustness highlights the effectiveness of the RVO-MIS framework for MIS navigation, particularly in dynamic environments where stable tracking is critical. \rev{Additional detailed results can be found in Appendix~\ref{sec:appendix_b}.}

\rev{The inferior performances from the competing methods are studied as follows. {\em (i)} The reliance on smooth motion prior as an initial guess for camera poses in ORB-SLAM2~\cite{mur2017orb} and SD-DefSLAM~\cite{lamarca2018camera} is often violated in the MIS environment where camera motion is abrupt and erratic. Since minimizing reprojection errors is not a convex optimization process, a poor initial guess would converge to a local optima, leading to inaccurate estimation. The strong dependency on temporal coherence is also the case in Endo-2DTAM~\cite{endodtam}.
% estimate camera poses by first giving an initial guess from a motion-prior, typically based on a constant motion assumption, and then refine the initial guess via minimizing the local reprojection errors. Since minimizing reprojection errors is not a convex optimization process, a poor initial guess would converge to a local optima, leading to inaccurate estimation. This is particularly common in the MIS environment where camera motion is often abrupt and erratic. 
{\em (ii)} The reliance on a strong parametric prior for modeling the deformable shape for estimating the pose in BodySLAM~\cite{body} acts like an over-constrained optimization, which is often biased especially in the unpredictable, changing MIS environment. 
% Camera pose estimation by BodySLAM~\cite{body} is tightly coupled with non-rigid shape optimization, relying on both temporal coherence and model-based constraints to stabilize tracking. The strong parametric prior for modeling the deformable shape acts like an over-constrained optimization that is often biased, especially in the unpredictable, changing MIS environment. The reliance of temporal coherence collapses when there is an abrupt camera motion, which is also the case in Endo-2DTAM~\cite{endodtam}. 
{\em (iii)} Photometric consistency assumption, primarily adopted by DPVO~\cite{dpvo} and EndoDepth~\cite{endo-depth-and-motion}, is easily collapsed in the presence of significant light changes in MIS scenarios.}

\begin{table*}[t]
\centering
\caption{\rev{Quantitative evaluation on the Small Intestine sequence of the EndoSLAM dataset using global alignment (GA) and origin alignment (OA). \textbf{Bold}: the best. \underline{Underlined}: the second best.}}
\label{tab:endoslam_intestine}
\vspace{-0.5em}
% \renewcommand{\arraystretch}{1.3}
\setlength{\tabcolsep}{3pt}

\resizebox{\textwidth}{!}{

\begin{tabular}{|c|c|c|cc|cc|}
\hline
\multirow{2}{*}{\textbf{Method}} & \multirow{2}{*}{\textbf{Fail?}} & \multirow{2}{*}{\textbf{\makecell{Avg. Traj. \\ Compl. (\%)}}} & \multicolumn{2}{c|}{\textbf{GA-ATE (RMSE)}} & \multicolumn{2}{c|}{\textbf{OA-ATE (RMSE)}} \\
% \cline{4-7}
 & & & \textbf{$\mathcal{T}$} & \textbf{$\mathcal{R}$} & \textbf{$\mathcal{T}$} & \textbf{$\mathcal{R}$} \\
% \cline{4-7}

% --- SLAM Category ---
% \textbf{SLAM} & & & & & & \\
\hline
ORB-SLAM2~\cite{mur2017orb} & Yes & 8.67\% & 8.71 & 11.88 & 14.22 & 9.60 \\
SD-DefSLAM~\cite{lamarca2018camera} & Yes & 16.98\% & 7.45 & 5.54 & 7.27 & 3.53 \\
BodySLAM~\cite{body} & Yes & 28.82\% & 3.01 & 3.74 & 3.99 & 2.52 \\
Endo-2DTAM~\cite{endodtam} & No & 100\% & 2.41 & 2.30 & \underline{3.46} & 2.21 \\
% \hline

% --- VO Category ---
% \textbf{VO} & & & & & & \\
% \hline
DPVO~\cite{dpvo} & No & 100\% & \underline{2.25} & \underline{0.89} & 3.93 & {\bf 0.49} \\
% \hline

% --- Deep Learning Category ---
% \textbf{Deep Learning} & & & & & & \\
% \hline
EndoDepth~~\cite{endo-depth-and-motion} & Yes & 38.24\% & 4.97 & 3.31 & 5.43 & 3.94 \\
% \hline

% --- Proposed Category ---
% \textbf{Proposed} & & & & & & \\
% \hline
\textbf{RVO-MIS (Proposed)} & \textbf{No} & 100\% & \textbf{2.02} & \textbf{0.26} & \textbf{3.32} & \underline{0.73} \\
\hline
\end{tabular}
}
\end{table*}

\noindent \rev{{\bf Quantitative Results on EndoSLAM:}
Table~\ref{tab:endoslam_intestine} shows the performances of RVO-MIS compared to the existing approaches on the challenging EndoSLAM dataset. Notably, ORB-SLAM2, SD-DefSLAM, EndoDepth, and BodySLAM all fail to track the camera motion towards the end of the sequence. This is typically because of poor pose estimations in some early frames encountering blurry images imposed by drastic camera motion change that leads to failure in the later frames, as supported by their diminished accuracy. On the other hand, Endo-2DTAM and DPVO are designed to keep estimating camera poses, despite the estimation being poor. This enforces the completion of the trajectory but with lower accuracy compared the proposed approach. }

% Evaluations were conducted under global alignment conditions, where we used the standard evo~\cite{evo} evaluation program to specifically assess accuracy from the trajectory. This origin aligned metric provides a more clinically relevant measure of VO performance in surgical navigation compared to conventional global alignment, as it better reflects the cumulative error characteristics during continuous operation. Furthermore, we also presented the ATE visualization results of RVO-MIS under different sequences (Figure \ref{fig5}), and simultaneously plotted the ground truth trajectories aligned with the origin.

% \rev{Figure \ref{fig5} examines the estimation error of RVO-MIS along the entire trajectory via a visualization of ATE.} We applied global alignment to adhere to standard error assessment protocols. 
% In contrast, for the trajectory comparison against baseline methods (\rev{Figures \ref{fig3} and \ref{fig4}}), we utilized origin alignment. This strategy fixes a common starting pose for all methods, thereby intuitively demonstrating the reduced cumulative drift of RVO-MIS compared to the baselines. 
% Evidently, our algorithm maintains smooth and stable trajectories across all sequences. This consistent robustness highlights the effectiveness of the RVO-MIS framework for MIS navigation, particularly in dynamic environments where stable tracking is critical.

\begin{figure*}[b]
    \centering
    \setlength{\tabcolsep}{1pt}
    
    
    \footnotesize
    \begin{tabular}{c c c c}
        \makebox[0.24\textwidth]{Sequence 1} & 
        \makebox[0.24\textwidth]{Sequence 2} & 
        \makebox[0.24\textwidth]{Sequence 3} & 
        \makebox[0.24\textwidth]{Sequence 4} \\
        \vspace{-1em}
    \end{tabular}
    
    % (61, 62, 63, 64)
    \raisebox{3.5em}{\rotatebox{90}{\scriptsize \textbf{Trajectory}}}
    \hspace{2pt}%
    % 
    \includegraphics[width=0.23\textwidth]{fig/61.png}\hfill
    \includegraphics[width=0.23\textwidth]{fig/62.png}\hfill
    % 
    \includegraphics[width=0.23\textwidth]{fig/63.png}\hfill
    \includegraphics[width=0.23\textwidth]{fig/64.png}    
    \vspace{-0.5em}
    \caption{Trajectories \rev{(origin alignment)} comparison for the representative sequences in the SCARED dataset. Our proposed method (blue) closely tracks the Ground Truth (black, dashed), outperforming established baselines.} 
    \label{fig3}
    \vspace{-0.5em}
\end{figure*}

\noindent \rev{\textbf{Qualitative Results:}} A qualitative assessment of the trajectories using the SCARED dataset is presented in \rev{Figure~\ref{fig3} compared to the baselines, with a clearer view for each dimension presented in Figure~\ref{fig4}. These trajectories are visualized with origin alignment, {\em i.e.}, the first frame is identical across everyone, facilitating an observation of motion drift. From Figure~\ref{fig4}, it is clear that the proposed method performs better than others by a wide margin, except for some cases DPVO is comparative with us.}

\begin{figure*}[t!]
    \centering
    \setlength{\tabcolsep}{1pt}
    
    \footnotesize
    \begin{tabular}{c c c c}
        \makebox[0.24\textwidth]{Sequence 1} & 
        \makebox[0.24\textwidth]{Sequence 2} & 
        \makebox[0.24\textwidth]{Sequence 3} & 
        \makebox[0.24\textwidth]{Sequence 4} \\
    \end{tabular}
    
    
    \raisebox{3.5em}{\rotatebox{90}{\scriptsize \textbf{Projected Views}}}
    \hspace{2pt}
    \includegraphics[width=0.23\textwidth]{fig/91.png}\hfill
    \includegraphics[width=0.23\textwidth]{fig/92.png}\hfill
    \includegraphics[width=0.23\textwidth]{fig/93.png}\hfill
    \includegraphics[width=0.23\textwidth]{fig/94.png}    
    \vspace{-1em}
    % --- Caption ---
    \caption{\rev{Projected trajectory views ($x, y, z$ components) for the corresponding sequences in Figure \ref{fig3}, clearly visualizing the direction and magnitude of motion drift over time.}}
    \label{fig4}
\end{figure*}

\begin{table}[b]
\centering
\caption{\rev{Ablation study of varying matching and geometric verification combinations. We report the average Inlier Ratio (\%), Average Trajectory Completion Rate (\%), and Average ATE (RMSE) using global alignment averaged across SCARED and EndoSLAM datasets.}}
\label{tab:ablation_summary}
\setlength{\tabcolsep}{3pt} 


\resizebox{\linewidth}{!}{
    \begin{tabular}{l|c|c|c|c}
    \multicolumn{1}{c|}{\textbf{Method}} & \textbf{Avg. Inlier (\%)}~$\uparrow$ & \textbf{Avg. Traj. Compl. (\%)}~$\uparrow$ & \textbf{ATE ($\mathcal{T}$)}~$\downarrow$ & \textbf{ATE ($\mathcal{R}$)}~$\downarrow$ \\
    VLFeat+RANSAC & 38.7722 & 82.6652 & 3.0613 & 0.8025 \\
    VLFeat+MSAC & 37.9260 & 82.8458 & 2.7302 & 0.3540 \\
    LightGlue+RANSAC & 52.1536 & 100.0000 & 5.3533 & 1.4494 \\
    \textbf{LightGlue+MSAC (Ours)} & \textbf{52.6790} & \textbf{100.0000} & \textbf{2.0311} & \textbf{0.3023} \\
    \end{tabular}
}
\end{table}

\noindent \rev{{\bf Ablation Study:} To examine the effectiveness of the feature matching method as well as the use of geometric verification combinations, an ablation study is conducted on both SCARED and EndoSLAM datasets, Table~\ref{tab:ablation_summary}, in terms of average inlier rate and average trajectory completion. Evidently, LightGlue delivers around 18\% improvements in inlier ratio over the traditional descriptor similarity based method (VLFeat). This is also reflected in an inlier ratio box plot across the selected sequences of the SCARED dataset, Figure~\ref{fig:ablation_study}. Although MSAC offers higher inlier ratio over RANSAC by a small margin, it elevates significant robustness in estimating absolute poses. Note that although VLFeat+RANSAC has better accuracy compared to LightGlue+RANSAC, the former is measured across shorter trajectory length than the latter.}

\begin{figure*}[t]
  \centering
  \includegraphics[width=0.95\linewidth]{fig/figurebox.png}
  \vspace{-0.5em} 
  \caption{\rev{Ablation study on feature matching robustness. The boxplots illustrate the distribution of inlier ratios across four chosen sequences of the SCARED dataset.
  We compare the LightGlue + MSAC (red) against
  VLFeat + RANSAC (gray), LightGlue + RANSAC (blue), and VLFeat + MSAC (green).
  The results demonstrate that our proposed combination consistently achieves higher median inlier rates.}}
  \label{fig:ablation_study}
\end{figure*}

% \begin{table}[htbp]
% \centering
% \caption{\rev{Ablation study of varying matching and geometric verification combinations. We report the average Inlier Ratio (\%) and Average Trajectory Completion Rate (\%) averaged across SCARED and EndoSLAM datasets. \textbf{Bold}: the best. \underline{Underlined}: the second best.}}
% \label{tab:ablation_summary}
% \setlength{\tabcolsep}{6pt}
% \begin{tabular}{l|c|c}
% \multicolumn{1}{c|}{\textbf{Method}} & \textbf{Avg. Inlier Rate (\%)}~$\uparrow$ & \textbf{Avg. Traj. Compl. (\%)}~$\uparrow$ \\
% VLFeat+RANSAC & 38.7722 & 82.6652 \\
% LightGlue+RANSAC & \underline{52.1536} & \textbf{100.0000} \\
% VLFeat+MSAC & 37.9260 & \underline{82.8458} \\
% \textbf{LightGlue+MSAC (Ours)} & \textbf{52.6790} & \textbf{100.0000} \\
% \end{tabular}
% \label{tab:ablation_study}
% \end{table}

% For evaluation, we selected several state-of-the-art techniques for comparison. We compare our approach with SLAM methods~\cite{mur2017orb, lamarca2018camera, body, endodtam}, VO methods~\cite{dpvo} and deep learning based methods~\cite{endo-depth-and-motion}. 


% Quantitative results are summarized in Table \ref{tab:scared_GA}, while Figure \ref{fig5} visualizes the ATE. 
% To further analyze the error distribution, Figure~\ref{fig6} presents the Cumulative Distribution Function (CDF) of the translation ATE, demonstrating the superior accuracy of our method across most frames. Table \ref{tab:scared_GA} presents the RMSE ATE of translation and ATE of rotation for our method compared to state-of-the-art approaches across four representative sequences. 

\noindent \textbf{Run Time:} \rev{RVO-MIS} was implemented in Python to avoid cross-language interoperability overhead and leverage parallel computing capabilities. Experiments were performed on a high performance server with dual AMD EPYC 9354 32-Core Processors (4 CPU cores for this study) and a single NVIDIA RTX A6000 GPU (48GB VRAM). \rev{For a runtime breakdown experiment, when running on} a total of 197 frames, feature matching accounted for 181.06s ($\approx 0.92s$ per frame), while pose estimation accounted for 73.15s ($\approx 0.37s$ per frame). In contrast, triangulation and pose refinement remained computationally efficient, requiring only 0.54s and 0.69s in total, respectively. This distribution indicates that while the transition to a pure Python environment with multi-CPU acceleration has streamlined the pipeline, feature matching remains the primary bottleneck, prompting further research into lightweight feature matchers and code optimization to reduce latency without compromising the robustness.

\section{Conclusions}
RVO-MIS addresses the challenges of MIS environments by \rev{integrating SIFT with LightGlue for reliable feature correspondences, followed by advocating absolute-pose first VO framework in a robust MSAC loop.} 
% integrating deep learning-based feature extraction with MSAC and P\rev{3}P pose estimation. 
This robust combination achieves state-of-the-art performance \rev{in terms of accuracy and trajectory completion rate, both on the SCARED and EndoSLAM datasets. This suffices to show that the reliance on temporal motion consistency and photometric consistency is not entirely practical in the MIS scenarios.} Notably, it requires significantly fewer computational resources than pure deep learning models, offering \rev{an alternative solution} for next-generation surgical navigation.

\section{Future Works}
Noting that DPVO~\cite{dpvo} achieved the lowest quantitative ATE ($\mathcal{T}$) in Sequence 3, we will investigate the underlying causes in future work. To further refine our system, we propose three key improvements: implementing three-view feature matching for scale-aware estimation~\cite{yuan2017dissecting}; integrating advanced matchers such as LoFTR~\cite{loftr} to \rev{additionally enhance estimation precision; and investigating the role of bundle adjustment~\cite{saha2025basedbundleadjustingsurgicalendoscopic} and its effectiveness in mitigating the motion drift.}

\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{The authors gratefully acknowledge the support provided by the University of Arizona Graduate College through the Research and Project (ReaP) Grant.}



%\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version





% \midlacknowledgments{We thank a bunch of people.}




\bibliography{midl26_34}

\clearpage

\appendix

\section{Algorithm Details}
\label{sec:appendix_a}


\begin{algorithm2e}[h]
\caption{RVO-MIS Pipeline}
\DontPrintSemicolon
\LinesNumbered


\SetKwInput{KwHyp}{Hyperparameters} 

\KwIn{Sequence of video frames $I_{0}, \dots, I_{N}$, Camera Intrinsic Matrix $K$}


\KwHyp{\rev{\\
\quad Inlier Threshold $\tau_{inlier} = 2$ px \\
\quad Max Iterations $N_{iter} = 3000$ \\
\quad Feature Ratio $\rho_{rank} = 0.8$ (Top-k selection) \\
\quad Keyframe Gap $\tau_{gap} = 15$ frames \\
\quad Covisibility Ratio $\tau_{cov} = 0.55$}}

\KwOut{Camera Trajectory $\{(\rot_i, \transl_i)\}$, 3D Map $\mathcal{M}$}

\vspace{0.2cm}
\tcp{Initialization Phase}
Extract SIFT features for $I_0$ and $I_1$ \rev{(Select top $\rho_{rank}$)}\;
Match features using LightGlue\;
Estimate relative pose and triangulate initial 3D points to form map $\mathcal{M}$ \rev{(using $\tau_{inlier}$)}\;
Set Keyframe $I_{kf} \leftarrow I_1$\;

\vspace{0.2cm}
\tcp{Tracking Phase}
\For{$i \leftarrow 2$ \KwTo $N$}{
    Extract SIFT features for current frame $I_i$\;
    Match features between $I_i$ and $I_{kf}$ using LightGlue\;
    Identify 2D-3D correspondences (co-visible points) based on $\mathcal{M}$\;
    
    \tcp{Absolute Pose Estimation}
    Estimate initial pose $(\rot_i, \transl_i)$ using MSAC \rev{($N_{iter}$ iterations, $\tau_{inlier}$ threshold)}\;
    Refine $(\rot_i, \transl_i)$ by minimizing the energy function $E(\rot_i,\transl_i)$ (Eq.~\ref{eq:energy_function}) using Levenberg-Marquardt\;
    
    \tcp{Map Management}
    $N_{cov} \leftarrow$ ratio of co-visible 3D landmarks\;
    $N_{gap} \leftarrow$ frame distance from $I_{kf}$\;
    
    \If{$N_{cov} < \rev{\tau_{cov}}$ \bf{or} $N_{gap} > \rev{\tau_{gap}}$}{
        Compute relative pose $(\rot_{kc}, \transl_{kc})$ from the current and keyframe absolute poses (Eq.~\ref{eq:ap2rp})\;
        Triangulate new 2D matches satisfying epipolar constraints \rev{($< \tau_{inlier}$)}\;
        Transform new points to world coordinates and add to $\mathcal{M}$\;
        Update Keyframe $I_{kf} \leftarrow I_i$\;
    }
}
\end{algorithm2e}

\clearpage




\appendix
\renewcommand{\thesection}{B} 
\section{Detailed Quantitative Results}
\label{sec:appendix_b}

\setcounter{table}{0}
\renewcommand{\thetable}{B\arabic{table}}


\begin{table*}[!htbp] 
    \centering
    \renewcommand{\arraystretch}{1.35} 
    \setlength{\tabcolsep}{3.5pt}

    % ------------------- Table B1 -------------------
    \caption{Detailed comparison of ATE on \textbf{Sequence 1} with {\em Global Alignment}. (``M'' represents monocular method, ``S'' represents stereo method. Best results are \textbf{bold}, second best are \underline{underlined}.)}
    \label{tab:seq1_ate}
    
    \resizebox{\textwidth}{!}{
        \begin{tabular}{|c|cccc|cccc|}
        \hline
        \multicolumn{1}{|c|}{\multirow{2}{*}{\textbf{Method}}} & \multicolumn{4}{c|}{\textbf{ATE} ($\mathcal{T}$)} & \multicolumn{4}{c|}{\textbf{ATE} ($\mathcal{R}$)} \\
         & max & avg & min & RMSE & max & avg & min & RMSE \\
        \hline
        ORB-SLAM2 (M)~\cite{mur2017orb} & 2.7664 & 1.4517 & 0.5062 & 1.6543 & 3.1294 & 3.0534 & 2.9731 & 3.0534 \\
        ORB-SLAM2 (S)~\cite{mur2017orb} & 2.4405 & 1.2036 & 0.1553 & 1.3091 & 0.3855 & 0.3502 & 0.3048 & 0.3511 \\
        SD-DefSLAM (M)~\cite{lamarca2018camera} & 2.5968 & 0.9146 & 0.1828 & 1.0036 & 0.3957 & 0.3736 & 0.3479 & 0.3247 \\
        BodySLAM (M)~\cite{body} & 0.9876 & 0.4107 & 0.0894 & 0.4504 & \underline{0.3605} & \underline{0.1786} & \textbf{0.0201} & \underline{0.1991} \\
        Endo-2DTAM (M)~\cite{endodtam} & 5.1425 & 2.2075 & 1.0403 & 2.3290 & 3.1376 & 2.4347 & 0.3213 & 2.2410 \\
        DPVO (M)~\cite{dpvo} & \underline{0.7202} & \underline{0.3033} & \textbf{0.0185} & \underline{0.3326} & 0.3853 & 0.2242 & 0.0438 & 0.2466 \\
        EndoDepth (M)~\cite{endo-depth-and-motion} & 2.3421 & 1.0514 & 0.3591 & 1.1748 & 0.4253 & 0.3682 & 0.3264 & 0.3688 \\
        RVO-MIS (M) & \textbf{0.5670} & \textbf{0.2813} & \underline{0.0522} & \textbf{0.2970} & \textbf{0.0614} & \textbf{0.0522} & \underline{0.0397} & \textbf{0.0523} \\
        \hline
        \end{tabular}
    }

    \vspace{2em}

    % ------------------- Table B2 -------------------
    \caption{Detailed comparison of ATE on \textbf{Sequence 2} with {\em Global Alignment}. (``M'' represents monocular method, ``S'' represents stereo method. Best results are \textbf{bold}, second best are \underline{underlined}.)}
\label{tab:seq2_ate}

\resizebox{\textwidth}{!}{
    \begin{tabular}{|c|cccc|cccc|}
    \hline
    \multicolumn{1}{|c|}{\multirow{2}{*}{\textbf{Method}}} & \multicolumn{4}{c|}{\textbf{ATE} ($\mathcal{T}$)} & \multicolumn{4}{c|}{\textbf{ATE} ($\mathcal{R}$)} \\
     & max & avg & min & RMSE & max & avg & min & RMSE \\
    \hline
    ORB-SLAM2 (M)~\cite{mur2017orb} & 2.1473 & 0.8472 & 0.4015 & 0.9912 & 3.1304 & 3.0805 & 3.0183 & 3.0798 \\
    ORB-SLAM2 (S)~\cite{mur2017orb} & 2.5160 & 1.2091 & 0.0669 & 1.3623 & 3.1203 & 3.0298 & 2.9297 & 3.0303 \\
    SD-DefSLAM (M)~\cite{lamarca2018camera} & 3.0879 & 1.6691 & 0.4306 & 1.7788 & 2.0754 & 2.0056 & 1.9468 & 2.0058 \\
    BodySLAM (M)~\cite{body} & \textbf{0.9069} & 0.3935 & \textbf{0.0200} & 0.4447 & 0.2878 & 0.1749 & \textbf{0.0796} & 0.1851 \\
    Endo-2DTAM (M)~\cite{endodtam} & 5.5302 & 2.1425 & 0.3685 & 2.3187 & 3.1264 & 1.4555 & 0.7135 & 1.1858 \\
    DPVO (M)~\cite{dpvo} & 1.0712 & \underline{0.3390} & \underline{0.0415} & \underline{0.3902} & \underline{0.2489} & \underline{0.1723} & 0.1058 & \underline{0.1763} \\
    EndoDepth (M)~\cite{endo-depth-and-motion} & 2.8358 & 1.3288 & 0.1519 & 1.4863 & 1.7598 & 1.7174 & 1.6732 & 1.7176 \\
    RVO-MIS (M) & \underline{0.9167} & \textbf{0.3090} & 0.0644 & \textbf{0.3574} & \textbf{0.1636} & \textbf{0.1285} & \underline{0.0865} & \textbf{0.1302} \\
    \hline
    \end{tabular}
}
\end{table*}

\clearpage

 %========================================================================================
% SEQUENCE 3
% =========================================================================================
\begin{table*}[!htbp]
    \centering
    \renewcommand{\arraystretch}{1.35} 
    \setlength{\tabcolsep}{3.5pt}      

    % =========================================================================================
    % SEQUENCE 3
    % =========================================================================================
    \caption{Detailed comparison of ATE on \textbf{Sequence 3} with {\em Global Alignment}. (``M'' represents monocular method, ``S'' represents stereo method. Best results are \textbf{bold}, second best are \underline{underlined}.)}
    \label{tab:seq3_ate}
    
    \resizebox{\textwidth}{!}{
    \begin{tabular}{|c|cccc|cccc|}
    \hline
    \multicolumn{1}{|c|}{\multirow{2}{*}{\textbf{Method}}} & \multicolumn{4}{c|}{\textbf{ATE} ($\mathcal{T}$)} & \multicolumn{4}{c|}{\textbf{ATE} ($\mathcal{R}$)} \\
     & max & avg & min & RMSE & max & avg & min & RMSE \\
     \hline
    ORB-SLAM2 (M)~\cite{mur2017orb} & 21.9279 & 8.5117 & 1.2194 & 9.6247 & 3.1362 & 2.8618 & 2.6213 & 2.8640 \\
    ORB-SLAM2 (S)~\cite{mur2017orb} & 28.0701 & 8.5828 & 4.0676 & 9.7053 & 3.1416 & 3.0071 & 2.6978 & 3.0089 \\
    SD-DefSLAM (M)~\cite{lamarca2018camera} & \underline{15.9008} & 5.5097 & 0.7359 & 6.2579 & \underline{0.5240} & 0.3597 & 0.1703 & \underline{0.3455} \\
    BodySLAM (M)~\cite{body} & - & - & - & - & - & - & - & - \\
    
    Endo-2DTAM (M)~\cite{endodtam} & 30.2704 & 7.1538 & 2.1716 & 6.3792 & 3.1406 & 1.8262 & 0.1627 & 1.9767 \\
    DPVO (M)~\cite{dpvo} & \textbf{7.0309} & \textbf{2.3949} & \underline{0.1415} & \textbf{2.8271} & 1.1005 & \underline{0.3471} & \textbf{0.0477} & 0.4607 \\
    \hline
    \textbf{Deep Learning} & & & & & & & & \\
    EndoDepth (M)~\cite{endo-depth-and-motion} & 31.2774 & 7.3084 & 0.4616 & 9.2599 & 0.9811 & 0.8782 & 0.6674 & 0.8804 \\
    RVO-MIS (M) & 18.2302 & \underline{2.8213} & \textbf{0.1165} & \underline{4.0381} & \textbf{0.2317} & \textbf{0.1246} & \underline{0.0743} & \textbf{0.1261} \\
    \hline
    \end{tabular}
}

    \vspace{2em} 

    % =========================================================================================
    % SEQUENCE 4
    % =========================================================================================
    \caption{Detailed comparison of ATE on \textbf{Sequence 4} with {\em Global Alignment}. (``M'' represents monocular method, ``S'' represents stereo method. Best results are \textbf{bold}, second best are \underline{underlined}.)}
    \label{tab:seq4_ate}
    
    \resizebox{\textwidth}{!}{
    \begin{tabular}{|c|cccc|cccc|}
    \hline
    \multicolumn{1}{|c|}{\multirow{2}{*}{\textbf{Method}}} & \multicolumn{4}{c|}{\textbf{ATE} ($\mathcal{T}$)} & \multicolumn{4}{c|}{\textbf{ATE} ($\mathcal{R}$)} \\
     & max & avg & min & RMSE & max & avg & min & RMSE \\
    \hline
    ORB-SLAM2 (M)~\cite{mur2017orb} & 10.8781 & 3.6578 & 0.7318 & 4.4825 & 3.1384 & 3.0179 & 2.9487 & 3.0183 \\
    ORB-SLAM2 (S)~\cite{mur2017orb} & 7.8057 & 3.8878 & 0.5753 & 4.3626 & 3.1405 & 2.9113 & 2.6823 & 2.9151 \\
    SD-DefSLAM (M)~\cite{lamarca2018camera} & 8.9583 & 2.9963 & 0.3520 & 3.5653 & \underline{0.3344} & \underline{0.1665} & 0.0593 & \underline{0.1761} \\
    
    BodySLAM (M)~\cite{body} & 15.1282 & 7.8329 & 4.2494 & 8.2481 & 1.1025 & 0.7804 & 0.5897 & 0.7986 \\
    Endo-2DTAM (M)~\cite{endodtam} & 17.0963 & 5.0704 & 1.4731 & 5.1394 & 3.1411 & 2.6069 & 1.0978 & 2.3166 \\
    DPVO (M)~\cite{dpvo} & \underline{1.7126} & \underline{0.8578} & \underline{0.1315} & \underline{0.9269} & 0.8780 & 0.3292 & \textbf{0.0317} & 0.4405 \\
    EndoDepth (M)~\cite{endo-depth-and-motion} & 8.9253 & 3.8852 & 0.6026 & 4.2948 & 0.4404 & 0.3432 & 0.2228 & 0.3484 \\
    RVO-MIS (M) & \textbf{1.4768} & \textbf{0.6184} & \textbf{0.0224} & \textbf{0.6822} & \textbf{0.0778} & \textbf{0.0541} & \underline{0.0382} & \textbf{0.0548} \\
    \hline
    \end{tabular}
}
\end{table*}























\renewcommand{\thetable}{B\arabic{table}}


\begin{table*}[!htbp] 
    \centering
    \renewcommand{\arraystretch}{1.35} 
    \setlength{\tabcolsep}{3.5pt}

    % ------------------- Table B1 -------------------
    \caption{Detailed comparison of ATE on \textbf{Sequence 1} with {\em Origin Alignment}. (``M'' represents monocular method, ``S'' represents stereo method. Best results are \textbf{bold}, second best are \underline{underlined}.)}
    \label{tab:seq1_ateo}
    
    

    \resizebox{\textwidth}{!}{
        \begin{tabular}{|c|cccc|cccc|}
        \hline
        \multicolumn{1}{|c|}{\multirow{2}{*}{\textbf{Method}}} & \multicolumn{4}{c|}{\textbf{ATE} ($\mathcal{T}$)} & \multicolumn{4}{c|}{\textbf{ATE} ($\mathcal{R}$)} \\
         & max & avg & min & RMSE & max & avg & min & RMSE \\
        \hline
        ORB-SLAM2 (M)~\cite{mur2017orb} & 10.5358 & 7.8183 & \textbf{0.0000} & 8.3241 & 0.1714 & 0.1167 & \textbf{0.0000} & 0.1259 \\
        ORB-SLAM2 (S)~\cite{mur2017orb} & 3.7592 & 3.3326 & \textbf{0.0000} & 3.4019 & 0.0692 & 0.0493 & \textbf{0.0000} & 0.0524 \\
        SD-DefSLAM (M)~\cite{lamarca2018camera} & 4.4132 & 3.0186 & \textbf{0.0000} & 3.1928 & \underline{0.0581} & \underline{0.0375} & \underline{0.0004} & \underline{0.0391} \\
        BodySLAM (M)~\cite{body} & 5.2943 & 2.2945 & \textbf{0.0000} & 2.7290 & 0.6214 & 0.3839 & \underline{0.0004} & 0.4290 \\
        Endo-2DTAM (M)~\cite{endodtam} & 15.8324 & 8.8650 & \textbf{0.0000} & 9.6275 & 3.0717 & 2.1193 & \underline{0.0004} & 2.2411 \\
        DPVO (M)~\cite{dpvo} & \underline{2.0840} & \underline{0.9160} & \textbf{0.0000} & \underline{1.1042} & 0.5907 & 0.3704 & \underline{0.0004} & 0.4124 \\
        EndoDepth (M)~\cite{endo-depth-and-motion} & 6.3598 & 2.7282 & \textbf{0.0000} & 3.0702 & 0.1119 & 0.0476 & \underline{0.0004} & 0.0534 \\
        RVO-MIS (M) & \textbf{1.3970} & \textbf{0.7528} & \textbf{0.0000} & \textbf{0.7963} & \textbf{0.0284} & \textbf{0.0172} & \textbf{0.0000} & \textbf{0.0179} \\
        \hline
        \end{tabular}
    }

    \vspace{2em} 

    % ------------------- Table B2 -------------------
    \caption{Detailed comparison of ATE on \textbf{Sequence 2} with {\em Origin Alignment}. (``M'' represents monocular method, ``S'' represents stereo method. Best results are \textbf{bold}, second best are \underline{underlined}.)}
\label{tab:seq2_ateo}

\resizebox{\textwidth}{!}{
    \begin{tabular}{|c|cccc|cccc|}
    \hline
    \multicolumn{1}{|c|}{\multirow{2}{*}{\textbf{Method}}} & \multicolumn{4}{c|}{\textbf{ATE} ($\mathcal{T}$)} & \multicolumn{4}{c|}{\textbf{ATE} ($\mathcal{R}$)} \\
     & max & avg & min & RMSE & max & avg & min & RMSE \\
    \hline
    ORB-SLAM2 (M)~\cite{mur2017orb} & 189.8000 & 79.4784 & \textbf{0.0000} & 98.5070 & 2.6856 & 1.2635 & \textbf{0.0000} & 1.4628 \\
    ORB-SLAM2 (S)~\cite{mur2017orb} & 17.2767 & 4.2766 & \textbf{0.0000} & 5.9156 & 0.3030 & 0.0859 & \textbf{0.0000} & 0.1176 \\
    SD-DefSLAM (M)~\cite{lamarca2018camera} & 10.5063 & 0.7037 & \textbf{0.0000} & 1.3217 & 0.1687 & \textbf{0.0182} & \textbf{0.0000} & \underline{0.0319} \\
    BodySLAM (M)~\cite{body} & 1.8654 & 0.7496 & \textbf{0.0000} & 0.8442 & 0.3314 & 0.1824 & \underline{0.0015} & 0.2093 \\
    Endo-2DTAM (M)~\cite{endodtam} & 5.0698 & 2.8493 & \textbf{0.0000} & 3.0723 & 3.1312 & 0.6630 & \underline{0.0015} & 1.1861 \\
    DPVO (M)~\cite{dpvo} & \underline{1.2917} & \textbf{0.5071} & \textbf{0.0000} & \underline{0.5583} & 0.2033 & 0.1134 & \underline{0.0015} & 0.1252 \\
    EndoDepth (M)~\cite{endo-depth-and-motion} & 3.3008 & 1.7438 & \textbf{0.0000} & 1.8933 & \underline{0.0733} & 0.0345 & \underline{0.0015} & 0.0386 \\
    RVO-MIS (M) & \textbf{1.2284} & \underline{0.5072} & \textbf{0.0000} & \textbf{0.5528} & \textbf{0.0419} & \underline{0.0202} & \textbf{0.0000} & \textbf{0.0236} \\
    \hline
    \end{tabular}
}
\end{table*}

\clearpage

 %========================================================================================
% SEQUENCE 3
% =========================================================================================
\begin{table*}[!htbp]
    \centering
    \renewcommand{\arraystretch}{1.35} 
    \setlength{\tabcolsep}{3.5pt}      

    % =========================================================================================
    % SEQUENCE 3
    % =========================================================================================
    \caption{Detailed comparison of ATE on \textbf{Sequence 3} with {\em Origin Alignment}. (``M'' represents monocular method, ``S'' represents stereo method. Best results are \textbf{bold}, second best are \underline{underlined}.)}
    \label{tab:seq3_ateo}
    


\resizebox{\textwidth}{!}{
    \begin{tabular}{|c|cccc|cccc|}
    \hline
    \multicolumn{1}{|c|}{\multirow{2}{*}{\textbf{Method}}} & \multicolumn{4}{c|}{\textbf{ATE} ($\mathcal{T}$)} & \multicolumn{4}{c|}{\textbf{ATE} ($\mathcal{R}$)} \\
     & max & avg & min & RMSE & max & avg & min & RMSE \\
    \hline
    ORB-SLAM2 (M)~\cite{mur2017orb} & 425.3198 & 198.6521 & \textbf{0.0000} & 234.2704 & 31.5492 & 15.2104 & \underline{0.0017} & 17.9475 \\
    ORB-SLAM2 (S)~\cite{mur2017orb} & 51.5513 & 23.1523 & \textbf{0.0000} & 27.8263 & 1.2640 & 0.4475 & \underline{0.0017} & 0.5553 \\
    SD-DefSLAM (M)~\cite{lamarca2018camera} & \underline{16.5036} & 7.7565 & \textbf{0.0000} & 8.5289 & \textbf{0.3199} & \underline{0.1463} & \underline{0.0017} & \underline{0.1610} \\
    BodySLAM (M)~\cite{body} & - & - & - & - & - & - & - & - \\
    Endo-2DTAM (M)~\cite{endodtam} & 38.1482 & 11.1089 & \textbf{0.0000} & 13.2626 & 3.1412 & 2.0323 & \underline{0.0017} & 2.1547 \\
    DPVO (M)~\cite{dpvo} & \textbf{9.7135} & \underline{2.7100} & \textbf{0.0000} & \textbf{3.5819} & 1.2508 & 0.4057 & \underline{0.0017} & 0.5303 \\
    EndoDepth (M)~\cite{endo-depth-and-motion} & 36.6869 & 7.8943 & \textbf{0.0000} & 10.4578 & 0.7332 & 0.1893 & \underline{0.0017} & 0.2422 \\
    RVO-MIS (M) & 22.1006 & \textbf{2.6989} & \textbf{0.0000} & \underline{4.7195} & \underline{0.3386} & \textbf{0.0522} & \textbf{0.0000} & \textbf{0.0830} \\
    \hline
    \end{tabular}
}

    \vspace{2em} 

    % =========================================================================================
    % SEQUENCE 4
    % =========================================================================================
    \caption{Detailed comparison of ATE on \textbf{Sequence 4} with {\em Origin Alignment}. (``M'' represents monocular method, ``S'' represents stereo method. Best results are \textbf{bold}, second best are \underline{underlined}.)}
    \label{tab:seq4_ateo}
    



\resizebox{\textwidth}{!}{
    \begin{tabular}{|c|cccc|cccc|}
    \hline
    \multicolumn{1}{|c|}{\multirow{2}{*}{\textbf{Method}}} & \multicolumn{4}{c|}{\textbf{ATE} ($\mathcal{T}$)} & \multicolumn{4}{c|}{\textbf{ATE} ($\mathcal{R}$)} \\
     & max & avg & min & RMSE & max & avg & min & RMSE \\
    \hline
    ORB-SLAM2 (M)~\cite{mur2017orb} & 210.3456 & 103.1932 & \textbf{0.0000} & 128.9915 & 3.1415 & 1.5821 & \underline{0.0011} & 1.9776 \\
    ORB-SLAM2 (S)~\cite{mur2017orb} & 28.5123 & 13.7664 & \textbf{0.0000} & 17.2080 & 0.4512 & 0.2185 & \underline{0.0011} & 0.2732 \\
    SD-DefSLAM (M)~\cite{lamarca2018camera} & 6.1234 & 3.0819 & \textbf{0.0000} & 3.8524 & 1.1023 & 0.5299 & \underline{0.0011} & 0.6624 \\
    BodySLAM (M)~\cite{body} & 24.1587 & 9.9346 & \textbf{0.0000} & 11.2834 & 0.7063 & 0.3807 & \underline{0.0011} & 0.4392 \\
    Endo-2DTAM (M)~\cite{endodtam} & 20.9071 & 7.1370 & \textbf{0.0000} & 8.1023 & 3.1410 & 2.1662 & \underline{0.0011} & 2.3157 \\
    DPVO (M)~\cite{dpvo} & \underline{4.1829} & \underline{2.1652} & \textbf{0.0000} & \underline{2.3902} & 0.9751 & 0.3931 & \underline{0.0011} & 0.5074 \\
    EndoDepth (M)~\cite{endo-depth-and-motion} & 10.9638 & 5.8307 & \textbf{0.0000} & 6.4815 & \underline{0.1662} & \underline{0.0930} & \underline{0.0011} & \underline{0.1013} \\
    RVO-MIS (M) & \textbf{2.1564} & \textbf{1.3026} & \textbf{0.0000} & \textbf{1.3573} & \textbf{0.0512} & \textbf{0.0218} & \textbf{0.0000} & \textbf{0.0237} \\
    \hline
    \end{tabular}
}

\end{table*}



% This is a boring technical proof of
% \begin{equation}\label{eq:example}
% \cos^2\theta + \sin^2\theta \equiv 1.
% \end{equation}

% \section{Proof of Theorem 2}

% This is a complete version of a proof sketched in the main text.

\end{document}
