\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\usepackage{graphicx,verbatim}
\usepackage{graphicx}
\usepackage{color}
\usepackage{url}
\usepackage{booktabs}
% \usepackage{ulem}
\usepackage{amsmath}
\usepackage{multirow}
\usepackage{xcolor,colortbl}
\definecolor{lavender}{gray}{0.9}
\usepackage{soul}
%\usepackage{tabularx}
\usepackage{bbding}
\usepackage{graphicx,verbatim}
\usepackage{amssymb}
\usepackage{bbm}
\usepackage[normalem]{ulem} 
\definecolor{lavender}{gray}{0.9}
\usepackage[dvipsnames]{xcolor}
% \newcommand{\xxx}[1]{\textcolor{magenta}{#1}}
% \newcommand{\xx}[1]{\textcolor{teal}{#1}}


\usepackage{amsmath} % Load amsmath for advanced math features and redefine \vec to avoid conflicts
\jmlrvolume{-- nnn}
\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026}
\editors{Accepted for publication at MIDL 2026}


\title[EndoStreamDepth]{EndoStreamDepth: Temporally Consistent Monocular Depth Estimation for Endoscopic Video Streams}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Hao Li\midljointauthortext{Corresponding author}}\Email{hao.li.1@vanderbilt.edu}\\
\Name{Daiwei Lu} \Email{daiwei.lu@vanderbilt.edu}\\
\Name{Jiacheng Wang} \Email{jiacheng.wang.1@vanderbilt.edu}\\
\Name{Robert J. Webster}\nametag{ III}
\Email{robert.webster@Vanderbilt.edu}\\
\Name{Ipek Oguz} \Email{ipek.oguz@vanderbilt.edu}\\
\addr Vanderbilt University
}

\begin{document}

\maketitle

\begin{abstract}
This work presents \textbf{EndoStreamDepth}, a monocular depth estimation framework for endoscopic video streams. It provides accurate depth maps with sharp anatomical boundaries for each frame, temporally consistent predictions across frames, and real-time throughput. Unlike prior work that uses batched inputs, EndoStreamDepth processes individual frames with a temporal module to propagate inter-frame information. 
% which are essential for reliable perception in autonomous medical robots.
The framework contains three main components: (1) a single-frame depth network with endoscopy-specific transformation to produce accurate depth maps,
 (2) multi-level Mamba temporal modules that leverage inter-frame information to improve accuracy and stabilize predictions, and (3) a hierarchical design with comprehensive multi-scale supervision, where complementary loss terms jointly improve local boundary sharpness and global geometric consistency. 
We conduct comprehensive evaluations on two publicly available colonoscopy depth estimation datasets, with quantitative results reported on phantom and simulated data that provide ground truth depth. Compared to state-of-the-art monocular depth estimation methods, EndoStreamDepth  substantially improves performance, and it produces depth maps with sharp, anatomically aligned boundaries, which are essential to support downstream tasks such as   automation for robotic surgery. The code is publicly available at \url{https://github.com/MedICL-VU/EndoStreamDepth}.
\end{abstract}


\begin{keywords}
Video depth estimation, temporal modeling, state-space models, self-supervised regularization, multi-level supervision
\end{keywords}



\input{texts/1introduction}
\input{texts/2methods}

\input{texts/3experiments}

\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{Research reported in this publication was supported by the Advanced Research Projects Agency for Health
(ARPA-H) under Award Number D24AC00415-00. The ARPA-H award provided 90\% of total costs with an
award total of up to $\$11,935,038$. The content is solely the responsibility of the authors and does not necessarily
represent the official views of ARPA-H. This work was also supported in part by the National Institutes of Health (R21DK133742) and Vanderbilt Institute for Surgery
and Engineering (VISE) Seed Grant. Daiwei Lu is supported by NIH F31DK143735-01.}


\bibliography{midl26_131}

\clearpage
\appendix

% \setcounter{figure}{0}% Reset figure counter
%   \let\oldthefigure\thefigure% Capture figure numbering scheme
%   \renewcommand{\thefigure}{A\oldthefigure}% Prefix figure number with A

%   \setcounter{table}{0}% Reset figure counter
%   \let\oldthetable\thetable% Capture figure numbering scheme
%   \renewcommand{\thetable}{A\oldthetable}% Prefix figure number with A

\section{EST Illustration}
\label{EST_illustration}

\begin{figure}[t]
\centering
\includegraphics[width=\linewidth]{figures/EST.png}
 \caption{The proposed EST for endoscopic depth estimation.}
\label{EST}
\end{figure}



Fig.~\ref{EST} shows qualitative examples of the individual image transformations used in the proposed EST pipeline. The first panel shows the input frame. The next three panels apply geometric transformations (random $90^\circ$ rotation, horizontal flip, vertical flip), which mimic camera roll and viewpoint changes that commonly occur during endoscopic procedures. The remaining panels show photometric perturbations, including Gaussian blur, auto-contrast, motion blur, median blur, random gamma, defocus, random fog, and random brightness/contrast. These photometric perturbations approximate typical appearance changes in real procedures, such as variations in exposure, illumination falloff, camera defocus, motion-induced blur, and occlusions from smoke or fog. During training, these transformations are sampled stochastically and applied to the RGB frame (and corresponding depth and masks), so that the depth network is exposed to a wide range of endoscopy-specific appearance variations.


We avoid other geometric warps, such as affine transformations, since the required interpolation on depth maps can introduce artifacts and corrupt the ground truth.



\input{tables/split_table_1}

\input{tables/split_table_split2}
\section{C3VD Dataset Splits.}
\label{Split}
The dataset splits used in our C3VD experiments are summarized in
Tab.~\ref{split1} and Tab.~\ref{split2}, which report the train/test
configurations for split~1 and split~2, respectively. Split~1 is primarily
used for method development and for validating cross-structure performance,
whereas split~2 follows the benchmarking of prior work \cite{paruchuri2024leveraging} and evaluates the generalizability of the methods.



\clearpage


\section{Evaluation Metrics}
\label{Evaluation metric}
Let $D \in \mathbb{R}^{H \times W}$ and $\hat{D} \in \mathbb{R}^{H \times W}$ denote the ground truth and predicted depth maps, respectively. Let $\Omega$ be the set of valid pixels and $N = |\Omega|$. For a pixel index $i \in \Omega$, we write $D_i$ and $\hat{D}_i$ for the corresponding depth values.

\paragraph{Pixelwise depth errors.}
We use the following standard depth metrics:
\begin{align}
\text{AbsRel} &= \frac{1}{N} \sum_{i \in \Omega} \frac{|D_i - \hat{D}_i|}{D_i}, \\
\text{SqRel}  &= \frac{1}{N} \sum_{i \in \Omega} \frac{(D_i - \hat{D}_i)^2}{D_i}, \\
\text{RMSE}   &= \sqrt{\frac{1}{N} \sum_{i \in \Omega} (D_i - \hat{D}_i)^2}, \\
\text{RMSE}_{\log} &= \sqrt{\frac{1}{N} \sum_{i \in \Omega} \bigl(\log D_i - \log \hat{D}_i\bigr)^2}, \\
\text{L1}     &= \frac{1}{N} \sum_{i \in \Omega} |D_i - \hat{D}_i|.
\end{align}

\paragraph{Threshold accuracy.}
The threshold accuracy $\delta_1$ with factor $1.25$ is defined as
\begin{equation}
\delta_1
= \frac{1}{N} \sum_{i \in \Omega}
\mathbf{1} \!\left[
\max\!\left(\frac{D_i}{\hat{D}_i}, \frac{\hat{D}_i}{D_i}\right) < 1.25
\right],
\end{equation}
which measures the percentage of pixels whose predicted depth is within a factor of $1.25$ of the ground-truth depth.





\paragraph{Boundary F1 score.}
We follow the recent depth estimation work \cite{Bochkovskii2024} to evaluate boundary sharpness Let
$M_i \in \{0,1\}$ be a validity mask (we set $M_i = 1$ only where $D_i$ is
valid and $D_i > 0$). We consider 4-connected neighbor pairs
\[
\mathcal{N} = \bigl\{(i,j)\;|\; i \text{ and } j \text{ are horizontal or vertical neighbors, } M_i M_j = 1 \bigr\}.
\]

For a ratio threshold $t>1$, a neighbor pair $(i,j)\in\mathcal{N}$ is marked as
a depth boundary in the ground truth if the two depths differ by more than a
factor $t$:
\begin{equation}
B_t(i,j) =
\begin{cases}
1, & \text{if } \displaystyle \max\!\left(\frac{D_i}{D_j}, \frac{D_j}{D_i}\right) > t,\\[4pt]
0, & \text{otherwise,}
\end{cases}
\end{equation}
and we define $\hat{B}_t(i,j)$ analogously using $\hat{D}$.

The precision and recall of predicted boundaries at threshold $t$ are
\begin{equation}
P_t = \frac{\sum_{(i,j)\in\mathcal{N}} B_t(i,j)\,\hat{B}_t(i,j)}
           {\sum_{(i,j)\in\mathcal{N}} \hat{B}_t(i,j)},
\qquad
R_t = \frac{\sum_{(i,j)\in\mathcal{N}} B_t(i,j)\,\hat{B}_t(i,j)}
           {\sum_{(i,j)\in\mathcal{N}} B_t(i,j)},
\end{equation}
and the boundary F1 score is
\begin{equation}
\mathrm{F1}(t) = \frac{2 P_t R_t}{P_t + R_t}.
\end{equation}

We obtain a scale-invariant boundary score by aggregating over multiple
thresholds. Let $\{t_k\}_{k=1}^N$ be $N$ thresholds uniformly spaced in
$[t_{\min}, t_{\max}]$ and let $w_k$ be weights proportional to the threshold:
\begin{equation}
\mathrm{F1}_{\mathrm{SI}}
= \sum_{k=1}^N w_k\, \mathrm{F1}(t_k),
\qquad
w_k = \frac{t_k}{\sum_{\ell=1}^N t_\ell}.
\end{equation}
In our experiments we use $t_{\min} = 1.05$, $t_{\max} = 1.15$, and $N = 10$. We're not following \cite{Bochkovskii2024} to use a maximum range of 1.25 because endoscopic scenes are dominated by smooth, low-texture surfaces where depth discontinuities are often relatively small. Larger ratios, such as 1.25, would not produce enough edge information. Fig.~\ref{sharp depth map} shows edge maps with $t=1.01$.


\paragraph{Frame variance.}
For each video sequence, we measure temporal variance of the predicted metric
scale. For each frame, we compute the optimal global scale factor
$s_t \in \mathbb{R}$ that best aligns $\hat{D}_t$ to $D_t$ in the least-squares
sense:
\begin{equation}
s_t
= \arg\min_{s} \sum_{i \in \Omega_t} \bigl(s \,\hat{D}_t(i) - D_t(i)\bigr)^2
= \frac{\sum_{i \in \Omega_t} \hat{D}_t(i)\, D_t(i)}
       {\sum_{i \in \Omega_t} \hat{D}_t(i)^2 + \varepsilon},
\end{equation}
with a small $\varepsilon$ to avoid division by zero.
Given the sequence of scale factors $\{s_t\}_{t=1}^T$, we define the frame
consistency score as the standard deviation of these scales:
\begin{equation}
\sigma
= \sqrt{\frac{1}{T} \sum_{t=1}^{T} \bigl(s_t - \bar{s}\bigr)^2},
\qquad
\bar{s} = \frac{1}{T} \sum_{t=1}^{T} s_t.
\end{equation}
Lower values of $\sigma$ indicate a more temporally stable metric
scale (less frame-to-frame flicker).




\clearpage

\section{Statistical Significance Analysis}
We have updated Tab.~\ref{tab:stats} to report the video-wise mean $\mathrm{mean} \pm \mathrm{std.}$ across the 9 test sequences, and we additionally conduct paired t-tests for statistical significance. The propposed EndoStreamDepth achieves statistically significant improvements over both baselines on 5 of 7 metrics: AbsRel ($p<0.001$), SqRel ($p<0.001$), RMSE ($p<0.05$), RMSE log ($p<0.001$), and L1 ($p<0.05$). In addition, our method shows lower variance on key metrics (AbsRel, RMSE, L1), indicating more robust performance across different test sequences.


\begin{table}[t]
\centering
\caption{Video-wise statistical evaluation on the C3VD split 1 test set.  \textbf{Bold} indicates the best mean result. $^{*}$: $^{}p<0.05$, $^{***}$: $^{}p<0.001$ (paired t-test vs. both baselines).}
\label{tab:stats}
\small
\begin{tabular}{l|l|l|l}
\toprule
Metrics & Metric DAv2 + EST & FlashDepth + SiLog + EST & EndoStreamDepth \\
\midrule
$\delta_1\uparrow$      & 0.948$\pm$0.052 & \textbf{0.952}$\pm$0.046 & \textbf{0.952}$\pm$0.047 \\

AbsRel$\downarrow$      & 0.109$\pm$0.025 & 0.109$\pm$0.021 & \textbf{0.085}$\pm$0.021$^{***}$ \\

SqRel$\downarrow$       & 0.402$\pm$0.135 & 0.395$\pm$0.100 & \textbf{0.246}$\pm$0.109$^{***}$ \\
RMSE$\downarrow$        & 3.081$\pm$0.528 & 3.023$\pm$0.552 & \textbf{2.739}$\pm$0.442$^{*}$ \\

RMSE log$\downarrow$    & 0.122$\pm$0.025 & 0.122$\pm$0.021 & \textbf{0.107}$\pm$0.023$^{***}$ \\

L1$\downarrow$          & 1.928$\pm$0.362 & 1.872$\pm$0.303 & \textbf{1.780}$\pm$0.300$^{*}$ \\

F1$\uparrow$            & 0.114$\pm$0.045 & 0.134$\pm$0.069 & \textbf{0.143}$\pm$0.059 \\
\bottomrule
\end{tabular}
\end{table}




\section{Qualitative Temporal Stability}

Fig.~\ref{temporal_results} compares two ablations from Tab.~\ref{main_table}: ``+ Edge loss'' (without temporal modules) and ``+ Temporal reg.'' (with the multi-level temporal module and self-supervised temporal regularization). Compared to ``+ Edge loss'', ``+ Temporal reg.'' produces more temporally consistent depth predictions and noticeably reduces frame-to-frame flickering, especially under rapid camera motion reversals (red arrows).

\paragraph{Video-level quantitative.} To complement the qualitative comparison, we report metrics on the same video (2b) shown in Fig.~\ref{temporal_results} using the evaluation protocol in Tab.~\ref{main_table}. Adding temporal regularization improves $\delta_1$ from 0.984 to 0.986 (+0.20\%), reduces AbsRel from 0.661 to 0.653 (+1.21\%), and reduces RMSE from 2.65$\mathrm{mm}$ to 2.59$\mathrm{mm}$ (+2.26\%). The frame variance also decreases from 0.00209 to 0.00127 (+39.23\%), further supporting improved temporal stability.


\paragraph{Temporal module choice.} We adopt a Mamba-based streaming temporal module to meet real-time, per-frame inference with a persistent state. This design follows FlashDepth, which reports supplementary comparisons among lightweight temporal alternatives and finds that vanilla Mamba is sufficient in most cases. Building on this established streaming backbone, our contributions focus on endoscopy-specific robustness (EST), multi-level temporal integration, and comprehensive supervision to further improve the performance and reduce flickering.



\begin{figure}[t]
\centering
\includegraphics[width=\linewidth]{figures/temporal.pdf}
\caption{
Temporal qualitative comparison between the two ablations in Tab.~\ref{main_table}. ``+ Edge loss'' refers to the ``+ Edge loss'' row (edge loss included) in Tab.~\ref{main_table}, i.e., without the multi-level temporal modules or temporal regularization. Temporally inconsistent predictions are marked by blue arrows. Red arrows indicate the camera motion, moving forward and then backward.
}
\label{temporal_results}
\end{figure}



\clearpage

\section{Sharp Depth Map}
Fig.~\ref{sharp depth map} shows the qualitative edge maps derived from ground truth and our predicted depth
maps.

% \xx{To qualitatively assess boundary sharpness, we visualize edge maps derived from both the ground-truth and predicted depth. Following the boundary F1 metric in Appendix \ref{Evaluation metric}, we extract depth edges by identifying neighboring pixel pairs whose depth ratio exceeds a threshold $t$. For visualization, we set $t=1.01$ to highlight fine-grained depth discontinuities that correspond to subtle anatomical structures in endoscopic scenes.}

% \xx{Fig.~\ref{sharp depth map} shows representative examples from the C3VD test set. The extracted edge maps (green) are overlaid on the corresponding RGB frames for visual comparison. To avoid spurious edges caused by the circular endoscopic field of view, we erode the valid depth mask using a 3-pixel kernel before edge extraction. As shown in Fig.~\ref{sharp depth map}, edges from our predictions closely match those from the ground truth, indicating that EndoStreamDepth preserves sharp anatomical boundaries, including in low-texture regions such as smooth mucosal folds. This property is important for downstream clinical applications where accurate depth discontinuities support reliable 3D reconstruction and instrument--tissue distance estimation. The consistent edge alignment across frames (corresponding to Fig.~\ref{qualitative} and Fig.~\ref{temporal_results}) further supports the effectiveness of our comprehensive supervision in producing geometrically consistent depth predictions.}



\begin{figure}[t]
\centering
\includegraphics[width=0.83\linewidth]{figures/sharp_depth.pdf}
\caption{
The edge maps (green) overlaid on frames derived from ground truth and our prediction depth maps. The left side bar indicates the frames in Fig.~\ref{qualitative} and Fig.~\ref{temporal_results}. The edge maps suggest that our methods can capture even slight changes in low-texture regions of the scene.
The edge maps were eroded with a 3-pixel kernel to exclude the field-of-view border. The edge maps are derived with $t=1.01$. The details of the edge maps are in Appendix.~\ref{Evaluation metric}.}
\label{sharp depth map}
\end{figure}


\clearpage

\section{Runtime Analysis}
\label{runtime}
\begin{table}[t]
\centering
\caption{Runtime and memory usage versus video length. The inference speed remains close with different video lengths}.
\label{tab:runtime_memory}
\small
\begin{tabular}{c|c|c|c}
\toprule
Video Length (frames) & Per-Frame Latency (ms) & FPS & Peak GPU Memory \\
\midrule
100  & 40.2 & 24.9 & 2.90\,GB \\
500  & 40.0 & 25.0 & 4.50\,GB \\
1000 & 40.3 & 24.8 & 6.50\,GB \\
3000 & 40.7 & 24.6 & 14.50\,GB \\
\bottomrule
\end{tabular}
\end{table}

Tab.~\ref{tab:runtime_memory} reports latency and memory benchmarks at $518\times518$ resolution on a single NVIDIA RTX A6000 GPU. We use bfloat16 mixed precision and \texttt{torch.compile} for efficient PyTorch inference. We skip the first 5 frames for warmup and measure steady-state performance on the remaining frames. We additionally benchmark up to 3000 frames, exceeding the maximum sequence length in our dataset (700 frames, see Tab.~\ref{split1}) to assess scalability.

\paragraph{Constant inference speed.} Per-frame latency remains $\sim$40\,ms ($\sim$25 FPS) across video lengths, meeting real-time requirements ($\geq$20 FPS) and indicating low computational overhead of the Mamba-based streaming temporal module.

\paragraph{Predictable memory scaling.} Peak GPU memory increases approximately linearly with video length. In our setting, it consists of a fixed base cost (model weights and buffers) plus an incremental cost of $\approx$4\,MB per frame for the temporal state, enabling straightforward memory budgeting for long sequences.




\section{Ablation Setting}
\label{blation setting}

\input{tables/ablation_setting}
Tab.~\ref{tab:ablation_settings} shows the ablation settings in our experiments, which are shown in Tab.~\ref{main_table}.

\end{document}





