\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} 
\usepackage{xcolor}

\usepackage{amsmath}
\usepackage{caption}  % put this in your preamble
\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026}
\jmlrvolume{-- nnn}
\editors{Accepted for publication at MIDL 2026}

\title[]{Geometry-Aware Depth-Guided Explainable Multimodal Polyp Size Estimation: 
A Fusion Model Beyond RGB}


 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{
\Name{Krispian Lawrence\nametag{$^{1}$}} \Email{krispian@mit.edu}\\
\Name{Usha Goparaju\nametag{$^{1}$}} \Email{usha.goparaju@screenwithpeek.com}\\
\Name{Luis Lamb\nametag{$^{2}$}} \Email{luislamb@alum.mit.edu}\\
\addr $^{1}$ Equitable Technologies, Cambridge, MA, USA \\
\addr $^{2}$ Catholic Institute of Technology, Cambridge, MA, USA
}

\begin{document}

\maketitle

\begin{abstract}
Accurately estimating the physical size of colorectal polyps from monocular endoscopy is difficult due to scale ambiguity, viewpoint distortions, and strong inter-patient variability. We introduce MPSE, a geometry-aware, depth-guided multimodal framework that jointly leverages RGB appearance, monocular depth cues, and interpretable geometry descriptors to produce reliable and clinically calibrated size estimates. Central to MPSE is a geometry-as-query fusion block that selectively attends to depth and RGB features, and a Scale Consistency Block (SCB) that models agreement between 2D footprint–derived and 3D depth–derived cues, reducing size bias under severe distribution imbalance. The model is trained with a primary regression objective supported by an auxiliary threshold-based classification loss that stabilizes predictions near clinically important cutoffs. On our clinical dataset, MPSE achieves a mean absolute error of 0.93\,mm and a polyp-level F1 score of 0.87 at the clinically critical 5\,mm threshold, demonstrating accurate and clinically reliable size estimation in endoscopy.

\end{abstract}

\begin{keywords}
polyp size estimation, depth fusion, geometry-awareness, multimodal learning, endoscopy
\end{keywords}


% =========================================================
% PAPER BODY STARTS HERE   SUBHEADS ONLY
% =========================================================
\section{Introduction}
Colonoscopy is central to preventing colorectal cancer (CRC), which remains a major global health burden with nearly two million new cases and high mortality each year~\cite{sung2021global}. The early detection and removal of precancerous polyps substantially reduce CRC incidence and death rates~\cite{nusko2000precancer,intissar2025detecting}. Among polyp characteristics assessed during endoscopy, \emph{polyp size} is one of the strongest predictors of malignant potential. Large-scale investigations, such as that of \cite{nusko2000precancer} show that polyps $\leq$5\,mm have negligible malignant risk, whereas lesions 26--35\,mm are malignant in 42.4\% of cases and those larger than 35\,mm reach malignancy rates of $75\%$~. Consequently, contemporary guidelines (e.g., ESGE and USMSTF ) rely on size thresholds at 5\,mm and 10\,mm to determine resection technique and surveillance intervals~\cite{esge2024guideline, usmstf2020guideline}. Accurate size estimation is also essential for implementing resect-and-discard and diagnose-and-leave strategies~\cite{jeon2025review}. However, conventional visual estimation during colonoscopy suffers from substantial interobserver variability and lack of standardization, leading to frequent overestimation or underestimation of lesion size~\cite{song2025polyp}. Endoscopic magnification, optical distortion, inconsistent use of reference tools, shrinkage of resected tissue, and piecemeal excision further compound these inaccuracies.

Recent advances in image-based measurement systems such as Virtual Scale Endoscopy (VSE) and laser-based virtual rulers have demonstrated improved reproducibility compared to traditional visual estimation~\cite{jeon2025review}. Yet, these technologies require specialized hardware and are not widely available across clinical settings. Parallel progress in artificial intelligence (AI) has transformed endoscopic detection, classification, and segmentation~\cite{dhali2025artificial,10930199}, but polyp \emph{size estimation} remains comparatively underexplored. Estimating 3D dimensions from a single 2D image is an inherently ill-posed problem, polyps with vastly different true diameters may appear visually similar depending on camera–polyp distance, as showed in recent depth-based studies~\cite{hwang2021unsupervised,liu2025efdepth,seeindepth2025}. Emerging datasets such as Polyp-Size~\cite{song2025polyp} highlight the magnitude of measurement inconsistencies even among experts, with mean relative errors reaching up to $77\%$ in clinical practice~\cite{AtalaiaMartins2019}. These limitations shows the need for robust, reproducible, and clinically aligned AI-driven methods for accurate polyp size estimation using standard endoscopic video. This paper is organized as follows. Section~2 reviews related work. Section~3 presents the proposed methodology. Section~4 describes the dataset and experimental setup. Section~5 reports and discusses the results. Section~6 concludes the study and outlines future directions.


\section{Related Work}\label{Related}

Recent advances in computational endoscopy have significantly advanced tasks such as polyp \textit{detection}, \textit{segmentation}, and \textit{histology prediction}, largely driven by deep learning. However, \textit{polyp size estimation} despite its critical role in risk stratification and surveillance interval decisions remains underexplored. Among the relatively few studies that address size estimation, a major unresolved challenge lies in the definition of “ground truth.” Reported size labels are typically derived from visually estimated measurements by endoscopists~\cite{wang2024real}, scale referencing using biopsy forceps or virtual tools~\cite{Shimoda2022}, post-resection pathology (which suffers from shrinkage and deformation), or 3D measurements from CT colonography~\cite{yee2001ct}. Each of these sources introduces systematic uncertainty due to scale ambiguity, tissue deformation, or lack of direct metric correspondence creating inherent noise in both training labels and evaluation baselines.

Traditional approaches to size estimation have relied on tactile references or vision-based overlays. Early systems such as Virtual Scale Endoscopy (VSE) incorporated calibrated forceps or laser rulers to infer polyp size during colonoscopy~\cite{nakatani2007three, yoshioka2021real}, offering improved accuracy over visual estimation but depending heavily on ideal pose, fixed distances, and planar assumptions. Shimoda et al.~\cite{shimoda2022prospective} and Djinbachian et al.~\cite{djinbachian2023randomized} demonstrated that VSE can reduce misclassification at clinical thresholds (e.g., 5 or 10\,mm), but these systems still struggle with morphologically irregular polyps or distorted views. More recent evaluations~\cite{minakata2024prospective} confirm reduced inter-observer variability using such tools, but highlight limitations in image-only processing and lack of structural priors. Critically, none of these works incorporate cross-frame temporal consistency, multi-view geometry, or multimodal integration of tool and scene context factors essential for robust measurement in dynamic or anatomically complex scenes. These limitations motivate the shift toward learning-based approaches that can encode semantic, geometric, and spatial features jointly.

Deep learning-based methods have recently begun addressing size estimation, both in colorectal and esophageal endoscopy. 
Abdelrahim et al.Du et al.~\cite{10635184} leveraged monocular metric depth estimation and 3D reconstruction to infer polyp size, demonstrating the value of depth-aware representations but without explicit lesion geometry modeling or cross-cue consistency. Ruano et al.~\cite{10635358} used a shape-from-shading model to estimate polyp size from a single image, but relied on restrictive illumination assumptions and lacked temporal or multimodal integration.
 ~\cite{abdelrahim2022computer} combined structure-from-motion with CNNs to classify polyps into binary size categories, achieving 85.2\% accuracy outperforming expert endoscopists but relied on small, constrained video datasets. Kwak et al.~\cite{kwak2022ai} introduced a W-Net regression model showing strong agreement with size labels (CCC = 0.961), yet their method lacked depth inference, temporal reasoning, or uncertainty quantification. More clinically oriented systems such as ENDOANGEL-CPS~\cite{wang2024endoangel} demonstrated real-time performance with 89.9\% accuracy and reduced inappropriate surveillance recommendations, but functioned as black-box predictors without leveraging procedural or spatial context. In parallel, esophageal variceal studies~\cite{fang2024vr, mao2024ai} used VR-assisted or AI-guided measurements to reduce evaluation time and improve outcome prediction, but similarly failed to account for shape deformation, pressure-induced size changes, or vessel morphology factors critical for pathophysiological relevance. Despite these advances, no existing system explicitly fuses RGB visual appearance, segmentation-derived geometry, and depth or spatial priors within a unified, trainable architecture. Our work addresses this gap by \emph{proposing a multimodal fusion framework that integrates temporal visual features, geometric cues, and tool-aware context to deliver anatomically grounded and metrically meaningful polyp size estimation}.



% Recent efforts have explored \textit{machine learning (ML)} and \textit{deep learning (DL)} methods to estimate polyp size directly from imaging data. Some studies treat the task as a \textit{regression problem}, predicting size in millimeters from colonoscopy frames using CNNs~\cite{wang2024real}, while others cast it as a \textit{classification task} e.g., <5\,mm vs $\geq$5\,mm using handcrafted features or deep models~\cite{itoh2018depth,krenzer2025transformer}. These methods typically operate on RGB frames from colonoscopy or capsule endoscopy and leverage texture, shape, or global appearance features. While they offer flexibility over rigid geometric assumptions, they suffer from \textit{noisy supervision}, \textit{small datasets}, and lack of metric cues. Polyp size in monocular endoscopy is inherently entangled with \textit{scope distance}, and RGB-only models cannot resolve this without auxiliary depth or geometric information. Moreover, most published models do not incorporate \textit{segmentation-derived shape priors}, \textit{explicit depth estimation}, or \textit{physics-based scale modeling}. As a result, current ML/DL methods though promising still leave the \textit{core size–distance ambiguity unresolved}. To the best of our knowledge, no existing model jointly fuses \textit{RGB appearance}, \textit{segmentation geometry}, and \textit{depth cues} in a unified framework for polyp size estimation.



% \begin{figure}[t]
%     \centering
%     % trim = left bottom right top   (in this exact order!)
%     % You want: top = 20, bottom = 160
%     \includegraphics[width=\linewidth, trim=0 170 0 20, clip]{figures/bd_2.png}
%     \caption{
%         \caption{Overview of MPSE. The model extracts dense RGB and pseudo-metric depth features from the polyp region and combines them with analytic geometry descriptors computed from the segmentation mask. A geometry-as-query fusion block directs cross-modal attention between tokens, followed by a Scale Consistency Block (SCB) . The resulting geometry-aware representation is used for millimetre-level regression with an auxiliary threshold-classification loss for training-time stabilization.}

%     \label{fig:bd_MISE}
% \end{figure}
\begin{figure}[t]
    \centering
    % trim = left bottom right top   (in this exact order!)
    % You want: top = 20, bottom = 160
    \includegraphics[width=\linewidth, trim=0 170 0 20, clip]{figures/bd_2.png}
    \caption{
        Overview of MPSE. RGB, depth, and analytic geometry descriptors are fused via a geometry-as-query attention block, followed by a Scale Consistency Block (SCB).
    }
    \label{fig:bd_MISE}
\end{figure}



\section{Method: MPSE - Multimodal Polyp Size Estimation}

Estimating the physical size of a colorectal polyp from a single endoscopic frame is intrinsically ill-posed: supervision is weak a single scalar diameter in millimeters while the cues that govern size perception are structurally heterogeneous. Accurate estimation simultaneously depends on the lesion’s two-dimensional footprint, its three-dimensional protrusion and local curvature, and the photometric and viewpoint factors that shape its appearance. Crucially, no single modality is sufficient on its own. MPSE coordinates these complementary but inconsistent signals through a unified representation. To this end, MPSE derives analytic morphology from the segmentation mask, extracts dense RGB and monocular-depth evidence over the lesion surface, and fuses them through a \textbf{geometry-driven transformer} in which an explicit geometry token guides cross-modal attention. A subsequent \textbf{Scale Consistency Block (SCB)} reconciles 2D and 3D size cues, correcting the systematic underestimation of large lesions observed in clinical practice and prior learning-based approaches.

%                                       
\subsection{Multimodal Cue Extraction}
%                                       

Each frame provides an RGB image $I$, from which a pretrained segmentation network like Polyp-PVT \cite{Dong2023} extracts a binary mask $M$ delineating the polyp. Beyond defining spatial support, $M$ enables computation of a compact set of analytic geometry descriptors $g$ that summarise the lesion’s 2D morphology. We obtain (i) pixel area and equivalent diameter, (ii) axis-aligned aspect ratios capturing elongation, (iii) circularity from contour moments, and (iv) a Laplacian-based boundary-sharpness score. These descriptors arise from deterministic operations (connected components, contour geometry, and moment estimation), yielding invariance to rotation, mild affine distortions, and moderate illumination changes. Crucially, they are not heuristic handcrafted features: they encode the same structural cues endoscopists use when visually estimating size and remain stable under appearance degradations such as blur, specularities, and mucosal texture. 

However, as showed in Fig.~\ref{fig:bd_MISE}, two lesions with nearly identical 2D footprints may differ substantially in true physical size due to changes in protrusion height or camera--lesion distance, information not recoverable from $M$ alone. To supply these missing 3D cues, we estimate a pseudo-metric depth map $D$ using \textsc{EndoDAC} \cite{Cui2024EndoDAC}, an endoscopy-specific monocular depth model whose photogeometric priors better capture relative curvature, protrusion, and coarse viewing distance compared with generic depth networks.

From the RGB frame $I$ and depth map $D$, modality-specific encoders $F_{\mathrm{rgb}}$ and $F_{\mathrm{depth}}$ (both ResNet--18 backbones) extract dense feature maps, which are projected into $D$-dimensional tokens through linear projections. The RGB and depth token sets are thus defined compactly as
\[
T_{\mathrm{rgb}} = \Pi_{\mathrm{rgb}}(\mathrm{flatten}(F_{\mathrm{rgb}}(I))), \qquad
T_{\mathrm{depth}} = \Pi_{\mathrm{depth}}(\mathrm{flatten}(F_{\mathrm{depth}}(D))),
\]
yielding $T_{\mathrm{rgb}},T_{\mathrm{depth}} \in \mathbb{R}^{N \times D}$ with $N = H'W'$. 

The analytic geometry vector $g \in \mathbb{R}^{F}$ (area, equivalent diameter, circularity, aspect ratios, boundary sharpness) is mapped into the same $D$-dimensional space using a two-layer MLP,
\begin{equation}
t_{\mathrm{geom}} = W_2\,\sigma(W_1 g + b_1) + b_2,
\label{eq:geom_token}
\end{equation}
producing a single geometry token aligned with the RGB and depth embeddings.

% ---------------------------------------------------------
\subsection{Polyp-Region Tokenization}
% ---------------------------------------------------------

Colonoscopy frames contain extensive background mucosa whose appearance varies widely across patients and illumination conditions but contributes no meaningful information for estimating physical size. Allowing these regions to generate transformer tokens would inflate the attention space, introduce modality-incoherent noise, and weaken optimization dynamics. MPSE therefore tokenizes only the \emph{polyp surface}. The segmentation mask $M$ is downsampled to match the backbone resolution of the RGB and depth encoders, producing aligned feature maps $F_{\mathrm{rgb}}$ and $F_{\mathrm{depth}}$. From the foreground coordinates, we uniformly sample $N$ positions that capture the lesion footprint. At each selected location $(u,v)$, RGB and depth descriptors are extracted from the corresponding feature tensors and projected into embedding space, yielding synchronized token sets.

% ---------------------------------------------------------
\subsection{MPSE Fusion Block}
% ---------------------------------------------------------

The central hypothesis behind MPSE is that \emph{global geometry should guide multimodal fusion}. Geometry provides footprint shape, scale priors, and coarse depth statistics, whereas RGB and depth tokens provide dense but highly local evidence. Treating modalities symmetrically forces the model to infer global structure from local cues alone, a setting that empirically produces unstable fusion and biased size estimates. MPSE therefore assigns the geometry token a privileged role: it is the \emph{sole query} in the first attention stage, enabling explicitly directed global-to-local reasoning.

\medskip
\noindent\textbf{(1) Geometry-as-Query Attention.}
The geometry token attends over all RGB and depth tokens to extract the local evidence most consistent with the global morphological prior:
\begin{equation}
\boxed{
\tilde{t}_{\mathrm{geom}}
= \mathrm{MHAttn}
\big(
Q = t_{\mathrm{geom}},\;
K,V = [T_{\mathrm{rgb}}, T_{\mathrm{depth}}]
\big)
}
\label{eq:geom_attn}
\end{equation}
This design substantially stabilizes optimization and reduces fusion ambiguity compared to RGB- or depth-driven query streams.

\medskip
\noindent\textbf{(2) RGB--Depth Cross-Modal Interaction.}
RGB and depth tokens are then refined through a shared multi-head self-attention layer, allowing shading-based appearance cues and curvature-based depth cues to mutually correct and reinforce one another. This bidirectional alignment mitigates cases where one modality is noisy or visually ambiguous.

\medskip
\noindent\textbf{(3) Joint Transformer Encoding.}
Finally, the updated geometry token is prepended to the refined RGB/depth tokens and passed through $L=2$ lightweight TransformerEncoder layers. The output token at position $0$ corresponding to the geometry slot acts as the fused representation $z_{\mathrm{fused}}$, summarizing appearance, morphology, and 3D topology in a geometry-aware manner.

\medskip
This three-stage pipeline yields a stable and interpretable multimodal embedding: geometry determines \emph{where} to attend, RGB and depth reconcile local inconsistencies, and the joint encoder produces a coherent representation suitable for scale-consistent regression.

% ---------------------------------------------------------
\subsection{Scale Consistency Block}
% ---------------------------------------------------------

Even with strong multimodal fusion, 2D footprint cues and 3D depth cues may disagree, particularly for flat or large lesions or under oblique viewing angles. These conflicts drive the network toward mid-range predictions and cause systematic underestimation in the clinically important $>5$\,mm and $>10$\,mm ranges. The Scale Consistency Block (SCB) makes this disagreement explicit by computing two internal size proxies. A \emph{2D footprint estimate} $s_{2D}$ is obtained by applying a small MLP to the geometry token (Eq.~\ref{eq:geom_token}), which encodes area, equivalent diameter, circularity, and bounding-box ratios---i.e., the size implied purely by boundary morphology. A \emph{3D protrusion estimate} $s_{3D}$ is obtained from depth tokens using attention pooling, which aggregates local curvature, protrusion height, and coarse camera--lesion distance cues into a single scalar. Their difference $\Delta s = s_{2D} - s_{3D}$ quantifies the inconsistency between modalities; large values indicate cases where monocular appearance and pseudo-metric depth disagree. Empirically, $\Delta s$ correlates with prediction variance and serves as a surrogate indicator of epistemic uncertainty.

To calibrate the fused representation accordingly, SCB concatenates the multimodal embedding $z_{\mathrm{fused}}$ with $(s_{2D}, s_{3D}, \Delta s)$ and feeds the result through a two-layer residual MLP:
\begin{equation}
z_{\mathrm{cons}}
=
z_{\mathrm{fused}}
+
f_{\mathrm{SCB}}([z_{\mathrm{fused}};\, s_{2D};\, s_{3D};\, \Delta s]).
\label{eq:scb}
\end{equation}
This residual structure allows SCB to preserve $z_{\mathrm{fused}}$ when cues are consistent, while learning corrective adjustments when 2D and 3D signals diverge. The calibrated representation $z_{\mathrm{cons}}$ is finally mapped to a millimetre-level prediction via a regression head.


\subsection{Prediction and Training Objective}
The calibrated representation $z_{\mathrm{cons}}$ is mapped to a millimetre-level estimate through a lightweight regression head,
\[
\hat{y} = f_{\mathrm{reg}}(z_{\mathrm{cons}}).
\]
A parallel auxiliary classifier predicts clinically relevant size bins (e.g., $<5$\,mm, $5$--$9$\,mm, $\geq10$\,mm), providing an ordinal signal that stabilizes learning near these thresholds. Although this multi-task formulation improves boundary sensitivity, the classifier serves purely as a training-time regularizer  its outputs are discarded during inference, and MAE remains the primary evaluation metric. Classification metrics such as F1 are reported only to enable fair comparison with prior threshold-based systems. To obtain a single estimate per polyp, frame-level predictions across the video are aggregated using the $p75$ statistic, which emphasizes frames with clearer visibility and more reliable depth cues. This strategy reduces two inherent ambiguities of monocular endoscopy: systematic underestimation of large lesions and prediction instability around the 5\,mm diagnostic threshold.

% \begin{table}[!t]
% \centering
% \caption{\textbf{Performance comparison between prior work, our baseline, and MPSE.}}
% \label{tab:comp_three_methods}
% \small 
% \begin{tabular}{lccccc}
% \toprule
% \textbf{Method} & \textbf{Accuracy} & \textbf{Recall} & \textbf{Precision} & \textbf{F1} & \textbf{AUROC} \\
% \midrule
% Song et al. 
% & $\sim$0.65 & $\sim$0.65 & $\sim$0.67 & $\sim$0.61 & $\sim$0.69 \\
% Our - Baseline 
% & 0.738 & 0.938 & 0.600 & 0.732 & 0.933 \\
% \textbf{MPSE} 
% & \textbf{0.857} & \textbf{0.9375} & \textbf{0.800} & \textbf{0.857} & \textbf{0.945}* \\
% \bottomrule
% \end{tabular}
% \label{naturepaper_comp}
% \end{table}

% Table 1 (Version 2) — exact content
\begin{table}[t]
\centering
\caption{Performance comparison between prior work, our baseline, and MPSE.}
\label{tab:table1}
\begin{tabular}{lccccc}
\hline
Method & Accuracy & Recall & Precision & F1 & AUROC \\
\hline
Song et al \cite{song2025polyp}. & $\sim$0.65 & $\sim$0.65 & $\sim$0.67 & $\sim$0.61 & $\sim$0.69 \\
Our - Baseline & 0.738 & 0.938 & 0.600 & 0.732 & 0.933 \\
MPSE & 0.857 & 0.9375 & 0.800 & 0.857 & 0.945* \\
\hline
\label{tab1}
\end{tabular}
\end{table}



% When the optional classifier is used, a cross-entropy term provides auxiliary supervision around the 5\,mm and 10\,mm decision boundaries. Importantly, MPSE requires no oracle signals at inference: segmentation masks, geometry descriptors, depth maps, and fused tokens are all derived automatically from the input frame. The full pipeline runs in a single forward pass, enabling efficient and deployment-ready size estimation.



% \subsection{MultiModal Data Preparation}
% Each RGB frame $I$ is processed by a pretrained polyp segmentation model to obtain a binary mask $M$, which serves as a reliable spatial prior for isolating lesion boundaries from surrounding mucosa. From $M$, we derive a set of analytic geometry descriptors $g$ that quantify the lesion’s two-dimensional morphology, including area, equivalent diameter, bounding-box ratios, circularity, and blur-based sharpness measures. These descriptors are not merely handcrafted heuristics; rather, they encode clinically meaningful shape cues that are known to correlate with pathological severity and are robust to illumination, texture variation, and camera motion. Importantly, geometry captures aspects of the polyp that cannot be inferred solely from RGB intensity such as how compact, elongated, or optically distorted the lesion appears providing an interpretable structural signal even when visual features are unstable. The descriptor vector is embedded into a compact latent representation via
% \begin{equation}
%     z^{\mathrm{geo}} = \phi(g),
%     \label{eq:geoembed}
% \end{equation}
% where $\phi$ is a lightweight MLP designed to preserve inter-descriptor relationships while reducing dimensionality.

% In parallel, we compute a dense depth map using a monocular depth estimator tailored for endoscopic imagery,
% \begin{equation}
%     D = \mathrm{Depth}(I),
%     \label{eq:depth}
% \end{equation}
% which infers pseudo-metric surface geometry by exploiting shading, reflectance, and learned anatomical priors. Depth complements geometry by capturing information unavailable in the mask alone: protrusion height, wall curvature, and camera–lesion distance. These cues are essential for disambiguating lesions with similar pixel footprints but different true physical sizes. For instance, a flat polyp viewed close to the camera may appear as large as a protruded polyp viewed from farther away; depth provides the missing third dimension that resolves this ambiguity. Thus, the triplet $(I, z^{\mathrm{geo}}, D)$ provides three structurally distinct but synergistic perspectives on the lesion appearance, shape, and 3D topology forming the foundation of our tri-modal reasoning pipeline.


% \subsection{Modality-Specific Encoders and Cross-Modal Fusion}
% RGB and depth frames are passed through modality-specific encoders to obtain multi-scale feature maps that capture complementary aspects of the scene: RGB provides fine-grained texture and boundary information, while depth encodes protrusion, curvature, and camera--lesion distance. However, these modalities do not contribute uniformly across frames. For example, depth may be reliable when the colon wall exhibits strong shading gradients but unstable during rapid camera motion, whereas geometry-derived cues are informative when segmentation is accurate but ambiguous under specular highlights. To reconcile these modality-specific failure modes, MPSE adopts a cross-modal fusion strategy in which RGB embeddings operate as \emph{queries}, and depth features together with the geometry embedding serve as \emph{keys} and \emph{values}. This formulation forces appearance-driven features to attend selectively to the structural cues that best explain them, rather than assuming that all modalities are equally trustworthy at all times. Cross-modal alignment is formally computed as
% \begin{equation}
%     \mathrm{Attn}_\ell = 
%     \mathrm{softmax}\!\left(
%         \frac{Q_\ell K_\ell^\top}{\sqrt{d_k}}
%     \right) V_\ell,
%     \label{eq:attention}
% \end{equation}
% yielding a representation in which ambiguous RGB patterns are grounded by depth-informed geometry priors.

% A learnable gating unit regulates the contribution of these attended features by adaptively down-weighting depth when it becomes unstable and amplifying geometry when it provides strong shape evidence. This mechanism is particularly important in clinically challenging scenarios: a shallow, flat lesion may appear large in RGB due to proximity but exhibit minimal depth variation, whereas a protruded but small polyp may have weak geometric footprint yet strong depth cues. By explicitly modeling such cross-modal asymmetries, MPSE elevates geometry and depth from auxiliary signals to \emph{structural priors} that constrain and refine appearance interpretation. For video sequences, MPSE further incorporates temporal smoothing to reduce flicker, stabilize predictions across frames, and accumulate information over short temporal windows where complementary modalities may fluctuate in reliability. This combination of selective cross-modal attention, gated structural reasoning, and temporal aggregation provides a principled fusion strategy that is robust to noise, illumination changes, segmentation drift, and endoscope motion.

% \begin{figure}[!b]
%     \centering

%     % Top image
%     \includegraphics[width=0.9\linewidth]{figures/spot_02_Video11_f000055.png}\\[6pt]
  
%     \caption{
%       RGB vs Segmentation Mask Vs Depth
%     }
%     \label{fig:two_vertical}
% \end{figure}
% \subsection{Prediction Heads}
% The fused multi-scale representation is decoded into a high-resolution feature map $G_L$. A mask-guided polyp embedding is then computed by averaging features only over the lesion region:
% \begin{equation}
%     z^{\mathrm{poly}} = 
%     \frac{1}{|\Omega|}
%     \sum_{(i,j)\in\Omega} G_L(i,j,:),
%     \qquad 
%     \Omega = \{(i,j)\mid M_{ij}=1\}.
%     \label{eq:polypembed}
% \end{equation}
% The final representation is obtained by concatenating $z^{\mathrm{poly}}$ with $z^{\mathrm{geo}}$ and passed to a regression head for continuous size prediction and a classification head that determines whether the lesion exceeds the 5\,mm clinical threshold. For colonoscopy videos, optional temporal smoothing improves continuity and clinical interpretability. Through this unified fusion of appearance, geometry, and depth anchored by structural reasoning and mask-guided embedding MPSE provides a principled, reliable pipeline for polyp size estimation.

\label{sec:method}

% ============================================================
\section{Results and Discussion}
\label{subsec:dataset_setup}

\noindent\textbf{Dataset.}

We use the Polyp-Size dataset~\cite{song2025polyp}, containing 42 polyps:
\text{26} diminutive ($<5$\,mm), \text{14} small (5--10\,mm), and only
\text{2} large polyps ($>10$\,mm), with per-polyp size annotations measured using calibrated vernier calipers. Each polyp appears in multiple consecutive frames, and all frames containing the polyp are used in training and evaluation. To prevent data leakage arising from strong temporal correlation across frames, all data splits are defined at the \emph{polyp (video)} level rather than at the frame level. Specifically, frames belonging to the same polyp instance (i.e., extracted from the same endoscopic video segment) are assigned exclusively to either the training or the test split, but never both. This polyp-disjoint protocol ensures that the model is evaluated on entirely unseen lesions at test time, rather than benefiting from near-duplicate frames of the same polyp. Although multiple consecutive frames per polyp are used during training and evaluation, this setup reflects the intended clinical use case where size estimation is performed across short temporal windows of a single lesion.




% ============================================================
% \subsection{Baselines}
% \label{subsec:baselines}

% We compare our method against several baselines designed to isolate the contribution of each modality and the fusion mechanism:

% \begin{itemize}
%     \item \textbf{RGB-only CNN:} ConvNeXt-Tiny trained on RGB frames only, predicting size via a regression head.
%     \item \textbf{RGB+Depth (Concatenation):} RGB and depth concatenated along the channel dimension and passed through a shared ConvNeXt-Tiny backbone.
%     \item \textbf{RGB+Geometry (Early Fusion):} Geometry vector concatenated to global pooled RGB features and fed to an MLP head.
%     \item \textbf{RGB+Depth+Geometry (Early Fusion):} All modalities concatenated into a single feature vector before the prediction head.
%     % \item \textbf{Attention Baselines:} SE-block weighting, channel attention, and self-attention fusion modules applied on concatenated features.
%     % \item \textbf{Ours w/o PISE:} Our full pipeline with modality-specific encoders but without the PISE block (simple concatenation before the head).
%     \item \textbf{Ours:} Full proposed model with RGB, depth, geometry encoders and the PISE cross-attention fusion block.
% \end{itemize}

% ============================================================
% \subsection{Evaluation Metrics}
% \label{subsec:metrics}

% \paragraph{Regression Metrics.}
% We evaluate polyp size regression using:
% mean absolute error (MAE, in mm), root mean squared error (RMSE, in mm),
% mean absolute percentage error (MAPE, \%), Pearson correlation coefficient ($r$),
% and coefficient of determination ($R^{2}$).

% \paragraph{Classification Metrics.}
% We further evaluate clinically relevant thresholds (e.g., $\leq 5$\,mm vs.\ $>5$\,mm) using accuracy, precision, recall, and F1-score at the polyp level.

% \paragraph{Agreement Analysis.}
% Following common practice in medical measurement tasks, we report Bland--Altman analysis and scatter plots of predicted vs.\ true sizes to characterize agreement and systematic biases.

% ============================================================
% \subsection{Implementation Details}
% \label{subsec:impl_details}

% All models are implemented in PyTorch and trained on a single GPU (e.g., NVIDIA T4 / V100).
% We use the AdamW optimizer with initial learning rate $\eta = 3 \times 10^{-4}$,
% weight decay $1 \times 10^{-4}$, cosine annealing with warm restarts, and mixed-precision training.
% Unless otherwise stated, we train for up to $N$ epochs (e.g., 50--100) and apply early stopping based on validation MAE.
% RGB frames are augmented with random horizontal flips, mild color jitter, and random cropping;
% no augmentations are applied to the depth maps or geometry descriptors to preserve their semantic meaning.

% ============================================================



% ============================================================

% ============================================================


\subsection{Overall Performance}
\label{subsec:overall}
\begin{figure}[!b]
    \centering
    \includegraphics[width=\linewidth]{figures/pred_size.png}
   \caption{\textbf{MPSE Quantitative Evaluation.} 
\textbf{(a)} Predicted vs.\ true sizes show high correlation. 
\textbf{(b)} Bland--Altman plot shows near-zero bias (red line), confirming Scale Consistency Block efficacy. 
\textbf{(c)} Absolute error boxplots by size range}

\label{fig:mpse_results}
    \label{fig:overall}
\end{figure}


Table~\ref{tab:table1} and Figure~\ref{fig:overall} jointly illustrate the performance characteristics of MPSE relative to both the published benchmark and our internal baselines. The RGB and RGB--D models from Song~et~al.\ achieve moderate performance (accuracy $\sim$0.65, F1 $\sim$0.61) because depth is incorporated only through channel concatenation, without modelling geometric structure or scale ambiguity. Our RGB-only baseline improves these metrics (accuracy 0.738, F1 0.732) but Figure~\ref{fig:overall}(a,c) reveals a systematic tendency to \emph{underestimate} medium and large polyps, reflected in low precision (0.600) and inflated errors in the $>10$,mm group. Adding depth alone increases recall substantially (0.938), yet precision remains low (0.600), and predictions collapse toward the mid-size regime consistent with the Bland--Altman bias pattern in Figure~\ref{fig:overall}(b). This behaviour is clinically concerning, as management decisions depend on accurate thresholds around 5,mm and 10,mm. MPSE, by contrast, integrates depth with explicit geometric priors and a scale-consistency mechanism, yielding balanced improvements across all metrics (accuracy 0.857, F1 0.857, AUROC 0.945*). The regression scatter shows tight alignment, the Bland--Altman plot shows near-zero bias, and the absolute-error boxplots confirm robustness even for large lesions (median $<0.6$,mm for $>10$,mm polyps). Together, these results demonstrate that MPSE does not simply perform better numerically it corrects the \emph{structural failure modes} of prior RGB, RGB--D, and depth-only approaches by enforcing geometrically grounded, scale-aware multimodal reasoning.





\subsection{Ablation Studies}
\label{subsec:ablations}

\subsubsection{Contribution of Geometry Features}
\label{subsubsec:geometry_ablation}

Figure~\ref{fig:geometry_analysis} shows that MPSE’s geometry features are not auxiliary heuristics but encode structural information that is both discriminative and fundamentally incomplete. Panel~A shows that simple footprint-based cues area, area ratio, and equivalent diameter—exhibit sizeable Cohen’s $d$ between $<5$\,mm and $\ge5$\,mm lesions, indicating that geometry alone can reliably separate coarse size regimes. However, Panel~B reveals substantial ECDF overlap, and Panel~C confirms only weak correlation with true millimetre size ($r\!\approx\!0.28$–$0.39$). Together, these plots expose a key property of monocular endoscopy: the 2D footprint is systematically confounded by camera–tissue distance and viewing angle, making geometry a stable but intrinsically underdetermined signal. MPSE therefore does not treat geometry as a stand-alone predictor but as a \emph{structural prior} that constrains the fusion process. By elevating geometry to a guiding token, the model conditions how RGB and depth tokens are integrated, enabling detection of geometry–depth inconsistencies and preventing the mid-range collapse characteristic of appearance-only or naïve RGB–D approaches. 


% \begin{table}[!t]
% \centering
% \caption{Ablation on geometry features: performance comparison without vs.\ with geometry (F1 / Recall).}
% \begin{tabular}{lcc}
% \toprule
% \textbf{Architecture} &
% \textbf{Without Geometry (F1 / R)} &
% \textbf{With Geometry (F1 / R)} \\
% \midrule
% CNN  & 0.625 / 0.9375 & 0.625 / 0.9375 \\
% UNet & 0.625 / 0.625  & 0.842 / 0.9375 \\
% FPN  & 0.722 / 0.8125 & 0.827 / 0.7375 \\
% ViT  & 0.750 / 0.750  & 0.8648 / 1.000 \\
% \bottomrule
% \end{tabular}
% \label{tab:geom_effect}
% \end{table}

\begin{figure}[!t]
    \centering
    \includegraphics[width=\linewidth]{figures/geom_plots.png}
  \caption{\textbf{Geometry feature analysis.} 
\textbf{(A)} Cohen’s $d$ shows strong discriminative power for footprint cues (area, area ratio, equivalent diameter). 
\textbf{(B)} ECDFs  
\textbf{(C)} Correlation matrix between pixel-based metrics and true size}

\label{fig:geometry_analysis}
    \label{fig:geom_effect_size}
\end{figure}
% \begin{figure}[t]
%     \centering

%     % Top image
%     \includegraphics[
%     width=0.9\linewidth,
%     trim=0 620 0 0,
%     clip
% ]{figures/overlay.png}

  
%     \caption{
%       Collage Depth and RGB
%     }
%     \label{fig:two_vertical}
% \end{figure}




\subsubsection{Contribution of Depth Features}
\label{subsubsec:depth_ablation}

While geometry captures ``how big the lesion looks'' in the image plane, depth is needed to answer ``how big it really is'' in three-dimensional space. Figure~\ref{fig:rgb_depth_collage} shows this qualitatively: the RGB frames (top row) often present lesions whose apparent size is heavily influenced by camera distance and foreshortening, whereas the corresponding pseudo-depth maps (bottom row) reveal protrusion patterns and surface topography that are not obvious from intensity alone.
 The depth map $D$ is never used in isolation; instead, it is encoded into tokens that are jointly processed alongside RGB and geometry. In the Scale Consistency Block, depth contributes a coarse estimate of protrusion-based size ($s_{3D}$), which is contrasted with the footprint-based estimate ($s_{2D}$) derived from geometry. When these two disagree, MPSE learns to interpret whether the discrepancy indicates a genuinely protruded large lesion (e.g.\ high depth contrast, consistent surface) or unreliable depth (e.g.\ specular artefacts). The strong improvement in precision at matched recall in Table~\ref{tab1} is consistent with this mechanism: depth does not merely add noise, but resolves ambiguous 2D cases where geometry alone cannot disambiguate flat-close versus raised-far configurations.


% \begin{table}[!t]
% \centering

% %           -- Geometry table (left)           --
% \begin{minipage}{0.48\linewidth}
% \centering
% \captionof{table}{Ablation on geometry features.}
% \label{tab:unet_geom_ablation}
% \begin{tabular}{lccc}
% \toprule
% \textbf{Variant} & \textbf{F1} & \textbf{Recall} & \textbf{MAE} \\
% \midrule
% Without Geometry & 0.625  & 0.625  & -- \\ % fill MAE
% With Geometry    & 0.842  & 0.9375 & -- \\ % fill MAE
% \bottomrule
% \end{tabular}
% \end{minipage}
% \hfill
% %           -- Depth table (right)           --
% \begin{minipage}{0.48\linewidth}
% \centering
% \captionof{table}{Ablation on depth cues.}
% \label{tab:unet_depth_ablation}
% \begin{tabular}{lccc}
% \toprule
% \textbf{Variant} & \textbf{F1} & \textbf{Recall} & \textbf{MAE} \\
% \midrule
% Without Depth  & 0.882  & 0.9375 & -- \\ % fill MAE
% With Depth     & 0.857  & 0.9375 & -- \\ % fill MAE
% \bottomrule
% \end{tabular}
% \end{minipage}

% \end{table}


\begin{table}[!t]
\centering
\small
\caption{\textbf{Critical component ablation on the validation set} }
\label{tab:ablation_critical}
\small 
\begin{tabular}{l l c c c c r}
\toprule
ID & Variant & Depth & Geom & MPSE-Fuse & SCB & Best MAE $\downarrow$ \\
\midrule
\textbf{B0} & \textbf{Baseline (MPSE)}   & \checkmark & \checkmark & \checkmark & \checkmark & \textbf{0.927} \\
A1          & No Depth            &            & \checkmark & \checkmark & \checkmark & \textbf{1.431} \\
A2          & No Geom             & \checkmark &            & \checkmark & \checkmark & 1.354 \\
A3          & No MPSE-Fuse       & \checkmark & \checkmark &            & \checkmark & 1.888 \\
A4          & No SCB              & \checkmark & \checkmark & \checkmark &            & 1.231 \\
\bottomrule
\end{tabular}
\end{table}



\begin{table}[!b]
\centering
\small
\caption{\textbf{Robustness Evaluation of MPSE.}}
\label{tab:robustness_unified}
\begin{tabular}{l c c c c}
\toprule
\textbf{Condition} & $\alpha$ & $\sigma_D$ & $\sigma_G$ / $p_\text{drop}$ & \textbf{Polyp MAE (mm)} $\downarrow$ \\
\midrule
Baseline (clean)          & 1.00 & 0.00 & 0.00 / 0.00 & 0.93 \\
Depth scale shift         & 0.50 & 0.00 & 0.00 / 0.00 & 1.35 \\
Depth noise               & 1.00 & 0.06 & 0.00 / 0.00 & 1.32 \\
Geometry corruption       & 1.00 & 0.00 & 0.10 / 0.25 & 1.23 \\
Combined moderate failure & 0.75 & 0.03 & 0.05 / 0.10 & 1.39 \\
Combined severe failure   & 0.50 & 0.06 & 0.10 / 0.25 & 1.48 \\
\bottomrule
\end{tabular}
\end{table}




\subsubsection{Effectiveness of the Cross-Modal Fusion Mechanism}
\label{subsubsec:fusion_ablation}

The ablation study in Table~\ref{tab:ablation_critical} shows that each MPSE component contributes a distinct and non-redundant capability. Removing depth or geometry produces predictable failures: without depth, the model loses protrusion and distance cues; without geometry, it loses stable footprint statistics robust to illumination and motion. These behaviours align with the analytic trends in Figures~\ref{fig:geometry_analysis}. The Figure \ref{fig:rgb_depth_collage} highlights the complementary nature of geometry and pseudo-depth: while RGB frames provide the lesion footprint, the depth maps expose protrusion patterns and surface topography that resolve 2D ambiguity. Together, these cues explain why removing either modality (A1–A2) yields predictable collapse in accuracy. The severe degradation when MPSE-Fuse is removed (A3) demonstrates that performance arises not from stacking modalities, but from the \emph{structured} geometry-as-query interaction that guides RGB and depth to informative regions. The Scale Consistency Block (A4) adds complementary robustness by reconciling disagreements between 2D and 3D cues. Its removal revives familiar biases underestimation of large lesions and instability near the 5\,mm threshold highlighting its role in resolving cases where naive fusion fails. Overall, the ablations paint a coherent picture: geometry and depth are individually necessary, directed fusion is indispensable, and SCB corrects the remaining scale ambiguity inherent to monocular endoscopy.


Table~\ref{tab:robustness_unified} evaluates the robustness of MPSE under controlled degradations of depth and geometry cues at inference time, without retraining, to emulate deployment-relevant failure modes in clinical endoscopy such as depth scale ambiguity, sensor noise, and imperfect segmentation. Perturbations are injected directly into the depth maps and geometry feature representations, with geometry corruption applied at the feature level rather than to raw binary masks, since ground-truth segmentation masks are not available for this dataset and segmentation quality in practice varies across cases. This design allows us to assess sensitivity to generalized geometric uncertainty independent of the specific artifact patterns of any single upstream segmentation model. Across all tested conditions, including severe combined perturbations, polyp-level MAE remains bounded and increases gradually relative to the clean baseline, with no evidence of abrupt performance collapse. Notably, worst-case errors are comparable to ablations that remove individual components (Table~\ref{tab:ablation_critical}), indicating that MPSE does not rely critically on any single modality or precise metric calibration. Overall, these results demonstrate controlled and predictable degradation under adverse conditions, supporting the robustness of MPSE in noisy, real-world endoscopic deployment.






% \subsubsection{Temporal Stability}
% Temporal smoothing further reduces prediction variance across video frames, particularly near the 5\,mm threshold where small depth or segmentation fluctuations often lead to inconsistent outputs. Aggregating evidence across short temporal windows yields smoother, clinically plausible size trajectories and reduces false transitions across size classes. This validates the methodological claim that robustness in endoscopic environments requires both cross-modal and temporal reasoning.
\begin{figure}[!t]
    \centering
    \includegraphics[
        width=\linewidth,
        trim=0 0 0 65, % left bottom right top (in points)
        clip
    ]{figures/collage_v2.png}\\[6pt]
    \caption{\textbf{Representative RGB frames and corresponding pseudo-depth maps.} 
The depth maps (bottom row) reveal 3D protrusion patterns and surface topography often missed by RGB appearance alone (top row).}
    \label{fig:rgb_depth_collage}
\end{figure}

\subsubsection{Limitations observed}
\label{sec:failure_scenarios}
Despite its strong performance, MPSE exhibits two predictable failure modes arising from intrinsic limits of monocular endoscopy rather than architectural shortcomings. First, polyps larger than $10\,\mathrm{mm}$ are occasionally underestimated due to scarce supervision in the right tail, the nonlinear scaling of footprint geometry with camera distance, and depth-map saturation on smooth near-field surfaces—precisely the regime where monocular 3D cues become unreliable. Multiple interventions  produced only modest gains, confirming that this bias is structurally rooted rather than an artifact of model capacity or training instability. Accordingly, results in the $>10\,\mathrm{mm}$ regime should be interpreted as indicative of systematic bias correction relative to RGB and naïve RGB--D baselines, rather than statistically representative performance across large lesions. Second, ambiguity persists around the clinically important $5\,\mathrm{mm}$ threshold: lesions in the $4$--$6\,\mathrm{mm}$ range often exhibit nearly identical footprints and shallow depth gradients, making sub-millimetre discrimination inherently difficult under monocular viewing conditions.


% MPSE mitigates both issues through cross-modal attention, mask-guided tokenization, and robust temporal aggregation, though a degree of uncertainty remains unavoidable due to the physics of monocular imaging.


% ============================================================

\subsection{Clinical Implications}

Reliable estimation of polyp size is central to risk stratification, resection planning, and surveillance interval determination. In particular, thresholds at 5\,mm and 10\,mm guide the adoption of resect-and-discard and diagnose-and-leave strategies, yet visual estimation remains highly variable even among expert endoscopists. By integrating depth-derived structural cues with segmentation-based geometric priors, our tri-modal framework reduces the scale ambiguity that commonly leads to underestimation of protruded or irregular lesions. The improved recall observed near the clinically sensitive 5\,mm boundary suggests that MPSE can serve as a stabilizing decision-support tool in real-time workflows, especially in community settings where advanced measurement devices such as VSE systems are unavailable. 

% The interpretability of geometry features and the visual traceability of depth cues further support clinical acceptance by providing semantically meaningful explanations for size predictions.

\section{Conclusion and Future Work}

We presented MPSE, a geometry-aware, depth-guided multimodal fusion framework for reliable polyp size estimation from monocular endoscopy. By jointly leveraging RGB appearance, segmentation-derived geometry descriptors, and pseudo-metric depth cues, the model overcomes key limitations of traditional image-based measurement, including scale ambiguity and inconsistent predictions near the 5\,mm clinical threshold. 
Future work will explore several directions. First, incorporating specialized endoscopy depth models or self-supervised geometric pretraining may further enhance 3D reasoning under specular or low-texture conditions. Second, integrating temporal transformers could provide more robust cross-frame aggregation for long sequences and mitigate transient segmentation noise. Third, expanding evaluation to multi-center datasets and diverse imaging conditions will be essential for assessing generalizability. Finally, coupling size estimation with uncertainty quantification and downstream tasks such as resection recommendation offers a promising route toward clinically comprehensive decision-support systems.
\appendix
\section{Appendix}

\subsection{Implementation Details}
Unless otherwise stated, RGB and pseudo-depth inputs were resized to the training resolution used by the respective backbone encoders. The RGB and depth streams used ResNet-18 backbones. Geometry descriptors included pixel area, equivalent diameter, aspect-ratio features, circularity, and a boundary-sharpness measure derived from the segmentation mask. The fused representation was processed with a lightweight transformer encoder with $L=2$ layers, followed by the Scale Consistency Block and a regression head for millimetre-level prediction.

\subsection{Data Split Protocol}
All experiments were performed using polyp-disjoint splits. Frames from the same polyp instance were assigned exclusively to either training or test partitions to prevent leakage from near-duplicate temporal frames. Frame-level predictions were aggregated at the polyp level using the $p75$ statistic.

\subsection{Additional Note on Limitations}
Performance on lesions larger than 10\,mm should be interpreted with caution due to the very limited number of large polyps in the dataset. This limitation is inherent to the available dataset and motivates broader future evaluation on larger multi-center cohorts.
% =========================================================
% ACKNOWLEDGMENTS
% =========================================================

%\midlacknowledgments{We thank a bunch of people.}

\bibliography{midl26_298}

\end{document}
