\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe}
\usepackage{float}
\usepackage{graphicx}
\usepackage{pdflscape}
\usepackage{booktabs,multirow,array}
\newcommand{\dpair}[2]{\mbox{#1\,/\,#2}}
\newcolumntype{Y}{p{0.18\textwidth}}
\usepackage{placeins}
\usepackage{enumitem}
\usepackage{xcolor}
\usepackage{dsfont}

\raggedbottom

% --- legend colours and macro ---
\definecolor{suction}{RGB}{0,0,255}
\definecolor{dural_scissors}{RGB}{0,255,0}
\definecolor{kerrisons}{RGB}{255,0,0}
\definecolor{retractable_knife}{RGB}{0,255,255}
\definecolor{ring_curette}{RGB}{255,0,255}
\definecolor{pituitary_rongeurs}{RGB}{255,235,12}
\definecolor{cup_forceps}{RGB}{255,128,128}

\newcommand{\pitlegend}[2]{%
  \textcolor{#1}{\rule{2mm}{2mm}}\,#2%
}


\newcolumntype{C}{>{\centering\arraybackslash}p{0.19\textwidth}}
\newcolumntype{D}{>{\centering\arraybackslash}p{0.09\textwidth}}
\newcolumntype{E}{>{\centering\arraybackslash}p{0.45\textwidth}}
\newcolumntype{F}{>{\centering\arraybackslash}p{0.24\linewidth}}

\definecolor{LNDcolor}{RGB}{0,0,255}          % (255,0,0)
\definecolor{MNDcolor}{RGB}{0,255,0}          % (0,255,0)
\definecolor{MicroForcepscolor}{RGB}{255,0,0} % (0,0,255)
\definecolor{Scissorcolor}{RGB}{0,255,255}    % (255,255,0)
\definecolor{Forcepscolor}{RGB}{255,0,255}    % (255,0,255)
\definecolor{Prograspcolor}{RGB}{255,235,12}  % (12,235,255)

\newcommand{\clslegend}[2]{%
  \textcolor{#1}{\rule{2mm}{2mm}}\,#2%
}

\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026}
\jmlrvolume{-- nnn}
\editors{Accepted for publication at MIDL 2026}

\title[MID-POSE]{MID-POSE: Multi-Instrument Detection and Pose Estimation in Endoscopic Surgery}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{%
\Name{Wenhua Wei\midljointauthortext{Contributed equally}\nametag{$^{1}$}} \orcid{0009-0009-2930-2209} \Email{wenhua.wei.17@alumni.ucl.ac.uk}\\
\Name{Laurent Mennillo\midlotherjointauthor\nametag{$^{1,2,3}$}} \orcid{0000-0001-5220-6631} \Email{l.mennillo@ucl.ac.uk}\\
\Name{Zhehua Mao\nametag{$^{1,2}$}} \orcid{0000-0001-9225-6318} \Email{z.mao@ucl.ac.uk}\\
\Name{Anjana Wijekoon\nametag{$^{1,2}$}} \orcid{0000-0003-3848-3100} \Email{a.wijekoon@ucl.ac.uk}\\
\Name{Kendall Feeny\nametag{$^{1,2}$}} \orcid{0009-0008-6755-8574} \Email{k.feeny@ucl.ac.uk}\\
\Name{Danyal Zaman Khan\nametag{$^{1,2}$}} \orcid{0000-0001-9213-2550} \Email{d.khan@ucl.ac.uk}\\
\Name{Evangelos B.~Mazomenos\nametag{$^{2,3}$}} \orcid{0000-0003-0357-5996} \Email{e.mazomenos@ucl.ac.uk}\\
\Name{Danail Stoyanov\nametag{$^{1,2}$}} \orcid{0000-0002-0980-3227} \Email{danail.stoyanov@ucl.ac.uk}\\
\Name{Hani J.~Marcus\nametag{$^{2,4}$}} \orcid{0000-0001-8000-392X} \Email{h.marcus@ucl.ac.uk}\\
\Name{Sophia Bano\nametag{$^{1,2}$}} \orcid{0000-0003-1329-4565} \Email{sophia.bano@ucl.ac.uk}\\[1ex]
\addr $^{1}$ Department of Computer Science, University College London, London, United Kingdom\\
\addr $^{2}$ UCL Hawkes Institute, University College London, London, United Kingdom\\
\addr $^{3}$ Department of Medical Physics \& Biomedical Engineering, University College London, London, United Kingdom\\
\addr $^{4}$ Department of Neurosurgery, National Hospital for Neurology and Neurosurgery, London, United Kingdom
}

\begin{document}

\maketitle

\begin{abstract}
Reliable perception of surgical instruments is a key prerequisite for intraoperative guidance, context-aware assistance, and workflow analysis in minimally invasive surgery (MIS). This is particularly challenging in skull base procedures, where narrow anatomical corridors, frequent occlusions, specular highlights, and visually similar instruments make multi-class detection and 2D pose estimation difficult. We address joint instrument detection and keypoint-based pose estimation from monocular endoscopic videos and introduce MID-POSE, a dual-head architecture that couples a high-resolution HRNetV2p encoder with a class-agnostic dense detection-pose head and a Multi-level Instrument Classification (MIC) head which operates on RoI-aligned multi-level features. To support this task, we construct the PitSurg dataset from 26 clinical procedures, providing seven instrument classes with bounding boxes and detailed 2D keypoints. Using YOLOv8x-pose as our strongest baseline, which in our tasks outperforms YOLO11x-pose, MID-POSE improves Det/Pose $AP_{50\text{--}95}$ on PitSurg from $59.4/63.1$ to $77.5/78.5$ and on the robotic SurgPose dataset from $47.9/61.1$ to $62.7/71.4$.
Qualitative analysis shows that high-resolution features sharpen localisation and keypoint placement, while the RoI classifier reduces misclassifications and spurious background detections, indicating that the proposed architecture and dataset provide an effective basis for robust multi-instrument perception in MIS.
\end{abstract}

\begin{keywords}
Minimally Invasive Surgery, Skull Base Surgery, Surgical Instrument Detection, Pose Estimation
\end{keywords}

\section{Introduction}
Minimally invasive surgery (MIS) has become the standard for many procedures, with surgeons operating through narrow anatomical corridors using elongated instruments visualised by an endoscope~\cite{jeganathan2025minimally}. Joint detection and 2D pose estimation of multiple instruments offer a compact scene representation, enabling geometric reasoning and semantic understanding, which are essential for downstream applications like intraoperative guidance, context-aware assistance, workflow analysis, and skill assessment
~\cite{das2024automated,das2025pitvis}. However, accurate instrument perception in MIS remains challenging because instruments are frequently occluded, affected by specular highlights, blood contamination, and motion blur, and can appear visually similar within a confined field of view. Additional complications arise from domain shifts between patients, limited dataset sizes, and expensive manual annotations. These factors make multi-class detection and keypoint estimation much more difficult than generic object detection or human pose estimation in natural images~\cite{maji2022yolopose,xu2022vitpose}.\looseness = -1

Deep learning has delivered powerful architectures for object detection and keypoint-based pose estimation~\cite{ren2015faster,newell2016stacked,xiao2018simple,chen2018cascaded,cheng2020higherhrnet,xu2022vitpose,maji2022yolopose}. Two-stage \emph{top-down} pipelines typically combine a generic instance detector such as Faster R-CNN~\cite{ren2015faster} with a single-instance pose network~\cite{newell2016stacked,xiao2018simple,chen2018cascaded,xu2022vitpose}, whereas \emph{bottom-up} methods localise all keypoints jointly and then group them into instances~\cite{cao2017realtime,newell2017associative,papandreou2018personlab,cheng2020higherhrnet}. More recently, one-stage dense predictors such as YOLO-pose~\cite{maji2022yolopose} jointly output boxes, classes, and keypoints in a single head, improving robustness in crowded scenes. Most of these approaches rely on backbones that repeatedly downsample the input and then attempt to recover spatial detail, whereas high-resolution representations are crucial for localisation-sensitive tasks \cite{sun2019hrnet}.

These generic architectures have increasingly been adapted to surgical instruments. On robotic MIS, Wu et al.~\cite{wu2025surgpose} benchmark YOLOv8x-pose, ViTPose, and DeepLabCut on the SurgPose dataset, showing that human-pose architectures can be transferred to articulated tools. For manual laparoscopy, teams in the PhaKIR challenge~\cite{rueckert2025phakir} extend YOLOv8-based detectors to predict per-instrument keypoints, including strategies for uncertainty estimation and handling a variable number of keypoints per class. Other works target 6D instrument pose from monocular images using one-stage regression~\cite{yoshimura2020single} or two-stage pipelines that combine YOLO-based detection with crop-based pose networks~\cite{spektor2024monocular}. These studies demonstrate that YOLO-style one-stage detector--pose architectures are strong baselines for surgical instruments and that high-resolution encoders further benefit keypoint localisation. However, support for \emph{truly multi-class, multi-instrument 2D pose estimation from monocular endoscopic views} remains limited, particularly in complex skull base surgery.

Dataset availability is a further bottleneck. SurgPose~\cite{wu2025surgpose} provides articulated 2D keypoints for six robotic instrument types in stereo MIS, and PhaKIR~\cite{rueckert2025phakir} offers 2D keypoints for 19 laparoscopic instruments. ROBUST-MIPS~\cite{han2025robustmips} and ART-Net~\cite{hasan2021detection} focus on tip and shaft representations in abdominal or robotic procedures. These resources provide valuable benchmarks for robotic and abdominal laparoscopy, but they do not cover monocular endoscopic pituitary surgery, a setting with distinctive visual conditions affecting instruments, such as frequent occlusions, border truncation, extreme perspectives, blur, and specular highlights. To the best of our knowledge, there is no public dataset for endoscopic pituitary surgery that combines multi-class instrument labels with instrument-specific 2D keypoint annotations for the sellar phase~\cite{marcus2021pituitary}.

This work addresses both methodological and dataset gaps by proposing MID-POSE (Multi-Instrument Detection and Pose Estimation in Endoscopic surgery), a dual-head architecture for multi-class surgical instrument detection and 2D keypoint pose estimation in MIS, and by constructing a new dataset for endoscopic pituitary surgery.
MID-POSE combines a high-resolution HRNet \cite{sun2019hrnet} encoder with a class-agnostic dense detection-pose head in the style of YOLOv8-pose~\cite{maji2022yolopose} and a proposed Multi-level Instrument Classification (MIC) head which operates on RoI-aligned  multi-level features.
We evaluate the approach on both the new PitSurg dataset for manual endoscopic pituitary surgery and on the robotic SurgPose benchmark~\cite{wu2025surgpose}, allowing us to assess performance across manual and robotic MIS scenarios. The main contributions of this work are:
\begin{itemize}[noitemsep,topsep=0pt]
    \item \textbf{PitSurg: a dataset for endoscopic pituitary surgery}, comprising monocular intraoperative images from 26 procedures with seven instrument types annotated by bounding boxes and detailed, class-specific 2D keypoints under frequent occlusion, truncation, and class imbalance.
    \item \textbf{MID-POSE: a dual-head architecture for multi-class instrument detection and 2D keypoint pose estimation}, which builds on HRNetV2p features with a class-agnostic dense detection-pose head and a MIC head, and incorporates a quality-aware instrumentness objective together with an extended keypoint visibility scheme.
    \item \textbf{A benchmark} for joint detection and 2D pose estimation in both manual and robotic endoscopic MIS scenarios, providing reference results with representative YOLO-style and HRNet-style designs as a common reference point under challenging intraoperative conditions.
\end{itemize}
\section{Method: MID-POSE for Instrument Detection and Pose}
We propose MID-POSE, a dual-head architecture for multi-class surgical instrument detection and 2D keypoint pose estimation in minimally invasive surgery. The model combines a high-resolution encoder, a class-agnostic dense detection-pose head, and a MIC head operating on RoI-aligned multi-level features (Fig.~\ref{fig:arch}). 

\subsection{Architecture Overview}

\begin{figure}[t!]
  \centering
  % \includegraphics[width=\linewidth]{architecture.png}
  \includegraphics[width=\linewidth]{images/MID-POSE.pdf}
  \caption{
  Overview of the proposed MID-POSE architecture for instrument detection and 2D keypoint estimation. Input images are encoded using an HRNetV2p encoder to produce a three-level feature pyramid (P3--P5). A class-agnostic dense prediction head attached to the pyramid outputs per-grid bounding boxes, instrumentness scores, and keypoint candidates that are then filtered by instrumentness. Filtered candidates are then used to extract multi-level RoI features from P3--P5 via a PyramidAdapter and RoIAlign, followed by a MIC head to predict per-RoI instrument class probabilities. These are finally combined with the dense predictions during post-processing to obtain the instrument detections and 2D keypoints.\looseness=-1
  }
  \label{fig:arch}
\end{figure}


\paragraph{Encoder and feature pyramid}
We adopt HRNetV2p--W32 as the encoder ~\cite{sun2019hrnet} as it maintains a high-resolution stream fused with lower-resolution branches, yielding semantically rich, spatially precise features that have proven effective for localization-sensitive dense prediction tasks. The encoder outputs a three-level feature pyramid ($P_3, P_4,$ and $P_5$) with strides 8, 16, and 32. These feature maps preserve fine spatial detail while progressively enriching semantics, and are shared between the dense and the MIC heads.

\paragraph{Class-Agnostic Dense Head}
On top of $P_3$--$P_5$, we adopt a YOLOv8-Pose style head~\cite{maji2022yolopose} for joint binary detection and 2D pose estimation in a class-agnostic manner. At every spatial location $a$ on each pyramid level, the head predicts:
(i) an instrumentness score $p_a = \sigma(z_a)$, where $\sigma(\cdot)$ denotes the sigmoid function and $z_a$ is the corresponding raw logit,
(ii) a bounding box $b_a = (x_{a,1}, y_{a,1}, x_{a,2}, y_{a,2})$, and
(iii) $K$ keypoints $k_{a,i} = (x_{a,i}, y_{a,i}, p_{v,a,i})$, $i = 1,\dots,K$, 
with 2D coordinates $(x_{a,i}, y_{a,i})$ and visibility probabilities $p_{v,a,i} \in [0,1]$.
Predictions with instrumentness below a fixed threshold $\tau$ are discarded. The remaining candidates are used both as final detection/pose outputs and as proposals for the multi-level RoI classification head.

\paragraph{Multi-scale RoI Feature Extractor}
To assign instrument categories, we introduce a multi-level RoI feature extractor operating on $P_3$--$P_5$. The bounding boxes of the filtered candidates are treated as regions of interest (RoIs). A PyramidAdapter first maps each feature map $P_\ell$ to a common channel dimension $C_r$ via a $1\times 1$ convolution:
\[
P_3, P_4, P_5 \rightarrow \tilde{P}_3, \tilde{P}_4, \tilde{P}_5 \in \mathbb{R}^{C_r \times H_\ell \times W_\ell}
\]
, while preserving their original spatial resolutions.
For every RoI, RoIAlign is applied independently to $\tilde{P}_3, \tilde{P}_4, \tilde{P}_5$, yielding tensors of size $C_r \times H_{\mathrm{roi}} \times W_{\mathrm{roi}}$ at each level.
These three tensors are concatenated along the channel dimension to form a multi-scale descriptor of shape $3 \times C_r \times H_{\mathrm{roi}} \times W_{\mathrm{roi}}$ that combines detailed local information from $P_3$ with increasingly contextual features from $P_4$ and $P_5$.

The concatenated tensor is passed through a feature fusion module consisting of a $1\times1$ convolution, group normalization, and a SiLU activation, which reduces the channels back to $C_r$ and learns to mix information across levels. The output is a fused RoI feature map for each candidate bounding box.

\paragraph{MIC head}
The MIC head operates on the fused RoI features
%$F_{\text{mic}}$ 
and predicts a discrete probability distribution over the instrument categories. It consists of two convolutional refinement blocks (Conv), each composed of a $3\times3$ convolution with stride~1 and padding~1, followed by group normalization and a SiLU activation; these blocks refine the local RoI features while preserving the $H\times W$ spatial resolution of the feature map. Global average pooling (GAP) is then applied over the spatial dimensions to obtain a $C_r$-dimensional descriptor for each RoI. This descriptor is passed through a fully connected block (FC) with two layers, where the first is a hidden linear layer with ReLU activation and dropout and the second is an output layer that produces eight logits corresponding to the seven pituitary instrument types and a background class.

\subsection{Loss Functions}

\paragraph{Instrumentness loss}
Let $y_a \in [0,1]$ be a soft target that reflects how well the predicted instrument at location $a$ matches its ground truth, increasing with both the instrumentness confidence and the IoU between the predicted and ground-truth bounding boxes, and let \(p_a\) denote the corresponding instrumentness probability. A quality-aware focal loss inspired by VarifocalNet~\cite{zhang2021varifocalnet} is used to train this branch. The per-location instrumentness loss is defined as:

\begin{equation}
\mathcal{L}_{\text{inst}}(p_a, y_a)
= w(p_a, y_a)\,\bigl[-y_a \log p_a - (1-y_a)\log(1-p_a)\bigr]
\end{equation}

, with

\begin{equation}
w(p_a, y_a)
= \mathds{1}[y_a > 0]\,y_a + \mathds{1}[y_a = 0]\,\alpha\,p_a^{\gamma}
\end{equation}

, where $\alpha$ and $\gamma$ control the balance between positive and negative locations and the degree of focusing on hard negatives. Because the dense head produces predictions at every spatial location while only a small top-$k$ subset is assigned as positives, this quality-aware focal reweighting prevents the loss from being dominated by easy background locations and encourages the model to focus on well-localized positives and hard negatives.

\paragraph{Bounding box loss}
The bounding box loss $\mathcal{L}_{\text{box}}$ follows the default YOLOv8-Pose formulation~\cite{maji2022yolopose} and is computed only for positive grid locations ($y_a > 0$). 

\paragraph{Keypoint loss}
The keypoint loss \(\mathcal{L}_{\text{kpt}}\) follows the YOLOv8-pose formulation~\cite{maji2022yolopose} and is computed only for positive grid locations ($y_a > 0$). It is decomposed into an Object Keypoint Similarity (OKS) coordinate term and a visibility term:

\begin{equation}
\mathcal{L}_{\text{kpt}} = \lambda_{\text{loc}} \mathcal{L}_{\text{OKS}} + \lambda_{\text{vis}} \mathcal{L}_{\text{vis}}.
\end{equation}

We extend the visibility label $v_j$ for keypoint $j$ to the range $v_j \in \{-1,0,1,2\}$, where $v_j=-1$ denotes an unlabeled keypoint. For the coordinate term, only keypoints with \(v_j > 0\) are treated as present:

\begin{equation}
\mathcal{L}_{\text{OKS}}(j) =
\begin{cases}
0, & v_j \le 0\\[4pt]
1 - \exp\!\left(
-\dfrac{\lVert \hat{k}_j - k_j \rVert_2^2}{2\,A\,(2\sigma_j)^2 + \varepsilon}
\right), & v_j > 0
\end{cases}
\end{equation}

, where $\hat{\mathbf{k}}_j$ and $\mathbf{k}_j$ are the predicted and ground-truth keypoint locations, $A$ is the ground-truth box area, and $\sigma_j$ is a keypoint-specific tolerance. For the visibility term, a binary target \(t_j = \mathbb{1}[v_j > 0]\) is defined for labeled keypoints with \(v_j \ge 0\), and the per-keypoint loss is:

\begin{equation}
\mathcal{L}_{\text{vis}}(j) =
\begin{cases}
0, & v_j = -1\\[4pt]
-\Big[t_j \log \sigma(\hat{v}_j) + (1 - t_j)\log\bigl(1 - \sigma(\hat{v}_j)\bigr)\Big], & v_j \ge 0
\end{cases}
\end{equation}

, where $\sigma(\cdot)$ denotes the sigmoid.
This extension allows instances without keypoint annotations to contribute to detection while excluding their unlabeled keypoints from the pose supervision.

\paragraph{MIC classification loss}
For each RoI $r$, the MIC head outputs logits $s_r \in \mathbb{R}^8$ over seven instrument classes plus background. Let $y_r \in \{0,\ldots,7\}$ denote the ground-truth label for RoI $r$ and $p_{r,c}$ the softmax probability for class $c$. The MIC loss is the standard cross-entropy:

\begin{equation}
\mathcal{L}_{\text{mic}}
= \frac{1}{N_{\text{mic}}}
\sum_{r=1}^{N_{\text{mic}}}
\bigl[-\log p_{r,y_r}\bigr]
\end{equation}

, where $N_{\text{mic}}$ is the number of RoIs in the batch.
This penalizes low predicted probability for the ground-truth class and drives the RoI head to discriminate between the seven instrument categories and background.

\paragraph{Total loss}
The network is trained end-to-end by combining the dense detection-pose losses from the class-agnostic head with the categorical loss from the MIC head. For each batch, the total loss is:

\begin{equation}
\mathcal{L}_{\text{total}}
= \lambda_{\text{box}}\,\mathcal{L}_{\text{box}}
+ \lambda_{\text{inst}}\,\mathcal{L}_{\text{inst}}
+ \lambda_{\text{kpt}}\,\mathcal{L}_{\text{kpt}}
+ \lambda_{\text{mic}}\,\mathcal{L}_{\text{mic}}
\label{eq:total_loss}
\end{equation}

, where $\lambda_{\text{box}}, \lambda_{\text{inst}}, \lambda_{\text{kpt}}$, and $\lambda_{\text{mic}}$ control the relative contributions of localization, instrumentness, pose, and MIC.

\section{Dataset and Experimental Setup}
We evaluate MID-POSE on two complementary datasets: \emph{PitSurg}, a new dataset of manual endoscopic pituitary surgery, and \emph{SurgPose}, a public benchmark of robotic MIS.% on the da~Vinci platform.

\paragraph{PitSurg} dataset is derived from 26 videos of monocular endoscopic pituitary surgery performed at the National Hospital for Neurology and Neurosurgery, London, UK. Videos were captured using a Hopkins Telescope with an AIDA storage system (Karl Storz Endoscopy, UK) at 720p, 24 FPS, and frames were sampled at 1 FPS for annotation (a summary is provided in Appendix \ref{app:datasheet}, Table \ref{tab:pitsurg-specs}). We used only sellar-phase frames~\cite{marcus2021pituitary} that contain one or two visible instruments, and split the data at the procedure level so that all frames from a given surgery appear in either the training or validation set, avoiding patient-level leakage. Seven instrument types are annotated, as illustrated in Fig.~\ref{fig:pitsurg_examples}, namely Suction, Dural Scissors, Kerrisons, Retractable Knife, Ring Curette, Pituitary Rongeurs, and Cup Forceps. Each instance has a bounding box, class label, and 2D keypoints with class-specific layouts, with suction having 2 keypoints, ring curette 3, retractable knife, dural scissors, pituitary rongeurs, and cup forceps 4, and Kerrisons 5. PitSurg reflects real intraoperative imaging conditions, in which annotated instruments are affected by frequent occlusion, border truncation, motion blur, extreme perspectives, and specular highlights. Consequently, not all keypoints are visible in every frame. The prevalence of these instrument-level challenging conditions is summarised in Appendix \ref{app:datasheet} (Figure \ref{fig:pitsurg-tags-pie}). Only suction can appear together with any other instrument. Because clinically verified cup forceps annotations are scarce in the sellar phase, we augment the training set with 376 additional cup forceps instances annotated only with bounding boxes and class labels, extracted from non-sellar segments of the same procedures, while keeping the validation set restricted to sellar-phase frames. The training split contains 1{,}042 suction, 351 dural scissors, 495 Kerrisons, 331 retractable knife, 312 ring curette, 301 pituitary rongeurs, and 487 cup forceps instances, and the validation split contains 201, 80, 114, 95, 91, 51, and 83 instances, respectively. 

\begin{figure}[t!]
  \centering
  \small

  % 6 images in a single row, no margins between them
  \noindent
  \includegraphics[width=0.165\linewidth]{images/PitSurg/gt_suction_dural_crop.png}%
  \includegraphics[width=0.165\linewidth]{images/PitSurg/gt_suction_kerrision_crop.png}%
  \includegraphics[width=0.165\linewidth]{images/PitSurg/gt_suction_retract_crop.png}%
  \includegraphics[width=0.165\linewidth]{images/PitSurg/gt_suction_ring_crop.png}%
  \includegraphics[width=0.165\linewidth]{images/PitSurg/gt_suction_rongeur_crop.png}%
  \includegraphics[width=0.165\linewidth]{images/PitSurg/gt_cup_crop.png}%

  %\vspace{0.6em}
  {\tiny
  \textbf{}\;
  \pitlegend{suction}{suction}\quad
  \pitlegend{dural_scissors}{dural scissors}\quad
  \pitlegend{kerrisons}{kerrisons}\quad
  \pitlegend{retractable_knife}{retractable knife}\quad
  \pitlegend{ring_curette}{ring curette}\quad
  \pitlegend{pituitary_rongeurs}{pituitary rongeurs}\quad
  \pitlegend{cup_forceps}{cup forceps}
  }

  \caption{Examples of PitSurg instrument classes with bounding boxes and 2D keypoints.}
  \label{fig:pitsurg_examples}
\end{figure}

\paragraph{SurgPose} dataset~\cite{wu2025surgpose} contains stereo endoscopic videos acquired with a da~Vinci surgical system. Following the official protocol, we use only the left-view images and adopt the provided split, using trajectories 0--19 for training and 20--33 for validation. Each frame contains two articulated robotic instruments annotated with bounding boxes and five 2D keypoints per tool across six instrument types: Large Needle Driver (LND), Mega Needle Driver (MND), MicroForceps, Curved Scissor, DeBakey Forceps, and Prograsp Forceps. In this dataset, instruments are fully visible in all frames, without occlusion, blur, specular highlights or end-effector border truncation.\looseness=-1

\paragraph{Five model variants} are considered in our experiments, namely YOLOv8x-pose~\cite{maji2022yolopose} and YOLO11x-pose~\cite{ultralytics_yolo11}; a YOLOv8x-pose+MIC, where the YOLOv8x-pose is augmented with the proposed MIC head to form a dual-head architecture; an HRNetV2p-pose model, which uses an HRNetV2p encoder feeding a YOLO--style dense detection-pose head; and the proposed MID-POSE architecture.

\paragraph{Training protocol} All variants are implemented in PyTorch using Ultralytics YOLO \cite{yolov8_ultralytics}, with custom extensions for MID-POSE, initialised from COCO-pretrained checkpoints and trained on a single NVIDIA GeForce RTX~4090 GPU.
For both datasets, all models use the same augmentations. Images are resized to $640\times640$ and augmented with random rotations (up to $\pm20^\circ$), translations (up to 10\%), isotropic scaling in $[0.5,1.5]$, horizontal flips (probability 0.5), and mild photometric jitter in hue, saturation, and brightness. 

Optimisation uses stochastic gradient descent (SGD) with a learning rate of 0.01, momentum 0.9, and weight decay $5\times10^{-4}$. Models are trained for 80 epochs on PitSurg and 50 epochs on SurgPose, with a batch size of 16 for YOLOv8x-pose, YOLO11x-pose, and YOLOv8x-pose+MIC, and 8 for HRNetV2p-Pose and MID-POSE. The batch size for models utilizing the HRNetV2p encoder was reduced to accommodate the higher memory footprint required by its high-resolution feature maps. YOLOv8x-pose+MIC and MID-POSE use two-stage training: first the encoder and class-agnostic dense head are trained as a binary detector--pose model, then the full dual-head architecture is fine-tuned while RoIs are constructed online from dense predictions. For each ground-truth instrument we keep the three predictions with the highest $p_a$ as positive RoIs and three hard negatives as the highest-$p_a$ predictions with zero IoU to all ground-truth boxes, labelling them with the corresponding instrument class or background. At inference, dual-head models filter dense predictions with an instrumentness threshold $\tau = 0.3$, keeping only predictions with $p_a \ge \tau$ as candidates for MIC.

Following the baseline YOLOv8x-pose implementation by ultralytics \cite{yolov8_ultralytics}, the default values of weighting parameters (Eq.~\eqref{eq:total_loss}), $\lambda_{\text{box}} = 7.5$, $\lambda_{\text{kpt}} = 12.0$ and $\lambda_{\text{inst}} = 1.0$ are used in the loss function for all variants on both datasets.
On PitSurg we use $\lambda_{\text{cls}} = 1.0$ for the single-head variants (YOLOv8x-Pose, YOLO11x-Pose, HRNetV2p-Pose), where $\mathcal{L}_{\text{cls}}$ is the multi-class counterpart of $\mathcal{L}_{\text{inst}}$, and we use $\lambda_{\text{mic}} = 10.0$ for the dual-head variants (YOLOv8x-pose+MIC and MID-POSE). On SurgPose, single-head models use $\lambda_{\text{cls}} = 40.0$, and dual-head models use $\lambda_{\text{mic}} = 17.0$. In all cases, the $\lambda$ values are set according to the relative scales and difficulty of the underlying tasks so that each loss contributes in a balanced way. We also note that globally scaling these weighting parameters is effectively equivalent to adjusting the learning rate for those terms.

\paragraph{Evaluation metrics} We report detection and pose performance using average precision over thresholds from 0.5 to 0.95 for IoU (Det $mAP_{50\text{--}95}$) and OKS (Pose $mAP_{50\text{--}95}$), given per class and as a class-averaged overall score in percentage. For qualitative examples, we show the mean IoU and OKS per image, assigning IoU = 0 and OKS = 0 to false negatives and computing these scores only for true positives.

\section{Results and Discussion}
We compare dense single-head architectures (YOLOv8x-pose, YOLO11x-pose, HRNetV2p-pose) and dual-head variants (YOLOv8x-pose+MIC and MID-POSE) on the PitSurg and SurgPose datasets. Performance is measured using Det/Pose $AP_{50\text{--}95}$ per class and overall (Table~\ref{tab:results_all}). As YOLO11 does not yield consistent gains over YOLOv8x, we use YOLOv8x-pose as the primary baseline and include YOLO11x-pose for completeness.

\paragraph{Quantitative}
Table ~\ref{tab:results_all} reports detection and pose $AP_{50\text{--}95}$ on the PitSurg and SurgPose datasets, respectively. On PitSurg, the primary performance driver is the encoder: replacing YOLOv8x with HRNetV2p improves baseline $AP_{50\text{--}95}$ (Det/Pose) from 59.4\% / 63.1\% to 73.3\% / 76.5\%, with particular gains on challenging classes like retractable knife, ring curette and cup forceps. The MIC head adds a further boost, resulting in the best performance of 77.5\% / 78.5\%. Conversely, on SurgPose, the dual-head design acts as the dominant factor. While the HRNetV2p encoder offers modest gains over the YOLOv8x baseline (47.9\% / 61.1\%), adding the MIC head to YOLOv8x jumps performance to 59.7\% / 68.2\%. The proposed MID-POSE architecture achieves the best overall SurgPose results, with 62.7\% / 71.4\%. To further isolate the contribution of HRNetV2p to spatial localization from the effects of the classification design, we also report class-agnostic performance in Table~\ref{tab:agnostic_det}. This indicates a complementary relationship: high-resolution features (HRNetV2p) drive spatial precision, critical for PitSurg, while the MIC head resolves class confusion among similar instrument tips, which is the primary bottleneck in SurgPose.

\begin{table}[t]
    \centering
    \caption{Detection and pose $AP_{50\text{--}95}$ (in \%) on the (a) PitSurg and (b) SurgPose validation sets for different encoder-head combinations. Results are reported per instrument class and as an overall class-averaged AP for detection (Det) and pose (Pose).}

    \resizebox{\linewidth}{!}{%
    \begin{tabular}{lccccccccc}
    \toprule
    \multirow{2}{*}{Architecture} &
    \multicolumn{8}{c}{(a) PitSurg Dataset - $AP_{50\text{--}95}$ (Det/Pose)} \\
    \cmidrule(lr){2-9}
    & Overall & suction & dural\_scissors & kerrisons &
      retractable\_knife & ring\_curette & pituitary\_rongeurs & cup\_forceps \\
    \midrule
    YOLOv8x-pose~\cite{maji2022yolopose}
      & 59.4 / 63.1
      & 70.6 / 84.5 & 65.1 / 79.5 & 81.6 / 57.5
      & 53.5 / 49.6 & 37.7 / 66.4 & 56.3 / 57.6 & 50.8 / 46.3 \\

    YOLO11x-pose~\cite{ultralytics_yolo11}
      & 54.8 / 62.0
      & 66.5 / 85.2 & 56.7 / 74.2 & 75.9 / 59.1
      & 42.2 / 36.7 & 45.1 / 72.9 & 51.2 / 57.1 & 45.7 / 49.1 \\

    YOLOv8x-pose+MIC
      & 59.6 / 64.9
      & 74.7 / 87.7 & 57.5 / 73.4 & 76.9 / 67.6
      & 57.8 / 54.1 & 48.9 / 76.6 & 50.1 / 44.3 & 51.6 / 51.0 \\

    HRNetV2p-pose
      & 73.3 / 76.5
      & 80.0 / \textbf{92.5} & 72.1 / \textbf{82.3} & 88.2 / 77.4
      & 70.6 / 64.2 & 63.2 / 82.4 & 63.1 / 66.1 & \textbf{75.6} / 70.4 \\

    MID-POSE
      & \textbf{77.5} / \textbf{78.5}
      & \textbf{82.9} / 92.3 & \textbf{76.1} / \textbf{82.3} & \textbf{89.6} / \textbf{81.2}
      & \textbf{78.7} / \textbf{65.6} & \textbf{73.6} / \textbf{86.8} &
        \textbf{68.3} / \textbf{68.2} & 73.3 / \textbf{73.2} \\
    \bottomrule
    \end{tabular}%
    }

    \medskip

    \resizebox{\linewidth}{!}{%
    \begin{tabular}{lccccccc}
    \toprule
    \multirow{2}{*}{Architecture} &
    \multicolumn{7}{c}{(b) SurgPose Dataset - $AP_{50\text{--}95}$ (Det/Pose)} \\
    \cmidrule(lr){2-8}
    & Overall & LND & MND & MicroForceps & Scissor & Forceps & Prograsp \\
    \midrule
    YOLOv8x-pose~\cite{maji2022yolopose}
      & 47.9 / 61.1
      & 74.2 / 82.0 & 31.4 / 30.6 & 26.4 / 36.2
      & 42.4 / 83.9 & 32.7 / 41.4 & \textbf{80.2} / \textbf{92.6} \\

    YOLO11x-pose~\cite{ultralytics_yolo11}
      & 47.2 / 60.9
      & 72.3 / 80.8 & 31.3 / 30.4 & 26.0 / 36.2
      & 41.8 / 83.8 & 32.2 / 41.5 & 79.8 / 92.6 \\

    YOLOv8x-pose+MIC
      & 59.7 / 68.2
      & 77.6 / 79.3 & 61.9 / 54.5 & 57.3 / 66.0
      & 54.0 / \textbf{95.0} & 39.1 / 43.7 & 68.3 / 70.9 \\

    HRNetV2p-pose
      & 50.3 / 64.9
      & 76.0 / \textbf{84.7} & 41.9 / 45.9 & 47.6 / 55.6
      & 26.9 / 66.0 & 36.5 / 48.8 & 72.9 / 88.3 \\

    MID-POSE
      & \textbf{62.7} / \textbf{71.4}
      & \textbf{79.5} / 81.2 & \textbf{63.4} / \textbf{55.9} & \textbf{59.5} / \textbf{68.5}
      & \textbf{55.2} / \textbf{95.0} & \textbf{48.8} / \textbf{56.0} & 69.6 / 72.1 \\
    \bottomrule
    \end{tabular}%
    }
    \label{tab:results_all}
\end{table}

\newcommand{\predcell}[3]{%
  \begin{minipage}[t]{\linewidth}
    \centering
    \includegraphics[width=\linewidth]{#1}\\[-0.5mm]
    {\tiny #2\\#3}%
  \end{minipage}%
}

\begin{figure*}[t]
  \centering
  \scriptsize
  \setlength{\tabcolsep}{1pt}
  \renewcommand{\arraystretch}{0.9}
  \begin{tabular}{@{}*{5}{C}@{}}
    \toprule
    {\tiny \textbf{Ground Truth}} &
    {\tiny \textbf{YOLOv8x-pose}} &
    {\tiny \textbf{YOLOv8x-pose+MIC}} &
    {\tiny \textbf{HRNetV2p-pose}} &
    {\tiny \textbf{MID-POSE}} \\
    \midrule

    % -------- Example 1 --------
    \includegraphics[width=\linewidth]{images/quantitive_PitSurg/gt_1_crop.png} &
    \predcell{images/quantitive_PitSurg/yolo_dense_1_crop.png}{TP = 0, FN = 1, FP = 3}{IoU = 0, OKS = 0} &
    \predcell{images/quantitive_PitSurg/yolo_dual_1_crop.png}{TP = 1, FN = 0, FP = 0}{IoU = 72.6, OKS = 55.5} &
    \predcell{images/quantitive_PitSurg/hrnet_dense_1_crop.png}{TP = 1, FN = 0, FP = 2}{IoU = 84.9, OKS = 84.6} &
    \predcell{images/quantitive_PitSurg/hrnet_dual_1_crop.png}{TP = 1, FN = 0, FP = 0}{IoU = 86.9, OKS = 83.8}
    \\[1mm]

    % -------- Example 2 --------
    \includegraphics[width=\linewidth]{images/quantitive_PitSurg/gt_2_crop.png} &
    \predcell{images/quantitive_PitSurg/yolo_dense_2_crop.png}{TP = 2, FN = 0, FP = 1}{IoU = 49.6, OKS = 76.7} &
    \predcell{images/quantitive_PitSurg/yolo_dual_2_crop.png}{TP = 2, FN = 0, FP = 0}{IoU = 57.9, OKS = 77.3} &
    \predcell{images/quantitive_PitSurg/hrnet_dense_2_crop.png}{TP = 2, FN = 0, FP = 0}{IoU = 96.3, OKS = 84.4} &
    \predcell{images/quantitive_PitSurg/hrnet_dual_2_crop.png}{TP = 2, FN = 0, FP = 0}{IoU = 97.0, OKS = 90.2}
    \\[1mm]

    % -------- Example 3 --------
    \includegraphics[width=\linewidth]{images/quantitive_PitSurg/gt_3_crop.png} &
    \predcell{images/quantitive_PitSurg/yolo_dense_3_crop.png}{TP = 1, FN = 1, FP = 1}{IoU = 44.9, OKS = 49.1} &
    \predcell{images/quantitive_PitSurg/yolo_dual_3_crop.png}{TP = 1, FN = 1, FP = 0}{IoU = 47.9, OKS = 48.7} &
    \predcell{images/quantitive_PitSurg/hrnet_dense_3_crop.png}{TP = 1, FN = 1, FP = 1}{IoU = 45.6, OKS = 49.3} &
    \predcell{images/quantitive_PitSurg/hrnet_dual_3_crop.png}{TP = 1, FN = 0, FP = 0}{IoU = 48.4, OKS = 49.8}
    \\[1mm]

    % -------- Example 4 --------
    \includegraphics[width=\linewidth]{images/quantitive_PitSurg/gt_4_crop.png} &
    \predcell{images/quantitive_PitSurg/yolo_dense_4_crop.png}{TP = 1, FN =0, FP = 3}{IoU = 88.8, OKS = 81.8} &
    \predcell{images/quantitive_PitSurg/yolo_dual_4_crop.png}{TP = 1, FN = 0, FP = 0}{IoU = 83.7, OKS = 92.5} &
    \predcell{images/quantitive_PitSurg/hrnet_dense_4_crop.png}{TP = 1, FN = 1, FP = 0}{IoU = 90.4, OKS = 95.5} &
    \predcell{images/quantitive_PitSurg/hrnet_dual_4_crop.png}{TP = 1, FN = 0, FP = 0}{IoU =93.6, OKS = 97.4}
    \\
    \bottomrule

    % ---------- legend row ----------
    \multicolumn{5}{c}{%
      \vspace{1mm}\scriptsize
      \textbf{}\;
      \pitlegend{suction}{suction}\quad
      \pitlegend{dural_scissors}{dural scissors}\quad
      \pitlegend{kerrisons}{kerrisons}\quad
      \pitlegend{retractable_knife}{retractable knife}\quad
      \pitlegend{ring_curette}{ring curette}\quad
      \pitlegend{pituitary_rongeurs}{pituitary rongeurs}\quad
      \pitlegend{cup_forceps}{cup forceps}
    }\\
  \end{tabular}

\caption{Qualitative PitSurg examples with ground truth and predictions. TP/FN/FP counts and mean IoU/OKS are shown below each prediction.}
  \label{fig:pit_qual}
\end{figure*}

\paragraph{Qualitative}
Qualitative examples (Fig.~\ref{fig:pit_qual} and Fig.~\ref{fig:surgpose_qual}) illustrate the distinct roles of the encoder and the MIC head. Across both datasets, the HRNetV2p encoder enhances localization; it produces bounding boxes that tightly follow instrument shafts and captures occluded parts (e.g., ring curette) where the baseline often over-extends into background tissue. In contrast, the MIC head primarily improves semantic consistency. It suppresses background false positives and corrects mislabeled bounding boxes, specifically in SurgPose, where single-head baselines struggle to distinguish instruments with similar end-effectors. While the encoder improves per-example IoU and OKS through richer spatial features, the MIC head ensures that geometrically similar bounding boxes receive coherent class labels and higher confidence scores, driving the AP gains through true positive recovery rather than further spatial refinement.

\begin{figure*}[t]
  \centering
  \scriptsize
  \setlength{\tabcolsep}{1pt}
  \renewcommand{\arraystretch}{0.9}
  \begin{tabular}{@{}*{5}{C}@{}}
    \toprule
    {\tiny \textbf{Ground Truth}} &
    {\tiny \textbf{YOLOv8x-pose}} &
    {\tiny \textbf{YOLOv8x-pose+MIC}} &
    {\tiny \textbf{HRNetV2p-pose}} &
    {\tiny \textbf{MID-POSE}} \\
    \midrule

    % -------- Example 1 --------
    \includegraphics[width=\linewidth]{images/quantitive_SurgPose/gt_1_sp} &
    \predcell{images/quantitive_SurgPose/yolo_dense_1_sp}{TP =1, FN = 1, FP = 2}{IoU = 47.3, OKS = 47.6} &
    \predcell{images/quantitive_SurgPose/yolo_dual_1_sp}{TP = 2, FN = 0, FP = 0}{IoU = 84.9, OKS = 98.0} &
    \predcell{images/quantitive_SurgPose/hrnet_dense_1_sp}{TP = 2, FN = 0, FP = 0}{IoU = 89.9, OKS = 98.9} &
    \predcell{images/quantitive_SurgPose/hrnet_dual_1_sp}{TP = 2, FN = 0, FP = 0}{IoU = 85.0, OKS = 98.8}
    \\[1mm]

    % -------- Example 2 --------
    \includegraphics[width=\linewidth]{images/quantitive_SurgPose/gt_2_sp} &
    \predcell{images/quantitive_SurgPose/yolo_dense_2_sp}{TP =1, FN = 1, FP = 2}{IoU = 38.4, OKS = 45.8} &
    \predcell{images/quantitive_SurgPose/yolo_dual_2_sp}{TP = 2, FN = 0, FP = 0}{IoU = 96.9, OKS = 96.7} &
    \predcell{images/quantitive_SurgPose/hrnet_dense_2_sp}{TP = 2, FN = 0, FP = 1}{IoU = 96.5, OKS = 93.9} &
    \predcell{images/quantitive_SurgPose/hrnet_dual_2_sp}{TP = 2, FN = 0, FP = 0}{IoU = 97.0, OKS = 98.2}
    \\[1mm]

    % -------- Example 3 --------
    \includegraphics[width=\linewidth]{images/quantitive_SurgPose/gt_3_sp} &
    \predcell{images/quantitive_SurgPose/yolo_dense_3_sp}{TP = 2, FN = 0, FP = 1}{IoU = 91.3, OKS = 82.7} &
    \predcell{images/quantitive_SurgPose/yolo_dual_3_sp}{TP = 1, FN = 1, FP = 0}{IoU = 94.3, OKS = 92.8} &
    \predcell{images/quantitive_SurgPose/hrnet_dense_3_sp}{TP = 1, FN = 1, FP = 2}{IoU = 48.6, OKS = 49.3} &
    \predcell{images/quantitive_SurgPose/hrnet_dual_3_sp}{TP = 2, FN = 0, FP = 0}{IoU = 94.4, OKS = 92.9}
    \\[1mm]
    \bottomrule

    % ---------- legend row ----------
    \multicolumn{5}{c}{%
      \vspace{1mm}\scriptsize
      \textbf{}\;
      \pitlegend{LNDcolor}{LND}\quad
      \pitlegend{MNDcolor}{MND}\quad
      \pitlegend{MicroForcepscolor}{MicroForceps}\quad
      \pitlegend{Scissorcolor}{Scissor}\quad
      \pitlegend{Forcepscolor}{Forceps}\quad
      \pitlegend{Prograspcolor}{Prograsp}\quad
    }\\
  \end{tabular}
  \caption{Qualitative SurgPose examples with ground truth and predictions. TP/FN/FP counts and mean IoU/OKS per image are reported below each prediction.}
  \label{fig:surgpose_qual}
\end{figure*}

\paragraph{Sensitivity to weighting parameter $\lambda_{\text{mic}}$}
The sensitivity study shown Table \ref{tab:influence_weighting} investigates the influence of weighting factor $\lambda_{\text{mic}}$ on detection and pose performance of dual-head models on both PitSurg and SurgPose datasets. One can observe that lower $\lambda_{\text{mic}}$ values result in higher performance on PitSurg, while higher $\lambda_{\text{mic}}$ values favour SurgPose. Importantly, across the tested range $\lambda_{\text{mic}} \in [10.0, 17.0]$, MID-POSE consistently outperforms YOLOv8x-pose+MIC on both datasets, indicating that our qualitative conclusions are not sensitive to a narrowly tuned choice of loss weight.

\begin{table}[ht]
\centering
\caption{Sensitivity to weighting parameter \(\lambda_{\text{mic}}\).}
\label{tab:influence_weighting}
\scriptsize
\setlength{\tabcolsep}{3pt}
\renewcommand{\arraystretch}{1.15}
\resizebox{\linewidth}{!}{%
\begin{tabular}{l*{8}{D}}
\toprule
 &
\multicolumn{4}{c}{PitSurg Dataset - $mAP_{50\text{--}95}$ (Det/Pose)} &
\multicolumn{4}{c}{SurgPose Dataset - $mAP_{50\text{--}95}$ (Det/Pose)} \\
\cmidrule(lr){2-5}\cmidrule(lr){6-9}
Architecture $\downarrow$ / $\lambda_{\text{mic}}$ $\rightarrow$ & 10.0 & 12.5 & 15.0 & 17.0 & 10.0 & 12.5 & 15.0 & 17.0 \\
\midrule
YOLOv8x-pose+MIC & 59.6/64.9 & 59.4/64.8 & 58.8/64.2 & 58.1/63.6 & 56.4/64.8 & 58.7/67.2 & 59.1/67.4 & 59.7/68.2 \\
MID-POSE & 77.5/78.5 & 77.2/78.2 & 76.6/77.4 & 75.9/76.8 & 59.3/68.4 & 60.9/69.8 & 62.1/70.9 & 62.7/71.4 \\
\bottomrule
\end{tabular}%
}
\end{table}

\paragraph{Sensitivity to instrumentness threshold $\tau$}
Table \ref{tab:influence_threshold} reports the False Positives (FP) and False Negatives (FN) trade-off under different instrumentness thresholds $\tau \in \{0.1, 0.3, 0.5\}$, together with success rates defined as $\Pr(\mathrm{IoU} \ge 0.5)$ for detection and $\Pr(\mathrm{OKS} \ge 0.5)$ for pose. Across both PitSurg and SurgPose, $\tau=0.3$ provides a practical operating point, as it yields a large reduction in false positives (i.e., "ghost" detections) while incurring only a marginal increase in false negatives. Additionally, MID-POSE demonstrates reduced threshold sensitivity over YOLOv8x-pose. Varying $\tau$ from $0.1$ to $0.5$ in MID-POSE maintains success rates above $90\%$ on PitSurg and above $79\%$ on SurgPose, while YOLOv8x-pose is much more sensitive, with success rates decreasing from above $90\%$ to around $60\%$. This suggests MID-POSE delivers more consistent performance over threshold shifts for deployment in distractor-heavy surgical scenes.

\begin{table}[ht]
\centering
\caption{Sensitivity to instrumentness threshold $\tau$.}
\label{tab:influence_threshold}
\scriptsize
\setlength{\tabcolsep}{5pt}
\renewcommand{\arraystretch}{1.1}

\resizebox{0.9\textwidth}{!}{%
\begin{tabular}{@{} l l cc cc @{}}
\toprule
\multirow{2}{*}{$\tau$} & \multirow{2}{*}{Metric}
& \multicolumn{2}{c}{PitSurg} & \multicolumn{2}{c}{SurgPose} \\
\cmidrule(lr){3-4}\cmidrule(lr){5-6}
& & YOLOv8x-pose & MID-POSE & YOLOv8x-pose & MID-POSE \\
\midrule

\multirow{2}{*}{0.1}
& FN / FP    & 21 / 1519   & 18 / 68     & 13 / 51499  & 17 / 28 \\
& IoU / OKS* & 94.8 / 96.2 & 94.3 / 93.9 & 94.7 / 94.7 & 84.7 / 84.7 \\
\addlinespace

\multirow{2}{*}{0.3}
& FN / FP    & 26 / 1054   & 21 / 27     & 45 / 14171  & 18 / 25 \\
& IoU / OKS* & 85.7 / 87.0 & 92.5 / 92.2 & 79.6 / 79.5 & 83.0 / 83.0 \\
\addlinespace

\multirow{2}{*}{0.5}
& FN / FP    & 81 / 214    & 42 / 22     & 815 / 2906  & 35 / 21 \\
& IoU / OKS* & 60.5 / 71.4 & 90.7 / 90.1 & 69.5 / 69.5 & 79.8 / 79.8 \\
\bottomrule
\end{tabular}%
}

\vspace{1mm}
\footnotesize
\emph{*Detection / pose success rate, defined as Pr(IoU $\geq 0.5$) / Pr(OKS $\geq 0.5$), in \%.}
\end{table}

\paragraph{Discussion}
Overall, PitSurg and SurgPose highlight complementary strengths of the proposed architecture: HRNetV2p mainly improves spatial precision, whereas the MIC head addresses fine-grained semantic ambiguities between visually similar instruments. Despite the higher training memory footprint of HRNetV2p, the MIC module itself is lightweight, containing 2.18M parameters compared with 7.38M in the baseline YOLOv8x-pose classification head. This suggests that the semantic gains are driven by object-centric feature extraction from multi-level RoIs. Our systematic failure analysis (Appendix \ref{app:failure_analysis}) quantifies these improvements across condition categories in both PitSurg and SurgPose, showing that the observed gains are not driven only by easier instances. On PitSurg (Table \ref{tab:pitsurg_failure_rates_by_category}), MID-POSE eliminates failures in the Blur, Specular Highlights, and Clean categories, and substantially improves Partial Occlusion (6.75\% $\rightarrow$ 0.78\%). The most difficult categories for YOLOv8x--pose are Heavy Occlusion (40.91\%), Border Truncation (22.50\%), and Extreme Perspectives (19.10\%). MID-POSE reduces these failure rates to 10.00\%, 6.25\%, and 13.48\%, respectively. Figure \ref{fig:failure_audit_pitsurg} further audits these categories through systematically selected worst-case examples. On SurgPose (Table \ref{tab:surgpose_failure_rates_by_condition}), YOLOv8x-pose fails in 30.10\%, 43.60\%, and 20.70\% of instances under the Semantic Ambiguity, Extreme Perspectives, and Clean categories, respectively, whereas MID-POSE reduces these to 7.35\%, 10.00\%, and 0.36\%. Figure \ref{fig:failure_audit_surgpose} provides corresponding worst-case qualitative evidence for representative failure categories in SurgPose.



\section{Conclusion}
We presented MID-POSE, a dual-head architecture for multi-class surgical instrument detection and 2D keypoint pose estimation in minimally invasive surgery, together with the PitSurg dataset of endoscopic pituitary procedures with class-specific 2D keypoint annotations. By combining a high-resolution HRNetV2p encoder, a class-agnostic dense detection-pose head, and a MIC head operating on RoI-aligned features, MID-POSE consistently improves Det/Pose $AP_{50\text{--}95}$ over strong YOLOv8x-pose baselines on both PitSurg and the robotic SurgPose benchmark, with particularly large gains for visually similar instruments and under occlusion. Qualitative results confirm that high-resolution features mainly enhance localisation and pose accuracy, whereas the MIC head resolves fine-grained class ambiguities and suppresses background false positives. 
Future work will explore spatio-temporal modelling of instrument appearance and motion across video frames to improve robustness, weaker forms of supervision to reduce reliance on dense annotations, and integration into real-time surgical assistance systems.

% Acknowledgments---Will not appear in anonymized version

\midlacknowledgments{This work was supported in whole, or part, by the UCL Hawkes Institute -- formerly WEISS (203145/Z/16/Z); the EPSRC (EP/Y01958X/1, EP/W00805X/1, EP/Z534754/1); UKRI [UKRI145] grants; DZK is supported by the NIHR Academic Clinical Fellowship; HJM is supported by the WEISS [NS/A000050/1] and by the NIHR Biomedical Research Centre at UCL, has shares in and is employed by Panda Surgical Ltd.; DS is supported by the Department of Science, Innovation and Technology (DSIT), and the Royal Academy of Engineering under the Chair in Emerging Technologies programme. For the purpose of open access, the author has applied a Creative Commons Attribution (CC BY) licence to any Author Accepted Manuscript version arising. }

\bibliography{midl26_89}

\appendix

\section{Additional Details on the PitSurg Dataset}
\label{app:datasheet}
PitSurg acquisition details and dataset size are presented in Table \ref{tab:pitsurg-specs}, while Figure \ref{fig:pitsurg-tags-pie} summarises the distribution of instrument-level visual condition categories.

\begin{table}[H]
\centering
\caption{PitSurg dataset specifications. Summary of acquisition settings and dataset size.}
\label{tab:pitsurg-specs}
\renewcommand{\arraystretch}{1.15}
\setlength{\tabcolsep}{8pt}
\resizebox{0.8\linewidth}{!}{%
\begin{tabular}{ll}
\hline
\multicolumn{2}{c}{\textbf{Acquisition details}} \\
\hline
Endoscope & Hopkins Telescope with AIDA storage \\
Manufacturer & Karl Storz Endoscopy, UK \\
Native Resolution & 720p \\
Native Frame Rate & 24 FPS \\
Sampling Rate & 1 FPS for annotation \\
\hline
\multicolumn{2}{c}{\textbf{Size of annotated data}} \\
\hline
Procedures & 26 clinical videos \\
Instrument Types & 7 classes \\
Total Instances & 4034 instances (3319 Train / 715 Val) \\
\hline
\end{tabular}%
}
\end{table}

\begin{figure}[H]
  \centering
  \includegraphics[width=\linewidth]{images/pie.pdf}
  \caption{Distribution of instrument-level visual condition categories in PitSurg. Categories include clean instances and degraded regimes, namely partial/heavy occlusion, border truncation, blur, specular highlights, and extreme viewpoints; percentages and counts are shown for each category.}
  \label{fig:pitsurg-tags-pie}
\end{figure}

\section{Systematic Failure Analysis}
\label{app:failure_analysis}
Tables \ref{tab:pitsurg_failure_rates_by_category} and \ref{tab:surgpose_failure_rates_by_condition} report instance-level failure rates of YOLOv8x-pose and MID-POSE across different visual condition categories. Table \ref{tab:pitsurg_failure_rates_by_category} summarizes results on PitSurg, while Table \ref{tab:surgpose_failure_rates_by_condition} reports the analogous breakdown on SurgPose. The GT Count ($n$) column indicates the number of ground-truth instrument instances in each category in the validation set.

\paragraph{Failure rate}
We compute failure rates per category using one-to-one matching between predictions and ground-truth instances at IoU $\geq$ 0.5. A ground-truth instance is counted as a failure if it is unmatched or if it is matched but the predicted keypoints do not meet the pose criterion (OKS $< 0.5$). The failure rate for a category is the fraction of ground-truth instances in that category that fail under this definition.

\paragraph{Observations}
On PitSurg (Table \ref{tab:pitsurg_failure_rates_by_category}), MID-POSE eliminates failures in the Blur, Specular highlights, and Clean categories (0.00\%), and improves Partial Occlusion (6.75\% $\rightarrow$ 0.78\%). YOLOv8x-pose fails most under Heavy Occlusion (40.91\%), Border Truncation (22.50\%), and Extreme Perspectives (19.10\%). MID-POSE reduces these to 10.00\%, 6.25\%, and 13.48\%, respectively. Remaining failures are dominated by complete overlap, severe low-contrast truncation, and rare-viewpoint confusion (pituitary rongeurs vs cup forceps); worst-case examples are shown in Figure \ref{fig:failure_audit_pitsurg}.
On SurgPose (Table~\ref{tab:surgpose_failure_rates_by_condition}), which provides a complementary benchmark with different visual characteristics, YOLOv8x-pose fails in $30.10\%$, $43.60\%$, and $20.70\%$ of instances under the Semantic Ambiguity, Extreme Perspectives, and Clean categories, respectively, while MID-POSE reduces these to $7.35\%$, $10.00\%$, and $0.36\%$. This consistent reduction across categories supports the stability of MID-POSE's robustness gains. Remaining errors are dominated by residual class confusion under rare viewpoints, especially between LND and Forceps (Fig.~\ref{fig:failure_audit_surgpose}).


\begin{table}[H]
  \centering
  \caption{Failure rates of YOLOv8x-pose and MID-POSE across instrument-level visual condition categories in the PitSurg validation set.}
  \setlength{\tabcolsep}{6pt}
  \renewcommand{\arraystretch}{1.0}
  \scriptsize
  \resizebox{\textwidth}{!}{%
  \begin{tabular}{l c c c}
    \toprule
    Condition Category & GT Count ($n$) & YOLOv8x-pose Failure Rate & MID-POSE Failure Rate \\
    \midrule
    Partial Occlusion      & 385 & 6.75\%  & 0.78\%  \\
    Heavy Occlusion        & 110 & 40.91\% & 10.00\% \\
    Border Truncation      & 80  & 22.50\% & 6.25\%  \\
    Extreme Perspectives   & 89  & 19.10\% & 13.48\% \\
    Blur                   & 16  & 12.50\% & 0.00\%  \\
    Specular highlights    & 12  & 8.33\%  & 0.00\%  \\
    Clean                  & 23  & 4.35\%  & 0.00\%  \\
    \midrule
    TOTAL                  & 715 & 15.20\% & 4.67\%  \\
    \bottomrule
  \end{tabular}%
  }
  \label{tab:pitsurg_failure_rates_by_category}
\end{table}

\begin{table}[H]
  \centering
  \caption{Failure rates of YOLOv8x-pose and MID-POSE across instrument-level visual condition categories in the SurgPose validation set.}
  \setlength{\tabcolsep}{6pt}
  \renewcommand{\arraystretch}{1.1}
  \scriptsize
  \resizebox{\textwidth}{!}{%
  \begin{tabular}{l c c c}
    \toprule
    Condition Category & GT Count ($n$) & YOLOv8x-pose Failure Rate & MID-POSE Failure Rate \\
    \midrule
    Semantic Ambiguity      & 8,200  & 30.10\% & 7.35\% \\
    Extreme Perspectives    & 5,741  & 43.60\% & 10.00\% \\
    Clean                   & 10,371 & 20.70\% & 0.36\% \\
    \midrule
    TOTAL                   & 24,312 & 29.30\% & 5.00\% \\
    \bottomrule
  \end{tabular}%
  }
  \label{tab:surgpose_failure_rates_by_condition}
\end{table}

\begin{figure}[H]
  \centering
  \setlength{\tabcolsep}{2pt}
  \renewcommand{\arraystretch}{0.9}
  \begin{tabular}{@{} *{4}{F} @{}}
    \toprule
    Partial Occlusion & Heavy Occlusion & Border Truncation & Extreme Perspectives \\
    \midrule

    % ---------------- GT ----------------
    \includegraphics[width=0.99\linewidth]{images/failure_PitSurg/gt_part_fail.jpg} & 
    \includegraphics[width=0.99\linewidth]{images/failure_PitSurg/gt_heavy_occlude.jpg} & 
    \includegraphics[width=0.99\linewidth]{images/failure_PitSurg/gt_border_fail.jpg} & 
    \includegraphics[width=0.99\linewidth]{images/failure_PitSurg/gt_rare_fail.jpg} \\
    {\scriptsize GT} & {\scriptsize GT} & {\scriptsize GT} & {\scriptsize GT} \\

    \addlinespace[1pt]

    % ---------------- YOLOv8x-pose ----------------
    \includegraphics[width=0.99\linewidth]{images/failure_PitSurg/base_part_fail.jpg} & 
    \includegraphics[width=0.99\linewidth]{images/failure_PitSurg/base_heavy_occlude.jpg} & 
    \includegraphics[width=0.99\linewidth]{images/failure_PitSurg/base_border_fail.jpg} & 
    \includegraphics[width=0.99\linewidth]{images/failure_PitSurg/base_rare_fail.jpg} \\
    {\scriptsize YOLOv8x-pose} & {\scriptsize YOLOv8x-pose} & {\scriptsize YOLOv8x-pose} & {\scriptsize YOLOv8x-pose} \\

    \addlinespace[1pt]

    % ---------------- MID-POSE ----------------
    \includegraphics[width=0.99\linewidth]{images/failure_PitSurg/mid_part_fail.jpg} & 
    \includegraphics[width=0.99\linewidth]{images/failure_PitSurg/mid_heavy_occlude.jpg} & 
    \includegraphics[width=0.99\linewidth]{images/failure_PitSurg/mid_border_fail.jpg} & 
    \includegraphics[width=0.99\linewidth]{images/failure_PitSurg/mid_rare_fail.jpg} \\
    {\scriptsize MID-POSE} & {\scriptsize MID-POSE} & {\scriptsize MID-POSE} & {\scriptsize MID-POSE} \\

    \midrule
    \multicolumn{4}{c}{%
      \vspace{-1mm}
      \scriptsize
      \pitlegend{suction}{suction}\quad
      \pitlegend{dural_scissors}{dural scissors}\quad
      \pitlegend{kerrisons}{kerrisons}\quad
      \pitlegend{retractable_knife}{retractable knife}\quad
      \pitlegend{ring_curette}{ring curette}\quad
      \pitlegend{pituitary_rongeurs}{pituitary rongeurs}\quad
      \pitlegend{cup_forceps}{cup forceps}
    }\\
    \bottomrule
  \end{tabular}
  
  \caption{Systematic failure audit in PitSurg across instrument-level visual condition categories. Ground-truth (GT) overlays are shown alongside predictions from YOLOv8x-pose and MID-POSE for representative cases of partial occlusion, heavy occlusion, border truncation, and extreme perspectives.}
  
  \label{fig:failure_audit_pitsurg}
\end{figure}

\begin{figure}[H]
  \centering

  \resizebox{0.85\linewidth}{!}{%
  \begin{tabular}{@{} *{2}{E} @{}}
    \toprule
    Semantic Ambiguity & Extreme Perspectives \\
    \midrule

    \includegraphics[width=0.90\linewidth]{images/failure_SurgPose/ps_gt_sim_fail.jpg} & 
    \includegraphics[width=0.90\linewidth]{images/failure_SurgPose/ps_gt_rare_fail.jpg} \\
    {\scriptsize GT} & {\scriptsize GT} \\

    \addlinespace[1pt]

    \includegraphics[width=0.90\linewidth]{images/failure_SurgPose/ps_base_sim_fail.jpg} & 
    \includegraphics[width=0.90\linewidth]{images/failure_SurgPose/ps_base_rare_fail.jpg} \\
    {\scriptsize YOLOv8x-pose} & {\scriptsize YOLOv8x-pose} \\

    \addlinespace[1pt]

    \includegraphics[width=0.90\linewidth]{images/failure_SurgPose/ps_mid_sim_fail.jpg} & 
    \includegraphics[width=0.90\linewidth]{images/failure_SurgPose/ps_mid_rare_fail.jpg} \\
    {\scriptsize MID-POSE} & {\scriptsize MID-POSE} \\

    \midrule
    \multicolumn{2}{c}{%
      \vspace{1mm}\scriptsize
      \pitlegend{LNDcolor}{LND}\quad
      \pitlegend{MNDcolor}{MND}\quad
      \pitlegend{MicroForcepscolor}{MicroForceps}\quad
      \pitlegend{Scissorcolor}{Scissor}\quad
      \pitlegend{Forcepscolor}{Forceps}\quad
      \pitlegend{Prograspcolor}{Prograsp}\quad
    }\\
    \bottomrule
  \end{tabular}
  }% end resizebox

  \caption{Systematic failure audit in SurgPose for representative condition categories. Ground-truth (GT) overlays are shown alongside predictions from YOLOv8x-pose and MID-POSE for representative cases of semantic ambiguity and extreme perspectives.}

  
  \label{fig:failure_audit_surgpose}
\end{figure}

\section{Class-Agnostic Performance}
Table~\ref{tab:agnostic_det} reports class-agnostic results, which isolate the contribution of high-resolution feature maps from the effects of the classification design. In the PitSurg dataset, replacing the YOLOv8x encoder with HRNetV2p results in a substantial gain in detection and pose mAP from 46.3\% / 67.5\% to 84.6\% / 84.5\%, confirming that high-resolution features are the primary driver for instrument localization and pose estimation in narrow anatomical corridors. In SurgPose, the improvements are more subtle but consistent as the YOLOv8 backbone is already strong at 92.2\% / 95.3\%, since SurgPose involves robotic instruments in a more uniform field of view. Finally, comparing HRNetV2p-pose to MID-POSE shows that adding the MIC head provides a slight refinement, by suppressing false positive detections. This demonstrates that while the encoder provides the spatial foundation, the dual-head design ensures semantic consistency.

\begin{table}[H]
\centering
\caption{Class-agnostic performance comparison across backbones.}
\label{tab:agnostic_det}
\resizebox{\textwidth}{!}{%
\begin{tabular}{@{}lcc@{}}
\toprule
Architecture & PitSurg - $mAP_{50\text{--}95}$ (Det/Pose) & SurgPose - $mAP_{50\text{--}95}$ (Det/Pose) \\
\midrule
YOLOv8x-pose   & 46.3\% / 67.5\% & 92.2\% / 95.3\% \\
HRNetV2p-pose  & 84.6\% / 84.5\% & 93.5\% / 96.5\% \\
MID-POSE       & 85.7\% / 85.0\% & 94.3\% / 96.8\% \\
\bottomrule
\end{tabular}%
}
\end{table}


\end{document}