\documentclass{midl}

\usepackage{mwe}
\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage{booktabs}
\usepackage{enumitem}
\usepackage{multirow}

\jmlryear{2025}
\jmlrworkshop{Full Paper -- MIDL 2025}
\jmlrvolume{-- 129}
\editors{Accepted for publication at MIDL 2025}

\title[JHU-VPT(JEPA): Cataract]{A Vision Foundation Model for Cataract Surgery Using Joint-Embedding Predictive Architecture}

\midlauthor{
    \Name{Nisarg A. Shah\midljointauthortext{Contributed equally}\nametag{$^{1}$}} \Email{snisarg812@gmail.com}\\
    \addr $^{1}$ Johns Hopkins University, Baltimore, USA \AND
    \Name{Mingze Xia\midlotherjointauthor\nametag{$^{1}$}} \Email{mxia8@jhu.edu}\\
    \Name{Subhasri Vijay\nametag{$^{1}$}} \Email{svijay2@jhu.edu}\\
    \Name{Shameema Sikder\nametag{$^{2,3}$}} \Email{ssikder1@jhmi.edu}\\
    \addr $^{2}$ Malone Center for Engineering in Healthcare, Baltimore, USA\\
    \addr $^{3}$ Wilmer Eye Institute, Johns Hopkins University, Baltimore, USA \AND
    \Name{S. Swaroop Vedula\nametag{$^{2}$}} \Email{swaroop@jhu.edu}\\
    \Name{Vishal M. Patel\nametag{$^{1}$}} \Email{vpatel36@jhu.edu}
}

\begin{document}

\maketitle

\begin{abstract}
Vision foundation models can automate surgical video analysis, enabling applications that support patient care and training. For cataract surgery, existing models are limited by small datasets, privacy concerns, and poor generalizability.~
In this paper, we introduce {JHU-VPT(JEPA)}, a self-supervised vision foundation model leveraging Joint-Embedding Predictive Architecture (JEPA) to learn spatiotemporal representations via latent feature prediction on a large corpus of unlabeled cataract videos, without requiring extensive labeled datasets or pixel-level reconstruction.~
JHU-VPT(JEPA) is pretrained on 2591 videos from multiple sites, capturing diverse surgical techniques and styles.
Comprehensive evaluations on step recognition, surgical feedback, and skill assessment tasks show JHU-VPT(JEPA) outperforms existing methods. Its effectiveness is evident even with attentive probing using a frozen encoder, highlighting feature robustness and addressing privacy by not needing raw video access for downstream tasks.
Our approach offers a scalable, generalizable, and privacy-preserving solution for surgical video analysis, with significant potential to advance patient care and surgical education.
\end{abstract}

\begin{keywords}
Surgical Pretraining, Joint Embedding Predictive Network, Cataract Surgery
\end{keywords}

\section{Introduction}
Vision foundation models analyzing surgical videos can substantially impact global patient care. Intraoperative surgical videos offer rich data for algorithms enabling critical applications like activity recognition, and skill/feedback prediction for surgeon learning and evaluation~\cite{maier2017surgical, yu2019assessment, padoy2019machine, shah2025step}. Surgical data science has accelerated models for analyzing surgical videos. However, current models face constraints like small/convenience datasets~\cite{shah2023glsformer}, limited evaluation, and poor generalizability~\cite{lecuyer2020assisted, padoy2019machine, funke2019video}. While foundation models are rapidly being trained for applications in several domains~\cite{kang2023deblurring, lu2023multi}, vision foundation models pretrained on large surgical video datasets have not yet been developed.

Self-supervised learning (SSL) has emerged as a powerful paradigm to leverage large corpora of unlabeled video data and train vision foundation models. Traditional SSL methods for medical imaging often involve multimodal cues, e.g., textual radiology reports paired with X-ray images~\cite{boecking2022making,moon2022multi}. By contrast, surgical videos typically lack accompanying text annotations, necessitating visual self-supervised schemes. To address this, we propose a self-supervised approach for complex spatio-temporal surgical video information. Building on the Joint-Embedding Predictive Architecture (JEPA)~\cite{assran2023self, weimann2024self}, our method focuses on \emph{feature prediction} in latent space, a strategy that captures both spatio-temporal coherence and surgical scene semantics without requiring direct pixel-level reconstruction.

Unlike prior self-supervised strategies that primarily rely on contrastive learning or masked autoencoders (MAEs) \cite{he2022masked,tong2022videomae,shah2025csmae},~
our approach masks tokens and predicts them in a latent feature space rather than reconstructing raw pixel values. This reduces the effect of low-level artifacts such as reflections and blur, while emphasizing higher-level semantic details that are crucial for tasks like recognition, feedback, and skill assessment. By eliminating the need for a pixel decoder and a separate reconstruction loss, the learning process is simplified to focus solely on meaningful feature extraction. Moreover, the use of an exponential moving average (EMA) stabilizes training by reducing gradient noise, which is particularly important in surgical video analysis where lighting changes and rapid motion clips can disrupt learning.
We develop our model, \emph{JHU-VPT(JEPA):Cataract}, which we refer to as JHU-VPT(JEPA), by pretraining on a large corpus of cataract surgery videos including multiple sites and surgeons. The dataset diversity allows the learning of domain-robust embeddings. The resultant representations can be shared more readily than raw videos (protecting patient privacy), and they excel in label-scarce scenarios, reducing the need for extensive manual annotations and data-hungry fine-tuning protocols.
We comprehensively evaluate JHU-VPT(JEPA)’s learned embeddings on three key tasks: (1) \textbf{Step Recognition}, wherein the aim is to identify surgical steps or phases; (2) \textbf{Surgical Feedback}, to predict specific performance feedback for the surgeon; and (3) \textbf{Skill Assessment}, which is essential for both surgeon training and credentialing. By varying the size of the annotated subsets used for fine-tuning, we show that JHU-VPT(JEPA) achieves strong performance even with limited labels, highlighting its data efficiency. Furthermore, we validate cross-domain generalization by testing on previously unseen videos, demonstrating JHU-VPT(JEPA)’s capacity to adapt to new surgical styles or camera configurations.

\paragraph{Contributions.} In summary, our main contributions are:
\begin{itemize}[leftmargin=1.5em]
    \item \textbf{JEPA-based approach for cataract videos.} We introduce JHU-VPT(JEPA), a novel architecture for surgical video analysis employing latent space feature prediction to learn rich spatio-temporal representations. These are validated via attentive probing with a frozen encoder, confirming high transferability and effectiveness without fine-tuning.
    \item \textbf{Large-scale video pretraining using a large dataset.} We use an extensive, multi-institution unlabeled surgical video dataset, ensuring robust, domain-generalizable embeddings.~
    \item \textbf{Comprehensive downstream evaluation.} We test JHU-VPT(JEPA) on three important tasks---step recognition, surgical feedback, and skill assessment---showing notable gains with varying labeled data, underscoring its potential clinical utility.
\end{itemize}

\section{JHU-VPT(JEPA): Cataract model}
In this section, we describe our proposed \emph{Vision Foundation Model for Cataract Surgery} \textbf{JHU-VPT(JEPA)}, which builds upon the JEPA principle~\cite{garrido2024learning, bardes2024revisiting} for learning rich, robust visual representations from cataract surgery videos. Our goal is to exploit \emph{feature prediction} as a stand-alone objective, enabling the model to learn meaningful spatio-temporal embeddings without extra supervision. A high-level overview of JHU-VPT(JEPA) is shown in Figure~\ref{fig:model_arc}.

\subsection{Overview}
\label{subsec:overview}
At the core of feature prediction as a stand-alone objective, the model learns by predicting the representation of a target input \(\mathbf{y}\) from the representation of a context input \(\mathbf{x}\). Specifically, an encoder \(E_\psi(\cdot)\) projects \(\mathbf{x}\) into latent space, while a predictor \(P_\phi(\cdot)\) attempts to recover the embedding of \(\mathbf{y}\) given \(\mathbf{x}\). A conditioning variable \(\delta\), indicating the transformation or corruption that links \(\mathbf{x}\) and \(\mathbf{y}\), guides the predictor to generate distinct outputs for different transformations. In our setting, \(\mathbf{x}\) and \(\mathbf{y}\) are disjoint spatio-temporal patches from a surgical clip, and \(\delta\) encodes the masking pattern (or offset) between these two regions.

\begin{figure*}[htp!]
  \centering
  \includegraphics[width=0.55\linewidth]{JEPA_v2.pdf}
  \vskip -10pt
  \caption{Overview of the JHU-VPT(JEPA) architecture. The framework consists of Block Masking, an Encoder, a Predictor, and an EMA-updated Target Encoder. The Encoder processes the non-masked tokens, predicting their feature representations. The Predictor combines these representations with learnable mask tokens and a conditioning variable to predict the embeddings of masked regions. The Target Encoder encodes all tokens, generating target embeddings for the feature-prediction loss.}
\label{fig:model_arc}
\end{figure*}

\subsection{Training Objective}
\label{subsec:training_objective}
To learn robust representations, we train the visual encoder \(E_\psi(\cdot)\) and the predictor \(P_\phi(\cdot)\) via a feature-prediction loss. Let the \emph{context} region \(\mathbf{x}\) and \emph{target} region \(\mathbf{y}\) be two non-overlapping subsets of video tokens from a video \(\mathbf{X}\), selected according to a masking scheme (see Section~\ref{subsec:masking}). We define the loss function to encourage the predicted representation of \(\mathbf{y}\) to match the actual representation of \(\mathbf{y}\), generated by a \emph{target encoder} \(E_{\bar{\psi}}(\cdot)\). Concretely, we minimize:
\begin{equation}
\label{eq:loss_l1}
\min_{\psi,\phi} \; \big\| P_\phi\big(E_\psi(\mathbf{x}),\, \delta\big) \;-\; \mathrm{sg}\!\big(E_{\bar{\psi}}(\mathbf{y})\big) \big\|_1,
\end{equation}
where \(\mathrm{sg}(\cdot)\) is a stop-gradient blocking updates to \(E_{\bar{\psi}}(\cdot)\). In practice, \(\bar{\psi}\) is maintained as an exponential moving average (EMA) of \(\psi\), consistent with prior work that mitigates representation collapse~\cite{garrido2024learning}.~
Using L1 loss and a stop-gradient on the target encoder prevents trivial solutions (i.e., feature collapse) by forcing the encoder and predictor to capture meaningful spatio-temporal information in the surgical video.

\paragraph{Collapse Prevention.}
Combining an EMA target encoder, a stop-gradient, and a predictor prevents representation collapse in various self-supervised contexts~\cite{grill2020bootstrap,assran2023self}. Intuitively, \(\bar{\psi}\) changes more slowly than \(\psi\), compelling \(E_\psi(\mathbf{x})\) to capture detailed information needed by \(P_\phi(\cdot)\) to match the slowly evolving target representation. This strategy drives the encoder to encode distinct semantic cues (e.g., instruments, ocular structures, movements) rather than collapsing to constant outputs.

\subsection{Prediction Task and Masking Strategy}
\label{subsec:masking}
We implement the feature-prediction objective using a masked modeling approach. Each video clip is partitioned into 3D tokens, and large continuous blocks are sampled to form the masked regions \(\mathbf{y}\); the remaining tokens constitute the visible regions \(\mathbf{x}\). Applying large or continuous masks across time creates a challenging prediction task, encouraging the model to capture dynamic interactions between surgical instruments and ocular tissue.

To achieve this, we use multi-block masking~\cite{bardes2024revisiting}. First, short-range masks involve sampling several small blocks (e.g., 8) that cover about 15\% of each frame, applied consistently across all frames. This forces the model to rely on temporal cues to infer fine-grained details and quick instrument movements. Second, long-range masks involve sampling fewer, larger blocks (e.g., 2) covering approximately 70\% of each frame and extending over time, forcing the model to understand broader surgical phases and slower eye changes from limited visible areas.~
This multi-block masking strategy challenges the predictor to reconstruct features of large masked regions from small visible segments, enhancing the model's understanding of actions and anatomy in surgery videos.

\subsection{Implementation Details}
\label{subsec:implementation}

JHU-VPT(JEPA) comprises three learnable modules and an EMA-updated target encoder.

\noindent \textbf{Tokenizer:} The tokenizer converts the raw video \(\mathbf{X} \in \mathbb{R}^{T \times C \times H \times W}\) into non-overlapping 3D tokens representing spatio-temporal volumes. We apply a 3D convolutional layer with kernel and stride \((t, h, w)\), producing \(N = \tfrac{T}{t} \times \tfrac{H}{h} \times \tfrac{W}{w}\) tokens, each of dimension \(k\). Fixed 3D positional encodings~\cite{he2022masked,tong2022videomae} are added to retain spatio-temporal information.

\noindent \textbf{Encoder} \(E_\psi(\cdot)\): The encoder is a Vision Transformer (ViT) backbone~\cite{dosovitskiy2020image,arnab2021vivit} that processes the visible tokens \(\mathbf{x}\), producing an embedding \(\mathbf{F}_x \in \mathbb{R}^{|\mathbf{x}| \times d}\), where \(d\) is the embedding dimension. This embedding is passed to the predictor.

\noindent \textbf{Predictor} \(P_\phi(\cdot)\): The predictor is a lightweight transformer that maps \(\mathbf{F}_x\) to a predicted embedding \(\widetilde{\mathbf{F}}_y\). It also receives learnable mask tokens \(\mathbf{M}\) (one per masked patch) with positional encodings and the conditioning variable \(\delta\), which encodes positional offsets or transformations between \(\mathbf{x}\) and \(\mathbf{y}\). Formally,
\begin{equation}
    \widetilde{\mathbf{F}}_y = P_\phi\big(\mathbf{F}_x,\, \mathbf{M},\, \delta\big).
\end{equation}

\noindent \textbf{Target Encoder} \(E_{\bar{\psi}}(\cdot)\): The target encoder is an EMA copy of the encoder, updated at each training iteration by
\begin{equation}
    \bar{\psi} \leftarrow \alpha\,\bar{\psi} + (1 - \alpha)\,\psi,
\end{equation}
where \(\alpha \in [0,1)\) is a momentum coefficient. It processes the masked tokens \(\mathbf{y}\), generating \(\mathbf{F}_y\) for the loss in Eq.~\eqref{eq:loss_l1}.

\subsection{Pretraining Architecture Analysis}
\label{subsec:discussion}
Predicting large, masked video region representations from limited visible cues allows JHU-VPT(JEPA) to capture fine-grained details and long-range context in cataract surgery. The joint-embedding mechanism directs the encoder to focus on discriminative aspects like surgical instruments, subtle eye movements, and relevant clinical steps. The EMA target encoder, stop-gradient, and predictor prevent collapse, enabling learning of temporally coherent, anatomically relevant features. This design is scalable to various downstream tasks, including surgical phase recognition and skill assessment, and demonstrates strong generalization with minimal labeled data. Section~\ref{sec:experiments} demonstrates JHU-VPT(JEPA)'s effectiveness in capturing real-world surgical workflow complexities while maintaining low annotation requirements.

\subsection{Downstream Task Evaluation}
\label{subsec:downstream_evaluation}

After pretraining JHU-VPT(JEPA), we evaluate its representations on downstream tasks using two approaches: \emph{fine-tuning} and \emph{attentive probing}. In fine-tuning, we initialize the encoder \(E_\psi(\cdot)\) with the pretrained weights and attach a linear classification head. The entire model, including the encoder and the classification head, is then optimized jointly on the downstream dataset.

In contrast, attentive probing keeps the pretrained encoder \(E_{\bar{\psi}}(\cdot)\) fixed to assess the quality of the learned features without updating them. We introduce a learnable cross-attention layer with a query token that attends to the output features of the frozen encoder. The output of the cross-attention layer is added to the query token via a residual connection and passed through a two-layer multilayer perceptron (MLP) for prediction:
\begin{equation}
\label{eq:attentive_probing}
\mathbf{h} = \text{MLP}\big( \mathbf{q} + \text{CrossAttn}(\mathbf{q},\, E_{\bar{\psi}}(\mathbf{x})) \big),
\end{equation}
where \(\mathbf{q}\) is the learnable query token, and \(\mathbf{h}\) is the output used for classification or regression tasks. Attentive probing evaluates the robustness of the pretrained features while keeping the feature extractor unchanged, ensuring that the representation quality is not influenced by further training. This approach is useful when labeled data is limited or when data privacy restrictions prevent sharing raw videos, as it allows training downstream models on new tasks using shared features without accessing the raw video data.

\subsection{Datasets}
For \textbf{Pretraining} JHU-VPT(JEPA), we assembled a multi-institutional dataset of 2,591 unlabeled cataract surgery videos. This dataset includes 1,838 internal videos (avg. 30 min, 59~fps) and 753 from Cataract-1k~\cite{ghamsarian2024cataract} (avg. 8 min), totaling 2591 unique videos. We did not pretrain on the Cataract-1k videos for which step recognition annotations are available. All videos were subsampled to 1~\textit{fps} and resized to $250 \times 250$ pixels for pretraining, following prior protocols~\cite{gao2021trans,twinanda2016endonet}. CSMAE~\cite{shah2025csmae} was pretrained on the D-450 dataset (an extension of D99 videos), following the methodology for MAE-based pretraining~\cite{bandara2023adamae}.

We evaluated JHU-VPT(JEPA) on three downstream tasks: step recognition, surgical feedback, and skill assessment. For \textbf{step recognition}, experiments were conducted under both low-data (10\%, 25\%, 50\%) and full-data settings using four cataract surgery datasets: Cataract-101~\cite{schoeffmann2018cataract}, D99~\cite{yu2019assessment}, Cataract-1k (subset for which annotations were provided with the original dataset)~\cite{ghamsarian2024cataract}, and a larger subset of Cataract-1k which we internally annotated (referred to as Cataract-1k-JHU and includes the annotated videos in the original dataset). Cataract-101 contains 101 videos at 25~\textit{fps} with 10 annotated steps and a resolution of $720 \times 540$ pixels, split into 50 training, 10 validation, and 40 testing videos~\cite{shah2023glsformer}. D99 comprises 99 videos at 59~\textit{fps} with 12 annotated steps and a resolution of $640 \times 480$ pixels, partitioned into 60 training, 20 validation, and 19 testing videos~\cite{shah2023glsformer}. For Cataract-1k, we used 25 training, 7 validation, and 24 testing videos. For Cataract-1k-JHU, we employed 181 training, 31 validation, and 91 testing videos. All evaluation videos were subsampled to 1~\textit{fps} and resized to $250 \times 250$ pixels for consistency.

\begin{table}[htbp]
\centering
\caption{Qualitative Results on Step Recognition Task for Attentive and Linear Probing Methods across four Datasets. JHU-VPT(JEPA) consistently outperforms VideoMAE across both probes when pretrained on the same pretraining set (D-2591). Results are presented as percentage accuracies, with improvements of our method relative to VideoMAE shown in green.}
\label{tab:compact_results}
\resizebox{0.6\textwidth}{!}{%
\begin{tabular}{lcc|cc}
\toprule
\multirow{2}{*}{\textbf{Dataset}} & \multicolumn{2}{c|}{\textbf{Attentive Probing}} & \multicolumn{2}{c}{\textbf{Linear Probing}} \\
\cmidrule(lr){2-3} \cmidrule(lr){4-5}
 & \textbf{VideoMAE} & \textbf{Ours} & \textbf{VideoMAE} & \textbf{Ours} \\
\midrule
Cataract-101         & 79.31 & 89.82 (+13.3\%) & 65.49 & 67.04 (+2.4\%) \\
Cataract-1k          & 63.75 & 79.58 (+24.8\%) & 52.26 & 52.51 (+0.5\%) \\
Cataract-1k-JHU      & 70.03 & 83.65 (+19.5\%) & 58.76 & 59.04 (+0.5\%) \\
D99                  & 66.13 & 77.20 (+16.7\%) & 47.75 & 48.75 (+2.1\%) \\
\bottomrule
\end{tabular}%
}
\label{table:probe_accuracy}
\end{table}

In the \textbf{surgical feedback} task, we evaluated JHU-VPT(JEPA) on feedback items~\cite{xia2025leveraging} during the capsulorhexis step using the D99 dataset~\cite{hira2022video} of 99 videos. Frames were resized to $224 \times 224$ pixels, applying data augmentations like rotation and color jitter. Data was split into training (60\%), validation (20\%), and testing (20\%) sets; we repeated experiments with three random splits and averaged the results.

For \textbf{skill assessment} in the main incision and capsulorhexis steps, we used 56 videos from D99 and an additional 37 videos captured under consistent conditions. Expert surgeons evaluated the videos using ICO-OSCAR:Phacoemulsification~\cite{puri2017assessment}. Skill was categorized as novice (scores 2--4) and expert (score 5) for main incision, and similarly for capsulorhexis, following~\cite{hira2022video, kim2019objective}.~

\section{Experiments and Results}
\subsection{Evaluation Metrics}
\label{subsec:evaluation_metrics}
We evaluate JHU-VPT (JEPA) using Accuracy, Precision, Recall, and Jaccard Index for step recognition~\cite{shah2023glsformer,kim2019objective,shah2025step}, and Accuracy, Sensitivity, Specificity, and AUC for surgical feedback and skill assessment.

\subsection{Comparison to State-of-the-Art Cataract Pretraining Models}
\label{subsec:comparison_sota}

{We compare JHU-VPT(JEPA) with existing pretraining models on the Cataract-101, D99, Cataract-1k, and Cataract-1k-JHU datasets.}
Table~\ref{tab:compact_results} shows the model’s performance in attentive and linear probing settings (frozen encoder). Our approach consistently outperforms VideoMAE in both, especially with attentive probing (over 20\% improvement for Cataract-1k), highlighting pretrained feature effectiveness and applicability in privacy-aware training.
Table~\ref{table:split_accuracy} shows that in attentive probing, JHU-VPT (JEPA) consistently outperforms VideoMAE~\cite{tong2022videomae} (pretrained on the same set) across all four datasets, often by over 10 percentage points with full data. Even with only 10\% labeled data on Cataract-1k-JHU, JHU-VPT (JEPA) achieves 63.81\% accuracy versus VideoMAE’s 58.36\%. Strong gains on Cataract-101 and D99 (different sources) indicate better generalization, showing JHU-VPT (JEPA)'s learned features effectively capture key surgical patterns without needing encoder updates in downstream tasks.

In full fine-tuning (Tab.~\ref{tbl:table_2}, Fig.~\ref{fig:fine_tuning_results}), JHU-VPT(JEPA) is competitive with CSMAE~\cite{shah2025csmae} across several metrics/datasets. Pretraining on larger, diverse data enhances JHU-VPT(JEPA)'s performance, emphasizing data diversity's role in SSL for surgical video. While JHU-VPT(MAE) (VideoMAE on our pretraining set) often yields higher accuracy with full fine-tuning \cite{feichtenhofer2022masked}, JHU-VPT(JEPA)'s predictive objective learns abstract, high-level features that excel with attention probes (frozen encoder), showcasing robustness. In contrast, full fine-tuning adjusts all parameters, which can perturb these representations.~
Overall, JHU-VPT(JEPA)'s feature prediction with large, diverse pretraining data yields significant gains in cataract surgery analysis, and its robust features enable privacy-constrained surgical video analysis with minimal fine-tuning \cite{garrido2024learning}.

\begin{table*}[h]
\centering
\footnotesize
\caption{Quantitative results of step recognition from different methods on the Cataract-101 and D99 datasets.}
\resizebox{0.85\textwidth}{!}{%
\begin{tabular}{ccccccccc} \toprule
\multirow{2}{*}{Method} & \multicolumn{4}{c}{Cataract-101}        & \multicolumn{4}{c}{D99}                \\  \cline{2-5}\cline{6-9} 
                        & Jaccard & Precision & Recall & Accuracy & Jaccard & Precision & Recall & Accuracy   \\ \toprule
ResNet\cite{he2016deep}      & 62.58 & 76.68 & 74.73 & 82.64 & 37.98 & 54.76 & 52.28 & 72.06         \\
SV-RCNet\cite{jin2017sv}     & 66.51 & 84.96 & 76.61 & 86.13 & 39.15 & 58.18 & 54.25 & 73.39        \\
OHFM\cite{yi2019hard}        & 69.01 & 85.37 & 78.29 & 87.82 & 40.01 & 59.12 & 55.49 & 73.82           \\
TeCNO\cite{czempiel2020tecno}      & 70.18 & 86.03 & 79.52 & 88.26 & 41.31 & 61.56 & 55.81 & 74.07         \\
TMRNet\cite{jin2021temporal} & 71.83 & 85.09 & 82.44 & 89.68 & 41.42 & 61.37 & 56.02 & 75.11        \\ 
Trans-SVNet\cite{gao2021trans} & 72.32 & 86.72 & 81.12 & 89.45 & 42.06 & 60.12 & 56.36 & 74.89         \\ 
ViT\cite{dosovitskiy2020image}       & 64.77 & 78.51 & 75.62 & 84.56 & 38.18 & 55.15 & 53.60 & 72.45         \\
TimesFormer\cite{bertasius2021space} & 75.97 & 85.38 & 84.47 & 90.76 & 42.69 & 64.24 & 55.17 & 77.83         \\
STMAE\cite{feichtenhofer2022masked} & 70.54 & 81.47 & 78.67 & 85.29 & 41.67 & 59.38 & 53.22 & 74.16         \\
VideoMAE\cite{tong2022videomae} & 71.39 & 82.13 & 80.16 & 86.47 & 42.58 & 61.24 & 56.35 & 74.39         \\
CSMAE\cite{shah2025csmae}    & 76.82 & {84.26} & 86.73 & {89.83} & 43.51 & 64.32 & 52.45 & 78.14    \\
JHU-VPT(MAE) & \textbf{79.95} & 87.80 & \textbf{89.10} & \textbf{92.00} & \textbf{49.95} & \textbf{64.78} & \textbf{64.46} & \textbf{78.69}         \\
JHU-VPT(JEPA)   & 79.58 & \textbf{87.88} & 88.89 & 91.52 & 43.63 & 55.39 & 62.19 & 75.61    \\ \bottomrule
\end{tabular}}
\label{tbl:table_2}
\end{table*}

Overall, JHU-VPT(JEPA)'s feature prediction approach, enabled by large and diverse pretraining data, yields significant performance gains in cataract surgery analysis. Its robust features allow surgical video analysis in privacy-constrained scenarios with minimal fine-tuning \cite{garrido2024learning}.

\begin{table*}[htb]
\centering
\caption{Comparison of Step Recognition Accuracy across different dataset splits (100\%, 50\%, 25\%, 10\%) with Attentive Probing. Results are presented as percentage accuracies. For each split, the higher value between VideoMAE~\cite{tong2022videomae} and JHU-VPT(JEPA) - Ours is highlighted in bold.}
\label{table:split_accuracy}
\resizebox{0.75\textwidth}{!}{%
\begin{tabular}{lcc|cc|cc|cc}
\toprule
\multirow{2}{*}{\textbf{Dataset}} & \multicolumn{2}{c|}{\textbf{100\% Split}} & \multicolumn{2}{c|}{\textbf{50\% Split}} & \multicolumn{2}{c|}{\textbf{25\% Split}} & \multicolumn{2}{c}{\textbf{10\% Split}} \\
\cmidrule(lr){2-3} \cmidrule(lr){4-5} \cmidrule(lr){6-7} \cmidrule(lr){8-9}
 & \textbf{VideoMAE} & \textbf{Ours} & \textbf{VideoMAE} & \textbf{Ours} & \textbf{VideoMAE} & \textbf{Ours} & \textbf{VideoMAE} & \textbf{Ours} \\
\midrule
Cataract-101    & 79.31 & \textbf{89.82} & 72.41 & \textbf{84.79} & 70.64 & \textbf{79.73} & \textbf{58.60} & 56.95 \\
D99             & 66.13 & \textbf{77.20} & 51.10 & \textbf{71.51} & 47.56 & \textbf{63.21} & 42.10 & \textbf{45.56} \\
Cataract-1k     & 63.75 & \textbf{79.58} & \textbf{59.65} & 58.80 & \textbf{46.91} & 45.09 & \textbf{36.61} & 35.12 \\
Cataract-1k-JHU & 70.03 & \textbf{83.65} & 66.18 & \textbf{80.71} & 63.93 & \textbf{74.55} & 58.36 & \textbf{63.81} \\
\bottomrule
\end{tabular}%
}
\end{table*}
\noindent\textbf{Qualitative Results:}
As shown in Fig.~\ref{fig:qualitative}, our JHU-VPT (JEPA) model outperforms VideoMAE in overall prediction quality across all four datasets.~
We observe clearer and more accurate step-transition boundaries, as well as reduced temporal jitter, which indicate more robust and stable features from JHU-VPT (JEPA) for Attentive Probing.~
These improvements arise from the JEPA-based latent space modeling, which emphasizes richer spatiotemporal representations over pixel-level reconstructions.

\begin{figure*}[htp!]
  \centering
  \includegraphics[width=0.75\linewidth]{jepa_fig.pdf}
  \vskip -10pt
\caption{Qualitative results for the step recognition task. Subfigures A, B, C, and D correspond to the datasets Cataract-101, D99, Cataract-1k, and Cataract-1k-JHU, respectively. Within each dataset, the small subfigures a, b, and c are ribbon plots showing the prediction for VideoMAE~\cite{tong2022videomae}, JHU-VPT (JEPA) trained with Attentive Probing, and the ground truth, respectively.}
\label{fig:qualitative}
\end{figure*}

\subsection{Comparison on Feedback and Skill Performance}
Table \ref{table:combined_model_evaluation}(a) shows that our method improves feedback prediction by approximately 13\% in AUC compared to TimeSformer~\cite{bertasius2021space}. Compared to other methods, JHU-VPT(JEPA) improves specificity, i.e., reduces false positives, indicating that the model has meaningful discrimination between positive and negative labels. Table \ref{table:combined_model_evaluation}(b,c) demonstrates our model's performance on skill assessment for main incision and capsulorhexis. We observe a steady improvement of 10\%-20\% for both phases, which highlights the robustness of its learned feature representations across various phases.

\label{sec:experiments}

\begin{table*}[htb]
\caption{Model evaluation for predicting feedback items, skill assessment in main incision, and skill assessment in capsulorhexis.}
\label{table:combined_model_evaluation} 
\begin{center}
\resizebox{0.65\linewidth}{!}{%
\begin{tabular}{lcccc}
\toprule
\multicolumn{5}{c}{\textbf{Feedback Prediction (Table~\ref{table:combined_model_evaluation}a)}} \\
\textbf{Model} & \textbf{Accuracy} & \textbf{Sensitivity} & \textbf{Specificity} & \textbf{AUC} \\
\midrule
CNN-LSTM~\cite{wan2024spatial} & $76.3 \pm 1.6$ & $\textbf{94.3} \pm \textbf{1.5}$ & $15.3 \pm 1.7$ & $0.659 \pm 0.049$ \\
CNN-LSTM-GNN~\cite{xia2025leveraging} & $75.0 \pm 1.1$ & $85.6 \pm 2.6$ & $34.5 \pm 6.7$ & $0.559 \pm 0.048$ \\
JHU-VPT(MAE)~(D-2591) & $80.4 \pm 2.9$ & $93.1 \pm 8.0$ & $35.9 \pm 8.2$ & $0.817 \pm 0.032$ \\
TimeSformer~\cite{bertasius2021space} & $77.2 \pm 1.1$ & $85.7 \pm 5.9$ & $40.2 \pm 12.4$ & $0.710 \pm 0.066$ \\
JHU-VPT(JEPA) (Ours) & $\textbf{82.3} \pm \textbf{1.4}$ & $92.6 \pm 6.9$ & $\textbf{40.8} \pm \textbf{16.5}$ & $\textbf{0.842} \pm \textbf{0.045}$ \\
\midrule
\multicolumn{5}{c}{\textbf{Main Incision Skill Assessment (Table~\ref{table:combined_model_evaluation}b)}} \\
\textbf{Model} & \textbf{Accuracy} & \textbf{Sensitivity} & \textbf{Specificity} & \textbf{AUC} \\
\midrule
CNN-LSTM \cite{hira2022video} & 63.0 & 92.0 & 36.0 & 0.64 \\
ViT~\cite{dosovitskiy2020image} & 62.0 & 10.0 & 100.0 & 0.55 \\
JHU-VPT(JEPA) (Ours) & 73.0 & 60.0 & 63.0 & 0.72 \\
\midrule
\multicolumn{5}{c}{\textbf{Capsulorhexis Skill Assessment (Table~\ref{table:combined_model_evaluation}c)}} \\
\textbf{Model} & \textbf{Accuracy} & \textbf{Sensitivity} & \textbf{Specificity} & \textbf{AUC} \\
\midrule
ResNet-101~\cite{he2016deep} & 62.0 & 76.0 & 80.0 & 0.45 \\
STMAE~\cite{feichtenhofer2022masked} & 66.0 & 68.0 & 80.0 & 0.55 \\
JHU-VPT(MAE)~(D-2591) & 71.0 & 85.0 & 90.0 & 0.55 \\
JHU-VPT(JEPA)~(Ours) & 80.0 & 70.0 & 56.25 & 0.80 \\
\bottomrule
\end{tabular}
}
\end{center}
\end{table*}

\section{Conclusion}
We introduced JHU-VPT(JEPA) for cataract surgery video analysis. JHU-VPT(JEPA), using latent feature prediction, captures rich spatio-temporal representations without needing pixel-level reconstruction or extensive labeled data. It allows clinical use while preserving patient privacy. While JHU-VPT (JEPA) shows strong performance, further improvements are possible by increasing temporal resolution and incorporating finer motion features for feedback and skill assessment. Future work may explore domain adaptation to improve generalization across surgical environments.

\clearpage
\midlacknowledgments{This research was supported by a grant from the National Institutes of Health, USA; R01EY033065. The content is solely the responsibility of the authors and does not necessarily represent the official views of the National Institutes of Health. Also, we would like to thank the Johns Hopkins Research IT team in IT@JH for their support and infrastructure resources, where some of these analyses were conducted, especially DISCOVERY HPC. Their commitment to advancing research has been invaluable in the successful completion of this study.
}

\bibliography{midl25_129}


\appendix
\section{More results of comparison on Pretraining dataset and Ablations}
Table~\ref{tbl:table_1} presents a detailed comparison of JHU-VPT(JEPA) with several state-of-the-art methods for D99 Step Recognition under various data-regime settings (10\%, 25\%, 50\%, and 100\% of labeled data). The table categorizes methods by their pretraining datasets (e.g., Kinetics-400, D-450, D-2591) and the employed masking strategies (e.g., Random, Frame, Tube, Token Selection, and Multi-block).

Notably, JHU-VPT(JEPA), which is pretrained on the large and diverse D-2591 dataset using a Multi-block masking approach, achieves the highest performance in low-data regimes (62.2 at 10\%, 65.86 at 25\%, and 70.42 at 50\%). These results underscore JHU-VPT(JEPA)'s ability to learn robust representations that are particularly effective when labeled data is scarce. While some methods, such as GLSFormer, surpass JHU-VPT(JEPA) at the full data regime (100\%), our approach offers a compelling advantage in scenarios where extensive labeled data is unavailable.

Overall, these findings highlight the effectiveness of combining extensive pretraining with tailored masking strategies, positioning JHU-VPT(JEPA) as a strong candidate for applications with privacy constraints and limited annotation resources.

\begin{table*}[ht]
\centering
\caption{Comparison of JHU-VPT(JEPA) with other state-of-the-art methods on D99 Step Recognition, under different data-regime settings and pretraining datasets.}~
\resizebox{0.5\textwidth}{!}{%
\begin{tabular}{c|c|c|cccc}
\toprule
Methods & Pre-training Dataset & Masking & \multicolumn{4}{c}{Data Regime (\%)} \\
\cmidrule{4-7}
 &  &  & 10 & 25 & 50 & 100 \\
\midrule
MaskFeat~\cite{wei2022masked} & Kinetics-400 & Random & 47.28 & 59.32 & 60.47 & 72.85 \\
GLSFormer~\cite{shah2023glsformer} & Kinetics-400 & - & 47.19 & 61.54 & 63.76 & \textbf{80.24} \\
VideoMAE~\cite{tong2022videomae} & D-450 & Frame & 48.62 & 58.73 & 60.84 & 70.91 \\
STMAE~\cite{feichtenhofer2022masked} & D-450 & Random & 52.37 & 60.42 & 63.58 & 74.16 \\
VideoMAE~\cite{tong2022videomae} & Kinetics-400 & Tube & 46.16 & 59.76 & 60.99 & 73.35 \\
VideoMAE~\cite{tong2022videomae} & D-450 & Random & 50.24 & 60.89 & 62.34 & 72.98 \\
VideoMAE~\cite{tong2022videomae} & D-450 & Tube & 52.11 & 61.59 & 63.72 & 74.39 \\
CSMAE~\cite{shah2025csmae} & D-450 & Token Selection & {54.75} & {63.12} & {65.83} & 78.14 \\
JHU-VPT(JEPA) & D-2591 & Multi-block & \textbf{62.2} & \textbf{65.86} & \textbf{70.42} & 75.61 \\
\bottomrule
\end{tabular}%
}
\label{tbl:table_1}
\end{table*}

Figure~\ref{fig:fine_tuning_results} shows the step recognition accuracy across different dataset splits after full fine-tuning. Both our JHU-VPT(JEPA) and the D-2591 models (VideoMAE pretraining) consistently outperform CSMAE on the Cataract-1k and Cataract-1k-JHU datasets, reinforcing the robustness of our pretraining with a large dataset.

\begin{figure}[htb]
    \centering
    \includegraphics[width=0.35\linewidth]{output (3).png}
    \caption{Step Recognition Accuracy across different dataset splits after complete fine-tuning.}
    \label{fig:fine_tuning_results}
\end{figure}

For ablation experiments on model architecture, our results show that as the number of pretraining epochs increases, accuracy improves smoothly, demonstrating the scaling capabilities of the JEPA architecture. For instance, on D99 accuracy increased from 65.65\% to 77.20\%, and on Cataract-101 from 80.42\% to 89.82\% (see Table~\ref{tab:epoch_ablation}).

\begin{table}[htbp]
\centering
\caption{Epoch Ablation on Step Recognition Accuracy. Accuracy values (in \%) for each checkpoint are shown for the D99 and C101 datasets. The best performance for each dataset is highlighted in bold.}
\label{tab:epoch_ablation}
\resizebox{0.5\textwidth}{!}{%
\begin{tabular}{lccccc}
\toprule
\textbf{Dataset} & \textbf{60} & \textbf{100} & \textbf{200} & \textbf{240} & \textbf{300} \\
\midrule
D99  & 65.65 & 69.97 & 70.84 & 74.03 & \textbf{77.20} \\
Cataract-101  & 80.42 & 83.72 & 86.38 & 88.17 & \textbf{89.82} \\
\bottomrule
\end{tabular}%
}
\end{table}

Our ablation study evaluating various classification strategies—ranging from simple linear probing and attentive probing to full fine-tuning—shows that full fine-tuning generally achieves the best performance. However, attentive probing offers a compelling trade-off, particularly for the D99 dataset where it outperforms full fine-tuning, as it requires less computation and enables privacy-aware fine-tuning by avoiding back-propagation through the visual encoder. These findings suggest that the choice of classification strategy can be tailored based on labeled data availability, privacy requirements, and computational resources (see Table~\ref{tab:jepa_ablation}).

\begin{table}[htbp]
\centering
\caption{Ablation experiments on JHU-VPT(JEPA) for the step recognition task with Full Finetuning, Attentive Probing, and Linear Probing across four datasets.}
\label{tab:jepa_ablation}
\resizebox{0.6\textwidth}{!}{%
\begin{tabular}{lccc}
\toprule
\textbf{Dataset} & \textbf{Full Finetuning} & \textbf{Attentive Probing} & \textbf{Linear Probing} \\
\midrule
Cataract-101     & \textbf{91.52}           & 89.82                      & 67.04 \\
Cataract-1k      & \textbf{94.25}           & 79.58                      & 52.51 \\
Cataract-1k-JHU  & \textbf{86.55}           & 83.65                      & 59.04 \\
D99              & 75.61                    & \textbf{77.20}             & 48.75 \\
\bottomrule
\end{tabular}%
}
\end{table}

\end{document}
