\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\usepackage{graphicx,verbatim}
\usepackage{booktabs}
\usepackage{color}
\usepackage{enumitem}
\usepackage[T1]{fontenc}
\usepackage{amssymb}
\usepackage[table]{xcolor}
\usepackage{xcolor}
\usepackage{soul}

\jmlrvolume{-- Under Review}
\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026 submission}
\editors{Under Review for MIDL 2026}

\title[CatVLM]{CatVLM: Enhancing Temporal Understanding in Cataract Surgery Videos with Boundary-Aware VLM}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Jay N. Paranjape\nametag{$^{1}$}} \orcid{0009-0007-0152-5502} \Email{jparanj1@jhu.edu}
 \AND
\Name{Nisarg Shah\nametag{$^{1}$}} \Email{snisarg812@gmail.com}
 \AND
\Name{Nanthini Narayanan\nametag{$^{1}$}} \Email{nanthinii.narayanan@gmail.com}
 \AND
\Name{Shameema Sikder\nametag{$^{2}$}} \Email{ssikder1@jhmi.edu}
 \AND
\Name{S. Swaroop Vedula\nametag{$^{1}$}} \Email{swaroop@jhu.edu}
 \AND
\Name{Vishal M. Patel\nametag{$^{1}$}} \Email{vpatel36@jhu.edu}\\
\addr $^{1}$ Johns Hopkins University\\
\addr $^{2}$ Wilmer Eye Institute\\
% \addr $^{3}$ Address 3
}

\begin{document}

\maketitle

\begin{abstract}
Recent studies have shown the effectiveness of Vision Language Models (VLMs) for understanding and analyzing videos in the medical domain and supporting various Question-Answer (QA) tasks. Yet, current VLMs fall short in addressing queries that require temporal reasoning—a critical capability for surgical video understanding. In this work, we introduce CatVLM, a boundary-aware VLM, designed to capture temporal dynamics in untrimmed cataract surgery videos. CatVLM is capable of performing three clinically relevant tasks that demand moment-level awareness: Video Moment Retrieval (VMR), Video Captioning (VC), and Counting. To facilitate the training of such a model, we generate a bank of QA annotations for each task and propose a method to integrate video clips with the timestamps they occur. To the best of our knowledge, this work is one of the first approaches to explicitly incorporate temporal boundary awareness into VLMs for cataracts as well as the medical domain. We evaluate CatVLM on two public cataract surgery datasets, establishing new baselines across all three tasks. All the code, model checkpoints and annotations will be released post-review
\end{abstract}

\begin{keywords}
Cataract, Vision Language Models, Video Understanding.
\end{keywords}

\begin{figure}[htbp]
\centering
\caption{We develop CatVLM, a boundary-aware LLM for cataract surgery that is capable of performing temporal video understanding tasks like Video Moment Retrieval (VMR), Video Captioning (VC), and Counting.} 
\vskip-6pt
\includegraphics[width=0.8\textwidth]{resources/intro_fig.png}
\label{introfig}
\end{figure}

\section{Introduction}

\noindent Cataract surgery is one of the most frequently performed operative procedures worldwide \cite{cat2,cat1}. In recent literature, various deep learning based approaches have been used for intra-operative and post-operative analysis of cataract surgery videos through various downstream tasks. These include skill assessment \cite{skill1,skill2}, step and instrument localization \cite{glsformer,barlow}, segmentation \cite{cat1k} and more recently, comprehensive understanding using vision language models (VLMs) \cite{cataractbot}. VLMs can process natural language queries and integrate them with video content to generate clinically relevant answers. However, current approaches are typically trained to produce summarized responses, lacking the fine-grained temporal reasoning needed to identify when key surgical events occur, which is essential for medical video analysis. 
In this work, we introduce CatVLM: a VLM that has improved temporal awareness for analysis of cataract surgery videos. Temporal understanding of surgery videos enables multiple downstream clinical applications. Examples of potential applications include retrieval of relevant segments of the video for analytics such as operative time and surgical skill, characterizing surgery based on how frequently a step is performed or sequence of activities performed during the procedure, and using the analytics to provide feedback to surgeons and enable context-aware systems. Moreover, temporally grounded QA can accelerate expert annotation by allowing annotators to focus directly on relevant video segments rather than reviewing entire procedures.
To facilitate this in the cataract surgery domain, we first use the Video-Masked Auto Encoder (VMAE) strategy (\cite{videomae}) to pretrain a feature extractor on a corpus of cataract surgery videos collected from our hospital. Next, we leverage step localization captions on public datasets to generate a corpus of QA annotations spanning three temporal tasks: VMR, VC and Counting. 
We then use the frozen encoder to extract video features from the public videos and generate embeddings, which capture the spatio-temporal features of the video, but are still unaware of what explicit timestamp each feature represents. To incorporate this information, we propose the simple yet effective Image-Timestamp (IM-TS) Adapter module, which concatenates timestamp embeddings with video features and refines them through a learnable linear layer to produce time-aware representations. These enriched embeddings are passed to an open-source VLM, which we fine-tune using Low-Rank Adaptation (LoRA) (\cite{lora}). To further improve task specialization, we employ separate LoRA modules for each of the three tasks. In addition, we learn a lightweight K-Nearest Neighbours (KNN) module that decides which LoRA to use based on the question type. During inference, the user can manually select the LoRA module or utilize the KNN for automatically selecting the LoRA module to plug into the VLM.
Thus, our contributions are threefold:
\begin{enumerate}[itemsep=0pt, parsep=0pt]
    \item We present CatVLM, a boundary-aware VLM capable of answering temporally grounded questions in cataract surgery videos as shown in Figure \ref{introfig}, enabled by VMAE pretraining and the proposed IM-TS Adapter.
    \item We develop a method to generate annotations to train such an VLM from public cataract video datasets.
    \item We show that our method is able to outperform existing methods for these tasks. Thus, we establish a new baseline for future research.
\end{enumerate}

\section{Related Work}
\noindent\textbf{Boundary-Aware VLMs in Literature: }In the natural vision domain, several recent works have explored empowering VLMs with the ability to capture video moments \cite{vtimellm,videochatgpt,Ren2023TimeChat}. VideoChatGPT \cite{videochatgpt} uses frozen CLIP features and trainable adapter layers to extract features, before passing it to the frozen Vicuna \cite{vicuna} backbone. TimeChat \cite{Ren2023TimeChat} introduces a time-aware Q-former architecture, combining multiple video Q-formers, adapters, and LoRA modules to make the VLM sensitive to temporal boundaries. While effective, this approach requires training a large number of modules, demanding significantly more data than is available in the cataract surgery setting. On the other hand, VTimeLLM \cite{vtimellm} uses a three-stage training approach. In the first stage, an adapter is pretrained with natural images. Next, it uses LoRA to tune the model to identify and localize all events in a given video. Finally, in the third stage, an additional LoRA is used to teach the model to respond to natural text using an instruction tuning dataset. In our work, we initialize the LLM with LoRA weights from VTimeLLM, which eliminates the need for instruction tuning and enables open-set question answering rather than template-based QA. Despite these advances, existing approaches face two critical challenges: (1) The feature extractor of CLIP is trained on natural images and is not the best choice for extracting features from cataract videos, and (2) Various important surgical moments in cataracts often last for only a few seconds before transitioning to the next evolution. In the current state, these VLMs aren't able to capture such quick changes and distinguish them as separate key actions.

To address these limitations, we pretrain a Video-MAE (VMAE) \cite{videomae} feature extractor on a large corpus of private cataract videos from our hospital and use the pretrained model to extract features during training, thus initializing the model with a representative set of features. In addition, unlike existing methods which predict timestamps at a scale of 100, we design our generated annotations such that the start and end times of moments are represented on a scale of 1000, essentially encouraging the model to predict event boundaries at ten times the resolution of existing methods. This allows it to capture steps that are changing quickly.

\noindent\textbf{Video QA in the Medical Domain: }Recent advances in medical video understanding have focused on integrating VLMs for answering clinically relevant questions including surgical tool detection, procedural understanding and step recognition. This has led to datasets like MedVidQA \cite{medvidqa}, that contains QA pairs for generic medical videos, and SurgPubVideo \cite{surgpubvideo}, that evaluates conceptual understanding in surgical videos. This has, in turn, enabled various systems performing video QA in the medical domain. \cite{tascon2023localized} introduced transformer-driven architectures for surgical video QA while Surgical-VQA \cite{surgicalVQA} used hierarchical graph networks and MedicalGPT \cite{MedicalGPT} explored integration of VLMs for allowing open-set question-answering. However, existing datasets and methods primarily focus on multiple-choice or spatial/conceptual questions, overlooking tasks that require temporal localization such as video moment retrieval or captioning. To the best of our knowledge, no datasets currently address temporal video QA specifically in the cataract surgery domain, underscoring the need for temporally aware approaches tailored to this setting.

\begin{figure}[htbp]
\centering
\caption{Model Architecture. The pretraining phase involves self-supervised pretraining on a large corpus of private videos and is responsible for creating a robust feature extractor. The finetuning phase improves these features by integrating timestamp-aware embeddings and passes them to a VLM, which is trained using LoRA for each task. During inference phase, the relevant LoRA module is selected from a bank of LoRA modules trained in the finetuning phase based on the task and used to get the final prediction.} 
\vskip-6pt
\includegraphics[width=0.95\textwidth]{resources/arch2.png}
\label{arch}
\end{figure}

\section{Methodology}
\subsection{Generating Annotations}
We leverage step-localization annotations of public datasets to generate question-answer pairs for tuning CatVLM. Given a video, step localizations cover what steps are present in the video and their time boundaries. Based on these annotations, we generate GPT-generated QA pairs to train CatVLM. In this work, we focus on three important question types as follows:
% \textbf{1. Video Moment Retrieval:} Q: When did $s$ happen? A: From $t_1$ to $t_2$, from $t_3$ to $t_4$\\
% \textbf{2. Video Captioning: } Q: What happened between $t_1$ and $t_2$? A: $s_1$ from $t_1$ to $t_2$, $s_2$ from $t_3$ to $t_4$, ...\\
% \textbf{3. Counting: } Q: How many times did $s$ occur? A: $s$ occurred $n$ times

\begin{enumerate}[itemsep=0pt, parsep=0pt]
    \item \textbf{Video Moment Retrieval:} Q: When did $s$ happen? A: From $t_1$ to $t_2$, from $t_3$ to $t_4$
    \item \textbf{Video Captioning: } Q: What happened between $t_1$ and $t_2$? A: $s_1$ from $t_1$ to $t_2$, $s_2$ from $t_3$ to $t_4$, ...
    \item \textbf{Counting: } Q: How many times did $s$ occur? A: $s$ occurred $n$ times
\end{enumerate}
Here, $s$ represents the step name, $t_i$ represents time instances and $n$ represents the number of times a given step was performed at separate intervals. To ensure consistency across videos of varying lengths, all timestamps are normalized to a scale of 0–1000, and are calculated as follows: $t_i = round(1000\times T_i/D)$, where $T_i$ is the timestamp in seconds and $D$ is the total duration of the videos in seconds. This allows the model to cater to untrimmed videos of any duration since all timestamps are scaled to the same range. 
For each of these question types, we used ChatGPT to generate 50 synonymous sentence variations. Hence, for a given video, we generate $50 \times total\_steps$ annotations, significantly expanding the training corpus. VMR allows the model to learn the correlations between the visual and temporal boundaries of the given step. Similarly, Video Captioning is a more complex task since it requires the model to identify all the steps happening between two timestamps. Counting is a derived task that allows the model to learn to distinguish between the same events at different points in time. All these tasks aim to enhance the temporal understanding of the LLM and are essential for an in-depth analysis of the video.

\subsection{Pretraining Phase}
The pretraining phase is based on the VideoMAE\cite{tong2022videomae} framework as shown in Fig.~\ref{arch}, and consists of a Tokenizer, Encoder, and Decoder. Given an input video \(X \in \mathbb{R}^{T \times C \times H \times W}\), the Tokenizer utilizes a 3D convolution layer followed by positional encoding to convert \(X\) into tokens.  A fraction of these tokens is then randomly selected based on a predetermined masking ratio. The selected tokens are passed through a Vision Transformer-based encoder to extract latent representations that capture high-level semantics of the input video. The decoder then merges these latent features with fixed representations of the masked tokens - augmented with positional encodings - and uses a transformer architecture to reconstruct the missing tokens. The model is optimized by minimizing the reconstruction error, measured as the Mean Squared Error between the reconstructed video tokens and the original normalized video tokens.

\subsection{Finetuning Phase: }Once the feature extractor is trained, we freeze its weights and utilize it to extract features from untrimmed videos, as shown in Figure \ref{arch}. These features capture the spatio-temporal structure of the video but lack explicit alignment with text-based timestamps. Moreover, since videos vary in length, the same set of features can correspond to different temporal positions. To address this, we propose a simple yet effective solution called the IM-TS Adapter. Given the duration $D$ of the video $v$ and its VMAE features $f_v \in \mathbb{R}^{N \times H}$, where $N$ are the number of features and $H$ is the hidden dimension of the features, we compute the timestamp embeddings for each feature as follows:
\begin{equation}
    f_t^i = \textit{VLM}\left( \frac{i\times D}{N} \text{  ``seconds"}\right)
\end{equation}
These embeddings are concatenated with the original features and passed through a learnable linear transformation to form the final set of features $f$ as follows:
$
    f = \textit{Linear}(f_v \cdot f_t)
$
Here, VLM denotes the embeddings generated by the language model for the string representation of the timestamp and \(\cdot\) represents concatenation. By concatenating timestamp embeddings with video features, the IM-TS Adapter produces time-aware representations that preserve both visual and temporal context. To adapt the VLM to the enriched features $f$ from the IM-TS module, we utilize Low Rank Adaptation (LoRA) \cite{lora} to perform parameter-efficient fine-tuning. This method allows us to adapt the LLM without tuning all parameters of the large model. We use a separate LoRA for each of the three tasks to allow for more stability during training. The role of the IM-TS adapter is to capture and integrate the timestamp information in each feature, while the LoRA module adapts the LLM to use this information to answer a particular question type. To encourage this role separation, we use the same IM-TS adapter across the three tasks with weight sharing, whereas each LoRA is task-specific.

\subsection{Inference Phase: }We learn different LoRA modules for different types of tasks, namely VMR, Counting and VC. During inference, we choose which LoRA module to use based on the question, attach it to the VLM and produce the prediction. The choice of LoRA can be manual or automatic depending on the application. In the manual approach, the user provides the LoRA module to use as an input. In the automatic approach, we use a K-Nearest Neighbour (KNN) to classify the question into one of three types and select the LoRA corresponding to the prediction of the KNN. To train the KNN, we generate 50 new synonymous sentences for each question and convert them into embeddings using CLIP \cite{clip}. With this method, we achieve a $97\%$ accuracy in LoRA selection, which is also reflected in our results. An alternative design is to use a single LoRA module for all tasks. However, this approach presents two major limitations:
\begin{enumerate}[itemsep=0pt, parsep=0pt]
    \item We observed that the model performed worse on all the tasks as shown in Table \ref{ablations}. This happens because the model often confuses the output of one task for the other or gets stuck in local minimas, leading to unstable training.
    \item Adding a single LoRA for all three question types limits the number of tasks that can be performed. In this study, we consider three major video understanding tasks. However, for a new type of application, the entire LoRA would have to be retrained with all training data again, to incorporate the new task.
\end{enumerate}
By contrast, maintaining multiple task-specific LoRA modules divides the training data appropriately and isolates learning patterns. In addition, it makes the system modular, allowing new tasks to be incorporated by adding corresponding LoRA experts without retraining existing modules, ensuring scalability and stability.



\section{Experiments}
%% TODO : Pretraining Dataset  :: details
\subsection{Datasets} 
We use two public cataract surgery video datasets for evaluating our method. Cataract-1K \cite{cat1k} consists of 1000 videos with an average video duration of 7.12 minutes. Since this dataset provides step annotations for only a subset of videos, we manually annotated the rest of the videos for training our model. We will be releasing these annotations post-review. Annotations were done by study personnel trained with a standard protocol designed by an expert surgeon. Reliability of annotations was assured by each annotator labeling the same set of videos and verification of consistent understanding of instructions, with a tolerance of 1 second for step boundaries. Cataract-101 \cite{cataracts101} consists of 101 cataract surgery videos performed by four different surgeons of varying expertise and average video length of 8.67 minutes. We annotated the videos for steps using a similar protocol as that used for Cataract-1k. 
% For pretraining, we use private cataracts video dataset from a hospital, with X number of videos. We ensure that videos used for evaluations are not used during pretraining.
%% Here; ->
For pretraining, we use private cataract video dataset from the Wilmer Eye Institute, with 450 videos~\cite{shah2025mathsf}. These videos have an average duration of 34 minutes and were originally recorded at 59 \textit{fps}. Following prior studies~\cite{gao2021trans,shah2023gated}, all videos are subsampled to 1 \textit{fps} and resized to $250\times250$ pixels for pretraining. We ensure that videos used for pretraining are not used during finetuning or inference.
% explain about Cat-1k, D99 and Cat-101. Also, for pretraining, mention the dataset used is not from any of these


\subsection{Implementation Details}
We used the Vicuna \cite{vicuna} model initialized with the weights from VTimeLLM \cite{vtimellm} since they train the LLM using instruction-tuning that allows the LLM to recognize instructions from natural language without the requirement of any template. We use Vicuna as the backbone VLM to ensure fair comparison with existing methods for temporal video QA. For the individual LoRA experts, we use a rank of 64 and an initial learning rate of 1e-4. The LoRA was trained for 1 epoch with batch size 8 on a single NVIDIA RTX A6000 GPU. 
For pretraining, we use AdamW optimizer with a base learning rate of \(1\times10^{-3}\) and a weight decay of 0.05, using a cosine decay schedule with a warmup period of 40 epochs. Training is performed over 800 epochs on 8 NVIDIA RTX A6000 GPUs with a batch size of 64, where each input consists of 16 frames.


\subsection{Metrics}
We compute the mean Average Precision (mAP), mean Average Recall (mAR) and mean Intersection-over-Union (mIoU) metrics for VMR and VC tasks and mean accuracy for the counting task. For mAP and mAR, we compute the precision and recall at IoU thresholds of 0.3, 0.5 and 0.7 and compute the average of the three values. To calculate the recall and precision, we calculate the total number of predicted timestamps that had IoU greater than the selected threshold with a ground truth and consider them as true positives. The ground truths that were not matched with any prediction are considered as false negatives, while the predictions that were not true positives were considered as false positives.



\subsection{Results}
We compare our method against state-of-the-art (SOTA) methods from the natural vision domain, which we finetune on the cataracts training data. As seen in Table \ref{results}, we outperform the current SOTA methods for temporal LLMs on all three tasks for Cataracts-101 and for VC and VMR on Cataract-1K with comparable performance on the Counting task. The task of Counting is simpler than VC or VMR and so we see greater than 85\% accuracy in all cases, with our method reaching 96\% and 89\% accuracies on Cataract-1K and Cataracts-101 respectively. With respect to VTimeLLM, for the task of VMR, we see a rise in 6\% for Cataracts-101 and 4\% for Cataract-1K. For VC, we see a rise of 4\% in Cataracts-101 but do not see significant change for Cataract-1K. We see a slight drop in metrics going from the manual LoRA selection to the automatic LoRA selection case, which is expected since errors from the KNN are also reflected in the results for the automatic case. It is important to note, however, that manually selecting the LoRA is not infeasible during inference.

In addition, we note that there is room for improving these numbers further. We observed that this is caused due to limited size of the dataset, causing overfitting during tuning the model. This also points towards the challenging nature of these two tasks, especially in the case of limited annotations and motivates further research in this field. Figure \ref{fig:results} shows qualitative results of our method for VMR (a) and VC (b). As seen in the figure, our method performs precise predictions for these tasks, and is able to identify boundaries correctly for all steps. Part c in the figure shows a failure case of our model, where one event boundary is correctly predicted but it misses the other occurrence by some seconds.

\begin{table}[htbp]
\begin{center}
\normalsize
\caption{Quantitative Results on two public datasets. The first question represents VMR, the second represents Counting and the third question denotes VC. All results have a maximum p-value of 1e-5 with our method, making it statistically significant.}
\label{results}
\resizebox{0.85\columnwidth}{!}{
\begin{tabular}
{@{\extracolsep{4pt}}c c c c c c c c@{}}
\toprule
\multicolumn{8}{c}{\textbf{Cataracts-101} \cite{cataracts101}}\\
\midrule
 & \multicolumn{3}{c}{Q: \color{blue}{When did}} & Q: \color{blue}{How many times} & \multicolumn{3}{c}{Q: \color{blue}{What happened}} \\
  & \multicolumn{3}{c}{\color{blue}{$s$ happen?}} & \color{blue}{did $s$ occur?} & \multicolumn{3}{c}{\color{blue}{between $t_1$ and $t_2$?}} \\
\cline{2-4} \cline{5-5} \cline{6-8} \\
Method & mAR & mAP & mIoU & Accuracy & mAR & mAP & mIoU\\
\midrule
VideoChatGPT \cite{videochatgpt} & 0.38 & 0.36 & 0.37 & 85.4 & 0.23 & 0.24 & 0.23\\
TimeChat \cite{Ren2023TimeChat} & 0.43 & 0.45 & \textbf{0.45} & 86.7 & 0.33 & 0.19 & 0.17\\
VTimeLLM \cite{vtimellm} & 0.36 & 0.37 & 0.37 & 85.6 & 0.55 & 0.43 & 0.40\\
\rowcolor{gray!12}
Ours (Manual LoRA selection)& \textbf{0.45} & \textbf{0.46} & 0.43 & \textbf{87.8} & \textbf{0.59} & \textbf{0.45} & \textbf{0.42}\\
\rowcolor{gray!12}
Ours (Automatic LoRA selection)& 0.44 & 0.45 & 0.42 & 85.2 & 0.58 & 0.44 & 0.41\\
\midrule
\multicolumn{8}{c}{\textbf{Cataract-1K} \cite{cat1k}}\\
\midrule
VideoChatGPT \cite{videochatgpt} & 0.15 & 0.16 & 0.16 & 96.1 & \textbf{0.23} & \textbf{0.24} & 0.23\\
TimeChat \cite{Ren2023TimeChat} & 0.14 & 0.14 & 0.17 & 95.4 & 0.01 & 0.02 & 0.01\\
VTimeLLM \cite{vtimellm} & 0.18 & 0.18 & 0.18 & \textbf{96.8} & 0.21 & 0.22 & 0.22\\
\rowcolor{gray!12}
Ours (Manual LoRA Selection) & \textbf{0.22} & \textbf{0.22} & \textbf{0.21} & 95.8 & \textbf{0.23} & \textbf{0.24} & \textbf{0.24}\\
\rowcolor{gray!12}
Ours (Automatic LoRA selection) & 0.21 & 0.21 & 0.20 & 92.5 & 0.22 & 0.23 & 0.23\\

\bottomrule
\end{tabular}}
\end{center}
\end{table}

\begin{table}[htbp]
\begin{center}
\normalsize
\caption{Ablations for MoE and increasing resolution on Cataracts-101}
\label{ablations}
\resizebox{0.85\columnwidth}{!}{
\begin{tabular}
{@{\extracolsep{4pt}}c c c c c c c c c c@{}}
\toprule
% \multicolumn{8}{c}{\textbf{Cataracts-101} \cite{cataracts101}}\\
% \midrule
 & & & \multicolumn{3}{c}{Q: \color{blue}{When did}} & Q: \color{blue}{How many times} & \multicolumn{3}{c}{Q: \color{blue}{What happened}} \\
  & & & \multicolumn{3}{c}{\color{blue}{$s$ happen?}} & \color{blue}{did $s$ occur?} & \multicolumn{3}{c}{\color{blue}{between $t_1$ and $t_2$?}} \\
\cline{4-6} \cline{7-7} \cline{8-10} \\
Method & range of timestamp & LoRA style & mAR & mAP & mIoU & Accuracy & mAR & mAP & mIoU\\
\midrule
Ablation 1 & 100 & Single & 0.34 & 0.36 & 0.35 & 84.0 & 0.54 & 0.29 & 0.36\\
Ablation 2 & 100 & Multiple & 0.37 & 0.38 & 0.38 & 86.6 & 0.51 & 0.39 & 0.35\\
Ablation 3 & 1000 & Single & 0.36 & 0.39 & 0.38 & 85.1 & 0.53 & 0.43 & 0.38\\
\rowcolor{gray!12}
Ours & 1000 & Multiple & \textbf{0.45} & \textbf{0.46} & \textbf{0.42} & \textbf{87.8} & \textbf{0.59} & \textbf{0.45} & \textbf{0.42}\\
\bottomrule
\end{tabular}}
\end{center}
\end{table}

\begin{table}[htbp]
\begin{center}
\normalsize
\caption{Ablations for model components on Cataracts-101}
\label{ablations_model}
\resizebox{0.85\columnwidth}{!}{
\begin{tabular}
{@{\extracolsep{4pt}}c c c c c c c c c c@{}}
\toprule
% \multicolumn{8}{c}{\textbf{Cataracts-101} \cite{cataracts101}}\\
% \midrule
 & & & \multicolumn{3}{c}{Q: \color{blue}{When did}} & Q: \color{blue}{How many times} & \multicolumn{3}{c}{Q: \color{blue}{What happened}} \\
  & & & \multicolumn{3}{c}{\color{blue}{$s$ happen?}} & \color{blue}{did $s$ occur?} & \multicolumn{3}{c}{\color{blue}{between $t_1$ and $t_2$?}} \\
\cline{4-6} \cline{7-7} \cline{8-10} \\
Method & VMAE features & IM-TS Adapter & mAR & mAP & mIoU & Accuracy & mAR & mAP & mIoU\\
\midrule
Ablation 1 & & & 0.36 & 0.37 & 0.36 & 84.8 & 0.54 & 0.40 & 0.38\\
Ablation 2 & \checkmark & & 0.43 & 0.43 & 0.41 & 86.9 & 0.59 & 0.44 & 0.42\\
Ablation 3 & & \checkmark & 0.37 & 0.38 & 0.38 & 87.0 & 0.59 & 0.44 & 0.41\\
\rowcolor{gray!12}
Ours & \checkmark & \checkmark & \textbf{0.45} & \textbf{0.46} & \textbf{0.43} & \textbf{87.8} & \textbf{0.59} & \textbf{0.45} & \textbf{0.42}\\
\bottomrule
\end{tabular}}
\end{center}
\end{table}
\subsection{Ablations} 
\noindent \textbf{Ablation on Multiple LoRAs and Increased Resolution: }
We analyze the effects of our design choices in Table \ref{ablations} for Cataracts-101 dataset. The first row represents using the default range of $0$ - $100$ for representing the timestamps, which is given by $t_{i} = round(100 \times T_i/D)$, where $t_i$, $T_i$, $D$ represent the new timestamp, original timestamp and duration of the video respectively. In addition, it uses a single LoRA for tuning. The second row represents the range of 100 but incorporates a multiple LoRA approach, with a separate LoRA for each task. The third row has an increased resolution of 1000, but incorporates a single LoRA. Finally, the last row is the proposed method, which has a range of 1000 and uses multiple LoRA modules. We use the manual LoRA selection process in the case of multiple LoRA modules. As seen in the table, each of the design choices has a non-trivial contribution in raising the performance of the model. 

\noindent\textbf{Ablation on Model Components: }
VMAE features and the IM-TS modules are the two major components in our method. To gauge the effectiveness of each of these, we perform an ablation by adding these one by one to the Vicuna model and evaluate performance on Cataracts-101 dataset, as seen in Table \ref{ablations_model}. In the first row, we use CLIP embeddings, similar to VTimeLLM. Next, we replace the CLIP features with VMAE features in the second row from the pretraining phase of CatVLM, which significantly increases the performance, showing the effectiveness of pretraining. In the third row, we retain the CLIP features and introduce the IM-TS adapter. Similar to row 2, we also see a rise in metrics, showing the effectiveness of the individual modules. Finally, using both components gives the best metrics as seen in the fourth row. We use a timestamp range as 0 to 1000 and multiple LoRAs with manual selection for all the rows.

\section{Conclusion: }In this work, we present CatVLM, a VLM that is more aware of the finer temporal boundaries of events and is able to perform downstream temporal tasks including Moment Retrieval, Counting and Captioning on untrimmed videos of cataract surgery. CatVLM uses discriminative VMAE features and a novel IM-TS adapter for learning spatio-temporal and timestamp-related features, as well as multiple LoRA experts allowing for a more stable training and the possibility to add more tasks easily without retraining for other tasks. Through our work, we aim to motivate further research in this field by establishing a baseline for three tasks on two datasets. In the current version, CatVLM shows room for improvement for mainly the Video Captioning and Moment Retrieval tasks, mainly due to overfitting. Future directions of research involve extending boundary-aware LLMs to other modalities and adding more tasks to the system.

\begin{figure}[htbp]
\centering
\caption{Qualitative Results. (a) shows the VMR task. Our method captures the step with a high IoU with the GT. (b) represents VC, where the prediction from our model captures all steps with a high IoU. (c) represents a failure case for the VMR task, where one occurrence is captured but the other occurrence is not.} 
\vskip-6pt
\includegraphics[width=0.8\textwidth]{resources/results.png}
\label{fig:results}

\end{figure}

% \begin{table}[htbp]
%  % The first argument is the label.
%  % The caption goes in the second argument, and the table contents
%  % go in the third argument.
% \floatconts
%   {tab:example}%
%   {\caption{An Example Table}}%
%   {\begin{tabular}{ll}
%   \bfseries Dataset & \bfseries Result\\
%   Data1 & 0.12345\\
%   Data2 & 0.67890\\
%   Data3 & 0.54321\\
%   Data4 & 0.09876
%   \end{tabular}}
% \end{table}

% \begin{figure}[htbp]
%  % Caption and label go in the first argument and the figure contents
%  % go in the second argument
% \floatconts
%   {fig:example}
%   {\caption{Example Image}}
%   {\includegraphics[width=0.5\linewidth]{example-image}}
% \end{figure}

% \begin{algorithm2e}
% \caption{Computing Net Activation}
% \label{alg:net}
%  % older versions of algorithm2e have \dontprintsemicolon instead
%  % of the following:
%  %\DontPrintSemicolon
%  % older versions of algorithm2e have \linesnumbered instead of the
%  % following:
%  %\LinesNumbered
% \KwIn{$x_1, \ldots, x_n, w_1, \ldots, w_n$}
% \KwOut{$y$, the net activation}
% $y\leftarrow 0$\;
% \For{$i\leftarrow 1$ \KwTo $n$}{
%   $y \leftarrow y + w_i*x_i$\;
% }
% \end{algorithm2e}

\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{The authors are supported by grants from the National Institutes of Health, U.S.A.; NIH 1R01EY033065 and NIH 1R01EB038734. The content is solely the responsibility of the authors and does not necessarily represent the official views of the National Institutes of Health.}


\bibliography{catvlm}


\appendix

% \section{Proof of Theorem 1}

% This is a boring technical proof of
% \begin{equation}\label{eq:example}
% \cos^2\theta + \sin^2\theta \equiv 1.
% \end{equation}

% \section{Proof of Theorem 2}

% This is a complete version of a proof sketched in the main text.

\end{document}
