\documentclass[pmlr]{jmlr}% new name PMLR (Proceedings of Machine Learning Research)


 % The following packages will be automatically loaded:
 % amsmath, amssymb, natbib, graphicx, url, algorithm2e

 %\usepackage{rotating}% for sideways figures and tables
\usepackage{longtable}% for long tables
\usepackage{multirow}
 % The booktabs package is used by this sample document
 % (it provides \toprule, \midrule and \bottomrule).
 % Remove the next line if you don't require it.
\usepackage{booktabs}
 % The siunitx package is used by this sample document
 % to align numbers in a column by their decimal point.
 % Remove the next line if you don't require it.
\usepackage[load-configurations=version-1]{siunitx} % newer version
 %\usepackage{siunitx}

 % The following command is just for this sample document:
\newcommand{\cs}[1]{\texttt{\char`\\#1}}

 % Define an unnumbered theorem just for this sample document:
\theorembodyfont{\upshape}
\theoremheaderfont{\scshape}
\theorempostheader{:}
\theoremsep{\newline}
\newtheorem*{note}{Note}

 % change the arguments, as appropriate, in the following:
\jmlrvolume{1}
\jmlryear{2023}
\jmlrworkshop{NeurIPS 2023 Gaze Meets ML Workshop}

\title[GazeSAM: Interactive Image Segmentation with Eye Gaze and SAM]{GazeSAM: Interactive Image Segmentation with Eye Gaze and Segment Anything Model}

 % Use \Name{Author Name} to specify the name.
 % Spaces are used to separate forenames from the surname so that
 % the surnames can be picked up for the page header and copyright footer.
 
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % *** Make sure there's no spurious space before \nametag ***

 % Two authors with the same address
  % \author{\Name{Author Name1\nametag{\thanks{with a note}}} \Email{abc@sample.com}\and
  %  \Name{Author Name2} \Email{xyz@sample.com}\\
  %  \addr Address}
\author{Anonymous}

 % Three or more authors with the same address:
 \author{\Name{Bin Wang} \Email{bin.wang@northwestern.edu}\\
  \Name{Armstrong Aboah} \\
  \Name{Zheyuan Zhang} \\
  \Name{Hongyi Pan}\\
  \Name{Ulas Bagci}\\
  \addr Northwestern University, Chicago, IL, 60611, USA}


 % Authors with different addresses:
 % \author{\Name{Author Name1} \Email{abc@sample.com}\\
 % \addr Address 1
 % \AND
 % \Name{Author Name2} \Email{xyz@sample.com}\\
 % \addr Address 2
 %}

% \editor{Editor's name}
 % \editors{List of editors' names}

\begin{document}

\maketitle

\begin{abstract}
Interactive image segmentation aims to assist users in efficiently generating high-quality data annotations through user-friendly interactions such as clicking, scribbling, and bounding boxes.
However, mouse-based interaction methods can induce user fatigue during large-scale dataset annotation and are not entirely suitable for some domains, such as radiology. This study introduces eye gaze as a novel interactive prompt for image segmentation, different than previous model-based applications.  Specifically, leveraging the real-time interactive prompting feature of the recently proposed Segment Anything Model (SAM), we present the \textit{GazeSAM} system to enable users to collect target segmentation masks by simply looking at the region of interest.
GazeSAM tracks users' eye gaze and utilizes it as the input prompt for SAM, generating target segmentation masks in real time.
To our best knowledge, GazeSAM is the first work to combine eye gaze and SAM for interactive image segmentation.
Experimental results demonstrate that GazeSAM can improve nearly 50\% efficiency in 2D natural image and 3D medical image segmentation tasks.
The code is available in \url{https://github.com/ukaukaaaa/GazeSAM}.

\end{abstract}
\begin{keywords}
Eye Gaze, Segment Anything Model,  Interactive Image Segmentation, Eye Tracking
\end{keywords}

\section{Introduction}
%我们需要海量数据以及高质量的数据标注去训练机器学习分割模型
%为了高效快速得到高质量的标注数据，很多工作在发掘交互分割
%交互分割能够干嘛怎么帮助标注
%现在主流的交互分割方式有哪些
%但这些交互分割方式全都是基于鼠标去操作的

%用鼠标做交互有一些弊端
%12345

%因此我们引入更加自然的眼动代替鼠标
%然后就能如何解决刚才的那些问题

%

% Image segmentation is a crucial process in numerous medical applications, such as disease diagnosis, treatment planning, and surgical navigation. It involves the identification of regions of interest (ROIs) in medical images, such as organs, tumors, and lesions \citep{altini2022liver,florez2018emergence,tunali2021application}. Accurate segmentation helps radiologists to understand a patient's condition better and to develop more effective treatment plans. However, the segmentation of medical images has primarily been accomplished through a manual annotation process. This is a costly and time-consuming procedure that can take hours or days to complete. This bottleneck presents a significant barrier to the widespread adoption of image segmentation in clinical practice.

% As a result, there is a growing need for more intelligent and efficient approaches to segmenting medical images. One promising approach which has not yet been explored is the use of eye-tracking technology to perform image segmentation in real time. While previous research in eye tracking has primarily focused on understanding the relationship between human attention and cognitive decision-making \citep{wood2020eye,khosravan2019collaborative}, its potential in automating the segmentation of medical images has yet to be fully realized. By tracking radiologists' eye movements when they read medical images, it is possible to identify the ROIs that are most relevant to them using their gaze points. This rich information can be leveraged to segment the images automatically.

In the field of modern machine learning and computer vision, having large amounts of data and accurate labels is crucial for generalizable deep learning models with high accuracy.
Accurate labeling is particularly important to ensure these models work effectively. 
However, labeling data can be extremely time-consuming and labor-intensive, especially for the image segmentation task. For some fields, such as radiology, this should be done with care as it is a high-risk task and often associated with diagnostic or prognostic decisions.
Traditionally, labeling involves people manually drawing lines at the boundaries of the objects or regions of interest.
To expedite and improve the efficiency of this process, interactive image segmentation was introduced, combining human interaction input and task-specific automated models.

For the human interaction part, there are several existing types, such as clicks \citep{chen2021conditional, lin2020interactive, sofiiuk2020f, sofiiuk2022reviving, liu2022simpleclick, chen2022focalclick}, scribbles \citep{bai2014error, grady2006random, li2004lazy}, bounding boxes \citep{lempitsky2009image, rother2004grabcut, wu2014milcut}, and polygons \citep{acuna2018efficient}. 
However, these methods are all based on mouse interaction. When facing the workload of large-scale datasets with high dimensional nature and low-resolution context, such as in radiology scans, the annotator will easily get tired after repeatedly clicking for precise annotations. 
This can largely decrease the efficiency of the data labeling process and the quality of the data annotations. 
To solve this problem, one promising approach that has not yet been explored extensively is the use of eye gaze to perform interactive image segmentation. 
% This system in fact nicely overlaps with the radiology workflow where radiologists perform the screening and diagnostic tasks with their eyes, and report the findings and measurements with speech-to-text technologies.  
While previous research regarding eye gaze has primarily focused on understanding the relationship between human attention and cognitive decision-making \citep{wood2020eye,khosravan2019collaborative}, its potential in automating the interactive segmentation has yet to be fully realized. 
% We are acknowledging some previous works using the gaze patterns as an input to model based segmentation systems, but never used within the deep segmentation strategies before in real time.

By tracking annotators' eye movements when they label the images, it is possible to identify the regions of interest that are most relevant to them using their gaze points. 
This rich information can be leveraged to segment the images automatically, and there are multiple advantages to using eye gaze as the interaction input. 
\textbf{First,} it is more natural and intuitive because eye gaze-based interaction aligns with how humans naturally perceive objects by simply looking at them. If we take the eye gaze as the prompt for segmentation, it would be more intuitive and user-friendly. 
\textbf{Second,} it can reduce user fatigue significantly. Using a mouse to annotate large-scale datasets will lead to a tedious click job. Interacting by eye gaze is way easier and less fatiguing since users do not need to perform physical movements to mark the object or draw the bounding boxes.
\textbf{Third,} eye gaze enables faster and more efficient interactions. Users can simply glance at the object they want to segment without the need for mouse clicking or drawing.
\textbf{Fourth,} using eye gaze as interaction input can generate multiple prompt input points in less than one second, which can input more information into the automated model in a short time. This will increase the accuracy of the generated segmentation masks. But, mouse-based interaction can only allow annotators to click to generate prompt input points one by one individually.


Related to the automated model part, previous studies \citep{chen2022focalclick, liu2022simpleclick, sofiiuk2022reviving} have trained separate models for different segmentation tasks for vision and radiology applications.
This leads to redundant model training efforts and requires the annotators to switch between models for annotations, which is unnecessary.
One solution is to employ the Segment Anything Model (SAM) \citep{kirillov2023segment} instead of separate models. While SAM has shown tremendous success across various domain applications \citep{ma2023segment, wang2023sam, cen2023segment}, its potential to form part of an interactive image segmentation system has not been entirely studied yet. Thanks to SAM's ability to generalize to new domains or even unseen objects and its promptable structure, we could build a general interactive image segmentation system instead of training separate models for different tasks.
More importantly, SAM can enable the system to conduct seamless, real-time interactive segmentation because of its lightweight architecture.

Hence, we propose the \textit{GazeSAM} to investigate the feasibility and efficacy of integrating eye gaze with SAM for an interactive image segmentation system. The proposed system uses eye-tracking technology to identify the regions that annotators are interested in and then prompts those attention cues to the SAM model to segment the images accordingly. The system is designed to be user-friendly, accurate, and fast in generating segmentation results. %It is worth noting that this is the first study to utilize the power of eye gaze and SAM to automate the interactive image segmentation process in real time. 

The major contributions of this work are summarized as follows:
\begin{enumerate}
    \item We propose a novel interactive image segmentation system, called \textit{GazeSAM}, that combines eye gaze with SAM for an efficient and user-friendly data annotation process.
    \item Instead of the mainstream mouse-based interaction methods, we introduce an alternative interaction type, \textit{eye gaze}, which is more natural, intuitive, efficient, and less fatiguing.
    \item \textit{GazeSAM} utilizes SAM's zero-shot power and lightweight architecture to avoid training separate task-specific models and enable real-time interaction.
    \item Our system has the unique capability of operating with 2D and 3D images, typically used in medical settings. This is the first of its kind developed to significantly increase radiologists' work efficiency in daily clinical practice.
    \item We have evaluated \textit{GazeSAM} on 2D image segmentation datasets GrabCut \citep{rother2004grabcut} and Berkeley \citep{martin2001database}, 3D medical image segmentation dataset  \citep{bilic2023liver}. Our results show a significant efficiency improvement without deviation from the high accuracy.

\end{enumerate}

% Despite the fact that previous studies have indicated that the Segment Anything Model (SAM) may not be effective in segmenting medical images \citep{roy2023sam,deng2023segment}, the current research is not primarily focused on evaluating the model's performance. Instead, the research objective is to investigate the potential of SAM as a system within a framework for real-time medical image segmentation in a collaborative and interactive context. The results from this work have the potential to open up new research areas and improve the real-time application of SAM in medical image segmentation in collaborative environments, ultimately contributing to the development of more efficient and accurate methods for medical image analysis and potentially enhancing clinical practice and research.

\section{Related Work}
Interactive image segmentation has always been a popular topic in the computer vision field. The reason is that the training of the models requires large amounts of data and high-quality annotations. Traditional annotation methods involve manually marking the object boundary, which is labor-intensive and tedious. DIOS \citep{xu2016deep} incorporates the automated models into the interactive image segmentation task, which increases the efficiency of the labeling significantly. The users are able to segment the target regions by conducting clicks to input the positive and negative prompts for the automated model. Then, the model generates the segmentation mask according to the users' prompt. 

After this, SimpleClick \citep{liu2022simpleclick}, FocalClick \citep{chen2022focalclick}, BRS \citep{jang2019interactive}, and f-BRS \citep{sofiiuk2020f} are proposed to conduct interactive image segmentation based on the mouse clicks. 
Lempitsky et al. \citep{lempitsky2009image} utilize the bounding box to replace the single mouse click.
Bai et al. \citep{bai2014error} introduce scribble as the interactive type, in which they ask the user to roughly draw lines on the target object. 
Acuna et al. \citep{acuna2018efficient} propose a polygon interactive method to enable the users to mark the object boundary during annotation.
However, these methods are all mouse-based interaction methods, which leads to user fatigue when doing large-scale dataset annotations. In this paper, we introduce eye gaze-based interaction, which is more natural and user-friendly.

\section{Methods}
\begin{figure}[htbp]
\includegraphics[width=\textwidth]{Figs/framework.pdf}
\caption{Overview of our proposed system.} \label{framework}
\end{figure}

In this section, we describe our proposed framework \textit{GazeSAM} for real-time segmentation mask collection by utilizing a screen-based eye-tracker and Segment-Anything Model (SAM). As illustrated in Fig.~\ref{framework}, \textit{GazeSAM} comprises two parts: eye-gaze data collection and segmentation. 

\subsection{Eye-Gaze Data Collection}
In this study, a Tobii Pro Nano screen-based eye tracker is used. It is a small, lightweight, and easy-to-use eye tracker whose length is 170mm, weight is 59g, and the sampling rate is 60Hz.

Before the experiment, calibration of the eye tracker is required because it ensures the eye movement is tracked accurately and makes the gaze coordinate on the screen consistent with where the user is looking. Here, we adopt a five-point calibration procedure in Tobii Pro eye tracker manager. After completing the calibration, the eye-gaze data can be collected in the form of the location coordinate on the screen.

\subsection{Prompt Transformation \& Segmentation} \label{optionsec}
% SAM utilizes a prompt encoder that accepts varieties of prompt forms, including points, boxes, and text. These prompts enable users to easily and conveniently select the specific object they wish to segment by simply clicking the mouse. This feature enhances the flexibility and usability of SAM, making it an interactive system for segmentation tasks. 
The prompt encoder in SAM is designed to support a wide range of prompt formats, such as points, boxes, and text. To integrate eye-gaze data as a new type of prompt into the SAM, we need to first conduct a prompt transformation.

\begin{figure}[htbp]
\includegraphics[width=\textwidth]{Figs/Option.pdf}
\caption{Two eye-gaze prompt options for segmentation in \textit{GazeSAM}.} \label{option}
\end{figure}

Eye-gaze data can be considered as a sequence of scatter points that correspond to the eye movement over time. Hence, it is possible to transfer the eye-gaze data into a point or a sequence of points, which can be utilized as the point prompt for SAM. Prior to this, it is necessary to first solve the coordinate problem. The eye-gaze points coordinates, denoted as $S_1, S_2,... S_n$  are collected in the screen coordinate space. We need to transform it into the image coordinate space as follows:
\begin{equation}
    I_1, I_2, ..., I_n = f(S_1, S_2,... S_n),
\end{equation}
where $f(\cdot)$ is the mapping function between two coordinate space and $I_1, I_2, ..., I_n$ are the eye-gaze points coordinates in image coordinate space.

Then, as illustrated in Fig.~\ref{option}, \textit{GazeSAM} supports two options for inputting the eye-gaze data as a prompt for SAM. The first option is to use the whole sequence of eye-gaze points collected over time, which can provide a more comprehensive representation of the user's gaze trajectory. The second option is to use the eye-gaze point collected at the last time point as the prompt. This approach is more appropriate when a coarse segmentation mask of a single object is desired.

It is noted that SAM might not always generate a perfect segmentation mask, especially for the boundary regions. To refine the generated mask, users need to manually add points to those regions, which can be tedious and time-consuming. In the first option, \textit{GazeSAM} simplifies this process by allowing users to add points by simply looking at the desired areas. In this way, a more efficient approach to refine the segmentation mask is offered, which has the potential to greatly enhance the user experience and speed of the whole pipeline.

Given a pre-computed image embedding and the prompt transformed from eye-gaze data, SAM can generate a segmentation mask subsequently in near real-time, making it an interactive segmentation system by using eye-tracking technology.




%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Experiments}
\begin{figure}[htbp]
\includegraphics[width=\textwidth]{Figs/setting.pdf}
\caption{Our designed user interface and experiment setting of eye-gaze data collection.} \label{setting}
\end{figure}

\subsection{Interactive Image Segmentation User Interface}
As illustrated in Fig.~\ref{setting}(a), we develop a user interface for the \textit{GazeSAM} system to conduct the interactive image segmentation. 
It incorporates multiple functions for the users. 
The function panel is situated on the left side of the interface, and each function can be easily activated either using corresponding keyboard shortcuts or clicking the buttons. 
"Select Image" is for choosing the 2D or 3D image files to annotate. In the following experiments, we enable the user to choose a folder that contains all the image files and switch the next and last image on the screen by typing the left or right key. 
Once the user clicks the "Start Tracking" button, the eye-gaze point is displayed as a hollow red circle, which tracks the user's eye movement in real time. 
The red circle will follow the trajectory of the user's eye as it moves across the screen. 
When the experiment is finished, "Stop Tracking" helps close the eye tracking system. 
The "Show Tracks" option displays the eye movement trajectory. 
These eye movement tracks are composed of multiple eye-gaze points, as yellow dots in Fig.~\ref{option}(a), that can be taken as the model prompt input. 
"Clear Tracks" enables the user to delete current eye-gaze points and restart the eye gaze recording. 
For convenience, we also allow the user to press key "A" to clear the eye gaze and restart automated segmentation. 
"Load SAM" reads SAM pretrained weight at the beginning of the experiments. 
For segmentation, GazeSAM provides two options ("One Point" and "All Points") as described in Section~\ref{optionsec}. 
Once an option is activated, a segmentation mask is generated in real-time and shown in green color. 
This mask is automatically updated based on the location of the user's eye gaze, allowing for dynamic adjustments to the segment target or iterative refinement of the segmentation.
Users can save the eye-captured segmentation mask displayed on the interface by using the "Save Mask" function at any point during the process.
For 3D medical images, a scroll bar is provided on the right side of the interface to control the slice image selection.

\subsection{Experiment Settings}
As illustrated in Fig.~\ref{setting}(b), the eye tracker is positioned directly below the lower edge of the display, and the user maintains a viewing distance of approximately 60cm from the screen. 

During the experiment, the annotator first loads SAM pretrained weight and then conducts the calibration for the eye tracker. After that, the test image list is read, and we start to record the annotation time from the first image and end the time after the last image labeling. 

Due to the absence of mouse clicks in eye gaze-based interaction, we can not employ the primary evaluation metric, Number of Clicks (NoC) at Intersection over Union (IoU), in our comparative experiment. 
Instead, we assess efficiency improvement by directly comparing annotation time costs. 
Additionally, we utilize the mean Intersection over Union (mIoU) as the metric to evaluate accuracy.

To ensure a fair comparison, we exclusively select SAM as the backbone model and assess efficiency by comparing eye gaze-based interaction with mouse-based interaction.


\subsection{2D Interactive Image Segmentation}
In this experiment, the annotator is asked to label the data on the GrabCut \citep{rother2004grabcut} and Berkeley \citep{martin2001database} dataset. 
The GrabCut dataset comprises 50 images, each containing a single object, while the Berkeley dataset consists of a total of 100 images.

\begin{table}[hbtp]
\floatconts
  {tab:example-booktabs}
  {\caption{Quantitative results on GrabCut and Berkeley dataset.}}
  {\begin{tabular}{ccccc}
  \toprule
  \multirow{2}{*}{ Model } &  \multicolumn{2}{c}{ GrabCut } &  \multicolumn{2}{c}{ Berkeley }  \\
    & Time/s $\downarrow$ & mIoU/\% $\uparrow$ & Time/s $\downarrow$ & mIoU/\% $\uparrow$ \\
  \midrule
  GazeSAM & \textbf{125} &92.10 & \textbf{266}& 85.56\\
  SAM + Mouse & 232 & 92.31& 424& 88.33\\

  \bottomrule
  \end{tabular}}\label{table}
\end{table}
\begin{figure}[htbp]
\includegraphics[width=\textwidth]{Figs/visual.pdf}
\caption{Examples from GrabCut and Berkeley dataset.} \label{visual}
\end{figure}
The results presented in Table.~\ref{table} demonstrate that annotators can complete segmentation labeling work in nearly 50\% less time when using the eye gaze-based interaction system compared to the mouse-based interaction system. This highlights the efficiency advantages of the eye gaze-based approach.

Moreover, when we compare the mIoU scores, it becomes apparent that the efficiency gains achieved by the GazeSAM system do not come at the expense of accuracy. The accuracy score is comparable between the two systems, demonstrating that eye gaze-based interaction offers an efficiency boost while maintaining accuracy comparable to the current mainstream mouse-based interaction system.
From the visualization of generated segmentation masks in Fig.~\ref{visual}, we can also find that the accuracy performance between eye gaze-based and mouse-based interaction systems is similar.

The reason for the higher efficiency of eye gaze-based interaction is readily apparent.
As illustrated in Fig.~\ref{mousegaze}, users can rapidly obtain multiple input points simply by glancing at the target region in less than a second. 
In contrast, the mouse-based interaction requires the user to click the target area to refine the segmentation performance, which takes more than one second. 
Consequently, we can conclude that the eye gaze-based interaction system offers a distinct advantage over the mouse-based interaction system in terms of efficiency.
\begin{figure}[htbp]
\includegraphics[width=\textwidth]{jmlr/Figs/mouse and gaze.pdf}
\caption{Efficiency comparison between eye gaze-based interaction and mouse-based interaction.} \label{mousegaze}
\end{figure}



\subsection{3D Interactive Medical Image Segmentation}
\begin{figure}[htbp]
\includegraphics[width=\textwidth]{jmlr/Figs/3d.pdf}
\caption{Some examples of 3D interactive medical image segmentation.} \label{3d}
\end{figure}
The GazeSAM system extends its capabilities to support 3D image segmentation, which we have successfully applied to the field of 3D medical image segmentation. 
This system is designed to be user-friendly for radiologists, enabling them to conveniently adjust the slice image and zoom in or out to gain a clearer view of the target area.
As depicted in Fig.~\ref{3d}, we can observe that the organ has been successfully and clearly segmented. We have also provided a demo video for the 3D medical image segmentation in our GitHub repository. It is worth noting that the experience is exceptionally smooth when the radiologist stares at one specific organ and controls the slice using the mouse slicer. This leads to the potential for improving the efficiency of daily clinical workflows, offering radiologists a feasible tool for assisting precise and efficient medical image analysis.

Given that SAM is primarily trained on natural images, its ability to infer accurate segmentation on medical images is limited. While GazeSAM offers a more efficient approach to improve segmentation quality by simply looking at the desired areas and incorporating more eye-gaze prompts in regions with poor segmentation, its performance is still restricted in some cases. To overcome this limitation, fine-tuning SAM on a large-scale medical image dataset is a possible solution~\citep{ma2023segment, cheng2023sam}. Besides, we can also use the generated mask as a coarse segmentation result for further refinement.


\section{Conclusion}
In this study, we propose GazeSAM, a novel interactive image segmentation system that utilizes eye gaze as the interactive prompt instead of the mouse-based interaction such as click, scribble, bounding box, and polygon. Our system takes advantage of SAM's zero-shot power and lightweight architecture to avoid training separate task-specific models and enable real-time interaction. By evaluating GazeSAM on 2D and 3D images, we observe that it offers significant efficiency improvement for the annotation workflow.

\acks{This study is supported by NIH R01-CA246704, R01-CA240639, R15-EB030356, R03-EB032943, U01-DK127384-02S1, and U01-CA268808.}

\bibliography{pmlr-sample}

\end{document}


\subsection{3D Medical Image Segmentation}

\begin{figure}[htbp]
\includegraphics[width=\textwidth]{Figs/3d.pdf}
\caption{Visualization of GazeSAM with 2D image and 3D image.} \label{3d}
\end{figure}






%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Discussion}
Given that SAM is primarily trained on natural images, its ability to infer accurate segmentation on medical images is limited. While GazeSAM offers a more efficient approach to improve segmentation quality by simply looking at the desired areas and incorporating more eye-gaze prompts in regions with poor segmentation, its performance is still restricted in some cases. To overcome this limitation, fine-tuning SAM on a large-scale medical image dataset is a possible solution~\citep{ma2023segment}. Besides, we can also use the generated mask as a coarse segmentation result for further refinement.


\section{Conclusion}


% \acks{Acknowledgements go here.}
\newpage
\bibliography{pmlr-sample}


\end{document}



time

            grabcut          berkeley        
SAM+Gaze    125                266
SAM+Mouse   232                424


Accuracy

            grabcut          berkeley        
SAM+Gaze    0.9210              0.8556
SAM+Mouse   0.9231              0.8833


Visualization

groundtruth     SAM+Gaze(gaze track)    SAM+Mouse(click)
                                                            grabcut
                                                            grabcut
                                                            berkeley
                                                            berkeley
                                                            medical
                                                            medical
                                                            