\documentclass[pmlr]{jmlr}% new name PMLR (Proceedings of Machine Learning Research)

 % The following packages will be automatically loaded:
 % amsmath, amssymb, natbib, graphicx, url, algorithm2e

 %\usepackage{rotating}% for sideways figures and tables
\usepackage{longtable}% for long tables
\usepackage{comment}
\usepackage{float} 
 % The booktabs package is used by this sample document
 % (it provides \toprule, \midrule and \bottomrule).
 % Remove the next line if you don't require it.
\usepackage{booktabs}
 % The siunitx package is used by this sample document
 % to align numbers in a column by their decimal point.
 % Remove the next line if you don't require it.
\usepackage[load-configurations=version-1]{siunitx} % newer version
 %\usepackage{siunitx}


% \documentclass{article}
\usepackage{graphicx}
\usepackage{amsmath}
% \usepackage{subcaption}


%%%%delete this package - athul
\usepackage{soul}

 % The following command is just for this sample document:
\newcommand{\cs}[1]{\texttt{\char`\\#1}}

 % Define an unnumbered theorem just for this sample document:
\theorembodyfont{\upshape}
\theoremheaderfont{\scshape}
\theorempostheader{:}
\theoremsep{\newline}
\newtheorem*{note}{Note}

 % change the arguments, as appropriate, in the following:
\jmlrvolume{1}
\jmlryear{2023}
\jmlrworkshop{NeurIPS 2023 Gaze Meets ML Workshop}

\title[Leveraging Multi-Modal Saliency and Fusion for Gaze Target Detection]{Leveraging Multi-Modal Saliency and Fusion for Gaze Target Detection}


 % Use \Name{Author Name} to specify the name.

 % Spaces are used to separate forenames from the surname so that
 % the surnames can be picked up for the page header and copyright footer.
 
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % *** Make sure there's no spurious space before \nametag ***

 % Two authors with the same address
  % \author{\Name{Author Name1\nametag{\thanks{with a note}}} \Email{abc@sample.com}\and
  %  \Name{Author Name2} \Email{xyz@sample.com}\\
  %  \addr Address}

 % Three or more authors with the same address:
 \author{\Name{Athul {M. Mathew}} \Email{amathew@elm.sa}\\
  \Name{Arshad Ali Khan} \Email{arkhan@elm.sa}\\
  \Name{Thariq Khalid} \Email{tkadavil@elm.sa}\\
  \Name{Faroq AL-Tam} \Email{faltam@elm.sa}\\
  \Name{Riad Souissi} \Email{rsouissi@elm.sa}\\
  \addr Elm Company, Saudi Arabia}




%\editor{Editor's name}

 % \editors{List of editors' names}
\usepackage{xcolor}
\begin{document}

\maketitle
\begin{abstract}
Gaze target detection (GTD) is the task of predicting where a person in an image is looking. This is a challenging task, as it requires the ability to understand the relationship between the person's head, body, and eyes, as well as the surrounding environment. In this paper, we propose a novel method for GTD that fuses multiple pieces of information extracted from an image. First, we project the 2D image into a 3D representation using monocular depth estimation. We then extract a depth-infused saliency module map, which highlights the most salient (\textit{attention-grabbing}) regions in image for the subject in consideration. We also extract face and depth modalities from the image, and finally fuse all the extracted modalities to identify the gaze target. We quantitatively evaluated our method, including the ablation analysis on three publicly available datasets, namely VideoAttentionTarget, GazeFollow and GOO-Real, and showed that it outperforms other state-of-the-art methods. This suggests that our method is a promising new approach for GTD.
\end{abstract}
\begin{keywords}
gaze target detection, gaze-following, 3D gaze, free-viewing, saliency, depth map, 3D projection, point cloud, multi-modal, fusion  
\end{keywords}

\section{Introduction}
\label{sec:intro}

It is general phenomenon that human gaze acts as a natural cue which provides rich contextual information on the attention of individuals when it comes to social interactions, engagements and communication. Gaze is a fundamental human communication mean, since it can express emotions, feelings, and intentions~\citep{lund2007importance}. Human beings have a remarkable capability to follow the gaze of others to understand their gaze target, understand whether a person is gazing at them and determine the attention of others~\citep{connecting_gaze_scene_attention}. GTD, also known as \textit{gaze-following}, is an active research area and can have a wide range of applications, including human-computer interaction, educational assessment, treatment of patients with cognitive or neurological disorders such as early diagnosis of ADHD (Attention Deficit Hyperactivity Disorder) in children and so on.


Gaze estimation (GE) utilizes eye or facial images of a person to estimate the direction of gaze for the person~\citep{10.1007/s00138-017-0852-4}~\citep{cvpr2016_gazecapture}. These methods utilize facial features of the person, such as the eyes, nose, and mouth, to estimate the three-dimensional orientation (i.e. \textit{Yaw, Pitch} and \textit{Roll}) of the face and predict the gaze direction. These methods can be less accurate, especially in challenging conditions such as low lighting or when the person is wearing glasses. Typical gaze detection and tracking systems often require a calibration step, where the user is asked to look at various points before usage of such a system. Also, such gaze tracking systems are constrained because they are designed to monitor the gaze of one person when the person is well situated within the confined space of monitoring for the gaze tracking system. An example of such a system is a driver attention or fatigue detection system in vehicles where the system is expected to monitor the gaze of the driver seated on the driving seat. 

Unlike GE, the GTD of a given person in an image involves learning relationship between the relative position of the person within the scene and the surrounding objects that lie within the field-of-view of that person. A robust and scalable gaze target assessment system is needed to identify the salient objects that people are likely looking at. While significant progress has been made in GTD from images, incorporating depth-related contextual cues remains a challenge. Such cues can enhance the accuracy and robustness of GTD, but reconstructing these cues from 2D images in multiple scenarios is difficult. This problem is further exacerbated when detecting the gaze target when the person is looking at a point on a smartphone screen~\citep{zhang2015appearance}, or when predicting fixation on an object when the person is looking at a salient object within or outside the frame~\citep{9157393}. In this paper, we focus on in-frame GTD due to time and scope constraints.

\begin{figure}[h]
\centering
  {\includegraphics [trim=1cm 2.8cm 2cm 1cm,scale=0.25]{images/gaze_architecture_overall2.jpg}}
\caption{Overview of our multi-modal saliency and fusion architecture for gaze target detection. }
\label{Figure:gaze_target_overall}
\end{figure}

Our GTD architecture consists of two modules, as shown in~\figureref{Figure:gaze_target_overall}. Firstly, \textbf{D}epth-\textbf{I}nfused \textbf{S}aliency \textbf{M}odule (DISM) extracts a map (binary mask) that highlights objects and artifacts that are in the line-of-sight of the subject's gaze within the scene. This map is predicted from the scene features that lie along the subject's field-of-view, based on contextual cues such as the subject's depth and spatial positioning in the scene. The second module, \textbf{M}ulti-\textbf{M}odal \textbf{F}usion (MMF), blends the DISM map with rich representations of scene, depth, and head features. This fusion process creates a unified representation of the scene that incorporates information from multiple sources. Together, the two modules work together to infer the gaze fixation of a given subject within the image.
We further elaborate this method in \sectionref{sec:our_methodology}. It is important to mention that many visual-attention-based saliency models {~\citep{itti2001computational}~\citep{Judd_2009}}  focus on identifying visual gaze fixations of a person \textit{free-viewing} a natural image. In GTD, people in the picture might fixate on objects even when they are not the most salient \citep{nips15_recasens}. Our work goes beyond this, since we learn gaze target with respect to a person's viewpoint, and the learnt saliency map adapts based on the person's location and facial orientation within the image. In this research, our network in DISM aims to identify salient scene artefacts specific to the person in consideration within the image and this may not necessarily be the \textit{free-viewing} saliency. Additionally, our MMF module introduces dedicated attention layers to distill the learnt saliency across face, scene, and depth branches and fuses the multi-modal embeddings effectively.  Such multimodal design leads to achieving better performance when compared to those presented in similar contemporary prior-art research. 

The remainder of this paper is organized as follows:~\sectionref{related_work} reviews state-of-the-art GE   and GTD methods.~\sectionref{sec:our_methodology} lays out our methodology, including our proposed architecture for extraction of DISM map and MMF.~\sectionref{sec:experiments} discusses extensive experimental results and evaluation against established benchmarks using Area Under the Curve (AUC), Distance and Angular metrics. In~\sectionref{sec:conclusion}, we summarize our research findings and,~\sectionref{sec:future} sets out our goals for extending this work in the future.

%------------------------------------------------------------------------
\section{Related Work}
\label{related_work}
The study of automatic gaze analysis can be divided into two categories: gaze estimation (GE) and gaze target detection (GTD)~\citep{9157393}~\citep{9577574}~\citep{recasens2017following}. GE estimates the direction of a person's gaze, typically in 3D, and does not necessarily focus on precisely locating the object of their interest~\citep{zhang2018training}\citep{Guo2023LiteGaze:Estimation}. Methods such as~\cite{parks2015augmented} estimate the gaze direction and do not identify the objects that are being attended to. On the other hand, ~\cite{liu20203d} uses a head-mounted eye-tracker to estimate the user's point of gaze. Similarly \cite{thakur2021predicting} also elaborate a method to detect where each person in the scene is looking by fusing videos and Inertial Measurement Unit (IMU) data. However, both these methods focus on GTD from the first-person viewpoint. There have been significant developments in gaze and saliency mapping, but robust 3D gaze orientation determination is still a challenging problem. \cite{connecting_gaze_scene_attention} rely on a 3D gaze angle regression model for GTD. Subsequently, \cite{9157393} extends the work to include temporal information to directly output an estimate of gaze uncertainty. In this paper, we focus on GTD with in-the-wild images, captured from a third-person viewpoint. 

GTD has been evolving given the adaptation of computer vision technologies in human gaze research. It has become evident that in domains where close-level iris/eye tracking is not possible, head pose is the most important feature for estimating human focus of attention, along with other semantic information. While modelling 3D gaze requires additional human annotations (\cite{JIN2022104924}), models trained on such datasets still struggle to generalize to common scenarios.~\cite{9577574} have pinpointed three significant issues in previous research. Firstly, most research works explore the gaze direction in 2D representation without encoding the depth modality. Secondly, salient object searching from 2D visual cues without depth understanding. Finally, learning mapping functions directly from head position to gaze direction without considering the relationship between eyes and head. Thus a comprehensive understanding of 3D scenes is essential to identify candidate objects lying at different depth layers along the subject's gaze direction.

While many approaches learn the mapping function from head features to gaze direction using 2D visual cues~\citep{nips15_recasens}~\citep{connecting_gaze_scene_attention}~\citep{9157393}~\citep{recasens2017following}, estimating depth information from a RGB image is essential to accurately predicting the gaze target.~\cite{9577574} and~\cite{Tonini_2022} introduced a dedicated depth branch to embed depth-related cues within their architecture. However, conventional depth estimation is a challenging task given the ill-posed nature of estimating real depth from a single RGB image. In the context of GTD, for example, using monocular depth estimation there may be multiple solutions in terms of estimating the distance from a viewpoint for a given target. Recently, deep neural networks have mitigated this problem by exploiting multiple visual cues such as relative size, brightness, patterns, and vanishing points extracted from an RGB image~\citep{Shim2023Depth-RelativeEstimation}. New deep-learning frameworks for head localization and pose estimation on depth images are being used to tackle issues arising from poor illumination conditions, occlusion, and dynamic scenes (e.g., in low light and with illumination changes during the day). Moreover,~\cite{Shim2023Depth-RelativeEstimation} propose a transformer-based approach called Relative Depth Transformer (RED-T), which uses relative depth as guidance in self-attention, such that the model assigns high attention weights to pixels of close depth and low attention weights to pixels of distant depth. However, transformer-based models typically require longer training and inference time than CNN counterparts, especially when a multimodal solution is adopted in the inference pipeline. Another challenge is the 3D reconstruction of the scene once the depth estimation is done, in order to place objects in 3D space and align them with their estimated depths.

Saliency mapping, in depth estimation, is another aspect that involves identifying and highlighting the most visually significant regions in a scene contributing to the depth perception element. Researchers like~\cite{nips15_recasens} and~\cite{9157393} present methods that completely discard utilization of depth modality within their networks which may prevent the GTD algorithms from identifying salient regions in a scene from the perspective of a human observer. \cite{Tonini_2022} include a dedicated branch for depth modality, where the depth is processed as pixels in 2D image space. ~\cite{9577574} extract a depth-based attention map, however, they utilize only the coarse depth features for extraction of the saliency map. 

One of the challenges, arising from integrating 2D annotation with 3D scene models for GTD is the semantic understanding and interaction with the scene. \cite{9878884} propose a GTD method that explicitly models 3D scenes using only 2D gaze annotations. Their research is particularly interesting because it considers 3D geometry to model the scene for GTD. However, this method assumes that the front-most object is always the salient object and this may not be valid at all times. 

~\cite{tu2022end} propose Human Gaze Target detection Transformer (HGTTR), which simultaneously detects multiple human head locations and their associated gaze targets at once in an image (instead of salient object detection and gaze prediction separately). This approach is more computationally efficient than the traditional two-stage head location and gaze target detection pipeline. However, HGTTR reports high false positives in images with a single human gaze target, making it less attractive for users evaluating gaze target detection datasets such as GazeFollow.

In terms of interesting real-world use cases,~\cite{Senarath2022RetailEnvironments} proposed Retail Gaze, and ~\cite{tomas2021goo} proposed GOO which are datasets for GTD in real-world retail environments. There has also been some work in the context of classroom gaze measurement use-case.~\cite{Ahuja2021} develop a new computer vision system that powers a 3D "digital twin" of the classroom. GTD is performed by post-processing the estimated gaze orientation of the face and ArUco markers~\citep{10.1016/j.patcog.2014.01.005} placed on objects around the classroom. Furthermore, \cite{OmerSumer2021multimodalengagement} experimented with multimodal engagement analysis from facial videos in the classroom.

%------------------------------------------------------------------------
\section{Our Methodology}
\label{sec:our_methodology}


Figure~\ref{Figure:gaze_target_overall} outlines our method which extracts the DISM map $S_i$ using depth map $D_i$, binary face mask $M_i$, and cropped face $F_i$. The scene image $I_i$ along with other modalities $D_i$, $F_i$, $M_i$, $S_i$ are fused to estimate the gaze target point of any given person in an image. Furthermore, our methodology has been clearly explained in \algorithmref{alg:method_algo}.

\begin{algorithm2e}[H]
\scriptsize
\caption{Method for Gaze Target Detection}
\label{alg:method_algo}

\SetKwInOut{Given}{Given : Depth-Infused Saliency network $f_{ds}$, 
Scene Branch $f_s$, Depth Branch $f_d$,  Face Branch $f_f$, Fusion Branch $f_n$}


\KwIn{Image stream $I_1, \ldots, I_n$}
\KwOut{Heatmap $H_i$}



\textbf{Given:} Depth-Infused Saliency network $f_{ds}$, 
Scene Branch $f_s$, Depth Branch $f_d$,  \\
\hspace{1.35cm}
Face Branch $f_f$, Fusion Branch $f_n$



\SetKwFunction{dism}{Depth\_Infused\_Saliency}
    \SetKwProg{Fn}{Function}{:}{}
    \Fn{\dism{$D_i$,$M_i$,$F_i$}}{
        \hspace{1.0cm}
        \textbf{return} $S_i  = f_{ds}(D_i, M_i, F_i)$ 
}
\textbf{End Function}



\SetKwFunction{mmf}{Multi\_Modal\_Fusion}
    \SetKwProg{Fn}{Function}{:}{}
    \Fn{\mmf{$S_i$,$I_i$,$D_i$,$M_i$,$F_i$}}{
        \hspace{1.0cm}
        scene\_features = $f_s$(concatenate($I_i$,  $M_i$)) \\
        \hspace{1.0cm}
        depth\_features = $f_d$(concatenate($D_i$,  $S_i$)) \\
        \hspace{1.0cm}
        face\_features = $f_f$($F_i$) \\
        \hspace{1.0cm}
        modulated\_scene\_features = modulate\_scene(scene\_features, face\_features, $S_i$) \\
        \hspace{1.0cm}
        modulated\_depth\_features = modulate\_depth(depth\_features, face\_features, $M_i$) \\
        \hspace{1.0cm}
        \textbf{gaze\_fixation} = $f_n$(modulated\_scene\_features, modulated\_depth\_features) \\
        \hspace{1.0cm}
        return \textbf{gaze\_fixation} \\
        
}
\textbf{End Function}



\SetKwFunction{gtd}{GazeTargetDetection}
    \SetKwProg{Fn}{Function}{:}{}
    \Fn{\gtd{$I_i$, $D_i$, $M_i$, $F_i$}}{
            \hspace{1.0cm}
            $S_i$ = \dism{$D_i$,$M_i$,$F_i$}\\
            \hspace{1.0cm}
            gaze\_fixation = \mmf{$S_i$,$I_i$,$D_i$,$M_i$,$F_i$} \\
            \hspace{1.0cm}
            return \textbf{gaze\_fixation}
}
\textbf{End Function}

\hspace{1.0cm}

\For{$i\leftarrow 1$ \KwTo $n$}{
    \hspace{1.0cm}
    Extract $D_i$, $M_i$, $F_i$ from $I_i$\\
    \hspace{1.0cm}
    \textbf{$H_i$} = \gtd{$I_i$, $D_i$, $M_i$, $F_i$}\\
    \hspace{1.0cm}

}
\end{algorithm2e}



%-------------------------------------------------------------------------
\subsection{Depth-infused Saliency Module (DISM)}


The depth-infused saliency network, $f_{ds}$, uses a Feature Pyramid Network (FPN) architecture~\citep{lin2017feature} to learn high-level semantic saliency for the subject of interest. The network takes a concatenated 7-channel input of the scene depth map $D_i$, binary head position mask $M_i$, and face image $F_i$. $M_i$ and $D_i$ encode the subject's relative three-dimensional position in the scene. $F_i$ helps the model to focus on scene artefacts that lie along a projection plane originating from the subject's facial position along the depth axis and are directed parallel to the facial orientation vector. The network finally predicts a DISM map, $S_i$ that highlights the most likely gaze fixation artefacts for the subject in the scene. An overview of the depth-infused saliency network $f_{ds}$ is shown in Figure~\ref{Figure:depth_infused_saliency_map}.

It is our intention to simplify the learning objective of DISM and utilize it to provide rich cues for the MMF module. We bin the human gaze direction $\theta$ along the depth plane $\theta_d$ into \textit{forward} ($\theta_{d_f}$)$(90^{\circ})$, \textit{intermediate-forward} ($\theta_{d_{if}}$)$(45^{\circ})$, \textit{same-plane} 
 ($\theta_{d_s}$)$(0^{\circ})$, \textit{intermediate-backward} ($\theta_{d_{ib}}$)$(-45^{\circ})$ and \textit{backward} ($\theta_{d_b}$)$(-90^{\circ})$ directions. The gaze direction along the image plane $\theta_{xy}$ is binned into \textit{lower-right} ($\theta_{{xy}_{lr}}$)$(30^{\circ})$, \textit{straight} ($\theta_{{xy}_s}$)$(90^{\circ})$, \textit{lower-left} ($\theta_{{xy}_{ll}}$)$(150^{\circ})$, \textit{upper-left} ($\theta_{{xy}_{ul}}$)$(220^{\circ})$ and \textit{upper-right} ($\theta_{{xy}_{ur}}$)$(320^{\circ})$ directions. At any instance, the 3D-gaze angle will comprise an image plane component and a depth plane component. That is, $\theta =[\theta_{xy}, \theta_d]$, where $\theta_{xy} \in \{\theta_{{xy}_{lr}},\theta_{{xy}_s}, \theta_{{xy}_{ll}},\theta_{{xy}_{ul}},\theta_{{xy}_{ur}} \}$ and  $\theta_d \in \{ \theta_{d_f}, \theta_{d_{if}}, \theta_{d_s}, \theta_{d_{ib}}, \theta_{d_b} \}$. 

\begin{figure}[h!]
 \centering
\floatconts
  {Figure:depth_infused_saliency_map}
  {\caption{Overview of DISM. We take 3D projection of depth map $P_d$ alongside gaze binning parameters $\theta_d$ and $\theta_{xy}$ to extract a sub-collection of filtered 3D points $P_c$. The re-projection of $P_c$ back to the image-plane serves as pseudo-labels for the FPN network, $f_{ds}$. The network provides a representation of the learned DISM map $S_i$.}}
  {\includegraphics[trim=1cm 2cm 1cm 1cm,scale=0.30]{images/depth-infused saliency map1.jpg}}
  
\end{figure}
\vspace*{-5.0mm}
\subsubsection{Data pre-processing}
\label{sec:data_preprocessing}

Pseudo-labels for DISM require relative depth between face and target point, computed using monocular depth estimation techniques such as~\cite{ranftl2020robust} and~\cite{ranftl2021vision}. Depth plane gaze angle $\theta_d$ is extracted using average depth of face $d_f$ and target points $d_t$, and empirically setting depth plane binning thresholds $\gamma_1$ and $\gamma_2$ to $3$ and $10$ respectively.
 
\begin{equation}
\label{eq:eq_1}
\theta_d =
    \begin{cases}
    \theta_{d_s} & \text{, if } d_f - d_t < \gamma_1 \\
    \theta_{d_{if}}  & \text{, if } \gamma_1 < d_f - d_t < \gamma_2 \\
    \theta_{d_{ib}} & \text{, if } \gamma_1 < d_t - d_f < \gamma_2 \\
    \theta_{d_f} & \text{, if } d_f - d_t > \gamma_2 \\
    \theta_{d_b}  & \text{, if } d_t - d_f > \gamma_2 \\
    \end{cases}       
\end{equation}

\textcolor{black}{The gaze direction along the image plane $\theta_{xy}$ is extracted from the pixel positions of the eye \textit{$(e_x,e_y)$} and the gaze target \textit{$(g_x,g_y)$}. Given $\theta_{xy} \in \{\theta_{{xy}_{lr}},\theta_{{xy}_s}, \theta_{{xy}_{ll}},\theta_{{xy}_{ul}},\theta_{{xy}_{ur}} \}$, the image angle $\alpha$ is discretized which assumes one of the values within $\theta_{xy}$. $\alpha$ is computed as:
 }
\begin{equation}
\label{eq:eq_2}
\alpha = 
\arctan \frac{g_y-e_y}{g_x-e_x}
\end{equation}
\textit{where} the fraction calculates the gradient between the eye location and gaze fixation points.
\subsubsection{Depth-infused saliency map}
\label{sec:saliency_map}

The dataset $A$ comprises of $N$ images such that $A = \{I_i, D_i, M_i, F_i\}_{i=1}^N$, where $I_i \in \mathbb{R}^{H_i \times W_i \times 3}$ is $i$-th image in the dataset. $H_i$ and $W_i$ denote the width and height of image. $D_i$ is the depth map of $I_i$. The binary mask of the head position of the subject within the scene is denoted as  $M_i$ and $F_i$ is the cropped face of the subject. The DISM map for all images is represented as \texttt{$S = \{S_i\}_{i=1}^N=\{ \{s_i^m\}_{m=1}^{H_i \times W_i}\}_{i=1}^N$} where $s_i^m \in \{0,1\}$ denote presence of $m$-th pixel in DISM map of $i$-th image, with $m = \left[1,...,H_i \times W_i\right]$. We can represent the network $f_{ds}$ which predicts the DISM map $S_i$ as :

\begin{equation}\label{eq:f_ds}
S_i = f_{ds}(D_i, M_i, F_i)
\end{equation}

In order to extract the ground truth DISM map $S_i$ (pseudo-labels), the depth map is projected onto a 3D grid representation using focal length $(f_x, f_y)$ and optical centre $(c_x, c_y)$ of the depth camera parameters from Places dataset. The extrinsic parameters are assumed to be an identity matrix. Let $P_d$ be the collection of 3D projection points of the depth map $D_i$. For every pixel location $(a,b)$ of the depth map, $p$ is a point within the collection $P_d$ such that :

\begin{equation}
p  =
    \begin{cases}
    p_{x} = \frac{(j-c_x) D_i[a,b] }{f_x}\\
    p_{y}  = \frac{(i-c_y) D_i[a,b]}{f_y} \\
    p_{z} = D_i[a,b] \\

    \end{cases}       
\end{equation}


A cuboid aligned along $\theta_{xy}$ in the XY plane and $\theta_d$ in the Z plane is projected from the face position in 3D space. The orientation of the 3D projection cuboid is determined using the image plane and depth plane gaze angle parameters $\theta_{xy}$ and $\theta_{d}$ obtained from \equationref{eq:eq_1} and \eqref{eq:eq_2}, respectively. $P_c$ is the collection of 3D points within the volume of the projected cuboid, where $P_c \subset P_d$. The collection, $P_c$ is then re-projected back to the image plane as a binary mask using the depth camera parameters to finally derive the DISM map $S_i$. The ground truth saliency mask, $S_i$, and prediction saliency mask, $\hat{S_i}$ are trained with the objective of minimizing the Jaccard distance (JD). We have opted for JD as it is considered suitable for binary segmentation tasks or mask comparison, especially in our case where precise delineation of regions matters a lot. The metric provides normalized measures of IOU along with computational efficiency and interpretability benefits which allows for meaningful comparison across different scales. The objective function to minimize JD is given by $L_j$ as :
\begin{equation}
L_j(S_i,\hat{S_i}) = 1 - \frac{(S_i \cdot \hat{S_i}) + \epsilon}{(S_i + \hat{S_i} - S_i\cdot \hat{S_i}) + \epsilon}
\end{equation}

where $\epsilon$ prevents zero division. 


 Our method in DISM generates 3D point clouds and embeds complete 3D information when modelling the DISM map and makes no assumptions regarding the spatial relationship of the salient points with respect to the subject. Our network supports deep supervision and is trainable end-to-end. Furthermore, our network is groundbreaking in its approach, as it addresses the task of modelling the likelihood of a subject's gaze location within the scene as if it were a scene segmentation problem.

%-------------------------------------------------------------------------
\subsection{Multi-Modal Fusion (MMF) module}

The MMF network, $f_{mm}$ is shown in ~\figureref{Figure:gaze_target_architecture}. It outputs a heatmap $H_i \in  \mathbb{R}^{H_i \times W_i}$ reflecting the probability of the gaze fixation point for a subject within the scene. We utilize the scene image $I_i$, the depth map $D_i$, the cropped face $F_i$, the binary face position mask $M_i$, and the DISM map $S_i$ to identify the gaze target. We can represent the network $f_{mm}$ mathematically as:
\begin{equation}
H_i= f_{mm}(I_i, D_i, M_i, F_i,S_i)
\end{equation}
where $S_i$ is obtained from the DISM network $f_{ds}$ in~\equationref{eq:f_ds}.

\begin{figure}[h!]
  \centering
\floatconts
  {Figure:gaze_target_architecture}
  {\caption{Our MMF module comprises three branches - face, scene, and depth. The three branches are fused in the Fusion module. The output of the module is a 2D Heatmap $H_i$ superimposed on the scene image $I_i$ here for visualization.}}
  {\includegraphics[trim=1cm 1cm 2cm 1cm,scale=0.42]{images/gaze_multimodalfusion2.jpg}}
\end{figure}

\textbf{Face Branch $f_f$} extracts facial features of dimension $1024\times7\times7$ from the cropped face image $F_i$ using the face backbone. It then average-pools the extracted features to $e_i^F$, which has dimensions $1024\times1\times1$. $e_i^F$ is then separately processed using a set of linear projections to learn the attention weights. Attention Layer 1 embeds the depth relevance by concatenating $e_i^F$ with the max-pooled and flattened DISM map $e_i^S$, which represents the salient depth information in the scene. Attention Layer 2 embeds the spatial relevance of the face by concatenating $e_i^F$ with the max-pooled and flattened binary face position mask $e_i^M$. Both attention layers ($attn_i^S$ and $attn_i^M$) are represented as a set of linear projections $f_S$ and $f_M$, respectively. These linear layers are then passed through a softmax function that applies weightage to spatial and depth-relevant cues within the image. 
 
\begin{align}
attn_i^S &= \Phi(f_S(e_i^F\oplus e_i^S))\nonumber\\
attn_i^M &= \Phi(f_M(e_i^F\oplus e_i^M))
\end{align}
where $\oplus$ denote concatenation operation and $\Phi$ denote softmax function.

\textbf{Scene Branch $f_s$} branch takes as input the scene image $I_i$ and the binary face position mask $M_i$. The two inputs are concatenated and passed through the scene backbone to extract the scene embedding $e_i^I$, where each embedding has a dimension of $1024\times7\times7$. $e_i^I$ is then modulated by $attn_i^S$. The dimension of $attn_i^S$ is $1\times7\times7$. The modulated scene embedding $e_i^{I*}$ has a dimension of $1024\times7\times7$ and is given by :

\begin{equation}
e_i^{I*} = e_i^I\otimes attn_i^S
\end{equation}


\textbf{Depth Branch $f_d$} takes the depth map $D_i$ and DISM map $S_i$ as inputs. The two inputs are concatenated and passed through the depth network backbone. The output depth embedding $e_i^D$ from the network has a dimension of $1024\times7\times7$. The depth embeddings are also modulated by $attn_i^M$ having a dimension of $1\times7\times7$. The modulated depth embedding $e_i^{D*}$ with the dimension of $1024\times7\times7$ is given by :

\begin{equation}
e_i^{D*} = e_i^D\otimes attn_i^M
\end{equation}
where $\otimes$ represents elementwise multiplication operation.

The modulated scene embeddings $e_i^{I*}$ and depth embeddings $e_i^{D*}$ are concatenated with face embeddings $e_i^F$ and are separately encoded using scene and depth encoders $f_e^I$ and $f_e^D$. The encodings are fused by summation and finally passed on to a decoder $f_d$ for predicting the gaze target heatmap $H_i$. The MMF network $f_{mm}$ can thus alternatively be represented as :

\begin{equation}
H_i = f_{mm}(I_i, D_i, M_i, F_i,S_i) = f_d\left[ f_e^I(e_i^{I*} \oplus e_i^F) + f_e^D(e_i^{D*} \oplus e_i^F)\right]
\end{equation}

The ground-truth gaze heatmap, $\hat{H_i}$ is attained by overlaying a Gaussian weight centred around the target gaze point. The objective of the network is to minimize the Heatmap Loss $L_h$ which is computed using Mean Squared Error (MSE) loss for cases when the gaze target is present inside the frame for $N$ instances within the dataset.
\begin{equation}
L_h(H_i, \hat{H_i}) = \sum_{i=1}^{N}(H_i-\hat{H_i})^2
\end{equation}

%-------------------------------------------------------------------------
\subsection{Implementation Details}

We have implemented the training and inferencing pipeline of our model using the PyTorch framework. All inputs are normalized and resized to $224\times224$ pixels. DISM uses a Resnet-101~\citep{he2015deep} backbone pre-trained on ImageNet~\citep{ILSVRC15}. It has a 5-stage encoder design with 256 and 128 convolution filters in the FPN feature pyramid and segmentation blocks, respectively. All backbones in the MMF module are pre-trained similar to~\cite{connecting_gaze_scene_attention},~\cite{9157393} and~\cite{9577574}. The scene and depth backbones were pre-trained on the Places dataset~\citep{10.5555/2968826.2968881}, and the head backbone was pre-trained on the Eyediap dataset~\citep{eyediap}. The face, scene, and depth feature extractors use Resnet-50~\citep{he2015deep} backbones. The network outputs a $64\times64$ gaze heatmap. We use random crop, colour manipulation, random flip, and head bounding box jittering for data augmentation during training. We train the DISM and MMF module on GazeFollow~\citep{nips15_recasens} until convergence; then fine-tune on VideoAttentionTarget~\citep{9157393}. We also train the network from scratch on the GOO-Real~\citep{tomas2021goo} dataset. We use the Adam optimizer~\citep{adam} with a learning rate of 0.00025 and a batch size of 48.

\vspace*{-1mm}
%------------------------------------------------------------------------
\section{Experiments}
\label{sec:experiments}
We quantitatively and qualitatively evaluated our full model on the VideoAttentionTarget, GazeFollow and GOO-Real datasets. We followed the standard training/testing splits of all datasets for a fair evaluation. We demonstrate that our method surpassed the performance of prior methods across most metrics in \sectionref{sec:model_evaluation}. Moreover, we perform an ablation study in \sectionref{sec:model_ablation} to validate the effectiveness of each module within our architecture. 
\vspace*{-1mm}
\subsection{Datasets}

\textbf{VideoAttentionTarget} dataset comprises 164,541 frame-level head bounding boxes with 109,574 in-frame gaze targets and 54,967 out-of-frame gaze annotations. 10 shows were kept aside as test split, which comprises of 31,978 gaze annotations. \textbf{GazeFollow} dataset comprises of 122,143 images and about 160,000 annotations of people head bounding boxes and their corresponding gaze points. \textbf{Gaze On Objects (GOO)} dataset focuses on the retail environment where several grocery items are placed on shelves to imitate a real grocery store. GOO comprises 192,000 synthetic images (\textit{GOO-Synth}) and 9552 real images (\textit{GOO-Real}). 
\vspace*{-1mm}
\subsection{Evaluation Metrics}
We use three evaluation metrics in line with previous works such as~\cite{connecting_gaze_scene_attention,9157393, 9577574, nips15_recasens, 10.1007/978-3-030-20893-6_3, Tonini_2022} to assess our model's performance. In the GazeFollow dataset, the ground truth gaze target location is estimated by taking the average of the annotations provided by 10 different human annotators for each image and subject. \textbf{Area Under Curve (AUC):} We compare the flattened output gaze heatmap to the flattened binarized ground truth heatmap and plot the ROC curve using True Positive Rate and False Positive Rate. The AUC score is the area under this curve, with a score of $1.0$ denoting perfect agreement of the prediction with the ground truth. \textbf{L2 Distance(Dist.):} The Euclidean distance between the ground truth target location and prediction heat map maximum is measured after normalizing the image height and width to 1. For the GazeFollow dataset, we also calculate the minimum distance (Min. Dist.) between the predicted gaze point and the 10 ground truth gaze target points for each subject. \textbf{Angular error(Ang.):} This metric reports the angular difference between the predicted gaze direction and the ground truth gaze vector between face location and gaze point. We report all results from our experiments in Table \ref{Table:videoattntarget_goo0} and Table \ref{Table:gazefollow1}.

\subsection{Multi-Modal Saliency and Fusion Model Evaluation}
\label{sec:model_evaluation}
We compare our model to several state-of-the-art architectures ~\citep{9157393}~\citep{nips15_recasens}~\citep{10.1007/978-3-030-20893-6_3}~\citep{connecting_gaze_scene_attention}~\citep{9577574}~\citep{JIN2022104924}~\citep{9878884}~\citep{Tonini_2022} for in-frame gaze target detection. We observed that the overall performance of our model is better on VideoAttentionTarget and GOO-Real, which have higher-resolution images than GazeFollow. The improved resolution translates to better depth map representations and generation of more accurate DISM maps from the 3D projections. See~\figureref{Figure:fig_overall_example} for visualizations of depth maps, DISM maps, MMF heatmaps, and predicted gaze target points for example cases from different datasets.

\begin{figure}[h!]
\centering
\begin{tabular}{@{}
  >{\centering\arraybackslash}p{2.5cm }
  >{\centering\arraybackslash}p{2.5cm}
  >{\centering\arraybackslash}p{2.5cm}
  >{\centering\arraybackslash}p{2.5cm }
  >{\centering\arraybackslash}p{2.5cm }@{}}
\textbf{Input Image} & \textbf{Depth Map} & \textbf{DISM Map} & \textbf{MMF Heatmap} & \textbf{Target Prediction} \\

\includegraphics[width=2.9cm,height=1.79cm]{images/videoattentiontarget/00001793.jpg}
&
\includegraphics[width=2.9cm,height=1.79cm]{images/videoattentiontarget/00001793_depth.jpg}
&
\includegraphics[width=2.9cm,height=1.79cm]{images/videoattentiontarget/00001793.jpg_796_115_927_253_227_209.jpg.png}
&
\includegraphics[width=2.9cm,height=1.79cm]{images/videoattentiontarget/00001793.jpg_796_115_927_253_227_209_heatmap.jpg}
&
\includegraphics[width=2.9cm,height=1.79cm]{images/videoattentiontarget/00001793.jpg_796_115_927_253_227_209.jpg}
\\
\includegraphics[width=2.9cm,height=1.79cm]{images/videoattentiontarget/00011121.jpg}
&
\includegraphics[width=2.9cm,height=1.79cm]{images/videoattentiontarget/00011121_depth.jpg}
&
\includegraphics[width=2.9cm,height=1.79cm]{images/videoattentiontarget/00011121.jpg_497_130_647_289_1490_246.jpg.png}
&
\includegraphics[width=2.9cm,height=1.79cm]{images/videoattentiontarget/00011121.jpg_497_130_647_289_1490_246_heatmap.jpg}
&
\includegraphics[width=2.9cm,height=1.79cm]{images/videoattentiontarget/00011121.jpg_497_130_647_289_1490_246.jpg}
\\
\includegraphics[width=2.9cm,height=1.79cm]{images/gazefollow/test_0_00000139.jpg}
&
\includegraphics[width=2.9cm,height=1.79cm]{images/gazefollow/00000139_depth_gazefollow.jpg}
&
\includegraphics[width=2.9cm,height=1.79cm]{images/gazefollow/00000139.jpg_0.576364_0.422066.png}
&
\includegraphics[width=2.9cm,height=1.79cm]{images/gazefollow/00000139.jpg_347_147_415_228_0_0_heatmap.jpg}
&
\includegraphics[width=2.9cm,height=1.79cm]{images/gazefollow/00000139.jpg_347_147_415_228_0_0.jpg} 
\\
\includegraphics[width=2.9cm,height=1.79cm]{images/gazefollow/00000013.jpg}
&
\includegraphics[width=2.9cm,height=1.79cm]{images/gazefollow/00000013_depth_gazefollow.jpg}
&
\includegraphics[width=2.9cm,height=1.79cm]{images/gazefollow/00000013.jpg_0.294643_0.3025.png}
&
\includegraphics[width=2.9cm,height=1.79cm]{images/gazefollow/00000013.jpg_29_44_180_191_0_0_heatmap.jpg}
&
\includegraphics[width=2.9cm,height=1.79cm]{images/gazefollow/00000013.jpg_29_44_180_191_0_0.jpg} 
\\
\includegraphics[width=2.9cm,height=1.79cm]{images/goo/cam00000_img00370.jpg}
&
\includegraphics[width=2.9cm,height=1.79cm]{images/goo/cam00000_img00370_depth.jpg}
&
\includegraphics[width=2.9cm,height=1.79cm]{images/goo/cam00000_img00370.jpg.jpg.png}
&
\includegraphics[width=2.9cm,height=1.79cm]{images/goo/cam00000_img00370.jpg_916_281_1091_484_0_0_heatmap.jpg}
&
\includegraphics[width=2.9cm,height=1.79cm]{images/goo/cam00000_img00370.jpg_916_281_1091_484_0_0.jpg} 
\\
\includegraphics[width=2.9cm,height=1.79cm]{images/goo/cam00000_img00554.jpg}
&
\includegraphics[width=2.9cm,height=1.79cm]{images/goo/cam00000_img00554_depth.jpg}
&
\includegraphics[width=2.9cm,height=1.79cm]{images/goo/cam00000_img00554.jpg.jpg.png}
&
\includegraphics[width=2.9cm,height=1.79cm]{images/goo/cam00000_img00554.jpg_1120_775_1345_988_0_0_heatmap.jpg}
&
\includegraphics[width=2.9cm,height=1.79cm]{images/goo/cam00000_img00554.jpg_1120_775_1345_988_0_0.jpg} \\
\end{tabular}
\caption{\textbf{Visualization results}. This figure shows examples from the VideoAttentionTarget (first two rows), GazeFollow (middle two rows), and GOO-Real (last two rows) datasets. Each row shows the input image, depth map, DISM map, MMF heatmap, prediction result, and ground truth.}
\label{Figure:fig_overall_example}

\end{figure}

\subsubsection{Evaluation on VideoAttentionTarget }
Quantitative results on VideoAttentionTarget dataset are summarized in~\tableref{Table:videoattntarget_goo0}. \textit{Random} denotes that the prediction is made with 50\% chance by sampling the values randomly from a Gaussian distribution. In \textit{Fixed bias}, the bias present in the dataset in terms of position of the faces and the relative gaze fixation points are taken into consideration. It is to be noted that the method in~\cite{9157393} uses a spatio-temporal architecture for video-based prediction (denoted as VideoAttn). For fair comparisons, we also include the performance of its spatial-only counterpart (denoted as VideoAttn$^\dagger$).

\textbf{Ours$^\ddagger$} refer to the GazeFollow-trained model (no fine-tuning) and \textbf{Ours} refer to the GazeFollow-trained model that is fine-tuned on VideoAttentionTarget. It is interesting to note that the performance of our model surpasses the performance of~\cite{9157393} without the inclusion of any temporal features for video-based gaze target detection. Our proposed architecture also exhibits impressive generalization capabilities. The model trained purely on GazeFollow (\textbf{Ours$^\ddagger$}) outperforms all other state-of-the-art architectures that were finetuned on VideoAttentionTarget in the $AUC$ metric. Finally, the fine-tuning of our model on VideoAttentionTarget (\textbf{Ours}) results in new state-of-the-art scores for both $AUC$ and $Dist.$ metrics. Some qualitative images are shown in~\figureref{Figure:videoattntarget_goo1}.

\begin{table}[h!]
\floatconts
  {Table:videoattntarget_goo0}
  {\caption{Evaluation on VideoAttentionTarget and Goo-Real. $^*$ indicates taken from~\cite{tomas2021goo}. The best and second-best scores are highlighted in \textbf{\textcolor{teal}{teal}} and \textcolor{red}{red}.}}
  {\begin{tabular}{l|cc|cc}
    \hline
    \multicolumn{1}{c|}{\textbf{Method}} & \multicolumn{2}{l|}{\textbf{VideoAttentionTarget}} & \multicolumn{2}{l}{\textbf{GOO-Real}}             \\ \cline{2-5} 
    \multicolumn{1}{c|}{} & AUC$\uparrow$  & Dist.$\downarrow$  & AUC$\uparrow$  & \multicolumn{1}{l}{Dist.}$\downarrow$\\ \hline
    Random~\citep{9157393}   & 0.505      & 0.458  & -          & -       \\
    Fixed Bias~\citep{9157393} & 0.728  & 0.326 & - & -           \\
    Recansens et al.~\citep{nips15_recasens} & -   & -  & $0.850^*$ & $0.220^*$   \\
    Lian et al.~\citep{10.1007/978-3-030-20893-6_3} & - & - & $0.840^*$ & $0.321^*$\\
    Chong et al.~\citep{connecting_gaze_scene_attention} & 0.830  & 0.193 & - & - \\
    VideoAttn$^\dagger$~\citep{9157393} & 0.854  & 0.147 & -  & -     \\
    VideoAttn~\citep{9157393}  & 0.860  & 0.134 & $0.796^*$  & $0.252^*$      \\
    Danyang et al.~\citep{tu2022end}& 0.893  & 0.137 & -  & -     \\
    Fang et al.~\citep{9577574}& 0.905  & \textcolor{red}{0.108} & -  & -       \\
    Jin et al.~\citep{JIN2022104924} & 0.901 & 0.116  & - & -    \\
    Bao et al.~\citep{9878884} & 0.885 & 0.120  & - & -      \\
    Tonini et al.~\citep{Tonini_2022} & 0.940 & 0.129  & \textcolor{red}{0.918}  & \textcolor{red}{0.164}   \\
    
    \textbf{Ours$^\ddagger$}& \textcolor{red}{0.958} & 0.123 & 0.876  & 0.208           \\
    \textbf{Ours} & \textbf{\textcolor{teal}{0.964}} & \textbf{\textcolor{teal}{0.100}} & \textbf{\textcolor{teal}{0.954}}  & \textbf{\textcolor{teal}{0.130}}          \\ \hline
    Human  & 0.921      & 0.051    & -  & -                     \\ 
    \hline
    \end{tabular}}
\end{table}

\begin{figure}[h!]

    \includegraphics[width=.195\linewidth,height= 0.11\linewidth]{images/videoattentiontarget/00010410.jpg_593_327_930_675_1335_403.jpg}
    \includegraphics[width=.195\linewidth,height= 0.11\linewidth]{images/videoattentiontarget/00010442.jpg_602_321_939_669_1271_353.jpg}
    \includegraphics[width=.195\linewidth,height= 0.11\linewidth]{images/videoattentiontarget/00010466.jpg_618_289_955_637_700_889.jpg}
    \includegraphics[width=.195\linewidth,height= 0.11\linewidth]{images/videoattentiontarget/00010467.jpg_616_294_953_642_422_796.jpg}
    \includegraphics[width=.195\linewidth,height= 0.11\linewidth]{images/videoattentiontarget/00010486.jpg_600_261_951_637_511_600.jpg}

    \includegraphics[width=.195\linewidth,height= 0.11\linewidth]{images/videoattentiontarget/00001307.jpg_488_133_690_364_871_902.jpg}
    \includegraphics[width=.195\linewidth,height= 0.11\linewidth]{images/videoattentiontarget/00001321.jpg_501_139_703_360_1320_209.jpg}
    \includegraphics[width=.195\linewidth,height= 0.11\linewidth]{images/videoattentiontarget/00001345.jpg_493_164_702_390_441_958.jpg}
    \includegraphics[width=.195\linewidth,height= 0.11\linewidth]{images/videoattentiontarget/00001469.jpg_445_89_732_376_956_390.jpg}
    \includegraphics[width=.195\linewidth,height= 0.11\linewidth]{images/videoattentiontarget/00001800.jpg_303_98_593_430_965_953.jpg}

    \includegraphics[width=.195\linewidth,height= 0.11\linewidth]{images/goo/cam00000_img00069.jpg_1323_581_1581_849_0_0.jpg}
    \includegraphics[width=.195\linewidth,height=0.11\linewidth]{images/goo/cam00000_img00113.jpg_1027_300_1255_577_0_0.jpg}
    \includegraphics[width=.195\linewidth,height=0.11\linewidth]{images/goo/cam00000_img00347.jpg_982_295_1217_535_0_0.jpg}
    \includegraphics[width=.195\linewidth,height= 0.11\linewidth]{images/goo/cam00000_img00547.jpg_1237_531_1409_724_0_0.jpg}
    \includegraphics[width=.195\linewidth,height= 0.11\linewidth]{images/goo/cam00001_img00370.jpg_1259_358_1461_591_0_0.jpg}

    \includegraphics[width=.195\linewidth,height= 0.11\linewidth]{images/goo/cam00001_img00135.jpg_622_140_781_326_0_0.jpg}
    \includegraphics[width=.195\linewidth,height= 0.11\linewidth]{images/goo/cam00001_img00248.jpg_750_251_938_455_0_0.jpg}
    \includegraphics[width=.195\linewidth,height= 0.11\linewidth]{images/goo/cam00001_img00450.jpg_460_191_632_337_0_0.jpg}
    \includegraphics[width=.195\linewidth,height= 0.11\linewidth]{images/goo/cam00001_img00660.jpg_941_240_1143_491_0_0.jpg}
    \includegraphics[width=.195\linewidth,height= 0.11\linewidth]{images/goo/cam00001_img00833.jpg_590_217_732_388_0_0.jpg}

  \caption{\textbf{Qualitative results}. The red and green lines denote 
  \textcolor{red}{ground truth} and \textcolor{green}{predictions} respectively. The first two rows represent the changes in gaze target points of a subject from video sequences in VideoAttentionTarget. The last two rows are images from GOO-Real for varying head poses.}
  \label{Figure:videoattntarget_goo1}
\end{figure}

\subsubsection{Evaluation on GOO-Real }
Similar to the notation scheme above, \textbf{Ours$^\ddagger$} denote the performance of the model trained on GazeFollow and tested on GOO-Real. The performance of the model trained from scratch on GOO-Real is denoted as \textbf{Ours}. The authors of~\cite{tomas2021goo} trained and tested the architectures from~\cite{nips15_recasens},~\cite{10.1007/978-3-030-20893-6_3} and~\cite{9157393} on GOO-Real dataset. The performance of these models is reported in~\tableref{Table:videoattntarget_goo0}. Our model trained on GazeFollow (\textbf{Ours$^\ddagger$}) was able to surpass the performance of~\cite{nips15_recasens},~\cite{10.1007/978-3-030-20893-6_3} and~\cite{9157393} when tested on GOO-Real for both AUC and Dist. metrics even-though the model was not finetuned for retail gaze target detection using GOO-Real dataset. The model that was trained and tested on GOO-Real (\textbf{Ours}) sets the new benchmarks for both the AUC and Dist. metrics. Some qualitative images are shown in~\figureref{Figure:videoattntarget_goo1}.

\begin{table}[h!]
\floatconts
  {Table:gazefollow1}
  {\caption{Evaluation on GazeFollow dataset}}
  {\begin{tabular}{l|cccc}
  \toprule
  \bfseries Method & \bfseries AUC$\uparrow$ & \bfseries Dist.$\downarrow$ & \bfseries Min. Dist.$\downarrow$ & \bfseries Angle$\downarrow$\\
  \midrule
  Random  & 0.504 & 0.484 & 0.391 & 69.0 \\
  Center & 0.633 & 0.313 & 0.230 & 49.0 \\
  Fixed bias & 0.674 & 0.306 & 0.219 & 48.0 \\
  Recansens et al.~\citep{nips15_recasens} & 0.878 & 0.190 & 0.113 & 24.0 \\
  Chong et al.~\citep{connecting_gaze_scene_attention} & 0.896   & 0.187 & 0.112 & -  \\
  Lian et al.~\citep{10.1007/978-3-030-20893-6_3} & 0.906 & 0.145 & 0.081 & 17.6\\
  Danyang et al.~\citep{tu2022end} & 0.917 & 0.133  & 0.069 & -     \\
  VideoAttn$^\dagger$~\citep{9157393} & 0.921 & 0.137 & 0.077 & - \\
  Fang et al.~\citep{9577574}  & 0.922 & 0.124 & \textcolor{red}{0.067} & 14.9 \\
  Jin et al.~\citep{JIN2022104924}  & 0.923 & \textbf{\textcolor{teal}{0.120}}  & \textbf{\textcolor{teal}{0.064}}  & \textcolor{red}{14.8} \\
  Tonini et al.~\citep{Tonini_2022} & 0.927 & 0.141  & -  & -   \\
  Bao et al.~\citep{9878884} & \textcolor{red}{0.928} & \textcolor{red}{0.122}  & - & \textbf{\textcolor{teal}{14.6}}     \\
  \textbf{Ours} & \textbf{\textcolor{teal}{0.932}} & 0.133 & 0.073 & 19.3                           \\ \hline
  \abovestrut{2.2ex}Human & 0.924 & 0.096 & 0.040 & 11.0 \\ \hline
  \bottomrule
  \end{tabular}}
\end{table}

\subsubsection{Evaluation on Gaze Follow}
The quantitative results for the GazeFollow dataset are shown in~\tableref{Table:gazefollow1}. The entry labelled \textit{Center} denotes the metrics are calculated by considering the gaze point to always be at the center of the image. Our model achieves new state-of-the-art in the AUC metric. Similar to~\cite{9878884}, we believe that \textit{AUC} is a better metric than \textit{Dist.} and \textit{Angle} for the GazeFollow dataset because the latter metrics are susceptible to errors introduced by averaging human annotations. For example,~\figureref{Figure:gazefollow_example} shows how averaging human annotations can lead to inconsistent estimates of the ground-truth gaze target point. Our example case highlights that averaging the human annotations causes the gaze point to drift away from the true gaze point and causes the averaged gaze point not to be consistently centred around the object that the person is looking at. We additionally include some challenging scenarios and failure cases from GazeFollow dataset in~\figureref{Figure:gazefollow_example}. Our model generalises well and identifies the correct target gaze heatmap bin consistently, thus achieving new state-of-the-art scores on the AUC metric.


\begin{figure}[h!]
    {\centering
    \hspace{1.6cm}
    \includegraphics[width=0.25\linewidth,height=0.15\linewidth]{images/gazefollow/00001873.jpg_533_135_591_191_0_0.jpg}
    \includegraphics[width=0.25\linewidth,height=0.15\linewidth]{images/gazefollow/00003372.jpg_307_122_358_178_0_0.jpg}
    \includegraphics[width=0.25\linewidth,height=0.15\linewidth]{images/gazefollow/00002596.jpg_432_123_484_182_0_0.jpg}}
    \\
    \includegraphics[width=0.245\linewidth,height=0.15\linewidth]{images/gazefollow/00001334.jpg_354_179_431_261_0_0.jpg}
    \includegraphics[width=0.245\linewidth,height=0.15\linewidth]{images/gazefollow/00001128.jpg_57_134_161_239_0_0.jpg}
    \includegraphics[width=0.245\linewidth,height=0.15\linewidth]{images/gazefollow/00002796.jpg_510_119_549_161_0_0.jpg}
    \includegraphics[width=0.245\linewidth,height=0.15\linewidth]{images/gazefollow/00001079.jpg_362_184_440_285_0_0.jpg}
    \\
    \includegraphics[width=.245\linewidth,height=0.15\linewidth]{images/gazefollow/00003165.jpg_39_251_264_452_0_0.jpg}
    \includegraphics[width=.245\linewidth,height=0.15\linewidth]{images/gazefollow/00001337.jpg_135_118_224_193_0_0.jpg}
    \includegraphics[width=.245\linewidth,height=0.15\linewidth]{images/gazefollow/00001717.jpg_157_101_232_183_0_0.jpg}
    \includegraphics[width=.245\linewidth,height=0.15\linewidth]{images/gazefollow/00000890.jpg_441_44_481_92_0_0.jpg}


  \caption{\textbf{Qualitative results}. Red, green and blue denote \textcolor{red}{average human annotation}, \textcolor{green}{prediction}, and \textcolor{cyan}{head location}, respectively. The first row shows inconsistent human annotation, while our model predicts the same gaze target for all three subjects. The second row shows our model's performance in challenging scenarios with complex backgrounds. The last row shows some failure cases, however, the predicted gaze targets contextually remain acceptable.}
  \label{Figure:gazefollow_example}
\end{figure}

\subsection{Ablation Study}
\label{sec:model_ablation}
In order to better understand the impact of various components within our system, we conducted further analysis using the GOO-Real dataset. Firstly, we removed the attention layers $attn_i^M$ and $attn_i^S$ from the network. This is reflected as injecting $e_i^D$ instead of $e_i^{D*}$ into $f_e^D$ and $e_i^I$ instead of $e_i^{I*}$ into $f_e^I$ (Attention-None). Next, we removed the DISM map, $S_i$, and replaced it with a uniformly weighted mask (DISM-None). We then removed all inputs in relation to the depth map $D_i$. We removed the Depth branch and $attn_i^S$ which modulates the scene embedding $e_i^I$ based on DISM map which is inherently learned from the depth map (Depth - None). Finally, we removed the Scene branch. This means that only the output of the depth encoder $f_e^D$ is fed to the decoder $f_d$ (Scene-None). We also evaluated the multi-modal fusion module using ground-truth DISM maps (DISM pseudo-labels). The results, sorted in the order of their performance are reported in~\tableref{Table:ablation_study}. We notice that DISM contributes significantly to the overall model performance. We also show that all components of our network are necessary to attain exceptional performance.
\vspace{-0.5cm}
\begin{table}[h!]
\floatconts
  {Table:ablation_study}
  {\caption{Ablation study on GOO-Real dataset}}
  {\begin{tabular}{l|cc}
  \toprule
  \bfseries Method & \bfseries AUC$\uparrow$ & \bfseries Dist.$\downarrow$\\
  \midrule
  DISM - None & 0.911 & 0.188  \\
    Depth - None & 0.915 & 0.186  \\
  Scene - None & 0.941 & 0.140 \\
  Attention - None  & 0.948 & 0.135 \\
  Ours - All (DISM pseudo-labels) & 0.959 & 0.128  \\
  \hline
  \abovestrut{2.2ex}\textbf{Ours - All} & \textbf{0.954} & \textbf{0.130}\\ \hline
  \bottomrule
  \end{tabular}}
\end{table}
\vspace{-0.5cm}
\section{Conclusion}
\label{sec:conclusion}
In this research, we have presented a GTD architecture comprising two key modules: Depth-Infused Saliency and Multi-Modal Fusion. The former focuses on identifying salient artefacts relevant to the subject within the scene image to generate the DISM map and the latter leverages the generated DISM map while fusing multiple modalities to generate the gaze target heatmap. This approach has proven to outperform similar contemporary research in terms of various state-of-the-art metrics. We presented challenging scenarios and failure cases to our model to test its generalization capabilities and it consistently pinpointed the gaze target correctly. This research represents a significant step forward in gaze target detection, offering a robust and effective approach to understanding human gaze within complex scenes. 

\section{Future Work}
\label{sec:future}
One of our next goals is to improve our network with a dedicated component for out-of-frame gaze detection. Furthermore, we envision extending our work into the field of education, aiming to enhance student-teacher interaction and engagement. Such systems necessitate a robust face detector. This inspires us to explore transformer-based networks and self-attention mechanisms, to concurrently identify faces and their associated gaze points.

\clearpage
\bibliography{pmlr-sample}

\end{document}
