\documentclass[pmlr]{jmlr}% new name PMLR (Proceedings of Machine Learning Research)

 % The following packages will be automatically loaded:
 % amsmath, amssymb, natbib, graphicx, url, algorithm2e

 %\usepackage{rotating}% for sideways figures and tables
\usepackage{longtable}% for long tables

 % The booktabs package is used by this sample document
 % (it provides \toprule, \midrule and \bottomrule).
 % Remove the next line if you don't require it.
\usepackage{booktabs}
 % The siunitx package is used by this sample document
 % to align numbers in a column by their decimal point.
 % Remove the next line if you don't require it.
\usepackage[load-configurations=version-1]{siunitx} % newer version
 %\usepackage{siunitx}

\usepackage{graphicx}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{tablefootnote}

 % The following command is just for this sample document:
\newcommand{\cs}[1]{\texttt{\char`\\#1}}

 % Define an unnumbered theorem just for this sample document:
\theorembodyfont{\upshape}
\theoremheaderfont{\scshape}
\theorempostheader{:}
\theoremsep{\newline}
\newtheorem*{note}{Note}

 % change the arguments, as appropriate, in the following:
\jmlrvolume{1}
\jmlryear{2023}
\jmlrworkshop{NeurIPS 2023 Gaze Meets ML Workshop}

\title[Temporal Understanding of Gaze Communication with GazeTransformer]{Temporal Understanding of Gaze Communication \titlebreak with GazeTransformer} %\titletag{\thanks{sample footnote}}

 % Use \Name{Author Name} to specify the name.

 % Spaces are used to separate forenames from the surname so that
 % the surnames can be picked up for the page header and copyright footer.
 
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % *** Make sure there's no spurious space before \nametag ***

 % Two authors with the same address
    % \author{\Name{Anonymous Submission}}

  %\author{\Name{Author Name1\nametag{\thanks{with a note}}} \Email{abc@sample.com}\and
   %\Name{Author Name2} \Email{xyz@sample.com}\\
   %\addr Address}

 % Three or more authors with the same address:
 \author{\Name{Ryan Anthony {de Belen}} \Email{r.debelen@unsw.edu.au}\\
  \Name{Gelareh Mohammadi} \Email{g.mohammadi@unsw.edu.au}\\
  \Name{Arcot Sowmya} \Email{a.sowmya@unsw.edu.au}\\
  \addr University of New South Wales, Sydney, NSW, Australia}


 % Authors with different addresses:
 % \author{\Name{Author Name1} \Email{abc@sample.com}\\
 % \addr Address 1
 % \AND
 % \Name{Author Name2} \Email{xyz@sample.com}\\
 % \addr Address 2
 %}

% \editor{Editor's name}
 % \editors{List of editors' names}

\begin{document}

\maketitle

\begin{abstract}
Gaze plays a crucial role in daily social interactions as it allows humans to communicate intentions effectively. We address the problem of temporal understanding of gaze communication in social videos in two stages. First, we develop GazeTransformer, an end-to-end module that infers atomic-level behaviours in a given frame. Second, we develop a temporal module that predicts event-level behaviours in a video using the inferred atomic-level behaviours. Compared to existing methods, GazeTransformer does not require human head and object locations as input. Instead, it identifies these locations in a parallel and end-to-end manner. In addition, it can predict the attended targets of all predicted humans and infer more atomic-level behaviours that cannot be handled simultaneously by previous approaches. We achieve promising performance on both atomic- and event-level prediction on the (M)VACATION dataset. Code will be available at \url{https://github.com/gazetransformer/gazetransformer}.
\end{abstract}
\begin{keywords}
Gaze estimation and prediction, gaze communication behaviour prediction
\end{keywords}



\section{Introduction}
Apart from enriching verbal utterances, non-verbal communication in itself plays an important role in conveying important information \citep{mehrabian2017nonverbal}. In addition, the ability to learn and understand gaze communication plays a crucial role in the development of social cognition, information processing and language \citep{brooks2015connecting, adamson2004development, de2020computer, de2023eye, de2021eyexplain, deBelenVA}. In the first years of life, infants learn to follow gaze and coordinate their attention with that of their primary caregivers \citep{brooks2015connecting}. For example, an infant may respond by following the gaze point of the parent looking at a target object. Difficulties in understanding gaze communication may result in various socio-communicative impairments during development, as the former makes it challenging to associate a word with an object \citep{mundy2018review}. This motivates researchers to systematically study gaze and responses to this primitive form of communication.

Although not the primary focus of this paper, it is worth mentioning that the mature saliency estimation domain \citep{borji2019saliency, de2022scanpathnet} bears similarity to gaze behaviour prediction. With distinct differences, both aim to computationally model the mechanisms that underlie human visual attention. Instead of inferring a person's attended target in an image, saliency estimation aims to identify the pixel locations that can attract the attention of humans while viewing images. In fact, a salient region in an image will most likely attract the attention of a person in the image. This is why most previous works on human gaze behaviour prediction contain a scene branch used for saliency estimation.

Earlier works have demonstrated the ability of neural networks to estimate the attended visual target of a person in an image \citep{recasens2015they}. The most common architecture includes two branches: a head branch that learns head pose features and a scene branch that learns salient regions in the image. Succeeding works have extended the problem to handling out-of-frame gaze targets, resulting in better performance \citep{chong2020detecting, chong2018connecting}. Furthermore, recent works have explored the problem of estimating attended visual targets in 360\textdegree  images \citep{li2021looking}, video \citep{recasens2017following, fang2021dual} and even 3D space \citep{masse2017tracking, wei2018and, brau2018multiple, masse2019extended, hu2022we}. These approaches are also useful for understanding human gaze communication behaviours in social video.

Over the past decades, different frameworks have been proposed in psychology and neuroscience to study and understand gaze communication \citep{itier2009neural, jording2018social}. A recent framework involves breaking down gaze behaviours into its atomic- and event-level components \citep{fan2019understanding}. Atomic-level components include the following fine-grained gaze patterns: (1) \textit{Single} is the simplest and does not involve any social communication/interaction behaviour. (2) \textit{Mutual} occurs when two people look at each other. (3) \textit{Avert} happens when a person looks away from another's gaze. (4) \textit{Refer} occurs when a person tries to direct the attention of another person to an object. (5) \textit{Follow} happens when another person responds to an initiation of attention of another person to an object. (6) \textit{Share} occurs when two people are looking at the same object. On the other hand, gaze communication events are coarse-grained and include the following: (1) \textit{Non-communicative} (2) \textit{Mutual Gaze} (3) \textit{Gaze Aversion} (4)\textit{ Gaze Following} and (5) \textit{Joint Attention}. These events can be formed by temporally combining atomic-level components. In this work, we adopt a slightly different framework and provide justifications for this minor change in Section \ref{sec:Vacation+}.

Previous works have explored the capability of deep learning networks to predict gaze communication of people in images or videos \citep{fan2019understanding, guo2022mgtr, marin2019laeo, marin21pami}. Most approaches require human and object bounding boxes or use a decoupled head detector \citep{marin2019laeo, fan2019understanding, marin21pami}, resulting in sub-optimal solutions. A better solution is to use an end-to-end module that jointly learns to predict head and object bounding boxes and their corresponding gaze relationships. Recently, an end-to-end model has been proposed to estimate the location of the attended target \citep{tu2022end}, while another end-to-end model can only detect \textit{Mutual} gaze relationships \citep{guo2022mgtr}, limiting future applications for handling more complex gaze communication. In contrast, our proposed atomic-level module can predict \textit{Single}, \textit{Mutual} and \textit{Share} gaze relationships, while our temporal module can predict event-level gaze communication.

Our framework to predict human gaze communication behaviours consists of two stages:
\begin{enumerate}
    \item We develop GazeTransformer, an end-to-end module that identifies atomic-level gaze communication. The module consists of an image feature extractor backbone, a transformer encoder-decoder network and several multi-layer perceptrons that predict human head and object locations and their corresponding gaze relationships (i.e., human-target interaction (HTI) instances), in parallel. These HTI instances are elements of an adjacency matrix that we use for inferring atomic-level gaze communication. GazeTransformer achieves promising results on the (M)VACATION dataset.
    \item We develop a temporal module for predicting event-level gaze communication behaviours. The model architecture consists of a long short-term memory network and a fully connected layer for classification. It first processes all video frames and determines unique atomic labels within the video. As a result, it does not miss crucial frames which will otherwise be excluded if a widely-used uniform sub-sampling approach \citep{carreira2017quo} is adopted. This module achieves promising results on the (M)VACATION dataset.
\end{enumerate}

This paper makes the following contributions:
\begin{enumerate}
    \item We modify the VACATION dataset and include atomic-level components that are minimally sufficient to build event-level gaze events. The reasons for this change are discussed in Section \ref{sec:Vacation+}. These modifications are justified and result in a simpler, more efficient and effective training and evaluation paradigm. Furthermore, the new atomic-level labels provide a more practical categorisation since they do not require temporal information for inference. This is ideal because atomic-level labels are defined for each frame instead of being reliant on previous or succeeding frames.
    \item We present GazeTransformer, a novel end-to-end model that predicts human head and object locations in parallel. In addition, it predicts the attended targets of all the predicted humans in the scene. Furthermore, we propose a novel way to infer atomic-level labels (e.g., \textit{Single}, \textit{Mutual}, \textit{Share}) from adjacency matrices. Currently, this ability cannot be handled simultaneously by existing end-to-end models.    
    \item  We present a temporal model for predicting event-level gaze communication behaviours from the atomic-level gaze communication already detected.
\end{enumerate}




\section{Related Work}
The ability to use and understand gaze allows humans to communicate and share intentions effectively. In addition, it provides a means for evaluating another person's interest in the environment \citep{mehrabian2017nonverbal}. The white part of the eye, called the sclera, is more prominent in humans than in other mammals, allowing humans to leverage the colour difference between the sclera and the darker-coloured iris when directing their attention to a potential target for conveying intention \citep{kobayashi1997unique}. When the pupil information cannot be reliably used for communication, humans resort to using head orientation as another way to convey and infer intentions. Finally, if the eyes and head are occluded, body orientation provides a sufficient cue for communication. As a result, the ability to estimate gaze and head direction is crucial for humans in determining gaze communication in an image or video. Consider an image where two people are looking at an object (i.e., sharing attention). A person looking at this image needs to have the ability to determine the attended targets of both persons and understand that the targets are the same object. It is therefore appropriate to discuss related work under the following headings: gaze target prediction and gaze communication behaviour understanding.

% In fact, \textit{Follow} and \textit{Refer} labels are hard to define without looking at the preceding and succeeding frames, suggesting that they may not be suitable atomic gaze behaviours. 
\textbf{Gaze target prediction} Previous works focus on detecting gaze targets in an image \citep{recasens2015they, chong2018connecting, chong2020detecting, tu2022end, guan2020enhanced, lian2018believe, zhao2020learning, bao2022escnet, hu2022gaze, gupta2022modular}, 360\textdegree image \citep{li2021looking}, video \citep{recasens2017following, fang2021dual}, or 3D space \citep{masse2017tracking, wei2018and, brau2018multiple, masse2019extended, hu2022we}. Since our work focusses on images and videos, we review related works in more detail. 

The earliest deep learning model follows a two-branch network approach that involves a head pathway and a scene pathway to infer a heatmap of the attended target of a person in an image \citep{recasens2015they}. An extension to this work involved considering body pose \citep{guan2020enhanced} and learning a modulation constant to identify out-of-frame targets in images, as well as in videos \citep{chong2018connecting, chong2020detecting}, resulting in improved prediction performance. Another two-stage method that aims to draw sight lines and determine the attended target by stopping at a position with high sight line strength has been proposed \citep{zhao2020learning}. A three-stage method that uses depth and 3D gaze estimation was proposed to exclude predictions that are at improper depth \citep{fang2021dual}. Another similar framework infers 3D geometry from a 2D image and parses the scene to infer the target gaze position in 2D \citep{bao2022escnet}. Another three-stage method uses a head branch, scene branch and a relational branch to identify the attended target \citep{chen2021gaze, hu2022gaze}. A modular multimodal model leverages depth and pose estimation and can be used in privacy-sensitive settings \citep{gupta2022modular}. A recent work presents a unified framework for jointly solving gaze estimation, gaze target prediction and gaze target detection \citep{wang2022gatector}. As can be observed from the above discussion, most prior approaches require ground truth locations of the human heads for accurate prediction of the attended targets, limiting their adoption to practical applications. A recent work addresses this issue using an end-to-end model that can simultaneously predict head bounding boxes and their corresponding gaze targets \citep{tu2022end}, while another determines \textit{Mutual} gaze relationships \citep{guo2022mgtr}. In contrast, we propose an end-to-end solution for simultaneously predicting the attended targets of each detected person in the scene, as well as inferring the corresponding \textit{Single}, \textit{Mutual} and \textit{Share} gaze relationships in this work.

\textbf{Gaze communication behaviour understanding} Previous works detect if two persons are looking at each other \citep{marin2011here, marin2014detecting, marin2019laeo, marin21pami, palmero2018automatic, doosti2021boosting, guo2022mgtr}, determine if two or more persons are sharing attention \citep{sumer2020attention}, predict the common gaze target of a group of persons \citep{fan2018inferring, zhuang2019muggle},  or recognise atomic-level (e.g., \textit{Single}, \textit{Mutual}, \textit{Share}) and event-level gaze communication behaviours (e.g., \textit{Gaze Aversion}, \textit{Joint Attention}) \citep{fan2019understanding}. Similar to the prior works on gaze target prediction, most approaches in this domain require ground truth locations of the humans in the scene.

In this work, we present GazeTransformer, an end-to-end module for atomic-level prediction. Compared to a previous model \citep{fan2019understanding} that requires human and object locations, GazeTransformer automatically predicts these locations, the attended targets, and their corresponding gaze relationships in parallel. Unlike previous end-to-end models that can only identify attended targets \citep{tu2022end} and handle \textit{Mutual} gaze \citep{guo2022mgtr}, GazeTransformer can simultaneously infer \textit{Single}, \textit{Mutual} and \textit{Share} gaze relationships. A temporal module is also built on top of GazeTransformer for event-level classification. Our experimental results show that our atomic- and event-level modules achieve promising performance on the (M)VACATION dataset, which is a modified VACATION dataset described in Section \ref{sec:Vacation+}.


\section{(M)VACATION dataset}\label{sec:Vacation+}
The Video gAze CommunicATION (VACATION) dataset \citep{fan2019understanding} is a large-scale video dataset that aims to tackle the problem of understanding human gaze communication in social videos from both atomic- and event- levels. It contains 300 videos of diverse social scenes with complete annotations of the bounding box locations of objects and human faces, human attention, and both atomic- and event-level gaze communication labels. 

In the VACATION dataset, atomic-level labels were categorised into six classes: \textit{Single, Mutual, Avert, Refer, Follow} and \textit{Share}. On the other hand, event-level labels were composed of \textit{Non-communicative, Mutual Gaze, Gaze Aversion, Gaze Following} and \textit{Joint Attention}. While atomic-level labels are provided for each person in each frame, event-level labels are the same for an entire video/segment. Note that there is an imbalance in the number of instances for \textit{Avert}, \textit{Refer} and \textit{Follow} atomic-level labels in the original VACATION dataset, as shown in Table \ref{table: dataset_stats}.

As can be observed, there are more \textit{Single} (i.e., no gaze interaction between persons in the scene), \textit{Mutual} (i.e., two persons are looking at each other) and \textit{Share} (i.e., two or more persons are looking at the same object) atomic-level gaze communication behaviours. On the other hand, there is substantially less number of \textit{Avert} (i.e., one person looks away after another person gazes), \textit{Refer} (i.e., one person tries to refer another person to another object by a mutual gaze followed by a look at an object) and \textit{Follow} (i.e., a person looks at where another person is looking at) behaviours in the VACATION dataset.

\begin{table}[]
% \resizebox{\columnwidth}{!}{%
\centering
\begin{tabular}{@{}lllllll@{}}
\toprule
\multicolumn{1}{c}{} & \multicolumn{6}{c}{Number of instances} \\     
          & \textit{Single} & \textit{Mutual} & \textit{Avert}  & \textit{Refer} & \textit{Follow} & \textit{Share}  \\ \midrule
VACATION  & 80,370 & 37,441 & 9,333 & 2,549 & 3,429  & 28,821 \\
(M)VACATION & 87,998 & 40,497  & -       &   -    &    -    & 33,452 \\ \bottomrule
\end{tabular}%
% }
\caption{Comparison of atomic-level gaze communication labels between VACATION and (M)VACATION datasets.}
\label{table: dataset_stats}
\end{table}


While the VACATION dataset undeniably provides a useful baseline to develop computational models for gaze communication behaviour understanding, we believe that it requires minor changes for an easier, more efficient and effective training and evaluation paradigm. Therefore, we introduce a modified VACATION dataset,  named (M)VACATION. The differences between the original and the modified datasets and the reasons for the modifications are outlined below: 

\begin{enumerate}
    \item the number of atomic-level gaze communication labels has been reduced to three: \textit{Single}, \textit{Mutual} and \textit{Share}. We believe that these three fine-grained components are sufficient to build more complex and course-grained event-level gaze communication. In fact, the removed atomic-level labels (e.g., \textit{Avert}, \textit{Refer} and \textit{Follow}) require temporal information, defeating their definition as atomic components. To illustrate, consider a \textit{Joint Attention} scenario in Figure \ref{fig:VACATION_sample} where \textit{Person1} shares attention to an object with \textit{Person2}. In row 1, the ground truth atomic-level labels for \textit{Person 1} is \textit{Follow} while it is \textit{Single} then \textit{Refer} for \textit{Person2} across several frames. However, it is difficult even for a human to determine these defined ground-truth atomic-level labels for each frame without looking at the surrounding frames. A similar issue can be found in row 2 of Figure \ref{fig:VACATION_sample} in which \textit{Person1} has a \textit{Share} then \textit{Follow} labels while \textit{Person2} has \textit{Share} then \textit{Single} labels. Since the prediction of atomic-level labels is performed for each frame, \textit{Single}, \textit{Mutual} and \textit{Share} provide a more practical categorisation of atomic-level labels in this case.

\begin{figure}[!]
  \centering
   \includegraphics[width=0.8\linewidth]{images/VACATION_sample5.jpg}

   \caption{Frames with ground truth human head/object locations with their corresponding atomic-level labels from the VACATION dataset. As shown by the directed arrows in each frame, all persons are looking at the same object. However, the ground truth may either be: \textit{Follow}, \textit{Single}, \textit{Refer}, or \textit{Share}. Since atomic-level labels are defined per frame, we believe that the mentioned labels should be \textit{Share}.} %%Hence, we re-define the atomic-level labels to only include \textit{Single}, \textit{Mutual} and \textit{Share}.}
   \label{fig:VACATION_sample}
\end{figure}

    \item the atomic-level ground truth has been modified to ensure labelling consistency. This was easily automated by constructing adjacency matrices (defined and discussed more thoroughly in section \ref{subsection:inferring_atomic}) that denote directed gaze in a scene. Afterwards, the atomic-level labels are inferred from the adjacency matrices. As shown in Figure \ref{fig:VACATION_sample}, while consecutive frames show that the two persons \textit{Share} attention, the ground truth labels are different, making the problem unnecessarily complex. The underlying adjacency matrices, as will be discussed in section \ref{subsection:inferring_atomic}, are the same:
    \begin{center}
        $A_1 = A_2 = A_3 = A_4 = \begin{bmatrix}
0 & 0 & 0\\
1 & 0 & 0 \\
1 & 0 & 0
\end{bmatrix}$
    \end{center}
    
    \item After the proposed modifications, the (M)VACATION has a more balanced number of classes compared to the original VACATION dataset, as shown in Table \ref{table: dataset_stats}. In addition to the advantages described above, training a deep learning model on a more balanced dataset results in better performance, especially at times when it is difficult to obtain representative examples in each class.
\end{enumerate}


\section{Methodology}

\begin{figure*}[!]
  \centering
   \includegraphics[width=\linewidth, height=0.28\paperheight]{images/Overview_Complete.png}

   \caption{Overview of the proposed pipeline that consists of a two-step approach: the atomic-level module predicts the atomic-level behaviours of all persons in a given frame in parallel and end-to-end. Afterwards, the event-level module infers the event-level behaviour using the unique atomic-level prediction in a given video.}
   \label{fig:overview}
\end{figure*}

%Gaze target detection is a crucial ability in solving the problem of  human gaze communication behaviour prediction. Without this ability, humans will not be able to determine if there is an intention to communicate in a given scenario. For example, humans need to be able to determine the attended target of each person in order to determine whether two people in an image are looking at each other. Similarly, this ability is needed to identify if two people are sharing attention. We refer to human-object or human-human gaze interactions as human-target interactions (HTI) instances.

As shown in Figure \ref{fig:overview}, an event, such as \textit{Joint Attention}, consists of a series of temporally changing human-human or human-object interactions, which we call human-target interaction (HTI) instances. Clearly, it is important to recognise these atomic components first before attempting to understand more complex gaze communication. Therefore, we develop a two-step approach for the temporal understanding of gaze communication, as illustrated in Figure \ref{fig:overview}:
\begin{enumerate}
    \item \textbf{Atomic-level prediction:} we develop GazeTransformer that predicts human/object locations, gaze targets and all HTI instances in a parallel and end-to-end manner. These HTI instances are elements of an adjacency matrix that we then use to infer the atomic labels of each person in the scene.
    \item \textbf{Event-level prediction:} we develop a temporal module on top of GazeTransformer to predict the event labels using the unique atomic predictions in a video.
\end{enumerate}
These two modules are trained separately to address the atomic- and event-level predictions. Afterwards, the output of the atomic-level module is passed to the event-level module. While our approach looks similar to \citet{fan2019understanding}, we neither follow a neural message passing framework of graph neural networks nor require human head/object locations to generate the atomic labels. For event-level prediction, our approach uses the predicted adjacency matrix, while \citet{fan2019understanding} uses atomic-level transition and frequency counts as input.


%The adjacency matrix for analyzing atomic-level gaze behaviour is already defined in [14] While both our work and [14] follow a two-stage approach and represent the scene as a graph using an adjacency matrix, there are significant differences between the two. First, [14] follows a neural message passing framework using graph neural networks to generate the adjacency matrices of known human/object locations, while our approach neither use graph neural networks nor require human/object locations. Second, [14] learns an additional readout function that is part of the graph neural network to generate the atomic labels, while our approach uses a simple yet effective approach for atomic-level classification. Setting aside that [14] uses a graph neural network, we believe that the additional readout function is not necessary as our proposed approach of inferring the atomic labels from adjacency matrix (see Section 4.2) is sufficient and effective for this problem \textbf{\textcolor{red}{[R1]}} \textbf{\textcolor{blue}{[R3]}}. Finally, the event-level prediction is completely different since [14] uses atomic-level transition and frequency counts as an input, while our work uses the predicted adjacency matrix as an input. Our work and [14] have clear differences - the only similarity is the use of adjacency matrices to represent the scene. We updated the manuscript to highlight these differences.


%to ensure that each component are solving the problem independently and not overfitting.
% module that predicts the attended visual target of a person. Afterwards, another module takes in the predicted visual target of attention and a mask that queries whether the attention is allocated in this area. 

\subsection{Problem Formulation} \label{sec: problem_formulation}
Similar to the problem of learning human-object interactions \citep{qi2018learning}, gaze communication behaviour prediction can be solved by analysing complete scene graphs (i.e., social graphs) and generating a sub-graph that entails the true gaze communication behaviours of persons in the scene. 

A complete social graph is represented as $\mathcal{G = (V,E)}$. Nodes $v \in \mathcal{V}$ take unique values from $\{1,...,\mathcal{|V|}\}$ and represent distinct entities (e.g., human, object) in the scene. Edges $e \in \mathcal{E}$ are two-tuples $e = (v,w) \in \mathcal{V \times V}$ and represent directed edges $v \rightarrow w$ that show all the possible human-human gaze interactions or human-object gaze interactions (i.e., HTI instances). The sub-graph $g = (\mathcal{V}_{g}, \mathcal{E}_{g})$, where $\mathcal{V}_{g} \subseteq \mathcal{V}$ and  $\mathcal{E}_{g} \subseteq \mathcal{E}$, denotes the true HTI instances in the scene. This $g$ is represented as an adjacency matrix $A = [0,1]^{|\mathcal{V}|\times|\mathcal{V}|}$. While this scene representation is similar to \citet{fan2019understanding}, there are significant differences. Unlike \citep{fan2019understanding}, we consider the problem of finding the sub-graph $g$ as a set prediction problem. More specifically, the off-diagonals of $A$ are the HTI instances that the GazeTransformer predicts in a parallel and end-to-end way. Furthermore, we set the diagonals of $A$ to zero since we assume that (human) nodes do not look at themselves. In addition, we neither add a dummy node to represent the social scene nor set the maximum number of nodes for atomic classification. We also do not learn an additional readout function that is part of a graph neural network to generate the atomic labels. Instead, we use a simple, effective and practical way to infer atomic labels from an adjacency matrix (see Section \ref{subsection:inferring_atomic}). 

%We define the problem of identifying atomic- and event-level human gaze communication behaviours as a two-step process: (1) predicting the entries of per-frame adjacency matrices that will help infer the atomic-level human gaze communication behaviours and (2) predicting the event-level gaze behaviours in a video using the unique atomic-level segments inferred by the previous step.

\begin{figure*}[t]
  \centering
   \includegraphics[width=\linewidth, height=0.22\paperheight]{images/Atomic-level.png}

   \caption{The proposed architecture of the atomic-level prediction module. It consists of three components: (1) an image feature extractor backbone (2) a transformer encoder-decoder network and (3) several multi-layer perceptrons to predict the HTI instances of a frame, in parallel. These HTI instances are elements of an adjacency matrix that we use to infer the atomic-level gaze communication behaviours. This resembles a set prediction problem.}
   \label{fig:gaze target}
\end{figure*}

\subsection{Atomic-level prediction module} \label{subsection:inferring_atomic}
The atomic-level prediction module, GazeTransformer, consists of three components: (1) an image feature extractor backbone (2) a transformer encoder-decoder network and (3) several multi-layer perceptrons (MLPs). Unlike a previous method \citep{fan2019understanding}, we solve the problem of atomic-level prediction in an end-to-end manner. Unlike a previous end-to-end model \citep{guo2022mgtr} that can only handle \textit{Mutual} gaze, GazeTransformer can infer \textit{Single}, \textit{Mutual} and \textit{Share} gaze. As shown in Figure \ref{fig:gaze target}, it takes in an image frame and outputs all HTI instances in parallel (similar to a set prediction problem), effectively generating the elements of the adjacency matrix $A$ of sub-graph $g$. The atomic labels are then inferred from the generated adjacency matrix using our proposed novel approach.

\textbf{Image Feature Extractor Backbone} This component consists of an arbitrary deep neural network that extracts visual features from an input frame. The input to this module is a colour image, $x \in \mathbb{R}^{3\times W \times H}$ and the output is a feature map $f \in \mathbb{R}^{C\times W_{f} \times H_{f}}$. This feature map is reduced to a lower dimension using a $1\times 1$ convolution operator with R channels. Since the encoder of the transformer network expects a sequence of features, the feature map spatial dimension is collapsed into a single dimension using a flatten operator, resulting in a final feature map $z \in \mathbb{R}^{R \times W_{f}H_{f}}$. 
%Similar to previous end-to-end models \citep{carion2020end, zou2021end, guo2022mgtr, tu2022end}, 
We compared different backbones (ResNet50\citep{he2016deep} and ResNet101\citep{he2016deep}) in our experiments.

\textbf{Transformer encoder-decoder network} This follows a standard transformer architecture \citep{vaswani2017attention} that consists of a multi-head self-attention and feed-forward networks for the encoder and an additional multi-head cross-attention layer for the decoder.

\textbf{Multi-layer perceptron} As shown in the lower right portion of Figure \ref{fig:gaze target}, an HTI instance is a tuple containing the human class, interaction class, target class and human and target bounding boxes (x, y, width and height). The human, target and interaction classes consist of binary labels (human/not human, object/not object, and looking/not looking, respectively). The HTI instances are decoded from the output embedding of each HTI query using several MLPs in parallel. We use separate one-layer MLPs with a final softmax layer for each confidence for the human class, target class and interaction class, while separate three-layer MLPs are used for each human and target bounding box.

\textbf{Inferring atomic-level gaze communication} We present a novel and effective way to infer atomic-level labels of each person in the scene by exploiting the interesting properties of adjacency matrices. Given an adjacency matrix $A$, each entry $A_{v_i,v_j} = 1$ corresponds to a directed edge from node $v_{i}$ to node $v_{j}$. Hence, two persons ($v_i,v_j$) have \textit{Mutual} gaze behaviours if $A_{v_i,v_j}$ and $A_{v_j,v_i}$ are equal to 1. To determine if a person $v_i$ has a \textit{Shared} attention to a human/object $v_j$ with another person, check if $A_{v_i,v_j}$ is equal to 1 and identify whether the column $v_j$ has more than one entry with a value of 1. If none of these cases is met, the person only has an atomic-level label of \textit{Single}.  To illustrate this process, we analyse the frames with adjacency matrices ($A_{t}$,$A_{t+1}$,$A_{t+2}$) in Figure \ref{fig:overview}. $A_{t}$ has three nodes $v \in \mathcal{V}$, where $\mathcal{V}$ can be $Person1$, $Person2$ or $Object1$. Looking at $A_{t}$, $A_{Person1, Person2}$ and $A_{Person2, Person1}$ are both equal to 1, hence the inferred atomic-level gaze communication for both \textit{Person1} and \textit{Person2} are \textit{Mutual}. Looking at $A_{t+1}$, $A_{Person1, Person2}$ and $A_{Person2, Object1}$ are both equal to 1, hence both persons have an atomic-level label of \textit{Single}. Finally, looking at $A_{t+2}$, $A_{Person1, Object1}$ and $A_{Person2, Object1}$ are both 1 (i.e., column $v_{Object1}$ have two entries with a value of 1), hence both persons have \textit{Shared} attention labels.

%Given a person of interest (Person1), its possible allocated attention to the other objects corresponds to a row $row_{attention}$ in the adjacency matrix. The index of $row_{attention}$ that has a value of 1 means that the attention of P1 is allocated to that object.


\subsection{Event-level prediction module}

Our event-level prediction module, as shown in Figure \ref{fig:event-level}, consists of a long-short term memory (LSTM) architecture that takes in a list of unique adjacency matrices predicted by the atomic-level prediction module. Similar to a concept introduced elsewhere \citep{liu2021no}, the proposed network processes the entire video and only uses key frames for prediction. The key frames are defined as the frames where an entry (or entries) of the predicted adjacency matrix has (have) changed. We believe that this simple yet effective approach resembles the behaviour of humans when determining gaze communication events in a video (i.e., humans watching a video keep track of the unique atomic-level labels that happened throughout the video before predicting an event label). In addition, the transition from one atomic-level label to another provides important information about the event-level label that has transpired. Since atomic-level labels can be inferred from the adjacency matrix $A$, keeping track of the unique $As$ across time provides crucial information about the event. Once the key frames have been identified, their corresponding adjacency matrices are flattened and used as input to the LSTM network. Afterwards, the learned features are passed to a fully connected layer for event-level classification.


\begin{figure}[]
  \centering
   \includegraphics[width=0.75\linewidth]{images/Event-level.png}

   \caption{The proposed architecture of the event-level prediction module. It consists of a long short-term memory (LSTM) network and a fully-connected layer for event-level classification. Displayed is a sample input that contains a list of unique adjacency matrices predicted by the atomic-level prediction module and an output that is the final classification of \textit{Joint Attention}.}
   \label{fig:event-level}
\end{figure}


\section{Experiments}
\subsection{Dataset}
We used the (M)VACATION dataset to train and evaluate our atomic- and event-level prediction modules. Since we predict the atomic-level labels in a parallel and end-to-end manner, the only input to the atomic-level module is an image frame, while the human head and object bounding box labels are used as ground truth to train the model. To prevent overfitting of the atomic-level prediction module, we sample each video every 10 frames and use the sampled frames as inputs to the module. Using the atomic-level ground truth, we generate the adjacency matrices for the entire duration of each event-level label and extract the unique adjacency matrices to train our event-level module.

\subsection{Implementation Details} Our proposed modules were implemented in PyTorch. The input to the atomic-level module is a normalised RGB image, while the output is a set of HTI instances. To train our model, we used the Hungarian algorithm \citep{kuhn1955hungarian} to solve the matching of the predicted HTI instances with the ground truth.  After the optimal matching was found, we used the loss function:
\begin{equation}
    \mathcal{L}_{loss} = \beta_1\sum_{c \in h, t, g} \mathcal{L}_{class}^c + \beta_2\sum_{b \in h, t} \mathcal{L}_{bbox}^b
\end{equation}
where $\mathcal{L}_{class}^c$ are the standard cross entropy losses between the human, target and gaze interaction and their corresponding ground truth labels. On the other hand, $\mathcal{L}_{bbox}^b$ consists of the weighted sum GIoU loss and L1 loss and is computed for each human and target bounding box.

We used different feature extractor backbones (ResNet50 and ResNet101) pre-trained on the ImageNet database and freeze their batch norm layers. We compared DETR \citep{carion2020end} and DeformableDETR \citep{zhu2020deformable} models pre-trained on COCO for our transformer encoder-decoder and MLP networks. Note that we only used the ResNet50 version of DeformableDETR since the ResNet101 version pre-trained on COCO is not publicly available. We used the default number of encoder and decoder layers, as well as the number of object (HTI in our case) queries, chosen by the original DETR and DeformableDETR models. We set the AdamW optimizer with the following parameters: learning rates of the backbone is 1e-5 and the transformer network is 1e-4; weight decay is set to 1e-4 and applied after 200 epochs. Similar to other DETR-like architectures, our models are trained with a long training schedule (250 epochs). %Training our network takes 40 hours on a single NVIDIA A6000 GPU on the (M)VACATION dataset.

The event-level prediction module was trained on the ground truth adjacency matrices of the training split of the (M)VACATION dataset. We compute the cross entropy loss between the predicted event-level gaze communication and the ground truth to train the model. For testing, the input of the event-level prediction module is a sequence of unique adjacency matrices predicted by the atomic-level prediction module. The length of the adjacency matrix sequence is set to a value of 5. We set the limit of the maximum number of nodes to 7 based on the (M)VACATION statistics. Smaller adjacency matrices are appended with zeros and shorter sequences are appended with the last unique matrix. %As a result, our model can handle up to seven humans and objects in the scene when predicting event-level human gaze communication behaviours.

\subsection{State-of-the-art models} \label{subsection:SOTA}
A spatiotemporal graph neural network has been proposed to represent the atomic-level gaze communication behaviours in a given frame \citep{fan2019understanding}. In addition, an event network was developed to predict the event-level gaze communication behaviours \citep{fan2019understanding}. While their approach is fundamentally similar, a fair comparison is not possible because of the following reasons: (1) their models were trained and evaluated using the VACATION dataset and (2) their models cannot be re-trained and re-evaluated on the (M)VACATION dataset due to the nature of their open-source implementation (i.e., their source code contains references to files that were not released). Re-implementing their network is out of the scope of this work. Instead, the proposed model was compared to a baseline that predicts the gaze target of a person in a scene \citep{chong2020detecting}. In particular, a pre-trained model that accepts a human head bounding box, as well as the entire image, was used to predict the probability distribution of the gaze target. Afterwards, the location with the highest probability value was utilised to determine if any human head/object falls within this location and construct an adjacency matrix that can then be used to infer atomic-level labels. This approach is fundamentally similar to the behaviour of the proposed network (i.e., GazeTransformer also predicts the attended target), with the exception that the proposed model performs automatic prediction of human head/object locations instead of using ground truth human head/object locations.

\subsection{Evaluation Metrics}
We use precision ($\mathcal{P}$), F1-score ($\mathcal{F}$) and top-1 average accuracy \citep{fan2019understanding} to evaluate both our atomic- and event-level prediction modules. We also report the $\mathcal{P}$, $\mathcal{F}$ and Recall ($\mathcal{R}$) values to demonstrate GazeTransformer's performance in detecting human/object locations. A prediction is considered a true positive if and only if the model predicts a box location that has an intersection-over-union (IOU) greater than 0.5 with the ground truth. To make the number of the atomic- and event-level predictions and ground truth the same and allow for a meaningful comparison, a node without gaze interaction with other nodes is added to the predicted adjacency matrix when ground truth is missed.

\section{Results} \label{sec:Results}
We discuss both the quantitative and qualitative results on atomic-level prediction in Section \ref{subsection:atomic_level} followed by the event-level prediction in Section \ref{subsection:event_level}.

%are no prior works that detect \textit{Single}, \textit{Mutual} and \textit{Share} atomic labels.
% We compare our results with the published results of the current state-of-the-art \citep{fan2019understanding} on VACATION since we do not have access to their pre-trained weights or documented training code \citep{fan2019understanding}.

\subsection{Atomic-level prediction module} \label{subsection:atomic_level}
\textbf{Quantitative results} GazeTransformer achieved the following human/object localisation performance: ResNet50: $\mathcal{P}$=92.29\%, $\mathcal{F}$=83.58\%, $\mathcal{R}$=76.38\%, while ResNet101: $\mathcal{P}$=90.23\%, $\mathcal{F}$=83.65\%, $\mathcal{R}$=77.97\%, suggesting that the models reported low false positives but moderate false negatives. This means that most of the time, no model generated bounding box predictions that do not contain any human head or object. However, there were times when the models failed to generate bounding boxes that should be there. In the next section, the reasons that contributed to this performance are explored, specifically the missed predictions on small objects. 

\begin{table*}[t]

\centering
\resizebox{\textwidth}{!}{%
\begin{tabular}{@{}lcccccccc@{}}
\toprule
\multicolumn{1}{c}{} & \multicolumn{7}{c}{Atomic-level Gaze Communication (Precision $\mathcal{P}$,  F1-score $\mathcal{F}$ \& Average Accuracy)}                                              \\ \midrule
 &
  \multicolumn{2}{c}{Single} &
  \multicolumn{2}{c}{Mutual} &
  \multicolumn{2}{c}{Share} &
  \multicolumn{1}{c}{Average Accuracy} \\
                     & $\mathcal{P} (\%) \uparrow$ & $\mathcal{F} (\%) \uparrow$ & $\mathcal{P} (\%) \uparrow$ & $\mathcal{F} (\%) \uparrow$ & $\mathcal{P} (\%) \uparrow$ & $\mathcal{F} (\%) \uparrow$ & top-1 $(\%) \uparrow$    &       \\
Ours*             & 79.48  & \textbf{86.28}   &  \textbf{75.57}   & 66.38   & 94.81 & 60.06  & \textbf{88.40}              \\
Ours$\dagger$                 &  \textbf{79.98}  & 85.17  & 72.78 & \textbf{68.12} & 83.10  & 60.24  & 87.88         \\
Ours$\ddagger$                 & 77.93   &  85.24 &  70.59 & 58.85  & \textbf{95.17}  &  \textbf{64.16}  & 87.54      \\

\citep{chong2020detecting}                 & 70.05   & 68.61  & 43.42 & 58.21 & 64.31   & 33.34  &  78.92        \\
%Ours$\ddagger$                  &   &   &   &   &        &        &        &        &        &        &   &   &       &       \\ 
\bottomrule
\end{tabular}
}
\caption{Quantitative performance of different models on (M)VACATION dataset for atomic-level gaze communication prediction. Model used: *DETR with ResNet50, $\dagger$DETR with ResNet101, $\ddagger$DeformableDETR with ResNet50.}
\label{table:AtomicQuantResults}
\end{table*}

The atomic-level classification results are reported in Table \ref{table:AtomicQuantResults}. All GazeTransformer models achieved promising precision ($\mathcal{P}$) values on all atomic-level classes with the highest $\mathcal{P}$ value of 95.17\% on \textit{Share} class, much higher than the baseline model \citep{chong2020detecting}. Similarly, the GazeTransformer models reported higher F1-scores ($\mathcal{F}$) than the baseline model. Overall, the GazeTransformer models achieved a similar high average accuracy of around 88\%. In comparison, the baseline model achieved a lower performance across all the atomic-level gaze communication behaviours. In particular, the baseline model resulted in a 10\% lower precision ($\mathcal{P}$) value of 70.05\% and a 20\% lower F1-score ($\mathcal{F}$) of 68.61\% on the \textit{Single} class. In addition, it resulted in around 30\% lower precision ($\mathcal{P}$) value of 43.42\% and a 10\% lower F1-score ($\mathcal{F}$) of 58.21\% on the \textit{Mutual} category. The greatest difference in performance was on the \textit{Share} class where the baseline model reported a 30\% lower precision ($\mathcal{P}$) value of 64.31\% and around 30\% lower F1-score ($\mathcal{F}$) of 33.34\%. Overall, the baseline model had a 10\% lower average accuracy of 78.92\%. All variants of GazeTransformer consistently outperformed the baseline model \citep{chong2020detecting}. In the next section, we will focus on the GazeTransformer with ResNet50 and DETR-like architecture.

\textbf{Qualitative results} 
% The visualisation of the qualitative results of our model on the atomic-level prediction is shown in Figure \ref{fig:qualitative_atomic}.
As shown in the first two rows of Figure \ref{fig:qualitative_atomic}, the proposed GazeTransformer correctly predicted human head and object locations (in coloured solid rectangles) that are close to the ground truth (in red dotted rectangles). Directed arrows are added to show the predicted attended targets of all the detected humans. In addition, it can correctly infer \textit{Single}, \textit{Mutual} and \textit{Share} atomic behaviours in scenarios where there are exactly two persons (Columns 1-3) or even three persons (Columns 4-5). %Note that we display all the HTI instances, resulting in overlapping solid rectangles.

GazeTransformer also predicted labels that were different from the ground truth (for the succeeding discussion, refer to the last two rows of Figure \ref{fig:qualitative_atomic}). Column 1 shows that our model predictions were \textit{Mutual}, while the ground truth was \textit{Single}. Here, the subtle cue of eye gaze direction results in a drastically different atomic-level label. This illustrates why our model achieved lower $\mathcal{P}$ and $\mathcal{F}$ values for the \textit{Mutual} label. We found instances where we believe that the ground truth (\textit{Mutual}) was incorrect (column 2). Our model was penalised for predicting the correct \textit{Single} label, effectively lowering our model's \textit{Single} and \textit{Mutual} performance. As shown in columns 3-4, our model failed to correctly predict the locations of small objects, resulting in lower $\mathcal{F}$ on \textit{Share} label. This is also reflected in the lower $\mathcal{R}$ value ($\sim76\%$) of GazeTransformer's localisation performance. Finally, column 5 shows ambiguous cases that are too difficult even for humans to identify.

\begin{figure*}[h!]
\centering
   \includegraphics[width=0.95\linewidth]{images/QResults.png}
   \caption{Atomic-level prediction results. The first two rows show frames with correctly classified labels (in green), while the last two rows show frames with incorrectly classified labels (in red, above the ground truth). The dotted rectangles are ground truth labels, while solid rectangles are model predictions.}
   \label{fig:qualitative_atomic}
\end{figure*}

\subsection{Event-level prediction module}   \label{subsection:event_level}

The event-level classification results are shown in Table \ref{table:EventQuantResults}. Our temporal module combined with different GazeTransformers has promising $\mathcal{P}$ and $\mathcal{F}$ values for \textit{Non-Communicative}, \textit{Mutual Gaze} and \textit{Joint Attention}, but low performance for \textit{Gaze Aversion} and \textit{Gaze Following}. Overall, our temporal modules have a similar high average accuracy of around 85\%. To eliminate any compounded errors caused by GazeTransformer and illustrate the effectiveness of our temporal module alone, we fed the latter with the ground truth adjacency matrices and achieved significantly higher performance across all metrics on all event labels, highlighting that our temporal module is working as intended. On the other hand, the baseline model was not able to predict \textit{Gaze Aversion} and \textit{Gaze Following} classes and a low performance on \textit{Joint Attention}, resulting in a much lower average accuracy of 22.90\%. 


\begin{table*}[h!]
\centering

\resizebox{\textwidth}{!}{%
\begin{tabular}{@{}lcccccccccccc@{}}
\toprule
\multicolumn{1}{c}{} & \multicolumn{11}{c}{Event-level Gaze Communication (Precision $\mathcal{P}$,  F1-score $\mathcal{F}$ \& Average Accuracy)}                           \\ \midrule
 &
  \multicolumn{2}{c}{Non-Comm.} &
  \multicolumn{2}{c}{Mutual Gaze} &
  \multicolumn{2}{c}{Gaze Aversion} &
  \multicolumn{2}{c}{Gaze Following} &
  \multicolumn{2}{c}{Joint Attention} &
  \multicolumn{1}{c}{Avg. Acc.} \\
                     & $\mathcal{P} (\%) \uparrow$ & $\mathcal{F} (\%) \uparrow$  & $\mathcal{P} (\%) \uparrow$ & $\mathcal{F} (\%) \uparrow$ & $\mathcal{P} (\%) \uparrow$ & $\mathcal{F} (\%) \uparrow$ & $\mathcal{P} (\%) \uparrow$ & $\mathcal{F} (\%) \uparrow$ & $\mathcal{P} (\%) \uparrow$ & $\mathcal{F} (\%) \uparrow$ & top-1 $(\%) \uparrow$ &      \\
Ours*                 &   61.76     &   \textbf{62.69}    &  \textbf{68.42}      &   65.00    &    \textbf{25.00}    &   33.33    &   20.00     &   18.00     &    43.75    &  46.67  & 85.03      \\ 
Ours$\dagger$                 &    60.60    &  61.54 &  57.14      &  63.15      &   25.00     &  33.33      &   \textbf{25.00}     &  25.00    &  \textbf{71.42}      &   \textbf{58.82}     &  \textbf{85.23}      \\
Ours$\ddagger$                 &  58.82      &  57.97 &  57.14      &  58.54    &   25.00     &  33.33      &   20.00     &  18.00   &  50.00      &   53.85     &  83.71     \\

\citep{chong2020detecting}                 &  \textbf{71.43}      &  31.25 &  23.81      &  \textbf{71.43}    &  0    &  0      &   0     &  0   &  37.5      &   33.33     &  22.90    \\
\hline
Ours$\dagger\dagger$                 &     \textbf{74.60}   &  \textbf{82.46} &  \textbf{89.66}      &  \textbf{85.25}      &   \textbf{66.67}     &  \textbf{66.67}      &   \textbf{66.67}     &  \textbf{72.73}    &  \textbf{78.57}      &   \textbf{57.89}    &  \textbf{91.90}      \\ 
%Ours$\ddagger$                 &        &        &        &        &        &        &        &        &        &        &     &      \\ 
\bottomrule
\end{tabular}
}
\caption{Quantitative evaluation results of different models on (M)VACATION dataset for event-level gaze communication prediction. Model used: *DETR with ResNet50, $\dagger$DETR with ResNet101, $\ddagger$DeformableDETR with ResNet50, $\dagger\dagger$The input is the ground-truth adjacency matrices instead of the atomic-level predictions.}
\label{table:EventQuantResults}

\end{table*}

\begin{figure*}[h]
  \centering
   \includegraphics[width=\linewidth, height=0.20\paperheight]{images/QResults-Event.png}

   \caption{Event-level prediction results. The first two rows show correctly classified atomic and event labels (in green), while the last two rows show incorrectly classified atomic and event labels (in red, above the ground truth).}
   \label{fig:qualitative_event}
\end{figure*}

Figure \ref{fig:qualitative_event} illustrates the qualitative results. A closer examination of Rows 1 and 2 reveals that all the frames depict accurate atomic-level predictions, resulting in correct event-level predictions. On the other hand, the event-level predictions in Rows 3 and 4 are incorrect because GazeTransformer either failed to identify an object in
the scene (as observed in Row 3) or incorrectly predicted the atomic-level labels of one or two persons in the scene (as depicted in Row 4). This affirms the heavy reliance of the temporal module on the GazeTransformer. More examples of atomic- and event-level predictions on the (M)VACATION dataset are currently available in the GitHub repository linked in the Abstract.

\section{Conclusion}
We have presented a two-stage approach for the temporal understanding of gaze communication. Compared to previous approaches, our first stage does not require human head/object bounding box locations. Instead, our module predicts these locations, attended targets and their corresponding gaze relationships in parallel. Unlike previous end-to-end models that can only predict attended gaze targets or identify \textit{Mutual} gaze instances, our model can infer \textit{Single}, \textit{Mutual} and \textit{Share} gaze behaviours. Afterwards, a temporal model uses the predicted atomic-level labels to identify event-level gaze communication. Both models show promising results on the (M)VACATION dataset. Despite the encouraging results, our proposed two-stage framework has some notable limitations. % that stem from both the atomic- and event-level modules. 
First, our atomic-level module has a lower localisation performance on small objects, as shown by the failure cases in Figure \ref{fig:qualitative_atomic}, resulting in lower atomic-level classification performance. Second, our event-level module is heavily dependent on our atomic-level module since both modules are in cascade. This is substantiated by the increase in performance of the event-level module when the ground truth adjacency matrices were used as input. Despite this promising result, the event-level results may still be sub-optimal, which is caused by the disjointed training of both modules (due to limited data). Nevertheless, our approach offers an end-to-end solution for atomic-level prediction combined with a temporal module for event-level prediction. 
%Both modules achieved promising results on the (M)VACATION dataset.


\bibliography{jmlr-sample}

% \appendix

% \section{First Appendix}\label{apd:first}

% This is the first appendix.

% \section{Second Appendix}\label{apd:second}

% This is the second appendix.

\end{document}
