\section{Proposed Method}
\subsection{Stage-I: Gaze Contrastive Learning (\textbf{\gazeclr{}}) Framework} 
\label{subsec:framework}

\begin{figure*}[t]
    \centering
    \includegraphics[width=\columnwidth]{images/arch5.pdf}
    \caption{\textbf{Method schematic.} For synchronous view frames $\{ I_{v_i}\}_{i=1}^{4}$, the above figure illustrates invariant and equivariant positive pairs anchored only for view $v_1$. The left branch shows \textit{single-view} learning ($\mathscr{L}^{I}$) and right branch illustrates \textit{multi-view} learning using four views ($\mathscr{L}^{E}$).
    % , where $I^{'}_{v_1}$ has invariant relationship to $I_{v_1}$. 
    %and corresponding other views $(I_{v_2}, I_{v_3}, I_{v_4,t})$ are selected for single-view and multi-view learning which further processed through augmentation pipeline. All images are encoded by a shared encoder. 
    All images (after augmentation, $a \in \mathcal{A}$)
    % ($\{ I_{v_i}\}_{i=1}^{4} \bigcup I^{'}_{v_1}$)
    are passed through a shared CNN encoder network, followed by MLP projectors (either $p_1$ or $p_2$) depending on the type of input positive pair. The embeddings for multi-view learning are further multiplied by an appropriate rotation matrix. More details in Section~\ref{subsec:framework}.
    %The projector $p_1$ maps single-view embeddings to invariant representation space (left branch) which are inputs to invariant contrastive loss $\mathscr{L}^{inv}$. 
    % The output of projector $p_1$ is utilized to enforce invariance using the $\mathscr{L}^{inv}$ loss function. Similarly, output of $p_2$ network is used to enforce equivariance through the $\mathscr{L}^{equiv}$ loss, after accounting for the camera viewpoint through a fixed rotation.
    }% multi-view embeddings to equivariant space which are further multiplied by a  matrix to apply equivariant loss $\mathscr{L}^{equiv}$ (right branch) as described in Section~\ref{subsec:multiview}.}
    \label{fig:arch}
\end{figure*}



\gazeclr{} is a framework to train an \textit{encoder} that learns embeddings to induce desired set of invariance and equivariance for the gaze estimation task. As stated earlier, the key intuition of \gazeclr{} is that we want to enforce invariance using selected appearance transformations (e.g., color jitter) and equivariance using synchronous images of the same person captured from multiple camera viewpoints. Similar to previous SSL approaches~\citep{chen2020simple, spurr2021self}, we rely on the normalized temperature-scaled cross-entropy loss (NT-Xent)\citep{chen2020simple} to encourage invariance or equivariance by maximizing the agreement between positive pairs and disagreement between the negative pairs. In particular, we devise two variants of NT-Xent loss, namely, $\mathscr{L}^{I}$ for invariance and $\mathscr{L}^{E}$ for equivariance.

%for a given camera frame $I_{v}$, we create its augmentation $I_{v}^{'}$ and try to make them similar using normalized temperature-scaled cross entropy loss (NT-Xent)~\cite{chen2020simple} two different augmented versions of a randomly sampled frame can be used to learn invariance to appearance transformation, . Secondly, two frames of the same person, sampled from the same timestamp but from different views, are used to learn equivariance under different geometric transformations.
% A schematic overview of the \gazeclr{} framework is illustrated in Figure~\ref{fig:arch}. 
The \gazeclr{} framework has three sub-modules: a CNN-based \textit{encoder} and two \textit{projection heads} based on MLP layers, as illustrated in Figure~\ref{fig:arch}. The output of the encoder branches out into different \textit{projection head} depending on the type of input positive pair.  To abide by the invariance for gaze direction, we consider augmentations based on only \textit{appearance} transformations denoted as $\mathcal{A}$. 

Let $\{I_{v_i,t}\}_{i=1}^{|V|}$ be the synchronous frames for timestamp $t$ coming from different camera views (i.e., $\{ v_i \}_{i=1}^{|V|}$), then we create the following positive pairs:
\begin{enumerate}
    \item{\em Single-view positive pairs:} We apply two randomly sampled augmentations from $\mathcal{A}$ to create a single-view positive pair. Specifically, for any image  $I_{v_i, t}$, at a given timestamp $t$ and view $v_i$, we sample two augmentations $a$ and $a'$ from $\mathcal{A}$ and then $(a(I_{v_i, t}), a'(I_{v_i, t}))$ forms a positive pair to learn invariance. The left branch of Figure~\ref{fig:arch} shows one such positive pair for view  $v_1$.
    \item{\em Multi-view positive pairs:} We consider all unique pairs of camera viewpoints from the same timestamp $t$ and apply random augmentations from $\mathcal{A}$, i.e., $\{(a_i(I_{v_i, t}), a_j(I_{v_j, t})) \mid i,j \in \{1, \ldots,|V|\}  \mid i \neq j \}$. %that is, images with the same timestamp but taken from different cameras. 
    The corresponding outputs from the encoder are passed through projection head $p_2$ and multiplied by an appropriate rotation matrix to learn equivariance. 
    % In order to enforce equivariance, the output of projector $p_2$ is pre-mutiplied by an appropriate rotation matrix, as described further.
\end{enumerate}


%for a given camera frame $I_{v}$, we create its augmentation $I_{v}^{'}$ and try to make them similar using normalized temperature-scaled cross entropy loss (NT-Xent)~\cite{chen2020simple} two different augmented versions of a randomly sampled frame can be used to learn invariance to appearance transformation, . Secondly, two frames of the same person, sampled from the same timestamp but from different views, are used to learn equivariance under different geometric transformations.


%an invariant pair by applying different augmentations to a randomly sampled frame from a single view (left branch, Figure~\ref{fig:arch}),
% $I_{v, t+1}$ taken from the temporal neighbor, 
%and (ii) equivariant pairs taken from same timestamp but different camera views (right branch, Figure~\ref{fig:arch}). For creating invariant pairs, we apply only appearance-based transformations on the input image because single-view branch is responsible for inducing invariance to appearances. Additionally, in multi-view branch, we promote equivariance by inverting the geometric transformations in the latent space after which we apply our equivariance contrastive loss. We hypothesize that both invariance and equivariance are



%comprises two branches (Figure~\ref{fig:arch}). {\bf Check that the following is  correct.} 

%The Single-view branch receives in input two augmented versions of the same image and generates invariant embeddings. Similarly, the Multi-view branch takes multiple views of the same image and produces equivariant embeddings. The essence of the proposed method is to learn meaningful representations that are invariant under different appearances while simultaneously equivariant to various geometric transformations.  In this section, we detail our approach of learning self-supervised gaze representations from a large set of videos acquired from multiple synchronized and calibrated cameras (Sec.~\ref{subsec:framework}). Further, we describe two contrastive learning objectives used to learn our overall framework. After that, we provide details on how the encoder is employed for the downstream task of gaze estimation (Sec.~\ref{subsec:gazeestim}).

% types of views used in practice, i.e., learning from images of a single-view, and multiple views \my{no comma before and unless its a list or starting a new sentence}. Thereafter, we provide details on how the encoder is employed for the downstream task of gaze estimation. \my{provide overview of next two subsections}

% The encoder is trained by inducing the properties of both invariances and equivariances, trained in a multi-task manner. The core idea is that two frames coming from same time but different viewpoints will form equivariant positive pairs while frames from two different times and any random view constitutes equivariant negative pairs. Along with equivariant, the invariant positive pairs are formed by producing two distorted versions of a single frame, similar to the previous SSL approaches~\cite{}..

Next, to construct negative pairs,  we do not sample them explicitly but use all other samples in the mini-batch as negative examples, similar to~\citet{chen2020simple}. The exact formulation of both loss functions $\mathscr{L}^{I}$ and  $\mathscr{L}^{E}$ is described below. For brevity, we omit $t$ from $I_{v_i, t}$ and augmentation $a$ in the following subsections.

%\subsection{The GazeCLR Framework} 
%\label{subsec:framework}
%Give a video sequence $\{I_{v, t}\}_{t=1}^{T}$ consisting of $T$ frames taken from multiple camera viewpoints $v\in V$, we create two kind of positive pairs: (i) an invariant pair by applying different augmentations to a randomly sampled frame from a single view (left branch, Figure~\ref{fig:arch}),
% $I_{v, t+1}$ taken from the temporal neighbor, 
%and (ii) equivariant pairs taken from same timestamp but different camera views (right branch, Figure~\ref{fig:arch}). For creating invariant pairs, we apply only appearance-based transformations on the input image because single-view branch is responsible for inducing invariance to appearances. Additionally, in multi-view branch, we promote equivariance by inverting the geometric transformations in the latent space after which we apply our equivariance contrastive loss. We hypothesize that both invariance and equivariance are necessary to improve the performance of gaze estimation.

% By introducing the creation of positive pairs in this manner, our proposed framework encourages both invariance and equivariance in the learned representations. \my{too verbose ... you need mention things here much more explicitly and accurately} 

% We hypothesize that both of these properties are necessary to improve the performance of gaze estimation. 
% We discard the notation $n$ in the following subsections for simplicity.



% Let's assume the dataset $\mathcal{D}$ consists of $N$ samples $\{x_{v, t}^{(i)}\}_{i=1}^{N}$ where $v \in V^{(i)} = \{1, \ldots, v^{(i)}\}$  represents the different views of $i^{th}$ sample and $t \in T^{(i)} = \{1, \ldots, t^{(i)}\}$ indicates timestamp of the frame. Furthermore, $N = \sum_{p=1}^{P} n_p$ where $P$ is number of participants in $\mathcal{D}$ and $n_p$ indicates the number of data points for a specific participant $p$. We also assume that the relative pose between different cameras is known.  

%Our framework consists of an encoder $E$ and two projection heads $p_1(\cdot)$ and $p_2(\cdot)$ where, $p_1$ and $p_2$ are responsible for inducing invariance and equivariance, respectively. The encoder learns the mapping from an image $I$ to an embedding $z \in R^d$ i.e., $z = E(I)$. The projection heads further map the embedding $z$ to two latent representations: an invariant representation $z^{inv} \in R^{d}$, and an equivariant representation $z^{equiv} \in R^{d}$, such that $z^{inv} = p_1(z)$ and $z^{equiv} = p_2(z)$. The whole framework is trained using two contrastive learning objectives as described in the following subsections.  

\subsubsection{Single-View Learning}
%For single-view learning, the objective function is similar to that of previous works on self-supervised learning for classification tasks~\cite{chen2020simple, grill2020bootstrap}. As shown in Figure~\ref{fig:overallidea}, a randomly sampled image $I_{v_1}$ is transformed  by two distinct data augmentation operators.
%$k$, $k'$, giving us a positive pair for invariance learning ($k(I_{v_1,t}), k'(I_{v_1,t})$. Here, $k, k'\in \mathcal{K}$ is a set of augmentations. We remove notation $k$ in rest of the paper for simplicity. 
Recall, the goal of \textit{single-view} learning is to induce invariance amongst representations under various appearance transformations. Let $v_i \in V$ be any view and $b \in [1,\ldots, B]$ be the batch index. Given a batch size of $B$, we apply two augmentations to each sample in the batch yielding $2B$ augmented images, and for each sample, we have one positive pair and $(2B-1)$ negative pairs stemming from remaining samples in the batch. % (i.e, $\{I_l^{v_i}, I_b^{v_i} \}_{l \neq  b } \bigcup \{I'_l^{v_i} \}_{l =  b }$ ). 
% Let $v_i \in V$ be any view, $b \in [1,\ldots, B]$ be the batch index, and ($I_b^{v_i}$, $I'_b^{v_i}$) be the data augmentations for each sample, then, there are $B$ positive samples  $\{I_b^{v_i}, I'_b^{v_i}\}_{b = 1}^{B}$.  
%\{I_l^{v_i}, I'_m^{v_i}\}_{l\neq m}$ 
%Therefore, we have $B$ positive pairs stemming for each sample, and for each such positive pair, we have ($2B-1$) negative pairs constituting of remaining augmentations from rest of the samples in the batch. 
%Let $b \in [1, \ldots, B]$ represents index over samples in the batch, then for each sample, we have  a positive pair $(a_1(I_{v_i}^{b}), a_2(I_{v_i}^{b}))$. 
Our encoder $E$ extracts representations for all $2B$ augmented images, which are further mapped by projection head $p_1(\cdot)$ yielding embeddings ($\{ z^b_{v_i}$, $z'^b_{v_i} \}_{b=1}^{B}$). % in latent space  where invariant contrastive loss is applied (Figure~\ref{fig:arch}, left branch).
% Following \cite{chen2020simple}, we do not sample negative pairs explicitly, but treat all other samples in the same mini-batch as negative examples.
With above notations, for any view $v_i$, the proposed invariance loss function $\mathscr{L}^{I}$ associated with a positive pair ($z^b_{v_i}, z'^b_{v_i}$) can be given as follows:
\begin{equation}
\label{loss:infonce}
    \mathscr{L}^{I} (z^{b}_{v_i}, z'^{b}_{v_i}) = - \logg \dfrac{\similarity(z_{v_i}^b, z'^b_{v_i})}{\sum_{l=1}^{B} \mathbbm{1}_{l\neq b}\similarity(z_{v_i}^b, z^l_{v_i}) + \sum_{l=1}^{B} \similarity(z_{v_i}^b, z'^l_{v_i}) }
\end{equation}
where, $z_{v_i}^b = p_1(E(I_{v_i}^b))$, $z'^b_{v_i} = p_1(E(I'^b_{v_i}))$, $\similarity(r, s) =\exponential\left({\dfrac{1}{\tau}\dfrac{r^T s}{||r||\cdot  || s||}}\right)$, $\mathbbm{1}_{[l\neq b]}$ is an indicator function and $\tau$ is the temperature coefficient parameter. It is worth noting that to minimize the loss in Eq.~\ref{loss:infonce}, it must hold that $z^b_{v_i}$ and $z'^b_{v_i}$ needs to be closer, which aligns with our goal of learning invariance to appearance transformations. One challenge, however, is the risk of collapse (e.g., the network could simply learn each person's identity). To avoid this, we create mini-batches such that all samples in a batch are taken from a single participant.
\subsubsection{Multi-View Learning}
\label{subsec:multiview}
We encourage equivariance in the gaze representations to different camera viewpoints through multi-view learning. To do so, we transform embeddings to a {common reference system}, chosen as the \textit{screen reference system} used during the EVE data collection. Let $\{R_{C_{v_i}}^{S}\}$ be the rotation matrix relating the camera viewpoint $v_i$ with the screen reference system.

For each sample  $I^b_{v_i}$ in a batch of size B, the positive pair is given as $(I^b_{v_i}, I^b_{v_j})$ for two distinct camera viewpoints $(v_i, v_j)_{i\neq j}$. 
%For each sample  $I^b_{v_i}$ in a batch of size B, we create positive pairs with all remaining camera views, i.e.,  $\{(I^b_{v_i}, I^b_{v_j}) | i, j \in {1,\ldots,|V|}, i\neq j\}$ (in the EVE dataset, $|V|=4$) and remaining ($2B-1$) samples are treated as negative samples. 
All images for viewpoints $v_i$ and $v_j$ are first augmented then passed through encoder $E$ and the projector head $p_2(\cdot)$ which gives embeddings $\hat{z}^b_{v_i}, \hat{z}^b_{v_j} \in R^{3\times d'}$. These embeddings are further multiplied by corresponding rotation matrices $R_{C_{v_i}}^{S}$ to project embeddings in the common (screen) reference system. We denote embeddings after rotation  as $\{ \bar{z}^b_{v_i}$, $\bar{z}^b_{v_j} \}_{b=1}^{B}$ such that $\bar{z}^b_{v_i} = R_{C_{v_i}}^S \hat{z}^b_{v_i}$. 
Therefore, for a batch of size B, our equivariant loss 
$\mathscr{L}^{E}$ associated with the positive pair $(\bar{z}^b_{v_i}, \bar{z}^b_{v_j})$ is as follows:
\begin{equation}
\label{loss:equivinfonce}
    \mathscr{L}^{E}(\bar{z}^b_{v_i}, \bar{z}^b_{v_j}) = - \logg \dfrac{\similarity(\bar{z}^{b}_{v_i}, \bar{z}^b_{v_j})}{\sum_{l=1}^{B} \mathbbm{1}_{[l\neq b]}\similarity(\bar{z}^{b}_{v_i}, \bar{z}^{l}_{v_i}) + \sum_{l=1}^{B} \similarity(\bar{z}^{b}_{v_i}, \bar{z}^{l}_{v_j})}
\end{equation}

\paragraph{Overall loss function.} Given $|V|$ camera viewpoints, we apply both $\mathscr{L}^{I}$ and $\mathscr{L}^{E}$ loss functions to each view. Thus, our overall objective function for a batch size of B becomes:
\begin{equation}
\label{eq:overallloss}
\mathscr{L}^{O} = \dfrac{1}{2B} \sum_{i=1}^{|V|} \sum_{b=1}^{B}  \biggl(\mathscr{L}^{I}(z^b_{v_i}, z'^b_{v_i}) + \mathscr{L}^{I}(z'^b_{v_i}, z^b_{v_i}) + \sum_{j=1, j \neq i}^{|V|}  \mathscr{L}^{E}(\bar{z}^b_{v_i}, \bar{z}^b_{v_j}) \biggr)
\end{equation}


% \my{above equation is confusing as $B$ is not explicitly used anywhere though you have mentioned it. Also, explain this objective what is the summation over and why?}

% For a given image frame $I_{v, t}$ of camera view $v$ at a timestamp $t$, where $v\in V$ and $1 \leq t \leq T$,  the loss function becomes:

% \begin{equation}
% \label{eq:overalllosssingle}
%     \mathscr{L}_{v, t} = \mathscr{L}^{inv}(z_{v,t}^{inv}, z_{v,t+1}^{inv}) + \mathscr{L}_{v, t}^{equiv} 
% \end{equation}


\subsection{Stage-II: Learning For Gaze Estimation}
\label{subsec:gazeestim}
After pre-training, the encoder learned by the \gazeclr{} framework is used for the task of gaze estimation and fine-tuned on a small labeled dataset. To this end, we remove both projection heads $p_1$ and $p_2$, and replace them with MLP regressor layers to predict 3D gaze direction. For training MLP regressor, we use the supervised loss function given as
\begin{equation}
\mathscr{L}^{ang} = \dfrac{180}{\pi}\text{arccos} \left(\dfrac{\pmb{g} \cdot \pmb{\hat g}}{||\pmb{g}|| \cdot ||\pmb{\hat g}||}\right)
\end{equation}
where $\pmb{g}$ and $\pmb{\hat g}$ are the ground-truth and predicted gaze directions, respectively. 

% \noindent


% \my{May be add notations for the adaptation/supervised network and learned representation in this sub-section then explain the above objective.}
