

\section{Experiments}
% % We first provide an analysis of quality of our technique used in positive and negative pair formation in Section~\ref{subsec:analysispdf} 

We start by detailing the experimental setup (Sec.~\ref{subsec:experimentalsetting}) followed by a brief explanation of considered baselines (Sec. \ref{subsec:baselines}). Next, we evaluate the performance of our pre-trained encoder and show that representations from \gazeclr{} can help train an accurate gaze estimation model even with a relatively lesser amount of annotations. %For this task, we evaluate using within-dataset setting (Sec.~\ref{subsec:withindataeval}). We show the transferable capability of our representations by evaluating on different domain datasets in both linear evaluation (frozen encoder) and fine-tuning settings (end-to-end training) and test with only few calibration samples from the test subject (Sec.~\ref{subsec:crossdataeval}). Thereafter, we compare GazeCLR against supervised~\cite{park2019few} and unsupervised gaze representation learning methods~\cite{yu2020unsupervised, sun2021cross}(Sec.~\ref{subsec:soacompare}). This is followed by t-SNE visualization of our learned gaze representations (Sec.~\ref{subsec:viz}). 
For this task, we consider the within-dataset setting (Sec.~\ref{subsec:withindataeval}). We assess the transferable capability of our representations by evaluating them on different domains in linear layer training (frozen encoder) setting, where we considered only a few calibration samples from the test subject, as detailed in Sec.~\ref{subsec:crossdataeval}. Thereafter, we compare \gazeclr{} with existing supervised ~\citep{park2019few} and unsupervised ~\citep{yu2020unsupervised, sun2021cross} pre-training methods in Sec.~\ref{subsec:soacompare}. Lastly, we probe the semantics of learned \gazeclr{} representations using a well-known t-SNE visualization technique (in Sec.~\ref{subsec:viz}). Additional results and ablation studies are provided in appendix~\ref{sec:additional-res} and \ref{sec:ablations}. 




% \begin{figure}
%     \centering
%     \includegraphics[scale=0.45]{images/final_paper_pdf.png}
%     \caption{Histogram showing the distribution of angle differences between gaze directions of positive and negative samples for one training epoch.}
%     \label{fig:pdf}
% \end{figure}


% \subsection{Analysis of Positive vs Negative Pairs}
% \label{subsec:analysispdf}
% We analyze the quality of positive and negative pairs used for training GazeCLR self-supervised stage as explained in Section~\ref{subsec:framework}. For this, we computed the angular difference between gaze directions of all positive and negative pairs created using single-view for one training epoch. Figure~\ref{fig:pdf} shows the normalized histogram distribution of these angle differences. We observe the right-skewed distribution for positive pairs with mode value close to $0^{\circ}$. Furthermore, the negative pair distribution is almost uniform between $10^{\circ}-48^{\circ}$ with small percentage of samples below $10^{\circ}$. This shows the validity of our data pair formation technique  


\subsection{Setup}
\label{subsec:experimentalsetting}
We train our \gazeclr{} framework on the EVE~\citep{park2020towards} dataset, which has videos collected in a constrained indoor setting with four different synchronized and calibrated camera views. It has approximately 12 million frames collected from 54 participants with natural eye movements. Following the splits considered by~\citet{park2020towards}, there are 40 subjects in training and 6 subjects in the validation set. We discard the data of test subjects due to the non-availability of labels. We use training subjects for the pre-training stage, \textit{without} using any gaze annotations. For the gaze estimation stage, we evaluate on the data of validation subjects to report the performance. We use all four camera views (i.e., $|V|=4$) as well as the information about the relative pose between camera and screen ($R_{C}^S$) provided with the EVE dataset. Note that our framework can be extended to more number of camera views ($|V| > 4$) using ETH-XGaze~\citep{zhang2020eth} dataset. In this paper, we consider pre-training only on EVE dataset as more views add on increased computational demand.


%\paragraph{Data pre-processing.} We use pre-processed face patch images available in the dataset obtained after applying a data-normalization procedure~\cite{sugano2014learning, zhang2018revisiting}. The normalization pipeline transforms the gaze annotation for an image to a normalized camera space through a rotation matrix $M$. Note that we post-multiply $R_C^S$ with $M^{-1}$ as $R_C^S$ is defined w.r.t. original camera reference frame, i.e., in multi-view learning, $\bar{z}^{v} = R_{C_v}^S (M)^{-1} z_{v}^{e}$.

%\paragraph{Data pre-processing.} We use face images available in the EVE dataset, obtained after applying a data-normalization procedure~\cite{sugano2014learning, zhang2018revisiting}. The normalization pipeline transforms the gaze annotation to a normalized camera space through a rotation matrix $M$. Note that we post-multiply $R_C^S$ with $M^{-1}$ as $R_C^S$ is defined w.r.t. the original camera reference frame, i.e., $\bar{z}_{v} = R_{C_v}^S (M)^{-1} \hat{z}_{v}$.


% Therefore, we reverse the normalization matrix in the equivariance loss $\mathscr{L}^{equiv}$ to obtain embeddings in original camera reference frame. To do so, we modify Equation~\ref{eq:relequiv} to account for the normalization matrix as $\bar{z}_{v,t}^{\ equiv} = R_{C_v}^S (M)^{-1} z_{v,t}^{\ equiv}$.


%\paragraph{Architecture details.} 

%\paragraph{Training details.} GazeCLR is trained using SGD optimizer with initial learning rate $0.03$, momentum $0.9$ and cosine annealing~\cite{loshchilov2016sgdr} for learning rate decay. We train GazeCLR on a single 1080 GeForce GTX GPU with the batch size of 128 and for 50K iterations. Note that, the mini-batch is made up of samples from a single participant to avoid trivial solution. The temperature coefficient $\tau$ is set equal to 0.1. For augmentation pipeline, we apply random spatial cropping and resizing, gaussian blur, color perturbation ($p=0.8$) on  brightness, contrast, saturation and hue,  grayscale conversion ($p=0.2$) and auto-contrast ($p=0.5$). 


%TODO: I've moved these details to supplementary
%\paragraph{Training details.} \gazeclr{} is trained using SGD optimizer with initial learning rate $=0.03$, momentum $=0.9$, and cosine annealing~\cite{loshchilov2016sgdr} for the learning rate decay. We use a single 1080 GeForce GTX GPU for training, with a batch size of 128, and train for 50K iterations. Our mini-batch is made up of samples from a single participant. The temperature coefficient $\tau$ is set to $0.1$. For the augmentation transformations $\mathcal{A}$, we apply random spatial cropping and resizing, gaussian blur, color perturbation ($p=0.8$) on  brightness, contrast, saturation and hue,  grayscale conversion ($p=0.2$) and auto-contrast ($p=0.5$).


\paragraph{Data pre-processing.} We use face images available in the EVE
% \footnote{This dataset is licensed under a \href{https://creativecommons.org/licenses/by-nc-sa/4.0/}{CC BY-NC-SA 4.0.}} 
dataset, obtained after applying a data-normalization procedure~\citep{sugano2014learning, zhang2018revisiting}. The normalization pipeline transforms the gaze annotation to a normalized camera space through a rotation matrix $M$. Note that we post-multiply $R_C^S$ with $M^{-1}$ as $R_C^S$ is defined w.r.t. the original camera reference frame, i.e., $\bar{z}_{v} = R_{C_v}^S (M)^{-1} \hat{z}_{v}$.


\paragraph{Training details.} \gazeclr{} is trained using SGD optimizer with initial learning rate $=0.03$, momentum $=0.9$, and cosine annealing~\citep{loshchilov2016sgdr} for the learning rate decay. We use a single 1080 GeForce GTX GPU for training, with a batch size of 128, and train for 50K iterations. Our mini-batch is made up of samples from a single participant. The temperature coefficient $\tau$ is set to $0.1$. For the augmentation transformations $\mathcal{A}$, we apply random spatial cropping and resizing, gaussian blur, color perturbation ($p=0.8$) on  brightness, contrast, saturation and hue,  grayscale conversion ($p=0.2$), and auto-contrast ($p=0.5$).



All experiments use ResNet-18~\citep{he2016deep} as the encoder network and take the output from the average pooling layer. The encoder is trained from scratch. Following~\citet{chen2020simple}, both projection heads $p_1(\cdot)$ and $p_2(\cdot)$ are two-layer MLP networks with ReLU non-linearity. The output dimensions for the first and second layers are $512$ and $180$, respectively. The input image size is $128\times 128$. 

We train the \gazeclr{} framework in two different settings: (i) \textit{GazeCLR (Equiv)}: where we only consider equivariance through the loss function $\mathscr{L}^{E}$
%, as mentioned in the Equation~\ref{loss:equivinfonce}, 
and (ii) \textit{GazeCLR (Inv+Equiv)}: where we consider both invariance and equivariance with equal weights using the overall objective $\mathscr{L}^{O}$. We present the performance of both training setups in all the considered experimental settings. Observe that, \textit{GazeCLR (Inv)} trained with only $\mathscr{L}^{I}$ loss function is equivalent to SimCLR~\citep{chen2020simple} baseline method. 

\subsection{Baselines} 
\label{subsec:baselines}
%We compare our pre-trained encoder performance on gaze estimation task against six baselines: (i) \textit{w/o Pretrain} where encoder is initialized to random weights, (ii) the vanilla \textit{Autoencoder} consists of same encoder layers and five DenseNet~\cite{huang2017densely}  deconvolution blocks as decoder and trained with L2 loss, (iii) \textit{Novel View Synthesis}~\cite{rhodin2018unsupervised} framework is trained on our dataset using same architecture as the auto-encoder, (iv) BYOL~\cite{grill2020bootstrap}, (v) SimCLR~\cite{chen2020simple}  and (vi) \textit{Fully-Supervised} is a ResNet-18 model trained on whole training data and forms the upper bound for our evaluations. For SimCLR and BYOL, we used same augmentation set as in our proposed method for fair evaluation. More details are provided in the supplementary.

We compare our approach with six following baselines: (i) \textit{w/o Pre-training}, i.e., an encoder is initialized using random weights, (ii) the vanilla \textit{Autoencoder}, which has an encoder network that consists of the same encoder layers as \gazeclr{} and five DenseNet~\citep{huang2017densely}  deconvolution blocks as decoder, and is trained with L2 loss, (iii) \textit{Novel View Synthesis}~\citep{rhodin2018unsupervised} framework is trained on our dataset using the same architecture as the auto-encoder, (iv) BYOL~\citep{grill2020bootstrap}, (v) SimCLR~\citep{chen2020simple}  and (vi) \textit{Fully-Supervised} is a ResNet-18 model trained on the whole  EVE training  data and represents possibly an upper bound for the performance of \gazeclr{}. For SimCLR and BYOL, we use the same augmentation set as in our proposed method. For more experimental details, see appendix~\ref{appendix:baselines}.

% \subsection{Results}
% We first evaluate the quality of representations learned by GazeCLR framework on gaze estimation task under within-dataset and cross-dataset settings, and compare the performance with the baseline methods.

\subsection{Within-dataset Evaluation}
\label{subsec:withindataeval}
For within-dataset evaluation, we perform pre-training on the training split of the EVE dataset without using labels. Then we adapt the pre-trained encoder for the gaze estimation on a small subset of labeled data. Precisely, we took five training subjects out of 40 (which form around $10\%$ samples out of the whole EVE dataset) for the supervised gaze estimation stage and called it ``MiniEVE''. We validate on fixed subject data chosen from training subjects and report the final performance for validation subjects.




% Table~\ref{table:within-data-eval} shows the mean angular errors (in degrees) obtained for different pre-training baselines and the proposed \gazeclr{} method. 
% % The top-half of the Table~\ref{table:within-data-eval} shows evaluation for image-based gaze estimation where our input is a RGB image (Input Modality=I) while the bottom-half shows performance on video-based gaze estimation where we input a sequence of 30 frames (Input Modality=V). 
% To this end, we freeze the pre-trained encoder and simply train a MLP regressor using the ``MiniEVE" dataset.
% Similarly, for video-based gaze estimation, we use the frozen trained encoder to extract sequence of embeddings and train a GRU-based~\cite{chung2014empirical} sequence regressor model to output frame-wise gaze directions. 
Table~\ref{table:within-data-eval} shows the mean angular errors (in degrees) obtained for different pre-training baselines and the proposed \gazeclr{} method. To this end, we freeze the pre-trained encoder and simply train an MLP regressor using the ``MiniEVE'' dataset. Note that, for two baselines, Autoencoder and BYOL, we fine-tune the whole end-to-end framework along with the encoder as otherwise, they fail to converge when only their representations are used. We indicate this behavior in Table~\ref{table:within-data-eval}, using the \textit{Frozen} column as \cmark\ if encoder is frozen otherwise as \xmark{}.
% for fine-tuning.
% Note that, for baseline autoencoder and BYOL in image-based gaze estimation, we fine-tune the whole end-to-end framework along with encoder due to the problem of non-convergence. 

We observe that our method \gazeclr{} outperforms other pre-training baseline methods by only training an MLP regressor on the small amount of labeled data (``MiniEVE'' is $\sim 10\%$ of whole data). Specifically, it can be seen that the performance achieved from \gazeclr{} helps in closing the gap with the fully-supervised baseline. Our method \textit{\gazeclr{} (Inv+Equiv)} shows a relative improvement of $25.1\%$ compared to the popular contrastive learning method SimCLR. Additionally, \textit{\gazeclr{} (Equiv)} shows a boost of $26.4\%$ relative improvement over the SimCLR approach, suggesting that equivariant representations are very effective for the gaze estimation task. We hypothesize that since we utilize similar augmentation strategies for creating both single-view and multi-view positive pairs,  \textit{\gazeclr{} (Equiv)} performs almost comparable to \textit{\gazeclr{} (Inv+Equiv)}.

\begin{table*}[t!]
\caption{\textbf{Within-dataset Evaluation.} We report the mean angular errors (MAE) in degrees for within-dataset evaluation for the gaze estimation task. The ``EVE'' shows the whole EVE data while ``MiniEVE'' indicates a small subset data. The Frozen column is \cmark{} if pre-trained encoder is frozen, otherwise fine-tuned \xmark{}. The best performing method is shown in \textbf{bold} and second best is \underline{underlined}.} 
\label{table:within-data-eval}
\centering
\resizebox{\columnwidth}{!}{%
	\begin{tabular}{l|c|c|c|c} 
	\hline
	\textbf{Method}  & \textbf{Pre-Train} &\textbf{Task} & \textbf{Frozen} & \textbf{MAE $\downarrow$}  \\ 
	  & \textbf{Data} & \textbf{Data} &  & \textbf{(degrees)}  \\
	\shline
	w/o Pre-training    & EVE  &  MiniEVE  & \xmark & 8.47  \\
    % \shline
    Autoencoder    & EVE  &  MiniEVE  & \xmark  & 6.91 \\
    % \hline
    Novel View Synthesis~\citep{rhodin2018unsupervised}    & EVE  &  MiniEVE & \cmark  & 6.79 \\
    % \hline
    BYOL~\citep{grill2020bootstrap}    & EVE  &  MiniEVE & \xmark &  8.35 \\ 
    SIMCLR~\citep{chen2020simple}    & EVE  &  MiniEVE & \cmark & 6.57  \\ 
    % \hline
    \textbf{GazeCLR  (Equiv)}    & EVE  &  MiniEVE  &\cmark & \textbf{4.83}  \\ 
    % \hline
    \textbf{GazeCLR (Inv+Equiv)}    & EVE  &  MiniEVE & \cmark & \underline{4.92}  \\ 
    % \shline
    \textcolor{gray}{Fully-Supervised}    & -  &  EVE & \xmark & 4.15  \\ 
	\shline
	\end{tabular}
	}
\end{table*}


\subsection{Transfer Learning/Cross-dataset Evaluation} %/Cross-dataset Evaluation} 
\label{subsec:crossdataeval}
We perform a cross-dataset evaluation using a few-shot personalized gaze estimation to further demonstrate  the cross-data generalization capabilities of the learned representations. We evaluate \gazeclr{} representations on two domain datasets different from pre-training data: MPIIGaze~\citep{zhang2015appearance} and Columbia~\citep{smith2013gaze}. \textbf{MPIIGaze} is a challenging dataset that has higher inter-subject variations. We use the standard evaluation subset MPIIFaceGaze~\citep{zhang2017s}, containing around 37667 images captured from 15 subjects. The \textbf{Columbia} dataset consists of 5880 images collected from 56 subjects and is known to have high head pose variations. 
% 4-fold,
% and leave-one-out 15-fold cross validation evaluation was used for Columbia, and MPIIGaze respectively. 
\begin{figure*}[t!]
    \centering
    \includegraphics[width=\columnwidth]{images/barplot_2.png}
    \caption{\textbf{Transfer Learning Evaluation.} Performance evaluation using  \textit{Linear Layer Training (LLT)} protocol for both MPIIGaze and Columbia dataset under different few-shot settings. Each bar is computed by averaging over 10 runs. Best viewed in color.}
    \label{fig:crossdataeval}
\end{figure*}

To measure the quality of learned representations, we use \textit{Linear Layer Training (LLT)} protocol, in which we freeze the trained encoder and learn a linear regressor on the target dataset. For this experiment, we investigate under a few-shot setting where we sample a few calibration samples from the test subject for adaptation and evaluate on the remaining samples of the same test subject.
% Two evaluation protocols are used to measure the quality of learned representations: (a) \textit{Linear Layer Training (LLT)}, in which we freeze the trained encoder and learn a linear regressor on the target dataset, and (b) \textit{Finetuning (FT)} fine-tunes the entire network in an end-to-end manner on the target dataset. We investigate both protocols under a few-shot setting where we sample a few calibration samples from the test subject for adaptation, and evaluate on the remaining samples of the same test subject.
% \paragraph{Linear Layer Training (LLT).} 

Figure~\ref{fig:crossdataeval} shows the mean angular errors for LLT protocol on 20-shot, 50-shot, and 64-shot gaze estimation. We first extract the gaze representations of a few calibration samples for each subject and learn a linear model on top of these representations. We evaluate the trained model on the remaining samples of the subject. We repeat above $10$ times for each subject on both datasets and report mean angular errors for the same in Figure~\ref{fig:crossdataeval}. 

Observe that both proposed \gazeclr{} variants outperform all other baselines in all few-shot settings for both datasets. Moreover, \textit{GazeCLR(Equiv)} gives a relative improvement of around $17.2\%$ over SimCLR with only $20$ calibration samples for Columbia. We hypothesize that this behavior is due to high head-pose variations within Columbia, and it suggests that:  a) learning equivariance over multi-views is beneficial for the GazeCLR framework in improving performance, and b) \gazeclr{} representations are relatively more generalizable for cross-domain datasets than other baselines. 
% Additional results are provided in the supplementary materials.

% \begin{table*}[]
% \caption{\textbf{Finetuning (FT).} Comparison of various baselines for the \textit{Finetuning} experimental protocol on various few-shot settings, for both MPIIGaze and Columbia. %Here, we fine-tune whole end-to-end network and utilize few calibration samples during test time. 
% The errors are computed from 10 runs and reported as (\meanstd{mean}{std}).}
% \label{table:cross-data-eval}
% \centering
% \resizebox{0.95\linewidth}{!}{%
% 	\begin{tabular}{l|c|c|c|c|c|c|c} 
% 	\hline
% 	   & \multicolumn{7}{c}{\textbf{MPIIGaze}} \\ 
%     \hline
% 	 \textbf{Method}  &  1 & 3 & 5 & 9 & 15 & 50  & 64 \\ 
% 	\shline  
% 	w/o Pre-training~\cite{chen2020offset}  & \meanstd{5.57}{1.60} & \meanstd{4.65}{0.71} & \meanstd{4.40}{0.40}  & \meanstd{4.22}{0.27}  & \meanstd{4.13}{0.17} & \meanstd{4.00}{0.04} & \meanstd{4.00}{0.04} \Tstrut{} \\
	
%     Autoencoder  & \meanstd{5.65}{1.60} & \meanstd{4.69}{0.76} & \meanstd{4.42}{0.45} & \meanstd{4.16}{0.21} & \meanstd{4.10}{0.16} & \meanstd{3.97}{0.05}  &  \meanstd{3.96}{0.04}\Tstrut \\
    
%     Novel View Synthesis~\cite{rhodin2018unsupervised}  & \meanstd{5.53}{1.32} & \meanstd{4.75}{0.63} & \meanstd{4.46}{0.40} & \meanstd{4.27}{0.25} & \meanstd{4.17}{0.15} &  \meanstd{4.06}{0.04} & \meanstd{4.06}{0.04}\Tstrut \\
    
%     BYOL~\cite{grill2020bootstrap}   & \meanstd{5.71}{1.63} &  \meanstd{4.71}{0.66}  & \meanstd{4.35}{0.31}  &  \meanstd{4.22}{0.21} & \meanstd{4.11}{0.15}  &    \meanstd{4.01}{0.05} & \meanstd{4.00}{0.04}  \Tstrut\\ 
    
%     SIMCLR~\cite{chen2020simple}   & \meanstd{4.87}{1.51} &  \meanstdred{3.93}{0.54} & \meanstd{3.74}{0.35} &  \meanstd{3.57}{0.24} &  \meanstd{3.47}{0.12} &  \meanstd{3.39}{0.04}  &  \meanstd{3.38}{0.03}\Tstrut \\ 
    

    
%     \textbf{GazeCLR (Equiv)} & \meanstdblue{4.70}{1.49} & \meanstdblue{3.77}{0.51}   & \meanstdblue{3.51}{0.32}  & \meanstdblue{3.39}{0.18}  & \meanstdblue{3.33}{0.11}  & \meanstdblue{3.25}{0.03}  & \meanstdblue{3.24}{0.02}  \Tstrut  \\ 
    
%     \textbf{GazeCLR (Inv+Equiv)} & \meanstdred{4.72}{1.33} &  \meanstd{3.93}{0.54}  &  \meanstdred{3.68}{0.34} &  \meanstdred{3.54}{0.19}  & \meanstdred{3.44}{0.11}  & \meanstdred{3.37}{0.03}    & \meanstdred{3.35}{0.03}\Tstrut \\ 
% 	\hline
% 	\hline
% 	   & \multicolumn{7}{c}{\textbf{Columbia}} \\ 
% 	\hline
% 	\hline \\[-2.6ex]
	
% 	w/o Pre-training~\cite{chen2020offset}  & \meanstd{6.96}{0.55} & \meanstd{5.73}{0.20}   & \meanstd{5.38}{0.14}  & \meanstd{5.23}{0.09}  & \meanstd{5.13}{0.05}  & \meanstd{5.04}{0.08}  &  \meanstd{5.00}{0.09} \Tstrut\\

%     Autoencoder  &  \meanstd{7.00}{0.57}  & \meanstd{5.79}{0.18}   &  \meanstd{5.49}{0.15} &  \meanstd{5.24}{0.07} & \meanstd{5.15}{0.04}  &  \meanstd{5.03}{0.08}  & \meanstd{5.03}{0.07} \Tstrut \\
    
%     Novel View Synthesis~\cite{rhodin2018unsupervised}  & \meanstd{7.38}{0.60} &  \meanstd{6.05}{0.22}  & \meanstd{5.78}{0.14}  &  \meanstd{5.51}{0.05} &  \meanstd{5.43}{0.06} &  \meanstd{5.33}{0.06}  &  \meanstd{5.27}{0.08}  \\
   
%     BYOL~\cite{grill2020bootstrap}   & \meanstd{6.09}{0.41} &   \meanstd{4.97}{0.22} & \meanstd{4.70}{0.13}  & \meanstd{4.55}{0.09}   & \meanstd{4.43}{0.04}  &  \meanstd{4.35}{0.05}   & \meanstd{4.34}{0.06}  \Tstrut \\ 

%     SIMCLR~\cite{chen2020simple}   & \meanstd{4.36}{0.20} &   \meanstd{3.67}{0.13} & \meanstd{3.44}{0.07}  &  \meanstd{3.34}{0.05} &  \meanstd{3.27}{0.04} &   \meanstd{3.21}{0.04}  &  \meanstd{3.19}{0.05} \Tstrut \\ 
    

%     \textbf{GazeCLR (Equiv)} & \meanstdblue{4.34}{0.25} &  \meanstdblue{3.60}{0.12}  &  \meanstdblue{3.42}{0.09} &  \meanstdblue{3.30}{0.04} &  \meanstdblue{3.26}{0.02} & \meanstdblue{3.17}{0.04}  & \meanstdblue{3.17}{0.02}  \Tstrut\\ 
    

%     \textbf{GazeCLR (Inv+Equiv)} & \meanstdred{4.54}{0.24} &  \meanstdred{3.75}{0.12}  & \meanstdred{3.59}{0.08}  &  \meanstdred{3.45}{0.05}  &  \meanstdred{3.39}{0.03} &  \meanstdred{3.31}{0.04}   & \meanstdred{3.31}{0.04}  \Tstrut\\ 
%     \shline
% 	\end{tabular}
% 	}
% \end{table*}

% \paragraph{Finetuning (FT).} In Table~\ref{table:cross-data-eval}, we present the results for FT on MPIIGaze and Columbia, where we fine-tune the whole end-to-end network. For this experiment, we adopt architecture from Chen et al.~\cite{chen2020offset}, where a subject-dependent bias term is learned along with an end-to-end network. 4-fold and leave-one-out (15-fold) evaluation protocols are used for Columbia and MPIIGaze, respectively. 

% Unlike \cite{chen2020offset},  our input is a full face image, and the backbone is a pre-trained encoder. We take a few calibration samples for each subject during inference and estimate the subject-dependent term. We evaluate performance on the remaining samples and repeat this calibration for 10 runs for each subject. Table~\ref{table:cross-data-eval} provides mean and standard deviation of angular errors over 10 runs. We compare the performance of our method with other baselines for various few-shots settings. Results demonstrate that our method consistently outperforms all other pre-training baselines, including \cite{chen2020offset} (w/o Pre-training) for all few-shot settings. This indicates the improved generalization capability of our learned representations, particularly on the MPIIGaze dataset. Also, we observe that our method is either superior or competitive with other baselines on the Columbia dataset. %  We suspect the reason is small size of Columbia dataset which effects fine-tuning the entire network.  


% \begin{figure}[t]%
%     \centering
%     \subfloat[\centering ]{{{\includegraphics[width=5cm]{images/faze_ssl_columbia.png}} }}%
%     \qquad
%     \subfloat[\centering ]{{{\includegraphics[width=5cm]{images/faze_ssl_mpiigaze.png}} }}%
%     \caption{\textbf{\gazeclr{} vs FAZE~\cite{park2019few}.} Comparison of \gazeclr{} with supervised baseline FAZE at different values of few-shots for Columbia dataset. The plot shows error bars with mean and standard error values.}%
%     \label{fig:vsfaze}%
% \end{figure}





\begin{table*}[t!]
\caption{\textbf{\gazeclr{} vs \citet{yu2020unsupervised, sun2021cross}:} Comparison of \gazeclr{} with other unsupervised gaze representation learning methods~\citep{yu2020unsupervised, sun2021cross} for 50-shot gaze estimation. $\dagger$ denotes the method that uses additional head pose information. The metric reported is mean angular errors averaged over 10 runs (in degrees).}
\label{table:basline-crossdata-eval}
\centering
\resizebox{0.85\columnwidth}{!}{%
	\begin{tabular}{l|c|c|c} 
	\shline
	Method & Pre-Train Data & MPIIGaze & Columbia \Tstrut{} \Bstrut{}\\
	\hline
	\citet{yu2020unsupervised}^{\dagger} & Columbia & - &  8.9 \Tstrut{}\\
	\citet{sun2021cross} & MPIIGaze & 8.5 & - \\
	\citet{sun2021cross} & Columbia & - & 7.0 \Bstrut{}\\
	\hline
	\textbf{\gazeclr{} (Equiv)} & EVE & 7.0  &  \textbf{6.1} \Tstrut{}\\
    \textbf{\gazeclr{} (Inv+Equiv)} & EVE & \textbf{6.5}  & 6.6 \Bstrut{}\\
	\shline  

	\end{tabular}
	}
\end{table*}

\begin{figure}[h!]%
    \centering
    \includegraphics[scale=0.6]{images/faze_ssl_columbia.png}%
    \caption{\textbf{\gazeclr{} vs FAZE~\citep{park2019few}.} Comparison of  \gazeclr{} with supervised pre-training baseline (FAZE) for various few-shot settings, on the Columbia dataset. The plot shows mean angular error (MAE, in degrees) and standard error bars \textit{versus} number of few-shot samples, reported after $10$ runs.}%
    \label{fig:vsfaze}%
\end{figure}

 \subsection{Comparison with state-of-the-art gaze representation learning} 
\label{subsec:soacompare}
We further compare \gazeclr{} with existing state-of-the-art  unsupervised~\citep{yu2020unsupervised, sun2021cross} and  supervised~\citep{park2019few} gaze representation learning methods. For a fair comparison, we adopt the same evaluation protocols as used by these baseline methods and compare the \gazeclr{} performance against their performance. % in the original paper. 

\paragraph{\gazeclr{} vs. Unsupervised Pre-training~\citep{yu2020unsupervised, sun2021cross}.}
We follow the same evaluation protocol as \citep{yu2020unsupervised}. 5-fold and leave-one-out (15-fold) evaluations are used for the Columbia and MPIIGaze datasets, respectively. In each fold, we freeze the \gazeclr{} encoder and extract representations for randomly selected $50$ samples with annotations and learn a simple MLP-based gaze estimator on top of that. We repeat the performance evaluation $10$ times and report mean angular errors in Table~\ref{table:basline-crossdata-eval}. Note that previous methods~\citep{yu2020unsupervised, sun2021cross} exploit left and right eye patches to get SSL signal, whereas our approach relies on face patches obtained from multiple camera viewpoints. %To our best knowledge, we lack the baseline for unsupervised gaze representation on full face images, thus it is difficult for us to compare.  


In Table~\ref{table:basline-crossdata-eval}, we compare against the best-performing models of \citet{yu2020unsupervised} and~\citet{sun2021cross}, for the 50-shot gaze estimation. 
Notice that our method outperforms baselines with absolute improvements of $2^{\circ}$ and  $0.9^{\circ}$ on MPIIGaze and Columbia, respectively. It is worth emphasizing that our method is pre-trained on a different dataset than both evaluation datasets, unlike baseline approaches. Again, it illustrates the strength of our approach in learning semantically meaningful representations for generalizable to other domains. Moreover, note that \citet{yu2020unsupervised} use additional head-pose information, unlike our method.

\paragraph{\gazeclr{} vs. Supervised Pre-training~\citep{park2019few}.} We evaluate the effectiveness of \gazeclr{} representations using the MAML framework~\citep{finn2017model}, similar to FAZE~\citep{park2019few}. For both \gazeclr{} and FAZE, we train a MAML-based gaze estimator on the representations for subjects from the GazeCapture~\citep{krafka2016eye} dataset. Then, we adapt the gaze estimator model to each test subject of Columbia with $k$ calibration samples and test on the remaining samples. 
% Among 15 subjects of the MPIIGaze dataset, the last 500 images are reserved for testing, while $k$ calibration samples are randomly selected for adaptation purposes. 
% We randomly select $k$ calibration samples for each subject of Columbia for adaptation purpose and test on remaining samples.
Figure~\ref{fig:vsfaze} depicts the performance comparison of \gazeclr{} with FAZE~\citep{park2019few} for four different values of $k$. It can be seen that our method consistently outperforms supervised pre-training baseline FAZE, for all values of $k$. Notably, our framework uses \textit{zero} labeled information to obtain gaze representations, unlike FAZE, which is pre-trained using  $\sim 2M$ labeled samples from the GazeCapture dataset. 



\subsection{Visualization of Gaze Representations}
\label{subsec:viz}
To further investigate the quality of learned representations, we project the gaze representations into 2-dimensions using t-SNE~\citep{van2008visualizing} algorithm as shown in Figure~\ref{fig:tsne}. In Fig~\ref{fig:tsne}(a), we compute 2D visualization of equivariant representations obtained after applying rotation matrices, i.e., $\bar{z}$. Projections in Fig~\ref{fig:tsne}(a) clearly demonstrate that gaze direction is invariant to the viewpoint, as images at the same timestamp from different views are mapped closer (shown with the same color border). In Fig~\ref{fig:tsne}(b), we apply t-SNE algorithm on gaze representations obtained at the output of encoder network, i.e., $z = \text{E}(\cdot)$, for images from single camera viewpoint. Projections corresponding to roughly similar gaze directions are naturally clustered and highlighted with different background colors. Also, we observe clear patterns in the learned feature space where images within close vicinity are invariant to the subject's identity, showing invariance towards appearances.


\begin{figure*}[t]%
    \centering
    \subfloat[\centering Representations after applying rotation matrices, i.e., $\bar{z}$ ]{{\fbox{\includegraphics[width=0.45\columnwidth, height=0.45\columnwidth]{images/multiview_tsne_5.png}} }}%
    \qquad
    \subfloat[\centering Representation obtained from the output of encoder, $z = \text{E}(\cdot)$ ]{{\fbox{\includegraphics[width=0.45\columnwidth, height=0.45\columnwidth]{images/singleview_tsne.png}} }}%
    \caption{\textbf{t-SNE visualization.} Qualitative visualization of gaze representations in 2-dimensional space using the t-SNE algorithm. (a) shows the visualization of projection embeddings for multi-view images obtained after applying rotation matrices, i.e., $\bar{z}^$. 
    The images with the same timestamp for all four views are highlighted using the same border color.
    (b) depicts representations for the output of encoder network, i.e., $z = \text{E}(\cdot)$ obtained for images from single camera viewpoint.
    Best viewed in color and after zooming.
    }%
    \label{fig:tsne}%
\end{figure*}
