\documentclass[pmlr]{jmlr}% new name PMLR (Proceedings of Machine Learning Research)

 % The following packages will be automatically loaded:
 % amsmath, amssymb, natbib, graphicx, url, algorithm2e

 %\usepackage{rotating}% for sideways figures and tables
%\usepackage{longtable}% for long tables

 % The siunitx package is used by this sample document
 % to align numbers in a column by their decimal point.
 % Remove the next line if you don't require it.
%\usepackage[load-configurations=version-1]{siunitx} % newer version
%\usepackage{siunitx}

\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{hyperref}       % hyperlinks
\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors
\usepackage{textcomp}
\usepackage{gensymb}
\usepackage{amssymb}
\usepackage{dsfont}
\usepackage{placeins}  % for appendix figures to be in single section
%\usepackage[pdftex]{graphicx}
%\usepackage{float}          % subplot
%\usepackage{subfig}

\usepackage{amsmath}
\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}


 % The following command is just for this sample document:
\newcommand{\cs}[1]{\texttt{\char`\\#1}}

 % Define an unnumbered theorem just for this sample document:
\theorembodyfont{\upshape}
\theoremheaderfont{\scshape}
\theorempostheader{:}
\theoremsep{\newline}
\newtheorem*{note}{Note}

 % change the arguments, as appropriate, in the following:
\jmlrvolume{1}
\jmlryear{2022}
\jmlrworkshop{NeurIPS 2022 Gaze Meets ML Workshop}

\title[Selection of XAI Methods Matters]{Selection of XAI Methods Matters: Evaluation of Feature Attribution Methods for Oculomotoric Biometric Identification}

\author{
 \Name{Daniel Krakowczyk\nametag{\thanks{Corresponding Author}}} \Email{daniel.krakowczyk@uni-potsdam.de}\\
 \addr Department of Computer Science, University of Potsdam, 14476 Potsdam, Germany
 \AND
 \Name{David R. Reich} \Email{david.reich@uni-potsdam.de}\\
 \addr Department of Computer Science, University of Potsdam, 14476 Potsdam, Germany
 \AND
 \Name{Paul Prasse} \Email{paul.prasse@uni-potsdam.de}\\
 \addr Department of Computer Science, University of Potsdam, 14476 Potsdam, Germany
 \AND
 \Name{Sebastian Lapuschkin} \Email{sebastian.lapuschkin@hhi.fraunhofer.de}\\
 \addr Fraunhofer Institute for Telecommunications, Heinrich Hertz Institute, 10587 Berlin, Germany
 \AND
 \Name{Tobias Scheffer} \Email{tobias.scheffer@uni-potsdam.de}\\
 \addr Department of Computer Science, University of Potsdam, 14476 Potsdam, Germany
 \AND
 \Name{Lena A. Jäger} \Email{jaeger@cl.uzh.ch}\\
 \addr Department of Computational Linguistics, University of Zurich, CH-8006 Zürich, Switzerland
}

\editor{Editor's name}
% \editors{List of editors' names}

\begin{document}

\maketitle


\begin{abstract}
    Substantial advances in oculomotoric biometric identification have been made due to deep neural networks processing non-aggregated time series data that replace methods processing theoretically motivated engineered features. However, interpretability of deep neural networks is not trivial and needs to be thoroughly investigated for future eye tracking applications. Especially in medical or legal applications explanations can be required to be provided alongside predictions. In this work, we apply several attribution methods to a state of the art model for eye movement-based biometric identification. To asses the quality of the generated attributions, this work is focused on the quantitative evaluation of a range of established metrics. We find that Layer-wise Relevance Propagation generates the least complex attributions, while DeepLIFT attributions are the most faithful. Due to the absence of a correlation between attributions of these two methods we advocate to consider both methods for their potentially complementary attributions.
\end{abstract}

\section{Introduction}
\label{sec:introduction}








Eye movements are known to reflect cognitive processes that include attentional mechanisms~\citep{justcarpenter1976,henderson2003human}; they are therefore considered to be a \textit{window on the mind and brain}~\citep{vanGompel2007eye}. Eye movements can serve as a basis to automatically screen for ADHD~\citep{Deng2022}, dyslexia~\citep{nilsson2016screening,raatikainen2021detection}, and autism spectrum disorder~\citep{alcaniz2022eye}. Since eye movements are also known to be highly individual~\citep{Noton1971}, they can be used as a biometric characteristic~\citep{KasprowskiOber2004,Bednarik2005,Makowski2021}. 

For medical screening applications, it is imperative that a machine-learning model can be understood to detect the actual evidence of the condition of interest rather than clever-hans signals or social biases inherent in the training data. Also for biometric identification, it is highly relevant to analyze which signals the model reacts to in order to understand vulnerabilities, biases, and aspects of fairness of the model~\citep{fairness}.

In recent years, deep neural networks that process raw gaze-velocity data have achieved dramatic performance increases compared to machine learning on engineered features. For example, recent deep neural network architectures for oculomotoric identification reduces the time for an identification and the error rate by one order of magnitude compared to identification based on engineered saccadic features~\citep{Makowski2021,Lohr2022}.
Engineered eye-gaze features are often derived from findings from neurophysiological research~\citep{Schleicher2008,rigas2018study}, thus analyses of feature importance of models that rely on such features are meaningful for experts. For deep neural networks, many feature attribution methods have been developed~\citep{Bach2015,Sundararajan2017,Shrikumar2017,Smilkov2017}. 
Unfortunately the interpretability of such complex models is hard and deep neural nets are well known to be black boxes due to the complexity of non-linear activations.

While a wide range of quantitative performance metrics exist for explainability methods, there is no consensus about the merit of each of these metrics. Metrics quantify the complexity~\citep{Bhatt2020,Chalasani2020,Nguyen2020}, faithfulness~\citep{Samek2015}, robustness~\citep{Alvarez-Melis2018} or localization~\citep{Kohlbrenner2020, Theiner2022} of attributions.
Image data remains the most popular application domain of these approaches. Unlike gaze data, image data naturally lends itself well to human inspection of attributions, and therefore the evaluation of explainability usually relies in no small part on the plausibility of visualizations of attributions~\citep{adebayo2018sanity,bylinskii2018different}.

Prior work on applying feature attributions to eye gaze based convolutional neural networks simply quantifies the overall importance of input channels by channel-wise aggregation of feature attributions~\citep{Deng2022}.
Nevertheless there exists no extensive analysis of feature attributions applied to eye gaze based models up to this date.
As a first step towards trusted models and explanations in deep learning based eye gaze applications, in this paper we will therefore rigorously evaluate feature attribution methods with respect to complexity, faithfulness, robustness, and consistency across methods.

In this paper we will restrict ourselves to the task of oculomotoric biometrics, as this is where these models exhibit top performance and it is intuitive that good explanations will also need good predictions~\citep{Kindermans2019}.
We will train the state-of-the-art neural network \emph{Eye Know You Too}~\citep{Lohr2022} on the three publically available datasets \emph{GazeBase}~\citep{Griffith2021}, \emph{JuDo1000}~\citep{Makowski2020b} and the \emph{Potsdam Textbook Corpus}~\citep{Jaeger2021b}.
We will apply a range of feature attribution methods to the trained models, namely \emph{DeepLIFT}~\citep{Shrikumar2017}, \emph{Integrated Gradients}~\citep{Sundararajan2017} and \emph{Layer-Wise Relevance Propagation}~\citep{Bach2015}, as well as \emph{Input X Gradient}~\citep{Shrikumar2016}.
We will further quantitatively evaluate these attributions by a variety of established metrics to assess desired properties like complexity, sensitivity, faithfulness and robustness.
We will finally evaluate the agreement of attributions across different attribution methods.

The main contributions of this paper are:
\begin{itemize}
    \item we generate and visualize commonly used attribution methods for a state-of-the-art oculomotoric biometric model,
    \item we present the first work to evaluate attributions of oculomotoric biometric models on several established metrics and three different real-world datasets,
    \item we evaluate the agreement between the generated attributions across the attribution methods.
\end{itemize}

\section{Materials and Methods}
\label{sec:materials-methods}

This section is structured as follows: Subsection~\ref{sec:biometric-models} introduces biometric identification and the biometric model under investigation, whereas Subsection~\ref{sec:attribution-methods} lists and briefly describes the applied attribution methods. In Subsection~\ref{sec:attribution-metrics} we give an overview of the employed attribution metrics and in Subsection~\ref{sec:data-sets} we present the datasets on which the attribution methods are evaluated. Subsection~\ref{sec:data-preprocessing} lays out the data preprocessing steps and Subsection~\ref{sec:evaluation-protocol} describes the underlying evaluation protocol for each dataset.

\subsection{Eye Tracking-Based Biometric Identification}
\label{sec:biometric-models}
We investigate the explainability of a state-of-the-art neural network model for oculomotoric biometric identification, namely \emph{EyeKnowYouToo}, developed by~\citet{Lohr2022}. Given a known population of individuals, we investigate a \emph{multi-class classification} setting, where the model is trained using eye movement recording sequences $\{((x_0,y_0)\ldots, (x_n, y_n)\}$ of each user, where $x_i$ and $y_i$ are the yaw (horizontal) and pitch (vertical) gaze eye movement velocities.

\emph{EyeKnowYouToo}~\citep{Lohr2022} is a DenseNet-based architecture~\citep{Huang2018} that uses the raw sequences of yaw and pitch angular velocities as input. This end-to-end dilated convolutional network is trained to minimize a multi-similarity loss along with the categorical cross-entropy. We use an extended version of Dillon Lohr's model implementation in PyTorch~\citep{Lohr2022}.


\subsection{Attribution Methods}
\label{sec:attribution-methods}


Feature attribution methods are local post-hoc explainability methods that reflect feature importance by attributing positive or negative values to each input feature of a specific model prediction. This facilitates interpretability of given predictions by highlighting relevant parts of the input signal. Attribution methods can be divided into perturbation-based and backpropagation-based methods~\citep{Ancona2017}.
Due to the computationally expensive approach of perturbation-based methods like SHAP~\citep{Lundberg2017} or Occlusion~\citep{Zeiler2013} we limit this study to the evaluation of the backpropagation-based attribution methods Input x Gradient~\citep[IxG,][]{Shrikumar2016}, Integrated Gradients~\citep[IG,][]{Sundararajan2017}, DeepLIFT~\citep[DL,][]{Shrikumar2017} and Layer-wise relevance propagation~\citep[LRP,][]{Bach2015,Montavon2017,Montavon2018}. Figure~\ref{fig:attribution-methods-example} presents an example of generated attributions for each of the methods introduced in this subsection.
We use the Captum library~\citep{Kokhlikyan2020} for its IxG, IG and DL implementations and the Zennit library~\citep{Anders2021} for the implementation of LRP rules.


IxG is an early attribution method for which relevance is computed by backpropagating the prediction gradient in respect to each input feature and multiplying the gradient element-wise with the actual input.

LRP computes input relevance by backpropagating the model output to its input according to a specific set of rules. Relevance of each unit is passed down to the lower units depending on the product of activations and weights of the respective layer units and connections, while keeping the total relevance in each layer constant. Although over time a range of different relevance passing rules was proposed~\citep{Kohlbrenner2020}, we limit ourselves to the basic original LRP-$\varepsilon$~rule for a more concise presentation.
We set $\varepsilon = 0.25 std$ according to the parameter selection in \citet{Montavon2019}.
High $\varepsilon$ values will result in less attributions close to zero, and vice versa for lower $\varepsilon$ values.


DeepLIFT~\citep{Shrikumar2017} is a very similar backpropagation-based attribution method, but in contrast to LRP, an explicit baseline input is used for calculating activation reference points. Activation differences are then backpropagated layer by layer according to a set of rules.

Integrated Gradients~\citep{Sundararajan2017} is somewhat different in that it computes the gradients of the model, which makes it implementation independent. It also uses an explicit baseline, which is then stepwise linearly interpolated into the actual input at hand. For each of those interpolations and for each input feature gradients are calculated, then integrated and finally multiplied with the feature difference between baseline and actual input.

One drawback of attribution methods which use an explicit baseline is the susceptibility for its choice~\citep{Sturmfels2020}. Usually a zero or mean baseline is chosen for both of these baseline-based attribution methods, but theoretically every input which leads to a neutral output can be chosen. One drawback of using a constant baseline is the introduced low attribution bias to input values close to the baseline value.



\subsection{Attribution Metrics}
\label{sec:attribution-metrics}

Apart from a qualitative visual analysis we will evaluate the generated attributions by different metrics to measure their complexity, faithfulness and robustness.
Due to a lack of ground truth segmentation mask we omit the evaluation of attribution localization.
We use the Quantus python package~\citep{Hedstrom2022} for the calculation of all attribution metrics used in this paper.

Let $\mathbf{x}$ be an instance with $d$ features, $\mathbf{f}$ our model and $\mathbf{g}$ an explanation function, where $\mathbf{g}(\mathbf{f},\mathbf{x})$ refers to the feature attribution for the model prediction $\mathbf{f}(\mathbf{x})$. We define $\mathbf{g}(\mathbf{f},\mathbf{x})_i$ as the attribution of the $i$-th input feature. The fractional contribution distribution is defined as $\mathds{P}_\mathbf{g}(i)=\frac{\lvert\mathbf{g(f,x)}_i\rvert}{\sum_{j\in [d]}\lvert\mathbf{g(f,x)}_j\rvert}$, for $i\in [d]$, the probability distribution by $\mathds{P}_\mathbf{g}=\{\mathds{P}_\mathbf{g}(i)~|~0\leq i\leq d \}$.

\paragraph{Complexity metrics} As complexity measures we use entropy~\citep[Equation~\ref{eq:entropy},][]{Bhatt2020}, sparseness~\citep[Equation~\ref{eq:sparseness},][]{Chalasani2020} and effective complexity~\citep[Equation~\ref{eq:effective-complexity},][]{Nguyen2020} which are defined by the equations below.

The entropy metric is defined as follows:
\begin{equation}
    \mu_{E}(\mathbf{f}, \mathbf{g}, \mathbf{x}) = \mathds{E} [ - \ln(\mathds{P}_\mathbf{g}) ] = - \sum_{k=1}^d \mathds{P}_\mathbf{g}(k) \ln(\mathds{P}_\mathbf{g}(k))
    \label{eq:entropy}
\end{equation}

The sparseness metric is based on the Gini coefficient and measures the dispersion between high and low attribution values. It is defined as follows:
\begin{equation}
    \mu_{S}(\mathbf{f}, \mathbf{g}, \mathbf{x}) = 1 - 2 \sum_{k=1}^d \frac{{\mathbf{g}(\mathbf{f}, \mathbf{x})}_k}{\| \mathbf{g}(\mathbf{f}, \mathbf{x}) \|_1} \frac{d - k + 0.5}{d}
    \label{eq:sparseness}
\end{equation}

Finally the effective complexity measures the amount of absolute attribution values above a certain threshold $\varepsilon$ and is defined as follows:
\begin{equation}
    \mu_{EC}(\mathbf{f}, \mathbf{g}, \mathbf{x}) = \frac{1}{d} \Bigl| \Bigl\{ ~ a \in \mathbf{g}(\mathbf{f}, \mathbf{x}) ~|~ a > \varepsilon ~ \Bigr\} \Bigr|
    \label{eq:effective-complexity}
\end{equation}

\paragraph{Faithfulness metric} To measure faithfulness we use the region perturbation metric proposed by~\citep{Samek2015}. In this metric we iteratively perturb the input instance $\mathbf{x}$ by non-overlapping patches which are ordered descendently by their sum of inner feature attribution values. The perturbed instance at step $i$ is denoted as $\mathbf{x}_{\text{MoRF}}^{(i)}$. 

By iterating through a number of steps $N$ we can thus generate a mean perturbation curve for a set of instances. The underlying intuition is, that perturbing features with high-scoring attributions should lead to a steep drop in target output $\mathbf{f}(\mathbf{x})$ if the evaluated attributions are being actually faithful.

To account for the drop in target output due to model robustness properties, we create a baseline where the non-overlapping order of patches is random. 
The perturbed instance for step $i$ using this random patch drawing method is denoted as $\mathbf{x}_{\text{Random}}^{(i)}$.
The overall metric score is then quantified as the area between the ordered and the random perturbation curve~(Equation~\ref{eq:region-perturbation}).

\begin{equation}
    \mu_{RP}(\mathbf{f}, \mathbf{g}, \mathbf{x}) = \frac{1}{N + 1} \sum_{i=1}^{N} ~ \mathbf{f}(\mathbf{x}_{\text{Random}}^{(i)}) - \mathbf{f}(\mathbf{x}_{\text{MoRF}}^{(i)})
    \label{eq:region-perturbation}
\end{equation}

\paragraph{Robustness metrics} As robustness metrics we use the local Lipschitz estimate~\citep[Equation~\ref{eq:local-lipschitz-estimate},][]{Alvarez-Melis2018}.
This metric perturbs the complete input instance by superimposing noise and measuring the distance between between explanations generated for the perturbed and unperturbed input.

Denote the perturbation of an input instance $\mathbf{x}$ by $\hat{\mathbf{x}}$ and by $\mathcal{N}(\mathbf{x})$ $x$ with added noise drawn from a Gaussian distribution with a mean of 0 and a standard deviation of 0.1. Reusing the notations introduced in the paragraph above, we can define the local Lipshitz estimate as follows:
\begin{equation}
    \mu_{L}(\mathbf{f}, \mathbf{g}, \mathbf{x}) =\argmax_{\hat{\mathbf{x}} \in \mathcal{N(\mathbf{x})}} \frac{\| \mathbf{g}(\mathbf{f}, \mathbf{x}) - \mathbf{g}(\mathbf{f}, \mathbf{\hat{x}}) \|_2}{\| \mathbf{x}- \mathbf{\hat{x}} \|_2}
    \label{eq:local-lipschitz-estimate}
\end{equation}



\subsection{Dataset}
\label{sec:data-sets}

We evaluate the model and attributions on the three datasets \emph{GazeBase}~\citep{Griffith2021}, \emph{JuDo1000}~\citep{Makowski2020b} and the \emph{Potsdam Textbook Corpus (PoTeC)}~\citep{Jaeger2021b}, which are all recorded at a sampling rate of 1000~Hz.

GazeBase is a large scale data set which was gathered over the course of 3~years, consisting of 9~individual rounds with two sessions made on the same day.
Although a total number of 322~subjects is available, only 14 subjects participated in the last round. Therefore, we reduce the data set to the first 4~rounds where exactly 100~subjects are available.
All subjects participate in seven different tasks: horizontal saccade task~(HSS), video viewing task 1 and~2~(VD1~\&~VD2), a fixation task~(FXS), a random saccade task~(RAN), a reading task~(TEX) and the Balura Game~(BLG). We use all available stimuli for evaluation.

JuDo1000 is a single stimulus data set where each trial consists of the sequential presentation of five randomly placed dots. The intervals between each subsequent dot presentation range between 250~ms and 1~s.
The dataset consists of  150~subjects recorded in 4~sessions that are at least two weeks apart from each other.
During each session the participants are instructed to visually follow five randomly placed dots on the screen.
The intervals between each subsequent dot presentation range between 250~ms and 1~s.
In contrast to the other two data sets, data is recorded for both eyes (binocular).

PoTeC is a single session data set recorded on a reading task. All 75~participants are instructed to read 12~different short texts.

\subsection{Data Preprocessing}
\label{sec:data-preprocessing}

We base our data preprocessing pipeline on the method proposed by Lohr~\&~Komogortsev~\citep{Lohr2022}. We first transform positional data into velocity data by applying the Savitzky-Golay differentiation filter~\citep{SavitzkyGolay1964} with a window size of 7 and an order of 2.
We create non-overlapping subsequences with a rolling window approach where we use a window size of 1~second (1000~samples~@~1000~Hz) for \emph{JuDo1000} and \emph{PoTeC}, and a window size of 5~seconds (5000~samples~@~1000~Hz) for \emph{GazeBase}.
We exclude all subsequences which need padding or which include more than 50\% of missing values and clamp all velocities to~$\pm 1000~\degree/s$.
Further, we apply z-score normalization and finally replace all missing values with 0.
We take use of the pymovements package for preprocessing~\citep{pymovements}.


\subsection{Evaluation Protocol}
\label{sec:evaluation-protocol}

In order to evaluate the introduced attribution methods for this specific task and dataset we apply the following protocols for each datasets:
We split the GazeBase dataset by a leave-one-round-out scheme, the JuDo1000 dataset by a leave-one-session-out scheme, and the PoTeC dataset by a leave-one-text-out scheme.

This results in a k-fold cross validation protocol to which we adhere for the complete evaluation pipeline (\emph{GazeBase} and \emph{JuDo1000}: $k = 4$, \emph{PoTeC}: $k = 12$).
Each fold includes all data for the respective round/session/text as a test set and the remainder as the training set.
Model accuracy as well as attribution metrics are evaluated on the test set of each fold only.

We take the predicted class as the target class to create all attributions.
We normalize attribution values by the maximum absolute attribution value of the respective instance.

We evaluate every setting on a AMD EPYC 7742 CPU and a NVIDIA DGX A100 GPU.
We train all models using the PyTorch~\citep{torch2019} library utilizing the NVIDIA CUDA platform.
We implement the model evaluation framework using the scikit-learn ~\citep{scikit-learn} machine learning package.
The code can be found online.\footnote{\url{https://github.com/aeye-lab/2022-nips-gmml-xai-eye-tracking-evaluation}}


\begin{figure}[htbp]
\floatconts
{fig:attribution-methods-example}
{\caption{Attributions generated by the employed attribution methods~(see Subsection~\ref{sec:attribution-methods}) for a single example instance out of the \emph{JuDo1000} dataset. 
 Each subfigure represents one of the four input velocity channels (from upper left to lower right: (a) yaw left eye, (b) yaw right eye, (c) pitch left eye, (d) pitch right eye). The respective channel signals are plotted as a continuous black line in the first row of each subfigure with y-axis scale from -1000 to 1000~$\degree/s$. The remaining rows depict the generated feature attributions for the method labeled at the y-axis. Red represents positive attributions and blue represents negative attributions. All attributions are normalized in the range between -1 and 1.}}% caption for whole figure
{%
\subfigure[yaw velocities of left eye]{%
\label{fig:attribution-methods-example-dxl}
\includegraphics[width=0.45\textwidth]{figures/attributions/attributions_judo_sr1000_sl1000_bxy_eky2_000005051_dxl.pdf}
}\qquad % space out the images a bit
\subfigure[yaw velocities of right eye]{%
\label{fig:attribution-methods-example-dxr}
\includegraphics[width=0.45\textwidth]{figures/attributions/attributions_judo_sr1000_sl1000_bxy_eky2_000005051_dxr.pdf}
}\qquad % space out the images a bit
\subfigure[pitch velocities of left eye]{%
\label{fig:attribution-methods-example-dyl}
\includegraphics[width=0.45\textwidth]{figures/attributions/attributions_judo_sr1000_sl1000_bxy_eky2_000005051_dyl.pdf}
}\qquad % space out the images a bit
\subfigure[pitch velocities of right eye]{%
\label{fig:attribution-methods-example-dyr}
\includegraphics[width=0.45\textwidth]{figures/attributions/attributions_judo_sr1000_sl1000_bxy_eky2_000005051_dyr.pdf}
}
}
\end{figure}

\section{Results}
\label{sec:results}

Literature has shown, that we need high performing model to get reliable explanations~\citep{Kindermans2019}.
To test the performance of the selected model \emph{Eye Know You Too}~\citep{Lohr2022}, we followed the evaluation protocol described in Section~\ref{sec:evaluation-protocol}, where we train our model to identify different identities in a multi-class setting. From our experiments we can conclude, that our model has a reasonably high mean accuracy of above $90 \%$ on all datasets, yielding state-of-the-art performance (see Appendix~\ref{apx:model-accuracy} for details). The remaining section is structured as follows: Section \ref{sec:attribution-qualitative-anaylsis} evaluates the different attribution methods in a qualitative manner, where the attribution methods are evaluated in a quantitative manner in Sections \ref{sec:attribution-complexity}-\ref{sec:attribution-robustness}. In Section \ref{sec:attribution-agreement} we evaluate the agreement of the different attribution methods.


\subsection{Qualitative Attribution Analysis}
\label{sec:attribution-qualitative-anaylsis}

We start the evaluation of attributions by a qualitative visual analysis to put the subsequent quantitative metrics into perspective.
Due to the lack of space we unfortunately have to limit the presentation to a single instance taken from the binocular \emph{JuDo1000} dataset which is presented in Figure~\ref{fig:attribution-methods-example}.
In order to get a more complete impression of the generated feature attributions, we refer the interested reader to a selection of additional instances of all three datasets in Appendix~\ref{apx:attribution-methods-examples}.
Apart from the cases where we specifically point to a feature of the given example instance the general observations hold true for the vast amount of the instances across datasets.
%Unfortunately it is not manageable for a small team of researchers to visually inspect the complete dataset.

We first start with a short inspection of the input signal.
There exist absolute velocity peaks at the time steps at roughly 100~ms, 350~ms, 600~ms and 875~ms.
Although the peaks are reasonably time-aligned across channels, they are sized different depending on the respective input channel.
This stems from differences in yaw and pitch direction of the underlying eye movement event but also from deviations between both eyes.
We further observe some low velocity oscillations above the noise floor in the pitch channels of both eyes during the first 100~ms.

When observing the generated attributions we first note the tendency of very high absolute attribution values being time-aligned with the previously identified high velocity eye movements.
Still, depending on the attribution method, low attribution values are clearly present in-between these high velocity eye movements.
Only LRP exhibits close to zero attribution values at almost all input features not in the temporal vicinity of the identified velocity peaks.

Regarding the consistency of top attribution values among the attribution methods, we observe that methods attribute the pitch velocity of the left eye at about 350~ms (Figure~\ref{fig:attribution-methods-example-dyl}) as the most important input feature. Further we see consistent peaks at about 600 ms in both yaw channels (Figure~\ref{fig:attribution-methods-example-dxl}~and~\ref{fig:attribution-methods-example-dxr}).

We observe that LRP doesn't attribute high values to the previously identified low velocity oscillations during the first 100~ms of both pitch velocity channels, while the three other attribution methods attribute much higher values, especially for the left eye~(Figure~\ref{fig:attribution-methods-example-dyl}).

Another aspect that we note is the seeming ambivalence between positive and negative attributions.
Positive attributions express a positive influence of a specific input feature towards the target class, and vice versa with negative attributions.
We notice this ambivalence also across channels, for example the IG and LRP attributions during the velocity peak at about 300~ms (left eye pitch velocity in Figure~\ref{fig:attribution-methods-example-dyl}) have opposing signs for the rise and fall of the input velocity profile.
Taking the mean of attribution values across input channels for each time step would mitigate this issue though, as the positive attributions of each channel outweigh the negative ones at this time step.


\subsection{Attribution Complexity}
\label{sec:attribution-complexity}

\begin{figure}
\centering

\includegraphics[width=0.99\textwidth]{figures/attribution-entropy.pdf}
\caption{Attribution entropy. The lower the better. See Section~\ref{sec:attribution-metrics} for metric definitions.\label{fig:attribution-entropy}}
\end{figure}

\begin{figure}
\centering

\includegraphics[width=0.99\textwidth]{figures/attribution-sparseness.pdf}
\caption{Attribution sparseness. The higher the better. See Section~\ref{sec:attribution-metrics} for metric definitions.\label{fig:attribution-sparseness}}
\end{figure}

\begin{figure}
\centering

\includegraphics[width=0.99\textwidth]{figures/attribution-effective-complexity.pdf}
\caption{Effective complexity with a logarithmic scale for $\varepsilon$ values. The lower the curve the better. See Section~\ref{sec:attribution-metrics} for metric definitions.\label{fig:attribution-effective-complexity}}
\end{figure}



We present the results of the quantitative complexity metrics regarding entropy $\mu_{E}$ in  Figure~\ref{fig:attribution-entropy}, sparseness $\mu_S$ in  Figure~\ref{fig:attribution-sparseness} and effective complexity $\mu_{EC}$ in  Figure~\ref{fig:attribution-effective-complexity}.

For all three complexity metrics we observe that LRP sets itself apart from the other evaluated methods.
LRP attributions consistently exhibit less entropy and are more sparse.
We further notice distinctively less attribution values between $1\mathrm{e}{-4}$ and 0.2 than it is the case for the other methods.

Nevertheless there's a turning point below an $\varepsilon$ of about $1\mathrm{e}{-4}$.
This shows that attribution values from LRP approach 0 more gradually, while the other methods seem to omit attribution values between 0 and $1\mathrm{e}{-4}$.
This is just a minor detail, as for a practitioner higher $\varepsilon$ threshold values in the range between $1\mathrm{e}{-3}$ and $1\mathrm{e}{-1}$ are usually much more relevant.

We observe the exact same rank order for the two best performing methods (LRP and IG) across all three complexity metrics and datasets.
There are some inconsistencies across datasets for IxG and DL, with DL performing worse on JuDo1000 and better on GazeBase and PoTeC.

LRP attributions are thus by far the least complex, followed by IG.
We note a higher variance for LRP across all datasets, and an overall increase in variance for the PoTeC dataset.



\subsection{Attribution Faithfulness}
\label{sec:attribution-faithfulness}

\begin{figure}
\centering

\includegraphics[width=0.99\textwidth]{figures/attribution-region_perturbation_s3_uniform_noabs_nonorm_morf_perturbation_curves_absolute.pdf}
\caption{Mean perturbation curves for the employed attribution methods. The mean random perturbation curve is plotted as a continuous black line. The greater the area between the random perturbation curve and the attribution perturbation curve (AOPC relative to random) the better. See Section~\ref{sec:attribution-metrics} for metric definitions.\label{fig:attribution-region-perturbation-curves}}
\end{figure}

\begin{figure}
\centering

\includegraphics[width=0.99\textwidth]{figures/attribution-region_perturbation_s3_uniform_noabs_nonorm_morf_aopc_boxplot.pdf}
\caption{Boxplot for the AOPCs relative to the random perturbation curve ($\mu_{RP}$). The higher the better. See Section~\ref{sec:attribution-metrics} for metric definitions.\label{fig:attribution-region-perturbation-boxplot}}
\end{figure}


We continue our quantitative attribution evaluation with the faithfulness measure of the region perturbation metric $\mu_{RP}$.
As described in the respective metric paragraph in Subsection~\ref{sec:attribution-metrics} we increasingly perturb the input instance on non-overlapping patches and measure the model output difference for the respective target output.

Figure~\ref{fig:attribution-region-perturbation-curves} depicts the mean perturbation curves for each attribution method together with a random perturbation curve.
We observe a very similar curve shape across all attribution methods, with LRP being consistently less steep than the other methods.
This is in concordance with the boxplot in Figure~\ref{fig:attribution-region-perturbation-boxplot}, where we can further see that rank order is preserved across all three datasets.
DL and IG are thus the most faithful methods regarding this evaluation.
All perturbation curves of each method converge to the random perturbation curve before about 30~\% of perturbed input.

However, the boxplot in Figure~\ref{fig:attribution-region-perturbation-boxplot} exposes a relatively high variance for all methods, where mean scores span an interval that is less than 1.5~times the interquartile range.
This is especially true for the PoTeC dataset, which exhibits a much bigger variance than the other datasets.


\subsection{Attribution Robustness}
\label{sec:attribution-robustness}

\begin{figure}
\centering

\includegraphics[width=0.99\textwidth]{figures/attribution-local-lipschitz-estimate.pdf}
\caption{Local Lipschitz estimate. The lower the better. See Section~\ref{sec:attribution-metrics} for metric definitions.\label{fig:attribution-robustness}}
\end{figure}


We evaluate the attribution methods on the robustness metric $\mu_L$, which measures the difference in attributions on noise superimposition across the whole input.
Figure~\ref{fig:attribution-robustness} depicts the respective metric results.
We observe several inconsistencies across datasets. Most noticable are the much less robust attributions created on the PoTeC dataset, which potentially stems from the fact that this is a single-session dataset and the trained model is more susceptible to noise superimposition.
Unfortunately rank order is not preserved across datasets. The only exception is IxG which performs worst on all three datasets.

\subsection{Agreement Across Attribution Methods}
\label{sec:attribution-agreement}



\begin{figure}
\floatconts
{fig:attribution-correlation}
{\caption{Attribution correlation using Spearman's $\rho$ (a-c) and Kendall's $\tau$ coefficient (d-f). All correlation coefficients are significant with a p-value below $\alpha = 0.001$.}}
{
\centering
\subfigure[GazeBase ($\rho$)]{\includegraphics[width=0.29\linewidth]{figures/gazebase_all_sr1000_sl5000_dxy-spearman.pdf}}\qquad
\subfigure[JuDo1000 ($\rho$)]{\includegraphics[width=0.29\linewidth]{figures/judo_sr1000_sl1000_bxy-spearman.pdf}}\qquad
\subfigure[PoTeC ($\rho$)]{\includegraphics[width=0.29\linewidth]{figures/potec_sr1000_sl1000_dxy-spearman.pdf}}
\\
\vspace{0.5cm}
\subfigure[GazeBase ($\tau$)]{\includegraphics[width=0.29\linewidth]{figures/gazebase_all_sr1000_sl5000_dxy-kendall.pdf}}\qquad
\subfigure[JuDo1000 ($\tau$)]{\includegraphics[width=0.29\linewidth]{figures/judo_sr1000_sl1000_bxy-kendall.pdf}}\qquad
\subfigure[PoTeC ($\tau$)]{\includegraphics[width=0.29\linewidth]{figures/potec_sr1000_sl1000_dxy-kendall.pdf}}
}
\end{figure}


We finally evaluate the agreement across attribution methods by correlation analysis using Spearman's $\rho$ coefficient and Kendall's $\tau$ coefficient.
The corresponding correlation matrices are depicted in Figure~\ref{fig:attribution-correlation}.

We note that there is next to no attribution correlation between LRP and the other three methods.
The highest correlation is between DL and IG, followed by the correlation between IxG and IG and IxG and DL.
Kendall correlations are consistently lower than Spearman correlations while ranks are nevertheless preserved between both correlation methods.
The results are consistent across datasets.


\section{Discussion}
\label{sec:discussion}

We quantitatively evaluated the four attribution methods DeepLIFT (DL), Input x Gradient (IxG), Integrated Gradients (IG) and Layer-wise Relevance Propagation (LRP) for complexity, faithfulness and robustness on three real world datasets and the biometric model \emph{Eye Know You Too}~\citep{Lohr2022}.

Although we identified LRP to create the least complex attributions, their faithfulness was slightly lacking in relation to the other methods, especially to DL which was the most faithful.
This relationship was shown for all three datasets and contributes evidence to suggesting a trade-off between complexity on one hand, and faithfulness on the other.
Less complex attributions will probably miss some important relevance relative to more complex attributions.
Nevertheless we can also construct scenarios in which complex attributions include features that were not actually important for the model decision, thus lowering their faithfulness.

Regarding attribution robustness we found low concordance across datasets. Apart from IxG being the worst performing attribution method for this metric, we are not able to generalize these results. We further note that it is imperative to use several sessions for biometric evaluation to diminish the effect of session bias, which can be a plausible reason for the decreased robustness on the \emph{PoTeC} dataset.

The challenge which has to be tackled for each single eye tracking application each time again, is the assessment which of these aspects attract higher priority.
This largely depends on the cognitive bias of the recipient of the explanations~\citep{Bertrand2022}. Clinical experts will be potentially able to interpret more complex attributions than lay persons.

Moreover it will be interesting to see if we can tune some of these attribution methods in such a way, that an optimal trade-off between certain metrics can be found.
Especially LRP with the scalar $\epsilon$ parameter as well as its additional layer rules seems a promising candidate for such an undertaking.
Further work will also have to be required in assessing the model influence on the resulting attributions.

We have further shown that LRP has close to no correlation with the other attribution methods when it comes to the two employed rank correlation coefficients Spearman's $\rho$ and Kendall's $\tau$.
Although the non-correlation can seem as an issue at first, it can also be beneficial for some applications to have uncorrelated attributions due to the prospect of highlighting features that complement each other.
From the correlations we conclude that LRP can complement DL or IG well enough. Due to the slightly higher faithfulness of DL we would advocate for an ensemble of LRP and DL.

Regarding the qualitative analysis of the attribution methods we found a tendency for time-alignment of high velocity peaks and high absolute attribution values for all four attribution methods.
This is in concordance with previous literature in which the predictive quality of high velocity eye movement statistics is shown to be high~\citep{scanpath_biometric,analyze_biometrics,rigas2016biometric}.
Nevertheless we can also identify low velocity regions which don't correspond to common eye movement types that still exhibit lower to mid attribution values across DL, IxG and IG.
We therefore expect that applying the discussed attribution methods to current and future eye tracking applications will further enhance the still ongoing visual analysis in the field and can potentially help in discovering of new types of eye movement features.

We further identified some limitations of this work.
First and most obvious, qualitative visual analysis can only analyze a small subset of the data due to limited human resources.
This leaves room for undiscovered phenomena and the criticism of cherry picking example instances.
On the one hand qualitative visual analysis by human experts cannot be completely replaced when evaluating attributions, as humans will be the recipients of explanations and there is no way to generally predict human preference on such a broad research problem.

On the other hand, some aspects of the undertaken qualitative analysis can be performed computationally, especially the time-alignment analysis between high absolute velocities and attribution values.
Moreover, feature attributions by themselves lack interpretability, especially in tasks where models exhibit better-than-human performance due to the complexity of the input space.
As future work we therefore propose to employ eye movement detection algorithms to quantify attribution localizations in regard to these human interpretable features which can be extracted by computational models.

Last but not least, we visually identified attribution ambivalence for all four attribution methods, where positive and negative attribution values are in close vicinity.
This currently leads to issues in interpretability, as these contradictory attributions are hard to interpret.
Issues in interpretability of these attributions are especially severe for models with subpar prediction performance, as provision of explanations biases the recipient towards accepting the decision~\citep{Jakubik2022}.
Further analysis will be needed to correctly assess this issue together with its root cause.



\section{Conclusion}
\label{sec:conclusion}

We have quantitatively evaluated the introduced attribution methods in regard to complexity, faithfulness and robustness on three real world datasets.
While Layer-wise relevance propagation exhibits low complexity, attributions generated by DeepLIFT are most faithful.
Due to the non-correlation of both methods we advocate for considering both methods for their potentially complementary attributions.

Although we identify similarities across attribution methods through visual analysis and quantitive metrics, we also identify differences and conclude that the selection of the respective attribution method will have decisive influence on derived conclusions.

This work therefore is the starting point and a possible baseline for a line of future research in the eye tracking community.
We see future work regarding the tuning of attribution methods and models for achieving better metric results, and improving on human interpretability of attributions through existing eye movement concepts in the psychological literature.
We propose that future publications on models in eye tracking research increasingly include measures of explainability to their model evaluation protocols to facilitate and assess the usefulness of the systems in real world problems.

\acks{This work was partially funded by the German Federal Ministry of Education and Research (grant 01$\vert$S20043).}


\bibliography{bibliography}

\newpage
\appendix

\FloatBarrier

\section{Model Accuracy}
\label{apx:model-accuracy}

We present a multiclass accuracy boxplot for all three datasets.

\begin{figure}[htp]
\centering

\includegraphics[width=0.45\textwidth]{figures/model_accuracy_eky2_all_boxplot.pdf}
\caption{Multiclass accuracy boxplot for the \emph{Eye Know You Too Model} across all three datasets.}
\end{figure}

\newpage
\FloatBarrier

\section{Additional Attribution Examples}
\label{apx:attribution-methods-examples}

This appendix section is dedicated to a brief showcase of the generated attributions for each of the three datasets.

\subsection{Attribution Examples for GazeBase}

\begin{figure}[htp]
\floatconts
{fig:attribution-examples-appendix-gazebase-1}% label for whole figure
{\caption{Attributions generated for a single example instance~(id~=~574) out of the GazeBase dataset. See caption in Figure\ref{fig:attribution-methods-example} for a complete description.}}
{%
\subfigure[yaw velocity]{%
\includegraphics[width=0.45\textwidth]{figures/attributions/attributions_gazebase_all_sr1000_sl5000_dxy_eky2_000000574_dx.pdf}
}\qquad % space out the images a bit
\subfigure[pitch velocity]{%
\includegraphics[width=0.45\textwidth]{figures/attributions/attributions_gazebase_all_sr1000_sl5000_dxy_eky2_000000574_dy.pdf}
}
}
\end{figure}


\begin{figure}[htp]
\floatconts
{fig:attribution-examples-appendix-gazebase-2}% label for whole figure
{\caption{Attributions generated for a single example instance~(id~=~2333) out of the GazeBase dataset. See caption in Figure\ref{fig:attribution-methods-example} for a complete description.}}
{%
\subfigure[yaw velocity]{%
\includegraphics[width=0.45\textwidth]{figures/attributions/attributions_gazebase_all_sr1000_sl5000_dxy_eky2_000002333_dx.pdf}
}\qquad % space out the images a bit
\subfigure[pitch velocity]{%
\includegraphics[width=0.45\textwidth]{figures/attributions/attributions_gazebase_all_sr1000_sl5000_dxy_eky2_000002333_dy.pdf}
}
}
\end{figure}


\begin{figure}[htp]
\floatconts
{fig:attribution-examples-appendix-gazebase-3}% label for whole figure
{\caption{Attributions generated for a single example instance~(id~=~13099) out of the GazeBase dataset. See caption in Figure\ref{fig:attribution-methods-example} for a complete description.}}
{%
\subfigure[yaw velocity]{%
\includegraphics[width=0.45\textwidth]{figures/attributions/attributions_gazebase_all_sr1000_sl5000_dxy_eky2_000013099_dx.pdf}
}\qquad % space out the images a bit
\subfigure[pitch velocity]{%
\includegraphics[width=0.45\textwidth]{figures/attributions/attributions_gazebase_all_sr1000_sl5000_dxy_eky2_000013099_dy.pdf}
}
}
\end{figure}


\begin{figure}[htp]
\floatconts
{fig:attribution-examples-appendix-gazebase-4}% label for whole figure
{\caption{Attributions generated for a single example instance~(id~=~25658) out of the GazeBase dataset. See caption in Figure\ref{fig:attribution-methods-example} for a complete description.}}
{%
\subfigure[yaw velocity]{%
\includegraphics[width=0.45\textwidth]{figures/attributions/attributions_gazebase_all_sr1000_sl5000_dxy_eky2_000025658_dx.pdf}
}\qquad % space out the images a bit
\subfigure[pitch velocity]{%
\includegraphics[width=0.45\textwidth]{figures/attributions/attributions_gazebase_all_sr1000_sl5000_dxy_eky2_000025658_dy.pdf}
}
}
\end{figure}


\begin{figure}[htp]
\floatconts
{fig:attribution-examples-appendix-gazebase-5}% label for whole figure
{\caption{Attributions generated for a single example instance~(id~=~39596) out of the GazeBase dataset. See caption in Figure\ref{fig:attribution-methods-example} for a complete description.}}
{%
\subfigure[yaw velocity]{%
\includegraphics[width=0.45\textwidth]{figures/attributions/attributions_gazebase_all_sr1000_sl5000_dxy_eky2_000039596_dx.pdf}
}\qquad % space out the images a bit
\subfigure[pitch velocity]{%
\includegraphics[width=0.45\textwidth]{figures/attributions/attributions_gazebase_all_sr1000_sl5000_dxy_eky2_000039596_dy.pdf}
}
}
\end{figure}

\FloatBarrier
\subsection{Attribution Examples for JuDo1000}

\begin{figure}[htp]
\floatconts
{fig:attribution-examples-appendix-judo-1}% label for whole figure
{\caption{Attributions generated for a single example instance~(id~=~156) out of the JuDo1000 dataset. See caption in Figure\ref{fig:attribution-methods-example} for a complete description.}}
{%
\subfigure[yaw velocities of left eye]{%
\includegraphics[width=0.45\textwidth]{figures/attributions/attributions_judo_sr1000_sl1000_bxy_eky2_000000156_dxl.pdf}
}\qquad % space out the images a bit
\subfigure[yaw velocities of right eye]{%
\includegraphics[width=0.45\textwidth]{figures/attributions/attributions_judo_sr1000_sl1000_bxy_eky2_000000156_dxr.pdf}
}\qquad % space out the images a bit
\subfigure[pitch velocities of left eye]{%
\includegraphics[width=0.45\textwidth]{figures/attributions/attributions_judo_sr1000_sl1000_bxy_eky2_000000156_dyl.pdf}
}\qquad % space out the images a bit
\subfigure[pitch velocities of right eye]{%
\includegraphics[width=0.45\textwidth]{figures/attributions/attributions_judo_sr1000_sl1000_bxy_eky2_000000156_dyr.pdf}
}
}
\end{figure}


\begin{figure}[htp]
\floatconts
{fig:attribution-examples-appendix-judo-2}% label for whole figure
{\caption{Attributions generated for a single example instance~(id~=~1578) out of the JuDo1000 dataset. See caption in Figure\ref{fig:attribution-methods-example} for a complete description.}}
{%
\subfigure[yaw velocities of left eye]{%
\includegraphics[width=0.45\textwidth]{figures/attributions/attributions_judo_sr1000_sl1000_bxy_eky2_000001578_dxl.pdf}
}\qquad % space out the images a bit
\subfigure[yaw velocities of right eye]{%
\includegraphics[width=0.45\textwidth]{figures/attributions/attributions_judo_sr1000_sl1000_bxy_eky2_000001578_dxr.pdf}
}\qquad % space out the images a bit
\subfigure[pitch velocities of left eye]{%
\includegraphics[width=0.45\textwidth]{figures/attributions/attributions_judo_sr1000_sl1000_bxy_eky2_000001578_dyl.pdf}
}\qquad % space out the images a bit
\subfigure[pitch velocities of right eye]{%
\includegraphics[width=0.45\textwidth]{figures/attributions/attributions_judo_sr1000_sl1000_bxy_eky2_000001578_dyr.pdf}
}
}
\end{figure}


\begin{figure}[htp]
\floatconts
{fig:attribution-examples-appendix-judo-3}% label for whole figure
{\caption{Attributions generated for a single example instance~(id~=~6797) out of the JuDo1000 dataset. See caption in Figure\ref{fig:attribution-methods-example} for a complete description.}}
{%
\subfigure[yaw velocities of left eye]{%
\includegraphics[width=0.45\textwidth]{figures/attributions/attributions_judo_sr1000_sl1000_bxy_eky2_000006797_dxl.pdf}
}\qquad % space out the images a bit
\subfigure[yaw velocities of right eye]{%
\includegraphics[width=0.45\textwidth]{figures/attributions/attributions_judo_sr1000_sl1000_bxy_eky2_000006797_dxr.pdf}
}\qquad % space out the images a bit
\subfigure[pitch velocities of left eye]{%
\includegraphics[width=0.45\textwidth]{figures/attributions/attributions_judo_sr1000_sl1000_bxy_eky2_000006797_dyl.pdf}
}\qquad % space out the images a bit
\subfigure[pitch velocities of right eye]{%
\includegraphics[width=0.45\textwidth]{figures/attributions/attributions_judo_sr1000_sl1000_bxy_eky2_000006797_dyr.pdf}
}
}
\end{figure}


\begin{figure}[htp]
\floatconts
{fig:attribution-examples-appendix-judo-4}% label for whole figure
{\caption{Attributions generated for a single example instance~(id~=~16066) out of the JuDo1000 dataset. See caption in Figure\ref{fig:attribution-methods-example} for a complete description.}}
{%
\subfigure[yaw velocities of left eye]{%
\includegraphics[width=0.45\textwidth]{figures/attributions/attributions_judo_sr1000_sl1000_bxy_eky2_000016066_dxl.pdf}
}\qquad % space out the images a bit
\subfigure[yaw velocities of right eye]{%
\includegraphics[width=0.45\textwidth]{figures/attributions/attributions_judo_sr1000_sl1000_bxy_eky2_000016066_dxr.pdf}
}\qquad % space out the images a bit
\subfigure[pitch velocities of left eye]{%
\includegraphics[width=0.45\textwidth]{figures/attributions/attributions_judo_sr1000_sl1000_bxy_eky2_000016066_dyl.pdf}
}\qquad % space out the images a bit
\subfigure[pitch velocities of right eye]{%
\includegraphics[width=0.45\textwidth]{figures/attributions/attributions_judo_sr1000_sl1000_bxy_eky2_000016066_dyr.pdf}
}
}
\end{figure}


\begin{figure}[htp]
\floatconts
{fig:attribution-examples-appendix-judo-5}% label for whole figure
{\caption{Attributions generated for a single example instance~(id~=~19239) out of the JuDo1000 dataset. See caption in Figure\ref{fig:attribution-methods-example} for a complete description.}}
{%
\subfigure[yaw velocities of left eye]{%
\includegraphics[width=0.45\textwidth]{figures/attributions/attributions_judo_sr1000_sl1000_bxy_eky2_000019239_dxl.pdf}
}\qquad % space out the images a bit
\subfigure[yaw velocities of right eye]{%
\includegraphics[width=0.45\textwidth]{figures/attributions/attributions_judo_sr1000_sl1000_bxy_eky2_000019239_dxr.pdf}
}\qquad % space out the images a bit
\subfigure[pitch velocities of left eye]{%
\includegraphics[width=0.45\textwidth]{figures/attributions/attributions_judo_sr1000_sl1000_bxy_eky2_000019239_dyl.pdf}
}\qquad % space out the images a bit
\subfigure[pitch velocities of right eye]{%
\includegraphics[width=0.45\textwidth]{figures/attributions/attributions_judo_sr1000_sl1000_bxy_eky2_000019239_dyr.pdf}
}
}
\end{figure}

\FloatBarrier

\subsection{Attribution Examples for Potsdam Textbook Corpus}

\begin{figure}[htp]
\floatconts
{fig:attribution-examples-appendix-potec-1}% label for whole figure
{\caption{Attributions generated for a single example instance~(id~=~2164) out of the PoTeC dataset. See caption in Figure\ref{fig:attribution-methods-example} for a complete description.}}
{%
\subfigure[yaw velocity]{%
\includegraphics[width=0.45\textwidth]{figures/attributions/attributions_potec_sr1000_sl1000_dxy_eky2_000002164_dx.pdf}
}\qquad % space out the images a bit
\subfigure[pitch velocity]{%
\includegraphics[width=0.45\textwidth]{figures/attributions/attributions_potec_sr1000_sl1000_dxy_eky2_000002164_dy.pdf}
}
}
\end{figure}

\begin{figure}[htp]
\floatconts
{fig:attribution-examples-appendix-potec-2}% label for whole figure
{\caption{Attributions generated for a single example instance~(id~=~16137) out of the PoTeC dataset. See caption in Figure\ref{fig:attribution-methods-example} for a complete description.}}
{%
\subfigure[yaw velocity]{%
\includegraphics[width=0.45\textwidth]{figures/attributions/attributions_potec_sr1000_sl1000_dxy_eky2_000016137_dx.pdf}
}\qquad % space out the images a bit
\subfigure[pitch velocity]{%
\includegraphics[width=0.45\textwidth]{figures/attributions/attributions_potec_sr1000_sl1000_dxy_eky2_000016137_dy.pdf}
}
}
\end{figure}


\begin{figure}[htp]
\floatconts
{fig:attribution-examples-appendix-potec-3}% label for whole figure
{\caption{Attributions generated for a single example instance~(id~=~19806) out of the PoTeC dataset. See caption in Figure\ref{fig:attribution-methods-example} for a complete description.}}
{%
\subfigure[yaw velocity]{%
\includegraphics[width=0.45\textwidth]{figures/attributions/attributions_potec_sr1000_sl1000_dxy_eky2_000019806_dx.pdf}
}\qquad % space out the images a bit
\subfigure[pitch velocity]{%
\includegraphics[width=0.45\textwidth]{figures/attributions/attributions_potec_sr1000_sl1000_dxy_eky2_000019806_dy.pdf}
}
}
\end{figure}


\begin{figure}[htp]
\floatconts
{fig:attribution-examples-appendix-potec-4}% label for whole figure
{\caption{Attributions generated for a single example instance~(id~=~82639) out of the PoTeC dataset. See caption in Figure\ref{fig:attribution-methods-example} for a complete description.}}
{%
\subfigure[yaw velocity]{%
\includegraphics[width=0.45\textwidth]{figures/attributions/attributions_potec_sr1000_sl1000_dxy_eky2_000082639_dx.pdf}
}\qquad % space out the images a bit
\subfigure[pitch velocity]{%
\includegraphics[width=0.45\textwidth]{figures/attributions/attributions_potec_sr1000_sl1000_dxy_eky2_000082639_dy.pdf}
}
}
\end{figure}


\begin{figure}[htp]
\floatconts
{fig:attribution-examples-appendix-potec-5}% label for whole figure
{\caption{Attributions generated for a single example instance~(id~=~83911) out of the PoTeC dataset. See caption in Figure\ref{fig:attribution-methods-example} for a complete description.}}
{%
\subfigure[yaw velocity]{%
\includegraphics[width=0.45\textwidth]{figures/attributions/attributions_potec_sr1000_sl1000_dxy_eky2_000083911_dx.pdf}
}\qquad % space out the images a bit
\subfigure[pitch velocity]{%
\includegraphics[width=0.45\textwidth]{figures/attributions/attributions_potec_sr1000_sl1000_dxy_eky2_000083911_dy.pdf}
}
}
\end{figure}



%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


\end{document}