\documentclass[pmlr]{jmlr}% new name PMLR (Proceedings of Machine Learning Research)

 % The following packages will be automatically loaded:
 % amsmath, amssymb, natbib, graphicx, url, algorithm2e

 %\usepackage{rotating}% for sideways figures and tables
\usepackage{longtable}% for long tables

 % The booktabs package is used by this sample document
 % (it provides \toprule, \midrule and \bottomrule).
 % Remove the next line if you don't require it.
\usepackage{booktabs}
 % The siunitx package is used by this sample document
 % to align numbers in a column by their decimal point.
 % Remove the next line if you don't require it.
\usepackage[load-configurations=version-1]{siunitx} % newer version
 %\usepackage{siunitx}
\usepackage{pbox}
\usepackage{multirow}
\usepackage{amssymb}
\usepackage{float}
\usepackage{graphicx}
\usepackage{gensymb}

\newcounter{magicrownumbers}
\newcommand\rownumber{\stepcounter{magicrownumbers}\arabic{magicrownumbers}}

 % The following command is just for this sample document:
\newcommand{\cs}[1]{\texttt{\char`\\#1}}

 % Define an unnumbered theorem just for this sample document:
\theorembodyfont{\upshape}
\theoremheaderfont{\scshape}
\theorempostheader{:}
\theoremsep{\newline}
\newtheorem*{note}{Note}

 % change the arguments, as appropriate, in the following:
\jmlrvolume{1}
\jmlryear{2023}
\jmlrworkshop{NeurIPS 2023 Gaze Meets ML Workshop}

\title[Detection of Drowsiness from Eye Movements]{Detection of Drowsiness and Impending Microsleep\titlebreak from Eye Movements}
 % Use \Name{Author Name} to specify the name.

 % Spaces are used to separate forenames from the surname so that
 % the surnames can be picked up for the page header and copyright footer.
 
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % *** Make sure there's no spurious space before \nametag ***

 % Two authors with the same address
 % \author{\Name{Silvia Makowski\nametag{\thanks{equal contribution}}} %\Email{silvia.makowski@uni-potsdam.de}\and
 %  \Name{Paul Prasse } \Email{paul.prasse@uni-potsdam.de}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \author{\Name{Author Name1} \Email{an1@sample.com}\\
 %  \Name{Author Name2} \Email{an2@sample.com}\\
 %  \Name{Author Name3} \Email{an3@sample.com}\\
 %  \Name{Author Name4} \Email{an4@sample.com}\\
 %  \Name{Author Name5} \Email{an5@sample.com}\\
 %  \Name{Author Name6} \Email{an6@sample.com}\\
 %  \Name{Author Name7} \Email{an7@sample.com}\\
 %  \Name{Author Name8} \Email{an8@sample.com}\\
 %  \Name{Author Name9} \Email{an9@sample.com}\\
 %  \Name{Author Name10} \Email{an10@sample.com}\\
 %  \Name{Author Name11} \Email{an11@sample.com}\\
 %  \Name{Author Name12} \Email{an12@sample.com}\\
 %  \Name{Author Name13} \Email{an13@sample.com}\\
 %  \Name{Author Name14} \Email{an14@sample.com}\\
 %  \addr Address}

 % Authors with different addresses:
 \author{\Name{Silvia Makowski}\thanks{Both authors contributed equally to this research.} \Email{silvia.makowski@uni-potsdam.de}\\
  \Name{Paul Prasse}$^*$ \Email{paul.prasse@uni-potsdam.de}\\  
  \addr University of Potsdam, Potsdam, Germany
  \AND
  \Name{Lena Ann Jäger} \Email{jaeger@cl.uzh.ch}\\
  \addr University of Zurich, Zurich, Switzerland\\
  \addr University of Potsdam, Potsdam, Germany
  \AND
  \Name{Tobias Scheffer} \Email{tobias.scheffer@uni-potsdam.de}\\
  \addr University of Potsdam, Potsdam, Germany
 }

\editor{Editor's name}
 % \editors{List of editors' names}

\begin{document}

\maketitle

\begin{abstract}
Drowsiness is a contributing factor in an estimated 12\% of all road traffic fatalities. It is known that drowsiness directly affects oculomotor control.
We therefore investigate whether drowsiness can be detected based on eye movements. To this end, we develop deep neural sequence models that exploit a person's raw eye-gaze and eye-closure signals to detect drowsiness. 
We explore three measures of drowsiness ground truth: a widely-used sleepiness self-assessment, reaction time, and impending microsleep in the near future. We find that our sequence models are able to detect drowsiness and outperform a baseline processing established engineered features. We also find that the risk of a microsleep event in the near future can be predicted more accurately than the sleepiness self-assessment or the reaction time. Moreover, a model that has been trained on predicting microsleep also excels at predicting self-assessed sleepiness in a cross-task evaluation, which indicates that upcoming microsleep is a less noisy proxy of the drowsiness ground truth. 
We investigate the relative contribution of eye-closure and gaze information to the model's performance. In order to make the topic of drowsiness detection more accessible to the research community, we collect and share eye-gaze data with participants in baseline and sleep-deprived states. 
\end{abstract}

\iffalse 
\begin{keywords}
List of keywords
\end{keywords}
\fi 

\section{Introduction}
\label{sec:intro}
The term \textit{drowsiness} refers to the transition from being clearly awake to being clearly asleep. This transition phase is already characterized by profound changes in motor control, cognition, brain activity, and consciousness~\citep{mcginley2015waking}. The European Commission estimates that drowsiness is a contributing factor in 12\% of all road accidents~\citep{EUfatigue}---these accidents account for 162,000 of the 1.35 million annual road traffic fatalities~\citep{CDCfatalities}. 

Driver cameras that are entering the automotive market are capable of extracting the driver's eye closure and eye gaze~\citep{halin2021survey}. It is known that fatigue and drowsiness are associated with an increased blink frequency~\citep{santamaria1987eeg,luckiesh1937eyelid,hoffman1946eye}; this connection may be understood as cessation of the attention-driven suppression of blinks~\citep{Schleicher2008}. Blink  duration~\citep{morris1996electrooculographic,hakkanen1999blink} and the individual standard deviation of blink rates and duration increase with increasing fatigue~\citep{Schleicher2008}. 
Motivated by these psychological findings, known approaches to drowsiness detection use features that are derived from the eye-lid closure---such as blink frequency, duration, and eye-lid velocity~\citep{wilkinson2013the,horng2004driver,nguyen2015eye,rumagit2017gazing}. 
The percentage of time in which the pupil is covered by the lid (PERCLOS)~\citep{skipper1986drowsy} tends to increase with increasing fatigue; however, individuals can willingly keep their eyes open despite being fatigued, and even while exhibiting indicators of sleep in the EEG~\citep{o1977comparison}. 

Psychological research has found features of gaze events, such as saccades and fixational micro-movements to be correlated to drowsiness. The saccadic accuracy and peak saccadic velocity can be negatively impacted by fatigue, but with a high inter-subject variability~\citep{galley1989saccadic,hirvonen2010improving}.
\citet{Stasi2013} find that ocular stability decreases as a function of mental fatigue:  In a visual search task, the velocity of saccades and micro-saccades decreases under fatigue whereas the velocity of ocular drift increases. In a driving-simulator experiment, saccadic duration increases, saccadic speed decreases, and their standard deviations increase with increasing fatigue~\citep{Schleicher2008}.
These psychological findings, and the fact that the eye gaze can be measured by optical sensors, motivate us to explore eye-movements in addition to eye-closure signals as a predictor of drowsiness.


However, when developing machine learning methods for drowsiness detection, obtaining valid ground-truth labels poses a major methodological challenge. 
Whereas the state in which a person is clearly sleeping (sleep stage 2) can be unambiguously labeled in an electroencephalogram (EEG) recording that shows the characteristic sleep spindles or K complexes, drowsiness (sleep stage 1) cannot be directly observed. Although drowsiness has long been known to be associated with certain changes in the EEG signal~\citep{matousek1983method,dement1957cyclic,santamaria1987eeg}, it cannot be unambiguously detected from the EEG signal alone across different individuals, but additional motion or eye movement data is necessary~\citep{moser2009sleep,santamaria1987eeg}, which not only makes the manual labeling difficult \citep{rechtschaffen1968manual,ASMmanual2020}, but has led to an overwhelming number of features extracted from the EEG signal that, at the group level, all have been shown to correlate with drowsiness \citep{Stancin2021}. Moreover, in a task such as driving, the EEG signal is heavily contaminated by muscle artifacts. In sum, although for the detection of sleep stage 2 and higher, EEG is the gold-standard measurement technique to determine ground-truth values, it is not an ideal tool to determine drowsiness ground truth. 

%Sleep can be detected in the EEG based on the Alpha power. Drowsiness may be detectable in the EEG based on the Theta power for some individuals; however, for other individuals, Theta waves can be completely absent from the EEG signal. In addition, the EEG signal is contaminated by muscle movements; it is therefore not an ideal tool to determine fatigue ground truth. 

The \emph{Karolinska sleepiness scale (KSS)}~\citep{aakerstedt1990subjective} is a widely-used self-assessment, in which participants are asked in regular intervals to rate their fatigue on a scale from ``1---very alert'' to ``9---very sleepy, great effort to keep alert, fighting sleep''. The KSS self-assessment is generally considered as a gold-standard proxy of drowsiness ground truth. It is correlated to driving errors and EEG-derived indicators of fatigue~\citep{kaida2006validation}. The drawback of the KSS score is that it is strongly subjective, and that participants can easily misjudge their own level of drowsiness. 

The \emph{psychomotor vigilance test  (PVT)}~\citep{PVT} measures users' reaction time during a ten-minute repetitive reaction test. While the PVT offers an objective, quantitative measure of vigilance, it interrupts any other user activity for ten minutes, and the functional relationship between vigilance and reaction time is highly individual. 

% Despite these psychological findings, and despite the fact that the eye gaze can be measured by optical sensors, macro- or micro-movements of the eyes have not previously been explored for drowsiness detection in published research.

This paper makes a number of contributions.
\begin{enumerate}
    \item We introduce upcoming microsleep events in the near future as a new proxy of drowsiness ground-truth. 
    We argue that upcoming microsleep is a highly relevant proxy of the drowsiness ground truth for applications such as driver monitoring, because prolonged driving without visual perception is objectively hazardous. 
    \item We show that the risk of impending microsleep can be predicted more accurately than both KSS self-assessment and reaction time. Moreover, a model trained to predict microsleep is at least as good at predicting KSS assessments as a model trained on KSS scores, which indicates that upcoming microsleep is a less noisy form of ground truth.
    \item We develop CNN, LSTM, and Bi-LSTM neural network architectures that directly process the raw eye-closure and eye-gaze signal to predict drowsiness by learning to extract the relevant information from the input signal in a fully data-driven way. As reference baseline, we implement an exhaustive list of published hand-crafted features that serves as input to a random-forest classifier. %We find that the neural networks outperform a random forest on these engineered features.
    \item In an ablation study, we quantify the relative contributions of eye-closure and eye-gaze features to fatigue detection.
    \item In order to make the topic of drowsiness detection more accessible to the research community, we collect and share a database of 47 participants in baseline and sleep-deprived states with KSS, PVT, and impending microsleep ground truth. We also implement an exhaustive collection of published engineered eye-closure and eye-gaze features and share their implementation.
\end{enumerate}

The remainder of this paper is structured as follows. Section \ref{sec:problem_setting} lays out the problem setting, Section \ref{sec:feature_extraction} introduces the drowsiness detection models, Section \ref{sec:Data_Collection} reports on our data collection. In Section \ref{sec:Experimental_Results} we present the experimental results. Section \ref{sec:Discussion} discusses the results and related work. Section \ref{sec:Conclusions} concludes.

\section{Problem Setting}
\label{sec:problem_setting}
In all the variations of problem settings that we study, the input to the system consists of the following signals:
\begin{itemize}
    \item A sequence of raw eye gaze yaw and pitch angles of the left and right eye over the observation period, recorded by a video-based eye tracker;
    \item an eye-closure signal on a scale of zero to one, where zero indicates an aperture of 12\,mm or more, and one indicates fully closed eyes;
    \item an eye-state variable that indicates whether the pupil is covered by the eye-lid with values ``open'' (pupil not covered), ``closed'' (pupil covered), ``partially closed'', ``not visible'' (covered by an occlusion), ``downcast'' (head is pitched downward and either may either be closed or looking downward), and ``not available'' (tracking failure);
    \item an eye movement events signal that indicates the presence of a fixation or saccade. Fixations are phases of relative stability during which only micro-movements occur and visual information is perceived, whereas saccades are fast relocation movements during which information uptake is suppressed. 
\end{itemize}

While drowsiness ground truth cannot be observed directly, we will study a new proxy, as well as reference proxies. The first and most common reference proxy for drowsiness ground truth is the KSS self-assessment score~\citep{aakerstedt1990subjective} on a scale from ``1--extremely alert'' to ``9--very sleepy, great effort to keep alert, fighting sleep''. We study KSS prediction as a binary classification problem where the positive class is the aggregate of scores 7 through 9---the \emph{sleepy} range---and the negative class is scores 1 through 6---the \emph{alert} range.
Due to the subjective nature of the KSS scale, it suffers from inter-subject variance that constitutes a principal upper bound on the accuracy that any system can possibly achieve.

As the second reference proxy of drowsiness ground truth, we consider the task of predicting the reaction time of the participants during the PVT task. In the PVT task, a small red dot appears in the center of a black screen after random time intervals. Participants have to press a button as soon as they recognize the dot. The reaction time is measured as the average interval between appearance of the dot and activation of the button.
A person's reaction time is highly individual and, as Section~\ref{sec:sescr} will confirm, some individuals can react faster while fighting sleep than others in their fully alert state.

We will therefore investigate the novel task of predicting impending microsleep events in a time window of the next 10 seconds. Definitions of microsleep events in the literature vary; we use a typical definition which is a continuous eye closure of at least 1,000\,ms duration. \emph{Predicting} microsleep events in the future must not be confused with the easier task of \emph{detecting} microsleep. By the time an ongoing microsleep event can be \emph{detected}, a hazardous situation is already in progress. 

We will evaluate this task in two levels of difficulty. In addition to the evaluation of \emph{all cases}, we will separately evaluate \emph{hard cases}. The latter evaluation is restricted to positives in which the observation window does \emph{not yet} contain a microsleep episode. Hard cases are first occurrences of microsleep that have to be predicted without the benefit of having observed preceding microsleep events that already provide evidence of drowsiness. 
%In order to count as a positive \emph{prediction}, the microsleep event must be predicted before the eyes actually start closing. 
Prediction of impending microsleep episodes is arguably linked closer to applications such as driver monitoring than estimating the KSS self-assessment score, because prolonged driving without visual perception is objectively hazardous. 


%\subsection{Performance Metrics}
For all binary output signals, we measure false-positive and true-positive rates. Each time step of each evaluation sequence constitutes an instance; in our evaluation protocol, time steps progress with a stride of 5 seconds. Depending on the target variable, a positive instance is a time step in which the model estimates the KSS as 7 to 9, or predicts an impending microsleep event, respectively. If the output matches the ground truth, the instance counts as a true positive, otherwise it is a false positive. All models under investigation implement a continuous decision function; a positive output is triggered when the decision function exceeds a threshold. Adjusting this threshold changes the trade-off between false-positive and false-negative rates. The attainable pairs of true and false positive rates can be visualized in a ROC curve and aggregated in AUC values.
The PVT reaction time is a continuous signal that we model as a regression task. We evaluate this task in terms of the \emph{Root Mean Squared Error (RMSE)} and the \emph{Coefficient of Determination} $R^2$.


\section{Drowsiness Detection Models}\label{sec:feature_extraction}
This section introduces the models used to predict the impending microsleep events and the reference ground-truth proxies of the KSS self assessment and reaction time (see Figure~\ref{fig:models}).


\subsection{Neural Networks}
This subsection presents the proposed neural networks for eye-gaze based drowsiness detection. All presented neural networks take an input sequence of 60 seconds at 200\,Hz, which results in 12,000 time steps. The input signals---described in detail in Section~\ref{sec:problem_setting}---result in 13 input channels: one eye-closure channel, seven channels that encode a discrete eye state, eye-gaze yaw and pitch velocities, and indicator channels for fixations, saccades, and missing values. The eye-state and eye-closure signals are generated by a commercial driver-monitoring system.
%as input a sequence of 60 seconds of data down-sampled to 200 Hz resulting in a sequence of 1,200 timesteps with 13 different channels. The input features can be divided into three broad categories. The first category contains the following eye closure and eye state features: eye closure, 6 eye state channels indicating the eye state into the categories $\{open, close, partially open, downcast, not visible, unknown, not available\}$. The second category contains the yaw and pitch velocities. The third category contains features indicating the presence of a fixation and saccade, respectively.
\begin{figure}[t]
\floatconts
  {fig:models}
  {\caption{Drowsiness detection models.}}
  {%
    \subfigure[Random Forest.]{\label{fig:rf}%
      \includegraphics[width=0.22\textwidth,keepaspectratio,trim = 50 50 675 50, clip]{figures/rf.pdf}}%
    \qquad
    \subfigure[CNN.]{\label{fig:cnn}%
      \includegraphics[width=0.22\textwidth,keepaspectratio,trim = 50 50 675 50, clip]{figures/cnn.pdf}}
    \subfigure[LSTM.]{\label{fig:lstm}%
      \includegraphics[width=0.22\textwidth,keepaspectratio,trim = 50 50 675 50, clip]{figures/lstm.pdf}}
    \subfigure[Bi-LSTM.]{\label{fig:bi_lstm}%
      \includegraphics[width=0.22\textwidth,keepaspectratio,trim = 50 50 675 50, clip]{figures/bi_lstm.pdf}}
  }
\end{figure}


\subsubsection{CNN}
We develop a one-dimensional CNN model architecture consisting of multiple CNN layers followed by a global average pooling layer and several fully connected layers that is designed to extract both local and global patterns in the eye-closure and eye-gaze signal (see Figure~\ref{fig:cnn}).

The first layers of the model are one-dimensional convolutional layers, which are designed to learn local patterns in the data, followed by a batch normalization and the ReLU activation function.
After the convolutional layers, we apply a global average pooling layer, which aggregates the features learned by the convolutional layers across time, reducing the spatial dimensionality of the output.
The output of the average pooling layer is then flattened and passed through several fully connected layers, which learn to classify the input data based on the extracted features. The final layer of the model is a softmax layer for the classification problems and a linear unit for the PVT regression task.

\begin{table*}[t!]
\small
\caption{Engineered features: absolute values for count features and mean, median, standard deviation, skewness, and kurtosis over all blinks in the input window of all other base features.}\label{tab:table_features_eye}
\centering
{\footnotesize\begin{tabular}{l|l|l}
 \toprule 
&Feature & Source\\
\hline
\rownumber & Time steps with eye state ``open'' (count) & Asaphus Vision\\
\rownumber &Time steps with eye state ``closed'' (count)& Asaphus Vision\\
\rownumber & Time steps with eye state ``partially open'' (count)& Asaphus Vision\\
\rownumber &Time steps with eye state ``not visible'' (count)& Asaphus Vision\\
\rownumber &Time steps with eye state ``downcast'' &\\&(closed or looking downward, count) & Asaphus Vision\\
\rownumber &Time steps with eye state ``not available'' (count)& Asaphus Vision\\
\rownumber &Number of blinks (count)& -\\
\rownumber &Blink duration from start to maximum &\\&reopening velocity& \cite{Schleicher2008}\\
\rownumber &Blink duration normalized by mean duration & \cite{Schleicher2008}\\
\rownumber &Blink duration from maximum closing to &\\&maximum opening velocity&\cite{wilkinson2013the}\\
\rownumber &Blink duration from onset of closing to full reopening&\cite{wilkinson2013the}\\
\rownumber &Time interval between two adjacent blinks & \cite{Schleicher2008}\\
\rownumber &Lid-closure amplitude & \cite{Schleicher2008}\\
\rownumber &Lid-closure amplitude normalized by mean amplitude& \cite{Schleicher2008}\\
\rownumber &Maximum closure velocity during blink& \cite{Schleicher2008}\\
\rownumber &Maximum closure velocity during blink normalized &\\&by expected velocity& \cite{Schleicher2008}\\
\rownumber &Mean closure velocity during blink &\\&normalized by expected velocity& \cite{Schleicher2008}\\
\rownumber &Delay between full closure and onset of reopening & \cite{Schleicher2008}\\
\rownumber &Percentage of time with eyes closed&\cite{wilkinson2013the}\\
\rownumber &Ratio of the max. amplitude to max. velocity &\\&of eyelid movement for the reopening phase&\cite{wilkinson2013the}\\
\rownumber &Ratio of the max. amplitude to max. velocity &\\&of eyelid movement for the closing phase&\cite{wilkinson2013the}\\
\rownumber &Percentage of time the eyes are fully closed &\\&for more than 10 ms&\cite{wilkinson2013the}\\
\rownumber &Saccade duration & \cite{Schleicher2008}\\
\rownumber &Saccade duration normalized by mean duration & \cite{Schleicher2008}\\
\rownumber &Time interval between two adjacent saccades & \cite{Schleicher2008}\\
\rownumber &Saccade amplitude & \cite{Schleicher2008}\\
\rownumber &Saccade amplitude normalized by mean amplitude& \cite{Schleicher2008}\\
\rownumber &Max velocity during saccade& \cite{Schleicher2008}\\
\rownumber &Max velocity during saccade normalized by expected velocity& \cite{Schleicher2008}\\
\rownumber &Mean velocity during saccade normalized by expected velocity& \cite{Schleicher2008}\\
\bottomrule
\end{tabular}}
\end{table*}

\subsubsection{LSTM and Bi-LSTM}

LSTM and Bi-LSTM networks are alternative architectures to the 1D-CNN to model time-series data. In order to allow the models to capture long-term dependencies in the input signals, we concatenate multiple layers of LSTM or Bi-LSTM units with fully connected layers (see Figure~\ref{fig:lstm} and~\ref{fig:bi_lstm}).

The first layers of our model are LSTM or Bi-LSTM layers, respectively, which are designed to extract local patterns in the input sequence. 
After these signal-processing layers, we apply several fully connected layers with dropout regularization. The final layer of the model is a softmax layer for the classification problems and a linear unit for the PVT regression.

%Overall, our proposed LSTM/Bi-LSTM network architecture is designed to classify microsleep events and high KSS scores from time-dependent eye closure and eye movement datat. By using LSTM/Bi-LSTM layers, the model can capture long-term dependencies in the input sequence. The fully connected layers then learn to classify the input sequence based on the extracted features.

\subsection{Reference Method}
As a baseline method that represents the state of the art, we implement 
all eye-lid movement and gaze-velocity features that we find in the published literature about drowsiness detection~\citep{Schleicher2008,wilkinson2013the}. Table~\ref{tab:table_features_eye} shows a list of base features. The complete set of features is composed of the  absolute values for count features, and mean, median, standard deviation, skewness, and kurtosis over all blinks in the input window of all other base features.
We train a 
random forest (RF) classifiers~\citep{breiman2001random} on these features using the scikit-learn library~\citep{scikit-learn}.

\section{Data Collection}
\label{sec:Data_Collection}
This section reports on our data collection. The data set and code are available online\footnote{\url{https://osf.io/hmyc4/}} and will be published upon acceptance.
We record a data set of binocular eye movements and eye-closure features of 47 participants. Participants have been informed about the purpose of the research and the procedure of data collection and have given their informed consent. The study has been approved by the responsible ethics committee. 
Participants are aged 18 through 48 (mean of 24 years); each participant is recorded in three experimental sessions with a time lag of at least one week in between two sessions. While the participants are instructed to appear well-rested to two of the sessions (\emph{baseline} sessions), one of the sessions takes place under sleep deprivation (\emph{sleep-deprived} session).   The order of the experimental conditions is counter-balanced across participants.
\begin{enumerate}
    \item For the \emph{sleep-deprived} session, participants are advised to refrain from sleeping within 24 hours before the experimental session starts, though we do not monitor participants during that time to verify compliance.  
    \item For each of two \emph{baseline} sessions, participants are asked to appear well rested.
\end{enumerate}



During each of the sessions, participants execute three times the Psychomotor Vigilance Task (PVT)~\citep{PVT} (PC-based reimplementation) of 10 minutes, interrupted by two time intervals (with mean durations of 35~$\pm$~9 and 30~$\pm$~6 minutes, respectively) in which they perform cognitive and visual tasks for other experiments. 
%In the PVT task, a small red dot with a diameter of 9\,px appears in the center of a black screen after random time intervals, drawn from within the range of 2 to 10 seconds. Participants are asked to press a button as soon as they recognize the dot. When the button is pressed, the color of the dot changes to green and disappears after one second. The reaction time is measured as the time between the screen onset of the red dot and the participant's button press.
We chose this task because it requires sustained attention but no specific skills.
Before and after each PVT block, participants report their perceived level of sleepiness on the Karolinska sleepiness scale (KSS), resulting in six KSS scores per session. We linearly interpolate the reported score in order to obtain a sleepiness measure for each point in time.




\subsection{Technical Setup}
We record participants' binocular eye gaze with an Eyelink Portable Duo eye tracker (SR Research) at a sampling frequency of 2000\,Hz and a vendor-reported spatial precision of 0.01\degree. Additionally, we record participants faces with a video-camera, with a sampling frequency of 30 fps and an image resolution of 344$\times$408\,px. The camera records in the infrared spectrum and is sensitive to the infrared illumination of the eye tracker (880\,nm). During the experiment, participants sit at a height-adjustable table in front of a computer monitor (38$\times$30\,cm, 1280$\times$1024\,px) with their heads stabilized by a chin and forehead rest. 

\begin{figure}[t!]
\floatconts
  {fig:1}
  {\caption{Drowsiness self-assessment with the Karolinska sleepiness scale (KSS).}}
  {%
  \setlength{\jmlrminsubcaptionwidth}{0.43\linewidth}
    \subfigure[Distribution of the KSS score across participants for both experimental conditions.]{\label{fig:a}%
      \includegraphics[width=0.4\linewidth]{figures/descriptive_statistics/kss/kss_dist_sleep_base_ecml_format.pdf}}%
    \qquad
    \subfigure[Development of the mean KSS score across participants in a session over time. Error bars show the standard deviation.]{\label{fig:b}%
      \includegraphics[width=0.4\linewidth]{figures/descriptive_statistics/kss/kss_tests_sleep_base_ecml_format.pdf}}
  }
\end{figure}

\begin{figure}[t!]
\floatconts
  {fig:reactiontime_kss}
  {\caption{Microsleep and reaction time for each KSS level. Number of microsleep events per minute for each (rounded interpolated) KSS score. In Figures~\ref{fig:rt_kss_errorbar} and \ref{fig:rt_kss_box}, boxes display the median value and interquartile range; whiskers extend up to the most extreme data point within 1.5 IQR of the quartiles.}}% The other points are considered outliers and displayed individually.}}
  {%
  \setlength{\jmlrminsubcaptionwidth}{0.43\linewidth}
    \subfigure[Number of microsleep events per minute for each (rounded interpolated) KSS score.]{\label{fig:rt_kss_errorbar}%
      \includegraphics[width=0.4\linewidth]{figures/descriptive_statistics/kss/reaction_time_kss/sleep_kss_boxplot.pdf}}%
    \qquad
    \subfigure[PVT reaction time in ms for each (rounded interpolated) KSS score.]{\label{fig:rt_kss_box}%
      \includegraphics[width=0.4\linewidth]{figures/descriptive_statistics/kss/reaction_time_kss/reactiontime_kss_boxplot_1.pdf}}
  }
\end{figure}

\subsection{Descriptive statistics of the recorded data}\label{sec:sescr}
Figure~\ref{fig:a} shows the histogram of reported KSS scores per session type, and Figure~\ref{fig:b} over time during each session; in summary, the data cover all levels of drowsiness. For the baseline sessions, the mean KSS score increases from 3 to 5 due to the repetitive nature of the task. In the sleep-deprived session, the mean KSS score increases from 7 to 8. 

Figure~\ref{fig:reactiontime_kss}(a) shows the number of microsleep episodes for each reported KSS level as box plot. For any KSS score, zero microsleep events is the mode of the distribution and any data points with microsleep events are outliers. While a correlation between KSS levels and microsleep events is apparent, there is also a large overlap of the distributions of sleep events per minute at different KSS levels, especially for low KSS values. 

Figure~\ref{fig:reactiontime_kss}(b) shows the distribution of PVT reaction times per KSS level; and again, the correlation is apparent. The overlap between distributions for different KSS scores underlines the large inter-person variability. At the KSS level of 9, quite a few still have a lower reaction time than other participants in their fully alert state.




\section{Experimental Results}
\label{sec:Experimental_Results}
This section reports on the evaluation protocol and the experimental results.

\subsection{Evaluation Protocol}\label{sec:learning}
All models are evaluated with a nested five fold cross-validation protocol that is stratified across persons, so that no person appears both in the training and test data at the same time. We tune the hyper-parameters using the training part of the first fold using grid search and use the best found configuration for all remaining folds (see Table~\ref{tab:hp_grid} for the list of used hyper-parameters and the best found parameters).

\begin{table}[t]
\small
\caption{Hyper-parameter grid for the models under investigation and best found values.}
  \begin{footnotesize}
  \begin{center}
    \label{tab:hp_grid}
    \footnotesize\begin{tabular}{l|l|l||l|l|l}
    \toprule
     & & & \multicolumn{3}{c}{Best values for setting} \\
      & Hyper-parameter & Search space & Microsleep & KSS & Reaction time \\
      \hline
      \multirow{4}{*}{\rotatebox[origin=c]{90}{RF}} & Num. of estimators $T$  & \{50, 100, 1000\} & 100 & 100 & 1000\\
      & Num. of features  & \{Auto, sqrt, log2\} & Auto & sqrt & Auto\\
      & Maximum depth of a tree & \{2, 4, 6, 8, None\} & 8 & None & None \\
      & Splitting criterion  & \{Gini, Entropy\} & Gini & Gini & Gini\\
      \hline 
      \multirow{7}{*}{\rotatebox[origin=c]{90}{CNN}} & Num. of conv layers $N_C$ & \{1, 2, 3\} & 1 & 1 & 1\\
      & Kernel size & \{16, 32, 64\} & [64] & [64] & [64]\\
      & Num. of filters & \{64, 128\} & [128] & [128] & [128]\\
      & Stride & \{1, 2, 4\} & 1 & 2 & 1\\
      & Num. of dense layers $M_C$& \{1, 2\} & 2 & 1 & 2\\
      & Num. of hidden dense units & \{16, 32, 64\} & [64, 32] & [32] & [64, 32]\\
      \hline
      \multirow{5}{*}{\rotatebox[origin=c]{90}{LSTM}} & Num. of LSTM layers $N_:$& \{1, 2, 3\} & 2 & 2 & 2\\
      & Num. of LSTM units & \{16, 32, 64\} & [64, 64] & [64, 64] & [64, 64]\\
      & Num. of dense layers $M_L$& \{1, 2\} & 1 & 1 & 1\\
      & Num. of hidden dense units& \{16, 32, 64\} & [32] & [32] & [32]\\
      \hline
      \multirow{5}{*}{\rotatebox[origin=c]{90}{Bi-LSTM}} & Num. of Bi-LSTM layers $N_B$& \{1, 2, 3\} & 2 & 2 & 2\\
      & Num. of Bi-LSTM units & \{16, 32, 64\} & [32, 32] & [32, 32] & [32, 32]\\
      & Num. of dense layers $M_B$& \{1, 2\} & 1 & 1 & 1\\
      & Num. of hidden dense units& \{16, 32, 64\} & [32] & [32] & [32]\\
      \bottomrule
    \end{tabular}    
  \end{center}
  \end{footnotesize}
\end{table}

\begin{table*}[t!]
\small
\caption{AUC $\pm$ standard error for prediction of the binary KSS label and impending microsleep events. A star indicates models better than the random forest baseline.}\label{tab:results_classification}
\centering
\footnotesize\begin{tabular}{l||l|l|l}
 \toprule
 %& \multicolumn{3}{c}{AUC} \\
 & KSS $\geq 7$ & \multicolumn{2}{c}{Microsleep}\\
 & & All cases & Hard cases \\
\hline
Random forest & 0.6 $\pm0.01$ & 0.93 $\pm$0.02 & 0.8 $\pm$0.01\\
CNN & \textbf{0.7} $\pm$ \textbf{0.01}$^*$ & 0.94 $\pm0.01$ & \textbf{0.87} $\pm$\textbf{0.02}$^*$\\
LSTM & 0.67 $\pm0.04$ & \textbf{0.95} $\pm$ \textbf{0.01} & 0.85 $\pm0.02$\\
Bi-LSTM & 0.66 $\pm0.04$ & 0.93 $\pm0.01$  & 0.82 $\pm0.03$\\
\bottomrule 
\end{tabular}
\end{table*}

\begin{table*}[t!]
\small
\caption{Results for predicting the reaction time. A star indicates models better than mean baseline. Mean RMSE $\pm$ standard error and mean $R^2 \pm $ standard error are shown.}\label{tab:reaction_time}
\centering
\footnotesize\begin{tabular}{l||l|l}
 \toprule 
Method & RMSE & $R^2$\\
\hline
Mean baseline & 0.27$\pm$0.04 & -0.01$\pm$0.0 \\
Random forest  & 0.26$\pm$0.04 & 0.08$\pm$0.09\\
CNN  & 0.28$\pm$0.02 & -0.83$\pm$0.93 \\
LSTM  & 0.26$\pm$0.04 & 0.06$\pm$0.02 \\
Bi-LSTM  & \textbf{0.25}$\pm$\textbf{0.04} & \textbf{0.15}$\pm$\textbf{0.03}\\
\bottomrule 
\end{tabular}
\end{table*}

\subsection{Predicting Microsleep is Easier than Predicting KSS or Reaction Time}
\label{sec:drowsiness}

Table~\ref{tab:results_classification} shows that microsleep episodes in the near future can be predicted with an AUC of around 0.95 (0.87 for hard cases in which no prior microsleep events occur in the observation window), whereas self-assessed fatigue is only detected with an AUC of around 0.7. A comparison of the ROC curves in Figure~\ref{fig:kss_class} for KSS-sleepiness and Figure~\ref{fig:sleep} and \ref{fig:sleep_exclude} confirms the conclusion that predicting upcoming microsleep events is easier than prediction the KSS self-assessment. The confusion matrix in Figure~\ref{fig:conf_kss} shows that false-positive and false-negative KSS predictions are more likely to have borderline true scores, but confusions occur across the entire KSS scale. 

Table~\ref{tab:reaction_time} shows RMSE and $R^2$ metrics for prediction of the PVT reaction time. While AUC, RMSE, and $R^2$ cannot directly be compared, the values show that only 15\% of the variance in reaction time can be explained by the KSS level, whereas microsleep episodes in the next 10 seconds can be predicted with an AUC of 0.95. Our interpretation of these findings is that impending microsleeps are much more predictable from eye-closure and eye-gaze signals than the KSS self-assessment or the PVT reaction time.

\begin{figure}[t]
\floatconts
  {fig:results_kss_class}
  {\caption{Results for the binarized KSS prediction.}}
  {%
  \setlength{\jmlrminsubcaptionwidth}{0.43\linewidth}
    \subfigure[Prediction of binarized KSS score. Shaded bands show the standard error. ]{\label{fig:kss_class}%
      \includegraphics[width=0.4\linewidth]{figures/results/KSS_classification.pdf}}%
    \qquad
    \subfigure[Confusion matrix for CNN model for predicting binarized KSS levels and ground truth KSS levels.]{\label{fig:conf_kss}%
      \includegraphics[width=0.4\linewidth]{figures/results/KSS_classification_cm_kss_CNN.pdf}}
  }
\end{figure}

\begin{figure}[t]
\floatconts
  {fig:results}
  {\caption{AUC curves for prediction of impending microsleep events. Shaded bands show the standard error.}}
  {%
  \setlength{\jmlrminsubcaptionwidth}{0.43\linewidth}
    \subfigure[Prediction of microsleep events for all cases.]{\label{fig:sleep}%
      \includegraphics[width=0.4\linewidth]{figures/results/Sleep_classification.pdf}}%
    \qquad
    \subfigure[Prediction of microsleep events for hard cases.]{\label{fig:sleep_exclude}%
      \includegraphics[width=0.4\linewidth]{figures/results/Sleep_classification_exclude_same.pdf}}
  }
\end{figure}



\subsection{Training on Impeding Microsleep is Better than on KSS Levels} 

In the next experiment, we apply the models that have been trained to predict impending microsleep and reaction time, respectively, as decision functions for the task of predicting the binarized KSS level. Surprisingly, Table~\ref{tab:table_cross_task} and Figure~\ref{fig:cross_task} show that the model that has been trained to predict microsleep seems to be better at predicting the KSS level than the model that has been trained on KSS self-assessments. The model that has been trained to predict reaction time, on the other hand shows a poorer performance at predicting KSS levels than the model trained on KSS levels. However, none of the differences are statistically significant. 

Our interpretation of these findings is that the presence of microsleep episodes in the near future is a better indicator of sleepiness on the KSS scale than the KSS self-assessment itself. The subjective nature of the self-assessment introduces a high level of noise %into the signal 
that renders this signal less useful than the presence or absence of microsleep in the future. 

\begin{table}[t]
\small
\caption{Cross task evaluation. AUC $\pm$ standard error for predicting the binarized KSS.}% and $\dagger$ indicates models better than its counterpart trained on the KSS binary problem setting.}
\label{tab:table_cross_task}
\centering
\footnotesize\begin{tabular}{l||l|l|l}
 \toprule
 & \multicolumn{3}{c}{Training task} \\
 & same task & \multicolumn{2}{c}{Cross-task}\\
 & KSS $\geq 7$ & Microsleep & Reaction time \\
\hline
Random forest & 0.6 $\pm0.01$ & \textbf{0.65} $\pm$\textbf{0.03} & 0.6 $\pm$0.02\\
CNN & 0.7 $\pm$ 0.01 & \textbf{0.71} $\pm$\textbf{0.03} & 0.63 $\pm$0.01\\
LSTM & 0.67 $\pm0.04$ & \textbf{0.71} $\pm$\textbf{0.04} & 0.63 $\pm$0.04\\
Bi-LSTM & 0.66 $\pm0.04$ & \textbf{0.69} $\pm$\textbf{0.04} & 0.65 $\pm$0.03\\
\bottomrule 
\end{tabular}
\end{table}

\begin{figure}[t]
\floatconts
  {fig:cross_task}
  {\caption{Cross task evaluation. Results for predicting the binarized KSS score. Shaded bands show the standard error.}}
  {%
  \setlength{\jmlrminsubcaptionwidth}{0.43\linewidth}
    \subfigure[Model trained to predict impending microsleep events and evaluated on the binarized KSS score prediction.]{\label{fig:cross_sleep}%
      \includegraphics[width=0.4\linewidth]{figures/results/cross_task_train_classification_Sleep_KSS.pdf}}%
    \qquad
    \subfigure[Model trained to predict the reaction time and evaluated on the binarized KSS score prediction.]{\label{fig:cross_reaction}%
      \includegraphics[width=0.4\linewidth]{figures/results/cross_task_train_regression_reaction_time_KSS.pdf}}
  }
\end{figure}

\subsection{Neural Networks Outperform Engineered Features}

For the prediction of microsleep events and KSS levels, Table~\ref{tab:results_classification} shows that the neural networks outperform the random forest on engineered features; in two out of three cases, the difference is statistically significant with $p<0.05$ according to a paired $t$-test. The difference between the alternative network architectures are not significant, but in total the CNN gives the best overall performance picture. 

For prediction of the PVT reaction time, the performance of all models is roughly equally poor. 

\subsection{All Signals under Investigation are Useful}

Figure~\ref{fig:ablation_study} and Table~\ref{tab:ablation_study} show that removing any input channel results in a lower AUC value; removing either the eye-lid channels or the eye-gaze channels results in the lowest AUC values. The deterioration is not statistically significant. 
Figure~\ref{fig:shap} shows the engineered features with highest SHAP value in the random-forest classifier. The figure confirms earlier findings~\citep{Schleicher2008,wilkinson2013the}: the strongest indicators of drowsiness are variability in eye-lid velocity, the percentage of time in which the eyes are closed, variability in blink duration, and delayed reopening during blinks.

\begin{figure}[t]
\floatconts
  {fig:ablation_study}
  {\caption{Ablation study. ROC curves for prediction of microsleep events.}}
  {%
  \setlength{\jmlrminsubcaptionwidth}{0.43\linewidth}
    \subfigure[All cases.]{\label{fig:abl_sleep}%
      \includegraphics[width=0.4\linewidth]{figures/results/Sleep_classification_ablation.pdf}}%
    \qquad
    \subfigure[Hard cases only.]{\label{fig:abl_sleep_exclude}%
      \includegraphics[width=0.4\linewidth]{figures/results/Sleep_classification_ablation_exclude_same.pdf}}
  }
\end{figure}

\begin{table*}[t]
\caption{Ablation study. AUC $\pm$ standard error for predicting microsleep events using the CNN model architecture using a subset of input channels.}\label{tab:ablation_study}
\centering
\footnotesize\begin{tabular}{l||l|l}
 \toprule
 %& \multicolumn{2}{c}{AUC} \\
 & All & Hard cases \\
\hline
All channels & 0.94 $\pm$0.01 & 0.87 $\pm$0.02\\
W/O eye closure and eye state channels& 0.92 $\pm$0.01 & 0.81 $\pm$0.03\\
W/O eye gaze channels & 0.93 $\pm$0.01 & 0.81 $\pm$0.03\\
W/O fixation and saccade channel & 0.93 $\pm$0.02 & 0.83 $\pm$0.01\\
\bottomrule 
\end{tabular}
\end{table*}

\begin{figure}[ht]
            \centering
            \includegraphics[width=0.7\textwidth]{figures/results/RF_shap.pdf}        
        \caption{Feature importance for the top 20 features (SHAP values).}%Feature importance using SHAP values. The figure shows the top 20 features with the highest importance.}
        \label{fig:shap}
\end{figure} 

\section{Discussion}
\label{sec:Discussion}
The problem setting of drowsiness detection is motivated by efforts to improve the safety of the operation of vehicles and other hazardous machinery. 
%The European Commission's estimate of drowsiness contributing to 12\% of all road accidents~\cite{EUfatigue}---or a total of for 162,000 annual road traffic fatalities~\cite{CDCfatalities}---attests to the urgency of research in drowsiness detection methods. 
The European New Car Assessment Program (EuroNCAP) has included driver fatigue and incapacitation detection in the catalog of safety functions that affect the safety rating of new vehicles~\citep{euroncap2021,euroncap2017}. 
The EU General Safety Regulation 2019/2144 makes it mandatory to introduce a range of new safety measures that also include incapacitation detection, following a fixed timetable of stages A-D, scheduled between 2022 and 2029.
The current generation of driver cameras that are entering the automotive market 
are capable of extracting the driver's eye closure and eye gaze in order to detect distraction and drowsiness~\citep{halin2021survey}. Research on drowsiness detection based on eye-closure and eye-gaze signals therefore has an immediate practical application and potential for societal benefit. 

%It also appears plausible that a longer observation duration results in a more accurate detection. However, a longer observation period also means that the result is obtained later. Given that our study is motivated by applications such as monitoring drivers or operators of other hazardous machinery, we limit the observation periods in this study to 60 seconds. 

Previous work that applies machine learning to drowsiness detection can be divided with respect to the input signals they use into physiological and vehicle-based approaches. 
A large body of research to which an overview is given by~\citet{Stancin2021} uses EEG signals as predictors for drowsiness. EEG input signals to predictive models are far removed from practical applicability since electrodes have to be attached to the head. The same disadvantage applies to electrooculogram (EOG) although it has been found to be more robust against noise~\citep{zhang2015novel} compared to EEG. 

By contrast, image-based methods have the advantage of being unobtrusive to users. A wide range of image features have been studied, including extract head, facial and eye-lid movements~\citep{Schleicher2008,wilkinson2013the}, raw images of the face~\citep{phan2021efficient} and eye crops~\citep{quddus2021using}. To the best of our knowledge, there is only one other study that uses eye-tracking features as input\citep{zandi2019non}; however, details regarding model and implementation are undisclosed. 
%to learn meaningful features end-to-end  %In \cite{quddus2021using} image crops of both eyes are processed by an LSTM network to learn a representation. 

Indirect approaches to driver monitoring based on steering wheel interaction~\citep{arefnezhad2019driver,zhenhai2017driver} and lane deviation~\citep{friedrichs2010drowsiness} have been studied extensively and are widely deployed in the market in attention assist systems. This type of indirect monitoring will become insufficient under the EU General Safety Regulation 2019/2144, and will not meet the test criteria of the European New Car Assessment Program (EuroNCAP) from 2024.

A comprehensive comparison of (combinations of) the possible input modalities for drowsiness detection is not available; any such investigation would be hampered by the lack of publicly available data and reference implementations. Nevertheless, it seems plausible that combining modalities such as vehicle interactions, eye closure, eye gaze, head and facial movements, and body posture may add to the robustness of detection systems across all users and their individual characteristics and conditions.

Although EEG is considered the gold standard for 
the collection of ground-truth labels for sleep stages 2 and higher, it does not allow for an unambiguous detection of drowsiness, and has furthermore the drawbacks of being susceptible to noise caused by muscle movements. While KSS self-assessment is widely regarded to be the gold-standard proxy of ground-truth drowsiness, our findings underscore the subjective nature of this self-assessment that limits the degree of accuracy with which it can be predicted. Analogously, reaction time varies widely across individuals with some persons reacting faster on the brink of sleep than others in their fully alert state. By studying the prediction of impending microsleep, this paper introduces a new proxy of drowsiness ground truth that is objectively hazardous. We interpret the fact that it can be predicted more accurately as indicating that it is a less noisy proxy of actual drowsiness.

With 47 participants, the \emph{Potsdam Binge~/~PVT data set} data set is not very large by machine-learning standards; it appears likely that a model trained on hundreds or thousands of participants would be considerably more accurate. 
%However, drowsiness detection solely based on facial expressions may not be suitable for individuals with facial paralysis or other facial conditions.   

% These interaction-based signals may not be suitable for all driving situations, such as driving on a straight and empty road, where there may be minimal interactions with the vehicle. 


\section{Conclusions}
\label{sec:Conclusions}
Since drowsiness is an internal state of the mind, the ground truth cannot be observed directly. 
Based on our experimental findings, we conclude that upcoming microsleep episodes in the near future are a better, less noisy proxy of the ground truth than a self-assessment on the Karolinska sleepiness scale (KSS). Not only can approaching microsleep events be predicted with high accuracy, but a model that has been trained to predict microsleep events is as accurate or even more accurate at predicting  a high KSS score than a model that was trained on KSS self-assessments.

We can furthermore conclude that neural network architectures that process the raw eye-state, eye-closure, gaze-velocity, and saccade indicator signals outperform a random forest that processes a comprehensive set of  engineered features derived from these signals. The difference in  performance between CNN, LSTM, and Bi-LSTM architectures are to small to support any conclusion.
Removing any set of features results in a slightly but insignificantly lower performance. The SHAP values that we observe for the engineered features are consistent with earlier findings. However, the neural-networks perform significantly better than the random forest on engineered features, and we therefore conclude that the signal-processing layers have learned to extract additional signals from the raw input that provide evidence of drowsiness. 

A large share of research and development on drowsiness detection takes place behind closed doors in the automotive industry and remains unpublished, which impedes the progress of the field as a whole. In order to improve the accessibility of this highly relevant topic to the research community, we share a data set of participants in baseline and sleep-deprived states, a reference implementation of published engineered features, and our implementations of the neural networks.

\acks{This work was partially funded by the German Federal Ministry of Education and Research under grant 01$\vert$S20043.
%%AEye project
}


\iffalse 
\section{Cross-Referencing}

Always use \verb|\label| and \verb|\ref| (or one of the commands
described below) when cross-referencing.  For example, the next
section is Section~\ref{sec:math}. The \textsf{jmlr} class
provides some convenient cross-referencing commands:
\verb|\sectionref|, \verb|\equationref|, \verb|\tableref|,
\verb|\figureref|, \verb|\algorithmref|, \verb|\theoremref|,
\verb|\lemmaref|, \verb|\remarkref|, \verb|\corollaryref|,
\verb|\definitionref|, \verb|\conjectureref|, \verb|\axiomref|,
\verb|\exampleref| and \verb|\appendixref|. The argument of these
commands may either be a single label or a comma-separated list
of labels. Examples:

Referencing sections: \sectionref{sec:math} or
\sectionref{sec:intro,sec:math} or
\sectionref{sec:intro,sec:math,sec:tables,sec:figures}.

Referencing equations: \equationref{eq:trigrule} or
\equationref{eq:trigrule,eq:df} or
\equationref{eq:trigrule,eq:f,eq:df,eq:y}.

Referencing tables: \tableref{tab:operatornames} or
\tableref{tab:operatornames,tab:example} or
\tableref{tab:operatornames,tab:example,tab:example-booktabs}.

Referencing figures: \figureref{fig:image} or
\figureref{fig:image,fig:teximage} or
\figureref{fig:image,fig:teximage,fig:subfigex} or
\figureref{fig:image-a,fig:image-b}.

Referencing algorithms: \algorithmref{alg:gauss} or
\algorithmref{alg:gauss,alg:moore} or
\algorithmref{alg:gauss,alg:moore,alg:net}.

Referencing theorem-like environments: \theoremref{thm:eigenpow},
\lemmaref{lem:sample}, \remarkref{rem:sample}, 
\corollaryref{cor:sample}, \definitionref{def:sample},
\conjectureref{con:sample}, \axiomref{ax:sample} and
\exampleref{ex:sample}.

Referencing appendices: \appendixref{apd:first} or
\appendixref{apd:first,apd:second}.

\section{Equations}
\label{sec:math}

The \textsf{jmlr} class loads the \textsf{amsmath} package, so
you can use any of the commands and environments defined there.
(See the \textsf{amsmath} documentation for further
details.\footnote{Either \texttt{texdoc amsmath} or
\url{http://www.ctan.org/pkg/amsmath}})

Unnumbered single-lined equations should be displayed using
\verb|\[| and \verb|\]|. For example:
\[E = m c^2\]
Numbered single-line equations should be displayed using the
\texttt{equation} environment. For example:
\begin{equation}\label{eq:trigrule}
\cos^2\theta + \sin^2\theta \equiv 1
\end{equation}
This can be referenced using \verb|\label| and \verb|\equationref|.
For example, \equationref{eq:trigrule}.

Multi-lined numbered equations should be displayed using the
\texttt{align} environment.\footnote{For reasons why you 
shouldn't use the obsolete \texttt{eqnarray} environment, see
Lars Madsen, \emph{Avoid eqnarray!} TUGboat 33(1):21--25, 2012.} For example:
\begin{align}
f(x) &= x^2 + x\label{eq:f}\\
f'(x) &= 2x + 1\label{eq:df}
\end{align}
Unnumbered multi-lined equations should be displayed using the
\texttt{align*} environment. For example:
\begin{align*}
f(x) &= (x+1)(x-1)\\
&= x^2 - 1
\end{align*}
If you want to mix numbered with unnumbered lines use the
align environment and suppress unwanted line numbers with
\verb|\nonumber|. For example:
\begin{align}
y &= x^2 + 3x - 2x + 1\nonumber\\
&= x^2 + x + 1\label{eq:y}
\end{align}
An equation that is too long to fit on a single line can be
displayed using the \texttt{split} environment. 
Text can be embedded in an equation using \verb|\text| or
\verb|\intertext| (as used in \theoremref{thm:eigenpow}).
See the \textsf{amsmath} documentation for further details.

\subsection{Operator Names}
\label{sec:op}

Predefined operator names are listed in \tableref{tab:operatornames}.
For additional operators, either use \verb|\operatorname|,
for example $\operatorname{var}(X)$ or declare it with
\verb|\DeclareMathOperator|, for example
\begin{verbatim}
\DeclareMathOperator{\var}{var}
\end{verbatim}
and then use this new command. If you want limits that go above and
below the operator (like \verb|\sum|) use the starred versions
(\verb|\operatorname*| or \verb|\DeclareMathOperator*|).

\begin{table}[htbp]
\floatconts
  {tab:operatornames}%
  {\caption{Predefined Operator Names (taken from 
   \textsf{amsmath} documentation)}}%
  {%
\begin{tabular}{rlrlrlrl}
\cs{arccos} & $\arccos$ &  \cs{deg} & $\deg$ &  \cs{lg} & $\lg$ &  \cs{projlim} & $\projlim$ \\
\cs{arcsin} & $\arcsin$ &  \cs{det} & $\det$ &  \cs{lim} & $\lim$ &  \cs{sec} & $\sec$ \\
\cs{arctan} & $\arctan$ &  \cs{dim} & $\dim$ &  \cs{liminf} & $\liminf$ &  \cs{sin} & $\sin$ \\
\cs{arg} & $\arg$ &  \cs{exp} & $\exp$ &  \cs{limsup} & $\limsup$ &  \cs{sinh} & $\sinh$ \\
\cs{cos} & $\cos$ &  \cs{gcd} & $\gcd$ &  \cs{ln} & $\ln$ &  \cs{sup} & $\sup$ \\
\cs{cosh} & $\cosh$ &  \cs{hom} & $\hom$ &  \cs{log} & $\log$ &  \cs{tan} & $\tan$ \\
\cs{cot} & $\cot$ &  \cs{inf} & $\inf$ &  \cs{max} & $\max$ &  \cs{tanh} & $\tanh$ \\
\cs{coth} & $\coth$ &  \cs{injlim} & $\injlim$ &  \cs{min} & $\min$ \\
\cs{csc} & $\csc$ &  \cs{ker} & $\ker$ &  \cs{Pr} & $\Pr$
\end{tabular}\par
\begin{tabular}{rlrl}
\cs{varlimsup} & $\varlimsup$ 
& \cs{varinjlim} & $\varinjlim$\\
\cs{varliminf} & $\varliminf$ 
& \cs{varprojlim} & $\varprojlim$
\end{tabular}
}
\end{table}

\section{Vectors and Sets}
\label{sec:vec}

Vectors should be typeset using \cs{vec}. For example $\vec{x}$.
The \textsf{jmlr} class also provides \cs{set} to typeset a
set. For example $\set{S}$.

\section{Floats}
\label{sec:floats}

Floats, such as figures, tables and algorithms, are moving
objects and are supposed to float to the nearest convenient
location. Please don't force them to go in a particular place. In
general it's best to use the \texttt{htbp} specifier and don't
put the figure or table in the middle of a paragraph (that is
make sure there's a paragraph break above and below the float).
Floats are supposed to have a little extra space above and below
them to make them stand out from the rest of the text. This extra
spacing is put in automatically and shouldn't need modifying.

To ensure consistency, please \emph{don't} try changing the format of the caption by doing
something like:
\begin{verbatim}
\caption{\textit{A Sample Caption.}}
\end{verbatim}
or
\begin{verbatim}
\caption{\em A Sample Caption.}
\end{verbatim}
You can, of course, change the font for individual words or 
phrases, for example:
\begin{verbatim}
\caption{A Sample Caption With Some \emph{Emphasized Words}.}
\end{verbatim}

\subsection{Tables}
\label{sec:tables}

Tables should go in the \texttt{table} environment. Within this
environment use \verb|\floatconts| (defined by \textsf{jmlr})
to set the caption correctly and center the table contents.

\begin{table}[htbp]
 % The first argument is the label.
 % The caption goes in the second argument, and the table contents
 % go in the third argument.
\floatconts
  {tab:example}%
  {\caption{An Example Table}}%
  {\begin{tabular}{ll}
  \bfseries Dataset & \bfseries Result\\
  Data1 & 0.12345\\
  Data2 & 0.67890\\
  Data3 & 0.54321\\
  Data4 & 0.09876
  \end{tabular}}
\end{table}

If you want horizontal rules you can use the \textsf{booktabs}
package which provides the commands \verb|\toprule|, 
\verb|\midrule| and \verb|\bottomrule|. For example, see
\tableref{tab:example-booktabs}.

\begin{table}[hbtp]
\floatconts
  {tab:example-booktabs}
  {\caption{A Table With Horizontal Lines}}
  {\begin{tabular}{ll}
  \toprule
  \bfseries Dataset & \bfseries Result\\
  \midrule
  Data1 & 0.12345\\
  Data2 & 0.67890\\
  Data3 & 0.54321\\
  Data4 & 0.09876\\
  \bottomrule
  \end{tabular}}
\end{table}

If you want vertical lines as well, you can't use the
\textsf{booktabs} commands as there'll be some unwanted gaps.
Instead you can use \LaTeX's \verb|\hline|, but the rows may
appear a bit cramped.  You can add extra space above or below a
row using \verb|\abovestrut| and \verb|\belowstrut|. For example,
see \tableref{tab:example-hline}.

\begin{table}[htbp]
\floatconts
  {tab:example-hline}
  {\caption{A Table With Horizontal and Vertical Lines}}%
  {%
    \begin{tabular}{|l|l|}
    \hline
    \abovestrut{2.2ex}\bfseries Dataset & \bfseries Result\\\hline
    \abovestrut{2.2ex}Data1 & 0.12345\\
    Data2 & 0.67890\\
    Data3 & 0.54321\\
    \belowstrut{0.2ex}Data4 & 0.09876\\\hline
    \end{tabular}
  }
\end{table}

If you want to align numbers on their decimal point, you can
use the \textsf{siunitx} package. For example, see
\tableref{tab:example-siunitx}. For further details see the
\textsf{siunitx} documentation\footnote{Either \texttt{texdoc
siunitx} or \url{http://www.ctan.org/pkg/siunitx}}.

\begin{table}[htbp]
\floatconts
  {tab:example-siunitx}
  {\caption{A Table With Numbers Aligned on the Decimal Point}}
  {\begin{tabular}{lS}
  \bfseries Dataset & {\bfseries Result}\\
  Data1 & 0.12345\\
  Data2 & 10.6789\\
  Data3 & 50.543\\
  Data4 & 200.09876
  \end{tabular}}
\end{table}

If the table is too wide, you can adjust the inter-column
spacing by changing the value of \verb|\tabcolsep|. For
example:
\begin{verbatim}
\setlength{\tabcolsep}{3pt}
\end{verbatim}
If the table is very wide but not very long, you can use the
\texttt{sidewaystable} environment defined in the
\textsf{rotating} package (so use \verb|\usepackage{rotating}|).
If the table is too long to fit on a page, you should use the
\texttt{longtable} environment defined in the \textsf{longtable}
package (so use \verb|\usepackage{longtable}|).

\subsection{Figures}
\label{sec:figures}

Figures should go in the \texttt{figure} environment. Within this
environment, use \verb|\floatconts| to correctly position the
caption and center the image. Use \verb|\includegraphics|
for external graphics files but omit the file extension. Do not
use \verb|\epsfig| or \verb|\psfig|. If you want to scale the
image, it's better to use a fraction of the line width rather
than an explicit length. For example, see \figureref{fig:image}.

\begin{figure}[htbp]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {fig:image}
  {\caption{Example Image}}
  {\includegraphics[width=0.5\linewidth]{example-image}}
\end{figure}

If your image is made up of \LaTeX\ code (for example, commands
provided by the \textsf{pgf} package) you can include it using
\cs{includeteximage} (defined by the \textsf{jmlr} class). This
can be scaled and rotated in the same way as \cs{includegraphics}.
For example, see \figureref{fig:teximage}.

\begin{figure}[htbp]
\floatconts
  {fig:teximage}
  {\caption{Image Created Using \LaTeX\ Code}}
  {\includeteximage[angle=45]{example-teximage}}
\end{figure}

If the figure is too wide to fit on the page, you can use the
\texttt{sidewaysfigure} environment defined in the
\textsf{rotating} package.

It's best not to use \verb|\graphicspath| with the \textsf{jmlr} class
as it can cause problems with the production editing process. If the
images are contained in a subdirectory, specify this when you
include the image, for example \verb|\includegraphics{figures/mypic}|.

\subsection{Sub-Figures}
\label{sec:subfigures}

Sub-figures can be created using \verb|\subfigure|, which is
defined by the \textsf{jmlr} class. The optional argument allows
you to provide a subcaption. The label should be placed in the
mandatory argument of \verb|\subfigure|. You can reference the
entire figure, for example \figureref{fig:subfigex}, or you can
reference part of the figure using \verb|\figureref|, for example
\figureref{fig:image-a}. Alternatively you can reference the
subfigure using \verb|\subfigref|, for example
\subfigref{fig:image-a,fig:image-b} in \figureref{fig:subfigex}.

\begin{figure}[htbp]
\floatconts
  {fig:subfigex}
  {\caption{An Example With Sub-Figures.}}
  {%
    \subfigure[Image A]{\label{fig:image-a}%
      \includegraphics[width=0.2\linewidth]{example-image-a}}%
    \qquad
    \subfigure[Image B]{\label{fig:image-b}%
      \includegraphics[width=0.25\linewidth]{example-image-b}}
  }
\end{figure}

By default, the sub-figures are aligned on the baseline.
This can be changed using the second optional argument
of \verb|\subfigure|. This may be \texttt{t} (top), \texttt{c}
(centered) or \texttt{b} (bottom). For example, the subfigures
\subfigref{fig:image-a2,fig:image-b2} in \figureref{fig:subfigex2}
both have \verb|[c]| as the second optional argument.

\begin{figure}[htbp]
\floatconts
  {fig:subfigex2}
  {\caption{Another Example With Sub-Figures (with a Cramped
    Sub-Caption).}}
  {%
    \subfigure[Image A][c]{\label{fig:image-a2}%
      \includegraphics[width=0.1\linewidth]{example-image-a}}%
    \qquad
    \subfigure[Image B][c]{\label{fig:image-b2}%
      \includegraphics[width=0.3\linewidth]{example-image-b}}
  }%
\end{figure}

Note that a very narrow sub-float will leave little space for the
sub-caption, which will likely cause Underfull/Overfull hbox warnings.

You can set the length \verb|\jmlrminsubcaptionwidth| to indicate
the minimum width to be made available for the sub-captions, as in
\figureref{fig:subfigex3}.

\begin{figure}[htbp]
\floatconts
  {fig:subfigex3}
  {\caption{Another Example With Sub-Figures (with a Less Cramped
    Sub-Caption).}}
  {\setlength{\jmlrminsubcaptionwidth}{0.2\linewidth}%
    \subfigure[Image A][c]{\label{fig:image-a3}%
     \includegraphics[width=0.1\linewidth]{example-image-a}}%
    \qquad
    \subfigure[Image B][c]{\label{fig:image-b3}%
      \includegraphics[width=0.2\linewidth]{example-image-b}}
  }%
\end{figure}

\subsection{Sub-Tables}
\label{sec:subtables}
There is an analogous command \verb|\subtable| for sub-tables.
It has the same syntax as \verb|\subfigure| described above.
You can reference the table using \verb|\tableref|, for example
\tableref{tab:subtabex} or you can reference part of the table,
for example \tableref{tab:ab}. Alternatively you can reference the
subtable using \verb|\subtabref|, for example
\subtabref{tab:ab,tab:cd} in \tableref{tab:subtabex}.

\begin{table}[htbp]
\floatconts
 {tab:subtabex}
 {\caption{An Example With Sub-Tables}}
 {%
   \subtable{%
     \label{tab:ab}%
     \begin{tabular}{cc}
     \bfseries A & \bfseries B\\
     1 & 2
     \end{tabular}
   }\qquad
   \subtable{%
     \label{tab:cd}%
     \begin{tabular}{cc}
     \bfseries C & \bfseries D\\
     3 & 4\\
     5 & 6
     \end{tabular}
   }
 }
\end{table}

By default, the sub-tables are aligned on the top.
This can be changed using the second optional argument
of \verb|\subtable|. This may be \texttt{t} (top), \texttt{c}
(centered) or \texttt{b} (bottom). For example, the sub-tables
\subtabref{tab:ab2,tab:cd2} in \tableref{tab:subtabex2}
both have \verb|[c]| as the second optional argument.

\begin{table}[htbp]
\floatconts
 {tab:subtabex2}
 {\caption{Another Example With Sub-Tables}}
 {%
   \subtable[][c]{%
     \label{tab:ab2}%
     \begin{tabular}{cc}
     \bfseries A & \bfseries B\\
     1 & 2
     \end{tabular}
   }\qquad
   \subtable[][c]{%
     \label{tab:cd2}%
     \begin{tabular}{cc}
     \bfseries C & \bfseries D\\
     3 & 4\\
     5 & 6
     \end{tabular}
   }
 }
\end{table}

\subsection{Algorithms}
\label{sec:algorithms}

Enumerated textual algorithms can be displayed using the
\texttt{algorithm} environment. Within this environment, use
use an \texttt{enumerate} or nested \texttt{enumerate} environments.
For example, see \algorithmref{alg:gauss}. Note that algorithms
float like figures and tables.

\begin{algorithm}[htbp]
\floatconts
{alg:gauss}% label
{\caption{The Gauss-Seidel Algorithm}}
{% contents
\begin{enumerate}
  \item For $k=1$ to maximum number of iterations
    \begin{enumerate}
      \item For $i=1$ to $n$
        \begin{enumerate}
        \item $x_i^{(k)} = 
          \frac{b_i - \sum_{j=1}^{i-1}a_{ij}x_j^{(k)}
          - \sum_{j=i+1}^{n}a_{ij}x_j^{(k-1)}}{a_{ii}}$
        \item If $\|\vec{x}^{(k)}-\vec{x}^{(k-1)} < \epsilon\|$,
          where $\epsilon$ is a specified stopping criteria, stop.
      \end{enumerate}
    \end{enumerate}
\end{enumerate}
}
\end{algorithm}

If you'd rather have the same numbering throughout the algorithm
but still want the convenient indentation of nested 
\texttt{enumerate} environments, you can use the
\texttt{enumerate*} environment provided by the \textsf{jmlr}
class. For example, see \algorithmref{alg:moore}.

\begin{algorithm}
\floatconts
{alg:moore}% label
{\caption{Moore's Shortest Path}}%caption
{% main float content
Given a connected graph $G$, where the length of each edge is 1:
\begin{enumerate*}
  \item Set the label of vertex $s$ to 0
  \item Set $i=0$
  \begin{enumerate*}
    \item \label{step:locate}Locate all unlabelled vertices 
          adjacent to a vertex labelled $i$ and label them $i+1$
    \item If vertex $t$ has been labelled,
    \begin{enumerate*}
      \item[] the shortest path can be found by backtracking, and 
      the length is given by the label of $t$.
    \end{enumerate*}
    otherwise
    \begin{enumerate*}
      \item[] increment $i$ and return to step~\ref{step:locate}
    \end{enumerate*}
  \end{enumerate*}
\end{enumerate*}
}
\end{algorithm}

Pseudo code can be displayed using the \texttt{algorithm2e}
environment. This is defined by the \textsf{algorithm2e} package
(which is automatically loaded) so check the \textsf{algorithm2e}
documentation for further details.\footnote{Either \texttt{texdoc
algorithm2e} or \url{http://www.ctan.org/pkg/algorithm2e}}
For an example, see \algorithmref{alg:net}.

\begin{algorithm2e}
\caption{Computing Net Activation}
\label{alg:net}
 % older versions of algorithm2e have \dontprintsemicolon instead
 % of the following:
 %\DontPrintSemicolon
 % older versions of algorithm2e have \linesnumbered instead of the
 % following:
 %\LinesNumbered
\KwIn{$x_1, \ldots, x_n, w_1, \ldots, w_n$}
\KwOut{$y$, the net activation}
$y\leftarrow 0$\;
\For{$i\leftarrow 1$ \KwTo $n$}{
  $y \leftarrow y + w_i*x_i$\;
}
\end{algorithm2e}

\section{Description Lists}

The \textsf{jmlr} class also provides a description-like 
environment called \texttt{altdescription}. This has an
argument that should be the widest label in the list. Compare:
\begin{description}
\item[add] A method that adds two variables.
\item[differentiate] A method that differentiates a function.
\end{description}
with
\begin{altdescription}{differentiate}
\item[add] A method that adds two variables.
\item[differentiate] A method that differentiates a function.
\end{altdescription}

\section{Theorems, Lemmas etc}
\label{sec:theorems}

The following theorem-like environments are predefined by
the \textsf{jmlr} class: \texttt{theorem}, \texttt{example},
\texttt{lemma}, \texttt{proposition}, \texttt{remark}, 
\texttt{corollary}, \texttt{definition}, \texttt{conjecture}
and \texttt{axiom}. You can use the \texttt{proof} environment
to display the proof if need be, as in \theoremref{thm:eigenpow}.

\begin{theorem}[Eigenvalue Powers]\label{thm:eigenpow}
If $\lambda$ is an eigenvalue of $\vec{B}$ with eigenvector
$\vec{\xi}$, then $\lambda^n$ is an eigenvalue of $\vec{B}^n$
with eigenvector $\vec{\xi}$.
\begin{proof}
Let $\lambda$ be an eigenvalue of $\vec{B}$ with eigenvector
$\xi$, then
\begin{align*}
\vec{B}\vec{\xi} &= \lambda\vec{\xi}
\intertext{premultiply by $\vec{B}$:}
\vec{B}\vec{B}\vec{\xi} &= \vec{B}\lambda\vec{\xi}\\
\Rightarrow \vec{B}^2\vec{\xi} &= \lambda\vec{B}\vec{\xi}\\
&= \lambda\lambda\vec{\xi}\qquad
\text{since }\vec{B}\vec{\xi}=\lambda\vec{\xi}\\
&= \lambda^2\vec{\xi}
\end{align*}
Therefore true for $n=2$. Now assume true for $n=k$:
\begin{align*}
\vec{B}^k\vec{\xi} &= \lambda^k\vec{\xi}
\intertext{premultiply by $\vec{B}$:}
\vec{B}\vec{B}^k\vec{\xi} &= \vec{B}\lambda^k\vec{\xi}\\
\Rightarrow \vec{B}^{k+1}\vec{\xi} &= \lambda^k\vec{B}\vec{\xi}\\
&= \lambda^k\lambda\vec{\xi}\qquad
\text{since }\vec{B}\vec{\xi}=\lambda\vec{\xi}\\
&= \lambda^{k+1}\vec{\xi}
\end{align*}
Therefore true for $n=k+1$. Therefore, by induction, true for all
$n$.
\end{proof}
\end{theorem}

\begin{lemma}[A Sample Lemma]\label{lem:sample}
This is a lemma.
\end{lemma}

\begin{remark}[A Sample Remark]\label{rem:sample}
This is a remark.
\end{remark}

\begin{corollary}[A Sample Corollary]\label{cor:sample}
This is a corollary.
\end{corollary}

\begin{definition}[A Sample Definition]\label{def:sample}
This is a definition.
\end{definition}

\begin{conjecture}[A Sample Conjecture]\label{con:sample}
This is a conjecture.
\end{conjecture}

\begin{axiom}[A Sample Axiom]\label{ax:sample}
This is an axiom.
\end{axiom}

\begin{example}[An Example]\label{ex:sample}
This is an example.
\end{example}

\section{Citations and Bibliography}
\label{sec:cite}

The \textsf{jmlr} class automatically loads \textsf{natbib}.
This sample file has the citations defined in the accompanying
BibTeX file \texttt{pmlr-sample.bib}. For a parenthetical
citation use \verb|\citep|. For example
\citep{guyon-elisseeff-03}. For a textual citation use
\verb|\citet|. For example \citet{guyon2007causalreport}.
Both commands may take a comma-separated list, for example
\citet{guyon-elisseeff-03,guyon2007causalreport}.

These commands have optional arguments and have a starred
version. See the \textsf{natbib} documentation for further
details.\footnote{Either \texttt{texdoc natbib} or
\url{http://www.ctan.org/pkg/natbib}}

The bibliography is displayed using \verb|\bibliography|.

\acks{Acknowledgements go here.}
\fi 
\bibliography{bibliography}

\appendix


\end{document}
