\documentclass[pmlr]{jmlr}% new name PMLR (Proceedings of Machine Learning Research)

 % The following packages will be automatically loaded:
 % amsmath, amssymb, natbib, graphicx, url, algorithm2e
%\usepackage{amsfonts}
%\usepackage[ruled,linesnumbered]{algorithm2e}
%\usepackage{algorithmic}
%\usepackage{algorithm}
%\usepackage{array}
\usepackage{subfig}
%\usepackage{textcomp}
%\usepackage{stfloats}
%\usepackage{verbatim}
%\usepackage{cite}
%\usepackage{amsthm}
\usepackage{multirow}
\usepackage{paralist}
\usepackage{xcolor}

 %\usepackage{rotating}% for sideways figures and tables
\usepackage{longtable}% for long tables

 % The booktabs package is used by this sample document
 % (it provides \toprule, \midrule and \bottomrule).
 % Remove the next line if you don't require it.
\usepackage{booktabs}
 % The siunitx package is used by this sample document
 % to align numbers in a column by their decimal point.
 % Remove the next line if you don't require it.
\usepackage[load-configurations=version-1]{siunitx} % newer version
 %\usepackage{siunitx}

 % The following command is just for this sample document:
\newcommand{\cs}[1]{\texttt{\char`\\#1}}

 % Define an unnumbered theorem just for this sample document:
\theorembodyfont{\upshape}
\theoremheaderfont{\scshape}
\theorempostheader{:}
\theoremsep{\newline}
\newtheorem*{note}{Note}

 % change the arguments, as appropriate, in the following:
\jmlrvolume{1}
\jmlryear{2010}
\jmlrworkshop{NeurIPS 2023 Gaze Meets ML Workshop}

\title[An Attention-based Predictive Agent]{An Attention-based Predictive Agent for Handwritten Numeral/Alphabet Recognition via Generation}

%\title[Short Title]{Full Title of Article\titlebreak This Title Has A Line Break
%\titletag{\thanks{sample footnote}}}

 % Use \Name{Author Name} to specify the name.

 % Spaces are used to separate forenames from the surname so that
 % the surnames can be picked up for the page header and copyright footer.
 
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % *** Make sure there's no spurious space before \nametag ***

 % Two authors with the same address
  \author{\Name{Bonny Banerjee} \Email{bbnerjee@memphis.edu}\\ %\and
   \Name{Murchana Baruah} \Email{murchanabaruah@gmail.com}\\
   \addr Institute for Intelligent Systems, and Department of Electrical \& Computer Engineering, University of Memphis, Memphis, TN 38152, USA
 }

 % \author{\Name{Author Name1} \Email{an1@sample.com}\\
 %   \addr Address 1
 % \AND
 %   \Name{Author Name2} \Email{an2@sample.com}\\
 %   \addr Address 2
 % \AND
 %   \Name{Author Name3} \Email{an3@sample.com}\\
 %   \addr Address 3
 %  \AND
 %   \Name{Author Name4} \Email{an4@sample.com}\\
 %   \addr Address 4}

 % Three or more authors with the same address:
 % \author{\Name{Author Name1} \Email{an1@sample.com}\\
 %  \Name{Author Name2} \Email{an2@sample.com}\\
 %  \Name{Author Name3} \Email{an3@sample.com}\\
 %  \Name{Author Name4} \Email{an4@sample.com}\\
 %  \Name{Author Name5} \Email{an5@sample.com}\\
 %  \Name{Author Name6} \Email{an6@sample.com}\\
 %  \Name{Author Name7} \Email{an7@sample.com}\\
 %  \Name{Author Name8} \Email{an8@sample.com}\\
 %  \Name{Author Name9} \Email{an9@sample.com}\\
 %  \Name{Author Name10} \Email{an10@sample.com}\\
 %  \Name{Author Name11} \Email{an11@sample.com}\\
 %  \Name{Author Name12} \Email{an12@sample.com}\\
 %  \Name{Author Name13} \Email{an13@sample.com}\\
 %  \Name{Author Name14} \Email{an14@sample.com}\\
 %  \addr Address}


 % Authors with different addresses:
 % \author{\Name{Author Name1} \Email{abc@sample.com}\\
 % \addr Address 1
 % \AND
 % \Name{Author Name2} \Email{xyz@sample.com}\\
 % \addr Address 2
 %}

\editor{Editor's name}
 % \editors{List of editors' names}

\begin{document}

\maketitle

\begin{abstract}
A number of attention-based models for either classification or generation of handwritten numerals/alphabets have been reported in the literature. However, generation and classification are done jointly in very few end-to-end models. We propose a predictive agent model that actively samples its visual environment via a sequence of glimpses. The attention is driven by the agent's sensory prediction (or generation) error. At each sampling instant, the model predicts the observation class and completes the partial sequence observed till that instant. It learns where and what to sample by jointly minimizing the classification and generation errors. Three variants of this model are evaluated for handwriting generation and recognition on images of handwritten numerals and alphabets from benchmark datasets. We show that the proposed model is more efficient in handwritten numeral/alphabet recognition than human participants in a recently published study as well as a highly-cited attention-based reinforcement model. This is the first known attention-based agent to interact with and learn end-to-end from images for recognition via generation, with high degree of accuracy and efficiency.
\end{abstract}

\begin{keywords}
Visual attention, glimpses, perception, proprioception, multimodal, handwritten numeral/alphabet recognition and generation.
\end{keywords}

%TL;DR: An attention-based predictive agent that learns to recognize handwritten numerals and alphabets by generating them.

\section{Introduction}
\label{sec:intro}
Perception and action are inextricably tied together as, in the real world, efficiency is as important as accuracy. Nature has evolved the visual system such that, to minimize resources, it learns to selectively attend to a few locations that provide information for the task at hand. This motivates our exploration of predictive agent models that observe the visual environment via a sequence of glimpses. Such agents predict, learn and act by minimizing sensory prediction error in a closed loop. 

%Our earlier work \citep{baruah2020perception,baruah2020multimodal} explored attention-based predictive agents that learn to sequentially sample their environment for spatial and spatiotemporal data generation. In this paper, we propose an attention-based predictive agent for handwritten numeral and alphabet recognition in images. The attention (action) is driven by the agent's sensory prediction error.

A number of works have explored attention-based agents that learn to sequentially sample their environment for spatial and spatiotemporal data generation. In this paper, we propose an attention-based predictive agent for handwritten numeral and alphabet recognition in images. The attention (action) is driven by the agent's sensory prediction error.

%\textbf{Related work.} 
Attention-based models can be hard or soft \citep{xu2015show,elsayed2019saccader}. Hard-attention models make decisions by processing a part of the data, sampled via a sequence of glimpses. These models can be reinforcement-based (e.g., \citep{elsayed2019saccader,mnih2014recurrent}), unsupervised (e.g., \citep{gregor2015draw,eslami2016attend}) or supervised (e.g., \citep{zheng2015neural}). Soft-attention models process the entire data but weigh the features. Supervised (e.g., \citep{fukui2019attention}) and unsupervised (e.g., \citep{sang2020human}) variants of these models have been reported. We propose a supervised (with class labels) hard-attention model that does not use any reinforcement.

Numerous attention-based models for either classification (e.g., \citep{mnih2014recurrent}) or generation (e.g., \citep{gregor2015draw,baruah2022attention}) of handwritten numerals/alphabets have been reported in the literature. However, generation and classification are done jointly in very few end-to-end models. Two models deserve mention: semi-supervised learning with generative models proposed in \citep{kingma2014semi}, and a multimodal variational autoencoder robust to missing data introduced in \citep{wu2018multimodal}. Though both models perform generation and classification of handwritten numerals (MNIST), only classification accuracy is reported in \citep{kingma2014semi} while only generation accuracy is reported in \citep{wu2018multimodal}. Further, none of them incorporate attention, i.e. an image is not sampled as a sequence of observations but presented in its entirety. 

\textbf{Contributions.} In this paper, we propose an attention-based agent model that learns to classify handwritten numerals/alphabets from images by generating them. The novelty of this work is as follows:
\begin{compactitem}
%\begin{itemize}
  \item The proposed model implements a perception-action loop to optimize an objective function. \textit{The action (attention) is modeled as proprioception in a multimodal setting} and is guided by perceptual prediction error, not by reinforcement. This kind of agent model was first introduced in \citep{baruah2020perception}, and has since been used to learn handwriting generation from images and videos \citep{baruah2022attention}, human interaction generation \citep{baruah2020multimodal}, human interaction recognition via generation \citep{baruah2023intent}, and speech emotion recognition via generation \citep{baruah2022speech}, but not for handwriting recognition. Also, no study has evaluated such a model in comparison to human efficiency.
  \item At each sampling instant, the model simultaneously classifies and completes the partial sequence of observations. Pattern completion allows prediction error computation which decides the next sampling location. Thus, attention emerges in our model and does not require learning feature weights. %, as in \cite{baruah2022speech,baruah2023intent}.
  \item In the model, the pattern completion function maps the partial sequences of perceptual and proprioceptive observations to the class label and completed perceptual pattern. Three variants of this function are proposed. Their accuracies correlate with the number of trainable parameters.
  \item The model is more efficient than the human participants in a recently published study \citep{baruah2023attentionmnist}. On average, the study participants required 4.2, 4.7 and 4.9 samples to recognize a numeral, uppercase and lowercase alphabet respectively. When exposed to the same stimuli and conditions as the participants, our model requires 2.0, 4.5, 4.2 samples respectively. In contrast, a highly-cited attention-based reinforcement model \citep{mnih2014recurrent} falls short of human performance. %, as reported in \citep{baruah2023attentionmnist}. 
%\end{itemize}
\end{compactitem}
The rest of the paper is organized as follows. The proposed agent model is described in Section \ref{2} and evaluated on various benchmark datasets in Section \ref{3}. The paper ends with concluding remarks in Section \ref{Sec:Conclusions}.


%\cite{baruah2022speech} Attention. The attention mechanism in our model differs from most SER models from behavioral and algorithmic perspectives. Typically, end-to-end attention-based models for SER learn all parameters (including attention weights) by optimizing an objective function. In most of these models, attention is an internal mechanism that does not have a corresponding behavior. The attention parameters play a role similar to any other parameter in the model. In our model, attention is a parameterless mechanism that emerges due to prediction error, which drives action/behavior (ref. Eq. 2–4). This mechanism is interpretable as the model simply attends to its unexpected observations.


%-------------------------------------------------------------------------
\section{Models and Methods} \label{2}
%\vspace{-2mm}
\subsection{Preliminaries} \label{2_1}
%\vspace{-1mm}
\noindent\textbf{Agent.} Anything that perceives from and acts upon its environment using sensors and actuators respectively is called an agent \citep{RussellNorvig2020}.

\noindent\textbf{Perception} is the mechanism of interpreting sensory signals from the external environment by an agent \citep{han2016assessing}.

\noindent\textbf{Proprioception} is a form of perception in which the agent's environment is its own body \citep{baruah2020perception}. Internal perception of position, movement, and motion of body parts is due to proprioception \citep{han2016assessing}.

\noindent\textbf{Generative model.} Given a set of data points $x$, a generative model $p_{model}$ with parameters $\theta$ maximizes the log-likelihood, $\mathcal{L}(x;\theta)$, of the data.

\noindent\textbf{Evidence lower bound (ELBO).} Let the data $x$ be generated by a latent continuous random variable $z$. Then, computing the log-likelihood requires integrating the marginal likelihood, $\int p_{model}(x,z) dz$, which is intractable \citep{kingma2013auto}. In variational inference, an approximation of the intractable posterior is optimized by defining an evidence lower bound (ELBO) on the log-likelihood, $\mathcal{L}(x;\theta) \leq \log p_{model}(x;\theta)$.

\noindent\textbf{Variational autoencoder (VAE)} is a multilayered generative model. It assumes an isotropic Gaussian prior, $p_{\theta}(z)$, and i.i.d. data samples. VAE maximizes the following ELBO \citep{kingma2013auto}: %$\mathcal{L}(x;\theta) \leq \mathbb{E}_{q_{\phi}(z|x)}[\log p_{\theta}(x|z)] - D_\textnormal{KL}[q_{\phi}(z|x),p_{\theta}(z)]$, 
\begin{equation}
\mathbb{E}_{q_{\phi}(z|x)}[\log p_{\theta}(x|z)] - D_\textnormal{KL}[q_{\phi}(z|x),p_{\theta}(z)]
\end{equation}
\noindent where $p_{\theta}(x|z)$ and $q_{\phi}(z|x)$ are generative and recognition models respectively, $\mathbb{E}$ denotes expectation, and $D_\textnormal{KL}$ denotes Kullback-Leibler divergence. The first and second terms capture accuracy and complexity respectively. The negative of this ELBO is also known as \textit{variational free energy}, minimization of which has been hypothesized as a general principle guiding brain function \citep{friston2010free}. 

\noindent\textbf{Saliency} lies in the eyes of an agent. Saliency of a location in an environment is a function of its neighborhood and an agent's internal model \citep{spratling2012predictive,friston2009reinforcement}.

%\vspace{-1mm}
\subsection{Problem Statement} \label{2_2}
Let an environment in $m$ modalities be represented by a set of observable variables $\textbf{X} = \{\textbf{X}^{(1)}, \textbf{X}^{(2)}, \ldots, \textbf{X}^{(m)}\}$. The variable representing the $i$-th modality is a sequence: $\textbf{X}^{(i)} = \langle X^{(i)}_1, X^{(i)}_2, \ldots, X^{(i)}_T \rangle$, where $T$ is the sequence length. Let $\textbf{x}_{\le t} = \{\textbf{x}^{(1)}, \textbf{x}^{(2)}, \ldots, \textbf{x}^{(m)}\}$ be a partial observation of $\textbf{X}$ such that $\textbf{x}^{(i)} = \langle x^{(i)}_1, \ldots, x^{(i)}_t \rangle$, $1\leq t\leq T$. Let $y$ represent the class label.
%\textbf{Pattern completion:} As in \citep{baruah2020perception}, we define \emph{pattern completion} as the problem of accurately generating $\textbf{X}$ from its partial observation $\textbf{x}_{\le t}$. Given $\textbf{x}_{\le t}$ and a generative model $p_{\theta}$ with parameters $\theta$ and latent variables $z_{\le t}$, the generative process of $\textbf{X}$ is given as $%\sum_{t=1}^{T} p_{\theta}(\textbf{X}|\textbf{x}_{\le t}) = \int \sum_{t=1}^{T}p_{\theta}(\textbf{X}|\textbf{x}_{\le t}, z_{\le t}; \theta)p_{\theta}(z_{\le t}) d\textbf{z} p_{\theta}(\textbf{X}|\textbf{x}_{\le t}) = \int p_{\theta}(\textbf{X}|\textbf{x}_{\le t}, z_{\le t}; \theta)p_{\theta}(z_{\le t}) dz$. The objective for pattern completion at any time $t$ is to maximize the log-likelihood of $\textbf{X}$, i.e. $\displaystyle \arg \max _{\theta} \int log(p_{\theta}(\textbf{X}|\textbf{x}_{\le t}, z_{\le t}; \theta)p_{\theta}(z_{\le t})) dz$.

\sloppy
We define \emph{pattern completion and classification} as the problem of accurately generating $\textbf{X}$ and $y$ from the partial observation $\textbf{x}_{\le t}$. Given $\textbf{x}_{\le t}$ and a generative model $p_{\theta}$ with parameters $\theta$ and latent variables $z_{\le t}$, the objective for pattern completion and classification at any time $t$ is to maximize the joint log-likelihood of $\textbf{X}$ and $y$, i.e., $\displaystyle \arg \max _{\theta} \int log(p_{\theta}(\textbf{X},y|\textbf{x}_{\le t}, z_{\le t}; \theta)p_{\theta}(z_{\le t})) dz$.

\subsection{Models} \label{2_2}
We solve the problem in three distinct ways as follows.

\textbf{Model M1} (ref. Fig. \ref{Fig:Model1}): The completed pattern and class label are generated from the latent variables. Mathematically, $\displaystyle \arg \max _{\theta} \int log(p_{\theta}(\textbf{X}|\textbf{x}_{\le t}, z_{\le t}; \theta)p_{\theta}(z_{\le t})) dz + \displaystyle \arg \max _{\theta} \int log(p_{\theta}(y|\textbf{x}_{\le t}, z_{\le t}; \theta)p_{\theta}(z_{\le t})) dz$. The model is trained end-to-end.

\textbf{Model M2} (ref. Fig. \ref{Fig:Model2}): The class label is inferred from the partial observation. The latent variables are inferred from the class label and partial observation, as in \citep{kingma2014semi}. Mathematically, $\displaystyle \arg \max _{\theta} \int log(p_{\theta}(\textbf{X}|\textbf{x}_{\le t}, z_{\le t}; \theta)p_{\theta}(z_{\le t})) dz + \arg \max _ {\phi} \log q_{\phi}(y_t|\textbf{x}_{\leq t})$, where $q_{\phi}$ is a recognition model. The model is trained end-to-end.

\textbf{Model M3} (ref. Fig. \ref{Fig:Model3}): The class label is inferred from the completed pattern which is generated from the latent variables. The pattern completion model is trained first, $\displaystyle \arg \max _{\theta} \int log(p_{\theta}(\textbf{X}|\textbf{x}_{\le t}, z_{\le t}; \theta)p_{\theta}(z_{\le t})) dz$. Then the classification model is trained, $\displaystyle \arg \max _{\pi} log(p_{\pi}(y|\textbf{X})$.


\begin{figure}[htbp!]
  \centering
	\includegraphics[width=0.75\textwidth]{BlockDiagram_Overall.png}
	\caption{Different components of the proposed agent. Implementation of the pattern completion block is shown in Fig. \ref{Fig:Three models}.}
	\label{Fig:Block diagram}
\end{figure}

% \begin{figure*}[t!]
%   \centering
%   \subfloat[Model M1.]{\includegraphics[width=0.275\textwidth]{Model1.png}\label{Fig:Model1}}\hfill
%   \subfloat[Model M2.]{\includegraphics[width=0.365\textwidth]{Model2.png}\label{Fig:Model2}}\hfill
%   \subfloat[Model M3.]{\includegraphics[width=0.275\textwidth]{Model3.png}\label{Fig:Model3}}
%   \caption{Three variations for implementing the pattern completion block in Fig. \ref{Fig:Block diagram}.}
%   \label{Fig:Three models}
% \end{figure*}

\begin{figure*}[htbp!]
  \centering
  \hspace{-4mm}
  \subfloat[Model M1.]{\includegraphics[width=0.29\textwidth]{Model1.png}\label{Fig:Model1}}\hfill
  \subfloat[Model M2.]{\includegraphics[width=0.38\textwidth]{Model2.png}\label{Fig:Model2}}\hfill
  \subfloat[Model M3.]{\includegraphics[width=0.29\textwidth]{Model3.png}\label{Fig:Model3}}
  \caption{Three variants for implementing the pattern completion block in Fig. \ref{Fig:Block diagram}.}
  \label{Fig:Three models}
\end{figure*}


\begin{table}[b!]
  \caption{Variable dimensions as used in this paper. Here $(.)^{(1)}$, $(.)^{(2)}$ refer to visual perception and visual proprioception respectively; $T$ is maximum number of glimpses, $t$ is glimpse index or time, $n\times n$ is patch size, $M\times M$ is image size.} \label{Table:Symbols}
  \centering
       \begin{tabular}{|c|c|c|c|} \hline
        $x^{(1)}_t$ & $x^{(2)}_t$ & $X_t$ & $S_t$ \\ \hline                
        $\{0,1\}^{n \times n}$ & $\mathbb{R}^{2}$ & $\{0,1\}^{M \times M}$ & $\mathbb{R}^{M \times M}$ \\ \hline                   
        \end{tabular}
 \end{table}


\subsection{Agent Architecture} \label{2_3}
As shown in the block diagram in Fig. \ref{Fig:Block diagram}, environment, observation, pattern completion and classification, action selection and learning are the five components of the proposed agent architecture.

\textbf{1. Environment.} The environment is the source of sensory data. We consider a static environment (images) in this work.

\textbf{2. Observation.} Our agent sequentially samples its environment in two modalities: visual perception and visual proprioception. The 2D coordinates of the fixation location in the environment constitutes the proprioceptive observation while the visual stimuli at that location constitutes the corresponding perceptual observation, as in \citep{friston2012perceptions}. See Table \ref{Table:Symbols} for variable dimensions.

\textbf{3. Pattern completion.} At each sampling instant, the partial observation till that instant is completed using a multimodal variational recurrent neural network (MVRNN). Recognition and generation are the two processes involved in the operation of a MVRNN. %\citep{wu2018multimodal}  %(ref. Line 6 in Algorithm \ref{algo1}).

\sloppy
\textit{Recognition (Encoder).} The recognition model, $q_{\phi}(z_{t}|\textbf{x}_{\le t})$ for M1 and M3, and $q_{\phi}(z_{t}|\textbf{x}_{\le t}, y_t)$ for M2, is a probabilistic encoder \citep{kingma2013auto}. It produces a Gaussian distribution over the possible values of the code $z_{t}$ from which the given observations could have been generated.\\
\textbf{Model M1:}  Two RNNs, each with one layer of long short-term memory (LSTM) units, constitute the recognition model. Each RNN infers the parameters of the approximate posterior distribution for each modality.\\
\textbf{Model M2:} In addition to the perceptual and proprioceptive modalities, the class label is an input modality. A fully-connected layer maps the class labels (inferred label $\hat{y}$ or given label $y$) to the parameters ($\mu^{(3)}$, $\Sigma^{(3)}$) of the approximate posterior density for the class label modality (ref. Fig. \ref{Fig:Model2}).\\
\textbf{Model M3:} Same as M1.

The parameters for all modalities are combined using product of experts (PoE) \citep{wu2018multimodal} to generate the joint distribution for the approximate posterior, $q_{\phi}(z_t|\textbf{x}_{\le t})$ for M1 and M3, and $q_{\phi}(z_t|\textbf{x}_{\le t},y_{t})$ for M2. 

The prior can be sampled from a standard normal distribution $p_{\theta}(z_t) \sim \mathcal{N}(0,1)$, as in \citep{gregor2015draw}. The function of the encoder is shown in Lines 1--3 of Algorithm \ref{algo2} and Lines 3--9 of Algorithm \ref{algo3} (see Appendix \ref{Appendix:Loss function derivation and pseudo code}), where $\mathit{RNN}^{enc}_{\phi}$ represents the function of a LSTM unit, $\varphi^{enc}$ is a function that returns the mean and the logarithm of the standard deviation as a linear function of the hidden state, as in \citep{chung2015recurrent}. % or from the model as in \citep{chung2015recurrent}. %Each RNN generates the parameters for the approximate posterior distribution and the prior distribution for each modality, as in \citep{chung2015recurrent}. The parameters from each modality and for each distribution are combined using product of experts (PoE), as in \citep{wu2018multimodal}, to generate the joint distribution parameters (see Fig. \ref{f11}) for both the prior $p_{\theta}(z_t)$ and the approximate posterior $q_{\phi}(z_t|x^{(1)}_{\le t}, x^{(2)}_{\le t})$.\\

\textit{Generation (Decoder).}\\ 
\textbf{Model M1:} The model, $p_{\theta}(X_{t}, y_{t}|\textbf{x}_{\leq t},z_{\le t})$, generates the perceptual data and the class label from the latent variables, $z_t$, at each time step. The generative model consists of two RNNs, each with one layer of hidden LSTM units.\\
%Model 2: The generative model, $p_{\theta}(X^{(1)}_{t}|\textbf{x}_{\leq t},z_{\le t},y_{\le t})$, generates the perceptual data from the latent variables, $z_t$. The generative model has one RNN with one layer of hidden LSTM units. 
\textbf{Model M2:} The model, $p_{\theta}(X_{t}|\textbf{x}_{\leq t},z_{\le t})$, generates the perceptual data from the latent variables, $z_t$. The generative model consists of one RNN with a single layer of hidden LSTM units.\\
\textbf{Model M3:} Same as M2.

Each RNN generates the parameters of the data distribution for a modality. The data is sampled from this distribution which can be multivariate Gaussian or Bernoulli. In our model, both $X_t$ and $y_t$ are sampled from a multivariate Bernoulli distribution with means inferred by the corresponding decoder RNN. In order to generate the perceptual data at any time step, the output from the perceptual RNN at the previous time step is added to the current perceptual RNN output before applying the sigmoid function, as in \citep{gregor2015draw}. The decoder equations are shown in Lines 5--8 of Algorithm \ref{algo2} and Lines 11--12 of Algorithm \ref{algo3} (see Appendix \ref{Appendix:Loss function derivation and pseudo code}), where the function $\mathit{RNN}^{dec}_{\theta}$ is the same as $\mathit{RNN}^{enc}_{\phi}$. %For time-varying data (video), $X^{(i)}_1, X^{(i)}_2$ represent a pair of adjacent frames; for static data (image), it represents the same image, for the $i^{th}$ modality.

\textbf{4. Classification.}\\
\textbf{Model M1:} The decoder infers the class label as a separate modality for each time step (ref. M1 in Generation (Decoder)).\\
\textbf{Model M2:} The class labels are inferred from the partial observations, $\textbf{x}_{\leq t}$, at every time step. An RNN with LSTM units is used as a hidden layer, along with a softmax classifier. The function of the classifier is shown in Lines 1--2 of Algorithm \ref{algo3} (see Appendix \ref{Appendix:Loss function derivation and pseudo code}).\\
\textbf{Model M3:} A classifier\footnote{We used a CNN classifier with code borrowed from https://chromium.googlesource.com/external/ github.com/tensorflow/tensorflow/+/r0.10/tensorflow/g3doc/tutorials/mnist/pros/index.md.} is trained separately to infer the class labels from the perceptual data. During training, the input to the classifier is the true perceptual data. During testing, the input is the predicted perceptual data.


\textbf{5. Action selection.} In our model, action selection is to decide the location in the environment to sample from. At any time $t$, a saliency map $S_{t}$ is computed which assigns a salience score $S_{t}^{(\ell)}$ to each location $\ell$. %{\color{red}{For Model 1,}}
\begin{equation}\label{equ:saliency1}
%S_{t} = D_{KL}(p(X_{t+1}^{(1)})||p_{\theta}(X_{t+1}^{(1)}|z_{\le t},{x_{\le t}}))
S_{t}^{(\ell)} = D_{KL}(p(X_{t+1, \ell})||p_{\theta}(X_{t+1, \ell}|z_{\le t},\textbf{x}_{\le t}))
\end{equation}
\noindent where $p(X_{t+1,\ell})$ is the true data distribution at location $\ell$ and is sampled from a Bernoulli distribution. KL divergence, also known as \textit{relative entropy}, is a measure of information gain achieved by using the true distribution, $p(X_{t+1,\ell})$, instead of the predicted distribution, $p_{\theta}(X_{t+1, \ell}|z_{\le t},\textbf{x}_{\le t})$. %{\color{red}{or $p_{\theta}(X_{t+1, \ell}^{(1)}|z_{\le t},\textbf{x}_{\le t}, y_{t})$}}. 
Thus, the saliency map is a function of the prediction error. The most salient location is computed from this saliency map which constitutes the sampling location. 

The saliency map is smoothed using a Gaussian kernel $\mathcal{N}(.,\sigma)$. The sampling location is chosen as: 
\begin{equation}\label{equ:sampling location}
\ell_t = \underset{\ell_{t}\in\{1,2,\ldots,M^2\}}{\mathrm{argmax}} \text{conv}(\mathcal{N}(.,\sigma), S_{t})
\end{equation}
\noindent where $\sigma=2$. Each sample is a $n\times n$ patch centered at $\ell_t$. %Eq. \ref{equ:sampling location} is shown as function $g_2$ in Line 8 of Algorithm \ref{algo1}.

The salient location $\ell_t$ at any time $t$ is the proprioceptive observation $x^{(2)}_{t+1}$ for time $t+1$. Hence, prediction error (saliency) guides the sampling of a scene in our model. Unlike typical multimodal models, the two modalities in our model interact at the observation level as the perceptual prediction error provides the observation for the visual proprioceptive modality. The most salient location is the location that yields the maximum information gain in the environment. These are the locations where the agent's prediction error is the highest given all the past observations. The agent attends to these locations to update its internal model. 

\textbf{6. Learning.} The objective is to maximize Equ. \ref{eq:obj1}, \ref{eq:obj2} and \ref{eq:obj3} for M1, M2 and M3 respectively. It can be derived from the objectives for multimodal VAE \citep{wu2018multimodal}, variational RNN \citep{chung2015recurrent} and VAE for classification \citep{kingma2014semi}. See Appendix \ref{Appendix:Loss function derivation and pseudo code} for derivations. %This objective function is obtained by modifying the objective for multimodal VAE (Eq. 2 in \citep{wu2018multimodal}) with variational RNN (Eq. 1 in \citep{chung2015recurrent}).
\begin{equation}\label{eq:obj1}
\mathbb{E}_{q_{\phi}(z_{\leq T}|\textbf{x}_{\leq T})} \Big[ \sum_{t=1}^{T} \lambda_1 \log p_{\theta}(X_{t}|z_{\leq t},\textbf{x}_{\leq t})
+ \lambda_2 \log p_{\theta}(y_t|z_{\leq t}, \textbf{x}_{\leq t}) \Big]
- \sum_{t=1}^{T} \beta D_{KL} \big( q_{\phi}(z_{t}|\textbf{x}_{\leq t}), p_{\theta}(z_{t}) \big)
\end{equation}
\noindent where $\lambda_1$, $\lambda_2$, $\beta$ are the weights balancing the terms. 
\begin{align}\label{eq:obj2}
\mathbb{E}_{q_{\phi}(z_{\leq T}|\textbf{x}_{\leq T},y_{\leq T})} \Big[ \sum_{t=1}^{T} \log p_{\theta}(X_{t}|z_{\leq t},\textbf{x}_{\leq t}) + \log p_{\theta}(y_{t})\Big]
 - \sum_{t=1}^{T} D_{KL} \big( q_{\phi}(z_{t}|\textbf{x}_{\leq t},y_{t}), p_{\theta}(z_{t}) \big) \nonumber\\ 
 + \sum_{t=1}^{T} \alpha \log q_{\phi}(y_t|\textbf{x}_{\leq t})
\end{align}
\noindent where $\alpha$ controls the relative weight between generative and purely discriminative learning.
\begin{equation}\label{eq:obj3}
\mathbb{E}_{q_{\phi}(z_{\leq T}|\textbf{x}_{\leq T})} \Big[ \sum_{t=1}^{T} \log p_{\theta}({X}_{t}|z_{\leq t},\textbf{x}_{\leq t}) \Big]
  - \sum_{t=1}^{T} D_{KL} \big( q_{\phi}(z_{t}|\textbf{x}_{\leq t}), p_{\theta}(z_{t}) \big) + \log q_{\pi}(y|X)
\end{equation}
\noindent where $q_{\pi}(y|X)$ is the classification model whose input is the entire image (completed pattern) and not a sequence of observations. Hence the subscript $t$ is dropped.

%The pseudocode of our model is stated in Algorithm \ref{algo1}.
We assume a one-to-one mapping between the agent's body and its environment, i.e. between the oculomotor muscles to the locations in the image. This assumption allows us to map from the perceptual space $\ell$ to the proprioceptive space $x^{(2)}$ using a simple function $g_{3}$ (ref. Line 9 of Algorithm \ref{algo1}).


\iffalse
\subsection{Evaluation of attention-based models}
We consider a highly-cited reinforcement model, recurrent attention model (RAM) \citep{mnih2014recurrent}, that reports experimental results on the MNIST dataset. This model sequentially samples an image and decides where to sample next at each sampling instant, making it appropriate to evaluate it using the collected data.\\
\textbf{RAM} classifies images using a sequence of glimpses. The next location is chosen stochastically from a distribution parameterized by a location network. The model is trained end-to-end by maximizing the following objective \citep{mnih2014recurrent}:
\begin{equation} \label{e2}
\frac{1}{M} \sum_{i=1}^{M} \sum_{t=1}^{T}\Delta_{\theta}\log \pi(u_{t}^{i}|x_{1:t}^i;\theta)(R^{i}_{t}-b_{t})
\end{equation}
\noindent where $M$ is the number of episodes, $T$ is the number of observations, $x_{1:t}^{i}$ \textbf{are the interaction sequences obtained by running the current agent} till $i$ episodes, $u_{t}^{i}$ is the current action, $\theta$ is the set of trainable parameters, $R^{i}_{t}$ is the cumulative reward, $b_t$ is a baseline, and $\pi(u_{t}^{i}|x_{1:t}^i;\theta)$ is the policy. RAM's behavior may be compared with the participants' by comparing the fixation maps obtained from the sequence of locations predicted by RAM and those chosen by the participants. A fixation map is computed by assigning each location a value equal to the frequency of its selection, and then normalizing those values to create a distribution over all locations.
\fi

\subsection{Metrics for comparing fixation maps}
In order to evaluate the action mechanism of our model, we compare the fixation map obtained from the sequence of locations sampled by our model with that of the fixation map obtained from participants' data in \citep{baruah2023attentionmnist}. The fixation map is computed by assigning each location a value equal to the frequency of its selection, and then normalizing the values to create a distribution over all locations.

For metrics comparing two fixation maps, $P$ and $Q$, we closely follow \citep{bylinskii2018different}. We use three distribution-based metrics: KL divergence (KL), Pearson correlation coefficient (CC), and Similarity (SIM), to compare the distribution of sampling locations from a model with that from the participants as recorded in the collected data.\\
\textbf{KL divergence.} \citep{bylinskii2018different} Given two image distributions, $P$ and $Q$, the KL divergence $\mathit{KL}(P,Q)$ measures the loss of information when $Q$ is used to approximate $P$. This is calculated for each pixel $k$ as: $\mathit{KL}(P_k,Q_k) = P_k \log\Big(\epsilon + \frac{P_k}{Q_k+\epsilon}\Big)$, where $\epsilon$ is a very small real number. Lower KL divergence for $k$ implies $P_k$ and $Q_k$ are similar. KL divergence is highly sensitive to zero values.\\
\textbf{CC} can evaluate the linear relationship between two maps as \citep{bylinskii2018different}: $\mathit{CC}(P,Q) = \frac{\sigma(P,Q)}{\sigma(P)\sigma(Q)}$, where $\sigma$ is the variance or covariance. Since CC is symmetric, it fails to infer whether differences between fixation maps are due to false positives or false negatives.\\
\textbf{SIM} is measured as \citep{bylinskii2018different}: $\mathit{SIM}(P,Q) = \sum_{k} \min(P_{k},Q_{k})$, where $\sum_{k}P_{k} = \sum_{k}Q_{k} = 1$. Like CC, SIM is symmetric and inherits the same drawback. Also, SIM is very sensitive to missing values, and penalizes predictions that fail to account for the ground truth density.

These metrics do not compare the sequence of fixations. This is inconsequential in the current work because recognizing a numeral or alphabet does not require sampling the image in a particular order. \cite{baruah2022attention} have shown that a predictive agent saccades when exposed to images of handwritten numerals or alphabets, and tracks when exposed to videos of the formation of the same handwritten numerals. In both cases, the agent learns to complete the proprioceptive pattern or the sequence of expected salient (or sampling) locations. See Fig. 2 in \citep{baruah2022attention}. The agent model in the current work also learns to complete the proprioceptive pattern in the same way, though this is not shown.


\begin{table*}\label{t2}
  \caption{Evaluation of fixation maps from RAM and our model (Model 1) for the stimuli presented in the MTurk experiments, averaged over all classes and samplings. Standard deviations are included in parenthesis.} \label{t2}
  \centering
  \resizebox{\columnwidth}{!}{
  \begin{tabular}{|l|l|l|l|l|l|l|}
  \hline
\multirow{2}{*}{Metric}  & \multicolumn{2}{|c|}{MNIST} & \multicolumn{2}{|c|}{EMNIST uppercase} & \multicolumn{2}{|c|}{EMNIST lowercase} \\ \cline{2-7}
     &    Our model (M1)    & RAM         &  Our model (M1)         & RAM       & Our model (M1)       & RAM             \\ \hline
KL    & $22.44 (7.50)$  & $22.50 (7.48)$  & $22.90 (7.55)$ &  $22.96 (7.24)$   & $22.30 (7.37)$  & $22.23 (7.16)$ \\ \hline
CC    & $0.02 (0.01)$   & $0.01 (0.00)$   & $0.02 (0.01)$    & $0.01 (0.00)$   & $0.02 (0.01)$    &  $0.01 (0.00)$ \\ \hline
SIM     &  $0.18 (0.11)$   & $0.17 (0.09)$    & $0.16 (0.10)$  &  $0.16 (0.07)$     & $0.18 (0.10)$  & $0.18 (0.09)$\\ \hline
  \end{tabular}
  }
\end{table*}


\begin{figure}[t!]
  \centering
  %\vspace{-5mm}
      \subfloat[Participants]{\includegraphics[width=0.25\linewidth]{mturk_dist.png}\label{r_f1_1}}\hfill
      \subfloat[Our model (M1)]{\includegraphics[width=0.25\linewidth]{vrnn_dist.png}\label{r_f1_3}}\hfill
       \subfloat[RAM]{\includegraphics[width=0.25\linewidth]{ram_dist.png}\label{r_f1_2}}\\
      
           \subfloat[Participants]{\includegraphics[width=0.25\linewidth]{mturk_dist_e1.png}\label{r_f1_1}}\hfill   
      \subfloat[Our model (M1)]{\includegraphics[width=0.25\linewidth]{vrnn_dist_e1.png}\label{r_f1_3}}\hfill
      \subfloat[RAM]{\includegraphics[width=0.25\linewidth]{ram_dist_e1.png}\label{r_f1_2}}\\
  
           \subfloat[Participants]{\includegraphics[width=0.25\linewidth]{mturk_dist_e2.png}\label{r_f1_1}}\hfill
      \subfloat[Our model (M1)]{\includegraphics[width=0.25\linewidth]{vrnn_dist_e2.png}\label{r_f1_3}}\hfill
      \subfloat[RAM]{\includegraphics[width=0.25\linewidth]{ram_dist_e2.png}\label{r_f1_2}}
 
    \caption{Comparison of the distribution of the sequence of fixations over a class for different cases; classes `9', `B', `m' are shown in rows 1 to 3 respectively. The fixations are scattered in case of RAM, our model shows similar pattern with the participants data.}
  \label{r_f1}
\end{figure}

%------------------------------------------------------------------------
\section{Experimental Results} \label{3}

\subsection{Datasets} 

Our model is evaluated using the following datasets:\\
(1) MNIST \citep{lecun1998gradient} is a dataset of handwritten numerals $\{0, 1, \ldots, 9\}$, consisting of 60,000 training and 10,000 test images ($28 \times 28$ pixels).\\
(2) EMNIST \citep{cohen2017emnist} is a balanced dataset of handwritten English alphabets in uppercase and lowercase, consisting of 124,800 training and 20,800 test images ($28 \times 28$ pixels).\\
(3) AttentionMNIST \citep{baruah2023attentionmnist} is a dataset\footnote{We downloaded the AttentionMNIST dataset from https://github.com/Murchana/AttentionMNIST.} consisting of a sequence of time-stamped samples from MNIST and EMNIST datasets, collected from human participants using MTurk. Each sample consists of: (1) the location in the image selected by the participant, (2) the class(es) selected by the participant, and (3) the time taken by the participant to register the current sample (i.e. the time elapsed between registering the last and current samples). The total time allowed to each participant for sampling $T=12$ locations of an image is limited to six minutes. This data is recorded from 15 distinct stimuli from each class for MNIST, EMNIST uppercase, and EMNIST lowercase letters. The dataset is collected from 382 distinct participants. It consists of 1736 samples from MNIST, 4431 samples from EMNIST uppercase, and 4315 samples from EMNIST lowercase, and 169.1 responses per class on average.


\subsection{Experimental setup} 

The generative, recognition and classification models consist of 512, 128, 128 hidden units respectively. The latent variable dimension is 20. These parameters are estimated experimentally, and are consistent with model parameters reported in the literature. For example, the multimodal model in \citep{wu2018multimodal} uses latent variable dimension of 64 and two MLP hidden layers of 512 units each for MNIST generation and classification, the model in \citep{gregor2015draw} uses latent variable dimension of 100 and an RNN hidden layer of 256 units for MNIST generation, and the model in \citep{mnih2014recurrent} uses an RNN hidden layer of 256 units for MNIST classification. 

Maximum number of glimpses $T=12$, and minibatch size is 100. The parameters $\beta$, $\lambda_{1}$, are fixed to 1, $\lambda_{2}$ and $\alpha$ are fixed to 5000. The model is learned end-to-end using backpropagation and Adam optimization \citep{kingma2014adam} with a learning rate of $10^{-3}$. These hyperparameters are estimated via cross-validation using 10,000 images from the training set. The first observation is sampled from the center pixel of an image, as in the participants' data \citep{baruah2023attentionmnist}. 

We use a dropout probability of $0.7$ to prevent overfitting. The dropout is applied at the decoder hidden layers for all the modalities in M1 and M3, and both the decoder hidden layer and the classification hidden layer for M2. Additionally, the KL divergence term in the objective function also acts as a regularizer \citep{kingma2013auto} that prevents overfitting.

\iffalse
\subsubsection{Baseline}
Baseline refers to the case where the entire image is sampled by our model at any glimpse, i.e. it observes 100\% of the ground truth.
Here we evaluate the contribution of proprioceptive modality in our model. We define a variant of our model by eliminating the proprioceptive modality at input (observation) and output (generation), keeping rest of the model unchanged. 
\fi


\subsubsection{Evaluation} 
The quality of the generated images is evaluated using negative log-likelihood (NLL), as in \citep{gregor2015draw}, and the class prediction is evaluated by classification accuracy. The three metrics, KL, CC and SIM, are used to evaluate the fixation maps obtained from the sequence of sampled locations. The efficiency of the model is evaluated by the number of glimpses required for accurate prediction, on the sampled MNIST and EMNIST datasets \citep{baruah2023attentionmnist}.

As in \citep{baruah2023attentionmnist}, we compare the efficiency and fixation maps with a highly-cited reinforcement model, recurrent attention model (RAM) \citep{mnih2014recurrent}, that reports experimental results on the MNIST dataset. RAM classifies images using a sequence of glimpses. The next location is chosen stochastically from a distribution parameterized by a location network. For a fair comparison with the participants, in RAM\footnote{We use the RAM implementation from github.com/hehefan/Recurrent-Attention-Model.}, we fixed the sequence length at $T=12$, the first sampling location at the image center, the input observation to a $5\times 5$ patch with the selected location as its center, and modified the reward function according to the experimental setup in \citep{baruah2023attentionmnist}.
%by eq. \ref{Equ:MTurk reward}. The cumulative reward, $R_t$ in eq. \ref{e2}, is replaced by the cumulative score $\sum_{\tau=1}^{t} P_{\tau}$ obtained from eq. \ref{Equ:MTurk reward}. 

%In order to test whether training the classifier together with the generative model is effective, we remove the classification modality from Model 1. Initially, the generative model is trained. Then,\\
%\textbf{Model 3:} A convolutional neural network \footnote{We use the code from https://chromium.googlesource.com/external/github.com/tensorflow/tensorflow/+/r0.10/tensorflow/g3doc/tutorials/mnist/pros/index.md.} is used to classify the generated data. \\
In addition to the three variants (M1, M2, M3), we include one more variation of our model in which the generative model is trained as in M3, and then an RNN with LSTM units is used to classify the data from the latent variables. We refer to this as \textbf{Model M4}. Unless otherwise stated, ``our model'' refers to M1 throughout the rest of the paper.
%\textbf{Model 4:} A RNN is used to classify the data from the latent variables.

\begin{table}[t!]
  \caption{Classification accuracy and NLL on the test set reported after the final glimpse.} \label{t1}
  \centering
  \begin{tabular}{|l|l|l|l|}
  \hline
    Dataset & Variants of the proposed model & Accuracy (\%) & NLL ($\leq$)\\
%            &                       &          &      \\   
\hline \hline
   MNIST    & M1               &        96.3         & 76.5\\
                  &  M2              &        92.3         &  107.0 \\
                  &  M3 (pretrained)             &        94.6             & 76.1 \\
                   &  M4 (not end-to-end)             &       82.9           &  76.1 \\
%            & w/o pro. o/p              &  1314.62  &  \\ 
\hline \hline
   EMNIST   & M1               &       90.2      &  125.8 \\
                   & M2                &       80.4     &  82.6 \\
                   & M3 (pretrained)               &       88.5     & 78.9 \\
                   & M4 (not end-to-end)               &     75.4       & 78.9  \\
%            & w/o pro. o/p              & 1573.6                  & \\ 
\hline
  \end{tabular}
\end{table}

\begin{table}[t!]
  \caption{Classification accuracy and NLL on the stimuli presented to the participants in \citep{baruah2023attentionmnist}, reported after the final glimpse.} \label{t1_1}
  \centering
  \begin{tabular}{|l|l|l|l|}
  \hline
    Dataset & Variants of the proposed model & Accuracy (\%) & NLL ($\leq$)\\
%            &                       &          &      \\   
\hline \hline
   MNIST    & M1               &        100        & 71.3\\
                  &  M2              &        96         &  102.5 \\
                  &  M3 (pretrained)             &         98.7      & 71.8 \\
                  &  M4 (not end-to-end)              &        20.7       &  71.8\\
       %           & RAM                          &      98.7          &  -  \\
%            & w/o pro. o/p              &  1314.62  &  \\ 
\hline \hline
   EMNIST   & M1              &    98.7              & 129.7  \\
   upp.          & M2               &   90.2              &    91.7 \\
                   & M3 (pretrained)              &     98.7        &    83.9 \\
                   & M4 (not end-to-end)               &      76.9                 &   83.9 \\
             %       & RAM                          &      98.7          &  -  \\
%            & w/o pro. o/p              & 1573.6                  & \\ 
\hline\hline
   EMNIST   & M1              &   95.6              & 111.0 \\
     low.         & M2               &   85.4           & 66.8   \\
                   & M3 (pretrained)              &      96.9        &   62.3\\
                   & M4 (not end-to-end)               &    74.9          &  62.3 \\  \hline
               %     & RAM                          &      98.7          &  -  \\\hline
  \end{tabular}
\end{table}

\iffalse
\begin{table}[t!]
  \caption{Classification accuracy and the NLL on the test set reported after the final glimpse.} \label{t1}
  \centering
  \begin{tabular}{|l|l|l|l|}
  \hline
    Dataset & Variants of        &    Accuracy  & NLL\\
            & proposed model     &    \%   & $\leq$\\
%            &                       &          &      \\   
\hline \hline
   MNIST    & M1               &        95.9          & 77.9\\
                  &  M2              &        88.9         &  81.5 \\
                  &  M3 (pretrained)              &         94.0          & 77.1 \\
                   &  M4 (not end-to-end)              &        83.9          &  77.1 \\
%            & w/o pro. o/p              &  1314.62  &  \\ 
\hline \hline
   EMNIST   & M1               &       89.7      &  118.4 \\
                   & M2                &       76.9      &  82.8 \\
                   & M3 (pretrained)                &      88.6      & 75.8 \\
                   & M4 (not end-to-end)                &      76.4               & 75.8  \\
%            & w/o pro. o/p              & 1573.6                  & \\ 
\hline
  \end{tabular}
\end{table}

\begin{table}[t!]
  \caption{Classification accuracy and the NLL on the stimuli presented to the participants as in \citep{baruah2023attentionmnist} reported after the final glimpse.} \label{t1_1}
  \centering
  \begin{tabular}{|l|l|l|l|}
  \hline
    Dataset & Variants of        &    Accuracy  & NLL\\
            & proposed model     &    \%   & $\leq$\\
%            &                       &          &      \\   
\hline \hline
   MNIST    & M1               &        98.7        & 72.1\\
                  &  M2              &        94.7         &  73.8 \\
                  &  M3 (pretrained)              &         99.3       & 72.1 \\
                  &  M4 (not end-to-end)              &         21.3        &  72.1\\
       %           & RAM                          &      98.7          &  -  \\
%            & w/o pro. o/p              &  1314.62  &  \\ 
\hline \hline
   EMNIST   & M1              &    97.9              & 120.6  \\
   upp.          & M2               &   87.7              &    90.2\\
                   & M3 (pretrained)               &      98.7       &    82.6\\
                   & M4 (not end-to-end)               &     76.7                   &   82.6\\
             %       & RAM                          &      98.7          &  -  \\
%            & w/o pro. o/p              & 1573.6                  & \\ 
\hline\hline
   EMNIST   & M1              &   95.4              & 104.4 \\
     low.         & M2               &   83            & 68.3   \\
                   & M3 (pretrained)               &     96.4         &   58.7\\
                   & M4 (not end-to-end)               &      70.8         &  58.7 \\  \hline
               %     & RAM                          &      98.7          &  -  \\\hline
  \end{tabular}
\end{table}
\fi

\begin{figure*}[htbp]
  \centering
  \hspace{-2mm}
      \subfloat[MNIST]{\includegraphics[width=0.32\linewidth]{MNIST_2.png}\label{r_f1_1}}\hfill
      \subfloat[EMNIST uppercase]{\includegraphics[width=0.32\linewidth]{EMNISTU_2.png}\label{r_f1_2}}\hfill
      \subfloat[EMNIST lowercase]{\includegraphics[width=0.32\linewidth]{EMNISTL_2.png}\label{r_f1_3}}
    \caption{Errorbar plot showing the change in classification accuracy and percentage of image area observed by the participants in \citep{baruah2023attentionmnist}, RAM \citep{mnih2014recurrent} and our model (M1, MVRNN) with number of glimpses or samples.}
  \label{r_f2}
\end{figure*}

\begin{figure*}[tb!]
  \centering
  \hspace{-2mm}
   \subfloat[Participants]{\includegraphics[width=0.30\linewidth]{m_dist_all.png}\label{f2_1}}\hfill
   \subfloat[Our model (M1)]{\includegraphics[width=0.30\linewidth]{m_vrnn_dist_all.png}\label{f2_2}}\hfill
   \subfloat[RAM]{\includegraphics[width=0.30\linewidth]{m_ram_dist_all.png}\label{f2_3}}\\
   \hspace{-2mm}
    \subfloat[Participants]{\includegraphics[width=0.30\linewidth]{m_class_time_0.png}\label{f2_2}}\hfill 
     \subfloat[Our model (M1)]{\includegraphics[width=0.30\linewidth]{m_vrnn_class_time_0.png}\label{f2_2}}\hfill
   \subfloat[RAM]{\includegraphics[width=0.30\linewidth]{m_ram_class_time_0.png}\label{f2_2}}
    \caption{(a)--(c) Distribution of sampling locations (or fixation maps) for each numeral and each sampling instant. (d)--(f) Class distribution for class `9'. Qualitatively, the participants' fixation maps are more similar to our model's than RAM's. The distributions are obtained by averaging the responses over all stimuli presented from each class. Each row corresponds to a class, and each column corresponds to a sampling instant which increases from left to right. Also see Figs. \ref{f2} and \ref{f3} in Appendix \ref{Appendix:Visualization of the fixation maps}, which show similar results for uppercase and lowercase alphabets respectively.} %The distributions are averaged over all stimuli (MVRNN and RAM) and all stimuli and participants (True) shown for MNIST.
    \label{f1}
\end{figure*}

% \begin{figure*}[tb!]
%   \centering
%    \subfloat[Participants]{\includegraphics[width=0.27\linewidth]{eu_dist_all.png}\label{f3_1}}\hfill
%   \subfloat[Our model (MVRNN)]{\includegraphics[width=0.27\linewidth]{eu_vrnn_dist_all.png}\label{f3_2}}\hfill
%    \subfloat[RAM]{\includegraphics[width=0.27\linewidth]{eu_ram_dist_all.png}\label{f3_3}}\\
%     \subfloat[Participants]{\includegraphics[width=0.27\linewidth]{eu_class_time_0.png}\label{f3_2}}\hfill 
%      \subfloat[Our model (MVRNN)]{\includegraphics[width=0.27\linewidth]{eu_vrnn_class_time_0.png}\label{f3_2}}\hfill 
%    \subfloat[RAM]{\includegraphics[width=0.27\linewidth]{eu_ram_class_time_0.png}\label{f3_2}}
%     \caption{(a)--(c) Distribution of sampling locations (or fixation maps) for each numeral and each sampling instant. Qualitatively, the participants' fixation maps are more similar to MVRNN's than RAM's. (d)--(f) Class distribution for class `B'. The distributions are averaged over all stimuli (MVRNN and RAM) and all stimuli and participants (True) shown for EMNIST uppercase. Each row corresponds to a class, each column corresponds to a sampling instant which increases from left to right.}
%     \label{f2}
% \end{figure*}

% \begin{figure*}[tb!]
%   \centering
%    \subfloat[Participants]{\includegraphics[width=0.27\linewidth]{el_dist_all.png}\label{f4_1}}\hfill
%    \subfloat[Our model (MVRNN)]{\includegraphics[width=0.27\linewidth]{el_vrnn_dist_all.png}\label{f4_2}}\hfill
%    \subfloat[RAM]{\includegraphics[width=0.27\linewidth]{el_ram_dist_all.png}\label{f4_3}}\\
%     \subfloat[Participants]{\includegraphics[width=0.27\linewidth]{el_class_time_0.png}\label{f4_2}}\hfill 
%      \subfloat[Our model (MVRNN)]{\includegraphics[width=0.27\linewidth]{el_vrnn_class_time_0.png}\label{f4_2}}\hfill
%    \subfloat[RAM]{\includegraphics[width=0.27\linewidth]{el_ram_class_time_0.png}\label{f4_2}}
%     \caption{(a)--(c) Distribution of sampling locations (or fixation maps) for each numeral and each sampling instant. Qualitatively, the participants' fixation maps are more similar to MVRNN's than RAM's. (d)--(f) Class distribution for class `m'. The distributions are averaged over all stimuli (MVRNN and RAM) and all stimuli and participants (True) shown for EMNIST lowercase. Each row corresponds to a class, each column corresponds to a sampling instant which increases from left to right.}
%     \label{f3}
% \end{figure*}

\subsection{Evaluation results}
\subsubsection{Evaluation for accuracy} 
When both the classification and the pattern completion modality are trained end-to-end as in M1 and M2, NLL increases (ref. Tables \ref{t1}, \ref{t1_1}). As the model is trained to learn generation and classification tasks at the same time, the model is not able to perform well, due to which the accuracy in the generation modality lowers. When the pattern completion and the classification modalities are trained separately, as the model is trained to learn the generation task only, the NLL is the lowest (ref. Tables \ref{t1}, \ref{t1_1}). 

The classification accuracy from M1 is higher than M2 in all cases (ref. Tables \ref{t1}, \ref{t1_1}). In M1, the classification modality shares parameters with the generation modality, whereas in M2, the classification modality does not share parameters with the generation modality, though in both cases the generation modality shares parameters with the classification modality. Thus, the generation modality contributes more to the classification accuracy of M1 than of M2. The classification accuracy for M3 is very close to M1 and the classification accuracy for M4 is the lowest (ref. Tables \ref{t1}, \ref{t1_1}). M3 utilizes a CNN-based classifier; it yields higher classification accuracy than M4, which utilizes an RNN-based classifier. 

\subsubsection{Evaluation of fixation maps} 
Results from comparing the fixation maps from RAM and our model (M1) with the participants' data \citep{baruah2023attentionmnist} are shown in Table \ref{t2}. KL is higher due to its sensitivity to zero values. This implies several locations are sampled by the participants (as there are multiple participants for each stimulus) but not by RAM or our model. KL is lower for our model (M1) than RAM for most cases. SIM and CC are either higher for our model than RAM, or comparable for both the models. 
%We obtain similar results for M2 and M3 as well (ref. supplemental material).
 
Clearly, between our model (M1) and RAM, the fixation maps generated by the former are more similar to those generated by the participants. Visualization of the fixation maps in Figs. \ref{r_f1}, \ref{f1}, \ref{f2}, \ref{f3} also shows that the maps obtained from our model are more similar to the participants'. As multiple participants responded to each stimulus, there are many more points for participants than for RAM or our model in the visualizations.

As our model is based on saliency computed using prediction error and the human brain is closely linked with predictive coding \citep{friston2010free}, this can possibly explain greater similarity of the fixation maps for our model. These experiments can be used as a baseline for evaluating locations sampled by an attention model.

The attention mechanism in our model differs from most models (including RAM) from behavioral and algorithmic perspectives. Typically, end-to-end attention-based models learn all parameters, including attention weights, by optimizing an objective function. In most of these models, attention is an internal mechanism that does not have a corresponding external behavior. The attention parameters play a role similar to any other parameter in the model. In our model, attention is a parameterless mechanism that emerges due to prediction error, which drives action/behavior (ref. Eqs. \ref{equ:saliency1}, \ref{equ:sampling location}). This mechanism is interpretable as the model simply attends to its unexpected observations.

%\cite{baruah2022speech} Attention. The attention mechanism in our model differs from most SER models from behavioral and algorithmic perspectives. Typically, end-to-end attention-based models for SER learn all parameters (including attention weights) by optimizing an objective function. In most of these models, attention is an internal mechanism that does not have a corresponding behavior. The attention parameters play a role similar to any other parameter in the model. In our model, attention is a parameterless mechanism that emerges due to prediction error, which drives action/behavior (ref. Eq. 2–4). This mechanism is interpretable as the model simply attends to its unexpected observations.

\subsubsection{Evaluation for efficiency}
In \citep{baruah2023attentionmnist}, a participant can select multiple classes at any instant. For the proposed and RAM models, instead of predicting the highest probable class, we consider the mean probability over all the classes as a threshold and predict the set of classes with probabilities greater than the threshold. We store the sampling or glimpse number after which the participant and the models select only the correct class.

The average number of samplings required by a participant to accurately predict a class is quite low. On average, it takes $4.2$, $4.7$, $4.9$ samples for MNIST, EMNIST uppercase and lowercase images \citep{baruah2023attentionmnist}. RAM requires $3.7$, $8.5$, $7.6$ samples to recognize MNIST numerals, uppercase and lowercase EMNIST alphabets respectively. Thus, in comparison to the participants, under the same experimental conditions, RAM is less efficient. Our model requires $2.0$, $4.5$, $4.2$ samples to recognize MNIST numerals, uppercase and lowercase EMNIST alphabets respectively. %MVRNN - 2 requires $2.0$, $3.9$, $3.7$ samples to recognize MNIST numerals, uppercase and lowercase EMNIST alphabets, which correspond to $4.6\%$, $ 10.4\%$, $9.4\%$ of image area respectively and MVRNN - 3 requires $2.0$, $3.9$, $3.7$ samples to recognize MNIST numerals, uppercase and lowercase EMNIST alphabets, which correspond to $4.6\%$, $ 10.4\%$, $9.4\%$ of image area respectively. 

In order to yield the same accuracy, our model requires fewer glimpses than RAM and the participants (ref. Fig. \ref{r_f2}). Hence, our model is more efficient. This is also validated by the class distribution plots shown in (d--f) of Figs. \ref{f1}, \ref{f2}, \ref{f3}. We also observe that the classification accuracy over glimpses plots for RAM and our model are mostly flat (ref. Fig. \ref{r_f2}). This is because, since we are using a threshold to select multiple classes from these models as stated above, the correct class is selected in most of the glimpses, which does not change the classification accuracy much over glimpses. The proportion of area observed increases with glimpses for RAM and the participants, but it saturates after a few glimpses for our model, particularly in Fig. \ref{r_f1_1}. As there is no inhibition of return used in our model during sampling, the model is allowed to sample near the already sampled locations, which may have led to this pattern. 
%Detailed results are shown in Fig.~\ref{r_f1}.


\section{Conclusions}
\label{Sec:Conclusions}
We proposed an attention-based agent model for handwritten numeral/alphabet recognition via a sequence of glimpses. The attention is driven by the agent's sensory prediction (or generation) error. At each sampling instant, the agent completes and classifies the partial sequence observed till that instant. End-to-end attention-based models that perform simultaneous generation and classification of handwritten numerals/alphabets is scarce. Our agent model is learned by jointly minimizing the classification and generation errors. Three variants of this model are evaluated on benchmark datasets. Their accuracies are comparable and correlate with the model size. Our experiments reveal that the proposed model is more data-efficient in handwritten numeral/alphabet recognition than human participants as well as a highly-cited attention-based reinforcement model, under the same conditions and stimuli. Qualitatively, the participants' fixation maps are more similar to our model's fixation maps than the reinforcement model's. To the best of our knowledge, this is the first attention-based end-to-end agent of its kind for recognition via generation, with high degree of accuracy and efficiency.

%\bibliographystyle{IEEEtran}
\bibliography{reference}
%\bibliography{jmlr-sample}




\newpage
%\clearpage
\renewcommand{\thepage}{A\arabic{page}} 
\setcounter{page}{1}
\renewcommand{\thesection}{A\arabic{section}}  
\setcounter{section}{0}
\renewcommand{\thetable}{A\arabic{table}}  
\setcounter{table}{0}
\renewcommand{\thefigure}{A\arabic{figure}}
\setcounter{figure}{0}
\renewcommand{\theequation}{A\arabic{equation}} 
\setcounter{equation}{0}

\appendix
\section{Loss function derivation and pseudo code}
\label{Appendix:Loss function derivation and pseudo code}

\subsection{Model M1}
\label{Appendix:Model M1}

Here we derive the objective function in Eq. \ref{eq:obj1}. The generative and recognition models are factorized as:
\begin{align*}
p_{\theta}(X_{\leq T}, y_{\le T}, z_{\le T}|\textbf{x}_{\leq T}) &= \prod_{t=1}^{T} p_{\theta}({X}_{t}, y_t|z_{\leq t},\textbf{x}_{\leq t}) p_{\theta}(z_{t})\nonumber \\
q_{\phi}(z_{\le T}|\textbf{x}_{\le T}) &= \prod_{t=1}^{T} q_{\phi}(z_{t}|\textbf{x}_{\leq t})
\end{align*}
The variational lower bound (ELBO) on the joint log-likelihood of the generated data, $\log p_{\theta}(X_{\leq T}, y_{\le T}|\textbf{x}_{\leq T})$, is derived as:
\begin{align*}
&\mathbb{E}_{q_{\phi}(z_{\leq T}|\textbf{x}_{\leq T})} \Big[ \log p_{\theta}({X}_{\leq T},y_{\leq T}|\textbf{x}_{\leq T}) \frac{q_{\phi}(z_{\leq T}|\textbf{x}_{\leq T})}{q_{\phi}(z_{\leq T}|\textbf{x}_{\leq T})} \Big]\\
&= \mathbb{E}_{q_{\phi}(z_{\leq T}|\textbf{x}_{\leq T})} \Big[ \log \frac{ p_{\theta}({X}_{\leq T}, y_{\leq T}, z_{\le T}|\textbf{x}_{\leq T})}{p_{\theta}(z_{\le T}|\textbf{x}_{\le T})} \frac{q_{\phi}(z_{\leq T}|\textbf{x}_{\leq T})}{q_{\phi}(z_{\leq T}|\textbf{x}_{\leq T})} \Big]\\
 &= \mathbb{E}_{q_{\phi}(z_{\leq T}|\textbf{x}_{\leq T})} \Big[\sum_{t=1}^{T} \log \frac{ p_{\theta}({X}_{t}, y_t|z_{\leq t},\textbf{x}_{\leq t})p_{\theta}(z_t)}{p_{\theta}(z_{t}|\textbf{x}_{\leq t})} \frac{q_{\phi}(z_{t}|\textbf{x}_{\leq t})}{q_{\phi}(z_{t}|\textbf{x}_{\leq t})} \Big]\\
 &= \mathbb{E}_{q_{\phi}(z_{\leq T}|\textbf{x}_{\leq T})} \Big[ \sum_{t=1}^{T} \Big[ \log p_{\theta}({X}_{t},y_t|z_{\leq t},\textbf{x}_{\leq t}) - \log \frac{q_{\phi}(z_{t}|\textbf{x}_{\leq t})}{p_{\theta}(z_{t})} + \log \frac{ q_{\phi}(z_{t}|\textbf{x}_{\leq t})}{p_{\theta}(z_{t}|\textbf{x}_{\leq t})} \Big] \Big]\\
 &\geq \mathbb{E}_{q_{\phi}(z_{\leq T}|\textbf{x}_{\leq T})} \Big[ \sum_{t=1}^{T} \log p_{\theta}({X}_{t}, y_t|z_{\leq t},\textbf{x}_{\leq t}) \Big] - \sum_{t=1}^{T} D_{KL} \big( q_{\phi}(z_{t}|\textbf{x}_{\leq t}), p_{\theta}(z_{t}) \big)
\end{align*}

We assume, the modalities $X_t$ and $y_t$ are conditionally independent given the common latent variables \citep{wu2018multimodal} and all observations till the current time. Therefore, 
\begin{align}%\label{25} %\label{25} has already been used for the loss/objective function equtaion
\log p_{\theta}({X}_{\leq T}, y_{\leq T}|\textbf{x}_{\leq T}) \geq ~& \mathbb{E}_{q_{\phi}(z_{\leq T}|\textbf{x}_{\leq T})} \Big[ \sum_{t=1}^{T} \lambda_1 \log p_{\theta}(X_{t}|z_{\leq t},\textbf{x}_{\leq t}) + \lambda_2 \log p_{\theta}(y_{t}|z_{\leq t},\textbf{x}_{\leq t}) \Big] \nonumber \\
& - \sum_{t=1}^{T} \beta D_{KL} \big( q_{\phi}(z_{t}|\textbf{x}_{\leq t}), p_{\theta}(z_{t}) \big)
\end{align}
\noindent where $\lambda_1$, $\lambda_2$, $\beta$ are the weights balancing the terms.


\begin{algorithm2e}[t!]
%\footnotesize
\caption{Learning the proposed network}\label{algo1}
Initialize parameters of the generative model $\theta$, recognition model $\phi$, sequence length $T$. %, number of perceptual modalities $m$, and number of proprioceptive modalities $p$.
Initialize optimizer parameters: $\beta_{1} = 0.9$, $\beta_{2} = 0.99$, $\eta = 0.001$, $\epsilon = 10^{-10}$. Initialize $x^{(1)}_1 \leftarrow F(X_{1}, \ell_0)$, $x_1^{(2)} \leftarrow g_3(\ell_0)$, where $\ell_0$ is the initial sampling location (ref. Experimental setup in Section \ref{3}), $g_{3}$ is an identity function (ref. Action selection in Section \ref{2_3}), and the function $F$ extracts a sample $x^{(1)}$ (e.g., $5\times 5$ patch) from the environment $X$ (e.g., $28\times 28$ image) at location $\ell$ (e.g., center of the image).\\
%\SetAlgoLined
\nl \While {true}
{
    \nl \For {$\tau \leftarrow 1 \ to \ T$}
    {   
        %\vspace{2mm}\\
        \textbf{Model M1 }\\
        \nl $\hat{X}_{\tau}, \hat{y}_{\tau} \leftarrow PatComClassM1(x^{(1:2)}_{1:\tau})$\\ %[ref. Algorithm \ref{algo2}]
        %\vspace{2mm}\\
        \textbf{Model M2 }\\
        \nl $\hat{X}_{\tau}, \hat{y}_{\tau} \leftarrow PatComClassM2(x^{(1:2)}_{1:\tau})$\\ %[ref. Algorithm \ref{algo2}]
        \textbf{Model M3 }\\
        \nl $\hat{X}_{\tau} \leftarrow PatComClassM1(x^{(1:2)}_{1:\tau})$\\ %[ref. Algorithm \ref{algo2}]
        \nl $\hat{y}_{\tau} \leftarrow Classifier(\hat{X}_{\tau})$\\ %[ref. Algorithm \ref{algo2}]

        \vspace{2mm}
        \textbf{Saliency Computation }\\
        \nl $S_{\tau} \leftarrow g_{1}(X_{\tau+1}, \hat{X}_{\tau})$ ~~~~~~ [ref. Eq. \ref{equ:saliency1}]\\
        \nl $\ell_{\tau} \leftarrow g_{2}(S_{\tau})$ ~~~~~~ [ref. Eq. \ref{equ:sampling location}]\\
        \nl $x_{\tau+1}^{(2)} \leftarrow g_{3}(\ell_{\tau})$\\ %~~~ [$g_{3}$ is an identity function; ref. Action selection in Section \ref{2_3}]
        \nl $x^{(1)}_{\tau+1} \leftarrow F(X_{\tau+1},\ell_\tau)$\\

        \vspace{2mm}
        \textbf{Learning }\\
        \nl Update $\{ \theta, \phi \}$ or $\{ \theta, \phi, \pi \}$ by maximizing Eq. \ref{eq:obj1}, \ref{eq:obj2} or \ref{eq:obj3}.
    }
}
\end{algorithm2e}




\subsection{Model M2}
\label{Appendix:Model M2}
Here we derive the objective function in Eq. \ref{eq:obj2}. The generative and recognition models are factorized as:
\begin{align*}
p_{\theta}(X_{\leq T}, y_{\le T}, z_{\le T}|\textbf{x}_{\leq T}) &= \prod_{t=1}^{T} p_{\theta}({X}_{t}, y_t|z_{\leq t},\textbf{x}_{\leq t}) p_{\theta}(z_{t})\nonumber \\
q_{\phi}(z_{\le T}|\textbf{x}_{\le T}, y_{\leq T}) &= \prod_{t=1}^{T} q_{\phi}(z_{t}|\textbf{x}_{\leq t}, y_t)
\end{align*}
\sloppy
The variational lower bound (ELBO) on the log-likelihood of the generated data, $\log p_{\theta}(X_{\leq T},y_{\leq T}|\textbf{x}_{\leq T})$, when the true label is given is derived as:
\begin{align*}
&\mathbb{E}_{q_{\phi}(z_{\leq T}|\textbf{x}_{\leq T},y_{\leq T})} \Big[ \log p_{\theta}(X_{\leq T},y_{\leq T}|\textbf{x}_{\leq T}) \frac{q_{\phi}(z_{\leq T}|\textbf{x}_{\leq T},y_{\leq T})}{q_{\phi}(z_{\leq T}|\textbf{x}_{\leq T},y_{\leq T})} \Big]\\
&= \mathbb{E}_{q_{\phi}(z_{\leq T}|\textbf{x}_{\leq T},y_{\leq T})} \Big[ \log \frac{ p_{\theta}(X_{\leq T}, z_{\le T}, y_{\leq T}|\textbf{x}_{\leq T})}{ p_{\theta}(z_{\le T}|\textbf{x}_{\le T},y_{\leq T})} \frac{q_{\phi}(z_{\leq T}|\textbf{x}_{\leq T},y_{\leq T})}{q_{\phi}(z_{\leq T}|\textbf{x}_{\leq T},y_{\leq T})} \Big]\\
 &= \mathbb{E}_{q_{\phi}(z_{\leq T}|\textbf{x}_{\leq T},y_{\leq T})} \Big[\sum_{t=1}^{T} \log \frac{ p_{\theta}(X_{t}|z_{\leq t}, \textbf{x}_{\leq t})p_{\theta}(z_t)p_{\theta}(y_t)}{p_{\theta}(z_{t}|\textbf{x}_{\leq t},y_{t})} \frac{q_{\phi}(z_{t}|\textbf{x}_{\leq t},y_{t})}{q_{\phi}(z_{t}|\textbf{x}_{\leq t},y_{t})} \Big]\\
 &= \mathbb{E}_{q_{\phi}(z_{\leq T}|\textbf{x}_{\leq T},y_{\leq T})} \Big[ \sum_{t=1}^{T} \Big[ \log p_{\theta}(X_{t}|z_{\leq t},\textbf{x}_{\leq t}) + \log p_{\theta}(y_{t}) - \log \frac{q_{\phi}(z_{t}|\textbf{x}_{\leq t},y_{t})}{p_{\theta}(z_{t})}\\ 
 &~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + \log \frac{ q_{\phi}(z_{t}|\textbf{x}_{\leq t},y_t)}{p_{\theta}(z_{t}|\textbf{x}_{\leq t},y_{t})} \Big] \Big]\\
 &\geq \mathbb{E}_{q_{\phi}(z_{\leq T}|\textbf{x}_{\leq T},y_{\leq T})} \Big[ \sum_{t=1}^{T} \log p_{\theta}(X_{t}|z_{\leq t},\textbf{x}_{\leq t}) + \log p_{\theta}(y_{t})\Big] - \sum_{t=1}^{T} D_{KL} \big( q_{\phi}(z_{t}|\textbf{x}_{\leq t},y_{t}), p_{\theta}(z_{t}) \big)\\
&= \mathbb{E}_{q_{\phi}(z_{\leq T}|\textbf{x}_{\leq T}, y_{\leq T})} \Big[ \sum_{t=1}^{T} (\log p_{\theta}(X_{t}|z_{\leq t}, \textbf{x}_{\leq t}) + \log p_{\theta}(y_t) )\Big] - \sum_{t=1}^{T} D_{KL} \big( q_{\phi}(z_{t}|\textbf{x}_{\leq t}, y_{t}), p_{\theta}(z_{t}) \big)
\end{align*}
%\noindent where $\lambda_1$, $\lambda_2$, $\beta$ are the weights balancing the terms.
After adding the classification loss, the final objective function can be written as:
\begin{align}%\label{25} %\label{25} has already been used for the loss/objective function equtaion
\nonumber
 &\mathbb{E}_{q_{\phi}(z_{\leq T}|\textbf{x}_{\leq T},y_{\leq T})} \Big[ \sum_{t=1}^{T} \log p_{\theta}(X_{t}|z_{\leq t},\textbf{x}_{\leq t}) + \log p_{\theta}(y_{t})\Big]\\
 & - \sum_{t=1}^{T} D_{KL} \big( q_{\phi}(z_{t}|\textbf{x}_{\leq t},y_{t}), p_{\theta}(z_{t}) \big) + \sum_{t=1}^{T} \alpha \log q_{\phi}(y_t|\textbf{x}_{\leq t})
\end{align}
\noindent where $\alpha$ controls the relative weight between generative and purely discriminative learning.

\begin{algorithm2e}[t!]
%\footnotesize
\caption{$PatComClassM1(x^{(1:2)}_{1:\tau})$}\label{algo2}
%\vspace{2mm}
\textbf{Recognition Model}\\
\nl \For {$i \leftarrow 1 \ to \ 2$}
{
    \nl $h^{enc_i}_{\tau} \leftarrow \mathit{RNN}^{enc}_{\phi}(x^{(i)}_{1:\tau},h^{enc_i}_{\tau-1})$\\ %~~~~~~ [ref. Section 2.1 in \citep{chung2015recurrent}]
    \nl $[\mu^{(i)}_{\tau}  \ ; \Sigma^{(i)}_{\tau}] \leftarrow \varphi^{enc}(h^{enc_i}_{\tau})$\\ %~~~~~~ [ref. Section 3 in \citep{chung2015recurrent}]
}

\vspace{2mm}
\textbf{Product of Experts}\\
\nl $z_{\tau} \sim \mathcal{N}(\mu_{\tau},\Sigma_{\tau})$, where $\Sigma_{\tau} \leftarrow \Big(\displaystyle\sum_{i=1}^{2}{\Sigma^{(i)}_{\tau}}^{-2}\Big)^{-1}, ~ \mu_{\tau} \leftarrow \Big(\displaystyle\sum_{i=1}^{2}\mu^{(i)}_{\tau}{\Sigma^{(i)}_{\tau}}^{-2}\Big) \Sigma_{\tau}$\\

\vspace{2mm}
\textbf{Generative Model}\\

Pattern completion\\
\nl $h^{dec_1}_{\tau} \leftarrow \mathit{RNN}^{dec}_{\theta}(z_{\tau},h^{dec_1}_{\tau-1})$\\
%~~~~If $X_{\tau+1}^{(i)}$ is Bernoulli (see \citep{gregor2015draw} for details):
\nl $\hat{X}_{\tau} \leftarrow f_{\sigma}(h^{dec_1}_{\tau},\hat{X}_{\tau-1})$\\

Classification Model\\ %(M1)
\nl $h^{dec_2}_{\tau} \leftarrow \mathit{RNN}^{dec}_{\theta}(z_{\tau},h^{dec_2}_{\tau-1})$\\
%~~~~If $X_{\tau+1}^{(i)}$ is Bernoulli (see \citep{gregor2015draw} for details):
\nl $\hat{y}_{\tau} \leftarrow softmax(h^{dec_2}_{\tau})$\\

%For proprioceptive modality:\\
%\STATE $h^{dec(2)}_{\tau} \leftarrow \mathit{RNN}^{dec}_{\theta}(z_{\tau},h^{dec(2)}_{\tau-1})$
%%~~~~If $X_{\tau+1}^{(i)}$ is Gaussian: %(as in interaction generation):
%\STATE $[\mu^{(2)}_{x^{(2)},\tau}  \ ; \sigma^{(2)}_{x^{(2)},\tau}] \leftarrow \varphi^{dec}(h^{dec(2)}_{\tau})$
%%\State $X_{t+1}^{(i)}|z_{t} \sim \mathcal{N}(\mu^{(i)}_{x^{(i)},t},\sigma^{(i)}_{x^{(i)},t})$
%\STATE $\hat{X}_{\tau}^{(2)} \leftarrow \mu^{(2)}_{x^{(2)},\tau}$
%\ENDFOR
\end{algorithm2e}


\subsection{Model M3}
\label{Appendix:Model M3}
Here we derive the objective function in Eq. \ref{eq:obj3}. The generative and recognition models are factorized as:
\begin{align*}
p_{\theta}(X_{\leq T}, z_{\le T}|\textbf{x}_{\leq T}) &= \prod_{t=1}^{T} p_{\theta}({X}_{t}|z_{\leq t},\textbf{x}_{\leq t}) p_{\theta}(z_{t})\nonumber \\
q_{\phi}(z_{\le T}|\textbf{x}_{\le T}) &= \prod_{t=1}^{T} q_{\phi}(z_{t}|\textbf{x}_{\leq t})
\end{align*}

The variational lower bound (ELBO) on the log-likelihood of the generated data, $\log p_{\theta}(X_{\leq T}|\textbf{x}_{\leq T})$, is derived as:
\begin{align*}
&\mathbb{E}_{q_{\phi}(z_{\leq T}|\textbf{x}_{\leq T})} \Big[ \log p_{\theta}({X}_{\leq T}|\textbf{x}_{\leq T}) \frac{q_{\phi}(z_{\leq T}|\textbf{x}_{\leq T})}{q_{\phi}(z_{\leq T}|\textbf{x}_{\leq T})} \Big]\\
&= \mathbb{E}_{q_{\phi}(z_{\leq T}|\textbf{x}_{\leq T})} \Big[ \log \frac{ p_{\theta}({X}_{\leq T}, z_{\le T}|\textbf{x}_{\leq T})}{ p_{\theta}(z_{\le T}|\textbf{x}_{\le T})} \frac{q_{\phi}(z_{\leq T}|\textbf{x}_{\leq T})}{q_{\phi}(z_{\leq T}|\textbf{x}_{\leq T})} \Big]\\
&= \mathbb{E}_{q_{\phi}(z_{\leq T}|\textbf{x}_{\leq T})} \Big[\sum_{t=1}^{T} \log \frac{ p_{\theta}({X}_{t}|z_{\leq t},\textbf{x}_{\leq t})p_{\theta}(z_t)}{p_{\theta}(z_{t}|\textbf{x}_{\leq t})}\frac{q_{\phi}(z_{t}|\textbf{x}_{\leq t})}{q_{\phi}(z_{t}|\textbf{x}_{\leq t})} \Big]\\
&= \mathbb{E}_{q_{\phi}(z_{\leq T}|\textbf{x}_{\leq T})} \Big[ \sum_{t=1}^{T} \Big[ \log p_{\theta}({X}_{t}|z_{\leq t},\textbf{x}_{\leq t}) - \log \frac{q_{\phi}(z_{t}|\textbf{x}_{\leq t})}{p_{\theta}(z_{t})} + \log \frac{ q_{\phi}(z_{t}|\textbf{x}_{\leq t})}{p_{\theta}(z_{t}|\textbf{x}_{\leq t})} \Big] \Big]\\
&\geq \mathbb{E}_{q_{\phi}(z_{\leq T}|\textbf{x}_{\leq T})} \Big[ \sum_{t=1}^{T} \log p_{\theta}({X}_{t}|z_{\leq t},\textbf{x}_{\leq t}) \Big] - \sum_{t=1}^{T} D_{KL} \big( q_{\phi}(z_{t}|\textbf{x}_{\leq t}), p_{\theta}(z_{t}) \big)
\end{align*}
After adding the classification loss, the final objective function can be written as:
\begin{align}%\label{25} %\label{25} has already been used for the loss/objective function equtaion
\mathbb{E}_{q_{\phi}(z_{\leq T}|\textbf{x}_{\leq T})} \Big[ \sum_{t=1}^{T} \log p_{\theta}({X}_{t}|z_{\leq t},\textbf{x}_{\leq t}) \Big] - \sum_{t=1}^{T} D_{KL} \big( q_{\phi}(z_{t}|\textbf{x}_{\leq t}), p_{\theta}(z_{t}) \big) + \log q_{\pi}(y|X)
\end{align}
\noindent where $q_{\pi}(y|X)$ is the classification model whose input is the entire image (completed pattern) and not a sequence of observations. So the subscript $t$ is dropped.

\begin{algorithm2e}[t!]
%\footnotesize
\caption{$PatComClassM2(x^{(1:2)}_{1:\tau},y_{1:\tau})$}\label{algo3}
%\vspace{2mm}
\textbf{Classification Model}\\
%\IF {labels are not present}
\nl $h_\tau^{cls} = RNN_\alpha^{cls}(h_{\tau-1}^{cls}, \textbf{x}_{1:\tau})$\\
\nl $\hat{y}_\tau = softmax(h_\tau^{cls})$\\
%\ENDIF

\vspace{2mm}
\textbf{Recognition Model}\\
\nl \For {$i \leftarrow 1 \ to \ 2$}
{
    \nl $h^{enc_i}_{\tau} \leftarrow \mathit{RNN}^{enc}_{\phi}(x^{(i)}_{1:\tau},h^{enc_i}_{\tau-1})$\\ %~~~~~~ [ref. Section 2.1 in 
    \nl $[\mu^{(i)}_{\tau}  \ ; \Sigma^{(i)}_{\tau}] \leftarrow \varphi^{enc}(h^{enc_i}_{\tau})$\\ %~~~~~~ [ref. Section 3 in \citep{chung2015recurrent}]
}
\nl \uIf {labels are present}
{ 
    \nl $h^{enc_3}_{\tau} \leftarrow \mathit{tanh}(y_{\tau})$\\ %~~~~~~ [ref. Section 2.1 in 
}
\uElse
{    
    \nl $h^{enc_3}_{\tau} \leftarrow \mathit{tanh}(\hat{y}_{\tau})$\\ %~~~~~~ [ref. Section 2.1 in 
}
\nl $[\mu^{(3)}_{\tau}  \ ; \Sigma^{(3)}_{\tau}] \leftarrow \varphi^{enc}(h^{enc_3}_{\tau})$\\ %~~~~~~ [ref. Section 3 in \citep{chung2015recurrent}]

\vspace{2mm}
\textbf{Product of Experts}\\
\nl $z_{\tau} \sim \mathcal{N}(\mu_{\tau},\Sigma_{\tau})$, where $\Sigma_{\tau} \leftarrow \Big(\displaystyle\sum_{i=1}^{3}{\Sigma^{(i)}_{\tau}}^{-2}\Big)^{-1}, ~ \mu_{\tau} \leftarrow \Big(\displaystyle\sum_{i=1}^{3}\mu^{(i)}_{\tau}{\Sigma^{(i)}_{\tau}}^{-2}\Big) \Sigma_{\tau}$\\

\vspace{2mm}
\textbf{Generative Model}\\
Pattern Completion\\
%\IF {labels are present}
\nl $h^{dec(1)}_{\tau} \leftarrow \mathit{RNN}^{dec}_{\theta}(z_{\tau},h^{dec_1}_{\tau-1})$\\
%\ELSE
%\STATE $h^{dec(1)}_{\tau} \leftarrow \mathit{RNN}^{dec}_{\theta}([z_{\tau}, \hat{y}_\tau],h^{dec(1)}_{\tau-1})$
%\ENDIF
%~~~~If $X_{\tau+1}^{(i)}$ is Bernoulli (see \citep{gregor2015draw} for details):
\nl $\hat{X}_{\tau} \leftarrow f_{\sigma}(h^{dec_1}_{\tau},\hat{X}_{\tau-1})$\\

%For proprioceptive modality:\\
%\IF {labels are present}
%\STATE $h^{dec(2)}_{\tau} \leftarrow \mathit{RNN}^{dec}_{\theta}([z_{\tau}, y_\tau],h^{dec(2)}_{\tau-1})$
%\ELSE
%\STATE $h^{dec(2)}_{\tau} \leftarrow \mathit{RNN}^{dec}_{\theta}([z_{\tau}, \hat{y}_\tau],h^{dec(2)}_{\tau-1})$
%\ENDIF
%%~~~~If $X_{\tau+1}^{(i)}$ is Gaussian: %(as in interaction generation):
%\STATE $[\mu^{(2)}_{x^{(2)},\tau}  \ ; \sigma^{(2)}_{x^{(2)},\tau}] \leftarrow \varphi^{dec}(h^{dec(2)}_{\tau})$
%%\State $X_{t+1}^{(i)}|z_{t} \sim \mathcal{N}(\mu^{(i)}_{x^{(i)},t},\sigma^{(i)}_{x^{(i)},t})$
%\STATE $\hat{X}_{\tau}^{(2)} \leftarrow \mu^{(2)}_{x^{(2)},\tau}$
%\ENDFOR
\end{algorithm2e}



\section{Visualization of fixation maps}
\label{Appendix:Visualization of the fixation maps}

Visualization of the fixation maps obtained from our model (M1), RAM \citep{mnih2014recurrent}, and the participants in \citep{baruah2023attentionmnist}, on uppercase and lowercase alphabets are shown in Figs. \ref{f2} and \ref{f3} respectively.

\begin{figure*}[t!]
  \centering
   \subfloat[Participants]{\includegraphics[width=0.27\linewidth]{eu_dist_all.png}\label{f3_1}}\hfill
  \subfloat[Our model (M1)]{\includegraphics[width=0.27\linewidth]{eu_vrnn_dist_all.png}\label{f3_2}}\hfill
   \subfloat[RAM]{\includegraphics[width=0.27\linewidth]{eu_ram_dist_all.png}\label{f3_3}}\\
    \subfloat[Participants]{\includegraphics[width=0.27\linewidth]{eu_class_time_0.png}\label{f3_2}}\hfill 
     \subfloat[Our model (M1)]{\includegraphics[width=0.27\linewidth]{eu_vrnn_class_time_0.png}\label{f3_2}}\hfill 
   \subfloat[RAM]{\includegraphics[width=0.27\linewidth]{eu_ram_class_time_0.png}\label{f3_2}}
    \caption{(a)--(c) Distribution of sampling locations (or fixation maps) for each uppercase alphabet and each sampling instant. Qualitatively, the participants' fixation maps are more similar to our model's than RAM's. (d)--(f) Class distribution for class `B'. The distributions are obtained by averaging the responses over all stimuli presented from each class. Each row corresponds to a class, and each column corresponds to a sampling instant which increases from left to right.} %The distributions are averaged over all stimuli (MVRNN and RAM) and all stimuli and participants (True) shown for EMNIST uppercase.
    \label{f2}
\end{figure*}

\begin{figure*}[t!]
  \centering
   \subfloat[Participants]{\includegraphics[width=0.27\linewidth]{el_dist_all.png}\label{f4_1}}\hfill
   \subfloat[Our model (M1)]{\includegraphics[width=0.27\linewidth]{el_vrnn_dist_all.png}\label{f4_2}}\hfill
   \subfloat[RAM]{\includegraphics[width=0.27\linewidth]{el_ram_dist_all.png}\label{f4_3}}\\
    \subfloat[Participants]{\includegraphics[width=0.27\linewidth]{el_class_time_0.png}\label{f4_2}}\hfill 
     \subfloat[Our model (M1)]{\includegraphics[width=0.27\linewidth]{el_vrnn_class_time_0.png}\label{f4_2}}\hfill
   \subfloat[RAM]{\includegraphics[width=0.27\linewidth]{el_ram_class_time_0.png}\label{f4_2}}
    \caption{(a)--(c) Distribution of sampling locations (or fixation maps) for each lowercase alphabet and each sampling instant. Qualitatively, the participants' fixation maps are more similar to our model's than RAM's. (d)--(f) Class distribution for class `m'. The distributions are obtained by averaging the responses over all stimuli presented from each class. Each row corresponds to a class, and each column corresponds to a sampling instant which increases from left to right.} %The distributions are averaged over all stimuli (MVRNN and RAM) and all stimuli and participants (True) shown for EMNIST lowercase.
    \label{f3}
\end{figure*}

\end{document}