\documentclass{article} % For LaTeX2e
\usepackage{iclr2024_conference,times}

% Optional math commands from https://github.com/goodfeli/dlbook_notation.
\input{math_commands.tex}

\usepackage{hyperref}
\usepackage{url}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{graphicx}
\usepackage{xcolor}

\title{Learning Successor Representations with Distributed Hebbian Temporal Memory}
\author{
}

% highlights on
\newcommand{\add}[2][]{\textbf{#1}\textcolor{blue}{#2}}
% highlights off
% \newcommand{\add}[2][]{#2}

\begin{document}

\maketitle

\begin{abstract}
This paper presents a novel approach to address the challenge of online hidden representation learning for decision-making under uncertainty in non-stationary, partially observable environments. The proposed algorithm, Distributed Hebbian Temporal Memory (DHTM), is based on factor graph formalism and a multicomponent neuron model. DHTM aims to capture sequential data relationships and make cumulative predictions about future observations, forming Successor Representation (SR). Inspired by neurophysiological models of the neocortex, the algorithm utilizes distributed representations, sparse transition matrices, and local Hebbian-like learning rules to overcome the instability and slow learning process of traditional temporal memory algorithms like RNN and HMM. Experimental results demonstrate that DHTM outperforms classical LSTM and performs comparably to more advanced RNN-like algorithms, speeding up Temporal Difference learning for SR in changing environments. Additionally, we compare the SRs produced by DHTM to another biologically inspired HMM-like algorithm, CSCG. Our findings suggest that DHTM is a promising approach for addressing the challenges of online hidden representation learning in dynamic environments.
\end{abstract}

\section{Introduction}
Modelling sequential data is one of the most important tasks in Artificial Intelligence as it has many applications, including decision-making and world models, natural language processing, conversational AI, time-series analysis, and video and music generation \citep{min2021recent,eraslan2019deep,dwivedi2023so,ji2020comprehensive,moerland2023model}. One of the classical approaches to modelling sequential data is forming a representation that stores and condenses the most relevant information about a sequence, and finding a general transformation rule of this information through the dimension of time \citep{lipton2015critical,harshvardhan2020comprehensive,mathys2011bayesian}. We refer to the class of algorithms that use this approach as Temporal Memory (TM) algorithms, as they essentially model the cognitive ability of complex living organisms to remember the experience and make future predictions based on this memory \citep{lstm,friston2016active,friston2018deep,parr2017working}.

This paper addresses the problem of hidden representation learning for decision-making under uncertainty, which can be formalized as agent Reinforcement Learning (RL) for a Partially Observable Markov Decision Process (POMDP) \citep{poupart2005exploiting}. Inferring the hidden state in a partially observable environment is, in effect, a sequence modelling problem as it requires processing a sequence of observations to get enough information about hidden states. One of the most efficient representations of the hidden states for discrete POMDP is the Successor Representation (SR) that disentangles hidden states and goals given by the reward function \citep{dayan1993improving}. \add{An extension of the SR into continuous POMDP is the Successor Features framework, which employs the same idea of value function decomposition, but, instead, for features of a hidden state \citep{barreto2017successor}. Temporal Memory algorithms can be leveraged to make cumulative predictions about future states and their features to form SR or SF.}

The most prominent TM algorithms, like a Recurrent Neural Network (RNN) or a Hidden Markov Model (HMM), use backpropagation to capture data relationships, which is known for its instability due to recurrent non-linear derivatives. They also require having complete sequences of data at hand during the training. Although the gradient vanishing problem can be partially circumvented in a way Receptance Weighted Key Value (RWKV) \citep{peng2023rwkv} or Linear Recurrent Unit (LRU) \citep{orvieto2023resurrecting} models do, the problem of online learning is still a viable topic. In contrast to HMM, RNN models and their descendants also lack a probabilistic theory foundation, which is beneficial for modeling sequences captured from stochastic environments \citep{salaun2019comparing,zhao2020rnn}. There is little research on TM models that can be used in fully online adaptable systems interacting with partially observable stochastic environments with access only to one sequence data point at a time, a prevalent case in Reinforcement Learning \citep{jahromi2022online}. 

We propose a Distributed Hebbian Temporal Memory (DHTM) algorithm based on the factor graph formalism and multi-compartment neuron model. The resulting graphical structure of our model is similar to one of the Factorial-HMM \citep{ghahramani1995factorial}, but with a factor graph forming online during training. We also show that depending on the graphical structure, our TM can be viewed as an HMM version of either RNN or LRU regarding information propagation in time. An important feature of our model is that transition matrices for each factor are stored as different components (segments) of artificial neurons, which makes computations very efficient in the case of sparse transition matrices. Our TM forms sequence representations fully online and employs only local Hebbian-like learning rules \citep{hebb2005organization,churchland1992computational,Lillicrap_Santoro_Marris_Akerman_Hinton_2020}, circumventing gradient drawbacks and making the learning process much faster than gradient methods.

Some key ideas for our TM algorithm are inspired by neurophysiological models of the neocortex neural circuits and pyramidal neurons \citep{george2009towards,Hawkins_Ahmad_2016,deep_predictive_learning_OReilly_2021}. For example, emission matrices for random variables are fixed to resemble the columnar structure of the neocortex layers, which significantly lessens the number of trainable parameters, speeding up learning and leading to sparse transition matrices. Another example is using multi-compartmental neurons with active dendritic segments as independent detectors of neuron pattern activity \citep{london2005dendritic}. Each dendritic segment can be viewed as a row of an HMM state transition matrix or, more generally, a value of a discrete factor function. Thus, we don't explicitly store large transition matrices, only their non-zero parts.    

The DHTM model notoriously fits Successor Features in the Reinforcement Learning setup to speed up TD learning. The proposed TM is tested as a world model \citep{ha2018world,hafner2023mastering} for an RL agent architecture, making decisions in a simple Pinball-like environment and in a more challenging AnimalAI testbed \citep{pmlr-v123-crosby20a}. Our algorithm outperforms a classic RNN algorithm LSTM and a more advanced RNN-like transformer algorithm RWKV in online Successor Feature formation task due to combination of fast Hebbian-like learning and sparse hidden state coding. Another advantage of our algorithm is that it allows its implementation for neuromorphic processors, as it uses only local learning rules.

Our contribution in this work is the following:
\begin{itemize}
    \item We propose a distributed memory model DHTM based on the factor graph formalism and multicompartment neural model.
    \item Our model stores sparse factor functions in neural segments, which significantly lessens the number of trainable parameters and speeds up learning.
    \item The DHTM learns fully online employing only local Hebbian-like rules.
    \item The DHTM model fits Successor Features in the RL setup to speed up TD learning.
    \item Tested as a world model for an RL agent architecture in a Pinball environment, DHTM outperforms LSTM and RWKV in online Successor Features formation task.
\end{itemize}

\section{Background}
\label{sec:background}
This section provides basic information about some concepts necessary to follow the paper.

\subsection{Reinforcement Learning}
% POMDP
In this paper, we consider decision-making in a partially observable environment, which is usually formalized as Partially Observable Decision Process \citep{poupart2005exploiting}. A POMDP is defined as a tuple $\mathcal{M}=(S, A, P, R, O, D, \gamma)$, where $S$---state space, $A$---action space, $P(s, a, s`)=Pr(s`\ |\ s, a)$---transition function, $R(s)$--reward function, O---observation space, $D(a, s`, o)=Pr(o\ |\ a, s`)$---sensor model and $\gamma \in [1, 0)$---discount factor, given a transition $s, a \to s`$, where $s\in S$, $a\in A$, $o\in O$. If $S, A, O$ are finite, $P, D$ can be viewed as real valued matrices, otherwise, they are conditional density functions. Here we consider deterministic rewards, which depend only on the current state, i.e. $R(s): S \to \mathbb{R}$.

The task of RL is to find a policy $\pi(a\ |\ s): S\times A \to [0, 1]$, which maximizes expected return $G = \mathbb{E}[\sum^T_{t=0} \gamma^l R_{t}]$, where $T$ is an episode length. For value based methods, it is convenient to define optimal policy via Q-function: $Q^\pi(s_t, a_t)=\mathbb{E} [\sum_{l\geq t} \gamma^l R(s_{l+1})\ |\ s_t, a_t, \pi]$. For an optimal value function $Q^*$ an optimal policy can be defined as \add{ $\pi(a\ |\ s) = \underset{a}{\mathrm{argmax}} \ Q^*(s, a)$}.

\subsection{Hidden Markov Model}
% HMM, FactorialHMM, sum-product, Baum-Whelch
Partially observable Markov process can be approximated by a Hidden Markov model (HMM) with hidden state space $H$ and observation space $O$. $O$ is the same as in $\mathcal{M}$, but $H$ generally is not equal $S$. Variables $H_t$ represent an unobservable (hidden) approximated state of the environment which evolves over time, and observable variables $O_t$ represent observations that depend on the same time step state $H_t$, and $h_t, o_t$ are corresponding values of this random variables. For the sake of simplicity, we suppose that actions are fully observable and information about them is included into $H_t$ variables. For the process of length $T$ with state values $h_{1:T} = (h_1, \ldots, h_T)$ and $o_{1:T} = (o_1, \ldots, o_T)$, the Markov property yields the following factorization of the generative model:
\begin{equation}
	p(o_{1:T}, h_{1:T}) = p(h_1)
	\prod_{t=2}^T p(h_t | h_{t-1})
	\prod_{t=1}^T p(o_t | h_t).
\end{equation}

In case of discrete hidden state, a time-independent stochastic transition matrix can be learned with Baum–Welch algorithm \citep{baum_welch_1970}, a variant of Expectation Maximization algorithm. To compute the statistics for the expectation step, it employs the forward-backward algorithm, which is a special case of sum-product algorithm \citep{sum_product}.

\subsection{Successor Representation}
\label{sec:sr}
Successor Representations are such representations of hidden states from which we can linearly infer the state value given the reward function \citep{dayan1993improving}. Here, we assume observation and state spaces are discrete.
\add{
\begin{multline}
V(h_t=i) 
    = \mathrm{E} [\sum^{\infty}_{l=0} \gamma^l R_{t+l+1} \ |\ h_t=i] 
        = \sum^{\infty}_{l=0} \gamma^l \mathrm{E} [R_{t+l+1} \ |\ h_t=i] =
        \nonumber \\
    = \sum^{\infty}_{l=0} \gamma^l \sum_j p(h_{t+l+1}=j\ | \ h_t=i) R_j 
        = \sum_{j} \sum^{\infty}_{l=0} \gamma^l p(h_{t+l+1}=j\ | \ h_t=i) R_j
        = \sum_{j} M_{ij} R_j,
        \label{eq:state_value}
\end{multline}
\noindent
where $\gamma$ is a discount factor, vector $\mathrm{SR}(h=i) = \{M_{ij}\}_{j}$ is a Successor Representation of a state $i$, and $M_{ij} = \sum^{\infty}_{l=0} \gamma^l p(h_{t+l+1}=j\ | \ h_t=i)$. $R_j$ is a reward for observing the state $j$.}
That is, SR can be computed by a TM that is able to predict future states. TM algorithms effectively predict observations only for a finite time horizon $T$. Therefore, in order to learn SR, a technique similar to TD learning in standard RL may be employed:
\begin{align}
    \delta_{ij} &= \sum^{T}_{l=0} 
        \gamma^l p(h_{t+l+1}=j\ | \ h_t=i))
         + \gamma^{T+1} \sum_{k} M_{kj} p(h_{t+T+1}=k\ |\ h_t=i)
         - M_{ij}, \\
    M_{ij} &\leftarrow M_{ij} + \alpha \delta_{ij} \label{eq:td_update},
\end{align}
\noindent
where $\alpha\in (0, 1)$ is a learning rate, $\delta_{ij}$---TD error for SR.

In partially observable environments, however, exact state values are not known, therefore we operate with state distributions or so-called belief states \citep{poupart2005exploiting}, which are inferred from observations. In that case, state value and SR are functions of hidden state variable distribution (see details in Appendix~\ref{appx:decompose}).

\subsection{Sparse Distributed Representations}
\label{sec:sdr}
% some words on how we use SP to encode observations
In our work, we design our model to operate with sparse distributed representations (SDRs) to reflect the spatiotemporal property of cortical network activity \citep{SDR_perin_2011}. In the discrete time case, SDR is a sparse binary vector in a high-dimensional space. To encode observed dense binary patterns to SDRs, we use a biologically plausible k-WTA (k-winners take all) neural network algorithm called spatial pooler with a Hebbian-like unsupervised learning method (see details in Appendix~\ref{appx:enc}).

% \section{Methods}
% This section describes our TM model and its usage for SF formation. We also outline the agent architecture that we use in our RL tasks. We use the same agent architecture and encoder-decoder pipeline for every TM we compare.

\section{Distributed Hebbian Temporal Memory}
\label{sec:dhtm}

\subsection{Factor Graph Model}
\label{sec:factor_graph}
Distributed Hebbian Temporal Memory is based on the sum-product belief propagation algorithm in a factor graph (see Figure~\ref{fig:whole_graph}). Analogously to Factorial-HMM \citep{ghahramani_factorial_1997}, we divide the hidden space $H$ into subspaces $H^k$. There are four sets of random variables (RV) in the model: $H^i_{t-1}$---latent variables representing hidden states from the previous time step (context), $H^k_t$---latent variables for the current time step, $\Phi^k_t$---feature variables, and $O^{lm}_t$---observable variables. Except for $O^{lm}_t$, all random variables have a categorical distribution. In contrast, $O^{lm}_t$, are Bernoulli variables because they represent pixels from a binary input image observation. RV state values are denoted as corresponding lowercase letters: $h^i_{t-1}$, $h^k_{t}$, $\varphi^k_t$, $o^{lm}_t$.

Each variable $\Phi^k_t$ is considered independent and has a separate graphical model for increased computational efficiency. However, hidden variables of the same time step are statistically interdependent in practice. We introduce their interdependence through a segment computation trick that goes beyond the standard sum-product algorithm (see Eq.~\ref{eq:seg_like}). 

The model also has three types of factors: $M^i_{t-1}$---messages from previous time steps, $F^k_c$---context factor (generalized transition matrix), $F^k_e$---emission factor. We assume that messages $M^i_{t-1}$ include posterior information from the time step $t-1$, therefore we don't depict observable variables for previous time steps in Figure~\ref{fig:whole_graph}.

\begin{figure}[tb]
    \centering
    \includegraphics[width=0.65\linewidth]{whole_graph.png}
    \caption{Partial factor graph for the DHTM. The input to the model is a sequence of binary images, each pixel is modelled as Bernoulli random variable $O^{lm}_t$, where $l$ and $m$ denote corresponding rows and cols of the image. The encoder block forms image categorical features $\Phi^k_t$ in an unsupervised manner. Each feature $\Phi$ has its own explaining hidden variable, which may depend on hidden variables of the other features and on itself from the previous time step. $F^k_c$ and $F^k_e$ are context and emission factors for the corresponding variables. Unary factors $M^i_{t-1}$ called messages represent accumulated information about previous time steps.  
    }
    \label{fig:whole_graph}
\end{figure}

Further, we discuss only the upper block of the graph, which is DHTM itself. The lower block---an encoder---is described in the Appendix~\ref{appx:enc}. The only requirement for the encoder is that its output should be represented as states of categorical variables (features) for the current observation.   

\subsection{Neural Implementation}
The main routine of the DHTM is to estimate distributions of currently hidden state variables given by the \eqref{eq:graph_model}, the computational flow of which is schematically depicted in Figure~\ref{fig:neural_imp}: 
\begin{equation}
    % \begin{split}
        p( h_{t}^{k}) \propto
        \sum _{\{h^i_{t-1}:i\in \omega_k\}}
        \prod _{i\in \omega_k} 
        M_{t-1}^{i}( h_{t-1}^{i}) 
        F_{c}^{k}( h_{t}^{k}, \{h^i_{t-1}:i\in \omega_k\}),
    % \end{split}
\label{eq:graph_model}
\end{equation}
\noindent
where $\omega_k={i_1, \dots, i_n}$---set of previous time step RV indexes included in $F_c^k$ factor, $(n+1)$---factor size. 

\begin{figure}[tb]
    \centering
    \includegraphics[width=0.8\linewidth]{neural_imp.png}
    \caption{
        Neuronal implementation of the DHTM. Random variables are represented by cell clusters (white circles), where each cell corresponds to a state and its spike frequency---to the probability of the state $p(h^k_t)$. Cell's dendritic segments $seg(k)$ correspond to context factor values $f_l$ for a particular combination of states (active presynaptic cells) $rec(l)$. Segments' excitations $E_l$ are combined to determine cell's spike frequency $p(h^k_t)$. Segment's synaptic weights reflect specificity of $rec(l)$ combination for the segment. Emission factors $F^k_e$ are fixed and represented by minicolumns inside a variable.      
    }
    \label{fig:neural_imp}
\end{figure}

For computational purposes, we translate the problem to the neural network architecture with Hebbian-like learning (for biological interpretation of the model, see Appendix~\ref{appx:bio}). As can be seen from Figure~\ref{fig:neural_imp}, every RV can be viewed as a set of spiking neurons representing the RV's states, that is, $p(h^k_{t})=p(c^j_t=1)$, where $j$---index of a neuron corresponding to the state $h^k_t$. Cell activity is binary $c^j_t \in \{0, 1\}$ (spike/no-spike), and the probability might be interpreted as a spike rate. Factors $F^k_c$ and $M^i_{t-1}$ can be represented as vectors, where elements are factor values for all possible combinations of RV states included in the factor. Let's denote elements of the vectors as $f_l$ and $m_u$ correspondingly, where $l$ corresponds to a particular combination of $k, h^k_t, h^{i_1}_{t-1}, \dots, h^{i_{n_l}}_{t-1}$ state values and $u$ indexes all neurons representing states of previous time step RVs. 

Drawing inspiration from biological neural networks with active dendrites, we group a neuron's connections (dendrites) into segments. A segment acts as an independent computational unit that detects a particular input pattern (a context state) defined by its own receptive field. In our model, a segment links together factor value $f_l$, the computational graph shown in Figure~\ref{fig:neural_imp}, and the excitation $E_l$ induced by the segment $l$ to the cell it is attached to. The segment is active, i.e., $s_l=1$ if all its presynaptic cells are active; otherwise, $s_l=0$. Computationally, a segment transmits its factor value $f_l$ to a cell it is attached to if the context matches the corresponding state combination.

We can now rewrite \eqref{eq:graph_model} as the following:
\begin{equation}
    p( h_{t}^{k}) \propto \sum _{l\in \mathrm{seg}(j)} L_l f^k_l,
    \label{eq:neuron_model}
\end{equation}
\noindent
where $L_l = \prod _{u\in \mathrm{rec}(l)} m_u$ is segment's likelihood as long as messages are normalized, $\mathrm{seg}(j)$---indexes of segments that are attached to cell $j$, $\mathrm{rec}(l)$---indexes of presynaptic cells that constitute receptive field of a segment with index $l$.

Initially, all factor entries are zero, meaning cells have no segments. As learning proceeds, new non-zero connections grouped into segments are grown. In \eqref{eq:neuron_model} we benefit from having sparse factor value vectors because its complexity depends linearly on the amount of non-zero components. And that's usually the case in our model due to one-step Monte-Carlo learning and specific form of emission factors $F^k_e$:
\begin{equation}
    F^k_e(h^k_t, o^k_t) = \mathbb{I}[h^k_t\in \mathrm{col}(\varphi^k_t)],
\end{equation}
\noindent
where $\mathbb{I}$---indicator function, $\mathrm{col}(\varphi^k_t)$ is a set of hidden states connected to the feature state $\varphi^k_t$ that forms a column. The form of emission factor is inspired by presumably columnar structure of the neocortex and was shown to induce sparse transition matrix in HMM \citep{CSCG_George_2021}.     

Segment likelihood $L_l$, resulting from the sum-product algorithm, is calculated as if presynaptic cells are independent. However, it's not usually the case for sparse factors. To take into account, approximately, their interdependence, we substitute the following equation for segment log-likelihood: 
\begin{equation}
    \log L_l = 
    \log \sum_{u \in \mathrm{rec}(l)} 
        w_{ul} m_u 
    + \sum_{u \in \mathrm{rec}(l)} 
        (1-w_{ul})\log m_u - \log n_l, 
\label{eq:seg_like}
\end{equation}
\noindent
where $w_{pl}$---synapse efficiency or neuron specificity for segment, such that $w_{ul} = p(s_l=1 | c^u_{t-1}=1)$, and $n_l$-number of cells in segment's receptive field. 

The idea that underlies the formula is to approximate between two extreme cases:
\begin{itemize}
    \item $p(s_l=1 | c^u_{t-1}=1) \to 1$ for all $u$, which means that all cells in the receptive field are dependent and are part of one cluster, i.e., they fire together. In that case, it should be $p(s_l) = m_u$ for any $u$, but we also reduce prediction variance by averaging between different $u$.
    \item $p(s_l=1 | c^u_{t-1}=1) \to 0$ for all $u$ means that presynaptic cells don't form a cluster. In that case, segment activation probability is just a product of the activation probability of each cell. 
\end{itemize}

The resulting equation for belief propagation in DHTM is the following:
\begin{equation}
    p(h^k_t) = p(c^j_t=1) = 
        \underset{j \in \mathrm{cells}[H^k_t]}{\mathrm{softmax}}(
            \underset{l \in seg(j)}{\mathrm{max}}(E_l)
        ), 
\label{eq:belief_prop}
\end{equation}
\noindent
where $E_l = \log f_l + \log L_l$, $\mathrm{cells}[H^k_t]$---indexes of cells that represent states for $H^k_t$ variable. Here, we also approximate logarithmic sum with $\mathrm{max}$ operation inspired by the neurophysiological model of segment aggregation by cell \citep{Stuart2015}. 

The next step after computing $p(h^k_t)$ distribution parameters is to incorporate information about current observations $p(h^k_t\ |\ o^k_t) \propto p(h^k_t) \mathbb{I}[h^k_t \in \mathrm{col}(o^k_t)]$. After that, the learning step is performed. The step for closing the loop of our TM algorithm is to assign the posterior for the current step $p(h^k_t\ |\ o^k_t)$ to $M^i_{t-1}$.

DHTM learns $f_l$ and $w_{ul}$ weights by Monte-Carlo Hebbian-like updates. First, $h^i_{t-1}$ and $h^k_t$ are sampled from their posterior distributions: $p(h^i_{t-1}\ |\ o^i_{t-1}) \propto M^i_{t-1}$ and $p(h^k_t\ |\ o^k_t)$ correspondingly. Then $f_l$ is updated according to the segment's $s_l$ and its cell's $c^j_t$ activity so that $f_l$ is proportional to several coincidences $s_l=c^j_t=1$ during the recent past, i.e., cell and its segment are active at the same time step. It's similar to Baum-Welch's update rule \citep{baum_welch_1970} for the transition matrix in HMM, which, in effect, counts transitions from one state to another, but, in our case, the previous state (context) is represented by a group of RVs, not just one hidden RV. 

\add{Weights $w_{ul}$ are also updated by the Hebbian rule to reflect the specificity of a presynaptic $u$ for activating a segment $l$. That is, they are targeted to represent probability $p(s_l=1\ |\ c^u_{t-1}=1)$ that segment $s_l$ is active, given cell $u$ was active at the previous time-step. We could learn it by counting activation coincidences and mismatches. But in our algorithm it is approximated as exponential moving average of segment’s $s_l$ frequency activation, given $c^u_{t-1}=1$: $\Delta w_{ul} = \alpha \cdot \mathbb{I}[c^u_{t-1}=1] \cdot (\mathbb{I}[s_l=1] - w_{ul})$, where $\alpha \in [0, 1)$ — learning rate.}

\subsection{Agent Architecture} \label{sec:ag_arch}
We incorporate DHTM as a part of an RL agent. The agent consists of a DHTM memory model, an SF mapping from hidden space, and a feature reward function. The memory model aims to speed up SF learning by predicting cumulative future distributions of feature variables $\Phi$ according to \eqref{eq:gen}. As shown in \eqref{eq:state_value_features}, SF representations are learned to estimate state value. The $r(\varphi^k_t)$ reward function is also learned during interaction with the environment and, combined with SF representations, is used to estimate the action value function. 

\begin{algorithm}[tb]
\caption{General agent training procedure}
\label{alg:agent_train}
\begin{algorithmic}[1] %[1] enables line numbers

\FOR{episode=1..n}
    \STATE RESET\_MEMORY()
    \STATE action $\leftarrow$ null
    \WHILE{(\NOT terminal) \AND (steps $<$ max\_steps)}
        \STATE obs, reward $\leftarrow$ STEP()
        \STATE features $\leftarrow$ ENCODE(PREPROCESS(obs))
        \STATE OBSERVE(features, action)
        \STATE REINFORCE(reward, features) \label{alg:line:reinforce}
        \STATE action $\leftarrow$ SAMPLE\_ACTION()
        \STATE ACT(action)
    \ENDWHILE
\ENDFOR
\end{algorithmic}
\end{algorithm}

The agent training procedure is outlined in Algorithm~\ref{alg:agent_train}. For each episode, the memory state is reset to a fixed initial message with \texttt{RESET\_MEMORY()} and \texttt{action} variable is initialized with \texttt{null} value. An observation image returned by an environment (\texttt{obs}) is first preprocessed to get events, mimicking a simple event-based camera with a floating threshold determined from the average difference between the current and previous step image intensities. The resulting events are encoded to SDRs with a biologically inspired spatial pooling encoder described in Appendix~\ref{appx:enc}. In \texttt{OBSERVE()} routine, the memory learns to predict next feature states as described in Section~\ref{sec:dhtm} and SF learning happens according to \eqref{eq:td_feature_update}. An agent learns associations to feature states and rewards in line \ref{alg:line:reinforce}:
\begin{equation}
    r^k_i \leftarrow r^k_i + \alpha \mathbb{I}[\varphi^k_t=i] (R_t - r^k_i)
\end{equation}
\noindent
where $\alpha$ is a learning rate, $R_t$---a reward for the current time step.

\add{We include actions into the model by forcing some of the hidden variables $H^k_t$ to represent actions. That is, we assume that information about action is included in the hidden state of the model. For example, if we have 4 actions, we set 4 states for one of the hidden variables and set its state from observation of the action. We form on-policy SFs, i.e. relying on policy iteration theorem.}  

An agent has a softmax policy over predicted values: $\pi(a_t\ |\ o_{0:t}) = \mathrm{softmax}(V[p(h_{t+1}\ |\ o_{0:t}, a_t)])$. We use the model to predict the hidden state distribution for every action in the next timestep $t+1$ and then estimate its value according to \eqref{eq:state_value_features}.

\section{Experiments}
We test our model in a reinforcement learning task in a pinball-like 2D environment, where successor features are easy to interpret, and in a more challenging AnimalAI 3D environment. This section shows how different memory models affect SF learning and an RL agent's adaptability. In our work, we compare the proposed DHTM model with LSTM \citep{lstm}, RWKV \citep{peng2023rwkv}, and CSCG \citep{CSCG_George_2021} \add{(see Appendix~\ref{appx:baselines} for the details)}.

\subsection{Pinball}
\add{The first, classic maze, test is designed in the Pinball environment (see Appendix~\ref{appx:setups} for details) to qualitatively assess SFs formed by different TMs for random policy (see Fig.~\ref{fig:2dmaze}). Ball is controlled by the agent able to apply a momentum in four opposite directions. The ball and terminal state are separated by a wall with a door on the right. Each episode is maximum of 30 steps. Memories are tested in two regimes: 5-step planning (i.e. using \eqref{eq:gen} only) and prediction only (\eqref{eq:pred}). As can be seen from the heatmaps, only DHTM yields adequate value functions. However, as can be seen from the learning curves, surprise of DHTM is higher than of the other memories. LSTM's learning curve is much flatter than of the others. Five-step DHTM planning gives more abrupt value function in comparison to prediction regime, as it usually requires more than five steps to reach the goal. Heatmaps for other baselines can be found in Appendix~\ref{appx:vf}.} 

\begin{figure}[tb]
    \centering
    \includegraphics[width=\linewidth]{2d_maze_surprise.png}
    \caption{Results of 2D maze random policy experiment in the Pinball environment. Surprise learning curves for DHTM, LSTM, RWKV and CSCG. Heatmaps represent value functions for DHTM and LSTM.}
    \label{fig:2dmaze}
\end{figure}

The second test is to show how TM can enhance adaptation in changing environments. For that experiment, we use two configurations of the Pinball environment shown in Figure~\ref{fig:setups}-A. We narrow the action space to three momentum vectors: vertical, 30 degrees left and 30 degrees right from the vertical axis. Each time step, the agent gets a small negative reward and a large positive reward if the ball enters the force field in the centre. The episode finishes when the ball enters the rewarding force field or the maximum number of steps is reached. Each trial is run for 500 episodes, each a maximum of 15 steps long, and we average the results over three trials for each parameter set and memory model.

We test the accuracy of five-step SF representations by measuring their pseudo-surprise, which is surprise computed for observed states on different time steps after SF was predicted with respect to normalized SF (more details in Appendix~\ref{appx:ps}). \add{In all experiments, the encoder outputs five variables $\Phi$ with 50 states each.} As can be seen from Figure~\ref{fig:pinball_results_sr}, SRs produced by our memory model (\textrm{dhtm}) give lower surprise than SRs of LSTM (\textrm{lstm}) and RWKV (\textrm{rwkv}), and is on par with SRs produced by Factorial version of CSCG (\textrm{fchmm}), which is just several CSCGs trained in parallel to enable handling of multiple variables outputted by encoder.  

\begin{figure}[tb]
    \centering
    \includegraphics[width=0.28\linewidth, trim={3.8cm 2.5cm 3.8cm 2.5cm}, clip]{fig03a_1step.png}
    \includegraphics[width=0.28\linewidth, trim={3.8cm 2.5cm 3.8cm 2.5cm}, clip]{fig03b_2step.png}
    \includegraphics[width=0.28\linewidth, trim={3.8cm 2.5cm 3.8cm 2.5cm}, clip]{fig03c_3step.png}
    \caption{Surprise comparison for various memory models including DHTM (ours), LSTM, RWKV, and Factorial version of CSCG (\textrm{fchmm}). The SFs generated by normalized five-step prediction models are used to calculate surprise for three future time steps.}
    \label{fig:pinball_results_sr}
\end{figure}

Then, we test how the number of prediction steps affects the agent's adaptability in the Pinball environment. In the first 500 episodes, the agent is trained to reach the target in the centre, as shown in Figure~\ref{fig:setups}-A, then the target is blocked by a random force that applies force in perpendicular direction to the ball's movement. The results show that an agent that uses five prediction steps during n-step TD learning of SF faster adapts to the changes in the environment in comparison to 1-step TD learning for SF, as seen from Figure~\ref{fig:pinball_results_rl}-A.

\begin{figure}[tb]
    \centering
    \includegraphics[width=0.7\linewidth]{rewards.png}
    \caption{A. Comparison of agent's adaptability during changes in the environment with different prediction steps during n-step TD learning of SF. At the 500th episode, the environment changes its configuration, shown in Figure~\ref{fig:setups}-A.
    B. AnimalAI changing food position experiment. Left picture is DHTM reward curves each averaged over five trials for two cases: SF formed by 7-step planning using DHTM and SF is predicted using TD learned weights and DHTM inferred belief states. At the 300th episode, the food is moved to the opposite corridor (see Fig.~\ref{fig:setups}-C).
    }
    \label{fig:pinball_results_rl}
\end{figure}

\subsection{AnimalAI}
\add{We designed an experiment in AnimalAI environment shown on Figure~\ref{fig:setups}-C. There are two corridors, one of which contains food (yellow cirle). The agent makes a decision at the start of the trial, having three options: go to the left corridor, go to the right and stay turning. After the decision is made, the agent follows a fixed strategy, which brings it either to the right corridor or to the left, and it observes its movement and actions. An episode ends when strategy is executed. Each time step, agent gets small negative reward and big positive reward only if reaches food. After 300 episodes, food is placed to the other corridor. Reward curves averaged over five trials for each setup are presented in Figure~\ref{fig:pinball_results_rl}-B. There are two cases on the plot: SF is formed by prediction (\eqref{eq:pred}) or planned (\eqref{eq:gen}). The results for DHTM show that planning allows much faster adaptation to the change of the rewarding food position.}

\section{Conclusion}
In this paper, we introduce a novel probabilistic Factorial-HMM-like algorithm DHTM for learning an observation sequence model in stochastic environments that uses local Hebbian-like learning rules, which renders it apt for running on neuromorphic processors. \add{DHTM is scalable to multiple feature variables as it employs sparse distributed representations and sparse factor function implementation using segments, which biologically plausible multicomponent neural models inspire. In contrast to methods that use Monte-Carlo trajectory sampling for future states probability estimation, our method is able to perform belief propagation, so each prediction step adds constant amount of computations.} We show that our memory model can quickly learn the observation sequences representation and the transition dynamics. The DHTM produces more accurate n-step Successor Features than LSTM and RWKV, which speeds up n-step TD learning of the SF in Reinforced Learning tasks with the changing environment.

\add{One of the limitations of the DHTM is that its temporal context is random, as it is formed on the fly. That is, the mechanism of context formation doesn't allow generalizations. That is why we are forced to use feature space inferred from observations for value function decomposition, to soften this problem. Nevertheless, we believe that forming Successor Features combined with two level hierarchy of DHTM layers may provide the next step to circumvent this limitation, which directs of our further research. Another limitation is the maximum number of variables per factor. The amount of segments in use grows exponentially with the number of variables per factor, especially in noisy environments. Solving this issue would require to modify segment excitation or growth algorithms.}    

%\subsubsection*{Author Contributions}
%ED developed the theoretical foundations of the memory model and its software implementation, conducted experiments, and prepared the text of the article. PK developed the encoder and decoder, prepared and configured the LSTM and RWKV baselines. AP advised and supervised the work of the team. PK and AP also helped with writing the article.

% \subsubsection*{Acknowledgments}
% Use unnumbered third level headings for the acknowledgments. All
% acknowledgments, including those to funding agencies, go at the end of the paper.

\bibliography{iclr2024_conference}
\bibliographystyle{iclr2024_conference}

\appendix

\section{Encoding and Decoding Observations}
\label{appx:enc}
Because our model is designed to work with sparse distributed representations and the testing environments do not provide observations as SDRs by default, an encoding procedure is required. For this task, we use a modified version of the Spatial Pooler (SP) \citep{spatial_pooler_2017,sp_math_2017}, a distributed noise-tolerant online clustering neural network algorithm that converts input binary patterns into SDRs with fixed sparsity while retaining pairwise similarity \citep{kuderov2023stabilize}. The SP algorithm learns a spatial specialization of neurons' receptive fields using the local Hebbian rule and k-WTA ($k$ winners take all) inhibition \citep{kwta}. Here we outline the main differences from the ``vanilla'' version of the SP algorithm described in \citet{spatial_pooler_2017}.

During an agent's decision-making process pipeline, the SP encoder accepts a current observation $o$ and transforms it to a latent state SDR $z$. In terms of processing, our SP encoder functions as a standard artificial neural network with a k-WTA binary activation function.:
\begin{align}
    \text{overlaps}_i &= \beta_i W_i o 
        \label{eq:overlaps}\\
    z_i &= \mathbb{I} \left[i \in \text{kWTA}(\text{overlaps}) \right],
\end{align}
\noindent
where $o$---a binary observation vector, $W_i$---a row-vector representing $i$-th neuron's connection weights (where non-existing connections have zero weights), $\text{overlaps}_i$---a value representing the strength of the input pattern recognition with the neuron $i$ \footnote{While the name ``overlap'' does not exactly reflect its meaning in our SP modification, because it is not a binary overlap between a receptive field and an input pattern, we kept it on purpose to refer to the similar term commonly used for the original SP.}, $\beta_i$---an $i$-th neuron boosting value, $z_i$---an $i$-th bit of an output SDR, $\mathbb{I}[\dots]$---an indicator function, kWTA---a $k$-winners-take-all activation function returning $k$ indices of the neurons with the highest overlap.

One difference between the ``vanilla'' SP algorithm and ours is that we do not distinguish between potential and active neural connections. Because all [existing] connections are active, they all participate in calculating overlaps. In the overlaps calculation, non-binary, that is, real-valued weights are used, similar to artificial neural networks, as shown in \eqref{eq:overlaps}. Furthermore, each neuron has a fixed capacity to produce neurotransmitters, which it distributes between its synaptic connections. This means that we keep all neuron weights normalized and summed to one. While it achieves the same Hebbian learning with homeostatic plasticity as the original SP, the exact formula is slightly different:
\begin{align}
    \Tilde{W}_i &= W_i + \alpha z_i \frac{
        \text{RF}_i \odot o
    }{
        \sum_j {\text{RF}_i \odot o}
    } \nonumber \\
    W_i &\leftarrow \frac{\Tilde{W}_i}{\sum_j{\Tilde{W}_i}},
\end{align}
\noindent
where $\Tilde{W}_i$---a row of new $i$-th neuron weights before normalization, $\alpha$---learning rate, $z_i$---a binary value representing the current activity state of the $i$-th neuron, $RF_i$---an $i$-th row of the binary connectivity matrix representing an $i$-th neuron receptive field, $\odot$---elementwise product, $o$---a binary observation vector.

The original SP algorithm has several drawbacks, including encoding instability caused by an innate homeostatic plasticity mechanism known as boosting, which helps neurons specialize and increases overall adaptability but makes memorization tasks more difficult, and slow processing on large inputs such as images, where an encoding overhead becomes noticeable when compared to overall model timings around 1k input size.

The introduction of the newborn stage, which follows the ideas proposed in \citet{dobric2022importance}, solves an encoding instability problem. The newborn stage of a spatial pooler occurs during the early stages of its learning process, when its neurons are expected to specialize. The boosting, which is intended to aid in the specialization process, is activated only during the newborn stage and its scale gradually decreases from the configured value to zero. Boosting remains turned off during an encoder's ``adulthood'', reducing the possibility of spontaneous re-specialization.

To reduce processing overhead, we use a much more sparsified connection matrix than in the original SP version. We randomly initialize connections with 40-60\% sparsity, which is typical for the ``vanilla'' SP. Then, during the newborn stage, we gradually prune the vast majority of the weakest connections, resulting in neurons that are highly specialized due to their small receptive fields. We typically configure the final receptive field size in relation to the average input pattern size (usually 25-200\% of it, resulting in 0.1-10\% connections sparsity). For example, if binary input patterns have on average 100 active bits out of 1000, we can set the target size of receptive fields to 25, which is 25\% of the active input size and corresponds to 2.5\% connection matrix sparsity. As a result, the spatial pooler's instability (and thus adaptiveness!) becomes even more limited in the adult stage.

Because of its soft discretization (from the distributed representation) and clusterization properties, we expect SP to assist the model with input sequence memorization and an environment transition dynamics generalization tasks in addition to the encoding itself. However, because the SP encoder learns online, particularly during the newborn stage, its output representation can be highly unstable during the early stages, potentially resulting in a performance drop.

To visualize and debug an encoded observation, we also learn a decoder, which is a linear neural layer learned locally with gradient descend on the MSE error between the predicted reconstruction and the actual observation.

\section{Value Function Decomposition}
\label{appx:decompose}
\add{In our agent model, we approximate the reward function R(s) as a sum: $R_t=\frac{1}{n}\sum^n_{k=1}{r(\varphi^k_t) \varphi^k_t}$, where $r(\varphi^k_t)$ is a reward associated with state $\varphi^k_t$, $n$--number of feature variables. Then, similarly to the Successor Representation idea (see Section~\ref{sec:sr}), the value function can be represented as:}

\add{\begin{align}
    V(h_t) 
    &= \mathrm{E} [\sum^{\infty}_{l=0} \gamma^l R_{t+l+1} \ |\ h_t] 
        = \sum^{\infty}_{l=0} \gamma^l \mathrm{E} [ 
        \frac{1}{n}\sum^n_{k=1}{r(\varphi^k_t)} \ |\ h_t]
        \nonumber \\
    &= \frac{1}{n}\sum^n_{k=1}\sum_{j} \sum^{\infty}_{l=0} \gamma^l p(\varphi^k_{t+l+1}=j\ | \ h^k_t) r^k_j
        \nonumber \\
    &= \frac{1}{n} \sum^n_{k=1}\sum_{j} M^k_{j}(h^k_t) r^k_j,
        \label{eq:state_value_features}
\end{align}
\noindent
where $M^k_{j}(h^k_t)=\sum^{\infty}_{l=0} \gamma^l p(\varphi^k_{t+l+1}=j\ | \ h^k_t)$, $h_t=(h^1_t, ..., h^n_t)$---hidden state vector of variables $\{H^k_t\}_k$. }

\add{Then, the temporal difference for $M^k_{ij}=M^k_{j}(h^k_t=i)$ is:}

\add{\begin{align}
    \delta^k_{ij} &= \sum^{T}_{l=0} 
        \gamma^l p(\varphi^k_{t+l+1}=j\ | \ h^k_t=i))
         + \gamma^{T+1} \sum_{m} M^k_{mj} p(h^k_{t+T+1}=m\ |\ h^k_t=i)
         - M^k_{ij},
        \label{eq:td_update_feature}
\end{align}
However, in POMDP we can't observe $h^k_t$, we only have a distribution $p(h^k_t \ |\ o_{0:t})$. Therefore, we need to average out the hidden state variable $\delta^k_j=\sum_i\delta^k_{ij}\cdot p(h^k_t=i \ |\ o_{0:t})$. Assuming that we minimise $L=(\delta^k_j)^2$, we get the following update rule:
\begin{align}
        \delta^k_j &= \mathrm{gen}_{t+T}(\varphi^k=j\ |\ o_{0:t}) + 
            \gamma^{T+1}\mathrm{pred}_{t+T+1}(\varphi^k=j\ |\ o_{0:t}) -
            \sum_i M^k_{ij}p(h^k_t=i\ |\ o_{0:t}) \\
        M^k_{ij} &\leftarrow M^k_{ij} + 
        \alpha \delta^k_{j} \cdot p(h^k_t=i \ |\ o_{0:t}),
    \label{eq:td_feature_update}
\end{align}
\noindent
where $\mathrm{gen}_{t+T}$---Successor Features component, generated by temporal memory up to timestep $T$, and $\mathrm{pred}_{t+T+1}$---SF component predicted using $M^k_{ij}$ weights:
\begin{align}
\mathrm{gen}_{t+T}(\varphi^k=j\ |\ o_{0:t}) &= \sum^T_{l=0}\gamma^l
    \sum_i p(\varphi^k_{t+l+1}=j\ |\ h^k_t=i)p(h^k_t=i\ |\ o_{0:t}) \label{eq:gen} \\
    \mathrm{pred}_{t+T+1}(\varphi^k=j\ |\ o_{0:t}) &= \sum_i M^k_{ij}
    p(h^k_{t+T+1}=i\ |\ o_{0:t})
    \label{eq:pred}
\end{align}}

\section{Biological Interpretation} 
\label{appx:bio}
\add{Neural implementation of the DHTM is inspired by neocortical neural networks (see Fig.~\ref{fig:bio_intr}). Hidden variables $H^k$ may be considered as populations of excitatory pyramidal neurons in cortical layer L2/3 of somatosensory areas, with lateral inhibition modelled as $\mathrm{softmax}$ function. \citet{staiger_2021} showed that neurons in this layer are responsible for temporal context formation.}

\add{The neuronal activity at timestep $t$ can be thought to carry messages $M^k_{t-1}$. Messages are propagated through synapses of dendritic segments, which correspond to factors $F^k_c$. Dendritic segments of biological neurons are known to be coincidence detectors of its synaptic input \citep{Stuart2015}. We use the notion of dendritic segment to sparsely represent context factors $F^k_c$, as each factor value corresponds to a particular combination of states (or active cells).}

\add{Feature variables $\Phi^k_t$ may be considered to represent cells of a granular layer (L4), as they are known to be the main hub for sensory excitation for L2/3. L2/3 cells that have common sensory input from the layer L4 are modelled as columns for particular feature states $\mathrm{col(\varphi^k_t)}$ \citep{mountcastle_columnar_1997}.}
\begin{figure}[h]
    \centering
    \includegraphics[width=0.8\linewidth]{bio_intr.png}
    \caption{
        Biological view of the neural implementation of the DHTM. Variables $H^{\cdot}_{t-1}$ correspond to populations of neurons that have common sensory input and lateral inhibitory competition. Dendritic segments correspond to factor values $f_l$. spike frequency of a neuron reflects state probability $p(h^k_t)$ of a variable.        
    }
    \label{fig:bio_intr}
\end{figure}

\section{Pseudo-Surprise}
\label{appx:ps}
\add{To calculate the pseudo-surprise of SF, we do the following:
\begin{enumerate}
    \item Normalize SF by summing it over corresponding variables $\Psi$ and dividing SF by these sums. The result is SF-induced probability distribution $p(\varphi^k)$ of $\Psi^k$ variables. 
    \item Measure average surprise over future observed states $\varphi^k$, according to this distribution: $-\log p(\varphi^k=j)$, where $j$ is the observed state.
\end{enumerate}
Normalized SF represents future observation (feature) profile for the current state. Pseudo-surprise shows whether SF is consistent with the observed feature states or not. For example, if SF doesn’t predict feature $j$ ($p(\varphi^k=j) = 0$), but we observe it, this’ll result in infinite surprise, which means that the SF is of a bad quality.} 

\section{Baseline RL agents implementation} \label{appx:baselines}
\add{As mentioned in Section~\ref{sec:ag_arch}, we incorporate DHTM as a part of an RL agent, which has a memory model, an SF mapping from hidden space, and a feature reward function. Our memory model is expected to speed up SF learning for an agent. We put this hypothesis to the test by experimenting with other memory models while keeping the agent architecture the same. Thus, all tested memory models work in the same regime---they learn sequences of encoded binary observations (i.e. SDRs that we get from Spatial Pooler encoder described in Appendix~\ref{appx:enc}) concatenated with one-hot encoded actions.}

\add{LSTM baseline was implemented with a single LSTMCell from PyTorch library \citep{pytorch}. It is supported by an additional symexp-layer to encode input before passing it to the LSTM cell and a symexp-layer to decode the LSTM cell's output from the LSTM's hidden state back to the input representation, where symexp activation function, $\mathrm{symexp}(x) = \mathrm{sign}(x) e^{\lvert x \rvert - 1}$, is a reverse of symlog function: $\mathrm{symlog} = \mathrm{sign}(x) \log{(\lvert x \rvert + 1})$.}

\add{The similar way we implemented RWKV baseline: a single RWKV layer supported by single-layer linear encoder and decoder. Current public RWKV implementation is a fast evolving framework \citep{peng_bo_2021_5196578}, and for the increased performance it is tightly bound to the offline batch training common for the transformer architectures. In our case we needed a so-called sequential mode for online learning similar to LSTM. Thus, we adapted another public implementation mentioned in the official documentation (\href{https://github.com/BlinkDL/ChatRWKV/blob/main/RWKV_in_150_lines.py}{RWKV in 150 lines of code}).}

\add{Both RNNs were trained online with backpropagation through time (BPTT) on the observed sequences with the backpropagation update step scheduled every $k$ timesteps. We experimented with different schedules and found that $k = 20$ provides a balance between the training stability and speed. The learning rate was set to $\alpha=6 \cdot 10^{-4}$ for LSTM and $\alpha = 5 \cdot 10^{-4}$ for RWKV.}

\add{We also incorporated some notion of random variables and their states by splitting the hidden state of the tested RNNs into groups. In all experiments the hidden state represents $80$ categorical variables with $4$ states. That is, both RNNs are forced to learn $80$ categorical distributions with multi-cross-entropy loss to explain the observed sequences, which is a somewhat close to the multi-categorical hidden state representation used in DreamerV2/V3 \citep{hafner2023mastering}. The idea of using symexp activation function, mentioned above, is inspired by Dreamer too, and is used to remedy the problem of learning extreme probability values. Without symexp the neural network has to represent zero probability with high negative logit values and one-probability with high positive logit values, which is hard to reach with low learning rate and may lead to instabilities. Thus, symexp function makes it faster to reach target values in log space.}

\add{CSCG baseline was implemented using code from the repository accompanying the paper (\url{https://github.com/vicariousinc/naturecomm_cscg}). In our experiments, in order to handle multiple feature variables, we trained several CSCGs independently using the same data. CSCG was trained on batches with size of 500 observation steps. We iteratively calculated exponential moving average of transition matrices obtained for different batches with smoothing coefficient $\alpha=0.8$. This smoothed transition matrix was used as initialization for the next batch training and for inference.}

\add{All baselines employed a multilayer perceptron, implemented with PyTorch, in order to map from the hidden state distribution to Successor Features because the simple linear model described in Appendix~\ref{appx:decompose} didn't work for them. The MLP had one hidden layer with size 256 units and batch size of 32 for CSCG and 256 for LSTM and RWKV with squared temporal difference as a loss function defined by \eqref{eq:td_feature_update}.}

\section{Experimental Setups}

Pinball is a partially observable environment developed in the Godot Game Engine \citep{beeching2021godot}. A ball that can move in the surface's 2D space and a surface with borders make up the environment (see Figure~\ref{fig:setups}-A). Force fields depicted as circles introduce stochasticity to the environment as they deflect the ball in random directions. An agent can apply arbitrary momentum to a ball. For each time step, the environment returns an image of the top view of the table as an observation and a reward. The agent gets the reward by entering force fields. Each force field can be configured to pass a specific reward value and to terminate an episode.

AnimalAI is a testbed inspired by experiments with animals \citep{pmlr-v123-crosby20a}. The environment consists of 3D area surrounded by a wall and many different objects that can be placed using a configuration file including: walls, food, ramps, trees, movable obstacles and so on (see Figure~\ref{fig:setups}-C). 

\label{appx:setups}
\begin{figure}[h]
    \centering
    \includegraphics[width=0.8\linewidth]{experimental_setups.png}
    \caption{
        A. Pinball experiments used two different setups. The upper image shows a setup in which the target is not blocked. The lower image depicts the setup, with the target obscured by a random field that deflects the ball perpendicular to its movement direction.
        B. Visualization of several steps in the Pinball environment. Each step is depicted by raw observation image, binary image of events, predicted events and Successor Features.
    C. Animal experimental setup: two corridors, one of which containing food (yellow circle), the agent is in between of the corridors (letter A). Food position changes after 300 episodes. Images on the right: observations (raw), processed observations (events), predictions and Successor Features decoded back to observation space for three last steps of an episode.
    }
    \label{fig:setups}
\end{figure}

\section{2D Maze Value Functions}
\label{appx:vf}
\begin{figure}[h]
    \centering
    \includegraphics[width=0.8\linewidth]{vf_rwkv_cscg.png}
    \caption{Heatmaps representing value function in 2D maze Pinball environment setup.}
    \label{fig:vf_rwkv}
\end{figure}

\section{Glossary}
\textbf{Categorical Random Variable}---a discrete random variable that can take on of finite $K$ possible states.\\
\textbf{Cortical Column or Minicolumn}---a population of neurons in the neocortex that spans across layers and shares sensory input.\\
\textbf{Dendritic segment}---a group of synapses (neuron's connections) that acts as an independent computational unit affecting the resulting neuron's activity.\\
\textbf{Factor Graph}---bipartite graph representing the factorization of a probability distribution, with one part representing factor nodes and another---random variables.\\
\textbf{Multi-compartment neuron model}---a model of neuron that divides neuron's connections into groups (segments) of different types (compartments), where each group may be considered as partly independent computational unit and groups of each compartment may affect the neuron's activity differently.\\
\textbf{Sparse Distributed Representations (SDR)}---sparse binary vector in a high-dimensional space, usually formed by k-WTA algorithms.\\
\textbf{Spatial Pooler (SP)}---a distributed noise-tolerant online clustering neural network algorithm that converts input binary patterns into SDRs with fixed sparsity while retaining pairwise similarity.\\
\textbf{Successor Representations (SR)}---a discounted sum of future [one-hot encoded] observations.\\
\textbf{Successor Features (SF)}---a generalization of SR, a discounted sum of future latent states.\\
\textbf{Temporal Memory (TM)}---in this work by this term we mean ``memory for sequences''.\\
\textbf{Hidden Markov Model (HMM)}---statistical model of a stochastic process where state probability depends only on previous state of the process.

\end{document}

