\documentclass[accepted]{uai2022} %

\usepackage[american]{babel}
\usepackage{zref-xr}
\usepackage{nameref}

\usepackage{natbib} %
\bibliographystyle{plainnat}
\renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} %
\usepackage{booktabs} %
\usepackage{tikz} %

\usepackage{bm}
\usepackage{dsfont}
\usepackage{graphicx}
\graphicspath{{./figs/}}
\usepackage{caption}
\usepackage{subcaption}
\usepackage{wrapfig}
\usepackage{xcolor}
\usepackage{hyperref}

\definecolor{navyb}{RGB}{0,0,128}
\definecolor{burgundy}{RGB}{150, 0, 32}
\hypersetup{
  colorlinks,
  citecolor=navyb,
  linkcolor=burgundy,
  urlcolor=navyb,
  filecolor=burgundy
  }
 

\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{amsthm}
\usepackage[capitalize,noabbrev]{cleveref}
\zxrsetup{toltxlabel=true,tozreflabel=false,verbose}
\zexternaldocument*{erraqabi_314}

\setcounter{equation}{6}
\setcounter{figure}{8}

\usepackage{algorithm}
\usepackage{algorithmic}
\newcommand\numberthis{\addtocounter{equation}{1}\tag{\theequation}}
\newcommand{\hi}{\text{hi}}
\newcommand{\low}{\text{low}}


\newcommand\blfootnote[1]{%
  \begingroup
  \renewcommand\thefootnote{}\footnote{#1}%
  \addtocounter{footnote}{-1}%
  \endgroup
}

\renewcommand{\Authsep}{\hspace{2em}}
\renewcommand{\Authand}{\hspace{2em}}
\renewcommand{\Authands}{\leading{18pt}\hspace{2em}}

\usepackage{authblk}
\makeatletter
\renewcommand\AB@affilsepx{~~ \protect\Affilfont}
\makeatother


\newcommand{\swap}[3][-]{#3#1#2} %

\title{Temporal Abstractions-Augmented Temporally Contrastive Learning: \\An Alternative to the Laplacian in RL (Supplementary material)}

\author[1,2]{Akram~Erraqabi}
\author[4,5,6]{Marlos~C.~Machado}
\author[1,3]{Mingde~Zhao}
\author[8]{Sainbayar~Sukhbaatar}
\author[8]{\authorcr{Alessandro~Lazaric}}
\author[8]{Ludovic~Denoyer}
\author[1,2,7]{Yoshua~Bengio}
\affil[1]{%
    Mila
}\affil[2]{%
    Université de Montréal
}
\affil[3]{%
    McGill University
  }
\affil[4]{%
    Amii\protect\\
  }
\affil[5]{%
    University of Alberta
  }
\affil[6]{%
    CIFAR AI Chair
  }
\affil[7]{%
   CIFAR Fellow
  }
\affil[8]{%
    Meta AI
  }
  
\begin{document}

\maketitle
\appendix

\section{Representation Objective Augmentation: Ablation Study}
\label{app:ablation}

\subsection{Boredom augmentation helps exploration}
In order to illustrate the importance of the boredom term $\mathcal{B}$ -- in the final objective (\ref{eq:rep-L}) we conducted the same representation learning experiments for the three gridworld domains in the non-uniform prior setting, but this time with the non-augmented representation learning objective ($\beta'=0$). 

\cref{fig:ablation_exp} shows how the agent failed at exploring the whole domain. In \textsc{T-Maze}, it focuses only on one corridor without getting curious about the other one. Regarding \textsc{U-Maze} and \textsc{4-rooms}, the agent stops exploring after discovering the end of the first corridor and the second room, respectively. This is due to the lack of incentive to visit the yet unseen states, as they are less rewarding for $\pi_\hi$ (i.e., closer in the representation space, hence smaller $R^\hi$) than the furthest explored state. The effect of the proposed augmentation compresses the representation of the explored area, say the first corridor in \textsc{U-Maze}, which makes the rest of the environment more appealing to explore for $\pi_\hi$ (i.e. further in the representation space, hence larger $R^\hi$).


\begin{figure}[ht!]
    \centering
    \begin{subfigure}[t]{0.2\columnwidth}
        \centering
        \includegraphics[width=\linewidth]{ablation_U_maze}
        \caption{\textsc{U-Maze}}
    \end{subfigure}%
    ~ \hspace{0.1\columnwidth}
    \begin{subfigure}[t]{0.2\columnwidth}
        \centering
        \includegraphics[width=\linewidth]{ablation_T_maze}
        \caption{\textsc{T-Maze}}
    \end{subfigure}
    ~ \hspace{0.1\columnwidth}
    \begin{subfigure}[t]{0.2\columnwidth}
        \centering
        \includegraphics[width=\linewidth]{ablation_4rooms}
        \caption{\textsc{4-rooms}}
    \end{subfigure}
    \caption{Learned representations in the gridworld domains with the \emph{non-augmented} objective. Without the boredom term, the agent fails to cover the state space (cf. Figure \ref{fig:progress_exp}), and may settle for incomplete representations. The colors reflect the distances in terms of the dynamics. They can be seen as quantities proportional to the length of the shortest path from the $s_0$ (marked in red) to the represented state.}
    \label{fig:ablation_exp}
\end{figure}

\subsection{Boredom augmentation enforces dynamics-awareness}
\label{app:dynamic_aware}
To verify the benefit of the boredom term beyond helping exploration, we train the representation with the non-augmented objective ($\beta'=0$) but this time in the uniform prior setting, so that to marginalize the exploration problem. \cref{fig:ablation_dynamics_awrns} illustrates the learned representations in the three gridworld domains.
These representations have failed to capture the dynamics. For example, in the case of \textsc{4-rooms}, the distances from the first room to the fourth and third rooms are comparable in the representation space, which indicates that the representation does not take into account the relative order in which the rooms should be visited, when moving from the first room to the last.
Similarly, in \textsc{U-Maze}, the end of the maze is closer to the initial area than the second corner is. However, in order to reach the former on must pass by the latter.
This proves that the boredom term is not only important for the desired exploratory behavior (cf. \cref{fig:ablation_exp}), but also enhances the dynamics-awareness of our representation.

\begin{figure}[ht!]
    \centering
    \begin{subfigure}[t]{0.25\columnwidth}
        \centering
        \includegraphics[width=\linewidth]{unif_baseline_U_maze}
        \caption{U-Maze}
    \end{subfigure}%
    ~ \hspace{0.1\columnwidth}
    \begin{subfigure}[t]{0.25\columnwidth}
        \centering
        \includegraphics[width=\linewidth]{unif_baseline_T_maze}
        \caption{T-Maze}
    \end{subfigure}
    ~ \hspace{0.1\columnwidth}
    \begin{subfigure}[t]{0.25\columnwidth}
        \centering
        \includegraphics[width=\linewidth]{unif_baseline_4rooms}
        \caption{4-rooms}
    \end{subfigure}
    \caption{Learned representations when uniformly sampling over the state space. Without the boredom term, the representation does not reflect temporally-extended dynamics. The colors reflect the distances in terms of the dynamics. They can be seen as quantities proportional to the length of the shortest path from the $s_0$ (marked in red) to the represented state.}
    \label{fig:ablation_dynamics_awrns}
\end{figure}

\section{A greedy high-level reward can hurt exploration}
\label{app:high_level_vs_greedy}
Our high-level reward term (\ref{eq:high_reward}) can be seen as an instance of assigning credit to skills based on their potential of affording more exploration. The greedy version of this reward, e.g. of the form $R^\hi_\text{greedy}(s^\hi_k, \bm{\delta}_k) \triangleq \|\phi(s^\hi_k) - \phi(s^\hi_{k+1}) \|_2$, would encourage each skill (and its direction) to take large steps. Taking large steps, even if it may seem intuitive, is not a good choice for exploration since the agent can get trapped in large oscillations: each skill can travel maximally, back and forth, e.g. along the diagonal of some initially explored area around $s_0$. These oscillations would still be very rewarding to the high-level policy in this greedy reward case without encouraging further exploration of the environment. Our high-level reward (\ref{eq:high_reward}) does not fall in this limitation. Our reward choice does not force each of the intermediate skills (within the sequence of $L$ skills) to be individually “greedy”. This allows picking skills that might seem sub-optimal, in the sense that they don’t travel far on their own, but still afford/offer more promising exploration opportunities later on. For example, consider the case where the agent is in a room with a closed door and on the other side of this door there is a large space to explore. With our reward term, the skill of opening the door would be rewarded similarly to that of a skill able to travel far the other side of the door once this one is open. The high-level policy eventually learns that opening the door is valuable for exploring further areas in the environment.

\section{Implementation details}
\label{app:implem}

\subsection{GridWorld}
The states are one-hot encoded such that no positional information is provided to the agent. The domains dimensions are: \textsc{U-Maze} $30\times30$, \textsc{T-Maze} $40\times30$, \textsc{4-rooms} $21\times21$.

For all the experiments, we defined the representation network as an MLP of two hidden layers of size $128$ with tanh activations and a linear output layer of the size of representation's dimensionality $d$. The high-level and the low-level policies are both MLPs of two hidden layers of size $128$ with tanh activations and a logsoftmax output layer of the size of their respective action spaces: the environment's $4$ actions for the low-level policy and $8$ actions for the high-level policy corresponding to the $8$ directions $\Omega = \{(\cos(2k\pi/n), \sin(2k\pi/n))\ |~ k \in \{0,...,7\}\}$ that define diverse skills.

The policies were trained with vanilla A2C with MC returns from the collected trajectories (Monte-Carlo estimates), i.e. no bootstrapped values where used. The skills being of a fixed size they could be trained without any reward discount ($\gamma=1$). 
The high-level and low-level policies were entropy-regularized with coefficients $0.3$ and $0.1$ respectively.

All of these networks were trained with RMSprop~\citep{hinton2012neural} and a step size of $0.001$. Environments specific hyperparameters are provided below.

\subsubsection{Representation Learning}

\paragraph{\textsc{U-Maze}.} Our representation is learned in the non-uniform prior setting with $p_r{=}0.3$, $p_{rw}{=}0.4$ and $K{=}90$ (around the number of steps between $s_0$ and the furthest state in the maze).
We learn a 2-dimensional representation ($d=2$) using the representation learning objective (\ref{eq:rep-L}) with $\beta=0.2$ and $\beta'=2$.
We fix the skills length to $c=30$ steps (so $L=K/c=3$), and jointly train the representation $\phi$ and the policies $(\pi_\hi,\pi_\low)$ by collecting, for each update, a batch of $N=32$ trajectories of length $c$ to fill $D_s$ and $D_{\pi_\mu}$ as described in Algorithm~\ref{alg:simultaneous_training}. We train them for $700$ epochs where each epoch corresponds to $10$ updates (convergence to the complete representation required around $500$ epochs).

\paragraph{\textsc{T-Maze}.} Our representation is learned in the non-uniform prior setting with $p_r{=}0.2$, $p_{rw}{=}0.4$ and $K{=}40$ (around the number of steps between $s_0$ and the furthest state in the maze).
We learn a 2-dimensional representation ($d=2$) using the representation learning objective (\ref{eq:rep-L}) with $\beta=0.2$ and $\beta'=2$.
We fix the skills' length to $c=20$ steps (so $L=K/c=2$).
and jointly train the representation $\phi$ and the policies $(\pi_\hi,\pi_\low)$ by collecting, for each update, a batch of $N=32$ trajectories of length $c$ to fill $D_s$ and $D_{\pi_\mu}$ as described in Algorithm~\ref{alg:simultaneous_training}. We train them for $700$ epochs where each epoch corresponds to $10$ updates (convergence to the complete representation required around $350$ epochs).

\paragraph{\textsc{4-rooms}.} Our representation is learned in the non-uniform prior setting with $p_r{=}0.25$, $p_{rw}{=}0.5$ and $K{=}60$ (around the number of steps between $s_0$ and the furthest state in the maze).
We learn a 2-dimensional representation ($d=2$) using the representation learning objective (\ref{eq:rep-L}) with $\beta=0.2$ and $\beta'=2$.
We fix the skills' length to $c=20$ steps (so $L=K/c=3$).
and jointly train the representation $\phi$ and the policies $(\pi_\hi,\pi_\low)$ by collecting, for each update, a batch of $N=32$ trajectories of length $c$ to fill $D_s$ and $D_{\pi_\mu}$ as described in Algorithm~\ref{alg:simultaneous_training}. We train them for $700$ epochs where each epoch corresponds to $10$ updates (convergence to the complete representation required around $350$ epochs).


\paragraph{The Laplacian representation} \textsc{Lap-rep} was trained in the same environments' settings described above, for both the uniform and non-uniform prior settings (of course no policy is trained here so $p_{rw}=1$, and $(s_0, p_r)$ are not relevant for the uniform prior setting). Besides the representation's dimension $d$, we used the training configuration and hyperparameters proposed by~\citet{Wu2019}. For the uniform prior setting, our online data collection does not cause any discrepancy compared to the offline scheme used in~\citet{Wu2019}. Indeed, for a minibatch size large enough, the stochastic minibatch based training of \textsc{Lap-rep} when using a uniform prior is agnostic to the data collection scheme (offline vs online) since in both cases the minibatches are sampled from the exact same state distribution.

\subsubsection{Prediction and Control}

In the prediction and control experiments, we evaluate each pretrained representation by training an actor-critic agent to solve a goal-achieving task with a sparse reward ($r=1$ upon reaching the goal, and $r=0$ otherwise). Here are the set goal positions: $(1,30)$ in \textsc{U-Maze}, $(25,30)$ in \textsc{T-Maze} and $(1,21)$ in \textsc{4-rooms}.
The episode size was set to $100$ steps for all the gridworld domains.

For the prediction, the critic head is a linear function in the given representation, while the actor is an MLP with two hidden layers of size $64$ and tanh activations, a logsoftmax output layer of size $4$ (discrete gridworld actions), and the actor's input is the state one-hot code. For the control experiments, the actor-critic agent is defined on top of the representation as an MLP of two hidden layers of size $64$ with tanh activations that feed two output heads: a linear critic head, and a logsoftmax action head for the $4$ actions.
The agent is trained with A2C with MC returns and a discount of $\gamma=0.98$, a batchsize of $80$ episodes, an entropy regularization with a $0.01$ coefficient, and Adam optimizer~\citep{kingma2014adam} with a learning rate of $0.001$.
\subsection{MuJoCo: AntMaze}

The Ant agent has a 29 dimensional state space and a 8 dimensional action space (4 legs with 2 joints each to control). For the sake of simplifying the RL training algorithm,
we mapped each action-dimension interval to a discrete set of 5 values equally spaced over this interval.

We used 2 mazes similar in shape to those from \citet{Wu2019} (see \cref{fig:antmaze_domains}): \textsc{AntMaze-1} defines a 3D U-shaped corridor and \textsc{AntMaze-2} is a 3D swirl-shaped corridor.

\begin{figure}[H]
\centering
\begin{subfigure}[t]{0.35\columnwidth}
    \centering
    \includegraphics[width=\linewidth]{figs/AntMaze1_with_goal}
    \caption{\textsc{AntMaze-1}}
\end{subfigure}
\hspace{0.05\columnwidth}
\begin{subfigure}[t]{0.35\columnwidth}
    \centering
    \includegraphics[width=\linewidth]{figs/AntMaze2_with_goal}
    \caption{\textsc{AntMaze-2}}
\end{subfigure}%
\caption{\textsc{Antmaze} domains. Goal positions of the evaluation tasks are shown in green.}
\label{fig:antmaze_domains}
\end{figure}

We used the same architectures for the representation and the policies as for the gridworld, with the only difference that for the low-level policy, the action head was adapted to the discretization of the action space by having $8$ logsoftmax output heads of size $5$, one for each action dimension, and the corresponding $5$ discrete values. This choice makes the training algorithm simpler as it allows using A2C here as well. 

Our representation is learned in the non-uniform prior setting with  $p_r=0.2$, $p_{rw}=0.3$ and $K=500$. We learn a 2-dimensional representation ($d=2$) using the representation learning objective \ref{eq:rep-L} with $\beta=0.2$ and $\beta'=5$.
We fixed their length to $c=100$ steps (so $L=K/c=5$).
and jointly train the representation $\phi$ and the policies $(\pi_\hi,\pi_\low)$ by collecting, for each update, a batch of $N=32$ trajectories of length $c$ to fill $D_s$ and $D_{\pi_\mu}$, as described in Algorithm~\ref{alg:simultaneous_training}. We train them for $1000$ epochs where each epoch corresponds to $10$ updates (convergence to the complete representation required around $650$ epochs).

The policies were trained with the same A2C used in gridworld domains and the same RMSprop hyperparameters. The high-level and low-level policies were entropy-regularized with the coefficients $0.15$ and $0.1$, respectively.

For the reward shaping and skills evaluation experiments, the goal positions are shown in \cref{fig:antmaze_domains}, and success is defined as being within $\epsilon$ from the goal. Here, $\epsilon=2$ which corresponds to half the size of the building blocks of the mazes.

\subsubsection{Reward Shaping}

For these experiments, \textsc{Lap-rep} was trained in both the uniform and the non-uniform prior setting. 

For the uniform prior setting, we used $d=2$ and followed the experimental framework of \citet{Wu2019}. Since our \textsc{AntMaze} environments are larger, we collected $500,000$ training samples (10 times more than in \citet{Wu2019}) from a uniformly random policy, then we trained the representation on this large dataset. For all other hyperparameters, we used those provided in \cite{Wu2019}. With $d=20$, our replication of \textsc{Lap-rep} did not succeed in reward shaping.

Regarding the non-uniform prior setting, we used the same setting configuration as for \textsc{TATC}, with the representation objective from~\citet{Wu2019}.
We have tested online (similar to \textsc{TATC}) and offline \citep{Wu2019} data collection for the representation training, and $d\in \{2, 20\}$. Both schemes ended up performed the same way for the reward shaping task.

Now, for the reward shaping, we train a Soft Actor-Critic (SAC)~\citep{haarnoja2018soft} agent to reach a goal area (neighbourhood around the goal state) with episodes of size $1000$ steps. We use the following hyperparameters: 
\begin{itemize}
    \item Discount $\gamma=0.99$
    \item Entropy coefficient (temperature) $\alpha=0.1$
    \item Soft critic updates with smoothing constant $\tau=0.005$
    \item Replay buffer of size $5\cdot10^6$ (equal to the number of training steps).
    \item Adam optimizer with step size of $0.0001$
\end{itemize}

As SAC is sensitive to the reward scale~\citep{haarnoja2018soft}, we grid-searched this hyperparameter in $\{10^{-5}, 10^{-4}, \cdots, 1, 2, 10, 20\}$, and the best performing one for our representation was $1$ in \textsc{AntMaze-1} and $0.01$ in \textsc{AntMaze-2}.
Regarding \textsc{Lap-rep}, we found that $10$ and $0.01$ worked the best for these two mazes, respectively. All these coefficients correspond to the dense reward shaping setting. Their values were doubled for the half-half mix reward setting, to account for the $0.5$ coefficient.

\subsubsection{Skills Evaluation}
\label{app:skill_eval}

To train DCO, we first collect a dataset to estimate the second eigenvector and then use the same dataset to train a policy -- the option -- using DDPG~\citep{lillicrap2015continuous}. Each DCO option is tied to its own eigenvector estimate and its own training set of size $500000$ (10 times the size used in~\citet{Jinnai2020Exploration}). As suggested by the authors of DCO \citep{Jinnai2020Exploration}, the remaining hyperparameters to estimate the eigenvectors and train their corresponding options were taken from \citet{Wu2019}. DIAYN skills were trained as recommended by \citet{Eysenbach19}. For fair comparison, we train $8$ skills for both DCO and DIAYN.

For the skills evaluation stage, we freeze the learned low-level policies and train a high-level policy to use the $8$ skills as the only available actions to reach the goal $g$ on the other end of the \textsc{AntMaze-1} environment using a sparse reward $r_t = \mathds{1}\left [\|s_{t+1}-g\|_2\leq \epsilon \right ]$ within a finite horizon of $1000$ steps. Note that this tasks is quite challenging given the type of reward and the length of episode especially in a continuous state space. As our skills offer some flexibility in their execution (can be started everywhere and run for arbitrary number of steps), this episode length was decomposed to $5$ skills of $200$ steps each. The high-level policy was trained with A2C with MC returns (no discount) a batch size of $8$ episodes, and RMSprop optimizer with a learning rate of $0.001$.

\section{The switching utility of the boredom term}
Note that $\mathcal{D}_s$, in \cref{eq:boredom}, may contain trajectories from skills that are not yet duly trained; for example early in the training or in a freshly discovered area. Since at that stage, these skills' trajectories are close to random walks, their contribution in the boredom term is similar to the first attractive term, in \cref{eq:base_obj_empir}. This means that a new skill trajectory initially contributes to the temporal similarity term (attractive term) in training the representation, thus making the most out of the sampled skills' trajectories while these are still early in their training. 
The more a skill is trained, the more structured its trajectories become and the more they contribute to the intended "boredom" effect (\cref{section:aug_rep_L}), that is encouraging exploration and dynamics awareness (Appendix~\ref{app:ablation}). 

\section{Connection to Behavioral Mutual Information}
\label{app:connect_visr}

There are numerous methods in the \emph{intrinsic control} literature that aim at maximizing the mutual information between the agent's behavior and a conditioning variable that encodes the available skills.

This type of intrinsic control is achieved by training a skill conditioned policy, $\pi$, to maximize the mutual information between the skill code, $z$, and some representation of the trajectory, $\tau$, obtained from the conditioned policy $\pi(\cdot|z)$. This objective can be written as:
\begin{equation}
\label{eq:MI}
    I(z; f(\tau)) = \mathcal{H}(z) - \mathcal{H}(z| f(\tau)),
\end{equation}
where entropy is denoted by $\mathcal{H}$, and $f$ is a function of the trajectory. It is also common to assume that $z$ is sampled from a fixed prior~\citep{Eysenbach19}, which simplifies this policy training objective as a minimization of the conditional entropy term in \cref{eq:MI}. In practice, the adopted training loss, can be derived as a lower bound of this quantity, using an approximate posterior, $q$:
\begin{equation}
    L_q(\pi) = - \mathbb{E}_{z,\pi}[\log q(z| f(\tau)) ].
\end{equation}
Traditionally, the integrand of this expectation defines the intrinsic reward of the skill conditioned policy, while the approximate posterior $q$ is trained to discriminate the correct $z$ based on the observed behavior $\tau$. 

Inspired by the fast inference offered by Successor Features, \citet{Hansen2020Fast} proposed to use a \emph{log-linear} discriminator in the successor representation (SR), $\phi_\text{SR}$. In other words, the skill rewards can be written as:
\begin{equation}
\label{eq:visr_rew}
    r(s) = \phi_\text{SR}(s)^\top \mathbf{w},
\end{equation}
with $\mathbf{w}$ playing the role of the skill-identifying variable (denoted by $z$ in the general case above). Note that in this case, the function $f$ maps, as it is commonly the case, to the final state of the trajectory. Now, consider the case where the trajectory is instead represented by the (normalized) latent direction of the final transition $(s,s')$. This reward would be
\begin{equation}
\label{eq:visr_rew_2final}
r(s,s') = \frac{(\phi_\text{SR}(s')-\phi_\text{SR}(s))^\top \mathbf{w}}{\|\phi_\text{SR}(s')-\phi_\text{SR}(s)\|_2}.
\end{equation}
Let's recall that the SR\footnote{The SR is encoded by the matrix $(I-\gamma T)^{-1}$, with $T$ the MDP's transition matrix.} shares the same eigenvectors as the Laplacian~\citep{stanchenfeld14, Machado2018}. This implies that it can also be approximated with a temporally-contrastive objective~\citep{Wu2019}, and potentially replaced by our alternative \textsc{TATC} representation $\phi$. Finally, we can rewrite \cref{eq:visr_rew_2final} as
\begin{equation}
\label{eq:tatc_visr}
r(s,s') = \frac{(\phi(s')-\phi(s))^\top \mathbf{w}}{\|\phi(s')-\phi(s)\|_2}
\end{equation}
This reward corresponds to our skills intrinsic reward from \cref{eq:skill_reward}, with $\mathbf{w} \equiv \bm{\delta}$.

\bibliography{erraqabi_314}

\end{document}