% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
% version; also before submission to
% see how the non-anonymous paper
% would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
% ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\renewcommand{\thefootnote}{}
\usepackage{hyperref}
\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}
\usepackage{multirow}
\usepackage{algorithm}  
\usepackage{algorithmic} 
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams


\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}
%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Learning Robust Representation for Reinforcement Learning \\ with Distractions by Reward Sequence Prediction}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<zhouqida@mail.ustc.edu.cn>?Subject=Paper RSP in UAI 2023}{Qi Zhou}{}}
\author[1,2]{\href{mailto:<jiewangx@ustc.edu.cn>?Subject=Paper RSP in UAI 2023}{Jie Wang\textsuperscript{*}}{}}
\author[1]{Qiyuan Liu}
\author[1]{Yufei Kuang}
\author[1,2]{Wengang Zhou}
\author[1,2]{Houqiang Li}
% Add affiliations after the authors
\affil[1]{%
    1CAS Key Laboratory of Technology in GIPAS,
    University of Science and Technology of China
}
\affil[2]{%
    Institute of Artificial Intelligence,
    Hefei Comprehensive National Science Center
}

  \begin{document}
\maketitle
\begin{abstract}
Reinforcement learning algorithms have achieved remarkable success in acquiring behavioral skills directly from pixel inputs. However, their application in real-world scenarios presents challenges due to their sensitivity to visual distractions (e.g., changes in viewpoint and light). A key factor contributing to this challenge is that the learned representations often suffer from overfitting task-irrelevant information. By comparing several representation learning methods, we find that the key to alleviating overfitting in representation learning is to choose proper prediction targets. Motivated by our comparison, we propose a novel representation learning approach---namely, \textbf{r}eward \textbf{s}equence \textbf{p}rediction (RSP)---that uses reward sequences or their transforms (e.g., discrete time Fourier transform) as prediction targets. RSP can efficiently learn robust representations as reward sequences rarely contain task-irrelevant information while providing a large number of supervised signals to accelerate representation learning. 
An appealing feature is that RSP makes no assumption about the type of distractions and thus can improve performance even when multiple types of distractions exist. 
We evaluate our approach in Distracting Control Suite. Experiments show that our method achieves state-of-the-art sample efficiency and generalization ability in tasks with distractions. 
\end{abstract}

\section{Introduction}
Recent deep reinforcement learning (RL) algorithms have achieved great success in learning behaviors directly from pixel inputs \cite{controlsuite}. 
 \footnotetext{* Corresponding author}
However, many of these algorithms suffer from obvious performance degradation in tasks with visual distractions \cite{dbc,dcs}, such as variations in background, color, and viewpoint. In the training phase, the distraction significantly reduces the sample efficiency \cite{dcs} and makes the optimization unstable. One major reason is that these algorithms can not efficiently extract task-relevant information from pixels and then suffer from large approximation error. In the evaluation phase, learned policies often generalize poorly to new environments where the distractors are different from those in the training environment \cite{difficult2, dcs, psm, survey}. The poor generalization comes from the overfitting to the training data. That is, the learned policies select actions based on non-causal features, but these features may significantly change in test environments \cite{difficult2}.

Recent work shows that representation learning is the key to improving robustness against distractions. Some methods improve representations by data augmentation \cite{secant,seva}. They encourage the consistency of outputs when applying different transformations to the inputs. However, many of these methods assume the type of distraction and then select proper transformations according to this assumption. Moreover, they often require a clean environment (no distractions exist) for stable optimization. However, the clean environment is unavailable in many real-world tasks. Another line of work learns robust representations by auxiliary tasks, which are additional objectives optimized simultaneously with standard RL objectives \cite{spr, slac, deepmdp}. These auxiliary tasks improve robustness by preventing representations from exploiting task-irrelevant information \cite{dbc,psm}. However, many of these methods are sample inefficient. They usually require many samples to learn robust representations and even struggle to learn good  behaviors when multiple distractions appear simultaneously. Therefore, learning robust representations with high sample efficiency remains challenging, especially when different types of distractions exist at the same time \cite{dcs}.


To tackle this problem, we propose \textbf{r}eward \textbf{s}equence \textbf{p}rediction (RSP), a novel approach that efficiently learns robust representations in tasks with distractions. First, we analyze several representation learning methods . Our results show that representation learning should follow the information bottleneck principle \cite{ibb, iba, ibc}. That is, the prediction targets used to learn representations should provide sufficient task-relevant information for sample-efficient training while containing little task-irrelevant information for good generalization. Second, we propose to use reward sequences or their transforms (e.g., discrete time Fourier transform) as prediction targets. We show that reward sequences and their transforms provide large amounts of information about the long-term future while having a low correlation with distractors. Then, we propose a TD-style algorithm to efficiently predict long reward sequences and their transforms. This algorithm enjoys the convergence property of contraction operations like the traditional TD-learning of Q functions. Finally, we propose a method that learns the transform of reward sequences by maximizing the information in prediction targets. This method can automatically exploit the property of reward sequences in different tasks.

RSP is compatible with most visual RL algorithms. In our experiments, we combine RSP with DrQ and DrQv2 \cite{drqv2}. We evaluate our method in Distracting Control Suite \cite{dcs}. In the multi-distraction setting, RSP achieves up to three times performance improvement in average return and is much more sample-efficient than our baselines. Moreover, in the video-background setting, RSP learns policies that generalize well to unseen distractions. We also provide analyses to show that RSP can learn robust representations that hardly encode task-irrelevant information.

Our contributions consist of four parts. (1) We compare different auxiliary tasks and propose to use reward sequences as prediction targets. We show that using reward sequences as prediction targets can better satisfy the information bottleneck principle (Figure \ref{MI}) than other auxiliary tasks. (2) We propose a novel TD-style learning method for the efficient computation of prediction targets. We prove that this method enjoys the convergence properties of contraction mappings. (3) We propose a method to learn the transform of reward sequences. It avoids the manual selection of transform for different tasks. (4) Our experiments demonstrate that RSP significantly improves the sample efficiency and generalization when distractions exist. We provide extensive analyses to understand the excellent performance of RSP.



\section{Related Work}


\textbf{RL with distractions:} 
Recent work that considers distractions usually focus on improving generalization \cite{survey, when, why,decouple,pad}. A promising approach to improve generalization is to use regularization, such as $\ell 2$ regularization \cite{noiseIB, l2reg2}, dropout \cite{noiseIB}, batch normalization \cite{noiseIB, l2reg1} and information bottleneck regularization \cite{noiseIB, IB2, IB3, multiview}. Recently, data augmentation has shown excellent potential to improve the generalization of deep RL \cite{soda, seva, rad, l2reg2, randconv, drac, paada, mixstyle, secant}. However, most of these methods assume that the training environment is without distractions, which may be impractical in real-world tasks. Moreover, they often assume the type of distractions to select transforms, which requires prior knowledge about environments. In contrast, some recent methods can directly train policies in environments with distractions. A popular idea is state abstraction \cite{sabs, bis, bis2, abstract1, abstract2}. For example, \citet{dbc} and \citet{psm} propose to group states according to $\pi$-bisimulation metric \cite{bisim} and the policy similarity metric, respectively. Another kind of method improves generalization ability via invariance. For example, \citet{ipo} introduces the ideas of invariant risk minimization \cite{irm} into policy gradient methods. Though these methods achieve promising generalization, some of them will hurt the sample efficiency, and asymptotic performance \cite{metarl}.
Recent work attempts to achieve high sample efficiency of model-based RL in tasks with distractions \cite{tia, tpc, dreamerpro}. However, they also suffer from low sample efficiency when multiple distractions exist (Section \ref{comp_exp}). Similar to RSP, CRESP \cite{cresp} conduct auxiliary tasks by reward signs. However, there are clear differences between CRESP and RSP. CRESP only considers the generalization ability but RSP can also improve the sample efficiency. CRESP is motivated by the invariance across different environments while RSP is motivated by the information bottleneck principle. CRESP can only predict few steps of rewards (3-7) while RSP use long reward sequences or their transforms as prediction targets. Section \ref{gen-exp} shows that RSP achieves better performance than CRESP.

\textbf{Auxiliary Tasks:} 
Previous work uses auxiliary tasks to improve the representations of high-dimensional observations \cite{aux1}. Reconstruction-based auxiliary tasks simultaneously learn an encoder and a decoder by minimizing the reconstruction errors \cite{plannet, dreamer, longterm, sacae, slac}. 
They encourage agents to capture all information, whether it is relevant to the control task or not. Recently, \citet{dbc} point out that task-irrelevant information can hinder the agent from learning robust representations and lead to performance degradation. Contrastive-based auxiliary tasks \cite{curl, mcurl, rcrl} minimize the distance between embeddings of similar observations while maximizing the distance between embeddings of dissimilar observations. Many of these methods require prior knowledge to define the similarity \cite{dbc}. We empirically show that contrastive-based auxiliary tasks also suffer from the distractions of task-irrelevant information (Figure \ref{MI}). Model-based auxiliary tasks \cite{deepmdp,spr,virtual,slac,dreamer,dreaming,tpc,dreamerpro} capture the information about the dynamics in latent spaces. Given current observations, these auxiliary tasks encourage the agent to discriminate the subsequent observations. However, the distractors also provide information to discriminate the subsequent observations, and we observe that model-based auxiliary tasks tend to misuse task-irrelevant information (Figure \ref{MI}). Learning value functions over multiple time horizons is also a popular auxiliary task \cite{discount}, which can improve the sample efficiency in Atari tasks. In this work, we focus on designing auxiliary tasks for deep reinforcement learning with distractions. Therefore, the auxiliary tasks should not only encourage neural networks to extract useful information but also needs to ignore task-irrelevant information.

\section{Preliminaries}
\subsection{Notation}
\begin{figure}[!htb]
  \centering
  \includegraphics[width=0.95\linewidth]{img/model.pdf}
  \label{SCM}
\end{figure}
An infinite-horizon Markov decision process (MDP) $\mathcal{M}$ is defined by $(\mathcal{S}, \mathcal{A}, P_s, R_s, \gamma)$, where $\mathcal{S}$ is the state space, $\mathcal{A}$ is the action space, $P_s:\mathcal{S}$ is the transition probability function, $R_s:\mathcal{S}\times\mathcal{A}\rightarrow[0, 1]$ is the reward function, $\gamma \in (0,1)$ is the discount factor. We define an MDP with distractions by $(\mathcal{M}, \mathcal{D}, \mathcal{P}_d, \mathcal{O}, g)$. Here, $\mathcal{M}$ is the original MDP, $\mathcal{D}$ is 
a set of distractors, $P_d$ is the transition probability functions of distractors, $\mathcal{O}$ is a set of observations, $g: \mathcal{S}\times\mathcal{D}\rightarrow\mathcal{O}$ is the observation function. Let $\pi: \mathcal{O} \times \mathcal{A} \rightarrow [0,1]$ denote a policy. We illustrate the process in the above figure. In this decision process, only the rewards $\{\mathbf{r}_i\}_{i=1}^\infty$ and the observation $\{\mathbf{o}_i\}_{i=0}^\infty$ can be observed. The actions $\{\mathbf{a}_i\}_{i=0}^\infty$ are selected under a given policy $\pi$. The low-dimensional state $\{\mathbf{s}_i\}_{i=0}^\infty$ and the distractors $\{\mathbf{d}_i\}_{i=0}^\infty$ can not be observed. We assume that states in $\mathcal{S}$ are not equivalent to each other and $g(s_1, d_1) \neq g(s_2,d_2)$ if $s_1\neq s_2$. Under the assumption, an MDP with distractions still enjoys the Markov property. Moreover, the assumption guarantees that there exists a function $\phi_o:\mathcal{O}\rightarrow \mathcal{S}$ mapping observations to origin states. Therefore, we can let  $P_o:\mathcal{O}\times\mathcal{A}\times\mathcal{O}\rightarrow[0,1]$ denote the transition probability function of observations and define $R_o:\mathcal{O}\times\mathcal{A}\rightarrow[0, 1]$ by $R_o(o,a) = R_s(\phi_o(o), a)$.

\subsection{DrQ and DrQv2}
Both DrQ and DrQv2 are state-of-the-art algorithms in visual control tasks. DrQ \cite{drq} is a combination of data augmentation and soft actor critic \cite{sac}. It uses optimality invariant state transformations $f$---which do not change the state-action value---to augment the training data. DrQ uses the data augmentation to improve the accuracy of target values and reduce the variance of stochastic gradients. Specifically, DrQ updates the Q function, which is a neural network parameterized by $\theta$, by minimizing the mean square error $J_Q$
\begin{align*}
&\mathbf{y}_n = \mathbf{r}_n + \frac{\gamma}{K}\sum_{k=1}^{K} Q_{\bar{\theta}}\left(f\left(\mathbf{o}^\prime_n,\mathbf{v}_k\right),\mathbf{a}^\prime_n\right), \\
&J_Q(\theta)=\frac{1}{NK}\sum_{n=1,k=1}^{N,K}\left( Q_{\theta}\left(f\left(\mathbf{o}_n,\mathbf{v}_k\right),\mathbf{a}_n\right)-\mathbf{y}_n\right)^2.
\end{align*}
Here, $(\mathbf{o}_n, \mathbf{a}_n, \mathbf{r}_n, \mathbf{o}^\prime_n)$ is uniformly sampled from a replay buffer, $\bar{\theta}$ is the parameters of target networks, $\mathbf{a}^\prime_n$ is sampled from the distribution $\pi(\cdot|\mathbf{o}^\prime_n)$, and $\mathbf{v}_k$ is a random parameter of the transform $f$. DrQ learns policies via maximum entropy RL \cite{sac} for efficient exploration and stable optimization. Therefore, DrQ actually uses the sum of $\mathbf{y}_n$ and an entropy term as the target value.

Recently, \citet{drqv2} propose DrQv2, which makes several modifications to DrQ. DrQv2 replaces the maximum entropy term \cite{sac} with a scheduled noise for adjustable exploration and borrows the idea of target policy smoothing from TD3 \cite{td3} to reduce the bias of Q functions. Furthermore, DrQv2 uses multi-step TD to learn value functions 
\begin{align*}
&\mathbf{y}_i = \sum_{t=1}^{T}\gamma^{k-1}\mathbf{r}_{n,t} + \frac{\gamma^T}{K}\sum_{k=1}^{K} Q_{\bar{\theta}}\left(f\left(\mathbf{o}_{n,T},\mathbf{v}_k\right),\mathbf{a}_{n,T}\right),
\end{align*}
where $\mathbf{r}_{n,1:T}$ is the subsequent rewards after $\mathbf{o}_n$, $\mathbf{o}_{n,1:T}$ is the subsequent observations, and $\mathbf{a}_{n,T}$ is sampled according to the policy $\pi(\cdot|\mathbf{o}_{n,T})$ and the scheduled noise.

\subsection{Discrete-Time Fourier Transform}
Discrete-time Fourier transform (DTFT) is used to analyze the frequency properties of a time series. It converts a sequence $\{c_n\}_{-\infty}^{\infty}$ into a complex-value function $h_c(x)$ by 
$$
h_c(x) = \sum_{n=-\infty}^{\infty}c_n e^{-nxj}.
$$
As DTFT is invertible, the function $h_c(x)$ contains all information about the sequence $\{c_n\}_{-\infty}^{\infty}$ \cite{dtft}. 

% We use the standard definition of the state-action value function $Q_\pi$ and the state value function $V_\pi$. That is,
% \begin{align*}
% &Q_\pi(s, a) = \mathbb{E}_{\pi}\left[\left.\sum_{t=0}^\infty \gamma^i R(\mathbf{s_t},\mathbf{a_t}) \ \right| \ \mathbf{s_0}=s, \mathbf{a_0}=a\right],\\
% &V_\pi(s) = \mathbb{E}_{\pi}\left[\left.Q_\pi(s,\mathbf{a_0})\ \right| \ \mathbf{s_0}=s \right],
% \end{align*}
% where $\mathbb{E}_{\pi}$ means that the trajectory $(\mathbf{s_0,a_0,s_1,a_1\cdots})$ is sampled under the policy $\pi$.

\section{Reward Sequence Prediction}
In this section, we introduce a novel representation learning method---namely, \textbf{r}eward \textbf{s}equence \textbf{p}rediction (RSP)---that learns robust representations in tasks with distractions. First, we compare six methods to study how to select prediction targets for representation learning. Second, we propose RSP that uses reward sequences or their transforms as prediction targets. Then, we propose a TD-style algorithm for the efficient prediction of long reward sequences or their transforms. Finally, we propose a method that automatically learns the transform by maximizing information. In this section, actions are sampled under a fixed policy, and we omit the policy in notations for simplification. We provide proofs of propositions in Appendix 1.


\subsection{Auxiliary Task Design}\label{sec41}
\begin{figure}[!t]
  \centering
    \includegraphics[width=0.95\linewidth]{img/MI.pdf}
    \caption{Comparison between different auxiliary tasks in a visual control task with background distractions. }
    \label{MI}
\end{figure}
This part discuss the relation between the performance and the prediction target used for representation learning. To do this, we analyze how much task-relevant and task-irrelevant information are encoded when using different representation learning methods. We compare six representaion learning methods, including VAE \cite{sacae}, which is a reconstruction-based auxiliary task; CURL \cite{curl}, which is based on multi-view contrastive learning; one-step CPC \cite{cpc}, which learns a model in the latent space; one-step reward prediction, which predicts one-step rewards; the combination of CPC and reward prediction; and our method RSP (detailed in the following sections). We compare them in a modified Cartpole Swingup environment, where the background is replaced with random images. We let $\phi_\theta$ denote the encoder learned by auxiliary tasks. We use the InfoNCE objective to estimate the mutual information $I\left(\phi_\theta(\mathbf{o_t});\mathbf{s_t}\right)$, which stands for task-relevant information. We train a network with a cross-entropy loss to predict background images and use the loss to estimate the mutual information $I\left(\phi_\theta(\mathbf{o_t});\mathbf{d_t}\right)$, which stands for task-irrelevant information. 

Figure \ref{MI} shows that all methods except RSP struggle to learn high-return policies. We observe that the performance will be low when whether too much task-irrelevant information (VAE, CPC, CURL, and CPC+Reward) or too little task-relevant information (Reward) are encoded in the learned representations. Moreover, we notice that the prediction targets used in auxiliary tasks almost determine how much task-relevant and task-irrelevant information is encoded. For example, the prediction targets of VAE include all task-irrelevant elements, so representations learned by VAE contain the largest amount of task-irrelevant information. The one-step reward signs almost contain no task-irrelevant information, so representations learned by reward prediction contain the smallest amount of task-irrelevant information. The prediction targets of CPC+Reward contain extra reward information compared with CPC, so representations learned by CPC+Reward encode more task-relevant information than those learned by CPC only. 

Our results imply that selecting proper prediction targets for representation learning is the key to improving the robustness against distractions. \textit{The selection of prediction targets should follow the information bottleneck principle}. Specifically, prediction targets should contain as much task-relevant information as possible while being as uncorrelated with distractions as possible.

\subsection{Prediction Targets of RSP}\label{def}
As discussed in Section \ref{sec41}, one-step reward signs rarely contain information about distractions but do not provide sufficient information for representation learning. Therefore, we propose to use reward sequences as prediction targets. That is, given an observation $\mathbf{o}_t$, an action $\mathbf{a}_t$, we encourage representations to encode information about the reward sequence $\mathbf{r}_{t+1:t+T}$, which is sampled 
under a policy $\pi$. Reward sequences provide more task-relevant information than one-step rewards as the inequality $I(\mathbf{s_t};\mathbf{r}_{t+1})\leq I(\mathbf{s}_t;\mathbf{r}_{t+1:t+T})$ always holds. We argue that reward sequences rarely contain task-irrelevant information similar to one-step rewards.
Proposition \ref{upper} provides an upper bound for the task-irrelevant information in reward sequences. 
\begin{proposition}\label{upper}
Assume that actions are sampled under a fixed policy $\pi$. Let $I(X;Y|Z)$ denote the mutual information between $X$ and $Y$ conditioned on $Z$. Then, we have
$$
I(\mathbf{d}_t;\mathbf{r}_{t+1:t+T} \  |\ \mathbf{s}_t )\leq \sum_{i=\mathbf{0}}^{T-1} I(\mathbf{d}_{t+i};\mathbf{a}_{t+i}|\mathbf{s}_{t+i}).
$$
\end{proposition}
The proposition shows that the task-irrelevant information (left-hand side) provided by reward sequences is less than that used to select actions (right-hand side). Given different observations $o$ and $o^\prime$ that correspond to a same state $s$, the task-irrelevant information used to select actions is negligible if the two action distributions $\pi(\cdot|o)$ and $\pi(\cdot|o^\prime)$ are similar. Therefore, the left-hand side will be small if we control the right-hand side by regularizing policies. There are optional regularization terms to control the right-hand side. For example, the commonly used maximum entropy regularization \citep{sac} can control the right-hand side by encouraging all action distributions to be close to the same uniform distribution. In our implementation of RSP, we regularize policies by the $\ell$2 norm of actions with a small coefficient, as discussed in Appendix 3.1.

As rewards in the long-term future rarely depend on the current observation and action, we discount rewards according to the time step, similar to the definition of $Q$ values. That is, we consider the expectation of discounted reward 
\begin{align}\label{discount}
e_n(o,a;\pi) \triangleq \mathbb{E}_{\pi}[\gamma^{n} \mathbf{r}_{n+1}\ |\ \mathbf{o}_0=o,\mathbf{a}_0=a].
\end{align}
% We note that the discount provides convergence guarantees as shown in Sections \ref{alg} and can control the variance of prediction targets. 

Based on the discounted reward, we can define two variants of RSP. The first one directly use $\left\{e_n(o,a;\pi)\right\}_{n=0}^{L-1}$ as the $L$-dimensional prediction target $Z_{\pi,1}\left(o,a\right)$. That is, 
$$
\left[Z_{\pi,1}(o,a)\right]_i \triangleq e_i(o,a;\pi).
$$ 
The second one considers the DTFT of reward sequences and uses the values at $L$ points as the prediction targets $Z_{\pi,2}(o, a)$. Specifically, it is defined by
$$
\left[Z_{\pi,2}(o,a)\right]_i \triangleq \sum_{n=0}^\infty e_n(o,a;\pi)\exp{\left(-\frac{2ni\pi}{L}j\right)}.
$$ 
The prediction target $Z_{\pi,2}(o, a)$ contains the frequency-domain information about reward sequences. We argue that the frequency-domain information can improve performance in tasks where state/reward sequences are approximately periodic (please see discussion in Appendix 3.5). 
% Thanks to the properties of Fourier expansion, the function $h_u(x)$ includes almost all information about the sequence $\{u_i\}_{i=0}^\infty$. Then, as learn a function is difficult, we encourage the auxiliary task to predict the value of the function $h_u(x)$ at $L$ points. That is, we convert the infinite sequence $\left\{u_n\right\}_{n=0}^\infty$ to an $L$-dimensional vector $z$ by
% $$
% z_i = \sum_{n=0}^\infty u_n\exp{\left(-\frac{2n\pi n\mathbf{j}}{L}\right)}.
% $$
% We note that the transform from an infinite sequence to a finite one follows the idea of DFT. We argue that it can also extra the frequency-domain information of $u_n$ and achieve higher performance than directly predicting finite reward sequence in tasks where state sequences are approximately periodic (please refer to Appendix). Therefore, given $\mathbf{o_t},\mathbf{a_t}$, the prediction target $\mathbf{z_t}$ is the $L$-dimensional vector transformed from the infinite sequence $\left\{\left[u_{\pi}(\mathbf{o_t},\mathbf{a_t})\right]_n\right\}_{n=0}^\infty$ via the aforementioned method. 


\subsection{TD-Style Learning}\label{td-style}
This part proposes a TD-style method to efficiently predict long reward sequences. First, we note that both two types of prediction targets $Z_{\pi,i}(o,a)$ in Section \ref{def} can be unified via contraction mappings $\mathcal{T}_{\pi,i}$. Proposition \ref{cont} provides the form of the contraction mapping $\mathcal{T}_{\pi,i}$.
\begin{proposition}\label{cont}
There exist contraction mappings $\mathcal{T}_{\pi,i}$ such that Equations (\ref{zeq}) holds for both $i = 1,2$
\begin{align} 
Z_{\pi,i}(o,a)&=\left(\mathcal{T}_{\pi,i} Z_{\pi,i}\right)(o,a), \label{zeq}\\
\left(\mathcal{T}_{\pi,i} Z_{\pi,i}\right)(o,a)&=W_{i}R_o(o,a) + \Gamma_{i}\mathbb{E}_{\mathbf{o}^\prime,\mathbf{a}^\prime}\left[Z_{\pi,i}(\mathbf{o}^\prime, \mathbf{a}^\prime)\right],\notag
\end{align}
where $W_i\in \mathbb{R}^{L}$, $\Gamma_i\in\mathbb{R}^{L\times L}$, $\mathbf{o}^\prime$ is sampled with probability $P_o(\mathbf{o}^\prime|o,a)$, $\mathbf{a}^\prime$ is sampled with probability $\pi(\mathbf{a}^\prime|\mathbf{o^\prime})$, and all vectors are column vectors.
\end{proposition}
For both $i=1,2$, we provide complete expressions of $W_i$ and $\Gamma_i$ in Appendix 1.2. In tabular settings, our TD-style learning method computes prediction targets $Z_{\pi,i}(o,a)$ by repeatedly apply the operator $\mathcal{T}_{\pi,i}$. Thanks to the properties of contraction mappings, this method enjoys the exponential convergence rate similar to the TD-learning of Q values.
% The Proposition \ref{cont} holds due to the discount factor $\gamma$ used to define $e_n(o,a;\pi)$ in Equation (\ref{discount}). 

In deep RL, we train a network $Z_\theta$ to approximate $Z_{\pi,i}$. First, we sample a batch of data $\{( \mathbf{o}_i, \mathbf{a}_i, \mathbf{r}_i, \mathbf{o}^\prime_i, \mathbf{a}^\prime_i)\}_{i=1}^N$ with a size of $N$ from the replay buffer. Then, we compute prediction targets by applying the operator $\mathcal{T}_{\pi,i}$ to the current prediction, i.e., $
\mathbf{z}=W_{i}\mathbf{r} + \Gamma_{i}Z_{\theta}(\mathbf{o}^\prime, \mathbf{a}^\prime)$.
Finally, we optimize the network $Z_\theta$ by minimize the mean square error
\begin{align}\label{jrsp}
J_{RSP} =\frac{1}{N}\sum_{n=1}^{N}\left\| Z_{\theta}\left(\mathbf{o}_n,\mathbf{a}_n\right)-\mathbf{z}_n\right\|_2^2.
\end{align}

This TD-style learning procedure can compute prediction targets without sampling long reward sequences from the buffer. Without the TD-style method, using $Z_{\pi,2}$ as prediction targets is impractical as it requires infinite sequences to compute prediction targets. Moreover, similar to the TD-learning of Q values, it significantly reduces the variance of gradients and thus can improve the sample efficiency.


\begin{figure*}[!t]
\centering
\includegraphics[width=0.8\textwidth]{img/structure.pdf}
\caption{Combination of RSP and DrQ/DrQv2. The dashed line show how gradients flow back to model weights. We prevent the gradient of RL losses from updating the convnet. The action $\mathbf{a^\prime}$ is sampled from a replay buffer. The policy and RSP network share a linear layer. $\phi_\theta^q$ and $\phi_\theta^\pi$ can share or not share parameters. We provide a comparison in Section \ref{exp}.}
\label{str}
\end{figure*}
\subsection{Learning transform}\label{learn-trans}
Motivated by Proposition \ref{cont}, we can define many prediction targets by different $W$ and $\Gamma$. Each pair of $W$ and $\Gamma$ corresponds to a transform of reward sequences. Therefore, we can view learning the parameters $W$ and $\Gamma$ as learning transforms of reward sequences. Proposition \ref{learned} shows how to define transforms of reward sequences by defining contraction mappings.
\begin{proposition}\label{learned}
For any $W\in \mathbb{R}^{L}$, if the infinity-norm of $\Gamma\in\mathbb{R}^{L\times L}$ is less than 1, the operator $\mathcal{T}_{\pi}$ defined by
\begin{align*} 
\left(\mathcal{T}_{\pi} Z_{\pi}\right)(o,a)&=WR_o(o,a) + \Gamma\mathbb{E}_{\mathbf{o}^\prime,\mathbf{a}^\prime}\left[Z_{\pi}(\mathbf{o}^\prime, \mathbf{a}^\prime)\right],\notag
\end{align*}
is a contraction mapping. The prediction target $Z_{\pi}$ defined as the fix point of $\mathcal{T}_{\pi}$ satisfies the equation
$$
Z_{\pi}(o,a) = \sum_{n=0}^\infty \left(\frac{\Gamma}{\gamma}\right)^nWe_n(o,a;\pi).
$$
\end{proposition}
According to Proposition \ref{learned}, we need to control the infinity-norm of $\Gamma$ to construct a transform of reward sequences. A simple method to achieve this is to express $\Gamma$ by the product of $D \in \mathbb{R}^{L\times L}$ and  $G \in \mathbb{R}^{L\times L}$, where $D$ is a diagonal matrix, $|D_{ii}|<\gamma$ and $\sum_{j=0}^{L-1}|G_{ij}|=1$ for any $0\leq i<L$. 

As the property of reward sequences varies in different tasks (e.g., sparse or dense rewards), we need to select the transforms (i.e., parameters $W$ and $\Gamma$) of reward sequences for different tasks. To automate this process, we propose to learn transforms by maximizing the information in the prediction targets. First, similarly to the TD-learning, we compute the current prediction $Z_\theta(\mathbf{o}, \mathbf{a})$ and the prediction targets $
\mathbf{z}=W\mathbf{r} + \Gamma Z_{\theta}(\mathbf{o}^\prime, \mathbf{a}^\prime)$. Then, we maximize the mutual information $I(Z_\theta(\mathbf{o}, \mathbf{a});\mathbf{z})$ to maximizing the predictable information in the target $\mathbf{z}$. That is, we update parameters by InfoNCE loss \cite{cpc}:
\begin{align}\label{j_trans}
    J_{Trans} =  \frac{1}{N}\sum_{n=1}^{N}\frac{\mathbf{Sim}(Z_{\theta}\left(\mathbf{o}_n,\mathbf{a}_n\right), \mathbf{z}_n)}{\sum_{m\neq n}^{N}\mathbf{Sim}(Z_{\theta}\left(\mathbf{o}_n,\mathbf{a}_n\right), \mathbf{z}_m)}.
\end{align}
Here, $\mathbf{Sim}$ stands for the exponential of cosine similarity.


\begin{algorithm}[!t]
   \caption{RSP}
   \label{rsp_alg}
\begin{algorithmic}
    \STATE Initialize the replay buffer $\mathcal{B}\leftarrow\emptyset$
    \STATE Initialize the parameters $\theta$ of networks
   \FOR{each episode}
   \STATE Sample actions: $ \mathbf{a}_0 \sim \pi_\theta(\cdot| \mathbf{o}_0)$ 
   \FOR{each environment step}
   \STATE Obtain rewards: $\mathbf{r}_{t+1} \leftarrow R_o(\mathbf{o}_t,  \mathbf{a}_t)$\
   \STATE Sample observations: $\mathbf{o}_{t+1} \sim P_o(\cdot| \mathbf{o}_t,  \mathbf{a}_t)$\
   \STATE Sample actions: $ \mathbf{a}_{t+1} \sim \pi_\theta(\cdot| \mathbf{o}_{t+1})$ \
   \STATE $\mathcal{B}\leftarrow \mathcal{B}\cup\{( \mathbf{o}_t, \mathbf{a}_t, \mathbf{r}_{t+1}, \mathbf{o}_{t+1}, \mathbf{a}_{t+1})\}$
   \ENDFOR
   \FOR{each training step}
   \STATE Sample a batch of training data
   \STATE Compute RL losses $J_\pi$ and $J_Q$\
   \STATE Compute $J_{RSP}$ by Equation (\ref{jrsp})\
   \STATE $\theta \leftarrow \theta - \lambda\nabla_\theta (J_\pi+J_Q+J_{RSP})$
   \STATE \textbf{Optional:} Compute $J_{Trans}$ by Equation (\ref{j_trans})\
   \STATE \textbf{Optional:} Minimize $J_{Trans}$ by gradient descent\
   \ENDFOR
   \ENDFOR
\end{algorithmic}
\end{algorithm}

\begin{figure*}[t]
\begin{center}
\centerline{\includegraphics[width=0.85\textwidth]{img/drq_and_drqv2_rsp.pdf}}
\caption{The performance during training. Here, we draw the line of "Best of Baselines" using the baseline that achieves the highest final score. The complete results are shown in Appendix 3.6. We use $Z_{\pi,1}(o,a)$ as prediction targets for environments in the first row and $Z_{\pi,2}(o,a)$ for those in the second row. RSP significantly improves the sample efficiency for both DrQ and DrQv2. Compared with DrQ/DrQv2, RSP(Learned) achieves \textbf{100\% improvement of final score} in all six tasks.}
\label{result1}
\end{center}
\end{figure*}

\section{Algorithm}
This part introduces the overall algorithm (Algorithm \ref{rsp_alg}) that combines our auxiliary tasks with DrQ/DrQv2. 
We let all networks share a convolutional encoder. We stop the gradients from the actor and the critic before they propagate to the shared convolutional layers. We only allow the gradients of $J_{RSP}$ to update the shared encoder. This stop-gradient trick can stabilize the optimization in some tasks. To regularize the representation used by the policy network $\pi_\theta$, we let it share the linear encoder $\phi_\theta^\pi$ with the prediction network. We illustrate the network architectures and the gradient flows in Figure \ref{str}. In this algorithm, we can choose either $Z_{\pi,1}(o,a)$ or $Z_{\pi,2}(o,a)$ as prediction targets $Z_\pi(o,a)$ according to the task. We can also learn the transform by the \textbf{Optional} steps in Algorithm \ref{rsp_alg} as discussed in Section \ref{learn-trans}. When computing prediction targets, we view the policy that collects data as the policy $\pi$ that outputs the next action $\mathbf{a}^\prime$. Therefore, we directly sample the action $\mathbf{a}^\prime$ from the buffer.



\section{Experiments}\label{exp}
This section evaluates RSP in Distracting Control Suite (DCS). Our experiments have three goals: 1) to test whether RSP can improve the sample efficiency and generalization in tasks with distractions; 2) to analyze the effect of each component in RSP; 3) to visualize the embedding space learned by RSP. All results are reported over five random seeds. We provide details of experiments in Appendix 3. We will release our code in https://github.com/QiZhou1997/MIRL.

\textbf{Implementation} We combine RSP with two algorithms, DrQ \cite{drq} and DrQv2 \cite{drqv2}. We evaluate two variants of RSP. The first one use a fixed transform, $Z_{\pi,1}$ or $Z_{\pi,2}$ (selected by human). The second one learns the transform as discussed in Section \ref{learn-trans}. The hyperparameters of RSP can be found in Appendix 3.

\subsection{Multiple-Distraction Setting}\label{comp_exp}
This part evaluates RSP in sample efficiency and asymptotic performance. We compare RSP with four state-of-the-art methods that learn representations with distractions. The first three are based on auxiliary tasks, including DBC \cite{dbc}, which is a contrastive-based auxiliary task, TPC \cite{tpc}, which learn a latent model without reconstruction, and TIA \cite{tia}, which is a reconstruction-based method with a pixel mask mechanism. The last one is SVEA \cite{seva}, which regularizes representations by strong data augmentation. We evaluate all methods in six visual control tasks, where agents face camera distractions, color distractions, and background distractions simultaneously. 
% We use the static setting in DCS. The camera position, the color of the robot, and the background image are randomly sampled before each episode and then kept fixed through the whole episode. We set the difficulty scales for camera and color distractions as $0.1$. The background images are sampled from 4 videos

We plot the results in Figure \ref{result1}. The solid curves correspond to the mean, and the shaded region to the standard deviation. The results show that RSP beat all baselines in six tasks. The four representation learning methods do not beat DrQ/DrQv2, except that TIA outperforms DrQ and DrQv2 in Walker Walker. The poor performance of our baselines indicates the difficulty of learning robust representations with multiple distractions. In contrast, RSP improves the sample efficiency and asymptotic performance for DrQ and DrQv2 in all tasks, demonstrating that RSP can achieve sample-efficient and robust representation learning. Results also show that learning transforms can improve the sample efficiency of RSP in most tasks.
% The poor performances of DBC, TPC, and TIA indicate that learning bisimulation metrics, latent dynamics or reconstruction remains challenging in multi-distractions settings. 

\begin{table*}[t]
    \centering
    \begin{tabular}{|l|l|l|l|l|l|l|}
    \hline
        ~ & BiC-Catch & C-Swingup & C-Run & F-Spin & R-Easy & W-Walk \\ \hline
        DrQ & $747\pm28$ & $582\pm42$ & $220\pm12$ & $646\pm54$ & $931\pm14$ & $549\pm83$ \\ 
        DBC & $113\pm133$ & $296\pm213$ & $133\pm98$ & $154\pm149$ & $129\pm64$ & $119\pm46$ \\ 
        TPC & $573\pm182$ & $706\pm64$ & $280\pm48$ & $634\pm138$ & $936\pm61$ & $768\pm33$ \\ 
        DrQ+CRESP & $665\pm185$ & $689\pm49$ & ${327\pm54}$ & $778\pm154$ & $667\pm82$ & $794\pm83$ \\ 
        DrQ+PSE & $\mathbf{821\pm17}$ & $\mathbf{749\pm19}$ & $308\pm12$ & $779\pm49$ & ${955\pm10}$ & $789\pm28$ \\ \hline\hline
        DrQ+RSP(Learned) & $730\pm79$ & ${662\pm47}$ & $\mathbf{347\pm55}$ & ${765\pm84}$ & $\mathbf{968\pm9}$ & $\mathbf{829\pm58}$ \\ \hline
        DrQ+RSP(Fixed) & $788\pm78$ & $\mathbf{752\pm16}$ & $\mathbf{338\pm27}$ & $\mathbf{891\pm33}$ & $\mathbf{960\pm15}$ & $\mathbf{820\pm39}$ \\ \hline
    \end{tabular}
    \caption{Performance with unseen distractions at 500K steps. RSP can achieve state-of-the-art generalization. }
\label{gener}
\end{table*}
\subsection{Evaluation in Generalization}\label{gen-exp}
In this part, we consider two addtional baseliens, PSE \cite{psm} and CRESP \cite{cresp}. They both achieve state-of-the-art generalization ability in tasks with distractions \cite{psm}. We do not evaluate TIA in this part, as it is not concerned about generalization. We use the same settings as PSE and CRESP. For each episode, a video is sampled as background and keeps playing forwards or backwards. We use two videos for training and 30 unseen videos for evaluation. We also use DrQ as the backbone. We provide results in Table \ref{gener}. The results show that DrQ+RSP achieves state-of-the-art generalization. Moreover, we find that using learned transform does not outperform using fixed transform in generalization. A potential reason is that maximizing information by the loss (\ref{j_trans}) may cause networks to capture extra task-irrelevant information. 


\subsection{Stochastic Rewards}\label{stco_rew}
Standard Deepmind Control environments usually use deterministic reward functions. Here, we provide results to test whether RSP can improve performance when rewards are stochastic. We consider two kinds of stochasticity, random delay and Gaussian noise. In the random delay setting, agents receive reward signals 0-3 steps after performing actions. In the Gaussian noise setting, we add Gaussian noise with a standard deviation of 0.1 on reward signals. In all environments, multiple distractions exist. The results in Figure \ref{bar_fig} show that RSP can significantly improve the performance of DrQv2 even when rewards are stochastic.
\begin{figure}[!tbh]
\begin{center}
\centerline{\includegraphics[width=0.97\linewidth]{img/random_reward.pdf}}
\caption{performance in tasks with stochastic rewards.}
\label{bar_fig}
\end{center}
\end{figure}


\subsection{Ablation Study}\label{abl_sec}
This part provides ablation studies in multi-distraction settings. We use fixed transforms if not otherwise stated.

\begin{figure} %this figure will be at the right
\centering
\includegraphics[width=0.97\linewidth]{img/abalation_len.pdf}
\caption{Training curves with different $L$. Here, $L$ stands for the number of dimensions of predicted targets.}
\label{abl_len}
\end{figure}

\begin{figure}[!t]
\begin{center}
\centerline{\includegraphics[width=0.97\linewidth]{img/abalation_trick.pdf}}
\caption{Comparison between different implementations of RSP. Here, "not share" means that the policy network and the value network do not share the linear encoder layer. "not detach" means that we do not prevent the gradients of $J_Q$ from updating the encoder.}
\label{abl_trick}
\end{center}
\end{figure}


\textbf{Hyperparameter $L$: } The hyperparameter $L$ control the dimension of the prediction targets $Z_\pi(o,a)$. Figure \ref{abl_len} shows that RSP performs better as $L$ becomes larger no matter whether $Z_{\pi,1}$ or $Z_{\pi,2}$ is used. The reason is that high-dimensional prediction targets provide more information than low-dimensional counterparts. As large $L$ does not cause obvious cost, we suggest set $L=1024$.

\textbf{Implementation: } 
We prevent the gradient of RL losses from updating the encoder and let all networks share one linear encoder layer. Figure \ref{abl_trick} shows that the two tricks are important in multi-distraction settings. A possible reason is that training more encoder layers by only the gradient of $J_{RSP}$ brings more stable representation learning.

\textbf{Learning transforms: } 
We provide an ablation study for the transform learning method. Results in Figure \ref{abl_learn} show that the performance will significantly degrade if without controlling the infinity-norm of $\Gamma$ (the "no control" line). Results also show that updating the parameters $W$ and $\Gamma$ outperforms using the randomly initialized ones (the "random initialized" line), demonstrating the effectiveness of learning transforms by maximizing information. 


\begin{figure}[!t]
\begin{center}
\centerline{\includegraphics[width=0.97\linewidth]{img/abalation_learn.pdf}}
\caption{Ablation study for learning transforms.}
\label{abl_learn}
\end{center}
\end{figure}

\begin{figure}[t]
\centering
\subfigure[RSP]{\includegraphics[width=0.505\linewidth]{img/rsp-974.pdf}}
\subfigure[CPC+Reward]{\includegraphics[width=0.485\linewidth]{img/cpc+reward1308.pdf}}
\caption{T-SNE of embedding spaces after 200K-step training. The color represents the predicted state values. Different markers represent different background images. Neighboring points in the embedding space learned by RSP have similar state values, whereas no such structure is seen in the embedding learned by CPC+Reward. }\label{tsne}
\end{figure}
\subsection{Visualization}
This part visualize the embeddings with t-SNE. We use the data that used in Section \ref{sec41}. Figure \ref{tsne} shows that RSP maps observations with similar values to neighboring regions while CPC+Reward does not. This means that RSP can better extract task-relevant information from raw observations compared with CPC+Reward. Moreover, CPC+Reward tends to map observations with different background images to different regions while RSP does not. This means that representations learned by CPC+Reward encode more task-irrelevant information than those learned by RSP.

\section{Conclusion}
Learning behaviors with distractions have been a longstanding challenge. To address this challenge, we introduce RSP, a novel method that learns robust presentations by predicting reward sequences. We compare different methods by estimating the information in representations. Compared with prior methods, representations learned by RSP encode more task-relevant information while containing less task-irrelevant information. We evaluate RSP in both multi-distraction and video-background settings. Experiments demonstrate that RSP can achieve state-of-the-art sample efficiency and generalization. A promising future research is combining state abstraction with RSP to filter task-irrelevant information further.

\begin{acknowledgements} 
We would like to thank all the anonymous reviewers for their insightful comments. This work was supported in part by National Science Foundations of China grants U19B2026, U19B2044, 61836006, 61836011, and 62021001, and the Scientific and Technological Innovation 2030—“New
Generation Artificial Intelligence” Major Project 2022ZD0119801.
\end{acknowledgements}

% References
\bibliography{zhou_42}
\end{document}
