% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:

\usepackage{hyperref}
\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}
\usepackage{multirow}
\usepackage{algorithm}  
\usepackage{algorithmic} 
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
% \usepackage{xr} 
% \externaldocument{uai2023-template}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Learning Robust Representation for Reinforcement Learning \\ with Distractions by Reward Sequence Prediction\\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors

\author[1]{Qi Zhou}
\author[1,2]{Jie Wang}
\author[1]{Qiyuan Liu}
\author[1]{Yufei Kuang}
\author[1,2]{Wengang Zhou}
\author[1,2]{Houqiang Li}
% Add affiliations after the authors
\affil[1]{%
    1CAS Key Laboratory of Technology in GIPAS,
    University of Science and Technology of China
}
\affil[2]{%
    Institute of Artificial Intelligence,
    Hefei Comprehensive National Science Center
}
  
  \begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle

\section{Propositions}\label{proof}
\subsection{An Upper Bound for Task Irrelevant Information}
\begin{proposition}
Assume that all trajectories are sampled under a fixed stationary policy. Then, we have
$$
I(\mathbf{d}_t;\mathbf{r}_{t+1:t+N} \  |\ \mathbf{s}_t )\leq \sum_{i=\mathbf{0}}^{N-1} I(\mathbf{d}_{t+i};\mathbf{a}_{t+i}|\mathbf{s}_{t+i}).
$$
\end{proposition}
\begin{proof}
As $\mathbf{d}_t$ is independent of $\mathbf{r}_{t+1:t+N}$ conditioned on $(\mathbf{s}_t,\mathbf{a}_{t:t+N-1})$, we have
\begin{align}
I(\mathbf{d}_t;\mathbf{r}_{t+1:t+N}|\mathbf{s}_t) &\leq I(\mathbf{d}_t;\mathbf{a}_{t:t+N-1},\mathbf{r}_{t+1:t+N}|\mathbf{s}_t)\notag\\
&= I(\mathbf{d}_t;\mathbf{a}_{t:t+N-1}|\mathbf{s}_t)+I(\mathbf{o}_t;\mathbf{r}_{t+1:t+N}|\mathbf{s}_t,\mathbf{a}_{t:t+N-1})\notag\\
&= I(\mathbf{d}_t;\mathbf{a}_{t:t+N-1}|\mathbf{s}_t)\notag\\
&= I(\mathbf{d}_t;\mathbf{a}_{t}|\mathbf{s}_t)+\sum_{i=1}^{N-1}I(\mathbf{d}_t;\mathbf{a}_{t+i}|\mathbf{s}_t,\mathbf{a}_{t:t+i-1}).\label{mi1}
\end{align}
Then, we have
\begin{align}
I(\mathbf{d}_t;\mathbf{a}_{t+i}|\mathbf{s}_t,\mathbf{a}_{t:t+i-1}) &\leq I(\mathbf{d}_t,\mathbf{d}_{t+i};\mathbf{a}_{t+i}|\mathbf{s}_t,\mathbf{a}_{t:t+i-1}) \notag\\
&= I(\mathbf{d}_{t+i};\mathbf{a}_{t+i}|\mathbf{s}_t,\mathbf{a}_{t:t+i-1}) + I(\mathbf{d}_{t};\mathbf{a}_{t+i}|\mathbf{s}_t,\mathbf{a}_{t:t+i-1},\mathbf{d}_{t+i}) \notag\\
&= I(\mathbf{d}_{t+i};\mathbf{a}_{t+i}|\mathbf{s}_t,\mathbf{a}_{t:t+i-1}) \label{mi2}\\
&\leq I(\mathbf{d}_{t+i};\mathbf{a}_{t+i},\mathbf{s}_{t+i}|\mathbf{s}_t,\mathbf{a}_{t:t+i-1}) \notag\\
&= I(\mathbf{d}_{t+i};\mathbf{s}_{t+i}|\mathbf{s}_t,\mathbf{a}_{t:t+i-1}) + I(\mathbf{d}_{t+i};\mathbf{a}_{t+i}|\mathbf{s}_t,\mathbf{a}_{t:t+i-1},\mathbf{s}_{t+i}) \notag\\
&=I(\mathbf{d}_{t+i};\mathbf{a}_{t+i}|\mathbf{s}_t,\mathbf{a}_{t:t+i-1},\mathbf{s}_{t+i}) \label{mi3}\\
&= I(\mathbf{s}_t,\mathbf{a}_{t:t+i-1},\mathbf{d}_{t+i};\mathbf{a}_{t+i}|\mathbf{s}_{t+i})-I(\mathbf{s}_t,\mathbf{a}_{t:t+i-1};\mathbf{a}_{t+i}|\mathbf{s}_{t+i}) \notag\\
&\leq  I(\mathbf{s}_t,\mathbf{a}_{t:t+i-1},\mathbf{d}_{t+i};\mathbf{a}_{t+i}|\mathbf{s}_{t+i}) \notag\\
&=  I(\mathbf{d}_{t+i};\mathbf{a}_{t+i}|\mathbf{s}_{t+i}) + I(\mathbf{s}_t,\mathbf{a}_{t:t+i-1};\mathbf{a}_{t+i}|\mathbf{s}_{t+i},\mathbf{d}_{t+i}) \notag\\
&=  I(\mathbf{d}_{t+i};\mathbf{a}_{t+i}|\mathbf{s}_{t+i}). \label{mi4}
\end{align}
\end{proof}
Equation (\ref{mi2}) holds as $\mathbf{a}_{t+i}$ is independent of $\mathbf{d}_{t}$ conditioned on $\mathbf{s}_t,\mathbf{a}_{t:t+i-1},\mathbf{d}_{t+i}$. Equation (\ref{mi3}) holds because $\mathbf{s}_{t+i}$ is independent of $\mathbf{d}_{t+i}$ conditioned on $\mathbf{s}_t,\mathbf{a}_{t:t+i-1}$. By combining (\ref{mi1}) and (\ref{mi4}), we have
\begin{align*}
I(\mathbf{d}_t;\mathbf{r}_{t+1:t+N} \  |\ \mathbf{s}_t )\leq \sum_{i=\mathbf{0}}^{N-1} I(\mathbf{d}_{t+i};\mathbf{a}_{t+i}|\mathbf{s}_{t+i}).
\end{align*}

\subsection{Contraction Mappings}\label{form}
\begin{proposition}
There exists a contraction mapping $\mathcal{T}_{\pi,i}$ such that the following equations holds for $i = 1,2$
\begin{align} 
Z_{\pi,i}(o,a)&=\left(\mathcal{T}_{\pi,i} Z_{\pi,i}\right)(o,a),\label{new_zeq}\\
\left(\mathcal{T}_{\pi,i} Z_{\pi,i}\right)(o,a)&=W_{i}R_o(o,a) + \Gamma_{i}\mathbb{E}_{\mathbf{o}^\prime,\mathbf{a}^\prime}\left[Z_{\pi,i}(\mathbf{o}^\prime, \mathbf{a}^\prime)\right],\notag
\end{align}
where $W_i\in \mathbb{R}^{L}$, $\Gamma_i\in\mathbb{R}^{L\times L}$, $\mathbf{o}^\prime$ is sampled with probability $P_o(\mathbf{o}^\prime|o,a)$, $\mathbf{a}^\prime$ is sampled with probability $\pi(\mathbf{a}^\prime|\mathbf{o^\prime})$, and all vectors are column vectors.
\end{proposition}
\begin{proof}
First, we provide the forms of $W_i$ and $\Gamma_i$ for $i=1,2$. Then, we prove $\mathcal{T}_{\pi,i}$ is a contraction mapping. Given two functions $Z_1$ and $Z_2$, we define the distance by
$$
\mathbf{Dist}\left(Z_1, Z_2\right) = \max_{o,a}\max_{0\leq i \leq L-1} {\bigg|}\left[Z_1(o,a)\right]_i-\left[Z_2(o,a)\right]_i{\bigg|}.
$$

(1) For directly predicting reward sequences, we have
\begin{align*}
&W_1 = 
\begin{pmatrix}
1& 0& 0&\cdots &0 &0
\end{pmatrix}^T,
&\Gamma_1 = 
\begin{pmatrix}
0& 0& \cdot &0 &0 &0\\
\gamma& 0&  \cdots &0 &0 &0\\
0& \gamma& \cdots &0 &0 &0\\
\vdots& \vdots& \ddots& \vdots& \vdots &\vdots \\
0& 0& \cdots &\gamma &0 &0\\
0& 0& \cdots &0 &\gamma &0
\end{pmatrix}.
\end{align*}
Then, we have
\begin{align*}
&{\bigg|}\left[\left(\mathcal{T}_{\pi,1}Z_1\right)(o,a)\right]_n-\left[\left(\mathcal{T}_{\pi,1}Z_2\right)(o,a)\right]_n{\bigg|}\\
\leq &{\bigg|}\left[W_{1}R_o(o,a)\right]_n - \left[W_{1}R_o(o,a)\right]_n{\bigg|} + 
{\bigg|}\gamma\mathbb{E}_{\mathbf{o}^\prime,\mathbf{a}^\prime}\left[\left[Z_1(\mathbf{o}^\prime, \mathbf{a}^\prime)\right]_{n+1}-\left[Z_2(\mathbf{o}^\prime, \mathbf{a}^\prime)\right]_{n+1}\right]{\bigg|}\\
= &{\bigg|}\gamma\mathbb{E}_{\mathbf{o}^\prime,\mathbf{a}^\prime}\left[\left[Z_1(\mathbf{o}^\prime, \mathbf{a}^\prime)\right]_{n+1}-\left[Z_2(\mathbf{o}^\prime, \mathbf{a}^\prime)\right]_{n+1}\right]{\bigg|}\\
\leq &\gamma\mathbb{E}_{\mathbf{o}^\prime,\mathbf{a}^\prime}\left[{\bigg|}\left[Z_1(\mathbf{o}^\prime, \mathbf{a}^\prime)\right]_{n+1}-\left[Z_2(\mathbf{o}^\prime, \mathbf{a}^\prime)\right]_{n+1}{\bigg|}\right]\\
\leq &\gamma\mathbb{E}_{\mathbf{o}^\prime,\mathbf{a}^\prime}\left[\mathbf{Dist}\left(Z_1,Z_2\right)\right]\\
\leq &\gamma \mathbf{Dist}\left(Z_1,Z_2\right).
\end{align*}
Therefore, we have 
$$
\mathbf{Dist}\left(\mathcal{T}_{\pi,1}Z_1, \mathcal{T}_{\pi,1}Z_2\right) \leq \gamma \mathbf{Dist}\left(Z_1, Z_2\right),
$$
which implies that $\mathcal{T}_{\pi,1}$ is a contraction mapping.

(2) For predicting the DTFT of reward sequences, we have
\begin{align*}
W_2 = 
\begin{pmatrix}
1& 1& 1&\cdots &1 &1
\end{pmatrix}^T, 
\end{align*}
\begin{align*}
\Gamma_2 = 
\begin{pmatrix}
\gamma& 0& \cdots &0 &0 &0\\\\
0& \gamma\exp{\left(-\frac{2\pi}{L}j\right)}& \cdots &0 &0 &0\\\\
\vdots& \vdots &\ddots &\vdots &\vdots &\vdots\\\\
0& 0& \cdots &\gamma\exp{\left(-\frac{2(L-3)\pi}{L}j\right)} &0 &0\\\\
0& 0& \cdots &0 &\gamma\exp{\left(-\frac{2(L-2)\pi}{L}j\right)} &0\\\\
0& 0& \cdots &0 &0 &\gamma\exp{\left(-\frac{2(L-1)\pi}{L}j\right)}
\end{pmatrix}.
\end{align*}

Then, we have
\begin{align*}
&{\bigg|}\left[\left(\mathcal{T}_{\pi,2}Z_1\right)(o,a)\right]_n-\left[\left(\mathcal{T}_{\pi,2}Z_2\right)(o,a)\right]_n{\bigg|}\\
\leq &{\bigg|}\left[W_{1}R_o(o,a)\right]_n - \left[W_{1}R_o(o,a)\right]_n{\bigg|} + 
{\bigg|}\gamma\exp\left(-\frac{2n\pi}{L}j\right)\mathbb{E}_{\mathbf{o}^\prime,\mathbf{a}^\prime}\left[\left[Z_1(\mathbf{o}^\prime, \mathbf{a}^\prime)\right]_{n}-\left[Z_2(\mathbf{o}^\prime, \mathbf{a}^\prime)\right]_{n}\right]{\bigg|}\\
= &{\bigg|}\gamma\exp\left(-\frac{2n\pi}{L}j\right)\mathbb{E}_{\mathbf{o}^\prime,\mathbf{a}^\prime}\left[\left[Z_1(\mathbf{o}^\prime, \mathbf{a}^\prime)\right]_{n}-\left[Z_2(\mathbf{o}^\prime, \mathbf{a}^\prime)\right]_{n}\right]{\bigg|}\\
= &{\bigg|}\gamma\exp\left(-\frac{2n\pi}{L}j\right){\bigg|} \cdot {\bigg|}\mathbb{E}_{\mathbf{o}^\prime,\mathbf{a}^\prime}\left[\left[Z_1(\mathbf{o}^\prime, \mathbf{a}^\prime)\right]_{n}-\left[Z_2(\mathbf{o}^\prime, \mathbf{a}^\prime)\right]_{n}\right]{\bigg|}\\
= &{\bigg|}\gamma\mathbb{E}_{\mathbf{o}^\prime,\mathbf{a}^\prime}\left[\left[Z_1(\mathbf{o}^\prime, \mathbf{a}^\prime)\right]_{n}-\left[Z_2(\mathbf{o}^\prime, \mathbf{a}^\prime)\right]_{n}\right]{\bigg|}\\
\leq &\gamma\mathbb{E}_{\mathbf{o}^\prime,\mathbf{a}^\prime}\left[{\bigg|}\left[Z_1(\mathbf{o}^\prime, \mathbf{a}^\prime)\right]_{n}-\left[Z_2(\mathbf{o}^\prime, \mathbf{a}^\prime)\right]_{n}{\bigg|}\right]\\
\leq &\gamma\mathbb{E}_{\mathbf{o}^\prime,\mathbf{a}^\prime}\left[\mathbf{Dist}\left(Z_1,Z_2\right)\right]\\
\leq &\gamma \mathbf{Dist}\left(Z_1,Z_2\right).
\end{align*}
Therefore, we have 
$$
\mathbf{Dist}\left(\mathcal{T}_{\pi,2}Z_2, \mathcal{T}_{\pi,2}Z_2\right) \leq \gamma \mathbf{Dist}\left(Z_1, Z_2\right),
$$
which implies that $\mathcal{T}_{\pi,2}$ is a contraction mapping.
We note that when using $Z_{\pi,2}$ as prediction target, the prediction network $Z_\theta$ needs to output $2L$-dimensional vector ($L$ dimensions for the real part and $L$ dimensions for the imaginary part). We actually use $W_2$ and $\Gamma_2$ in the following form
$$
W_2 = 
\begin{pmatrix}
1& 0& 1& 0&\cdots &1 &0
\end{pmatrix}^T\\,
$$
$$
\Gamma_2 = 
\begin{pmatrix}
\cos(\frac{0}{L}\pi)& \sin(\frac{0}{L}\pi)& 0& 0& \cdots &0 &0\\\\
-\sin(\frac{0}{L}\pi)& \cos(\frac{0}{L}\pi)& 0& 0& \cdots &0 &0\\\\
0& 0& \cos(\frac{2}{L}\pi)& \sin(\frac{2}{K}\pi)& \cdots &0 &0\\\\
0& 0& -\sin(\frac{2}{L}\pi)& \cos(\frac{2}{K}\pi)& \cdots &0 &0\\\\
\vdots& \vdots& \vdots& \vdots& \ddots& \vdots& \vdots\\
0& 0& 0& 0& \cdots &\cos(\frac{2K-2}{L}\pi)& \sin(\frac{2K-2}{L}\pi)\\\\
0& 0& 0& 0& \cdots &-\sin(\frac{2K-2}{L}\pi)& \cos(\frac{2K-2}{L}\pi)\\\\
\end{pmatrix}.
$$

\end{proof}
\subsection{Learning transforms}
\begin{proposition}\label{learned}
For any $W\in \mathbb{R}^{L}$, if the infinity-norm of $\Gamma\in\mathbb{R}^{L\times L}$ is less than 1, the operator $\mathcal{T}_{\pi}$ defined by
\begin{align*} 
\left(\mathcal{T}_{\pi} Z_{\pi}\right)(o,a)&=WR_o(o,a) + \Gamma\mathbb{E}_{\mathbf{o}^\prime,\mathbf{a}^\prime}\left[Z_{\pi}(\mathbf{o}^\prime, \mathbf{a}^\prime)\right],\notag
\end{align*}
is a contraction mapping. The prediction target $Z_{\pi}$ defined as the fix point of $\mathcal{T}_{\pi}$ satisfies the equation
$$
Z_{\pi}(o,a) = \sum_{n=0}^\infty \left(\frac{\Gamma}{\gamma}\right)^nWe_n(o,a;\pi).
$$
\end{proposition}
\begin{proof}
Given two functions $Z_1$ and $Z_2$, we define the distance by
$$
\mathbf{Dist}\left(Z_1, Z_2\right) = \max_{o,a}\max_{0\leq i \leq L-1} {\bigg|}\left[Z_1(o,a)\right]_i-\left[Z_2(o,a)\right]_i{\bigg|}.
$$
Then, we have
\begin{align*}
&{\bigg\|} \left(\mathcal{T}_{\pi}Z_1\right)(o,a)-\left(\mathcal{T}_{\pi}Z_2\right)(o,a){\bigg\|}_\infty\\
\leq &{\bigg\|}WR_o(o,a) - WR_o(o,a){\bigg\|}_\infty + 
{\bigg\|}\Gamma\mathbb{E}_{\mathbf{o}^\prime,\mathbf{a}^\prime}\left[Z_1(\mathbf{o}^\prime, \mathbf{a}^\prime)-Z_2(\mathbf{o}^\prime, \mathbf{a}^\prime)\right]{\bigg\|}_\infty\\
= &{\bigg\|}\Gamma\mathbb{E}_{\mathbf{o}^\prime,\mathbf{a}^\prime}\left[Z_1(\mathbf{o}^\prime, \mathbf{a}^\prime)-Z_2(\mathbf{o}^\prime, \mathbf{a}^\prime)\right]{\bigg\|}_\infty\\
\leq &{\|}\Gamma{\|}_\infty\cdot{\bigg\|}\mathbb{E}_{\mathbf{o}^\prime,\mathbf{a}^\prime}\left[Z_1(\mathbf{o}^\prime, \mathbf{a}^\prime)-Z_2(\mathbf{o}^\prime, \mathbf{a}^\prime)\right]{\bigg\|}_\infty\\
\leq &
{\|}\Gamma{\|}_\infty \cdot \mathbf{Dist}\left(Z_1,Z_2\right).
\end{align*}
Therefore, we have 
$$
\mathbf{Dist}\left(\mathcal{T}_{\pi}Z_2, \mathcal{T}_{\pi}Z_2\right) \leq {\|}\Gamma{\|}_\infty \mathbf{Dist}\left(Z_1, Z_2\right).
$$
Because ${\|}\Gamma{\|}_\infty$ is less than 1, we have that $\mathcal{T}_{\pi}$ is a contraction mapping.

$$
\left(\mathcal{T}_{\pi} Z_{\pi}\right)(o,a)&=WR_o(o,a) + \Gamma\mathbb{E}_{\mathbf{o}^\prime,\mathbf{a}^\prime}\left[Z_{\pi}(\mathbf{o}^\prime, \mathbf{a}^\prime)\right],
$$

Then, by the definition of the fix point, we have
\begin{align*}
Z_{\pi}(o,a)&=WR_o(o,a) + \Gamma\mathbb{E}_{\mathbf{o}^\prime,\mathbf{a}^\prime}\left[Z_{\pi}(\mathbf{o}^\prime, \mathbf{a}^\prime)\right]\\
&=WR_o(o,a) + \Gamma\mathbb{E}_{\pi}\left[Z_{\pi}(\mathbf{o_1}, \mathbf{a_1})\ |\ \mathbf{o_0}=o, \mathbf{a_0}=a\right]\\
&=WR_o(o,a) + \Gamma\mathbb{E}_{\pi}\left[WR_o(\mathbf{o_1}, \mathbf{a_1}) + \Gamma Z_{\pi}(\mathbf{o_2}, \mathbf{a_2})\ |\ \mathbf{o_0}=o, \mathbf{a_0}=a\right]\\
&=WR_o(o,a) + \Gamma W\mathbb{E}_{\pi}\left[R_o(\mathbf{o_1},\mathbf{a_1})\ |\ \mathbf{o_0}=o, \mathbf{a_0}=a\right] + \Gamma\mathbb{E}_{\pi}\left[ Z_{\pi}(\mathbf{o_2}, \mathbf{a_2})\ |\ \mathbf{o_0}=o, \mathbf{a_0}=a\right]\\
&\cdots\\
&=\sum_{n=1}^\infty \Gamma^nW \mathbb{E}_{\pi}\left[R_o(\mathbf{o_n},\mathbf{a_n})\ |\ \mathbf{o_0}=o, \mathbf{a_0}=a\right]\\
&=\sum_{n=1}^\infty \left(\frac{\Gamma}{\gamma}\right)^nW e_n(o,a;\pi).
\end{align*}
We note that the infinite sum in RHS converges. The reason is that $\sum_{n=N}^\infty \left(\frac{\Gamma}{\gamma}\right)^nW e_n(o,a;\pi)$ converges to zero vector, as
$$
{\bigg\|}\sum_{n=N}^\infty \left(\frac{\Gamma}{\gamma}\right)^nW e_n(o,a;\pi){\bigg\|}_\infty 
\leq R_{max}\sum_{n=N}^\infty{\bigg\|} \Gamma^nW {\bigg\|}_\infty
\leq R_{max}{\bigg\|} W {\bigg\|}_\infty\frac{{\bigg\|} \Gamma {\bigg\|}_\infty^N}{1-{\bigg\|} \Gamma {\bigg\|}_\infty}.
\\
$$
\end{proof}



% \newpage
% \section{Other Prediction Targets of RSP}\label{other}
% Note that, whenever $\|W_a\|_1 < 1$, its corresponding mapping $\mathcal{T}_{\pi,a}$ is a contraction mapping. Therefore, we can define two more prediction targets via Equation \ref{zeq}. The first one corresponds to the \textbf{multiple Q} variant in Section \ref{abl_sec}. Its prediction target $Z_{\pi,3}(o,a)$ is defined as
% $$
% \left[Z_{\pi,3}(o,a)\right]_i = \sum_{n=0}^\infty \gamma^i\left[U_{\pi}(o,a)\right]_n.
% $$
% Then, the corresponding $W_3$ and $\Gamma_3$ have the following forms
% \begin{align*}
% &W_3 = 
% \begin{pmatrix}
% 1& 1& 1&\cdots &1 &1
% \end{pmatrix},
% &\Gamma_3 = 
% \begin{pmatrix}
% \gamma& 0& 0& \cdots &0 &0\\
% 0& \gamma^2& 0&\cdots &0 &0\\
% 0& 0& \gamma^3&\cdots &0 &0\\
% \vdots& \vdots& \vdots& \ddots& \vdots& \vdots\\
% 0& 0& 0&\cdots &\gamma^{L-1} &0\\
% 0& 0& 0&\cdots &0 &\gamma^{L}
% \end{pmatrix}.
% \end{align*}

% The second one corresponds to the \textbf{random} variant in Section \ref{abl_sec}. Its prediction target $Z_{\pi_4}$ is recurrently defined by
% \begin{align*} 
% Z_{\pi,4}(o,a)&=\left(\mathcal{T}_{\pi,4} Z_{\pi,4}\right)(o,a),\\
% \left(\mathcal{T}_{\pi,4} Z_{\pi,4}\right)(o,a)&=W_{4}R_o(o,a) + \Gamma_{4}\mathbb{E}_{\mathbf{o}^\prime,\mathbf{a}^\prime}\left[Z_{\pi,4}(\mathbf{o}^\prime, \mathbf{a}^\prime)\right],\notag
% \end{align*}
% We sample $W_4$ and $\Gamma_4$ before training and then keep them fixed. Specifically, we sample $W_4$ and $\Gamma_4$ via the following method (Algorithm \ref{sample_w}) to ensure that $\mathcal{T}_{\pi,4}$ is a contraction mapping.

% {\centering
% \begin{minipage}{0.5\textwidth}
% \begin{algorithm}[H]
%    \caption{Sample $W_4$ and $\Gamma_4$}
%    \label{sample}
% \begin{algorithmic}
%     \STATE $W4 \leftarrow$ torch.rand(L,1)\\
%     \STATE $\Gamma_4 \leftarrow$ torch.rand(L,L)\\
%     \STATE $\Gamma_4 \leftarrow$ torch.softmax($\Gamma4$, dim=-1)\\
%     \STATE $\Gamma_4 \leftarrow \gamma \Gamma_4$
% \end{algorithmic}
% \label{sample_w}
% \end{algorithm}
% \end{minipage}
% }

\section{Details for Experiments in Section 4.1}
In this section, we provide additional information about the experiments in Section 4.1.
\subsection{Experimental Setting}
We evaluate all auxiliary tasks in a modified Cartpole Swingup environment. In each episode, the background images are sampled from two videos and then kept fixed through the whole episode. We label an observation according to the video that its background image is sampled from.  We use the InfoNCE objective to estimate the mutual information $I\left(\phi_\theta(\mathbf{o_t});\mathbf{s_t}\right)$. 
That is,
$$
\mathcal{J}_{NCE} = -\mathbb{E}\left[\log\left(
\frac{\|f_w(\phi_\theta(\mathbf{o}_i))-f^\prime_w(\mathbf{s}_i)\|_2^2}{\sum_{i=k}^{N}\|f_w(\phi_\theta(\mathbf{o}_k))-f^\prime_w(\mathbf{s}_k)\|_2^2}
\right)\right],
$$
where $f_w$ and $f_w^\prime$ are two networks, $\left((\mathbf{o}_1, \mathbf{s}_1),\cdots,(\mathbf{o}_N, \mathbf{s}_N)\right)$ is a batch of samples. We train these two networks via maximizing $\mathcal{J}_{NCE}$. We use the final loss as an estimate for $I\left(\phi_\theta(\mathbf{o_t});\mathbf{s_t}\right)$. We train a network with a cross-entropy loss to predict background images and use the loss to estimate the mutual information $I\left(\phi_\theta(\mathbf{o_t});\mathbf{d_t}\right)$. That is, 
$$
\mathcal{J}_{CE} =  -\mathbb{E}\left[\frac{1}{N}\sum_{i=1}^{N}\log q_w(\mathbf{d}_i|\phi_\theta(\mathbf{o}_i))\right],
$$
where $q_w$ is the classification network, and $\mathbf{d}_i$ is the label of the background image. We train the network $q_\theta$ by minimizing $J_{CE}$. We use $\log2 - \mathcal{J}_{CE}$ as an estimate of $I\left(\phi_\theta(\mathbf{o_t});\mathbf{d_t}\right)$. For all methods, we optimize the policy network and value network via 200K-step online training and then estimate the mutual information using the saved data. Note that all auxiliary tasks are combined with DrQ.

\subsection{Additional Results}
We show the performance during training in Figure \ref{anaper}. Results show RSP significantly outperforms other auxiliary tasks. In our experiments, the VAE-based auxiliary task tends to minimize reconstruction losses by reconstructing the background images as shown in Figure \ref{vae}. 
\begin{figure}[!ht]
\begin{center}
\centerline{\includegraphics[width=0.6\columnwidth]{img/analyze_performance.pdf}}
\caption{The performance of different auxiliary tasks during training in the Cartpole Swingup task.}
\label{anaper}
\end{center}
\end{figure}
\begin{figure}[!ht]
\begin{center}
\centerline{\includegraphics[width=0.95\columnwidth]{img/vae.pdf}}
\caption{Ground-truth and reconstructed images. Results show that representations learned by VAE mainly encode information about the background images, which is irrelevant to the control task.}
\label{vae}
\end{center}

\end{figure}

\section{Experiments}\label{exp_more}
\subsection{Multi-Distraction Setting}\label{exp_detail}
Figure \ref{dcs} shows the snapshots of all six environments. In these tasks, robots face multiple distractions at the same. We implement DrQ and DrQv2 using Pytorch. We run all experiments in one GPU, Geforce 2080Ti. Our implementation of DrQ is slightly different from the official implementation of DrQ. (1) Our implementation does not use a target encoder similar to DrQv2. (2) We use a small batchsize 256 instead of 512 for a fair comparison. (3) We use a small learning rate 5e-4 instead of 1e-3. (4) We use a large replay buffer whose size is 500K instead of 100K. The first two modifications are to improve the computational efficiency while the last two are to unify the hyperparameters used in DrQ and DrQv2. In Tabel \ref{drq_imp}, we provide a comparison between our implementation and that used in DCS. Note that these modifications do not reduce the performance of DrQ, and even improve it in some environments. Our implementation of DrQv2 also uses batchsize 256 and learning rate 5e-4 for a fair comparison. Moreover, for DrQv2, we set the action repeat hyperparameter the same as DrQ. We directly use the official implementation of DBC, TIA, and TPC for experiments in Section 6.1. All hyperparameters of RSP are listed in Table \ref{hyper}. Please note that RSP also predicts one-step rewards and we regularize the outputs of policy networks by l2 norm with a small coefficient 0.01. The action regularization can control the task-irrelevant information used by the early exploration policy. This trick can not improve the performance of DrQ/DrQv2, but can reduce the performance variance of RSP with different random seeds in some tasks. For computation efficiency, we use a smaller batchsize 128 instead of 256 for all ablation studies.

\begin{figure}[!bht]

\begin{center}
\centerline{\includegraphics[width=0.55\columnwidth]{img/dcs.pdf}}
\caption{The six environments used in our Section. Agents face the camera distractions, color distractions, and background distractions simoutenously.}
\label{dcs}
\end{center}

\end{figure}
\begin{table*}[t]
    \centering
    \begin{tabular}{|l|l|l|l|l|l|l|}
    \hline
        ~ & BiC-Catch & C-Swingup & C-Run & F-Spin & R-Easy & W-walk \\ \hline
        DrQ (DCS) & $138\pm20$ & $334\pm29$ & $4\pm2$ & $378\pm125$ & $113\pm22$ & $28\pm1$ \\ \hline
        DrQ (Our) & ${99\pm99}$ & ${341\pm52}$ & $211\pm64$ & $543\pm245$ & ${168\pm63}$ & $30\pm8$ \\ \hline
    \end{tabular}
    \caption{Comparison between different implementations in multi-distraction environments. Our implementation achieves similar or better performance than that used in DCS.}\label{drq_imp}
\end{table*}

\begin{table*}[!tbh]
    \centering
    \begin{tabular}{|l|c|}
    \hline
        \textbf{Hyperparameter} & \textbf{Setting}\\ \hline
        Input dimension & 3$\times$84$\times$84\\ \hline
        Stacked frames & 3\\ \hline
        Discount factor & 0.99\\ \hline
        % Episode length & 1000\\ \hline
        Replay buffer size & 500K\\ \hline
        Batch size & 256\\ \hline
        % Optimizer & Adam\\ \hline
        learning rate &5e-4\\ \hline
        Random cropping padding & 4\\ \hline
        Seed steps & 4000 \\ \hline
        % Init temperature & 0.1 \\ \hline
        Encoder conv layers & 4\\ \hline
        Encoder conv strides & [2,1,1,1]\\ \hline
        Encoder conv channels & 32\\ \hline
        Encoder feature dim & 50\\ \hline
        Actor/Critic head MLP layers & 3\\ \hline
        Actor/Critic head MLP hidden dim & 1024\\ \hline
        Actor update frequency & 2\\ \hline
        % Critic head MLP layers & 3\\ \hline
        % Critic head MLP hidden dim & 1024\\ \hline
        Critic target update frequency & 2\\ \hline
        Critic soft-update rate & 0.01\\ \hline
        DrQv2: noise schedule & linear(1.0, 0.1, 500000)\\ \hline
        * RSP network: prediction layers & 3\\ \hline
        * RSP network: hidden dim & 256\\ \hline
        * RSP network: output dim & 1024\\ \hline
        * RSP: share the first linear layer & True\\ \hline
        * RSP: stop gradients of RL losses & True\\ \hline
        \multirow{2}*{* RSP: prediction target} & $Z_{\pi,1}$ in R-Easy, BiC-Catch, W-Walk\\ 
        ~ & $Z_{\pi,2}$ in C-Swingup, F-Spin, C-Run\\ \hline
    \end{tabular}
    \caption{Hyperparameters were used in our experiments. The marker * means the extra hyperparameters used in RSP. The noise schedule "linear(1.0, 0.1, 500000)" used for DrQv2 means that the exploration noise decays linearly from 1.0 to 0.1 after 500K environment steps.}\label{hyper}
\end{table*}

\subsection{Results in no-distraction environments}
Many methods considering distraction perform worse than DrQ in standard DMC environments. However, the final scores of DrQ+RSP is comparable with those of DrQ (Table \ref{nodis}).
\begin{table*}[!tbh]
    \centering
    \begin{tabular}{|l|l|l|l|l|l|l|}
    \hline
        ~ & BiC-Catch & C-Swingup & C-Run & F-Spin & R-Easy & W-Walk \\ \hline
        DrQ & $963\pm9$ & $868\pm10$ & $660\pm96$ & $938\pm103$ & $942\pm71$ & $921\pm45$ \\ \hline
        DrQ+RSP & $963\pm7$ & ${864\pm12}$ & ${642\pm46}$ & $981\pm3$ & ${973\pm4}$ & ${950\pm18}$ \\ \hline
    \end{tabular}
    \caption{500K step scores in no-distraction environments.}
\label{nodis}
\end{table*}

\begin{table*}[t]
    \centering
    \begin{tabular}{|l|c|}
    \hline
        \textbf{Hyperparameter} & \textbf{Setting}\\ \hline
        Input dimension & 3$\times$168$\times$168\\ \hline
        Episode length & 500\\ \hline
        Policy learning rate &1e-3\\ \hline
        Critic learning rate &1e-4\\ \hline
        Random cropping padding & 8\\ \hline
        Seed steps & 2000 \\ \hline
        Critic target update frequency & 4\\ \hline
        Regularization coef & 0.05\\ \hline
        DrQv2: noise schedule & linear(1.0, 0.1, 100000)\\ \hline
        RSP: prediction target & $Z_{\pi,1}$\\ \hline
    \end{tabular}
    \caption{Hyperparameters used in the Door Opening task.}\label{newhyper}
\end{table*}
\begin{figure}[ht]

\begin{center}
\includegraphics[width=0.65\columnwidth]{img/robosuite.pdf}
\caption{Door Opening in Robosuite benchmarking. We illustrate observations in different episodes.}
\label{door}
\end{center}

\end{figure}
\begin{figure}[!htb]

\begin{center}
\includegraphics[width=0.4\columnwidth]{img/robosuite_result.pdf}
\caption{Comparision between DrQv2+RSP and DrQv2 in the Door Opening environment. Results show that RSP significantly improvide the sample efficiency and final performance.}\label{robo_fig}
\end{center}

\end{figure}

\subsection{Comparison in Door opening}
{ Here, we compare DrQv2+RSP and DrQv2 in a Robosuite task, Door Opening. Compared with DCS environments, the Door Opening environment simulates a more realistic robotic scenario, where a robot arm learns to turn a handle and then open the door. The dimension of observations ($3\times168\times168$) in Door Opening is also higher than that ($3\times84\times84$) in DCS. In our experiments, three kinds of distractions exist during the training phase, including color, light, and camera distractions. We illustrate the environment in Figure \ref{door}. We use hyperparameters similar to that of SECANT. The hyperparameters different from Table \ref{hyper} are shown in Table \ref{newhyper}. We report results over five random seeds. Figure \ref{robo_fig} shows the performance after 150K environment steps (300 episodes). RSP provides significant performance improvement ($+736\%$) in the Door Opening task. 
}

\subsection{Ablation Results and Visualization}\label{more_vis}
Figure \ref{vismore} visualize the latent spaces learned by six different auxiliary tasks using the data the same as that used in Section 41. Figure \ref{near_rsp} and \ref{near_cpc} provide more comparisons between RSP and CPC+Reward.

\begin{figure}[ht]

\begin{center}
\subfigure[RSP]{\includegraphics[width=0.49\columnwidth]{img/rsp-974.pdf}}
\subfigure[VAE]{\includegraphics[width=0.49\columnwidth]{img/vae-222.pdf}}
\subfigure[CURL]{\includegraphics[width=0.49\columnwidth]{img/curl-130.pdf}
}
\subfigure[CPC]{\includegraphics[width=0.49\columnwidth]{img/cpc1320.pdf}
}
\subfigure[Reward]{\includegraphics[width=0.49\columnwidth]{img/reward5-10.pdf}
}
\subfigure[CPC+Reward]{\includegraphics[width=0.49\columnwidth]{img/cpc+reward1308.pdf}
}
\caption{Embedding spaces learned by different auxiliary tasks. Results show that RSP can capture information about state values and tends to map observations with different background images to the same region. }
\label{vismore}
\end{center}

\end{figure}

\begin{figure}[ht]

\begin{center}
\centerline{\includegraphics[width=0.97\columnwidth]{img/tsne_all_rsp.pdf}}
\caption{T-SNE of the embedding space learned by RSP. We randomly sample four observations (corresponding to four colors in the T-SNE figure) and match them with their nearest neighbors respectively (shown in the bottom subfigures). The results show that neighboring points in the embedding space learned by RSP metric have similar states.}
\label{near_rsp}
\end{center}

\end{figure}

\begin{figure}[ht]

\begin{center}
\centerline{\includegraphics[width=0.97\columnwidth]{img/tsne_all_cpc.pdf}}
\caption{T-SNE of the embedding space learned by CPC+Reward. CPC+Reward tends to map observations with similar background images to neighboring regions, even though those observations may corresponds to dissimilar states.}
\label{near_cpc}
\end{center}

\end{figure}



\subsection{Comparison between Variants of RSP}\label{hypothesize}
In Figure \ref{drq_comp} and \ref{drqv2_comp}, we provide comparisons between the prediction targets $Z_{\pi,1}$ and $Z_{\pi,2}$, which corresponding to \textbf{direct} and \textbf{Fourier} respectively. Figure \ref{cartpole_ana}-\ref{ball_ana} further visualize reward sequences and state sequences to understand potential reasons why \textbf{Fourier} outperforms \textbf{direct} in some tasks. The results show that the variant \textbf{Fourier} outperforms \textbf{direct} in Cheetah Run, Cartpole Swingup, and Finger Spin environments. We observe the approximate periodicity of reward sequences in Cartpole Swingup and Finger Spin. We do not observe the periodicity of reward sequences in Cheetah Run. However, some dimensions of states are approximate periodic. We hypothesize that \textbf{Fourier} outperforms \textbf{direct} in Cheetah Run due to the approximate periodicity of states. In the Ball in Cup environment, we do not observe periodicity of reward sequences or state sequences. Therefore, \textbf{Fourier} performs worse than \textbf{direct} in the Ball in Cup environment.
\begin{figure}[!htb]

\begin{center}
\centerline{\includegraphics[width=0.75\columnwidth]{img/drq.pdf}}
\caption{Comparison between the variants \textbf{direct} and \textbf{Fourier} based on DrQ.}
\label{drq_comp}
\end{center}

\end{figure}
\begin{figure}[!htb]

\begin{center}
\centerline{\includegraphics[width=0.75\columnwidth]{img/drqv2.pdf}}
\caption{Comparison between the variants \textbf{direct} and \textbf{Fourier} based on DrQv2.}
\label{drqv2_comp}
\end{center}

\end{figure}

\begin{figure}[!htb]

\begin{center}
\centerline{\includegraphics[width=0.7\columnwidth]{img/cartpole.pdf}}
\caption{State sequences and the corresponding reward sequence in the Cartpole Swingup task.}
\label{cartpole_ana}
\end{center}

\end{figure}
\begin{figure}[!htb]

\begin{center}
\centerline{\includegraphics[width=0.7\columnwidth]{img/finger.pdf}}
\caption{State sequence and the corresponding reward sequence in the Finger Spin task.}
\label{finger_ana}
\end{center}

\end{figure}
\begin{figure}[!htb]

\begin{center}
\centerline{\includegraphics[width=0.7\columnwidth]{img/cheetah.pdf}}
\caption{State sequences and the corresponding reward sequence in the Cheetah Run task. }
\label{cheetah_ana}
\end{center}

\end{figure}
\begin{figure}[!htb]

\begin{center}
\centerline{\includegraphics[width=0.72\columnwidth]{img/ball.pdf}}
\caption{State sequences and the corresponding reward sequence in the Ball in Cup Catch task.}
\label{ball_ana}
\end{center}

\end{figure}


\subsection{Complete results of our baselines}
This part provides complete results (\ref{complete_fig}) of our baselines in multi-distraction settings. Results show that the baselines hardly improve the performance compared with DrQ/DrQv2, indicating the difficulty of learning representations when multiple distractions exist.
\begin{figure}[!htb]
\begin{center}
\centerline{\includegraphics[width=0.72\columnwidth]{img/baselines.pdf}}
\caption{Complete results of our baselines in multi-distraction settings.}
\label{complete_fig}
\end{center}
\end{figure}
\end{document}
