%\documentclass{uai2022} % for initial submission
 \documentclass[accepted]{uai2022} % after acceptance, for a revised
% version; also before submission to
% see how the non-anonymous paper
% would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
% ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

\usepackage{booktabs}
\usepackage[utf8]{inputenc}
\usepackage{makecell}
\usepackage{graphicx}
\usepackage{amsmath, amssymb}
\usepackage{geometry}
\usepackage{subfigure}

\graphicspath{{figures/}} % Directory in which figures are stored

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

\title{Self-Supervised Representations for Multi-View Reinforcement Learning (Supplementary Material)}

\author[1]{Huanhuan Yang}
\author[2,3,1]{Dianxi Shi \thanks{Corresponding author (dxshi@nudt.edu.cn).}}
\author[4]{Guojun Xie}
\author[1]{Yingxuan Peng}
\author[2]{Yi Zhang}
\author[3]{Yantai Yang}
\author[1]{Shaowu Yang}

% Add affiliations after the authors
\affil[1]{%
	College of Computer, National University of Defense Technology, Changsha, China
}
\affil[2]{%
	Artificial Intelligence Research Center, Defense Innovation Institute, Beijing, China
}
\affil[3]{%
	Tianjin Artificial Intelligence Innovation Center, Tianjin, China
}
\affil[4]{%
	College of Computer Science and Technology, Nanjing University of Aeronautics and Astronautics, Nanjing, China
}

\begin{document}
\onecolumn
\maketitle
\setcounter{page}{11}
\setcounter{figure}{8}
\setcounter{table}{1}
\setcounter{equation}{14}
\appendix
\section{Derivation of the Two-view CEB Loss} \label{sec:2-CEB loss}
As mentioned in the main text, the two-view CEB objective is defined as:
\begin{equation}
\begin{array}{l}
\displaystyle{\bf obj:}\quad  \min_{Z,Z_1,Z_2} \beta_1I(X_1;Z_1|Y_1)+\beta_2I(X_2;Z_2|Y_2)-I(Z;Y) \\
\displaystyle \qquad \ \ =\min_{z,z_1,z_2} \beta_1I(s_1;z_1|{z}'_1,r,a) + 
\beta_2 I(s_2;z_2|{z}'_2,r,a) - I(z;{z}',r|a) \qquad\qquad\qquad\qquad\qquad\qquad\quad\\
\displaystyle{\bf s.t.:}\quad Z= f_\theta(Z_1,Z_2) \Rightarrow z = f_\theta(z_1,z_2)
\end{array}
\label{eq1}
\end{equation}
Considering $I(X_1;Z_1|Y_1)=I(X_1;Z_1)-I(Z_1;Y_1), I(X_2;Z_2|Y_2)=I(X_2;Z_2)-I(Z_2;Y_2)$, the original two-view CEB objective can be rewritten as:
\begin{equation}
\begin{array}{l}
\displaystyle{\bf obj:}\quad \min_{Z,Z_1,Z_2} \beta_1[I(X_1;Z_1)-I(Z_1;Y_1)]+\beta_2[I(X_2;Z_2)-I(Z_2;Y_2)]-I(Z;Y)\\
\displaystyle \qquad \ \ \ =\min_{z,z_1,z_2} \beta_1(I(s_1;z_1)-I(z_1;{z}'_1,r|a)) + 
\beta_2 (I(s_2;z_2) - I(z_2;{z}'_2,r|a)) - I(z;{z}',r|a) \qquad\quad\ \\
\displaystyle{\bf s.t.:}\quad Z = f_\theta(Z_1,Z_2) \Rightarrow z = f_\theta(z_1,z_2)
\end{array}
\label{eq2}
\end{equation}
To begin with, we give the joint probability density function of variables $s_1$, $s_2$, $z_1$, $z_2$, $z$, ${z}'_1$, ${z}'_2$, ${z}'$, $r$ and $a$. Since $z_1$ is learned from $s_1$, $z_2$ is learned from $s_2$, $z$ is fused by $z_1$ and $z_2$, thus, based on the Bayes’ rule, this joint probability density function can be expressed as:
\begin{equation}
\begin{aligned}
p(s_1, s_2, z_1, z_2, z, {z}'_1, {z}'_2, {z}', r, a) &=p(z|s_1, s_2, z_1, z_2, {z}'_1, {z}'_2, {z}', r, a)\ {\cdot} \ p(z_1|s_1, s_2, z_2, {z}'_1, {z}'_2, {z}', r, a)\ \cdot \\
&\quad\ p(z_2|s_1, s_2, {z}'_1, {z}'_2, {z}', r, a) \ {\cdot} \ p(s_1, s_2, {z}'_1, {z}'_2, {z}', r, a)\\
&= p(z|z_1, z_2)\ {\cdot} \ p(z_1|s_1) \ \cdot \ p(z_2|s_2)\ \cdot \ p(s_1, s_2, {z}'_1, {z}'_2, {z}', r, a)
\end{aligned}
\label{eq3}
\end{equation}
Then, we analysis the first term $I(s_1;z_1)$ in Eq. (\ref{eq2}), according to the standard definition of the mutual information, the mutual information between $s_1$ and $z_1$ is:
\begin{equation}
\begin{aligned}
I(s_1;z_1)=\int d_{s_1}d_{z_1}\ p(s_1,z_1)\log\frac{p(z_1|s_1)}{p(z_1)}\qquad\qquad\qquad\qquad\qquad\qquad\qquad\qquad\qquad\qquad\qquad\quad
\end{aligned}
\label{eq4}
\end{equation}
Due to the intractable of $p(z_1)$, we use the variational distribution $q_1(z_1)$ to approximate it. Considering the non-negative property of the Kullback-Leibler divergence (KL-divergence), we can infer that:
\begin{equation}
KL(p(z_1)||q_1(z_1))\ \ge \ 0\Longrightarrow\int d_{z_1}\ p(z_1)\log{p}(z_1)\ \ge \ \int d_{z_1}\ p(z_1)\log {q_1}(z_1)\qquad\qquad\qquad\qquad\quad
\label{eq5}
\end{equation}
Substituting Eq. (\ref{eq5}) into Eq. (\ref{eq4}), we have:
\begin{equation}
\begin{aligned}
I(s_1;z_1)\le& \int d_{s_1}d_{z_1}\ p(s_1,z_1)\log\frac{p(z_1|s_1)}{q_1(z_1)} \\
=&\int d_{s_1} d_{s_2}d_{{z}'_1}d_{{z}'_2} d_{{z}'}d_{r}d_{a}d_{z_1}\ p(s_1,s_2,{z}'_1, {z}'_2, {z}',r, a,z_1)\log\frac{p(z_1|s_1)}{q_1(z_1)}  \qquad\qquad\ \qquad\qquad
\end{aligned}
\label{eq6}
\end{equation}
Considering variable $z_1$ only depends on variable $s_1$, we get the following variational bound for term $I(s_1;z_1)$:
\begin{equation}
I(s_1;z_1)\ \le\ \int d_{s_1}d_{s_2}d_{{z}'_1} d_{{z}'_2}d_{{z}'}d_{r}d_{a}\ p(s_1,s_2, {z}'_1, {z}'_2, {z}', r, a) \int d_{z_1}\ p(z_1|s_1)\log\frac{p(z_1|s_1)}{q_1(z_1)}\qquad\qquad
\label{eq7}
\end{equation}
Similarly, for the third term $I(s_2;z_2)$ in Eq. (\ref{eq2}), its variational bound is:
\begin{equation}
I(s_2;z_2)\ \le\ \int d_{s_1} d_{s_2}d_{{z}'_1} d_{{z}'_2}d_{{z}'}d_{r}d_{a}\ p(s_1,s_2, {z}'_1, {z}'_2, {z}', r, a) \int d_{z_2} \ p(z_2|s_2)\log\frac{p(z_2|s_2)}{q_2(z_2)}\qquad\qquad
\label{eq8}
\end{equation}
Next, we focus on the second term $I(z_1;{z}'_1, r | a)$ in Eq. (\ref{eq2}). According to the definition, the conditional multual information of variables  $z_1$, ${z}'_1$ and $r$ given $a$ is:
\begin{equation}
I(z_1;{z}'_1, r | a) =\int d_{z_1}d_{{z}'_1}d_{r}d_{a} \ p(z_1, {z}'_1, r, a) \log \frac{p({z}'_1, r|z_1, a)}{p({z}'_1, r | a)} \qquad\qquad\qquad\qquad\qquad\qquad\qquad\quad\ \
\label{eq9}
\end{equation}
Since it is difficult to compute $p({z}'_1, r|z_1, a)$, we use distribution $g_{\omega_1}({z}'_1, r|z_1, a)$ learned form a neural network to approximate it. Since the KL-divergence between distributions $p({z}'_1, r|z_1, a)$ and $g_{\omega_1}({z}'_1, r|z_1, a)$ is always non-negative, we have:
\begin{equation}
\begin{aligned}
KL(p({z}'_1, r|z_1, a)||g_{\omega_1}({z}'_1, r|z_1, a))\ \ge0 \  \Longrightarrow& \int d_{z_1}d_{{z}'_1}d_{r}d_{a} \ p(z_1, {z}'_1, r, a) \log{p}({z}'_1, r|z_1, a) \ \ge \\ 
&\int d_{z_1}d_{{z}'_1}d_{r}d_{a} \ p(z_1, {z}'_1, r, a) \log{g_{\omega_1}}({z}'_1, r|z_1, a)\qquad\quad\ \ 
\end{aligned}
\label{eq10}
\end{equation}
Substituting Eq. (\ref{eq10}) into Eq. (\ref{eq9}), $I(z_1;{z}'_1, r | a)$ is reshaped as:
\begin{equation}
\begin{aligned}
I(z_1;{z}'_1, r | a)\ \ge \ &\int d_{z_1}d_{{z}'_1}d_{r}d_{a} \ p(z_1,{z}'_1, r, a) \log\frac{{g_{\omega_1}}({z}'_1, r|z_1, a)}{p({z}'_1, r | a)} \\
=&\int d_{z_1}d_{{z}'_1} d_{r}d_{a} \ p(z_1,{z}'_1, r,a) \log{g_{\omega_1}({z}'_1, r|z_1, a)} - \underbrace{\int d_{{z}'_1} d_{r}d_{a}\ p({z}'_1, r,a)\log{p({z}'_1, r | a) }}_{\text{dropped}}
\end{aligned}
\label{eq11}
\end{equation}
Notice that the $\int d_{{z}'_1} d_{r}d_{a}\ p({z}'_1, r,a)\log{p({z}'_1, r | a) } $ term in Eq. (\ref{eq11}) is independent of the optimization of the S2R model, so we can directly drop it. Then, Eq. (\ref{eq11}) is equivalent to:
\begin{equation}
\begin{aligned}
I(z_1;{z}'_1, r | a)\  \ge \ &\int d_{z_1}d_{{z}'_1} d_{r}d_{a} \ p(z_1,{z}'_1, r,a)\log{g_{\omega_1} ({z}'_1, r|z_1, a)} \\
=&\int d_{s_1} d_{s_2}d_{{z}'_1}d_{{z}'_2}d_{{z}'}d_{r}d_{a}d_{z_1}\ p(s_1,s_2,{z}'_1, {z}'_2, {z}', r, a,z_1)\log{g_{\omega_1}({z}'_1, r|z_1, a)}\qquad\qquad\quad
\end{aligned}
\label{eq12}
\end{equation}
Considering variable $z_1$ only depends on variable $s_1$, we get the following variational bound for term $I(z_1;{z}'_1, r | a)$:
\begin{equation}
I(z_1;{z}'_1, r | a) \ \ge \ \int d_{s_1} d_{s_2}d_{{z}'_1}d_{{z}'_2}d_{{z}'}d_{r}d_{a}\ p(s_1,s_2,{z}'_1, {z}'_2, {z}', r, a) \int d_{z_1} \ p(z_1|s_1)\log{g_{\omega_1}({z}'_1, r|z_1, a)}\quad\ 
\label{eq13}
\end{equation}
Similarly, for the fourth term $I(z_2;{z}'_2, r | a)$ in Eq. (\ref{eq2}), its variational bound is:
\begin{equation}
I(z_2;{z}'_2, r | a) \ \ge \ \int d_{s_1} d_{s_2}d_{{z}'_1}d_{{z}'_2}d_{{z}'}d_{r}d_{a}\ p(s_1,s_2,{z}'_1, {z}'_2, {z}', r, a) \int d_{z_2} \ p(z_2|s_2)\log{g_{\omega_2}({z}'_2, r|z_2, a)}\quad\ 
\label{eq14}
\end{equation}
Finally, we derive the variational bound for the last term $I(z;{z}', r | a)$ in Eq. (\ref{eq2}). According to the definition, the conditional multual information of variables $z$, ${z}'$ and $r$ given $a$ is:
\begin{equation}
I(z;{z}', r | a) =\int d_{z}d_{{z}'}d_{r}d_{a} \ p(z, {z}', r , a) \log \frac{p({z}', r|z, a)}{p({z}', r | a)}
\qquad\qquad\qquad\qquad\qquad\qquad\qquad\qquad\qquad\ \ \ 
\label{eq15}
\end{equation}
Since $p({z}', r|z, a)$ is intractable, we use distribution $g_{\omega_{12}}({z}', r|z, a)$ learned from a nueral network to approximate it. Considering the non-negative of the KL-divergence between distributions $p({z}', r|z, a)$ and $g_{\omega_{12}}({z}', r|z, a)$, we have:
\begin{equation}
\begin{aligned}
KL(p({z}', r|z, a)||g_{\omega_{12}}({z}', r|z, a))\ \ge0 \  \Longrightarrow & \int d_{z}d_{{z}'}d_{r}d_{a}\ p(z, {z}', r, a) \log{p}({z}', r | z, a) \ \ge \\ 
&\int d_{z}d_{{z}'}d_{r}d_{a}\ p(z, {z}', r, a) \log{g}_{\omega_{12}}({z}', r|z, a)\qquad\qquad\quad\ \ \ \ 
\end{aligned}
\label{eq16}
\end{equation}
Therefore, $I(z;{z}', r | a)$ is bounded by:
\begin{equation}
\begin{aligned}
I(z;{z}', r | a) \ \ge \ &\int d_{z}d_{{z}'}d_{r}d_{a}\ p(z,{z}', r, a)\log\frac{{g_{\omega_{12}}}({z}', r|z, a)}{p({z}', r | a)} \\
=&\int d_{z}d_{{z}'} d_{r}d_{a}\ p(z,{z}', r,a) \log{g_{\omega_{12}}({z}', r|z, a)} - \underbrace{ \int d_{{z}'} d_{r}d_{a}\ p({z}', r,a)\log{p({z}', r | a)}} _{\text{dropped}} \qquad\quad
\end{aligned}
\label{eq17}
\end{equation}
In Eq. (\ref{eq17}), the $\int d_{{z}'} d_{r}d_{a}\ p({z}', r,a)\log{p({z}', r | a)}$ term can be ignored, then, we have:
\begin{equation}
\begin{aligned}
I(z;{z}', r | a)\ \ge \ & \int d_{z}d_{{z}'} d_{r}d_{a}\ p(z,{z}', r,a)\log{g_{\omega_{12}}({z}', r|z, a)}\\
=&\int d_{s_1} d_{s_2}d_{{z}'_1}d_{{z}'_2}d_{{z}'} d_{r}d_{a}d_{z_1}d_{z_2}d_{z} \ p(s_1,s_2,{z}'_1, {z}'_2, {z}', r, a,z_1, z_2, z) \log{g_{\omega_{12}}({z}', r|z, a)}\quad\enspace\ \ 
\end{aligned}
\label{eq18}
\end{equation}
By using the joint probability density function in Eq. (\ref{eq3}), the variational bound for term $I(z;{z}', r | a)$ is:
\begin{equation}
\begin{aligned}
I(z;{z}', r | a) \ \ge \ & \int d_{s_1} d_{s_2}d_{{z}'_1}d_{{z}'_2}d_{{z}'} d_{r}d_{a}\ p(s_1,s_2,{z}'_1, {z}'_2, {z}', r, a)  \ \cdot \\
& \int d_{z_1}d_{z_2}d_z \ p(z_1|s_1)p(z_2|s_2) p(z|z_1,z_2)\log{g_{\omega_{12}}({z}', r|z, a)} \qquad\qquad\qquad\qquad\qquad\quad\quad\ \:
\end{aligned}
\label{eq19}
\end{equation}
With the variational bounds listed in Eq. (\ref{eq7}), Eq. (\ref{eq8}), Eq. (\ref{eq13}), Eq. (\ref{eq14}) and Eq. (\ref{eq19}), the final variational upper bound of the two-view CEB objective in Eq. (\ref{eq1}) is summarized as:
\begin{align}
&\beta_1I(s_1;z_1|{z}'_1,r,a) + 
\beta_2 I(s_2;z_2|{z}'_2,r,a) - I(z;{z}',r|a) \ \le \notag\\
&\qquad\int d_{s_1} d_{s_2}d_{{z}'_1}d_{{z}'_2}d_{{z}'} d_{r}d_{a}\ p(s_1,s_2,{z}'_1, {z}'_2, {z}', r, a) \Big (\beta_1\int d_{z_1}\ p(z_1|s_1) \left[\log \frac{p(z_1| s_1)}{q_1(z_1)} - \log g_{\omega_1}({z}'_1, r|z_1,a)\right] \ + \beta_2\notag \\
&\qquad \int d_{z_2}\ p(z_2|s_2) \left[\log\frac {p(z_2|s_2)}{q_2(z_2)} - \log{g_{\omega_2}({z}'_2,r| z_2,a)}\right] \ - \int d_{z_1} d_{z_2}d_z \ p(z_1|s_1) p(z_2|s_2) p(z|z_1, z_2)\log{g_{\omega_{12}} ({z}',r|z,a)}\Big )
\label{eq20}
\end{align}
In the actual implementation, we use the Monte Carlo sampling to sample empirical data to approximate $s_1$, $s_2$, ${z}'_1$, ${z}'_2$, ${z}'$, $r$ and $a$, then, Eq. (\ref{eq20}) is simplified as:
\begin{align}
&\beta_1I(s_1;z_1|{z}'_1,r,a) + 
\beta_2 I(s_2;z_2|{z}'_2,r,a) - I(z;{z}',r|a) \ \le \notag \\
&\qquad\quad\frac{1}{M} \sum^{M} \Big (\beta_1 \left[D_{KL}\left(p(z_1| s_1)||q_1(z_1)\right)\ - \  \mathbb{E}_{z_{1}\sim p(z_1| s_1)} \log g_{\omega_1}({z}'_1,r|z_1,a)\ \right] \ + \notag\\
&\qquad\qquad\quad\quad\ \ \ \beta_2 \left[D_{KL}\left(p(z_2| s_2)||q_2(z_2)\right)\ - \  \mathbb{E}_{z_{2}\sim p(z_2| s_2)}\log{g_{\omega_2} ({z}'_2,r| z_2,a)}\ \right] \ - \  \qquad\qquad \notag\\ 
&\qquad\qquad\qquad\quad \mathbb{E}_{z_{1}\sim p(z_1| s_1)}\mathbb{E}_{z_{2}\sim p(z_2| s_2)} \mathbb{E}_{z\sim p(z|z_1,z_2)}\left[ \log{g_{\omega_{12}}({z}',r|z,a)}\right]\Big ) \qquad\qquad\qquad\qquad\qquad\qquad\qquad
\label{eq21}
\end{align}
Where $M$ is the size of the sampled data.

\section{Implementation Details}
In our implementation, both \textbf{actor and critic} are parameterized by a 3-layer fully connected network of 256 units with the ReLU activations. For the \textbf{encoder and target encoder}, both of them consist of four convolutional layers followed by the ReLU activations. The kernel size of the convolutional layer is $3 \times 3$. We use stride 2 for the first layer and stride 1 for the rest layers. The output of the last convolutional layer is fed into a fully-connected layer to project into a 50-dimension feature vector and further passed a Layer Normalization. For \textbf{feature fusion module and MLPs}, we use the same architecture for them and implement them as three dense layers of 256 units with the ReLU activations. For \textbf{view-specific predictor and multi-view predictor}, we implement them as one network, i.e., three dense layers of 256 units with the ReLU activations. In Table \ref{detail parameters}, we show a full list of hyper-parameters used for our experiments.

	\begin{table}[!ht]
		\newcommand{\tabincell}[2]{\begin{tabular}{@{}#1@{}}#2\end{tabular}}
		\caption{Full list of hyper-parameters used for the DMControl suite.}
		\label{detail parameters}
		\centering
		\begin{tabular}{ll}
			\hline
			\textbf{Hyperparameter} & \textbf{Value}\\ \hline
			Augmentation & Crop \\
			Image states &  $100 \times 100$\\
			Cropped image states & $84 \times 84$ \\
			Replay buffer capacity &  $10 ^ 5$\\
			Initial steps &  1000 \\
			Total trainning steps &  $500000$ \\
			Stacked frames & 3 \\
			Action repeat & \tabincell{l}{2 finger spin; walker walk \\ 4 cheetah run; ball-in-cup catch; reacher easy; walker run \\ 8 cartpole swingup} \\
			Evaluation episodes &  10 \\
			Optimizer  & Adam \\
			\tabincell{l}{Learning rate (encoder/policy/Q Function)} & \tabincell{l}{$2e-4$ cheetah run; $1e-3$ otherwise} \\
			Learning rate ($\alpha$) & $1e-4$ \\
			Batch size ($M$) & 512 \\
			View number ($N$) & 2 \\
			Q Function EMA $\tau_{\varphi_i}, i=1,2$ & 0.01 \\
			Encoder EMA $\tau_\rho$ & 0.05 \\
			Critic/encoder target update freq & 2 \\
			MCEB $\beta_j$ (default setting), $ j\in[1,N]$ & \tabincell{l}{$1e-4 \to 1e-3$ cheetah run; $1e-3 \to 1e-2$ otherwise} \\ 
			MCEB $\beta_j$ (random image setting), $ j\in[1,N]$ & $1e-4 \to 1e-2$ \\
			MCEB $\beta_j$ (natural video setting), $ j\in[1,N]$ & $1e-4 \to 1e-2$ \\
			Convolutional layers & 4 \\
			Number of ﬁlters & 32 \\
			Non-linearity & ReLU \\
			Encoder feature  dimension & 50 \\
			Discount factor $\gamma$ & 0.99 \\
			Initial temperature & 0.1 \\ \hline
		\end{tabular}
	\end{table}

\section{Additional DMControl Results}
For the image distractor setting and natural video setting, Fig. \ref{Image Setting} and Fig. \ref{Video Setting} show the performance of S2R + SAC, RAD, and DBC on 6 DMControl tasks. In both settings, S2R + SAC performs comparably or better than RAD, and substantially outperforms DBC, showing its ability to learn efficient and robust representations. As expected, the MCEB objective urges the S2R + SAC agent to pay attention to the robot control task itself, ignore task-independent details in the environment background, and thus be more robust to the visual noise in the environment. 

For ablation studies, we choose the cheetah run and walker walk tasks and compare the performance of S2R with its variants in Fig. \ref{Ablation Setting}, including the regularization factors, predictive data ($Y_1$, $Y_2$, and $Y$), optimization objectives, and the number of views included in the MCEB objective. Results confirm the correctness of the value of MCEB $\beta_j$ given in Table \ref{detail parameters}, the rationality of learning representations based on the latent transition function and reward function, and the efficiency of integrating multi-view data with CEB in the MCEB objective. These all are significant factors for the design and success of S2R. Besides, the MCEB objective in S2R can take advantage of the multi-view data to learn robust representations.

\begin{figure}[!hb]
	\centering
	\subfigure{
		\includegraphics[scale=0.31]{Image_ball_in_cup.png}
		\label{Image ball-in-cup, catch}
	}
	\quad
	\subfigure{
		\includegraphics[scale=0.31]{Image_cartpole.png}
		\label{Image cartpole, swingup}
	}
	\quad
	\subfigure{
		\includegraphics[scale=0.31]{Image_cheetah.png}
		\label{Image cheetah, run}
	}
	\quad
	\subfigure{
		\includegraphics[scale=0.31]{Image_reacher.png}
		\label{Image reacher, easy}
	}
	\quad
	\subfigure{
		\includegraphics[scale=0.31]{Image_walker_run.png}
		\label{Image walker, run}
	}
	\quad
	\subfigure{
		\includegraphics[scale=0.31]{Image_walker.png}
		\label{Image walker, walk}
	}
	\caption{Performance of S2R + SAC over five seeds with mean and one standard error in the image distractor DMControl setting. We benchmark it with RAD and DBC. S2R + SAC performs comparably or better than RAD and significantly improves the performance of DBC, on all 6 pixel-based control tasks.}
	\label{Image Setting}
\end{figure}

\begin{figure}
	\centering
	\subfigure{
		\includegraphics[scale=0.31]{Video_ball_in_cup.png}
		\label{Video ball-in-cup, catch}
	}
	\quad
	\subfigure{
		\includegraphics[scale=0.31]{Video_cartpole.png}
		\label{Video cartpole, swingup}
	}
	\quad
	\subfigure{
		\includegraphics[scale=0.31]{Video_cheetah.png}
		\label{Video cheetah, run}
	}
	\quad
	\subfigure{
		\includegraphics[scale=0.31]{Video_reacher.png}
		\label{Video reacher, easy}
	}
	\quad
	\subfigure{
		\includegraphics[scale=0.31]{Video_walker_run.png}
		\label{Video walker, run}
	}
	\quad
	\subfigure{
		\includegraphics[scale=0.31]{Video_walker.png}
		\label{Video walker, walk}
	}
	\caption{Performance of S2R + SAC over five seeds with mean and one standard error in the natural video DMControl setting. We benchmark it with RAD and DBC. S2R + SAC again performs comparably or better than RAD and DBC, on all 6 pixel-based control tasks.}
	\label{Video Setting}
\end{figure}

\begin{figure}[!ht]
	\centering
	\subfigure[MCEB regularization factors]{
		\includegraphics[scale=0.31]{Ablation_lr_cheetah.png}
		\label{Ablation_lr_cheetah, run}
	}
	\quad
	\subfigure[MCEB regularization factors]{
		\includegraphics[scale=0.31]{Ablation_lr_walker.png}
		\label{Ablation_lr_walker, walk}
	}
	\quad
	\subfigure[MCEB predictive data]{
		\includegraphics[scale=0.31]{Ablation_preob_cheetah.png}
		\label{Ablation_preob_cheetah, run}
	}
	\quad
	\subfigure[MCEB predictive data]{
		\includegraphics[scale=0.31]{Ablation_preob_walker.png}
		\label{Ablation_preob_walker, walk}
	}
	\quad
	\subfigure[MCEB optimization objectives]{
		\includegraphics[scale=0.31]{Ablation_s2rob_cheetah.png}
		\label{Ablation_s2rob_cheetah, run}
	}
	\quad
	\subfigure[MCEB optimization objectives]{
		\includegraphics[scale=0.31]{Ablation_s2rob_walker.png}
		\label{Ablation_s2rob_walker, walk}
	}
	\subfigure[Number of views]{
		\includegraphics[scale=0.31]{Ablation_viewn_cheetah.png}
		\label{Ablation_viewn_cheetah, run}
	}
	\subfigure[Number of views]{
		\includegraphics[scale=0.31]{Ablation_viewn_walker.png}
		\label{Ablation_viewn_walker, walk}
	}
	\caption{Performance of S2R + SAC over five seeds with mean and one standard error in the default DMControl setting for ablation studies. We compare the performance of S2R with its variants, i.e., regularization factors in (a) and (b), predictive data ($Y_1$, $Y_2$, and $Y$) in (c) and (d), optimization objectives in (e) and (f), and the number of views $N$ in (g) and (h). Results show that choosing suitable values for the regularization factors, and simultaneously predicting the latent transition function and reward function, together with the MCEB objective, is significant for the success of S2R. Besides, the increase of $N$ generally improves (comparably or better than the two-view case) the performance of the S2R method.}
	\label{Ablation Setting}
\end{figure}

\end{document}
