

\section{Appendix}

\subsection{Details about \mname{} Surrogate Objective}
\label{appendix:lwb}

In this subsection, we elaborate on the details of derivation of the lower bound of sequential multi-view total correlation. Using this lower bound as a surrogate objective, we propose a reinforcement learning framework for visual control problems by learning a complete and concise representation from multi-view observations. We aim to maximize sequential multi-view total correlation between multi-view observation sequence and representation sequence under the condition of action sequence. First of all, we give a formal definition of sequential total multi-view correlation.

\begin{definition}
    Given the sequence of multiview observations, representations, and actions $\vec{O}_{1:T}$, $Z_{1:T}$, $A_{1:T}$, define the sequential multiview total correlation as follows:
    \begin{equation}
        SMTC(\vec{O}_{1:T}; Z_{1:T}  \mid A_{1:T})
    = \sum_{v=1}^{V} I(O^{v}_{1:T}; Z_{1:T} \mid A_{1:T}) 
    - I(\vec{O}_{1:T}; Z_{1:T} \mid A_{1:T}) 
    \end{equation}
    where $T$ is temporal length and $V$ is number of views.
\end{definition}


With the definition of sequential multi-view total correlation, we have the following lemmas and theorem to derive a tractable lower bound of mutual information between sequences of observation and representation on condition of action sequence.
    
\hspace{1in}
\begin{figure}[htb]
    \centering
    \includegraphics[width=0.6\columnwidth]{fig/graphical_model_crop.pdf}
    \caption{Graphical Model of Random Variables where $o$ denotes observation, $z$ denotes representation and $a$ denotes action.}
    \label{fig:graphical_model}
\end{figure}




\begin{lemma}\label{thm:lem1}
Let $O_{1:T}$ and $Z_{1:T}$ be random variables with joint distribution
$$p(O_{1:T}, Z_{1:T}) = \prod_{t=1}^{T} p(O_t, Z_t \mid O_{t-1}, Z_{t-1}, A_{t-1})$$ where $A_{1:T}$ be random variables of action sequence and $p(O_1, Z_1 \mid O_0, Z_0, A_0) = p(O_1, Z_1)$. Then it follows that 
\begin{equation}
    I(O_{1:T}; Z_{1:T} \mid A_{1:T}) 
    \geq
    \sum_{t=1}^{T} I(O_{t}; Z_{t} \mid Z_{t-1}, A_{t-1}) 
    \label{eq:ineq_1}
\end{equation}
where $T$ is temporal length.
\end{lemma}

\begin{proof}
According to information theory, we have 
\begin{align*}
&I(X; Y) = H(X) - H(X \mid Y),  \\
&H(X_1, X_2, \cdots, X_n) = \sum_{i=1}^{n} H(X_i \mid X_{i-1}, \cdots, X_1), \\
&I(X ; Y \mid Z) = \mathbb{E}_{Z} 
% D_{KL} ( P_{(X, Y) \mid Z}  ||  P_{X \mid Z} \otimes P_{Y \mid Z} ).
\infdiv{ P_{(X, Y) \mid Z} }{ P_{X \mid Z} \otimes P_{Y \mid Z} }
\end{align*}
Let $\tau = (o_{1:t-1}, z_{1:t-1}, a_{1:T})$, according to the definition of conditional mutual information, we have
\begin{equation}
    I( O_{t} ; Z_{t} \mid O_{1:t-1}, Z_{1:t-1}, A_{1:T} ) = 
    \int_{\tau} \int_{o_t} \int_{z_t} 
    p(o_t, z_t \mid \tau ) \log \frac{
        p(o_t, z_t \mid \tau )
    }{
        p(o_t \mid \tau ) \cdot p(z_t \mid \tau )
    }
    d z_t d o_t d \tau
    \label{eq:cond_mut_info}
\end{equation}
With the Markovian property, i.e. hidden state or representation $z_t$ at time step $t$ is determined only by previous hidden state or representation $z_{t-1}$ and action $a$ at time step $t-1$, it follows $p(o_t, z_t \mid \tau) = p(o_t, z_t \mid z_{t-1}, a_{t-1})$. Applying this result to Equation \ref{eq:cond_mut_info} we obtain the following result. 
\begin{multline}
    I( O_{t} ; Z_{t} \mid O_{1:t-1}, Z_{1:t-1}, A_{1:T} ) 
    = \int_{\tau} \int_{o_t} \int_{z_t} 
    p(o_t, z_t \mid z_{t-1}, a_{t-1} )  
    \log \frac{
        p(o_t, z_t \mid  z_{t-1}, a_{t-1} )
    }{
        p(o_t \mid z_{t-1}, a_{t-1}) \cdot p(z_t \mid z_{t-1}, a_{t-1} )
    } \\
    d z_t d o_t d \tau 
    = I( O_{t} ; Z_{t} \mid Z_{t-1}, A_{t-1} )
\end{multline}

Finally, we derive a lower bound of mutual information between subsequences of observations and representations as
\begin{align}
    I(O_{1:T}; Z_{1:T} \mid A_{1:T}) &= H(Z_{1:T} \mid A_{1:T}) - H(Z_{1:T} \mid O_{1:T}, A_{1:T}) \\
    &= \sum_{t=1}^{t} H(Z_{t} \mid Z_{1:t-1}, A_{1:T} ) - H(Z_{t} \mid Z_{1:t-1}, O_{1:T}, A_{1:T} ) \\
    &= \sum_{t=1}^{T} I( O_{1:T} ; Z_{t} \mid Z_{1:t-1}, A_{1:T}  ) \\
    &= \sum_{t=1}^{T} H( O_{1:T} \mid Z_{1:t-1}, A_{1:T} ) - H( O_{1:T} \mid Z_{t}, Z_{1:t-1}, A_{1:T} ) \\
    &= \sum_{t=1}^{T} \sum_{s=1}^{T} H( O_{s} \mid O_{1:s-1}, Z_{1:t-1}, A_{1:T} ) 
        - H( O_{s} \mid O_{1:s-1}, Z_{t}, Z_{1:t-1}, A_{1:T} ) \\
    &= \sum_{t=1}^{T} \sum_{s=1}^{T} I( O_{s} ; Z_{t} \mid O_{1:s-1}, Z_{1:t-1}, A_{1:T} ) \\
    &\geq 
    \sum_{t=1}^{T} I( O_{t} ; Z_{t} \mid O_{1:t-1}, Z_{1:t-1}, A_{1:T} ) \\
    &= \sum_{t=1}^{T} I( O_{t} ; Z_{t} \mid Z_{t-1}, A_{t-1} ) 
\end{align}
Hence, we have that 
\begin{equation}
    I(O_{1:T}; Z_{1:T} \mid A_{1:T}) \geq \sum_{t=1}^{T} I(O_{t}; Z_{t} \mid Z_{t-1}, A_{t-1})
\end{equation}
where $T$ is temporal length.
    
\end{proof}




\begin{lemma}\label{thm:lem2}
Let $\vec{O}_{1:T}$ be multi-view observation sequence with $\text{dim } \vec{O} = V$ and temporal length $T$, it follows that
    \begin{equation}
        I(\vec{O}_{1:T}; Z_{1:T} \mid A_{1:T})
        \leq \sum_{t=1}^{T} \sum_{s=1}^{T}
    \mathbb{E}_{
        p(\vec{o}_s)
    } \Bigg[
        \infdiv{ p(z_t \mid \iota) }{ r_{\phi}(z_t \mid \iota) }
        % D_{KL} \big( p(z_t \mid \iota ) || r_{\phi}(z_t \mid \iota)  \big)
    \Bigg]  
    \label{eq:ineq_2}
    \end{equation}
    where $\iota = (\vec{o}_{1:s-1}, z_{1:t-1}, a_{1:T})$ and prior distribution $r_{\phi}(z_{t}) \approx p(z_{t})$ is an approximate distribution with $\phi$.
\end{lemma}

\begin{proof}
    It is easy to verify that 
\begin{equation}
I(\vec{O} ; Z) \leq \mathbb{E}_{
    p(\vec{o})
} [
    % D_{KL} ( p(z \mid \vec{o}) || r_{\phi}(z) )
    \infdiv{p(z \mid \vec{o})}{r_{\phi}(z)} 
]            
\end{equation}
since 
\begin{multline}
    I(\vec{O} ; Z) 
    = \mathbb{E}_{
        p(\vec{o}, z)
    } \left[ \log \frac{
        p(z \mid \vec{o} )    
    }{
        p(z)
    }  \cdot \frac{r_{\phi}(z)}{r_{\phi}(z)}  \right] 
    = \mathbb{E}_{
        p(\vec{o})
    } 
    % D_{KL} ( p(z \mid \vec{o}) || r_{\phi}(z) )
    \infdiv{ p(z \mid \vec{o}) }{ r_{\phi}(z) }  
    % - D_{KL}( p(z) || r_{\phi}(z) ) \\
    - \infdiv{ p(z) }{ r_{\phi}(z) }   \\
    \leq 
    \mathbb{E}_{
        p(\vec{o})
    } [
        % D_{KL} ( p(z \mid \vec{o}) || r_{\phi}(z) )
        \infdiv{ p(z \mid \vec{o}) }{ r_{\phi}(z) }  
    ]   
\end{multline}
Using this technique , we could derive a upper bound of $I(\vec{O}^{1:T}; Z^{1:T} \mid A^{1:T})$ as follows:
\begin{equation}
    I(\vec{O}_{1:T}; Z_{1:T} \mid A_{1:T}) = 
    \sum_{t=1}^{T} \sum_{s=1}^{T}
    I( \vec{O}_{s} ; Z_{t} \mid \vec{O}_{1:s-1} , 
     Z_{1:t-1}, A_{1:T} ) 
    \leq 
    \sum_{t=1}^{T} \sum_{s=1}^{T} \mathbb{E}_{
        p(\vec{o}_s)
    } [  
    % D_{KL} ( \tilde{q} || \tilde{r}  )   ]  
    \infdiv{ \tilde{q} }{  \tilde{r} }    ]  
\end{equation}
where 
\begin{align}
\tilde{q} &= p(z_t \mid \vec{o}_{1:s}, z_{1:t-1}, a_{1:T})  \\
\tilde{r}_{\phi} &= r_{\phi}(z_t \mid \vec{o}_{1:s-1}, z_{1:t-1}, a_{1:T}).
\end{align}
Hence, we derive an upper bound of mutual information between sequence of multi-view observations and representations on condition of action sequence.

\end{proof}



\begin{lemma}
\label{lem:non-posi-entropy}
    For any view $v$ and time step $t$, we have 
    \begin{equation}
    H(O^{v}_{t} \mid Z_{t}, Z_{t-1}, A_{t-1}) \leq 0
\end{equation}
where $O^{v}_{t}$ is observation of view $v$ at time step t, $Z_t$ and $Z_{t-1}$ are representation at time steps $t$ and $t-1$, $A_{t-1}$ is action at time step $t -1 $.
\end{lemma}

\begin{proof}
    According to the definition of entropy, we have 
\begin{equation}
    H(O^{v}_{t} \mid Z_{t}, Z_{t-1}, A_{t-1}) = - \int_{o^{v}_{t}} \int_{z_t} \int_{z_{t-1}} 
    \int_{a_{t-1}} f(o^{v}_{t}, z_t, z_{t-1}, a_{t-1}) \log  g(o^{v}_{t} \ \mid \ z_t, z_{t-1}, a_{t-1}) 
    d o^{v}_{t} d z_t d z_{t-1} d a_{t-1}
\end{equation}
where $f$ is joint probability density function and $g$ is conditional probability density function.

If there does not exist independent noise in observation with respect to representation, according to the formulation of episode rollout of Markov Decision Process, we assert that at time step $t$ representation $Z$ and multi-view observation $O^v_t$ are mutually determined.
Since observation $o^{v}_{t}$ is fully determined by representation $z_t$, it follows that $f(o^{v}_{t}, z_t, z_{t-1}, a_{t-1}) = f(z_t, z_{t-1}, a_{t-1})$.
Using this result, we could reduce the integral into
\begin{multline}
    H(O^{v}_{t} \mid Z_{t}, Z_{t-1}, A_{t-1}) = -  \int_{z_t} \int_{z_{t-1}} \int_{a_{t-1}} 
    f( z_t, z_{t-1}, a_{t-1}) \Bigg[
     \int_{o^{v}_{t}} \log  g(o^{v}_{t} \ \mid \ z_t, z_{t-1}, a_{t-1}) d o^{v}_{t}
    \Bigg] \\
    d z_t d z_{t-1} d a_{t-1} = 0
\end{multline}
where the second equality holds since there exists only one observation instance $o^v_t$ in observation space which corresponds to the representation condition $z_t$. Otherwise, the probability density function $g$ is zero.

If there exists independent noise in observation with respect to representation, it follows that
\begin{multline}
    H(O^{v}_{t} \mid Z_{t}, Z_{t-1}, A_{t-1}) = -  \int_{z_t} \int_{z_{t-1}} \int_{a_{t-1}} 
    f( z_t, z_{t-1}, a_{t-1}) \Bigg[
     \int_{o^{v}_{t}} \log  g(o^{v}_{t} \ \mid \ z_t, z_{t-1}, a_{t-1}) d o^{v}_{t}
    \Bigg] 
    d z_t d z_{t-1} d a_{t-1} \\
    = - C \int_{z_t} \int_{z_{t-1}} \int_{a_{t-1}} 
    f^2(z_t, z_{t-1}, a_{t-1}) 
    d z_t d z_{t-1} d a_{t-1} 
    \leq 0
\end{multline}
where $C$ is a positive constant.

Specifically, given some view index $v$ and time step $t$, it follows from Bayesian Theorem that 
\begin{equation}\label{eq:bayasian_thm}
    Pr(O^{v}_{t} \mid Z_t) = \frac{Pr(Z_t \mid O^{v}_{t}) \cdot Pr(O^{v}_{t}) }{ \int_{O'} Pr(Z_t \mid O') d O' }.
\end{equation}
According to the encode mapping, we know that $Z \sim \mathcal{N}(\mathbf{\mu}, \mathbf{\Sigma})$ where $\mathbf{\mu}$ is mean vector and $\mathbf{\Sigma}$ is covariance matrix. The mean and covariance matrix are determined by multi-view observations, i.e. $\mathcal{N}(\mathbf{\mu}, \mathbf{\Sigma}) = h(\vec{O})$ for some encoding mapping $h$. Note that an encoding mapping is mapping from observational space into representation space. In multi-view scenario, the observational space consists of multi-view observations instead of single-view observation.

According to graphical model illustrated in Figure \ref{fig:graphical_model} and Equation \ref{eq:bayasian_thm}, it holds that 
\begin{align}
    Pr(O^{v}_{t} \mid Z_t, Z_{t - 1}, A_{t - 1}) 
    &= Pr(Z_t, Z_{t-1}, A_{t - 1}) 
    \cdot Pr(O^{v}_{t} \mid Z_t) \cdot Pr(O^{v}_{t} \mid Z_{t - 1}, A_{t - 1}) \\
    &= \frac{Pr(Z_t \mid O^{v}_{t}) \cdot Pr(O^{v}_{t}) }{ \int_{O'} Pr(Z_t \mid O') d O' } \cdot Pr(O^{v}_{t} \mid Z_{t - 1}, A_{t - 1}) 
    \cdot Pr(Z_t, Z_{t-1}, A_{t - 1}) \\
    &= \frac{Pr(Z_t \mid O^{v}_{t}) \cdot Pr(O^{v}_{t}) }{ Pr(Z_t) } 
    \cdot Pr(O^{v}_{t} \mid Z_{t - 1}, A_{t - 1}) \cdot Pr(Z_t, Z_{t-1}, A_{t - 1}).
\end{align}
With this result, we could derive 
\begin{multline}
    \int_{o^{v}_{t}} \log  g(o^{v}_{t} \ \mid \ z_t, z_{t-1}, a_{t-1}) d o^{v}_{t} =  f(z_t, z_{t-1}, a_{t-1})  \cdot \int_{o^{v}_{t}} \log 
     \left( \frac{p_1(z_t \mid o^{v}_{t}) \cdot p_2(o^{v}_{t}) }{ p_3(z_t) } \cdot p_4(o^{v}_{t} \mid z_{t - 1}, a_{t - 1}) \right) d o^{v}_{t} \\ = (T_1 + T_2 - T_3 + T_4) \cdot f(z_t, z_{t-1}, a_{t-1})
\end{multline}
where
\begin{align}
 T_1 &:= \int_{o^{v}_{t}} \log p_1(z_t \mid o^{v}_{t})  d o^{v}_{t}, \\
 T_2 &:= \int_{o^{v}_{t}} \log p_2(o^{v}_{t})  d o^{v}_{t}, \\
 T_3 &:= \int_{o^{v}_{t}} \log p_3(z_t)  d o^{v}_{t} = \log p_3(z_t), \\
 T_4 &:= \int_{o^{v}_{t}} \log p_4(o^{v}_{t} \mid z_{t - 1}, a_{t - 1})  d o^{v}_{t}.
\end{align}
According to the definition of encoder architecture which includes a IVW structure, $p_1$ is multi-dimensional Gaussian distribution. If we assume that observation distribution, representation distribution and transition probability distribution are all Gaussian distribution which is consistent with model-based methods for previous works \cite{dreamerv2,DBC}. Since $T_1 + T_2 - T_3 = 0$ and $T_4 \geq 0$ due to the property of transition probability, it follows that the aforementioned positive value $C$ does exist.










\end{proof}



\begin{theorem}
The sequential multi-view total correlation between sequences of multi-view observation and representation on condition of action sequence has the following lower bound:
\begin{multline}
    SMTC(\vec{O}_{1:T}; Z_{1:T}  \mid A_{1:T}) 
    \geq 
    \sum_{v=1}^{V} \sum_{t=1}^{T}
    \bigg[ 
    H(O_{t}^{v} \mid Z_{t-1}, A_{t-1}) 
    + 
    \mathbb{E}_{
        p(z_{t}, o_{t}^{v} \mid z_{t-1}, a_{t-1} ) 
    }  
    \ln q_{\psi}^{v}(o_{t}^{v} \mid z_{t}, z_{t-1}, a_{t-1} )
    \bigg] \\   
    - \sum_{t=1}^{T} \sum_{s=1}^{T}
    \mathbb{E}_{
        p(\vec{o}_s)
    } \bigg[ 
    % D_{KL} ( p(z_t \mid o_s, \iota)  || r_{\phi}(z_t \mid \iota) )
    \infdiv{ p(z_t \mid o_s, \iota) }{ r_{\phi}(z_t \mid \iota) }  
    \bigg]
\end{multline}
where posterior distribution $q_{\psi}(o^{v}_{t} \mid z_{t}, z_{t-1}, a_{t-1}) \approx p(o^{v}_{t} \mid z_{t}, z_{t-1}, a_{t-1})$ is an approximate distribution with $\psi$, $\iota$ and $r_{\phi}$ are defined in Lemma \ref{thm:lem2}.
\end{theorem}

\begin{proof}
    Applying Equation \ref{eq:ineq_1} in Lemma \ref{thm:lem1}, we could derive 
    \begin{align}
    SMTC(\vec{O}_{1:T}; Z_{1:T}  \mid A_{1:T}) 
    &=     
    \sum_{v=1}^{V} I(O^{v}_{1:T}; Z_{1:T} \mid A_{1:T}) - I(\vec{O}_{1:T}; Z_{1:T} \mid A_{1:T}) \\
    &\geq 
    \sum_{v=1}^{V} \sum_{t=1}^{T}
    I(O^{v}_{t}; Z_{t} \mid Z_{t-1}, A_{t-1}) 
    - I(\vec{O}_{1:T}; Z_{1:T} \mid A_{1:T})
    \end{align}

    Then by applying inequality \ref{eq:ineq_2} in Lemma \ref{thm:lem2}, we have
    \begin{equation}
    - I(\vec{O}_{1:T}; Z_{1:T} \mid A_{1:T}) 
    \geq
    - \sum_{t=1}^{T} \sum_{s=1}^{T}
    \mathbb{E}_{
        p(\vec{o}_s)
    } [
        % D_{KL} ( p(z_t \mid \iota ) || r_{\phi}(z_t \mid \iota) )
        \infdiv{ p(z_t \mid \iota ) }{ r_{\phi}(z_t \mid \iota) }
    ]  
    \end{equation}
    where $\iota = (\vec{o}_{1:s-1}, z_{1:t-1}, a_{1:T})$.

    We split the first summand $I(O^{v}_{t}; Z_{t} \mid Z_{t-1}, A_{t-1})$ into subtraction of two entropy terms and derive a lower bound using Lemma \ref{lem:non-posi-entropy} as follows.
    \begin{equation} \label{eq:lwb_1}
    I(O^{v}_{t}; Z_{t} \mid Z_{t-1}, A_{t-1})     
    = H(O^{v}_{t} \mid Z_{t-1}, A_{t-1}) 
    - H(O^{v}_{t} \mid Z_{t}, Z_{t-1}, A_{t-1})  
    \geq H(O^{v}_{t} \mid Z_{t-1}, A_{t-1})
    \end{equation}

    According to Appendix A.1 Equation (11) and (12) in \cite{MVTC}, we could further lower the double sum of expected value of KL divergence between true prior distribution $p$ and approximate prior distribution $r_{\phi}$.
    \begin{multline} \label{eq:lwb_2}
    - \sum_{t=1}^{T} \sum_{s=1}^{T}
    \mathbb{E}_{
        p(\vec{o}_s)
    } [
        % D_{KL} ( p(z_t \mid \iota ) || r_{\phi}(z_t \mid \iota) )
        \infdiv{ p(z_t \mid \iota ) }{ r_{\phi}(z_t \mid \iota) }
    ]  
    \geq      
    \sum_{v=1}^{V} \sum_{t=1}^{T}
     \mathbb{E}_{
        p(z_{t}, o_{t}^{v} \mid z_{t-1}, a_{t-1} ) 
    } \ln q_{\psi}^{v}(o_{t}^{v} \mid z_{t}, z_{t-1}, a_{t-1} )
    ] \\
    - \sum_{t=1}^{T} \sum_{s=1}^{T}
    \mathbb{E}_{
        p(\vec{o}^s)
    } [
        % D_{KL} ( p(z_t \mid \iota ) || r_{\phi}(z_t \mid \iota) )
        \infdiv{ p(z_t \mid \iota ) }{ r_{\phi}(z_t \mid \iota) }
    ] 
    \end{multline} 
    
    Summing up inequalities \ref{eq:lwb_1} and \ref{eq:lwb_2}, we could have a tractable lower bound of sequential multi-view total correlation between sequences of multi-view observations and representations on condition of action sequence as follows.
    
    \begin{multline}
    SMTC(\vec{O}_{1:T}; Z_{1:T}  \mid A_{1:T}) 
    \geq 
    \sum_{v=1}^{V} \sum_{t=1}^{T}
    \big[ 
    H(O_{t}^{v} \mid Z_{t-1}, A_{t-1}) 
    + 
    \mathbb{E}_{
        p(z_{t}, o_{t}^{v} \mid z_{t-1}, a_{t-1} ) 
    }  
    \ln q_{\psi}^{v}(o_{t}^{v} \mid z_{t}, z_{t-1}, a_{t-1} )
    \big] \\   
    - \sum_{t=1}^{T} \sum_{s=1}^{T}
    \mathbb{E}_{
        p(\vec{o}_s)
    } \big[ D_{KL}  
    ( p(z_t \mid o_s, \iota)  || r_{\phi}(z_t \mid \iota) )
    \big]
    \end{multline}
       

where $T$ is temporal length and $V$ is number of views.
\end{proof}









\subsection{Network Architectures and Hyperparameters}
\label{appendix:param}

\begin{table}[htb]
    \centering
    \begin{tabular}{lr}
        \toprule
        Parameter                & Value   \\
        \midrule
        learning rate            & $0.001$ \\
        optimizer                & Adam    \\
        number of views $V$      & $10  $  \\
        temporal length $T$      & $10   $  \\
        batch size               & $256 $  \\
        representation dimension & $64  $  \\
        discount factor $\gamma$  & $0.99$  \\
        number of random seeds    & 5 \\
        \bottomrule
    \end{tabular}
    \caption{Hyperparameters for \mname{} framework}
    \label{tab:params}
\end{table}






The hyperparameters is set empirically based on previous works, some of which are listed in Table \ref{tab:params}. In particular, it is sufficient to confine $V$ and $T$ as small integers. Since $V$ and $T$ are small integers, the lower bound derived above is computationally efficient to approximate the sequential multi-view total correlation even though the form has double sum operator. In the following, we elaborate the designs of network architecture for encoder, decoder, actor and critic.










\textbf{Encoder Networks} 
The encoder architecture consists of three convolutional layers with $3 \times 3$ kernels, $32$ channels, stride $2$ and padding $1$, just like the auto-encoder architecture. ReLU is applied after each convolution layer as activation function. After flattening the output of the last convolutional layer, this output is fed into a fully-connected layer, generating a $2048$-dimensional feature vector. This feature vector is passed to another two fully-connected layer, bringing about $64$-dimensional mean and $64$-dimensional variance separately. 

\textbf{Decoder Networks} The decoder architecture starts with a fully-connected layer, transforming $64$-dimensional representation $Z$ into $2048$-dimensional feature vector. Then it consists of three transposed convolutional layers with $3 \times 3$ kernels, $32$ channels, stride $2$ and padding $1$. 


\textbf{Actor and Critic Networks} We follow the common implementation of SAC \cite{sacori}. The actor and critic networks are implemented by MLPs with $256$-dimensional hidden layers. However, the actor network has two different output layers, including mean output layer and variance output layer while the critic network has only one output layer with $1$-dimension.






\subsection{Additional Experimental Results} 


\begin{table*}[htb]
    \centering
    \resizebox{\textwidth}{!}{
    \begin{tabular}{llllllll}
        \toprule
        Scores at 500k Steps    & DrQ           & RAD           & DreamerV2              & PI-SAC                & SLAC                   & DRIBO          & \mname{}                \\
        \midrule
		Cheetah, run & $797 \pm 116$ & $880 \pm 104$ & $841 \pm 57$ & $802 \pm 119$ & $881 \pm 116$ & $864 \pm 52$ & $ \mathbf{ 1019 \pm 107} $   \\ 
		Walker, walk & $930 \pm 46$ & $858 \pm 82$ & $966 \pm 117$ & $959 \pm 103$ & $930 \pm 107$ & $881 \pm 90$ & $ \mathbf{ 1036 \pm 30} $   \\ 
		Ball in cup, catch & $958 \pm 102$ & $-9 \pm 80$ & $955 \pm 82$ & $963 \pm 75$ & $983 \pm 98$ & $ \mathbf{ 1006 \pm 39} $ & $853 \pm 115$   \\ 
		Finger, spin & $738 \pm 79$ & $880 \pm 32$ & $366 \pm 105$ & $787 \pm 112$ & $947 \pm 58$ & $960 \pm 53$ & $ \mathbf{ 969 \pm 52} $   \\ 
		Acrobot, swingup & $228 \pm 50$ & $163 \pm 34$ & $209 \pm 106$ & $246 \pm 85$ & $ \mathbf{ 256 \pm 45} $ & $242 \pm 67$ & $247 \pm 51$   \\ 
		Humanoid, run & $470 \pm 60$ & $375 \pm 117$ & $436 \pm 104$ & $482 \pm 37$ & $453 \pm 117$ & $497 \pm 40$ & $ \mathbf{ 507 \pm 105} $   \\ 
		Hopper, hop & $454 \pm 89$ & $357 \pm 44$ & $347 \pm 52$ & $431 \pm 67$ & $453 \pm 41$ & $ \mathbf{ 488 \pm 38} $ & $484 \pm 33$   \\ 
		Fish, swim & $729 \pm 90$ & $546 \pm 114$ & $697 \pm 109$ & $694 \pm 102$ & $750 \pm 101$ & $ \mathbf{ 787 \pm 38} $ & $748 \pm 114$   \\ 
		Basic Manipulation (uh) & $130 \pm 30$ & $310 \pm 34$ & $108 \pm 39$ & $168 \pm 45$ & $246 \pm 34$ & $84 \pm 42$ & $ \mathbf{ 377 \pm 45} $   \\ 
		Basic Manipulation (ud) & $59 \pm 34$ & $ \mathbf{ 366 \pm 32} $ & $41 \pm 38$ & $198 \pm 38$ & $242 \pm 42$ & $93 \pm 43$ & $221 \pm 41$   \\ 
		Basic Manipulation (hd) & $68 \pm 45$ & $105 \pm 31$ & $78 \pm 33$ & $173 \pm 32$ & $177 \pm 33$ & $70 \pm 48$ & $ \mathbf{ 358 \pm 42} $   \\ 
		Basic Manipulation (uhd) & $86 \pm 47$ & $368 \pm 36$ & $62 \pm 40$ & $47 \pm 33$ & $136 \pm 30$ & $101 \pm 38$ & $ \mathbf{ 446 \pm 34} $   \\ 
		\midrule
        Scores at 100k Steps   & DrQ           & RAD           & DreamerV2              & PI-SAC                & SLAC                   & DRIBO          & \mname{}                \\
        \midrule
		Cheetah, run & $723 \pm 102$ & $577 \pm 109$ & $532 \pm 77$ & $683 \pm 100$ & $632 \pm 49$ & $496 \pm 119$ & $ \mathbf{ 940 \pm 76} $   \\ 
		Walker, walk & $725 \pm 83$ & $651 \pm 100$ & $923 \pm 98$ & $826 \pm 70$ & $452 \pm 97$ & $532 \pm 81$ & $ \mathbf{ 976 \pm 109} $   \\ 
		Ball in cup, catch & $959 \pm 65$ & $543 \pm 35$ & $ \mathbf{ 988 \pm 48} $ & $963 \pm 113$ & $612 \pm 114$ & $547 \pm 77$ & $813 \pm 68$   \\ 
		Finger, spin & $744 \pm 36$ & $679 \pm 87$ & $479 \pm 30$ & $761 \pm 49$ & $715 \pm 114$ & $262 \pm 34$ & $ \mathbf{ 932 \pm 78} $   \\ 
		Acrobot, swingup & $216 \pm 61$ & $139 \pm 53$ & $187 \pm 111$ & $227 \pm 78$ & $158 \pm 33$ & $112 \pm 52$ & $ \mathbf{ 230 \pm 103} $   \\ 
		Humanoid, run & $435 \pm 77$ & $318 \pm 114$ & $399 \pm 60$ & $435 \pm 109$ & $266 \pm 32$ & $248 \pm 93$ & $ \mathbf{ 479 \pm 47} $   \\ 
		Hopper, hop & $403 \pm 84$ & $327 \pm 101$ & $333 \pm 86$ & $394 \pm 75$ & $274 \pm 82$ & $223 \pm 61$ & $ \mathbf{ 456 \pm 91} $   \\ 
		Fish, swim & $671 \pm 58$ & $444 \pm 104$ & $634 \pm 98$ & $651 \pm 118$ & $470 \pm 108$ & $376 \pm 101$ & $ \mathbf{ 708 \pm 33} $   \\ 
		Basic Manipulation (uh) & $103 \pm 46$ & $147 \pm 38$ & $54 \pm 46$ & $117 \pm 40$ & $187 \pm 42$ & $56 \pm 32$ & $ \mathbf{ 234 \pm 31} $   \\ 
		Basic Manipulation (ud) & $47 \pm 47$ & $174 \pm 30$ & $20 \pm 41$ & $137 \pm 37$ & $ \mathbf{ 184 \pm 48} $ & $62 \pm 30$ & $137 \pm 31$   \\ 
		Basic Manipulation (hd) & $54 \pm 33$ & $50 \pm 41$ & $39 \pm 40$ & $119 \pm 32$ & $134 \pm 44$ & $47 \pm 35$ & $ \mathbf{ 222 \pm 47} $   \\ 
		Basic Manipulation (uhd) & $68 \pm 44$ & $175 \pm 32$ & $31 \pm 45$ & $32 \pm 37$ & $104 \pm 43$ & $67 \pm 37$ & $ \mathbf{ 277 \pm 46} $   \\ 

    \bottomrule
    \end{tabular}
    }
    \caption{Evaluation scores at 100k/500k steps.}
    \label{tab:scores}
\end{table*}


\textbf{Evaluation Scores at different Stages.}
We also summarize the evaluation scores of \mname{} and other baselines at 100k and 500k steps in Table \ref{tab:scores}. In scores at 500k steps, \mname{} achieves the best performance among baselines in 7 tasks over 12 tasks. In scores at 100k steps,
\mname{} achieves the best performance among baselines in 10 tasks over 12 tasks. This implies that \mname{} has significant advantage in sample efficiency over other baselines, even though its non-dominant performance in scores at 500k steps.


\textbf{Predictability}. 
We provide predictions of future observations using the deconvolutional decoder in Figure \ref{fig:prediction}, showing that our method can really have good predictive ability for future states.


\begin{figure}[htb]
    \centering
    \includegraphics[width=0.7\columnwidth]{fig/prediction.pdf}
    \caption{Prediction of future states (observations)
    from two representations of observations in task: Walker, walk.}
    \label{fig:prediction}
    
\end{figure}


\subsection{Reasons for Poor Performance on Ball Catch}




We posit that one of the possible reasons why \mname{} performs worse than other baselines in ``Ball in cup, catch" is that too little difference exists among multiview observations under this scenario. Nevertheless, even though \mname{} may not be optimal for this single task, this does not impair the advantages over other baselines. Likewise, DRIBO \cite{DRIBO}
does not outperform other baselines in finger spin. We would like to emphasize that \mname{} is not designed to outperform all existing model-free RL methods, but rather to introduce a novel approach to image representation learning and demonstrate its effectiveness in a set of benchmark tasks.


\subsection{Summary of Visual Control Methods}

\begin{table}[htb]
    \centering
    \resizebox{0.8\textwidth}{!}{
    \begin{tabular}{cccc}
        \toprule
        Method    & Model-free & Representation & Information theoretic \\
        \midrule
        \mname{}  & \cmark     & \cmark         & \cmark                  \\
        DreamerV2 \cite{dreamerv2} & \xmark     & \xmark         & \xmark                  \\
        RAD \cite{RAD}      & \cmark     & \xmark         & \xmark                  \\
        DrQ \cite{drq}      & \cmark     & \xmark         & \xmark                  \\
        PI-SAC \cite{PISAC}   & \cmark     & \cmark         & \cmark                  \\
        SLAC \cite{SLAC}     & \cmark     & \cmark         & \xmark                  \\
        DRIBO \cite{DRIBO}    & \cmark     & \cmark         & \cmark                  \\
        \bottomrule
    \end{tabular}
    }
    \caption{Comparison among \mname{} and other baselines.}
    \label{tab:comp}
\end{table}


