% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{abbrvnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions

\usepackage{amsthm,amsmath,amssymb,amsfonts,exscale,latexsym,float,eucal}
\usepackage{xspace}
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{algorithm,algpseudocode}
\usepackage{multirow}
\usepackage{wrapfig}

\usepackage{url}
\usepackage{hyperref}
\hypersetup{
    colorlinks=true,
    linkcolor=blue,
    citecolor=cyan,
    filecolor=green,      
    urlcolor=magenta,
}

\usepackage{Definitions}

\def\Algref#1{Algorithm~\ref{#1}}
\def\figref#1{figure~\ref{#1}}
% Figure reference, capital. For start of sentence
\def\Figref#1{Figure~\ref{#1}}
% Section reference, lower-case.
\def\secref#1{section~\ref{#1}}
% Section reference, capital.
\def\Secref#1{Section~\ref{#1}}


% \usepackage{algorithmic}


%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

\newlength{\bibitemsep}\setlength{\bibitemsep}{.2\baselineskip plus .05\baselineskip minus .05\baselineskip}
\newlength{\bibparskip}\setlength{\bibparskip}{0pt}
\let\oldthebibliography\thebibliography
\renewcommand\thebibliography[1]{%
  \oldthebibliography{#1}%
  \setlength{\parskip}{\bibitemsep}%
  \setlength{\itemsep}{\bibparskip}%
}

\usepackage{xr}

\makeatletter
\newcommand*{\addFileDependency}[1]{% argument=file name and extension
  \typeout{(#1)}
  \@addtofilelist{#1}
  \IfFileExists{#1}{}{\typeout{No file #1.}}
}
\makeatother

\newcommand*{\myexternaldocument}[1]{%
    \externaldocument{#1}%
    \addFileDependency{#1.tex}%
    \addFileDependency{#1.aux}%
}

\myexternaldocument{Zhang_322-supp}

\title{Energy-based Predictive Representations \\for Partially Observed Reinforcement Learning}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1, 2, $^\star$]{\href{mailto:<tongzheng@utexas.edu>?Subject=Your UAI 2023 paper}{Tianjun Zhang}}
\author[1, 3, $^\star$]{\href{mailto:<tianjunz@berkeley.edu>?Subject=Your UAI 2023 paper}{Tongzheng Ren}}
\author[4]{Chenjun Xiao}
\author[2]{Wenli Xiao}
\author[2]{\\Joseph E. Gonzalez}
\author[1, 4]{Dale Schuurmans}
\author[1, 5]{\href{mailto:<bodai@google.com>?Subject=Your UAI 2023 paper}{Bo Dai}}
% Add affiliations after the authors
\affil[1]{%
    Google DeepMind
}
\affil[2]{%
    Department of EECS, University of California, Berkeley
}
\affil[3]{
    Department of Computer Science, University of Texas, Austin
  }
\affil[4]{Department of Computer Science, University of Alberta}
\affil[5]{School of Computational Science and Engineering, Georgia Tech}

  
\begin{document}
\maketitle

\begin{abstract}
In real-world applications, handling partial observability is a common requirement for reinforcement learning algorithms, which is not captured by a Markov decision process (MDP). Although partially observable Markov decision processes (POMDPs) have been specifically designed to address this requirement, they present significant computational and statistical challenges in learning and planning. In this work, we introduce the \emph{Energy-based Predictive Representation~(\algabb)} to provide a unified approach for designing practical reinforcement learning algorithms in both the MDP and POMDP settings. This framework enables coherent handling of \emph{learning, exploration, and planning} tasks. The proposed framework leverages a powerful neural energy-based model to extract an adequate representation, allowing for efficient approximation of Q-functions. This representation facilitates the efficient computation of confidence, enabling the implementation of optimism or pessimism in planning when faced with uncertainty. Consequently, it effectively manages the trade-off between exploration and exploitation. Experimental investigations demonstrate that the proposed algorithm achieves state-of-the-art performance in both MDP and POMDP settings.
\end{abstract}

\let\thefootnote\relax\footnotetext{$^\star$ Equal Contribution}

\section{Introduction}\label{sec:intro}
Reinforcement learning (RL) based on Markov Decision Processes (MDPs) has proven to be highly effective in various real-world decision-making problems~\citep{levine2016end, jiang2021towards}. However, the success of most RL algorithms~\citep{ren2022free,zhang2022making} heavily relies on the assumption that the agent has full observability of the environment state. In practice, this assumption is easily violated due to observational noise. To tackle this issue, Partially Observable Markov Decision Processes (POMDPs)~\citep{aastrom1965optimal} have been proposed to capture the inherent uncertainty resulting from partial observations.

However, the flexibility of POMDPs introduces significant challenges in terms of statistical and computational complexity for planning, exploration, and learning. Specifically, the presence of partial observability leads to a non-Markovian dependence on the \emph{entire history}, which expands the space of observation sequences and state space distributions, thereby posing substantial representation challenges. In fact, it has been proven that even the planning of finite-horizon tabular POMDPs is NP-hard without additional structural assumptions~\citep{papadimitriou1987complexity}, and the sample complexity for learning POMDPs can grow exponentially with respect to the horizon~\citep{jin2020sample}.
These complexities only become more demanding in continuous state spaces and real-world scenarios. 

On the other hand, despite the theoretical hardness in the worst cases, there is sufficient structures in real-world POMDPs that can be exploited to bypass the aforementioned complexities. 
Recently, observable POMDPs with invertible emissions have been investigated to justify the finite-length sliding window heuristic in tabular cases~\citep{azizzadenesheli2016reinforcement,guo2016pac,jin2020sample, golowich2022learning},
which has been further extended with function approximation for large and continuous state POMDPs~\citep{wang2022embed,uehara2022provably}. Although these algorithms can exploit particular structure efficiently in terms of the sample complexity, they rely on unrealistic computation oracles, and are thus not applicable in practice. In this paper, we consider the following natural question:
\begin{center}
    \emph{How can one design {\bf efficient} and {\bf practical} algorithms for {\bf structured} POMDPs?}
\end{center}
In particular, we would like to exploit special structures that allows approximation to bypass inherent worst-case difficulties. By ``efficient'' we mean considering {\bf learning},  {\bf planning} and {\bf exploration} in a unified manner that can balance errors in each component and reduce unnecessary computation, while by ``practical'' we mean the algorithm retains sufficient flexibility and can be easily implemented and deployed in real-world scenarios. 

There have been many attempts to address this question. 
The most straightforward idea is to extend model-free RL methods, including policy gradient and $Q$-learning, with a memory-limited parametrization, \eg, recurrent neural networks~\citep{wierstra2007solving,hausknecht2015deep,zhu2017improving}. 
Alternatively, in model-based RL~\citep{kaelbling1998planning}, an approximation of the latent dynamics can be estimated and a posterior over latent states (\ie, beliefs) maintained, in principle allowing an optimal policy to be extracted via dynamic programming upon beliefs.
Following this idea,~\citet{deisenroth2012solving} and~\citet{igl2018deep,gregor2019shaping,zhang2019solar,lee2020stochastic} consider Gaussian process or deep model parametrizations, respectively. Such methods are designed based on implicit assumptions about structure through the parameterization choices of the models. However, these approaches suffer from sub-optimal performance due to several compounding factors: {\bf i), } approximation error from inaccurate parametrizations of the learnable components (policy, value function, model, belief), {\bf ii),} a sub-optimal policy induced by approximated planning (through policy gradient or dynamic progamming), and {\bf iii),} the neglect of exploration when interacting with the environment.

As an alternative, spectral representation approaches provide an alternative strategy based on extracting a sufficient representation that can support learning, planning and exploration.
In this vein~\citet{azizzadenesheli2016reinforcement} investigate spectral methods
for latent variable model estimation in POMDPs, but only consider tabular scenarios with finite state and action cases. Predictive State Representations~(PSR)~\citep{littman2001predictive} also leverage spectral decomposition, but instead of recovering an underlying latent variable model, they learn an \emph{equivalent sufficient} representation of belief. 
These methods have been extended to real-world settings with continuous observations and actions by exploiting kernel embeddings~\citep{boots2013hilbert} or deep models~\citep{downey2017predictive,venkatraman2017predictive, guo2018neural}. 
However, tractable planning and efficient exploration upon spectral representations has yet to be thoroughly developed~\citep{zhan2022pac}. 

In this paper, we propose \emph{Energy-based Predictive Representation~(\algabb)} to support efficient and tractable learning, planning, and exploration in POMDPs (and  MDPs), as a solution to the aforementioned question. More specifically:
\begin{itemize}[leftmargin=20pt, parsep=0pt, partopsep=0pt]
    \item We propose a flexible nonlinear energy-based model for induced belief-state MDPs \emph{without explicit parameterization of beliefs}, providing a principled \emph{linear sufficient} representation for the state-action value function. 
    \item We reveal the connection between~\algabb and PSR, while also illustrating the differences, to demonstrate the modeling ability of the proposed~\algabb. 
    \item We provide \emph{computationally-tractable} learning and planning algorithms for \algabb that implement the principles of optimism and pessimism in the face of uncertainty for online and offline RL, balancing exploration and exploitation.  
    \item We conduct a comprehensive comparison to existing state-of-the-art RL algorithms in both MDP and POMDP benchmarks, demonstrating superior empirical performance of the proposed~\algabb. 
\end{itemize}

\section{Preliminaries}\label{sec:prelim}
In this section, we introduce POMDPs and their degenerate case of MDPs, identifying the special structures that will be used to derive the proposed representation learning method. 

\paragraph{Partially Observable Markov Decision Processes.}
Formally, we define a partially observable Markov decision process~(POMDP) as a tuple $\Pcal = \rbr{\Scal, \Acal, \Ocal, r, H, \mu, P, O}$, where $H$ is a positive integer denoting the length of horizon; $\mu$ is the initial distribution of state, $r:\mathcal{S} \times \mathcal{A} \to [0, 1]$, the reward function, and $\Scal, \Acal, \Ocal$ denote the state, action and observation space, respectively. $P(\cdot|s, a): \Scal\times\Acal\rightarrow \Delta(\Scal)$ is the transition kernel, capturing the dynamics between states, and $O(\cdot|s):\Scal\rightarrow \Delta(\Ocal)$ is the emission kernel, where $\Delta\rbr{\cdot}$ denotes the set of probability measures over the support. 

Initially, given a state $s_1\sim \mu(s)$ as a starting point, at each step $h\in [1, H]$, the agent takes an action $a\in \Acal$, a new state $s_{h+1}$ is generated $s_{h+1}\sim P(\cdot|s_h, a_h)$, and the agent observes $o_{h+1}\sim O(\cdot|s_{h+1})$ and reward $r(s_{h+1}, a_{h+1})$. Due to partial observability, the dependence between observations is non-Markovian, hence, we define a policy $\pi = \{\pi_t\}$ where $\pi_t :\mathcal{O} \times (\mathcal{A} \times \mathcal{O})^{t} \to \Delta(\mathcal{A})$ to depend on the whole history, \ie, $x_t = \{o_0, \{a_i, o_{i+1}\}_{i=0}^{t-1}\}$. The corresponding value for policy $\pi$ can be defined as $ V^\pi = \EE_\pi\sbr{\sum_{h=1}^H r(s_h, a_h)}$, and the objective is to find the optimal policy $\pi^* = \argmax_\pi V^\pi$. 

Markov decision processes~(MDPs) are a degenerate case of POMDPs, where $\Scal = \Ocal$ and $O(o|s) = \delta(o=s)$, and can be specified as $\Mcal = \rbr{\Scal, \Acal, r, H, \mu, P}$. 
One can also convert a POMDP to an MDP by treating the whole history $x_t = \{o_0, \{a_i, o_{i+1}\}_{i=0}^{t-1}\}$ as the state. Specifically, following~\citep{kaelbling1998planning}, we define the belief  
$b: \mathcal{O} \times (\mathcal{A} \times \mathcal{O})^{t} \to \Delta(\mathcal{S})$, $\forall t\in \mathbb{N}^{+}$, which can be recursively defined as: $b(s_1|o_1) = P(s_1 | o_1)$, and
\begin{align} \label{eq:belief}
\textstyle
    b(s_{t+1}|x_{t+1}) = \frac{b(s_t|x_{t}) P(s_{t+1}|s_t, a_t) O(o_{t+1}|s_{t+1})}{\int b(s_t|x_{t}) P(s_{t+1}|s_t, a_t) O(o_{t+1}|s_{t+1}) \dif s_{t} \dif o_{t+1}}.
\end{align}

Each entry of the belief state describes the probability of the underlying state given the past history. 
Furthermore, with a slight abuse of notation, we use $b_t$ to denote the belief state at step $t$. Then, one can construct the equivalent belief MDP $\Mcal_b = \rbr{\Xcal, \Acal, R_h, H, \mu_b, T_b}$ with $\mathcal{X}$ denoting the set of possible histories, and 
\begin{small}
\begin{align}
\textstyle
    \mu_b &\defeq \int b(s|o_1)\mu(o_1)do_1, \\
    R_h(b, a) &= \int b_h(s_h) r(s_h, a) ds_h, \\
    T_b\rbr{b_{t+1}|b(x_t), a_t} &\defeq \int_{\mathcal{O}} \mathbf{1}_{b_{t+1} = b(x_{t+1})}P(o_{t+1}|b(x_t), a_t) \dif o_{t+1}. \label{eq:belief_mdp}
\end{align}
\end{small}

Therefore, the corresponding value function $V_{h}^{\pi}(b_h)$ and $Q_{h}^\pi(b_h, a_h)$ for the belief MDP given a policy $\pi$ can be defined as:
\begin{align}
\textstyle
    V_h^\pi(b_h) &= \EE\left[\sum_{t=h}^{H} R_t(b_t, a_t)|x_h\right], \\
    Q_h^{\pi}(b_h, a_h) &= \mathbb{E}\left[\sum_{t=h}^{H} R_t(b_t, a_t)|b_h, a_h\right]. 
\end{align}
Following the MDP perspective, we also have the Bellman recursive equation:
\begin{align}\label{eq:bellman}
    V_h^\pi(b_h) &= \mathbb{E}_\pi\left[Q_h^\pi(b_h, a_h)\right], \\
    Q_h^\pi(b_h, a_h) &= R_{h}(b_h, a_h) + \mathbb{E}_{T_b}\left[V_{h+1}^\pi(b_{h+1})\right].
\end{align}
One can still apply a dynamic programming style approach to solve POMDPs according to~\eq{eq:bellman}, however since the belief depends on the entire history, the number of possible beliefs can still be infinite even the number of states is finite.

To combat with these essential difficulties, we will leverage two particular structures, observability and linearity, as introduced below. 

\paragraph{Observability in POMDPs.}
It has been shown% ~\citep{even2007value,
~\citep{golowich2022planning,uehara2022provably} that for POMDPs with an observability assumption, one can relax the history dependence with a short window, bypassing the exponential sample and planning complexity w.r.t.\ horizon length~\citep{golowich2022learning, golowich2022planning}. Specifically, the observability property for POMDPs is defined as follows.
\begin{assumption}[\citep{golowich2022planning}]\label{asmp:observability}
The POMDP with emission model $O$ satisfies $\kappa$-observability if for arbitrary beliefs $b$ and $b'$ over states, $\nbr{\inner{O}{b} - \inner{O}{b'}}_1\ge \kappa\nbr{b - b'}_1$, where $\inner{O}{b}\defeq \int O(o|s) b(s)ds$. 
\end{assumption}
A key consequence of observability is that, the belief can be well approximated with a short history window ~\citep{golowich2022planning}, and one can construct an approximate MDP based on a finite belief history, which eliminates the exponential complexity induced by full history dependence. Specifically, we denote $L$ as the length of the window. Then, defining $x_t^L = \cbr{o_{t-L}, \cbr{a_i, o_{i+1}}_{i=t-L}^t}\in\Xcal^L$, the approximated beliefs $b^L$ follow the same recursive definition as~\eq{eq:belief} but with only finite history $x_t^L$ starting from the uniform belief. This immediately induces an approximate MDP $\Mcal_b^L = \rbr{\Xcal^L,\Acal, R_h^L, H, \mu_b, T_b^L}$ according to~\eq{eq:belief_mdp} with $b^L$, instead of $b$. Theorem 2.1 in \citet{golowich2022learning} proves that the approximation error of the finite-memory belief MDP is small for observable POMDPs. Hence, with slight abuse of notation, we still use $b$ to represent $b^L$ throughout the paper.

\paragraph{Linearity in MDPs.}
To handle the complexity induced by large state spaces, linear/low-rank structures have been introduced in MDPs~\citep{jin2020provably} for effective  function approximation, which leverages spectral factorization of the transition dynamics and reward: 
\begin{align}
    P(s'|s, a) = \inner{\phi(s, a)}{\mu(s')}, \, r(s, a) = \inner{\phi(s, a)}{\theta},
\end{align}
where $\phi: \Scal \times \Acal\rightarrow \Hcal$, $\mu:\Scal \rightarrow \Hcal$ are two feature maps to a Hilbert space $\Hcal$. Under such an assumption, we can represent the state-action value function $Q^\pi$ for an arbitrary policy $\pi$ by: 
\begin{align}
\textstyle
    Q^\pi(s, a) &= r(s, a) + \gamma \int V^\pi(s')P(s'|s, a)ds' \\ 
    &= \inner{\phi(s, a)}{\theta + \gamma \int V^\pi(s')\mu(s')ds'},
\end{align}
which implies that instead of a complicated function space defined on the raw state space, one can design a computationally efficient planning and sample efficient exploration algorithm in the space linearly spanned by $\phi$. 
In fact, from the correspondence between policy and $Q$-function as discussed in~\citep{ren2023spectral}, $\phi$ can be understood as representing primitives for skill set construction. Efficient and practical algorithms have been designed for exploiting linearity in MDPs~\citep{zhang2022making, qiu2022contrastive}, which inspires us to exploit similar properties in POMDPs. 

\paragraph{Energy-based Models.} Energy-based Models are one of the most flexible models to represent the conditional probability measure. It takes the form of $p(y|x) = \exp(-f(x, y))/Z(x)$ where $f(x, y)$, which can be parametrized by deep models, is the energy of $(x, y)$ and $Z(x)$ is a partition function that only depends on $x$ to guarantee $p(y|x)$ is a valid probability measure. When $y$ is discrete, we have that $p(y|x) = \exp(-f(x, y))/\sum_{y} \exp(-f(x, y))$, which corresponds to the standard softmax probability where $-f(x, y)$ is the softmax logits. We refer the interested readers to \citet{song2021train} for the training methods of energy-based models.

\section{Energy-based Predictive Representation}\label{sec:epr}
We propose \emph{\AlgName~(\algabb)}, which introduces linear structure into finite-history approximated POMDPs, reducing the complexity induced by large state spaces and long histories, and thus, yielding improved efficiency for learning, planning and exploration. We emphasize that %although this discussion focuses on POMDPs, 
the proposed method is also applicable to MDPs. 

The approach builds upon recent progress in large-state MDPs~\citep{zhang2022making,qiu2022contrastive}
that leverages linear structure in the dynamics, $P(s'|s, a) = \inner{\phi(s, a)}{\mu(s')}$, to obtain an efficient and practical framework for learning, planning and exploration. 
Recall the construction of a finite-memory belief MDP to approximate a POMDP discussed in~\Secref{sec:prelim}, which avoids full history dependence.
For such a constructed belief MDP, a natural idea is to apply linear MDP algorithms, 
\ie, extracting the linear decomposition for $T_b^L(b^\prime|b, a) = \inner{\phi(b, a)}{\mu({b^\prime})}$, to handle the hardnesses of POMDPs mentioned in~\Secref{sec:intro}. 
However, there are several difficulties in such a straightforward extension:
\begin{itemize}[leftmargin=20pt, parsep=0pt, partopsep=0pt]
    \item[{\bf i},] the set of beliefs is proportional to the number of states, which could be infinite, which leads the difficulty in parametrization and calculation of the belief;
    
    \item[{\bf ii},] the learning and factorization of the transition dynamics~\eq{eq:belief_mdp} in the equivalent belief MDP is difficult.
\end{itemize}
These difficulties hinder the extension of linear MDPs to observable POMDPs. However, note that we never explicitly require the beliefs and their dynamics, but only the representation $\phi(b, a)$. As beliefs are functions over finite-window histories, the representation can also be rewritten as $\phi(x_t, a_t)$, which suggests that one might bypass the inherent difficulties by a \emph{reprameterization trick}. 
Consider the energy-based parametrization
for $P(o_{t+1}|b(x_t), a_t)$ where $b(x_t)$ is the belief for history $x_t$:
\begin{align}\label{eq:energy_transition}
\textstyle
    % &T_b^L\rbr{b(x_{t+1})|b(x_t), a_t} 
    & P(o_{t+1}|b(x_t), a_t) \nonumber \\
    &=  p(o_{t+1}) \exp\rbr{f(x_t, a_t)^\top \rbr{g(o_{t+1}) + \lambda f(x_t, a_t)}}, \nonumber\\
    &\EE_{o_{t+1}}\sbr{\exp\rbr{f(x_t, a_t)^\top \rbr{g(o_{t+1}) + \lambda f(x_t, a_t)}}} = 1, \\& \forall \rbr{x_t, a_t}\in \Xcal^L,
\end{align}
where $\lambda$ is a scalar, $p(o)$ is a fixed distribution and the normalization condition enforces that the energy-based model $P(o_{t+1}|b(x_t), a_t)$ is a valid distribution. 
We avoid any explicit parametrization and computation of beliefs $b$, while preserving dependence through $f$ and $g$, which will be learned jointly. 
Compared to standard parametrizations, we do not need to specify unnecessary model parameters for the transition dynamics $P$ and emmission $O$, 
and bypass any learning and approximation of beliefs that induce compounding errors. 
As a special case, we note that the observable Linear-Quadratic Gaussian~(LQG) actually follows~\eq{eq:energy_transition} with a specific $\lambda$ and $p(o)$. See~\appref{appendix:lqg} for details.

Meanwhile, this approach also provides a linear factorization of $T(b_{t+1}|b_t, a_t)$ almost for free. By viewing the proposed parameterization~\eq{eq:energy_transition} as a kernel and following the random Fourier feature trick~\citep{rahimi2007random,ren2022free}, one can write
\begin{align}\label{eq:random_expansion}
\textstyle
    & P(o_{t+1} | b(x_t), a_t)
    =  \EE_{\omega}\sbr{\phi_{\omega}(x_{t}, a_t)\psi_\omega(o_{t+1})} \\
    &= \langle\phi_{\omega}(x_{t}, a_t), \psi_\omega(o_{t+1}) \rangle_{p(\omega)}, \nonumber
\end{align}
where $\omega_i\sim \mathcal{N}(0, I_d)$ and
\begin{align}\label{eq:random_phi}
\textstyle
    \phi_\omega(x_t, a_t) =&\left[\exp\left(\left(\lambda-\frac{1}{2}\right) \|f(x_t, a_t)\|^2 + \omega_i^\top f(x_t, a_t)\right)\right]_{i=1}^d, \\ 
    \psi_\omega(x_{t+1}) =& \left[ p(o_{t+1})\exp\left(\omega_i^\top g(o_{t+1}) - \frac{\|g(o_{t+1})\|^2}{2}\right ) \right]_{i=1}^d.
\end{align}
We provide a detailed derivation in Appendix~\ref{appendix:rf_deriviation}.

Substituting~\eq{eq:random_expansion} into~\eq{eq:belief_mdp} yields the factorization of $T_b$ as
\begin{align}
    &T_b\rbr{b_{t+1}|b_t, a_t} \nonumber \\
    =& \int_{\mathcal{O}} \mathbf{1}_{b_{t+1}= b(x_{t+1})} \EE_{\omega}\sbr{\phi_{\omega}(x_{t}, a_t)\psi_\omega(o_{t+1})} \dif o_{t+1} \nonumber\\
    =& \EE_\omega\sbr{\phi_{\omega}(x_{t}, a_t){\mu(b_{t+1})}}, \nonumber 
\end{align} 
where $\mu(b_{t+1})\defeq {\int_{\mathcal{O}} \mathbf{1}_{b_{t+1} = b(x_{t+1
})}\psi_\omega(o_{t+1})\dif o_{t+1}}$. 
If $b_{t+1}$ can only be induced by a unique history $x_{t+1}$,
we obtain a valid linear representation $\phi(b_t, a_t) = \phi_\omega\rbr{x_t, a_t}$ with $\omega\sim\Ncal\rbr{0, I_d}$ as \AlgName~(\algabb) for a belief MDP without any explicit calculation of beliefs. Although the linear representation $\phi_\omega\rbr{x_t, a_t}$ is infinite dimensional, it can be approximated by Monte-Carlo approximation easily~\citep{rahimi2007random,ren2022free}. 
Even we cannot obtain the original $\mu\rbr{\cdot}$, as we introduce in~\Secref{sec:prelim}, we only need the $\phi\rbr{b, a}$ for constructing linear space of $Q$-function. 

To learn the \algabb given data $\Dcal\defeq\cbr{o_{t-1}, a_t, r_t}_{t=1}^H$, we exploit maximum likelihood estimation~(MLE) of~\eq{eq:energy_transition}, 
\begin{align}\label{eq:mle}
    \min_{f, g}\,\,& -\widehat\EE_\Dcal\sbr{f(x_t, a_t)^\top \rbr{g(o_{t+1})+\lambda f(x_t, a_t) }}, \\
    \st & \EE_{p(o_{t+1})}\sbr{ \exp\rbr{f(x_t, a_t)^\top \rbr{g(o_{t+1}) + \lambda f(x_t, a_t)}} } = 1, \nonumber \\ & 
    \forall \rbr{x_t, a_t}\in \Xcal^L. \nonumber
\end{align}
To ensure the constraints, we add a regularization term 
\begin{align}
\textstyle
    &\left(\log\left(\EE_{o}\sbr{\exp(f(x_t, a_t)^\top \rbr{g(o)+\lambda f(x_t, a_t) })}\right)\right)^2 \nonumber\\
    \approx&\left(\log\left(\frac{1}{m}\sum_{i=1}^m \exp(f(x_t, a_t)^\top \rbr{g(o_i)+\lambda f(x_t, a_t) })\right)\right)^2,\nonumber
\end{align}
with $o_i \sim p$. The objective will be
\begin{multline}
\textstyle
    \min_{f, g} \,\, \widehat\EE_\Dcal\bigg[-f(x_t, a_t)^\top \rbr{g(o_{t+1})+\lambda f(x_t, a_t) }\\
    + \alpha \rbr{\log\rbr{\frac{1}{m}\sum_{i=1}^m \exp(f(x_t, a_t)^\top  \rbr{g(o_i) +\lambda f(x_t, a_t) })}}^2\bigg]. \nonumber
\end{multline}
In practice, we can further simplify the objective by normalizing the $\ftil(x_t, a_t) = \frac{f(x_t, a_t)}{\nbr{f(x_t, a_t)}_2}$, obtaining the final objective 
\begin{align}\label{eq:mle_epr}
\textstyle
    \min_{\ftil, g} \,\, & \widehat\EE_\Dcal \left[-\ftil(x_t, a_t)^\top {g(o_{t+1})} +\lambda + \right. \\ & \left. \alpha \left(\log\left(\sum_{i=1}^m \exp(\ftil(x_t, a_t)^\top {g(o_i)} +\lambda)\right)\right)^2\right],\nonumber
\end{align}
which reduces to a contrastive loss that can be optimized by stochastic gradient descent with a deep network parameterization of $\ftil$ and $g$. We obtain negative samples $\cbr{o_i}_{i=1}^m\sim p(o)$ from a mixture of replay buffer and collected trajectories. 

Before we introduce an exploration-exploitation mechanism with~\algabb in~\Secref{subsec:online_offline}, we first discuss the relationship between the proposed~\algabb, predictive state representations~(PSR)~\citep{littman2001predictive,singh2004predictive}, and spectral dynamics embedding~(SPEDE)~\citep{ren2022free}. 

\paragraph{Connection to PSR~\citep{littman2001predictive,singh2004predictive}:} 
The predictive state representation~(PSR) was proposed to bypass belief calculation by factorizing transition dynamics operator. Specifically, given the history $\rbr{x_t, a_t}$, the probability for observing a test, \ie, the finite sequence of events $ y = (a_{t+1}, o_{t+1}, \cdots, a_{t+k}, o_{t+k})$ with $k\in \NN$, is $p(y|x) \defeq  p(o_{t+1}^{t+k}|x_t, a_t^{t+k})$. For time step $t$, one can construct a set of core tests $U = \cbr{u_i, \ldots, u_k}$ as sufficient statistics for history $x_t$, such that for any test $\tau$, $p(\tau|x_t) = \inner{p(U|x_t)}{w_{\tau}}$ for some weights $w_{\tau,}\in \RR^{\abr{U_t}}$. The  forward dynamics
can be represented in a PSR by Bayes' rule: 
$
p(\tau | x_t, a_t, o_{t+1}) = \frac{w_{\rbr{\tau, a_t, o_{t+1}}}^\top p(U|x_t)}{w_{\rbr{a_t, o_{t+1}}}^\top p(U|x_t)},
$
which implies that a PSR updates with new observations and actions by repeating a calculation for each $u_i\in U$. 
Although originally defined for tabular cases, PSRs have been extended to continuous observations by introducing kernels~\citep{boots2013hilbert} or neural networks~\citep{guo2018neural,downey2017predictive,hefny2018recurrent}. 

The proposed~\algabb shares similarities with PSR. Both factorize conditional distributions defined by the dynamics. However, these representations are designed for different purposes, and thus, with different usages and updates. Concretely, \algabb is proposed for seeking a linear space that can represent the $Q$-function.  The representation is designed to preserve linearity with successive observations without the need for Bayesian updates, which induce extra nonlinearity in PSRs. This linear property leads to efficient exploration and planning in \algabb; while an efficient exploration and planning algorithm has not yet been discussed for PSR.

\paragraph{Connection to SPEDE~\citep{ren2022free}:}  Linear random features have been proposed for solving planning in MDPs with nonlinear dynamics in~\citep{ren2022free}, where the transition operator is defined as $T\rbr{s'|s, a} \propto \exp\rbr{-\nbr{s' - f(s, a)}_2^2/(2\sigma^2)}$, corresponding to dynamics $s' = f(s, a) + \epsilon$ with Gaussian noise $\epsilon\sim \Ncal\rbr{0, \sigma}$. In addition to the generalization of~\algabb for POMDPs, even in an MDP, \algabb considers a general energy-based model, $T(s'|s, a) \propto p(s')\exp\rbr{f(s, a)^\top \rbr{g(s')+\lambda f(s, a)}}$ for the dynamics, which is far more flexible than the Gaussian perturbation model considered in SPEDE.  

\subsection{Online Exploration and Offline Policy Optimization with~\algabb}\label{subsec:online_offline}

With an~\algabb $\phi(x_t, a_t)$ learned for a POMDP, we can represent the $Q$-function linearly for the approximated belief MDP, and thus, achieve computationally efficient planning, while calculating confidence bounds for implementing the optimism/pessimism in the face of uncertainty.

\begin{algorithm}[tb] 
\caption{\AlgName}
\label{alg:prototype}
\begin{algorithmic}[1]
  \State \textbf{Input:} History Embedding $f(x, a)$, Observation Embedding $g(o)$, Random Feature $\{\omega_i\}_{i=1}^n$ where $\omega_i \sim \mathcal{N}(0, I_d)$, Initial Random Policy $\pi_0$, Initial Dataset $\mathcal{D} = \emptyset$ for online setting.
  \For{Episode $i=1, \cdots, K$}\label{step:iteration}
  \State Collect data $\{(x_{i, j}, a_{i, j}, o_{i, j}, r_{i, j})\}_{j=1}^{H}$ with $\tilde\pi_{i} = \xi \pi_i + (1 - \xi)\pi_0$, and add the data to $\mathcal{D}$.
  \State Optimize $f$ and $g$ with~\eq{eq:mle_epr} using the data from $\mathcal{D}$.
  \State Obtain the representation $\phi(x_t, a_t)$ via~\eq{eq:random_phi} using $\{\omega_i\}_{i=1}^n$.
  \State Add the bonus~\eq{eq:bonus} to the reward and obtain the optimal policy $\pi_{i+1}$ via SAC~\citep{haarnoja2018soft} with the $Q(x_t, a_t)$ parameterize as $\phi(x_t, a_t)$ and optimize via FQI .~\label{step:bonus}
  \EndFor
  \State \textbf{Return } $\pi_{K}$.
\end{algorithmic}
\end{algorithm}

\paragraph{Exploration and Exploitation with Elliptical Confidence Bound.}
Given the learned representation $\phi(x_t, a_t)$, the confidence bounds can be calculated efficiently, which allows efficient implementation of optimism/pessimism in the face of uncertainty via upper/lower confidence bound~\citep{abbasi2011improved, jin2020provably, uehara2021representation}. 
This is achieved simply by adding an additional elliptical bonus to the $R(x, a)$. Specifically, given the dataset we collect $\mathcal{D} = \{(x^L_i, a_i, R_i, o_{i+1})\}_{i=1}^n$, and calculate the confidence bound as the bonus,
%\vspace{-2mm}
\begin{align}
    b(x_t, a_t) = \sqrt{\phi(x_t, a_t)\Sigma_n^{-1} \phi(x_t, a_t)}
    \label{eq:bonus}
\end{align}
where $\lambda$ is a pre-specified hyperparameter, and $\Sigma_n = \sum_{i=1}^n \lambda I + \phi(x^L_i, a_i) \phi(x^L_i, a_i)^\top$. 
One can then implement UCB/LCB by adding/subtracting the bonus to $R(x_t, a_t)$, and performing planning on the modified reward function.

\paragraph{Planning with Obtained Representation.}
Planning can be conducted by Bellman recursion within the linear space spanned by $\phi(x^L_t, a_t)$ without a bonus. 
However, with an additional bonus term, the $Q^\pi$ no longer lies in the linear space of $\phi$, since 
\[
Q^\pi\rbr{x^L_t, a_t} = R(x^L_t, a_t) + b(x^L_t, a_t) + \EE_{T_b^L\pi}\sbr{Q^\pi\rbr{x_{t+1}, a_{t+1}}}. 
\]
As discussed in~\citep{zhang2022making}, one can augment the feature space $\psi\rbr{x, a}\defeq \cbr{\phi(x, a), b(x, a)}$ to ensure the $Q$-functions can still be linearly represented but with an extra $\Ocal\rbr{d^2}$ memory cost. 
In practice, we perform fitted $Q$ iteration with a nonlinear component extending the linear parameterization, \ie, $Q(x, a) = \cbr{w_1, w_2}^\top \sbr{\phi(x, a), \sigma\rbr{w_3^\top \phi(x, a)}}$. 

We provide an outline of our implementation of UCB in ~\Algref{alg:prototype}. LCB for pessimistic offline RL is similar but using a pre-collected dataset $\hat\Dcal$ without data collection in Step~\ref{step:iteration}, and with the bonus subtracted in Step~\ref{step:bonus}. Our algorithm follows the standard interaction paradigm between the agent and the environment, where the agent executes the policy and logs the data to the dataset. Then we perform representation learning and optimistic planning with the $Q$ function parameterized upon the learned representation. The optimistic planning can be done via soft actor-critic~\citep{haarnoja2018soft}. 

We also remark that the representation learning part in our algorithm can easily exploit extra offline data as a warm start for free to the online improvement.


\section{Related Work}\label{sec:related_work}

\paragraph{Partial Observability in Reinforcement Learning.}

Despite the essential hardness of POMDPs in terms of learning, planning and exploration~\citep{papadimitriou1987complexity, vlassis2012computational, jin2020provably}, the study of reinforcement learning with partial observations, from both theoretical and empirical aspects, is still attractive due to its practical importance. 

Algorithmically, model-based/-free algorithms have been extended to POMDPs, explicitly or implicitly exploiting structure information. Model-based RL algorithms parameterize and learn  latent dynamics with an emission model explicitly, and planning through the simulation upon the learned models. A variety of deep models have been proposed recently for better modeling~\citep{watter2015embed,karl2016deep,igl2018deep,zhang2019solar,lee2020stochastic,hafner2019dream,hafner2020mastering}. Although deep models indeed provide better approximation ability, 
they also bring new challenges in terms of planning and exploration, which has not been fully handled. On the other hand, model-free RL algorithms have been extended for POMDPs by learning history dependent value functions and/or policies, through temporal-difference algorithms or policy gradients. For example, deep $Q$-learning~\citep{mnih2013playing} concatenates $4$ consecutive frames as the input of a deep neural $Q$-net, which is then improved by recurrent neural networks for longer windows~\citep{bakker2001reinforcement,hausknecht2015deep,zhu2017improving}. Recurrent neural networks have also been exploited for history dependent policies~\citep{schmidhuber1990reinforcement, bakker2001reinforcement,wierstra2007solving,heess2015memory} with policy gradient as well as actor-critic  approaches~\citep{ni2021recurrent,meng2021memory}. Model-free RL for POMDPs bypasses the planning complexity of model-based RL algorithms. However, the difficulty in exploration remains, which leads to suboptimal performance in practice. By contrast, the proposed~\algabb not only can be efficiently learned, but is also equipped with principled planning and exploration methods, which has not been previously achieved. 

\paragraph{Representation Learning for RL.}
Successful vision-based representation learning methods have been extended to RL for extracting compact and invariant \emph{state-only} information from raw-pixels, \eg,~\citep{kostrikov2020image}. However, such vision-based features are not specially designed for capturing properties in POMDPs/MDPs essential for decision making.
To reveal structure that is particularly helpful for RL, many representation learning methods have been designed for different purposes, such as bi-simulation~\citep{ferns2004metrics,gelada2019deepmdp,zhang2020learning}, successor features~\citep{dayan1993improving,barreto2017successor,kulkarni2016deep}, spectral decomposition of transition operators~\citep{mahadevan2007proto,wu2018laplacian,duan2019state}, latent future prediction~\citep{schwarzer2020data} and contrastive learning~\citep{oord2018representation,mazoure2020deep,nachum2021provable, yang2021trail}. These representation methods ignore the requirement of planning tractability. Moreover, they are learning based on a pre-collected dataset, which ignores the exploration issue. 

Features that are able to represent value functions are desirable for efficient planning and exploration. Based on the linear MDPs structure~\citep{jin2020provably}, several theoretical algorithms~\citep{agarwal2020flambe,uehara2021representation} have been developed. 
\citet{ren2022free,ren2023spectral, zhang2022making,qiu2022contrastive} bridge the gap between theory and practice and bypass computational intractability via different techniques, demonstrating advantages empirically. The proposed~\algabb is inspired from this class of representations, but extended to POMDPs, which is highly non-trivial. 

\paragraph{Provably RL for POMDPs.} Besides the statistical and computational hardness results for learning and planning upon POMDPs, most recent theoretical research focuses on overcoming the statistical complexity from the ``curse of history'' by considering tractable POMDPs~\citep{krishnamurthy2016pac,azizzadenesheli2016reinforcement,guo2016pac,jin2020sample, liu2022partially}. Similarly to the \emph{observability} structure we exploited in our algorithm, these work bypass the curse of history by different special structures, reducing the whole history dependency to finite-length memory. Recently,~\citet{uehara2022provably,wang2022embed} generalize these special structures with function approximation beyond tabular cases. 
\citet{golowich2022learning} consider the complexity planning and exploration together with learning, but only valid for tabular MDPs. 

\section{Experiments}

\begin{table*}[h]
\caption{Performance on various MuJoCo control tasks. All the results are averaged across 4 random seeds and a window size of 10K. Results marked with $^*$ is adopted from MBBL. \algabb achieves strong performance compared with baselines.}
\scriptsize
\setlength\tabcolsep{3.5pt}
\label{tab:MuJoCo_results}
\centering
\begin{tabular}{p{2cm}p{1.8cm}p{1.7cm}p{1.7cm}p{1.7cm}p{1.7cm}p{1.7cm}p{1.7cm}}
\toprule
& & HalfCheetah & Reacher & Humanoid-ET & Pendulum & I-Pendulum \\ 
\midrule  
\multirow{4}{*}{Model-Based RL} & ME-TRPO$^*$ & 2283.7$\pm$900.4 & -13.4$\pm$5.2 & 72.9$\pm$8.9 & \textbf{177.3$\pm$1.9} & -126.2$\pm$86.6\\
& PETS-RS$^*$  & 966.9$\pm$471.6 & -40.1$\pm$6.9 & 109.6$\pm$102.6 & 167.9$\pm$35.8 & -12.1$\pm$25.1\\
& PETS-CEM$^*$  & 2795.3$\pm$879.9 & -12.3$\pm$5.2 & 110.8$\pm$90.1 & 167.4$\pm$53.0 & -20.5$\pm$28.9\\
& Best MBBL & 3639.0$\pm$1135.8 & \textbf{-4.1$\pm$0.1} & 1377.0$\pm$150.4 & \textbf{177.3$\pm$1.9} & \textbf{0.0$\pm$0.0}\\
\midrule
\multirow{3}{*}{Model-Free RL} & PPO$^*$ & 17.2$\pm$84.4 & -17.2$\pm$0.9 & 451.4$\pm$39.1 & 163.4$\pm$8.0 & -40.8$\pm$21.0 \\
& TRPO$^*$ & -12.0$\pm$85.5 & -10.1$\pm$0.6 & 289.8$\pm$5.2 & 166.7$\pm$7.3 & -27.6$\pm$15.8 \\
& SAC$^*$ (3-layer)  & 4000.7$\pm$202.1 & -6.4$\pm$0.5 & \textbf{1794.4$\pm$458.3} & 168.2$\pm$9.5 & -0.2$\pm$0.1\\
\midrule
\multirow{4}{*}{Representation RL} & DeepSF & 4180.4$\pm$113.8 & -16.8$\pm$3.6 & 168.6$\pm$5.1 & 168.6$\pm$5.1 & -0.2$\pm$0.3\\
& SPEDE & 4210.3$\pm$92.6 & -7.2$\pm$1.1 & 886.9$\pm$95.2 & 169.5$\pm$0.6 & 0.0$\pm$0.0\\
& {\bf \algabb} & \textbf{5107.6$\pm$195.4} & \textbf{-5.6$\pm$0.3} & 1494.6$\pm$131.3 & {169.4$\pm$4.1} & \textbf{0.0$\pm$0.0}\\
\bottomrule 
\end{tabular}
\centering
\begin{tabular}{p{2cm}p{1.8cm}p{1.7cm}p{1.7cm}p{1.7cm}p{1.7cm}p{1.7cm}p{1.7cm}}
& & Ant-ET & Hopper-ET & S-Humanoid-ET & CartPole & Walker-ET \\ 
\midrule  
\multirow{4}{*}{Model-Based RL} & ME-TRPO$^*$ & 42.6$\pm$21.1 & 1272.5$\pm$500.9 & -154.9$\pm$534.3 & 160.1$\pm$69.1 & -1609.3$\pm$657.5\\
& PETS-RS$^*$ & 130.0$\pm$148.1 &  205.8$\pm$36.5 & 320.7$\pm$182.2 & 195.0$\pm$28.0 & 312.5$\pm$493.4 \\
& PETS-CEM$^*$ & 81.6$\pm$145.8 & 129.3$\pm$36.0 & 355.1$\pm$157.1 & 195.5$\pm$3.0 & 260.2$\pm$536.9 \\
& Best MBBL & 275.4$\pm$309.1 & 1272.5$\pm$500.9 & \textbf{1084.3$\pm$77.0} & \textbf{200.0$\pm$0.0} & 312.5$\pm$493.4\\
\midrule
\multirow{3}{*}{Model-Free RL} & PPO$^*$ & 80.1$\pm$17.3  & 758.0$\pm$62.0 & 454.3$\pm$36.7 & 86.5$\pm$7.8 & 306.1$\pm$17.2\\
& TRPO$^*$ & 116.8$\pm$47.3  & 237.4$\pm$33.5 & 281.3$\pm$10.9 & 47.3$\pm$15.7 & 229.5$\pm$27.1\\
& SAC$^*$ (3-layer) & 2012.7$\pm$571.3  & 1815.5$\pm$655.1 & 834.6$\pm$313.1 & \textbf{199.4$\pm$0.4} & \textbf{2216.4$\pm$678.7}\\
\midrule
\multirow{4}{*}{Representation RL} & DeepSF & 768.1$\pm$44.1  & 548.9$\pm$253.3 & 533.8$\pm$154.9 & 194.5$\pm$5.8 & 165.6$\pm$127.9\\
& SPEDE & 806.2$\pm$60.2  & 732.2$\pm$263.9 & 986.4$\pm$154.7 & 138.2$\pm$39.5 & 501.6$\pm$204.0\\
& {\bf \algabb} & \textbf{4081.3$\pm$973.9} & \textbf{2191.4$\pm$502.8}  & \textbf{1326.3$\pm$20.8} & \textbf{200.8$\pm$0.1} & \textbf{1975.4$\pm$751.3}\\
\bottomrule 
\end{tabular}
\end{table*}
Our experiments investigate how our algorithm performs in  robotic lomocation simulation environments. We extensively evaluate the proposed approach on the Mojuco in OpenAI Gym and DeepMind Control Suites. We conduct experiments on both the fully observable MDP  and  partially observable POMDP settings. 
\subsection{Fully Observable MDP}
\paragraph{Dense-Reward Mujoco Tasks.} We first conduct experiments in the fully observable MDP setting in Mujoco locomotion tasks. This is a test suite  commonly used for both model-free and model-based RL algorithms. We compare \algabb with model-based RL baselines  PETS~\citep{chua2018deep} and ME-TRPO~\citep{kurutach2018model}, and model-free RL baselines  SAC~\citep{haarnoja2018soft}, TRPO~\citep{schulman2015trust} and PPO~\citep{schulman2017proximal}. In addition, we also compare to the representation learning RL baselines Deep Successor Feature (DeepSF)~\citep{kulkarni2016deep} and SPEDE~\citep{ren2022free}. We list the best model-based RL results (except for iLQR~\citep{li2004iterative}) in MBBL~\citep{wang2019benchmarking} for comparison. All algorithms are run for 200K environment steps. The results are averaged across four random seeds with a window size of 10K. We show that in Tab.~\ref{tab:MuJoCo_results}, \algabb significantly outperforms all the baselines including the strong previous SoTA model-free algorithm SAC. 

In particular, we observe that most model-based algorithms have a hard time learning the walk and hop behavior in the Walker and Hopper environments respectively. We suspect that this is due to the fact that the quality of the data is bad at the initial data collection process (e.g., the agent often fall to the ground or has a hard time standing up). As a result, the behavior learned by most model-based algorithms can be suboptimal. For example, some model-based algorithms only learn to stand up without hopping in the Hopper environment. In contrast, \algabb achieves SoTA performance in the Hopper task and Ant task, demonstrating the behavior of doing good exploration in the task domain. 

\paragraph{Sparse-Reward DM Control Tasks.} Manually-designed dense reward functions are extremely hard to obtain, while it is difficult to gain access to a good dense reward function in practical real-robot settings. Thus, exploration in the sparse-reward settings is a key consideration for the success of  RL  in  robotics settings. We  test our algorithm \algabb with SAC and PPO in such cases.  Here we compare with DeepSF as an additional representation RL baseline. Note that the critic network used in SAC and PPO is deeper than \algabb. From Tab.~\ref{tab:DM_results}, we see that \algabb achieves a particularly huge gain compared to SAC and PPO in sparse reward tasks \texttt{walker-run-sparse}.

\begin{table*}[h]
\caption{Performance of on various Deepmind Suite Control tasks. All the results are averaged across four random seeds and a window size of 10K. Comparing with SAC, our method achieves even better performance on sparse-reward tasks}
\setlength\tabcolsep{3.5pt}
\label{tab:DM_results}
\centering
\begin{tabular}{p{3cm}p{2.4cm}p{1.8cm}p{2.8cm}p{1.8 cm
}p{2,8cm}p{1.8cm}p{1.8cm}}
\toprule
& & cheetah\_run & cheetah\_run\_sparse & walker\_run & walker\_run\_sparse & humanoid\_run \\ 
\midrule
Model-Based RL & Dreamer & 542.0 $\pm$ 27.7 & 499.9$\pm$73.3 & 337.7$\pm$67.2 & 95.4$\pm$54.7 & 1.0$\pm$0.2\\
\midrule
\multirow{3}{*}{Model-Free RL} & PPO & 227.7$\pm$57.9 & 5.4$\pm$10.8 & 51.6$\pm$1.5 & 0.0$\pm$0.0 & 1.1$\pm$0.0\\
& SAC (2-layer)  & 222.2$\pm$41.0 & 32.4$\pm$27.8 & 183.0$\pm$23.4 & 53.5$\pm$69.3 & 1.3$\pm$0.1\\
& SAC (3-layer)  & 595.2$\pm$96.0 & 419.5$\pm$73.3 & 700.9$\pm$36.6 & 311.5$\pm$361.4 & 1.2$\pm$0.1\\
\midrule  
\multirow{4}{*}{Representation RL} & DeepSF & 295.3$\pm$43.5 & 0.0$\pm$0.0 & 27.9$\pm$2.2 & 0.1$\pm$0.1 & 0.9$\pm$0.1\\
& Proto RL &  305.5$\pm$37.9 &  0.0$\pm$0.0 & 433.5$\pm$56.8 &  46.9$\pm$34.1 &  0.3$\pm$0.6\\
& {\bf \algabb} & \textbf{611.6$\pm$53.5} & \textbf{469.8$\pm$30.6} & \textbf{792.8$\pm$35.7} & \textbf{701.8$\pm$30.4} & \textbf{11.5$\pm$5.4}\\
\bottomrule 
\end{tabular}
\end{table*}

\subsection{Partial Observable MDP covering Velocity}
\paragraph{Mujoco.} Often in practice, it is hard to recover a full observation of the states. Thus, the ability to handle a partially-observed MDP (POMDP) is also important if we can only recover partial observations. To conduct experiments in this setting, we adopt a commonly used approach~\citep{ni2021recurrent, gangwani2020learning, weigand2021reinforcement} of masking the the velocities in the observations.
We compare to algorithms with different embedding approaches that maps a given history sequence to a latent representation, which is used as the input for a SAC planner.  

We consider four embedding methods as baselines: Transformer (Trans), which uses causal encoding with one layer and one head, and positional encoding (details in Appendix~\ref{appendix:experiments}); GRU; PSR \citep{guo2018neural}; and a simple MLP baseline for sanity checking. The MLP baseline concatenates the history sequence and maps it to a latent feature using a MLP. 
We find that this setting is very challenging and the performance of all algorithms degrades comparied to the fully-observable setting, as shown in Tab.~\ref{tab:MuJoCo_results_POMDP}. Nevertheless, the proposed algorithm still achieves SoTA performance in tasks like Halfcheetah, Ant, SlimHumanoid. This demonstrates the capability of handling partial observability in \algabb which can have an important effect in practice. 

\begin{table*}[h]
\caption{Performance on various MuJoCo control tasks {\color{blue}with partial observation}. All the results are averaged across 4 random seeds and a window size of 10K. Results marked with $^*$ is adopted from MBBL. \algabb achieves strong performance compared with baselines.}
\setlength\tabcolsep{3.5pt}
\label{tab:MuJoCo_results_POMDP}
\centering
\begin{tabular}{p{3cm}p{2.5cm}p{2.5cm}p{2.5cm}p{2.5cm}p{2cm}p{2cm}p{2cm}}
\toprule
& & HalfCheetah & Humanoid-ET & Walker-ET\\ 
\midrule  
\multirow{2}{*}{Representation RL}
& {\bf \algabb} & \textbf{3441.6 $\pm$ 993.0} & \textbf{865.6 $\pm$ 107.3} & 416.6 $\pm$ 145.6\\
& { PSR} & 2679.75 $\pm$ 386 & 534.4 $\pm$ 36.6 & 862.4 $\pm$ 355.3 \\
\midrule
\multirow{2}{*}{Model-Free RL}
& { MLP} & 1612.0 $\pm$ 223 & 614.15 $\pm$ 67.6 & 236.5 $\pm$ 65.6 \\
& { Trans} & 1443.5 $\pm$ 227.2 & 387.1 $\pm$ 8.4 & 388.7 $\pm$ 224.9 \\
& { GRU} & 1664.3 $\pm$ 431.2 & 467.7 $\pm$ 43.4 &  \textbf{1020.6 $\pm$ 364.9} \\
\bottomrule 
\end{tabular}
\centering
\begin{tabular}{p{3cm}p{2.5cm}p{2.5cm}p{2.5cm}p{2.5cm}p{2cm}p{2cm}p{2cm}}
& & Ant-ET & Hopper-ET & S-Humanoid-ET \\ 
\midrule  
\multirow{2}{*}{Representation RL}
& {\bf \algabb} & \textbf{1508.1 $\pm$ 594.4} & \textbf{1059.4 $\pm$ 582.5} & \textbf{805.7 $\pm$ 65.9}\\
& { PSR } & 1128.3 $\pm$ 166.6    & 818.8 $\pm$ 87.2 & 493.3 $\pm$ 65.2 \\
\midrule
\multirow{2}{*}{Model-Free RL}
& { MLP} & 1262.0 $\pm$ 68.7 & 260.6 $\pm$ 63.7 & 294.17 $\pm$ 49.4 \\
& { Trans} & 928.5 $\pm$ 44.2 & 470.8 $\pm$ 50.3 & 447.9 $\pm$ 112.6 \\
& { GRU} & 1190.8 $\pm$ 79.4 & 777.5 $\pm$ 113.3 & 485.9 $\pm$ 25.8  \\
\bottomrule 
\end{tabular}
\end{table*}


\begin{table*}[h]
\caption{Performance of on various Deepmind Suite Control tasks  {\color{blue}with partial observation}. All the results are averaged across four random seeds and a window size of 10K. Our method achieves even better performance, comparing with the existing methods.}
\setlength\tabcolsep{3.5pt}
\label{tab:DM_results_POMDP}
\centering
\begin{tabular}{p{2cm}p{1cm}p{2.5cm}p{2.5cm}p{2.5cm}p{2.5cm}p{1.5cm}p{1.5cm}}
\toprule
& & cheetah\_run & cheetah\_run\_sparse & walker\_run & walker\_run\_sparse & humanoid\_run \\ 
\midrule
\multirow{1}{*}{Explicit Model}
& {SLAC} & {105.1 $\pm$ 30.1} & {0.0 $\pm$ 0.0} & {139.2 $\pm$ 3.4} & {0.0 $\pm$ 0.0} & {0.9 $\pm$ 0.1}\\
\midrule
\multirow{3}{*}{Model-Free RL} 
& MLP & \textbf{743.3 $\pm$ 7.2} & 0.0 $\pm$ 0.0 & 279.8 $\pm$ 190.6 & 0.0 $\pm$ 0.0 & 1.2 $\pm$ 0.1 \\
& Trans & 379.6 $\pm$ 80 & 0.0 $\pm$ 0.0 & 68.06 $\pm$ 39.9 & 0.0 $\pm$ 0.0 & 0.92 $\pm$ 0.1 \\
\midrule  
\multirow{3}{*}{Representation RL} 
& PSR & 173.7 $\pm$ 25.7 & 0.0 $\pm$ 0.0 & 57.4 $\pm$ 7.4 & 0.0 $\pm$ 0.0 & 0.89 $\pm$ 0.1 \\
& {\bf \algabb} & 526.5$\pm$61.1 & \textbf{411.0$\pm$51.6} & \textbf{509.8$\pm$24.4} & \textbf{460.3$\pm$51.6} & \textbf{6.1$\pm$2.5} \\
\bottomrule 
\end{tabular}
\end{table*}

\paragraph{DM Control Suite.} Correspondingly, we conduct POMDP experiments in the DM Control Suite. However, we find that covering all the velocities is very challenging, leading to trivial performances for all the competitors. We cover the last 3 dimensions of the velocity. As shown in Tab.~\ref{tab:DM_results_POMDP}, our~\algabb significantly dominant the other competitors. 

\begin{figure}[h]
    \centering
    \includegraphics[width=0.45\textwidth]{figures/fetch_reach_image.png}
    \caption{\textbf{\algabb in image-based environment:} \algabb gets a good performance compared to SPR and SAC+AE.}
    \label{fig:reach}
\end{figure}

\subsection{Image-Based Environments}
To test our method on image-based environments, we conduct an additional experiment on MetaWorld~\citep{yu2020meta}. We choose one of the \texttt{fetch-reach} tasks and compare against the model-free algorithm SAC+AE~\citep{yarats2021improving} and a popular representation learning method SPR~\citep{schwarzer2020data}. We show the results in Fig.~\ref{fig:reach} and note that the minimum distance between the current state and the goal is used as the evaluation metric (the smaller distance means better performance). We can see that \algabb manages to reach the distant goal within 100K steps. Comparing to SAC+AE, \algabb strictly dominate its performance. For SPR, although it learns faster at the beginning, \algabb has better final performance.

\section{Conclusion}

We exploit~\AlgName~(\algabb) for linearly representing value functions for arbitrary policies and supporting reinforcement learning in partially observed environments with finite memories. The proposed~\algabb shows that planning and  strategic  exploration can be implemented efficiently. The coherent design of each component brings  empirical advantages in  RL benchmarks considering both the MDP and POMDP settings. Such superior performance makes the theoretical understanding of~\algabb more intriguing, which we leave as future work. 


\bibliography{ref}

\end{document}
