% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\R}[1]{\text{Regret}(#1)}
\newcommand{\gap}[1]{\text{gap}_{#1}}
\newcommand{\HLS}[0]{\text{HLS} }
\newcommand{\image}[0]{\text{Im}}
\newcommand{\algname}[0]{\text{ReLEX}\xspace}
\usepackage{xspace} 

%-----------------------------------------------------------------%
%
% The latex macro of the SMiLe group.
%                                       by Han Liu
%
%-----------------------------------------------------------------%

\RequirePackage{amsmath}
\RequirePackage{amssymb}
\RequirePackage{amsthm}
\RequirePackage{bm} 
\RequirePackage{url}
\usepackage{multirow}
\usepackage{natbib}
\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{makecell}
\usepackage{booktabs}
\usepackage{array}
\usepackage{url}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{dsfont}

\newcommand{\la}{\left\langle}
\newcommand{\ra}{\right\rangle}
\renewcommand{\algorithmicrequire}{\textbf{Input:}}
\renewcommand{\algorithmicensure}{\textbf{Output:}}
\def\eps{\varepsilon}


\let\hat\widehat
\let\tilde\widetilde
\def\given{{\,|\,}}
\def\ds{\displaystyle}
\newcommand\wtilde{\stackrel{\sim}{\smash{\mathcal{W}}\rule{0pt}{1.1ex}}}

%----- bold fonts -----%

\newcommand{\ab}{\mathbf{a}}
\newcommand{\bbb}{\mathbf{b}}
\newcommand{\cbb}{\mathbf{c}}
\newcommand{\db}{\mathbf{d}}
\newcommand{\eb}{\mathbf{e}}
\newcommand{\fb}{\mathbf{f}}
\newcommand{\gb}{\mathbf{g}}
\newcommand{\hb}{\mathbf{h}}
\newcommand{\ib}{\mathbf{i}}
\newcommand{\jb}{\mathbf{j}}
\newcommand{\kb}{\mathbf{k}}
\newcommand{\lb}{\mathbf{l}}
\newcommand{\mb}{\mathbf{m}}
\newcommand{\nbb}{\mathbf{n}}
\newcommand{\ob}{\mathbf{o}}
\newcommand{\pb}{\mathbf{p}}
\newcommand{\qb}{\mathbf{q}}
\newcommand{\rb}{\mathbf{r}}
\newcommand{\sbb}{\mathbf{s}}
\newcommand{\tb}{\mathbf{t}}
\newcommand{\ub}{\mathbf{u}}
\newcommand{\vb}{\mathbf{v}}
\newcommand{\wb}{\mathbf{w}}
\newcommand{\xb}{\mathbf{x}}
\newcommand{\yb}{\mathbf{y}}
\newcommand{\zb}{\mathbf{z}}

\newcommand{\ba}{\bm{a}}
\newcommand{\bb}{\bm{b}}
\newcommand{\bc}{\bm{c}}
\newcommand{\bd}{\bm{d}}
\newcommand{\be}{\bm{e}}
\newcommand{\bbf}{\bm{f}}
\newcommand{\bg}{\bm{g}}
\newcommand{\bh}{\bm{h}}
\newcommand{\bi}{\bmf{i}}
\newcommand{\bj}{\bm{j}}
\newcommand{\bk}{\bm{k}}
\newcommand{\bl}{\bm{l}}
\newcommand{\bbm}{\bm{m}}
\newcommand{\bn}{\bm{n}}
\newcommand{\bo}{\bm{o}}
\newcommand{\bp}{\bm{p}}
\newcommand{\bq}{\bm{q}}
\newcommand{\br}{\bm{r}}
\newcommand{\bs}{\bm{s}}
\newcommand{\bt}{\bm{t}}
\newcommand{\bu}{\bm{u}}
\newcommand{\bv}{\bm{v}}
\newcommand{\bw}{\bm{w}}
\newcommand{\bx}{\bm{x}}
\newcommand{\by}{\bm{y}}
\newcommand{\bz}{\bm{z}}




\newcommand{\Ab}{\mathbf{A}}
\newcommand{\Bb}{\mathbf{B}}
\newcommand{\Cb}{\mathbf{C}}
\newcommand{\Db}{\mathbf{D}}
\newcommand{\Eb}{\mathbf{E}}
\newcommand{\Fb}{\mathbf{F}}
\newcommand{\Gb}{\mathbf{G}}
\newcommand{\Hb}{\mathbf{H}}
\newcommand{\Ib}{\mathbf{I}}
\newcommand{\Jb}{\mathbf{J}}
\newcommand{\Kb}{\mathbf{K}}
\newcommand{\Lb}{\mathbf{L}}
\newcommand{\Mb}{\mathbf{M}}
\newcommand{\Nb}{\mathbf{N}}
\newcommand{\Ob}{\mathbf{O}}
\newcommand{\Pb}{\mathbf{P}}
\newcommand{\Qb}{\mathbf{Q}}
\newcommand{\Rb}{\mathbf{R}}
\newcommand{\Sbb}{\mathbf{S}}
\newcommand{\Tb}{\mathbf{T}}
\newcommand{\Ub}{\mathbf{U}}
\newcommand{\Vb}{\mathbf{V}}
\newcommand{\Wb}{\mathbf{W}}
\newcommand{\Xb}{\mathbf{X}}
\newcommand{\Yb}{\mathbf{Y}}
\newcommand{\Zb}{\mathbf{Z}}

\newcommand{\bA}{\bm{A}}
\newcommand{\bB}{\bm{B}}
\newcommand{\bC}{\bm{C}}
\newcommand{\bD}{\bm{D}}
\newcommand{\bE}{\bm{E}}
\newcommand{\bF}{\bm{F}}
\newcommand{\bG}{\bm{G}}
\newcommand{\bH}{\bm{H}}
\newcommand{\bI}{\bm{I}}
\newcommand{\bJ}{\bm{J}}
\newcommand{\bK}{\bm{K}}
\newcommand{\bL}{\bm{L}}
\newcommand{\bM}{\bm{M}}
\newcommand{\bN}{\bm{N}}
\newcommand{\bO}{\bm{O}}
\newcommand{\bP}{\bm{P}}
\newcommand{\bQ}{\bm{Q}}
\newcommand{\bR}{\bm{R}}
\newcommand{\bS}{\bm{S}}
\newcommand{\bT}{\bm{T}}
\newcommand{\bU}{\bm{U}}
\newcommand{\bV}{\bm{V}}
\newcommand{\bW}{\bm{W}}
\newcommand{\bX}{\bm{X}}
\newcommand{\bY}{\bm{Y}}
\newcommand{\bZ}{\bm{Z}}


%----- calligraphic fonts -----%

\newcommand{\cA}{\mathcal{A}}
\newcommand{\cB}{\mathcal{B}}
\newcommand{\cC}{\mathcal{C}}
\newcommand{\cD}{\mathcal{D}}
\newcommand{\cE}{\mathcal{E}}
\newcommand{\cF}{\mathcal{F}}
\newcommand{\cG}{\mathcal{G}}
\newcommand{\cH}{\mathcal{H}}
\newcommand{\cI}{\mathcal{I}}
\newcommand{\cJ}{\mathcal{J}}
\newcommand{\cK}{\mathcal{K}}
\newcommand{\cL}{\mathcal{L}}
\newcommand{\cM}{\mathcal{M}}
\newcommand{\cN}{\mathcal{N}}
\newcommand{\cO}{\mathcal{O}}
\newcommand{\cP}{\mathcal{P}}
\newcommand{\cQ}{\mathcal{Q}}
\newcommand{\cR}{\mathcal{R}}
\newcommand{\cS}{{\mathcal{S}}}
\newcommand{\cT}{{\mathcal{T}}}
\newcommand{\cU}{\mathcal{U}}
\newcommand{\cV}{\mathcal{V}}
\newcommand{\cW}{\mathcal{W}}
\newcommand{\cX}{\mathcal{X}}
\newcommand{\cY}{\mathcal{Y}}
\newcommand{\cZ}{\mathcal{Z}}




%----- blackboard bold fonts-----%

\newcommand{\CC}{\mathbb{C}}
\newcommand{\EE}{\mathbb{E}}
\newcommand{\VV}{\mathbb{V}}
\newcommand{\II}{\mathbb{I}}
\newcommand{\KK}{\mathbb{K}}
\newcommand{\LL}{\mathbb{L}}
\newcommand{\MM}{\mathbb{M}}
\newcommand{\NN}{\mathbb{N}}
\newcommand{\PP}{\mathbb{P}}
\newcommand{\QQ}{\mathbb{Q}}
\newcommand{\RR}{\mathbb{R}}
\newcommand{\SSS}{\mathbb{S}}
\newcommand{\TT}{\mathbb{T}}
\newcommand{\ZZ}{\mathbb{Z}}
\newcommand{\XX}{\mathbb{X}}
\newcommand{\YY}{\mathbb{Y}}
\newcommand{\OOmega}{\mathbb{\Omega}}




%----- bold greek fonts -----%

\newcommand{\balpha}{\bm{\alpha}}
\newcommand{\bbeta}{\bm{\beta}}
\newcommand{\bgamma}{\bm{\gamma}}
\newcommand{\bepsilon}{\bm{\epsilon}}
\newcommand{\bvarepsilon}{\bm{\varepsilon}}
\newcommand{\bzeta}{\bm{\zeta}}
\newcommand{\btheta}{\bm{\theta}}
\newcommand{\bvartheta}{\bm{\vartheta}}
\newcommand{\bkappa}{\bm{\kappa}}
\newcommand{\blambda}{\bm{\lambda}}
\newcommand{\bmu}{\bm{\mu}}
\newcommand{\bnu}{\bm{\nu}}
\newcommand{\bxi}{\bm{\xi}}
\newcommand{\bpi}{\bm{\pi}}
\newcommand{\bvarpi}{\bm{\varpi}}
\newcommand{\brho}{\bm{\varrho}}
\newcommand{\bsigma}{\bm{\sigma}}
\newcommand{\bvarsigma}{\bm{\varsigma}}
\newcommand{\btau}{\bm{\tau}}
\newcommand{\bupsilon}{\bm{\upsilon}}
\newcommand{\bphi}{\bm{\phi}}
\newcommand{\bvarphi}{\bm{\varphi}}
\newcommand{\bchi}{\bm{\chi}}
\newcommand{\bpsi}{\bm{\psi}}
\newcommand{\bomega}{\bm{\omega}}

\newcommand{\bGamma}{\bm{\Gamma}}
\newcommand{\bDelta}{\bm{\Delta}}
\newcommand{\bTheta}{\bm{\Theta}}
\newcommand{\bLambda}{\bm{\Lambda}}
\newcommand{\bXi}{\bm{\Xi}}
\newcommand{\bPi}{\bm{\Pi}}
\newcommand{\bSigma}{\bm{\Sigma}}
\newcommand{\bUpsilon}{\bm{\Upsilon}}
\newcommand{\bPhi}{\bm{\Phi}}
\newcommand{\bPsi}{\bm{\Psi}}
\newcommand{\bOmega}{\bm{\Omega}}


%----- Some standard definitions -----%

\newcommand{\argmin}{\mathop{\mathrm{argmin}}}
\newcommand{\argmax}{\mathop{\mathrm{argmax}}}
\newcommand{\minimize}{\mathop{\mathrm{minimize}}}

\newcommand{\sign}{\mathop{\mathrm{sign}}}
\newcommand{\tr}{\mathop{\mathrm{tr}}}

\DeclareMathOperator{\Var}{{\rm Var}}
\DeclareMathOperator{\Cor}{\rm Corr}
\DeclareMathOperator{\Cov}{\rm Cov}
\DeclareMathOperator{\ind}{\mathds{1}}  % Indicator
\newcommand{\smallfrac}[2]{{\textstyle \frac{#1}{#2}}}  
                                                        
\newcommand*{\zero}{{\bm 0}}
\newcommand*{\one}{{\bm 1}}

\newcommand{\diag}{{\rm diag}}



%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


%%%%% Norms

\newcommand{\norm}[1]{||#1||}
\newcommand{\bignorm}[1]{\bigg|\bigg|#1\bigg|\bigg|}
\newcommand{\opnorm}[2]{| \! | \! | #1 | \! | \! |_{{#2}}}

%%%%% Dot product
\newcommand{\dotp}[2]{\langle{#1},{#2}\rangle}

%%%%  brackets
\newcommand{\inner}[2]{\left\langle #1,#2 \right\rangle}
\newcommand{\rbr}[1]{\left(#1\right)}
\newcommand{\sbr}[1]{\left[#1\right]}
\newcommand{\cbr}[1]{\left\{#1\right\}}
\newcommand{\nbr}[1]{\left\|#1\right\|}
\newcommand{\abr}[1]{\left|#1\right|}

%%%%%%%%%  Other commands

\newcommand{\mcomment}[1]{\marginpar{\tiny{#1}}}
\newcommand{\fcomment}[1]{\footnote{\tiny{#1}}}
%\newcommand{\overbar}[1]{\mkern 2mu\overline{\mkern-2mu#1\mkern-2mu}\mkern 2mu}
\newcommand{\overbar}[1]{\mkern 1.5mu\overline{\mkern-1.5mu#1\mkern-1.5mu}\mkern 1.5mu}
\newcommand{\ud}{{\,\mathrm{d}}}


%%%%%%%%  amsmath %%%%%%%%%%
\newtheoremstyle{mytheoremstyle} % name
    {\topsep}                    % Space above
    {\topsep}                    % Space below
    {\normalfont}                   % Body font
    {}                           % Indent amount
    {\bfseries}                   % Theorem head font
    {.}                          % Punctuation after theorem head
    {.5em}                       % Space after theorem head
    {}  % Theorem head spec (can be left empty, meaning ‘normal’)

\theoremstyle{mytheoremstyle}

\ifx\BlackBox\undefined
\newcommand{\BlackBox}{\rule{1.5ex}{1.5ex}}  % end of proof
\fi

\ifx\QED\undefined
\def\QED{~\rule[-1pt]{5pt}{5pt}\par\medskip}
\fi

\ifx\proof\undefined
\newenvironment{proof}{\par\noindent{\bf Proof\ }}{\hfill\BlackBox\\[2mm]}
%\newenvironment{proof}{\emph{Proof. }}{ \hfill \QED}
\fi

\ifx\theorem\undefined
\newtheorem{theorem}{Theorem}
\fi
\ifx\example\undefined
\newtheorem{example}[theorem]{Example}
\fi
\ifx\property\undefined
\newtheorem{property}{Property}
\fi
\ifx\lemma\undefined
\newtheorem{lemma}[theorem]{Lemma}
\fi
\ifx\proposition\undefined
\newtheorem{proposition}[theorem]{Proposition}
\fi
\ifx\remark\undefined
\newtheorem{remark}[theorem]{Remark}
\fi
\ifx\corollary\undefined
\newtheorem{corollary}[theorem]{Corollary}
\fi
\ifx\definition\undefined
\newtheorem{definition}[theorem]{Definition}
\fi
\ifx\conjecture\undefined
\newtheorem{conjecture}[theorem]{Conjecture}
\fi
\ifx\fact\undefined
\newtheorem{fact}[theorem]{Fact}
\fi
\ifx\claim\undefined
\newtheorem{claim}[theorem]{Claim}
\fi
\ifx\assumption\undefined
\newtheorem{assumption}[theorem]{Assumption}
\fi
\ifx\condition\undefined
\newtheorem{condition}[theorem]{Condition}
\fi
\numberwithin{equation}{section}
\numberwithin{theorem}{section}



\title{Provably Efficient Representation Selection in Low-rank Markov Decision Processes: From Online to Offline RL}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<weightzero@ucla.edu>?Subject=Your UAI 2023 paper}{Weitong~Zhang}{}}
\author[1]{Jiafan~He}
\author[1]{Dongruo~Zhou}
\author[2,3]{Amy~Zhang}
\author[1]{Quanquan~Gu}
% \author[3]{Further~Coauthor}
% \author[3,1]{Further~Coauthor}
% % Add affiliations after the authors
\affil[1]{%
    Department of Computer Science\\
    University of California, Los Angeles\\
    California, USA
}
\affil[2]{%
    Department of Electrical and Computer Engineering\\
    University of Texas at Austin\\
    Texas, USA
}
\affil[3]{%
    Facebook AI Research
  }
  \begin{document}
\maketitle
\begin{abstract}
The success of deep reinforcement learning (DRL) lies in its ability to learn a representation that is well-suited for the exploration and exploitation task. To understand how the choice of representation can improve the efficiency of reinforcement learning (RL), we study representation selection for a class of low-rank Markov Decision Processes (MDPs) where the transition kernel can be represented in a bilinear form. We propose an efficient algorithm, called \algname, for representation learning in both online and offline RL. Specifically, we show that the online version of \algname, called \algname-UCB, always performs no worse than the state-of-the-art algorithm without representation selection, and achieves a strictly better constant regret if the representation function class has a "coverage" property over the entire state-action space. For the offline counterpart, \algname-LCB, we show that the algorithm can find the optimal policy if the representation class can cover the state-action space and achieves gap-dependent sample complexity. This is the first result with constant sample complexity for representation learning in offline RL.
\end{abstract}

\section{Introduction}
Reinforcement Learning (RL) has achieved impressive results in game-playing \citep{mnih2013playing}, robotics \citep{kober2013reinforcement}, and many other tasks. However, most current RL tasks are challenging due to large state-action spaces that make traditional tabular methods intractable. Instead, function approximation methods can be applied to tackle this challenge. In this scheme, the state-action pairs are compressed to provide some compact \emph{representations} that leverage the underlying structure in the MDP, and therefore allow the algorithm to generalize to unseen states.

In modern approaches, deep neural networks are often used as feature extractors to generate these representations. Since different feature extractors powered by different pretrained neural networks can be used, multiple valid representations are generated to encode the same state-action pair. However, how to select the \emph{best} representation for different scenarios is not well addressed in the literature. Nonetheless, this task is crucial in many applications such as robotics, where a robot is usually equipped with different types of sensors working through different physical phenomena \citep{de2018integrating}, like accelerometers, magnetic sensors, or laser sensors. These sensors estimate the current state of the robot and provide a representation of the current state as the output. However, the accuracy and robustness of these sensors vary in different states. Thus an intelligent system should utilize the most accurate and robust sensor in different states to achieve the best performance.

For online reinforcement learning, existing works on representation learning \citep{jiang2017contextual, agarwal2020flambe, modi2021model, uehara2021representation, sun2019model, du2021bilinear} often assumed that the transition dynamic can be represented as a linear function of an unknown representation, and they proposed algorithms to learn a single representation with provable sample complexity guarantees. They do not consider the possibility of using different representations for different scenarios (i.e., state-action pairs). On the other hand, for offline reinforcement learning, representation learning is much less studied, with only a few notable exceptions \citep{uehara2021representation, zhang2022making}. Nevertheless, neither of these works considers selecting different representations for different scenarios.

Based on the above motivation, we are interested in the following research question:

\begin{center}
\emph{Can selecting a good representation improve sample efficiency in (online and offline) RL?} \
\end{center}

In this paper, we answer the above question affirmatively for a class of low-rank Markov Decision Processes (MDPs) named bilinear MDP \citep{yang2020reinforcement}, where the transition kernel $\PP(s' | s, a)$ can be written as a bilinear form of a known feature map $\bphi(s, a)$, unknown matrix $\Mb^*$, and known feature map $\bpsi(s')$. Our goal is to select the best representation $\bphi(s,a)$ from a finite representation class $\Phi$ for different $(s,a)$ such that the resulting RL algorithm outperforms that using a single representation for all state-action pairs. For both online and offline reinforcement learning, we propose an algorithm called \algname, which can select the best representation in a representation function class in different scenarios. The key idea behind the representation selection is to choose the representation which gives the smallest optimistic Q-value function\footnote{For offline RL, our algorithm chooses the representation which gives the largest pessimistic Q-value function.}. Our contributions are summarized as follows:

\begin{itemize}[leftmargin=*,nosep]
\item In the context of online reinforcement learning, we propose a novel algorithm named \algname-UCB, which capitalizes on the benefits of representation selection. Our results show that \algname-UCB performs as well as the state-of-the-art algorithms that do not select representations, and attains a strictly superior regret bound when the representation function class has good coverage for all state-action pairs under the optimal policy.
\item For offline reinforcement learning, we introduce \algname-LCB as a counterpart to \algname-UCB for the online setting. We demonstrate that \algname-LCB is capable of identifying the optimal policy with gap-dependent sample complexity of the offline data. Furthermore, when the representation function class satisfies certain coverage assumptions under the behavior policy, our algorithm enjoys a constant sample complexity, which represents a novel contribution to this line of research.
\item To validate the effectiveness of representation selection and the superiority of our algorithms, we conduct empirical studies on various MDPs with different representation functions. Our experimental results demonstrate that both \algname-UCB and \algname-LCB outperform any single representation function in the respective settings, thus confirming the power of representation selection and the advantages of our proposed algorithms.
\end{itemize}

\noindent\textbf{Notation.} Scalars and constants are denoted by lower and upper case letters, respectively. Vectors are denoted by lower case boldface letters $\xb$, and matrices by upper case boldface letters $\Ab$. We denote by $[k]$ the set $\{1, 2, \cdots, k\}$ for positive integers $k$. For two non-negative sequence $\{a_n\}, \{b_n\}$, $a_n = \cO(b_n)$ means that there exists a positive constant $C$ such that $a_n \le Cb_n$, and we use $\tilde \cO(\cdot)$ to hide the $\log$ factor in $\cO(\cdot)$ except for the episode number $k$. We denote by $\|\cdot\|_2$ the Euclidean norm of vectors and the spectral norm of matrices and by $\|\cdot\|_{\mathrm F}$ the Frobenius norm of a matrix. We denote the Loewner ordering between two symmetric matrices as $\Ab \succeq \Bb$ if $\Ab - \Bb \succeq \zero$. For a vector $\xb \in \RR^d$, we denote by $\xb_{[i]}$ the $i$-th element of $\xb$, for a matrix $\Ab \in \RR^{d \times d}$, we denote by $\Ab_{[ii]}$ the $i$-th diagonal element. For any symmetric matrix $\Ab \in \RR^{d \times d}$ and vector $\xb \in \RR^d$, we denote %$\|\xb\|_{\Ab}^2 = \xb^\top \Ab \xb$ and 
$\|\xb\|_{\Ab} = \sqrt{\xb^\top \Ab \xb}$. We define $\Ib$ as the identity matrix. We denote the image space of a matrix $\Ab$ as $\mathrm{Im}(\Ab)$, and a vector is in the image space $\xb \in \mathrm{Im}(\Ab)$ if there exists a vector $\yb$ such that $\xb = \Ab \yb$.

\section{Related Works}
In this section, we discuss related works on representation learning and selection in both online and offline RL. Additional related works are discussed in Appendix~1.

Learning good representations in reinforcement learning has a long history. One of the earliest methods for aggregating different states and generating a compressed representation for those states is state aggregation \citep{michael1995reinforcement, dean1997model, ravindran2002model, abel2016near}. In deep RL, deep neural networks have been used to learn good representations in different settings \citep{diuk2008object, stooke2020decoupling, yang2020offline}. Several theoretical works \citep{du2019provably1, misra2020kinematic, foster2020instance} have studied the Block MDP where the dynamics are governed by a discrete latent state space and proposed algorithms based on decoding the latent state space from the observations. \citet{Du2020Is} showed that having a good approximate representation for the Q-function, transition kernel, or optimal Q-function is not sufficient for efficient learning, and can still have an exponential sample complexity unless the quality of the approximation is above a certain threshold. In the linear function approximation setting, several representation learning algorithms have been proposed. For example, \citet{jiang2017contextual} proposed a model-free algorithm called OLIVE, which can learn the correct representation from a representation function class (in the realizable setting). \citet{modi2021model} improved the OLIVE algorithm by proposing the MOFFLE algorithm, which is computationally efficient. On the other hand, \citet{agarwal2020flambe} proposed a model-based algorithm, FLAMBE, which can find the correct representation from the representation function class. \citet{uehara2021representation} improved FLAMBE by combining the maximum likelihood estimator and optimistic estimation (resp. pessimistic estimation) for representation learning in online RL (resp. offline RL). Some recent works \citep{qiu2022contrastive, zhang2022making} have used contrastive learning instead of the maximum likelihood estimator in \citet{agarwal2020flambe,uehara2021representation} to obtain more practical algorithms.

All the aforementioned works focus on learning the "correct" representation, which can well approximate the underlying transition kernel. In contrast, we pursue a different objective, which is to select a good representation adaptively for different state-action pairs from a class of correct representations, which can potentially lead to better performance. To achieve this objective, \citet{papini2021leveraging} proposed an algorithm, LEADER, which leverages good representations in linear contextual bandits. Independent of our work, \citet{papinireinforcement} extended the representation selection in linear contextual bandits to linear MDPs \citep{jin2020provably}. The differences between their work and ours are as follows. First, they considered the linear MDP setting, which yields a linear dependence on the size of the representation class (i.e., $|\Phi|$) in their regret, while we studied a special linear MDP called bilinear MDP \citep{yang2020reinforcement}, which enjoys a logarithmic dependency on $|\Phi|$ (i.e., $\log(|\Phi|)$) in our regret. Second, we also consider representation learning in offline RL, which, to our knowledge, has not been considered before in the literature. Compared to previous results on representation learning in offline RL \citep{zhang2022making, uehara2021representation}, we provide the first gap-dependent sample complexity

\section{Preliminaries}
We consider time-inhomogeneous episodic Markov Decision Processes (MDP), denoted by $\cM(\cS, \cA, H, \{r_h\}_{h=1}^H, \{\PP_h\}_{h=1}^H)$. Here, $\cS$ is the state space, $\cA$ is the finite action space, $H$ is the length of each episode, $r_h: \cS \times \cA \mapsto [0, 1]$ is the reward function at step $h$, and $\PP_h(s'| s, a)$ denotes the probability for state $s$ to transition to state $s'$ with action $a$ at step $h$. We further assume that the initial state $s_1$ is randomly sampled from a distribution $\mu$.

Given the MDP, we consider a deterministic policy $\pi = \{\pi_h\}_{h=1}^H$ as a sequence of functions where $\pi_h: \cS \mapsto \cA$ maps a state $s$ to an action $a$. For each state-action pair $(s, a) \in \cS \times \cA$ at time-step $h$, given the policy $\pi$, we denote the Q-function and value function as follows:
\begin{align*}
Q_h^\pi(s, a) &= r_h(s, a) + \EE\left[\sum_{h' = h + 1}^H r_{h'}(s_{h'},\pi_{h'}(s_{h'}))\right], \\
V_h^\pi(s) &= Q_h^\pi(s, \pi_h(s)),
\end{align*}
where $s_h = s, a_h = a$ and for all $h' \in [h, H]$, the distribution of $s_{h' + 1}$ is given by $\PP_{h'}(s_{h'} | s, a)$. Both $Q_h^\pi(s, a)$ and $V_h^\pi(s)$ are bounded in $[0, H]$ by definition. We further define the optimal value function as $V^h(s) := \sup_{\pi} V^\pi_h(s)$ and the optimal Q-function as $Q^h(s, a) := \sup_{\pi} Q^\pi_h(s, a)$. The optimal policy is denoted by $\pi_h^(s):= \argmax_\pi V^\pi_h(s)$, and we assume the optimal policy function $\pi^*$ is unique.

For simplicity, we define $[\PP_h V](s, a) = \EE_{s' \sim \PP_h(s' | s, a)} V(s')$ for any function $V: \cS \mapsto \RR$. With this notation, we have the following Bellman equation, as well as the Bellman optimality equation:
\begin{align}
Q_h^\pi(s, a) &= r_h(s, a) + [\PP_h V_{h+1}^\pi](s, a), \notag \\
Q_h^*(s, a) &= r_h(s, a) + [\PP_h V_{h+1}^*](s, a),\label{eq:bellman}
\end{align}
where $V_{H+1}^*$ and $V_{H+1}^\pi$ are set to be zero for any state $s$ and policy $\pi$.

We will focus on learning the structure of the MDP in an online manner. The algorithm is designed to run for $K$ episodes, where for each episode $k \in [K]$, the first step is to determine a policy $\pi^k = \{\pi_h^k\}_{h=1}^H$ based on the knowledge collected from the environment. The agent then follows the policy and the dynamics of the MDP. Specifically, at each step $h \in [H]$, the agent observes the state $s_h^k$, selects an action $a_h^k$ using the policy $\pi_h^k$, transitions to the next state $s_{h+1}^k$ generated by the MDP, and receives the reward $r_{h+1}^k$.

We define the cumulative regret for the first $K$ episodes as $\R{K} = \sum_{k=1}^K V_1^*(s_1^k) - V_1^{\pi^k}(s_1^k)$, where $V_1^*(s_1^k)$ is the optimal value of the initial state in episode $k$ and $V_1^{\pi^k}(s_1^k)$ is the value of the initial state in episode $k$ under the policy $\pi^k$.

The aim of this paper is to establish a problem-dependent regret bound. To achieve this goal, we require the assumption of a strictly positive minimal sub-optimality gap~\citep{simchowitz2019non, yang2021q, he2020logarithmic}. This assumption ensures that the difference between the value of the optimal policy and the value of any other policy is not too small, which is essential for proving the regret bound.

\begin{assumption}\label{asm:gap}
We have $\gap{\min}>0$, where
\begin{align}
     \gap{h}(s, a) &:= V_h^*(s) - Q_h^*(s, a), \notag \\
     \gap{\min} &:= \inf_{h, s, a}\big\{\gap{h}(s, a): \gap{h}(s, a) \neq 0\big\}. \label{def:gap}
\end{align}
\end{assumption}
We consider the bilinear MDPs in~\citet{yang2020reinforcement}, where the probability transition kernel is a bi-linear function of the feature vectors.
\begin{definition}[Bilinear MDPs, \citealt{yang2020reinforcement}]\label{asm:lin}
    For each state-action-state triple $(s, a, s') \in \cS \times \cA \times \cS$,  vectors $\bphi(s, a) \in \RR^d, \bpsi(s') \in \RR^{d'}$ are known as the feature vectors. There exists an unknown matrix $\Mb^*_h \in \RR^{d \times d'}$ for all $h \in [H]$ such that $\PP_h( s' | s, a) = \bphi^\top(s, a)\Mb^*_h\bpsi(s')$. We denote $\Kb_{\bpsi} = \sum_{s \in \cS}\bpsi(s)\bpsi^\top(s)$ which is assumed to be invertible. Let $\bPsi = (\bpsi(s_1), \bpsi(s_2), \cdots, \bpsi(s_{|\cS|}))^\top \in \RR^{|\cS| \times d'}$ be the matrix of all $\bpsi$ features. We assume that for all $h \in [H]$, 
    $\|\Mb_h^*\|_F^2 \le C_{\Mb}d$, for all $(s, a) \in \cS \times \cA$, $\|\bphi(s, a)\|_2^2 \le C_{\bphi}d$, and for all $\vb \in \RR^{|\cS|}$, $\|\bPsi^\top \vb\|_{2} \le C_{\bpsi}\|\vb\|_{\infty}$ and $\|\bPsi\Kb_{\bpsi}^{-1}\|_{2, \infty} \le C'_{\bpsi}$, where $C_{\Mb}, C_{\bphi}, C_{\bpsi}$ and $C_{\bpsi}'$ are all positive constants. 
\end{definition}

In this work, we focus on the bilinear MDP, which is a specific case of the low-rank MDP or linear MDP. In the low-rank MDP framework \citep{yang2019sample,jin2020provably}, the transition kernel is assumed to be a bilinear function of the state-action feature vector and an unknown measure $\btheta_h(s')$ of dimension $d$, i.e., $\PP_h(s'| s, a) = \la \bphi(s, a), \btheta_h(s') \ra$. In our approach, we model $\btheta_h(s')$ as a product of an unknown matrix $\Mb_h^*$ and a feature vector $\bpsi(s')$. In contrast, in the linear MDP, the reward function is assumed to be a linear function of the state-action feature vector $\bphi(s, a)$, whereas we assume it is known to simplify the presentation. As noted by \citet{yang2020reinforcement}, we can replace this assumption with a linear function of the representation $\bphi(s, a)$ and add an optimistic reward function estimation step similar to LinUCB \citep{chu2011contextual} without significantly altering the analysis.

Given the linear function representation of the MDP, we aim to learn a good representation $\bphi(s, a)$ for different state-action pairs in the representation function class $\Phi$, in both online and offline settings. To this end, we introduce the definition of an \emph{admissible} representation function class.

\begin{definition}[Admissible Function Class]\label{def:admissible}
A representation function class $\Phi$ is admissible if every $\bphi \in \Phi$ satisfies Definition~\ref{asm:lin} with a different dimension $d_{\bphi}$, a different parameter $\Mb_{h, \bphi}^*$, a different constant $C_{\bphi}$, and the same context $\bpsi(s')$. In other words, for any representation function $\bphi \in \Phi$, the same transition kernel can be represented as $\PP_h(s' | s, a) = \bphi^\top(s, a)\Mb_{h, \bphi}^*\bpsi(s')$.
\end{definition}

\begin{remark}
Definition~\ref{def:admissible} suggests that the same transition kernel can be represented in different ways, which is quite common in practice. For example, one can always represent a bilinear MDP with finite state and action spaces by a tabular MDP, which can be further represented by another bilinear MDP with $d_{\bphi} = |\cS| \times |\cA|$. However, different representations $\bphi$ may have different learning complexities. For instance, the linear representations with a lower dimension $d_{\bphi}$ are easier to learn than the tabular representation with $d_{\bphi} = |\cS| \times |\cA|$. Thus, our goal is to select a good representation from the admissible function class for different state-action pairs.
\end{remark}

\begin{remark}
    In the rest of our paper, we assume that the functions $\bphi \in \bPhi$ is given to the algorithm. In real-world applications, however, such a function class can be chosen as hand-crafted features or pre-trained neural networks.
\end{remark}
\begin{remark}\label{rm:1}
Although one can also consider learning the representation function tuple $(\bphi, \bpsi) \in \Phi \times \Psi$ simultaneously, there is no difference compared with assuming the $\bpsi$ function is fixed and known in terms of the algorithm and the analysis. This is because the $Q$-function is a linear function of $\bphi$ (see~\eqref{eq:q}), and the confidence radius of the estimated $Q$-function only depends on $\bphi$ (see~\eqref{eq:estq}). Therefore, we only select the representation of $\bphi$ instead of both $\bphi$ and $\bpsi$ without loss of generality.
\end{remark}

\section{Representation Selection for Online RL}
\subsection{\algname-UCB Algorithm}
We present the \emph{Representation seLection for EXploration and EXploitation} with upper confidence bound (\algname-UCB) algorithm for selecting a good representation from a finite representation function class $\Phi$ for different state-action pairs. The algorithm, shown in Algorithm~\ref{alg:main}, maintains a different model parameter estimate for each individual representation $\bphi \in \Phi$. Under Definition~\ref{asm:lin}, we have the following property for each representation $\bphi$:
\begin{align}
&\left[ \PP_h \bpsi(\cdot)^\top\Kb_{\bpsi}^{-1}\right ](s, a) = \sum_{s' \in \cS} \PP_h(s' | s, a) \bpsi^\top(s')\Kb_{\bpsi}^{-1} \notag\\
&\quad= \sum_{s' \in \cS} \bphi^\top(s, a)\Mb_{h, \bphi}^*\bpsi(s') \bpsi^\top(s')\Kb_{\bpsi}^{-1}\notag \\\
&\quad= \bphi^\top(s, a)\Mb_{h, \bphi}^*,\label{hhelp}
\end{align}
where the last equality uses the fact that $\Kb_{\bpsi} = \sum_{s' \in \cS} \bpsi(s')\bpsi^\top(s')$. Equation \eqref{hhelp} suggests that we can build $\Mb_{h, \bphi}^k$, the estimate of $\Mb^*_{h, \bphi}$, as the solution to the ridge regression problem analytically, given the sampled triples $\{s_h^j, a_h^j, s_{h+1}^j\}_{j=1}^{k-1}$ in Line~\ref{ln:1} of Algorithm~\ref{alg:main}.

\begin{remark}
    The computation of $\Kb_{\bpsi}$ requires only one pass of the state space since it does not depend on the round $k$ or the representation $\bphi$. Thus, it is not computationally expensive and would not be a bottleneck in the algorithm's computational complexity. Additionally, when the state space is infinite, $\Kb_{\bpsi}$ can be efficiently approximated using Monte Carlo integration techniques, as demonstrated in prior works such as \citet{zhou2020provably} and \citet{yang2020reinforcement}.
\end{remark}

With the estimate $\Mb_{h, \bphi}^k$, Algorithm~\ref{alg:main} recursively estimates the $Q$-function starting from $Q_{H+1}^k = 0$. The $Q$-function at step $h$ can be deduced as $Q_{h, \bphi}^k(s, a) = r(s, a) + [\PP_h V_{h+1}^k](s, a)$, where $V_{h+1}^k$ is the estimate of the value function at step $h+1$. Using the Bellman equation~\eqref{eq:bellman}, we can further write $Q_{h, \bphi}^k(s, a)$ as
\begin{align}
Q_{h, \bphi}^k(s, a) &= r(s, a) + \sum_{s' \in \cS} \bphi^\top(s, a) \Mb_{h, \bphi}^k\bpsi(s')V_{h+1}^k(s') \label{eq:q}.
\end{align}
To construct an optimistic estimation of the $Q$-function, we follow the approach proposed by \citet{yang2020reinforcement} and add an optimism bonus term to the right-hand side of \eqref{eq:q}. The optimism bonus is defined as $C_{\bpsi}H\sqrt{\beta_{k, \bphi}}\|\bphi(s, a)\|{(\Ub_{h, \bphi}^k)^{-1}}$, where $C_{\bpsi}$ and $\beta_{k, \bphi}$ are user-defined hyperparameters, and $\Ub_{h, \bphi}^k$ is the covariance matrix calculated in Line~\ref{ln:2}. This results in the following optimistic estimation of the $Q$-function:
\begin{align}
Q_{h, \bphi}^k(s, a) &= r(s, a) + \sum_{s' \in \cS} \bphi^\top(s, a) \Mb_{h, \bphi}^k\bpsi(s')V_{h+1}^k(s') \notag \\
&\quad+ C_{\bpsi}H\sqrt{\beta_{k, \bphi}}\|\bphi(s, a)\|_{(\Ub_{h, \bphi}^k)^{-1}}.\label{eq:estq}
\end{align}

By following the standard analysis for optimistic estimation \citep{abbasi2011improved}, it can be shown that $Q_{h, \bphi}^k(s, a)$ is an upper confidence bound for $Q_h^{\pi^k}(s, a)$ for each representation $\bphi \in \Phi$, according to~\eqref{eq:estq}. In other words, $Q_{h, \phi}^k(s, a) \ge Q_h^*(s, a)$. In Line~\ref{ln:4}, the algorithm selects the representation with the smallest optimistic estimation, which should be considered as the tightest optimistic estimation given the current covariance matrix $\Ub_{h, \bphi}^k$. Alternatively, this step can be interpreted as selecting the representation with the minimal uncertainty, which is measured by $\|\bphi\|_{(\Ub_{h, \bphi}^k)^{-1}}$. This approach is intuitive and ensures that the algorithm chooses the representation that provides the best possible estimate of the value function for the current state-action pair.

As a result, Line~\ref{ln:4} selects different representations $\bphi$ for different state-action pairs implicitly by minimizing the optimistic $Q$-function, which results in a tighter optimistic estimation of the $Q$-function. The algorithm then executes the greedy policy and obtains the optimistic value function defined in Line~\ref{ln:3}.

Our algorithm offers a distinct advantage over \citet{yang2020reinforcement} in that it enables the selection of different representations for different state-action pairs. This is in contrast to representation learning, which seeks a \emph{universal} representation that works well for \emph{all} state-action pairs. For instance, our algorithm can adaptively choose a representation that yields accurate value function estimates for certain state-action pairs, even if its performance is suboptimal for others. By doing so, our algorithm outperforms \citet{yang2020reinforcement} which relies on a single representation for all state-action pairs. This demonstrates the benefits of representation selection in online RL.

\begin{algorithm}[t!]
\caption{Online Representation seLection for EXploration and EXploitation with Upper Confidence Bound (ReLEX-UCB)}\label{alg:main}
\begin{algorithmic}[1]
    \STATE Initialize $Q_{H+1, \bphi}^k(s, a) = 0$ for all $(s, a, k, \bphi)$
    \FOR{episodes $k=1,\ldots,K$}
    \STATE Received the initial state $s_1^k$. 
    \FOR{step $h=H,\ldots,1$}
    \FOR{representation $\bphi \in \Phi$}
    \STATE $\Mb_{h, \bphi}^k=\argmin_{\Mb} \big(\sum_{j=1}^{k-1} \|\bpsi^\top(s_{h+1}^k)\Kb_{\bpsi}^{-1} - \bphi^\top(s_h^k, a_h^k)\Mb\|_2^2 + \|\Mb\|_F^2\big)$ \label{ln:1}
    \STATE $\Ub_{h, \bphi}^k = \Ib + \sum_{j=1}^{k-1}\bphi(s_h^k, a_h^k)\bphi^\top(s_h^k, a_h^k)$\label{ln:2}
    \STATE Calculate $Q_{h, \bphi}^k(s, a)$ as~\eqref{eq:estq}
    \ENDFOR
    \STATE Set $Q_h^k(s, a) = \min_{\bphi \in \Phi} \{Q_{h, \bphi}^k(s, a)\}$, \label{ln:4}
    \STATE Set $V_h^k(s) = \max\{0, \min\{\max_a Q_h^k(s, a), H\}\}$ \label{ln:3}
    \ENDFOR
    \FOR{step $h=1,\ldots,H$}
    \STATE Take action $a_h^k\leftarrow \argmax_{a} Q_h^k(s_h^k,a)$
    \STATE Receive next state $s_{h+1}^k$ \label{algorithm:line5}
    \ENDFOR
\ENDFOR
\end{algorithmic}
\end{algorithm}
\subsection{Constant Regret Bounds}
We present the regret bound for \algname-UCB, which demonstrates the advantage of representation selection in a rigorous way. To do so, we require the following assumption:

\begin{assumption}\label{asm:hls}
Suppose that the representation function class $\Phi$ is admissible. For any $(s, a, h) \in \cS \times \cA \times [H]$, there exists a representation $\bphi \in \Phi$ such that $\bphi(s, a) \in \image(\bLambda_{h, \bphi})$, where
\begin{align*}
\bLambda_{h, \bphi} := \EE_{d_{\pi^*}}[\phi(s_h, \pi_h^*(s_h))\phi^\top(s_h, \pi_h^*(s_h))]
\end{align*}
with $d_{\pi^*}$ representing the state visitation distribution induced by the optimal policy $\pi^*$. We also denote $\sigma_{h, \bphi}$ as the minimal non-zero eigenvalue of $\bLambda_{h, \bphi}$ and $\sigma_{\bphi} = \min_{h \in [H]} \sigma_{h, \bphi}$.
\end{assumption}
\begin{remark}
Several related assumptions, known as \emph{diversity assumptions}, have been proposed to lower bound the minimum eigenvalue of the term $\bphi\bphi^\top$. These assumptions are discussed in detail in \citet{papini2021leveraging}. Here, we extend the assumption from the linear bandit to the reinforcement learning setting, where the state distribution at time-step $h$ is defined by the optimal policy. We note that a similar but stronger assumption, called `uniformly excited features', is made by \citet{wei2021learning} in the infinite time-horizon average reward MDP setting. There, they assume $\bLambda$ is strictly positive definite for \emph{all} possible policies $\pi$. In contrast, we only require $\bLambda$ to be strictly positive for the distribution induced by the \emph{optimal} policy, which is a weaker assumption. This implies that the states that rarely occur in the optimal policy do not significantly impact the quality of the representation.
\end{remark}

\begin{remark}
We notice that a similar assumption called UniSoft-mixing, is made in~\citet{papinireinforcement}, where they assume that for all $(s, a) \in \cS \times \cA$, there exists a $\bphi \in \Phi$ such that $\bphi(s, a) \in \text{span }\{\bphi(s, \pi^*(s)): d_{\pi^*}(s) > 0\}$. The difference between our assumption and their assumption is that they filter out the states which are \emph{almost surely} never visited by the optimal policy $\pi^*$. In contrast, we take the expectation with respect to $d_{\pi^*}(s)$ without explicitly filtering out the never-visited states.
\end{remark}%}
Now we are ready to present the regret bound result.
\begin{theorem}\label{thm:main}
Under Assumptions~\ref{asm:gap} and~\ref{asm:hls}, set $\beta_{k, \bphi} = c(C_{\Mb} + {C'_{\bpsi}}^{2})d_{\bphi}\log(kHC_{\bphi} |\Phi| / \delta)$ in Algorithm~\ref{alg:main}, where $c$ is an absolute positive constant, then with probability at least $1 - 5\delta$, there exists a threshold
\begin{align}
    k^*= \max_{\bphi \in \Phi}\Big\{\text{poly}(d_{\bphi}, \sigma_{\bphi}^{-1}, H, \log(|\Phi| / \delta), \gap{\min}^{-1} \notag \\
    \quad, C_{\bphi}, C_{\bpsi}, C_{\Mb}, C_{\bpsi}')\Big\} \label{eq:k-star}
\end{align} 
independent from episode number $k$. The regret for the first $k$ episodes is upper bounded by 
\begin{align*}
    \R{k} &\le 2 + \min_{\bphi \in \Phi}\bigg\{ \frac{128C_{\bpsi}^2H^5d_{\bphi}^2c(C_{\Mb} + {C'_{\bpsi}}^{2})}{\gap{\min}}\\
    &\qquad \times \log\big(1 + C_{\bphi}\tilde kd_{\bphi}\big)\log\big(\tilde kHC_{\bphi} |\Phi| / \delta\big)\bigg\}\\
    &\quad + \frac{96H^4\log\big(2\tilde k (1 + \log(H \gap{\min}^{-1}))|\Phi|/ \delta\big)}{\gap{\min}}\\
    &\quad+ \frac{16H^2\log\big(\big(1 + \log\big(H{\tilde k}\big)\big){\tilde k}^2|\Phi|/ \delta\big)}3,
\end{align*}
where we denote $\tilde k := \min\{k, k^*\}$.
\end{theorem}

\begin{remark}
The regret bound exhibits a phase transition as the episode number $k$ increases. When $k \le k^*$, the regret is upper bounded by $\tilde \cO(d^2H^5\log(k)\gap{\min}^{-1})$, which is exactly the logarithmic regret bound (given by Lemma 3.3 in Appendix). However, when $k \ge k^*$, the regret bound becomes $\tilde \cO(d^2H^5\log(k^*)\gap{\min}^{-1})$. Since $k^*$ is independent of $k$ (as shown in~\eqref{eq:k-star}), the regret bound turns into a problem-dependent constant regret bound that no longer grows as the total number of episodes $k$ increases. This result aligns with our intuition: once we have a fixed, strictly positive sub-optimality gap, the regret might initially increase over the first few episodes. However, once the agent collects enough data, it can learn the environment well and will no longer incur any additional regret.
\end{remark}

\begin{remark}
If Assumption~\ref{asm:hls} does not hold, then $k^*=\infty$, and our regret bound degenerates to the gap-dependent regret bound. Similar bounds have been proved in~\citet{he2020logarithmic} for both linear MDPs and linear mixture MDPs. Our bound has the same dependency on $H$, $\gap{\min}$, and episode number $k$ as the bounds in~\citet{he2020logarithmic}. However, in terms of $d$, our dependency is $\cO(d^2)$, while the dependency is $\cO(d^3)$ for linear MDPs in the LSVI-UCB algorithm~\citep{jin2020provably}. This difference arises from estimating the MDP parameter $\Mb_{h, \bphi}^*$, which is similar to that in the UCRL-VTR algorithm~\citep{ayoub2020model} for learning linear mixture MDPs. Furthermore, since our regret bound minimizes over all $\bphi \in \Phi$, the performance of \algname-UCB is always competitive with the best one using any single representation $\bphi$ in that function class, ignoring the logarithmic terms.
\end{remark}

\begin{remark}\label{rm:logphi}
We note that our regret bound includes an additional $\log(|\Phi|)$ factor, which reflects the cost of representation selection to guarantee that all $|\Phi|$ regressions can be learned well by the union bound. This term is caused by the worst-case scenario and may be eliminated in practice by considering the average-case scenario instead. By doing so, we can potentially reduce the impact of the $\log(|\Phi|)$ factor on the regret bound. Additionally, it's worth noting that this dependency on $|\Phi|$ is better than the one in the regret bound of \citet{papinireinforcement}, which has a $|\Phi|$ factor. The reason for the better dependency in our result is that the bilinear MDP structure we consider is simpler than the linear MDP structure considered in~\citet{papinireinforcement}. When applying our algorithm to linear MDPs, we still need a $|\Phi|$ factor to cover the value function class, which degenerates to the result in~\citet{papinireinforcement}. Furthermore, the $\log(|\Phi|)$ dependency allows us to extend our result to some infinite representation function classes with bounded statistical complexity~\citep{agarwal2020flambe}.
\end{remark}

\begin{remark}
When $|\Phi| = 1$, i.e., there is only one representation function, Assumption~\ref{asm:hls} provides a criterion for a `good representation' and such a `good representation' can improve the problem-dependent regret bound from $\cO(\log(k))$~\citep{he2020logarithmic} to a constant regret bound. 
\end{remark}

\section{Representation Selection for Offline RL}
\subsection{\algname-LCB Algorithm}
We present an offline version of $\algname$ that selects a good representation based on the offline data generated from a behavior policy. In this version, the algorithm estimates the parameter and its covariance matrix for each representation function $\bphi$ in Lines~\ref{ln:1-offline} and \ref{ln:2-offline} in Algorithm~\ref{alg:main-offline}, using the offline data $\cD_h$ for the $h$-th step, which consists of the triplet $(s, a, s')$ as the state, action, and next-state, then the estimated $\Mb$ can be therefore written by
\begin{align}
\Mb_{h, \bphi} & =\argmin_{\Mb} \|\Mb\|_F^2 \notag \\
&\quad + \sum_{(s, a, s') \in \cD_h} \|\bpsi^\top(s')\Kb_{\bpsi}^{-1} - \bphi^\top(s, a)\Mb\|_2^2  \label{eq:offline-M}
\end{align}
The algorithm then provides a pessimistic estimation of the $Q$-function, following a similar method as~\eqref{eq:estq} in Lines~\ref{ln:4-offline} and~\ref{ln:5-offline}, which is widely used in offline reinforcement learning to provide a robust estimation for later planning. In detail, the estimated Q function is subtracted by a confidence radius $\Gamma$ defined by
\begin{align}
\Gamma_{h, \bphi}(s, a) = C_{\bpsi}H\sqrt{\beta_{\bphi}\bphi^\top(s, a)\Ub_{h, \bphi}^{-1}\bphi(s, a)} \label{eq:offline-C}
\end{align}
and thus the estimated $Q$-function can be written as
\begin{align}
Q_{h, \bphi}(s, a) &= r(s, a) + \sum_{s' \in \cS} \bphi^\top(s, a) \Mb_{h, \bphi}\bpsi(s')V_{h+1}(s') \notag \\
&\quad - \Gamma_{h, \bphi}(s, a) \label{eq:offline-Q}.
\end{align}

Unlike the online version, where a smaller estimation of $Q$ is preferred, the offline version adopts a pessimistic estimation ($Q_{h, \bphi} \le Q_h^*$), where a larger estimation is considered more accurate. Therefore, in Line~\ref{ln:3-offline}, the algorithm selects the maximum $Q$-function over all representation functions $\bphi$, and in Line~\ref{algorithm:line5-offline}, it takes the greedy policy based on the selected $Q$-function from the offline training. Similar to the online version, $\algname$-LCB selects different representation functions for different state-action pairs instead of a single representation for the entire environment, thereby leveraging the advantage of different representation functions to provide a good estimation for the underlying MDP.
\begin{algorithm}[t!]
\caption{Offline Representation seLection for EXploration and EXploitation Lower Confidence Bound (ReLEX-LCB)}\label{alg:main-offline}
\begin{algorithmic}[1]
\STATE \texttt{// offline training}
\FOR{$(h, \bphi) \in [H] \times \Phi$}
\STATE Calculate $\Mb_{h, \bphi}$ as of~\eqref{eq:offline-M} \label{ln:1-offline}
\STATE Calculate $\Ub_{h, \bphi} = \Ib + \sum_{(s, a, s') \in \cD_h}\bphi(s, a)\bphi^\top(s, a)$\label{ln:2-offline}
\ENDFOR
\STATE \texttt{// offline planning}
\STATE Initialize $Q_{H+1, \bphi}(s, a) = 0$ for all $(s, a, \bphi)$
\FOR {$h = H, H - 1, \cdots, 1$}
\STATE Calculate $\Gamma_{h, \bphi}(s, a)$ as of~\eqref{eq:offline-C}\label{ln:4-offline}
\STATE Calculate $Q_{h, \bphi}(s, a)$ as of~\eqref{eq:offline-Q}\label{ln:5-offline}
\STATE Set $Q_h(s, a) = \max_{\bphi \in \Phi} \{Q_{h, \bphi}(s, a)\}$
\label{ln:3-offline}
\STATE Set $V_h(s) = \max\{0, \min\{\max_a Q_h(s, a), H\}\}$
\STATE Set $\pi_h(s, a) = \argmax_a Q_h(s, a)$ \label{algorithm:line5-offline}
\ENDFOR
\ENSURE Policy $\pi = \{\pi_h\}_{h=1}^H$
\end{algorithmic}
\end{algorithm}

\subsection{Gap-dependent Sample Complexity}
In this section, we provide the sample complexity of Algorithm~\ref{alg:main-offline}. Similarly to its online counterpart, we start with a coverage assumption for offline RL, which suggests that the representation function class $\Phi$ can provide a good representation for all possible state-action pairs in the offline training data.

\begin{assumption}\label{asm:offline}
Suppose the representation function class $\Phi$ is admissible, and for any $(s, a, h) \in \cS \times \cA \times [H]$, there exists a representation function $\bphi \in \Phi$ such that
\begin{align*}
\bphi(s, a) \in \image(\tilde \bLambda_{h, \bphi}),\ \tilde \bLambda_{h, \bphi} := \EE_{d_h^{\hat \pi}}[\bphi(s, a)\bphi(s, a)^\top],
\end{align*}
where $d_h^{\hat \pi}$ is the state-action visitation distribution in the offline dataset on step $h$ induced by some behavior policy $\hat \pi$ in the underlying MDP for the offline data. We denote the minimal non-zero eigenvalue of $\tilde \bLambda_{h, \bphi}$ as $\tilde \sigma_{h, \bphi}$.
\end{assumption}

\begin{remark}
Similar assumptions have been made in the offline RL literature~\citep{wang2020statistical,jin2021pessimism, min2021variance, uehara2021representation, yin2022nearoptimal}, which require that the offline dataset can provide good coverage of the entire state-action space. Notably, thanks to representation selection, we only require that the representations in the function class $\Phi$ can together cover the state-action space, rather than every single representation covering the state-action space perfectly. This relaxes existing assumptions by allowing every single representation to not provide perfect coverage. For example, it is possible to define two representations $\{\bphi_1, \bphi_2\}$ such that each representation does not satisfy Assumption~\ref{asm:offline}, but the representation function class $\Phi = {\bphi_1, \bphi_2}$ satisfies. For more details about this example, please refer to Appendix~2.2, or Appendix G in \citet{papinireinforcement}.
\end{remark}

We also need the following assumption, which is standard in the literature.
\begin{assumption}\label{asm:iid}
    The trajectories in the offline dataset are i.i.d. sampled, i.e., different trajectories are generated by the same behavior policy $\hat \pi$ independently.
\end{assumption}

Now we are ready to present the sample complexity result.
\begin{theorem}\label{thm:offline}
Set $\beta_{\bphi} = Cd_{\bphi}\log(2KH|\Phi|/\delta)$ where $C$ is an absolute positive constant, then with probability at least $1 - \delta$, then under Assumptions~\ref{asm:offline} and~\ref{asm:iid}, the sub-optimality of the policy $\pi$ output by Algorithm~\ref{alg:main-offline} could be bounded by
% \begin{smaller}
\begin{align}
    &V_h^*(s) - V_h^\pi(s) \le 2C_{\bpsi}H\notag \\
    &\qquad\times\sum_{h' = h}^H\EE_{\pi^*}\Big[\min_{\bphi \in \Phi}\big\{\sqrt{\beta_{\bphi}}\|\bphi(s, a)\|_{\Ub_{h', \bphi}^{-1}}\big\}\big | s_h = s\Big].\label{eq:main}
\end{align}
% \end{smaller}
Furthermore, under Assumptions~\ref{asm:gap}, if the size of the offline dataset is greater than
% \begin{small}
\begin{align*}
    K &> \max_{\bphi \in \Phi, h \in [H]} \left\{ \frac{32C_{\bphi}^2d_{\bphi}^2\log(Hd_{\bphi}|\Phi|/\delta)}{\tilde \sigma^{2}_{h, \bphi}}\right.\\
    &\qquad\qquad\quad\left. \times\left(1 + \frac{C_{\bpsi}^2H^4\beta_{\bphi}C_{\bphi}\tilde \sigma_{h, \bphi}}{4\gap{\min}^2C_{\bphi}^2d_{\bphi}\log(Hd_{\bphi}|\Phi|/\delta)}\right)\right\}.
\end{align*}
% \end{small}
then Algorithm~\ref{alg:main-offline} is guaranteed to output the optimal policy $\pi = \pi^*$.
\end{theorem}

\begin{remark}
Our error bound in \eqref{eq:main} contains the $\min$ operator, which suggests that our result should be no worse than using any single representation, compared with the offline RL algorithm using a single representation~\citep{jin2021pessimism, yin2022nearoptimal}. 
\end{remark}

\begin{remark}
    The bound of $\sqrt{\beta_{\bphi}}\|\bphi(s, a)\|_{\Ub_{h', \bphi}^{-1}}$ cannot decrease to $0$ without other further assumptions. \cite{jin2021pessimism,yin2022nearoptimal} require a `uniform coverage' assumption to make the sub-optimality decrease at a $1/\sqrt{K}$ rate. This `uniform coverage' suggests that the covariance matrix under the behavior policy can cover the entire state-action space. In sharp contrast, according to Assumption~\ref{asm:offline}, our results only require the representations in the function class to together cover the state-action space, even if any single representation cannot.
\end{remark}

\begin{remark}
Our `gap-dependent sample complexity' is also aligned with the gap-dependent sample complexity for offline RL in the tabular setting under the condition $(P, \text{gap}_{\min})$ in \citet{wang2022gap}. In their setting, $P$ stands for a uniform optimal policy coverage coefficient in the tabular MDP, which is analogous to our $\tilde{\sigma}_{h, \bphi}^{-1}$ in the linear function approximation setting. Our result has the same inverse dependence on $\text{gap}_{\min}$.
\end{remark}

\section{Experiments}
\begin{table}
\caption{Cumulative regret ($\text{mean} \pm \text{dev.}$) after 5M episodes for \algname-UCB v.s. UC-MatrixRL and $\epsilon$-greedy using a single representation} \label{tab:reg}
\centering
\begin{tabular}{cc}
\toprule
Alg. + Rep.  & Cumulative regret \\ 
\midrule
UC-MatrixRL + $\bphi$ (oracle) & $2534.9 \pm 26.6$ \\ 
\midrule
UC-MatrixRL + $\bphi^{(1)}$ & $11459.5 \pm 225.7$ \\
UC-MatrixRL + $\bphi^{(2)}$ & $13838.5 \pm 266.2$ \\
$\epsilon$-greedy + $\bphi$ & $15305.9 \pm 245.7$ \\ 
$\epsilon$-greedy + $\bphi^{(1)}$ & $15745.8 \pm 408.0$ \\
$\epsilon$-greedy + $\bphi^{(2)}$ & $15652.9 \pm 471.2$ \\
\algname-UCB + $\{\bphi^{(1)}, \bphi^{(2)}\}$ & $\boldsymbol{6765.0 \pm 146.6}$\\
\bottomrule
\end{tabular}
\end{table}
\subsection{Online RL}
To showcase the efficacy of representation selection by \algname-UCB, we conduct the following experiments on an environment with $|\cS| = 20, |\cA| = 3$, $H = 10$, and $d = d' = 5$. We generate the feature functions $\bphi: \cS \times \cA \mapsto \RR^d$ and $\bpsi: \cS \mapsto \RR^{d'}$ such that for all $h \in [H]$, there exists a matrix $\Mb_h \in \RR^{d \times d}$ where $\PP_h(s' | s, a) = \bphi(s, a)^\top \Mb_h \bpsi(s')$. The generated $\bphi$ satisfies Assumption~\ref{asm:hls}. We set the reward function such that $r_H(s, a) \sim \text{Bernoulli}(0.5)$ and $r_h(s, a) = 0$ for all $h < H$, forcing the algorithm to learn the transition kernel in order to achieve good performance.

Furthermore, we generate two additional representations $\bphi^{(1)}$ and $\bphi^{(2)}$ such that neither $\bphi^{(1)}$ nor $\bphi^{(2)}$ satisfies Assumption~\ref{asm:hls}, but their union $\Phi = {\bphi^{(1)}, \bphi^{(2)}}$ does. Appendix~2.1 contains a detailed definition of these representations.


We evaluated the performance of \algname-UCB using the feature map class $\Phi = \{\bphi^{(1)}, \bphi^{(2)}\}$ with episode $K = 5,000,000$. We also reported the performance of UC-MatrixRL~\citep{yang2020reinforcement} and $\epsilon$-greedy using the feature map $\bphi$, $\bphi^{(1)}$, and $\bphi^{(2)}$ separately.

We repeated the experiment on the same environment eight times and reported the mean and standard deviation of the cumulative regret in Table~\ref{tab:reg}. Our experiment results showed that \algname-UCB outperformed both $\epsilon$-greedy and UC-MatrixRL using $\bphi^{(1)}$ or $\bphi^{(2)}$, which verifies the effectiveness of representation selection. More results, including the figure of cumulative regret, are deferred to Appendix 2.4.1.

% Figure~\ref{fig:my_label} plots the cumulative regret with respect to the episode number, with the standard deviation indicated by the shadows. We observed that the cumulative regrets for both UC-MatrixRL using $\bphi$ and \algname-UCB grew very slowly after the first one million episodes. As a comparison, UC-MatrixRL using $\bphi^{(1)}$ or $\bphi^{(2)}$ had a sub-linear regret growth instead of near-constant regret. As for the $\epsilon$-greedy algorithm, although the greedy policy can learn very fast at the beginning, it eventually had a much higher cumulative regret since it could not explore the environment well.

% \begin{figure}[t]
%     \centering
%     \includegraphics[width=.7\columnwidth ]{aistats/z-crop.pdf}
%     \caption{Cumulative regret over 5M episodes for \algname-UCB v.s. UC-MatrixRL and $\epsilon$-greedy using a single representation.}
%     \label{fig:my_label}
% \end{figure}

\subsection{Offline RL}
\begin{table}
\caption{Relative sub-optimality of \algname-LCB over 500K episodes} \label{tab:2}
\begin{center}
\begin{tabular}{cc}
\toprule
\makecell{Representation}  & \makecell{Final sub-optimality \\$(\text{mean} \pm \text{dev.}) \times 10^{-3}$} \\ 
\midrule
\makecell{$\bphi$ (oracle)} & $1.288 \pm 0.807$ \\ 
$\bphi^{(1)}$ & $3.424 \pm 1.455$ \\
$\bphi^{(2)}$ & $3.336 \pm 1,624$ \\
$\{\bphi^{(1)}, \bphi^{(2)}\}$ & $\boldsymbol{1.292 \pm 0.806}$\\
\bottomrule
\end{tabular}
\end{center}
\end{table}
In this subsection, we present experiments to demonstrate the performance of \algname-LCB. We use a setup similar to the online RL setting with one oracle representation $\bphi$ satisfying Assumption~\ref{asm:offline} and two representations $\bphi^{(1)}$ and $\bphi^{(2)}$. Neither of $\bphi^{(1)}$ nor $\bphi^{(2)}$ satisfies Assumption~\ref{asm:offline}, but the union of these two representations satisfies the assumption. We collect $K = 500K$ episodes of offline trajectories using a fixed randomly-generated behavior policy and evaluate the sub-optimality of Algorithm~\ref{alg:main-offline} using different sizes of offline training data. The rest of the parameter settings are the same as in the online RL setting.

We report the performance of Algorithm~\ref{alg:main-offline} using (1) the oracle representation $\bphi$, (2) the representation function class $\{\bphi_1, \bphi_2\}$, (3) $\bphi_1$, and (4) $\bphi_2$, respectively. We use the relative sub-optimality over the initial policy, i.e., $(V_1^*(s) - V_1^{\pi_k}(s)) / (V_1^*(s) - V_1^{\pi_1}(s))$ as a performance measure. We repeat the experiment 32 times and report the mean and standard deviation of the relative sub-optimality in Table~\ref{tab:2}.

We observe that by selecting over two imperfect representations, \algname-LCB can match the performance of the oracle algorithm using a single perfect representation, even if using the two representations separately leads to a larger ($\sim 2.5\times$) sub-optimality on the same offline data. More results, including the figures comparing the sub-optimality over different algorithms, are deferred to Appendix 2.4.1.

% \begin{figure}[t]
% \centering
% \includegraphics[width=.85\columnwidth]{NeurIPS2022/offline.pdf}
% \caption{Relative sub-optimality of \algname-LCB after 500K offline episodes}
% \label{fig:2}
% \end{figure}

\section{Conclusion and Future Work}
\label{sec:conclusion}

In this paper, we have explored representation selection for reinforcement learning by focusing on a special class~\citep{yang2020reinforcement} of low-rank MDPs~\citep{yang2019sample,jin2020provably}. Our proposed \algname algorithm has demonstrated the ability to improve performance in both online and offline RL settings. The promising theoretical and empirical results suggest that there is potential in combining our work with FLAMBE~\citep{agarwal2020flambe} or MOFFLE~\citep{modi2021model}. By integrating our approach with these methods that select the \emph{correct} representations, we can further select the \emph{good} representation from a class of \emph{correct} representations. This may help in designing more practical, theory-backed representation learning algorithms for reinforcement learning.
\begin{acknowledgements} % will be removed in pdf for initial submission,
						 % (without ‘accepted’ option in \documentclass)
                         % so you can already fill it to test with the
                         % ‘accepted’ class option
    We thank the anonymous reviewers for their helpful comments. WZ, JH, DZ and QG are supported in part by the National Science Foundation CAREER Award 1906169 and research fund from UCLA-Amazon Science Hub. The views and conclusions contained in this paper are those of the authors and should not be interpreted as representing any funding agencies.
\end{acknowledgements}
% \clearpage
% References
\bibliography{zhang_466}
\end{document}
