% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{abbrvnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr}
\input{packages/defs}
\input{packages/header}
\input{packages/math_commands}
\newcommand{\est}{\operatorname{EST}}
\newcommand{\rfe}{\operatorname{RFE}}
\newcommand{\env}{\operatorname{env}}
\renewcommand{\opt}{\operatorname{OPT}}
\newcommand{\rl}{\operatorname{RL}}
\newcommand{\bc}{\operatorname{BC}}
\externaldocument{xu_380}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Provably Efficient Adversarial Imitation Learning with Unknown Transitions\\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author{}
% \author[1, 4]{\href{mailto:<xut@lamda.nju.edu.cn>?}{Tian Xu\thanks{Equal contribution. Author ordering is determined randomly using a coin flip.}}{}}
% \author[2, 3]{\href{mailto:<ziniuli@link.cuhk.edu.cn>?}{Ziniu Li{$^*$}}}
% \author[1, 4]{\href{mailto:<yuy@nju.edu.cn>?}{{Yang Yu\thanks{Corresponding author.}}}}
% \author[2, 3]{\href{mailto:<luozq@cuhk.edu.cn>?}{Zhi-Quan Luo{$^\dag$}}}
% \author[1]{Further~Coauthor}
% \author[3]{Further~Coauthor}
% \author[3,1]{Further~Coauthor}
% Add affiliations after the authors
% \affil[1]{%
%     National Key Laboratory for Novel Software Technology, Nanjing University
% }
% \affil[2]{%
% The Chinese University of Hong Kong, Shenzhen
% }
% \affil[3]{%
% Shenzhen Research Institute of Big Data
%   }
% \affil[4]{%
% Polixir.ai
%   }
  
\begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle
% \newpage
\section{Notation}
\label{sec:notation}

\begin{table}[h]
\caption{Notations}
\label{table:notations}
\centering
%\resizebox{\textwidth}{!}{%
%\resizebox{\columnwidth}{!}{%
\begin{tabular}{@{}ll@{}}
\toprule
Symbol                     & Meaning \\ \midrule
% $\gS$                & the state space  \\
% $\gA$                   & the action space   \\
% $P = \lb P_1, \cdots, P_{H} \rb$ & the transition function \\
% $H$              & the planning horizon  \\
% $\rho$                & the initial state distribution    \\
% $r= \lb r_1, \cdots, r_{H} \rb$ & the reward function    \\
% $\pi = \lb \pi_1, \cdots, \pi_{h} \rb$ & non-stationary policy \\
$\piE$ & the expert policy \\
$V^{\pi, P, r}$ & policy value under the transition model $P$ and reward $r$ \\
$\varepsilon$ & the imitation gap \\
$\delta$ & failure probability \\
$d^{\pi}_h (s)$ & state distribution \\
$d^{\pi}_h (s, a)$ & state-action distribution \\
$\tr = \lp s_1, a_1, \cdots, s_{H}, a_{H} \rp$ & the trajectory \\
$\tr_h = \lp s_1, a_1, \cdots, s_{h}, a_{h} \rp$ & the truncated trajectory \\
$\tr_h(\cdot)$ & the state at time step $h$ in $\tr$ \\
$\tr_h(\cdot, \cdot)$ & the state-action pair at time step $h$ in $\tr$ \\
$\tr (a_h)$ & the action at time step $h$ in $\tr$ \\
$\gD$ & expert dataset \\
$m$   & number of expert trajectories \\
$\widehat{d}_h^{\piE} (s, a)$ & maximum likelihood estimator of $d^{\piE}_h$ in \cref{eq:estimate_by_count}
\\
$\widetilde{d}_h^{\piE} (s, a)$ & transition-aware estimator in \cref{eq:new_estimator_unknown_transition}
\\
$\sP^{\piE}(\tr)$ & probability of the trajectory $\tr$ under the expert policy $\piE$
\\
$\sP^{\piE}(\tr_h)$ & probability of the truncated trajectory $\tr_h$ under the expert policy $\piE$
\\
$\gS_h (\gD)$ & the set of states visited in time step $h$ in dataset $\gD$
\\
$\Tr_h^{\gD}$ & the trajectories along which each state has been visited in $\gD$ up to time step $h$
\\
$\pi^{(t)}$ & the policy obtained in the iteration $t$ \\
$w^{(t)}$ & the reward function learned in the iteration $t$ \\
$\eta^{(t)}$ & the step size in the iteration $t$ \\
$f^{(t)} (w)$ & the objective function in the iteration $t$ in \cref{eq:objective_w} \\
$\widebar{d}_h (s, a)$ & the averaged state-action distribution in \cref{algo:gradient_based_optimization} \\
$\widebar{\pi}$ & the policy derived by the averaged state-action distribution in \cref{algo:gradient_based_optimization} \\
$\Pi_{\text{BC}} \lp \gD_{1} \rp$ & the set of policies which take the expert action on states covered in $\gD_{1}$
\\
$\widehat{P}$ & the empirical transition function
\\
$d^{\pi, \widehat{P}}_h (s, a)$ & the state-action distribution of $\pi$ under the empirical transition function $\widehat{P}$
\\\bottomrule
\end{tabular}%
% }
%}
\end{table}

% \newpage
\section{From Regret Guarantee to PAC Guarantee}
\label{sec:from_regret_to_pac}

\citet{shani2022online} proved a regret guarantee for their OAL algorithm. In particular, \citet{shani2022online} showed that with probability at least $1 - \delta^\prime$, we have 
\begin{align}   \label{eq:oal_regret}
     \sum_{k=1}^{K}  V^{\piE} - V^{\pi_k}  \leq  \widetilde{\gO}\lp \sqrt{H^4 |\gS|^2 |\gA|K} + \sqrt{H^3 |\gS| |\gA| K^2 /m} \rp,
\end{align}
where $\pi^{k}$ is the policy obtained at episode $k$, $K$ is the number of interaction episodes, and $m$ is the number of expert trajectories. We would like to comment that the second term in \eqref{eq:oal_regret} involves the statistical estimation error about the expert policy. Furthermore, this term reduces to $\widetilde{\gO}(\sqrt{H^2 |\gS| K^2 /m})$ under the assumption that the expert policy is deterministic. 



To further convert this regret guarantee to the PAC guarantee considered in this paper, we can apply Markov's inequality as suggested by \citep{jin18qlearning}. Concretely, let $\widebar{\pi}$ be the policy that randomly chosen from $\{\pi^{1}, \pi^{2}, \cdots, \pi^{K}\}$ with equal probability, then we have 
\begin{align*}
    \sP \lp V^{\piE} - V^{\widebar{\pi}} \geq \varepsilon \rp \leq \frac{1}{\varepsilon} \expect \ls \frac{1}{K} \sum_{k=1}^{K}  V^{\piE} - V^{\pi_k} \rs \leq \frac{1}{\varepsilon} \lp \widetilde{\gO} \lp \sqrt{ \frac{H^4 |\gS|^2 |\gA|}{K} } + \sqrt{H^2 |\gS| /m}\rp + \delta^\prime H  \rp,
\end{align*}
Therefore, if we set $\delta^{\prime} = \varepsilon \delta / (3H)$, and
\begin{align*}
    K = \widetilde{\gO} \lp \frac{H^4 |\gS|^2 |\gA|}{\varepsilon^2 \delta^2} \rp, \quad m = \widetilde{\gO} \lp \frac{H^2 |\gS|}{\varepsilon^2} \rp,
\end{align*}
we obtain that $\sP ( V^{\piE} - V^{\widebar{\pi}} \geq \varepsilon ) \leq \delta $. 

% As commented in \citep{menard20fast-active-learning}, this transformation leads to a worse dependence on failure probability $\delta$, but the sample complexity dependence on other terms does not change.

\section{Proof of Results in Section \ref{sec:warm-up}}
\label{sec:proof_of_sec:warm-up}


\subsection{Proof of Lemma \ref{lem:1}}

\begin{proof}
The proof starts with the dual representation of policy value (see \cref{eq:dual_of_policy_value}).
\begin{align*}
    V^{\piE} - V^{\widebar{\pi}} & = \sum_{h=1}^H \sum_{(s, a) \in \gS \times \gA} \lp d^{\piE}_h (s, a) - d^{\widebar{\pi}}_h (s, a) \rp r_h (s, a)
    \\
    &\overset{(a)}{\leq} \sum_{h=1}^H \lnorm d^{\piE}_h - d^{\widebar{\pi}}_h   \rnorm_1
    \\
    &\leq \sum_{h=1}^H \lnorm d^{\piE}_h  - \widetilde{d}^{\piE}_h   \rnorm_1 + \sum_{h=1}^H \lnorm  d^{\widebar{\pi}}_h  - \widetilde{d}^{\piE}_h  \rnorm_1, 
\end{align*}
where inequality $(a)$ is based on the assumption that $r_h (s, a) \in [0, 1]$. For the two terms in RHS, according to \cref{def:estimation} and \cref{def:distribution_matching_error}, we have
\begin{align*}
    \sum_{h=1}^H \lnorm d^{\piE}_h  - \widetilde{d}^{\piE}_h  \rnorm_1 \leq \varepsilon_{\est}, \; \sum_{h=1}^H \lnorm  d^{\widebar{\pi}}_h - \widetilde{d}^{\piE}_h   \rnorm_1 \leq \min_{\pi \in \Pi} \sum_{h=1}^H \lnorm  d^{\pi}_h - \widetilde{d}^{\piE}_h    \rnorm_1 + \varepsilon_{\opt}.  
\end{align*}
With the above two inequalities, we further obtain
\begin{align*}
    V^{\piE} - V^{\widebar{\pi}} &\leq \varepsilon_{\est} + \min_{\pi \in \Pi} \sum_{h=1}^H \lnorm  d^{\pi}_h  - \widetilde{d}^{\piE}_h   \rnorm_1 + \varepsilon_{\opt}
    \\
    &\overset{(a)}{\leq} \varepsilon_{\est} + \sum_{h=1}^H \lnorm  d^{\piE}_h - \widetilde{d}^{\piE}_h    \rnorm_1 + \varepsilon_{\opt}
    \\
    &\leq 2 \varepsilon_{\est} + \varepsilon_{\opt}.  
\end{align*}
Inequality $(a)$ holds since $\piE \in \Pi$. We complete the proof.

\end{proof}

\section{Proof of Results in Section \ref{sec:main_result}}
\label{sec:proof:results_in_sction_main_result}







\subsection{Proof of Proposition \ref{prop:connection}}
\label{subsec:proof_of_proposition_connection}


\begin{proof}
Let $\widetilde{d}^{\piE}_h(s, a)$ be an expert state-action distribution estimator and $\widehat{P}$ be a transition model learned by a reward-free method. Notice that reward-free exploration methods also enable uniform policy evaluation with respect to \emph{any} reward function; see \cref{defn:uniform_policy_evaluation}. That is, with probability at least $1-\delta_{\rfe}$, for any reward function $r$ and policy $\pi$, we have $\vert V^{\pi, P, r} - V^{\pi, \widehat{P}, r} \vert \leq \varepsilon_{\rfe}$. Then we define the following two events.
\begin{align*}
    &E_{\mathrm{EST}}:= \lb \sum_{h=1}^H \lnorm \widetilde{d}^{\piE}_h - d^{\piE}_h  \rnorm_{1} \leq \varepsilon_{\mathrm{EST}} \rb,
    \\
    & E_{\mathrm{RFE}}:= \lb \forall r = (r_1, \ldots, r_H), \forall \pi \in \Pi:  \left\vert V^{\pi, P, r} - V^{\pi, \widehat{P}, r} \right\vert \leq \varepsilon_{\mathrm{RFE}} \rb. 
\end{align*}
According to assumption (a) and (b), we have that $\sP \lp E_{\mathrm{EST}}  \rp \geq 1 - \delta_{\mathrm{EST}}$ and $\sP \lp E_{\mathrm{RFE}}  \rp \geq 1 - \delta_{\mathrm{RFE}}$. Applying union bound yields
\begin{align*}
    \sP \lp E_{\mathrm{EST}}  \cap E_{\mathrm{RFE}} \rp \geq 1 - \delta_{\mathrm{EST}} -  \delta_{\mathrm{RFE}}.
\end{align*}
The following analysis is established on the event $E_{\mathrm{EST}}  \cap E_{\mathrm{RFE}}$. Let $\widebar{\pi}$ be the output of Algorithm \ref{algo:framework}.
\begin{align*}
    \left\vert V^{\piE, P} - V^{\widebar{\pi}, P} \right\vert \leq \left\vert V^{\piE, P} - V^{\widebar{\pi}, \widehat{P}} \right\vert + \left\vert V^{\widebar{\pi}, \widehat{P}} - V^{\widebar{\pi}, P} \right\vert \leq \left\vert V^{\piE, P} - V^{\widebar{\pi}, \widehat{P}} \right\vert + \varepsilon_{\mathrm{RFE}}. 
\end{align*}
The last inequality follows the event $E_{\mathrm{RFE}}$. Then we consider the error $\vert V^{\piE, P} - V^{\widebar{\pi}, \widehat{P}} \vert$. From the dual form of the policy value in \cref{eq:dual_of_policy_value}, we have that
\begin{align*}
    \left\vert V^{\piE, P} - V^{\widebar{\pi}, \widehat{P}} \right\vert &= \left\vert \sum_{h=1}^H \sum_{(s, a) \in \gS \times \gA} \lp d^{\piE, P}_h (s, a) - d^{\widebar{\pi}, \widehat{P}}_h (s, a) \rp r_h (s, a)  \right\vert \leq \sum_{h=1}^H \lnorm d^{\piE, P}_h - d^{\widebar{\pi}, \widehat{P}}_h   \rnorm_1,
\end{align*}
where $d^{\widebar{\pi}, \widehat{P}}_h (s, a)$ is the state-action distribution of the policy $\widebar{\pi}$ under the transition model $\widehat{P}$. Then we get that
\begin{align*}
    \sum_{h=1}^H \lnorm d^{\piE, P}_h - d^{\widebar{\pi}, \widehat{P}}_h   \rnorm_1 & \leq \sum_{h=1}^H \lnorm d^{\piE, P}_h - \widetilde{d}^{\piE}_h   \rnorm_1 + \sum_{h=1}^H \lnorm \widetilde{d}^{\piE}_h - d^{\widebar{\pi}, \widehat{P}}_h   \rnorm_1
    \\
    &\leq \varepsilon_{\mathrm{EST}} + \sum_{h=1}^H \lnorm \widetilde{d}^{\piE}_h - d^{\widebar{\pi}, \widehat{P}}_h   \rnorm_1. 
\end{align*}
The last inequality follows the event $E_{\mathrm{EST}}$. Combining the above three inequalities yields
\begin{align*}
    \left\vert V^{\piE, P} - V^{\widebar{\pi}, P} \right\vert \leq \sum_{h=1}^H \lnorm \widetilde{d}^{\piE}_h - d^{\widebar{\pi}, \widehat{P}}_h   \rnorm_1 + \varepsilon_{\mathrm{EST}} +  \varepsilon_{\mathrm{RFE}}.
\end{align*}
According to assumption (c), with the estimator $\widetilde{d}^{\piE}_h (s, a)$ and transition model $\widehat{P}$, algorithm C solves the optimization problem in \cref{eq:ail_with_model} up to an error $\varepsilon_{\opt}$ and $\widebar{\pi}$ is the output of the algorithm C. Formally,
\begin{align*}
     \sum_{h=1}^H \lnorm \widetilde{d}^{\piE}_h - d^{\widebar{\pi}, \widehat{P}}_h   \rnorm_1 \leq \min_{\pi \in \Pi} \sum_{h=1}^H \lnorm \widetilde{d}^{\piE}_h - d^{\pi, \widehat{P}}_h   \rnorm_1 + \varepsilon_{\opt}.
\end{align*}
Then we get that
\begin{align*}
    \left\vert V^{\piE, P} - V^{\widebar{\pi}, P} \right\vert &\leq \sum_{h=1}^H \lnorm \widetilde{d}^{\piE}_h - d^{\widebar{\pi}, \widehat{P}}_h   \rnorm_1 + \varepsilon_{\mathrm{EST}} +  \varepsilon_{\mathrm{RFE}}
    \\
    &\leq \min_{\pi \in \Pi} \sum_{h=1}^H \lnorm \widetilde{d}^{\piE}_h - d^{\pi, \widehat{P}}_h  \rnorm_1 +  \varepsilon_{\opt} + \varepsilon_{\mathrm{EST}} +  \varepsilon_{\mathrm{RFE}}
    \\
    &\overset{(a)}{\leq} \sum_{h=1}^H \lnorm \widetilde{d}^{\piE}_h - d^{\piE, \widehat{P}}_h   \rnorm_1 +  \varepsilon_{\opt} + \varepsilon_{\mathrm{EST}} +  \varepsilon_{\mathrm{RFE}}
    \\
    &\leq \sum_{h=1}^H \lnorm \widetilde{d}^{\piE}_h - d^{\piE, P}_h   \rnorm_1 + \sum_{h=1}^H \lnorm d^{\piE, P}_h - d^{\piE, \widehat{P}}_h   \rnorm_1  +  \varepsilon_{\opt} + \varepsilon_{\mathrm{EST}} +  \varepsilon_{\mathrm{RFE}}
    \\
    &\overset{(b)}{\leq} \sum_{h=1}^H \lnorm d^{\piE, P}_h - d^{\piE, \widehat{P}}_h   \rnorm_1  +  \varepsilon_{\opt} + 2\varepsilon_{\mathrm{EST}} +  \varepsilon_{\mathrm{RFE}},
\end{align*}
where inequality $(a)$ holds since $\piE \in \Pi$ and inequality $(b)$ follows the event $E_{\mathrm{EST}}$. With the dual representation of $\ell_1$-norm, we have that
\begin{align*}
    \sum_{h=1}^H \lnorm d^{\piE, P}_h - d^{\piE, \widehat{P}}_h \rnorm_1 &= \max_{w \in \gW} \sum_{h=1}^H \sum_{(s, a) \in \gS \times \gA} w_h (s, a) \lp d^{\piE, P}_h (s, a) - d^{\piE, \widehat{P}}_h (s, a)  \rp
    \\
    &= \max_{w \in \gW} \sum_{h=1}^H V^{\piE, P, w} - V^{\piE, \widehat{P}, w} \leq \varepsilon_{\mathrm{RFE}},
\end{align*}
where $\gW = \{w = (w_1, \ldots, w_H): w_h \in \reals^{|\gS| \times |\gA|}, \|w_h \|_{\infty} \leq 1 \}$, $V^{\piE, \widehat{P}, w}$ is the value of policy $\piE$ with the transition model $\widehat{P}$ and reward function $w$. The last inequality follows the event $E_{\mathrm{RFE}}$. Then we prove that
\begin{align*}
    \left\vert V^{\piE, P} - V^{\widebar{\pi}, P} \right\vert \leq  2\varepsilon_{\mathrm{EST}} + 2 \varepsilon_{\mathrm{RFE}} + \varepsilon_{\opt}. 
\end{align*}
\end{proof}





% \subsection{The Application of FEM on Proposition \ref{prop:connection}}
% \label{subsec:application_of_fem}

% In this part, we discuss how to extend the framework in \cref{algo:framework} and \cref{prop:connection} under other metrics beyond the $\ell_1$-norm. In particular, we focus on the $\ell_2$-norm used in FEM. Under our framework, the assumption (b) becomes: with probability at least $1-\delta_{\est}$, we have
% \begin{align*}
%     \sum_{h=1}^H \lnorm \widetilde{d}^{\piE}_h - d^{\piE}_h \rnorm_2 \leq \varepsilon_{\est}.
% \end{align*}
% Besides, the assumption (c) becomes: with estimation $\widetilde{d}_h^{\piE} (s, a)$ and transition model $\widehat{P}$, the policy $\widebar{\pi}$ output by FEM satisfies
% \begin{align*}
%     \sum_{h=1}^H \lnorm \widetilde{d}^{\piE}_h - d^{\widebar{\pi}, \widehat{P}}_h   \rnorm_2 \leq \min_{\pi \in \Pi}\sum_{h=1}^H \lnorm \widetilde{d}^{\piE}_h - d^{\pi, \widehat{P}}_h   \rnorm_2 + \varepsilon_{\opt}.
% \end{align*}
% Following the same idea in the proof of Proposition \ref{prop:connection}, we can get that
% \begin{align*}
%     \left\vert V^{\piE, P} - V^{\widebar{\pi}, P} \right\vert &\leq \left\vert V^{\piE, P} - V^{\widebar{\pi}, \widehat{P}} \right\vert + \varepsilon_{\rfe} 
%     \\
%     &\leq \sum_{h=1}^H \lnorm d^{\piE, P}_h  - d^{\widebar{\pi}, \widehat{P}}_h  \rnorm_1 + \varepsilon_{\rfe}
%     \\
%     &\leq \sum_{h=1}^H \lnorm d^{\piE, P}_h  - \widetilde{d}^{\piE}_h  \rnorm_1 + \sum_{h=1}^H \lnorm \widetilde{d}^{\piE}_h  - d^{\widebar{\pi}, \widehat{P}}_h  \rnorm_1 + \varepsilon_{\rfe}.
% \end{align*}
% For an arbitrary vector $x \in \reals^n$, we have that $\lnorm x \rnorm_2 \leq \lnorm x \rnorm_1 \leq \sqrt{n} \lnorm x \rnorm_2$. Then we show that
% \begin{align*}
%     \lnorm d^{\piE, P}_h  - \widetilde{d}^{\piE}_h  \rnorm_1 \leq \sqrt{\vert \gS \vert \vert \gA \vert} \lnorm d^{\piE, P}_h  - \widetilde{d}^{\piE}_h  \rnorm_2 \leq \sqrt{\vert \gS \vert \vert \gA \vert} \varepsilon_{\est}.
% \end{align*}
% Then we continue to consider the policy value gap.
% \begin{align*}
%     \left\vert V^{\piE, P} - V^{\widebar{\pi}, P} \right\vert &\leq \sum_{h=1}^H \lnorm \widetilde{d}^{\piE}_h  - d^{\widebar{\pi}, \widehat{P}}_h  \rnorm_1 + \sqrt{\vert \gS \vert \vert \gA \vert} \varepsilon_{\est} + \varepsilon_{\rfe}
%     \\
%     &\leq \sqrt{\vert \gS \vert \vert \gA \vert} \sum_{h=1}^H \lnorm \widetilde{d}^{\piE}_h  - d^{\widebar{\pi}, \widehat{P}}_h  \rnorm_2 + \sqrt{\vert \gS \vert \vert \gA \vert} \varepsilon_{\est} + \varepsilon_{\rfe}
%     \\
%     &\leq \sqrt{\vert \gS \vert \vert \gA \vert} \lp \min_{\pi \in \Pi} \sum_{h=1}^H \lnorm \widetilde{d}^{\piE}_h  - d^{\pi, \widehat{P}}_h  \rnorm_2 +  \varepsilon_{\opt} \rp + \sqrt{\vert \gS \vert \vert \gA \vert} \varepsilon_{\est} + \varepsilon_{\rfe}.
% \end{align*}
% The last inequality holds because of assumption (c). Then we have that
% \begin{align*}
%     \left\vert V^{\piE, P} - V^{\widebar{\pi}, P} \right\vert &\leq \sqrt{\vert \gS \vert \vert \gA \vert} \sum_{h=1}^H \lnorm \widetilde{d}^{\piE}_h  - d^{\piE, \widehat{P}}_h  \rnorm_2 + \sqrt{\vert \gS \vert \vert \gA \vert} \lp  \varepsilon_{\opt} +  \varepsilon_{\est} \rp + \varepsilon_{\rfe}
%     \\
%     &\leq \sqrt{\vert \gS \vert \vert \gA \vert} \lp \sum_{h=1}^H \lnorm \widetilde{d}^{\piE}_h  - d^{\piE, P}_h  \rnorm_2 + \sum_{h=1}^H \lnorm d^{\piE, P}_h  - d^{\piE, \widehat{P}}_h  \rnorm_2  \rp + \sqrt{\vert \gS \vert \vert \gA \vert} \lp  \varepsilon_{\opt} +  \varepsilon_{\est} \rp + \varepsilon_{\rfe}
%     \\
%     &\leq \sqrt{\vert \gS \vert \vert \gA \vert} \lp \varepsilon_{\mathrm{EST}} + \sum_{h=1}^H \lnorm d^{\piE, P}_h  - d^{\piE, \widehat{P}}_h  \rnorm_2  \rp + \sqrt{\vert \gS \vert \vert \gA \vert} \lp  \varepsilon_{\opt} +  \varepsilon_{\est} \rp + \varepsilon_{\rfe}
%     \\
%     &\leq \sqrt{\vert \gS \vert \vert \gA \vert} \sum_{h=1}^H \lnorm d^{\piE, P}_h  - d^{\piE, \widehat{P}}_h  \rnorm_1 + \sqrt{\vert \gS \vert \vert \gA \vert} \lp  \varepsilon_{\opt} +  2 \varepsilon_{\est} \rp + \varepsilon_{\rfe}
%     \\
%     &\leq \sqrt{\vert \gS \vert \vert \gA \vert} \lp  \varepsilon_{\opt} +  2 \varepsilon_{\est} + \varepsilon_{\rfe} \rp + \varepsilon_{\rfe}.
% \end{align*}
% In the last inequality, we use the dual representation of $\ell_1$-norm and policy value. Furthermore, $\widehat{P}$ satisfies that for any policy $\pi \in \Pi$ and reward $r \in \gS \times \gA \rar [0, 1]$, $\vert V^{\pi, P, r} - V^{\pi, \widehat{P}, r} \vert \leq \varepsilon_{\rfe}$. 

% \begin{rem}
% Note that the additional factor $\sqrt{|\gS||\gA|}$ is partially caused by the $\ell_2$-norm. In particular, the original assumption in FEM \citep{pieter04apprentice} is that there exist certain $\phi_h, w_h \in \real^{d}$ such that $r_h(s, a) = w_h^{\top} \phi_h(s, a)$. When $\phi_h(s, a)$ is the one-hot feature used in the tabular MDP in this paper, $w_h(s, a) = r_h(s, a)$. According to our assumption that $r_h(s, a) \in [0, 1]$, such an $w_h$ satisfies $\Vert w_h \Vert_2 \leq \sqrt{|\gS| |\gA|}$, which is different from the assumption $\Vert w_h \Vert_2 \leq 1$ in \citep{pieter04apprentice}.

% % However, this mismatch may not be a big issue since the concentration rate for $\ell_2$-norm metric is faster than $\ell_1$-norm when the estimation error is small.
% \end{rem}


\subsection{Reward-free Exploration Method}


In this part, we present the RF-Express algorithm in \citep{menard20fast-active-learning} with our notations. Please see \cref{algo:rf_express}.

\begin{algorithm}[htbp]
\caption{RF-Express}
\label{algo:rf_express}
\begin{algorithmic}[1]
\REQUIRE{Failure probability $\delta$, function $\beta(n, \delta) = \log (3 \vert \gS \vert \vert \gA \vert H / \delta)+ \vert \gS \vert \log (8 e(n+1))$.}
\FOR{$t = 0, 1, 2, \cdots$}
\STATE{Update the counter and the empirical transition model:}
\begin{align*}
    & n^{t}_h (s, a) = \sum_{i=1}^{t} \indict \{ s^{i}_h = s, a^i_h = a \}, \; n^{t}_h (s, a, s^\prime) = \sum_{i=1}^{t} \indict \{ s^{i}_h = s, a^i_h = a, s^{i}_{h+1} = s^\prime  \},
    \\
    & \widehat{P}^{t}_h (s^\prime|s, a) = \frac{n^{t}_h (s, a, s^\prime)}{n^{t}_h (s, a)}, \; \text{if $n^{t}_h (s, a) > 0$ and } \widehat{P}^{t}_h (s^\prime|s, a) = \frac{1}{\vert \gS \vert}, \; \forall s^\prime \in \gS \text{ otherwise}. 
\end{align*}
\STATE{Define $W^{t}_{H+1} (s, a) = 0, \; \forall (s, a) \in \gS \times \gA$.}
\FOR{$h = H, H-1, \cdots, 1$}
\STATE{$W^{t}_h (s, a) = \min \left(H, 15 H^{2} \frac{\beta\left(n_{h}^{t } (s, a), \delta\right)}{n_{h}^{t } (s, a)}+\left(1+\frac{1}{H}\right) \sum_{s^{\prime} \in \gS} \widehat{P}_{h}^{t} (s^{\prime} | s, a ) \max_{a^{\prime}} W_{h+1}^{t }\left(s^{\prime}, a^{\prime}\right)\right)$.}
\ENDFOR
\STATE{Derive the greedy policy: $\pi_{h}^{t+1}(s)=\argmax_{a \in \mathcal{A}} W_{h}^{t }(s, a), \forall s \in \gS, \forall h \in[H]$.}
% \STATE{Define $W^{t }: = \expect_{x \sim \rho^{}} [ W_{1}^{t} (x, \pi_{1}^{t+1, }(x)) ]$.}
\IF{$3 e \sqrt{ W_{1}^{t} (s_1, \pi_{1}^{t+1, }(s_1)) }+ W_{1}^{t} (s_1, \pi_{1}^{t+1}(s_1)) \leq \varepsilon / 2$}
\BREAK
\ENDIF
\STATE{Rollout $\pi^{t+1}$ to collect a trajectory $\tau^{t+1} = (s^t_1, a^t_1, s^t_2, a^t_2, \cdots, s^t_H, a^t_H)$.}
\ENDFOR
\ENSURE{Transition model $\widehat{P}^{t}$.}
\end{algorithmic}
\end{algorithm}



\subsection{Proof of Lemma \ref{lemma:sample_complexity_of_new_estimator_unknown_transition}}

Prior to proving \cref{lemma:sample_complexity_of_new_estimator_unknown_transition}, we first prove that the estimator shown in \eqref{eq:new_estimator_unknown_transition} is an unbiased estimation. We consider the decomposition of $d_h^{\piE} (s, a)$.
\begin{align} 
d_h^{\piE}(s, a) &= {\sum_{\tr_h \in \Tr_h^{\gD_1}} \sP^{\piE}(\tr_h) \indict\lb \tr_h(\cdot, \cdot) = (s, a) \rb} + {\sum_{\tr_h \notin \Tr_h^{\gD_1}} \sP^{\piE}(\tr_h) \indict\lb \tr_h(\cdot, \cdot) = (s, a) \rb}  \nonumber 
\\
&= {\sum_{\tr_h \in \Tr_h^{\gD_1}} \sP^{\pi^\prime}(\tr_h) \indict\lb \tr_h(\cdot, \cdot) = (s, a) \rb} + {\sum_{\tr_h \notin \Tr_h^{\gD_1}} \sP^{\piE}(\tr_h) \indict\lb \tr_h(\cdot, \cdot) = (s, a) \rb},  \label{eq:proof_1}
\end{align}
where $\pi^{\prime} \in \Pi_{\text{BC}} \lp \gD_1 \rp$ and the last equality follows Lemma \ref{lemma:unknown-transition-unbiased-estimation}.

% Let $\Pi_{\text{BC}} \lp \gD_1 \rp$ denote the set of policies, each of which exactly takes expert action on states contained in $\gD_{1}$. Fix $\pi \in \Pi_{\text{BC}} \lp \gD_{1} \rp$, $h \in [H]$ and $(s, a) \in \gS \times \gA$, we consider the probability $\sP^{\piE} \lp \tr_h \rp$ of a truncated trajectory $\tr_h \in \Tr^{\gD_{1}}_h$. Since $\pi$ exactly takes expert action on states contained in $\gD_{1}$, we have

% Therefore, we obtain that
% \begin{align*}
%     \sum_{\tr_h \in \Tr_h^{\gD_1}} \sP^{\piE}(\tr_h) \indict\lb \tr_h(\cdot, \cdot) = (s, a) \rb = \sum_{\tr_h \in \Tr_h^{\gD_1}} \sP^{\pi}(\tr_h) \indict\lb \tr_h(\cdot, \cdot) = (s, a) \rb,
% \end{align*}

\begin{lem} \label{lemma:unknown-transition-unbiased-estimation}
We define $\Pi_{\text{BC}} \lp \gD_1 \rp$ as the set of policies, each of which takes expert action on states contained in $\gD_{1}$. For each $\pi \in \Pi_{\text{BC}} \lp \gD_{1} \rp$, $\forall h \in [H]$ and $(s, a) \in \gS \times \gA$, we have
\begin{align*}
    \sum_{\tr_h \in \Tr_h^{\gD_1}} \sP^{\piE}(\tr_h) \indict\lb \tr_h(\cdot, \cdot) = (s, a) \rb 
    =\sum_{\tr_h \in \Tr_h^{\gD_1}} \sP^{\pi}(\tr_h) \indict\lb \tr_h(\cdot, \cdot) = (s, a) \rb.
\end{align*}
\end{lem}

\begin{proof}
The proof is based on the fact that any $\pi \in \Pi_{\bc}(\gD_1)$ takes the same action with the expert on trajectories in $\Tr_h^{\gD_1}$. More concretely, for any $\tr_h \in \Tr_h^{\gD_1}$, we have 
\begin{align*}
    &\quad \sP^{\piE}(\tr_h) \\
    &= \rho (\tr_h(s_1)) \piE_1 \lp \tr_h(a_1)| \tr(s_1) \rp \prod_{\ell=1}^{h-1}  P_{\ell} \lp \tr_h(s_{\ell+1}) | \tr_h(s_{\ell}), \tr_h(a_{\ell}) \rp \piE_{\ell+1} \lp \tr_h(a_{\ell+1}) | \tr_h(s_{\ell+1}) \rp
    \\
    &= \rho (\tr_h(s_1)) \pi_1 \lp \tr_h(a_1)| \tr(s_1) \rp \prod_{\ell=1}^{h-1}  P_{\ell} \lp \tr_h(s_{\ell+1}) | \tr_h(s_{\ell}), \tr_h(a_{\ell}) \rp \pi_{\ell+1} \lp \tr_h(a_{\ell+1}) | \tr_h(s_{\ell+1}) \rp
    \\
    &= \sP^{\pi}(\tr_h),
\end{align*}
which completes the proof.
\end{proof}

% Recall the definition of the new estimator.
% \begin{align*}
% \widetilde{d}_h^{\piE} (s, a) = {\frac{\sum_{\tr_h \in \gD^\prime_{\mathrm{env}}} \indict \{ \tr_h (\cdot, \cdot) = (s, a), \tr_h \in \Tr_h^{\gD_1} \}}{|\gD^\prime_{\mathrm{env}}|}} + {\frac{  \sum_{\tr_h \in \gD_1^c}  \indict\{ \tr_h (\cdot, \cdot) = (s, a), \tr_h \not\in \Tr_h^{\gD_1}  \} }{|\gD_1^c|}},
% \end{align*}
% where $\gD^\prime_{\mathrm{env}}$ is the dataset collected by the policy $\pi^\prime \in \Pi_{\text{BC}} (\gD_1)$. Notice that the two terms in RHS are Monte Carlo estimations of ${\sum_{\tr_h \in \Tr_h^{\gD_1}} \sP^{\pi^\prime}(\tr_h) \indict\lb \tr_h(\cdot, \cdot) = (s, a) \rb}$ and ${\sum_{\tr_h \notin \Tr_h^{\gD_1}} \sP^{\piE}(\tr_h) \indict\lb \tr_h(\cdot, \cdot) = (s, a) \rb}$ based on the dataset $\gD^\prime_{\mathrm{env}}$ and $\gD_1^c$, respectively. Therefore, $\widetilde{d}_h^{\piE} (s, a)$ is an unbiased estimation of $d_h^{\piE}(s, a)$.  


% Similarly, we utilize the decomposition of $d_h^{\piE}(s, a)$ in \cref{eq:key_decomposition}.  
% \begin{align*}
%     d_h^{\piE}(s, a) = \sum_{\tr_h \in \Tr_h^{\gD_1}} \sP^{\piE}(\tr_h) \indict\lb  \tr_h (\cdot, \cdot) = (s, a)  \rb + \sum_{\tr_h \notin \Tr_h^{\gD_1}} \sP^{\piE}(\tr_h) \indict\lb  \tr_h (\cdot, \cdot) = (s, a)  \rb.
% \end{align*}
% Then, for any $h \in [H]$ and $(s, a) \in \gS \times \gA$, we have


% Recall that dataset $\gD^\prime_{\env}$ is collected by the policy $\pi \in \Pi_{\text{BC}} \lp \gD_{1} \rp$ with $|\gD^\prime_{\env}| = n^\prime$, and $\sum_{\tr_h \in \gD^\prime_{\env}} \indict \{ \tr_h (\cdot, \cdot) = (s, a), \tr_h \in \Tr_h^{\gD_1} \} /  | \gD^\prime_{\env} |$ is a maximum likelihood estimator for 
% \begin{align*}
%     \sum_{\tr_h \in \Tr_h^{\gD_1}} \sP^{\pi}(\tr_h) \indict \lb  \tr_h (\cdot, \cdot) = (s, a)  \rb.
% \end{align*}
% Moreover, \cref{lemma:unknown-transition-unbiased-estimation} indicates that
% \begin{align*}
%    \sum_{\tr_h \in \Tr_h^{\gD_1}} \sP^{\piE}(\tr_h) \indict \lb  \tr_h (\cdot, \cdot) = (s, a)  \rb = \sum_{\tr_h \in \Tr_h^{\gD_1}} \sP^{\pi}(\tr_h) \indict \lb  \tr_h (\cdot, \cdot) = (s, a)  \rb.
% \end{align*}
% Therefore, we obtain that $\sum_{\tr_h \in \gD^\prime_{\env}} \indict \{ \tr_h (\cdot, \cdot) = (s, a), \tr_h \in \Tr_h^{\gD_1} \} /  | \gD^\prime_{\env} |$ is a maximum likelihood estimator for
% \begin{align*}
%      \sum_{\tr_h \in \Tr_h^{\gD_1}} \sP^{\piE}(\tr_h) \indict \lb  \tr_h (\cdot, \cdot) = (s, a)  \rb.
% \end{align*}
% Let ${E^\prime}^{s}_h$ be the event that $\tr_h$ agrees with expert policy at state $s$ at time step $h$ and appears in $\Tr_h^{\gD_1}$. Formally, 
% \begin{align*}
%     {E^\prime}_h^{s} = \indict\{\tr_h (\cdot, \cdot) = (s, \piE_h (s)) \cap \tr_h \in \Tr_h^{\gD_1}\}.
% \end{align*}


Now we proceed to prove \cref{lemma:sample_complexity_of_new_estimator_unknown_transition}.
\begin{proof}[Proof of \cref{lemma:sample_complexity_of_new_estimator_unknown_transition}]
    
We aim to upper bound the estimation error $\sum_{h=1}^H \Vert \widetilde{d}^{\piE}_h - d^{\piE}_h  \Vert_{1}$. Recall the definition of the estimator $\widetilde{d}^{\piE}_h(s, a)$ in \cref{eq:new_estimator_unknown_transition}:
\begin{align*}
    \widetilde{d}_h^{\piE} (s, a) := \frac{\sum_{\tr_h \in \gD^\prime_{\env}} \indict \lb \tr_h (\cdot, \cdot) = (s, a), \tr_h \in \Tr_h^{\gD_1} \rb }{ | \gD^\prime_{\env} |} + \frac{\sum_{\tr_h \in \gD_1^{c}} \indict \lb \tr_h (\cdot, \cdot) = (s, a), \tr_h \notin \Tr_h^{\gD_1} \rb}{| \gD_1^{c} |}.
\end{align*}
Using \cref{eq:proof_1}, for any $h \in [H]$ and $(s, a) \in \gS \times \gA$, we have
\begin{align*}
    &\quad \labs \widetilde{d}^{\piE}_h(s, a) - d^{\piE}_h (s, a)  \rabs
    \\
    &\leq \labs \frac{\sum_{\tr_h \in \gD^\prime_{\env}} \indict \lb \tr_h (\cdot, \cdot) = (s, a), \tr_h \in \Tr_h^{\gD_1} \rb }{ | \gD^\prime_{\env} |} - \sum_{\tr_h \in \Tr_h^{\gD_1}} \sP^{\pi^\prime}(\tr_h) \indict\lb  \tr_h (\cdot, \cdot) = (s, a)  \rb  \rabs 
    \\
    & + \labs \frac{\sum_{\tr_h \in \gD_1^{c}} \indict \lb \tr_h (\cdot, \cdot) = (s, a), \tr_h \notin \Tr_h^{\gD_1} \rb}{| \gD_1^{c} |} - \sum_{\tr_h \notin \Tr_h^{\gD_1}} \sP^{\piE}(\tr_h) \indict\lb  \tr_h (\cdot, \cdot) = (s, a)  \rb  \rabs .
\end{align*}
Thus, we can upper bound the estimation error.
\begin{align*}
    &\quad \sum_{h=1}^H \lnorm \widetilde{d}^{\piE}_h - d^{\piE}_h  \rnorm_{1}
    \\
    &\leq \underbrace{\sum_{h=1}^H \sum_{(s, a) \in \gS \times \gA} \labs \frac{\sum_{\tr_h \in \gD^\prime_{\env}} \indict \lb \tr_h (\cdot, \cdot) = (s, a), \tr_h \in \Tr_h^{\gD_1} \rb }{ | \gD^\prime_{\env} |} - \sum_{\tr_h \in \Tr_h^{\gD_1}} \sP^{\pi^{\prime}}(\tr_h) \indict\lb  \tr_h (\cdot, \cdot) = (s, a)  \rb  \rabs}_{\text{Error A}}
    \\
    &+ \underbrace{\sum_{h=1}^H \sum_{(s, a) \in \gS \times \gA} \labs \frac{\sum_{\tr_h \in \gD_1^{c}} \indict \lb \tr_h (\cdot, \cdot) = (s, a), \tr_h \notin \Tr_h^{\gD_1} \rb}{| \gD_1^{c} |} - \sum_{\tr_h \notin \Tr_h^{\gD_1}} \sP^{\piE}(\tr_h) \indict\lb  \tr_h (\cdot, \cdot) = (s, a)  \rb  \rabs}_{\text{Error B}}. 
\end{align*}
We first analyze the term $\text{Error A}$. Trajectories in $\gD^{\prime}_{\env}$ are collected by $\pi^{\prime}$ via interacting with the environment. Thus, we have the estimator in $\text{Error A}$ is unbiased, i.e., for any $(s, a) \in \gS \times \gA$ and $h \in [H]$,
\begin{align*}
    \expect_{}\ls  \frac{\sum_{\tr_h \in \gD^\prime_{\env}} \indict \lb \tr_h (\cdot, \cdot) = (s, a), \tr_h \in \Tr_h^{\gD_1} \rb }{ | \gD^\prime_{\env} |}  \rs =  \sum_{\tr_h \in \Tr_h^{\gD_1}} \sP^{\pi^{\prime}}(\tr_h) \indict\lb  \tr_h (\cdot, \cdot) = (s, a)  \rb,
\end{align*}
where the expectation is taken over the randomness of collecting $\gD_{\env}^{\prime}$. The above equality holds because the stochastic processes on the both sides are induced by $\pi^{\prime}$. Then we leverage Chernoff's bound to upper bound \text{Error} A.
\begin{lem}[Chernoff's bound \citep{vershynin2018high}]   \label{lemma:chernoff_bound}
Let $\widebar{X} = {1}/{n} \cdot \sum_{i=1}^{n} X_i$, where $X_i$ is a Bernoulli random variable with $\sP(X_i = 1) = p_i$ and $\sP(X_i = 0) = 1 - p_i$, for $i \in [n]$. Furthermore, assume these random variables are independent. Let $\mu = \expect[\widebar{X}] = {1}/{n} \cdot \sum_{i=1}^{n} p_i$. Then for $0 < t \leq 1$, 
\begin{align*}
    \sP\lp  \labs  \widebar{X} - \mu \rabs  \geq t \mu  \rp \leq 2 \exp\lp -\frac{\mu n t^2}{3}  \rp.
\end{align*}
\end{lem}
First, for each $s \in \gS$ and $h \in [H]$, for any non-expert action $a \not= \piE_h (s)$, we have that
\begin{align*}
     \sum_{\tr_h \in \Tr_h^{\gD_1}} \sP^{\pi^{\prime}}(\tr_h) \indict\lb  \tr_h (\cdot, \cdot) = (s, a)  \rb = 0.
\end{align*}
This is because on the trajectory $\tr_h \in \Tr^{\gD_1}_h$, the state $s$ in time step $h$ is covered in $\gD_1$. As a result, the BC policy $\pi^\prime$ learned from $\gD_1$ must take the expert action $\piE_h (s)$ on such a state and thus $\sP^{\pi^{\prime}}(\tr_h) \indict\lb  \tr_h (\cdot, \cdot) = (s, a)  \rb = 0$. Second, since the estimator of
\begin{align*}
    \frac{\sum_{\tr_h \in \gD^\prime_{\env}} \indict \lb \tr_h (\cdot, \cdot) = (s, a), \tr_h \in \Tr_h^{\gD_1} \rb }{ | \gD^\prime_{\env} |}
\end{align*}
is an unbiased estimator and is non-negative almost surely. Therefore, for each $s \in \gS$ and $h \in [H]$, for any non-expert action $a \not= \piE_h (s)$, with probability of $1$,  
\begin{align*}
    \frac{\sum_{\tr_h \in \gD^\prime_{\env}} \indict \lb \tr_h (\cdot, \cdot) = (s, a), \tr_h \in \Tr_h^{\gD_1} \rb }{ | \gD^\prime_{\env} |} = 0.
\end{align*}
Based on the above two claims, we have that
\begin{align*}
    \text{Error A} = \sum_{h=1}^H \sum_{s \in \gS} \labs \frac{\sum_{\tr_h \in \gD^\prime_{\env}} \indict \lb \tr_h (\cdot, \cdot) = (s, \piE_h (s)), \tr_h \in \Tr_h^{\gD_1} \rb }{ | \gD^\prime_{\env} |} - \sum_{\tr_h \in \Tr_h^{\gD_1}} \sP^{\pi^{\prime}}(\tr_h) \indict\lb  \tr_h (\cdot, \cdot) = (s, \piE_h (s))  \rb  \rabs. 
\end{align*}

Let ${E^\prime}^{s}_h$ be the event that $\tr_h \in \gD_{\env}^{\prime}$ agrees with expert policy at state $s$ at time step $h$ and also appears in $\Tr_h^{\gD_1}$. Formally, 
\begin{align*}
    {E^\prime}_h^{s} = \indict\{\tr_h (\cdot, \cdot) = (s, \piE_h (s)) \cap \tr_h \in \Tr_h^{\gD_1}\}.
\end{align*}

By Lemma \ref{lemma:chernoff_bound}, for each $s \in \gS$ and $h \in [H]$, with probability at least $1 - \frac{\delta}{2 |\gS| H}$ over the randomness of $\gD_{\env}^\prime$, we have
\begin{align*}
    &\quad \labs \frac{\sum_{\tr_h \in \gD^\prime_{\env}} \indict \lb \tr_h (\cdot, \cdot) = (s, \piE_h(s)), \tr_h \in \Tr_h^{\gD_1} \rb }{ | \gD^\prime_{\env} |}  - \sum_{\tr_h \in \Tr_h^{\gD_1}} \sP^{\pi^{\prime}}(\tr_h) \indict\lb  \tr_h (\cdot, \cdot) = (s, \piE_h (s))  \rb  \rabs
    \\
    &\leq \sqrt{ \sP^{\pi^{\prime}} \lp {E^\prime}^{s}_h  \rp  \frac{3 \log \lp 4 |\gS| H / \delta \rp}{n^\prime}}.
\end{align*}
By union bound, with probability at least $1-\frac{\delta}{2}$ over the randomness of $\gD^\prime_{\env}$, we have
\begin{align*}
    \text{Error A} &\leq  \sum_{h=1}^H \sum_{s \in \gS} \sqrt{ \sP^{\pi^{\prime}} \lp {E^\prime}^{s}_h  \rp  \frac{3 \log \lp 4 |\gS| H / \delta \rp}{n^\prime}}
    \\
    &\leq \sum_{h=1}^H \sqrt{|\gS|} \sqrt{\sum_{s \in \gS} \sP^{\pi^{\prime}} \lp {E^\prime}^{s}_h  \rp  \frac{3 \log \lp 4 |\gS| H / \delta \rp}{n^\prime} }
\end{align*}
The last inequality follows the Cauchy-Schwartz inequality. It remains to upper bound $\sum_{s \in \gS}  \sP^{\piE}({E^\prime}_{h}^{s})$ for all $h \in [H]$. To this end, we define the event ${G^\prime}_h^{\gD_1}$ that policy $\pi^{\prime}$ visits states covered in $\gD_1$ up to time step $h$. Formally, ${G^\prime}_h^{\gD_1} = \indict\{ \forall h^{\prime} \leq h,  s_{h^{\prime}} \in \gS_{h^{\prime}} (\gD_1) \}$, where $\gS_{h}(\gD_1)$ is the set of states in $\gD_1$ at time step $h$, where $s_h^{\prime}$ comes from $\tr_h \in \gD_{\env}^{\prime}$. Then, for all $h \in [H]$, we have 
\begin{align*}
    \sum_{s \in \gS} \sP^{\pi^{\prime}} \lp {E^\prime}_h^{s}  \rp = \sP^{\pi^{\prime}}({G^\prime}_h^{\gD_1}) \leq \sP({G^\prime}_1^{\gD_1}).
\end{align*}
The last inequality holds since ${G^\prime}_h^{\gD_1} \subseteq {G^\prime}_1^{\gD_1}$ for all $h \in [H]$. Then we have that
\begin{align*}
    \text{Error A} \leq H \sqrt{\frac{3 |\gS| \log \lp 4 |\gS| H / \delta \rp}{n^\prime}}.
\end{align*}
When the interaction complexity satisfies that $n^\prime \gtrsim \frac{| \gS | H^{2}}{\varepsilon^2} \log\lp  \frac{|\gS| H}{\delta} \rp$, with probability at least $1-\frac{\delta}{2}$ over the randomness of $\gD^\prime$, we have $\text{Error A} \leq \frac{\varepsilon}{2}$. 



% The last inequality follows the Cauchy-Schwartz inequality. It remains to upper bound $\sum_{s \in \gS}  \sP^{\piE}({E^\prime}_{h}^{s})$ for all $h \in [H]$. To this end, we define the event ${G^\prime}_h^{\gD_1}$ that expert policy $\piE$ visits states covered in $\gD_1$ up to time step $h$. Formally, ${G^\prime}_h^{\gD_1} = \indict\{ \forall h^{\prime} \leq h,  s_{h^{\prime}} \in \gS_{h^{\prime}} (\gD_1) \}$, where $\gS_{h}(\gD_1)$ is the set of states in $\gD_1$ at time step $h$. Then, for all $h \in [H]$, we have \begin{align*}
%     \sum_{s \in \gS} \sP^{\piE} \lp {E^\prime}_h^{s}  \rp = \sP^{\piE}({G^\prime}_h^{\gD_1}) \leq \sP({G^\prime}_1^{\gD_1}).
% \end{align*}


For the term $\text{Error B}$, we utilize \citep[Lemma A.11]{rajaraman2020fundamental}. When the expert sample complexity satisfies that $m \gtrsim \frac{|\gS| H^{3/2}}{\varepsilon} \log \lp \frac{|\gS| H}{\delta} \rp$, with probability at least $1-\frac{\delta}{2}$ over the randomness of $\gD$, we have $\text{Error B} \leq \frac{\varepsilon}{2}$. Applying union bound finishes the proof.

\end{proof}


\subsection{Proof of Lemma \ref{lemma:approximate-minimax}}
Before we prove Lemma \ref{lemma:approximate-minimax}, we first state the following key lemma.
\begin{lem}\label{lemma:regret_of_ogd}
Consider Algorithm \ref{algo:gradient_based_optimization}, we have
\begin{align*}
    \sum_{t=1}^T f^{(t)} \lp w^{(t)} \rp - \min_{w \in \gW} \sum_{t=1}^T f^{(t)} (w) \leq 2H \sqrt{2 |\gS| |\gA| T},
\end{align*}
where $f^{(t)}(w) = \sum_{h=1}^{H} \sum_{(s, a) \in \gS \times \gA} w_h(s, a) ( d^{\pi^{(t)}, \widehat{P}}_h (s, a) - \widetilde{d}^{\piE}_h (s, a) )$.
\end{lem}
\begin{proof}
Lemma \ref{lemma:regret_of_ogd} is a direct consequence of the regret bound of online gradient descent \citep{shalev12online-learning}. To apply such a regret bound, we need to verify that 1) the iterate norm $\lnorm w \rnorm_2$ has an upper bound; 2) the gradient norm $\Vert \nabla_{w}  f^{(t)}(w) \Vert_2$ also has an upper bound. The first point is easy to show, i.e., $\lnorm w \rnorm_2 \leq \sqrt{H |\gS| |\gA|}$ by the condition that $w \in \gW = \{ w = (w_1, \ldots, w_H): \Vert w_h \Vert_{\infty} \leq 1, \; \forall h \in [H] \}$. For the second point, let $\widetilde{d}^{1}_h$ and $\widetilde{d}^{2}_h$ be the first and the second part in $\widetilde{d}^{\piE}_h$ defined in \eqref{eq:new_estimator_unknown_transition}. Then, 
\begin{align*}
    \lnorm \nabla_{w} f^{(t)} (w) \rnorm_{2} &= \sqrt{\sum_{h=1}^H \sum_{(s, a) \in \gS \times \gA} \lp d^{\pi^{(t)}, \widehat{P}}_h (s, a) - \widetilde{d}^{\piE}_h (s, a) \rp^2 }
    \\
    &= \sqrt{\sum_{h=1}^H \sum_{(s, a) \in \gS \times \gA} \lp d^{\pi^{(t)}, \widehat{P}}_h (s, a) - \widetilde{d}^{1}_h(s, a) - \widetilde{d}^{2}_h(s, a) \rp^2 } 
    \\
    &\leq \sqrt{\sum_{h=1}^H  3 \sum_{(s, a) \in \gS \times \gA}   \lp d^{\pi^{(t)}, \widehat{P}}_h (s, a) \rp^2 + \lp \widetilde{d}^{1}_h(s, a) \rp^2 + \lp \widetilde{d}^{2}_h(s, a)  \rp^2 }
    \\
     &\leq \sqrt{\sum_{h=1}^H  3  \lp \lnorm d^{\pi^{(t)}, \widehat{P}}_h \rnorm_1 + \lnorm \widetilde{d}^{1}_h \rnorm_1 + \lnorm \widetilde{d}^{2}_h \rnorm_1  \rp }
    \\
    &\leq 2\sqrt{ H },
\end{align*}
where the first inequality follows $(a+b+c)^2 \leq 3(a^2+b^2+c^2)$ and the second inequality is based on that $ x ^2 \leq \vert x \vert$ if $0 \leq x \leq 1$.

Invoking Corollary 2.7 in \citep{shalev12online-learning} with $B = \sqrt{H |\gS| |\gA|}$ and $L = 2 \sqrt{H}$ finishes the proof. 
\end{proof}

\begin{proof}[Proof of \cref{lemma:approximate-minimax}]
    
 With the dual representation of $\ell_1$-norm, we have
\begin{align*}
    \min_{\pi \in \Pi} \sum_{h=1}^H \lnorm d^{\pi, \widehat{P}}_h - \widetilde{d}^{\piE}_h \rnorm_{1} = \min_{\pi \in \Pi} \max_{w \in \gW} \sum_{h=1}^H \sum_{(s, a) \in \gS \times \gA} w_{h} (s, a) \lp \widetilde{d}^{\piE}_h(s, a) - d^{\pi, \widehat{P}}_h (s, a) \rp. 
\end{align*}
Since the above objective is linear w.r.t both $w$ and $d^\pi_h$, invoking the minimax theorem \citep{bertsekas2016nonlinear} yields
\begin{align*}
    &\quad \min_{\pi \in \Pi} \max_{w \in \gW} \sum_{h=1}^H \sum_{(s, a) \in \gS \times \gA} w_{h} (s, a) \lp \widetilde{d}^{\piE}_h(s, a) - d^{\pi, \widehat{P}}_h (s, a) \rp
    \\
    &= \max_{w \in \gW} \min_{\pi \in \Pi} \sum_{h=1}^H \sum_{(s, a) \in \gS \times \gA} w_{h} (s, a) \lp \widetilde{d}^{\piE}_h(s, a) - d^{\pi, \widehat{P}}_h (s, a) \rp
    \\
    &= - \min_{w \in \gW} \max_{\pi \in \Pi} \sum_{h=1}^H \sum_{(s, a) \in \gS \times \gA} w_h (s, a) \lp d^{\pi, \widehat{P}}_h (s, a) -  \widetilde{d}^{\piE}_h(s, a) \rp, 
\end{align*}
where the last step follows the property that for a function $f$, $- \max_{x} f(x) = \min_{x} - f(x)$. Therefore, we have
\begin{align} \label{eq:l1_dual_representation}
    \min_{\pi \in \Pi} \sum_{h=1}^H \lnorm d^{\pi, \widehat{P}}_h  - \widetilde{d}^{\piE}_h \rnorm_{1} = - \min_{w \in \gW} \max_{\pi \in \Pi} \sum_{h=1}^H \sum_{(s, a) \in \gS \times \gA} w_h (s, a) \lp d^{\pi, \widehat{P}}_h (s, a) -  \widetilde{d}^{\piE}_h(s, a) \rp.
\end{align}
Then we consider the term $\min_{w \in \gW} \max_{\pi \in \Pi} \sum_{h=1}^H \sum_{(s, a) \in \gS \times \gA} w_h (s, a) \lp d^{\pi, \widehat{P}}_h (s, a) -  \widetilde{d}^{\piE}_h(s, a) \rp$.
\begin{align*}
    &\quad \min_{w \in \gW} \max_{\pi \in \Pi} \sum_{h=1}^H \sum_{(s, a) \in \gS \times \gA} w_h (s, a) \lp d^{\pi, \widehat{P}}_h (s, a) -  \widetilde{d}^{\piE}_h(s, a) \rp
    \\
    &\leq \max_{\pi \in \Pi} \sum_{h=1}^H \sum_{(s, a) \in \gS \times \gA} \lp \frac{1}{T} \sum_{t=1}^T w^{(t)}_h (s, a) \rp \lp d^{\pi, \widehat{P}}_h (s, a) -  \widetilde{d}^{\piE}_h(s, a) \rp
    \\
    &\leq \frac{1}{T} \sum_{t=1}^T \max_{\pi \in \Pi} \sum_{h=1}^H \sum_{(s, a) \in \gS \times \gA} w^{(t)}_h (s, a) \lp d^{\pi, \widehat{P}}_h (s, a) -  \widetilde{d}^{\piE}_h(s, a) \rp. 
\end{align*}
At iteration $t$, $\pi^{(t)}$ is the approximately optimal policy regarding reward function $w^{(t)}$ with an optimization error of $\varepsilon_{\rl}$. Then we obtain that
\begin{align*}
    &\quad \frac{1}{T} \sum_{t=1}^T \max_{\pi \in \Pi} \sum_{h=1}^H \sum_{(s, a) \in \gS \times \gA} w^{(t)}_h (s, a) \lp d^{\pi, \widehat{P}}_h (s, a) -  \widetilde{d}^{\piE}_h(s, a) \rp
    \\
    &\leq \frac{1}{T} \sum_{t=1}^T \sum_{h=1}^H \sum_{(s, a) \in \gS \times \gA} w^{(t)}_h (s, a) \lp d^{\pi^{(t)}, \widehat{P}}_h (s, a) -  \widetilde{d}^{\piE}_h(s, a) \rp + \varepsilon_{\rl}.
\end{align*}
Applying Lemma \ref{lemma:regret_of_ogd} yields that
\begin{align*}
    &\quad \frac{1}{T} \sum_{t=1}^T \sum_{h=1}^H \sum_{(s, a) \in \gS \times \gA} w^{(t)}_h (s, a) \lp d^{\pi^{(t)}, \widehat{P}}_h (s, a) -  \widetilde{d}^{\piE}_h(s, a) \rp
    \\
    & \leq \min_{w \in \gW} \frac{1}{T} \sum_{t=1}^T \sum_{h=1}^H \sum_{(s, a) \in \gS \times \gA} w_h (s, a) \lp d^{\pi^{(t)}, \widehat{P}}_h (s, a) -  \widetilde{d}^{\piE}_h(s, a) \rp + 2H \sqrt{ \frac{2 |\gS| |\gA|}{T} }
    \\
    &= \min_{w \in \gW}  \sum_{h=1}^H \sum_{(s, a) \in \gS \times \gA} w_h (s, a) \lp \frac{1}{T} \sum_{t=1}^T d^{\pi^{(t)}, \widehat{P}}_h (s, a) -  \widetilde{d}^{\piE}_h(s, a) \rp + 2H \sqrt{ \frac{2 |\gS| |\gA|}{T} }
    \\
    &= \min_{w \in \gW}  \sum_{h=1}^H \sum_{(s, a) \in \gS \times \gA} w_h (s, a) \lp d^{\widebar{\pi}, \widehat{P}}_h (s, a) -  \widetilde{d}^{\piE}_h(s, a) \rp + 2H \sqrt{ \frac{2 |\gS| |\gA|}{T} }.
\end{align*}
Note that $\widebar{\pi}$ is induced by the mean state-action distribution, i.e., $\widebar{\pi}_h (a|s) = \widebar{P}_h(s, a) / \sum_{a} \widebar{P}_h(s, a)$, where $\widebar{P}_h (s, a) = \frac{1}{T} \sum_{t=1}^T d^{\pi^{(t)}, \widehat{P}}_h (s, a)$. Based on Proposition 3.1 in \citep{ho2016gail}, we have that $d^{\widebar{\pi}, \widehat{P}}_h (s, a) = \widebar{P}_h (s, a)$, and hence the last equation holds. Combined with \cref{eq:l1_dual_representation}, we have that
\begin{align*}
    &\quad \min_{\pi \in \Pi} \sum_{h=1}^H \lnorm d^{\pi, \widehat{P}}_h - \widetilde{d}^{\piE}_h \rnorm_{1}
    \\
    &\geq - \min_{w \in \gW}  \sum_{h=1}^H \sum_{(s, a) \in \gS \times \gA} w_h (s, a) \lp d^{\widebar{\pi}, \widehat{P}}_h (s, a) -  \widetilde{d}^{\piE}_h(s, a) \rp - 2H \sqrt{ \frac{2 |\gS| |\gA|}{T} } - \varepsilon_{\rl}
    \\
    &= \max_{w \in \gW} \sum_{h=1}^H \sum_{(s, a) \in \gS \times \gA} w_h (s, a) \lp  \widetilde{d}^{\piE}_h(s, a) - d^{\widebar{\pi}, \widehat{P}}_h (s, a)  \rp - 2H \sqrt{ \frac{2 |\gS| |\gA|}{T} } - \varepsilon_{\rl}
    \\
    &= \lnorm \widetilde{d}^{\piE}_h - d^{\widebar{\pi}, \widehat{P}}_h \rnorm_{1} - 2H \sqrt{ \frac{2 |\gS| |\gA|}{T}} - \varepsilon_{\rl},
\end{align*}
where the last step again utilizes the dual representation of $\ell_1$-norm.  If we take $\varepsilon_{\rl} \leq \varepsilon/2$, $T \gtrsim H^2 |\gS||\gA|/\varepsilon^2$ and $\eta^{(t)} := \sqrt{|\gS||\gA| / (8T)}$, then we have
\begin{align*}
\sum_{h=1}^H \lnorm d^{\widebar{\pi}, \widehat{P}}_h - \widetilde{d}^{\piE}_h \rnorm_{1} \leq \min_{\pi \in \Pi} \sum_{h=1}^H \lnorm d^{\pi, \widehat{P}}_h - \widetilde{d}^{\piE}_h \rnorm_{1} + \varepsilon. 
\end{align*} 
We complete the proof.
\end{proof}

\subsection{Proof of Theorem \ref{theorem:sample-complexity-unknown-transition}}


\begin{proof}[Proof of Theorem \ref{theorem:sample-complexity-unknown-transition}]

Firstly, we verify assumption (a) in Proposition \ref{prop:connection}. With \cref{lem:reward_free}, when the number of trajectories collected by \textnormal{RF-Express} satisfies
\begin{align*}
    n \gtrsim  \frac{H^{3} |\gS| |\gA| }{\varepsilon^2}    \lp |\gS| + \log\lp\frac{|\gS| H}{\delta} \rp \rp,
\end{align*}
for any policy $\pi \in \Pi$ and reward function $w : \gS \times \gA \rar [0, 1]$, with probability at least $1-\delta/2$, $| V^{\pi, P, w} - V^{\pi, \widehat{P}, w} | \leq \varepsilon / 16 = \varepsilon_{\rfe}$. In a word, the assumption (a) in Proposition \ref{prop:connection} holds with $\delta_{\mathrm{RFE}} = \delta / 2$ and $\varepsilon_{\mathrm{RFE}} = \varepsilon / 16$. 


Secondly, we note that the assumption (b) in Proposition \ref{prop:connection} holds by Lemma \ref{lemma:sample_complexity_of_new_estimator_unknown_transition}. More concretely, if the expert sample complexity and interaction complexity satisfies
\begin{align*}
    m \gtrsim   \frac{H^{3/2} | \gS | }{\varepsilon} \log\lp  \frac{|\gS| H}{\delta} \rp, \; n^\prime \gtrsim \frac{H^{2} | \gS |}{\varepsilon^2} \log\lp  \frac{|\gS| H}{\delta} \rp,
\end{align*}
with probability at least $1-\delta/2$, $\sum_{h=1}^H \Vert \widetilde{d}^{\piE}_h - d^{\piE}_h  \Vert_{1} \leq \varepsilon / 16 = \varepsilon_{\est}$. Hence, the assumption (b) in Proposition \ref{prop:connection} holds with $\delta_{\mathrm{EST}} = \delta / 2$ and $\varepsilon_{\mathrm{EST}} = \varepsilon / 16$.

Thirdly, we aim to verify that the assumption (c) in Proposition \ref{prop:connection} holds with $\widetilde{d}^{\piE}_h (s, a)$ and $\widehat{P}$. When $\varepsilon_{\rl} \leq \varepsilon / 2$ and $T \gtrsim |\gS| |\gA| H^2 / \varepsilon^2$ such that $2 H \sqrt{2 |\gS| |\gA| / T} \leq \varepsilon / 4$, we have that
\begin{align*}
    \sum_{h=1}^H \lnorm d^{\widebar{\pi}, \widehat{P}}_h - \widetilde{d}^{\piE}_h \rnorm_{1} - \min_{\pi \in \Pi} \sum_{h=1}^H \lnorm d^{\pi, \widehat{P}}_h - \widetilde{d}^{\piE}_h \rnorm_{1} \leq \frac{3\varepsilon}{4}  = \varepsilon_{\opt}.
\end{align*}
Therefore, the assumption (c) in Proposition \ref{prop:connection} holds with $\varepsilon_{\opt} = 3\varepsilon / 4$. Now, we summarize the conditions what we have obtained.
\begin{itemize}
    \item The assumption (a) in Proposition \ref{prop:connection} holds with $\delta_{\mathrm{RFE}} = \delta / 2$ and $\varepsilon_{\mathrm{RFE}} = \varepsilon / 16$.
    \item The assumption (b) in Proposition \ref{prop:connection} holds with $\delta_{\mathrm{EST}} = \delta / 2$ and $\varepsilon_{\mathrm{EST}} = \varepsilon / 16$.
    \item The assumption (c) in Proposition \ref{prop:connection} holds with $\varepsilon_{\opt} = 3\varepsilon / 4$. 
\end{itemize}
Applying Proposition \ref{prop:connection} finishes the proof. With probability at least $1-\delta$,
\begin{align*}
    V^{\piE} - V^{\widebar{\pi}} \leq 2 \varepsilon_{\mathrm{RFE}} + 2 \varepsilon_{\mathrm{EST}} + \varepsilon_{\opt} = \varepsilon.  
\end{align*}

\end{proof}

% \begin{rem}
% We conjecture that the dependence of the interaction complexity on $\vert \gS \vert$ and $H$ is tight in the worst case. This is because that IL without a known transition is intrinsically harder than online RL problem and the minimax optimal complexity of online RL is already $\widetilde{\gO} \lp H^3 \vert \gS \vert \vert \gA \vert / \varepsilon^2 \rp$~\citep{menard20fast-active-learning}. As for an instance dependent complexity, one observation is that the reward functions in AIL are not arbitrary as in reward-free exploration. If we can leverage this property, we may derive sharper results.    
% \end{rem}


\section{Proof of Results in Section \ref{sec:mbtail_state_abstraction}}
\label{sec:proof_of_mbtail_state_abstraction}

\subsection{Problem Setup} 

To facilitate later analysis, we introduce some useful notations widely used in the literature \citep{li2006towards, jiang2015abstraction}. In this part, for a function $f$ that operates on the original state space $\gS$, we add a superscript $\phi$ (i.e., $f^{\phi}$) to denote the counterpart that operates on the abstract state space $\Phi$. Inversely, for a function $f^{\phi}$ that operates on the abstract state space, we use $[f^{\phi}]^{M}$ to denote its lifted version, which is defined as $[f^{\phi}]^{M} (s) = f^{\phi} (\phi (s))$. Notice that $[f^{\phi}]^{M}$ is a function over $\gS$.



\begin{defn}[Abstract MDP]
\label{def:abstract_mdp}
Under \cref{asmp:state_abstraction}, for the original MDP $\gM = (\gS, \gA, P, r, H, \rho)$, we define the abstract MDP $\gM^{\phi} = (\Phi, \gA, P^{\phi}, r^{\phi}, H, \rho^{\phi})$. In particular, 
\begin{itemize}
    \item $P^{\phi}_h (x^\prime | x, a) = \sum_{s^\prime \in \phi^{-1}_h (x^\prime)} P_h (s^\prime|s, a)$, for an arbitrary $s \in \phi^{-1}_h (x)$.
    \item $r^{\phi}_h (x, a) = r_h (s, a)$, for an arbitrary $s \in \phi^{-1}_h (x)$.
    \item $\rho^{\phi} (x) = \sum_{s \in \phi^{-1}_1 (x)} \rho (s, a)$.
\end{itemize}
Here $\phi^{-1}_h (x) = \{ s \in \gS: \phi_h (s) = x \}$. 
\end{defn}
We clarify that there is no ambiguity in \cref{def:abstract_mdp} because of \cref{asmp:state_abstraction}. The bisimulation condition enables that $s \in \phi_h^{-1}(x)$ are equivalent under the reward-consistent and transition-consistent conditions. With the abstract MDP $\gM^{\phi}$, for any abstract policy $\pi^{\phi}$, we utilize $V^{\pi^{\phi}, \gM^{\phi}}_h (x)$ to denote the corresponding value function. Similarly, with the original MDP $\gM$, for any policy $\pi$, we use $V^{\pi, \gM}_h (s)$ to denote the corresponding value function. 


\begin{defn}[Abstract Expert Policy]
\label{def:abstract_expert_policy}
    Under \cref{asmp:state_abstraction}, for the original expert policy $\piE$, we define the abstract expert policy $\pi^{\expert, \phi}$. In particular, for any $(x, h) \in \Phi \times [H]$, it holds that
    \begin{align*}
        \pi^{\expert, \phi}_h (x) = \piE_h (s), \; \text{for an arbitrary } s \in \phi^{-1}_h (x). 
    \end{align*}
\end{defn}







Besides, for any policy $\pi \in \Pi$, we utilize $d^{\pi, \phi}_h \in \Delta (\Phi \times \gA)$ to denote the abstract state-action distribution.
\begin{align*}
    d^{\pi, \phi}_h (x, a) = \sP^{\pi} ( \phi_h (s_h) = x, a_h = a | P) = \sum_{s \in \phi^{-1}_h (x)} d^{\pi}_h (s, a). 
\end{align*}
For any abstract policy $\pi^{\phi} \in \Pi^{\phi}$ and abstract transition function $P^{\phi}$, we utilize $d^{\pi^{\phi}, P^{\phi}}_h \in \Delta (\Phi \times \gA)$ to denote the abstract state-action distribution induced by $\pi^{\phi}$ in $P^{\phi}$. In particular,
\begin{align*}
    d^{\pi^{\phi}, P^{\phi}}_h (x, a) = \sP^{\pi^{\phi}} (x_h = x, a_h = a| P^{\phi}).
\end{align*}


\subsection{MB-TAIL with State Abstraction}
\label{subsec:appendix_mbtail_with_state_abstraction_algorithm}



Before presenting MB-TAIL with state abstraction, we first develop a meta-algorithm for AIL with state abstractions when the transition function is unknown. 


% Then, we leverage analysis techniques in state abstraction \citep{li2006towards, li2009unifying, jiang2015abstraction} to transfer the results from the abstract space to the original one. 




\begin{algorithm}[htbp]
\caption{Meta-algorithm for AIL with State Abstractions and Unknown Transitions}
\label{algo:framework_state_abstraction}
\begin{algorithmic}[1]
\REQUIRE{Expert demonstrations $\gD$, a set of state abstractions $\{ \phi_h \}_{h=1}^H$.}
\STATE{$\widehat{P}^{\phi} \lar$ Invoke a reward-free exploration method to collect $n$ trajectories and learn an \emph{abstract} transition model. \label{alg_line:reward_free}}
\STATE{$\widetilde{d}_h^{\piE, \phi} \lar $ Estimate the \emph{abstract} expert state-action distribution. \label{alg_line:estimation}}
\STATE{$\widebar{\pi}^{\phi} \lar$ Apply an AIL approach to perform imitation with the expert estimation $\widetilde{d}_h^{\piE, \phi}$ under transition model $\widehat{P}^{\phi}$. \label{alg_line:optimization}}
\ENSURE{Policy $[\widebar{\pi}^{\phi}]^{M}$.}
\end{algorithmic}
\end{algorithm}

% Now, we develop a meta-algorithm (\cref{algo:framework_state_abstraction}) for AIL with state abstractions when the transition function is unknown.






In the sequel, we present three main algorithmic designs that appeared in Line \ref{alg_line:reward_free}, Line \ref{alg_line:estimation} and Line \ref{alg_line:optimization} in Algorithm \ref{algo:framework_state_abstraction} in the setting with state abstraction.


\textbf{The Transition-aware Estimator with State Abstraction. } Here we present the transition-aware estimator with state abstraction. The key idea of the construction of the estimator is similar to that discussed in \cref{subsec:transition_aware_estimator}. However, unlike the original estimator in \eqref{eq:new_estimator_unknown_transition}, the transition-aware estimator with state abstraction is a distribution over the abstract space $\Phi \times \gA$. We present our adaptions to the setting with state abstraction in the following part.    




Similar to the procedure presented in \cref{subsec:transition_aware_estimator}, we randomly divide the expert dataset into two equal parts, i.e., $\gD = \gD_1 \cup \gD_1^{c}$ and $\gD_1 \cap \gD_1^{c} = \emptyset$ with $|\gD_1 |=  |\gD_1^{c}| = m / 2$. First, with state abstractions $\{ \phi_h \}_{h=1}^H$, we first apply BC on $\gD_1$ to learn the abstract policy $\pi^{\prime, \phi}$.
\begin{align}
\label{eq:bc_policy_state_abstraction}
\pi^{\prime, \phi}_h(a|x) = \left\{ \begin{array}{ll}
      \frac{n_h^{1}(x, a)}{n_h^{1}(x)}  & \text{if } n_h^{1}(x) > 0  \\
        \frac{1}{|\gA|} & \text{otherwise}  
    \end{array} \right.
\end{align}
Here $n^1_h(x, a) = \sum_{\tr \in \gD_1} \indict \{ \phi_h (\tr_h (\cdot)) = x, \tr_h (a_h) = a \}$ and $n^1_h(x) = \sum_{\tr \in \gD_1} \indict \{ \phi_h (\tr_h (\cdot)) = x \}$. Intuitively, $n^1_h(x, a)$ ($n^1_h(x)$) is the number of abstract-state-action (abstract state) pairs that appeared in $\gD_1$ in step $h$.


Second, we utilize the lifted policy $[\pi^{\prime, \phi}]^M$ to interact with the environment to collect a new dataset $\gD_{\env}^\prime$. Notice that $[\pi^{\prime, \phi}]^M$ is a policy defined in the original state space $\gS$. Finally, we can establish the following estimator with state abstractions $\{ \phi_h \}_{h=1}^H$.
\begin{align}
    \widetilde{d}^{\piE, \phi}_h (x, a) &= \frac{\sum_{\tr_h \in \gD^{\prime}_{\env}} \indict \{ \phi_h (\tr_h (\cdot)) = x, \tr_h (a_h) = a, \tr_h \in \Tr^{\gD_1, \phi}_h   \}}{\vert \gD^{\prime}_{\env} \vert} \nonumber 
    \\
    &+ \frac{\sum_{\tr_h \in \gD_1^c} \indict \{ \phi_h (\tr_h (\cdot)) = x, \tr_h (a_h) = a, \tr_h \not\in  \Tr^{\gD_1, \phi}_h\}}{\labs \gD^{c}_1 \rabs}. \label{eq:new_estimator_unknown_transition_state_abstraction}
\end{align}
Here
\begin{align*}
     \Tr^{\gD, \phi}_h = \{ \tr_h = (s_1, a_1, \ldots, s_h, a_h): \phi_{\ell} (s_{\ell}) \in \Phi_{\ell} (\gD), \forall \ell \in [h] \}, \; \Phi_h (\gD) = \{ x \in \Phi: \exists \tr \in \gD, \phi_h (\tr_h (\cdot)) = x\}.
\end{align*}
Intuitively, $\Phi_h (\gD)$ is the set of abstract states visited in $\gD$ in time step $h$. $\Tr^{\gD, \phi}_h$ is the set of truncated trajectories of length $h$, along which each abstract state is visited in $\gD$.


\textbf{Reward-free Exploration with State Abstraction.} In this part, we adapt the reward-free exploration method RF-Express to the setting with state abstraction; see \cref{algo:rf_express_state_abstraction}. The main difference is that we learn the abstract transition model and abstract exploration policy. Nevertheless, when interacting with the original environment, we need to transfer the abstract policy $\pi^{\phi, t+1}$ to the lifted version $[\pi^{\phi, t+1}]^{M}$.  




\begin{algorithm}[htbp]
\caption{RF-Express with State Abstraction}
\label{algo:rf_express_state_abstraction}
\begin{algorithmic}[1]
\REQUIRE{A set of state abstractions $\{ \phi_h \}_{h=1}^H$, failure probability $\delta$, and function $\beta(n, \delta) = \log (3 \vert \Phi \vert \vert \gA \vert H / \delta)+ \vert \Phi \vert \log (8 e(n+1))$.}
\FOR{$t = 0, 1, 2, \cdots$}
\STATE{Update the abstract counter and abstract empirical transition model:}
\begin{align*}
    & n^{t}_h (x, a) = \sum_{i=1}^{t} \indict \{ \phi_h (s^{i}_h) = x, a^i_h = a \}, \; n^{t}_h (x, a, x^\prime) = \sum_{i=1}^{t} \indict \{ \phi_h (s^{i}_h) = x, a^i_h = a, \phi_{h+1} (s^{i}_{h+1}) = x^\prime  \},
    \\
    & \widehat{P}^{\phi, t}_h (x^\prime|x, a) = \frac{n^{t}_h (x, a, x^\prime)}{n^{t}_h (x, a)}, \; \text{if $n^{t}_h (x, a) > 0$ and } \widehat{P}^{\phi, t}_h (x^\prime|x, a) = \frac{1}{\vert \gS \vert}, \; \forall x^\prime \in \Phi \text{ otherwise}. 
\end{align*}
\STATE{Define $W^{t}_{H+1} (x, a) = 0, \; \forall (x, a) \in \Phi \times \gA$.}
\FOR{$h = H, H-1, \cdots, 1$}
\STATE{$W^{t}_h (x, a) = \min \left(H, 15 H^{2} \frac{\beta\left(n_{h}^{t } (x, a), \delta\right)}{n_{h}^{t } (x, a)}+\left(1+\frac{1}{H}\right) \sum_{x^{\prime} \in \Phi} \widehat{P}_{h}^{\phi, t} (x^{\prime} | x, a ) \max_{a^{\prime}} W_{h+1}^{t }\left(x^{\prime}, a^{\prime}\right)\right)$.}
\ENDFOR
\STATE{Derive the greedy policy: $\pi_{h}^{\phi, t+1}(x)=\argmax_{a \in \mathcal{A}} W_{h}^{ t}(x, a), \forall x \in \Phi, \forall h \in[H]$.}
% \STATE{Define $W^{t }: = \expect_{x \sim \rho^{}} [ W_{1}^{t} (x, \pi_{1}^{t+1, }(x)) ]$.}
\IF{$3 e \sqrt{ W_{1}^{t} ( \phi_1(s_1), \pi_{1}^{\phi, t+1} (\phi_1 (s_1))) }+ W_{1}^{t} (\phi_1(s_1), \pi_{1}^{\phi, t+1}( \phi_1 (s_1))) \leq \varepsilon / 2$}
\BREAK
\ENDIF
\STATE{Rollout $[\pi^{\phi, t+1}]^{M}$ to collect a trajectory $\tau^{t+1} = (s^{t+1}_1, a^{t+1}_1, s^{t+1}_2, a^{t+1}_2, \cdots, s^{t+1}_H, a^{t+1}_H)$. \label{alg_line:data_collection}}
\ENDFOR
\ENSURE{Transition model $\widehat{P}^{\phi, t}$.}
\end{algorithmic}
\end{algorithm}


\textbf{Gradient-based Optimization.} For Line \ref{alg_line:optimization} in Algorithm \ref{algo:framework_state_abstraction}, we aim to solve the following state-action distribution matching problem.
\begin{align*}
        \min_{\pi^\phi \in \Pi^{\phi}} \sum_{h=1}^H \lnorm \widetilde{d}^{\piE, \phi}_h - d^{\pi^{\phi}, \widehat{P}^{\phi}}_h \rnorm_1,
\end{align*}
Notice that this is precisely the optimization problem of projecting $\widetilde{d}^{\piE, \phi}_h$ on the set of all feasible \emph{abstract} state-action distributions. We can still apply \cref{algo:gradient_based_optimization} with inputs of $\widehat{P}^{\phi}$ and $\widetilde{d}^{\piE, \phi}_h$ to solve this optimization problem.



Finally, we combine the above three algorithmic designs under the developed framework (\cref{algo:framework_state_abstraction}), which yields the final algorithm.




\begin{algorithm}[htbp]
\caption{Model-based Transition-aware AIL with State Abstractions}
\label{algo:mbtail-state-abstraction}
\begin{algorithmic}[1]
\REQUIRE{Expert demonstrations $\gD$, and a set of state abstractions $\{ \phi_h \}_{h=1}^H$. }
\STATE{Randomly split $\gD$ into two equal parts: $\gD = \gD_1 \cup \gD_1^{c}$.}
\STATE{Learn an abstract policy $\pi^{\prime, \phi} \in \Pi_{\text{BC}} \lp \gD_{1} \rp$ by BC with $\{ \phi_h \}_{h=1}^H$ and roll out $[\pi^{\prime, \phi}]^{M}$ to obtain dataset $\gD_{\env}^\prime$ with $|\gD_{\env}^\prime| = n^{\prime}$ \label{alg_line:mbtail-state-abstraction-rollout-bc}}.
\STATE{Obtain the abstract estimator $\widetilde{d}_h^{\piE, \phi}$ in \eqref{eq:new_estimator_unknown_transition_state_abstraction} with $\gD$, $\gD_{\env}^\prime$ and $\{ \phi_h \}_{h=1}^H$.}
\STATE{Invoke \cref{algo:rf_express_state_abstraction} to collect $n$ trajectories and learn an abstract empirical transition function $\widehat{P}^{\phi}$.}
\STATE{$\widebar{\pi}^{\phi} \lar$ Apply \cref{algo:gradient_based_optimization} with the estimation $\widetilde{d}_h^{\piE, \phi}$ under transition model $\widehat{P}^{\phi}$.}
\ENSURE{Policy $[\widebar{\pi}^{\phi}]^{M}$.}
\end{algorithmic}
\end{algorithm}




\subsection{Proof of Theorem \ref{theorem:sample-complexity-unknown-transition-state-abstraction}}



% We give the definition of reward-free exploration in the abstract model, which is analogous to \cref{defn:reward_free}. 
% \begin{defn}   \label{defn:reward_free_abstract_model}
% Given an abstract MDP $\gM^{\phi}$ without reward function $r^{\phi}$, an algorithm is said to be $(\varepsilon, \delta)$-PAC for reward-free exploration (RFE) in the abstract model if 
% \begin{align*}
%     \sP(&\text{for any abstract reward function $r^{\phi} = (r^{\phi}_1, \ldots, r^{\phi}_H), \; r^{\phi}_h: \Phi \times \gA \rar [0, 1] $}, |V^{\pi^*_{r^\phi}, P^{\phi}} - V^{\widehat{\pi}_{r^\phi}^*, P^{\phi}}| \leq \varepsilon) \geq 1 - \delta,
% \end{align*}
% where $\pi^*_{r^\phi}$ is the abstract optimal policy in the abstract MDP with the reward function $r^{\phi}$, and $\widehat{\pi}_{r^\phi}^*$ is the optimal policy in the abstract MDP with the learned transition model $\widehat{P}^{\phi}$ by RFE and reward function $r^\phi$.
% \end{defn}
% Similarly, reward-free exploration also enables uniform policy evaluation with respect to any abstract reward function in the abstract MDP.


% \begin{lem}  \label{lem:uniform_policy_evaluation_abstract_model}
% An $(\varepsilon, \delta)$-PAC reward-free exploration (RFE) method enables the uniform policy evaluation property, i.e., with probability at least $1-\delta$, for any abstract reward function $r^{\phi}$ and abstract policy $\pi^{\phi}$, we have 
% \begin{align*}
%    \labs  V^{\pi^{\phi}, P^{\phi}, r^{\phi}} - V^{\pi^{\phi}, \widehat{P}^{\phi}, r^{\phi}}  \rabs \leq \varepsilon,
% \end{align*}
% where $V^{\pi^{\phi}, P^{\phi}, r^{\phi}}$ and $V^{\pi^{\phi}, \widehat{P}^{\phi}, r^{\phi}}$ are the policy values of abstract policy $\pi^{\phi}$ with reward function $r^{\phi}$ under the real abstract transition model $P^{\phi}$ and recovered abstract transition model $\widehat{P}^{\phi}$ by RFE, respectively.
% \end{lem}
Prior to proving \cref{theorem:sample-complexity-unknown-transition-state-abstraction}, we provide a theoretical guarantee for the meta-algorithm presented in \cref{algo:framework_state_abstraction}. The algorithm constructs an abstract transition model, an abstract state-action distribution and an abstract policy. Finally, the algorithm outputs a policy that can operate in the original state space. To accomplish this, we introduce specialized analysis tools to connect these concepts in both the original and abstract spaces.      

\begin{prop}   \label{prop:connection_state_abstraction}
Suppose that 
\begin{itemize}  % \vspace{-0.15cm}
    \item[(a)] an algorithm A solves the reward-free exploration problem on the \emph{abstract} MDP $\gM^{\phi}$ (see Definition \ref{defn:reward_free}) up to an error $\varepsilon_{\rfe}$ with probability at least $1-\delta_{\rfe}$.
    \item[(b)] an algorithm B has an \emph{abstract} state-action distribution estimator for $d^{\piE, \phi}_h$, which satisfies $\sum_{h=1}^H \Vert \widetilde{d}^{\piE, \phi}_h - d^{\piE, \phi}_h  \Vert_{1} \leq \varepsilon_{\est}$, with probability at least $1-\delta_{\est}$;
    \item[(c)] with the \emph{abstract} transition model in (a) and the \emph{abstract} estimator in (b), an algorithm C solves the following optimization problem up to an error $\varepsilon_{\opt}$.
    \begin{align}
    \label{eq:ail_with_model_state_abstraction}
        \min_{\pi^\phi \in \Pi^{\phi}} \sum_{h=1}^H \lnorm \widetilde{d}^{\piE, \phi}_h - d^{\pi^{\phi}, \widehat{P}^{\phi}}_h \rnorm_1,
    \end{align}
    where $\Pi^{\phi}$ is the set of all abstract policies and $d^{\pi^{\phi}, \widehat{P}^{\phi}}_h$ is the abstract state-action distribution induced by the abstract policy $\pi^{\phi}$ and abstract transition function $\widehat{P}^{\phi}$.
\end{itemize}
% \vspace{-0.15cm}
Then applying algorithms A, B and C under the framework in Algorithm \ref{algo:framework_state_abstraction} could return a policy $[\widebar{\pi}^{\phi}]^{M}$, which has a policy value gap (i.e., $V^{\piE} - V^{[\widebar{\pi}^{\phi}]^{M}}$) at most $2 \varepsilon_{\est} + 2 \varepsilon_{\rfe} + \varepsilon_{\opt}$, with probability at least $1-\delta_{\est} - \delta_{\rfe}$.
\end{prop}
\begin{proof}
The proof idea is similar to that in \cref{subsec:proof_of_proposition_connection}. Additionally, we leverage the analysis techniques in state abstraction. We want to upper bound the imitation gap $V^{\piE, \gM} - V^{[\widebar{\pi}^{\phi}]^{M}, \gM}$, where $V^{\pi, \gM}$ represents the policy value of $\pi$ on the original MDP $\gM$. We consider the following two events.
\begin{align*}
    & E_{\est} = \lb \sum_{h=1}^H \lnorm \widetilde{d}^{\piE, \phi}_h - d^{\piE, \phi}_h  \rnorm_1 \leq \varepsilon_{\est}  \rb
    \\
    & E_{\rfe} = \lb \forall \pi^{\phi} \in \Pi^{\phi}, \; r^{\phi} = (r^{\phi}_1, \ldots, r^{\phi}_H), \; r^{\phi}_h: \Phi \times \gA \rar [0, 1] , \; | V^{\pi^{\phi}, P^{\phi}, r^{\phi}} - V^{\pi^{\phi}, \widehat{P}^{\phi}, r^{\phi}} | \leq \varepsilon_{\rfe} \rb. 
\end{align*}
With condition $(a)$ and condition $(b)$, we obtain $\sP (E_{\est} \cap E_{\rfe}) \geq 1 - \delta_{\rfe} - \delta_{\est}$. The following analysis is established on the event $E_{\est} \cap E_{\rfe}$.





By \cref{lemma:policy_value_irrelevant}, we have $V^{[\widebar{\pi}^{\phi}]^{M}, \gM} = V^{\widebar{\pi}^{\phi}, \gM^{\phi}}$, where $\gM^{\phi}$ is the abstract MDP in \cref{def:abstract_mdp}. Then we can upper bound the term $V^{\piE, \gM} - V^{\widebar{\pi}^{\phi}, \gM^{\phi}}$. On the event $E_{\rfe}$, we further have
\begin{align*}
    V^{\piE, \gM} - V^{\widebar{\pi}^{\phi}, \gM^{\phi}} &\leq  V^{\piE, \gM} - V^{\widebar{\pi}^{\phi}, \widehat{P}^{\phi}, r^{\phi}} + \varepsilon_{\rfe}
    \\
    &= \sum_{h=1}^H \sum_{(s, a) \in \gS \times \gA} d^{\piE}_h (s, a) r_h (s, a) - \sum_{h=1}^H \sum_{(x, a) \in \Phi \times \gA} d^{\widebar{\pi}^{\phi}, \widehat{P}^{\phi}}_h (x, a) r^{\phi}_h (x, a)+ \varepsilon_{\rfe}
    \\
    &= \sum_{h=1}^H \sum_{(x, a) \in \Phi \times \gA} d^{\piE, \phi}_h (x, a) r^{\phi}_h (x, a) - \sum_{h=1}^H \sum_{(x, a) \in \Phi \times \gA} d^{\widebar{\pi}^{\phi}, \widehat{P}^{\phi}}_h (x, a) r^{\phi}_h (x, a)+ \varepsilon_{\rfe} . 
\end{align*}
Here $d^{\widebar{\pi}^{\phi}, \widehat{P}^{\phi}}_h$ is the abstract state-action distribution of $\widebar{\pi}^{\phi}$ in $\widehat{P}^{\phi}$ and $d^{\piE, \phi}_h (x, a) = \sum_{s \in \phi^{-1}_h (x)} d^{\piE}_h (s, a)$. The last equation holds due to the reward-consistent condition in \eqref{eq:reward_consistent}. Then we can obtain
\begin{align*}
    V^{\piE, \gM} - V^{\widebar{\pi}^{\phi}, \gM^{\phi}} &\leq  \sum_{h=1}^H \sum_{(x, a) \in \Phi \times \gA} d^{\piE, \phi}_h (x, a) r^{\phi}_h (x, a) - \sum_{h=1}^H \sum_{(x, a) \in \Phi \times \gA} d^{\widebar{\pi}^{\phi}, \widehat{P}^{\phi}}_h (x, a) r^{\phi}_h (x, a)+ \varepsilon_{\rfe}
    \\
    &\overset{(a)}{\leq} \sum_{h=1}^H \lnorm d^{\piE, \phi}_h - d^{\widebar{\pi}^{\phi}, \widehat{P}^{\phi}}_h \rnorm_1+ \varepsilon_{\rfe}
    \\
    &\leq \sum_{h=1}^H \lnorm d^{\piE, \phi}_h  - \widetilde{d}^{\piE, \phi}_h  \rnorm_1 + \sum_{h=1}^H \lnorm \widetilde{d}^{\piE, \phi}_h  - d^{\widebar{\pi}^{\phi}, \widehat{P}^{\phi}}_h \rnorm_1+ \varepsilon_{\rfe}
    \\
    &\overset{(b)}{\leq}  \sum_{h=1}^H \lnorm \widetilde{d}^{\piE, \phi}_h - d^{\widebar{\pi}^{\phi}, \widehat{P}^{\phi}}_h  \rnorm_1 + \varepsilon_{\est}+ \varepsilon_{\rfe}. 
\end{align*}
Inequality $(a)$ holds due to the dual representation of $\ell_1$-norm and inequality $(b)$ holds due to the event $E_{\est}$. Because $\widebar{\pi}^{\phi}$ is an $\varepsilon_{\opt}$-optimal solution of the optimization problem in \eqref{eq:ail_with_model_state_abstraction}, we get that
\begin{align*}
    V^{\piE, \gM} - V^{\widebar{\pi}^{\phi}, \gM^{\phi}} &\leq \min_{\pi^{\phi} \in \Pi^{\phi}} \sum_{h=1}^H \lnorm \widetilde{d}^{\piE, \phi}_h  - d^{\pi^{\phi}, \widehat{P}^{\phi}}_h  \rnorm_1 + \varepsilon_{\est}+ \varepsilon_{\rfe} + \varepsilon_{\opt}. 
\end{align*}
We consider the abstract expert policy $\pi^{\expert, \phi}$ in \cref{def:abstract_expert_policy}. Since $\pi^{\expert, \phi} \in \Pi^{\phi}$, it holds that
\begin{align*}
    V^{\piE, \gM} - V^{\widebar{\pi}^{\phi}, \gM^{\phi}} &\leq \sum_{h=1}^H \lnorm \widetilde{d}^{\piE, \phi}_h  - d^{\pi^{\expert, \phi}, \widehat{P}^{\phi}}_h  \rnorm_1 + \varepsilon_{\est}+ \varepsilon_{\rfe} + \varepsilon_{\opt}
    \\
    &\leq \sum_{h=1}^H \lnorm \widetilde{d}^{\piE, \phi}_h - d^{\pi^{\expert, \phi}, P^{\phi}}_h  \rnorm_1 + \sum_{h=1}^H \lnorm d^{\pi^{\expert, \phi}, P^{\phi}}_h  - d^{\pi^{\expert, \phi}, \widehat{P}^{\phi}}_h  \rnorm_1 + \varepsilon_{\est}+ \varepsilon_{\rfe} + \varepsilon_{\opt} 
\end{align*}
Then we upper bound the term $\sum_{h=1}^H \Vert d^{\pi^{\expert, \phi}, P^{\phi}}_h - d^{\pi^{\expert, \phi}, \widehat{P}^{\phi}}_h  \Vert_1$
\begin{align*}
    \sum_{h=1}^H \lnorm d^{\pi^{\expert, \phi}, P^{\phi}}_h - d^{\pi^{\expert, \phi}, \widehat{P}^{\phi}}_h  \rnorm_1 &= \max_{r^{\phi} \in \gW^{\phi}} \sum_{h=1}^H \sum_{(x, a) \in \Phi \times \gA} \lp d^{\pi^{\expert, \phi}, P^{\phi}}_h (x, a) - d^{\pi^{\expert, \phi}, \widehat{P}^{\phi}}_h (x, a) \rp r^{\phi}_h (x, a)
    \\
    &= \max_{r^{\phi} \in \gW^{\phi}} V^{\pi^{\expert, \phi}, P^{\phi}, r^{\phi}} - V^{\pi^{\expert, \phi}, \widehat{P}^{\phi}, r^{\phi}}
    \\
    &\leq \varepsilon_{\rfe}.
\end{align*}
Here $\gW^{\phi} = \{ w^{\phi} = (w^{\phi}_1, \ldots, w^{\phi}_H), \; w^{\phi}_h: \Phi \times \gA \rar [0, 1], \forall h \in [H] \}$. The last inequality holds due to the event $E_{\rfe}$. Then we obtain
\begin{align*}
    V^{\piE, \gM} - V^{\widebar{\pi}^{\phi}, \gM^{\phi}} &\leq \sum_{h=1}^H \lnorm \widetilde{d}^{\piE, \phi}_h - d^{\pi^{\expert, \phi}, P^{\phi}}_h  \rnorm_1  + \varepsilon_{\est}+ 2\varepsilon_{\rfe} + \varepsilon_{\opt}. 
\end{align*}
Applying \cref{lemma:state_action_distribution_irrelevant} on $\pi^{\expert, \phi}$ and $P^{\phi}$ yields $d^{\pi^{\expert, \phi}, P^{\phi}}_h = d^{[\pi^{\expert, \phi}]^M, P, \phi}_h$. Combined with $[\pi^{\expert, \phi}]^M = \piE$ in \cref{lemma:reward_transition_irrelevant}, we obtain
\begin{align*}
    V^{\piE, \gM} - V^{\widebar{\pi}^{\phi}, \gM^{\phi}} &\leq \sum_{h=1}^H \lnorm \widetilde{d}^{\piE, \phi}_h  - d^{\pi^{\expert, \phi}, P^{\phi}}_h  \rnorm_1  + \varepsilon_{\est}+ 2\varepsilon_{\rfe} + \varepsilon_{\opt}
    \\
    &= \sum_{h=1}^H \lnorm \widetilde{d}^{\piE, \phi}_h - d^{\piE, \phi}_h \rnorm_1  + \varepsilon_{\est}+ 2\varepsilon_{\rfe} + \varepsilon_{\opt}
    \\
    &\leq 2\varepsilon_{\est}+ 2\varepsilon_{\rfe} + \varepsilon_{\opt}, 
\end{align*}
where the last inequality holds due to the event $E_{\est}$. We finish the proof.
\end{proof}


Now, we proceed to prove Theorem \ref{theorem:sample-complexity-unknown-transition-state-abstraction}.  
\begin{proof}[Proof of Theorem \ref{theorem:sample-complexity-unknown-transition-state-abstraction}]
First, we verify condition $(a)$ in Proposition \ref{prop:connection_state_abstraction}. We want to demonstrate that Algorithm \ref{algo:rf_express_state_abstraction} is equivalent to applying RF-Express (Algorithm \ref{algo:rf_express}) on the abstract MDP $\gM^{\phi}$. The only difference lies in the data-collection process. On one hand, in line \ref{alg_line:data_collection} in Algorithm \ref{algo:rf_express_state_abstraction}, we roll out the lifted policy $[\pi^{\phi, t+1}]^{M}$ on the original MDP $\gM$. On the other hand, when applying RF-Express (Algorithm \ref{algo:rf_express}) on the abstract MDP $\gM^{\phi}$, we rollout the abstract policy $\pi^{\phi, t+1}$ on the abstract MDP $\gM^{\phi}$. We will prove that in the above two data-collection processes, the corresponding abstract-state-action distributions are actually the same. Consequently, Algorithm \ref{algo:rf_express_state_abstraction} can be regarded as applying RF-Express (Algorithm \ref{algo:rf_express}) on the abstract MDP $\gM^{\phi}$.   


In the first process, conditioned on $\pi^{\phi, t+1}$, we consider the probability distribution of $(\phi_h (s^{t+1}_h), a^{t+1}_h)$. Recall the definition: 
\begin{align*}
    d^{[\pi^{\phi, t+1}]^{M}, P, \phi}_h (x, a) := 
    \sP \lp \phi_h (s^{t+1}_h) = x, a^{t+1}_h = a | [\pi^{\phi, t+1}]^{M}, P \rp = \sum_{s \in \phi^{-1}_h (x)} \sP \lp s^{t+1}_h = s, a^{t+1}_h = a | [\pi^{\phi, t+1}]^{M}, P \rp. 
\end{align*}
By \cref{lemma:state_action_distribution_irrelevant}, we have that
\begin{align*}
    d^{[\pi^{\phi, t+1}]^{M}, P, \phi}_h (x, a) = d^{\pi^{\phi, t+1}, P^{\phi}}_h (x, a).
\end{align*}
Notice that the distribution $d^{\pi^{\phi, t+1}, P^{\phi}}_h (x, a)$ is exactly the abstract state-action distribution of $\pi^{\phi, t+1}$ in the abstract MDP $\gM^{\phi}$. Therefore, in the mentioned two data-collection processes, the corresponding abstract-state-action distributions are actually the same. Then we can apply \cref{lem:reward_free} on the abstract MDP. When the number of trajectories collected by Algorithm \ref{algo:rf_express_state_abstraction} satisfies
\begin{align*}
    n \gtrsim  \frac{H^{3} |\Phi| |\gA| }{\varepsilon^2}    \lp |\Phi| + \log\lp\frac{|\Phi| H}{\delta} \rp \rp,
\end{align*}
for any policy $\pi^{\phi} \in \Pi^{\phi}$ and reward function $r^{\phi} = (r^{\phi}_1, \ldots, r^{\phi}_H), \; r^{\phi}_h: \Phi \times \gA \rar [0, 1]$, with probability at least $1-\delta/2$, $| V^{\pi^{\phi}, P^{\phi}, r^{\phi}} - V^{\pi^{\phi}, \widehat{P}^{\phi}, r^{\phi}} | \leq \varepsilon / 16 = \varepsilon_{\rfe}$. In summary, the assumption (a) in Proposition \ref{prop:connection_state_abstraction} holds with $\delta_{\mathrm{RFE}} = \delta / 2$ and $\varepsilon_{\mathrm{RFE}} = \varepsilon / 16$.



Second, we verify the condition $(b)$ in Proposition \ref{prop:connection_state_abstraction}. Note that the assumption (b) in Proposition \ref{prop:connection_state_abstraction} holds by Lemma \ref{lemma:sample_complexity_of_new_estimator_unknown_transition_state_abstraction}. More concretely, if the expert sample complexity and interaction complexity satisfies
\begin{align*}
    m \gtrsim   \frac{H^{3/2} | \Phi | }{\varepsilon} \log\lp  \frac{|\Phi| H}{\delta} \rp, \; n^\prime \gtrsim \frac{H^{2} | \Phi |}{\varepsilon^2} \log\lp  \frac{|\Phi| H}{\delta} \rp,
\end{align*}
with probability at least $1-\delta/2$, $\sum_{h=1}^H \Vert \widetilde{d}^{\piE, \phi}_h - d^{\piE, \phi}_h  \Vert_{1} \leq \varepsilon / 16 = \varepsilon_{\est}$. Hence, the assumption (b) in Proposition \ref{prop:connection_state_abstraction} holds with $\delta_{\mathrm{EST}} = \delta / 2$ and $\varepsilon_{\mathrm{EST}} = \varepsilon / 16$.



% We aim to upper bound the following estimation error.
% \begin{align*}
%     \sum_{h=1}^H \Vert \widetilde{d}^{\piE, \phi}_h - d^{\piE, \phi}_h  \Vert_{1} \leq \varepsilon_{\est}.
% \end{align*}
% Our proof idea is to connect the above estimation error with the one in \cref{lemma:sample_complexity_of_new_estimator_unknown_transition} when applying for abstract MDP $\gM^{\phi}$.



% Then we have that
% \begin{align*}
%     d^{\piE, \phi}_h (x, a) \overset{(a)}{=} d^{[\pi^{\expert, \phi}]^M, \phi}_h (x, a) \overset{(b)}{=} d^{\pi^{\expert, \phi}, P^{\phi}}_h (x, a).  
% \end{align*}
% Equation $(a)$ follows $\piE = [\pi^{\expert, \phi}]^{M}$ in \cref{lemma:reward_transition_irrelevant} and equation $(b)$ follows \cref{lemma:state_action_distribution_irrelevant}. Then we have that
% \begin{align*}
%     \sum_{h=1}^H \Vert \widetilde{d}^{\piE, \phi}_h - d^{\piE, \phi}_h  \Vert_{1} = \sum_{h=1}^H \Vert \widetilde{d}^{\piE, \phi}_h - d^{\pi^{\expert, \phi}, P^{\phi}}_h  \Vert_{1}. 
% \end{align*}
% Notice that the estimator $\widetilde{d}^{\piE, \phi}_h$ in \eqref{eq:new_estimator_unknown_transition_state_abstraction} is a deterministic function of $\gD$ and $\gD^{\prime}$. Given the dataset $\gD$ and $\gD^{\prime}$, we construct another two abstract dataset $\gD^{\phi}$ and $\gD^{\prime, \phi}$. In the construction of $\gD^{\phi}$, for each trajectory in $\gD$, we map these states to abstract states via $\{ \phi_h \}_{h=1}^H$ and then add the resultant trajectory to the dataset $\gD^{\phi}$. The other dataset $\gD^{\prime, \phi}$ is generated by $\gD^{\prime}$ in the same way. With $\gD^{\phi}$ and $\gD^{\prime, \phi}$, we can calculate the estimator in \eqref{eq:new_estimator_unknown_transition} by regarding $\Phi$ as the state space. The resultant estimator is denoted as $\widetilde{d}$. By comparing \cref{eq:new_estimator_unknown_transition} and \cref{eq:new_estimator_unknown_transition_state_abstraction}, we observe that $\widetilde{d} = \widetilde{d}^{\piE, \phi}_h$ given $\gD$ and $\gD^{\prime}$. Then we obtain that
% \begin{align*}
%     \sum_{h=1}^H \Vert \widetilde{d}^{\piE, \phi}_h - d^{\pi^{\expert, \phi}, P^{\phi}}_h  \Vert_{1} = \sum_{h=1}^H \Vert \widetilde{d} - d^{\pi^{\expert, \phi}, P^{\phi}}_h  \Vert_{1}. 
% \end{align*}
% Now we consider the abstract state-action distributions in $\gD^{\phi}$ and $\gD^{\prime, \phi}$. We want to prove that 



% First, we calculate the abstract state-action distribution in $\gD^{\prime, \phi}$. Given $\gD_1$ where $\gD = \gD_1 \cup \gD_1^c$, recall that $\pi^{\prime, \phi}$ is the abstract BC's policy on $\gD_1$ (see \eqref{eq:bc_policy_state_abstraction}). Notice that the dataset $\gD^{\prime}$ is collected by $[\pi^{\prime, \phi}]^M$ in $\gM$ (see Line \ref{alg_line:mbtail-state-abstraction-rollout-bc} in \cref{algo:mbtail-state-abstraction}). Then we can formulate the abstract state-action distribution as
% \begin{align*}
%     \sP (x_h = x, a_h = a | [\pi^{\prime, \phi}]^M, P) = \sP (\phi_h (s_h) = x, a_h = a | [\pi^{\prime, \phi}]^M, P) = d^{[\pi^{\prime, \phi}]^M, P, \phi}_h (x, a). 
% \end{align*}
% The last equation is due to the definition. By \cref{lemma:state_action_distribution_irrelevant}, we further have
% \begin{align*}
%     \sP (x_h = x, a_h = a | [\pi^{\prime, \phi}]^M, P) = d^{[\pi^{\prime, \phi}]^M, P, \phi}_h (x, a) = d^{\pi^{\prime, \phi}, P^{\phi}}_h (x, a).  
% \end{align*}
% In a word, the abstract state-action distribution in $\gD^{\prime, \phi}$ is exactly the state-action distribution of $\pi^{\prime, \phi}$ 


Third, we validate the condition $(c)$ in Proposition \ref{prop:connection_state_abstraction}. In particular, we apply \cref{algo:gradient_based_optimization} to solve the following abstract state-action distribution matching problem.
\begin{align*}
        \min_{\pi^\phi \in \Pi^{\phi}} \sum_{h=1}^H \lnorm \widetilde{d}^{\piE, \phi}_h - d^{\pi^{\phi}, \widehat{P}^{\phi}}_h \rnorm_1.
\end{align*}
Therefore, we can apply \cref{lemma:approximate-minimax}. In particular, when $\varepsilon_{\rl} \leq \varepsilon / 2$ and $T \gtrsim |\Phi| |\gA| H^2 / \varepsilon^2$ such that $2 H \sqrt{2 |\Phi| |\gA| / T} \leq \varepsilon / 4$, we have that
\begin{align*}
    \sum_{h=1}^H \lnorm \widetilde{d}^{\piE, \phi}_h - d^{\widebar{\pi}^{\phi}, \widehat{P}^{\phi}}_h \rnorm_1 - \min_{\pi^\phi \in \Pi^{\phi}} \sum_{h=1}^H \lnorm \widetilde{d}^{\piE, \phi}_h - d^{\pi^{\phi}, \widehat{P}^{\phi}}_h \rnorm_1 \leq \frac{3\varepsilon}{4}  = \varepsilon_{\opt}.
\end{align*}
In summary, we have established the following conditions:
\begin{itemize}
    \item Assumption (a) in Proposition \ref{prop:connection_state_abstraction} holds with $\delta_{\mathrm{RFE}} = \delta / 2$ and $\varepsilon_{\mathrm{RFE}} = \varepsilon / 16$.
    \item Assumption (b) in Proposition \ref{prop:connection_state_abstraction} holds with $\delta_{\mathrm{EST}} = \delta / 2$ and $\varepsilon_{\mathrm{EST}} = \varepsilon / 16$.
    \item Assumption (c) in Proposition \ref{prop:connection_state_abstraction} holds with $\varepsilon_{\opt} = 3\varepsilon / 4$. 
\end{itemize}
By applying Proposition \ref{prop:connection_state_abstraction}, we complete the proof. With probability at least $1-\delta$, we have
\begin{align*}
    V^{\piE} - V^{[\widebar{\pi}^{\phi}]^M} \leq 2 \varepsilon_{\mathrm{RFE}} + 2 \varepsilon_{\mathrm{EST}} + \varepsilon_{\opt} = \varepsilon.  
\end{align*}
\end{proof}


\subsection{Useful Lemmas}
In this part, we develop specialized analysis tools for AIL with state abstraction. The below lemma indicates that under \cref{asmp:state_abstraction}, the lifted versions of the abstract reward function and abstract transition function are identical to the original reward function and transition function, respectively.   
\begin{lem}
\label{lemma:reward_transition_irrelevant}
For the original MDP $\gM = (\gS, \gA, P, r, H, \rho)$ and expert policy $\piE$ that satisfy \cref{asmp:state_abstraction}, we consider the abstract MDP $\gM^{\phi} = (\Phi, \gA, P^{\phi}, r^{\phi}, H, \rho^{\phi})$ in \cref{def:abstract_mdp}. Then we have that
\begin{align*}
\forall h \in [H], \; (s, a) \in \gS \times \gA, \; x^\prime \in \Phi, \; r_h (s, a) = [r^{\phi}]^{M}_h (s, a), \; \sum_{s^\prime \in \phi^{-1}_{h+1} (x^\prime)} P_h (s^\prime |s, a) = [P^{\phi}]^{M}_h (x^\prime|s, a).    
\end{align*}
Here $[r^{\phi}]^{M}_h (s, a) = r^{\phi}_h (\phi_h (s), a)$ and $[P^{\phi}]^{M}_h (x^\prime|s, a) = P^{\phi}_h (x^\prime | \phi_h (s), a)$. Furthermore, we consider the abstract expert policy $\pi^{\expert, \phi}$ in \cref{def:abstract_expert_policy}. Then we have that 
\begin{align*}
    \forall h \in [H], \; s \in \gS, \; \piE_h (s) = [\pi^{\expert, \phi}]^{M}_h (s), 
\end{align*}
where $[\pi^{\expert, \phi}]^{M}_h (s) = \pi^{\expert, \phi}_h (\phi_h (s)) $.
\end{lem}

\begin{proof}
For the reward function, we have
\begin{align*}
    [r^{\phi}_h]_{\gM} (s, a) = r^{\phi}_h (\phi_h (s), a) \overset{x:= \phi_h (s)}{=} r^{\phi}_h (x, a). 
\end{align*}
Notice that $r^{\phi}_h (x, a) = r_h (\widehat{s}, a)$ for an arbitrary $\widehat{s} \in \phi^{-1}_h (x)$. Moreover, since $s, \; \widehat{s} \in \phi^{-1}_h (x)$ and $r$ satisfies \eqref{eq:reward_consistent}, we have $r_h (\widehat{s}, a) = r_h (s, a)$. 

For the transition function, we have
\begin{align*}
    [P^{\phi}]^{M}_h (x^\prime|s, a) = P^{\phi}_h (x^\prime | \phi_h (s), a) \overset{x := \phi_h (s)}{=} P^{\phi}_h (x^\prime | x, a).  
\end{align*}
According to \cref{def:abstract_mdp}, we have $P^{\phi}_h (x^\prime | x, a) = \sum_{s^\prime \in \phi^{-1}_{h+1} (x^\prime)} P_h (s^\prime| \widetilde{s}, a)$ for an arbitrary $\widetilde{s} \in \phi^{-1}_h (x)$. Furthermore, because $s, \; \widetilde{s} \in \phi^{-1}_h (x)$ and $P$ satisfies \eqref{eq:transition_consistent}, we have 
\begin{align*}
\sum_{s^\prime \in \phi^{-1}_{h+1} (x^\prime)} P_h (s^\prime| \widetilde{s}, a) = \sum_{s^\prime \in \phi^{-1}_{h+1} (x^\prime)} P_h (s^\prime| s, a).
\end{align*}
Finally, for the expert policy, it holds that 
\begin{align*}
    [\pi^{\expert, \phi}]^{M}_h (s) = \pi^{\expert, \phi}_h (\phi_h (s)) \overset{x:= \phi_h (s)}{=} \pi^{\expert, \phi}_h (x).  
\end{align*}
According to \cref{def:abstract_expert_policy}, we have $\pi^{\expert, \phi}_h (x) = \piE_h (\widetilde{s})$ for an arbitrary $\widetilde{s} \in \phi^{-1}_h (x)$. Notice that $s, \; \widetilde{s} \in \phi^{-1}_h (x)$ and $\piE$ satisfies \eqref{eq:expert_consistent}. Therefore, we have $\pi^{\expert, \phi}_h (x) = \piE_h (s)$. We finish the proof.
\end{proof}


\begin{lem}
\label{lemma:state_abstraction_summation}
For any function $f: \Phi \rightarrow \reals$, $g: \gS \rightarrow \reals$ and an state abstraction $\phi: \gS \rightarrow \Phi$, we define $g^{\phi} (x) := \sum_{s \in \phi^{-1}(x)} g (s)$, then we have
\begin{align*}
    \sum_{x \in \Phi} g^{\phi} (x) f(x) = \sum_{s \in \gS } g(s) [f]^{M} (s), 
\end{align*}
where $[f]^{M} (s) = f (\phi (s))$.
\end{lem}
\begin{proof}
    \begin{align*}
    \sum_{x \in \Phi} g^{\phi} (x) f(x) &= \sum_{x \in \Phi} \sum_{s \in \phi^{-1}(x)} g (s) f(x)
    \\
    &= \sum_{x \in \Phi} \sum_{s \in \gS} \indict \lb s \in \phi^{-1}(x)  \rb g (s) f(x)
    \\
    &= \sum_{s \in \gS} \sum_{x \in \Phi}  \indict \lb x = \phi (s)    \rb g (s) f(x)
    \\
    &= \sum_{s \in \gS} g (s) f(\phi (s))
    \\
    &= \sum_{s \in \gS} g (s) [f]^{M} (s). 
\end{align*}
We complete the proof.
\end{proof}




\cref{lemma:policy_value_irrelevant} indicates that for any abstract policy $\pi^{\phi} \in \Pi^{\phi}$, the value function of $[\pi^{\phi}]^M$ on $P$ equals the lifted version of the value function of $\pi^{\phi}$ on $P^{\phi}$.
\begin{lem}
\label{lemma:policy_value_irrelevant}
For the original MDP $\gM = (\gS, \gA, P, r, H, \rho)$ and expert policy $\piE$ that satisfy \cref{asmp:state_abstraction}, we consider the abstract MDP $\gM^{\phi} = (\Phi, \gA, P^{\phi}, r^{\phi}, H, \rho^{\phi})$ in \cref{def:abstract_mdp}. Then, for any abstract policy $\pi^{\phi} \in \Pi^{\phi}$, we have
\begin{align*}
V^{[\pi^{\phi}]^{M}, \gM}_h (s) = [V^{\pi^{\phi}, \gM^{\phi}}]^{M}_h (s), \forall s \in \gS, h \in [H],    
\end{align*}
where $[V^{\pi^{\phi}, \gM^{\phi}}]^{M}_h (s) := V^{\pi^{\phi}, \gM^{\phi}}_h (\phi_h (s))$, $[\pi^{\phi}]^{M}_h (a|s) = \pi^{\phi}_h (a|\phi_h (s))$. $V^{\pi^{\phi}, \gM^{\phi}}_h (s)$ is the value function of $\pi^{\phi}$ on $\gM^{\phi}$ and $V^{[\pi^{\phi}]^{M}, \gM}_h (s)$ is the value function of $[\pi^{\phi}]^{M}$ on $\gM$. Furthermore, it holds that $V^{[\pi^{\phi}]^{M}, \gM} = V^{\pi^{\phi}, \gM^{\phi}}$.
\end{lem}

\begin{proof}
The proof is based on backward induction. For the base case, we prove that
\begin{align*}
    V^{[\pi^{\phi}]^{M}, \gM}_H (s) = [V^{\pi^{\phi}, \gM^{\phi}}]^{M}_H (s), \; \forall s \in \gS.
\end{align*}
In particular,
\begin{align*}
    [V^{\pi^{\phi}, \gM^{\phi}}]^{M}_H (s) &= V^{\pi^{\phi}, \gM^{\phi}}_H (\phi_H (s))
    \\
    &= \sum_{a \in \gA} \pi^{\phi}_H (a| \phi_H (s)) r^{\phi}_H (\phi_H (s), a)
    \\
    &= \sum_{a \in \gA} [\pi^{\phi}]^{M}_H (a| s) [r^{\phi}]^{M}_H (s, a)
    \\
    &\overset{(a)}{=}  \sum_{a \in \gA} [\pi^{\phi}]^{M}_H (a| s) r_H (s, a)
    \\
    &= V^{[\pi^{\phi}]^{M}, \gM}_{H} (s).
\end{align*}
Equation $(a)$ follows \cref{lemma:reward_transition_irrelevant}. We finish the proof of the base case and continue to prove the induction stage. Assume that $V^{[\pi^{\phi}]^{M}, \gM}_{h+1} (s) = [V^{\pi^{\phi}, \gM^{\phi}}]^{M}_{h+1} (s), \forall s \in \gS$, we consider the time step $h$.

\begin{align*}
    [V^{\pi^{\phi}, \gM^{\phi}}]^{M}_{h} (s) &= V^{\pi^{\phi}, \gM^{\phi}}_{h} (\phi_h (s))
    \\
    &= \expect_{a \sim  \pi^{\phi}_h (\cdot|\phi_h (s))} \ls r^{\phi}_h (\phi_h (s), a) + P^{\phi}_{h+1} V^{\pi^{\phi}, \gM^{\phi}}_{h+1} (\phi_h (s), a) \rs.
\end{align*}
Here $P^{\phi}_{h+1} V^{\pi^{\phi}, \gM^{\phi}}_{h+1} (\phi_h (s), a) = \expect_{x^\prime \sim P^{\phi}_{h+1}(\cdot|\phi_h (s), a )} \ls V^{\pi^{\phi}, \gM^{\phi}}_{h+1} (x^\prime) \rs$. For the first term in RHS, we have

\begin{align*}
    \expect_{a \sim  \pi^{\phi}_h (\cdot|\phi_h (s))} \ls r^{\phi}_h (\phi_h (s), a) \rs = \expect_{a \sim  [\pi^{\phi}]^{M}_h (\cdot|s)} \ls [r^{\phi}]^{M}_h (s, a) \rs = \expect_{a \sim  [\pi^{\phi}]^{M}_h (\cdot|s)} \ls r_h (s, a) \rs. 
\end{align*}
The last equation utilizes \cref{lemma:reward_transition_irrelevant}. For the term $P^{\phi}_{h+1} V^{\pi^{\phi}, \gM^{\phi}}_{h+1} (\phi_h (s), a)$, we obtain

\begin{align*}
    P^{\phi}_{h+1} V^{\pi^{\phi}, \gM^{\phi}}_{h+1} (\phi_h (s), a) &= \sum_{x^\prime \in \Phi} P^{\phi}_{h+1} (x^\prime|\phi_h (s), a) V^{\pi^{\phi}, \gM^{\phi}}_{h+1} (x^\prime)
    \\
    &= \sum_{x^\prime \in \Phi} \lp \sum_{s^\prime \in \phi^{-1}_h (x^\prime)} P_{h+1} (s^\prime|\phi_h (s), a) \rp V^{\pi^{\phi}, \gM^{\phi}}_{h+1} (x^\prime), 
    \\
    &= \sum_{x^\prime \in \Phi} \lp \sum_{s^\prime \in \phi^{-1}_h (x^\prime)} P_{h+1} (s^\prime|\tilde{s}, a) \rp V^{\pi^{\phi}, \gM^{\phi}}_{h+1} (x^\prime), \; \text{for an arbitrary } \widetilde{s} \in \phi^{-1}_h (x). 
\end{align*}
In the last equation, we define $x = \phi_h (s)$. According to $s, \; \widetilde{s} \in \phi^{-1}_h (x)$ and \eqref{eq:transition_consistent} in \cref{asmp:state_abstraction}, we have 

\begin{align*}
    P^{\phi}_{h+1} V^{\pi^{\phi}, \gM^{\phi}}_{h+1} (\phi_h (s), a) &= \sum_{x^\prime \in \Phi} \lp \sum_{s^\prime \in \phi^{-1}_h (x^\prime)} P_{h+1} (s^\prime|\tilde{s}, a) \rp V^{\pi^{\phi}, \gM^{\phi}}_{h+1} (x^\prime)
    \\
    &= \sum_{x^\prime \in \Phi}  \lp \sum_{s^\prime \in \phi^{-1}_h (x^\prime)} P_{h+1} (s^\prime|s, a) \rp V^{\pi^{\phi}, \gM^{\phi}}_{h+1} (x^\prime). 
\end{align*}
Applying \cref{lemma:state_abstraction_summation} with $f (x) = V^{\pi^{\phi}, \gM^{\phi}}_{h+1} (x), g (s^\prime) = P_{h+1} (s^\prime|s, a), \phi = \phi_{h+1}$ yields that
\begin{align*}
    P^{\phi}_{h+1} V^{\pi^{\phi}, \gM^{\phi}}_{h+1} (\phi_h (s), a) &= \sum_{s^\prime \in \gS} P_{h+1} (s^\prime|s, a) \ls V^{\pi^{\phi}, \gM^{\phi}} \rs^{M}_{h+1} (s^\prime)
    \\
    &\overset{(a)}{=} \sum_{s^\prime \in \gS} P_{h+1} (s^\prime|s, a) V^{[\pi^{\phi}]^{M}, \gM}_{h+1} (s^\prime)
    \\
    &= P_{h+1} V^{[\pi^{\phi}]^{M}, \gM}_{h+1} (s, a). 
\end{align*}
In equation $(a)$, we leverage the assumption in time step $h+1$. Then we obtain
\begin{align*}
    [V^{\pi^{\phi}, \gM^{\phi}}]^{M}_{h} (s) &= \expect_{a \sim  \pi^{\phi}_h (\cdot|\phi_h (s))} \ls r^{\phi}_h (\phi_h (s), a) + P^{\phi}_{h+1} V^{\pi^{\phi}, \gM^{\phi}}_{h+1} (\phi_h (s), a) \rs
    \\
    &= \expect_{a \sim  [\pi^{\phi}]^{M}_h (\cdot|s)} \ls r_h (s, a) + P_{h+1} V^{[\pi^{\phi}]^{M}, \gM}_{h+1} (s, a) \rs
    \\
    &= V^{[\pi^{\phi}]^{M}, \gM}_{h} (s). 
\end{align*}
We prove the induction stage and thus finish the proof of the first claim. Furthermore, according to the definition of $\rho^{\phi}$, we have

\begin{align*}
    V^{\pi^{\phi}, \gM^{\phi}} = \expect_{x \sim \rho^{\phi}} \ls  V^{\pi^{\phi}, \gM^{\phi}}_1 (x)  \rs = \sum_{x \in \Phi} \rho^{\phi} (x) V^{\pi^{\phi}, \gM^{\phi}}_1 (x) = \sum_{s \in \gS} \rho (s) \ls V^{\pi^{\phi}, \gM^{\phi}} \rs^{M}_1 (s).  
\end{align*}
In the last equation, we apply \cref{lemma:state_abstraction_summation} with $f (x) = V^{\pi^{\phi}, \gM^{\phi}}_1 (x)$, $g (s) = \rho (s)$ and $\phi = \phi_1$. We have proved that $[ V^{\pi^{\phi}, \gM^{\phi}} ]^{M}_1 (s) = V^{[\pi^{\phi}]^{M}, \gM}_{1} (s)$. Then it holds that
\begin{align*}
    V^{\pi^{\phi}, \gM^{\phi}} = \sum_{s \in \gS} \rho (s) V^{[\pi^{\phi}]^{M}, \gM}_{1} (s) = V^{[\pi^{\phi}]^{M}, \gM}_{1},  
\end{align*}
which completes the proof.
\end{proof}


\begin{lem}
\label{lemma:state_action_distribution_irrelevant}
For the original MDP $\gM = (\gS, \gA, P, r, H, \rho)$ and expert policy $\piE$ that satisfy \cref{asmp:state_abstraction}, we consider the abstract MDP $\gM^{\phi} = (\Phi, \gA, P^{\phi}, r^{\phi}, H, \rho^{\phi})$ in \cref{def:abstract_mdp}. Then, for any abstract policy $\pi^{\phi} \in \Pi^{\phi}$,
\begin{align*}
    \forall h \in [H], \; (x, a) \in \Phi \times 
    \gA, \; d^{\pi^{\phi}, P^{\phi}}_h (x, a) = d^{[\pi^{\phi}]^{M}, P, \phi}_h (x, a).
\end{align*}
Here $d^{\pi^{\phi}, P^{\phi}}_h (x, a) = \sP ( x_h = x, a_h = a | \pi^{\phi}, P^{\phi} )$ and $d^{[\pi^{\phi}]^{M}, P, \phi}_h (x, a) = \sP (\phi_h (s_h) = x, a_h = a | [\pi^{\phi}]^{M}, P) = \sum_{s \in \phi^{-1}_h (x)} d^{[\pi^{\phi}]^{M}, P}_h (s, a)$.
\end{lem}


\begin{proof}
We first prove that for any fixed $x \in \Phi, h \in [H]$, 
\begin{align*}
    d^{\pi^{\phi}, P^{\phi}}_h (x) = d^{[\pi^{\phi}]^{M}, P, \phi}_h (x),
\end{align*}
where $d^{\pi^{\phi}, P^{\phi}}_h (x) = \sP \lp x_h = x | \pi^{\phi}, P^{\phi}  \rp$ and $d^{[\pi^{\phi}]^{M}, P, \phi}_h (x) = \sP \lp \phi_h (s_h) = x | [\pi^{\phi}]^{M}, P  \rp$. Consider any fixed $x \in \Phi, h \in [H]$, we construct an abstract reward function $\widetilde{r}^{\phi}$.
\begin{align*}
    &\widetilde{r}^{\phi}_{h} (x, a) = 1, \forall a \in \gA,
    \\
    &\widetilde{r}^{\phi}_{\ell} (\tilde{x}, a) = 0, \forall \tilde{x} \in \Phi \setminus \{x \}, a \in \gA, \ell \in [H] \setminus \{ h \}. 
\end{align*}
Furthermore, we consider $[\widetilde{r}^{\phi}]^{M}$, which is the lifted version of $\widetilde{r}^{\phi}$.
\begin{align*}
    & \ls \widetilde{r}^{\phi} \rs^{M}_{h} (s, a) = 1, \forall s \in \phi^{-1}_h (x), a \in \gA,
    \\
    &\ls \widetilde{r}^{\phi} \rs^{M}_{\ell} (s, a) = 0, \forall s \in \gS \setminus \phi^{-1}_h (x), a \in \gA, \ell \in [H] \setminus \{ h \}. 
\end{align*}



On the one hand, according to the dual formulation of policy value in \eqref{eq:dual_of_policy_value}, we can get that $d^{\pi^{\phi}, P^{\phi}}_h (x) = V^{\pi^{\phi}, P^{\phi}, \widetilde{r}^{\phi}}$. On the other hand, it holds that

\begin{align*}
    d^{[\pi^{\phi}]^{M}, P, \phi}_h (x) = \sum_{s \in \phi^{-1}_h (x)} d^{[\pi^{\phi}]^{M}, P}_h (s) = V^{[\pi^{\phi}]^{M}, P, [\widetilde{r}^{\phi}]^{M}}.
\end{align*}

The last equation still follows the dual representation of policy value. Notice that $[\widetilde{r}^{\phi}]^{M}$ satisfies the reward-consistent condition (i.e., \eqref{eq:reward_consistent} in \cref{asmp:state_abstraction}). With \cref{lemma:policy_value_irrelevant}, we get that $V^{[\pi^{\phi}]^{M}, P, [\widetilde{r}^{\phi}]^{M}} = V^{\pi^{\phi}, P^{\phi}, \widetilde{r}^{\phi}}$, which implies that $d^{\pi^{\phi}, P^{\phi}}_h (x) = d^{[\pi^{\phi}]^{M}, P, \phi}_h (x)$. Then we have that
\begin{align*}
    d^{\pi^{\phi}, P^{\phi}}_h (x, a) = d^{\pi^{\phi}, P^{\phi}}_h (x) \pi^{\phi}_h (a|x) = d^{[\pi^{\phi}]^{M}, P, \phi}_h (x) \pi^{\phi}_h (a|x) &=   d^{[\pi^{\phi}]^{M}, P, \phi}_h (x) \ls \pi^{\phi} \rs^{M}_h (a|s) =  d^{[\pi^{\phi}]^{M}, P, \phi}_h (x, a),
\end{align*}
where $s \in \phi^{-1}_h (x)$. We finish the proof.    
\end{proof}


\begin{lem} \label{lemma:sample_complexity_of_new_estimator_unknown_transition_state_abstraction}
Given the expert dataset $\gD$, let $\gD$ be divided into two equal subsets, i.e., $\gD = \gD_{1} \cup \gD_{1}^c$ and $\gD_1 \cap \gD_1^{c} = \emptyset$ with $\labs \gD_1 \rabs = \labs \gD_1^{c} \rabs = m / 2$. Let $\pi^{\prime, \phi}$ be the abstract BC's policy on $\gD_1$. Fix $\pi^{\prime, \phi}$, let $\gD^\prime_{\mathrm{env}}$ be the dataset collected by $[\pi^{\prime, \phi}]^M$ and $|\gD^\prime_{\mathrm{env}} | = n^\prime$. Fix $\varepsilon \in (0, 1)$ and $\delta \in (0, 1)$; suppose $H \geq 5$. Consider the abstract state-action distribution estimator $\widetilde{d}^{\piE, \phi}_h$ shown in \eqref{eq:new_estimator_unknown_transition_state_abstraction}, if the expert sample complexity ($m$) and the interaction complexity ($n^\prime$) satisfy
\begin{align*}
    m \gtrsim   \frac{H^{3/2} | \Phi | }{\varepsilon} \log\lp  \frac{|\Phi| H}{\delta} \rp, \; n^\prime \gtrsim \frac{H^{2} | \Phi |}{\varepsilon^2} \log\lp  \frac{|\Phi| H}{\delta} \rp,
\end{align*}
then with probability at least $1-\delta$, we have
\begin{align*}
    \sum_{h=1}^H \lnorm \widetilde{d}^{\piE, \phi}_h - d^{\piE, \phi}_h  \rnorm_{1} \leq \varepsilon.
\end{align*}
\end{lem}

\begin{proof}
First, we can obtain that
\begin{align*}
    \sum_{h=1}^H \lnorm \widetilde{d}^{\piE, \phi}_h - d^{\piE, \phi}_h  \rnorm_1 &= \sum_{h=1}^H \sum_{(x, a) \in \Phi \times \gA} \left\vert \widetilde{d}^{\piE, \phi}_h (x, a) - d^{\piE, \phi}_h (x, a)  \right\vert
    \\
    &=\sum_{h=1}^H \sum_{x \in \Phi} \left\vert \widetilde{d}^{\piE, \phi}_h (x, \pi^{\expert, \phi}_h (x)) - d^{\piE, \phi}_h (x, \pi^{\expert, \phi}_h (x))  \right\vert.
\end{align*}
Here $\pi^{\expert, \phi}$ is the abstract expert policy in \cref{def:abstract_expert_policy}. The last equation holds since $\piE$ is a deterministic policy and satisfies \eqref{eq:expert_consistent} in \cref{asmp:state_abstraction}. Recall the abstract state-action distribution estimator $\widetilde{d}^{\piE, \phi}_h$ shown in \eqref{eq:new_estimator_unknown_transition_state_abstraction}.
\begin{align*}
    \widetilde{d}^{\piE, \phi}_h (x, \pi^{\expert, \phi}_h (x)) &= \frac{\sum_{\tr_h \in \gD^{\prime}_{\env}} \indict \{ \phi_h (\tr_h (\cdot)) = x, \tr_h (a_h) = \pi^{\expert, \phi}_h (x), \tr_h \in \Tr^{\gD_1, \phi}_h   \}}{\vert \gD^{\prime}_{\env} \vert} \nonumber 
    \\
    &+ \frac{\sum_{\tr_h \in \gD_1^c} \indict \{ \phi_h (\tr_h (\cdot)) = x, \tr_h (a_h) = \pi^{\expert, \phi}_h (x), \tr_h \not\in  \Tr^{\gD_1, \phi}_h\}}{\labs \gD^{c}_1 \rabs}.
\end{align*}
Given $\gD_1$, for $d^{\piE, \phi}_h$, we have the following decomposition.
\begin{align*}
    &\quad d^{\piE, \phi}_h (x, \pi^{\expert, \phi}_h (x)) 
    \\
    &= \sum_{\tr_h} \sP^{\piE} (\tr_h) \indict \{ \phi_h (\tr_h (\cdot)) = x, \tr_h (a_h) = \pi^{\expert, \phi}_h (x) \}
    \\
    &= \sum_{\tr_h \in \Tr^{\gD_1, \phi}_h} \sP^{\piE} (\tr_h) \indict \{ \phi_h (\tr_h (\cdot)) = x, \tr_h (a_h) = \pi^{\expert, \phi}_h (x) \} 
    \\
    &+ \sum_{\tr_h \not\in \Tr^{\gD_1, \phi}_h} \sP^{\piE} (\tr_h) \indict \{ \phi_h (\tr_h (\cdot)) = x, \tr_h (a_h) = \pi^{\expert, \phi}_h (x) \} .
\end{align*}
Then we have that
\begin{align*}
    & \quad | \widetilde{d}^{\piE, \phi}_h (x, \pi^{\expert, \phi}_h (x)) - d^{\piE, \phi}_h (x, \pi^{\expert, \phi}_h (x)) |  
    \\
    & \leq \bigg\vert \frac{\sum_{\tr_h \in \gD^{\prime}_{\env}} \indict \{ \phi_h (\tr_h (\cdot)) = x, \tr_h (a_h) = \pi^{\expert, \phi}_h (x), \tr_h \in \Tr^{\gD_1, \phi}_h   \}}{\vert \gD^{\prime}_{\env} \vert}
    \\
    &- \sum_{\tr_h \in \Tr^{\gD_1, \phi}_h} \sP^{\piE} (\tr_h) \indict \{ \phi_h (\tr_h (\cdot)) = x, \tr_h (a_h) = \pi^{\expert, \phi}_h (x) \}   \bigg\vert
    \\
    & + \bigg\vert \frac{\sum_{\tr_h \in \gD_1^c} \indict \{ \phi_h (\tr_h (\cdot)) = x, \tr_h (a_h) = \pi^{\expert, \phi}_h (x), \tr_h \not\in  \Tr^{\gD_1, \phi}_h\}}{\labs \gD^{c}_1 \rabs}
    \\
    &- \sum_{\tr_h \not\in \Tr^{\gD_1, \phi}_h} \sP^{\piE} (\tr_h) \indict \{ \phi_h (\tr_h (\cdot)) = x, \tr_h (a_h) = \pi^{\expert, \phi}_h (x) \}  \bigg\vert.
\end{align*}
We denote the first term in RHS as $\text{EA}_h (x)$ and the second term in RHS as $\text{EB}_h (x)$. We have that
\begin{align*}
    \sum_{h=1}^H \lnorm d^{\piE, \phi}_h  -  \widetilde{d}^{\piE, \phi}_h \rnorm_{1} &\leq \underbrace{\sum_{h=1}^H \sum_{x \in \Phi} \text{EA}_h (x)}_{\text{Error A}} + \underbrace{\sum_{h=1}^H \sum_{x \in \Phi} \text{EB}_h (x)}_{\text{Error B}}.
\end{align*}





First, we analyze the term Error A. Let ${E^\prime}^{x}_h$ be the event that $\tr_h$ agrees with expert policy at abstract state $x$ in time step $h$ and appears in $\Tr_h^{\gD_1, \phi}$. Formally, 
\begin{align*}
    {E^\prime}_h^{x} = \indict\{ \phi_h (\tr_h (\cdot)) = x \cap \tr_h (a_h) = \pi^{\expert, \phi}_h (x) \cap \tr_h \in \Tr_h^{\gD_1, \phi}\}.
\end{align*}

Then we leverage Chernoff's bound to upper bound $\text{EA}_h (x)$. By Lemma \ref{lemma:chernoff_bound}, for each $x \in \gS$ and $h \in [H]$, with probability at least $1 - \frac{\delta}{2 |\Phi| H}$ over the randomness of $\gD ^\prime$, we have
\begin{align*}
   \text{EA}_h (x) \leq \sqrt{ \sP^{\piE} \lp {E^\prime}^{x}_h  \rp  \frac{3 \log \lp 4 |\Phi| H / \delta \rp}{n^\prime}}.
\end{align*}
By union bound, with probability at least $1-\frac{\delta}{2}$ over the randomness of $\gD^\prime_{\env}$, we have
\begin{align*}
    \sum_{h=1}^H \sum_{x \in \Phi} \text{EA}_h (x) &\leq  \sum_{h=1}^H \sum_{x \in \Phi} \sqrt{ \sP^{\piE} \lp {E^\prime}^{x}_h  \rp  \frac{3 \log \lp 4 |\Phi| H / \delta \rp}{n^\prime}}
    \\
    &\leq \sum_{h=1}^H \sqrt{|\Phi|} \sqrt{\sum_{x \in \Phi} \sP^{\piE} \lp {E^\prime}^{x}_h  \rp  \frac{3 \log \lp 4 |\Phi| H / \delta \rp}{n^\prime} }
\end{align*}
The last inequality follows the Cauchy-Schwartz inequality. It remains to upper bound $\sum_{x \in \Phi}  \sP^{\piE}(E_{h}^{x})$ for all $h \in [H]$. To this end, we define the event ${G^\prime}_h^{\gD_1}$ that expert policy $\piE$ visits abstract states covered in $\gD_1$ up to time step $h$. Formally, ${G^\prime}_h^{\gD_1} = \indict\{ \forall h^{\prime} \leq h,  \phi_{h^\prime} (s_{h^{\prime}}) \in \Phi_{h^{\prime}} (\gD_1) \}$, where $\Phi_{h}(\gD_1)$ is the set of abstract states in $\gD_1$ at time step $h$. Then, for all $h \in [H]$, we have \begin{align*}
    \sum_{x \in \Phi} \sP^{\piE} \lp {E^\prime}_h^{x}  \rp = \sP^{\piE}({G^\prime}_h^{\gD_1}) \leq \sP({G^\prime}_1^{\gD_1}).
\end{align*}
The last inequality holds since ${G^\prime}_h^{\gD_1} \subseteq {G^\prime}_1^{\gD_1}$ for all $h \in [H]$. Then we have that
\begin{align*}
    \sum_{h=1}^H \sum_{x \in \Phi} \text{EA}_h (x) \leq H \sqrt{\frac{3 |\Phi| \log \lp 4 |\Phi| H / \delta \rp}{n^\prime}}.
\end{align*}
When the interaction complexity satisfies that $n^\prime \gtrsim \frac{| \Phi | H^{2}}{\varepsilon^2} \log\lp  \frac{|\Phi| H}{\delta} \rp$, with probability at least $1-\frac{\delta}{2}$ over the randomness of $\gD^\prime$, we have $\sum_{h=1}^H \sum_{x \in \Phi} \text{EA}_h (x) \leq \frac{\varepsilon}{2}$. 



Second, we upper bound the term Error B. Similarly, we can leverage Chernoff's bound to characterize its concentration rate. For a trajectory $\tr_h$, let $E^x_h$ be the event that $\tr_h$ agrees with expert policy at abstract state $x$ at time step $h$ but is not in $\Tr^{\gD_1, \phi}_h$, that is,
\begin{align*}
    E^x_h = \{ \phi_h (\tr_h (\cdot)) = x \cap \tr_h (a_h) = \pi^{\expert, \phi}_h (x) \cap \tr_h \not\in  \Tr^{\gD_1, \phi}_h \}.
\end{align*}
We consider $E^x_h$ is measured by the stochastic process induced by the expert policy $\piE$. Accordingly, its probability is
denoted as $\sP^{\piE} (E^x_h)$. We see that $\sP^{\piE} (E^x_h)$ is equal to the second term in $\text{EB}_h (x)$. Moreover, the first term in $\text{EB}_h (x)$ is an empirical estimation for $\sP^{\piE} (E^x_h)$. After applying Chernoff's bound, with probability at least $1-\delta/(2 \vert \Phi \vert H)$ with $\delta \in (0, 1)$ (over the randomness of the expert demonstrations $\gD_1^c$), for each $h \in [H], x \in \Phi$, we have
\begin{align*}
    \text{EB}_h (x) \leq \sqrt{\sP ^{\piE} \left(E_{h}^{x}\right) \frac{3 \log (4|\Phi| H / \delta)}{m}}.
\end{align*}
Therefore, with probability at least $1 - \delta/2$, we have
\begin{align*}
     \sum_{h=1}^{H} \sum_{x \in \Phi} \text{EB}_h (x)  &\leq \sum_{h=1}^{H} \sum_{x \in \Phi} \sqrt{\sP ^{\piE} \left(E_{h}^{x}\right) \frac{3 \log (4|\Phi| H / \delta)}{m}}
     \\
     &\leq \sum_{h=1}^{H}  \sqrt{\sum_{x \in \Phi} \sP ^{\piE} \left(E_{h}^{x}\right) \frac{3 \vert \Phi \vert \log (4|\Phi| H / \delta)}{m}},
\end{align*}
where the last step follows the Cauchy–Schwarz inequality. It remains to upper bound $\sum_{x \in \Phi} \sP ^{\piE} \left(E_{h}^{x}\right)$ for all $h \in [H]$. To this end, we define the event $G^{\gD_1}_h$: the expert policy visits certain abstract states uncovered in $\gD_1$ up to time step $h$. Formally, $G^{\gD_1}_h = \{ \exists h^\prime \leq h, \phi_{h^\prime} (s_{h^\prime}) \not \in \Phi_{h^\prime} (\gD_1) \}$, where $\Phi_{h^\prime} (\gD_1)$ is the set of abstract states in $\gD_1$ at time step $h$. Then, for all $h \in [H]$, we have
\begin{align*}
   \sum_{x \in \Phi} \sP ^{\piE} \lp E_{h}^{x} \rp = \sP^{\piE} \lp G^{\gD_1}_h \rp \leq \sP^{\piE} \lp G^{\gD_1}_{H} \rp,  
\end{align*}
where the first equality is true because $\cup_{x \in \Phi} E_{h}^{x} $ corresponds to the event that $\piE$ has visited some state uncovered in $\gD_1$, and the last inequality holds since $G^{\gD_1}_h \subseteq G^{\gD_1}_{H}$ for all $h \in [H]$. Conditioned on $\gD_1$, we further have
\begin{align*}
    \sP(G_H^{\gD_1}) \leq \sum_{h=1}^{H} \sum_{x \in \Phi} d^{\piE, \phi}_h(x) \indict\lb x \notin \Phi_h(\gD_1)  \rb.
\end{align*}

We first consider the expectation $\expect[\sum_{h=1}^{H} \sum_{x \in \Phi} d^{\piE, \phi}_h(x) \indict\lb x \notin \Phi_h(\gD_1)  \rb]$, where the expectation is taken over the expert dataset $\gD_{1}$. 
\begin{align*}
    \expect\ls \sum_{h=1}^{H} \sum_{x \in \Phi} d^{\piE, \phi}_h(x) \indict\lb x \notin \Phi_h(\gD_1)  \rb  \rs \leq \sum_{h=1}^{H} \sum_{x \in \Phi}  d_h^{\piE, \phi}(x) \lp 1 - d_h^{\piE, \phi}(x)  \rp^{m/2} \leq \frac{8 |\Phi| H}{9m},
\end{align*}
where the last step uses the numerical inequality\footnote{The first inequality is based on the basic calculus and the second inequality is based on the fact that $(1 - 1/x)^{x} \leq 1/e \leq 4/9$ while $x \geq 1$.} $\max_{x \in [0, 1]} x (1-x)^{m} \leq {1}/{(1+m)} \cdot \lp 1 - {1}/{m}  \rp^{m} \leq {4}/{(9m)}$. With \citep[Lemma A.3]{rajaraman2020fundamental}, with probability at least $1-\delta$ with $\delta \in (0, \min\{1, H/5\})$, we have
\begin{align*}
    \sum_{h=1}^{H} \sum_{x \in \Phi} d^{\piE, \Phi}_h(x) \indict\lb x \notin \Phi_h(\gD_1)  \rb \leq \frac{8|\Phi| H}{9m} + \frac{6 \sqrt{|\Phi|} H \log(H/\delta)}{m}.
\end{align*}
Then we have 
\begin{align*}
\sum_{h=1}^{H} \sum_{x \in \Phi} \text{EB}_h (x) &\leq \sum_{h=1}^{H} \sqrt{ \lp \frac{8|\Phi| H}{9m} + \frac{6 \sqrt{|\Phi|} H \log(2H/\delta)}{m}  \rp \frac{3 |\Phi| \log (4 |\Phi| H/\delta)}{m}} \\
&\leq \frac{H^{3/2} |\Phi|}{m} \log^{1/2}\lp \frac{4|\Phi| H}{\delta}  \rp \sqrt{ \frac{8}{3} + 18 \log (2H/\delta)  }.
\end{align*}
When the expert sample complexity satisfies that $m \gtrsim   \frac{H^{3/2} | \Phi | }{\varepsilon} \log\lp  \frac{|\Phi| H}{\delta} \rp$, with probability at least $1-\frac{\delta}{2}$ over the randomness of $\gD$, we have $\sum_{h=1}^H \sum_{x \in \Phi} \text{EB}_h (x) \leq \frac{\varepsilon}{2}$. Then, with union bound, with probability at least $1-\delta$, we can obtain
\begin{align*}
    \sum_{h=1}^H \lnorm \widetilde{d}^{\piE, \phi}_h - d^{\piE, \phi}_h  \rnorm_1 \leq \sum_{h=1}^H \sum_{x \in \Phi} \text{EA}_h (x) +  \sum_{h=1}^H \sum_{x \in \Phi} \text{EB}_h (x) \leq \varepsilon,
\end{align*}
which completes the proof.
\end{proof}


% \newpage
\section{Experiment Details}
\label{section:experiment_details}
% The methods for comparison include BC, OAL and \mbalgname. We use the optimal policy to collect expert demonstrations. All methods are provided the same number of expert demonstrations. All experiments run with $20$ random seeds.  All experiments run on the machine with $32$ CPU cores, $128$ GB RAM and NVIDIA GeForce RTX $2080$ Ti.


\textbf{Environment.} The Reset Cliff MDP is from \citep{rajaraman2020fundamental, xu2021error}. The state space $\gS = \{1, 2, \cdots, |\gS| - 1, b \}$ and action space $|\gA| = \{1, 2, \ldots, |\gA| - 1,  a^{\expert} \}$, where $b$ is a unique absorbing state and $a^{\expert}$ is the expert action. An example with there states and two actions are shown in \cref{fig:reset_cliff}, where the expert action is shown in green. Only the expert action has a reward $+1$. All non-expert actions have the same transitions and rewards. The initial state distribution $\rho = (1/m, 1/m,  1-|\gS|/m+2/m, 0)$. 

\begin{figure}[htbp]
    \centering
    \includegraphics[width=0.5\linewidth]{figs/three-cliffwalking.pdf}
    \caption{An example of Reset Cliff with three states and two actions. Arrows indicate the transitions and digits indicate the reward values.}
    \label{fig:reset_cliff}
\end{figure}

In our experiments, we implement the Reset Cliff MDP with 20 states and 5 actions. The planning horizon is 20. All algorithms are provided with 100 expert trajectories. All experiments run with $20$ random seeds. 


% \begin{table}[htbp]
% \centering
% \caption{Performance of the trained policies by TAIL and MIMIC-MD on Reset Cliff with different horizons. Numbers correspond to the averaged evaluation return over 20 random seeds ($\pm$ indicates the standard deviation). N.A. indicates that the experiment terminates due to running out of memory.}
% \label{tab:noisy_expert}
% \begin{tabular}{@{}c|llllllll}
% \toprule
%             & 100  & 200 & 300 & 400 & 500 & 600 & 700 &800 \\ \midrule
% TAIL      &                          \\  
% MIMIC-MD    &  & N.A.  & N.A.  & N.A.  & N.A.  & N.A.  & N.A.  & N.A.      \\
% \bottomrule
% \end{tabular}
% \end{table}




% \RED{The description about Reset Cliff and Grid World.}
% Table \ref{table:task_information_unknown_transition_setting} summarizes the detailed information on tasks of Reset Cliff and Grid World.
% \begin{table}[htbp]
% \caption{Information about Reset Cliff and Grid World.}
% \label{table:task_information_unknown_transition_setting}
% \centering
% % \resizebox{\columnwidth}{!}{%
% \begin{tabular}{@{}lllll@{}}
% \toprule
% Tasks           & Number of states & Number of actions & Horizon & Number of expert demonstrations \\ \midrule
% Reset Cliff          & 20              & 5      &  20          & 100           \\
% \midrule 
% Grid World   & 25             & 4 & 12  & 4          \\ \bottomrule
% \end{tabular}%
% %
% % }
% \end{table}



\textbf{Algorithm Implementation.} BC directly estimates the expert policy from expert demonstrations. Since the expert policy is deterministic, BC copies the expert action on visited states and takes a uniform policy on non-visited states. The implementation of FEM and GTAL follows the description in \citep{pieter04apprentice} and \citep{syed07game}, respectively. 

% On the other hand, MB-TAIL and OAL iteratively update the policy and reward function. The number of iterations $T$ of MB-TAIL is set as 500. Notice that OAL performs one update after each interaction. Therefore, the number of iterations $T$ equals the number of interactions in OAL.



MB-TAIL first establishes the estimator in \cref{eq:new_estimator_unknown_transition} with $20 \%$ of the environment interactions and learns an empirical transition model by invoking RF-Express~\citep{menard20fast-active-learning} to collect the remaining $80 \%$ trajectories. Subsequently, MB-TAIL performs policy and reward optimization with the recovered transition model. In particular, MB-TAIL utilizes value iteration to obtain the optimal policy (Line 2 of \cref{algo:gradient_based_optimization}). Besides, MB-TAIL utilizes online gradient descent to update the reward function. To utilize the optimization structure, we implement an adaptive step size~\citep{Orabona19a_modern_introduction_to_ol} rather than the constant step size:
\begin{align*}
    \eta_{t} = \frac{D}{ \sqrt{\sum_{i=1}^t \lnorm \nabla_{w} f^{(i)} \lp w^{(i)} \rp \rnorm_2^2}},
\end{align*}
where $D = \sqrt{2H |\gS| |\gA|}$ is the diameter of the set $\gW$. Conclusions about the sample complexity and computational complexity do not change by this adaptive step size. The number of iterations $T$ of MB-TAIL is 500.


% On the other hand, OAL is a model-based method and uses mirror descent (MD) \citep{Orabona19a_modern_introduction_to_ol} to optimize policy and reward. During the interaction, OAL maintains an empirical transition model to estimate Q-function for policy optimization. 


% in experiments and hence, OAL requires too many interactions to reach a good and stable performance.

To encourage exploration, OAL adds a bonus function to the Q-function. The bonus function used in the theoretical analysis of~\citep{shani2022online} is too big and impractical. Therefore, we simplify their bonus function from $b_{h}^k (s, a)=\sqrt{ \frac{4 H^{2} |\gS| \log \lp 3 H^{2} |\gS| |\gA| n / \delta \rp}{ n_{h}^{k}(s, a) \vee 1}}$ to  $b_{h}^k (s, a)=\sqrt{\frac{ \log \lp H |\gS| |\gA| n / \delta \rp}{n_{h}^{k}(s, a) \vee 1}}$, where $n$ is the total number of interactions, $\delta$ is the failure probability,  $n^{k}_h (s, a)$ is the number of times visiting $(s, a)$ at time step $h$ until episode $k$, and $n^{k}_h(s, a) \vee 1 = \max\{ n^{k}_h(s, a), 1 \}$. With the learned transition model and Q-function, OAL uses mirror descent (MD) to optimize the policy and reward function. The step sizes of MD are set by the results in the theoretical analysis of \citep{shani2022online}. The number of iterations $T$ of OAL is also 500.


% After the training process, we evaluate the policy value via exact Bellman update.








% \input{appendix/notation}
% \input{appendix/regret_to_pac}
% \input{appendix/proof_warm_up}
% \input{appendix/proof_main_result}
% \input{appendix/proof_state_abstraction}
% \input{appendix/experiment}
\bibliography{xu_380-supp}

\end{document}
