% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{abbrvnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
% \usepackage{xr}
\usepackage{tikz} % nice language for creating drawings and diagrams
% \AtBeginDocument{\setlength\abovedisplayskip{4.5pt}}
% \AtBeginDocument{\setlength\belowdisplayskip{4.5pt}}
\newcommand{\sectionspace}{}
\newcommand{\subsectionspace}{}
\input{packages/defs}
\input{packages/header}
\input{packages/math_commands}
\newcommand{\meanstd}[2]{$#1 {\scriptscriptstyle \pm #2}$}
% \externaldocument{xu_380-supp}
% \usepackage{todonotes}


\newcommand{\est}{\operatorname{EST}}
\newcommand{\rfe}{\operatorname{RFE}}
\newcommand{\env}{\operatorname{env}}
\renewcommand{\opt}{\operatorname{OPT}}
\newcommand{\rl}{\operatorname{RL}}
\newcommand{\bc}{\operatorname{BC}}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Provably Efficient Adversarial Imitation Learning with Unknown Transitions}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1, 4]{\href{mailto:<xut@lamda.nju.edu.cn>?}{Tian Xu\thanks{Equal contribution. Author ordering is determined randomly using a coin flip.}}{}}
\author[2, 3]{\href{mailto:<ziniuli@link.cuhk.edu.cn>?}{Ziniu Li{$^*$}}}
\author[1, 4]{\href{mailto:<yuy@nju.edu.cn>?}{{Yang Yu\thanks{Corresponding author.}}}}
\author[2, 3]{\href{mailto:<luozq@cuhk.edu.cn>?}{Zhi-Quan Luo{$^\dag$}}}
% \author[1]{Further~Coauthor}
% \author[3]{Further~Coauthor}
% \author[3,1]{Further~Coauthor}
% Add affiliations after the authors
\affil[1]{%
    National Key Laboratory for Novel Software Technology, Nanjing University
}
\affil[2]{%
The Chinese University of Hong Kong, Shenzhen
}
\affil[3]{%
Shenzhen Research Institute of Big Data
  }
\affil[4]{%
Polixir.ai
  }
  
  \begin{document}
\maketitle

% optimizes policies and rewards adversarially

% However, compared with recent advances in AIL with known transitions,   There have been lots of theoretical studies of AIL in the ideal known transition setting where AIL can evaluate policies accurately. However, the transition function is unavailable for many applications, and  the theoretical foundation of AIL with unknown transitions remains under-developed. 
 
\begin{abstract}
% Imitation learning (IL) directly learns good policies from expert demonstrations. As a broad class of IL methods, adversarial imitation learning (AIL) performs well in many applications. However, the theoretical foundation of AIL (with unknown transitions) remains under-developed. In this paper, we theoretically explore this direction, where the main challenge is that environment transitions are stochastic and uncertain. We study both expert sample complexity and interaction complexity required to recover good policies, which are of great interest in practice. First, we establish a general framework that connects reward-free exploration, an emerging topic in online reinforcement learning, with AIL. Under this framework, we design an algorithm MB-TAIL, which achieves the expert sample complexity $\widetilde{\mathcal{O}} (H^{3/2} |\mathcal{S}|/\varepsilon)$ and interaction complexity $\widetilde{\mathcal{O}} (H^{3} |\mathcal{S}|^2 |\mathcal{A}|/\varepsilon^2)$. Here $H$ is the planning horizon, $|\gS|$ is the state space size, $|\gA|$ is the action space size, and $\varepsilon$ is the desired imitation gap. To our best knowledge, MB-TAIL is the first to achieve the minimax optimal expert sample complexity in the unknown transition setting. Besides, MB-TAIL further improves the interaction complexity than the best-known algorithm OAL by $\gO (H)$. Finally, we extend MB-TAIL to the function approximation setting. In particular, we prove that MB-TAIL can achieve the expert sample and interaction complexity independent of $|\gS|$, which demonstrates its generalization ability. 

% The process of learning good policies from expert demonstrations, known as imitation learning (IL), has been proven effective in many applications. This paper explores the theoretical underpinnings of AIL in this context, where the primary challenge is the stochastic and uncertain nature of environment transitions. , which are of great practical interest.
Imitation learning (IL) has proven to be an effective method for learning good policies from expert demonstrations. Adversarial imitation learning (AIL), a subset of IL methods, is particularly promising, but its theoretical foundation in the presence of unknown transitions has yet to be fully developed. This paper explores the theoretical underpinnings of AIL in this context, where the stochastic and uncertain nature of environment transitions presents a challenge.  We examine the expert sample complexity and interaction complexity required to recover good policies. To this end, we establish a framework connecting reward-free exploration and AIL, and propose an algorithm, MB-TAIL, that achieves the minimax optimal expert sample complexity of $\widetilde{\mathcal{O}} (H^{3/2} |\mathcal{S}|/\varepsilon)$ and interaction complexity of $\widetilde{\mathcal{O}} (H^{3} |\mathcal{S}|^2 |\mathcal{A}|/\varepsilon^2)$. Here, $H$ represents the planning horizon, $|\gS|$ is the state space size, $|\gA|$ is the action space size, and $\varepsilon$ is the desired imitation gap. MB-TAIL is the first algorithm to achieve this level of expert sample complexity in the unknown transition setting and improves upon the interaction complexity of the best-known algorithm, OAL, by $\gO (H)$. Additionally, we demonstrate the generalization ability of MB-TAIL by extending it to the function approximation setting and proving that it can achieve expert sample and interaction complexity independent of $|\gS|$.
% In this work, we design provably efficient imitation learning algorithms that learn policies from expert demonstrations. First, in the known transition setting, the recent method MIMIC-MD achieves the minimax optimal sample complexity $\widetilde{\mathcal{O}} (H^{3/2} |\mathcal{S}|/\varepsilon)$ but suffers a quadratic space complexity $\mathcal{O} ( (|\gS| |\gA| H)^2)$, which hinders its application on large-scale problems. Here $H$ is the horizon, $|\mathcal{S}|$ is the state space size, $|\gA|$ is the action space size and $\varepsilon$ is the desired imitation gap. To address this computation issue, we develop a memory-efficient method TAIL with a better linear space complexity $\mathcal{O} (|\gS| |\gA| H)$, which also achieves the same sample complexity with MIMIC-MD. Second, when the transition function is unknown but the interaction is allowed, we develop an interaction-efficient method MB-TAIL, which enjoys the sample complexity $\widetilde{\mathcal{O}} (H^{3/2} |\mathcal{S}|/\varepsilon)$ and interaction complexity $\widetilde{\mathcal{O}} (H^{3} |\mathcal{S}|^2 \mathcal{A}/\varepsilon^2)$. In particular, MB-TAIL is significantly better than the best-known OAL algorithm in terms of both sample and interaction complexity. Technically, MB-TAIL is built upon a novel framework that connects reward-free exploration and adversarial imitation learning. Finally, we investigate TAIL and MB-TAIL with function approximation. With proper state abstractions, we prove that they can achieve a sample complexity independent of $|\mathcal{S}|$, which demonstrates their generalization properties.

% In this work, we design provably more efficient imitation learning algorithms that learn policies from expert demonstrations. Firstly, when the transition function is known, built upon the method MIMIC-MD \citep{rajaraman2020fundamental}, we propose to relax a complex projection operator. Based on this relaxation, we develop an adversarial imitation learning method named \emph{TAIL} with a gradient-based optimization procedure. Accordingly, TAIL still keeps the same sample complexity (i.e., the number of expert trajectories) $\widetilde{\mathcal{O}} (H^{3/2} |\mathcal{S}|/\varepsilon)$ with MIMIC-MD, where $H$ is the planning horizon, $|\mathcal{S}|$ is the state space size and $\varepsilon$ is the desired imitation gap. This implies TAIL could be better than conventional AIL methods with the sample complexity $\widetilde{\mathcal{O}} (H^2 |\mathcal{S}|/\varepsilon^2)$ in the worst case. In addition, TAIL is more practical than MIMIC-MD as the former has a linear space complexity $\mathcal{O} (|\mathcal{S} | |\mathcal{A}| H)$ while the latter suffers a quadratic one $\mathcal{O} ((|\mathcal{S} | |\mathcal{A}| H)^2)$. Secondly, when the transition function is unknown but the interaction is allowed, we propose an algorithmic framework that connects reward-free exploration and AIL. Under this framework, we develop an extension of TAIL named \emph{MB-TAIL}, which still enjoys the sample complexity $\widetilde{\mathcal{O}} (H^{3/2} |\mathcal{S}|/\varepsilon)$ and interaction complexity $\widetilde{\mathcal{O}} (H^{3} |\mathcal{S}|^2 \mathcal{A}/\varepsilon^2)$. In particular, MB-TAIL is significantly better than the best-known OAL algorithm \citep{shani2022online} in both sample complexity and interaction complexity. To our understanding, MB-TAIL is the first algorithm that shifts the advances in the known transition setting to the unknown transition setting. 


% In this work, we design provably (more) efficient imitation learning algorithms that directly optimize policies from expert demonstrations. Firstly, when the transition function is known, we build on the nearly minimax optimal algorithm MIMIC-MD \citep{rajaraman2020fundamental} and relax a projection operator in it. Based on this change, we develop an adversarial imitation learning (AIL) algorithm named \emph{TAIL} with a gradient-based optimization procedure. Accordingly, TAIL has the same sample complexity (i.e., the number of expert trajectories) $\widetilde{\mathcal{O}}(H^{3/2} |\mathcal{S}|/\varepsilon)$ with MIMIC-MD, where $H$ is the planning horizon, $|\mathcal{S}|$ is the state space size and $\varepsilon$ is desired policy value gap. This implies TAIL could be better than conventional AIL methods such as FEM and GTAL in the worst-case since they have a sample complexity $\widetilde{\mathcal{O}}(H^2 |\mathcal{S}| / \varepsilon^2)$. In addition, TAIL is more practical than MIMIC-MD as the former has a space complexity $\mathcal{O} (|\mathcal{S}||\mathcal{A}|H)$ while the latter's is about $\mathcal{O} (|\mathcal{S}|^2 |\mathcal{A}|^2 H^2)$. Secondly, under the scenario where the transition function is unknown but the interaction is allowed, we present an extension of TAIL named \emph{MB-TAIL}. The sample complexity of MB-TAIL is still $\widetilde{\mathcal{O}}(H^{3/2} |\mathcal{S}|/\varepsilon)$ while the interaction complexity (i.e., the number of interaction episodes) is $\widetilde{\mathcal{O}} (H^3 |\mathcal{S}|^2 |\mathcal{A}| / \varepsilon^2)$. In particular, MB-TAIL is significantly better than the best-known OAL algorithm in \citep{shani2022online}, which has a sample complexity $\widetilde{\mathcal{O}}(H^{2} |\mathcal{S}|/\varepsilon^2)$ and interaction complexity $\widetilde{\mathcal{O}} (H^4 |\mathcal{S}|^2 |\mathcal{A}| / \varepsilon^2)$. The advances in MB-TAIL are based on a new framework that connects reward-free exploration and AIL. To our understanding, MB-TAIL is the first algorithm that shifts the advances in the known transition setting to the unknown transition setting.
\end{abstract}

% \sectionspace
\section{Introduction}\label{sec:intro}
% \sectionspace


% \todo[inline]{\dquote{Most previous studies ...}: this claim is not accurate. Indeed, many empirical works under the unknown transition setting. A better writing could be: compared with recent advances in IL with known transitions, the theoretical foundation of AIL with unknown transition remains under-developed.}


% \todo[inline]{The exact improvement over OAL should be stated clearly. For instance, \dquote{improves OAL by $\gO(H)$}.}

% \todo[inline]{In the abstract, remove some technique details and show the contribution clearly.}

% Sequential decision-making tasks are ubiquitous in real life, where agents implement policies to maximize the long-term return. One paradigm to obtain good policies is reinforcement learning (RL)~\citep{sutton2018reinforcement}, which learns from trial and error in unknown environments; it usually requires million of samples to achieve satisfying performance in practice~\citep{mnih2015human, duan2016benchmarking}. On the other hand, imitation learning (IL)~\citep{argall2009survey, osa2018survey} directly optimizes policies from expert demonstrations, which is more sample-efficient and has been successfully demonstrated in many applications \citep{shi2019taobao, levin16_end_to_end, jang2022bc}.

In real-life scenarios, sequential decision-making tasks are ubiquitous, where agents devise policies to maximize the long-term return. Reinforcement learning (RL)~\citep{sutton2018reinforcement} is a popular paradigm for learning effective policies through trial and error in unknown environments. However, RL often requires a large amount of samples and laborious reward engineering to achieve satisfactory performance in practice. Alternatively, imitation learning (IL)~\citep{argall2009survey, osa2018survey} provides a more sample-efficient approach to policy optimization by directly learning from expert demonstrations, and has been proven successful in various applications~\citep{levin16_end_to_end, shi2019taobao, jang2022bc}. By leveraging existing expert knowledge, IL methods enable efficient policy learning in situations where RL might be infeasible or expensive. Therefore, IL has become an increasingly popular and practical alternative for real-world applications.


% The target of IL is to optimize the policy \emph{without} the environment reward function. To achieve this, given expert demonstrations, it minimizes the policy value gap between the expert policy and the imitated policy~\citep{ross2010efficient, xu2020error, rajaraman2020fundamental}. Representative IL methods include behavioral cloning (BC)~\citep{Pomerleau91bc, ross11dagger} and adversarial imitation learning (AIL)~\citep{pieter04apprentice, syed07game}. In particular, BC aims to imitate the expert policy distribution. More specifically, BC builds on \emph{supervised learning} to minimize the policy distribution discrepancy with the expert policy on the given expert dataset. In contrast, AIL mainly follows the principle of \emph{state-action distribution matching}~\citep{pieter04apprentice, syed07game, ho2016gail}. In other words, AIL aims to optimize the policy such that the induced state-action distribution could match the counterpart generated by the expert. Algorithmically, it solves a min-max problem: the learner infers an adversarial reward function to maximize the policy value gap and subsequently learns a policy to minimize the policy value gap with the recovered reward function. Based on the above two principles, many practical algorithms \citep{Torabi18bco, Brantley20disagreement, fu2018airl, ho2016gail,  ke19imitation_learning_as_f_divergence, Kostrikov19dac, Kostrikov20value_dice} have been developed.


% optimize the policy \emph{without} the environment reward function. To achieve this, given expert demonstrations, it minimizes

% The target of IL is to minimize the policy value gap between the expert policy and the imitated policy~\citep{ross2010efficient, xu2020error, rajaraman2020fundamental}. Representative IL methods include behavioral cloning (BC)~\citep{Pomerleau91bc, ross2010efficient} and adversarial imitation learning (AIL)~\citep{pieter04apprentice, syed07game, ziebart2008MEIRL, ho2016gail}. More specifically, BC adopts supervised learning to minimize the policy distribution discrepancy with the expert policy. In contrast, AIL mainly follows the principle of \emph{state-action distribution matching}~\citep{pieter04apprentice, syed07game, ho2016gail}. Algorithmically, it solves a min-max problem: the learner infers an adversarial reward function that maximizes the policy value gap, and subsequently learns a policy to minimize the policy value gap with the inferred reward function. Based on the above two principles, many practical algorithms \citep{Torabi18bco,  fu2018airl,  ke19imitation_learning_as_f_divergence, Kostrikov19dac, Brantley20disagreement, garg2021iqlearn, dadashi2021primal} have been developed. 

Imitation learning (IL) is a framework that aims to minimize the difference between the expert policy and the imitated policy~\citep{ross2010efficient, xu2020error, rajaraman2020fundamental}. The two prominent IL methods are behavioral cloning (BC)~\citep{Pomerleau91bc, ross2010efficient} and adversarial imitation learning (AIL)~\citep{pieter04apprentice, syed07game,ziebart2008MEIRL, ho2016gail}. BC employs supervised learning to minimize the discrepancy between the policy distribution of the imitated policy and the expert policy. On the other hand, AIL focuses on state-action distribution matching, where the learner estimates an adversarial reward function that maximizes the policy value gap and then learns a policy to minimize the gap with the inferred reward function through a min-max optimization. Practical algorithms that build upon these principles have been developed and applied to various domains \citep{Torabi18bco, fu2018airl,  ke19imitation_learning_as_f_divergence, Kostrikov19dac, Brantley20disagreement, garg2021iqlearn, dadashi2021primal, viano2022proximal}. 
% These algorithms provide efficient and effective solutions to learn policies from expert demonstrations, which is particularly useful when direct optimization through RL is infeasible or too costly.


% \vspace{-0.1cm}
% \begin{table*}[tbp]
%     \centering
%     \caption{Sample complexity and interaction complexity to achieve an $\varepsilon$-optimal policy for different IL algorithms. We use $\widetilde{\gO}$ and $\widetilde{\Omega}$ to hide logarithmic factors. Here \dquote{\xmark} means the algorithm/bound is not applicable under the respective setting.}
%     \label{table:summary-of-results}
%     { % \small
%     \resizebox{\textwidth}{!}{%
%     \begin{tabular}{l|l|l|l}
%     \toprule
%     \multirow{2}{*}{} &
%       \multicolumn{1}{c|}{Known Transition Setting} &
%       \multicolumn{2}{c}{Unknown Transition Setting} \\  \cline{2-4}%  \cmidrule(l){2-4} 
%      &
%       Sample Complexity &
%       Sample Complexity &
%       Interaction Complexity \\ \hline
%     BC~\citep{rajaraman2020fundamental} &
%       $\widetilde{\Theta} \lp \frac{H^2 |\gS|}{\varepsilon} \rp$ &
%       $\widetilde{\Theta} \lp \frac{H^2 |\gS|}{\varepsilon} \rp$ &
%       $0$ \\ \hline
%     FEM~\citep{pieter04apprentice} &
%       $\widetilde{\Theta} \lp \frac{H^{2} |\gS|}{\varepsilon^2} \rp$ &
%       $\widetilde{\gO} \lp \frac{H^{2} |\gS|}{\varepsilon^2} + \frac{H^8 |\gS|^3 |\gA| }{\varepsilon^5} \rp $ &
%       $0$ \\ \hline
%     GTAL~\citep{syed07game} &
%       $\widetilde{\Theta} \lp \frac{H^{2} |\gS|}{\varepsilon^2} \rp$ &
%       $\widetilde{\gO} \lp \frac{H^{2} |\gS|}{\varepsilon^2} + \frac{H^6 |\gS|^3 |\gA| }{\varepsilon^3} \rp$ &
%       $0$ \\ \hline
%     OAL~\citep{shani2022online}        & $\widetilde{\Theta} \lp \frac{H^{2} |\gS|}{\varepsilon^2} \rp$ & $\widetilde{\gO} \lp \frac{H^{2} |\gS|}{\varepsilon^2} \rp$ & $\widetilde{\gO} \lp \frac{H^4 |\gS|^2 |\gA| }{\varepsilon^2} \rp$ \\ \hline
%     MIMIC-MD~\citep{rajaraman2020fundamental} &
%       $\widetilde{\gO} \lp \frac{H^{3/2} |\gS|}{\varepsilon} \rp$ &
%       \xmark &
%       \xmark \\ \hline
%     \textbf{TAIL} (Algorithm \ref{algo:main_aglorithm}) & $\widetilde{\gO} \lp \frac{H^{3/2} |\gS|}{\varepsilon} \rp$     & \xmark    & \xmark \\ \hline
%     \textbf{MB-TAIL} (Algorithm \ref{algo:mbtail-abstract}) & \xmark & $\widetilde{\gO} \lp \frac{H^{3/2} |\gS|}{\varepsilon} \rp$     & $\widetilde{\gO}\lp \frac{H^3 |\gS|^2 |\gA| }{\varepsilon^2} \rp$ \\ \hline
%     Lower Bound \citep{nived2021provably} &
%       $\widetilde{\Omega} \lp \frac{H^{3/2} }{\varepsilon} \rp$ &
%       \xmark &
%       \xmark \\ \bottomrule
%     \end{tabular}
%    }
%    }
% \end{table*}

% \vspace{-0.1cm}

% \textbf{Context.} 

% In the ideal known transition setting, there have been lots of theoretical studies \citep{pieter04apprentice, syed07game, syed08lp, Zahavy20al_via_frank-wolfe, rajaraman2020fundamental} toward understanding AIL. When the transition function is known, the learner can exactly evaluate a policy for any reward function, which is crucial for the policy optimization step in AIL. When performing state-action distribution matching, conventional AIL methods such as FEM \citep{pieter04apprentice} and GTAL \citep{syed07game} utilize the maximum likelihood estimator of the expert's state-action distribution. They can achieve the sample complexity (i.e., the number of expert trajectories) of $\widetilde{\gO} ( H^2|\gS|/\varepsilon^2)$\footnote{We translate the results in  \citep{pieter04apprentice,syed07game} from the infinite-horizon setting to the episodic setting by 1) replacing the effective planning horizon $1 / (1-\gamma)$ with the finite planning horizon $H$; 2) instantiating the linear feature with the one-hot feature under the tabular setting.} in the worst case, where $H$ is the planning horizon, $|\gS|$ is the state space size and $\varepsilon$ is the desired policy value gap. On the other hand, the lower bound under this scenario is $\widetilde{\Omega} (H^{3/2} /\varepsilon)$ \citep{nived2021provably}, suggesting conventional AIL is not minimax optimal.


% \todo[inline]{It is still rapid in giving the context. I would give more background information and motivation: first, I would say AIL performs better than BC in practice, which motivates to study AIL rather than BC. Second, I would say theoretical analysis of AIL is hard, so many prior works simplified the problem and focused on the known transition setting (I could review these works in the related work section). Given the above information, I would say this paper will study AIL with unknown transition, an important and practical problem. Then, I review some prior works on AIL with unknown transition. Finally, I tell readers the contributions of this paper, compared with the mentioned prior works. }

% Recently, there has been some progress in developing the minimax optimal method in the known transition setting. In particular, \citet{rajaraman2020fundamental} propose a new type of IL method MIMIC-MD, which can be viewed as a combination of BC and AIL. More specifically, with a transition-aware estimator, MIMIC-MD solves a constrained state-action distribution matching problem subject to the set of BC policies. Importantly, MIMIC-MD achieves a sample complexity of $\widetilde{\gO}(H^{3/2}|\gS|/\varepsilon)$, which matches the lower bound in terms of $H$ and $\varepsilon$. 

% Interestingly, many empirical studies \citep{ho2016gail,  ke19imitation_learning_as_f_divergence, Kostrikov19dac, ghasemipour2019divergence} demonstrate that AIL often outperforms BC by a wide margin. This phenomenon motivates a lot of theoretical studies \citep{zhang2020gail, Chen20on_computation_and_generalization_of_gail, rajaraman2020fundamental, rajaraman2021value, xu2020error, liu2021provably, li2022rethinking, xu2022understanding} toward understanding AIL. However, analyzing AIL is challenging because both expert policy and environment transitions are unknown, meaning that the expert estimation and policy evaluation/optimization are inaccurate. The complicated running procedure of AIL\footnote{That is, AIL repeatedly explores the environment and imitates the expert over iterations.} further makes the theoretical analysis difficult. To this end, some prior works \citep{pieter04apprentice, syed08lp, rajaraman2020fundamental, rajaraman2021value, xu2022understanding} have assumed that the transition function is known, which significantly simplifies the problem. 

A remarkable observation from empirical studies~\citep{ho2016gail, Kostrikov19dac, ghasemipour2019divergence} is that adversarial imitation learning (AIL) often outperforms behavioral cloning (BC) by a significant margin. This phenomenon has spurred numerous theoretical investigations~\citep{zhang2020gail, Chen20on_computation_and_generalization_of_gail, rajaraman2020fundamental, rajaraman2021value, xu2020error, liu2021provably, xu2022understanding} aimed at understanding the mechanisms of AIL. However, analyzing AIL is challenging because both the expert policy and environment transitions are unknown, making expert estimation and policy optimization/evaluation inaccurate. The complex min-max implementation of AIL further compounds the theoretical analysis difficulty. As a result, several prior works~\citep{pieter04apprentice, syed08lp, rajaraman2020fundamental, rajaraman2021value, xu2022understanding} have made the simplifying assumption of a known transition function to facilitate the analysis.


% by bypassing the environment exploration difficulty.


% To conduct theoretical analysis, many prior works \citep{pieter04apprentice, syed07game, rajaraman2020fundamental, rajaraman2021value, xu2022understanding} simplify the problem and focus on the known transition setting, which bypasses the exploration difficulty. 


% However, analyzing of AIL is challenging due to the complicated running procedure of AIL (i.e., AIL iteratively imitates and explores when performing state-action distribution matching). To conduct theoretical analysis, many prior works \citep{pieter04apprentice, syed07game, rajaraman2020fundamental, rajaraman2021value, xu2022understanding} simplify the problem and focus on the known transition setting, which bypasses the exploration difficulty. 





% In many practical tasks such as robotics control and recommendation system, however, it is often challenging to characterize the exact form of environment transitions \citep{duan2016benchmarking, shi2019taobao}. This motivates the study of the unknown transition setting. In this paper, we investigate AIL with unknown transitions, in which the learner does not know the transition function a prior but can interact with the environment to collect trajectories. This setup is widely considered in empirical studies \citep{ho2016gail, fu2018airl,  ke19imitation_learning_as_f_divergence, Kostrikov19dac, Brantley20disagreement, garg2021iqlearn}. From a theoretical perspective, we care about both the \emph{expert sample complexity} (i.e., the number of expert trajectories) and \emph{interaction complexity} (i.e., the number of environment interactions) required to recover good policies, which are of much practical interest. 

However, the characterization of environment transitions is often challenging in practical tasks, as noted in previous studies \citep{duan2016benchmarking, shi2019taobao}. Therefore, there has been growing interest in investigating AIL with unknown transitions, where the learner does not have prior knowledge of the transition function but can collect trajectories by interacting with the environment. This setup is widely used in empirical studies \citep{ho2016gail, fu2018airl,  ke19imitation_learning_as_f_divergence, Kostrikov19dac, Brantley20disagreement, garg2021iqlearn, li2022rethinking}. From a theoretical perspective, it is important to understand both the expert sample complexity (i.e., the number of trajectories collected by the expert) and the interaction complexity (i.e., the number of trajectories collected by the online learner) to achieve good policies, as these are of practical interest. In this paper, we investigate AIL with unknown transitions and focus on analyzing the required expert sample and interaction complexity.



% Compared with advances in IL with \emph{known} transitions, the theoretical foundation of AIL with \emph{unknown} transitions remains largely under-developed. In particular, seminal AIL works such as FEM \citep{abbeel05exploration-and-ap} and GTAL \citep{syed07game} directly leverage expert demonstrations to estimate the transition function to perform imitation. As a consequence, their algorithms are impractical as the expert sample complexity is unacceptably large; refer to \cref{table:summary-of-results}. To our best knowledge, only the online apprenticeship learning (OAL) algorithm in \citep{shani2022online} is promising, which updates the policy and reward function using no-regret algorithms when interacting with the environment. In particular,  OAL achieves an expert sample complexity $\widetilde{\gO}(H^2|\gS|/\varepsilon^2)$  and interaction complexity $\widetilde{\gO} (H^4 |\gS|^2 |\gA| / \varepsilon^2)$\footnote{In \citep{shani2022online}, a regret $\widetilde{\gO}(\sqrt{H^4 |\gS|^2 |\gA|K} + \sqrt{H^3 |\gS| |\gA| K^2 /m})$ is proved, where $K$ is the number of interaction episodes and $m$ is the number of expert trajectories. We convert this regret guarantee to the sample complexity guarantee (see Appendix \ref{sec:from_regret_to_pac}).}.  Here $|\gS|$ is the state space size, $|\gA|$ is the action space size, $H$ is the planning horizon, and $\varepsilon = V^{\piE} - V^{\pi}$ is the desired imitation gap. Regardless of the (large) interaction complexity of OAL, the sample complexity of OAL is not even optimal in the sense that the best expert sample complexity is $\widetilde{\gO}(H^{3/2}|\gS|/\varepsilon)$ in the known transition setting \citep{rajaraman2020fundamental}. In other words, even if OAL interacts with environments infinitely, its expert sample complexity is not optimal. Therefore, how to improve AIL with unknown transitions is of great significance.

% \ref{sec:from_regret_to_pac})
Compared with the progress made in IL with known transitions, AIL with unknown transitions still lacks a well-developed theoretical foundation. Earlier works, such as FEM \citep{abbeel05exploration-and-ap} and GTAL \citep{syed07game}, estimated the transition function from expert demonstrations for imitation, rendering their algorithms impractical due to the prohibitively large expert sample complexity (as shown in \cref{table:summary-of-results}). To the best of our knowledge, the online apprenticeship learning (OAL) algorithm in \citep{shani2022online} is a promising approach that updates the policy and reward function using no-regret algorithms during environment interaction. In particular,  OAL achieves an expert sample complexity $\widetilde{\gO}(H^2|\gS|/\varepsilon^2)$  and interaction complexity $\widetilde{\gO} (H^4 |\gS|^2 |\gA| / \varepsilon^2)$\footnote{In \citep{shani2022online}, a regret $\widetilde{\gO}(\sqrt{H^4 |\gS|^2 |\gA|K} + \sqrt{H^3 |\gS| |\gA| K^2 /m})$ is proved, where $K$ is the number of interaction episodes and $m$ is the number of expert trajectories. We convert this regret guarantee to the sample complexity guarantee (see the Appendix).}, where $|\gS|$ and $|\gA|$ are the state and action space sizes, $H$ is the planning horizon, and $\varepsilon = V^{\piE} - V^{\pi}$ is the desired imitation gap. However, even with infinite environment interactions, OAL's expert sample complexity is sub-optimal, as the best expert sample complexity in the known transition setting is $\widetilde{\gO}(H^{3/2}|\gS|/\varepsilon)$ \citep{rajaraman2020fundamental}. Thus, improving AIL with unknown transitions is a significant area of research.


% More importantly, previous methods adopt dedicated strategies to address the challenges from the unknown transition, which is of less generality. To this end, we ask the following research questions: can we develop a general framework to overcome the challenges from unknown transitions in a unified way? can we design a more interaction-efficient AIL algorithm that can simultaneously achieve the minimax optimal sample complexity?          
% This paper focuses on the theoretical foundations of IL approaches. Concretely, we care about the \emph{(expert) sample} and \emph{(environment) interaction} complexity, which refer to the number of \emph{expert trajectories} and \emph{interaction episodes with the environment} to achieve a small policy value gap, respectively. These two metrics are of interest in practice \citep{ho2016gail,Kostrikov19dac, Kostrikov20value_dice}.


% First, we consider the known transition setting where the learner has an access to the environment transition function. In this setting, the learner can exactly evaluate a policy (i.e., compute the expected return) for any reward function, which is crucial for AIL methods. Besides, BC can also operate in this setting although it does not leverage the transition information. In this setting, we focus on the metric of expert sample complexity.  


% There have been lots of finite sample complexity studies in the known transition setting. For BC, with high probability, the sample complexity is $\widetilde{\Theta} (H^2 |\gS| /\varepsilon)$ \citep{rajaraman2020fundamental}, where $H$ is the planning horizon, $|\gS|$ is the state space size and $\varepsilon$ is the desired policy value gap. Besides, the sample complexity of conventional\footnote{Here \dquote{conventional} means the state-action distribution estimator is based on the maximum likelihood estimation as in \eqref{eq:estimate_by_count}.} AIL methods such as FEM \citep{pieter04apprentice} and GTAL \citep{syed07game} is $\widetilde{\gO} ( H^2|\gS|/\varepsilon^2)$\footnote{We translate the results in  \citep{pieter04apprentice,syed07game} from the infinite-horizon setting to the episodic setting by 1) replacing the effective planning horizon $1 / (1-\gamma)$ with the finite planning horizon $H$; 2) instantiating the linear feature with the one-hot feature under the tabular setting.}. This sample complexity is also proven to be tight for conventional AIL methods in the worst case. On the other hand, the lower bound under this scenario is $\widetilde{\Omega} (H^{3/2} /\varepsilon)$ \citep{nived2021provably}, suggesting BC and conventional AIL are not minimax optimal.




% Interestingly, another type of IL algorithm MIMIC-MD \citep{rajaraman2020fundamental} has a sample complexity $\widetilde{\gO}(H^{3/2}|\gS|/\varepsilon)$, which matches the lower bound in terms of $H$ and $\varepsilon$. MIMIC-MD can be viewed as a combination of BC and AIL. With a transition-aware estimator (see \eqref{eq:new_estimator}), MIMIC-MD solves a \emph{constrained} state-action distribution problem subject to the set of BC policies. Due to the projection operator over BC policies, \citet{nived2021provably} propose a linear programming (LP) based optimization procedure for MIMIC-MD.


% Unfortunately, the LP formulation for MIMIC-MD leads to a poor quadratic space complexity $\widetilde{\gO}((|\gS| |\gA| H)^2)$, which is unbearable in practice when the state space (or the planning horizon) is large (see the evidence in \RED{Appendix XXX}). A natural question is: (\textbf{Q1}) can we design a \emph{memory-efficient} AIL method that can also achieve the minimax optimal sample complexity\footnote{It is fundamental to design algorithms that achieve the minimax optimal sample complexity and have a low space complexity in the RL community. See e.g. \citep{jin18qlearning} (especially Table 1 in \citep{jin18qlearning}) for the discussion.}? 



% Second, we consider a practical setting where the transition function is unknown but the environment interaction is allowed. In this setting, the main challenge is that the agent can \emph{not} exactly evaluate a policy any longer. Therefore, AIL methods need to efficiently explore and imitate. In this setting, we care about both the expert sample complexity and interaction complexity.


% In the unknown transition setting, several theoretical guarantees of the extensions of conventional AIL methods are well-known (shown in Table \ref{table:summary-of-results}). In particular, seminal works such as FEM and GTAL directly leverage expert demonstrations to estimate the transition function to perform imitation. Correspondingly, their algorithms are impractical as their sample complexity is unacceptably large. To our best knowledge, only the online apprenticeship learning (OAL) algorithm in \citep{shani2022online} is promising, which updates the policy and reward function using no-regret algorithms when interacting with the environment. In particular,  OAL achieves an expert sample complexity $\widetilde{\gO}(H^2|\gS|/\varepsilon^2)$  and interaction complexity $\widetilde{\gO} (H^4 |\gS|^2 |\gA| / \varepsilon^2)$\footnote{In \citep{shani2022online}, a regret  $\widetilde{\gO}(\sqrt{H^4 |\gS|^2 |\gA|K} + \sqrt{H^3 |\gS| |\gA| K^2 /m})$ is proved, where $K$ is the number of interaction episodes and $m$ is the number of expert trajectories. We convert this regret guarantee into the PAC guarantee (see Appendix \ref{sec:from_regret_to_pac}).}.


% Despite the large interaction complexity of OAL, the sample complexity of OAL is not optimal in the sense that the best sample complexity is $\widetilde{\gO}(H^{3/2}|\gS|/\varepsilon)$ in the known transition setting. To this end, we ask the question: \textbf{(Q2)} can we design algorithms with better sample complexity and interaction complexity under the unknown transition case?


% \RED{Related work on generalization.}


\begin{table}[htbp]
\centering
\caption{Expert sample complexity and interaction complexity of BC \citep{rajaraman2020fundamental}, FEM \citep{pieter04apprentice}, GTAL \citep{syed07game}, OAL \citep{shani2022online}, and MB-TAIL (ours) with unknown expert and transitions. We use $\widetilde{\gO}$ to hide logarithmic factors.}
\label{table:summary-of-results}
\begin{tabular}{@{}c|c|c@{}}
\toprule
 & \begin{tabular}[c]{@{}l@{}}Expert Sample \\ Complexity\end{tabular} & \begin{tabular}[c]{@{}l@{}}Interaction \\ Complexity\end{tabular} \\ \midrule
BC   & $\widetilde{\gO} \lp \frac{H^2 |\gS|}{\varepsilon} \rp$  & 0 \\
 FEM  & $\widetilde{\gO} \lp \frac{H^{2} |\gS|}{\varepsilon^2} + \frac{H^8 |\gS|^3 |\gA| }{\varepsilon^5} \rp $  & 0  \\
 GTAL & $\widetilde{\gO} \lp \frac{H^{2} |\gS|}{\varepsilon^2} + \frac{H^6 |\gS|^3 |\gA| }{\varepsilon^3} \rp$  & 0  \\
 OAL  & $\widetilde{\gO} \lp \frac{H^{2} |\gS|}{\varepsilon^2} \rp$  & $\widetilde{\gO} \lp \frac{H^4 |\gS|^2 |\gA| }{\varepsilon^2} \rp$ \\  \hline
 MB-TAIL  &  $\widetilde{\gO} \lp \frac{H^{3/2} |\gS|}{\varepsilon} \rp$ & $\widetilde{\gO}\lp \frac{H^3 |\gS|^2 |\gA| }{\varepsilon^2} \rp$   \\ 
 \bottomrule
\end{tabular}
\end{table}


\textbf{Contribution.} This paper presents a new and general framework (\cref{algo:framework}) that overcomes the challenge of unknown transitions and unknown expert policies. At a high level, our framework establishes a connection between AIL and reward-free exploration (RFE) \citep{chi20reward-free, menard20fast-active-learning, chen2022rewardfree}, which is an emerging topic in online RL. We prove that any effective AIL algorithm that works with known transitions can be transferred to the unknown transition setting using an efficient RFE method, as shown in \cref{prop:connection}.

% In this paper, we overcome the difficulty of unknown transitions (and unknown expert policies) with a new and general framework (\cref{algo:framework}). At a high level, this framework builds a connection between AIL and reward-free exploration (or RFE for short) \citep{chi20reward-free, menard20fast-active-learning, chen2022rewardfree}, which is an emerging topic in online RL. Specifically, we prove that any proper AIL algorithm that works with known transitions could be provably transferred to the unknown transition setting by leveraging an efficient RFE method (\cref{prop:connection}).

% Specifically, 


% under this framework, any proper AIL algorithm with known transitions could be provably transferred to the unknown transition setting by leveraging a reward-free exploration method. 


% establish a meta-framework (\cref{algo:framework}), under which any proper AIL algorithm that works in the known transition setting could be provably transferred to the unknown transition setting by leveraging 


% Under this framework, we design an AIL algorithm named MB-TAIL (\cref{algo:mbtail-abstract}), which can achieve the minimax optimal sample complexity and a better interaction complexity.


% In particular, we develop the general framework by establishing the connection between AIL and reward-free exploration \citep{chi20reward-free, menard20fast-active-learning}. Briefly, reward-free exploration methods (see Definition \ref{defn:reward_free}) usually establish a transition model by interacting with the environment in the first step; then, they evaluate a policy with the constructed transition model and hope the evaluation is uniformly accurate for \emph{any} reward function. Since AIL requires optimizing the policy under several recovered reward functions, we can incorporate the idea of reward-free exploration to tackle the challenge from the unknown transition. Technically, we present an error decomposition analysis, which serves as a theoretical foundation for this framework.

% Then we develop a novel algorithm named MB-TAIL under this framework. In particular, MB-TAIL incorporates three main designs: a fine-grained state-action distribution estimator, a reward-free exploration method and an efficient min-max optimization procedure. 

Further, we also introduce a new algorithm called MB-TAIL\footnote{MB-TAIL stands for model-based transition-aware adversarial imitation learning.}, which incorporates recent advances in AIL with known transitions and RFE. MB-TAIL builds on MIMIC-MD \citep{rajaraman2020fundamental} and RF-Express \citep{menard20fast-active-learning} but requires new designs to apply their main ideas in the unknown transition setting. Notably, MB-TAIL achieves an expert sample complexity of $\widetilde{\gO}(H^{3/2}|\gS|/\varepsilon)$, meeting the lower bound $\Omega(H^{3/2}/\varepsilon)$ \citep{nived2021provably} in $H$ and $\varepsilon$. This sample complexity is nearly minimax optimal and the first to be achieved in the unknown transition setting. Additionally, MB-TAIL has an interaction complexity of $\widetilde{\gO} (H^3 |\gS|^2 |\gA|  / \varepsilon^2 )$, which improves upon the best-known OAL algorithm by a factor of $\gO(H)$.


% an algorithm named MB-TAIL\footnote{MB-TAIL standards for model-based transition-aware adversarial imitation learning.} is developed, which incorporates recent advances in AIL with known transitions and RFE, thanks to our framework. Concretely, MB-TAIL builds on two algorithms, MIMIC-MD \citep{rajaraman2020fundamental} and RF-Express \citep{menard20fast-active-learning}. However, to apply the main punchlines from these two algorithms to the unknown transition setting, new designs are proposed, which we will discuss in the main text. Importantly, MB-TAIL achieves the expert sample complexity $\widetilde{\gO}(H^{3/2}|\gS|/\varepsilon)$, which meets the lower bound $\Omega(H^{3/2}/\varepsilon)$ \citep{nived2021provably} in $H$ and $\varepsilon$. To our best knowledge, MB-TAIL is the first algorithm to achieve this nearly minimax optimal sample complexity in the unknown transition setting. Moreover, MB-TAIL enjoys an interaction complexity of $\widetilde{\gO} (H^3 |\gS|^2 |\gA|  / \varepsilon^2 )$, which improves the best-known OAL algorithm by $\gO (H)$.  

% In addition, we corroborate our theoretical results with experiments.


Finally, we extend the MB-TAIL algorithm to the function approximation setting and demonstrate its ability to achieve the expert sample and interaction complexity independent of the state space size $|\gS|$. Specifically, we investigate the case of state abstraction \citep{li2006towards}, which involves approximating functions using piecewise constant functions. By employing appropriate state abstractions, MB-TAIL can estimate the abstract state-action distribution instead of the tabular counterpart, which is crucial for generalization.


% we extend MB-TAIL to the function approximation setting and prove that MB-TAIL can achieve the expert sample and interaction complexity independent of the state space size $|\gS|$. In particular, we investigate the state abstraction case, which corresponds to the function approximation with piecewise constant functions. With proper state abstractions, MB-TAIL can only estimate the \emph{abstract} state-action distribution instead of the tabular counterpart, which serves as the key step to achieving generalization.

% Technically, we leverage the analysis tools in state abstraction to prove that MB-TAIL can achieve the expert sample and interaction complexity independent of $|\mathcal{S}|$.


% To answer (Q1), we develop a memory-efficient AIL method named \emph{TAIL}, which can also exhibit the minimax optimal sample complexity. Our key insight is that the projection operator in MIMIC-MD can be relaxed without sacrificing sample complexity through a new analysis. In particular, as a pure AIL method, TAIL removes the projection operator in MIMIC-MD and solves an \emph{unconstrained} state-action distribution matching problem with the transition-aware estimator; see Remark \ref{remark:projection_choice}. Based on this relaxation, we leverage the tool of online gradient descent \citep{shalev12online-learning} to solve the min-max problem under the AIL framework. Computationally, TAIL is solvable in polynomial time and further has a better space complexity ${\gO}(|\gS| |\gA| H)$ compared with MIMIC-MD. Statistically, based on a new analysis, TAIL still enjoys the nearly minimax optimal sample complexity of $\widetilde{\gO}(H^{3/2}|\gS|/\varepsilon)$. 





% To answer (Q2), we develop an interaction-efficient AIL method named \emph{MB-TAIL}, which beats the best-known algorithm in terms of both sample and interaction complexity. Technically, MB-TAIL is built upon a novel framework that connects reward-free exploration and AIL. Briefly, reward-free exploration methods (see Definition \ref{defn:reward_free}) usually establish a transition model by interacting with the environment in the first step; then, they evaluate a policy with the constructed transition model and hope the evaluation is uniformly accurate for \emph{any} reward function. Since AIL requires optimizing the policy under several recovered reward functions, we can incorporate the idea of reward-free exploration to tackle the challenge from the unknown transition. 





% Under the developed framework, we combine TAIL and RF-Express~\citep{menard20fast-active-learning} (a SOTA reward-free exploration method), and obtain the new algorithm named \mbalgname. The sample complexity of \mbalgname is $\widetilde{\gO} ( H^{3/2} |\gS|  / \varepsilon )$ while its interaction complexity is $\widetilde{\gO} (H^3 |\gS|^2 |\gA|  / \varepsilon^2 )$. Compared with the mentioned OAL algorithm, our algorithm has significant improvements in both sample and interaction complexity. We further corroborate our theoretical results with numerical studies. To our understanding, MIMIC-MD is limited to the known transition setting, and our algorithm is the first one to shift the advances from the known transition setting to the unknown transition setting.



 


% \sectionspace
\section{Related Work}
% \sectionspace

% Researchers have analyzed IL algorithms based on the \emph{error bound} analysis (i.e., under the \dquote{infinite} samples setting). To name a few, \citet{ross2010efficient} reveal that BC has the compounding error issue, indicating that its error bound is $\gO ( H^2)$.  DAgger~\citep{ross11dagger} improves this error bound to $\gO ( H )$ with active expert queries. By state-action distribution matching, AIL methods can also address the compounding error issue and have the error bound $\gO (H) $~\citep{Chen20on_computation_and_generalization_of_gail, xu2020error} without additional expert queries. 


% However, the error bound analysis does not fully characterize the sample efficiency. Later on, there emerge lots of finite sample complexity studies \citep{xu2020error, rajaraman2020fundamental, nived2021provably, rashidinejad2021bridge, shani2022online}. In addition to results in Table \ref{table:summary-of-results}, in the pure offline setting where the transition is unknown and \emph{no} interaction is allowed, \citet{rajaraman2020fundamental} prove that BC has a sample complexity $\widetilde{\gO} \lp H^2 |\gS| / \varepsilon \rp$. This upper bound matches the lower bound $\widetilde{\Omega} \lp H^2 |\gS|/\varepsilon \rp$ under this setting \citep{rajaraman2020fundamental}. Interestingly, under the active setting where the agent can query the expert, \citet{rajaraman2020fundamental} show that DAgger does not improve the sample complexity compared with BC; see \citep{rajaraman2020fundamental} for a detailed explanation.  


% \RED{function approximation?}






% \textbf{Reward Exploration.} 


% In the known transition setting, there are lots of theoretical studies on the expert sample complexity of AIL \citep{pieter04apprentice, syed07game, Zahavy20al_via_frank-wolfe, rajaraman2020fundamental, swamy2022minimax, xu2021error, xu2022understanding}. To name a few, conventional apprenticeship learning algorithms such as FEM and GTAL have the expert sample complexity $\widetilde{\gO} ( H^2|\gS|/\varepsilon^2)$\footnote{We translate the results in  \citep{pieter04apprentice,syed07game} from the infinite-horizon setting to the episodic setting by 1) replacing the effective planning horizon $1 / (1-\gamma)$ with the finite planning horizon $H$; 2) instantiating the linear feature with the one-hot feature under the tabular setting.}. This upper bound is proved to be tight in the worst case \citep{xu2022understanding, swamy2022minimax}. Furthermore, \citet{rajaraman2020fundamental} propose a new AIL method called MIMIC-MD, which explicitly exploits the transition function. Compared with classical AIL methods, MIMIC-MD achieves an improved expert sample complexity $\widetilde{\gO} (H^{3/2} |\gS| /\varepsilon)$. The information-theoretic lower bound of expert sample complexity with known transitions is $\widetilde{\Omega} (H^{3/2} /\varepsilon)$ \citep{nived2021provably}. As one can see, MIMIC-MD matches this lower bound in terms of $H$ and $\varepsilon$. The horizon-free expert sample complexity is recently studied in \citep{xu2022understanding}, which provides an explanation of superior performance of AIL with known transitions. The theoretical studies of AIL with unknown transitions are limited. Since we have reviewed them in the previous section, we do not repeat them here.

In the realm of AIL with known transitions, there have been numerous theoretical investigations into expert sample complexity \citep{pieter04apprentice, syed07game, Zahavy20al_via_frank-wolfe, rajaraman2020fundamental, swamy2022minimax, xu2021error, xu2022understanding}. For example, FEM and GTAL, which are traditional AIL algorithms, have expert sample complexity of $\widetilde{\gO} (H^2|\gS|/\varepsilon^2)$ \footnote{Results from \citep{pieter04apprentice,syed07game} are transformed from the infinite-horizon setting to the episodic setting by 1) substituting the effective planning horizon $1 / (1-\gamma)$ with the finite planning horizon $H$; 2) instantiating the linear feature with the one-hot feature under the tabular setting.}. This upper bound is proven to be tight in the worst-case \citep{xu2022understanding, swamy2022minimax}. Additionally, \citet{rajaraman2020fundamental} proposed a novel AIL technique, MIMIC-MD, which leverages the transition function to obtain an enhanced expert sample complexity of $\widetilde{\gO} (H^{3/2} |\gS| /\varepsilon)$. MIMIC-MD meets the information-theoretic lower bound of expert sample complexity with known transitions, which is $\widetilde{\Omega} (H^{3/2} /\varepsilon)$ \citep{nived2021provably}, in terms of both $H$ and $\varepsilon$. Recently, horizon-free expert sample complexity was studied in \citep{xu2022understanding}, which explains the superior performance of AIL with known transitions. However, there are only a limited number of theoretical investigations into AIL with unknown transitions. We have already discussed these in the previous section and thus will not repeat them here.



% \BLUE{

% However, our focus differs from these studies as we concentrate on the IL problem where the target is to learn a high-quality policy.  

% A separate line of works studies sample complexity in the problem of inverse reinforcement learning where the goal is to infer a reward function \citep{metelli2021provably, lindner2022active, metelli2023towards}. In particular, \citep{lindner2022active} proposed to employ an upper confidence approach that actively explores the environment and expert policy to learn the reward function. However, our focus differs from these studies as we concentrate on the IL problem where the target is to learn a high-quality policy.  
% }

% In particular, when performing distribution matching, conventional AIL methods such as FEM \citep{pieter04apprentice} and GTAL \citep{syed07game} utilize the maximum likelihood estimator of the expert's state-action distribution. They have a sample complexity upper bound $\widetilde{\gO} ( H^2|\gS|/\varepsilon^2)$\footnote{We translate the results in  \citep{pieter04apprentice,syed07game} from the infinite-horizon setting to the episodic setting by 1) replacing the effective planning horizon $1 / (1-\gamma)$ with the finite planning horizon $H$; 2) instantiating the linear feature with the one-hot feature under the tabular setting.}. For conventional AIL, this sample complexity is proven to be tight in the worst case \citep{xu2022understanding}. Furthermore, \citet{rajaraman2020fundamental} propose a new AIL method MIMIC-MD, which utilizes a transition-aware estimator. Compared with conventional AIL methods, MIMIC-MD achieves an improved sample complexity $\widetilde{\gO} (H^{3/2} |\gS| /\varepsilon)$, which matches the lower bound $\widetilde{\Omega} (H^{3/2} /\varepsilon)$ in $H$ and $\varepsilon$.  




% Our work bridges adversarial imitation learning and reward-free exploration, an emerging topic in online reinforcement learning that we briefly review below. The reward-free exploration framework is firstly proposed in \citep{chi20reward-free} with the goal of 1) isolating the exploration issue and planning issue under a standard RL framework and 2) learning a \dquote{robust} environment to cover all possible training scenarios. Following \citep{chi20reward-free}, there are many advances in this direction \citep{Kaufmann21adaptive-rfe, wang2020rewardfree, zhang2021reward, chen2022rewardfree}, where the minimax rate under the tabular setting is achieved by \citep{menard20fast-active-learning}.

Our research establishes a connection between adversarial imitation learning and reward-free exploration, which is an emerging area of interest in online reinforcement learning. The reward-free exploration framework was introduced in \citep{chi20reward-free} with two primary goals: 1) isolating the exploration and planning problems within a standard RL framework and 2) learning an environment that is robust enough to cover all possible training scenarios. Since then, several advances have been made in this field \citep{Kaufmann21adaptive-rfe, wang2020rewardfree, zhang2021reward, chen2022rewardfree}. Specifically, \citep{menard20fast-active-learning} achieved the minimax rate in the tabular setting.


It is worth noting that AIL is closely related to inverse reinforcement learning (IRL) \citep{ng00irl}, which aims to infer the ground truth reward function from expert demonstrations. Recent works in IRL include \citep{metelli2021provably}, which studied the error propagation of the obtained policy's performance when transferring the reward function to a new environment, and \citep{zeng2022maximum}, which developed a single-loop algorithm to recover the reward function under the maximum entropy IRL formulation. Additionally, \citep{lindner2022active} proposed an upper confidence approach that actively explores the environment and expert policy to learn the reward function. However, our focus differs from these studies as our goal is to solve the imitation learning problem by learning a high-quality policy, rather than inferring the reward function.


% To name a few recent works, \citep{metelli2021provably} studied error propagation of the learned policy’s performance when transferring the reward function to a different environment. \citep{zeng2022maximum} developed a single-loop algorithm to recover the reward function under the maximum entropy IRL formulation. Moreover, \citep{lindner2022active} proposed to employ an upper confidence approach that actively explores the environment and expert policy to learn the reward function. However, our focus differs from these studies as our goal is to solve the imitation learning problem and learn a high-quality policy, rather than inferring the reward function.

% For imitation learning, our meta-framework in Section \ref{sec:main_result} connects reward-free exploration and adversarial imitation learning.


% \sectionspace
\section{Background}
% \sectionspace

\textbf{Episodic Markov Decision Process.} In this paper, we consider episodic Markov decision process (MDP), which can be described by the tuple $\gM = (\gS, \gA, P, r, H, \rho)$. Here $\gS$ and $\gA$ are the state and action space, respectively. $H$ is the planning horizon and $\rho$ is the initial state distribution. $P = \{P_1, \cdots, P_{H}\}$ specifies the non-stationary transition function of this MDP; concretely, $P_h(s_{h+1}|s_h, a_h)$ determines the probability of transiting to state $s_{h+1}$ conditioned on state $s_h$ and action $a_h$ at time step $h$, for $h \in [H]$, where $[x]$ denotes the set of integers from $1$ to $x$. Similarly, $r = \{r_1, \cdots, r_{H}\}$ specifies the reward function of this MDP; without loss of generality, we assume that $r_h: \gS \times \gA \rar [0, 1]$, for $h \in [H]$. A non-stationary policy $\pi = \lb \pi_1, \cdots, \pi_h \rb$ with $\pi_h: \gS \rar \Delta(\gA)$, where $\Delta(\gA)$ is the probability simplex and $\pi_h (a|s)$ gives the probability of selecting action $a$ on state $s$ at time step $h$, for $h \in [H]$. 

The sequential decision process runs as follows: at the beginning of an episode, the environment is reset to an initial state according to $\rho$; then the agent observes a state $s_h$ and takes an action $a_h$ based on $\pi_h(a_h|s_h)$; consequently, the environment makes a transition to the next state $s_{h+1}$ according to $P_h(s_{h+1}|s_h, a_h)$ and sends a reward $r_h(s_h, a_h)$ to the agent. This episode ends after $H$ repeats. 

The quality of a policy is measured by its \emph{policy value} (i.e., the expected long-term return): 
\begin{align*}
    V^{\pi} = \expect \bigg[ &\sum_{h=1}^{H} r_h(s_h, a_h) | s_1\sim \rho; a_h \sim \pi_h (\cdot|s_h), \\ 
    &s_{h+1} \sim P_h(\cdot|s_h, a_h), \forall h \in [H] \bigg].
\end{align*}
To facilitate later analysis, we introduce the state-action distribution induced by a policy $\pi$:
\begin{align*}
    d_h^{\pi}(s, a) = \sP ( s_h = s, a_h = a | s_1 \sim \rho; a_\ell \sim \pi_h (\cdot|s_\ell),
    \\
    s_{\ell+1} \sim P_{\ell} (\cdot|s_{\ell}, a_{\ell}),\; \forall \ell \in [h] ).
\end{align*}
In other words, $d_h^{\pi}(s, a)$ qualifies the visitation probability of state-action pair $(s, a)$ at time step $h$. In this way, we get an equivalent dual form of the policy value \citep{puterman2014markov}:
\begin{align}   \label{eq:dual_of_policy_value}
    V^{\pi} = \sum_{h=1}^{H} \sum_{(s, a) \in \gS \times \gA} d_h^{\pi}(s, a) r_h(s, a),
\end{align}
which will be used in later analysis.

\textbf{Imitation Learning.} The goal of IL is to learn a high quality policy \emph{without} the environment reward function. To this end, we often assume there is a nearly optimal expert policy $\piE$ that could interact with the environment to generate a dataset (i.e., $m$ trajectories of length $H$):
\begin{align*}
    \gD =& \{ \tr = \lp s_1, a_1, s_2, a_2, \cdots, s_H, a_H \rp; s_1 \sim \rho;
    \\
    &a_h \sim \piE_h(\cdot|s_h), s_{h+1} \sim P_h(\cdot|s_h, a_h), \forall h \in [H] \}.
\end{align*}
Then, the learner can use the dataset $\gD$ to mimic the expert and to obtain a good policy. The quality of imitation is measured by the \emph{imitation gap}~\citep{pieter04apprentice, ross2010efficient, rajaraman2020fundamental}: $V^{\piE} - V^{\pi}$, where $\pi$ is the learned policy. That is, we hope the learned policy can perfectly imitate the expert such that the imitation gap is small. In this paper, we assume the expert policy is deterministic, which is common in the literature \citep{rajaraman2020fundamental, swamy2022minimax, xu2022understanding}. 

\textbf{Notation.} We denote $\Pi$ as the set of all stochastic policies for the learner. Furthermore, $|\gD|$ is the number of trajectories in $\gD$. We reserve the symbol $m$ to denote the number of expert trajectories. We write $a(n) \gtrsim b(n)$ if there exist constants $C > 0, n_0 \geq 1$ such that  $a(n) \geq Cb(n)$ for $n \geq n_0$.



\section{Warm-up: AIL with Known Transitions}
\label{sec:warm-up}


To imitate the expert policy, AIL methods solve the state-action distribution matching problem \citep{ho2016gail, ke19imitation_learning_as_f_divergence, xu2020error}. As an introduction to general readers, we consider the known transition setting in this section. Our starting point is the following state-action distribution matching problem: 
\begin{align} \label{eq:ail_known_transition} 
   \min_{\pi \in \Pi}  \sum_{h=1}^{H} \lnorm d^{\pi}_h - \widetilde{d}^{\piE}_h \rnorm_1.
\end{align}
where $\widetilde{d}^{\piE}_h$ is an estimation of the expert state-action distribution $d^{\piE}_h$.  We can explain why \cref{eq:ail_known_transition} is a good learning objective with the following two definitions.  

% For the above distribution matching problem, the state-action distribution estimation $\widetilde{d}^{\piE}_h$ and the obtained optimal policy are two important components. We introduce the following definitions to quantitatively measure these two components.  

\begin{defn}
\label{def:estimation}
An estimator $\widetilde{d}^{\piE}_h$ is said to be $\varepsilon_{\est}$-accurate for $d^{\piE}_h$ if $ \sum_{h=1}^{H} \Vert \widetilde{d}^{\piE}_h - d^{\piE}_h \Vert_1 \leq \varepsilon_{\est}$. 
% \begin{align*}
%     \sum_{h=1}^{H} \lnorm \widetilde{d}^{\piE}_h - d^{\piE}_h \rnorm_1 \leq \varepsilon_{\est}.  
% \end{align*}
\end{defn}

\begin{defn}
\label{def:distribution_matching_error}
For optimization problem \eqref{eq:ail_known_transition}, a policy $\widebar{\pi}$ is said to be $\varepsilon_{\opt}$-optimal if $    \sum_{h=1}^{H} \Vert d^{\widebar{\pi}}_h - \widetilde{d}^{\piE}_h \Vert_1 \leq \min_{\pi \in \Pi}  \sum_{h=1}^{H} \Vert d^{\pi}_h - \widetilde{d}^{\piE}_h \Vert_1 + \varepsilon_{\opt}$. 
% \begin{align*}
%     \sum_{h=1}^{H} \lnorm d^{\widebar{\pi}}_h - \widetilde{d}^{\piE}_h \rnorm_1 \leq \min_{\pi \in \Pi}  \sum_{h=1}^{H} \lnorm d^{\pi}_h - \widetilde{d}^{\piE}_h \rnorm_1 + \varepsilon_{\opt}. 
% \end{align*}
\end{defn}

% Based on the above definitions, we can explain why \cref{eq:ail_known_transition} is a good learning objective. 

\begin{lem}  \label{lem:1}
Given an $\varepsilon_{\est}$-accurate estimator $\widetilde{d}^{\piE}_h$, suppose that $\widebar{\pi}$ is $\varepsilon_{\opt}$-optimal for problem \eqref{eq:ail_known_transition}, then we have that $V^{\piE} - V^{\widebar{\pi}} \leq  \varepsilon_{\opt} + 2\varepsilon_{\est}$.
\end{lem}

% The proof of \cref{lem:1} is deferred to Appendix; same as other theoretical results. \cref{lem:1} puts the state-action distribution matching on firm theoretical footing. We clarify that \cref{lem:1} is not first created by us; instead, different versions of \cref{lem:1} have appeared in many prior works, see, e.g., \citep{syed07game, rajaraman2020fundamental}. For ease of presentation, we defer how to control the estimation and optimization errors in the next section.  


% As mentioned, adequate theoretical advances have been achieved in the known transition setting. In particular, this setup allows one to accurately compute $d^{\pi}_h$ for any policy $\pi$. However, this assumption is quite impractical in many applications, where transitions are actually unknown. To be more practical, many empirical works have considered the unknown transition setting, where the analytic form of transition function is unavailable, but the interaction with environments is allowed. In this scenario, besides the expert sample complexity, the interaction complexity is also of great interest. We explore this direction in the next section. 
 
Proof of \cref{lem:1} can be found in the Appendix along with other theoretical results. This lemma establishes a strong theoretical foundation for state-action distribution matching. It is worth noting that similar versions of this lemma have been presented in prior works such as \citep{syed07game, rajaraman2020fundamental}. We will discuss how to control estimation and optimization errors in the next section.

While significant theoretical progress has been made in the known transition setting, this assumption is not always practical in real-world applications where the transition function is unknown. In such cases, empirical studies have been carried out under the unknown transition setting, where the interaction with environments is allowed but the analytic form of transition function is not available. In addition to expert sample complexity, the interaction complexity is also of great interest in this scenario, which we will explore in the next section.





% Next, we discuss how to properly control the estimation and optimization errors separately.




\section{Main Results: AIL with Unknown Transitions}
\label{sec:main_result}

In this section, we consider the unknown transition setting where $d^{\pi}_h$ is not accessible, rendering the learning objective in \cref{eq:ail_known_transition} inapplicable. A sound solution is to replace $d^{\pi}_h$ with its estimated version $\widehat{d}^{\pi}_h$ in \cref{eq:ail_known_transition}. We highlight that the unknown transition leads to the exploration-and-exploitation trade-off, which is shared with online RL \citep{agarwal2022rlbook}. The prior work OAL addresses this challenge by an optimistic estimation of the value function \citep{shani2022online}.  



In this paper, we explore an alternative model-based approach: we first learn the transition function from collected trajectories and subsequently estimate $d^{\pi}_h$ based on the recovered transition model. The key challenge is how to recover a good transition model such that policy evaluation/optimization can be conducted accurately. To this end, we propose a general algorithmic framework, which connects AIL with reward-free exploration (or RFE for short)~\citep{chi20reward-free, menard20fast-active-learning}, which is an emerging topic in online RL. Under this framework, a proper AIL algorithm that works under the known transition setting could be transferred to the unknown transition setting by leveraging an efficient RFE method. Before presenting the details of our framework, we formally introduce RFE.


% Before we give details of this framework, let us formally introduce RFE.


\begin{defn}[\citep{menard20fast-active-learning}]   \label{defn:reward_free}
Given an MDP $\gM$ without reward function $r$, an algorithm is said to be $(\varepsilon, \delta)$-PAC for reward-free exploration (RFE) if 
\begin{align*}
    \sP \big( &\text{for any reward function $r$}, |V^{\pi^*_{r}} - V^{\widehat{\pi}_{r}^*}| \leq \varepsilon \big) \geq 1 - \delta,
\end{align*}
where $\pi^*_{r}$ is the optimal policy in the MDP with the reward function $r$, and $\widehat{\pi}_{r}^*$ is the optimal policy in the MDP with the learned transition model $\widehat{P}$ by RFE and reward function $r$.
\end{defn}


% The key insights for bridging adversarial imitation learning with reward-free exploration are two-folded. First, reward-free exploration guarantees \emph{uniform policy optimization} with respect to any reward function. Based on such a learned transition model, an AIL algorithm can accurately perform policy optimization with respect to different reward functions across different iterations. Second, this definition suggests the reward-free exploration methods could achieve \emph{uniform policy evaluation} after the exploration. Formally, a reward-free exploration method can ensure that $\forall r = \{r_1, \ldots, r_H \}, \; r_h : \gS \times \gA \rightarrow [0, 1], \; \pi \in \Pi$, we have $\vert V^{\pi, \gP,r} - V^{\pi, \widehat{\gP}, r} \vert \leq \varepsilon$, where $V^{\pi, \gP, r}$ is the policy value under transition $\gP$ and reward $r$ \citep{chi20reward-free}. It exactly suffices for estimating the state-action distribution of the learner's policy in AIL. To see this, consider estimating the state-action distribution $d^{\pi}_h$ by $\widehat{d}^{\pi}_h$ calculated in the learned transition model $\widehat{\gP}$. Then we can upper bound the $\ell_1$-norm-based estimation error by
% \begin{align*}
% &\quad \sum_{h=1}^H \| \widehat{d}^{\pi}_h -  d^{\pi}_h  \|_1
% \\
% &=\max_{w \in \gW} \sum_{h=1}^H \sum_{(s, a)} w_h (s, a) (\widehat{d}^{\pi}_h (s, a) -  d^{\pi}_h (s, a) )
% \\
% &= \max_{w \in \gW} V^{\pi, \widehat{\gP}, w} - V^{\pi, \gP, w} \leq \varepsilon
% \end{align*}

% \cref{defn:reward_free} implies RFE enables the \emph{uniform policy evaluation} with respect to any reward function

By algorithmic designs, RFE methods usually satisfy the so-called uniform policy evaluation property, which is crucial for the discussion of AIL.
\begin{defn}  \label{defn:uniform_policy_evaluation}
Given an MDP $\gM$ without reward function $r$, an algorithm is said to be $(\varepsilon, \delta)$-PAC for uniform policy evaluation if 
\begin{align*}
    \sP \big( &\text{for any reward function $r$ and policy $\pi$}, \\
    & \quad |V^{\pi, P, r} - V^{\pi, \widehat{P}, r}| \leq \varepsilon \big) \geq 1 - \delta,
\end{align*}  
where $V^{\pi, P, r}$ and $V^{\pi, \widehat{P}, r}$ are the policy values of policy $\pi$ with reward function $r$ under the real transition model $P$ and recovered transition model $\widehat{P}$, respectively.
\end{defn}

Examples of algorithms that satisfy \cref{defn:uniform_policy_evaluation} include RF-RL-Explore \citep{chi20reward-free} (see their Lemma 3.6), RF-UCRL \citep{Kaufmann21adaptive-rfe} (see their Lemma 1 and the stopping rule) and RF-Express in \citep{menard20fast-active-learning} (see their Lemma 1 and the stopping rule).

% exhibit the uniform policy evaluation property

% \BLUE{
% % Importantly, RFE methods RF-RL-Explore \citep{chi20reward-free}, RF-UCRL \citep{Kaufmann21adaptive-rfe} and RF-Express satisfy the uniform policy evaluation property.
% \begin{lem}[]   \label{lem:uniform_policy_evaluation}
% The RFE methods RF-RL-Explore \citep{chi20reward-free}, RF-UCRL \citep{Kaufmann21adaptive-rfe} and RF-Express in \citep{menard20fast-active-learning} exhibit the uniform policy evaluation property, i.e., with probability at least $1-\delta$, for any reward function $r$ and policy $\pi$, we have 
% \begin{align*}
%    \labs  V^{\pi, P, r} - V^{\pi, \widehat{P}, r}  \rabs \leq \varepsilon,
% \end{align*}
% where $V^{\pi, P, r}$ and $V^{\pi, \widehat{P}, r}$ are the policy values of policy $\pi$ with reward function $r$ under the real transition model $P$ and recovered transition model $\widehat{P}$ by RF-Express, respectively.
% \end{lem}
% }


% \begin{lem}[Lemma 3.6 of \cite{chi20reward-free}]   \label{lem:uniform_policy_evaluation}
% An $(\varepsilon, \delta)$-PAC reward-free exploration (RFE) method enables the uniform policy evaluation property, i.e., with probability at least $1-\delta$, for any reward function $r$ and policy $\pi$, we have 
% \begin{align*}
%    \labs  V^{\pi, P, r} - V^{\pi, \widehat{P}, r}  \rabs \leq \varepsilon,
% \end{align*}
% where $V^{\pi, P, r}$ and $V^{\pi, \widehat{P}, r}$ are the policy values of policy $\pi$ with reward function $r$ under the real transition model $P$ and recovered transition model $\widehat{P}$ by RFE, respectively.
% \end{lem}

\cref{defn:uniform_policy_evaluation} is connected with AIL in the following way:
\begin{align*}
&\quad \sum_{h=1}^H \| \widehat{d}^{\pi}_h -  d^{\pi}_h  \|_1
\\
&=\max_{w \in \gW} \sum_{h=1}^H \sum_{(s, a)} w_h (s, a) (\widehat{d}^{\pi}_h (s, a) -  d^{\pi}_h (s, a) )
\\
&= \max_{w \in \gW} V^{\pi, \widehat{P}, w} - V^{\pi, P, w} \leq \varepsilon.
\end{align*}
Here the first equality follows the dual representation of $\ell_1$-norm, and $\gW = \{  w: \Vert w \Vert_{\infty} \leq 1\}$ is the unit ball. The second equality follows \cref{eq:dual_of_policy_value}. The last inequality follows \cref{defn:uniform_policy_evaluation}. In plain language, the above formula shows that we can get an accurate estimation of $d^{\pi}_h$, based on the recovered model by RFE.


% , where $V^{\pi, \widehat{P}, w}$ is the policy value under the recovered transition $\widehat{P}$ with reward function $w$ and $V^{\pi, P, w}$ is the policy value with the real transition $P$ and reward function $w$.

Based on the above relation, with a transition model learned by RFE, AIL can be implemented as if this empirical transition function were the same as the true transition function. More specifically, the state-action distribution matching problem \cref{eq:ail_known_transition} becomes
\begin{align}   \label{eq:ail_with_model}
 \min_{\pi \in \Pi} \sum_{h=1}^{H} \lnorm  \widetilde{d}^{\piE}_h -  d^{\pi, \widehat{P}}_h \rnorm_1 
\end{align}
where $d^{\pi, \widehat{P}}_h$ is the state-action distribution of policy $\pi$ with the transition model $\widehat{P}$. We outline the whole procedure in \cref{algo:framework} and the theoretical guarantee is provided below.

% The following analysis ensures that a proper AIL method under the known transition setting can be transferred to the unknown transition setting with theoretical guarantees.


\begin{prop}   \label{prop:connection}
Suppose that 
\begin{itemize}  % \vspace{-0.15cm}
    % \item[(a)] an algorithm A solves the reward-free exploration problem (see Definition \ref{defn:reward_free}) up to an error $\varepsilon_{\rfe}$ with probability at least $1-\delta_{\rfe}$;
    \item[(a)] a reward-free exploration algorithm A satisfies the uniform policy evaluation property (see \cref{defn:uniform_policy_evaluation}) up to an error $\varepsilon_{\rfe}$ with probability at least $1-\delta_{\rfe}$;
    \item[(b)] an algorithm B has a state-action distribution estimator for $d^{\piE}_h$, which satisfies $\sum_{h=1}^H \Vert \widetilde{d}^{\piE}_h - d^{\piE}_h  \Vert_{1} \leq \varepsilon_{\est}$, with probability at least $1-\delta_{\est}$;
    \item[(c)] with the transition model in (a) and the estimator in (b), an algorithm C solves the optimization problem in \cref{eq:ail_with_model} up to an error $\varepsilon_{\opt}$.
\end{itemize}
% \vspace{-0.15cm}
Then applying algorithms A, B and C under the framework in Algorithm \ref{algo:framework} could return a policy $\widebar{\pi}$, which has a policy value gap (i.e., $V^{\piE} - V^{\widebar{\pi}}$) at most $2 \varepsilon_{\est} + 2 \varepsilon_{\rfe} + \varepsilon_{\opt}$, with probability at least $1-\delta_{\est} - \delta_{\rfe}$.
\end{prop}

% \vspace{-0.15cm}
\begin{algorithm}[htbp]
\caption{Meta-algorithm for AIL with Unknown Transitions}
\label{algo:framework}
\begin{algorithmic}[1]
\REQUIRE{Expert demonstrations $\gD$.}
\STATE{$\widehat{P} \lar$ Invoke a reward-free exploration method to collect $n$ trajectories and learn a transition model.}
\STATE{$\widetilde{d}_h^{\piE} \lar $ Estimate the expert state-action distribution.}
\STATE{$\widebar{\pi} \lar$ Apply an AIL approach to perform imitation with the expert estimation $\widetilde{d}_h^{\piE}$ under transition model $\widehat{P}$.}
\ENSURE{Policy $\widebar{\pi}$.}
\end{algorithmic}
\end{algorithm}
% \vspace{-0.15cm}


% \begin{rem}
% We claim that a general class of AIL algorithms that utilize other distance measures can also be applied under the developed framework. This is because that the metric (e.g., $\ell_1$-norm) used in the estimation problem (assumption (b)) and the optimization problem (assumption (c)) is not unique. For instance, FEM \citep{pieter04apprentice} uses the $\ell_2$-norm metric in its algorithm but FEM can be also applied under this framework. As a result, the policy value gap becomes $\gO ( \sqrt{\vert \gS \vert \vert \gA \vert} ( \varepsilon_{\est} + \varepsilon_{\rfe} + \varepsilon_{\opt} ) )$; refer to Appendix \ref{subsec:application_of_fem} for a formal argument and the explanation of the additional factor $\sqrt{|\gS||\gA|}$. 
% \end{rem}

% \todo[inline]{Remark 1 can be removed if we need more space to explain other things}

Next, we show how to substantiate the framework in \cref{algo:framework} with detailed procedures. We will consider the tabular formulation, where the space of parameterized value functions spans all possible functions. In this scenario, expert policies and reward functions are realizable. We discuss how to control $\varepsilon_{\rfe}$, $\varepsilon_{\est}$, and $\varepsilon_{\opt}$ in a sequential order. 


\subsection{Controlling Reward-free Exploration Error}
 



% We leverage the advanced algorithm called RF-Express in \citep{menard20fast-active-learning} to control $\varepsilon_{\rfe}$ required by condition (a) in \cref{prop:connection}. We state the theoretical property of RF-Express below.

To ensure that condition (a) in \cref{prop:connection} is satisfied, we make use of the RF-Express algorithm, as described in \citep{menard20fast-active-learning}. This advanced algorithm allows us to control $\varepsilon_{\rfe}$ effectively. Below, we provide the theoretical property of RF-Express.

% \ref{algo:rf_express}
\begin{lem}[Theorem 1 in \citep{menard20fast-active-learning}]
\label{lem:reward_free}
Fix $\varepsilon \in \lp 0, 1 \rp$ and $\delta \in (0, 1)$. Consider the \textnormal{RF-Express} algorithm (see Algorithm 1 in Appendix) and $\widehat{P}$ is the empirical transition function built on the collected trajectories, if the number of trajectories collected by \textnormal{RF-Express} satisfies 
\begin{align*}
    n \gtrsim  \frac{H^{3} |\gS| |\gA| }{\varepsilon^2}    \lp |\gS| + \log\lp\frac{|\gS| H}{\delta} \rp \rp.
\end{align*}
Then with probability at least $1-\delta$, for any policy $\pi$ and any bounded reward function $r$ between $[-1, 1]$, we have $| V^{\pi, P, r} - V^{\pi, \widehat{P}, r} | \leq {\varepsilon}/{2}$; furthermore, for any bounded reward function $r$ between $[-1, 1]$, we have $ \max_{\pi \in \Pi} V^{\pi, P, r} \leq V^{\widehat{\pi}_{r}^{*}, P, r} + \varepsilon$, where $\widehat{\pi}_{r}^{*}$ is the optimal policy under the empirical transition function $\widehat{P}$ with reward function $r$.

% and $V^{\pi, P, r}$ is the policy value under the real transition function $P$ with reward function $r$. 
\end{lem}

% That is, RF-Express satisfies condition (a) in \cref{prop:connection}.


% \footnote{This is implied by the stopping rule in RF-Express algorithm and Lemma 1 in \citep{menard20fast-active-learning}.}


\subsection{Controlling Expert State-action Distribution Estimation Error}
\label{subsec:transition_aware_estimator}

In this part, we talk about how to control the expert state-action distribution estimation error. Quite often, the maximum likelihood estimator (MLE) is considered in the literature \citep{pieter04apprentice, syed07game, shani2022online}. Mathematically, MLE counts how frequently a state-action pair appears in the observed expert trajectories: 
\begin{align}   \label{eq:estimate_by_count}
    \widehat{d}^{\piE}_h(s, a) = \frac{  \sum_{\tr \in \gD}  \indict\lb \tr_h(\cdot, \cdot) = (s, a) \rb }{|\gD|}, 
\end{align}
where $\tr_h(\cdot, \cdot)$ indicates the specific state-action pair of trajectory $\tr$ in time step $h$. The sample complexity of MLE is well-known. 
\begin{lem}[\cite{rajaraman2020fundamental}]   \label{lem:sample_complexity_mle}
Fix $\varepsilon \in (0, H)$ and $\delta \in (0, 1)$, if the number of expert trajectories in $\gD$ satisfies
\begin{align*}
    m \gtrsim \frac{H^2 \vert \gS \vert}{\varepsilon^2} \log \lp \frac{H}{\delta} \rp,
\end{align*}
then with probability at least $1-\delta$, we have $\sum_{h=1}^H \Vert \widehat{d}^{\piE}_h - d^{\piE}_h   \Vert_{1} \leq \varepsilon$.
\end{lem}

% In this paper, we argue that the sample complexity in \cref{lem:sample_complexity_mle} is tight. 

% \begin{prop}    \label{prop:lower_bound_mle}
% Fix $\varepsilon \in (0, H)$ and $\delta \in (0, 1)$, there exists an MDP instance such that  
% \end{prop}


% We notice that a similar lower bound is also presented in \citep{swamy2022minimax}. The main difference is that our lower bound holds for any $|\gS|$ while the lower bound in \citep{swamy2022minimax} is only true when $|\gS| = 2$.



% Despite its simplicity, MLE, however, is not optimal from a theoretical perspective. 
{The above sample complexity of MLE is tight in the worst case; see, e.g., \citep[Lemma 8]{kamath2015learning}. Though MLE can be implemented under our framework, this estimator cannot lead to the minimax optimal expert sample complexity $\Theta(H^{3/2}|\gS|/\varepsilon)$.} To address this issue, in light of \citep{rajaraman2020fundamental}, we develop a new estimator. For a better presentation, let us introduce the following notations.
\begin{itemize}  % \vspace{-0.15cm}
    \item Similar to $\tr_h(\cdot, \cdot)$,  $\tr_h(\cdot)$ indicates the specific state of trajectory $\tr$ in time step $h$.
    \item Without $(\cdot)$ or $(\cdot, \cdot)$, $\tr_h$ is the truncated version of trajectory $\tr$ up to time step $h$, i.e., $\tr_h = (s_1, a_1, \cdots, s_h, a_h)$.
    \item $\gS_{h}(\gD) = \{s: \exists \tr \in \gD \text{ such that } s = \tr_h(\cdot) \}$ is the set of states visited at time step $h$ in $\gD$. 
    % \item $\Tr_h^{\gD} = \lb \tr^h: \tr^h_{\ell} (\cdot) \in \gS_{\ell} (\gD), \forall \ell \in [h] \rb$: the trajectories along which each state has been visited in $\mathcal{D}$ up to time step $h$.
    \item $\Tr_h^{\gD} = \{ \tr_h = (s_1, a_1, \ldots, s_h, a_h): s_\ell \in \gS_{\ell}(\gD), \forall \ell \in [h] \}$ is the set of truncated  trajectories (that may not appear in $\gD$), along which each state has been visited in $\mathcal{D}$ up to time step $h$.
\end{itemize}


From the definition of state-action distribution, we have  
\begin{align}
   d^{\pi}_h(s, a)  &= d^{\pi}_h(s) \pi_h(a|s) \nonumber  \\
   &=  \big[ \sum_{s^\prime, a^\prime} d^{\pi}_{h-1}(s^\prime, a^\prime) P_{h-1}(s|s^\prime, a^\prime) \big]  \pi_h(a|s)  \label{eq:flow}
\end{align}
This equation offers another perspective on visitation probability: $d^{\pi}_h(s, a)$ represents the weighted average of flows. Specifically, each flow path is determined by ancestral state-action sequences that lead to the target state-action pair $(s, a)$, and the weight of this flow is influenced by both the transition probability and the policy distribution.

However, when dealing with a finite sample regime, only a subset of trajectories executed by the expert policy is observed, while others remain unobserved. We can use the transition function to calculate the visitation probability for the observed trajectories, but we require statistical estimation for the non-observed ones. This idea has been exploited in \citep{rajaraman2020fundamental} in the known transition setting.



Now, consider the dataset $\gD$ is randomly divided into two equal parts, i.e., $\gD = \gD_1 \cup \gD_1^{c}$ and $\gD_1 \cap \gD_1^{c} = \emptyset$ with $|\gD_1 |=  |\gD_1^{c}| = m / 2$. We have the following decomposition:
\begin{align} 
&d_h^{\piE}(s, a) =  \underbrace{\sum_{\tr_h \in \Tr_h^{\gD_1}} \sP^{\piE}(\tr_h) \indict\lb \tr_h(\cdot, \cdot) = (s, a) \rb}_{:= \clubsuit} \nonumber \\
&+ \underbrace{\sum_{\tr_h \notin \Tr_h^{\gD_1}} \sP^{\piE}(\tr_h) \indict\lb \tr_h(\cdot, \cdot) = (s, a) \rb}_{:= \spadesuit}, \label{eq:key_decomposition}
\end{align}
where $\sP^{\piE} (\tr_h)$ is the probability of the truncated trajectory $\tr_h$ induced by the deterministic expert policy $\piE$. As we have mentioned, if the transition function is known, we can calculate  $\sP^{\piE} (\tr_h)$ directly: $\sP^{\piE} (\tr_h)  = \rho(s_1) \prod_{\ell=1}^{h-1} P_{\ell}(s_{\ell + 1}|s_{\ell}, a_{\ell})$ with $\tr_h = (s_1, a_1, \cdots, s_h, a_h)$

% Now, consider the dataset $\gD$ is randomly divided into two equal parts, i.e., $\gD = \gD_1 \cup \gD_1^{c}$ and $\gD_1 \cap \gD_1^{c} = \emptyset$ with $|\gD_1 |=  |\gD_1^{c}| = m / 2$. We start from the following decomposition:
% \begin{align} 
% &d_h^{\piE}(s, a) =  \underbrace{\sum_{\tr_h \in \Tr_h^{\gD_1}} \sP^{\piE}(\tr_h) \indict\lb \tr_h(\cdot, \cdot) = (s, a) \rb}_{:= \clubsuit} \nonumber \\
% &+ \underbrace{\sum_{\tr_h \notin \Tr_h^{\gD_1}} \sP^{\piE}(\tr_h) \indict\lb \tr_h(\cdot, \cdot) = (s, a) \rb}_{:= \spadesuit}, \label{eq:key_decomposition}
% \end{align}
% where $\sP^{\piE} (\tr_h)$ is the probability of the truncated trajectory $\tr_h$ induced by the deterministic expert policy $\piE$. If the transition function is known, we can calculate  $\sP^{\piE} (\tr_h)$ directly: $\sP^{\piE} (\tr_h)  = \rho(s_1) \prod_{\ell=1}^{h-1} P_{\ell}(s_{\ell + 1}|s_{\ell}, a_{\ell})$ with $\tr_h = (s_1, a_1, \cdots, s_h, a_h)$


We explain two terms in \cref{eq:key_decomposition} separately. On the one hand, term $\clubsuit$ can be calculated exactly if we know both the transition function and $\gD_1$, as explained previously. However, this is not applicable in our case as the transition function is unknown. We will discuss how to deal with this trouble later. On the other hand, term $\spadesuit$ accounts for non-observed trajectories, which is not easy to compute (because we have no clue about expert actions on non-observed states). To address this issue, \citet{rajaraman2020fundamental} proposed to use trajectories in $\gD_1^{c}$ to make a maximum likelihood estimation. This is because, $\gD_1^{c}$ is statistically independent of $\gD_1$ and therefore can be viewed as a new dataset. We follow the approach in \citep{rajaraman2020fundamental} to estimate term $\spadesuit$.

% This is the key idea of MIMIC-MD \citep{rajaraman2020fundamental}: we can break the sample barrier with known transitions.

Now, we explain how to estimate term $\clubsuit$ in the unknown transition setting. Our solution has two steps. The first step is to apply BC on $\gD_1$ to learn policy $\pi^\prime$:
\begin{align*}
\pi^{\prime}_h(a|s) = \left\{ \begin{array}{cc}
  \frac{n^1_h(s, a)}{n^1_h (s)}   &  \text{ if } {n^1_h(s) > 0} \\
   \frac{1}{|\gA|}  & \text{ otherwise}  
\end{array} \right.
\end{align*}
Here $n^1_h(s, a)$ ($n^1_h(s)$) is the number of state-action (state) pairs that appeared in $\gD_1$ in step $h$. This step recovers the expert behaviors on visited states in $\gD_1$. The second step is to let $\pi^\prime$ interact with the environment to collect a new dataset $\gD_{\env}^\prime$, from which we can estimate term $\clubsuit$ by MLE. To get a better sense, we mention that the uncertainty of estimating term $\clubsuit$ comes from the transition function, rather than the expert policy. Furthermore, by our design, trajectories in $\gD^{\prime}_{\env}$ are collected as if the expert policy were roll-out (because $\pi^\prime$ can perfectly match $\piE$ on $\gS(\gD_1)$, so the randomness of MLE is only caused by the stochastic transitions. 

% see Lemma \ref{lemma:unknown-transition-unbiased-estimation} in Appendix for more details)

% \todo[inline]{How to learn a policy by BC on a dataset is not explained.}

In summary, we arrive at the following estimator:
\begin{align}
    &\widetilde{d}^{\piE}_h(s, a) = {\frac{\sum_{\tr_h \in \gD_{\env}^\prime} \indict \{ \tr_h (\cdot, \cdot) = (s, a), \tr_h \in \Tr_h^{\gD_1} \}}{|\gD^\prime_{\env}|}} \nonumber \\
    &+ {\frac{  \sum_{\tr_h \in \gD_1^c}  \indict\{ \tr_h (\cdot, \cdot) = (s, a), \tr_h \not\in \Tr_h^{\gD_1}  \} }{|\gD_1^c|}}. \label{eq:new_estimator_unknown_transition}
\end{align}
Two terms in \cref{eq:new_estimator_unknown_transition} give estimation for terms $\clubsuit$ and $\spadesuit$ in \cref{eq:key_decomposition}, respectively. {It is important to note that the state-action distribution largely depends on the transition probability, as shown in \cref{eq:flow}. In contrast to the MLE in \cref{eq:estimate_by_count}, our proposed estimator additionally leverages the transition information from the online interactions; see the first term in RHS in \cref{eq:new_estimator_unknown_transition}. This advancement leads to a more accurate estimation of the expert's state-action distribution.} 
\begin{lem} \label{lemma:sample_complexity_of_new_estimator_unknown_transition}
Given the expert dataset $\gD$, let $\gD$ be divided into two equal subsets, i.e., $\gD = \gD_{1} \cup \gD_{1}^c$ and $\gD_1 \cap \gD_1^{c} = \emptyset$ with $\labs \gD_1 \rabs = \labs \gD_1^{c} \rabs = m / 2$. Fix $\pi^{\prime} \in \Pi_{\text{BC}} \lp \gD_1 \rp$, let $\gD^\prime_{\mathrm{env}}$ be the dataset collected by $\pi^\prime$ and $|\gD^\prime_{\mathrm{env}} | = n^\prime$. Fix $\varepsilon \in (0, 1)$ and $\delta \in (0, 1)$; suppose $H \geq 5$. Consider the estimator $\widetilde{d}^{\piE}_h$ shown in \eqref{eq:new_estimator_unknown_transition}, if the expert sample complexity ($m$) and the interaction complexity ($n^\prime$) satisfy
\begin{align*}
    m \gtrsim   \frac{H^{3/2} | \gS | }{\varepsilon} \log\lp  \frac{|\gS| H}{\delta} \rp, \; n^\prime \gtrsim \frac{H^{2} | \gS |}{\varepsilon^2} \log\lp  \frac{|\gS| H}{\delta} \rp,
\end{align*}
then with probability at least $1-\delta$, we have
\begin{align*}
    \sum_{h=1}^H \lnorm \widetilde{d}^{\piE}_h - d^{\piE}_h  \rnorm_{1} \leq \varepsilon.
\end{align*}
\end{lem}

To our best knowledge,  the estimator \eqref{eq:new_estimator_unknown_transition} is the first to enjoy a better expert sample complexity than MLE in the unknown transition setting. The nature of unknown transitions raises a technical difficulty in analyzing the estimation error of two sub-estimators in \eqref{eq:new_estimator_unknown_transition}. We highlight that the classical concentration inequality, used to analyze the MLE estimator in \cref{lem:sample_complexity_mle}, cannot be used to upper bound this estimation error, as the distributions involved are not valid. To overcome this obstacle, we employ Chernoff's bound and additional statistical arguments. 

% We overcome this challenge by applying Chernoff's bound and additional statistical arguments.


% After solving the estimation issue, we design computation-efficient algorithms to control $\varepsilon_{\opt}$.


% Next, we present how to apply \algname in Algorithm \ref{algo:main_aglorithm} under our framework. Notice that the estimator in \eqref{eq:new_estimator} involves the exact transition function and cannot be directly applied here. Luckily, we could address this issue by a dataset $\gD^{\prime}_{\env}$ collected by rolling out a BC policy (obtained from $\gD_1$) with the environment; see Appendix \ref{subsec:explanation_of_the_estimation_unknown_transition} for more explanation. Based on this trick, the new estimator is formulated as $\widetilde{P}_h^{\piE} (s, a) =$
% \begin{equation}
%     \begin{aligned}
% & {\frac{\sum_{\tr_h \in \gD_{\env}^\prime} \indict \{ \tr_h (s_h, a_h) = (s, a), \tr_h \in \Tr_h^{\gD_1} \}}{|\gD^\prime_{\env}|}} + 
% \\
% & {\frac{  \sum_{\tr_h \in \gD_1^c}  \indict\{ \tr_h (s_h, a_h) = (s, a), \tr_h \not\in \Tr_h^{\gD_1}  \} }{|\gD_1^c|}}. 
% \end{aligned}
% \label{eq:new_estimator_unknown_transition}
% \end{equation}


\subsection{Controlling Optimization Error}

 We now consider the optimization issue. Again, we utilize the dual representation of $\ell_1$-norm and the min-max theorem~\citep{bertsekas2016nonlinear} to obtain the following max-min optimization problem:
\begin{align}   \label{eq:new_algo_max_min}
     \max_{w \in \gW} \min_{\pi \in \Pi} \sum_{h=1}^H \sum_{(s, a)} w_h (s, a) ( \widetilde{d}^{\piE}_h (s, a) -  d^{\pi, \widehat{P}}_h (s, a) ).
\end{align}
where $\gW = \{w: \lnorm w \rnorm_\infty \leq 1 \}$ is the unit ball. We see that the inner problem in \eqref{eq:new_algo_max_min} is to maximize the policy value of $\pi$ given the reward function $w_h(s, a)$ (see \cref{eq:dual_of_policy_value} for the dual form of policy value). For the outer optimization problem, we can use online gradient descent methods \citep{shalev12online-learning} so that the overall objective can finally reach an approximate saddle point. Formally, let us define the objective $f^{(t)}(w)$:
\begin{align}   
    & \underbrace{\sum_{h=1}^{H} \sum_{(s, a) \in \gS \times \gA} w_h(s, a) \lp d^{\pi^{(t)}, \widehat{P}}_h (s, a) - \widetilde{d}^{\piE}_h (s, a)  \rp}_{:= f^{(t)}(w)}, \label{eq:objective_w}
\end{align}
where $\pi^{(t)}$ is the optimized policy in iteration $t$. Then the update rule for $w$ is:
\begin{align*}   
    w^{(t+1)} := \gP_{\gW} ( w^{(t)} - \eta^{(t)}  \nabla f^{(t)}(w^{(t)}) ), 
\end{align*}
where $\eta^{(t)} > 0$ is the stepsize to be chosen later, and $\gP_{\gW}$ is the Euclidean projection on the unit  ball $\gW$, i.e., $\gP_{\gW}(w) := \argmin_{z \in \gW} \lnorm z- w \rnorm_2$. The procedure for solving \eqref{eq:new_algo_max_min} is outlined in Algorithm~\ref{algo:gradient_based_optimization}.

% \vspace{-0.15cm}
% \begin{algorithm}[htbp]
% \caption{Transition-aware AIL (\algname)}
% \label{algo:main_aglorithm}
% {
% \begin{algorithmic}[1]
% \REQUIRE{expert demonstrations $\gD$, number of iterations $T$, step size $\eta^{(t)}$, and initialization $w^{(1)}$.}
% \STATE{Randomly split $\gD$ into two equal parts: $\gD = \gD_1 \cup \gD_1^{c}$ and obtain the estimation $\widetilde{P}_h^{\piE}$ in \eqref{eq:new_estimator}.}
% \FOR{$t = 1, 2, \cdots, T$}
% \STATE{$\pi^{(t)} \lar $ solve the optimal policy with the reward function $w^{(t)}$ up to an error of $\varepsilon_{\mathrm{opt}}$.}
% \STATE{Compute the state-action distribution $P^{\pi^{(t)}}_h$ for $\pi^{(t)}$.}
% \STATE{Update $ w^{(t+1)} := \gP_{\gW}\lp w^{(t)} - \eta^{(t)}  \nabla f^{(t)}(w^{(t)}) \rp$ with $f^{(t)}(w)$ defined in \eqref{eq:objective_w}.}
% \ENDFOR
% \STATE{Compute the mean state-action distribution $\widebar{P}_h(s, a) = \sum_{t=1}^{T} P^{\pi^{(t)}}_h(s, a) / T$.}
% \STATE{Derive $\widebar{\pi}_h (a|s) \lar \widebar{P}_h(s, a) / \sum_{a} \widebar{P}_h(s, a)$.}
% \ENSURE{policy $\widebar{\pi}$.}
% \end{algorithmic}
% }
% \end{algorithm}
% \vspace{-0.15cm}

\begin{algorithm}[htbp]
\begin{algorithmic}[1]
\caption{Gradient-based Optimization}
\label{algo:gradient_based_optimization}
\REQUIRE{Transition model $\widehat{P}$, and expert state-action distribution estimator $\widetilde{d}^{\piE}_h$.}
\FOR{$t = 1, 2, \cdots, T$}
\STATE{$\pi^{(t)} \lar $ Solve the optimal policy with the transition model $\widehat{P}$ and reward function $w^{(t)}$ up to an error $\varepsilon_{\rl}$.}
\STATE{Compute the state-action distribution $d^{\pi^{(t)}, \widehat{P}}_h$ for $\pi^{(t)}$.}
\STATE{Update $ w^{(t+1)} := \gP_{\gW}\lp w^{(t)} - \eta^{(t)}  \nabla f^{(t)}(w^{(t)}) \rp$ with $f^{(t)}(w)$ defined in \cref{eq:objective_w}.}
\ENDFOR
\STATE{Compute the mean state-action distribution $\widebar{d}_h(s, a) = \sum_{t=1}^{T} d^{\pi^{(t)}, \widehat{P}}_h(s, a) / T$.}
\STATE{Derive $\widebar{\pi}_h (a|s) \lar \widebar{d}_h(s, a) / \sum_{a} \widebar{d}_h(s, a)$.}
\ENSURE{Policy $\widebar{\pi}$.}
\end{algorithmic}
\end{algorithm}

% Line 2 in \cref{algo:gradient_based_optimization}  amounts to a typical RL optimization problem. As a general claim, we allow $\pi^{(t)}$ is $\varepsilon_{\rl}$-optimal w.r.t. the optimal policy with reward function $w^{(t)}$, i.e., $V^{\pi^{(t)}, \widehat{P}, w^{(t)}} \geq V^{\pi^{*}_{w^{(t)}}, \widehat{P}, w^{(t)}} - \varepsilon_{\rl}$. In the tabular case, $\varepsilon_{\rl} = 0$ by value iteration with finite computation steps, whilst approximate methods (e.g., policy gradient ascent) can also be used if they can guarantee $\varepsilon_{\rl}$ is small with cheap computation.

Line 2 in \cref{algo:gradient_based_optimization} formulates a typical reinforcement learning (RL) optimization problem. We allow $\pi^{(t)}$ to be $\varepsilon_{\rl}$-optimal with respect to the optimal policy with reward function $w^{(t)}$, i.e., $V^{\pi^{(t)}, \widehat{P}, w^{(t)}} \geq V^{\pi^{*}_{w^{(t)}}, \widehat{P}, w^{(t)}} - \varepsilon_{\rl}$. In the tabular case, $\varepsilon_{\rl} = 0$ by value iteration with finite and polynomial computation steps. For approximate methods such as policy gradient ascent, we require that they can guarantee $\varepsilon_{\rl}$ is small with low computational cost.

\begin{lem} \label{lemma:approximate-minimax}
Fix $\varepsilon > 0$. Consider the gradient-based optimization procedure in \cref{algo:gradient_based_optimization} with $\varepsilon_{\rl} \leq \varepsilon/2$.  If we take $T \gtrsim H^2 |\gS||\gA|/\varepsilon^2$ and $\eta^{(t)} := \sqrt{|\gS||\gA| / (8T)}$, then we have
\begin{align*}
\sum_{h=1}^H \lnorm d^{\widebar{\pi}, \widehat{P}}_h - \widetilde{d}^{\piE}_h \rnorm_{1} \leq \min_{\pi \in \Pi} \sum_{h=1}^H \lnorm d^{\pi, \widehat{P}}_h - \widetilde{d}^{\piE}_h \rnorm_{1} + \varepsilon. 
\end{align*}
\end{lem}



\subsection{MB-TAIL: Combing All Together}


Combing the above all pieces together, we obtain the final approach called MB-TAIL presented in Algorithm \ref{algo:mbtail-abstract}. Here \dquote{MB-TAIL} stands for model-based transition-aware adversarial imitation learning.


\begin{algorithm}[htbp]
\caption{Model-based Transition-aware AIL}
\label{algo:mbtail-abstract}
\begin{algorithmic}[1]
\REQUIRE{Expert demonstrations $\gD$.}
\STATE{Invoke \textnormal{RF-Express} to collect $n$ trajectories and learn an empirical transition function $\widehat{P}$.}
\STATE{Randomly split $\gD$ into two equal parts: $\gD = \gD_1 \cup \gD_1^{c}$.}
\STATE{Learn $\pi^{\prime} \in \Pi_{\text{BC}} \lp \gD_{1} \rp$ by BC and roll out $\pi^{\prime}$ to obtain dataset $\gD_{\env}^\prime$ with $|\gD_{\env}^\prime| = n^{\prime}$.}
\STATE{Obtain the estimator $\widetilde{d}_h^{\piE}$ in \eqref{eq:new_estimator_unknown_transition} with $\gD$ and $\gD_{\env}^\prime$.}
\STATE{$\widebar{\pi} \lar$ Apply \cref{algo:gradient_based_optimization} with the estimation $\widetilde{d}_h^{\piE}$ under transition model $\widehat{P}$.}
\ENSURE{Policy $\widebar{\pi}$.}
\end{algorithmic}
\end{algorithm}


\begin{thm}\label{theorem:sample-complexity-unknown-transition}
Fix $\varepsilon \in \lp 0, 1 \rp$ and $\delta \in (0, 1)$; suppose $H \geq 5$. Under the unknown transition setting, consider \mbalgname displayed in Algorithm \ref{algo:mbtail-abstract} and $\widebar{\pi}$ is output policy, assume that the RL error $\varepsilon_{\rl} \leq \varepsilon / 2$, the number of iterations and the step size are the same as in \cref{lemma:approximate-minimax}, if the expert sample complexity and the interaction complexity satisfy
\begin{align*}
&m \gtrsim  \frac{ H^{3/2} |\gS|}{\varepsilon} \log\lp\frac{H |\gS|}{\delta} \rp, n^{\prime} \gtrsim  \frac{ H^2 |\gS|}{\varepsilon^2} \log \lp \frac{H |\gS| }{\delta} \rp,
\\
& \quad n \gtrsim \frac{H^3 |\gS| |\gA|}{\varepsilon^2} \lp |\gS| + \log \lp \frac{H |\gS| |\gA|}{\delta \varepsilon} \rp \rp,
\end{align*}
then with probability at least $1-\delta$, we have $V^{\piE} - V^{\widebar{\pi}} \leq \varepsilon $.
\end{thm}



\begin{rem}
% For MB-TAIL, the expert sample complexity is $m = \widetilde{\gO}(H^{3/2} |\gS| / \varepsilon)$ and the total interaction complexity is $n + n^\prime = \widetilde{\gO}(H^{3} |\gS|^2 |\gA| / \varepsilon^2)$. Recall that the OAL algorithm in \citep{shani2022online} has the expert sample complexity $\widetilde{\gO} (H^{2} |\gS| / \varepsilon^2 )$ and interaction complexity $\widetilde{\gO} ( H^4 |\gS|^2 |\gA| / \varepsilon^2 )$ under the unknown transition case. Theorem \ref{theorem:sample-complexity-unknown-transition} implies our approach has substantial improvements over the OAL algorithm in both expert sample complexity and interaction complexity. 

Our MB-TAIL algorithm achieves expert sample complexity $m = \widetilde{\gO}(H^{3/2} |\gS| / \varepsilon)$ and total interaction complexity $n + n^\prime = \widetilde{\gO}(H^{3} |\gS|^2 |\gA| / \varepsilon^2)$, even in the case of unknown transitions. In comparison, the OAL algorithm in \citep{shani2022online} has expert sample complexity $\widetilde{\gO} (H^{2} |\gS| / \varepsilon^2 )$ and interaction complexity $\widetilde{\gO} ( H^4 |\gS|^2 |\gA| / \varepsilon^2 )$ in the same scenario. Theorem \ref{theorem:sample-complexity-unknown-transition} validates that our approach provides significant improvements over OAL in terms of both expert sample complexity and interaction complexity. 

The success of this improvement hinges on the design of our algorithm. Unlike OAL, which uses a maximum likelihood estimate of the expert's state-action distribution for imitation, MB-TAIL leverages transition information to construct a more accurate estimator. In addition, OAL uses a tailored optimistic value function in a model-free manner for exploration, but MB-TAIL employs a model-based, reward-free exploration method to efficiently explore the state-action space. These algorithmic designs yield substantial enhancements in both expert sample complexity and interaction complexity.

% We explain this improvement in terms of algorithm design below. We note that OAL uses the simple MLE of the expert's state-action distribution. With such an MLE, OAL simultaneously performs exploration (to learn transitions) and imitation (to learn policies and rewards). To perform exploration, OAL uses a tailored optimistic value function in a model-free manner. For imitation, OAL applies online mirror descent to update rewards and policies. In contrast, the state-action estimator in MB-TAIL leverages the transition information and has a much better theoretical guarantee. Furthermore, an efficient reward-free exploration method (that is model-based) is used in MB-TAL to solve the exploration issue in AIL. 
\end{rem}


% \ref{fig:reset_cliff}  \ref{section:experiment_details}
\textbf{Simulation Studies.} Finally, we conclude by validating our theoretical results through experiments, where we compare the performance of MB-TAIL with four other state-of-the-art algorithms: BC \citep{Pomerleau91bc}, FEM \citep{abbeel05exploration-and-ap}, GTAL \citep{syed07game}, and OAL \citep{shani2022online}. All algorithms are given 100 expert trajectories, and we evaluate their performance on the Reset Cliff MDP (shown in Figure 1 in Appendix), which is known to be challenging for imitation learning algorithms \citep{rajaraman2020fundamental, xu2021error}. We conduct experiments with $20$ random seeds, and provide more experimental details in the Appendix. The code to reproduce our results is available at our GitHub repository \footnote{\url{https://github.com/tianxusky/tabular-ail}}.



\begin{figure}[htbp]
    \centering
    \includegraphics[width=0.75\linewidth]{figs/cliffwalking.pdf}
    \caption{The imitation gap (i.e., $V^{\piE} - V^{\pi}$) in Reset Cliff.}
    \label{fig:unknown_transition_result}
\end{figure}

Figure \ref{fig:unknown_transition_result} shows the imitation gap for each algorithm. Note that BC, FEM, and GTAL do not leverage environment interactions. Our results show that MB-TAIL outperforms FEM and GTAL when the number of interactions exceeds $500$. Additionally, we observe that MB-TAIL outperforms OAL with the same number of interactions, which confirms the superior theoretical bounds of MB-TAIL.


% we corroborate our theoretical result with experiments. We compare MB-TAIL with BC \citep{Pomerleau91bc}, FEM \citep{abbeel05exploration-and-ap}, GTAL \citep{syed07game}, OAL \citep{shani2022online}. All algorithms are provided with the same number of expert trajectories. We consider the Reset Cliff MDP, which is known to be hard for imitation learning algorithms \citep{rajaraman2020fundamental, xu2021error}. All experiments run with $20$ random seeds. Appendix \ref{section:experiment_details} gives more experimental details.


% \todo[inline]{Due to the space limit, I believe the simulation on the Reset Cliff is sufficient. The performance of FEM and GTAL should be also plotted.}


% Figure \ref{fig:unknown_transition_result} displays the imitation gap of all methods. Notice that BC, FEM, and GTAL do not utilize environment interactions. First, MB-TAIL outperforms FEM and GTAL when the number of interactions exceeds $500$. Besides, we observe that MB-TAIL outperforms OAL with the same number of interactions. This corroborates the better theoretical bounds of MB-TAIL.

% our theoretical results that MB-TAIL is more provably efficient than OAL. 

% , suggesting that it is inefficient to leverage expert dataset to learn transitions

% \section{MB-TAIL with State Abstraction}





% \sectionspace
% \section{An Improved Transition-aware AIL Algorithm with Known Transitions}
% \label{sec:main_results}
% \sectionspace

% In this part, we assume the transition function is known, so the agent can exactly evaluate a policy for any reward function. Therefore, we only focus on the expert sample complexity in this section and defer the further study of interaction complexity with unknown transitions to the next section. 

% \subsection{An Analysis Framework for AIL Methods}

% Our starting point is the following unconstrained state-action distribution matching problem: 
% \begin{align} \label{eq:l1_norm_imitation_with_estimator} 
%    \min_{\pi \in \Pi}  \sum_{h=1}^{H} \lnorm P^{\pi}_h - \widetilde{P}^{\piE}_h \rnorm_1.
% \end{align}
% where $\Pi$ is the set of all non-stationary policies and $\widetilde{P}^{\piE}_h$ is an estimation of the expert state-action distribution $P^{\piE}_h$. For the above distribution matching problem, the state-action distribution estimation and the corresponding solved policy are two important components. First, we introduce the following two definitions to measure the qualities of these components.   






% \RED{The discussion with MIMIC-MD}



% \todo[inline]{The above overview is important for general readers. If we directly present technical discussion, they may find it is hard to read and lose interests.}


% \textbf{(a section of estimation error)}


% Conventional methods like FEM and GTAL use the direct maximum likelihood estimation (MLE):
% \begin{align}   \label{eq:estimate_by_count}
%     \frac{  \sum_{\tr \in \gD}  \indict\lb \tr(s_h, a_h) = (s, a) \rb }{|\gD|}, 
% \end{align}
% which is statistically unbiased. With the standard concentration inequalities, one can prove that the MLE leads to an estimation error $\gO()$; see, e.g., .  Despite the simplicity of MLE, perhaps surprisingly, recent studies have showed that this estimator is not optimal in the known-transition setting. 


% In our work, we choose to use the fine-grained estimation proposed in  \citep{rajaraman2020fundamental}. (Details below).

% XXXXX

% This estimator looks rather complicated and we explain the intuition below. 

% XXXXXXXXX



% \textbf{(a section of optimization error)}

% Now, we investigate the optimization error. Since the analytic solution to \cref{eq:l1_norm_imitation_with_estimator} is not available, we prefer to use iterative/numerical algorithms that can provide an approximately optimal solution with cheap computation. In this scenario, we care about the iteration complexity and memory complexity used by iterative algorithms.  (Details below)


% Our key insight is to utilize the dual representation of $\ell_1$-norm and the min-max theorem

% XXXXXXXX



% \subsection{The Transition-Aware Estimator}


% Our method leverages the estimator proposed in \citep{rajaraman2020fundamental} for the state-action distribution $P^{\piE}_h$. For a better presentation, let us introduce the following notations.
% \begin{itemize}
%     \item $\tr_h$: the truncated trajectory up to time step $h$, i.e., $\tr_h = (s_1, a_1, \cdots, s_h, a_h)$.
%     \item $\gS_{h}(\gD)$: the set of states visited at time step $h$ in $\gD$. 
%     \item $\Tr_h^{\gD} = \lb \tr_h: \tr_h (s_\ell) \in \gS_{\ell} (\gD), \forall \ell \in [h] \rb$: the trajectories along which each state has been visited in $\mathcal{D}$ up to time step $h$.
% \end{itemize}


% Now, consider the dataset $\gD$ is randomly divided into two equal parts, i.e., $\gD = \gD_1 \cup \gD_1^{c}$. The estimator in \citep{rajaraman2020fundamental} is\footnote{To help readers understand this estimator, we provide a detailed example in Appendix \ref{sec:an_example}.}:
% \begin{equation}
% \label{eq:new_estimator}
% \begin{aligned} 
% &\widetilde{P}_h^{\piE}  (s, a) =   { \sum_{\tr_h \in \Tr_h^{\gD_1} } \sP^{\piE}(\tr_h) \indict\lb \tr_h(s_h, a_h) = (s, a)\rb}
% \\
% &+ {\frac{  \sum_{\tr_h \in \gD_1^c}  \indict\{ \tr_h (s_h, a_h) = (s, a), \tr_h \not\in \Tr_h^{\gD_1}  \} }{|\gD_1^c|}}, 
% \end{aligned}
% \end{equation}
% where $\sP^{\piE} (\tr_h)$ is the probability of the truncated trajectory $\tr_h$ induced by the deterministic expert policy $\piE$. 



% To get a better intuition of this estimator, consider the following key decomposition of $P^{\piE}_h (s, a)$:
% \begin{equation}
% \label{eq:key_decomposition}
% \begin{aligned} 
% P_h^{\piE}(s, a) &= {\sum_{\tr_h \in \Tr_h^{\gD_1}} \sP^{\piE}(\tr_h) \indict\lb \tr_h(s_h, a_h) = (s, a) \rb}
% \\
% &+ {\sum_{\tr_h \notin \Tr_h^{\gD_1}} \sP^{\piE}(\tr_h) \indict\lb \tr_h(s_h, a_h) = (s, a) \rb}. 
% \end{aligned}
% \end{equation}


% First of all, we see that the first term in the estimator \eqref{eq:new_estimator} is exactly the first part in \eqref{eq:key_decomposition}. For this part, all state-action pairs up to time step $h$ are known from $\gD_1$. Therefore, we can compute $\sP^{\piE} (\tr_h)$ exactly as the transition function is known; specifically, $\sP^{\piE}(\tr_h) = \rho(\tr(s_1)) \prod_{\ell=1}^{h-1} \gP_{\ell}(\tr(s_{\ell+1})|\tr(s_\ell), \tr(a_\ell))$. Secondly, for the second term in \eqref{eq:key_decomposition} (i.e., $\tr_h \not\in \Tr_h^{\gD_1}$), we cannot exactly calculate $\sP^{\piE} (\tr_h)$ since we do not known some actions in $\tr_h$ from $\gD_1$. Fortunately, we can leverage the complementary dataset $\gD^c_1$ to establish an estimator. In fact, the second term in \eqref{eq:new_estimator} is a maximum likelihood estimation for the associated part in \eqref{eq:key_decomposition}.



% Differently, conventional AIL methods such as FEM \citep{pieter04apprentice}, GTAL \citep{syed07game} and GAIL \citep{ho2016gail} adopt the maximum likelihood estimator of the whole probability term $P_h^{\piE}(s, a)$.
% \begin{align}   \label{eq:estimate_by_count}
%    \widehat{P}_h^{\piE} (s, a) := \frac{  \sum_{\tr \in \gD}  \indict\lb \tr(s_h, a_h) = (s, a) \rb }{|\gD|}, 
% \end{align}
% Since the estimator in \eqref{eq:new_estimator} utilizes the transition function information explicitly, it has a better statistical guarantee.





% \begin{rem}  
% This lemma indicates that when $\varepsilon \leq \sqrt{H}$, the estimator in \eqref{eq:new_estimator} is better than the maximum likelihood estimation in conventional AIL approaches such as FEM \citep{pieter04apprentice}, GTAL \citep{syed07game} and GAIL \citep{ho2016gail}:
% \begin{align}   \label{eq:estimate_by_count}
%    \widehat{P}_h^{\piE} (s, a) := \frac{  \sum_{\tr \in \gD}  \indict\lb \tr(s_h, a_h) = (s, a) \rb }{|\gD|}, 
% \end{align}
% which has a  sample complexity $\widetilde{\gO}(H^{2}|\gS|/\varepsilon)$ (see Appendix~\ref{sec:proof:results_in_sction_main_results} for a proof).
% \end{rem}



% \subsection{An Efficient Optimization Procedure}

% With the improved estimator in \eqref{eq:new_estimator}, we arrive at the following state-action distribution matching problem:
% \begin{align} \label{eq:l1_norm_imitation_with_estimator} 
%    \min_{\pi \in \Pi} \frac{1}{H}  \sum_{h=1}^{H} \lnorm P^{\pi}_h - \widetilde{P}^{\piE}_h \rnorm_1.
% \end{align}


% \begin{rem}   \label{remark:projection_choice}
% % The problem we consider is different from the one in MIMIC-MD. 
% Differently, MIMIC-MD is a combination of AIL and BC.
% \begin{align} \label{eq:l1_norm_imitation_with_estimator} 
%    \min_{\pi \in \Pi_{\text{BC}} (\gD_1)} \frac{1}{H}  \sum_{h=1}^{H} \lnorm P^{\pi}_h - \widetilde{P}^{\piE}_h \rnorm_1.
% \end{align}
% That is, when performing state-action distribution matching, MIMIC-MD restricts candidate policies to $\Pi_{\text{BC}} (\gD_1) = \{ \pi \in \Pi: \pi_h (s) = \piE_h (s), \forall h \in [H], s \in \gS_h (\gD_1) \}$, which is the set of BC policies on $\gD_1$. The intuition in MIMIC-MD is that the expert actions are known on $\gD_1$ so that direct projection is feasible. Based on the choice of $\Pi_{\text{BC}} (\gD_1)$, \citet{nived2021provably} develop a linear programming based optimization procedure for MIMIC-MD. However, as the policy set $\Pi_{\text{BC}} (\gD_1)$ is complex, it is computationally expensive to take a projection step onto $\Pi_{\text{BC}} (\gD_1)$. 


% In contrast, we consider an unconstrained state-action distribution objective, which relaxes this complex projection operator. We will show that our choice allows a computationally efficient gradient-based optimization procedure. Technically, we perform a different analysis which enables that this relaxation does not scarify any statistical guarantee.
% \end{rem}


% \todo[inline]{Give this objective first, then explain the estimator? That is, exchange the order of section 4.2 and 4.1.}



% \begin{thm} \label{theorem:final_sample_complexity} 
% Fix $\varepsilon \in \lp 0, H \rp$ and $\delta \in (0, 1)$; suppose $H \geq 5$. Consider the approach \algname in Algorithm \ref{algo:main_aglorithm} with $\widebar{\pi}$ being the output policy. Assume that the optimization error $\varepsilon_{\mathrm{opt}} \leq \varepsilon / 2$, the number of iterations $T \gtrsim H^2 |\gS|  |\gA| / \varepsilon^2$, and the step size $\eta^{(t)} :=  \sqrt{|\gS||\gA| / (8T)}$. If the number of expert trajectories satisfies
% \begin{align*}
% m \gtrsim  \frac{ H^{3/2} |\gS|}{\varepsilon} \log\lp\frac{H |\gS|}{\delta} \rp,
% \end{align*}
% then with probability at least $1-\delta$, we have $V^{\piE} - V^{\widebar{\pi}} \leq \varepsilon$.
% \end{thm}


% \begin{rem}
% The optimization problem in line $3$ of Algorithm \ref{algo:main_aglorithm} can be solved efficiently to an acceptable error by value iteration \citep{puterman2014markov} or policy gradient based methods~\citep{agarwal2020pg}. 
% \end{rem}


% \begin{rem}
% Note that the sample complexity of Algorithm \ref{algo:main_aglorithm} matches the lower bound of $\widetilde{\Omega} \lp H^{3/2} / \varepsilon \rp$ \citep{nived2021provably} in terms of $H$ and $\varepsilon$. Under the AIL framework, it improves the existing sample complexity bound $\widetilde{\Theta}(H^{2}|\gS|/\varepsilon^2)$ for pure conventional AIL methods such as FEM and GTAL.
% \end{rem}

% \begin{rem}  
% Computationally, TAIL enjoys a linear space complexity $\gO (\vert \gS \vert \vert \gA \vert H )$ for updating $w^{(t)}$ and $\pi^{(t)}$. In contrast, MIMIC-MD has a quadratic space complexity $\gO \lp  (\vert \gS \vert \vert \gA \vert H)^2 \rp$ \footnote{MIMIC-MD requires a dense matrix with shape $(|\gS||\gA|H) \times (|\gS||\gA|H)$ to save the constraints in LP.}. Therefore, MIMIC-MD is hard to be applied to large-scale IL problems (see \RED{Appendix XXX}). Besides, TAIL can be efficiently solved in polynomial time\footnote{The computation complexity (in terms of arithmetic operations) of TAIL is $\widetilde{\mathcal{O}}(|\mathcal{S}|^3 |\mathcal{A}|^2 H^3/\varepsilon^2)$ while MIMIC-MD by LP is about $\widetilde{\mathcal{O}}(d^{2.5})$ where {$d = 2|\mathcal{S}| |\mathcal{A}| H$}. We see TAIL is asymptotically more efficient than MIMIC-MD when $\varepsilon > \widetilde{\mathcal{O}}((|\mathcal{S}|H/|\mathcal{A}|)^{0.25})$.}.
% \end{rem}


\section{MB-TAIL with State Abstraction}
\label{sec:mbtail_state_abstraction}


Previously, we considered the tabular representation, which leads to theoretical bounds that depend on the size of the problem $|\gS|$. However, as suggested by the lower bounds in \citep[Theorem 6.1, 6.2]{rajaraman2020fundamental}, this dependence is inevitable and could be unacceptable when $|\gS|$ is huge. In this section, we investigate the use of state abstractions \citep{li2006towards} within MB-TAIL, so the dependence on $|\gS|$ can be eliminated.


% In particular, we investigate the use of state abstractions \citep{li2006towards} within MB-TAIL. State abstractions correspond to function approximations using a series of piecewise constant functions \citep{chen2019information}. 

Specifically, we assume that we have a set of state abstractions $\{ \phi_h \}_{h=1}^H$, where $\phi_h: \gS \rightarrow \Phi$ and $\Phi$ is the abstract state space. State abstractions correspond to function approximations using a series of piecewise constant functions \citep{chen2019information}. The abstract state space is much smaller than the original state space, i.e., $|\Phi| \ll |\gS|$. We also assume that $\{ \phi_h \}_{h=1}^H$ satisfies a condition that is common in the literature \citep{li2006towards, jiang2015abstraction}.


% we consider tabular representations. As a result, the theoretical bounds depend on the problem size $|\gS|$. In fact, this dependence is inevitable as the lower bounds in \citep[Theorem 6.1, 6.2]{rajaraman2020fundamental} suggest. This could be unacceptable if $|\gS|$ is huge. In this part, we discuss how to extend MB-TAIL to the function approximation setting, which can eliminate the dependence on $|\gS|$.



% In particular, we investigate MB-TAIL with  state abstractions \citep{li2006towards}, which correspond to the function approximation with a series of piecewise constant functions \citep{chen2019information}. To be more specific, assume we have access to a set of state abstractions $\{ \phi_h \}_{h=1}^H$, where $\phi_h: \gS \rightarrow \Phi$ for each $h \in [H]$ and $\Phi$ is the abstract state space. The size of abstract state space is much smaller than that of original state space, i.e., $|\Phi| \ll |\gS|$. We assume that the state abstractions satisfy the following condition, which is common in the literature \citep{li2006towards, jiang2015abstraction}.


% Previously, we develop TAIL in the tabular setting and the corresponding sample complexity depends on the state space size $\gS$. In this part, we extend TAIL to the function approximation setting, which can eliminate the dependence of $|\gS|$.


% Besides, the lower bounds \citep[Theorem 6.1, 6.2]{rajaraman2020fundamental} imply that the dependence of $\vert \gS \vert$ is inevitable for all IL algorithms if no additional information is provided. In this part, we discuss that if provided with a set of state abstractions \citep{li2006towards}, how to avoid the dependence of $\vert \gS \vert$ on sample complexity. In particular, state abstractions correspond to the function approximation with a series of piecewise constant functions \citep{chen2019information}.

\begin{asmp}
\label{asmp:state_abstraction}
There exists a set of known state abstractions $\{ \phi_h \}_{h=1}^H$, which satisfies $\forall h \in [H]$, for any $s^{1}, s^{2} \in \gS$ such that $\phi_h (s^{1}) = \phi_h (s^{2})$,
\begin{align}
    &\text{bisimulation}: \forall a \in \gA, x^\prime \in \Phi, r_h (s^{1}, a) = r_h (s^{2}, a) \label{eq:reward_consistent}
    \\
    &\sum_{s^\prime \in \phi_h^{-1} (x^\prime)} P_h (s^\prime |s^1, a) = \sum_{s^\prime \in \phi_h^{-1} (x^\prime)} P_h (s^\prime |s^2, a); \label{eq:transition_consistent}
    \\
    &\piE\text{-irrelevant}: \; \piE_h (s^1) = \piE_h (s^2), \label{eq:expert_consistent}
\end{align}
where $\phi_h^{-1} (x^\prime) = \{ s^\prime \in \gS: \phi_h (s) = x^\prime \}$.
\end{asmp}
% \begin{rem}
%     When the set of abstractions is a bisimulation, if $\piE$ is the optimal policy, then the $\piE$-irrelevant condition naturally holds \citep{li2006towards}. 
% \end{rem}

% \begin{rem}
{In bisimulation, the reward-consistent condition in \eqref{eq:reward_consistent} ensures that two different states mapped to the same abstract state share an identical reward. We highlight that this condition is important for MB-TAIL to avoid the dependence of \emph{expert sample complexity} on $\vert \gS \vert$.} In particular, the bottleneck of the sample complexity of AIL methods is the estimation of $d^{\piE}_h(s, a)$. Under the reward-consistent condition, we can calculate the expert policy value as
\begin{align*}
    V^{\piE} &= \sum_{h=1}^H \sum_{(s, a) \in \gS \times \gA} r_h (s, a)  d^{\piE}_h (s, a)
    \\
    &= \sum_{h=1}^H \sum_{(x, a) \in \Phi \times \gA} r^{\phi}_h (x, a) d^{\piE, \phi}_h (x, a), 
\end{align*}
where $r^{\phi}_h (x, a) = r_h (s, a)$ for an arbitrary $s \in \phi^{-1}_h (x)$ and $d^{\pi, \phi}_h (x, a) = \sP^{\piE} (\phi_h (s_h) = x, a_h = a) = \sum_{s \in \phi_h^{-1} (x)} d^{\pi}_h (s, a)$ is the {abstract state-action distribution}. With the above formulation, to estimate the expert policy value, we can estimate the \emph{abstract} state-action distribution rather than the tabular counterpart, which can remove the dependence on $\vert \gS \vert$. {Analogously, the transition-consistent condition in \eqref{eq:transition_consistent} guarantees that two different states mapped to the same abstract state share an identical transition.} This condition is crucial for removing the dependence of \emph{interaction complexity} on $\vert \gS \vert$ since it allows estimating the \dquote{abstract transition function}.
% \end{rem}

% Now we extend MB-TAIL to the setting with state abstraction. Due to the space limit, we defer the detailed description of the algorithm to \cref{subsec:appendix_mbtail_with_state_abstraction_algorithm}. We prove that under \cref{asmp:state_abstraction}, MB-TAIL exhibits the expert sample and interaction complexity independent of $|\gS|$. The proof is not straightforward and the main difficulty lies in bridging the state-action distributions in the original and abstract MDP. We defer the discussion of the dedicated analysis tools in Appendix.       
We now extend MB-TAIL to the state abstraction setting, which we describe in detail in the Appendix due to space limitations. We prove that under \cref{asmp:state_abstraction}, MB-TAIL achieves expert sample and interaction complexities that are independent of $|\gS|$. However, the proof is not straightforward, and the primary challenge is to connect the state-action distributions in the original and abstract MDPs. We provide a detailed discussion of the specialized analysis tools in the Appendix.


% \ref{algo:mbtail-state-abstraction}
\begin{thm}\label{theorem:sample-complexity-unknown-transition-state-abstraction}
Under \cref{asmp:state_abstraction}, fix $\varepsilon \in \lp 0, 1 \rp$ and $\delta \in (0, 1)$; suppose $H \geq 5$. Under the unknown transition setting, consider Algorithm 2 in Appendix and $[\widebar{\pi}^{\phi}]^{M}$ is output policy. Assume that the RL error $\varepsilon_{\rl} \leq \varepsilon / 2$, the number of iterations $T \gtrsim H^2 |\Phi|  |\gA| / \varepsilon^2$, and the step size $\eta^{(t)} :=  \sqrt{|\Phi||\gA| / (8T)}$. If the number of expert trajectories ($m$), the number of interaction trajectories for estimation ($n^\prime$), and the number of interaction trajectories for reward-free exploration ($n$) satisfy
\begin{align*}
&m \gtrsim  \frac{  |\Phi| H^{3/2}}{\varepsilon} \log\lp\frac{ |\Phi| H}{\delta} \rp, n^{\prime} \gtrsim  \frac{  |\Phi| H^2}{\varepsilon^2} \log \lp \frac{ |\Phi| H }{\delta} \rp,
\\
& n \gtrsim \frac{ |\Phi| |\gA| H^3}{\varepsilon^2} \lp |\Phi| + \log \lp \frac{ |\Phi| |\gA| H}{\delta \varepsilon} \rp \rp,
\end{align*}
then with probability at least $1-\delta$, we have the imitation gap $V^{\piE} - V^{[\widebar{\pi}^{\phi}]^{M}} \leq \varepsilon $.
\end{thm}




% \sectionspace
% \section{An Improved Transition-aware AIL Algorithm With Unknown Transitions}
% \label{sec:unknown_transition}
% \sectionspace
% Previously, we consider the known transition setting where the learner can almost exactly perform policy optimization and policy evaluation. In this part, we consider a more practical setting where the learner does not know the transition function in advance but can interact with the environment. Under this scenario, in addition to the number of expert demonstrations, we also care about the number of environment interactions. Here we refer to the above two measures as \emph{(expert) sample complexity} and \emph{interaction complexity}, respectively. The main challenge under this scenario is that the agent cannot exactly evaluate and optimize a policy any longer since the transition function is unknown.

% \subsection{A Framework For AIL Under the Unknown Transition Setting}


% To address the challenge from the unknown transition, we need to consider the online exploration issue in RL. Without a smart exploration strategy, the agent may take exponentially large interactions to make progress. To overcome this difficulty, we propose a general algorithmic framework to connect reward-free exploration~\citep{chi20reward-free, menard20fast-active-learning} and adversarial imitation learning. Under our framework, a proper AIL algorithm that works under the known transition setting could be provably transferred to the unknown transition setting. In the sequel, let us formally introduce the reward-free exploration methods.





% \begin{rem}  \label{remark:mimic_md_fail}
% Unfortunately, we could not apply the MIMIC-MD algorithm under our framework for two reasons. Firstly, the estimator in MIMIC-MD does not satisfy the assumption (b) under the unknown transition setting. We will discuss this later. More importantly, the projection in MIMIC-MD is restrictive to $\Pi_{\text{BC}}(\gD_1)$ (see Remark \ref{remark:projection_choice}), which are not compatible with the assumption (c).
% \end{rem}




% \subsection{Model-based Transition-aware Adversarial Imitation Learning}

% Next, we present how to apply \algname in Algorithm \ref{algo:main_aglorithm} under our framework. Notice that the estimator in \eqref{eq:new_estimator} involves the exact transition function and cannot be directly applied here. Luckily, we could address this issue by a dataset $\gD^{\prime}_{\env}$ collected by rolling out a BC policy (obtained from $\gD_1$) with the environment; see Appendix \ref{subsec:explanation_of_the_estimation_unknown_transition} for more explanation. Based on this trick, the new estimator is formulated as $\widetilde{P}_h^{\piE} (s, a) =$
% \begin{equation}
%     \begin{aligned}
% & {\frac{\sum_{\tr_h \in \gD_{\env}^\prime} \indict \{ \tr_h (s_h, a_h) = (s, a), \tr_h \in \Tr_h^{\gD_1} \}}{|\gD^\prime_{\env}|}} + 
% \\
% & {\frac{  \sum_{\tr_h \in \gD_1^c}  \indict\{ \tr_h (s_h, a_h) = (s, a), \tr_h \not\in \Tr_h^{\gD_1}  \} }{|\gD_1^c|}}. 
% \end{aligned}
% \label{eq:new_estimator_unknown_transition}
% \end{equation}



% With the estimator in \eqref{eq:new_estimator_unknown_transition}, we develop an extension of TAIL named \emph{MB-TAIL} presented in Algorithm \ref{algo:mbtail-abstract}. 
% \vspace{-0.15cm}
% \begin{algorithm}[htbp]
% \caption{Model-based Transition-aware AIL (\mbalgname)}
% \label{algo:mbtail-abstract}
% \begin{algorithmic}[1]
% \REQUIRE{expert demonstrations $\gD$.}
% \STATE{Randomly split $\gD$ into two equal parts: $\gD = \gD_1 \cup \gD_1^{c}$.}
% \STATE{Learn $\pi \in \Pi_{\text{BC}} \lp \gD_{1} \rp$ by BC and roll out $\pi$ to obtain dataset $\gD_{\env}^\prime$ with $|\gD_{\env}^\prime| = n^{\prime}$}.
% \STATE{Obtain the estimator $\widetilde{P}_h^{\piE}$ in \eqref{eq:new_estimator_unknown_transition} with $\gD$ and $\gD_{\env}^\prime$.}
% \STATE{Invoke \textnormal{RF-Express} to collect $n$ trajectories and learn an empirical transition function $\widehat{\gP}$.}
% \STATE{$\widebar{\pi} \lar$ apply TAIL to perform imitation with the estimation $\widetilde{P}_h^{\piE}$ under transition model $\widehat{\gP}$.}
% \ENSURE{policy $\widebar{\pi}$.}
% \end{algorithmic}
% \end{algorithm}
% \vspace{-0.15cm}

% \begin{thm}\label{theorem:sample-complexity-unknown-transition}
% Fix $\varepsilon \in \lp 0, 1 \rp$ and $\delta \in (0, 1)$; suppose $H \geq 5$. Under the unknown transition setting, consider \mbalgname displayed in Algorithm \ref{algo:mbtail-abstract} and $\widebar{\pi}$ is output policy, assume that the optimization error $\varepsilon_{\mathrm{opt}} \leq \varepsilon / 2$, the number of iterations and the step size are the same as in Theorem \ref{theorem:final_sample_complexity}, if the expert sample complexity and the total interaction complexity satisfy
% \begin{align*}
% &m \gtrsim  \frac{ H^{3/2} |\gS|}{\varepsilon} \log\lp\frac{H |\gS|}{\delta} \rp, n^{\prime} \gtrsim  \frac{ H^2 |\gS|}{\varepsilon^2} \log \lp \frac{H |\gS| }{\delta} \rp,
% \\
% & \quad n \gtrsim \frac{H^3 |\gS| |\gA|}{\varepsilon^2} \lp |\gS| + \log \lp \frac{H |\gS| |\gA|}{\delta \varepsilon} \rp \rp.
% \end{align*}
% Then with probability at least $1-\delta$, we have $V^{\piE} - V^{\widebar{\pi}} \leq \varepsilon $.
% \end{thm}




% \subsection{Simulation Results}
        

% \sectionspace
\section{Conclusion}
% \sectionspace


This paper contributes to the establishment of theoretical foundations for AIL with unknown transitions. We propose a new and general framework that enables AIL to explore and imitate efficiently. As mentioned, AIL methods can have much better theoretical guarantees on structured instances, such as horizon-free bounds suggested in \citep{xu2022understanding}. Thus, we believe that investigating AIL with unknown transitions on structured instances is an interesting and valuable direction for future research.


% \sectionspace
\section*{Acknowledgment}
% \sectionspace
Tian Xu would like to thank Zhilong Zhang, Fanming Luo, and Jingcheng Pang for reading the manuscript and providing helpful comments. The work of Yang Yu is supported by National Key Research and Development Program of China (2020AAA0107200), NSFC(61876077), and Collaborative Innovation Center of Novel Software Technology and Industrialization. The work of Zhi-Quan Luo is supported in part by the National Key Research and Development Project under grant 2022YFA1003900, and in part by the Guangdong Provincial Key Laboratory of Big Data Computing.


% While AIL methods can enjoy better bounds on structured instances with known transitions, such as horizon-free bounds suggested in [Xu et al., 2022], our framework addresses the challenging problem of AIL with unknown transitions. We believe that investigating AIL with unknown transitions on structured instances is an interesting and valuable direction for future research.



% This paper contributes to establishing theoretical foundations for AIL with unknown transitions. We propose a general framework that enables AIL to efficiently explore and imitate. As mentioned, AIL methods can enjoy much better bounds (e.g., horizon-free bounds) on structured instances with known transitions as suggested in \citep{xu2022understanding}. Thus, it is interesting and valuable to investigate AIL with unknown transitions on structured instances. 


% This paper presents the worst-case theoretical results of AIL. To this end, an interesting future direction is to investigate whether we can obtain better bounds on structured instances as suggested in \citep{xu2022understanding}.

% develop an  analysis for AIL \citep{xu2022understanding} with unknown transitions.} 


%  Under this framework, we design an AIL algorithm MB-TAIL which achieves improved sample and interaction complexities. Finally, we extend MB-TAIL to the function approximation setting and prove the sample and interaction complexities independent of $|\gS|$, demonstrating its generalization ability.  



% In the future, we will investigate adversarial imitation learning with more general function approximation such as low rank MDPs \citep{agarwal2020flambe}.   
% To summarize, the main contributions of this paper are two-fold. First, in the known transition setting, we propose a new AIL method TAIL, which is more statistically and computationally efficient. Second, in the unknown transition setting, we develop a general framework that enables known-transition AIL approaches to efﬁciently explore and imitate. The resultant approach MB-TAIL outperforms the best-known algorithm in both sample complexity and interaction complexity.    

% \todo[inline]{It is not very clear why to consider more general function approximation. Maybe we can put a conjecture that the interaction complexity achieved by AIL is tight. Further, we can also conjecture that the interaction complexity may be small on instances where horizon-free expert sample complexity holds.}


% While our results have several desirable features, they also have shortcomings. One limitation is that we mainly consider the tabular case in this paper. In Section \ref{subsec:tail_state_abstraction}, we present an extension of TAIL in the state abstraction setting, which corresponds to a special case of function approximation. In the future, we will investigate how to extend our results to the setting with general function approximation.     
% References
% \bibliographystyle{abbrvnat}
% \newpage 
\bibliography{xu_380}
% \input{UAI-2023/xu_380-supp}
% \input{appendix/appendix.tex}
\end{document}
