\documentclass[accepted]{uai2022} % for initial submission
% \documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \bibpunct{(}{)}{;}{a}{,}{,}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
%\usepackage{tikz} % nice language for creating drawings and diagrams


\usepackage{textcase}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)




%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\usepackage{amsmath, amsthm, amssymb, amsfonts, mathtools, graphicx, enumitem}
\usepackage{algorithm,algorithmic}
\renewcommand{\algorithmicrequire}{\textbf{Input:}}
\renewcommand{\algorithmicensure}{\textbf{Output:}}
\usepackage{mkolar_definitions}

\newtheorem*{theorem*}{Theorem}

\usepackage{comment}



%%%% Drawing
\usepackage{tikz}
\usepackage{bbm}
\usetikzlibrary{automata, arrows}
\usetikzlibrary{positioning}


\usepackage{xr}
\makeatletter
\newcommand*{\addFileDependency}[1]{% argument=file name and extension
	\typeout{(#1)}
	\@addtofilelist{#1}
	\IfFileExists{#1}{}{\typeout{No file #1.}}
}
\makeatother

\newcommand*{\myexternaldocument}[1]{%
	\externaldocument{#1}%
	\addFileDependency{#1.tex}%
	\addFileDependency{#1.aux}%
}
\myexternaldocument{chen_628}



%%%% cref, Cref
\usepackage[capitalise,nameinlink]{cleveref}
\Crefname{equation}{Eq.}{Eqs.}
\Crefname{assumption}{Assumption}{Assumptions}
\Crefname{condition}{Condition}{Conditions}


%%% Allow math equation to cross pages
\allowdisplaybreaks


\newcommand{\defeq}{:=}
%\newcommand{\gapmin}{\mathrm{gap}_{\mathrm{min}}}
\newcommand{\gapmin}{C_\mathrm{gap}}
\newcommand{\gap}{\mathrm{gap}}
\newcommand{\cgap}{C_\mathrm{gap}}
\newcommand{\gapq}{\mathrm{gap}(Q^*)}
\newcommand{\estat}{\varepsilon_{\mathrm{stat},n}}

%\newcommand{\mainalg}{\text{AlgName}\xspace}
%\newcommand{\algunknown}{\text{AlgNameUnknown}\xspace}


\hypersetup{
  colorlinks   = true, %Colours links instead of ugly boxes
  urlcolor     = blue, %Colour for external hyperlinks
  linkcolor    = blue, %Colour of internal links
  citecolor   = blue  %Colour of citations
}

\newcount\Comments  % 0 suppresses notes to selves in text
\Comments=0 % TODO: change to 0 for final version
\definecolor{darkred}{rgb}{0.7,0,0}
\definecolor{darkgreen}{rgb}{0,0.5,0}
\definecolor{orange}{rgb}{0.7,0.4,0}
\definecolor{purple}{rgb}{0.8,0.0,0.8}
%\newcommand{\kibitz}[2]{\ifnum\Comments=1{\textcolor{#1}{\textsf{\footnotesize #2}}}\fi}
\newcommand{\jc}[1]{\textcolor{orange}{[JC: #1]}}
\newcommand{\nj}[1]{\textcolor{red}{[NJ: #1]}}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\title{Offline Reinforcement Learning Under Value and \\ Density-Ratio Realizability: The Power of Gaps \\ (Supplementary Material)}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author{Jinglin Chen, Nan Jiang}
%\author{Jinglin Chen}
%\author[1]{Nan Jiang\thanks{jinglinc@illinois.edu, nanjiang@illinois.edu}}
% Add affiliations after the authors
\affil{%
    Department of Computer Science\\
    University of Illinois Urbana-Champaign\\
    Urbana, IL, USA
    
    %\textrm{jinglinc@illinois.edu, nanjiang@illinois.edu}
}
%\affil[2]{%
%    Second Affiliation\\
%    Address\\
%    …
%}
%\affil[3]{%
%    Another Affiliation\\
%    Address\\
%    …
%  }
  
\begin{document}
\onecolumn

\maketitle

\appendix

\section{Proof of Main Results}
In this section, we provide the complete proofs of our main results in \pref{sec:main}. We start with some helper lemmas in \pref{app:helper_lemma_main}. Then we show the proof of \pref{thm:find_v_star} in \pref{app:proof_find_v_star}. Finally, we provide the proof of \pref{thm:main} in \pref{app:proof_main}.

\subsection{Helper Lemmas}
\label{app:helper_lemma_main}
\begin{lemma}[Concentration]
\label{lem:conc}
With probability at least $1-\delta$, for any $f\in\Fcal,w\in\Wcal,h\in[H]$ we have,
\[
\abr{\Lcal_{\Dcal}(f,w,h)-\EE[\Lcal_{\Dcal}(f,w,h)]}\le 2CH\sqrt{\frac{\log(2|\Fcal||\Wcal|H/\delta)}{2n}}=:\estat.
\]
\end{lemma}
\paragraph{Remark}
Here we apply Hoeffding's inequality to show the concentration result. Similar as \citet{xie2020q}, we can also apply Bernstein's inequality, but the dominating rate would be the same.
%\end{remark}
\begin{proof}
Firstly, we fix
$f\in\Fcal,w\in\Wcal,h\in[H]$. From the boundedness assumptions (\pref{assum:bound_q} and \pref{assum:bound_w}), for any sample $(x_h^{(i)},a_h^{(i)},r_h^{(i)},x_{h+1}^{(i)})$ in the dataset, we have
\begin{align*}
\abr{w_h(x_h^{(i)},a_h^{(i)})(f_h(x_h^{(i)},a_h^{(i)})-r_h^{(i)}- f_h(x_{h+1}^{(i)},\pi_f(x_{h+1}^{(i)})))}\le CH.
\end{align*}
Then since our dataset is i.i.d., applying Hoeffding's inequality yields that with probability at least $1-\delta/(|\Fcal||\Wcal|H)$,
\begin{align*}
\abr{\Lcal_{\Dcal}(f,w,h)-\EE[\Lcal_{\Dcal}(f,w,h)]}\le 2CH\sqrt{\frac{\log(2|\Fcal||\Wcal|H/\delta)}{2n}}.
\end{align*}
Finally, union bounding over $f\in\Fcal,w\in\Wcal,h\in[H]$ gives us that with probability at least $1-\delta$, for any $f\in\Fcal,w\in\Wcal,h\in[H]$,
\[
\abr{\Lcal_{\Dcal}(f,w,h)-\EE[\Lcal_{\Dcal}(f,w,h)]}\le 2CH\sqrt{\frac{\log(2|\Fcal||\Wcal|H/\delta)}{2n}}:=\estat.
\]
This completes the proof.
\end{proof}


\begin{lemma}[Population loss and average Bellman error]
	\label{lem:trans}
For any $f\in\Fcal,w\in\Wcal,h\in[H]$, we have
\[
\EE[\Lcal_{\Dcal}(f,w,h)]=\EE_{(x_h,a_h)\sim d^D_h}[w_h(x_h,a_h)(f_h(x_h,a_h)- (\Tcal_h f_{h+1})(x_h,a_h)))]
	\]
and
\[
\EE[\Lcal_{\Dcal}(f,w^*,h)]=\Ecal(f,\pi^*,h)=\EE[f_h(x_h,a_h)-R_h(x_h,a_h)-f_{h+1}(x_{h+1},a_{h+1})\mid a_{0:h}\sim\pi^*, a_{h+1}\sim \pi_f],
	\]
where $\Ecal(\cdot)$ is the Q-type average Bellman error \citep{jin2021bellman,du2021bilinear} \[\Ecal(f,\pi,h)=\EE[f_h(x_h,a_h)-R_h(x_h,a_h)-f_{h+1}(x_{h+1},a_{h+1})\mid a_{0:h}\sim\pi, a_{h+1}\sim \pi_f]. 	\]
\end{lemma}
\begin{proof}
These equations can be simply shown from the data generating process and the definition of population loss and empirical loss. For any $f\in\Fcal,w\in\Wcal,h\in[H]$, we have
\begin{align*}
	&~\EE[\Lcal_{\Dcal}(f,w,h)]\\
	=&~\EE\sbr{\frac{1}{n}\sum_{i=1}^n[w_h(x_h^{(i)},a_h^{(i)})(f_h(x_h^{(i)},a_h^{(i)})-r_h^{(i)}-f_{h+1}(x_{h+1}^{(i)},\pi_f(x_{h+1}^{(i)})))]}
	\\
	=&~\EE_{(x_h,a_h)\sim d^D_h,x_{h+1}\sim P_h(\cdot\mid x_h,a_h)}[w_h(x_h,a_h)(f_h(x_h,a_h)-r_h-f_{h+1}(x_{h+1},\pi_f(x_{h+1})))]
	\\
	=&~\EE_{(x_h,a_h)\sim d^D_h}[w_h(x_h,a_h)(f_h(x_h,a_h)-R_h(x_h,a_h)-\EE_{x_{h+1}\sim P_h(\cdot\mid x_h,a_h)}[f_{h+1}(x_{h+1},\pi_f(x_{h+1}))])]
	\\
	=&~\EE_{(x_h,a_h)\sim d^D_h}[w_h(x_h,a_h)(f_h(x_h,a_h)- (\Tcal_h f_{h+1})(x_h,a_h)))].
\end{align*}

For any $f\in\Fcal,h\in[H]$, we similarly have
\begin{align*}
	\EE[\Lcal_{\Dcal}(f,w^*,h)]&=~\EE\sbr{\frac{1}{n}\sum_{i=1}^n[w_h^*(x_h^{(i)},a_h^{(i)})(f_h(x_h^{(i)},a_h^{(i)})-r_h^{(i)}-f_{h+1}(x_{h+1}^{(i)},\pi_f(x_{h+1}^{(i)})))]}
	\\
	&=~\EE_{(x_h,a_h)\sim d^D_h,x_{h+1}\sim P_h(\cdot\mid x_h,a_h)}[w_h^*(x_h,a_h)(f_h(x_h,a_h)-r_h-f_{h+1}(x_{h+1},\pi_f(x_{h+1})))]
	\\
	&=~\EE_{(x_h,a_h)\sim d^*_h,x_{h+1}\sim P_h(\cdot\mid x_h,a_h)}[f_h(x_h,a_h)-r_h-f_{h+1}(x_{h+1},\pi_f(x_{h+1}))]
	\\
	&=~\EE[f_h(x_h,a_h)-R_h(x_h,a_h)-f_{h+1}(x_{h+1},a_{h+1})\mid a_{0:h}\sim\pi^*, a_{h+1}\sim \pi_f].
\end{align*}
This completes the proof.
\end{proof}

\subsection{Proof of Theorem~\ref{thm:find_v_star}}
\label{app:proof_find_v_star}
\begin{theorem*}[Sample complexity of identifying $v^*$, restatement of \pref{thm:find_v_star}]
%\label{thm:find_v_star}
Suppose  \pref{assum:realizablity_q}, \pref{assum:realizablity_w}, \pref{assum:bound_q}, \pref{assum:bound_w} hold and the total number of samples $nH$ satisfies \[nH\ge \frac{8C^2H^5\log(2|\Fcal||\Wcal|H/\delta)}{\varepsilon^2}.
\]
Then with probability at least $1-\delta$, running \pref{alg:pess_alg} with $\gapmin=0$ and $\alpha=\varepsilon/(2H)$ guarantees 
\[|V_{\hat f}(x_0)-v^*|\le \varepsilon.
\]
\end{theorem*}

\begin{proof}
From our choice of $n$ and \pref{lem:conc}, with probability at least $1-\delta$, for any $f\in\Fcal,w\in\Wcal,h\in[H]$, we have
\[\abr{\Lcal_{\Dcal}(f,w,h)-\EE[\Lcal_{\Dcal}(f,w,h)]}\le\estat\le\varepsilon/(2H).\]
Throughout the proof, we condition on this high probability event. 

From \pref{lem:trans}, for any $w\in\Wcal,h\in[H]$, we have
\begin{align*}
\EE[\Lcal_{\Dcal}(Q^*,w,h)]&=~\EE_{(x_h,a_h)\sim d^D_h}[w_h(x_h,a_h)(Q^*_h(x_h,a_h)-\Tcal_h Q^*_{h+1}(x_h,a_h)]
\\
&=~\EE_{(x_h,a_h)\sim d^D_h}[w_h(x_h,a_h)\cdot 0]%\tag{Bellman optimality equation}
\\
&=~0.
\end{align*}
Therefore, we further have
\[
\Lcal_\Dcal(Q^*,w,h)\le \EE[\Lcal_\Dcal(Q^*,w,h)]+\estat\le\varepsilon/(2H)=\alpha,
\]
which means $Q^*$ satisfies all the constraints.

Then we show that any value function satisfying all constraints (though it may have large average Bellman errors under some distributions) can not be much more pessimistic than $Q^*$. 

From \pref{lem:conc} and \pref{lem:trans}, we know that for any $f\in\Fcal,h\in[H]$,
\begin{align*}
&~\abr{\Ecal(f,\pi^*,h)}
\\
=&~|\EE[f_h(x_h,a_h)-R_h(x_h,a_h)-f_{h+1}(x_{h+1},a_{h+1})\mid a_{0:h}\sim\pi^*, a_{h+1}\sim \pi_f]|
\\
=&~|\EE[\Lcal_{\Dcal}(f,w^*,h)]|
\\
\le&~ \Lcal_{\Dcal}(f,w^*,h)+\estat
\\
\le&~ \alpha+\estat\le \varepsilon/H.
\end{align*}
Therefore, we have
\begin{align*}
V_f(x_0)&=~f_0(x_0,\pi_f(x_0))
\\
&\ge~f_0(x_0,\pi^*(x_0))
\\
&\ge~\EE[R_0(x_0,a_0)+f_1(x_1,a_1)\mid a_{0}\sim\pi^*,a_{1}\sim\pi_f]- \varepsilon/H \tag{$|\Ecal(f,\pi^*,0)|\le \varepsilon/H$}
\\
&\ge~\EE[R_0(x_0,a_0)\mid a_{0}\sim\pi^*]+\EE[f_1(x_1,a_1)\mid a_{0:1}\sim\pi^*] - \varepsilon/H
\\
&\ge~\EE[R_0(x_0,a_0)\mid a_{0}\sim\pi^*]+\EE[R_1(x_1,a_1)+f_2(x_2,a_2)\mid a_{0:1}\sim\pi^*,a_2\sim \pi_f]-2 \varepsilon/H \tag{$|\Ecal(f,\pi^*,1)|\le \varepsilon/H$}
\\
&\ge~\ldots\\
&\ge~\EE\left[\sum_{h=0}^{H-1}R_h(x_h,a_h)\mid a_{0:H-1}\sim\pi^*\right]-H\times  \varepsilon/H=V^*_0(x_0)- \varepsilon.
\end{align*}
Combining the two arguments above, we know that the pessimistic value function $\hat f$ found by the algorithm satisfies \[v^*-\varepsilon=V^*_0(x_0)-\varepsilon  \le V_{\hat f}(x_0) \le V^*_0(x_0)=v^*,\]
where the second inequality is due to pessimism. This completes the proof.
\end{proof}

\subsection{Proof of Theorem~\ref{thm:main}}
\label{app:proof_main}
\begin{theorem*}[Sample complexity of learning a near-optimal policy, restatement of \pref{thm:main}]
%\label{thm:main}
	Suppose  \pref{assum:realizablity_q}, \pref{assum:realizablity_w}, \pref{assum:bound_q}, \pref{assum:bound_w}, \pref{assum:gap_plus} hold and the total number of samples $nH$ satisfies 
	\[nH\ge \frac{8C^2H^7\log(2|\Fcal||\Wcal|H/\delta)}{\varepsilon^2 \gapq^2}.
	\]
	Then with probability at least $1-\delta$, running \pref{alg:pess_alg} with $\alpha=\varepsilon\gapq/(2H^2)$ and $\gapmin=\gapq$ guarantees 
	\[
	v^{\pi_{\hat f}} \ge v^*-\varepsilon.
	\]
\end{theorem*}

\begin{proof}	
From our choice of $n$ and \pref{lem:conc}, we know that with probability at least $1-\delta$, for any $f\in\Fcal,w\in\Wcal,h\in[H]$, we have
\[\abr{\Lcal_{\Dcal}(f,w,h)-\EE[\Lcal_{\Dcal}(f,w,h)]}\le\estat\le\varepsilon\gapq/(2H^2).\]
Throughout the proof, we condition on this high probability event. 

From the definition of $\gapq$, we know that prescreening will not eliminate $Q^*$, i.e., $Q^*\in\Fcal(\gapq)$. Then similar as the proof of \pref{thm:find_v_star}, we have 
\[
\Lcal_\Dcal(Q^*,w,h)\le \EE[\Lcal_\Dcal(Q^*,w,h)]+\estat=\estat\le\varepsilon\gapq/(2H^2)=\alpha,
\]
which means that $Q^*$ satisfies all the constraints.


For any $f\in\Fcal(\gapq)$ that satisfies all the constraints and any $h\in[H]$, we have
\begin{align*}
&~\Ecal(f,\pi^*,h)
\\
=&~|\EE[f_h(x_h,a_h)-R_h(x_h,a_h)-f_{h+1}(x_{h+1},a_{h+1})\mid a_{0:h}\sim\pi^*, a_{h+1}\sim \pi_f]|
\\
=&~|\EE[\Lcal_{\Dcal}(f,w^*,h)]|
\\
\le&~ \Lcal_{\Dcal}(f,w^*,h)+\estat
\\
\le&~\alpha+\estat
\\
\le&~\varepsilon\gapq/H^2.%\le \varepsilon\gapmin/H^2.
\end{align*}
Now we have the following stronger result compared with the proof of \pref{thm:find_v_star}
\begin{align*}
	&~V_f(x_0)\\
	=&~f_0(x_0,\pi_f(x_0))
	\\
	\ge&~f_0(x_0,\pi^*(x_0)) + \gapq \one\{\pi_f(x_0)\neq \pi^*(x_0)\}
	\\
	\ge&~\EE[R_0(x_0,a_0)+f_1(x_1,a_1)\mid a_{0}\sim\pi^*,a_{1}\sim\pi_f]\\
	&\quad + \gapq \one\{\pi_f(x_0)\neq \pi^*(x_0)\}-\varepsilon \gapq/H^2 \tag{$|\Ecal(f,\pi^*,0)|\le \varepsilon \gapq/H^2$}
	\\
	\ge&~\EE[R_0(x_0,a_0)\mid a_{0}\sim\pi^*]+\EE[f_1(x_1,\pi^*(x_1))+\gapq \one\{\pi_f(x_1)\neq \pi^*(x_1)\}\mid a_{0}\sim\pi^*] \\
	&\quad + \gapq \one\{\pi_f(x_0)\neq \pi^*(x_0)\}-\varepsilon \gapq/H^2
	\\
	=&~\EE[R_0(x_0,a_0)\mid a_{0}\sim\pi^*]+\EE[f_1(x_1,a_1)\mid a_{0:1}\sim\pi^*]+\gapq\EE[\one\{\pi_f(x_1)\neq \pi^*(x_1)\}\mid a_{0}\sim\pi^*] \\
	&\quad + \gapq \one\{\pi_f(x_0)\neq \pi^*(x_0)\}-\varepsilon \gapq/H^2
	\\
	\ge&~\EE[R_0(x_0,a_0)\mid a_{0}\sim\pi^*]+\EE[R_1(x_1,a_1)+f_2(x_2,a_2)\mid a_{0:1}\sim\pi^*,a_2\sim \pi_f]\\
	& \quad  + \gapq [\one\{\pi_f(x_0)\neq \pi^*(x_0)\}+\EE[\one\{\pi_f(x_1)\neq \pi^*(x_1)\}\mid a_{0}\sim \pi^*\}]]
	\\
	& \quad-2\varepsilon \gapq/H^2 \tag{$|\Ecal(f,\pi^*,1)|\le \varepsilon \gapq/H^2$}
	\\
	\ge&~\ldots\\
	\ge&~\EE\left[\sum_{h=0}^{H-1}R_h(x_h,a_h)\mid a_{0:H-1}\sim\pi^*\right]+\gapq \EE\left[\sum_{h=0}^{H-1}\one\{\pi_f(x_h)\neq \pi^*(x_h)\}\mid a_{0:H-1}\sim\pi^*\right]\\
	&\quad-H\times \varepsilon \gapq/H^2\\
	=&~V^*_0(x_0)+\gapq \EE\left[\sum_{h=0}^{H-1}\one\{\pi_f(x_h)\neq \pi^*(x_h)\}\mid a_{0:H-1}\sim\pi^*\right]- \varepsilon \gapq/H.
\end{align*}
This implies the pessimistic value function $\hat f$ found by the \pref{alg:pess_alg} satisfies
\[
	V^*_0(x_0) \ge V_{\hat f}(x_0)\ge V^*_0(x_0)+\gapq \EE\left[\sum_{h=0}^{H-1}\one\{\pi_{\hat f}(x_h)\neq \pi^*(x_h)\}\mid a_{0:H-1}\sim\pi^*\right]- \varepsilon \gapq/H
\]
and thus
\begin{equation}
\label{eq:sl_error}
\EE\left[\sum_{h=0}^{H-1}\one\{\pi_{\hat f}(x_h)\neq \pi^*(x_h)\}\mid a_{0:H-1}\sim\pi^*\right]\le \varepsilon/H.
\end{equation}


On the other hand, define each trajectory $\tau$ as $(x_0,a_0,r_0,\ldots,x_{H-1},a_{H-1},r_{H-1},x_H)$, the return of $\tau$ as $\mathrm{Return}(\tau)=r_0+\ldots+r_{H-1}$, and the probability of $\tau$ under policy $\pi$ (i.e., $a_h=\pi(x_h),\forall h\in[H]$) as $\Pr\nolimits_\pi(\tau)$. For any  $f\in\Fcal$, we can decompose the entire trajectory space into three disjoint sets $\Ccal_1=\{\tau=(x_0,a_0,r_0,\ldots,x_{H-1},a_{H-1},r_{H-1},x_H):\forall h\in[H],a_h=\pi^*(x_h)=\pi_f(x_h)\}$,  $\Ccal_2=\{\tau=(x_0,a_0,r_0,\ldots,x_{H-1},a_{H-1},r_{H-1},x_H):\forall h\in[H],a_h=\pi^*(x_h),\exists h\in[H],\pi_f(x_h)\neq\pi^*(x_h)\}$, $\Ccal_3=(\Ccal_1\bigcup\Ccal_2)^\complement$.

Then we calculate $V^{\pi^*}$ and $V^{\pi_f}$ with the definition of these three sets 
\begin{align*}
	V^{\pi^*}_0(x_0)&=~\sum_{\tau\in\Ccal_1\bigcup\Ccal_2\bigcup\Ccal_3}\Pr\nolimits_{\pi^*}(\tau)\text{Return}(\tau)
	\\
	&=~\sum_{\tau\in\Ccal_1}\Pr\nolimits_{\pi^*}(\tau)\text{Return}(\tau)+\sum_{\tau\in\Ccal_2}\Pr\nolimits_{\pi^*}(\tau)\text{Return}(\tau) \tag{Because  $\pi^*$ is greedy policy, any trajectory $\tau\in\Ccal_3$ has 0 probability}
	\\
	&=~\sum_{\tau\in\Ccal_1}\Pr\nolimits_{\pi_f}(\tau)\text{Return}(\tau)+\sum_{\tau\in\Ccal_2}\Pr\nolimits_{\pi^*}(\tau)\text{Return}(\tau) \tag{Definition of $\Ccal_1$}
	\\
	&\le~\sum_{\tau\in\Ccal_1}\Pr\nolimits_{\pi_f}(\tau)\text{Return}(\tau) +\sum_{\tau\in\Ccal_2}\Pr\nolimits_{\pi^*}(\tau)H \tag{$\text{Return}(\tau)\le H$}
	\\
	&\le~\sum_{\tau\in\Ccal_1\bigcup\Ccal_2\bigcup\Ccal_3}\Pr\nolimits_{\pi_f}(\tau)\text{Return}(\tau) +\sum_{\tau\in\Ccal_2}\Pr\nolimits_{\pi^*}(\tau)H \tag{$\text{Return}(\tau)\ge 0$}
	\\
	&=~V^{\pi_f}_0(x_0) +\sum_{\tau\in\Ccal_2}\Pr\nolimits_{\pi^*}(\tau)H.
\end{align*}
It remains to show that $\Pr\nolimits_{\pi^*}(\Ccal_2)=\sum_{\tau\in\Ccal_2}\Pr\nolimits_{\pi^*}(\tau)$ is small. From the definition, any trajectory $\tau=(x_0,a_0,r_0,\ldots,x_{H-1},a_{H-1},r_{H-1},x_H)\in \Ccal_2$ satisfies that $\forall h\in[H], a_h=\pi^*(x_h)$ and $\exists h\in[H], a_h\neq \pi_f(x_h)$. Then for any $\tau\in\Ccal_2$, we can find a unique index $h'\in[H]$ such that $a_0=\pi^*(x_0)=\pi_f(x_0),\ldots,a_{h'-1}=\pi^*(x_{h'-1})=\pi_f(x_{h'-1})$, $a_{h'}= \pi^*(x_{h'})\neq \pi_f(x_{h'})$ (i.e., $h'$ is the smallest index that $\pi_f$ differs from $\pi^*$ in trajectory $\tau$). This implies that $\Ccal_2\subseteq \bigcup_{h'=0}^{H-1}\Ccal_2^{h'}$, where $\Ccal^{h'}_2=\{\tau=(x_0,a_0,r_0,\ldots,x_{H-1},a_{H-1},r_{H-1},x_H):a_0=\pi^*(x_0)=\pi_f(x_0),\ldots,a_{h'-1}=\pi^*(x_{h'-1})=\pi_f(x_{h'-1})$, $a_{h'}= \pi^*(x_{h'})\neq \pi_f(x_{h'})\}$. Since $\EE[\one\{\pi_f(x_{h'})\neq \pi^*(x_{h'})\mid a_{0:h'-1}\sim\pi^*\}] = \Pr\nolimits_{\pi^*}(\Ccal_2^{h'})$, we have 
\begin{align*}
\sum_{\tau\in\Ccal_2}\Pr\nolimits_{\pi^*}(\tau)\le\sum_{h'=0}^{H-1}\sum_{\tau\in\Ccal_2^{h'}}\Pr\nolimits_{\pi^*}(\tau)&=~\EE\left[\sum_{h=0}^{H-1}\one\{\pi_{\hat f}(x_h)\neq \pi^*(x_h)\}\mid a_{0:h-1}\sim\pi^*\right]\\
&=~\EE\left[\sum_{h=0}^{H-1}\one\{\pi_{\hat f}(x_h)\neq \pi^*(x_h)\}\mid a_{0:H-1}\sim\pi^*\right].
\end{align*}
Finally, combining all the results above gives us
\begin{align}
\label{eq:final_err}
	V^{\pi_{\hat f}}_0(x_0)&~\ge V^*_0(x_0)-\sum_{\tau\in\Ccal_2}\Pr\nolimits_{\pi^*}(\tau)H \notag
	\\
	&~\ge V^*_0(x_0)-H\EE\left[\sum_{h=0}^{H-1}\one\{\pi_{\hat f}(x_h)\neq \pi^*(x_h)\}\mid a_{0:H-1}\sim\pi^*\right] \notag
	\\
	&~\ge v^* - H \times \varepsilon/H =v^* - \varepsilon.
\end{align}
This completes the proof.
\end{proof}

\paragraph{Remark}
We notice that \Cref{eq:sl_error} is the error of supervised learning (SL) with 0/1 loss. Therefore, we can directly use the RL to SL reduction in imitation learning literature (e.g., Theorem 2.1 in \citet{ross2010efficient}) to translate it to the final performance difference. It gives us the same as our result in \Cref{eq:final_err}. This second part of the proof is different from the one in \citet{ross2010efficient} and is potentially easier to understand. We believe that it is also of its independent interest.


\section{Proof of Robustness Results}
In this section, we provide the complete proof of misspecificed cases in \pref{sec:appx_error}. We start with some helper lemmas in \pref{app:helper_lemma_approx}. Then we show the proof of \pref{thm:find_v_star_appx} in \pref{app:proof_find_v_star_approx} and the proof of \pref{thm:main_appx} in \pref{app:proof_main_approx}.

\subsection{Helper Lemmas}
\label{app:helper_lemma_approx}
\begin{lemma}[Population loss bound for approximately realizable $\Wcal$]
\label{lem:appx_w}
Recall that the definitions of $\varepsilon_\Wcal$ and $\tilde w^*$ are
\begin{align*}
\varepsilon_{\Wcal}=~\min_{w\in\Wcal}\max_{f\in\Fcal}\max_{h\in[H]}\abr{\EE_{d^D_h}[w_h\cdot (f_h-\Tcal_h f_{h+1})] -\EE_{d^*_h}[f_h-\Tcal_h f_{h+1}]}
\end{align*}
and 
\begin{align*}
\tilde w^*=~\argmin_{w\in\Wcal}\max_{f\in\Fcal}\max_{h\in[H]}\abr{\EE_{d^D_h}[w_h\cdot (f_h-\Tcal_h f_{h+1})] -\EE_{d^*_h}[f_h-\Tcal_h f_{h+1}]}.
\end{align*}
For any $f\in\Fcal,h\in[H]$, we have
\textbf{\begin{align*}
\abr{\Ecal(f,\pi^*,h)} \le \abr{\EE[\Lcal_{\Dcal}(f,\tilde w^*,h)]} + \varepsilon_\Wcal,
\end{align*}}
where $\Ecal(\cdot)$ is the Q-type average Bellman error \citep{jin2021bellman,du2021bilinear} \[\Ecal(f,\pi,h)=\EE[f_h(x_h,a_h)-R_h(x_h,a_h)-f_{h+1}(x_{h+1},a_{h+1})\mid a_{0:h}\sim\pi, a_{h+1}\sim \pi_f]. 	\]
\end{lemma}
\begin{proof}
For any $f\in\Fcal,h\in[H]$, we have
\textbf{\begin{align*}
&~\abr{\Ecal(f,\pi^*,h)}
\\
=&~\EE[f_h(x_h,a_h)-R_h(x_h,a_h)-f_{h+1}(x_{h+1},a_{h+1})\mid a_{0:h}\sim\pi^*, a_{h+1}\sim \pi_f]. 
\\
=&~\abr{\EE_{(x_h,a_h)\sim d^*_h,x_{h+1}\sim P_h(\cdot\mid x_h,a_h)}[f_h(x_h,a_h)-R_h- f_{h+1}(x_{h+1},\pi_f(x_{h+1}))]}
	\\
=&~\abr{\EE_{(x_h,a_h)\sim d^*_h}[f_h(x_h,a_h)-(\Tcal_h f_{h+1})(x_h,a_h)]}
\\
=&~\abr{\EE_{d^*_h}[f_h-\Tcal_h f_{h+1}]}
\\
%=&~\abr{\EE_{(x_h,a_h)\sim d^D_h}[w_h^*(x_h,a_h)(f(x_h,a_h)-R_h(x_h,a_h)- \EE_{x_{h+1}\sim P_h(\cdot\mid x_h,a_h)}f(x_{h+1},\pi_f(x_{h+1})))]}
%\\
%=&~\abr{\EE_{(x_h,a_h)\sim d^D_h}[w_h^*(x_h,a_h)(f(x_h,a_h)-(\Tcal_h f_{h+1})(x_h,a_h))]}
%\\
\le&~\abr{\EE_{d^D_h}[\tilde w_h^*(f_h-\Tcal_h f_{h+1}]}+\abr{\EE_{d^D_h}[\tilde w_h^*\cdot (f_h-\Tcal_h f_{h+1})] -\EE_{d^*_h}[f_h-\Tcal_h f_{h+1}]} 
\\
\le&~\abr{\EE[\Lcal_{\Dcal}(f,\tilde w^*,h)]} + \varepsilon_\Wcal,
\end{align*}}
which completes the proof.
\end{proof}

\begin{lemma}[$\varepsilon_\Fcal$ is weaker than $\ell_\infty$ approximation error] 
\label{lem:appx_f_vs_infty}
Recall that the definitions of $\varepsilon_\Fcal$ and $\tilde Q_{\Fcal}^*$ are
\[
\varepsilon_{\Fcal}=\min_{f\in\Fcal}\max_{w\in\Wcal}\max_{h\in[H]}\left(\abr{\EE_{d^D_h}[w_h\cdot (f_h -\Tcal_h f_{h+1})]}+\abr{f_0(x_0,\pi_f(x_0))-Q^*_0(x_0,\pi^*(x_0))}\right)
\]
and 
\[
\tilde Q_{\Fcal}^*=\argmin_{f\in\Fcal}\max_{w\in\Wcal}\max_{h\in[H]}\left(\abr{\EE_{d^D_h}[w_h\cdot (f_h -\Tcal_h f_{h+1})]}+\abr{f_0(x_0,\pi_f(x_0))-Q^*_0(x_0,\pi^*(x_0))}\right).
\]
Suppose additionally we have mild regularity assumptions on $\Wcal$, i.e., for any $w\in\Wcal,h\in[H]$, $\EE_{d^D_h}[w_h] = 1$ and $w_h\in (\Xcal\times\Acal\rightarrow[0,\infty))$.
Then we have
\[
\varepsilon_\Fcal \le 3 \min_{f\in\Fcal}\max_{h\in[H]}\|f_h-Q^*_h\|_\infty.
\]
\end{lemma}

\begin{proof}
For any $f\in\Fcal,w\in\Wcal,h\in[H]$, we have the following
\begin{align}
\label{eq:proof_loss_to_inf_tran}
&\abr{\EE_{d^D_h}[w_h\cdot (f_h -\Tcal_h f_{h+1})]}\notag\\
\le&~ \abr{\EE_{d^D_h}[w_h\cdot (f_h-Q^*_h -\Tcal_h f_{h+1}+\Tcal_h Q^*_{h+1})]}+\abr{\EE_{d^D_h}[w_h\cdot (Q^*_h -\Tcal_h Q^*_{h+1})]}
\notag\\
\le&~\abr{\EE_{d^D_h}[w_h\cdot (f_h-Q^*_h)]}+\abr{\EE_{d^D_h}[w_h\cdot (\Tcal_h f_{h+1}-\Tcal_h Q^*_{h+1})]}+0\notag\\
\le&~\EE_{d^D_h}[w_h\cdot\|f_h-Q^*_h\|_\infty]+\abr{\EE_{(x_h,a_h)\sim d^D_h,x_{h+1}\sim P_h(\cdot\mid x_h,a_h)}[w_h\cdot (f_{h+1}(x_{h+1},\pi_f(x_{h+1})- Q^*(x_{h+1},\pi^*(x_{h+1})))]}
\notag\\
\le&~ \|f_h-Q^*_h\|_\infty + \EE_{(x_h,a_h)\sim d^D_h\cdot w_h,x_{h+1}\sim P_h(\cdot\mid x_h,a_h)}[|f(x_{h+1},\pi_f(x_{h+1})- Q^*_{h+1}(x_{h+1},\pi^*(x_{h+1}))|],
%\\
%\le&~  \|f_h-Q^*_h\|_\infty + \EE_{(x_h,a_h)\sim d^D_h,x_{h+1}\sim P_h(\cdot\mid x_h,a_h)}[|f_{h+1}(x_{h+1},\pi_f(x_{h+1})- Q^*_{h+1}(x_{h+1},\pi^*(x_{h+1}))|]
\end{align}
where the last inequality is due to the $\EE_{d_h^D}[w_h]=1$ and $w_h\ge 0$.

Now, we bound the second term in \Cref{eq:proof_loss_to_inf_tran}. Using $\varepsilon'$ to denote $\max_{h\in[H]}\|f_h-Q^*_h\|_\infty$, we have
\begin{align*}
&~Q^*_{h+1}(x_{h+1},\pi^*(x_{h+1}))- \varepsilon'\le f_{h+1}(x_{h+1},\pi^*(x_{h+1}))\\
\le&~ f_{h+1}(x_{h+1},\pi_f(x_{h+1})) \le Q^*_{h+1}(x_{h+1},\pi_f(x_{h+1})) + \varepsilon'\le  Q^*_{h+1}(x_{h+1},\pi^*(x_{h+1}))+ \varepsilon'.
\end{align*}
This implies that
\begin{align*}
|f_{h+1}(x_{h+1},\pi_f(x_{h+1})- Q^*_{h+1}(x_{h+1},\pi^*(x_{h+1}))|\le \varepsilon'=\max_{h\in[H]}\|f_h-Q^*_h\|_\infty.
\end{align*}
Therefore, we have
\begin{align*}
&\abr{\EE_{d^D_h}[w_h\cdot (f_h -\Tcal_h f_{h+1})]}\le  \|f_h-Q^*_h\|_\infty+\EE_{(x_h,a_h)\sim d^D_h\cdot w_h,x_{h+1}\sim P_h(\cdot\mid x_h,a_h)}[ \|f_{h+1}-Q^*_{h+1}\|_\infty].
\end{align*}
Since $\EE_{d_h^D}[w_h]=1$, we know that $\EE_{(x_h,a_h)\sim d^D_h\cdot w_h,x_{h+1}\sim P_h(\cdot\mid x_h,a_h)}[\cdot]$ is a probability distribution over $x_{h+1}$. This implies that 
\begin{align*}
&\abr{\EE_{d^D_h}[w_h\cdot (f_h -\Tcal_h f_{h+1})]}\le 2 \max_{h\in[H]}\|f_h-Q^*_h\|_\infty.
\end{align*}
Similarly, we have $\abr{f_0(x_0,\pi_f(x_0))-Q^*_0(x_0,\pi^*(x_0))}\le \max_{h\in[H]}\|f_h-Q^*_h\|_\infty$, thus
\[
\abr{\EE_{d^D_h}[w_h\cdot (f_h -\Tcal_h f_{h+1})]}+\abr{f_0(x_0,\pi_f(x_0))-Q^*_0(x_0,\pi^*(x_0))}\le 3\max_{h\in[H]}\|f_h-Q^*_h\|_\infty.
\]
Taking $\max$ over $h\in[h],w\in\Wcal$ and then taking $\min$ over $f\in\Fcal$ on both sides completes the proof.
\end{proof}




\subsection{Proof of Theorem~\ref{thm:find_v_star_appx}}
%\subsection{Robustness Result for Estimating the Optimal Expected Return}
\label{app:proof_find_v_star_approx}
\begin{theorem*}[Robust version of \pref{thm:find_v_star}, Restatement of \pref{thm:find_v_star_appx}]
%\label{thm:find_v_star_appx}
Suppose  \pref{assum:bound_q}, \pref{assum:bound_w} hold and the total number of samples $nH$ satisfies 
\[nH\ge \frac{8C^2H^5\log(2|\Fcal||\Wcal|H/\delta)}{\varepsilon ^2}.
\]
Then with probability $1-\delta$, running \pref{alg:pess_alg} with $\alpha=\varepsilon /(2H)+\varepsilon_{\Fcal}$ and $\gapmin=0$ guarantees 
\[|V_{\hat f}(x_0)-v^*| \le \varepsilon + H\varepsilon_{\Fcal}+H\varepsilon_\Wcal.
\]
\end{theorem*}


\begin{proof}
From \pref{lem:conc} and our choice $n\ge \frac{8C^2H^4\log(2|\Fcal||\Wcal|H/\delta)}{\varepsilon^2}$, with probability at least $1-\delta$, for any $f\in\Fcal,w\in\Wcal,h\in[H]$, we have
\[
\abr{\Lcal_{\Dcal}(f,w,h)-\EE[\Lcal_{\Dcal}(f,w,h)]}\le\estat\le \varepsilon /(2H).
\]
Throughout the proof, we will condition on this high probability event.

From \pref{lem:trans}, we have
\begin{align*}
|\EE[\Lcal_{\Dcal}(\tilde Q^*_{\Fcal},w,h)]|&=~\abr{\EE_{(x_h,a_h)\sim d^D_h}[w_h(x_h,a_h)(\tilde Q^*_{\Fcal,h}(x_h,a_h)-(\Tcal_h \tilde Q^*_{\Fcal,h+1})(x_h,a_h))]}
\\
&\le~\abr{\EE_{(x_h,a_h)\sim d^D_h}[w_h(x_h,a_h)(\tilde Q^*_{\Fcal,h}(x_h,a_h)-(\Tcal_h \tilde Q^*_{\Fcal,h+1})(x_h,a_h))]}\\
&\quad +\abr{\tilde Q^*_{\Fcal,0}(x_0,\pi_{\tilde Q^*_{\Fcal}}(x_0))-Q^*_0(x_0,\pi^*(x_0))}\\
&\le~ \varepsilon_{\Fcal}.
\end{align*}
When using the relaxed constraints by setting $\alpha=\varepsilon/(2H)+\varepsilon_{\Fcal}$, we can incorporate the approximation errors. More specifically, we have
\begin{align*}
\abr{\Lcal_\Dcal(\tilde Q^*_{\Fcal},w,h)}\le \abr{\EE[\Lcal_\Dcal(\tilde Q^*_{\Fcal},w,h)]}+\estat\le \varepsilon_{\Fcal} +\estat \le \varepsilon/(2H)+\varepsilon_{\Fcal}= \alpha,
\end{align*}
which implies that $\tilde Q^*_{\Fcal}$ will satisfy all constraints.

In addition, for any $f\in\Fcal$ that satisfies all constraints, we have that for any $w\in\Wcal,h\in[H]$,
\[
|\EE[\Lcal_{\Dcal}(f,w,h)]|\le\Lcal_{\Dcal}(f,w,h)+\estat\le \alpha + \estat=\varepsilon /H+\varepsilon_{\Fcal}.%+\varepsilon_\Wcal:=\varepsilon'.
\]
From \pref{lem:appx_w}, we further have
\[
|\Ecal(f,\pi^*,h)|\le\abr{\EE[\Lcal_{\Dcal}(f,\tilde w^*,h)]} + \varepsilon_\Wcal.
\]
Since $\tilde w^*\in\Wcal$, we get
\[
|\Ecal(f,\pi^*,h)|\le\abr{\EE[\Lcal_{\Dcal}(f,\tilde w^*,h)]} + \varepsilon_\Wcal\le\varepsilon /H+\varepsilon_{\Fcal}+\varepsilon_\Wcal:=\varepsilon'.
\]
Following telescoping step in the proof of \pref{thm:find_v_star}, for any $f\in\Fcal,h\in[H]$ that satisfies all constraints, we have
\begin{align*}
V_f(x_0)=f_0(x_0,\pi_f(x_0))\ge V^*_0(x_0)- H\varepsilon'.
\end{align*}
Therefore, we have 
\begin{align*}
V^*_0(x_0)+\varepsilon_{\Fcal}=Q^*_0(x_0,\pi^*(x_0)) + \varepsilon_{\Fcal}\ge \tilde Q^*_0(x_0,\pi_{\tilde Q^*}(x_0))\ge\hat f_0(x_0,\pi_{\hat f}(x_0))\ge V^*_0(x_0)- H\varepsilon',
\end{align*}
where the first inequality is due to the definition of approximation error $\varepsilon_{\Fcal}$ and the second inequality is due to pessimism.
This gives us
\[
|V_{\hat f}(x_0)-v^*| \le \max\{H\varepsilon', \varepsilon_{\Fcal}\}\le \varepsilon + H\varepsilon_{\Fcal}+H\varepsilon_\Wcal,
\]
which completes the proof.
\end{proof}

\subsection{Proof of Theorem~\ref{thm:main_appx}}
%\subsection{Robustness Result for Learning a Near-Optimal policy}
\label{app:proof_main_approx}
\begin{theorem*}[Robust version of \pref{thm:main}, restatement of \pref{thm:main_appx}]
%\label{thm:main_appx}
Suppose  \pref{assum:bound_q}, \pref{assum:bound_w} hold and the total number of samples $nH$ satisfies \[nH\ge \frac{8C^2H^7\log(2|\Fcal||\Wcal|H/\delta)}{\varepsilon^2 \gapmin^2}.\] Then with probability $1-\delta$, running \pref{alg:pess_alg} with a user-specified $\gapmin$ and $\alpha=\varepsilon \gapmin/(2H^2)+\varepsilon_{\Fcal(\gapmin)}$ guarantees \[v^{\pi_{\hat f}} \ge v^*-\varepsilon  - \frac{(H^2+H)\varepsilon_{\Fcal(\gapmin)}+H^2\varepsilon_\Wcal}{\gapmin}.\]
\end{theorem*}


\begin{proof}
From \pref{lem:conc} and our choice $n\ge \frac{8C^2H^6\log(2|\Fcal||\Wcal|H/\delta)}{\varepsilon^2 \gapmin^2}$, with probability at least $1-\delta$, for any $f\in\Fcal,w\in\Wcal,h\in[H]$, we have
\[
\abr{\Lcal_{\Dcal}(f,w,h)-\EE[\Lcal_{\Dcal}(f,w,h)]}\le\estat\le \varepsilon \gapmin/(2H^2).
\]
Throughout the proof, we will condition on this high probability event.

From \pref{lem:trans}, we have
\begin{align*}
|\EE[\Lcal_{\Dcal}(\tilde Q^*_{\Fcal(\gapmin)},w,h)]|&=~\abr{\EE_{(x_h,a_h)\sim d^D_h}[w_h(x_h,a_h)(\tilde Q^*_{\Fcal(\gapmin),h}(x_h,a_h)-(\Tcal_h \tilde Q^*_{\Fcal(\gapmin),h+1})(x_h,a_h))]}
\\
&\le~\abr{\EE_{(x_h,a_h)\sim d^D_h}[w_h(x_h,a_h)(\tilde Q^*_{\Fcal(\gapmin),h}(x_h,a_h)-(\Tcal_h \tilde Q^*_{\Fcal(\gapmin),h+1})(x_h,a_h))]}\\
&\quad +\abr{\tilde Q^*_{\Fcal(\gapmin),0}(x_0,\pi_{\tilde Q^*_{\Fcal(\gapmin)}}(x_0))-Q^*_0(x_0,\pi^*(x_0))}\\
&\le~ \varepsilon_{\Fcal(\gapmin)}.
\end{align*}
When using the relaxed constraints of $\alpha=\varepsilon \gapmin/(2H^2)+\varepsilon_{\Fcal(\gapmin)}$, we can incorporate the approximation errors. More specifically, we have
\begin{align*}
\abr{\Lcal_\Dcal(\tilde Q^*_{\Fcal(\gapmin)},w,h)}&\le~ \abr{\EE[\Lcal_\Dcal(\tilde Q^*_{\Fcal(\gapmin)},w,h)]}+\estat
\\
&\le~ \varepsilon_{\Fcal(\gapmin)} +\estat
\\
&\le~ \varepsilon \gapmin/(2H^2)+\varepsilon_{\Fcal(\gapmin)}= \alpha,
\end{align*}
which implies that $\tilde Q^*_{\Fcal(\gapmin)}$ will satisfy all constraints.

In addition, for any $f\in\Fcal(\gapmin)$ that satisfies all constraints, we have that for any $w\in\Wcal,h\in[H]$,
\[
|\EE[\Lcal_{\Dcal}(f,w,h)]|\le\Lcal_{\Dcal}(f,w,h)+\estat\le \alpha + \estat=\varepsilon \gapmin/H^2+\varepsilon_{\Fcal(\gapmin)}.%+\varepsilon_\Wcal:=\varepsilon'.
\]

From \pref{lem:appx_w}, we further have
\[
|\Ecal(f,\pi^*,h)|\le\abr{\EE[\Lcal_{\Dcal}(f,\tilde w^*,h)]} + \varepsilon_\Wcal.
\]
Since $\tilde w^*\in\Wcal$, we get
\[
|\Ecal(f,\pi^*,h)|\le\abr{\EE[\Lcal_{\Dcal}(f,\tilde w^*,h)]} + \varepsilon_\Wcal\le\varepsilon \gapmin/H^2+\varepsilon_{\Fcal(\gapmin)}+\varepsilon_\Wcal:=\varepsilon'.
\]

Since we run the algorithm on $\Fcal(\gapmin)$, the gap parameter will be $\gapmin$ instead of $\gapq$ in \pref{thm:main}. Following the proof of \pref{thm:main}, for any $f\in\Fcal(\gapmin),h\in[H]$ that satisfies all constraints, we have
\begin{align*}
V_f(x_0)=f_0(x_0,\pi_f(x_0))\ge Q^*_0(x_0,\pi^*(x_0))+\gapmin \EE\left[\sum_{h=0}^{H-1}\one\{\pi_f(x_h)\neq \pi^*(x_h)\}\mid a_{0:H-1}\sim\pi^*\right]- H\varepsilon'.
\end{align*}
Therefore, we have 
\begin{align*}
&~Q^*_0(x_0,\pi^*(x_0)) + \varepsilon_{\Fcal(\gapmin)}
\\
\ge&~ \tilde Q^*_{\Fcal(\gapmin),0}(x_0,\pi_{Q^*_{\Fcal(\gapmin)}}(x_0)) \tag{Definition of approximation error $\varepsilon_{\Fcal(\gapmin)}$}
\\
\ge&~ \hat f_0(x_0,\pi_{\hat f}(x_0)) \tag{Pessimism}
\\
\ge&~ Q^*_0(x_0,\pi^*(x_0))+\gapmin \EE\left[\sum_{h=0}^{H-1}\one\{\pi_f(x_h)\neq \pi^*(x_h)\}\mid a_{0:H-1}\sim\pi^*\right]- H\varepsilon',
\end{align*}
which yields
\[
\EE\left[\sum_{h=0}^{H-1}\one\{\pi_f(x_h)\neq \pi^*(x_h)\}\mid a_{0:H-1}\sim\pi^*\right]\le \rbr{H\varepsilon'  + \varepsilon_{\Fcal(\gapmin)}}/\gapmin.
\]
This translates to the performance difference bound of
\[
V^{\pi_{\hat f}}_0(x_0)\ge v^*-H\rbr{H\varepsilon' + \varepsilon_{\Fcal(\gapmin)}}/\gapmin\ge v^*-\varepsilon - \frac{(H^2+H)\varepsilon_{\Fcal(\gapmin)}+H^2\varepsilon_\Wcal}{\gapmin},
\]
which completes the proof.
\end{proof}

\subsection{Corollary from Theorem \ref{thm:main_appx}}
\label{sec:corr_main_appx}

\pref{thm:main_appx} gives us a convenient way to set the gap parameter $\gapmin$. We show that it can easily handle the case that $\ell_\infty$ approximation error of $\Fcal$ and $\gapq$ are known. We formally define $\ell_\infty$ approximation error and the corresponding best approximator w.r.t. $\Fcal$ as
\[\varepsilon_{\Fcal,\infty}=\min_{f\in\Fcal}\max_{h\in[H]}\|f_h-Q^*_h\|_\infty, \quad \tilde Q^*_{\Fcal,\infty}=\argmin_{f\in\Fcal}\max_{h\in[H]}\|f_h-Q^*_h\|_\infty.
\]
Similarly, we can define the version for $\Fcal(\gapq)$.

Then we have the following corollary.
\begin{corollary}[Corollary from \pref{thm:main_appx}]
\label{corr:main_appx}
Suppose  \pref{assum:bound_q}, \pref{assum:bound_w} hold, the weight function class satisfies the additional mild regularity assumptions stated in \pref{lem:appx_f_vs_infty}. Assume we are given $\varepsilon_{\Fcal,\infty},\gapq$ and $2\varepsilon_{\Fcal,\infty}<\gapq$. If the total number of samples $nH$ satisfies 
\[
nH\ge \frac{8C^2H^7\log(2|\Fcal||\Wcal|H/\delta)}{\varepsilon^2 (\gapq-2\varepsilon_{\Fcal,\infty})^2},
\]
then with probability $1-\delta$, running \pref{alg:pess_alg} with $\gapmin=\gapq-2\varepsilon_{\Fcal,\infty}$ and $\alpha=\varepsilon (\gapq-2\varepsilon_{\Fcal,\infty})/(2H^2)+2\varepsilon_{\Fcal,\infty}$  guarantees 
\[v^{\pi_{\hat f}} \ge v^*-\varepsilon  - \frac{(2H^2+H)\varepsilon_{\Fcal,\infty}+H^2\varepsilon_\Wcal}{\gapq-2\varepsilon_{\Fcal,\infty}}.\]
\end{corollary}

\begin{proof}
From the definition of $\gapq$, $\varepsilon_{\Fcal,\infty}$ and $\tilde Q^*_{\Fcal,\infty}$, we know that 
\[
\gap(\tilde Q^*_{\Fcal,\infty})\ge \gapq-2\varepsilon_{\Fcal,\infty}>0.
\]
Therefore, we have $\tilde Q^*_{\Fcal,\infty}\in \Fcal(\gapq-2\varepsilon_{\Fcal,\infty})$. Together with the definition that $\tilde Q^*_{\Fcal,\infty}$ is the best approximator of $Q^*$ within $\Fcal$ (under $\ell_\infty$ norm), we know that $\tilde Q^*_{\Fcal,\infty}$ is also the best approximator within $\Fcal(\gapq-2\varepsilon_{\Fcal,\infty})$ (under $\ell_\infty$ norm). This implies that \[\varepsilon_{\Fcal(\gapq-2\varepsilon_{\Fcal,\infty}),\infty}=\varepsilon_{\Fcal,\infty}.
\]
In addition, under the mild regularity assumptions stated in \pref{lem:appx_f_vs_infty}, applying \pref{lem:appx_f_vs_infty} tells us
\[
\varepsilon_{\Fcal(\gapq-2\varepsilon_{\Fcal,\infty})} \le 3 \min_{f\in\Fcal(\gapq-2\varepsilon_{\Fcal,\infty})}\max_{h\in[H]}\|f_h-Q^*_h\|_\infty=3\varepsilon_{\Fcal(\gapq-2\varepsilon_{\Fcal,\infty}),\infty}=3\varepsilon_{\Fcal,\infty}.
\]
The remaining part of the proof follows a similar approach as the proof of \pref{thm:main_appx}. Firstly, we have the $1-\delta$ high probability event that for any $f\in\Fcal,w\in\Wcal,h\in[H]$
\[
\abr{\Lcal_{\Dcal}(f,w,h)-\EE[\Lcal_{\Dcal}(f,w,h)]}\le\estat\le \varepsilon (\gapq-2\varepsilon_{\Fcal,\infty})/(2H^2).
\]
Then following the proof \pref{lem:appx_f_vs_infty}, we have
\begin{align*}
|\EE[\Lcal_{\Dcal}(\tilde Q^*_{\Fcal,\infty},w,h)]|&=~\abr{\EE_{ d^D_h}[w_h\cdot (\tilde Q^*_{\Fcal,\infty,h}-\Tcal_h\tilde Q^*_{\Fcal,\infty,h+1})]}
\\
&\le~\abr{\EE_{d^D_h}[w_h\cdot (\tilde Q^*_{\Fcal,\infty,h}-Q^*_h)]}+\abr{\EE_{d^D_h}[w_h\cdot (\Tcal_h \tilde Q^*_{\Fcal,\infty,h+1}-\Tcal_h Q^*_{h+1})]}+0\notag\\
&\le~2\max_{h\in[H]}\|\tilde Q^*_{\Fcal,\infty,h}-Q^*_h\|_\infty=2\varepsilon_{\Fcal,\infty}.
\end{align*}
The empirical loss of $\tilde Q^*_{\Fcal,\infty}$ satisfies
\begin{align*}
\abr{\Lcal_\Dcal(\tilde Q^*_{\Fcal,\infty},w,h)}&\le~ \abr{\EE[\Lcal_\Dcal(\tilde Q^*_{\Fcal,\infty},w,h)]}+\estat
\\
&\le~ \varepsilon (\gapq-2\varepsilon_{\Fcal,\infty})/(2H^2)+2\varepsilon_{\Fcal,\infty}= \alpha,
\end{align*}
which implies that $\tilde Q^*_{\Fcal,\infty}$ will satisfy all constraints.

In addition, for any $f\in\Fcal(\gapq-2\varepsilon_{\Fcal,\infty})$ that satisfies all constraints, we have that for any $w\in\Wcal,h\in[H]$,
\[
|\EE[\Lcal_{\Dcal}(f,w,h)]|\le\Lcal_{\Dcal}(f,w,h)+\estat\le \alpha + \estat=\varepsilon (\gapq-2\varepsilon_{\Fcal,\infty})/H^2+2\varepsilon_{\Fcal,\infty}.%+\varepsilon_\Wcal:=\varepsilon'.
\]
Similarly, we further have
\[
|\Ecal(f,\pi^*,h)|\le\abr{\EE[\Lcal_{\Dcal}(f,\tilde w^*,h)]} + \varepsilon_\Wcal\le\varepsilon (\gapq-2\varepsilon_{\Fcal,\infty})/H^2+2\varepsilon_{\Fcal,\infty}+\varepsilon_\Wcal:=\varepsilon'.
\]
The final performance difference bound is
\[
V^{\pi_{\hat f}}_0(x_0)\ge v^*-H\rbr{H\varepsilon' + \varepsilon_{\Fcal,\infty} }/(\gapq-2\varepsilon_{\Fcal,\infty})\ge v^*-\varepsilon - \frac{(2H^2+H)\varepsilon_{\Fcal,\infty}+H^2\varepsilon_\Wcal}{\gapq-2\varepsilon_{\Fcal,\infty}},
\]
where the difference compared with the derivation in the proof of \pref{thm:main_appx} is that we use $\ell_\infty$ bound to get 
\[
Q_0^*(x_0,\pi^*(x_0))+\varepsilon_{\Fcal,\infty}\ge \tilde Q^*_{\Fcal,\infty,0}(x_0,\pi_{Q^*_{\Fcal,\infty}}(x_0)).
\]
This completes the proof.
\end{proof}




\section{Proof of the Unknown Gap Parameter Setting}
\label{app:proof_main_unknown_gap}
In this section, we present the formal proof of \pref{thm:main_unknown}. We start with a standard helper lemma in \pref{app:helper_lemma_unknown}, which shows the concentration result of Monte Carlo estimate. Then we show the proof of \pref{thm:main_unknown} in \pref{app:proof_main_unknown}.

\subsection{A Helper Lemma}
\label{app:helper_lemma_unknown}
\begin{lemma}[Concentration for Monte Carlo estimate]
\label{lem:conc_mc}
Assume we run policy $\pi$ and collect $m$ trajectories $\cbr{x_0^{(i)},a_0^{(i)},r_0^{(i)},\ldots,x_{H-1}^{(i)},a_{H-1}^{(i)},r_{H-1}^{(i)}}_{i=1}^m$ and our Monte Carlo estimate is defined as 
\[\hat v^\pi:=\frac{1}{m}\sum_{i=1}^m\sum_{h=0}^{H-1} r_h^{(i)}.\]
Then we have 
\[
\abr{\hat v^\pi-v^{\pi}}\le 2H\sqrt{\frac{\log(2/\delta)}{2m}}.
\]
\end{lemma}
\begin{proof}
Define random variable $Y_i:=\sum_{h=0}^{H-1}r_h^{(i)}$. From the definition, we know that $Y_i$ are i.i.d. samples with mean $v^{\pi}$. Applying Hoeffding's inequality and noticing that $|Y_i|\le H$ gives us with probability $1-\delta$,
\[
\abr{\frac{1}{m}\sum_{i=1}^m Y_i-v^{\pi}}\le 2H\sqrt{\frac{\log(2/\delta)}{2m}}.
\]
This completes the proof.
\end{proof}

\subsection{Proof of Theorem~\ref{thm:main_unknown}}
\label{app:proof_main_unknown}
\begin{theorem*}[Sample complexity of finding a near-optimal policy with unknown $\gapq$, restatement of \pref{thm:main_unknown}]
%\label{thm:main_unknown}
Suppose  \pref{assum:realizablity_q}, \pref{assum:realizablity_w}, \pref{assum:bound_q}, \pref{assum:bound_w}, \pref{assum:gap_plus} hold but $\gapq$ is unknown. Assume we have a dataset $\Dcal$ with size $n$ for each $\Dcal_h$ and additional online access to collect
\[(\log(2H/\gapq))^2\cdot \frac{n\log(24/\delta)}{C^2 H}=\tilde O\rbr{\frac{n\log(1/\delta)}{C^2H}}\]
samples. Then with probability at least $1-\delta$, the output policy $\hat \pi$ from \pref{alg:unknown_gap} satisfies
\begin{align*}
%\label{eq:unknonw_q_accu}
v^{\hat \pi}\ge v^* - 5\sqrt{\frac{32C^2H^6\iota(\log(2H/\gapq))}{n\gapq^2}},    
\end{align*}
where $\iota(t)=\log(24|\Fcal||\Wcal|H\cdot 2^t/\delta)$.
\end{theorem*}


\begin{proof}
For \pref{thm:find_v_star}, \pref{thm:main} and Monte Carlo roll out estimate at iteration $t$, we set their high probability event parameter as $\delta'_t:=\delta/(6 \times 2^t)$. Then union bounding over all of them gives us $1-\delta$ high probability event. Our following analysis is conditioned on these high probability events. %From \pref{alg:unknown_gap}, we know that there is at most $\log(H/B_{\mathrm{gap}}) + 1$ iterations.

Firstly, we show that \pref{alg:unknown_gap} will terminate once our guess $\mathrm{gap}^{\mathrm{guess}}_t$ drops below the true $\gapq$. From \pref{thm:find_v_star}, we know that $|\hat v_t^*-v^*|\le \varepsilon_t$. Further, when $\mathrm{gap}^{\mathrm{guess}}_t\le \gapq$, we can guarantee that $Q^*\in\Fcal(\mathrm{gap}^{\mathrm{guess}}_t)$. Therefore, \pref{thm:main} tells us $v^{\hat \pi_t} \ge v^*-\varepsilon_t$. Finally, for Monte Carlo estimate $\hat v^{\hat \pi_t}$, we have $|\hat v^{\hat \pi_t}-v^{\hat \pi_t}|\le \varepsilon_t$. Combining them together yields
\[
\hat v^{\hat \pi_t} \ge v^{\hat \pi_t} - \varepsilon_t \ge v^*-\varepsilon_t-\varepsilon_t\ge \hat v^*_t -\varepsilon_t-\varepsilon_t-\varepsilon_t=\hat v^*_t-3\varepsilon_t,
\]
which means our algorithm will stop in this iteration.

So if we assume the algorithm terminates at iteration $T$, then $T$ satisfies $H/2^T\ge \gapq/2$, thus
\[
T\le \log(2H/\gapq).
\]
Then we prove that the output policy $\hat\pi_T$ satisfies $v^{\hat\pi_T}\ge v^*-5\varepsilon_t$. This can be seen from
\[
v^{\hat \pi_T} \ge \hat v^{\hat \pi_T} -\varepsilon_T \ge \hat v^*_T -3\varepsilon_T -\varepsilon_T \ge  v^* -\varepsilon_T -3\varepsilon_T -\varepsilon_T =v^*-5\varepsilon_T.
\]
Notice that $\varepsilon_t$ will increase as $t$ increases. Therefore, if our algorithm terminates before $\mathrm{gap}^{\mathrm{guess}}_t$ drops below $\gapq$, we will have a better performance guarantee. More specifically, we have \[\varepsilon_T\le\varepsilon_{\log(2H/\gapq)}=\sqrt{\frac{32C^2H^6\iota(\log(2H/\gapq))}{n\gapq^2}}.
\]
Therefore, $\hat\pi_T$ satisfies 
\[v^{\hat \pi_T}\ge v^* - 5\sqrt{\frac{32C^2H^6\iota(\log(2H/\gapq))}{n\gapq^2}},
\]
which has the same order of the accuracy as running \pref{alg:pess_alg} with known $\gapq$ in \pref{thm:main} up to polylog terms.

Finally we calculate the required number of online samples. For iteration $t$, applying \pref{lem:conc_mc}, we require \[H\cdot\frac{2H^2\log(12\times 2^t/\delta)}{\varepsilon_t^2}\le  \frac{2H^3\log(12\times 2^T/\delta)}{\varepsilon_t^2}=\frac{n\log(12\times 2^T/\delta)}{4C^2 H\iota(t) 2^{2t}}\le\frac{n\log(12\times 2^T/\delta)}{C^2 H}\le\frac{nT\log(12\times 2/\delta)}{C^2 H}
\]
samples. Then since we have at most $\log(2H/\gapq)$ iterations, the required number of online samples is at most 
\[
\log(2H/\gapq)\cdot \frac{nT\log(12\times 2/\delta)}{C^2 H}\le(\log(2H/\gapq))^2\cdot \frac{n\log(24/\delta)}{C^2 H}.
\]
This completes the proof.
\end{proof}



\section{Lagrangian Form Algorithm and Results}
\label{app:lang}
In this section, we introduce the Lagrangian form variant of PABC (\pref{alg:pess_alg}) and its sample complexity guarantees. We start with showing its variant PABC-L (\pref{alg:pess_lang_alg}) in \pref{app:alg_lang}. Then we provide the main results of PABC-L in \pref{app:main_lang} and its robustness results in \pref{app:appx_error_lang}.

\subsection{Algorithm}
\label{app:alg_lang}
In this part, we introduce the PABC-L (PABC with Lagrangian form) algorithm as shown in \pref{alg:pess_lang_alg}. Compared with PABC (\pref{alg:pess_alg}), PABC-L does not take the threshold $\alpha$ as input. In addition, it moves the constraints (\Cref{eq:constraint}) to the objective (\Cref{eq:objective}). Furthermore, to estimate $v^*$, it returns  $\hat f_0(x_0, \pi_{\hat f}(x_0))+ H\cdot \max_{w\in\Wcal,h\in[H]} |\Lcal_{\Dcal}(\hat f,w,h)|$ instead of $\hat f_0(x_0, \pi_{\hat f}(x_0))$.
\begin{algorithm}[hbt]
	\caption{PABC-L (PABC with Lagrangian form)}\label{alg:pess_lang_alg}
	\begin{algorithmic}[1]
	    \REQUIRE gap factor $\cgap$, function class $\Fcal$, weight function class $\Wcal$, and dataset $\Dcal$.
	    \STATE Perform prescreening according to input  $\cgap$: %\label{line:prescreen}
	    \begin{align} %\label{line:prescreen}
	    \Fcal(\cgap):=\{f \in \Fcal: \gap(f)\ge\cgap\}.
	    \end{align}
		\STATE Find the pessimism value function in  $\Fcal(\cgap)$ with the Lagrangian form objective %\label{line:pess_select}
		\begin{align}
		\label{eq:objective}
		&\hat f=\argmin_{f\in\Fcal(\cgap)} \rbr{f_0(x_0,\pi_{f}(x_0))+ H\cdot \max_{w\in\Wcal,h\in[H]} |\Lcal_{\Dcal}(f,w,h)|}
    	%\text{s.t.}& \max_{w\in\Wcal,h\in[H]} |\Lcal_{\Dcal}(f,w,h)|\le\alpha,
    	\end{align}
    	where the empirical loss $\Lcal_{\Dcal}(f,w,h)$ is defined as
    	\begin{align}
        \Lcal_{\Dcal}(f,w,h) &%&~=\EE_{\Dcal}[w_h(x_h,a_h)(f_h(x_h,a_h)-r_h-f_{h+1}(x_{h+1},\pi_f(x_{h+1})))].\\
        =\frac{1}{n}\sum_{i=1}^n[w_h(x_h^{(i)},a_h^{(i)})(f_h(x_h^{(i)},a_h^{(i)})-r_h^{(i)}-f_{h+1}(x_{h+1}^{(i)},\pi_f(x_{h+1}^{(i)})))]. %\label{eq:LD}
        \end{align}
		\ENSURE policy $\pi_{\hat f}$ and return estimation $\hat f_0(x_0, \pi_{\hat f}(x_0))+ H\cdot \max_{w\in\Wcal,h\in[H]} |\Lcal_{\Dcal}(\hat f,w,h)|$.
	\end{algorithmic}
\end{algorithm}

\paragraph{Remark} In the objective (\Cref{eq:objective}), we can also use
\begin{align}
\label{eq:objective_another}
&\hat f=\argmin_{f\in\Fcal(\cgap)} \rbr{f_0(x_0,\pi_{f}(x_0))+ \sum_{h=0}^{H-1} \max_{w\in\Wcal} |\Lcal_{\Dcal}(f,w,h)|}.
	%\text{s.t.}& \max_{w\in\Wcal,h\in[H]} |\Lcal_{\Dcal}(f,w,h)|\le\alpha,
\end{align}
From the detailed proofs in the subsequent parts, it is easy to see that the theoretical results hold under this objective (\Cref{eq:objective_another}).


\subsection{Main Guarantees}
\label{app:main_lang}
In this part, we present the main sample complexity results of PABC-L (\pref{alg:pess_lang_alg}). In parallel with \pref{sec:main}, we show that PABC-L can identify $v^*$ without the gap assumption in \pref{app:find_v_star_lang} and show that PABC-L with the gap assumption learns a near-optimal policy in \pref{app:find_near_optimal_lang}. 

\subsubsection{ESTIMATING OPTIMAL EXPECTED RETURN}
\label{app:find_v_star_lang}
We show the sample complexity bound and the proof for PABC-L to identify $v^*$. The bound is the same as that of PABC (\pref{thm:find_v_star}).
\begin{theorem}[Sample complexity of identifying $v^*$, Lagrangian version]
\label{thm:find_v_star_lang}
Suppose  \pref{assum:realizablity_q}, \pref{assum:realizablity_w}, \pref{assum:bound_q}, \pref{assum:bound_w} hold and the total number of samples $nH$ satisfies \[nH\ge \frac{8C^2H^5\log(2|\Fcal||\Wcal|H/\delta)}{\varepsilon^2}.
\]
Then with probability at least $1-\delta$, running \pref{alg:pess_lang_alg} with $\gapmin=0$ guarantees 
\[|V_{\hat f}(x_0)-v^*|\le \varepsilon.
\]
\end{theorem}

\begin{proof}
The proof mostly follows the proof of \pref{thm:find_v_star}, and we only show the different and crucial steps here. We still condition on the high probability event from concentration (\pref{lem:conc}).

From the concentration result and the choice of $n$, we have the bound for $Q^*$:
\[
V^*_0(x_0) + H\cdot \max_{w\in\Wcal,h\in[H]} |\Lcal_{\Dcal}(Q^*,w,h)| \le V^*_0(x_0) + H\estat,
\]
where $\estat\le \varepsilon/H$.

From pessimism and the objective in \pref{alg:pess_lang_alg}, we have 
\begin{align*}
V^*_0(x_0) + H\cdot \max_{w\in\Wcal,h\in[H]} |\Lcal_{\Dcal}(Q^*,w,h)| &\ge~ V_{\hat f}(x_0)+H\cdot \max_{w\in\Wcal,h\in[H]} |\Lcal_{\Dcal}(\hat f,w,h)|.
\end{align*}
Therefore, we get
\begin{align}
\label{eq:find_v_star_lang_1}
V_0^*(x_0)+H\estat \ge V_{\hat f}(x_0)+H\cdot \max_{w\in\Wcal,h\in[H]} |\Lcal_{\Dcal}(\hat f,w,h)|.
\end{align}
For any $f\in\Fcal$, following the telescoping step in the proof of \pref{thm:find_v_star}, we know that
\begin{align*}
V_f(x_0)&=~f_0(x_0,\pi_f(x_0))
\\
&\ge~f_0(x_0,\pi^*(x_0))
\\
&=~\EE[R_0(x_0,a_0)+f_1(x_1,a_1)\mid a_{0}\sim\pi^*,a_{1}\sim\pi_f]+ \Ecal(f,\pi^*,0)
\\
&\ge~\EE[R_0(x_0,a_0)\mid a_{0}\sim\pi^*]+\EE[f_1(x_1,a_1)\mid a_{0:1}\sim\pi^*] + \Ecal(f,\pi^*,0)
\\
&\ge~\EE[R_0(x_0,a_0)\mid a_{0}\sim\pi^*]+\EE[R_1(x_1,a_1)+f_2(x_2,a_2)\mid a_{0:1}\sim\pi^*,a_2\sim \pi_f]+ \Ecal(f,\pi^*,1)+ \Ecal(f,\pi^*,0)
\\
&\ge~\ldots
\\
&\ge~\EE\left[\sum_{h=0}^{H-1}R_h(x_h,a_h)\mid a_{0:H-1}\sim\pi^*\right]+\sum_{h=0}^{H-1}\Ecal(f,\pi^*,h)
\\
&\ge~ V^*_0(x_0)-  \sum_{h=0}^{H-1}|\Ecal(f,\pi^*,h)|.
\end{align*}
Therefore, we get
\begin{align}
\label{eq:find_v_star_lang_2}
&~V_{\hat f}(x_0)+H\cdot \max_{w\in\Wcal,h\in[H]} |\Lcal_{\Dcal}(\hat f,w,h)|
\notag\\
\ge&~ V_0^*(x_0)-  \sum_{h=0}^{H-1}|\Ecal(\hat f,\pi^*,h)|+H\cdot \max_{w\in\Wcal,h\in[H]} |\Lcal_{\Dcal}(\hat f,w,h)|\notag
\\
\ge&~ V_0^*(x_0)-\sum_{h=0}^{H-1}|\Ecal(\hat f,\pi^*,h)|+H\cdot \max_{w\in\Wcal,h\in[H]} |\EE[\Lcal_{\Dcal}(\hat f,w,h)]|-H\estat
\notag\\
\ge &~V_0^*(x_0)-\sum_{h=0}^{H-1}|\Ecal(\hat f,\pi^*,h)|+ \sum_{h=0}^{H-1}|\EE[\Lcal_{\Dcal}(\hat f,w^*,h)]|-H\estat
\notag\\
= &~V_0^*(x_0)-\sum_{h=0}^{H-1}|\Ecal(\hat f,\pi^*,h)|+\sum_{h=0}^{H-1}|\Ecal(\hat f,\pi^*,h)|-H\estat
\notag\\
= &~V_0^*(x_0)-H\estat.
\end{align}
Combining \Cref{eq:find_v_star_lang_1} and \Cref{eq:find_v_star_lang_2} yields
\[
|V_{\hat f}(x_0)+H\cdot \max_{w\in\Wcal,h\in[H]} |\Lcal_{\Dcal}(\hat f,w,h)|-v^*|=|V_{\hat f}(x_0)+H\cdot \max_{w\in\Wcal,h\in[H]} |\Lcal_{\Dcal}(\hat f,w,h)|-V_0^*(x_0)|\le H\estat\le \varepsilon,
\]
which completes the proof.
\end{proof}



\subsubsection{LEARNING A NEAR-OPTIMAL POLICY}
\label{app:find_near_optimal_lang}
Here we present the result for learning a near optimal policy. Compared with its counterpart (\pref{thm:main}), the sample complexity only differs in the constant.
\begin{theorem}[Sample complexity of learning a near-optimal policy, Lagrangian version]
\label{thm:main_lang}
	Suppose  \pref{assum:realizablity_q}, \pref{assum:realizablity_w}, \pref{assum:bound_q}, \pref{assum:bound_w}, \pref{assum:gap_plus} hold and the total number of samples $nH$ satisfies 
	\[nH\ge \frac{32C^2H^7\log(2|\Fcal||\Wcal|H/\delta)}{\varepsilon^2 \gapq^2}.
	\]
	Then with probability at least $1-\delta$, running \pref{alg:pess_lang_alg} with $\gapmin=\gapq$ guarantees 
	\[
	v^{\pi_{\hat f}} \ge v^*-\varepsilon.
	\]
\end{theorem}

\begin{proof}
The proof mostly follows the proof of \pref{thm:main} and \pref{thm:find_v_star_lang}, and we only show the different and crucial steps here. We still condition on the high probability event from concentration (\pref{lem:conc}).

Similar as the proof of \pref{thm:find_v_star_lang}, from pessimism, we have
\begin{align}
\label{eq:main_lang_2}
V_0^*(x_0)+H\estat \ge V_{\hat f}(x_0)+H\cdot \max_{w\in\Wcal,h\in[H]} |\Lcal_{\Dcal}(\hat f,w,h)|,
\end{align}
where $\estat\le \varepsilon\gapq/(2H^2)$.

On the other hand, following the proof of \pref{thm:main} and \pref{thm:find_v_star_lang}, we have
\begin{align}
\label{eq:main_lang_1}
&~V_f(x_0)+H\cdot \max_{w\in\Wcal,h\in[H]} |\Lcal_{\Dcal}(\hat f,w,h)|
\notag\\
\ge&~ V^*_0(x_0)+\gapq \EE\left[\sum_{h=0}^{H-1}\one\{\pi_f(x_h)\neq \pi^*(x_h)\}\mid a_{0:H-1}\sim\pi^*\right]- \sum_{h=0}^{H-1}|\Ecal(\hat f,w^*,h)|+H\cdot \max_{w\in\Wcal,h\in[H]} |\Lcal_{\Dcal}(\hat f,w,h)|
\notag\\
\ge&~V^*_0(x_0)+\gapq \EE\left[\sum_{h=0}^{H-1}\one\{\pi_f(x_h)\neq \pi^*(x_h)\}\mid a_{0:H-1}\sim\pi^*\right]- \sum_{h=0}^{H-1}|\Ecal(\hat f,w^*,h)|+\sum_{h=0}^{H-1}|\Ecal(\hat f,w^*,h)|-H\estat
\notag\\
\ge&~V^*_0(x_0)+\gapq \EE\left[\sum_{h=0}^{H-1}\one\{\pi_f(x_h)\neq \pi^*(x_h)\}\mid a_{0:H-1}\sim\pi^*\right]-H\estat.
\end{align}
Combining \Cref{eq:main_lang_2} and \Cref{eq:main_lang_1} yields
\[\EE\left[\sum_{h=0}^{H-1}\one\{\pi_f(x_h)\neq \pi^*(x_h)\}\mid a_{0:H-1}\sim\pi^*\right]\le 2H\estat /\gapq\le\varepsilon.
\]
The remaining steps are followed from the proof of \pref{thm:main}.
\end{proof}


\subsection{Robustness to Misspecification}
\label{app:appx_error_lang}
In this part, we present the sample complexity results of PABC-L (\pref{alg:pess_lang_alg}) under misspecification. In parallel with \pref{sec:appx_error}, we show that PABC-L can identify $v^*$ in \pref{app:find_v_star_appx_lang}  and show its results for learning a near-optimal policy in \pref{app:find_near_optimal_appx_lang}. The major advantage of PABC-L is that it does not take $\alpha$ as the input, therefore, we no longer require the knowledge of approximation errors. 

\subsubsection{ESTIMATING OPTIMAL EXPECTED RETURN}
\label{app:find_v_star_appx_lang}
We present the result for identifying $v^*$. The sample complexity of PABC-L is the same as its counterpart (\pref{thm:find_v_star_appx}).
\begin{theorem}[Robust version of \pref{thm:find_v_star_lang}]
\label{thm:find_v_star_appx_lang}
Suppose  \pref{assum:bound_q}, \pref{assum:bound_w} hold and the total number of samples $nH$ satisfies 
\[nH\ge \frac{8C^2H^5\log(2|\Fcal||\Wcal|H/\delta)}{\varepsilon ^2}.
\]
Then with probability $1-\delta$, running \pref{alg:pess_lang_alg} with $\gapmin=0$ guarantees 
\[|V_{\hat f}(x_0)-v^*| \le \varepsilon + H\varepsilon_{\Fcal}+H\varepsilon_\Wcal.
\]
\end{theorem}

\begin{proof}
The proof mostly follows the proof of \pref{thm:find_v_star_appx} and \pref{thm:find_v_star_lang}, and we only show the different and crucial steps here. We still condition on the high probability event from concentration (\pref{lem:conc}).

For $\tilde Q^*_{\Fcal}$, from the concentration result and the definition of $\varepsilon_{\Fcal}$, we get
\[
\tilde Q^*_{\Fcal,0}(x_0,\pi_{Q^*_{\Fcal}(x_0)}) + H\cdot \max_{w\in\Wcal,h\in[H]} |\Lcal_{\Dcal}(\tilde Q^*_{\Fcal},w,h)| \le V^*_0(x_0) +H\varepsilon_{\Fcal}+ H\estat,
\]
where $\estat\le \varepsilon/H$.

From pessimism and the objective in  \pref{alg:pess_lang_alg}, we have 
\begin{align*}
\tilde Q^*_{\Fcal,0}(x_0,\pi_{Q^*_{\Fcal}(x_0)}) + H\cdot \max_{w\in\Wcal,h\in[H]} |\Lcal_{\Dcal}(\tilde Q^*_{\Fcal},w,h)| &\ge~ V_{\hat f}(x_0)+H\cdot \max_{w\in\Wcal,h\in[H]} |\Lcal_{\Dcal}(\hat f,w,h)|.
\end{align*}
Therefore, we get
\begin{align}
\label{eq:find_v_star_appx_lang_1}
V_0^*(x_0)+H\varepsilon_{\Fcal}+H\estat \ge V_{\hat f}(x_0)+H\cdot \max_{w\in\Wcal,h\in[H]} |\Lcal_{\Dcal}(\hat f,w,h)|.
\end{align}
For any $f\in\Fcal$, following the telescoping step in the proof of \pref{thm:find_v_star_lang}, we know that
\begin{align*}
    V_f(x_0)\ge V^*_0(x_0)-  \sum_{h=0}^{H-1}|\Ecal(f,\pi^*,h)|.
\end{align*}
Therefore, similar as the proof of \pref{thm:find_v_star_lang} and applying \pref{lem:appx_w}, we get
\begin{align}
\label{eq:find_v_star_appx_lang_2}
&~V_{\hat f}(x_0)+H\cdot \max_{w\in\Wcal,h\in[H]} |\Lcal_{\Dcal}(\hat f,w,h)|
\notag\\
\ge&~ V_0^*(x_0)-\sum_{h=0}^{H-1}|\Ecal(\hat f,\pi^*,h)|+H\cdot \max_{w\in\Wcal,h\in[H]} |\EE[\Lcal_{\Dcal}(\hat f,w,h)]|-H\estat
\notag\\
\ge &~V_0^*(x_0)-\sum_{h=0}^{H-1}|\Ecal(\hat f,\pi^*,h)|+ \sum_{h=0}^{H-1}|\EE[\Lcal_{\Dcal}(\hat f,\tilde w^*,h)]|-H\estat
\notag\\
\ge &~V_0^*(x_0)-\sum_{h=0}^{H-1}|\Ecal(\hat f,\pi^*,h)|+\sum_{h=0}^{H-1}|\Ecal(\hat f,\pi^*,h)|-H\varepsilon_{\Wcal}- H\estat
\notag\\
= &~V_0^*(x_0)-H\varepsilon_{\Wcal}-H\estat.
\end{align}
Combining \Cref{eq:find_v_star_appx_lang_1} and \Cref{eq:find_v_star_appx_lang_2} yields
\begin{align*}
|V_{\hat f}(x_0)+H\cdot \max_{w\in\Wcal,h\in[H]} |\Lcal_{\Dcal}(\hat f,w,h)|-v^*|&=~|V_{\hat f}(x_0)+H\cdot \max_{w\in\Wcal,h\in[H]} |\Lcal_{\Dcal}(\hat f,w,h)|-V_0^*(x_0)|
\\
&=~ H(\varepsilon_{\Fcal}+ \varepsilon_{\Wcal}+\estat)
\\
&\le~  \varepsilon + H(\varepsilon_{\Fcal}+ \varepsilon_{\Wcal}),
\end{align*}
which completes the proof.
\end{proof}



\subsubsection{LEARNING A NEAR-OPTIMAL POLICY}
\label{app:find_near_optimal_appx_lang}
In this part, we show the results for learning a near-optimal policy. Compared with the ones for PABC (\pref{thm:main_appx} and \pref{corr:main_appx}), the differences are only the constants.

\begin{theorem}[Robust version of \pref{thm:main_lang}]
\label{thm:main_appx_lang}
Suppose  \pref{assum:bound_q}, \pref{assum:bound_w} hold and the total number of samples $nH$ satisfies 
\[nH\ge \frac{32C^2H^7\log(2|\Fcal||\Wcal|H/\delta)}{\varepsilon^2 \gapmin^2}.\]
Then with probability $1-\delta$, running \pref{alg:pess_lang_alg} with a user-specified $\gapmin$ guarantees 
\[v^{\pi_{\hat f}} \ge v^*-\varepsilon  - \frac{H^2\varepsilon_{\Fcal(\gapmin)}+H^2\varepsilon_\Wcal}{\gapmin}.\]
\end{theorem}

\begin{proof}
The proof mostly follows the proof of \pref{thm:main_lang} and \pref{thm:find_v_star_appx_lang}, and we only show the different and crucial steps here. We still condition on the high probability event from concentration (\pref{lem:conc}).

Similar as the proof of \pref{thm:find_v_star_appx_lang}, we have
\begin{align}
\label{eq:main_appx_lang_1}
V_0^*(x_0)+H\varepsilon_{\Fcal(\gapmin)}+H\estat \ge V_{\hat f}(x_0)+H\cdot \max_{w\in\Wcal,h\in[H]} |\Lcal_{\Dcal}(\hat f,w,h)|,
\end{align}
where $\estat\le \varepsilon\gapmin/(2H^2)$.

On the other hand, following the proof of \pref{thm:main_lang} and \pref{thm:find_v_star_appx_lang}, we have
\begin{align}
\label{eq:main_appx_lang_2}
&~V_{\hat f}(x_0)+H\cdot \max_{w\in\Wcal,h\in[H]} |\Lcal_{\Dcal}(\hat f,w,h)|
\notag\\
\ge&~ V^*_0(x_0)+\gapmin \EE\left[\sum_{h=0}^{H-1}\one\{\pi_f(x_h)\neq \pi^*(x_h)\}\mid a_{0:H-1}\sim\pi^*\right]- \sum_{h=0}^{H-1}|\Ecal(\hat f,w^*,h)|
\notag\\
&~\quad +H\cdot \max_{w\in\Wcal,h\in[H]} |\Lcal_{\Dcal}(\hat f,w,h)|
\notag\\
\ge&~V^*_0(x_0)+\gapmin \EE\left[\sum_{h=0}^{H-1}\one\{\pi_f(x_h)\neq \pi^*(x_h)\}\mid a_{0:H-1}\sim\pi^*\right]-H\varepsilon_{\Wcal}-H\estat.
\end{align}
Combining \Cref{eq:main_appx_lang_1} and \Cref{eq:main_appx_lang_2} yields
\[\EE\left[\sum_{h=0}^{H-1}\one\{\pi_f(x_h)\neq \pi^*(x_h)\}\mid a_{0:H-1}\sim\pi^*\right]\le H(2\estat+\varepsilon_{\Wcal}+\varepsilon_{\Fcal(\gapmin)}) /\gapmin.
\]
The remaining steps can be followed from the proof of \pref{thm:main}.
\end{proof}


\begin{corollary}[Corollary from \pref{thm:main_appx_lang}]
\label{corr:main_appx_alg}
Suppose  \pref{assum:bound_q}, \pref{assum:bound_w} hold, the weight function class satisfies the additional mild regularity assumptions stated in \pref{lem:appx_f_vs_infty}. Assume we are given $\varepsilon_{\Fcal,\infty},\gapq$ and $2\varepsilon_{\Fcal,\infty}<\gapq$. If the total number of samples $nH$ satisfies 
\[
nH\ge \frac{8C^2H^7\log(2|\Fcal||\Wcal|H/\delta)}{\varepsilon^2 (\gapq-2\varepsilon_{\Fcal,\infty})^2},
\]
then with probability $1-\delta$, running \pref{alg:pess_lang_alg} with $\gapmin=\gapq-2\varepsilon_{\Fcal,\infty}$ guarantees 
\[v^{\pi_{\hat f}} \ge v^*-\varepsilon  - \frac{2H^2\varepsilon_{\Fcal,\infty}+H^2\varepsilon_\Wcal}{\gapq-2\varepsilon_{\Fcal,\infty}}.\]
\end{corollary}

\begin{proof}
The proof mostly follows the proof of \pref{corr:main_appx} and \pref{thm:main_appx_lang}, and we only show the different and crucial steps here. We still condition on the high probability event from concentration (\pref{lem:conc}).

Similar as the proof of \pref{corr:main_appx} and \pref{thm:main_appx_lang}, we have
\begin{align}
\label{eq:corr_appx_lang_1}
V_0^*(x_0)+2H\varepsilon_{\Fcal,\infty}+H\estat \ge V_{\hat f}(x_0)+H\cdot \max_{w\in\Wcal,h\in[H]} |\Lcal_{\Dcal}(\hat f,w,h)|.
\end{align}
On the other hand, following the proof of \pref{thm:main_appx_lang}, we have
\begin{align}
\label{eq:corr_appx_lang_2}
&~V_{\hat f}(x_0)+H\cdot \max_{w\in\Wcal,h\in[H]} |\Lcal_{\Dcal}(\hat f,w,h)|
\notag\\
%\ge&~ V^*_0(x_0)+\gap(\Fcal(\gapmin)) \EE\left[\sum_{h=0}^{H-1}\one\{\pi_f(x_h)\neq \pi^*(x_h)\}\mid a_{0:H-1}\sim\pi^*\right]- \sum_{h=0}^{H-1}|\Ecal(\hat f,w^*,h)|+H\cdot \max_{w\in\Wcal,h\in[H]} |\Lcal_{\Dcal}(\hat f,w,h)|
%\notag\\
\ge&~V^*_0(x_0)+(\gapq-2\varepsilon_{\Fcal,\infty}) \EE\left[\sum_{h=0}^{H-1}\one\{\pi_f(x_h)\neq \pi^*(x_h)\}\mid a_{0:H-1}\sim\pi^*\right]-H\varepsilon_{\Wcal}-H\estat.
\end{align}
Combining \Cref{eq:corr_appx_lang_1} and \Cref{eq:corr_appx_lang_2} yields
\[\EE\left[\sum_{h=0}^{H-1}\one\{\pi_f(x_h)\neq \pi^*(x_h)\}\mid a_{0:H-1}\sim\pi^*\right]\le H(2\estat+\varepsilon_{\Wcal}+2\varepsilon_{\Fcal,\infty}) /(\gapq-2\varepsilon_{\Fcal,\infty}).
\]
The remaining steps can be followed from the proof of \pref{thm:main}.
\end{proof}



\section{Discussion on the Data Coverage Assumption}
\label{app:conc_example}

In this section, we provide an example that shows our data coverage assumption is more relaxed than the $\pi^*$-concentrability assumption in \citet{zhan2022offline} (their Assumption 1) based on raw density ratios. Notice that their assumption translates into $d^*_h(x_h,a_h)/d^D_h(x_h,a_h)\le C,\forall h\in[H],x_h\in\Xcal_h,a_h\in\Acal$ in our finite-horizon episodic setting. We will show an instance where there exists some $h,(x_h,a_h)$ such that $d^*_h(x_h,a_h)/d^D_h(x_h,a_h)=\infty$ and $w^*$ does not even exist (thus $w^*\notin \Wcal$), but we still have $\varepsilon_\Wcal=0$. Therefore, our robust version of sample complexity results can give us meaningful guarantees, however, we cannot apply the (robustness) results in \citet{zhan2022offline}.

\begin{figure}[h!]
	\center
	\begin{tikzpicture}[scale=3]
		\node[state] (s0) at (0,0) {$x_0$};
		%\node[state] (s1) [below left=7em of s0]  {$\mathrm{Null}$};
		\node[state] (s2) [below=4.2em of s0] {$\mathrm{Null}$};
		%\node[state] (s3) [below right=7em of s0] {$\mathrm{Null}$};
		
		%\draw[->] (s0) -> node[near start,above = .2 em,left=.3em] {$\pi^*,\textrm{L}$} (s1);
		\draw[->] (s0) edge[bend right=60] node[near start,above = .2 em,left=.3em] {$\pi^*,\textrm{L}$} (s2);
		\draw[->] (s0) -> node[near start,below = .2 em,right=.em] {$\textrm{M}$} (s2);
		\draw[->] (s0) edge[bend left=60] node[near start,below = .2 em,right=.3em] {$\textrm{R}$} (s2);
		%\draw[->] (s0) -> node[near start,below = .2 em,right=.3em] {$\textrm{R}$} (s3);
	\end{tikzpicture}
	\caption{Example for comparison with $\pi^*$-concentrability assumption \citep{zhan2022offline}.}
	\label{fig:conc_example}
\end{figure}


\renewcommand{\arraystretch}{1.25}
\begin{table}[htb]
\begin{center}
	\begin{tabular}{ |c|c|c|c|c| } 
		\hline
		&$(x_0,\mathrm{L})$ & $(x_0,\mathrm{M})$ & $(x_0,\mathrm{R})$  \\ 
		\hline
		$R$ & 0.8 & 0.6 & 0.3  \\ 
		$Q^*$ & 0.8 & 0.6 & 0.3 \\ 
		$f$ & 0.7 & 0.3 & 0.8 \\ 
		\hline 
		$d^*$ & 1 & 0 & 0 \\ 
		$d^D$ & 0  & 0.5 & 0.5 \\
		$w$ & 0 & 1 & 1 \\ 
		\hline
	\end{tabular}
\end{center}
\caption{Example for comparison with $\pi^*$-concentrability assumption \citep{zhan2022offline}.}
\label{table:conc_example}
\end{table}

As shown in \pref{fig:conc_example}, circles denote states and arrows denote actions with deterministic transitions. In this MDP, the length of horizon is $H=1$ and taking any action $\mathrm{L}$, $\mathrm{M}$, or $\mathrm{R}$ at the initial state $x_0$ transits to the $\mathrm{Null}$ terminal state. Since $H=1$, in the following discussion we drop the subscript $h$ for simplicity. In \Cref{table:conc_example}, we show the reward function, the optimal value function $Q^*$, the bad function $f$, the density-ratio function of the optimal policy $d^*$, the data distribution $d^D$, and the weight function $w$. We construct a singleton weight function class $\Wcal=\{w\}$ and a realizable function class $\Fcal=\{Q^*,f\}$. One can easily verify that $d^*(x_0,\mathrm{L})/d^D(x_0,\mathrm{L})=\infty$, $w^*$ does not exist, and the approximation error $\varepsilon_\Wcal$ as defined in \Cref{eq:appx_w} is 0. 



\bibliography{refs}

%\input{appendix}


\end{document}
