\documentclass{uai2023} % for initial submission
% \documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\usepackage{microtype}
\usepackage{graphicx}
\usepackage{color}
\usepackage{subcaption}
\usepackage{wrapfig}
\usepackage{float}
\usepackage{enumitem}
\usepackage[font=small,labelfont=bf]{caption}
\usepackage{algorithmic}
\usepackage{algorithm}
\usepackage[algo2e]{algorithm2e} 
\usepackage{mathrsfs}
\usepackage{dsfont}
\usepackage{enumerate}
\usepackage{amssymb}
\usepackage{amsfonts,amsmath}
\usepackage{wrapfig}
\usepackage{multirow}
\newtheorem{theorem}{Theorem}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
% \theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}{Assumption}
% \newtheorem{corollary}{Corollary}
\newtheorem{remark}{Remark}
% \newcommand{\theHalgorithm}{\arabic{algorithm}}
\input{math_qi.tex}
\usepackage{enumitem}
\usepackage{hyperref}

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{zref-xr,zref-user}
\zexternaldocument*{yang_24}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Pessimistic Model Selection for Offline Deep Reinforcement Learning\\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<jj@example.edu>?Subject=Your UAI 2023 paper}{Jane~J.~von~O'L\'opez}{}}
\author[1]{Harry~Q.~Bovik}
\author[1,2]{Further~Coauthor}
\author[3]{Further~Coauthor}
\author[1]{Further~Coauthor}
\author[3]{Further~Coauthor}
\author[3,1]{Further~Coauthor}
% Add affiliations after the authors
\affil[1]{%
    Computer Science Dept.\\
    Cranberry University\\
    Pittsburgh, Pennsylvania, USA
}
\affil[2]{%
    Second Affiliation\\
    Address\\
    …
}
\affil[3]{%
    Another Affiliation\\
    Address\\
    …
  }
  
  \begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle

\appendix
\onecolumn
\section{Comments on Asymptotic Results}\label{sec: asymptotic comments}
We remark here that all theoretical justification in this paper is based on asymptotics. It might be possible to investigate finite sample regimes when one has an exact confidence interval or a non-asymptotic bound. However, having an exact confidence interval might require some model specification of the value function, and using non-asymptotic bounds might require additional tuning steps (e.g., constants in many concentration inequalities), which is beyond the scope of this paper. In addition, as seen from our empirical evaluations below, with a relatively large sample size, the proposed model selection approach performs well.
\section{Technical Proofs}
\textit{Notations}: The notation $\xi(N) \lesssim \theta(N)$ (resp. $\xi(N) \gtrsim \theta(N)$) means that there exists a sufficiently large (resp. small) constant $c_1>0$ (resp. $c_2>0$) such that $\xi(N) \leq c_1 \theta(N)$ (resp. $\xi(N) \geq c_2 \theta(N)$) for some sequences $\theta(N)$ and $\xi(N)$ related to $N$. In the following proofs, $N$ often refers to some quantity related to $n$ and $T$.


\textbf{Lemma~\ref{thm: EIF} and its proof }:
Let $J$ denotes some index of our batch data $\calD_n$. Define
\begin{align*}
\phi(J, Q^\pi, \omega^{\pi, \nu}, \pi) = \frac{1}{|J|} \sum_{(i, t) \in J}  \omega^{\pi, \nu}(S_{i, t}, A_t)\left( R_{i, t} + \gamma \sum_{a'\in \calA}\pi(a' | S_{i, t+1})Q^\pi(S_{i, t+1}, a') -Q^\pi(S_{i, t}, A_{i, t})\right),
\end{align*}
where $|J|$ is the cardinality of the index set $J$, e.g., $|J_o| = \frac{nT}{O}$ for every $1 \leq o \leq O$. Then we have the following Lemma~\ref{thm: EIF} as an intermediate result to Theorem~.
\begin{lemma}\label{thm: EIF}
	Under Assumptions , for every $1 \leq l \leq L$ and $1 \leq o \leq O-1$, the following asymptotic equivalence holds.
	\begin{align}\label{eqn: ARL}
	\sqrt{\frac{nT}{O}}\left\{\hat \calV_{\calD_{o+1}}(\hat \pi^{(o)}_l) - \calV(\hat \pi^{(o)}_l)\right\} = \sqrt{\frac{nT}{O}}\phi(J, Q^{\hat \pi^{\ast (o)}}, \omega^{\hat \pi^{(o)}_l, \nu}, \hat \pi^{(o)}_l) + o_p(1),
	\end{align}
	where $o_p(1)$ refers to a quantity that converges to $0$ as $n$ or $T$ goes to infinity.
\end{lemma}


The proof is similar to that of Theorem 7 in \cite{kallus2019efficiently}. First, notice that 
\begin{align*}
&\sqrt{\frac{nT}{O}}\left\{\hat \calV_{\calD_{o+1}}(\hat \pi^{(o)}_l) - \calV(\hat \pi^{(o)}_l)\right\}\\
 = & \sqrt{\frac{nT}{O}}\left\{\phi(J, \widehat Q^{\hat \pi^{\ast (o)}}, \widehat \omega^{\hat \pi^{(o)}_l, \nu}, \hat \pi^{(o)}_l) - \phi(J, Q^{\hat \pi^{\ast (o)}}, \omega^{\hat \pi^{(o)}_l, \nu}, \hat \pi^{(o)}_l) \right.\\
 +&\left.  (1-\gamma)\EE_{S_0 \sim \nu}[\sum_{a\in \calA}\hat \pi^{(o)}_l(a | S_0)Q^{\hat \pi^{(o)}_l}(S_0, a)] - (1-\gamma)\EE_{S_0 \sim \nu}[\sum_{a\in \calA} \hat \pi^{(o)}_l(a | S_0)Q^{ \hat \pi^{(o)}_l}(S_0, a)]\right\}\\
 + & \sqrt{\frac{nT}{O}} \phi(J, Q^{\hat \pi^{\ast (o)}}, \omega^{\hat \pi^{(o)}_l, \nu}, \hat \pi^{(o)}_l).
\end{align*}
Then it suffices to show the term in the first bracket converges to $0$ faster than $\sqrt{nT}$. Notice that
\begin{align*}
	&\left\{\phi(J, \widehat Q^{\hat \pi^{\ast (o)}}, \widehat \omega^{\hat \pi^{(o)}_l, \nu}, \hat \pi^{(o)}_l) - \phi(J, Q^{\hat \pi^{\ast (o)}}, \omega^{\hat \pi^{(o)}_l, \nu}, \hat \pi^{(o)}_l) \right.\\
	+&\left.  (1-\gamma)\EE_{S_0 \sim \nu}[\sum_{a\in \calA}\hat \pi^{(o)}_l(a | S_0)Q^{\hat \pi^{(o)}_l}(S_0, a)] - (1-\gamma)\EE_{S_0 \sim \nu}[\sum_{a\in \calA} \hat \pi^{(o)}_l(a | S_0)Q^{ \hat \pi^{(o)}_l}(S_0, a)]\right\}\\
	=& E_1 + E_2 + E_3,
\end{align*}
where
\begin{align*}
E_1 =& \frac{O}{nT}\sum_{(i, t) \in J_{o+1}}(\widehat \omega^{\hat \pi^{(o)}_l, \nu}(S_{i, t}, A_{i, t}) - \omega^{\hat \pi^{(o)}_l, \nu}(S_{i, t}, A_{i, t}))(R_{i, t} - Q^{\hat \pi^{(o)}_l}(S_{i, t}, A_{i, t}) \\
+& \gamma\sum_{a \in \calA}\hat \pi^{(o)}_l(a | S_{i, t+1})Q^{\hat \pi^{(o)}_l}(S_{i, t+1}, a)), 
\end{align*}
\begin{align*}
E_2 =& \frac{O}{nT}\sum_{(i, t) \in J_{o+1}} \omega^{\hat \pi^{(o)}_l, \nu}(S_{i, t}, A_{i, t})(\widehat Q^{\hat \pi^{(o)}_l}(S_{i, t}, A_{i, t}) -  Q^{\hat \pi^{(o)}_l}(S_{i, t}, A_{i, t}) \\
+& \gamma\sum_{a \in \calA}\hat \pi^{(o)}_l(a | S_{i, t+1})(\widehat Q^{\hat \pi^{(o)}_l}(S_{i, t+1}, a) - Q^{\hat \pi^{(o)}_l}(S_{i, t+1}, a))), 
\end{align*}
and
\begin{align*}
E_3 =& \frac{O}{nT}\sum_{(i, t) \in J_{o+1}} (\widehat \omega^{\hat \pi^{(o)}_l, \nu}(S_{i, t}, A_{i, t}) - \omega^{\hat \pi^{(o)}_l, \nu}(S_{i, t}, A_{i, t}))(\widehat Q^{\hat \pi^{(o)}_l}(S_{i, t}, A_{i, t}) -  Q^{\hat \pi^{(o)}_l}(S_{i, t}, A_{i, t}) \\
+& \gamma\sum_{a \in \calA}\hat \pi^{(o)}_l(a | S_{i, t+1})(\widehat Q^{\hat \pi^{(o)}_l}(S_{i, t+1}, a) - Q^{\hat \pi^{(o)}_l}(S_{i, t+1}, a))). 
\end{align*}
Next, we bound each of the above three terms. For term $E_1$, it can be seen that
$$
\EE[E_1 | \bar J_{o}] = 0.
$$
In addition, by the previous Assumptions, we can show
$$
\Var[E_1] = \EE[\Var(E_1 | \bar J_{o})] \lesssim  \frac{O}{nT} (nT/O)^{-2\kappa_2},
$$
where the inequality is based on that each item in $E_3$ is uncorrelated with others. 
Then by Markov's inequality, we can show
$$
|E_1| = O_p((\frac{O}{nT})^{-1/2-\kappa_2}).
$$
Similarly, we can show $$
|E_2| = O_p((\frac{O}{nT})^{-1/2-\kappa_1}).
$$
For term $(E_3)$, by Cauchy Schwarz inequality and similar arguments as before, we can show
$$
|E_3| = O_p((\frac{O}{nT})^{-(\kappa_2 + \kappa_1)}).
$$
Therefore, as long as $(\kappa_2 + \kappa_1) > 1/2$, we have $E_1 + E_2 + E_3 = o(\sqrt{O/nT})$, which concludes our proof.

\noindent
\textbf{Proof of Theorem }
	We aim to show that
	\begin{align*}
	\frac{\sqrt{nT(O-1)/O}\left(\hat \calV(\hat \pi_l) - \calV( \hat \pi_l)\right)}{\hat \sigma(l)} \Longrightarrow \calN(0, 1).
	\end{align*}
	It can be seen that
	\begin{align*}
	\frac{\sqrt{nT(O-1)/O}\left(\hat \calV(\hat \pi_l) - \calV( \hat \pi_l)\right)}{\hat \sigma(l)} & = \sqrt{\frac{nT}{O(O-1)}}\left(\sum_{o = 1}^{O-1}\frac{\hat \calV_{\calD_{o+1}}(\hat \pi^{(o)}_l) - \calV(\hat \pi_l)}{\hat \sigma_{o+1}(\hat \pi^{(o)}_l)}\right)\\
	& = \sqrt{\frac{nT}{O(O-1)}}\left(\sum_{o = 1}^{O-1}\frac{\hat \calV_{\calD_{o+1}}(\hat \pi^{(o)}_l) - \calV(\hat \pi^{(o)}_l)}{\hat \sigma_{o+1}(\hat \pi^{(o)}_l)}\right) \\
	& + \sqrt{\frac{nT}{O(O-1)}}\left(\sum_{o = 1}^{O-1}\frac{\calV(\hat \pi^{(o)}_l) - \calV(\hat \pi_l)}{\hat \sigma_{o+1}(\hat \pi^{(o)}_l)}\right). 
	\end{align*}
	Define
	\begin{align*}
		\phi(J, Q^\pi, w^\pi, \pi) = \frac{1}{|J|} \sum_{(i, t) \in J}  w^{\pi, \nu}(S_{i, t}, A_t)\left( R_{i, t} + \gamma \sum_{a'\in \calA}\pi(a' | S_{i, t+1})Q^\pi(S_{9, t+1}, a') -Q^\pi(S_{i, t}, A_{i, t})\right),
	\end{align*}
	where $|J|$ is the cardinality of the index set $J$, i.e., $|J| = \frac{nT}{O}$. Then by Lemma~\ref{thm: EIF}, we show that
	\begin{align}\label{eqn: estimated variance consistency}
		\sqrt{\frac{nT}{O}}\frac{\hat \calV_{\calD_{o+1}}(\hat \pi^{(o)}_l) - \calV(\hat \pi^{(o)}_l)}{\hat \sigma_{o+1}(\hat \pi^{(o)}_l)} = \sqrt{\frac{nT}{O}}\frac{\phi(J_{o+1}, Q^{\hat \pi^{(o)}_l}, w^{\hat \pi^{(o)}_l}, \hat \pi^{(o)}_l)}{\hat \sigma_{o+1}(\hat \pi^{(o)}_l)} + o_p(1).
	\end{align}
	If we can show that
	$$
	\max_{1 \leq o \leq (O-1)}\left|\frac{\hat \sigma_{o+1}(\hat \pi^{(o)}_l)}{\sigma_{o+1}(\hat \pi^{(o)}_l)} - 1\right| = o_p(1),
	$$
	which will be shown later,
	then by Slutsky theorem, we can show that
	\begin{align*}
		&\sqrt{\frac{nT}{O(O-1)}}\left(\sum_{o = 1}^{O-1}\frac{\hat \calV_{\calD_{o+1}}(\hat \pi^{(o)}_l) - \calV(\hat \pi^{(o)}_l)}{\hat \sigma_{o+1}(\hat \pi^{(o)}_l)}\right) \\
		= & \underbrace{\sqrt{\frac{nT}{O(O-1)}}\left(\sum_{o = 1}^{O-1}\frac{\phi(J_{o+1}, Q^{\hat \pi^{(o)}_l}, w^{\hat \pi^{(o)}_l}, \hat \pi^{(o)}_l)}{\sigma_{o+1}(\hat \pi^{(o)}_l)} \right)}_{(I)}+ o_p(1).
	\end{align*}
For $(I)$, we can see that
\begin{align}
(I) & = \sqrt{\frac{O}{nT(O-1)}}(\sum_{o = 1}^{O-1}\sum_{(i, t) \in J_{o+1}}  w^{\hat \pi^{(o)}_l, \nu}(S_{i, t}, A_{i, t})( R_{i, t} \\
& +\gamma \sum_{a'\in \calA}\hat \pi^{(o)}_l(a' | S_{i, t+1})Q^{\hat \pi^{(o)}_l}(S_{i, t+1}, a') -Q^{\hat \pi^{(o)}_l}(S_{i, t}, A_{i, t}))/\sigma_{o+1}(\hat \pi^{(o)}_l) ).
\end{align}
By the sequential structure of our proposed algorithm, $(I)$ forms a mean zero martingale. Then we use Corollary 2.8 of \citep{mcleish1974dependent} to show its asymptotic distribution. First of all, by the uniformly bounded assumption on Q-function, ratio function and the variance, we can show that
\begin{align*}
\sqrt{\frac{O}{nT(O-1)}}\max_{1 \leq o \leq (O-1)} \max_{(i, t) \in J_0}\left| w^{\hat \pi^{(o)}_l, \nu}(S_{i, t}, A_{i, t})( R_{i, t} +\gamma \sum_{a'\in \calA}\hat \pi^{(o)}_l(a' | S_{i, t+1})Q^{\hat \pi^{(o)}_l}(S_{i, t+1}, a') -\right.\\
\left.Q^{\hat \pi^{(o)}_l}(S_{i, t}, A_{i, t}))/\sigma_{o+1}(\hat \pi^{(o)}_l)\right| = o_p(1).
\end{align*}
Next, we aim to show that
\begin{align} \label{eqn: consistency of sigma}
&\frac{O}{nT(O-1)}\left|(\sum_{o = 1}^{O-1}\sum_{(i, t) \in J_{o+1}}  \{w^{\hat \pi^{(o)}_l, \nu}(S_{i, t}, A_{i, t})( R_{i, t} \right.\\
&\left.+\gamma \sum_{a'\in \calA}\hat \pi^{(o)}_l(a' | S_{i, t+1})Q^{\hat \pi^{(o)}_l}(S_{i, t+1}, a') -Q^{\hat \pi^{(o)}_l}(S_{i, t}, A_{i, t}))\}^2/\sigma^2_{o+1}(\hat \pi^{(o)}_l) ) - 1\right| = o_p(1)\nonumber.
\end{align}
Notice that the left hand side of the above is bounded above by
\begin{align} 
&\frac{O}{nT}\max_{1 \leq o \leq (O-1)}\left|(\sum_{(i, t) \in J_{o+1}}  \{w^{\hat \pi^{(o)}_l, \nu}(S_{i, t}, A_{i, t})( R_{i, t} \right.\\
&\left.+\gamma \sum_{a'\in \calA}\hat \pi^{(o)}_l(a' | S_{i, t+1})Q^{\hat \pi^{(o)}_l}(S_{i, t+1}, a') -Q^{\hat \pi^{(o)}_l}(S_{i, t}, A_{i, t}))\}^2/\sigma^2_{o+1}(\hat \pi^{(o)}_l) ) - 1\right|.
\end{align}
Because, for each $1 \leq o \leq (O-1)$,
\begin{align} 
&\frac{O}{nT}\left\{(\sum_{(i, t) \in J_{o+1}}  \{w^{\hat \pi^{(o)}_l, \nu}(S_{i, t}, A_{i, t})( R_{i, t} \right.\\
&\left.+\gamma \sum_{a'\in \calA}\hat \pi^{(o)}_l(a' | S_{i, t+1})Q^{\hat \pi^{(o)}_l}(S_{i, t+1}, a') -Q^{\hat \pi^{(o)}_l}(S_{i, t}, A_{i, t}))\}^2 - \EE[\{w^{\hat \pi^{(o)}_l, \nu}(S, A)( R \right.\\
&\left.+\gamma \sum_{a'\in \calA}\hat \pi^{(o)}_l(a' | S')Q^{\hat \pi^{(o)}_l}(S', a') -Q^{\hat \pi^{(o)}_l}(S, A))\}]/\sigma^2_{o+1}(\hat \pi^{(o)}_l) )\right\},
\end{align}
forms a mean zero martingale, we apply Freedman’s inequality in \citep{freedman1975tail} with Assumptions to show it is bounded by $O_p(\sqrt{\frac{O}{nT}})$. Applying union bound shows \eqref{eqn: consistency of sigma} is $o_p(1)$ and furthermore consistency of $\hat \sigma(\hat \pi_l)$ in \eqref{eqn: estimated variance consistency} holds.
Then we apply the martingale central limit theorem to show
\begin{align*}
	\sqrt{\frac{nT}{O(O-1)}}\left(\sum_{o = 1}^{O-1}\frac{\phi(J_{o+1}, Q^{\hat \pi^{(o)}_l}, w^{\hat \pi^{(o)}_l}, \hat \pi^{(o)}_l)}{\sigma_{o+1}(\hat \pi^{(o)}_l)} \right) \Longrightarrow \calN(0, 1).
\end{align*}
The remaining is to show
$$
\sqrt{\frac{nT}{O(O-1)}}\left(\sum_{o = 1}^{O-1}\frac{\calV(\hat \pi^{(o)}_l) - \calV(\hat \pi_l)}{\hat \sigma_{o+1}(\hat \pi^{(o)}_l)}\right)
$$
is asymptotically negligible. Consider
\begin{align}
	&\EE\left|\calV(\hat \pi^{(o)}_l) - \calV(\hat \pi_l) \right| \\
	\leq & \EE\left|\calV(\hat \pi^{(o)}_l) - \calV( \pi^{\ast}_l) \right| +  \EE\left|\calV(\hat \pi_l) - \calV( \pi^{\ast}_l) \right| \\
	\leq & \EE\left|\calV(\hat \pi^{(o)}_l) - \calV( \pi^{\ast}_l) \right| +  \EE\left|\calV(\hat \pi_l) - \calV( \pi^{\ast}_l) \right| \\
	\leq & (nTo)^{-\kappa} O^{\kappa} +  (nT)^{-\kappa},
\end{align}
where we use Assumption for the last inequality. Summarizing together, we can show that 
\begin{align*}
	& \sqrt{\frac{nT}{O(O-1)}}\EE\left|\sum_{o = 1}^{O-1}\calV(\hat \pi^{(o)}_l) - \calV(\hat \pi_l)\right| \\
	\leq &\sqrt{\frac{nT}{O(O-1)}} \sum_{o = 1}^{O-1} (nTo)^{-\kappa} O^{\kappa} +  \sqrt{\frac{nT(O-1)}{O}}(nT)^{-\kappa}\\
	\leq &\sqrt{\frac{nTO^2}{O(O-1)}} \sum_{o = 1}^{O-1} (nT)^{-\kappa} +  \sqrt{\frac{nT(O-1)}{O}}(nT)^{-\kappa}\\
	=&o(1),
\end{align*}
where we obtain the second inequality by that $\sum_{o=1}^{O-1}o^{-\kappa} \leq 1 + \int_{1}^{O}o^{-\kappa}do \lesssim O^{1 - \kappa}$. In the last inequality, we use $\kappa > 1$ in Assumption. Then Markov inequality gives that 
$$
\sqrt{\frac{nT}{O(O-1)}}\left(\sum_{o = 1}^{O-1}\calV(\hat \pi^{(o)}_l) - \calV(\hat \pi_l)\right) = o_p(1).
$$
Moreover, by Assumption that $\inf_{1 \leq o \leq O-1}\hat \sigma_{o+1}(\hat \pi^{(o)}_l) \geq c$ for some constant $c>0$, we can further show that
$$
\sqrt{\frac{nT}{O(O-1)}}\left(\sum_{o = 1}^{O-1}\frac{\calV(\hat \pi^{(o)}_l) - \calV(\hat \pi_l)}{\hat \sigma_{o+1}(\hat \pi^{(o)}_l)}\right) = o_p(1),
$$
which completes our proof.

\noindent
\textbf{Proof of Corollary}
Denote the sets $E_l = \{ | \calV(\hat \pi_l) -\hat \calV(\hat \pi_l)   |\leq \hat u(l) \} $, $l=1,\ldots,L$, where $\hat u(l) = z_{\alpha / 2}\sqrt{nT(O-1)/O}\hat \sigma(l)$.
Note that $\lim \inf_{nT\rightarrow \infty} \Pr( \cap_{j=1}^{L} E_j) \geq 1-L \alpha$ and
   \begin{align*}
  & \Pr (\calV(\hat \pi_{\hat l}) \geq \max_{1\leq l \leq L} \calV(\hat \pi_l) - 2 \hat u(l)  )\\
   = & \Pr (\calV(\hat \pi_{\hat l}) - \hat \calV(\hat \pi_{\hat l}) + \hat \calV(\hat \pi_{\hat l}) \geq \max_{1\leq l \leq L} \calV(\hat \pi_l) - \hat \calV(\hat \pi_{l}) - 2 \hat u(l)  + \hat \calV(\hat \pi_{l}) )\\
   \geq & \Pr (\calV(\hat \pi_{\hat l}) - \hat \calV(\hat \pi_{\hat l}) + \hat \calV(\hat \pi_{\hat l}) \geq \max_{1\leq l \leq L} \calV(\hat \pi_l) - \hat \calV(\hat \pi_{l}) - 2 \hat u(l)  + \hat \calV(\hat \pi_{l}) |\cap_{j=1}^{L} E_j ) \Pr(\cap_{j=1}^{L} E_j)\\
   \geq &  \Pr ( \hat \calV(\hat \pi_{\hat l}) -\hat u(\hat l) \geq \max_{1\leq l \leq L}  \hat \calV(\hat \pi_{l}) - \hat u(l)  |\cap_{j=1}^{L} E_j )   \Pr(\cap_{j=1}^{L} E_j) \\
   = & \Pr(\cap_{j=1}^{L} E_j),
   \end{align*}
   where the last inequality holds because given the event $\cap_{j=1}^{L} E_j$, one has $- \hat u (\hat l) \leq \calV(\hat \pi_{\hat l}) - \hat \calV(\hat \pi_{\hat l})$ and $\calV(\hat \pi_{l}) - \hat \calV(\hat \pi_{l}) \leq \hat u (l)$ for any $l$.
This completes the proof by taking $\lim\inf$ on both sides. 

\noindent
\textbf{Proof of Theorem on Bias}
To show the results in Theorem, it can be seen that
	\begin{align*}
	\left|\frac{\sqrt{nT(O-1)/O}\left(\hat \calV(\hat \pi_l) - \calV( \pi^\ast)\right)}{\hat \sigma(l)}\right| & \leq \left|\sqrt{\frac{nT}{O(O-1)}}\left(\sum_{o = 1}^{O-1}\frac{\hat \calV_{\calD_{o+1}}(\hat \pi^{(o)}_l) - \calV(\hat \pi^{(o)}_l)}{\hat \sigma_{o+1}(\hat \pi^{(o)}_l)}\right)\right| \\
	& + \sqrt{\frac{nT}{O(O-1)}}\left(\sum_{o = 1}^{O-1}\frac{\calV(\hat \pi^{(o)}_l) -  \calV( \pi^\ast)}{\hat \sigma_{o+1}(\hat \pi^{(o)}_l)}\right)\\
	& \leq \underbrace{\left|\sqrt{\frac{nT}{O(O-1)}}\left(\sum_{o = 1}^{O-1}\frac{\hat \calV_{\calD_{o+1}}(\hat \pi^{(o)}_l) - \calV(\hat \pi^{(o)}_l)}{\hat \sigma_{o+1}(\hat \pi^{(o)}_l)}\right)\right|}_{(I)} \\
	& + B(l)\sqrt{\frac{nT}{O(O-1)}}\left(\sum_{o = 1}^{O-1}\frac{1}{\hat \sigma_{o+1}(\hat \pi^{(o)}_l)}\right).
	\end{align*}
	Then by results in the proof of Theorem, we can show that
	\begin{align}%\label{eqn: estimated variance consistency}
	\underset{nT \rightarrow \infty}{\lim} \Pr((I) > z_{\alpha/2}) = \alpha.
	\end{align}
	This implies that
		\begin{align}
		&\underset{nT \rightarrow \infty}{\liminf} \Pr\left(|\calV(\pi^\ast) - \hat \calV(\hat \pi_l) |\leq z_{\alpha / 2}\sqrt{O/nT(O-1)}\hat \sigma(l) + B(l)\right) \\
		\geq & \underset{nT \rightarrow \infty}{\lim} \Pr((I) \leq z_{\alpha/2}) = 1-\alpha,
	\end{align}
	which concludes our proof.
	
\textbf{Proof of Corollary}: We mainly show the proof of the second claim in the corollary, based on which the first claim can be readily seen. Define an event $E$ such that $1\leq l \leq L$, $|\calV(\hat \pi_l) - \hat \calV(\hat \pi_l)| \leq c(\delta)\log(L)\hat \sigma(i)/\sqrt{NT}$ and $|\calV(\pi^\ast) - \hat \calV(\hat \pi_l) |\leq z_{\alpha / (2L)}\sqrt{O/nT(O-1)} \hat \sigma(l) + B(l)$. Based on the assumption given in Corollary  and Theorem , we have $\underset{nT \rightarrow \infty}{\liminf}P(E) \geq 1 - \delta - \alpha$. In the following, we suppose event $E$ holds.

Inspired by the proofs of Corollary 1 in \citep{mathe2006lepskii} and Theorem 3 of \citep{su2020adaptive}, we define $\tilde{l} = \max \{l: B(l) \leq u_1(l) + u_2(l) \}$, where $u_1(l) = z_{\alpha/(2L)}\sqrt{O/nT(O-1)} \hat \sigma(l)$. Let $u_2(l) = c(\delta)\log(L)\hat \sigma(i)/\sqrt{NT}$. By Assumption , for $l \leq \tilde l$, 
$$
B(l) \leq B(\tilde l) \leq u_1(\tilde l) \leq u_1(l),$$
which further implies that for any $l \leq \tilde l$,
$$
|\hat \calV(\hat \pi_l) - \calV(\pi^\ast)| \leq B(l) + u_1(l) \leq 2 u_1(l).
$$
Then $\calV(\pi^\ast) \in I(l)$ based on the construction of $I(l)$ for all $l \leq \tilde l$. In addition, we have for $l \leq \tilde l$
\begin{align}\label{eqn 1}
    |\calV(\hat \pi_l) - \calV(\pi^\ast)| \leq 2 u_1(l) + u_2(l),
\end{align}
by triangle inequality and event $E$.
Since $I(l)$ share at least one common element for $1 \leq l \leq \tilde l$, we have $\hat{i} \geq \tilde l$. Moreover, there must exist an element $x$ such that $x \in I(\tilde l) \cap I(\hat i)$, where $|\hat \calV(\hat \pi_{\tilde l})- x| \leq u_1(\tilde l)$ and $|\hat \calV(\hat \pi_{\hat i})- x| \leq u_1(\hat i)$. This indicates that
\begin{align}\label{eqn 2}
    |\hat \calV(\hat \pi_{\hat i}- \calV(\pi^\ast)| &\leq |\hat \calV(\hat \pi_{\hat i})- x| + |\hat \calV(\hat \pi_{\tilde l})- x| + |\hat \calV(\hat \pi_{\tilde l})- \calV(\pi^\ast)|\\
    & \leq u_1(\hat i) + 2u_1(\tilde l) \leq 3 u_1(\tilde l),
\end{align}
by again triangle inequality and Assumption , and
\begin{align}\label{eqn 3}
    |\calV(\hat \pi_{\hat i})- \calV(\pi^\ast)| &\leq u_2(\hat i)+ 3 u_1(\tilde l) \leq u_2(\tilde l)+ 3 u_1(\tilde l),
\end{align}
by event $E$ and Assumption . Define $l^\ast = \min \{l: B(l) + u_1(l) + u_2(l)\}$. Then following the similar proof of \citep{su2020adaptive}, we consider two cases:

\textbf{Case 1:} If $l^\ast \leq \tilde l$, then we have
$$
u_2(\tilde l) + B(\tilde l) + u_1(\tilde l) \leq 2u_1(l^\ast) + u_2(l^\ast) \leq 2u_1(l^\ast) + 2 B(l^\ast) + u_2(l^\ast),
$$
where we use Assumption .

\textbf{Case 2:} If $l^\ast > \tilde l$, then we have
$$
\zeta (u_2(\tilde l) + u_1(\tilde l)) \leq (u_2(\tilde l+1) + u_1(\tilde l+1)) \leq B(\tilde l+1) \leq B(l^\ast),
$$
where we use Assumption . This implies that
$$
u_2(\tilde l) + u_1(\tilde l) + B(\tilde l) \leq (1+ 1/\zeta)B(l^\ast).
$$
Combining two cases, we can show that
$$
u_2(\tilde l) + u_1(\tilde l) + B(\tilde l) \leq (1+ 1/\zeta)(B(l^\ast)+u_1(l^\ast) + u_2(l^\ast)),
$$
as $\zeta < 1$. Together with \eqref{eqn 2}, we have
\begin{align}\label{eqn 31}
    |\calV(\hat \pi_{\hat i})- \calV(\pi^\ast)| &\leq u_2(\hat i)+ 3 u_1(\tilde l) \leq 3(1+ 1/\zeta)(B(l^\ast)+u_1(l^\ast) + u_2(l^\ast)),
\end{align}
which concludes our proof.



\section{More Details on DQN Environments}
\label{apendix:dqn}




We introduce our deployed DQN environments in this section, which included four environments with discrete action ($\mathbf{E}_1$ to $\mathbf{E}_4$) and two environments ($\mathbf{E}_5$ to $\mathbf{E}_6$) with continuous action. These environments cover wide applications, including tabular learning ($\mathbf{E}_1$), navigation to a target object in a geometrical space ($\mathbf{E}_2$), digital gaming ($\mathbf{E}_3$ to $\mathbf{E}_4$), and continuous control ($\mathbf{E}_5$ to $\mathbf{E}_6$).


\begin{table}[ht!]
\begin{minipage}[t]{.47\textwidth}
\begin{center}
   \includegraphics[width=0.80\linewidth]{latex/Figure/index_env.png}
\end{center}
\vspace{-0.5cm}
    \captionof{figure}{Policy selection using top-k ranking regret score in $\mathbf{E}_1$ (Frozen Lake).} 
\label{fig:figure:env1}
\end{minipage}
\hfill
\begin{minipage}[t]{.47\textwidth}
% \vspace{-20.5mm}
\begin{center}
   \includegraphics[width=0.80\linewidth]{latex/Figure/pre_env_1.png}
\end{center}
\vspace{-0.5cm}
   \captionof{figure}{Policy selection using top-k ranking precision in $\mathbf{E}_1$ (Frozen Lake).}
\label{pre:figure:env1}
\end{minipage}

\end{table}




\textbf{$\mathbf{E}_1$: Frozen Lake:} The Frozen Lake is a maze environment that manipulates an agent to walk from a starting point (S) to a goal point without failing into the hole (H). We use \emph{FrozenLake-v0} from OpenAI Gym~\citep{brockman2016openai}. We provide top-5 regret and precision results shown in Figure and \ref{pre:figure:env1}.  

\textbf{$\mathbf{E}_2$: Banana Collector:} The Banana collector is one popular 3D-graphical navigation environment that compresses discrete actions and states as an open source DQN benchmark from Unity~\footnote{\url{https://www.youtube.com/watch?v=heVMs3t9qSk}} ML-Agents v0.3.\citep{juliani2018unity}. The DRL agent controls an automatic vehicle with 37 dimensions of state observations including velocity and a ray-based perceptional information from objects around the agent. The targeted reward is $12.0$ points by accessing correct yellow bananas ($+1$) and avoiding purple bananas ($-1$) in first-person point of view as shown in Fig(b). We provide the related top-5 regret and precision results shown in Figure \ref{fig:figure:env2} and \ref{pre:figure:env2}.  


% Different from MuJoCo simulator, the Unity environment is open resource. \iclr{We provide the related top-5 regret performance results shown as in Figure ~\ref{fig:figure:env6}. }


% The Banana collector shown in Figure \ref{fig:new:env} (a) is one of the Unity 3D baseline~\citep{juliani2018unity}. Different from the MuJoCo simulators with continuous actions, the Banana collector is controlled by four ``discrete'' actions corresponding to moving directions. The targeted reward is $12.0$ points by accessing correct bananas ($+1$). The state-space has 37 dimensions included velocity and a ray-based perception of objects around the agent.


\begin{table}[ht!]
\begin{minipage}[t]{.47\textwidth}
\begin{center}
   \includegraphics[width=0.80\linewidth]{latex/Figure/index_env2.png}
\end{center}
\vspace{-0.5cm}
    \captionof{figure}{Policy selection using top-k ranking regret score in $\mathbf{E}_2$ (Banana Collector).} 
\label{fig:figure:env2}
\end{minipage}
\hfill
\begin{minipage}[t]{.47\textwidth}
% \vspace{-20.5mm}
\begin{center}
   \includegraphics[width=0.80\linewidth]{latex/Figure/pre_env_2.png}
\end{center}
\vspace{-0.5cm}
   \captionof{figure}{Policy selection using top-k ranking precision in $\mathbf{E}_2$ (Banana Collector).} 
\label{pre:figure:env2}
\end{minipage}

\end{table}

\begin{table}[ht!]
\begin{minipage}[t]{.47\textwidth}
\begin{center}
   \includegraphics[width=0.80\linewidth]{latex/Figure/index_env3.png}
\end{center}
\vspace{-0.5cm}
    \captionof{figure}{Policy selection using top-k ranking regret score in $\mathbf{E}_3$ (Pong).}  
\label{fig:figure:env3}
\end{minipage}
\hfill
\begin{minipage}[t]{.47\textwidth}
% \vspace{-20.5mm}
\begin{center}
   \includegraphics[width=0.80\linewidth]{latex/Figure/pre_env_3.png}
\end{center}
\vspace{-0.5cm}
   \captionof{figure}{Policy selection using top-k ranking precision in $\mathbf{E}_3$ (Pong).} 
\label{pre:figure:env3}
\end{minipage}
% 
\end{table}

\textbf{$\mathbf{E}_3$: Pong:} Pong is one Atari game environment from OpenAI Gym~\citep{brockman2016openai} as shown in (c). We provide its top-5 regret and precision results shown in Figure \ref{fig:figure:env3} and \ref{pre:figure:env3}. 


\begin{table}[ht!]
\begin{minipage}[t]{.47\textwidth}
\begin{center}
   \includegraphics[width=0.80\linewidth]{latex/Figure/index_env4.png}
\end{center}
\vspace{-0.5cm}
    \captionof{figure}{Policy selection using top-k ranking regret score  in $\mathbf{E}_4$ (Breakout).}  
\label{fig:figure:env4}
\end{minipage}
\hfill
\begin{minipage}[t]{.47\textwidth}
% \vspace{-20.5mm}
\begin{center}
   \includegraphics[width=0.80\linewidth]{latex/Figure/pre_env_4.png}
\end{center}
\vspace{-0.5cm}
   \captionof{figure}{Policy selection using top-k ranking precision in $\mathbf{E}_4$ (HalfCheetah-v1).} 
\label{pre:figure:env4}
\end{minipage}

\end{table}

\begin{table}[ht!]
\begin{minipage}[t]{.47\textwidth}
\begin{center}
   \includegraphics[width=0.80\linewidth]{latex/Figure/index_env5.png}
\end{center}
\vspace{-0.5cm}
    \captionof{figure}{Policy selection using top-k ranking regret score  in $\mathbf{E}_5$ (HalfCheetah-v1).}  
\label{fig:figure:env5}
\end{minipage}
\hfill
\begin{minipage}[t]{.47\textwidth}
% \vspace{-20.5mm}
\begin{center}
   \includegraphics[width=0.80\linewidth]{latex/Figure/pre_env_5.png}
\end{center}
\vspace{-0.5cm}
   \captionof{figure}{Policy selection using top-k ranking precision in $\mathbf{E}_5$ (HalfCheetah-v1).} 
\label{pre:figure:env5}
\end{minipage}

\end{table}

% \begin{figure}[!ht]
% \vspace{-0.2cm}
% \begin{center}
%   \includegraphics[width=0.6\linewidth]{latex/Figure/index_env6.png}
% \end{center}
%   \caption{\iclr{Policy selection using top-k ranking scores in $\mathbf{E}_6$ with different numbers of trajectory (Trajs). }} 
% \label{fig:figure:env6}
% \end{figure}

\begin{table}[ht!]
\begin{minipage}[t]{.47\textwidth}
\begin{center}
   \includegraphics[width=0.80\linewidth]{latex/Figure/index_env6.png}
\end{center}
\vspace{-0.5cm}
    \captionof{figure}{Policy selection using top-k ranking regret score  in $\mathbf{E}_6$ (Walker2d-v1).}  
\label{fig:figure:env6}
\end{minipage}
\hfill
\begin{minipage}[t]{.47\textwidth}
% \vspace{-20.5mm}
\begin{center}
   \includegraphics[width=0.80\linewidth]{latex/Figure/pre_env_6.png}
\end{center}
\vspace{-0.5cm}
   \captionof{figure}{Policy selection using top-k ranking precision in $\mathbf{E}_6$ (Walker2d-v1).} 
\label{pre:figure:env6}
\end{minipage}

\end{table}

\textbf{$\mathbf{E}_4$: Breakout:} Breakout is one Atari game environment from OpenAI Gym~\citep{brockman2016openai} as shown in Fig~\ref{fig:figure:env4}(d). We provide the related top-5 regret and precision results shown in Figure \ref{fig:figure:env4} and \ref{pre:figure:env4}. 

\textbf{$\mathbf{E}_5$: HalfCheetah-v1:} Halfcheetah is a continuous action and state environment to control agent with monuments made by MuJoCo simulators as shown in (e). We provide the related top-5 regret and precision results shown in Figure \ref{fig:figure:env5} and \ref{pre:figure:env5}. 

\textbf{$\mathbf{E}_6$: Walker2d-v1:} Walker2d-v1 is a continuous action and state environment to control agent with monuments made by MuJoCo simulators as shown in (f). We provide the related top-5 regret and precision results shown in Figure \ref{fig:figure:env6} and \ref{pre:figure:env6}. 



\section{Hyper-Parameters Information}
\label{sub:hyper}
We select a total of $70$ DQN based models for each environment. We will open source the model and implementation for future studies. Table~\ref{tab:1}, Table~\ref{tab:2}, and Table~\ref{tab:3} summarize their hyper-parameter and setups. In addition, Figure~\ref{fig:alpha} and Figure~\ref{fig:O} provide ablation studies on different scales of $\alpha$ and $O$ selection in PMS experiments for the deployed DRL navigation task ($\mathbf{E}_2$). From the experimental results, a more pessimistic $\alpha$ (e.g., 0.001) is associated with a slightly better attained top-5 regret. Meanwhile, the selection of $O$ does not produce much different performance on selected policies but slightly affects the range of the selected policies.


\begin{table}[ht!]
    \caption{Hyper-parameters information for for DQN models used in $\mathbf{E}_1$ to $\mathbf{E}_2$}
    \label{tab:1}
    \centering
   \begin{tabular}{ll}
\hline Hyper-parameters & Values \\
\hline Hidden layers & $\{1,~2\}$ \\
Hidden units & $\{16,~32,~64,~128\}$ \\
Learning rate & $\{1\times\mathrm{e}^{-3},~5\times \mathrm{e}^{-4}\}$ \\
DQN training iterations & $\{100,~500,~1k,~2k\}$ \\
Batch size & $\{64\}$ \\
\hline
\end{tabular}
\end{table}

\begin{table}[ht!]
    \caption{Hyper-parameters information for for DQN models used in $\mathbf{E}_3$ to $\mathbf{E}_4$}
    \label{tab:2}
    \centering
   \begin{tabular}{ll}
\hline Hyper-parameters & Values \\
\hline 
Convolutional layers & $\{~2,3\}$ \\
Convolutional units & $\{16,~32\}$ \\
Hidden layers & $\{~2,3\}$ \\
Hidden units & $\{64,~256,~512\}$ \\
Learning rate & $\{1\times\mathrm{e}^{-3},~5\times \mathrm{e}^{-4}\}$ \\
DQN training iterations & $\{4M,~4.5M,~5M\}$ \\
Batch size & $\{64\}$ \\
\hline
\end{tabular}
\end{table}

\begin{table}[ht!]
\caption{Hyper-parameters information for double DQN (DDQN) models~\citep{van2016deep} with a prioritized replay~\citep{schaul2015prioritized} used in $\mathbf{E}_5$ to $\mathbf{E}_6$.}
    \label{tab:3}
    \centering
   \begin{tabular}{ll}
\hline Hyper-parameters & Values \\
\hline Hidden layers & $\{4,~5,~6\}$ \\
Hidden units & $\{64,~128,~256,~512\}$ \\
Learning rate & $\{1\times\mathrm{e}^{-3},~5\times \mathrm{e}^{-4}\}$ \\
DDQN training frames & $\{40M,~45M,~50M\}$ \\
Batch size & $\{256\}$ \\
Buffer size & $\{10^{6}\}$ \\
Updated target & $\{1000\}$ \\
\hline
\end{tabular}
\end{table}

\begin{table}[ht!]
\begin{minipage}[t]{.47\textwidth}
\begin{center}
   \includegraphics[width=0.80\linewidth]{latex/Figure/alpha_diff.pdf}
\end{center}
\vspace{-0.5cm}
    \captionof{figure}{Different $\alpha$ for PMS selection.}  
\label{fig:alpha}
\end{minipage}
\hfill
\begin{minipage}[t]{.47\textwidth}
% \vspace{-20.5mm}
\begin{center}
   \includegraphics[width=0.80\linewidth]{latex/Figure/differen_O.pdf}
\end{center}
\vspace{-0.5cm}
   \captionof{figure}{Different $O$ for PMS selection.} 
\label{fig:O}
\end{minipage}
%\vspace{-6mm}
\end{table}


% \begin{figure}[!ht]
% \vspace{-0.2cm}
% \begin{center}
%   \includegraphics[width=0.6\linewidth]{latex/Figure/alpha_diff.pdf}
% \end{center}
%   \caption{Different $\alpha$. } 
% \label{fig:figure:alpha}
% \end{figure}

% \begin{figure}[!ht]
% \vspace{-0.2cm}
% \begin{center}
%   \includegraphics[width=0.6\linewidth]{latex/Figure/differen_O.pdf}
% \end{center}
%   \caption{Different $O$. } 
% \label{fig:figure:O}
% \end{figure}



\section{Broader Impact}
There are also some limitations of the proposed PMS as one of the preliminary attempts on model selection for offline reinforcement learning. When the benchmarks environments (excluded Atari games) are based on simulated environments to collect the true policy~\citep{barth2018distributed, siegel2019keep}, more real-world-based environments could be customized and studied in future works. For example, one experimental setup needs to be carefully controlled in clinical settings~\citep{tang2021model} or resilience-oriented~\citep{yang2021causal} reinforcement learning.
\bibliography{uai2023-template}

\end{document}
