\documentclass[accepted]{uai2023} % for initial submission
% \documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\usepackage{microtype}
\usepackage{graphicx}
\usepackage{color}
\usepackage{subcaption}
\usepackage{wrapfig}
\usepackage{float}
\usepackage{enumitem}
\usepackage[font=small,labelfont=bf]{caption}
\usepackage{algorithmic}
\usepackage{algorithm}
\usepackage[algo2e]{algorithm2e} 
\usepackage{mathrsfs}
\usepackage{dsfont}
\usepackage{enumerate}
\usepackage{amssymb}
\usepackage{amsfonts,amsmath}
\usepackage{wrapfig}
\usepackage{multirow}
\newtheorem{theorem}{Theorem}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
% \theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}{Assumption}
% \newtheorem{corollary}{Corollary}
\newtheorem{remark}{Remark}
% \newcommand{\theHalgorithm}{\arabic{algorithm}}
\input{math_qi.tex}
\usepackage{enumitem}
\usepackage{hyperref}
% For theorems and such
% \usepackage{amsmath}
% \usepackage{amssymb}
% \usepackage{amsthm}
%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example
\begin{document}
\title{Pessimistic Model Selection for Offline Deep Reinforcement Learning}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<huckiyang@gatech.edu>}{Chao-Han Huck Yang$^*$}{}}
\author[2]{\href{mailto:<qizhengling@email.gwu.edu>}{Zhengling Qi$^*$}{}}
\author[3]{Yifan Cui}
\author[4]{Pin-Yu Chen}

% Add affiliations after the authors
\affil[1]{%
    Georgia Institute of Technology\\
    USA\quad $^2$George Washington University, USA
}
\affil[3]{%
    Zhejiang University, China \quad $^4$IBM Research AI, USA
}
% \affil[3]{%
%     Another Affiliation\\
%     Address\\
%     …
%   }
  
  

\maketitle

\begin{abstract}
Deep Reinforcement Learning (DRL) has demonstrated great potentials in solving sequential decision making problems in many applications. Despite its promising performance,  practical gaps exist when deploying DRL in real-world scenarios. One main barrier is the over-fitting issue that leads to poor generalizability of the policy learned by DRL.  
In particular, for offline DRL with observational data, model selection is a challenging task as there is no ground truth available for performance validation, in contrast with the online setting with simulated environments. In this work, we propose a pessimistic model selection (PMS) approach for offline DRL with a theoretical guarantee, which features a provably effective framework for finding the best policy among a set of candidate models. Two refined approaches are also proposed to address the potential bias of DRL model in identifying the optimal policy. Numerical studies demonstrated the superior performance of our approach over existing methods.
\end{abstract}

\section{Introduction}
The success of deep reinforcement learning~\citep{mnih2013playing, henderson2018deep} (DRL) often leverages upon executive training data with considerable efforts to select effective neural architectures. Deploying online simulation to learn useful representations for DRL is not always realistic and feasible, especially in some high-stake environments, such as automatic navigation~\citep{kahn2018self, hase2020ultrasound}, dialogue learning~\citep{jaques2020human}, and clinical applications~\citep{tang2020clinician}. \emph{Offline reinforcement learning} (OffRL)~\citep{singh1996reinforcement,levine2020offline, agarwal2020optimistic}  has prompted strong interests~\citep{paine2020hyperparameter, kidambi2020morel} to empower DRL toward problem-solving involving notable costs and risks. The idea of OffRL is to train DRL models with only logged data and recorded trajectories. However, with given observational data, designing a successful neural architecture in OffRL is often expensive~\citep{levine2020offline}, requiring intensive experiments, time, and computing resources. 

%\iclr{Meanwhile, automated machine learning~\citep{hutter2019automated} (AutoML) shows many advantages to accelerate the development cycle for given data and desired tasks. AutoML has been used to provide simpler solutions, faster creation of those solutions, and specific models and hyperparameters that perform equally or better than hand-designed architectures from domain experts. AutoML demonstrates many competitive solutions, such as neural architectures search~\citep{zoph2016neural}, hyperparameter optimization~\citep{bergstra2011algorithms, maclaurin2015gradient}, and model selection~\citep{farahmand2011model, fard2010pac, jiang2015abstraction,xie2021batch}, for images classification, language modeling, and RL respectively.}

Unlike most aforementioned applications with online interaction, \emph{offline} tasks for reinforcement learning often face the challenges of insufficient observational data from offline collection to construct a universal approximated model for fully capturing the temporal dynamics. Therefore, relatively few attempts in the literature have been presented for providing a provably effective pipeline to automate the development process for model selection and neural architecture search in OffRL settings. Here, model selection refers to selecting the best model (e.g., the policy learned by a trained neural network) among a set of candidate models (e.g., different neural network hyperparameters).

\begin{figure}[t]
\centering
	\begin{subfigure}{0.22\textwidth} % width of left subfigure
	    \centering
	    \includegraphics[width=\textwidth]{latex/Figure/fig1_pms.pdf}
	   % \caption{Cartpole$^{adversarial}_{vector}$.} % subcaption
	\end{subfigure}
	\quad
	\begin{subfigure}{0.22\textwidth} % width of right subfigure
	    \centering
		\includegraphics[width=\textwidth]{latex/Figure/fig1_wis.pdf}
% 		\caption{Banana$^{adversarial}$. } % subcaption
	\end{subfigure}
	\quad
	\begin{subfigure}{0.22\textwidth} % width of right subfigure
	    \centering
		\includegraphics[width=\textwidth]{latex/Figure/fig1_Am.pdf}
%  		\caption{ } % subcaption
	\end{subfigure}
	\quad
	\begin{subfigure}{0.22\textwidth} % width of right subfigure
	    \centering
		\includegraphics[width=\textwidth]{latex/Figure/fig1_fqe.pdf}
%  		\caption{ } % subcaption
	\end{subfigure}
	%\vspace{-3mm}
	\caption{Comparisons of model selection algorithms for offline deep Q-network (DQN) learning: (a) proposed pessimistic model selection (PMS); (b) weighted importance sampling (WIS)~\citep{gottesman2018evaluating}; (c) approximate model (AM)~\citep{voloshin2019empirical}; (d) fitted Q evaluation (FQE)~\citep{le2019batch}. In this figure, the algorithms are trained and evaluated in a navigation task ($\mathbf{E}_2$) discussed in Section~\ref{sec:exp:7} and Appendix. Vertical axis shows the offline policy evaluation (OPE) value. Higher Pearson's correlation coefficient $\rho$ means better performance.} % caption for whole figure
	\label{fig:1:intro}
	%\vspace{-4mm}
\end{figure}

In this work, we propose a novel model selection approach to automate OffRL development process, which provides an evaluation mechanism to identify the best DRL model given offline data. Our method utilizes statistical inference to provide uncertainty quantification on the ``optimal" value functions trained by different DRL models, based on which a pessimistic rule is incorporated to select the best model/policy. In addition, two refined approaches are further proposed to address the possible biases of DRL models in identifying the optimal policy. In this work, we mainly focus on deep Q-network (DQN)~\citep{mnih2013playing, mnih2015human}  based architectures, while our proposed methods can be flexibly extended to other settings. Figure \ref{fig:1:intro} demonstrates the superior performance of the proposed pessimistic model selection (PMS) method in identifying the best model among $70$ DRL models of different algorithms on one navigation task (See Appendix for details), compared with the model selection method by \citep{tang2021model} 
which uses three offline policy evaluation (OPE) estimates for validation. Specifically, based on the derived confidence interval of the OPE value for each candidate model, the final selected model by our PMS method is the one that has the largest lower confidence limit, which exactly has the largest true OPE value among all candidate models. In contrast, none of three OPE estimates used for model selection by \cite{tang2021model} can identify the best model due to the inevitable overfitting issue during the validation procedure. 

To close this section, we summarize the contributions of this work as follows:
\begin{itemize}[leftmargin=*]
    \item We propose a novel PMS framework, which targets finding the best policy from given candidate models (e.g., neural architecture, hyperparameters, etc) with offline data for DQN learning. Unlike many existing methods, our approach essentially does not involve additional hyperparameter tuning except for two interpretable parameters.
    \item Leveraging asymptotic analysis in statistical inference, we provide uncertainty quantification on each candidate model, based on which our method can guarantee that the worst performance of finally selected model is the best among all candidate models. See Corollary \ref{cor: selection} for our key insight.
    \item To address potential biases of candidate models in identifying the optimal policy, two refined approaches are proposed, one of which can be shown to have regret bounded by the smallest error bound among all candidate models under some technical conditions (See Corollary \ref{cor: refine selection}).  To the best of our knowledge, this is the first model-selection method in offline DRL with such a guarantee.
    %We provide theoretical justifications based on uncertainty quantification to address value function estimation for worse-case analysis. \PY{(Add some sentence to highlight novelty or importance in theory)}
    \item The numerical results demonstrate that the proposed PMS shows  superior performance in six different DQN benchmark environments. %\PY{(Highlight some numbers to show improvement)}
\end{itemize}

% The desire of incorporating 

% Deep reinforcement learning (DRL) encompasses wide and salient applications in human society, such as robotics, automatic navigation, and clinical resource allocation. The success

% However, training a DRL model would be expensive and risky in these real-world applications.  



\section{Related Work}
% To learn a policy from interaction with the environment, Q-learning 
\textbf{Model Selection for Reinforcement Learning:} Model selection has been studied in online decision-making environments~\citep{fard2010pac, lee2014exact}. Searching nearly optimal online model is a critical topic for online bandits problems with limited information feed-backs. For linear contextual bandits, \cite{abbasi2011improved, chu2011contextual} aim to find the best worst-case bound when the optimal model class is given. For model-based reinforcement learning, \cite{pacchiano2020optimism} introduces advantages of using noise augmented Markov Decision Processes (MDP) to archive a competitive regret bound to select an individual model with constraints for ensemble training. Recently, \cite{lee2021online} utilizes an online algorithm to select a low-complexity model based on a statistical test. However, most of the previous model selection approaches are focused on the online RL
setting.  Only few works including \cite{farahmand2011model,paine2020hyperparameter,su2020adaptive,yang2020offline,kuzborskij2021confident,tang2021model,xie2021batch} study the offline setting. %\textcolor{blue}{See more discussion on the aforementioned work in Appendix \ref{sec: more related work}}. 
In particular, \citep{su2020adaptive,yang2020offline,kuzborskij2021confident} focus on model selection for OPE problem. \citep{farahmand2011model,xie2021batch} propose to select the best model/policy based on minimizing the Bellman error, while the first approach requires an additional tuning and latter does not. \citep{paine2020hyperparameter,tang2021model} propose several criteria to perform model selection in OffRL and mainly focused on empirical studies. In this work, we provide one of the firstline model selection approaches based on statistical inference for RL tasks with offline data collection. 


\textbf{Offline-Policy Learning:} Training a DRL agent with offline data collection often relies on batch-wise optimization. Batch-Constrained deep Q-learning~\citep{fujimoto2019off} (BCQ) is considered one OffRL benchmark that uses a generative model to minimize the distance of selected actions to the batch-wise data with a perturbation model to maximize its value function. Other popular OffRL approaches, such as behavior regularized actor-critic (BRAC) \citep{wu2019behavior}, and random ensemble mixture~\citep{agarwal2020optimistic} (REM) (as an optimistic perspective on large datasets), have also been studied in RL Unplugged (RLU)~\citep{gulcehre2020rl} benchmark together with behavior cloning (BC)~\citep{bain1995framework, ross2010efficient}, DQN, and DQN with quantile regression (QR-DQN)~\citep{dabney2018distributional}. RLU suggests a naive approach based on human experience for offline policy selection, which requires independent modification with shared domain expertise (e.g., Atari environments) for tuning each baseline. Meanwhile, how to design a model selection algorithm for OffRL remains an open question. Motivated by the benefits and the challenges as mentioned earlier of the model selection for \emph{offline} DRL, we aim to develop a unified approach for model selection in offline DRL with theoretical guarantee and interpretable tuning parameters.

\section{Background and Notations}
Consider a time-homogeneous Markov decision process (MDP) characterized by a tuple $\calM = (\calS, \calA, p, r, \gamma)$, where $\calS$ is the state space, $\calA$ is the finite action space, $p$ is the transition kernel, i.e., $p(s' | s, a)$ is the probability mass (density) of transiting to $s'$ given current state-action $(s, a)$, $r$ is the reward function, i.e., $	\EE(R_{t}|S_{t}=s,A_{t}=a)=r(s,a)$ for $t \geq 0$, and $0 \leq \gamma <1$ is a discount factor. For ease of presentation, we assume $\calA$ and $\calS$ are both finite. But our method can also be applied in continuous cases. Under this MDP setting, it is sufficient to consider stationary Markovian policies for optimizing discounted sum of rewards \citep{puterman1994markov}. Denote $\pi$ as a stationary Markovian policy mapping from the state space $\calS$ into a probability distribution over the action space. For example, $\pi(a | s)$ denotes the probability of choosing action $a$ given the state value $s$. One essential goal of RL is to learn an optimal policy that maximizes the value function. Define $V^{\pi}(s)=\sum_{t=0}^{+\infty} \gamma^t \EE^{\pi}[R_t| S_0 = s]$ and then the optimal policy is defined as
 %\begin{align}%\label{optimal pi}
 $\pi^\ast \in \argmax_{\pi} \{\calV(\pi)\triangleq (1-\gamma)\sum_{s\in \calS} V^{\pi}(s) \nu(s)\}$,
 %\end{align}
 where $\nu$ denotes some reference distribution function over $\calS$. In addition, we denote Q-function as  $Q^{\pi}(s,a)=\sum_{t=0}^{+\infty} \gamma^t \EE^{\pi} (R_{t}|A_{0}=a,S_{0}=s)$ for $s \in \calS$ and $a \in \calA$. In this work, we consider the OffRL setting. The observed data consist of $N$ trajectories, corresponding to $N$ independent and identically distributed copies of $\{(S_{t},A_{t},R_{t})\}_{t\ge 0}$. For any $i \in \{1,\cdots, n\}$, data collected from the $i$th trajectory can be summarized by $\{(S_{i,t},A_{i,t},R_{i,t},S_{i,t+1})\}_{0\le t< T}$, where $T$ denotes the termination time. We assume that the data are generated by some fixed stationary policy denoted by $b$. 
 
 Among many RL algorithms, we focus on Q-learning type of methods. The foundation is the optimal Bellman equation given below,
 {\small
 \begin{align}\label{eqn: optimal bellman}
 	Q^\ast(s, a) =  \EE[R_{t} + \gamma\max_{a' \in \calA} Q^\ast(S_{t+1}, a') \, | \, S_t = s, A_t = a]
 \end{align}
 }%
 where $Q^\ast$ is called optimal Q-function, i.e., Q-function under $\pi^\ast$. Among others, fitted q-iteration (FQI) is one of the most popular RL algorithms \citep{ernst_tree-based_2005}. FQI leverages supervised learning techniques to iteratively solve the optimal Bellman equation \eqref{eqn: optimal bellman} and shows competitive performance in OffRL.
 
 To facilitate our model-selection algorithm, we introduce the discounted visitation probability, motivated by the marginal importance sampling estimator in \citep{liu2018breaking}. For any $t\ge 0$, let $p_t^{\pi}(s, a)$ denote the $t$-step visitation probability $\Pr^{\pi}(S_{t}=s, A_t = a)$ assuming the actions are selected according to $\pi$ at time $0,\cdots,t$. In particular, $p_0^\pi(s, a) = \nu(s)\pi(a | s)$. We define the discounted visitation probability function as $d^{\pi}(s, a)=(1-\gamma)\sum_{t\ge 0} \gamma^{t} p_t^{\pi}(s,a)$. To adjust the distribution from behavior policy to any target policy $\pi$, we use the discounted probability ratio function defined as
 \begin{eqnarray}\label{eqn:condw}
 \omega^{\pi, \nu}(s,a)=\frac{d^{\pi}(s,a)}{\frac{1}{T}\sum_{t=0}^{T-1}p_t^b(s, a)},
 \end{eqnarray}
 where $p_t^b(s, a)$ is the $t$-step visitation probability under the behavior policy $b$, i.e., $\Pr^b(S_t = s, A_t = a)$. The ratio function $\omega^{\pi, \nu}(s,a)$ is always assumed well defined, where $\nu$ is the distribution of the initial state in $d^\pi$. The estimation of ratio function is motivated by the observation that for every measurable function $f$ defined over $\calS \times \calA$, 
{ \small
\begin{align}\label{eq: Estimating Equation for ratio}
&\EE[\frac{1}{T}\sum_{t = 0}^{T-1}\omega^{\pi, \nu}(S_t, A_t)(f(S_t, A_t) \\
&~~~- \gamma \sum_{a' \in \calA}\pi(a'\mid S_{t+1})f(S_{t+1}, a'))] \nonumber \\
&= (1-\gamma)\EE_{S_0 \sim \nu}[ \sum_{a\in \calA} \pi(a\mid S_0)f(a,S_0)],\nonumber
\end{align}
}%
based on which several min-max estimation methods has been proposed such as \citep{liu2018breaking,nachum2019dualdice,uehara2019minimax}. We refer to Lemma~1 of \citep{uehara2019minimax} for a formal proof of equation~\eqref{eq: Estimating Equation for ratio}.

Finally, because our proposed model selection algorithm relies on an efficient evaluation of any target policy using batch data, we introduce three types of model-free offline policy evaluation estimators in the existing RL literature. The first type is called direct method via estimating Q-function, based on the relationship that $\calV(\pi) = (1-\gamma)\sum_{s\in \calS, a\in \calA}\pi(a | s)Q(s, a)\nu(s)$. The second type is motivated by the importance sampling \citep{precup2000eligibility}. Based on the definition of ratio function, we can see $\calV(\pi) = \EE[\frac{1}{T}\sum_{t = 0}^{T-1}\omega^{\pi, \nu}(S_t, A_t)R_t]$, from which a plugin estimator can be constructed. The last type of OPE methods combines the first two types of methods and construct a so-called doubly robust estimator \citep{kallus2019efficiently,Tang2020Doubly}. This estimator is motivated by the efficient influence function of $\calV(\pi)$ under a transition-sampling setting and the model that consists of the set of all observed data distributions given by arbitrarily
varying the initial, transition, reward, and behavior policy distributions, subject to certain minimal regularity and identifiability conditions \citep{kallus2019efficiently}, i.e.,
 { \small
 \begin{align}\label{eqn: eif}
 	&\frac{1}{T}\sum_{t=0}^{T-1}\omega^{\pi, \nu}(S_t, A_t)(R_{t} + \gamma \sum_{a \in \calA}\pi(a | S_{t+1})Q^\pi(S_{t+1}, a)- \nonumber\\
 	& Q^\pi(S_t, A_t)) + (1- \gamma)\EE_{S_0 \sim \nu}[\sum_{a \in \calA}\pi(a | S_0)Q^\pi(S_0, a)] - \calV(\pi).
 \end{align}
 }%
 A nice property of doubly robust estimators is that as long as either  the Q-function $Q^{\pi}$ or the ratio function $\omega^{\pi, \nu}$ can be consistently estimated, the final 
 estimator of $\calV(\pi)$ is consistent \citep{robins1994,jiang2015doubly,kallus2019efficiently,Tang2020Doubly}.
Furthermore, a doubly robust estimator based on \eqref{eqn: eif} can achieve semi-parametric efficiency under the conditions proposed by \citep{kallus2019efficiently}, even if nuisance parameters are estimated via black box models such as deep neural networks. Therefore, such an estimator is particularly suitable under the framework of DRL.
Our proposed algorithm will rely on this doubly robust type of OPE estimator.

\section{Pessimistic Model Selection (PMS) for Best Policy}
In this section, we discuss our pessimistic model selection approach. For the ease of presentation, we focus on the framework of (deep) Q-learning, where policy optimization is performed via estimating the optimal Q-function. While this has already covered a wide range of state-of-the-art RL algorithms such as FQI~\citep{ernst_tree-based_2005}, DQN~\citep{mnih2013playing} and QR-DQN~\citep{dabney2018distributional}, we remark that our method is not restricted to this class of algorithms. 

Suppose we have total number of $L$ candidate models for policy optimization, where each candidate model will output an estimated policy, say $\hat \pi_l$ for $1 \leq l \leq L$. Our goal is to select the best policy among $L$ policies during our training procedure. Note that these $L$ models can be different deep neural network architectures, hyper-parameters, and various classes of functions for approximating the optimal Q-function or policy, etc.

\subsection{Difficulties and Challenges}

Given a candidate $l$ among $L$ models, we can apply for example FQI using the whole batch data $\calD_n$ to learn an estimate of $Q^{\ast}$ as $\widehat Q_{l}$ and an estimated optimal policy $\hat \pi_l$ defined as
$
\hat \pi_l(a | s) \in \argmax_{a\in \calA} \widehat Q_{l}(s, a),
$
for every $s \in \calS$.
In order to select the final policy, one may use a naive greedy approach to choose some $\tilde{l}$ such that 
$
\tilde{l} \in \argmax_{l} \EE_{S_0 \sim \nu}[\sum_{a\in \calA}\hat \pi_l(a | s)\widehat Q_{l}(S_0, a)],
$
as our goal is to maximize $\calV(\pi)$.
However, using this criterion will lead to over-fitting. Specifically, due to the distributional mismatch between the behavior policy and target policies, which is regarded as a fundamental challenge in OffRL \citep{levine2020offline}, we may easily overestimate Q-function, especially when some state-action pairs are not sufficiently visited in the batch data. This issue becomes more serious when we apply max-operator during our policy optimization procedure. Such observations have already been noticed in recent works, such as \citep{kumar2019stabilizing,kumar2020conservative,paine2020hyperparameter,yu2020mopo,tang2021model,jin2021pessimism}. Therefore, it may be inappropriate to use this criterion for selecting the
best policy among $L$ models.





%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% APPENDIX
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
One may also use cross-validation procedure to address the issue of over-fitting or overestimating Q-function for model selection.
For example, one can use OPE approaches on the validate dataset to evaluate the performance of estimated policies from the training data set (see \cite{tang2021model} for more details). However, since there is no ground truth for the value function of any policies, the OPE procedure on the validation dataset cannot avoid involving additional tuning on hyperparameters. Therefore, such a procedure may still incur a large variability due to the over-fitting issue. In addition, arbitrarily splitting the dataset for cross-validation and ignoring the Markov dependent structure will cause additional errors, which should be seriously taken care of.

\subsection{Sequential Model Selection}
In the following, we propose a pessimistic model selection algorithm for finding an optimal policy among $L$ candidate models. Our goal is to develop an approach to estimate the value function under each candidate model during our policy optimization procedure with theoretical guarantee. The proposed algorithm is motivated by recent development in statistical inference of sequential decision making \citep{luedtke2016statistical,shi2020statistical}. The idea is to first estimate optimal Q-function $Q^\ast$, optimal policy $\pi^\ast$ and the resulting ratio function based on a chunk of data, and evaluate the performance of the estimated policy on the next chunk of data using previously estimated nuisance functions. Then, we combine the first two chunks of data, perform the same estimation procedure and evaluation on the next chunk of data. The framework of MDP provides a nature way of splitting the data. 

Specifically, denote the index of our batch dataset $\calD_n$ as $J_0 = \left\{(i, t): 1 \leq i \leq n,  0 \leq t < T \right\}$. We divide $J_0$ into $O$ number of non-overlapping subsets, denoted by $J_1, \cdots, J_O$ and the corresponding data subsets are denoted by $\calD_1, \cdots, \calD_O$. Without loss of generality, we assume these data subsets have equal size. We require that for any $1 \leq o_1 < o_2 \leq O$, any $(i_1, t_1) \in J_{o_1}$ and $(i_2, t_2) \in J_{o_2}$, either $i_2 \neq i_1$ or $t_1 < t_2$. For $1 \leq o \leq O$, denote the aggregate chunks of data as
$
\bar \calD_o = \left\{(S_{i, t}, A_{i, t}, R_{i, t}, S_{i, t+1}), (i, t) \in \bar J_o = J_1 \cup \cdots \cup J_o \right\}.	
$

We focus on FQI algorithm for illustrative purpose and it should be noticed that our algorithm can be applied to other RL algorithms. Starting from the first chunk of our batch data, at the $o$-th step ($o = 1, \cdots, O-1)$, for each candidate model $l = 1, \cdots, L$, we apply FQI on $\bar \calD_o$ to compute  $\widehat Q_l^{(o)}$ as an estimate of optimal Q-function and obtain $\hat \pi^{(o)}_l$ correspondingly such that $\hat \pi^{(o)}_l(a | s) \in \argmax_{a \in \calA} \widehat Q_l^{(o)}(s, a)$ for every $s \in \calS$. Additionally, we compute an estimate of ratio function $\omega^{\hat \pi^{(o)}_l, \nu}$ using $\bar \calD_{o}$ by many existing algorithms such as \cite{nachum2019dualdice}. Denote the resulting estimator as $\widehat \omega^{\hat \pi^{(o)}_l, \nu}$. The purpose of estimating this ratio function is to improve the efficiency and robustness of our value function estimation for each candidate model. Then, we compute the estimated value function of $\hat \pi^{(o)}_l$ on $\calD_{o+1}$ as
{\small
\begin{align}\label{eq: estimated value on next dataset}
	&\hat \calV_{\calD_{o+1}}(\hat \pi^{(o)}_l)\nonumber\\
	=& (1-\gamma)\EE_{S_0 \sim \nu}[\sum_{a_0\in \calA}\hat \pi^{(o)}_l(a_0 | S_0)\widehat Q_l^{(o)}(S_0, a_0)]\nonumber\\
	+& \EE_{\calD_{o+1}}[\widehat \omega^{\hat \pi^{(o)}_l, \nu}(S, A)( R+ \gamma \sum_{a'\in \calA}{\hat \pi^{(o)}_l}(a' | S')\widehat Q_l^{(o)}(S', a'))]\nonumber\\
	-&\EE_{\calD_{o+1}}[\widehat \omega^{\hat \pi^{(o)}_l, \nu}(S, A)\widehat Q_l^{(o)}(S, A)],
\end{align}
}%
where $\EE_{\calD_{o+1}}$ denotes the empirical average over the $(o+1)$ chunk of dataset and $(S, A, R, S')$ is one transition tuple in $\calD_{o+1}$. While one can aggregate $\hat \calV_{\calD_{o+1}}(\hat \pi^{(o)}_l)$ for $1\leq o \leq (O-1)$ to evaluate the performance of $L$ models, the uncertainty of these estimates due to the finite sample estimation should not be ignored. Therefore, in the following, we derive an uncertainty quantification of our estimated value function for each candidate model, for performing model selection. Based on equation~\eqref{eqn: eif}, (conditioning on $\bar{\calD}_o$), the variance of $\hat \calV_{\calD_{o+1}}(\hat \pi^{(o)}_l)$ is
{\small
\begin{align}\label{eq: variance}
	&\sigma^2(\hat \pi^{(o)}_l)\nonumber \\
	=&\EE\left[\{\widehat \omega^{\hat \pi^{(o)}_l, \nu}(S, A)( R + \gamma \sum_{a'\in \calA}\hat \pi^{(o)}_l(a' | S')\widehat Q_l^{(o)}(S', a')\nonumber\right.\\
	&\left.-\widehat Q_l^{(o)}(S, A))\}^2\right],
\end{align}
}where $(S, A, S')$ is a transition tuple with $(S, A)$ follows some stationary distribution. See Assumption \ref{ass: stationary}.
Correspondingly we have an estimator defined as
{\small
\begin{align}\label{eq: estimated variance}
	&\hat \sigma^2_{o+1}(\hat \pi^{(o)}_l) \nonumber\\ =&\EE_{\calD_{o+1}}[\{\widehat \omega^{\hat \pi^{(o)}_l, \nu}(S, A)( R + \gamma \sum_{a'\in \calA}\hat \pi^{(o)}_l(a' | S')\widehat Q_l^{\ast(o)}(S', a') \nonumber\\
	-&\widehat Q_l^{\ast(o)}(S, A))\}^2].
\end{align}
}The estimation procedure stops once we have used all our offline data and denote the final estimated policy as $\hat \pi_l$ for each $l = 1, \cdots, L$. Notice that $\hat \pi_l = \hat \pi^{(O)}_l$.
Finally, we compute the weighted average of all the intermediate value functions as our final evaluation of the estimated policy $\hat \pi_l$, i.e.,
{\small
\begin{align}\label{weighted optimal value}
	\hat \calV(\hat \pi_l) = \left(\sum_{o = 1}^{O-1}\frac{1}{\hat \sigma_{o+1}(\hat \pi^{(o)}_l)}\right)^{-1}\left(\sum_{o = 1}^{O-1}\frac{\hat \calV_{\calD_{o+1}}(\hat \pi^{(o)}_l)}{\hat \sigma_{o+1}(\hat \pi^{(o)}_l)}\right).
\end{align}
}%
In Section \ref{sec: theory}, we show that under some technical conditions, the following asymptotic result holds:
{\small
\begin{align}\label{eqn: asymptotic normal}
	\frac{\sqrt{nT(O-1)/O}\left(\hat \calV(\hat \pi_l) - \calV(\hat \pi_l)\right)}{\hat \sigma(l)} \Longrightarrow \calN(0, 1),
\end{align}
}% 
where $\hat \sigma(l) = (O-1)(\sum_{o = 1}^{O-1}\{\sigma_{o+1}(\hat \pi^{(o)}_l)\}^{-1})^{-1}$, $\Longrightarrow$ refers to weak convergence when either $n$ or $T$ goes to infinity, and $\calN(0, 1)$ refers to the standard normal distribution. Based on the asymptotic result in \eqref{eqn: asymptotic normal}, we can construct a confidence interval for the value function of each policy $\hat \pi_l$. Given a confidence level $\alpha$, for each $l$, we can compute $U(l) =  \hat \calV(\hat \pi_l) - z_{\alpha / 2}\sqrt{O/nT(O-1)}\hat \sigma(l)$,
where $z_{\alpha / 2}$ is $(1-\frac{\alpha}{2})$-quantile of the standard normal distribution. Our final selected one is $\hat l \in \argmax_{1 \leq l \leq L} \, \, U(l)$. 

The use of $U(l)$ is motivated by the recent proposed pessimistic idea to address the overestimation issue of value (or Q) function in the OffRL setting. See \cite{kumar2019stabilizing,kumar2020conservative,jin2021pessimism,xie2021bellman,uehara2021pessimistic,zanette2021provable} for details. The final output of our algorithm is $\hat \pi_{\hat l}$ and an outline of the proposed algorithm can be found in Algorithm \ref{alg:policy learning}. As we can see, our algorithm is nearly tuning-free, which provides great flexibility in real-world applications.  The only two adjustable parameters are $O$ and $\alpha$ and they are directly interpretable. The size of $O$ balances the computational cost and the finite-sample accuracy of evaluating each candidate model. Specifically, we can indeed show that the variance of the estimated value function by our algorithm can achieve the semi-parametric efficiency bound, which is best one can hope for. So in the asymptotic sense, the effect of $O$ is negligible. In the finite-sample setting, it is rational to assume the performance will be discounted by a factor $\sqrt{O-1/O}$. Therefore, if $O$ is large enough, $\sqrt{O-1/O}$ will have a mere effect on the performance. See Theorem \ref{thm: normal} for more details. However, using large $O$ will result in a large computational cost. As a sacrifice for the nearly tuning-free algorithm, we need to apply OffRL algorithms $O$ times for each candidate model. The parameter $\alpha$ determines how worst the performance of each policy we should use to evaluate each policy. See Corollary \ref{cor: selection} for more insights.
% %\vspace{-2mm}
\begin{algorithm}[t] %\SetAlgoNoLine
	\SetAlgoLined
	\caption{Pessimistic Model Selection (PMS) for OffRL} \label{alg:policy learning}
	\KwIn{Dataset $\calD_n$ and $L$ candidate models for estimating optimal Q-function and policy; We divide $\calD_n$ into non-overlapping subsets denoted by $\calD_1, \cdots, \calD_O$. We require that for any $1 \leq o_1 < o_2 \leq O$, any $(i_1, t_1) \in J_{o_1}$ and $(i_2, t_2) \in J_{o_2}$, either $i_2 \neq i_1$ or $t_1 \leq t_2$.}
	\For{$l \in L$}{
	\For{$o = 1$ \textbf{to} $O-1$}
	{\ShowLn For $l \in L$ models, construct the optimal $\widehat Q_l^{(o)}$ and $\hat \pi^{(o)}_l$ using $\bar \calD_{o}$ data subset.\\
		\ShowLn Compute $\widehat \omega^{\hat \pi^{(o)}_l, \nu}$ using $\bar \calD_{o}$ by \cite{nachum2019dualdice} and min-max solver for \eqref{eq: Estimating Equation for ratio}.\\
		\ShowLn Compute $\hat \calV_{\calD_{o+1}}(\hat \pi^{(o)}_l)$ and $\hat \sigma^2_{o+1}(l)$ using $ \calD_{o+1}$ given in \eqref{eq: estimated value on next dataset} and \eqref{eq: estimated variance} respectively.\\}
	\ShowLn For $l$-th model, we compute 
	$
	U(l) =  \hat \calV(\hat \pi_l) - z_{\alpha / 2}\sqrt{nT(O-1)/O}\hat \sigma(l),
	$ 
	where $\hat \calV(\hat \pi_l)$ and $\hat \sigma(l)$ are given in \eqref{weighted optimal value} and \eqref{eqn: asymptotic normal} respectively.
}
	\ShowLn Pick  $\hat l=\arg\max_{l} U(l)$ as the selected model and run the algorithm on full dataset to obtain $\hat \pi_{\hat l}$.
	
	\textbf{Return}  $\hat \pi_{\hat l}$.
\end{algorithm}
%\vspace{-0.3cm}

\section{Theoretical Results}\label{sec: theory}
In this section, we justify our asymptotic result given in \eqref{eqn: asymptotic normal}. We use $O_p$ to denote the stochastic boundedness. Before that, we make several technical assumptions:
%%\vspace{-0.2cm}
\begin{assumption}\label{ass: stationary}
	The stochastic process $\{A_t, S_t\}_{t \geq 0}$ is stationary with stationary distribution $p_\infty$.
\end{assumption}
% %\vspace{-0.6cm}
\begin{assumption}\label{ass: value consistent}
	For every $1 \leq l \leq L$ and $1 \leq o \leq O$, we have
	$
	\EE|\calV(\hat \pi^{(o)}_l) - \calV(\pi^\ast)|  \leq C_0 (nT/O)^{-\kappa},
	$
	for some constant $C_0$ and $\kappa > 1/2$.
\end{assumption}
% %\vspace{-0.2cm}
Assumption \ref{ass: stationary} is standard in the existing literature such as \citep{kallus2019efficiently}. Assumption \ref{ass: value consistent} is key to our developed asymptotic results developed. This assumption essentially states that all candidate models are good enough so that eventually their value functions will converge to that of the optimal one. This implies that there is no asymptotic bias in estimating the optimal policy. While this is reasonable thanks to the capability of deep neutral networks, which has demonstrated their empirical success in many RL applications, such an assumption could still be strong. In Section~\ref{sec: refined}, we aim to relax this assumption and provide two remedies for addressing possibly biased estimated policies.  In addition, Assumption \ref{ass: stationary} also requires that the convergence rates of value functions under estimated policies are fast enough. This has been shown to hold under the margin condition on $\pi^\ast$, see e.g., \citep{hu2021fast} for more details. 
% %\vspace{-0.3cm}
\begin{assumption}\label{ass: Q-function}
	For every $1 \leq l \leq L$ and $1 \leq o \leq O-1$, suppose $\EE_{(S,A)\sim p_{\infty}} |\widehat Q_l^{(o)}(S, A)-Q^{\hat \pi_l^{ (o)}}(S, A)|^2=O_p\{(nT/O)^{-2\kappa_1}\}$ for some constant $\kappa_1\ge 0$. In addition, $\widehat Q_l^{(o)}$ is uniformly bounded almost surely. 
\end{assumption}
% %\vspace{-0.6cm}
\begin{assumption}\label{ass: ratio}
	For every $1 \leq l \leq L$ and $1 \leq o \leq O-1$, suppose $ \EE_{(S,A) \sim p_{\infty}} |\widehat \omega^{\hat \pi^{(o)}_l, \nu}(S, A)-\omega^{\hat \pi^{(o)}_l, \nu}(S, A)|^2=O_p\{(nT/O)^{-2\kappa_2}\}$ for some constant $\kappa_2\ge 0$. In addition, both $\omega^{\hat \pi^{(o)}_l, \nu}$ and $\widehat \omega^{\hat \pi^{(o)}_l, \nu}$ are uniformly bounded above and below away from $0$ almost surely.
\end{assumption}
% %\vspace{-0.6cm}
\begin{assumption}\label{ass: sigma}
	For every $1 \leq l \leq L$ and $1 \leq o \leq O-1$, $\sigma^2(\hat \pi^{(o)}_l)$ and $\hat \sigma^2_{o+1}(\hat \pi^{(o)}_l)$ are bounded above and below from $0$ almost surely.
\end{assumption}
% %\vspace{-0.2cm}
Assumptions \ref{ass: Q-function} and \ref{ass: ratio} impose high-level conditions on two nuisance functions. Our theoretical results only require $\kappa_1 + \kappa_2 > 1/2$, which is a mild assumption. For example, if considered parametric models for both Q-function and ratio function, then $\kappa_1 = \kappa_2 = 1/2$. If considered nonparametric models for these two nuisance functions such as deep neural networks, then $1/4 < \kappa_{1}, \kappa_2 < 1/2$ can be obtained under some regularity conditions. See \cite{yang2020theoretical} and \cite{liao2020batch,uehara2021finite} for the convergence rates of Q-function and ratio function by non-parametric models respectively. In addition, Assumption \ref{ass: sigma} is a mild assumption, mainly for theoretical justification.
Then, we have the following main theorem as a foundation of our proposed algorithm.
% %\vspace{-2mm}
\begin{theorem}\label{thm: normal}
	Under Assumptions \ref{ass: stationary}-\ref{ass: sigma}, we have
	{\small
	\begin{align}\label{eqn: asymptotic normal in thm}
	(\sqrt{nT(O-1)/O}(\hat \calV(\hat \pi_l) - \calV(\hat \pi_l)))/\hat \sigma(l) \Longrightarrow \calN(0, 1).
	\end{align}
	}%
	%Furthermore, 
	%\begin{align}\label{eqn: asymptotic normal 2}
	%\frac{\sqrt{nT(O-1)/O}\left(\hat \calV(\hat \pi_l) - %\calV(\pi^\ast)\right)}{\hat \sigma(l)} \Longrightarrow \calN(0, 1).
	%\end{align}
\end{theorem}
% %\vspace{-2mm}
Theorem \ref{thm: normal} provides an uncertainty quantification of each candidate model used in policy optimization. Such uncertainty quantification is essential in OffRL as data are often limited. We highlight the importance of such results in Appendix. A consequent result following Theorem  \ref{thm: normal} validates the proposed Algorithm~\ref{alg:policy learning}:
% %\vspace{-2mm}
\begin{corollary}\label{cor: selection}
   $ \underset{{nT\rightarrow \infty}}{\lim\inf}\, \Pr (\calV(\hat \pi_{\hat l}) \geq \max_{1\leq l \leq L} \calV(\hat \pi_l) - 2z_{\alpha / 2}\sqrt{nT(O-1)/O}\hat \sigma(l)  ) \geq 1-L\alpha $  under Assumptions \ref{ass: stationary}-\ref{ass: sigma}.
\end{corollary}
% %\vspace{-2mm}
As can be seen clearly from Corollary~\ref{cor: selection} and the proposed PMS method, with a high probability (by letting $\alpha$ small), we consider the worst performance of each candidate model $\hat \pi_l$ in the sense of the lower confidence limit of the value function, and then select the best one among all models. 


\section{Two Refined Approaches}\label{sec: refined}
In this section, we relax Assumption \ref{ass: value consistent} by allowing possibly non-negligible bias in estimating the optimal policy and introduce two refined approaches for addressing this issue. Instead of imposing Assumption \ref{ass: value consistent}, we make an alternative assumption below.
% %\vspace{-0.3cm}
\begin{assumption}\label{ass: allow bias}
	For $1 \leq l \leq L$, there exists $B(l)$ such that $\max_{1 \leq o \leq (O-1)} |\calV(\hat \pi^{(o)}_l) - \calV(\pi^\ast)| \leq B(l)$ almost surely.
\end{assumption}
% %\vspace{-0.2cm}
Assumption \ref{ass: allow bias} is a very mild assumption. It essentially states that the biases for all our intermediate value function estimates are bounded by some constant, which is much weaker than Assumption \ref{ass: value consistent}. In this case, the asymptotic results in \eqref{eqn: asymptotic normal in thm} may not hold in general. Correspondingly, we have the following result.
% %\vspace{-2mm}
\begin{theorem}\label{thm: bias + normal}
	Under Assumptions \ref{ass: stationary}, \ref{ass: Q-function}-\ref{ass: allow bias}, for every $1 \leq l \leq L$, the following inequality holds:
	{\small
	\begin{align}\label{eqn: high probability bound}
		&\underset{nT \rightarrow \infty}{\liminf} \Pr\left(|\calV(\pi^\ast) -\hat \calV(\hat \pi_l) |\nonumber \right.\\
		&\left.\leq z_{\alpha / 2}\sqrt{O/nT(O-1)}\hat \sigma(l) + B(l)\right)\geq 1- \alpha.
	\end{align}
	}%
\end{theorem}
% %\vspace{-2mm}
Motivated by Lepski's principle \citep{lepski1997optimal} from nonparametric statistics and \citep{su2020adaptive} studying the model selection of OPE, we consider the following refined model-selection procedure to find the best policy. We first rank $L$ candidate models in an non-increasing order based on  the value of $\hat \sigma(l)$, i.e., for $1 \leq i < j \leq L$, $\hat \sigma(i) \geq \hat \sigma(j)$. Then for $i$-th model, we construct an interval as
% $$
% I(l) = \left[\hat \calV(\hat \pi_l)-2z_{\alpha/(2L)}\sqrt{O/nT(O-1)}\hat \sigma(l),  \hat \calV(\hat \pi_l)+2z_{\alpha/(2L)}\sqrt{O/nT(O-1)}\hat \sigma(l)\right]. 
% $$
$
I(l) = [\hat \calV(\hat \pi_l)-2z_{\alpha/(2L)}\sqrt{O/nT(O-1)}\hat \sigma(l),  \hat \calV(\hat \pi_l)+2z_{\alpha/(2L)}\sqrt{O/nT(O-1)}\hat \sigma(l)]. 
$
Finally the optimal model/policy we choose is $\hat \pi_{\hat i}$ such that
$
\hat i = \max\{i : 1 \leq i \leq L, \cap_{1 \leq j \leq i}I(j) \neq \emptyset  \}.
$
 To show this procedure is valid, we need to make one additional assumption.
 %%\vspace{-2mm}
\begin{assumption}\label{ass: Monotonicity}
	There exists a $\zeta < 1$ such that for $1 \leq i \leq L$, $B(i) \leq B(i+1)$ and $\zeta \hat \sigma(i) \leq \hat \sigma(i+1)  \leq \hat \sigma(i)$ almost surely.
\end{assumption}
%\vspace{-2mm}
While this assumption is borrowed from \cite{su2020adaptive}, we consider model selection for policy learning instead of OPE in \cite{su2020adaptive}, which is substantially more challenging.  This assumption typically assumes that after model sorting based on $\hat \sigma(l)$, the bias of estimated policy is monotonically increasing and the standard deviation is monotonically deceasing but not too quickly. This is commonly seen when all candidate estimators exhibit some bias-variance trade-off phenomena. Define the following event
\begin{align*}
\calE &= \left\{ |\hat \calV(\hat \pi_{\hat i}) - \calV(\pi^\ast)| \leq 6(1+\zeta^{-1})\nonumber\right.\\
& \left.\times \min_{1 \leq i \leq L} \{B(i) +  z_{\alpha/(2L)}\sqrt{O/nT(O-1)} \hat \sigma(i)\}\right\}.
\end{align*}
Then we have the following theoretical guarantee for our refined procedure.
% %\vspace{-2mm}
\begin{corollary}\label{cor: refine selection}
	Under Assumptions \ref{ass: stationary}, \ref{ass: Q-function}-\ref{ass: sigma}, and \ref{ass: Monotonicity}, we have $ \underset{nT \rightarrow \infty}{\liminf} \Pr(\calE) \geq 1 -\alpha$. If we further assume that for any $\delta > 0$, with probability at least $1-\delta$, for every $1\leq i \leq L$, $|\calV(\hat \pi_i) - \hat \calV(\hat \pi_i)| \leq c(\delta)\log(L)\hat \sigma(i)/\sqrt{NT}$ for some constant $c(\delta)$, then  $\underset{nT \rightarrow \infty}{\liminf} \Pr(\overline{ \calE}) \geq 1 -\alpha - \delta$, where
	{\small
	\begin{align*}
&\overline \calE = \{|\calV(\hat \pi_{\hat i}) - \calV(\pi^\ast)| \leq 3(1+\zeta^{-1}) \times \\
& \min_{1 \leq i \leq L} \{B(i) +  (c(\delta)\log(L) +z_{\alpha/(2L)})\sqrt{O/nT(O-1)} \hat \sigma(i)\}\}.\nonumber
\end{align*}
}%
\end{corollary}
%\vspace{-2mm}
\begin{figure}[ht!]
%%\vspace{-0.2cm}
\begin{center}
   \includegraphics[width=0.97\linewidth]{latex/Figure/ICLR22-PMS-Figure1.pdf}
\end{center}
    %\vspace{-4mm}
%%\vspace{-0.6cm}
   \caption{DQN environments in our studies: (a) $\mathbf{E}_1$: \emph{FrozenLake-v0}; (b)  $\mathbf{E}_2$: \emph{Banana Collectors} (3D geometrical navigation task); (c)  $\mathbf{E}_3$: \emph{Pong-v0}; (d)  $\mathbf{E}_4$: \emph{Breakout-v0}; (e)  $\mathbf{E}_5$: \emph{Halfcheetah-v1}; (f)  $\mathbf{E}_6$: \emph{Walker2d-v1}. } 
\label{fig:figure:env}
%\vspace{-4mm}
\end{figure}

\begin{figure}[ht!]
    \centering
    \includegraphics[width=0.99\linewidth]{latex/Figure/pms_f3.pdf}
    %\vspace{-4mm}
    \caption{Box plots of model selection performance from offline learning in each DRL algorithm for $\mathbf{E}_2$. } 
\label{fig:imprv}
\end{figure}


\begin{figure*}[ht!]
    \centering
    \includegraphics[width=0.8\linewidth]{latex/Figure/epi.pdf}
        %\vspace{-4mm}
    \caption{Sensitivity analysis for different training data size. PMS attains the best performance and has the least sensitivity.} 
\label{fig:eps}
\end{figure*}
\begin{figure}[ht!]
 %\vspace{-4mm}
  \begin{center}
    \includegraphics[width=0.46\textwidth]{latex/Figure/pms_rfs.png}
  \end{center}
   %\vspace{-4mm}
  \caption{PMS and its refinements (R1/R2).}
  \label{fig:pms:re}
%   %\vspace{-5mm}
\end{figure}


The additional assumption (i.e., the high probability bound) in Corollary \ref{cor: refine selection} can be shown to hold by  the empirical process theory under some technical conditions \citep{van2000empirical}. Hence Corollary \ref{cor: refine selection} provides a strong guarantee that the regret of the final selected policy is bounded by the smallest error bound among all $L$ candidate policies. Note that Assumption \ref{ass: Q-function} imposed here could be strong. 

\textbf{Another refined approach:} Notice that the above refined approach indeed focuses on OPE estimates to select the best policy with regret warranty. The motivation behind it is to find a policy that has the smallest estimation error to the optimal one. However, such procedure may not directly match the goal of maximizing the value function in OffRL. To relieve this issue, we can alternatively choose the final policy as $\hat \pi_{\hat{\hat i}}$ such that
$
\hat{\hat i} = \argmax_{1 \leq i \leq \hat i} \hat \calV(\hat \pi_i) - 2z_{\alpha / 2}\sqrt{nT(O-1)/O}\hat \sigma(i),
$
where the $\argmax$ is taken over $1$ to $\hat i$ models. This approach can be viewed as a combination of PMS and the above refined approach. By adopting this approach, candidate models with large biases are firtly removed by the truncation on $\hat i$. Then, we use the idea of PMS to select the best model having the best worst performance among the remaining candidates. Unfortunately, we do not have theoretical guarantee for this combined approach.




\section{Experimental Results}
\label{sec:exp:7}
We select six DQN environments ($\mathbf{E}_1$ to  $\mathbf{E}_6$)  from open-source benchmarks~\citep{brockman2016openai, juliani2018unity} to conduct numerical experiments, as shown in Fig.~\ref{fig:figure:env}. These tasks of deployed environments cover different domains that include tabular learning (Fig~\ref{fig:figure:env}(a)); automatic navigation in a geometry environment with a physical ray-tracker (Fig~\ref{fig:figure:env}(b)); Atari digital gaming (Fig~\ref{fig:figure:env}(c) and (d)), and continuous control (Fig~\ref{fig:figure:env}(e) and (f)). We provide detailed task description and targeted reward for each environment in Appendix~. We will also provide our reproducible code and implementations.



%\subsection{Model Selection of Neural Architectures \& Training Hyperparameters}

\textbf{Experiment setups.} To evaluate the performance of PMS with DQN models in offRL, 
%To investigate performance benefits of deployed PMS for model selection from DQN models for offline, 
we choose different neural network architectures under five competitive DRL algorithms including DQN by \citep{mnih2013playing,mnih2015human}, BCQ by \citep{fujimoto2019off}, BC by \citep{bain1995framework, ross2010efficient}, BRAC by \citep{wu2019behavior} from RLU benchmarks, and REM by \citep{agarwal2020optimistic}. 
Within each architecture, 70 candidate models are created by assigning  different hyperparameters and training setups. See Appendix for details.
We then conduct performance evaluation of different OffRL model selection methods on these generated candidate models. 
%After genraerated these DQN candidates,we %then use our proposed PMS for online DQN
%learning. 




\textbf{Evaluation procedure.} We utilize validation scores from OPE for each model selection algorithm, which picks the best (or a good) policy from the candidate set of size $L$ based on its own criterion. %associated with different model parameters such as different model parameters and architectures. 
Regret is used as the evaluation metric for each candidate. The regret for model $l$ is defined as 
$\calV\left(\pi_{l^\ast}\right)-\calV\left(\hat \pi_{l}\right)$, where $l^{*}=\arg \max_{l^{\prime}=1 \ldots L} \calV \left(\pi_{l^{\prime}}\right)$ corresponds to the candidate policy with the best OPE validation performance. In our implementation, we treat $\pi_{l^\ast}$ as $\pi^\ast$, the oracle but unknown best possible policy. A small regret is desirable after model selection. Note the optimal regret is not zero since we can only use data to obtain $\hat \pi_{l}$ instead of $\pi_{l}$ for each model. We provide additional top-k regret and precision results in Appendix .

\textbf{Performance comparison.}
As highlighted in Fig.~\ref{fig:1:intro} in the introduction, we report estimated OPE values by different model selection approaches, i.e., PMS and three methods by \citep{tang2021model}, versus the true OPE values. In this experiment, we consider 70 DQN models under the above mentioned five DRL algorithms, i.e., $14$ models are considered for each architecture. We use fewer models for each DRL algorithm mainly for clear presentation. By using the confidence interval constructed by our PMS procedure, our method is able to correctly select the top models, while the other three methods fail. To further investigate the performance of PMS, we implement model selection among $70$ models within each DRL algorithm separately.
Fig.~\ref{fig:imprv} shows the box plots of averaged regret over six environments after OPE per neural network architecture. Each subfigure contains results from one particular DRL algorithm with different  hyperparameters or training setups. The left box plot refers to the regrets of all $70$ models and the right one represents the regrets of top $10\%$ models selected by the proposed PMS method. Note that the right box plot is a subset of the left one.
%We first generate 70 variants for those mentioned above five different DQN architectures in each environment and report their averaged regret for six environments after OPE, as the box plots shown in Fig.~\ref{fig:imprv}. 
The results show that our proposed PMS successfully helps to select models with the best policies and improve the average regret by a significant margin. In particular, PMS-REM-based models attain the lowest regrets, due to the benefit from its ensemble process. Detailed results for each environment is given in Appendix~, where $\alpha=0.01$ and $O=20$ are used in all experiments. 




% We provide our quantitative evaluation metrics including: (1) Regret, defined as $\max _{k=1 \ldots K} v\left(\pi_{k}\right)-v\left(\pi_{k^{*}}\right)$ where $k^{*}=\underset{k^{\prime}=1 \ldots K}{\arg \max } \hat{v}\left(\pi_{k^{\prime}}\right)$ corresponds to the candidate policy with the best OPE validation score. This measures how far the identified ``best'' policy is from the actual best policy among the candidate set; (2) Spearman's rank correlation $\rho$ between OPE scores and ground-truth policy values.



\textbf{Sensitivity analysis.}
Fig.~\ref{fig:eps} compares different selection algorithms with varying training data size. PMS outperforms others across all scales, and 
larger number of episodes gives smaller variation and lower sensitivity. 

\textbf{PMS algorithm with refinements.} We replicate our experiments in the offline navigation task in $\mathbf{E}_2$ (\emph{Banana Collector}) for 30 times and report regrets of top $10\%$ models selected by PMS and two refinements in Fig. \ref{fig:pms:re}. As we can see, while the overall performances of the proposed three model selection methods are similar, two refined approaches have better regrets than PMS in terms of median, demonstrating their potentials in identifying the best model. OPE results have been also evaluated also in DRL tasks with $\mathbf{E}_1$ and $\mathbf{E}_3$ to $\mathbf{E}_6$, where the refinement algorithms (PMS R1/R2) have only a small relative $\pm$ 0.423 \% performance difference compared to its original PMS setups.




% \begin{figure}[ht!]
% \begin{center}
%   \includegraphics[width=0.40\linewidth]{latex/Figure/index.png}
% \end{center}
% %\vspace{-0.4cm}
%   \caption{PMS selection from online Q-networks. } 
% \label{fig:imprv}
% %\vspace{-0.2cm}
% \end{figure}

% \begin{figure}[ht!]
% \begin{center}
%   \includegraphics[width=0.40\linewidth]{latex/Figure/epi.pdf}
% \end{center}
% %\vspace{-0.4cm}
%   \caption{Sensitivity analysis for different data size. } 
% \label{fig:eps}
% %\vspace{-0.2cm}
% \end{figure}

\section{Conclusion}
We propose a new theory-driven model selection framework (PMS) for offline deep reinforcement learning based on statistical inference. The proposed pessimistic mechanism warrants that the worst performance of the selected model is the best among all candidate models. Two refined approaches are further proposed to address the biases of DRL models.
Extensive experimental results on six DQN environments with varying network architectures and training hyperparameters demonstrate that our proposed PMS method consistently yields improved model selection performance over existing baselines. The results suggest the effectiveness of PMS as a powerful and provably effective tool toward automating model selection in offline DRL.

% References
\small
\bibliography{uai2023-template}
\clearpage


\end{document}
