%\documentclass{uai2022} % for initial submission
 \documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example
\usepackage{microtype}
\usepackage{graphicx}
\usepackage{graphics}
\usepackage{subfigure}
\usepackage{booktabs} % for professional tables
\usepackage{amsmath,amsthm,amssymb}
\usepackage{amsfonts}
\usepackage{dsfont}


\newtheorem{theorem}{Theorem}[section]
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{claim}[theorem]{Claim}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{example}[theorem]{Example}
\newtheorem{remark}[theorem]{Remark}
\newtheorem{assumption}[theorem]{Assumption}
\newtheorem{question}[theorem]{Question}


\newcommand\norm[1]{\left\lVert#1\right\rVert}
\newcommand{\mypink}{\textcolor{mypink3}}
\newcommand{\argmin}{\mathop{\mathrm{argmin}}}
\newcommand{\argmax}{\mathop{\mathrm{argmax}}}
\newcommand{\minimize}{\mathop{\mathrm{minimize}}}
\newcommand{\red}[1]{\textcolor{red}{#1}}
\newcommand{\blue}[1]{\textcolor{blue}{#1}}
\newcommand{\defeq}{\mathrel{\mathop:}=}
\newcommand\explaineq[2]{\stackrel{\mathclap{\normalfont\mbox{#1}}}{#2}}
\newcommand{\sumSA}{\sum_{h=0}^{\infty}\sum_{\substack{s,a \\ s\neq g}}}
\newcommand{\sumS}{\sum_{h=0}^{\infty}\sum_{\substack{s \\ s\neq g}}}


\def\ini{\mathrm{init}}
\def\amdp{\{s,u^\star_t\}}
\def\hoV{\widehat{V}^\star}
\def\hP{\widehat{P}}
\def\tP{\widetilde{P}}
\def\tB{\widetilde{B}}
\def\hT{\widehat{\mathcal{T}}}
\def\hV{\widehat{V}}
\def\Vb{V^{\bar{\pi}}}
\def\tT{\widetilde{\mathcal{T}}}
\def\tin{\text{in}}
\def\PP{\textit{\textbf{P}}}
\def\hpi{\widehat{\pi}}
\def\E{\mathbb{E}}
\def\P{\mathbb{P}}
\def\Cov{\mathrm{Cov}}
\def\Var{\mathrm{Var}}
\def\half{\frac{1}{2}}
\def\th{\mathrm{th}}
\def\tr{\mathrm{tr}}
\def\df{\mathrm{df}}
\def\dim{\mathrm{dim}}
\def\col{\mathrm{col}}
\def\row{\mathrm{row}}
\def\nul{\mathrm{null}}
\def\rank{\mathrm{rank}}
\def\nuli{\mathrm{nullity}}
\def\sign{\mathrm{sign}}
\def\supp{\mathrm{supp}}
\def\diag{\mathrm{diag}}
\def\aff{\mathrm{aff}}
\def\regret{\mathrm{Regret}}
\def\hy{\hat{y}}
\def\ty{\tilde{y}}
\def\hbeta{\hat{\beta}}
\def\tbeta{\tilde{\beta}}
\def\htheta{\hat{\theta}}
\def\halpha{\hat{\alpha}}
\def\hf{\hat{f}}
\def\lone{1}
\def\ltwo{2}
\def\linf{\infty}
\def\lzero{0}
\def\T{^T}
\def\R{\mathbb{R}}
\def\cA{\mathcal{A}}
\def\cB{\mathcal{B}}
\def\cD{\mathcal{D}}
\def\cE{\mathcal{E}}
\def\cF{\mathcal{F}}
\def\cG{\mathcal{G}}
\def\cH{\mathcal{H}}
\def\cM{\mathcal{M}}
\def\cN{\mathcal{N}}
\def\cP{\mathcal{P}}
\def\cR{\mathcal{R}}
\def\cS{\mathcal{S}}
\def\cT{\mathcal{T}}
\def\cW{\mathcal{W}}
\def\cX{\mathcal{X}}
\def\cY{\mathcal{Y}}
\def\cZ{\mathcal{Z}}
\def\TV{\mathrm{TV}}
\newcommand{\ming}[1]{\textit{\textcolor{gray}{[ming]: #1}}} % Ming's notes
\newcommand{\yw}[1]{\ifdraft\textit{\textcolor{red}{[yuxiang]: #1}}\fi} % YW's notes

\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors


\usepackage{amsmath,amsthm,amssymb,bbm}
\usepackage{mathtools}
\usepackage{cases}
\usepackage{dsfont}
\usepackage{microtype}
\usepackage{tablefootnote}

\allowdisplaybreaks

\usepackage{subfigure}
\usepackage{algorithm,algorithmic}
\usepackage{color}
\usepackage{appendix}
% \usepackage{lmodern}
% \usepackage[lining,semibold]{libertine}
% \usepackage[T1]{fontenc}
% \usepackage[libertine]{newtxmath}
% \usepackage{bm}
\usepackage{pdfpages}
\usepackage{bm}
\usepackage{subfigure}
\usepackage{algorithm,algorithmic}
\usepackage{color}
\usepackage{booktabs}       % professional-quality tables
\usepackage{appendix}
\usepackage{authblk}
\usepackage{comment}

%\usepackage[authoryear]{natbib}
\usepackage{hyperref}
\pdfstringdefDisableCommands{\def\Cref#1{#1}}

\usepackage{xcolor}
\hypersetup{
	colorlinks,
	linkcolor={blue!50!black},
	citecolor={blue!50!black},
}
\colorlet{linkequation}{blue}


%\externaldocument[]{privacy_supp}[privacy_supp.pdf]
\definecolor{maroon}{RGB}{192,80,77}
\definecolor{mypink3}{cmyk}{0, 0.7808, 0.4429, 0.1412}
\newcommand{\maroon}[1]{\textcolor{maroon}{#1}}
\newcommand{\explain}[2]{\underset{\mathclap{\overset{\uparrow}{#2}}}{#1}}
\newcommand{\explainup}[2]{\overset{\mathclap{\underset{\downarrow}{#2}}}{#1}}





\title{Offline Stochastic Shortest Path: Learning, Evaluation and Towards Optimality}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1,2]{Ming Yin\thanks{Equal contribution.}}
\author[3]{Wenjing Chen$^*$}
\author[4]{Mengdi Wang}
\author[1]{Yu-Xiang Wang}
%\author[1]{Further~Coauthor}
%\author[3]{Further~Coauthor}
%\author[3,1]{Further~Coauthor}
% Add affiliations after the authors
\affil[1]{%
	Department of Computer Science\\
	UC Santa Barbara
}
\affil[2]{%
	Department of Statistics and Applied Probability\\
	UC Santa Barbara
}
\affil[3]{%
	 Department of Electrical and Computer Engineering\\
	Texas A\&M University
}

\affil[4]{%
	Department of Electrical and Computer Engineering\\
	Princeton University
}
  
  \begin{document}
  	

  	
\maketitle



\begin{abstract}
	Goal-oriented Reinforcement Learning, where the agent needs to reach the goal state while simultaneously minimizing the cost, has received significant attention in real-world applications. Its theoretical formulation, \emph{stochastic shortest path} (SSP), has been intensively researched in the online setting. Nevertheless, it remains understudied when such an online interaction is prohibited and only historical data is provided. In this paper, we consider the \emph{offline stochastic shortest path} problem when the state space and the action space are finite. We design the simple \emph{value iteration}-based algorithms for tackling both \emph{offline policy evaluation (OPE)} and \emph{offline policy learning} tasks. Notably, our analysis of these simple algorithms yields strong instance-dependent bounds which can imply worst-case bounds that are near-minimax optimal. We hope our study could help illuminate the fundamental statistical limits of the offline SSP problem and motivate further studies beyond the scope of current consideration.
	
\end{abstract}



\section{Introduction}\label{sec:introduction}

Goal-oriented reinforcement learning aims at entering a goal state while minimizing its expected cumulative cost. The interplay between the agent and the environment keeps continuing when the target/goal state is not reached and this causes trajectories to have variable lengths among different trials, which makes it different from (or arguably more challenging than) the finite-horizon RL. In particular, this setting naturally subsumes the \emph{infinite-horizon $\gamma$-discounted} case as one can make up a ``ghost'' goal state $g$ and set $1-\gamma$ probability to enter $g$ at each timestep for the latter. 

The goal-oriented RL covers many popular reinforcement learning tasks, such as navigation problems (e.g., Mujoco mazes), Atari games (\emph{e.g.} breakout) and Solving Rubik's cube \citep{akkaya2019solving} (also see Figure~\ref{fig:main} for more examples). Parallel to its empirical popularity, the theoretical formulation, \emph{stochastic shortest path} (SSP), has been studied from the control perspective (\emph{i.e.} with known transition) since \cite{bertsekas1991analysis}. Recently, there is a surge of studying SSP from the data-driven aspects (\emph{i.e.} with unknown transition) and existing literatures formulate SSP into the \emph{online reinforcement learning} framework \citep{tarbouriech2020no,rosenberg2020near,cohen2021minimax,chen2021finding,tarbouriech2021stochastic}. On the other hand, there exists no literature (to the best of our knowledge) formally study the \emph{offline} behavior of stochastic shortest path problem. 

\begin{figure}[H]
	\centering     %%% not \center
	\subfigure{\label{fig:different_n}\includegraphics[width=20mm]{1}}
	\subfigure{\label{fig:different_H}\includegraphics[width=20mm]{2}}
	\subfigure{\label{fig:different_H}\includegraphics[width=20mm]{3}}
	\caption{Examples of Goal-oriented RL tasks in OpenAI-Gym environment. The robot can be asked to move-fetch to a position, orient a block or play with a pen.}
	\label{fig:main}
\end{figure}

In this paper, we study the offline counterpart of the stochastic shortest path (SSP) problem. Unlike its online version, we have no access to further explore new strategies (policies) and the data provided are historical trajectories. The goal is to come up with a cost-minimizing policy that can enter the goal state (\emph{policy learning}) or to evaluate the performance of a target policy (\emph{policy evaluation}).

\paragraph{Why should we study offline SSP?}Online SSP provides a suitable learning framework for goal-oriented tasks with cheap experiments (\emph{e.g.} Atari games). However, real-world applications usually have high-stake experiments which makes online interactions infeasible. For instance, in the application of logistic transportation, goods need to be delivered to their destinations. How to minimize the transportation cost should be decided/learned beforehand using the logged data. In the aircraft planning, changing flight routes instantaneously could be dangerous and designing routes based on history records is more appropriate for optimizing flying operation budget. In those scenarios, \emph{offline SSP} suffices for treating the practical challenges as it only learns from historical data.


\paragraph{Our contributions.} In this paper, we provide the first systematic study of the offline stochastic shortest path problem, and consider both \emph{offline policy evaluation} (OPE) and \emph{offline policy learning} tasks. As an initial attempt, we design the simple \emph{value iteration}-based algorithms to tackle the problems and obtain strong statistical guarantees. Concretely, our contributions are four folds.

\begin{itemize}
	\item For the offline policy evaluation task, we design VI-OPE algorithm (Algorithm~\ref{alg:VI_OPE}) under the coverage Assumption~\ref{assum:ope}. In particular, our algorithm is \emph{parameter-free} (requires no knowledge about $T^\pi$/$B^\pi$) and fully executed by the offline data. Theorem~\ref{thm:ope} provides the first statistical guarantee for offline SSP evaluation and nearly matches the statistical efficiency of its finite horizon counterpart (see discussion in Section~\ref{sec:discussion});
	
	\item For the offline learning task, we propose \emph{pessimism}-based algorithm PVI-SSP (Algorithm~\ref{alg:OPO}) under the Assumption~\ref{assum:opl} and \ref{assum:PC}. Our result (Theorem~\ref{thm:OPL}) has several merits: it is instance-dependent (as opposed to the worst-case guarantees in the existing online SSP works), enjoys faster $\widetilde{O}(1/n)$ convergence when the system is deterministic, and is also minimax-rate optimal. We believe Theorem~\ref{thm:OPL} is (in general) unimprovable for the current tabular setting.
	
	\item To understand the statistical limit of offline SSP, we prove the minimax lower bound $\Omega(B_\star\sqrt{\frac{SC^\star}{n}})$ (Theorem~\ref{thm:lower_main}) under the marginal coverage concentrability $\max_{s,a,s\neq g}\frac{d^{\pi^\star}(s,a)}{d^{\mu}(s,a)}\leq C^\star$. Our Theorem~\ref{thm:OPL} can match this rate (up to the logarithmic factor).
	
	\item Along the way for solving the problem, we highlight two new technical observations: Lemma~\ref{lem:T_pi} and Lemma~\ref{lem:HD}. The first one depicts the connection between the expected time $T^\pi$ and marginal coverage $d^\pi(s,a)$. As a result, we can express our result without using $T^\pi$ but the ratio-based quantity $\frac{d^\pi(s,a)}{d^\mu(s,a)}$, which matches the flavor of previous finite-horizon RL studies (also see Remark~\ref{remark:T_pi}). The second one is a general dependence improvement lemma that works with arbitrary policy $\pi$ and is the key for guaranteeing minimax optimal rate (also see Remark~\ref{remark:HD}). Both Lemmas are general and may be of independent interest.
	
\end{itemize}



\subsection{Related works.}

Stochastic shortest path itself is a broad topic and we are not aiming for the exhaustive review. Here we discuss two aspects that are most relevant to us.

\textbf{Online SSP.} Previous literatures intensively focus on the online aspect of SSP learning. Earlier works consider two types of problems: online shortest path routing problem with deterministic dynamics, which can be solved using the combinatorial bandit technique (\emph{e.g.} \cite{gyorgy2007line,talebi2017stochastic}); or SSP with stochastic transitions but adversarial feedbacks \citep{neu2012adversarial,zimin2013online,rosenberg2019online,chen2021finding,chen2021minimax}. Recently, \cite{tarbouriech2020no} starts investigating general online SSP learning problem and introduce the UC-SSP algorithm to first achieve the no-regret bound $\widetilde{O}(DS\sqrt{ADK})$.\footnote{Here the diameter of SSP is defined as $D:=\max_{s\in\mathcal{S}}\min_{\pi\in\Pi}T^\pi_s$ and by Lemma~2 of \cite{tarbouriech2020no} $B_\star:=\norm{V^\star}_\infty\leq c_{\max}D$. In this paper, we consider the dependence on $B_\star$ only since our $c_{\max}=1$ and this implies $B_\star\leq D$.} \cite{rosenberg2020near} improves this result to $\widetilde{O}(B_\star S\sqrt{AK})$ via a UCRL2-style algorithm with Bernstein-type bonus for exploration. Later, \cite{cohen2021minimax} eventually achieves the minimax rate $\widetilde{O}(B_\star\sqrt{SAK})$ by a reduction from SSP to finite-horizon MDP. However, the reduction technique requires the knowledge of $B_\star$ and $T_\star$. Most recently, \cite{tarbouriech2021stochastic} proposes EB-SSP which recovers the minimax rate but gets rid of the parameter knowledge (\emph{parameter-free}). When the parameters are known, their results can be \emph{horizon-free}.

Other than the general tabular SSP learning, there are also other threads, \emph{e.g.} Linear MDPs \citep{min2021learning,vial2021regret,chen2021improved} and posterior sampling \citep{jafarnia2021online}. Nevertheless, no analysis has been conducted for offline SSP yet.

\textbf{Offline tabular RL.} In the offline RL regime, there are fruitful results under different type of assumptions. \cite{yin2021near} first achieves the minimax rate $\widetilde{O}(\sqrt{H^3/nd_m})$ for non-stationary MDP with the strong uniform coverage assumption. \cite{ren2021nearly} improves the result to $\widetilde{O}(\sqrt{H^2/nd_m})$ for the stationary MDP setting. Later, \cite{rashidinejad2021bridging,xie2021policy,li2022settling} use the weaker single concentrability assumption and achieve the minimax rate $\widetilde{O}\sqrt{H^3SC^\star/n}$ (or $\widetilde{O}\sqrt{(1-\gamma)^{-3}SC^\star/n}$). Recently, this is further subsumed by the tighter instance-dependent result \citep{yin2021towards}. For offline policy evaluation (OPE) task, statistical efficiency has been achieved in tabular \citep{yin2020asymptotically}, linear \citep{duan2020minimax} and differentiable function approximation settings \citep{zhang2022off}.




























\section{Problem setup }\label{sec:formulation}
\paragraph{Stochastic Shortest Path.} An SSP problem consists of a \emph{Markov decision process} (MDP) together with an initial state $s_{\mathrm{init}}$ and an extra goal state $g$ and it is denoted by the tuple $M:=\langle \mathcal{S},\mathcal{A},P,c,s_\ini,g\rangle$. In particular, we denote $\mathcal{S}':=\mathcal{S}\cup\{g\}$. Each state-action pair $(s,a)$ incurs a bounded random cost (within $[0,1]$) drawn i.i.d. from a distribution with expectation $c(s,a)$ and will transition to the next state $s'\in\mathcal{S}'$ according to the probability distribution $P(\cdot|s,a)$. Here $\sum_{s^{\prime} \in \mathcal{S}^{\prime}} P\left(s^{\prime} \mid s, a\right)=1$. The goal state $g$ is a termination state with absorbing property and has cost zero (\emph{i.e.} $P(g|g,a)=1,c(g,a)=0$ for all $a\in\mathcal{A}$). 

The optimal behavior of the agent is characterized by a stationary, deterministic and proper policy that minimizes the expected total cost of reaching the goal state from \emph{any} state $s$. A stationary policy $\pi:\mathcal{S}\rightarrow \Delta^\mathcal{A}$ is a mapping from state $s$ to a probability distribution over action space $\mathcal{A}$, here  $\Delta^\mathcal{A}$ is the set of probability distributions over $\mathcal{A}$. The definition of proper policy is defined as follows.

\begin{definition}[Proper policies]\label{def:proper}
	A policy $\pi$ is proper if playing $\pi$ reaches the goal state with probability $1$ when starting from any state. A policy is improper if it is not proper. Denote the set of proper policies as $\Pi_{\mathrm{prop}}$.
\end{definition}

\paragraph{Value and $Q$-functions in SSP.} Any policy $\pi$ induces a \emph{cost-to-go} value function $V^\pi:\mathcal{S}\mapsto [0,\infty]$ defined as 
\[
V^\pi(s):=\lim_{T\rightarrow\infty} \E^\pi\left[\sum_{t=0}^T c(s_t,a_t)|s_0=s\right],\;\;\forall s\in\mathcal{S}
\]
and the Q-function is defined as $\forall s,a\in\mathcal{S}\times\mathcal{A}$,
\[
Q^\pi(s,a):=\lim_{T\rightarrow\infty} \E^\pi\left[\sum_{t=0}^T c(s_t,a_t)|s_0=s,a_0=a\right],
\]
where the expectation is taking w.r.t. the random trajectory of states generated by executing $\pi$ and transitioning according to $P$. Also, we denote $T^\pi_s:=\lim_{T\rightarrow\infty}\E[\sum_{t=0}^T\mathbf{1}[s_t\neq g]|s_0=s]=\E[\sum_{t=0}^\infty\mathbf{1}[s_t\neq g]|s_0=s]$ to be the expected time that $\pi$ takes to enter $g$ starting from $s$. By Definition~\ref{def:proper}, $\pi$ is proper if $T^\pi_s<\infty$ for all $s$, and improper if $T^\pi_s=\infty$ for some state $s$. Moreover, by definition it follows $V^\pi(g)=Q^\pi(s,a)=0$ for all $\pi$ and action $a$. The next proposition is the Bellman equation for the SSP problem.

\begin{proposition}[Bellman equations for SSP problem \citep{bertsekas1991analysis}]\label{prop:Bellman}
	Suppose there exists at least one proper policy and that for every improper policy $\pi'$ there exists at least one state $s\in\mathcal{S}$ such that $V^{\pi'}(s)=+\infty$. Then the optimal policy $\pi^\star$ is stationary, deterministic, and proper. Moreover, $V^\star=V^{\pi^\star}$ is the unique solution of the equation $V^\star = \mathcal{L}V^\star$, where 
	\[
	\mathcal{L} V(s):=\min _{a \in \mathcal{A}}\left\{c(s, a)+P_{s, a} V\right\}\quad \forall V\in\R^{S'}.
	\]
	Similarly, for a proper policy $\pi$, $V^\pi$ is the unique solution of $V^\pi=\mathcal{L}^\pi V^\pi$ with $\mathcal{L}^\pi V(s):= \E_{a\sim\pi(\cdot|s)}[c(s, a)+P_{s, a} V],\;\forall V\in\R^{S'}$. Furthermore, it holds
	\begin{equation}\label{eqn:bellman}
	\begin{aligned}
	Q^{\star}(s, a)=c(s, a)+P_{s, a} V^{\star}, \; &V^{\star}(s)=\min_{a \in \mathcal{A}} Q^{\star}(s, a), \\
	Q^{\pi}(s, a)=c(s, a)+P_{s, a} V^{\pi}, \; &V^{\pi}(s)=\E_{a\sim\pi(\cdot|s)} [Q^{\pi}(s, a)].
	\end{aligned}
	\end{equation}
\end{proposition}

We use $T^\star_s$ to denote the expected arriving time when coupled with the optimal policy $\pi^\star$ and the proof of Proposition~\ref{prop:Bellman} can be found in Appendix~\ref{sec:gen_Bellman}.

\paragraph{The Offline SSP task.} The goal of offline SSP is to reach the goal state but also minimize the cost using offline data $\mathcal{D}:=\{(s^{(i)}_0,a^{(i)}_0,c^{(i)}_0,s^{(i)}_1,\ldots,s^{(i)}_{T_i})\}_{i=1,\ldots,n}$, which is collected by a proper (possibly stochastic) behavior policy $\mu$. The optimal policy is a proper policy $\pi^\star$ (the existence of $\pi^\star$ is guaranteed by the Proposition~\ref{prop:Bellman}) which minimizes the value function for all states, \emph{i.e.}, 
\begin{align}
\pi^\star(s)=\arg\min_{\pi\in\Pi_{\text{prop}}}V^{\pi}(s).
\end{align}
The final learning objective is to come up with a (proper) policy $\widehat{\pi}$ using $\mathcal{D}$ such that the suboptimality gap $V^{\widehat{\pi}}(s_{\text{init}})-V^\star(s_{\text{init}})<\epsilon$ for a given accuracy $\epsilon>0$.

\paragraph{Some Notations.} In the paper, we may abuse the notation $V^\star$ with $V^{\pi^\star}$, and define $B_\star:=\max_s\left\{V^\star(s)\right\}$. In addition, we denote $\xi_h^{\pi}(s,a)$ to be the marginal state-action occupancy at time step $h$ under the policy $\pi$ and $\xi_h^{\pi}(s)$ the marginal state occupancy at time $h$. Furthermore, we define the \emph{marginal coverage} $d^\pi$ as (given the initial state is $s_{\text{init}}$):
\begin{equation}\label{def:mar_cov}
d^\pi(s,a):=\sum_{h=0}^\infty \xi^\pi_h(s,a),\;\;\forall s,a\in\mathcal{S}\times\mathcal{A}.
\end{equation}
\begin{remark} 
	The notation of marginal coverage mirrors the marginal state-action occupancy in the infinite horizon $\gamma$-discounted setting but without normalization. Therefore, it is likely that $d^\pi(s,a)>1$ (or even $\infty$) for the offline SSP problem. Nevertheless, the key Lemma~\ref{lem:T_pi} guarantees $d^\pi(s,a)$ is finite when $\pi$ is a proper policy. This feature helps formalize the following assumptions in offline SSP.
\end{remark}





\subsection{Assumptions}
Offline learning/evaluation in SSP is impossible without assumptions. We now present three required assumptions.

\begin{assumption}[offline policy evaluation (OPE)]\label{assum:ope}
	We assume both the target policy $\pi$ and behavior policy $\mu$ are proper. In this case, we have $
	\Pi_{\text{prop}}\neq\emptyset$. Moreover, we assume behavior policy $\mu$ can cover the exploration (state-action) space of $\pi$, i.e. $\forall s,a\in\mathcal{S}\times\mathcal{A}$ s.t. $d^\pi_{\bar{s}}(s,a):=\sum_{h=0}^\infty \xi^\pi_{h,\bar{s}}(s,a)>0$, it implies $d^\mu_{\bar{s}}(s,a):=\sum_{h=0}^\infty \xi^\mu_{h,\bar{s}}(s,a)>0$, where $d^\pi_{\bar{s}}(s,a)$ is the marginal coverage and $\xi^\pi_{h,\bar{s}}(s,a)$ the marginal state-action occupancy given the initial state $\bar{s}$. In particular, when $\bar{s}=s_{\text{init}}$, we suppress the subscript and use $d^\pi,\xi^\pi_h$ only.
\end{assumption}

There are two remarks that are in order.

Assumption~\ref{assum:ope} requires that the behavior policy $\mu$ can explore all the state-action locations that are explored by $\pi$ and this mirrors the necessary OPE assumption made in the standard RL setting (\emph{e.g.} \cite{thomas2016data,yin2020asymptotically,uehara2020minimax}). Otherwise, policy evaluation for SSP would incur constant suboptimality gap even when \emph{infinite many} trajectories are collected. 

Moreover, instead of making assumption only on $d^\mu_{s_\text{init}}$, \ref{assum:ope} assumes $\mu$ can cover $\pi$ when starting from any state $\bar{s}$ (\emph{i.e.} $d^\pi_{\bar{s}}>0$ implies $d^\mu_{\bar{s}}>0$ for all $\bar{s}$). This extra requirement is mild since, by Definition~\ref{def:proper}, a proper policy can reach goal state $g$ with probability $1$ when starting from any state $\bar{s}$. Similarly, we need the assumptions for offline learning tasks. 


\begin{assumption}[offline policy learning]\label{assum:opl}
	We assume there exists a deterministic proper policy and the behavior policy $\mu$ is (possible random) proper. Next, by Proposition~\ref{prop:Bellman}, we know there exists a deterministic optimal proper policy $\pi^\star$. We assume behavior policy $\mu$ can cover the exploration (state-action) space of $\pi^\star$, i.e. $\forall s,a\in\mathcal{S}\times\mathcal{A}$ s.t. $d^{\pi^\star}_{\bar{s}}(s,a):=\sum_{h=0}^\infty \xi^{\pi^\star}_{h,\bar{s}}(s,a)>0$, it implies $d^\mu_{\bar{s}}(s,a):=\sum_{h=0}^\infty \xi^\mu_{h,\bar{s}}(s,a)>0$, where $d^{\pi^\star}_{\bar{s}}(s,a)$ and $\xi^{\pi^\star}_{h,\bar{s}}(s,a)$ is the same notion used in Assumption~\ref{assum:ope}. In particular, when $\bar{s}=s_{\text{init}}$, we suppress the subscript and use $d^{\pi^\star},\xi^{\pi^\star}_h$ only.
\end{assumption}

\ref{assum:opl} provides the offline learning version of Assumption~\ref{assum:ope}. It echos its offline RL counterpart assumed in \cite{liu2019off,yin2021towards,uehara2021pessimistic}. Similar to the offline RL setting (\emph{e.g.} see \cite{yin2021towards} for detailed explanations), this assumption is also required for the tabular offline SSP problem.

\begin{assumption}[Positive cost \citep{rosenberg2020near}]\label{assum:PC}
	There exists $c_{\min}>0$ such that $c(s,a)\geq c_{\min}$ for every $(s,a)\in\mathcal{S}\times\mathcal{A}$.\footnote{Note this assumption only holds for $(s,a)\in\mathcal{S}\times\mathcal{A}$. For goal state $g$, it always has $c(g,a)=0$ for all $a\in\mathcal{A}$.}
\end{assumption}

This assumption guarantees there is no \emph{``free-cost''} state. With \ref{assum:PC} it holds that any policy does not reach the goal state has infinite cost, and this certifies the condition in Proposition~\ref{prop:Bellman} that for every improper policy $\pi'$ there exists at least one state $s$ such that $V^{\pi'}(s)=+\infty$. When $c_{\min}$ is $0$, a simple workaround is to solve a perturbed SSP instance with all observed costs clipped to $\epsilon$ if they are below some $\epsilon>0$, and in this case $c_{\min}=\epsilon>0$. This will cause only an additive term of order $O(\epsilon)$ (see \cite{tarbouriech2020no} for online SSP). Therefore, as the first attempt for offline SSP problem, we stick to this assumption throughout the paper. Last but not least, Assumption~\ref{assum:PC} is only used in offline learning problem (Section~\ref{sec:OPL}) and our OPE analysis (Section~\ref{sec:discussion}) can work well with zero cost.














\section{Off-Policy Evaluation in SSP} \label{sec:discussion}
\begin{algorithm}
	\caption{VI-OPE (Value Iteration for OPE problem of Stochastic Shortest Path)}
	\label{alg:VI_OPE}
	\begin{algorithmic}[1]
		\STATE {\bfseries Input:} $\epsilon_{\text{OPE}}$, $\mathcal{D}:=\{(s^{(i)}_1,a^{(i)}_1,c^{(i)}_1,s^{(i)}_2,\ldots,s^{(i)}_{T_i})\}_{i=1}^n$.
		\FOR{$(s,a,s')\in \mathcal{S}\times\mathcal{A}\times\mathcal{S}'$}
		\STATE Set $n(s,a) =  \sum_{i=1}^n\sum_{j=1}^{T_i}\mathbb{I}(s_j^{(i)}=s\text{, }a_j^{(i)}=a)$.
		\IF{$n(s,a)>0$}
		\STATE Calculate $\widehat{c}(s,a)=\frac{\sum_{i=1}^n\sum_{j=1}^{T_i}\mathbb{I}(s_j^{(i)}=s\text{, }a_j^{(i)}=a)c^{(i)}_j}{n(s,a)}$
		\STATE  $\widehat{P}(s'|s,a)=\frac{\sum_{i=1}^n\sum_{j=1}^{T_i}\mathbb{I}(s_j^{(i)}=s\text{, }a_j^{(i)}=a\text{, }s_{j+1}^{(i)}=s')}{n(s,a)}$, 
		\ELSE
		\STATE $\widehat{c}(s,a)\leftarrow c_\text{min}$, $\widehat{P}(s'|s,a)\leftarrow \mathbb{I}(s'=g)$.
		\ENDIF
		\STATE \blue{$\diamond$ Perturb the estimated transition kernel} 
		\STATE $\widetilde{P}(s'|s,a)=\frac{n(s,a)}{n(s,a)+1}\widehat{P}(s'|s,a)+\frac{\mathbb{I}[s'=g]}{n(s,a)+1}$
		\ENDFOR
		\STATE \blue{$\diamond$ Value Iteration for SSP problem} 
		\STATE {\bfseries Initialize:} ${V}^{(-1)}(\cdot)\leftarrow -\infty$, ${V}^{(0)}(\cdot)\leftarrow\mathbf{0}$, $i=0$.
		\WHILE{$\norm{V^{(i)}-V^{(i-1)}}_\infty>\epsilon_{\text{OPE}}$}
		\FOR{$(s,a)\in \mathcal{S}\times\mathcal{A}$}
		\STATE $Q^{(i+1)}(s,a)=\widehat{c}(s,a)+\widetilde{P}_{s,a}V^{(i)}$
		\STATE  $V^{(i+1)}(s)=\langle \pi(\cdot|s), Q^{(i+1)}(s,\cdot)\rangle$
		\STATE $i\leftarrow i+1$
		\ENDFOR
		\ENDWHILE
		\STATE \textbf{Output}: $V^{(i)}(\cdot)\in\R^S$, $V^{(i)}(s_{\text{init}})$.
	\end{algorithmic}
\end{algorithm}

In this section, we assume that Assumption~\ref{assum:ope} holds and consider \emph{offline policy evaluation} (OPE) for the \emph{stochastic shortest path} (SSP) problem. Our algorithmic design follows the natural idea of \emph{approximate value iteration} \citep{munos2005error} and is named \textbf{VI-OPE} (Algorithm~\ref{alg:VI_OPE}). Specifically, VI-OPE approximates \eqref{eqn:bellman} by solving the fixed point solution of the empirical Bellman equation associated with estimated cost $\widehat{c}$ and transition $\widetilde{P}$. One highlight is that we construct $\widetilde{P}$ to be the skewed version of the vanilla empirical estimation $\widehat{P}$ by injecting $\frac{1}{n(s,a)+1}$ probability to state $g$ (Line~11 of Algorithm~\ref{alg:VI_OPE}).\footnote{This treatment is also used in \cite{tarbouriech2021stochastic}.} By such a shift, the empirical Bellman operator $\widehat{\mathcal{T}}^{\pi}(\cdot):=\widehat{c}^\pi+\widetilde{P}^\pi(\cdot) $ becomes a contraction with rate $\rho:=\max_{\substack{s,a \\ s\neq g}}(\frac{n_{s,a}}{n_{s,a}+1})<1$ (see Lemma~\ref{lem:contraction} for details). Hence, \emph{contraction mapping theorem} \citep{diaz1968fixed} guarantees the loop (Line15-21) will end after $O(\log(\epsilon_{\text{OPE}})/\log(\rho))$ iterations for any $\epsilon_{\text{OPE}}>0$. We have the following main result for VI-OPE, whose proof can be found in Appendix~\ref{sec:proof_ope}.


\begin{theorem}[Offline Policy Evaluation in SSP]
	\label{thm:ope}
	Denote $d_m:=\min\{\sum_{h=0}^\infty \xi^\mu_h(s,a):s.t. \sum_{h=0}^\infty \xi^\mu_h(s,a)>0\}$, and $T^\pi_s$ to be the expected time to hit $g$ when starting from $s$. Define $\bar{T}^\pi=\max_{\bar{s}\in\mathcal{S}}T^\pi_{\bar{s}}$ and the quantity ${T}_{\max}=\max_{i\in[n]} T_i$. Then when $n\geq \max\{\frac{49 S\iota}{9d_m}, 64(\bar{T}^\pi)^2\frac{S\iota}{d_m},O(\iota/d_m),O( {T}^2_{\max}\log(SA/\delta)/d_m^2)\}$, we have with probability $1-\delta$, the output of Algorithm~\ref{alg:VI_OPE} satisfies ($\iota=O(\log(SA/\delta))$)
	%\ming{specify $\iota$}
	\begin{align*}
	&|V^{(i)}(s_\mathrm{init})-V^\pi(s_\mathrm{init})|\\
	\leq &4\sum_{s,a,s\neq g} d^\pi(s,a)\sqrt{\frac{2\Var_{P_{s,a}}[V^\pi+c]\iota}{n\cdot d^\mu(s,a)}}
	+\widetilde{O}(\frac{1}{n})+\frac{\epsilon_{\mathrm{OPE}}}{1-\rho}.
	\end{align*}
	where the $\widetilde{O}$ absorbs Polylog term and higher order terms.
\end{theorem}

\textbf{On statistical efficiency.} First of all, when VI-OPE converges exactly (\emph{i.e.} $\epsilon_\text{OPE}=0$), the output $\widehat{V}^\pi:=\lim_{i\rightarrow\infty}V^{(i)}$ possesses no optimization error (\emph{i.e.} $\epsilon_{\text{OPE}}/(1-\rho)=0$) and the (non-squared) statistical rate achieved by VI-OPE is dominated by $O(\sum_{s,a} d^\pi(s,a)\sqrt{\frac{\Var_{P_{s,a}}[V^\pi+c]\iota}{n\cdot d^\mu(s,a)}})$. As a comparison, for the well-studied finite-horizon tabular MDP problem, the statistical limit $O(\sqrt{\sum_{h=1}^H\sum_{s,a} d^\pi_h(s,a)^2\frac{\Var_{P_h}[V^\pi_{h+1}+c]}{n\cdot d^\mu_h(s,a)}]})$ has been achieved by \cite{yin2020asymptotically,duan2020minimax,kallus2020double} which matches the previous proven lower bound \citep{jiang2016doubly}. Therefore, it is natural to conjecture that the statistical lower bound for SSP-OPE problem has the rate $O(\sqrt{\sum_{s,a} d^\pi(s,a)^2\frac{\Var_{P_{s,a}}[V^\pi+c]}{n\cdot d^\mu(s,a)}})$. Our simple VI-OPE algorithm nearly matches this conjectured lower bound and only has the expectation outside of the square root. How to obtain the Carmer-Rao-style lower bound for SSP OPE problem and how to close the gap are beyond this initial attempt. We leave these as the future works.

\textbf{Parameter-free.} Different from the standard MDPs (\emph{e.g.} finite-horizon, discounted), the SSP formulation generally has variable horizon length which yields no explicit bound on $\norm{V^\pi}_\infty$. Consequently, most of the previous literature that study SSP problem requires the knowledge of expected running time $T^\pi/T^\star$ or $B^\pi/B_\star$, the upper bound on $\norm{V^\pi}/\norm{V^\star}$ (\emph{e.g.} \cite{tarbouriech2020no,rosenberg2020near,cohen2021minimax,chen2021finding,chen2021improved}). In contrast, VI-OPE is fully parameter-free as it requires no prior information about neither $T^\pi$ nor $B^\pi$ and the main term of our bound does not explicitly scale with those parameters. Last but not least, VI-OPE does not reply on the positive cost Assumption~\ref{assum:PC}.






\section{Offline Learning in SSP} \label{sec:OPL}
In this section, we consider the offline policy optimization problem. Similar to previous work, we assume the knowledge of an upper bound on the $B_\star:=\norm{V^\star}_\infty$, which is denoted as $\tB$. How to deal with the case when $\tB$ is unknown is discussed in Section~\ref{sec:knowladge}. Throughout the section, we suppose Assumption~\ref{assum:opl} and Assumption~\ref{assum:PC} holds.


We introduce our algorithm in Algorithm~\ref{alg:OPO}. The main idea behind the algorithm is the pessimistic update of the value function via adding a bonus function to $V^{(i)}$. Here the \textbf{bonus function} 
$b_{s,a}(V):=\sqrt{\frac{2\hat{c}(s,a)\iota}{n(s,a)}}+\frac{7\iota}{3n(s,a)}+\frac{\tB}{n(s,a)}+\frac{16\tB\iota}{3n(s,a)}+\max\{2\sqrt{\frac{\Var(\tP',V)\iota}{n(s,a)}},4\frac{\tB\iota}{n(s,a)}\}+180\sqrt{\frac{3\widetilde{T}\tB S}{2n(s,a)n_{\min}}}(\sqrt{\tB}+1)\iota$ $\forall(s,a)\in\mathcal{S}\times\mathcal{A}$, where $n_{\text{max}}=\max_{s,a}n(s,a)$ and $n_{\text{min}}=\min_{s,a}\{n(s,a):n(s,a)>0\}$. For the goal state $b_{g,a}(V)=0$ $\forall a\in\mathcal{A}$. Here $\widetilde{T}$ is an upper bound of $T^\star$.\footnote{Here we do point the design of $b_{s,a}$ requires $\widetilde{T}$ in addition to $\widetilde{B}$. However, this is not essential as (by Assumption~\ref{assum:PC}) $\widetilde{T}$ can be bounded by $\widetilde{B}/c_\text{min}$.}

\begin{algorithm}[tbh]
	\caption{PVI-SSP (Pessimistic Value Iteration for SSP)}
	\label{alg:OPO}
	\begin{algorithmic}[1]
		\STATE {\bfseries Input:} $\epsilon_{\text{OPL}}$, $\mathcal{D}:=\{(s^{(i)}_1,a^{(i)}_1,c^{(i)}_1,s^{(i)}_2,\ldots,s^{(i)}_{T_i})\}_{i=1}^n$. $\tB$ and $\iota=O(\log(SA/\delta))$. $n_{\text{max}}$ and $b_{s,a}$ see above.
		\FOR{$(s,a,s')\in \mathcal{S}\times\mathcal{A}\times\mathcal{S}'$}
		\STATE Set $n(s,a) =  \sum_{i=1}^n\sum_{j=1}^{T_i}\mathbb{I}(s_j^{(i)}=s\text{, }a_j^{(i)}=a)$.
		\IF{$n(s,a)>0$}
		\STATE Calculate $\widehat{c}(s,a)=\frac{\sum_{i=1}^n\sum_{j=1}^{T_i}\mathbb{I}(s_j^{(i)}=s\text{, }a_j^{(i)}=a)c^{(i)}_j}{n(s,a)}$
		\STATE $\widehat{P}(s'|s,a)=\frac{\sum_{i=1}^n\sum_{j=1}^{T_i}\mathbb{I}(s_j^{(i)}=s\text{, }a_j^{(i)}=a\text{, }s_{j+1}^{(i)}=s')}{n(s,a)}$,  
		\ELSE
		\STATE $\widehat{c}(s,a)\leftarrow c_{\min}$, $\widehat{P}(s'|s,a)\leftarrow\mathbb{I}(s'=g)$.
		\ENDIF
		\STATE $\widetilde{P}'(s'|s,a)=\frac{n_{\text{max}}}{n_{\text{max}}+1}\widehat{P}(s'|s,a)+\frac{\mathbb{I}[s'=g]}{n_{\text{max}}+1}$
		\ENDFOR
		\STATE \blue{$\diamond$ Pessimistic Value Iteration for offline learning} 
		\STATE {\bfseries Initialize:} ${V}^{(-1)}(\cdot)\leftarrow \infty$, ${V}^{(0)}(\cdot)\leftarrow\tB\cdot\mathbf{1}$, $i=0$.
		\WHILE{$\norm{V^{(i)}-V^{(i-1)}}_\infty> 0 \blue{(\epsilon_{\text{OPL}})}$}
		\FOR{$(s,a)\in \mathcal{S}'\times\mathcal{A}$}
		\STATE $Q^{(i+1)}(s,a)=\min\{\widehat{c}(s,a)+\widetilde{P}'_{s,a}V^{(i)}+b_{s,a}(V^{(i)})\text{ , }\tB\}$
		\STATE  $V^{(i+1)}(s)=\min_a Q^{(i+1)}(s,a)$
		\STATE $i\leftarrow i+1$
		\ENDFOR
		\ENDWHILE
		\STATE Calculate $\bar{\pi}(\cdot)=\argmin_{a}Q^{(i)}(\cdot,a)$
		\STATE \textbf{Output}: $\bar{\pi}$, $\bar{V}(\cdot)=\min_{a}Q^{(i)}(\cdot,a)$
	\end{algorithmic}
\end{algorithm}



The use of value iteration to approximate the underlying Bellman optimality equation $V^{\star}(s)=\max_{a\in\mathcal{A}}\{c(s, a)+P_{s, a} V^{\star}\},\;\forall s\in\mathcal{S}'$ is natural when model components $P,c$ are accurately estimated by $\widetilde{P}',\widehat{c}$. Moreover, comparing to VI-OPE, there are several differences for PVI-SSP. First, $\widetilde{P}'$ is chosen according to $n_\text{max}$ (instead of $n(s,a)$), which makes $\widetilde{P}'$ ``closer'' to $\widehat{P}$ but preserves the positive one-step transition to $g$. More importantly, a pessimistic bonus $b_{s,a}$ is added to the value update differently at each state-action location which measures the uncertainty learnt so far from the offline data. Action with higher uncertainty are less likely to be chosen for the next update. Concretely, $\sqrt{\frac{\Var(\widetilde{P}',V^{(i)})}{n}}$ measures the uncertainty of $V^{(i)}$ and $\sqrt{\frac{\widehat{c}}{n}}$ measures the uncertainty of per-step cost $\widehat{c}$.\footnote{This is due to $\Var(c)\leq E[c^2]\leq E[c]$ for r.v. $c\in[0,1]$.} However, to guarantee proper pessimism, we require the knowledge of $\widetilde{B}$ in the design of $b_{s,a}$.

In addition, for analysis purpose we state our result under the regime where the iteration converges exactly and the output $\bar{V}$ (in Line 22) is fixed point of the operator $\widetilde{\mathcal{T}}$ (see Appendix~\ref{sec:converge_OPO} for details). In practice, one can stop the iteration when the update difference is smaller than $\epsilon_{\text{OPL}}$. We have the following offline learning guarantee for $\bar{\pi}$, which is our major contribution. The proof is deferred to Appendix~\ref{sec:proof_OPO}.





\begin{theorem}[Offline policy learning in SSP]\label{thm:OPL}
	Denote $d_m:=\min\{\sum_{h=0}^\infty \xi^\mu_h(s,a):s.t. \sum_{h=0}^\infty \xi^\mu_h(s,a)>0\}$, and $T^\pi_s$ to be the expected time to hit $g$ when starting from $s$. Define $\bar{T}^\pi=\max_{\bar{s}\in\mathcal{S}}T^\pi_{\bar{s}}$. Then when $n\geq n_0$, we have with probability $1-\delta$, the output $\bar{\pi}$ of Algorithm~\ref{alg:OPO} is a proper policy and satisfies ($\iota=O(\log(SA/\delta))$)
	{\begin{align*}
		0\leq &V^{\bar{\pi}}(s_\mathrm{init})-V^\star(s_\mathrm{init})\\
		\leq&4\sum_{s,a,s\neq g} d^\star(s,a)\sqrt{\frac{2\Var_{P_{s,a}}[V^\star+c]\iota}{n\cdot d^\mu(s,a)}}+\widetilde{O}(\frac{1}{n}),
		\end{align*}
	}where the quantity $d_\text{max}=\max_{s,a}d^\mu(s,a)$, the quantity ${T}_{\max}=\max_{i\in[n]} T_i$ and we define $n_0:=\max\{\frac{4B_\star-2c_\text{min}}{c_\text{min}d_{\text{max}}}, \frac{26^2\times 2S\iota(\bar{T}^\star)^2(\sqrt{B_\star}+1)^2}{d_m}, \frac{10^6(\sqrt{\tB}+1)^4S\iota\bar{T}^\star\widetilde{T}}{B^\star(\sqrt{B^\star}+1)^2d_m}, $ $O( {T}_{\max}^2\log(SA/\delta)/d_m^2)\}$.
\end{theorem}




\textbf{On guarantee for policy.} Existing online SSP works measure the algorithm performance using \emph{regret} $R^{\text{SSP}}_{K}:=\sum_{k=1}^{K} \sum_{h=1}^{I^{k}} c_{h}^{k}-K \cdot \min_{\pi \in \Pi_{\text {proper }}} V^{\pi}\left(s_{\text{init}}\right)$ (\emph{e.g.} \citep{tarbouriech2021stochastic}) and is different from policy-based regret measurement $R_K:=\sum_{k=1}^{K} V_{1}^{\star}\left(x_{k, 1}\right)-V_{1}^{\pi_{k}}\left(x_{k, 1}\right)$ (\emph{e.g.} \cite{azar2017minimax}) in standard RL. The notion of $R^{\text{SSP}}_{K}$ provides the flexibility for policy update even within the episode (since it suffices to minimize $\sum_{h=1}^{I^{k}} c_{h}^{k}$), therefore unable to output a concrete stationary policy for the policy learning purpose. In contrast, Theorem~\ref{thm:OPL} provides a policy learning result via bounding the performance of output policy $\bar{\pi}$ explicitly.

\textbf{Instance-dependent bound.} Prior online SSP studies focus on deriving better worst-case regret (\emph{e.g.} the minimax rate is of order $\Theta(B_\star\sqrt{SAK})$) where the bounds are expressed by the parameters $B_\star/D,S,A$ that lack the characterization of individual instances. In offline SSP, the main term of PVI-SSP is fully expressed by the system quantities with marginal coverage $d^\star$ and $d^\mu$, conditional variance over transition $P$ and cost function $c$. This instance-adaptive result characterizes the hardness of learning better since the magnitude of the bounds changes with the instances. It fully avoids the explicit use of worst-case parameters $B_\star,S,A$.

\textbf{Faster convergence.} When the SSP system is deterministic for both cost $c$ and transition $P$, the conditional variances $\Var_{P_{s,a}}[V^\star+c]$ are always zero. In these scenarios, Theorem~\ref{thm:OPL} automatically guarantees faster convergence rate $\widetilde{O}(1/n)$ in deterministic SSP learning. Such a feature is not enjoyed by the existing worst-case studies in online SSP as their regrets are dominated by the statistical rate $\widetilde{O}(\sqrt{K})$ even for deterministic systems.


\textbf{On optimality.} While instance-dependent, it is still of great interest to understand whether this result is optimal. We provide the affirmative answer by showing a (nearly) matching minimax lower bound under the single concentrability condition in the next section.  



\section{SSP Minimax Lower Bound}\label{sec:lower}

In this section, we study the statistical limit of offline policy learning in SSP. Concretely, we consider the family of problems satisfying bounded partial coverage, \emph{i.e.} $\max_{s,a,s\neq g}\frac{d^{\pi^\star}(s,a)}{d^{\mu}(s,a)}\leq C^\star$, where $d^{\pi}(s,a)=\sum_{h=0}^\infty \xi^\pi_h(s,a)<\infty$ for all $s,a$ (excluding $g$) for any proper policy $\pi$. This $C^\star$ formally defines the maximum ratio between $\pi^\star$ and $\mu$ in Assumption~\ref{assum:opl}. Consequently, we have the following result (the full proof is in Appendix~\ref{sec:lower_proof}):

\begin{theorem}\label{thm:lower_main}
	We define the following family of SSPs:
	\[
	\mathrm{SSP}(C^\star)=\{(s_{\mathrm{init}},\mu,P,c)|\max_{s,a,s\neq g}\frac{d^{\pi^\star}(s,a)}{d^{\mu}(s,a)}\leq C^\star\},
	\]
	where $d^\pi(s,a)=\sum_{h=0}^\infty\xi^\pi_h(s,a)$. Then for any $C^\star\geq 1$, $\norm{V^\star}_\infty=B_\star>1$, it holds (for some universal constant $c$)
	\begin{align*}
	&\inf_{\widehat{\pi} \;\mathrm{proper}}\sup_{(s_{\mathrm{init}},\mu,P,c)\in\mathrm{SSP}(C^\star)}\E_{\mathcal{D}}[V^{\widehat{\pi}}(s_{\mathrm{init}})-V^\star(s_{\mathrm{init}})]\\
	&\geq c\cdot B_\star\sqrt{\frac{SC^\star}{n}}.
	\end{align*}
\end{theorem}

Theorem~\ref{thm:lower_main} reveals for the family with proper policy $\pi^\star$ and $\mu$ with bounded ratio $C^\star$, the minimax lower bound is $\Omega(B_\star\sqrt{\frac{SC^\star}{n}})$. In particular, the dominant term in Theorem~\ref{thm:OPL} directly implies this rate (recall $\pi^\star$ is deterministic by \ref{assum:opl}) by the following calculation (assuming $B_\star>1$ just like Theorem~\ref{thm:lower_main}):
{
	\begin{equation}
	\begin{aligned}
	&\sum_{s,a,s\neq g} d^\star(s,a)\sqrt{\frac{\Var_{P_{s,a}}[V^\star+c]}{n\cdot d^\mu(s,a)}}\\
	=&\sum_{s,s\neq g} d^\star(s,\pi^\star(s))\sqrt{\frac{\Var_{P_{s,\pi^\star(s)}}[V^\star+c]}{n\cdot d^\mu(s,\pi^\star(s))}}\\
	\leq&\sqrt{\sum_{s,s\neq g}\frac{d^\star(s,\pi^\star(s))}{d^\mu(s,\pi^\star(s))}\cdot\sum_{s,s\neq g}\frac{d^\star(s,\pi^\star(s))\Var_{P_{s,\pi^\star(s)}}[V^\star+c]}{n}}\\
	\leq&\sqrt{\sum_{s,s\neq g}C^\star\cdot \frac{B_\star^2}{n}}=B_\star\sqrt{\frac{SC^\star}{n}} \;\;(\text{also see Proposition~\ref{prop:simplified}}),
	\end{aligned}
	\end{equation}
}where the first inequality uses CS inequality and the second one uses the key Lemma~\ref{lem:T_pi}.\footnote{Here since $B_\star>1$, when applying Lemma~\ref{lem:T_pi}, $B_\star$ will dominate $c\in[0,1]$.} This verifies PVI-SSP is near-optimal up to the logarithmic and higher order terms.










\section{Sketch of the analysis}\label{sec:pf_overview}
In this section, we sketch the proofs of our main theorems. In particular, we focus on describing the procedure of offline policy learning Theorem~\ref{thm:OPL}. First of all, when the condition $n\geq n_0$ holds, the output $\bar{\pi}$ is proper with high probability and following this one can conduct standard decomposition:
\[
V^{\bar{\pi}}-V^\star = (V^{\bar{\pi}}-\bar{V})+(\bar{V}-V^\star)
\]
where $V^\star$ is the solution of Bellman optimality operator $\mathcal{T}$ and $\bar{V}$ is the fixed point solution of the operator $\tT(V)(s)=\min_{a}\left\{\min\{\widehat{c}(s,a)+\widetilde{P}_{s,a}V+b_{s,a}(V)\text{ , }\tB\}\right\}$ (Lem~\ref{lem:contraction2}). Also, $V^{\bar{\pi}}$ satisfies general Bellman equation (Lemma~\ref{lem:general_Bellman}) therefore we first decompose $V^{\bar{\pi}}-\bar{V}$ using a \emph{simulation-lemma} style decomposition (Lemma~\ref{lem:Vpi-V}):
\begin{align*}
\Vb-\bar{V}=&\sumS\xi_{h}^{\bar{\pi}}(s)\bigg\{(P_{s,\bar{\pi}(s)}-\tP'_{s,\bar{\pi}(s)})\bar{V}\\
+&c(s,\bar{\pi}(s))-\hat{c}(s,\bar{\pi}(s))-b_{s,\bar{\pi}(s)}(\bar{V})\bigg\}
\end{align*}
By the careful design of $b_{s,a}(\cdot)$, the pessimism guarantees $V^{\bar{\pi}}-\bar{V}\leq 0$ (Lemma~\ref{lem:pessimism}). For $\bar{V}-V^\star$, a similar \emph{simulation-lemma} style SSP decomposition (Lemma~\ref{lem:bar_V-V*}) follows:
\begin{equation}\label{eqn:decomp_star}
\begin{aligned}
\bar{V}-V^{\star}\leq&\sumS \xi^\star_h(s)\bigg\{(\tP'_{s,\pi^\star(s)}-P_{s,\pi^\star(s)})\bar{V}\\
+&\widehat{c}(s,\pi^\star(s))-c(s,\pi^\star(s))+b_{s,\pi^\star(s)}(\bar{V})\bigg\}.
\end{aligned}
\end{equation}




Before we proceed to explain about how to bound the residual summations, we present two new lemmas, which help characterize the key features of stochastic shortest path problem.

\begin{lemma}[Informal version of Lemma~\ref{lem:propT}]\label{lem:T_pi}
	Let $T^{\pi}$ be the expected time of arrival to goal state $g$ when applying proper policy $\pi$ and starting from $s_\mathrm{init}$, then 
	\[
	T^{\pi}=\sum_{h=0}^{\infty}\sum_{\substack{s,a \\ s\neq g}}\xi_{h}^{\pi}(s,a)=\sum_{\substack{s,a \\ s\neq g}}d^{\pi}(s,a).
	\]
\end{lemma}

\begin{remark}\label{remark:T_pi}
	Lemma~\ref{lem:T_pi} explicitly reflects the connection between the expected arriving time $T^\pi$ and marginal coverage $d^\pi(s,a)$. Unlike the finite-horizon problem where $d^\pi_h$ are probability measures (\emph{e.g.} see \cite{yin2021towards}), for SSP $d^\pi(s,a)$ can be arbitrary large (for a general policy $\pi$) due to definition~\ref{def:mar_cov}. Lemma~\ref{lem:T_pi} guarantees $d^\pi(s,a)<\infty$ for proper policy $\pi$ since by Definition~\ref{def:proper} $T^\pi<\infty$, and, as a result, make our bound in Theorem~\ref{thm:OPL} valid. Note similar result is of less interests in the standard finite-horizon episodic RL since it holds trivially that $H=\sum_{h=1}^H\sum_{s,a}d^\pi_h(s,a)$ and, in SSP, this becomes important as we have undetermined horizon length. With Lemma~\ref{lem:T_pi}, we can get away with estimating the aggregated measure $T^\pi/T^\star$ (like previous online SSP papers did) and use sub-component $d^\pi(s,a)/d^\star(s,a)$ to reflect the behaviors of individual state-action pairs and achieve more instance-dependent results. 
\end{remark}

\begin{lemma}[Informal version of Lemma~\ref{lem:bound_sum_var}]\label{lem:HD}
	For any probability transition matrix $P$, policy $\pi$, 
	and any cost function $c\in[0,1]$ associated with ${SSP}(P,\pi)$. Suppose $V\in\R^{S+1}$ is any value function satisfying order property (where $V(g)=0$), i.e., $V(s)\geq \sum_{a}\pi(a|s)P_{s,a}V$ for all $s\in\mathcal{S}$, then we have 
	{
		\[
		\sum_{h=0}^{\infty}\sum_{\substack{s,a \\ s\neq g}}\xi_h^{\pi}(s,a)\Var_{P_{s,a}}(V)\leq2\norm{V}_{\infty}\cdot V(s_\mathrm{init})\leq2\norm{V}_{\infty}^2.
		\]}
\end{lemma}
\begin{remark}\label{remark:HD}
	Lemma~\ref{lem:HD} can be viewed as a dependence improvement result for SSP problem since it guarantees Theorem~\ref{thm:OPL} to achieve the minimax rate via \eqref{eqn:minimax_derivation}. More critically, it widely applies to arbitrary policies assuming the ordering condition holds for $V$. For instance, a direct upper bound using Lemma~\ref{lem:T_pi} would yield $T^\pi\norm{V}^2_\infty$ and $T^\pi$ could be very large or even $\infty$. In contrast, Lemma~\ref{lem:HD} always upper bounds by $2\norm{V}^2_\infty$ without extra dependence. Similar result was previously derived in RL, \emph{e.g.} Lemma~3.4 of \cite{yin2020asymptotically} and also \cite{ren2021nearly}, but their result only applies to $V^\pi$ due the analysis via law of total variances and ours applies to all $V$ (satisfying ordering condition) through only the telescoping sum.
\end{remark}

Now we go back to bounding \eqref{eqn:decomp_star}. First of all, by leveraging Lemma~\ref{lem:T_pi}, we are able to bound the $\infty$-norm of $\bar{V}-V^\star$ as (see Theorem~\ref{thm:crude_PO})
\begin{equation}\label{eqn:crude}
\norm{\bar{V}-V^{\star}}_\infty\leq30\sqrt{\frac{\bar{T}^\star B_\star\iota}{nd_m}}(\sqrt{B_\star}+1)
\end{equation}
which is a crude/suboptimal bound that serves as an intermediate step for the final bound. 

\textbf{What give rise to instance-dependencies.} Next, we apply \emph{empirical Bernstein inequality} for structure $(\tP'_{s,\pi^\star(s)}-P_{s,\pi^\star(s)})\bar{V}$ and $\widehat{c}(s,\pi^\star(s))-c(s,\pi^\star(s))$ separately. In particular, since both $\bar{V}$ and $\tP'_{s,\pi^\star(s)}$ depend on data, therefore Bernstein concentration cannot be directly applied. Informally, we can surpass this hurdle by decomposing
\[
(\tP'-P)\bar{V}=(\tP'-P)(\bar{V}-V^\star)+(\tP'-P){V}^\star.
\]
In this scenario, concentration can be readily applied to $(\tP'-P){V}^\star$ and crude bound \eqref{eqn:crude} is leveraged here for bounding $(\tP'-P)(\bar{V}-V^\star)\leq \norm{\tP'-P}_1\norm{\bar{V}-V^\star}_\infty$. As explained by \cite{zanette2019tighter}, the use of Bernstein concentration is the key for characterizing the structure of problem instance via the expression of conditional variance $\Var_{P_{s,a}}(V^\star)$.


\textbf{On the proof for VI-OPE.} At a high level, the proof for VI-OPE (Theorem~\ref{thm:ope}) shares the same flavor as that of Theorem~\ref{thm:OPL}. Ideally, in finite horizon setting the tighter analysis could be conducted by following the pipeline of Section~B.7 in \cite{duan2020minimax}, where the dominant error of $\widehat{V}^\pi-V^\pi$ (where $\widehat{V}^\pi=\lim_{i\rightarrow\infty} V^{(i)}$ in Algorithm~\ref{alg:VI_OPE}) can be decomposed as:
{
	\[
	\frac{1}{n}\sum_{i=1}^n\sum_{h=0}^\infty\frac{\xi^\pi_h(s^{(i)}_h,a^{(i)}_h)}{\xi^\mu_h(s^{(i)}_h,a^{(i)}_h)}\left(Q^\pi-(c+V^\pi)\right)(s^{(i)}_h,a^{(i)}_h)
	\]
}Applying Freedman's inequality for the above martingale structure, one can hope for a tighter rate $O(\sqrt{\sum_{s,a} d^\pi(s,a)^2\frac{\Var_{P_{s,a}}[V^\pi+c]}{n\cdot d^\mu(s,a)}})$. However, such a procedure will have technical issue for SSP problem since: (1) SSP has stationary transition $P$ and $n(s,a)$ is computed via collecting all the transitions that encounter $s,a$ for tighter dependence. This breaks the sequential ordering that is needed for martingale.\footnote{Note \cite{duan2020minimax} Corollary~1 considers time-inhomogeneous MDP and each $P_t$ can be estimated stage-wisely so the decomposition forms a martingale.} (2) Even if we have a martingale, the martingale difference will incorporate an infinite sum that could be arbitrary large. Both facts indicate Freedman's inequality cannot be directly applied due to the technical hurdle.

Lastly, the lower bound proof uses a generalized Fano's argument (Lemma~\ref{lem:gen_Fano}), followed by reducing estimation problem to testing. The packing set of hard MDP instances is based on the modification of \cite{rashidinejad2021bridging} so that Gilbert-Varshamov Lemma~\ref{lem:GV} can be applied.



%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Discussions }

\subsection{The knowledge of $B_\star/\widetilde{B}$}\label{sec:knowladge}

While VI-OPE (Algorithm~\ref{alg:VI_OPE}) is parameter-free, our policy learning algorithm PVI-SSP (Algorithm~\ref{alg:OPO}) requires $\widetilde{B}$ in the pessimistic bonus design. Since $\widetilde{B}$ is an upper bound of $B_\star$, one natural idea is to use VI-OPE to provide an upper bound estimation given that a proper policy is provided. This idea is summarized as below. 

\begin{proposition}[Alternative offline learning algorithm VI-OPE+PVI-SSP]
	Suppose we are provided with an arbitrary proper policy $\pi$ (\emph{e.g.} some previously deployed strategy). In this scenario, one can equally halve the data $\mathcal{D}$ into $\mathcal{D}_1$ and $\mathcal{D}_2$, and use $\mathcal{D}_1$ to evaluate $V^\pi$. The $\infty$-norm of VI-OPE output serves as surrogate for $\widetilde{B}$ and uses as an input for computing $b_{s,a}$. Next, use $\mathcal{D}_2$ to run PVI-SSP (with calculated $b_{s,a}$).
\end{proposition}

The above procedure will not deteriorate the theoretical guarantee since $\widetilde{B}$ is only used in $O(1/n)$ terms and the estimation error can only be higher order terms. This means we will end up with the same dominant term as Theorem~\ref{thm:OPL}.

\subsection{On higher order terms.}
In our analysis of Theorem~\ref{thm:OPL}, while the dominant $\widetilde{O}(\sqrt{1/n})$ term is near-optimal, the higher order term $\widetilde{O}(1/n)$ is not and depends on the parameters including $\widetilde{T}$, $\widetilde{B}$ and $d_m$ (\emph{e.g.} check the last line of \eqref{eqn:final_derivation}). In particular, if one can remove the polynomial dependence of $\widetilde{T}$, then the result is called \emph{horizon-free} \citep{tarbouriech2021stochastic}. One potential approach for addressing the higher order dependence could be the recent development of robust estimation in RL \citep{wagenmaker2021first}. As the initial attempt for offline SSP, this is beyond our scope and we leave it as the future work. 


\subsection{Future directions}

\textbf{SSP under weaker conditions.} Following previous works, we consider stochastic shortest path problem with a discrete action space $\mathcal{A}$ and non-negative cost bounded by $c\in[0,1]$. However, the convergence of SSP can hold under much weaker conditions. For instance, \cite{bertsekas2013stochastic} shows under \emph{compactness and continuity condition}, \emph{i.e.} for each state $s$ the admissible action set $\mathcal{A}(s)$ is a  compact metric space and a subset of $\mathcal{A}$ where (for all $s'$) transition $P(s'|s,\cdot)$ are continuous functions over $\mathcal{A}(s)$ and the cost function $c(s,\cdot)$ is lower semi-continuous over $\mathcal{A}(s)$, value iteration/policy iteration will still work under mild assumptions. This extends our setting (\emph{e.g.} cost can even be negative) and how to conduct SSP learning in this case remains open. 

\textbf{Extension to linear MDP case.} Another natural and promising generalization of the current study is the offline linear MDP for SSP problem. In the study of offline RL with linear MDPs, \cite{jin2021pessimism} shows the provable efficiency, \cite{zanette2021provable} improves the result in the \emph{linear Bellman complete} setting and \cite{yin2022near} leverages variance-reweighting for least square objective to obtain the near-optimal result. Adopting their useful results in offline SSP problem is hopeful.




\section{Conclusion}\label{sec:conclusion}

In this paper, we initiate the study of \emph{offline stochastic shortest path} problem. We consider both \emph{offline policy evaluation} (OPE) and \emph{offline policy learning} tasks and propose the simple value-iteration-based algorithms (VI-OPE and PVI-SSP) that yield strong theoretical guarantees for both evaluation and learning tasks. To complement the discussion, we also provide an information-theoretical lower bound and it certifies PVI-SSP is minimax rate optimal. We hope our work can draw further attention for studying offline SSP setting. 




\begin{acknowledgements} % will be removed in pdf for initial submission,
   Ming Yin and Yu-Xiang Wang are partially supported by NSF Awards \#2007117 and \#2003257. MY would like to thank Tongzheng Ren for helpful discussions.
\end{acknowledgements}
\bibliography{yin_665}


\begin{comment}
\appendix
\onecolumn
\begin{center}
{\LARGE \textbf{Appendix}}
%{\LARGE Appendix}
\end{center}
\input{sections/appendix}
\end{comment}

% NOTE: necessary when ptmx or no mathfont class option is given
\providecommand{\upGamma}{\Gamma}
\providecommand{\uppi}{\pi}







\end{document}
