


%%%%%%%% ICML 2021 EXAMPLE LATEX SUBMISSION FILE %%%%%%%%%%%%%%%%%

\documentclass[onecolumn]{article}
\usepackage{fullpage}

\usepackage{authblk}
% Recommended, but optional, packages for figures and better typesetting:
\usepackage{microtype}
\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{booktabs} % for professional tables
\usepackage{hyperref}


\PassOptionsToPackage{unicode}{hyperref}
\PassOptionsToPackage{naturalnames}{hyperref}

\usepackage[round]{natbib}


%\renewcommand{\bibname}{References}
%\renewcommand{\bibsection}{\subsubsection*{\bibname}}




\bibliographystyle{apalike}




% hyperref makes hyperlinks in the resulting PDF.
% If your build breaks (sometimes temporarily if a hyperlink spans a page)
% please comment out the following usepackage line and replace
% \usepackage{icml2021} with \usepackage[nohyperref]{icml2021} above.





% Attempt to make hyperref and algorithmic work together better:
\newcommand{\theHalgorithm}{\arabic{algorithm}}

% Use the following line for the initial blind version submitted for review:
% \usepackage{icml2023}

% If accepted, instead use the following line for the camera-ready submission:
%\usepackage[accepted]{icml2021}

% The \icmltitle you define below is probably too long as a header.
% Therefore, a short form for the running title is supplied here:
% \icmltitlerunning{Towards Minimax Optimality of Model-based Robust Reinforcement Learning}




%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\usepackage{tikz}
\def\checkmark{\tikz\fill[scale=0.4](0,.35) -- (.25,0) -- (1,.7) -- (.25,.15) -- cycle;}



% comments 
\newcommand{\mfg}[1]{{\color{red}[mfg: #1]}}
\newcommand{\mfh}[1]{{\color{green}[Pierre: #1]}}

\usepackage{hyperref}







%\renewcommand{\bibname}{References}
%\renewcommand{\bibsection}{\subsubsection*{\bibname}}
%\bibliographystyle{apalike}
%\bibliographystyle{apalike}
%\usepackage{tikz}
%\def\checkmark{\tikz\fill[scale=0.4](0,.35) -- (.25,0) -- (1,.7) -- (.25,.15) -- cycle;}




\usepackage{amsmath}
\usepackage{graphicx}
%\usepackage{hyperref}
\usepackage{xcolor}    
%\usepackage{subcaption}
\usepackage{lipsum}
\usepackage{graphicx}
\usepackage{comment}
\usepackage{amsthm}

\PassOptionsToPackage{unicode}{hyperref}
\PassOptionsToPackage{naturalnames}{hyperref}

\usepackage{hyperref}       % hyperlinks
\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors
\usepackage[linesnumbered,ruled,vlined]{algorithm2e}

%\usepackage{algorithm}


% If your paper is accepted, change the options for the package
% aistats2024 as follows:
%
%\usepackage[accepted]{aistats2024}
%
% This option will print headings for the title of your paper and
% headings for the authors names, plus a copyright note at the end of
% the first column of the first page.

% If you set papersize explicitly, activate the following three lines:
%\special{papersize = 8.5in, 11in}
%\setlength{\pdfpageheight}{11in}
%\setlength{\pdfpagewidth}{8.5in}

% If you use natbib package, activate the following three lines:
%\usepackage[round]{natbib}
%\renewcommand{\bibname}{References}
%\renewcommand{\bibsection}{\subsubsection*{\bibname}}

% If you use BibTeX in apalike style, activate the following line:
%\bibliographystyle{apalike}

\begin{document}
\newcommand{\Qpik}{Q^{\pi_k} }
\newcommand{\Qhatpik}{\hat{Q}^{\pi_k} }
\newcommand{\Qhats}{\hat{Q}^* }

\newcommand{\Qs}{Q^* }
\newcommand{\Qhatpistar}{\hat{Q}^{\pi^*}}
\newcommand{\Qhatpihat}{\hat{Q}^{\hat{\pi}}}
\newcommand{\Qpihat}{Q^{\hat{\pi}}}
\newcommand{\Qhatpi}{\hat{Q}^\pi}
\newcommand{\Qsapi}{Q_{\mathrm{sa}}^\pi}
\newcommand{\Qspi}{Q_{\mathrm{s}}^\pi}
\newcommand{\Qpi}{Q^\pi}

%

\newcommand{\kappai }[1]{{\underset{#1}{\kappa}}    }



\newtheorem{theorem}{Theorem}[section]
\newtheorem{corollary}{Corollary}[theorem]
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{assumption}[theorem]{Assumption}

\newtheorem{definition}{Definition}[section]


\newcommand{\Vpik}{V^{\pi_k} }
\newcommand{\Vhatpik}{\widehat{V}^{\pi_k} }
\newcommand{\Vhats}{\hat{V}^* }
\newcommand{\defn}{\coloneqq}

\newcommand{\Vs}{V^* }
\newcommand{\Vhatpistar}{\hat{V}^{\pi^*}}
\newcommand{\Vhatpihat}{\hat{V}^{\hat{\pi}}}
\newcommand{\Vpihat}{V^{\hat{\pi}}}
\newcommand{\Vhatpi}{\hat{V}^\pi}

\newcommand{\Vsapi}{V_{\mathrm{sa}}^\pi}
\newcommand{\Vspi}{V_{\mathrm{s}}^\pi}
\newcommand{\Vpi}{V^\pi}


% pi
\newcommand{\pihat}{\hat{\pi}}
\newcommand{\pistar}{ \pi^{*} }
\newcommand{\pihatstar}{\hat{\pi}^{*}}
% r
\newcommand{\rpihat}{ r^{\hat{\pi}} }
\newcommand{\rpistar}{ r^{\pi^{*}} }
\newcommand{\rsa}{ r^{(s,a)}_{Q} }
\newcommand{\rs}{ r^{s}_{Q,\pi} }
\newcommand{\rsapi}{ r^{(s,a)}_{Q^\pi_{sa}} }
\newcommand{\rspi}{ r^{s}_{Q^\pi_{s}} }
\newcommand{\rspihat}{ r^{s}_{Q^{\hat{\pi}}_{s}} }
\newcommand{\rspistar}{ r^{s}_{Q^{\pi^*}_{s}} }


\newcommand{\rsahatpi}{ r^{(s,a)}_{\hat{Q}^\pi_{sa}} }
\newcommand{\rshatpi}{ r^{s}_{\hat{Q}^\pi_{s}} }


\newcommand{\rshatpihat}{ r^{s}_{\hat{Q}^{\hat{\pi}}_{s}} }
\newcommand{\rshatpistar}{ r^{s}_{\hat{Q}^{\pi^*}_{s}} }

% P
\newcommand{\Pzero}{P_0}
\newcommand{\Pzeropistar}{\Pzero^{\pi^*}}
\newcommand{\Pzeropihat}{\Pzero^{\hat{\pi}}}
\newcommand{\Ppistar}{P^{\pi^*}}
\newcommand{\Ppihat}{P^{\hat{\pi}}}
\newcommand{\Phatpihat}{\hat{P}^{\hat{\pi}}}
\newcommand{\Phatpihatstar}{\hat{P}^{\hat{\pi}^*}}

\newcommand{\Phatpi}{\hat{P}^{\pi}}
\newcommand{\Phatpistar}{\hat{P}^{\pi^*}}
\newcommand{\Cbet}{C_\beta}


%Operator
\newcommand{\norm}[1]{\left\lVert#1\right\rVert}
\newcommand{\normq}[1]{\left\lVert#1\right\rVert_q}
\newcommand{\normqbar}[1]{\left\lVert#1\right\rVert_{q}}
\newcommand{\normpbar}[1]{\left\lVert#1\right\rVert_{p}}
\newcommand{\normqbarpi}[1]{\left\lVert#1\right\rVert_{q,\pi}}
\newcommand{\normpbarpi}[1]{\left\lVert#1\right\rVert_{p,\pi}}
\newcommand{\normcq}[1]{\left\lVert#1\right\rVert_{\tilde{q}}}
\newcommand{\normc}[1]{\left\lVert#1\right\rVert_{\tilde{\infty}}}
\newcommand{\norminf}[1]{\left\lVert#1\right\rVert_\infty}
\newcommand{\norminfpi}[1]{\left\lVert#1\right\rVert_{\infty,\pi}}
\newcommand{\normqpi}[1]{\left\lVert#1\right\rVert_{q,\pi}}
\newcommand{\normqpis}[1]{\left\lVert#1\right\rVert_{q,\pi^*}}
\newcommand{\normqpik}[1]{\left\lVert#1\right\rVert_{q,\pi_k}}
\newcommand{\normqpihat}[1]{\left\lVert#1\right\rVert_{q,\hat{\pi}^*}}



%\newcommand{\normqbarpi}[1]{\left\lVert#1\right\rVert_{\bar{q},\pi}}
\newcommand{\normqbarpis}[1]{\left\lVert#1\right\rVert_{q,\pi^*}}
\newcommand{\normqbarpik}[1]{\left\lVert#1\right\rVert_{q,\pi_k}}
\newcommand{\normqbarpihat}[1]{\left\lVert#1\right\rVert_{\bar{q},\hat{\pi}^*}}

\newcommand{\snormq}[1]{\mathrm{sp}_q(#1)}
\newcommand{\snormqbar}[1]{\mathrm{sp}_{q}(#1)}
\newcommand{\snorminf}[1]{\mathrm{sp}(#1)_\infty}
\newcommand{\snorminfpi}[1]{\mathrm{sp}_{\infty,\pi}(#1)}
\newcommand{\snormqpi}[1]{\mathrm{sp}_{q,\pi}(#1)}
\newcommand{\snormqpis}[1]{\mathrm{sp}_{q,\pi^*}(#1)}
\newcommand{\snormqpik}[1]{\mathrm{sp}_{q,\pi^k}(#1)}
\newcommand{\snormqpihat}[1]{\mathrm{sp}_{q,\hat{\pi}}(#1)}


\newcommand{\snormqbarpis}[1]{\mathrm{sp}_{q,\pi^*}(#1)}
\newcommand{\snormqbarpik}[1]{\mathrm{sp}_{q,\pi^k}(#1)}
\newcommand{\snormqbarpi}[1]{\mathrm{sp}_{q,\pi}(#1)}
\newcommand{\snormqbarpihat}[1]{\mathrm{sp}_{q,\hat{\pi}}(#1)}


\newcommand{\snorm}[1]{\mathrm{sp}(#1)}
\newcommand{\sdnorm}[1]{\mathrm{sp}(#1)_{*}}
\newcommand{\dnorm}[1]{\left\lVert#1\right\rVert_{*}}



% Sets and symbols
\newcommand{\dimS}{\vert S \vert^{\frac{1}{q}}}

\newcommand{\horizon}{H}
\newcommand{\Snorm}{\vert S \vert }
\newcommand{\Anorm}{\vert A \vert }
% \newcommand{\Snorm}{|S|}
% \newcommand{\Anorm}{|A|}



% If your paper is accepted and the title of your paper is very long,
% the style will print as headings an error message. Use the following
% command to supply a shorter title of your paper so that it can be
% used as headings.
%
%\runningtitle{I use this title instead because the last one was very long}

% If your paper is accepted and the number of authors is large, the
% style will print as headings an error message. Use the following
% command to supply a shorter version of the authors names so that
% they can be used as headings (for example, use only the surnames)
%
%\runningauthor{Surname 1, Surname 2, Surname 3, ...., Surname n}




%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\title{Towards Minimax Optimality \\ of Model-based Robust Reinforcement Learning}

\date{}

\author[1,2]{Pierre Clavier}
\author[1]{Erwan Le Pennec\footnote{Contributed equally.}}
\newcommand\CoAuthorMark{\footnotemark[\arabic{footnote}]} % get the current value
\author[3]{Matthieu Geist \protect\CoAuthorMark}
\affil[1]{Ecole polytechnique, CMAP}
\affil[2]{INRIA Paris, Inserm, HeKA team}
\affil[3]{Google DeepMind}







\maketitle


\begin{abstract}
%\end{abstract}
  We study the sample complexity of obtaining an $\epsilon$-optimal policy in \emph{Robust} discounted Markov Decision Processes (RMDPs), given only access to a generative model of the nominal kernel. 
  % This problem is widely studied in the non-robust case, and it is known that any planning approach applied to an empirical MDP estimated with $\tilde{\mathcal{O}}(\frac{H^3  |S||A|}{\epsilon^2})$ samples provides an $\epsilon$-optimal policy, which is minimax optimal. Results in the robust case are much more scarce. 
  This problem is widely studied in the non-robust case, but results are much more scarce in the robust case.
  For $sa$- (resp $s$-) rectangular uncertainty sets, until recently the best-known sample complexity was $\tilde{\mathcal{O}}(\frac{H^4  |S|^2|A|}{\epsilon^2})$ (resp. $\tilde{\mathcal{O}}(\frac{H^4  | S |^2| A |^2}{\epsilon^2})$), %for specific algorithms and 
  when the uncertainty set is based on the total variation (TV), the KL or the Chi-square divergences. Here, %In this paper, 
  we consider uncertainty sets defined with an $L_p$-ball (recovering the TV case), and study the sample complexity of any planning algorithm (with high accuracy guarantee on the solution) applied to an empirical RMDP estimated using the generative model. In the general case, we prove a sample complexity of $\tilde{\mathcal{O}}(\frac{H^4  | S || A |}{\epsilon^2})$ for both the $sa$- and $s$-rectangular cases (improvements of $| S |$ and $| S || A |$ respectively). When the size of the uncertainty is small enough, we improve the sample complexity to $\tilde{\mathcal{O}}(\frac{H^3 | S || A | }{\epsilon^2})$, recovering the lower-bound for the non-robust case for the first time and a robust lower-bound. Finally, we also introduce simple and efficient algorithms for solving the studied $L_p$-RMDPs.
  \looseness=-1
\end{abstract}





\section{Introduction}
Reinforcement learning (RL) \citep{sutton2018reinforcement}, often modelled as learning and decision-making in a Markov decision process (MDP), has attracted increasing interest in recent years due to its remarkable success in practice. A major goal of RL is to find a strategy or policy, based on a collection of data samples, that can predict the expected cumulative rewards in an
MDP, without direct access to a detailed description of the underlying model. However, \citet{mannor2004bias} showed that the policy and the value function could sometimes be sensitive to estimation errors of the reward and transition probabilities, meaning that a  very small perturbation of the reward and transition probabilities could lead to  a significant change in the value function.
\looseness=-1

\vspace{0.3cm}

Robust MDPs  \citep{iyengar2005robust,nilim2005robust} (RMDPs) have been proposed to handle these problems by letting the transition probability vary in an uncertainty (or ambiguity) set. In this way, the solution of robust MDPs is less sensitive to model estimation errors with a properly chosen uncertainty set.
%RMDPs focuses on developing control strategies that are robust to parameter uncertainties arising from the discrepancy between the simulator model and the real world.%%
An RMDP problem is usually formulated as a max-min problem, where the objective is to find the policy that maximises the value function for the worst possible model that lies within an uncertainty set around a nominal model. Initially, RMPDs \citep{iyengar2005robust,nilim2005robust} were developed because the solution of MDPs can be very sensitive to the model parameters \citep{zhao2019investigating,packer2018assessing}. However, as the solution of robust MDPs is NP-hard for general uncertainty sets \citep{nilim2005robust},  the uncertainty set is usually assumed to be rectangular (meaning that it can be decomposed as a product of uncertainty sets for each state or state-action pair), which allows tractability \citep{iyengar2005robust,ho2021partial}. 
%Two assumptions of rectangularity exist, the $s-$ and $sa-$rectangularity assumptions on the uncertainty set. A fundamental difference between $sa$- and $s$-rectangular robust MDPs is that the greedy and optimal policy in $sa$-rectangular robust MDPs 
These two kinds of sets are called respectively $s$- and $sa$-rectangular sets. A fundamental difference between them is that the greedy and optimal policy in $sa$-rectangular robust MDPs 
 is deterministic, as in non-robust MDPs, but can be stochastic in the $s$-rectangular case \citep{wiesemann2013robust}.
 Compared to $sa$-rectangular robust MDPs, $s$-rectangular robust MDPs are less restrictive but much more difficult to handle. Under this rectangularity assumption, many structural properties of MDPs remain intact \citep{iyengar2005robust} and methods such as robust value iteration, robust modified policy iteration, or partial robust policy iteration \citep{ho2021partial}  can be used to solve them. It is also known that the uncertainty in the reward can be easily handled, while handling uncertainty in the transition kernel is much more difficult \citep{kumar2022efficient,derman2021twice}.
 \looseness=-1

 \vspace{0.3cm}

 In this work, we consider robust MDPs, with both $sa$- and $s$-rectangular uncertainty sets, consisting of $L_p$-balls centred around the nominal model $P_0$. We assume access to a generative model, which can sample a next state from any state-action pair from the nominal model. The question we address is to know how many samples are required to compute an $\epsilon$-optimal policy. This classic abstraction, which allows studying the sample complexity of planning over a long horizon, is widely studied in the non-robust setting \cite{singh1994upper, sidford2018near, gheshlaghi2013minimax, agarwal2020model, li2020breaking, kozuno2022kl}, but much less in the robust setting \citep{yang2021towards,panaganti2022sample,shi2022distributionally,xu2023improved,shi2023curious}. We consider more specifically model-based robust RL. We call the generative model the same number of times for each state-action pair, to build a maximum likelihood estimate of the nominal model, and use any planning algorithm for robust MDPs (with high accuracy guarantee on the solution) on this empirical model. This setting will be discussed further later, but we insist right away that it is especially meaningful in the robust setting, as an abstraction of sim-to-real. The research question we address is: \textit{How many samples are required for guaranteeing an $\epsilon$-optimal policy with high probability?}
 \looseness=-1

 \vspace{0.3cm}

 Our \textbf{first contribution} is to prove that for both $s$ and $sa$-rectangular sets based on $L_p$-balls, the sample complexity of the proposed approach is $\tilde{\mathcal{O}}(\frac{H^4  \Snorm\Anorm}{\epsilon^2})$, with $H=(1-\gamma)^{-1}$ being the horizon term. Previous works \citep{yang2021towards,panaganti2022sample,shi2022distributionally,xu2023improved} study different sets, based on the Kullback-Leibler (KL) divergence, Chi-square divergence, and total variation (TV). We have the TV in common ($L_1$-ball up to a normalizing factor), and, in this case, we improve these existing results by $\Snorm$ for the $sa$-rectangular case, and by $\Snorm\Anorm$ for the $s$-rectangular case, which is significant for large state-action spaces. On the technical side, our results build heavily upon the dual view of robust Bellman operators~\citep{derman2021twice,kumar2022efficient}.  However, we deviate from this line of work by enforcing the uncertainty set to belong to the simplex. This allows ensuring that the robust operators are not overly conservative while ensuring they are $\gamma$-contractions, which is important for the theoretical analysis. On the negative side, the algorithms they introduce are no longer applicable, which calls for new algorithmic design.
\looseness=-1

\vspace{0.3cm}
 
 Our \textbf{second contribution} is to show that, if the uncertainty set is small enough, then we have a sample complexity of $\tilde{\mathcal{O}}(\frac{H^3  \Snorm\Anorm}{\epsilon^2})$. This is a further improvement by $H$ of the previous bound, and it matches the known lower bound for the non-robust case \citep{gheshlaghi2013minimax}. On the technical side, it again builds upon the dual  view of robust Bellman operators with the deviation mentioned above. %\citep{derman2021twice,kumar2022efficient}. 
 In addition to that, it adapts two proof techniques of the non-robust case: The total variance technique of \citet{gheshlaghi2013minimax} to reduce the dependency to the horizon, and the \emph{absorbing MDP} construction of \citet{agarwal2020model} to allow for a wider range of valid $\epsilon$.

\vspace{0.3cm}

As mentioned earlier, the algorithms of \citet{derman2021twice,kumar2022efficient} are not applicable to the more realistic uncertainty sets we consider. Our \textbf{third contribution} is an algorithm $   \mathtt{DRVI 
 } ~ \mathtt{L_P}$ (see Alg. \ref{alg:cvi-dro-infinite}, \ref{alg:cvi-dro-infinites}): for Distributionally Robust Value Iteration for $L_P$  in $sa-$ and $s$-rectangular case that solves exactly RMDPs in the case of valid robust transitions that belong to the simplex, contrary to \cite{kumar2022efficient}. 
 %\vspace{-0.3cm}
% Several previous papers have studied robust MDPs from a regularisation perspective \cite{derman2021twice,husain2021regularized,derman2020distributional}. Recently, \cite{kumar2022efficient} has established a concrete equivalence between s-rectangular robust MDPs and policy-regularised MDPs. Finally, efficient algorithms for different  uncertainty sets  have been proposed \cite{xu2010distributionally, mannor2016robust,wiesemann2013robust,clavier2022robust} However, these works are empirical in nature and do not provide any theoretical guarantees for the learned policies. In particular, there are few works that provide robust RL algorithms with provable (non-asymptotic) finite-sample performance guarantees

% In this paper, we address the problem of developing a
% robust RL algorithm using model-based generative framework having finite sample guarantees on its performance. To do so, we characterise the sample complexity in a PAC (probably approximately correct) sense. Here, the RMDP framework assumes that the real-world model lies within
% an uncertainty set around a called nominal (simulator) model. The goal of the algorithm here is to learn a policy $\pi$ that is the best under the worst possible model in this uncertainty set. We do not assume that the algorithm knows the exact simulator model (and hence the exact uncertainty set) like previous work on classical MDPs or RL setting.  Indeed we only assume that the algorithm has access to
% a generative sampling model that can generate next state samples for all state-action pairs according to the nominal simulator model in the model-based setting.  Roughly speaking, there are at least two common algorithmic approaches: a model-based approach and a model-free approach. In the model-based approach, the tasks of model estimation and policy learning are decoupled. More specifically, one first estimates the unknown model using the available data samples and then uses the fitted model to carry out the planning. We will be looking at sample complexity in this framework.
% The natural question taht arises is : how many samples
% from the nominal simulator model are needed to learn
% an $\epsilon
% $-optimal robust policy with  probability $1-\delta$?
% Sample complexity have been studied in many works in the non-robust setting such that \cite{singh1994upper},\cite{sidford2018near},\cite{gheshlaghi2013minimax}\cite{agarwal2020model} \cite{li2020breaking} or \cite{kozuno2022kl}. From the work of \cite{gheshlaghi2013minimax}, we know that the sample complexity of a generative model in model based setting for classical MDPS is minimax optimal with a bound of $\tilde{\mathcal{O}}(\frac{H^3\Snorm\Anorm}{\epsilon^2})$. However, the range of $\epsilon$ where the bound is true was depending on the size of the uncertainty set. The work of \cite{agarwal2020model} \cite{li2020breaking} considerably improve the range of $\epsilon$ where this bound is true first to $ \epsilon \in [0, H^{1/2} ) $  and finally to $\epsilon \in [0, H)$. In our work, we will match this upper bound under some condition RMDPs and reconcile the bounds in robust and non-robust case.
% The sample complexities described in the previous papers do not encourage the use of robust algorithms because of their very high sample complexity, even in the case of a generative model which is usally smaller than other context. The issue at stake is :
% \textit{Does the Robust MDPs problem with (s) and (sa)
% rectangular uncertainty set have the same sample complexity as classical MDP ?}

% \textbf{Our contribution :}

% In this paper, the following contributions are made:
% \begin{itemize}
%     \item New algorithms linking regularisation with value functions and robustness \cite{kumar2022efficient} are analysed. We show here the interest of these algorithms not only on a practical level but also as a theoretical tool that allows us to analyse the RMDPs.
%     \item It is shown that for the case of $L_p$-balls, the complexity of s and sa rectangular is the same, which was not the case in previous work. \cite{yang2021towards}. We make a first bound for the $L_p$ distance, and we considerably improve the bounds for $TV$ which corresponds to $L1$ by gaining a factor $\Snorm$ or $\Snorm\Anorm$ depending on (sa) or (s) rectangularity assumption.
% \item Finally, we show that for a set of uncertainty on the transitions that is not too large (see Thm. \ref{toward}, we can obtain a sample complexity for the generative model that is again improved by a factor $H$ compared to our previous bound. The latest upper bound is minimax optimal in the non-robust case as our upper bound match the lower bound of MDPs in terms of $\Snorm,\Anorm, \H$ and $ \epsilon$. This is the first time to our knowledge that an RMDPs algorithm is known to have the sample complexity has classical MDPs. Moreover, our bounds is independent to the size of uncertainty set  which is not always the case in other works such as \cite{yang2021towards, panaganti2022sample}.
% \end{itemize}
\section{Related Work}
The question of sample complexity when having access to a generative model has been widely studied in the non-robust setting \citep{singh1994upper, sidford2018near, gheshlaghi2013minimax, agarwal2020model, li2020breaking, kozuno2022kl}. Notably, \citet{gheshlaghi2013minimax} provide a lower-bound of this sample complexity, $\tilde{\Omega}
(\frac{\Snorm\Anorm H^3}{\epsilon^2})$, and show that (tabular) model-based RL reaches this lower-bound, making it minimax optimal (up to polylog factors). This bound relies on the so-called total variance technique, that we adapt to the robust setting. However, their result is only true for small enough $\epsilon$, in the range $(0,\sqrt{H/\Snorm})$. This was later improved to $(0,\sqrt{H})$ by \citet{agarwal2020model}, thanks to a novel \emph{absorbing MDP} construction, that we also adapt to the robust setting.

Closer to our contributions are the works that study the sample complexity in the \emph{robust} setting \citep{yang2021towards,panaganti2022sample,xu2023improved,shi2022distributionally}. The study  of sample complexity of specific algorithms (respectively either empirical robust value or Robust Phased Value Learning) is studied by \cite{panaganti2022sample,xu2023improved}, while our results apply to any oracle planning (applied to the empirical model), as long as it provides a solution with enough accuracy. We consider both $s$- and $sa$-rectangular uncertainty sets, as \citet{yang2021towards}, while \citet{panaganti2022sample,xu2023improved,shi2022distributionally} only consider the simpler $sa$-rectangular sets. They all study either TV, KL or Chi-square balls, while we study $L_p$-balls.   \cite{shi2022distributionally} improved  the   KL bound compared to  \cite{yang2021towards,panaganti2022sample} in the $sa$ rectangular case. The framework of \cite{xu2023improved} is slightly different as they consider finite horizon which adds a factor $H$ in all bounds. All previous results are not minimax optimal in terms of the horizon factor.

% Our results build upon the equivalent formulation of robust MDPs with twice regularized MDPs \citep{derman2021twice}, extending classic regularized MDPs \citep{geist2019theory} to both policy and value-regularization (corresponding respectively to reward and transition uncertainty sets).
% We rely more specifically on a simple scalar optimisation dual expression of the minimisation problem over models. As such, we do not cover the KL and Chi-square cases, which do not have such a simple form even if they can also be written as simple scalar optimisation problem. However, we have in common  with \cite{yang2021towards,panaganti2022sample} the total variation case, which corresponds to a (scaled) $L_1$-ball. For this case, we can compare our sample complexities. Without assumption on the size of the uncertainty set, we improve the existing sample complexities by $\Snorm$ and $\Snorm\Anorm$ respectively (for $sa$- or $s$-rectangularity). Also, our bounds have no dependency on the size of the uncertainty set. Notice that as we consider a generic oracle planning algorithm, our bounds apply to the algorithms of \cite{panaganti2022sample,xu2023improved}. If we further assume that the uncertainty set is small enough, then we improve the bound by an additional $H$ factor, reaching the minimax sample complexity of the non-robust case. Table \ref{tableau1} summarizes the difference in sample complexity, and we'll discuss them again after stating our theorems.

\vspace{0.3cm}

    We rely more specifically on a reformulation of the minimisation problem over models as a much simpler scalar optimisation dual expression. Note that even if the KL and Chi-square cases can also be written as a simple scalar optimisation problem \citet{panaganti2022sample}, our proof can not be adapted directly to this setting.  However, we have in common  with \cite{yang2021towards,panaganti2022sample} the total variation case, which corresponds to a (scaled) $L_1$-ball. For this case, we can compare our sample complexities. Without assumption on the size of the uncertainty set, we improve the existing sample complexities by $\Snorm$ and $\Snorm\Anorm$ respectively (for $sa$- or $s$-rectangularity). Also, our bounds have no dependency on the size of the uncertainty set. Notice that as we consider a generic oracle planning algorithm, our bounds apply to the algorithms of \cite{panaganti2022sample,xu2023improved}. If we further assume that the uncertainty set is small enough, then we improve the bound by an additional $H$ factor, reaching the minimax sample complexity of the non-robust case. Table \ref{tableau1} summarizes the difference in sample complexity, and we'll discuss them again after stating our theorems.
 \looseness=-1

\vspace{0.3cm}

 Finally, the archival version of this contribution predates the concurrent work of \cite{shi2023curious} that studies the sample complexity of RMDPs for $TV$ and $\chi^2$ divergence. In the very specific case of $sa$- rectangular for $TV$ which in this case coincides with $L_1$ norm, \cite{shi2023curious} retrieves our upper bound which is minimax optimal in the regime where the radius of the uncertainty set is small and improves our result in the regime where the radius of the uncertainty set is bigger than $1-\gamma$. However, our results hold more generally for the $s$-rectangular case, and are still state-of-the-art for $s$-rectangular case with $p\geq1$ and for $sa-$rectangular with $p>1$. 
Notice also that the proof techniques are very different, and it is an interesting research direction to know if their bound for the 
% regime where the radius of the uncertainty set is bigger than $1-\gamma$ 
large radius regime
or their lower-bound would extend to the more general case studied here.\looseness=-1

\vspace{0.3cm}
% \begin{itemize}
%     \item  \textbf{Sample Complexity of Robust MDPs. } Sample complexity of  RMDPs have been studied recently by \cite{panaganti2022sample} and \cite{yang2021towards} which are the two more close work. In  \cite{panaganti2022sample}, only sa-rectangular is discussed in comparison  to \cite{yang2021towards} and our paper. On both papers, an analysis is conducted for $TV$, $KL$ and $\chi^2$ distances/divergences, which have similar sample complexity in terms of $\Snorm, \Anorm $and $H$. In our paper we only use the $L_p$ distance for any $p$ and have in common the $TV$ which is in fact $L_1$ for positive measure. We are not treating   $KL$ and $\chi^2$ has they have no closed form solution of the minimisation problem. However, for this particular distance we obtain very  tight bounds compare to others papers. Indeed, for $sa$ case we improve the bound of \cite{panaganti2022sample} from a factor $\Snorm$ and from a factor $\Snorm\Anorm$ for s-rectangular, which is considerable in large MDPs. Moreover, we obtain similar rate for $S$ and $sa$ rectangular which was not the case in \cite{yang2021towards}. One can go even further by also gaining a horizon factor $H$ in both sa and s rectangular case when the uncertainty set is not too large.  Our analysis obtain better results than others works mainly because we use regularised form of MDPS which are equivalent to robust RMPDs in certain cases \cite{derman2021twice}, which is not the case in other papers.  A table summarising the differences in sample complexity is available in table \ref{tableau}.  Finally, \cite{neufeld2022robust} is interested in Wasserstein based uncertainty sets. To our knowledge, there is no other proof matching our sample complexity.     
%  \item \textbf{Regularised MDPs} : \cite{geist2019theory} are becoming more and more successful due to their link between robustness and achieve very good results on real environments.  Here the algorithm studied is a regularisation not only in terms of policy  but also in Q function. The regularized forms of the RMDPs are the key to our analysis here.
% \end{itemize}

\begin{table*} \small
\caption{Sample Complexity of TV for $s$- or $sa$ rectangular with $\beta$ (see Def~\ref{def:beta}) the radius of uncertainty set (see also Tab. \ref{tableau2} in the appendix for a complete table with different norms)
 \label{tableau1}}
\begin{tabular}
{ |p{0.9cm}|p{3.0cm}|p{2.8cm}|p{1.9cm}|p{3.3cm}|p{2.4cm}|   }
%\hline
% \multicolumn{5}{|c|}{\textbf{Sample Complexity of RMDPs with a generative model}} \\
\hline
& \cite{panaganti2022sample}  &\cite{yang2021towards} & Our   $\beta \geq 0$ & Our $ 1/(2H\gamma) >\beta>0$ & \cite{shi2023curious} 
\\
\hline
$sa$-rect. & $\tilde{\mathcal{O}}\left(\frac{\Snorm^2\Anorm  \horizon^4}{\epsilon^2}\right)$ &$\tilde{\mathcal{O}}\left(\frac{\Snorm^2\Anorm  \horizon^4(2+\beta)^2}{\epsilon^2\beta^2}\right)$&$\tilde{\mathcal{O}}\left(\frac{\Snorm\Anorm  \horizon^4}{\epsilon^2}\right)$ &$\tilde{\mathcal{O}}\left(\frac{\Snorm\Anorm \horizon^3}{\epsilon^2}\right)$& $\tilde{\mathcal{O}}\left(\frac{\Snorm\Anorm  \horizon^2}{\epsilon^2 \min(1/H, \beta)}\right)$ \\
\hline
$s$-rect. & $\times$&$\tilde{\mathcal{O}}\left(\frac{\Snorm^2\Anorm^2  \horizon^4(2+\beta)^2}{\epsilon^2\beta^2}\right)$&$\tilde{\mathcal{O}}\left(\frac{\Snorm\Anorm  \horizon^4}{\epsilon^2}\right)$  &$\tilde{\mathcal{O}}\left(\frac{\Snorm\Anorm  \horizon^3}{\epsilon^2}\right)$& $\times$   \\
\hline
\end{tabular}
%\vspace{-10pt}
\end{table*}
\section{Preliminaries}
%\subsection{Notations}
For finite sets $S$ and $A$, we write respectively $\Snorm$ and $\Anorm$ their cardinality. We write $\Delta_{A}:=\{p: A\rightarrow \mathbb{R} \mid p(a) \geq 0, \sum_{a \in A} p(a)=1\}$ the simplex over $A$. For $v\in\mathbb{R}^S$ the classic $L_q$ norm is $\normq{v}^q= \sum_s v(s)^q$. For the conditional distribution $\pi\in\Delta_A^S$, we define the $\pi$-weighted $L_q$ norm $ \normqpi{u}^q:=\sum_s| \sum_a \pi(a|s) u(s,a)|^q$, with $u\in\mathbb{R}^{S\times A}$. %Moreover, we define the normalised norm and normalised $\pi$-weighted norm  as $\normqbar{v}^q=\frac 1 \Snorm\sum_s v(s)^q$  and $ \normqbarpi{u}^q=  \frac 1 \Snorm \sum_s| \sum_a \pi(a|s) u(s,a)\vert^q$. 
Finally, we denote $\tilde{\mathcal{O}}$ the $\mathcal{O} $ notation up to the logarithm factor. \looseness=-1

% For $S, A,$ the space and action set, $ \Snorm $ and respectively $\Anorm$ denotes its cardinality, 
% , $\Delta_{A}:=\left\{\pi: A\rightarrow \mathbb{R} \mid \pi(a\vert s) \geq 0, \forall s \sum_{a \in A}, \pi(a\vert s)=1\right\}$ is the probability simplex over $A$. We define also $\pi$-weighted $L_q$ norm $ \normqpi{u}^q:=\sum_s| \sum_a \pi(a|s) u(s,a)|^q$ and the classical $L_q$ norm  $\normq{v}^q= \sum_s v(s)^q$ for $\pi \in \Delta_{A}$. Moreover we define the normalised norm and normalised $\pi$-weighted norm for $v \in \mathbb{R}^{\Snorm}$ and $q \in \mathbb{R}^{\Snorm\Anorm}$ as $\normqbar{v}^q=\sum_s v(s)^q/\Snorm$  and $ \normqbarpi{u}=  \sum_s| \sum_a \pi(a|s) u(s,a)\vert^q/\Snorm$ .  Finnaly  $\mathbb{I}$ denotes all ones vector. Finally we write $\langle u, v\rangle_A:=\sum_{a \in A} u(a) v(a)$.
% We will see later in next section why we normalised the classic  $L_p$ by the dimention of the considered state space.
\subsection{ Markov Decision Process}
A Markov Decision Process (MDP) is defined by $M=(\mathcal{S}, \mathcal{A}, P, R, \gamma, \mu)$ where $S$ and $A$ are the finite state and action spaces, $P: \mathcal{S} \times \mathcal{A} \rightarrow \Delta_{\mathcal{S}}$ is the transition kernel, $R: \mathcal{S} \times \mathcal{A} \rightarrow [0,1]$ is the reward function, $\mu \in \Delta_{\mathcal{S}}$ is the initial distribution over states and $\gamma \in[0,1)$ is the discount factor.  %We denote by $H=1/(1-\gamma)$ the horizon factor useful in the following analysis.  In MDPs framework, we assume $S$ and $A$ are finite sets with cardinalities $\Snorm$ and $\Anorm$, respectively. Moreover, w
% We assume that the immediate reward $R(s, a)$ is taken from the interval $[0,1] $.  
A stationary policy $\pi: \mathcal{S} \rightarrow \Delta_{\mathcal{A}}$ maps states to probability distributions over actions. 
%And $\pi(a \mid s), P\left(s^{\prime} \mid s, a\right), R(s, a)$ denotes the probability of selecting action $a$ in state $s$, transition probability to state $s^{\prime}$ in state $s$ under action $a$, and reward in state s under. 
We write $P_{s, a}$ the vector $P(\cdot | s, a)$. We also define $P^\pi$ to be the transition matrix on state-action pairs induced by a policy $\pi$: 
$
P_{(s, a),(s^{\prime}, a^{\prime})}^\pi=P(s^{\prime} | s, a) \pi(a'| s') 
$.
Slightly abusing notations, for $V \in \mathbb{R}^{S}$, we define the vector $\operatorname{Var}_P(V) \in \mathbb{R}^{\mathcal{S} \times A}$ as 
$
\operatorname{Var}_P(V)(s, a):=\operatorname{Var}_{P(\cdot \mid s, a)}(V)$,  so that $\operatorname{Var}_P(V)=P(V)^2-(P V)^2$ (with the square understood component-wise).
%In this case, the framework of the MDPs is used and the following assumptions are made in the rest of the paper.
Usually, the goal is to estimate the value function  defined as:
$V_{P, R}^\pi(s):=\mathbb{E}\left[\sum_{n=0}^{\infty} \gamma^n R\left(s_n, a_n\right) \mid s_0=s,\pi ,P\right]. $
The value function $V_{P, R}^\pi$ for policy $\pi$, is the fixed point of the Bellmen operator $\mathcal{T}_{P, R}$, defined as
$
\mathcal{T}_{P, R}^\pi V(s)=\sum_a \pi(a | s)[R(s, a)+\gamma \sum_{s^{\prime}} P\left(s^{\prime} | s, a\right) V\left(s^{\prime}\right)]$. 
We also define the optimal Bellman operator: $\mathcal{T}_{P, R}^* V(s)=\max _{\pi_s \in \Delta_{\mathcal{A}}} \left(\mathcal{T}_{P, R}^{\pi_s} V\right)(s).$
Both optimal and classical Bellman operators are $\gamma$-contractions \cite{sutton2018reinforcement}. This is why sequences $\left\{V_n^\pi \mid n \geq 0\right\}$, and $\left\{V_n^* \mid n \geq 0\right\}$, defined as
$
V_{n+1}^\pi:=\mathcal{T}_{P, R}^\pi V_n^\pi \text{ and } V_{n+1}^*:=\mathcal{T}_{P, R}^* V_n^*, 
$ converge linearly to $V_{P, R}^\pi$ and $V_{P, R}^*$, respectively the value function following $\pi$ and the optimal value function. 
Finally, we can  define the Q-function, 
$
Q_{P, R}^\pi(s,a):=\mathbb{E}\left[\sum_{n=0}^{\infty} \gamma^n R\left(s_n, a_n\right) \mid s_0=s, a_0=a,\pi ,P\right]. $
The  value function and Q-function  are linked with the relation $V_{P, R}^\pi(s)=\langle(\pi_s,Q_{P, R}^\pi(s)\rangle_A$.  With these notations, we can define Q-functions for transition probability transition $P$ following policy $\pi$ by 
$$Q_{P, R}^\pi=R+\gamma P V_{P, R}^\pi=R+\gamma P^\pi Q_{P, R}^\pi
=\left(I-\gamma P^\pi\right)^{-1} R.$$
%or equivalently  $$Q_{P, R}^\pi=\left(I-\gamma P^\pi\right)^{-1} R.$$
\subsection{Robust Markov Decision Process}
Once classical MDPs defined, we can define robust (optimal) Bellman operators $\mathcal{T}^\pi_{\mathcal{U}}$ and  $\mathcal{T}_{\mathcal{U}}^*$,
% $$\mathcal{T}^\pi_{\mathcal{U}}(s):=\min _{R, P \in \mathcal{U}}\left(\mathcal{T}_{P, R}^\pi V\right)(s)\quad 
% $$  
% %\vspace{-15\lineskip}
% $$\left(\mathcal{T}_{\mathcal{U}}^* V\right)(s):=\max _{\pi_s \in \Delta_{\mathcal{A}}} \min _{R, P \in \mathcal{U}}\left(\mathcal{T}_{P, R}^{\pi_s} V\right)(s),$$
\begin{align*}
    \mathcal{T}^\pi_{\mathcal{U}}(s)&:=\min _{R, P \in \mathcal{U}}\left(\mathcal{T}_{P, R}^\pi V\right)(s),
    \\
    \left(\mathcal{T}_{\mathcal{U}}^* V\right)(s)&:=\max _{\pi_s \in \Delta_{\mathcal{A}}} \min _{R, P \in \mathcal{U}}\left(\mathcal{T}_{P, R}^{\pi_s} V\right)(s),
\end{align*}
%\vspace{-0.0cm}
where $P$ and $R$ belong  to the uncertainty set $\mathcal{U}$. The optimal robust Bellman operator $\mathcal{T}_{\mathcal{U}}^*$ and robust Bellman operator $\mathcal{T}^\pi_{\mathcal{U}}$ are $\gamma$-contraction maps for any policy $\pi$ \citep[Thm.~3.2]{iyengar2005robust} if  the uncertainty set $\mathcal{U}$ is a subset of $\Delta_s$ so that the transition kernel is valid.
%(see Thm-3.2  of \cite{iyengar2005robust}, that is :
% $$
% \begin{aligned}
% &\left\|\mathcal{T}_{\mathcal{U}}^* v-\mathcal{T}_{\mathcal{U}}^* u\right\|_{\infty} \leq \gamma\|u-v\|_{\infty}, \quad \quad  \\
% &\left\|\mathcal{T}_{\mathcal{U}}^\pi v-\mathcal{T}_{\mathcal{U}}^\pi u\right\|_{\infty} \leq \gamma\|u-v\|_{\infty}, \quad \forall \pi.
% \end{aligned}
% $$
%
Finally, for any initial values $V_0^\pi, V_0^*$, sequences defined as
$
V_{n+1}^\pi:=\mathcal{T}_{\mathcal{U}}^\pi V_n^\pi$ and $V_{n+1}^*:=\mathcal{T}_{\mathcal{U}}^* V_n^*
$
converge linearly to their respective fixed points, that is $V_n^\pi \rightarrow V_{\mathcal{U}}^\pi$ and $V_n^* \rightarrow V_{\mathcal{U}}^*$. This makes robust value iteration an attractive method for solving robust MDPs. In order to obtain tractable forms of RMDPs, one has to make assumptions about the uncertainty sets and give them a rectangularity structure \citep{iyengar2005robust}. In the following, we will use an $L_p$ norm as the distance between distributions.  The $s$- and $sa$-rectangular assumptions can be defined as follows, with $R_0$ and $P_0$ being called the nominal reward and kernel. \looseness=-1
\begin{assumption}($sa$-rectangularity)
\label{sa rectangle}
  We define $sa$-rectangular $L_p$-constrained uncertainty set as{\small
\begin{align*}
&\mathcal{U}_p^{\text {$sa$}}:=  
    \left(R_0+\mathcal{R}\right) \times\left(\Pzero +\mathcal{P}\right),  
\mathcal{R}=\times_{s \in \mathcal{S}, a \in \mathcal{A}} \mathcal{R}_{s, a}, 
 \mathcal{P}=\times_{s \in \mathcal{S}, a \in \mathcal{A}} \mathcal{P}_{s, a},  \mathcal{R}_{s, a}=\left\{r_{s, a} \in \mathbb{R} \mid \vert r_{s, a} \vert \leq \alpha_{s, a}\right\} \\
&\mathcal{P}_{s, a}=\{P_{s, a}: \mathcal{S} \rightarrow \mathbb{R} \mid \sum_{s'} P_{s,a}(s')=0, \Pzero+P_{s,a}\geq0, 
\normpbar{P_{s, a}} \leq \beta_{s, a}\}
\end{align*}}
\end{assumption}
    

\begin{assumption}
    ($s$-rectangularity)
    \label{s rectangle}
We define s-rectangular $L_p$-constrained uncertainty set as
%\vspace{-0.1cm}
{\small
\begin{align*}\small
&\mathcal{U}_p^{\mathbf{s}}=\left(R_0+\mathcal{R}\right) \times\left(\Pzero+\mathcal{P}\right),   \mathcal{P}=\times_{s \in \mathcal{S}} \mathcal{P}_s, 
\mathcal{R}=\times_{s \in \mathcal{S}} \mathcal{R}_s, \quad \mathcal{R}_s=\left\{r_s: \mathcal{A} \rightarrow \mathbb{R} \mid \normpbar{r_s} \leq \alpha_s\right\} \\
&\mathcal{P}_s=\{P_s: \mathcal{S} \times \mathcal{A} \rightarrow \mathbb{R} \mid \sum_{s'} P_s(s',a)=0 , %\mspace{200mu}
P_s(.a)+\Pzero\geq0   ,\normpbar{P_s} \leq \beta_s\} 
\end{align*}}

\end{assumption}
%\vspace{-0.3cm}
We write \label{def:beta}$\beta = \sup_{s,a}\beta_{s,a}$ for $sa$-rectangular assumptions or $\beta = \sup_{s}\beta_s$ for $s$-rectangular assumptions and with the same manner $ \alpha=\sup_{s,a}\alpha_{s,a}$. Moreover, we write $ P\in \mathcal{P}_{0,s,a}$ for   $P=P_{0,s,a}+ P'$ with $P'\in \mathcal{P}_{s,a}$ and $ P\in \mathcal{P}_{0,s}$ for  $P=P_{0,s}^\pi+ P'$ with $P'\in \mathcal{P}_{s}$, $P_{0,s}^\pi(s')  =\sum_a \pi(a\vert s) P_{0,s,a}(s') \in \mathbb{R}^S$. The conditions $\sum_{s'} P_s(s',a)=0 ,  P_s(.a)+\Pzero\geq0$ or $\sum_{s'} P_{s,a}(s)=0 ,  P_{s,a}(s')+\Pzero\geq0$ ensure that the robust kernel is in the simplex. In comparison to $sa$-rectangular robust MDPs, $s$-rectangular robust MDPs are less restrictive but much more difficult to deal with. Using rectangular assumptions and constraints defined with  $L_p$-balls, it is possible to derive simple dual forms for the (optimal) robust Bellman operators for the minimization problem that involves the seminorm defined below:
\looseness=-1
 \begin{definition}[Span seminorm \citep{puterman1990markov}] \label{span}
Let $q$ be such that it satisfies the Holder's equality, i.e. $\frac{1}{p}+\frac{1}{q}=1$. Let span-seminorm function $\mathrm{sp}_q: \mathcal{S} \rightarrow \mathbb{R}$ and  $q$-mean function $\omega_q: \mathcal{S} \rightarrow \mathbb{R}$ be defined as
$$
\snormqbar{v}:=\min _{\omega \in \mathbb{R}}\|v-\omega \mathbf{1}\|_{q}, \quad \omega_{q}(v):=\arg \min _{\omega \in \mathbb{R}}\|v-\omega \mathbf{1}\|_{q} .
$$
\end{definition}
%\vspace{-0.5cm}
One can think of those span-seminorms as semi-mean-centered-norms. These quantities represent the dispersion of a distribution around its mean, and there are no order relations for this type of object. Seminorms appear in the (non-robust) RL community for others reasons \citep{puterman1990markov,scherrer2013performance}. For $p=$1, 2 and $\infty$, closed form can be derived, corresponding to median, variance and range. This is not the case for arbitrary $p$ but span-seminorms can be efficiently computed in practice, see ~\cite{kumar2022efficient}. Once span-seminorms defined, we introduced the dual of the inner minimisation problem.
\looseness=-1
\begin{lemma}[Duality for $sa$ rectangular case with $L_p$ norm] \label{saduality}  For any $V\in \mathbb{R}^S, P_{0,s,a}=P_0(.\vert s,a) \in  \mathbb{R}^S   $  and $\mu \in \mathbb{R}^S$ \looseness=-1
    \begin{align*}
        &\min _{  P\in  \mathcal{P}_{0,s,a}   }PV=\max _{\mu\geq 0}  P_{0,s,a}(V-\mu) -\beta_{s,a} \snormqbar{V-\mu} 
=\max _{\alpha \in [V_{min}, V_{max}]} P_{0,s,a}[V]_\alpha - \beta_{s,a} \snormqbar{ [V]_\alpha  }  . 
    \end{align*}
    \looseness=-1
    % with $[V]_\alpha(s):= \alpha, \text { if } V(s)>\alpha , \text{ and } V(s), \text { otherwise. }$ and $V_{min}, V_{max}$ respectively the minimum and 
    with $[V]_\alpha(s):= \alpha$ if $ V(s)>\alpha$ and $V(s)$ otherwise,  and $V_{min}$, $V_{max}$ respectively the minimum and 
    the maximum value taken by V.
    \looseness=-1
\end{lemma}
\begin{lemma}[Duality for $s$ rectangular case.] 
Consider  the probability kernel $P^\pi_{0,s} =\Pi^\pi P_{0,s,a} \in \mathbb{R}^{s}$ with $\Pi^{\pi}$ the projection matrix associated with a  policy $\pi$ such that 
$P_{0,s}^\pi(s')  =\sum_a \pi(a\vert s) P_{0,s,a}(s') \in \mathbb{R}^S  $. For any $V\in \mathbb{R}^S :$
    \begin{align*}
        &\min _{ P\in \mathcal{P}_{0,s }}PV= 
        \max _{\mu\geq 0}  P_{0,s}^{\pi}(V-\mu) -\beta_{s}\normqbar{\pi_s} \snormqbar{V-\mu}  
        =\max _{\alpha \in [V_{min}, V_{max}]}  \Big(P^{\pi}_{0,s}[V]_\alpha - \beta_{s} \normqbar{\pi_s}\snormqbar{ [V]_\alpha  } \Big)
    \end{align*}
\end{lemma}




 
 %%%%%%% bouger 


 
%  \cite{derman2021twice,kumar2022efficient}. Indeed,  $\min _{R, P \in \mathcal{U}} V_{P, R}(s)$, the inner minimisation problem, has a simple form which allows having practical and efficient algorithms. Instead of considering a robust algorithm, we are only interested in a regularised form of MDPs, which has been more widely studied. The main difference is that the regularisation involves the current value function, whereas classical regularisations are only concerned with the current policy $\pi$, such as in entropy \cite{haarnoja2018soft} or KL-regularised MDP \cite{vieillard2020leverage}. 
% \ The infimum of optimisation problem of RMPDs below involve these quantities.



Proofs car be found in Appendix \ref{saduality3} ,\ref{sduality}.
These results allow computing robust value and Q-functions. Close to our work, \cite{derman2021twice,kumar2022efficient} also consider $L_p$-norms but do not assume that robust kernel belongs to the simplex. In that sense, their formulation is a relaxation of the framework of RMPDs. Using this relaxation, closed forms of robust Bellman operator can be obtained, see \citet[Thm.~1]{kumar2022efficient}.  In our work, we assume a valid transition kernel in the simplex ($P_{s, a}\geq 0$ or $P_{s }\geq 0$ for respectively $sa-$ or $s-$ rectangular cases), leading to a dual form that has no closed form but which is a simple scalar optimisation problem. A complete discussion can be found in Appendix \ref{comparaison}. \looseness=-1
 % \cite{derman2021twice} introduced penalties invoking the classical norms and not span seminorms for the value function. However, assumptions on the transition kernels were not realistic, which was corrected by \citet{kumar2022efficient} to get a kernel that sums to $1$ but this still suffers from non-positivity of the kernel .
%%%%%%% 
%%%%%%%%%%%%%%%
% \begin{theorem}[\citet{kumar2022efficient}] The  $sa$-rectangular robust Bellman operator is equivalent to a regularised non-robust Bellman operator: for $r^{(s,a)}_{V}(s,a)=-\alpha_{s, a}-\gamma \beta_{s, a} \snormqbar{V}+R_0(s, a)$
% $$
% \begin{aligned}
% &\mathcal{T}_{\mathcal{U}_p^{\text {sa }}}^\pi V(s)=\langle \pi_s,r^{(s,a)}_{V}(s,a)+\gamma \sum_{s^{\prime}} \Pzero\left(s^{\prime} \mid s, a\right) V\left(s^{\prime}\right)\rangle_A\\
% &\mathcal{T}_{\mathcal{U}_p^{\text {sa }}}^* V(s)=\max _{a \in \mathcal{A}}\left[r^{(s,a)}_{V}(s,a)+\gamma \sum_{s^{\prime}} \Pzero\left(s^{\prime} \mid s, a\right) V\left(s^{\prime}\right)\right].
% \end{aligned}
% $$
% \end{theorem}
% \begin{theorem}[\citet{kumar2022efficient}]
%   The $s$-rectangular Robust Bellman operator is equivalent to a regularised non-robust Bellman operator: for $r^{s}_{V,\pi}(s,a)=-\left(\alpha_s+\gamma \beta_s \snormqbar{V}\right)\normq{\pi_s}+R_0(s,a)$, where $\left\|\pi_s\right\|_{q}$ is $q$-norm of the vector $\pi(\cdot \mid s) \in \Delta_{\mathcal{A}}$, we have
% $$
% \begin{aligned}
% &\mathcal{T}_{\mathcal{U}_p^s}^\pi V(s)=\langle \pi_s ,r^{s}_{V,\pi}(s,a)+\gamma \sum_{s^{\prime}} \Pzero\left(s^{\prime} \mid s, a\right) V\left(s^{\prime}\right)\rangle_A; \quad 
% \mathcal{T}_{\mathcal{U}_p^s}^*  V(s)=\max _{\pi \in \Delta_{\mathcal{A}}}\mathcal{T}_{\mathcal{U}_p^s}^\pi V(s).
% \end{aligned}
% $$
% \end{theorem}
% However, this requires access to seminorms, which in practice is difficult for arbitrary $p$ . For $p=$1, 2 and $\infty$, closed form can be derived, corresponding to median, variance and range~\citep{kumar2022efficient}.
% It is important to note that in the article of \citep{derman2021twice}, penalties invoking the classical norm and note span seminorms for the vale function are derived. However, assumptions on the $P$ the transition kernels are not realistic, which is corrected in \cite{kumar2022efficient}.
% These operators give an easy way to tackle the robust value iteration problem if one is able to estimate $p$-span semi norm, which is not obvious for a general $p$ even if algorithms are proposed in \cite{kumar2022efficient}. (Here we consider an oracle, and we assume we have access to $p$-semi norms.) 
Finally, we denote robust $Q$ function for $sa-$ and $s-$ rectangular respectively $Q_{sa}^\pi$ and $Q_s^\pi$ and we define them from robust value function $V_{sa}^\pi$, $V_{s}^\pi$ as : \looseness=-1
\begin{align*}
    \label{Q robust}
    V_{s}^\pi (s)=\sum_a \pi(a\vert s) Q_{s}^\pi(s,a),   V_{sa}^\pi(s)=\sum_a \pi(a\vert s) Q_{sa}^\pi(s,a)
\end{align*}
% and 
% \begin{align*}
%     &Q_{s}^\pi (s)= R_0(s,a)+ \gamma \min _{  P\in P_{0,s}^\pi + \mathcal{P}_{s}   }(P V_{s}^\pi )(s) \\
%     & Q_{sa}^\pi (s)= R_0(s,a)+ \gamma \min _{  P\in P_{0,s,a}^\pi + \mathcal{P}_{s,a}   }(P V_{sa}^\pi )(s)
% \end{align*}
\begin{lemma}
For $sa-$ and $s-$ rectangular,
    \label{Q robust}
\begin{align*}
    &Q_{sa}^\pi (s,a)= \rsapi+\gamma P_{0,s,a}   V_{sa}^\pi, 
     Q_{s}^\pi (s,a)= \rspi +\gamma P_{0,s,a}   V_{s}^\pi 
\end{align*}
\vspace*{-20\lineskip}
\begin{align*}
     &\text{with  }\quad \rsapi= R_0(s,a)-  \alpha_{s,a} +\gamma \min _{  P\in  \mathcal{P}_{s,a}   } P  V_{sa}^\pi 
     \rspi= R_0(s,a) -\Big( \frac{\pi_s(a)}{\normq{\pi_s}}\Big)^{q-1}  \alpha_{s} +\gamma \min _{  P^\pi \in \mathcal{P}_{s}   }P^\pi  V_{s}^\pi
\end{align*}
\end{lemma}
% In the following, we will write $V_s^\pi$ and $Q_s^\pi$ for robust value and robust Q-functions for Assumption \ref{s rectangle} and  $V_{sa}^\pi$ and $Q_{sa}^\pi$ for Assumption \ref{sa rectangle}. Finally, with these notations, we can write the robust Q-function Bellman equations for $sa$-  and $s$- rectangular assumptions as:
% \begin{align*}
% \label{Q robust}
% &\Qsapi(s,a)=-\alpha_{s, a}-\gamma 
%   \min _{P\in \Delta_s, \normpbar{P}\leq\beta_{s,a}  } P \Vpi  +R_0(s, a) \\&+\gamma \Pzero^\pi \Qsapi(s,a) ,\\
% &\Qspi(s,a)=  (-\alpha_{s}-\gamma \min _{P\in \Delta_s, \normpbar{P}\leq\beta_{s}  } P \Vpi)( \frac{\pi_s(a)}{\normq{\pi_s}})^{q-1}  \\
% & +R_0(s, a)+\gamma \Pzero^\pi \Qspi(s,a) 
% \end{align*}
%
% $$\snormqbar{\Vsapi}:=\snormqbarpi{\Qsapi}=\snormqbar{\langle \pi,\Qsapi\rangle_A}$$ and 
% $$\snormqbar{\Vspi}:=\snormqbarpi{\Qspi}=\snormqbar{\langle \pi,\Qspi\rangle_A}.$$

%We will use the closed forms of the problem updates  to analyse the $L_p$-constrained RMDPs. Furthermore, we will show that we can gain factors in the analysis of the sample complexity by a factor $\Snorm$ and a factor $H\Snorm$ in the second part of the paper, with a condition on $\beta$. This analysis justify the use of these algorithms which have a good sample complexity to tackle  the RMDPs problem.
%\vspace*{-0.4cm}



%Similar resuls can be found in \cite{kumar2023policy}. 
Robust $Q$ functions and dual forms of the robust Bellman operators will be central to our analysis of the sample complexity of model-based robust RL. They allow improving the bound by a factor $\Snorm$ or $\Snorm\Anorm$ compared to existing results (Sec.~\ref{sec:H4}). With additional technical subtleties, adapted from the non-robust setting, and assuming the uncertainty set is small enough, they even allow improving the bound by a factor $\Snorm H$ or $\Snorm\Anorm H$ (Sec.~\ref{toward}).
\looseness=-1
%% paragrap, pas subsection
%%% talbeau mettre juste tv en caption
%%  
\subsection{Generative Model Framework }
We consider the setting where we have access to a generative model, or sampler, that gives us samples $s^{\prime} \sim \Pzero(\cdot \mid s, a)$, from the nominal model and from arbitrary state-action couples. Suppose we call our sampler $N$ times on each state-action pair $(s,a)$. Let $\widehat{P}$ be our empirical model, the maximum likelihood estimate of $\Pzero$,
$
\widehat{P}(s^{\prime} \mid s, a)= 
P_{s,a}(s') =\frac{\operatorname{count}(s^{\prime}, s, a)}{N},
$
where $\operatorname{count}(s^{\prime}, s, a)$ represents the number of times the state-action pair $(s, a)$ transitions to state $s^{\prime}$. Moreover, we define $\widehat{M}$ as the empirical RMDP identical to the original $M$ except that it uses $\widehat{P}$ instead of $\Pzero$ for the transition kernel. We denote  by $\widehat{V}^\pi$ and $\widehat{Q}^\pi$ the value functions of a policy $\pi$ in $\widehat{M}$, and $\widehat{\pi}^{\star}, \widehat{Q}^{\star}$ and $\widehat{V}^{\star}$ denote the optimal policy and its value functions in $\widehat{M}$. It is assumed that the reward function $R_0$ is known and deterministic and therefore exactly identical in $M$ and $\widehat{M}$. 
Moreover, we write $ P\in \hat{\mathcal{P}}_{s,a}$ for   $P=\hat{P}_{s,a}+ P'$ with $P'\in \mathcal{P}_{s,a}$ and $ P\in \hat{\mathcal{P}}_{s}$ for  $P=\hat{P_{s}}^\pi+ P'$ with $P'\in \mathcal{P}_{s}$, $\hat{P_{s}}^\pi(s')  =\sum_a \pi(a\vert s) \hat{P}_{s,a}(s') \in \mathbb{R}^S  $.


\vspace{0.3cm}

Notice that our analysis would easily account for an estimated reward (the hard part being handling the estimated transition model). 
This generative model framework, when we can only sample from the nominal kernel, is classic and appears for both non-robust and robust MDPs~\citep{agarwal2020model,panaganti2022robust,gheshlaghi2013minimax,xu2023improved}. In the robust case, it is especially relevant as an abstraction of ``sim-to-real'', the simulator giving access to the nominal kernel for learning a robust policy to be deployed in the real world (assumed to belong to the uncertainty set). 
\looseness=-1
% Generative model framework is a classical framework where we are only able to sample from the nominal kernel and is present in both non-robust and robust MDPS \citep{agarwal2020model,panaganti2022robust,gheshlaghi2013minimax,xu2023improved}. Many works on sample complexity is based on this assumption, which is relevant in Sim-to-Real RL, where the simulator has only access to the nominal kernel.

\vspace{0.3cm}

The question of how to solve RMDPs and the related computational complexity are complementary, but different from Theorems \ref{h4}and \ref{h3}. Indeed, an important point that differentiates us from \citet{panaganti2022sample} is the use of a
\emph{robust optimisation oracle}. 
In (model-based) sample complexity analysis, the goal is to determine the smallest sample size $N$ such that a planner executed in $\widehat{M}$ yields a near-optimal policy in the RMDP $M$. In order to decouple the statistical and computational aspects of planning with respect to an approximate model $\widehat{M}$, we will use an optimisation oracle that takes as input an (empirical) RMDP and returns a policy $\hat{\pi}$ that satisfies
$\|\hat{Q}^* - \hat{Q}^{\hat{\pi}}\|_\infty \leq \epsilon_\text{opt}$. Our final bound will depend on $\epsilon$, the error made from finite sample complexity, and $\epsilon_{\text {opt }}$. In practice, the error $\epsilon_{\text {opt }}$ is typically decreasing at a linear speed of $\gamma^k$ at the $k^\mathrm{th}$ iteration of the algorithm, as in classical MDPs because (optimal) Bellman operators are $\gamma$-contraction in both classic and robust settings when the robust kernel belongs to the simplex. 

\vspace{0.3cm}


The computational cost of RMDPs is addressed by  
\citet{iyengar2005robust} but not in the $L_p$ case. \cite{kumar2022efficient} address this question, using the regularised form of robust MDPs obtained with relaxed hypothesis on the kernel (See Appendix \ref{comparaison}).
 The conclusions of the latter are that $L_p$ robust MDPs are computationally as easy as non-robust MDPs for regularised forms, at least for some choices of $p$ for their relaxation. However, in their analysis, the use of $\gamma$-contraction of the Robust Bellman Operator is needed, whereas it does not hold for sufficiently large $\beta$. Indeed, assuming robust kernel is not anymore in the simplex, Robust Bellman Operator is not anymore a $\gamma$-contraction but an $\epsilon-$contraction for $\epsilon$ close to $1$ and only for a small range of $\beta$, see \citet[Thm.~5.1]{derman2021twice}.
We address the question of solving RMPDs in the $L_p$ case with a valid robust kernel in Alg. \ref{alg:cvi-dro-infinite} and \ref{alg:cvi-dro-infinites},
as a possible approach to ensure an $\epsilon_{ops}$ solution in our analysis.
% as it is required to obtain $\epsilon_{ops}$ solution in our analysis. 










% The conclusions of the latter are that $L_p$ robust MDPs are computationally as easy as non-robust MDPs for regularised forms, at least for some choices of $p$ for their relaxation. 

% These questions are  not central to this paper, as we consider an optimisation oracle in the sample analysis but we derive an algorithm to solve $L_p$ RMDPs with valid robust kernel in Alg. \ref{alg:cvi-dro-infinite}



%  The computational cost of  non-regularised form of RMDPs is addressed by  
% \citet{iyengar2005robust}  while this question for the regularised form of RMDPs is raised by \citet{kumar2022efficient}. The conclusions of the latter are that $L_p$ robust MDPs are computationally as easy as non-robust MDPs for regularised forms, at least for some choices of $p$ for their relaxation. Notice that the regularized form and the classic one being equivalent, any approach can be chosen. 
% % One can  use algorithms such as robust value iteration \citep{panaganti2022sample} and determine the minimum number of steps required to have an error $\epsilon_{\text {opt }}$.
% % Existing analyses  consider a specific algorithm.

 



 
%\section{ First sample complexity for RMDPs with $L_p$-balls.}
\section{First Sample Complexity }
\label{sec:H4}

% The aim of this section is to obtain an upper-bound on the sample complexity of RMDPs. This result is true for $sa$- and $s$-rectangular sets and for any $L_p$ norm  with  $p\geq1.$
The aim here is to obtain a general upper-bound on the sample complexity of RMDPs. This result is true for $sa$- and $s$-rectangular sets and for any $L_p$ norm  with  $p\geq1.$
\looseness=-1
\begin{theorem}
\label{h4}
Assume $\delta>0$, $\epsilon >0$ and $\beta >0$. Let $\widehat{\pi}$ be any $\epsilon_{\text {opt }}$-optimal policy for $\widehat{M}$, i.e. $\|\widehat{Q}^{\widehat{\pi}}-\widehat{Q}^{\star}\|_{\infty} \leq \epsilon_{\text {opt } }$.
With $N$ calls to the sampler per state-action pair, such that
$
N \geq \frac{C \gamma^2
\log \left(|\mathcal{S} \| \mathcal{A}|(1-\gamma)^{-1} \delta^{-1}\right)}{(1-\gamma)^4 \epsilon^2}, 
$
we obtain the following guarantee for policy $\hat{\pi}$,
$ \norminf{\Qs-\Qpihat}\leq \epsilon +
\epsilon_{o p t   }     $
with probability at least $1-\delta$, where $C$ is an absolute constant. Finally, for 
$N_{\text {total }}=N|\mathcal{S}||\mathcal{A}|$ and  $H=1/(1-\gamma)$, we get an overall complexity of $N_{\text {total }}=\tilde{\mathcal{O}}\left(   \frac{H^4\Snorm\Anorm}{\epsilon^2}    \right).$
%
\end{theorem}
\subsection{Discussion}
%In practice, for value iteration $\epsilon$ is bigger than $\epsilon_{o p t   }$, as $\epsilon_{o p t   }$  is typically decreasing at a speed of $\gamma^k$ for value or policy iteration, such as \citet{panaganti2022robust} who do not assume having access to an oracle but consider empirical robust value iteration. Our theorem is true for any $\epsilon>0$, which is not always the case in other works, such as \citet{panaganti2022sample} whose result is true for $\epsilon$ in the range $(0,\gamma/(1-\gamma) )$.

This result says that the policy $\hat{\pi}$ computed by the planner on the empirical RMDP $\hat{M}$ will be $(\epsilon_\text{opt}+\epsilon)$-optimal in the original RMDP $M$. As explained before, there exists planning algorithms for RMDPs that guarantee arbitrary small $\epsilon_\text{opt}$, such as robust value iteration considered by \citet{panaganti2022sample} or our algorithm \ref{alg:cvi-dro-infinite} and \ref{alg:cvi-dro-infinites}. It will also apply to future planners, as long as they come with a convergence guarantee.
% In the non-robust case, the sample complexity is lower-bounded by $\Omega\left(\frac{H^3 \Anorm \Snorm}{\epsilon^2}\right)$ . Here in Theorem \ref{h4} we  show that without any conditions on $\beta$ or any uncertainty set, we have an overall sample complexity of about $\tilde{\mathcal{O}}\left(\frac{H^4 \Anorm \Snorm}{\epsilon^2}\right)$ for both s and sa-rectangular assumptions, which is a factor $H$ different from the lower-bound for the non-robust setting. We will show later that this bound is not tight but can be improved in section \ref{toward}, by adding a condition on $\beta$.
The error term $\epsilon$ is controlled by the number of samples: $N_\text{tot} = \tilde{\mathcal{O}}(H^4 \Snorm\Anorm\epsilon^{-2})$ calls to the generative models allow guaranteeing an error $\epsilon$.
%closed form solution of our inner minimisation problem gives  us 
This is a gain in terms of sample complexity of $\Snorm$ compared to \citet{panaganti2022sample}, for the $sa$-rectangular assumption. Our bound also holds for both $s$- and $sa$-rectangular uncertainty sets. \citet{panaganti2022robust} do not study the $s$-rectangular case, while \citet{yang2021towards} do, but have a worst dependency to $\Anorm$ in this case. Their bounds also have additional dependencies on the size of the uncertainty set, which we do not have. We recall that we do not cover the same cases, we do not analyse the KL and Chi-Square robust set, while they do not analyse the $L_p$ robust set for $p>1$. However, the above comparison holds for the total variation case that we have in common ($p=1$). These bounds are clearly stated in Table~\ref{tableau1}. In the non-robust setting, \citet{gheshlaghi2013minimax} show that there exist MDPs where the sample complexity is at least $\tilde{\Omega}\left(\frac{H^3 \Anorm \Snorm}{\epsilon^2}\right)$.
Section \ref{toward} gives a new upper-bound in $H^3$ which matches this lower-bound for non-robust MDPs with an extra condition on the range of $\beta$ (the uncertainty set should be small enough).
 \looseness=-1
\subsection{Sketch of Proof}
\label{subsec:sketch_H4}
% The first idea is to make a bound that is as simple as possible with Hoeffding's concentration arguments. This is not a bound that is optimal with respect to the $H=1/(1-\gamma) $ horizon. On the other hand, this bound has the advantage of being valid for any range $\alpha$ and $\beta$, while our future bound in $H^3$ is optimal in $H$ but imposes a more restrictive condition on $\beta$.

This first proof is the simpler one, it relies notably on Hoeffding's concentration arguments. We provide a sketch, the full proof can be found in Appendix~\ref{annex B}. The resulting bound is not optimal in terms of the horizon $H$, but it also does not impose any condition on the range of $\epsilon$ or $\beta$, contrary to the (better) bound of Sec.~\ref{toward}.
% Note also that our bound does not depend on $\beta$, which allows us to reconcile our robust and non-robust complexity proof. Indeed, in the work of \citet{yang2021towards} or \citet{panaganti2022sample} for bounds using $\mathrm{KL}$, $\mathrm{TV}$ or $\chi^2$, when the uncertainty set grew, it increased or decreased the sample complexity, which is not the case in our work, where the complexity is very much independent of $\beta$. (See Table \ref{tableau}.)
We would like to bound the supremum norm of the difference between the optimal Q-function and the one of the policy computed by the planner in the empirical RMDP, according to the true RMDP, $\|Q^* - Q^{\hat{\pi}}\|_\infty$. Using a simple decomposition and the fact that $\pi^*$ is not  optimal in the empirical RMDP ($\hat{Q}^{\pi^*} \leq \hat{Q}^* = \hat{Q}^{\hat{\pi}^*}$), we have that \looseness=-1
\begin{equation*}
    Q^* - Q^{\hat{\pi}} = Q^* - \hat{Q}^* + \hat{Q}^* - \hat{Q}^{\hat{\pi}} + \hat{Q}^{\hat{\pi}} - Q^{\hat{\pi}}.
\end{equation*}
As $Q^* - \hat{Q}^* \leq  Q^* - \hat{Q}^{\pi^*}$, a triangle inequality yields
%we get %and a triangle inequality, we get
\begin{equation*}
    \|Q^* - Q^{\hat{\pi}}\|_\infty
    \leq \|Q^* - \hat{Q}^{\pi^*}\|_\infty +
    \|\hat{Q}^* - \hat{Q}^{\hat{\pi}}\|_\infty + \|\hat{Q}^{\hat{\pi}} - Q^{\hat{\pi}}\|_\infty.
\end{equation*}
% The bound can be decomposed into three terms using triangular inequality.
% \begin{align*}
% \norminf{\Qs-\Qpihat} &\leq \norminf{\Qs-\Qhatpistar} + \norminf{ \Qhatpistar-\Qhatpihat} \\
% &+ \norminf{ \Qhatpihat-\Qpihat}
% \end{align*}
% where $\hat{\pi}$ is the policy output of the oracle, $\Qhatpihat$ is  the empirical robust Q function and $\Qhatpistar$ is the empirical robust Q function following the best policy of the true MDPs. Proof can be found in Appendix \ref{decomposition}
The second term is easy to bound, by the assumption of the planning oracle we have $\|\hat{Q}^* - \hat{Q}^{\hat{\pi}}\|_\infty\leq \epsilon_\text{opt}$. The two other terms are similar in nature. They compare the Q-functions of the same policy (either $\pi^*$ the optimal one of the original RMDP, or $\hat{\pi}$ the output of the planning algorithm) but for different RMPDs, either the original one or the empirical one.
% By definition of our problem, we can bound the second term $\norminf{\Qs-\Qhatpistar}$ by $\epsilon_{\mathrm{opt}}$ as it is the term depending on the method chosen to solve the RMDPs. Value or policy iteration algorithm is possible, but we are in a more general case here. The two last terms $\norminf{\Qs-\Qhatpistar}$ and $\norminf{ \Qhatpistar-\Qhatpihat}$ compare Q-function for different dynamics and depend on how many data we collect for each state-action pairs.  Then, for any set $\mathcal{D}$ and a vector $v$, let define
For bounding the remaining terms, we need to introduce the following notation. For any set $\mathcal{D}$ and a vector $v$, let define
$
\kappa_{\mathcal{D}}(v)=\inf \left\{u^{\top} v: u \in \mathcal{D}\right\} .
$
This quantity corresponds to the $\inf$ form of the robust Bellman operator.  The following lemma provides a data-dependent bound of the two terms of interest.
\looseness=-1
\begin{lemma} 
%Upper bound on the norm of $Q^*(s,a)-\widehat{Q}^*(s,a)$ and $Q^*\left(s, a\right)-\Qhatpistar\left(s, a\right) $. 
With $\mathcal{P}_{s,a}$ defined in Assumption \ref{sa rectangle} and $\hat{\mathcal{P}}_{s,a}$ the robust set centred around the empirical MDPs,
% \begin{align*}
%     \norminf{\Qpihat(s,a)-\Qhatpihat(s,a)}\leq& \frac{\gamma}{1-\gamma}  \max _{s, a}\left|\kappa_{\hat{\mathcal{P}}_{s, a}^{\mathrm{}}}(\Vpihat)-\kappa_{\mathcal{P}_{s, a}}(\Vpihat)\right|\\
%  \norminf{Q^*\left(s, a\right)-\Qhatpistar\left(s, a\right) }\leq& \frac{\gamma }{1-\gamma}\max _{s, a}\left|\kappa_{\widehat{\mathcal{P}}_{s, a}}(V^*) -\kappa_{\mathcal{P}_{s, a}}(V^*)\right|
% \end{align*}
\begin{align*}
    &\|\Qpihat-\Qhatpihat\|_\infty 
    \leq \frac{\gamma}{1-\gamma}  \max_{s, a}|\kappa_{\hat{\mathcal{P}}_{s, a}^{\mathrm{}}}(\Vhatpihat)-\kappa_{\mathcal{P}_{0,s, a}}(\Vhatpihat)|
    \\
    &\|Q^*-\Qhatpistar \|_\infty
    \leq \frac{\gamma }{1-\gamma}\max_{s, a}|\kappa_{\widehat{\mathcal{P}}_{s, a}}(V^*) -\kappa_{\mathcal{P}_{0,s, a}}(V^*)|.
\end{align*}
%\pierre{faire rentrer les equations}
\end{lemma}
For proving these inequalities, we rely on fundamental properties of the (robust) Bellman operator, such as $\gamma$-contraction. This lemma is written for the $sa$-rectangular assumption, but it is also true for the $s$-rectangular one, replacing the robust set $\mathcal{P}_{s,a}$ by $\mathcal{P}_{s}$. 
% Proof can be found in Appendix \ref{upper hat}. In these two inequalities, we use the fundamental properties of the Bellman optimal operator, such as $\gamma$-contraction. The strength of these lemmas is that they do not involve manipulation of  $\beta$ and $\alpha$ and allow no conditioning on the size of the uncertainty set. This makes it possible to avoid paying an additional $\Snorm$ factor, as in \cite{panaganti2022robust}. The following lemma link concentration of robust MDPs to classical MDPs and gives a first bound with probability $1-\delta$.
Now, we need to bound the resulting terms, which is done by the following lemma. \looseness-1
\begin{lemma}
With probability at least $1-\delta$, we have
\label{close_form_main}
\begin{align*}
    &\max _{s, a}|\kappa_{\hat{\mathcal{P}}_{s, a}^{\mathrm{}}}(\Vhatpihat)-\kappa_{\mathcal{P}_{0,s, a}}(\Vhatpihat)|
 \leq    \sqrt{\frac{256 \log \left(L\right)}{(1-\gamma)^2N}}+ 2 \epsilon_{\mathrm{opt}} \\
 &  \max _{s, a}|\kappa_{\hat{\mathcal{P}}_{s, a}^{\mathrm{}}}(\Vhatpistar)-\kappa_{\mathcal{P}_{0,s, a}}(\Vhatpistar)|
 \leq    \sqrt{\frac{256 \log \left(L\right)}{(1-\gamma)^2N}}
\end{align*}
with $L=\log (8|\mathcal{S}||\mathcal{A}| /((1-\gamma) \delta))$
% \begin{align*}
%     \max _{s, a}\left|\kappa_{\hat{\mathcal{P}}_{s, a}^{\mathrm{}}}(\Vpihat)-\kappa_{\mathcal{P}_{s, a}}(\Vpihat)\right|&=\max _{(s, a)}\left|\left(P_{s, a} -\widehat{P}_{s, a}\right)\Vpihat \right|\\
% &\leq  H \sqrt{\frac{\log (2|\mathcal{S} \| \mathcal{A}| / \delta)}{2 N}}\\
% \max _{s, a}\left|\kappa_{\widehat{\mathcal{P}}_{s, a}}(V^*) -\kappa_{\mathcal{P}_{s, a}}(V^*)\right|=& \max _{(s, a)}\left|\left(P_{s, a} -\widehat{P}_{s, a}\right)  \Vs\right]\\
% &\leq H  \sqrt{\frac{\log (2|\mathcal{S} \| \mathcal{A}| / \delta)}{2 N}}
% \end{align*}
\end{lemma}

 Again, this also holds for $s$-rectangular sets.
%
 This inequality relies on a classic Hoeffding concentration argument coupled with  absorbing MDPs of \cite{agarwal2020model}. Putting everything together, we have just shown that :
\begin{equation*}
    \|Q^* - Q^{\hat{\pi}}\|_\infty \leq \frac{3\gamma \epsilon_{\mathrm{opt}} }{1-\gamma}+
    \sqrt{\frac{1024\gamma^2\log (L)}{ (1-\gamma)^4N}}
\end{equation*}
Solving in $\epsilon$ for the second term of the right-hand side gives the stated result. \looseness=-1
%\pierre{corrigé la présentation graphique ou enlever? }

% This lemma is central in that it shows that the concentration of robust rectangular MDPs with $L_p$ norms  is nothing else than the concentration difference of the non-robust form in the right hand side of the equation. This lemma is the key to the proof of Theorem \ref{h4}. This is mainly due to the closed forms of the RMDP optimisation problem. Hoeffding's inequality gives us the bound in the last inequality and the bound of Theorem \ref{h4}.
% One of the differences with previous analyses is that the other bounds are improved by a factor of $\Snorm$. This is only due to the fact that we are working with a closed form for our inner minimisation problem, which is not the case in the other cases where there is a dual form treated with $\epsilon$ covering arguments that cost this extra $\Snorm$ factor.
\section{Toward minimax optimal sample complexity} % in $H^3$ for RMDPs.}
\label{toward}
Now, we provide a better bound  in terms of the horizon $H$, reaching (up to log factors) the lower-bound in $H^3$ for non-robust MDPs.
%and thus to show that one approaches a minimax-optimal bound as classically with a horizon factor proportional to $H^3$. \pierre{on est minimax pour une condition quand meme en robust donc je laisse?}
\looseness=-1
Recall $\beta = \sup_{s,a}\beta_{s,a}$ for the $sa$-rectangular assumption or $\beta = \sup _{s}\beta_s$ for the $s$-rectangular assumption.
For the following result to hold, we need to assume that the uncertainty set is small enough: we will require $
    \beta \leq \frac{1-\gamma}{2\gamma\Snorm^{1/q}}=\frac{1}{2(H-1)\Snorm^{1/q}}$. For $p=1$, $q=\infty$ and $\Snorm^{1/q}=1$ and we retrieve results of Table \ref{tableau1}.
% then the condition ensured by $\beta$ in the derivation of the proof is $$\beta< \frac{1-\gamma}{4\gamma}=\frac{1}{4(H-1)}=B. $$
%
The following theorem is true for both $sa$- and $s$-rectangular uncertainty sets, and for any $L_p$ norm with $p\geq 1$.

\begin{theorem}
\label{h3}
let  $\beta_0 \in (0,\frac{1}{2(H-1)\Snorm^{1/q}} ]$, for
any $\kappa>0$ and
any $\epsilon_0 \leq \kappa \sqrt{H}$
it exists a $C>0$ independent of $H$
such that for 
any $\beta \in (0,\beta_0)$ and
any $\epsilon \in (0,\epsilon_0)$,
 whenever $N$ the number of calls to the sampler per state-action pair satisfies
$
N \geq 
C 
\frac{L\gamma^2 H^3}{\epsilon^2}
$
where $L=\log (8|\mathcal{S}||\mathcal{A}| /((1-\gamma) \delta))$, it holds that 
if $\widehat{\pi}$ is any $\epsilon_{\text {opt }}$-optimal policy for $\widehat{M}$, that is when $\|\widehat{Q}^{\widehat{\pi}}-\widehat{Q}^{\star}\|_{\infty} \leq \epsilon_{o p t   }$,
then
$
\norminf{\Qs-\Qpihat}\leq \epsilon  +\frac{8\epsilon_\text{opt}}{1-\gamma}
%+\frac{9 \epsilon_{o p t   }}{1-\gamma}
$
with probability at least $1-\delta$. So we have $N_{\text {total }}=N|\mathcal{S}||\mathcal{A}|$ as an overall sample complexity $\displaystyle \tilde{\mathcal{O}}\left(   \frac{H^3\Snorm\Anorm}{\epsilon^2}    \right)$
for any $\epsilon<\epsilon_0$.
\end{theorem} \looseness=-1
\subsection{Discussion} % of Theorem \ref{h3}. }
The constants of Theorem~\ref{h3} are explicitly given in Appendix~\ref{annex c}. For instance, for $\beta_0 = \frac{1}{8(H-1)}$ 
and
$\epsilon_0
= \sqrt{16 H} 
$, we have $C=1024$, other choices being possible. Recall that in the non-robust case, the lower-bound is $\tilde{\Omega}\left(   \frac{H^3\Snorm\Anorm}{\epsilon^2}    \right)$ \citep{gheshlaghi2013minimax}. Our theorem states that any model-based robust RL approach, in the generative model setting, with an accurate enough planner applied to the empirical RMDP, reaches this lower bound, up to log terms. As far as we know, it is the first time that one shows that solving an RMDP in this setting does not require more samples than solving a non-robust MDP, provided that the uncertainty set is small enough.
% When $\beta$ is not too large, we gain a horizon factor $H$. We can interpret this result as if our MDPs, which is not too modified because of regularisation, then it reaches the optimal complexity sample which is minimax optimal for the non-robust case of $N_{\text {total }}=\tilde{\mathcal{O}}\left(   \frac{H^3\Snorm\Anorm}{\epsilon^2}    \right)$ \cite{gheshlaghi2013minimax}. 
Our bound on $\epsilon$ is similar to the one of 
\citet{agarwal2020model}\
in the robust case with their range $[0, \sqrt{H})$,
we differ only by giving more flexibility in the choice of the constant $C$.
 The  best range of $\epsilon$ for non-robust MDPs is $(0, H)$ \citep{li2020breaking}, we let its extension to the robust case for future work. 
 %Notice that our range of $\epsilon$ is much better than the one of \citet{gheshlaghi2013minimax} for the non-robust case, which is $(0,\sqrt{H/\Snorm})$.
%The  best rate in general for Robust MDPs is for $\epsilon<H$ and it is maybe possible to improve this expression with argument close to the one of \citet{li2020breaking}.
%To our knowledge, this is the first proof for robust MDPs that has such a low sample complexity. Generalising our theorem for larger value of $\beta$  is left for future work. 
So far, we discussed the lower-bound for the non-robust case, that we reach. Indeed, non-robust MDPs can be considered as a special case of MDPs with $\beta=0$. As far as we know, the very first robust-specific lower-bounds on the sample complexity have been proposed by \citet{yang2021towards}. They propose two lower-bounds accounting for the size of the uncertainty set, one for the Chi-square case, and one for the total variation case, which coincide with our $L_p$ framework for $p=1$ This bound is
%
% A question that naturally comes to mind is: are there lower bounds for RMDPs, and do these upper bounds, that matches the lower bound of non-robust MDPs match the robust lower bound? The only work in our existence that gives us a robust lower bond is that of \citet{yang2021towards}. Two lower bounds in the $\chi^2$ and $L_1$ cases are given. The one in $L_1$ distance that coincides with our analysis in $L_p$ is:
 %
$
\tilde{\Omega}\left(\frac{|\mathcal{S}||\mathcal{A}|(1-\gamma)}{\varepsilon^2} \min \left\{\frac{1}{(1-\gamma)^4}, \frac{1}{\beta^4}\right\}\right).
$
This lower-bound has two cases, depending on the size of the uncertainty set. If $\beta\leq(1-\gamma) = 1/H$,
we  retrieve the non-robust lower bound $\tilde{\Omega}\left(\frac{|\mathcal{S}||\mathcal{A}|H^3}{\varepsilon^2}\right) $. Therefore, for an $L_1$-ball, our upper-bound matches the lower-bound, and we have proved that model-based robust RL in the generative model setting is minimax optimal for any accurate enough planner. Their condition for this bound,   $\beta\leq 1/H $, is close to our condition, $\beta<1/(4(H-1)$.
This suggests that our condition on $\beta$ is not just a proof artefact.
% We notice that this lower bound has two cases. In the first one where $\beta\leq(1-\gamma) $,
% we can retrieve the non-robust lower bound of $N=\tilde{\Omega}\left(\frac{|\mathcal{S}||\mathcal{A}|H^3}{\varepsilon^2}\right) $. So for this case of $L_1$-ball, our upper bound matches the lower bound, and we have minimax optimality of the sample complexity for RMDPs. Their condition for this bound of   $\beta\leq 1/H $ is close to our condition, $\beta<(1/(4(H-1)) $ as the factor $4$ is mainly artificial, and comes from  inequalities and artefact of proof. This reinforces the idea that our condition on $\beta$ is not just due to a proof artefact.
% To the best of our knowledge, this work delivers the first minimax-optimal guarantees for RMDPs here assuming $L_p$ metric for the uncertainty set.
 In the second case, if $\beta >1-\gamma $, the lower-bound is  $\widetilde{\Omega}\left(\frac{|\mathcal{S} \| \mathcal{A}|(1-\gamma)}{\varepsilon^2 \beta^4}\right)$.
In this case, our theorem does not hold, and we only currently get a bound in $H^4$ (see Sec.~\ref{sec:H4}), which doesn't match this lower-bound. 
\looseness=-1

\vspace{0.3cm}

In the case of $TV$, we know from posterior work \citep{shi2023curious} that it is possible to get a tighter bound in the regime $\beta>1-\gamma$ but in the case of $L_P$ norm it is still an open question This is left as future work. In the case where $\beta$ is too large, the question arises whether RMDPs are useful as long as there is little to control when the transition kernel can be too arbitrary.
% our transition kernel may differ greatly from the original one, which creates solutions that are far from the optimal one.
To sum up, to the best of our knowledge, with a small enough uncertainty set, our work delivers the first ever minimax-optimal guarantee for RMDPs according to the non-robust lower-bound for $L_p$-balls, and the first ever minimax-optimal guarantee  according to the robust lower-bound for the total variation case for sufficiently small radius of the uncertainty set, which has been later refined on the larger set of $\beta$ by \cite{shi2023curious}.
\looseness=-2
\subsection{Sketch of proof}

The full proof is provided in Appendix~\ref{annex c}. As in Sec.~\ref{subsec:sketch_H4}, we start from the inequality
\begin{equation*}
    \|Q^* - Q^{\hat{\pi}}\|_\infty
    \leq \|Q^* - \hat{Q}^{\pi^*}\|_\infty +
    \|\hat{Q}^* - \hat{Q}^{\hat{\pi}}\|_\infty + \|\hat{Q}^{\hat{\pi}} - Q^{\hat{\pi}}\|_\infty,
\end{equation*}
where the second term of the right-hand side can again be readily bounded, $\|\hat{Q}^* - \hat{Q}^{\hat{\pi}}\|_\infty \leq \epsilon_\text{opt}$. To bound the remaining two terms, if we want to obtain a tighter final bound, the contracting property of the robust Bellman  operator will not be enough, we need a finer analysis. To achieve this, we rely on the total variance technique introduced by \citet{gheshlaghi2013minimax} for the non-robust case, combined with the \emph{absorbing MDP} construction of \citet{agarwal2020model}, also for the non-robust case, which allows improving the range of valid $\epsilon$. The key underlying idea is to rely on a Bernstein concentration inequality rather than a Hoeffding one, therefore considering the variance of the random variable rather than its range, tightening the bound. Working with a Bernstein inequality will require controlling the variance of the return. A key result was provided by \citet{gheshlaghi2013minimax}, that we extend to the robust setting,
 \looseness=-1
\begin{equation}
 \label{total_variance}
     \left\|\left(I-\gamma P_0^\pi\right)^{-1} \sqrt{\operatorname{Var}_{P_0}\left(V^\pi\right)}\right\|_{\infty} \leq \sqrt{\frac{2}{(1-\gamma)^3}}.
\end{equation}
   Naively bounding the left-hand side would provide a bound in $H^2$, while this (non-obvious) bound in $\sqrt{H^3}$ is crucial for obtaining on overall dependency in $H^3$ in the end.
% In this analysis, we do a finer error propagation following the ideas of \citet{agarwal2020model} for the Q function to  refine our bounds. To our knowledge, this is a only way  to get a chance to have bound in $H^3$. Classical idea from \cite{gheshlaghi2013minimax,agarwal2020model,li2020breaking} are related to the fact that we can upper bound the variance of returns as follows. This technique is called total variance technique, and it is the key in non-robust MDPs to go from $H^4$ to $H^3$. The inequality is as follows. For any policy $\pi$:
% %
%  \begin{equation}
%  \label{total_variance}
%      \left\|\left(I-\gamma P^\pi\right)^{-1} \sqrt{\operatorname{Var}_P\left(V^\pi\right)}\right\|_{\infty} \leq \sqrt{\frac{2}{(1-\gamma)^3}}.
%  \end{equation}
% %
% If the left-hand side is upper bounded too sharply, we obtain an expression homogeneous to $H^2$,  wherease the right-hand side is homogeneous to $H^{3/2}$ in this analysis which is a consequence of Jensen's inequality. 
Now, we come back to the terms $\|Q^* - \hat{Q}^{\hat{\pi}^*}\|_\infty$ and $\|Q^{\hat{\pi}} - \hat{Q}^{\hat{\pi}}\|_\infty$ that we have to bound. This bound should involve a term proportional to $(I-\gamma P_0^\pi)^{-1}$ to leverage later Eq.~\eqref{total_variance}. The following lemma is inspired by \citet{agarwal2020model}, and its proof relies crucially on having a simple dual of the robust Bellman operator.
 \looseness=-1
% To obtain a form proportional  to $\left(I-\gamma P^\pi\right)^{-1}$ such as in  Eq.~\eqref{total_variance} in our analysis, we have to do a propagation error of Q function involving this term such as in \cite{agarwal2020model} as follows :
\begin{lemma} %Upper bound on $\Qs-\Qhatpistar$ and on $\Qpihat-\Qhatpihat$
\label{lemma_pis_main}
\begin{align*}
    \| \Qpihat-\Qhatpihat \|_\infty \leq & \gamma \|(I-\gamma\Pzeropihat)^{-1}(\Pzero-\widehat{P}) \Vhatpihat\|_\infty 
    + \frac{2\gamma \beta \Snorm^{1/q} }{1-\gamma}  \|\Qpihat- \Qhatpihat\|_\infty.
\end{align*}%
% \small{
% \begin{align*}
%  \norminf{\Qs-\Qhatpistar}\leq& \gamma\norminf{\left(I-\gamma\Ppistar\right)^{-1}(P-\widehat{P}) \Vhatpistar }\\
%  +& \frac{2\gamma \beta  }{1-\gamma}   \norminf{\Qs- \Qhatpistar}
%  \\
%   \norminf{ \Qpihat-\Qhatpihat }\leq& \norminf{ \gamma\left(I-\gamma\Ppihat\right)^{-1}(P-\widehat{P}) \Vhatpihat}\\ 
%    +& \frac{2\gamma \beta  }{1-\gamma}  \norminf{\Qpihat- \Qhatpihat}
% \end{align*}
% }
%\label{lemma pi hat main}
\end{lemma}

This lemma also holds replacing also $\hat{\pi}$ by $\pistar$. We see that the term $\beta$ appears in the bound. This comes from the need to control the difference in penalisation between seminorms of value functions, from a technical viewpoint. Indeed, the terms $  \frac{2\gamma \beta  }{1-\gamma}   \|Q^\pi- \hat{Q}^\pi\|_\infty$ (with $\pi$ being either $\hat{\pi}$ or $\pi^*$) are not present in the non-robust version of the bound, and are one of the main differences from the derivation of \citet{agarwal2020model}. The first term of the right-hand side of each bound $\| (I-\gamma P_0^\pi)^{-1}(P_0-\widehat{P}) \hat{V}^\pi \|_\infty$ (with $\pi$ being either $\hat{\pi}$ or $\pi^*$) will be upper-bounded using a Bernstein argument, leveraging also Eq.~\eqref{total_variance}. The resulting lemma is the following.
 \looseness=-1
% The major problem in obtaining $H^3$ bounds in robust MDPs is that it involves $\beta$ in the proof. At some point, we have to control the difference in penalisation between the semi-norms of the value functions, which imposes a condition on $\beta$ not too large in order to always have the bound. We cannot use the Lemma  \ref{close_form_main} as in the $H^4$ case.
% The residual term related to $   \frac{2\gamma \beta  }{1-\gamma}   \norminf{\Qpihat- \Qhatpihat}$ or $    \frac{2\gamma \beta  }{1-\gamma}  \norminf{\Qpihat- \Qhatpihat}$ has to be sufficiently small, which impose the condition on $\beta$ in the following lemma. These terms are not present in the non-robust case and are one of the main difference in the derivation of \cite{agarwal2020model}. The two last terms $ \gamma\left(I-\gamma\Ppihat\right)^{-1}(P-\widehat{P}) \Vhatpihat$ and $ \gamma\left(I-\gamma\Ppistar\right)^{-1}(P-\widehat{P}) \Vhatpistar$ will be upper bounded using concentration arguments. From these two lemma \ref{lemma_pis_main},  we can obtain Lemma \ref{concentration_end} using concentration of the variance of the value function with Bernstein's inequality argument.  Hoeffding's concentration \citep{hoeffding1994probability}, which is only based on the range of the random variable, is generally less tight than Bernstein's inequality arguments bounding the variance (See Eq.~\eqref{total_variance}) of the random variable. Hoeffding's argument here does not work as we retrieve the first Theorem \ref{h4}.
\begin{lemma}
%Upper bound and concentration arguments.
With probability at least $1-\delta$,  we have
\label{concentration_end}
\begin{align*}
   & \left\|Q^{\widehat{\pi}}-\widehat{Q}^{\widehat{\pi}}\right\|_{\infty}
    < (C_{N}+C_\beta) \|\Qpihat- \Qhatpihat\|_\infty
    +4\gamma \sqrt{\frac{ L}{N(1-\gamma)^3}} + \frac{\gamma \Delta_{\delta, N}^{\prime}}{1-\gamma}+ \frac{\gamma \epsilon_{\mathrm{opt}}}{1-\gamma}\left(2+\sqrt{\frac{8 L}{N}}\right),
    %+\frac{\gamma \epsilon_{\mathrm{opt}}}{1-\gamma}\left(2+\sqrt{\frac{8 L}{N}}\right) 
\end{align*}
% \begin{align*}
%     \left\|Q^{\widehat{\pi}}-\widehat{Q}^{\widehat{\pi}}\right\|_{\infty}&<\left(C_{N}+C_\beta\right) \norminf{\Qpihat- \Qhatpihat} \\
%     +4\gamma \sqrt{\frac{ L}{N(1-\gamma)^3}}+&\frac{\gamma \Delta_{\delta, N}^{\prime}}{1-\gamma}+\frac{\gamma \epsilon_{\mathrm{opt}}}{1-\gamma}\left(2+\sqrt{\frac{8 L}{N}}\right) . \\ \text{    } \\
%     \norminf{\Qs- \Qhatpistar}&<\left(C_{N}+C_\beta\right)  \norminf{\Qs- \Qhatpistar} \\
%    +&4\gamma \sqrt{\frac{ L}{N(1-\gamma)^3}}+\frac{\gamma \Delta_{\delta, N}^{\prime}}{1-\gamma} .
% \end{align*}
%
with $C_{\beta}=\frac{2\gamma\beta\Snorm^{1/q}}{1-\gamma}$  and $C_{N}=\frac{\gamma}{1-\gamma} \sqrt{\frac{8 L}{N}}$ 
and where
$
\Delta_{\delta, N}^{\prime}=\sqrt{\frac{c L}{N}}+\frac{c L}{(1-\gamma) N}$
with $L=\log (8|\mathcal{S}||\mathcal{A}| /((1-\gamma) \delta))$.
\end{lemma}



For this result to be exploitable, we have to ensure that $C_N+ C_{\beta} < 1$, which leads 
to $\beta \leq \frac{1-\gamma}{2\gamma \Snorm^{1/q}}$, and then $C_N + C_\beta < 1$ leads to a constraint on
$N$ in Theorem~\ref{h3}. Eventually, injecting the result of this last lemma in the initial bound, keeping the dominant term in $1/\sqrt{N}$ and solving for $\epsilon$ provides the stated result, cf Appendix~\ref{annex c}.
 \looseness=-1

% Here we see that the condition on $C_N+C_\beta < 1$ impose a range of validity of the theorem on $\beta $ coming from $C_\beta< 1/2$. The range where the theorem is valid in terms of $\epsilon$ comes from imposing that $C_N<1/2$. The constant $1/2$ is arbitrary, and it is possible to change it as long as it belongs to $(0,1)$ if we want to improve the conditions on $\beta$ and degrade those on $\epsilon$ or reversely. 


% Lastly, it is important to notice in this bound, we have tried carefully to obtain the largest $\beta$ where the theorem is valid. 
% % If we do not renormalise the norms in the definition of uncertainty set in Assumtions \ref{s rectangle}, \ref{sa rectangle}, we suffer from an additional factor $\Snorm^{1/q}$ that comes from Holder's inequality but which is in fact only depends on the dimension of the RMDPs. 
% By reasoning on homogeneous problems using normalised norms, we obtain our condition on $\beta<(1-\gamma)/(4\gamma)$. Finally, Theorem \ref{h3} is a direct consequence of Lemma \ref{concentration_end}.


\section{Conclusion}

In this paper, we have studied the question of the sample complexity of model-based robust reinforcement learning.  We have considered the classic (in non-robust RL) generative model setting, where a sampler can provide next-state samples from the nominal kernel and from arbitrary state-action couples. We focused our study more specifically on $sa$- and $s$-rectangular uncertainty sets corresponding to $L_p$-balls around the nominal. We ensure $\gamma-$contraction of Robust Bellman Operator in order not to be too conservative, contrary to \cite{derman2021twice} and \cite{kumar2022efficient} and propose two algorithms \ref{alg:cvi-dro-infinite}, \ref{alg:cvi-dro-infinites} to solve $L_P$ RMDPs problem.
 \looseness=-1
 
Without any restriction on the maximum level of suboptimality ($\epsilon$) or the size of uncertainty set ($\beta$), we have shown that the sample complexity of the studied general setting is $\tilde{\mathcal{O}}(\frac{\Snorm\Anorm H^4}{\epsilon^2})$, already significantly improving  existing results \citep{yang2021towards,panaganti2022sample}. Our bound holds for both the $sa$- and $s$-rectangular cases, and improves existing results (for the total variation) by respectively $\Snorm$ and $\Snorm\Anorm$. By assuming a small enough uncertainty set, and for a small enough $\epsilon$, we further improved this bound to $\tilde{\mathcal{O}}(\frac{\Snorm\Anorm H^3}{\epsilon^2})$, adapting proof techniques from the non-robust case \citep{gheshlaghi2013minimax, agarwal2020model}. This is a significant improvement. Our bound again holds for both the $sa$- and $s$- rectangular cases, it matches the lower-bound for the non-robust case~\cite{gheshlaghi2013minimax}, and it matches the total variation lower-bound for the robust case when the uncertainty set is small enough \citep{yang2021towards}. We think this is an important step towards minimax optimal robust reinforcement learning. 

There are a number of natural perspectives, such as knowing if we could extend our results to other kinds of uncertainty sets, or to extend our last bound to larger uncertainty sets (despite the fact that if the dynamics is too unpredictable, there may be little left to be controlled). Our results build heavily on the simple dual form of the robust Bellman operator, which prevents us from considering, for the moment, uncertainty sets based on the KL or Chi-square divergence. Beyond their theoretical advantages such as contraction of the Robust Bellman Operator, these simple dual forms also provide practical and computationally efficient planning algorithms.  Therefore, another interesting research direction would be to know if one could derive additional useful uncertainty sets relying primarily on the regularization viewpoint.
 \looseness=-1


\bibliography{main}
\appendix
\newpage




\section{Overview and useful inequalities}
%\subsection{Overview}
The appendix is organised as follows
\begin{itemize}
     \item In Appendix \ref{tableau_com}, a comprehensive table with state-of-the-art complexity for every distance. 
     \item In Appendix \ref{comparaison}, we provide more details/explanations on the difference between our formulation on the one of \cite{kumar2022efficient} and \cite{derman2021twice}.
     \item In Appendix \ref{DRVI}, we give more details about our algorithm :$ \mathtt{DRVI }~ \mathtt{L_P}$ 
     
     \item In Appendix \ref{annex A}, we give some useful inequalities frequently used in the proofs.
    \item In Appendix \ref{annex B}, we prove Theorem \ref{h4}.
    \item In Appendix  \ref{annex c}, we prove Theorem \ref{h3}.
    \item In Appendix \ref{sec:complex}, we set complexity of algorithm \ref{alg:cvi-dro-infinite} and \ref{alg:cvi-dro-infinites}.
    %\item  In Appendix  \ref{annex_annex}, we show how to adapt the concentration result of \citet{agarwal2020model} to the robust setting.
\end{itemize}

Finally, the proofs for the $s$-rectangular and $sa$-rectangular cases are often very similar. If this is true, we will combine them in a single proof with the two cases detailed when needed.


\subsection{Table of sample Complexity}
\label{tableau_com}
\begin{table*}[h!]
\tiny
\caption{Sample Complexity for different metric and $s$- or $sa$ rectangular assumptions with $\beta$ the radius of uncertainty set, $H$ the horizon factor, $\epsilon$ the precicion, $\bar{p}$, $\beta_{0,p}= (1-\gamma)/(2\gamma \Snorm^{1/q}).$ the smallest positive state transition probability of the nominal kernel visited by the optimal robust policy (see \citet{yang2021towards}).
 \label{tableau2}}
\begin{tabular}{ |p{0.4cm}|p{2.4cm}|p{2.6cm}|p{1.5cm}|p{1.6cm}|p{1.6cm}|  p{1.7cm} |p{1.7cm} | }
%\hline
% \multicolumn{5}{|c|}{\textbf{Sample Complexity of RMDPs with a generative model}} \\
\hline
& \cite{panaganti2022sample}  &\cite{yang2021towards} & \cite{shi2022distributionally}&Our  $ \beta \geq0$ & Our $ \beta_{0,p} >\beta>0$ &    \cite{shi2023curious} $\beta>1-\gamma$ & \cite{shi2023curious}   $0<\beta< 1-\gamma$
\\
\hline
TV ($sa$) & $\tilde{\mathcal{O}}\left(\frac{\Snorm^2\Anorm  \horizon^4}{\epsilon^2}\right)$ &$\tilde{\mathcal{O}}\left(\frac{\Snorm^2\Anorm  \horizon^4(2+\beta)^2}{\epsilon^2\beta^2}\right)$& $\times$ &$\tilde{\mathcal{O}}\left(\frac{\Snorm\Anorm  \horizon^4}{\epsilon^2}\right)$ &$\tilde{\mathcal{O}}\left(\frac{\Snorm\Anorm \horizon^3}{\epsilon^2}\right)$  &    $\tilde{\mathcal{O}}\left(\frac{\Snorm\Anorm \horizon^2}{\epsilon^2 \beta}\right)$  &   $\tilde{\mathcal{O}}\left(\frac{\Snorm\Anorm \horizon^3}{\epsilon^2}\right)$ \\
\hline
TV ($s$)& $\times$&$\tilde{\mathcal{O}}\left(\frac{\Snorm^2\Anorm^2  \horizon^4(2+\beta)^2}{\epsilon^2\beta^2}\right)$& $\times$&$\tilde{\mathcal{O}}\left(\frac{\Snorm\Anorm  \horizon^4}{\epsilon^2}\right)$  &$\tilde{\mathcal{O}}\left(\frac{\Snorm\Anorm  \horizon^3}{\epsilon^2}\right)$ & $\times$ & $\times$\\
\hline
$L_p$  ($sa$)& $\times$&$\times$ & $\times$ &$\tilde{\mathcal{O}}\left(\frac{\Snorm\Anorm  \horizon^4}{\epsilon^2}\right)$  & $\tilde{\mathcal{O}}\left(\frac{\Snorm\Anorm  \horizon^3}{\epsilon^2}\right)$ &$\times$ &$\times$ \\
\hline
$L_p$ ($s$)& $\times$&$\times$ & $\times$ &$\tilde{\mathcal{O}}\left(\frac{\Snorm\Anorm  \horizon^4}{\epsilon^2}\right)$  & $\tilde{\mathcal{O}}\left(\frac{\Snorm\Anorm \horizon^3}{\epsilon^2}\right)$&$\times$ &$\times$ \\
\hline
$\chi^2$  ($sa$)  & $\tilde{\mathcal{O}}\left(\frac{\Snorm^2\Anorm \beta  \horizon^4}{\epsilon^2}\right)$  & $\tilde{\mathcal{O}}\left(\frac{|\mathcal{S}|^2|\mathcal{A}|(1+\beta)^2H^4}{\varepsilon^2(\sqrt{1+\beta}-1)^2}\right)$ &$\times$ & $\times$ & $\times$ &   $\tilde{\mathcal{O}}\left(\frac{\Snorm\Anorm \beta  \horizon^4}{\epsilon^2}\right)$  & $\tilde{\mathcal{O}}\left(\frac{\Snorm\Anorm \beta  \horizon^4}{\epsilon^2}\right)$  \\
\hline
$\chi^2 $ ($s$)  & $\times$& $\tilde{\mathcal{O}}\left(\frac{|\mathcal{S}|^2|\mathcal{A}^3|(1+\beta)^2H^4}{\varepsilon^2(\sqrt{1+\beta}-1)^2}\right)$ & $\times$& $\times$  &$\times$& &$\times$ \\
\hline
$\mathrm{KL}$  ($sa$)& $  \tilde{\mathcal{O}}\left(\frac{|\mathcal{S}|^2|\mathcal{A}| \exp (H)H^4 }{\beta^2 \varepsilon^2}\right) $&$\tilde{\mathcal{O}}\left(\frac{\Snorm^2\Anorm   \horizon^4}{\bar{p}^2\epsilon^2\beta^2}\right)$& $\tilde{\mathcal{O}}\left(\frac{\Snorm\Anorm   \horizon^4}{\bar{p}\epsilon^2\beta^4}\right)$ & $\times$ & $\times$& $\times$ &$\times$ \\
\hline
$\mathrm{KL}$  ($s$)& $  \times$&$\tilde{\mathcal{O}}\left(\frac{\Snorm^2\Anorm^2   \horizon^4}{\bar{p}^2\epsilon^2\beta^2}\right)$& $\times$& $\times$ & $\times$&$\times$ &$\times$ \\
\hline
\end{tabular}
\end{table*}

\newpage



\subsection{Relation with related work}
\label{comparaison}


In the work of \cite{derman2021twice} 
close forms for  RMDPs with $L_p$ norms are derived  assuming the following uncertainty set :

\begin{assumption}
    ($sa$-rectangularity in \cite{derman2021twice})
$$
\begin{gathered}
\mathcal{U}_p^{\text {$sa$}}:=\left(R_0+\mathcal{R}\right) \times\left(\Pzero +\mathcal{P}\right),  
\mathcal{R}=\times_{s \in \mathcal{S}, a \in \mathcal{A}} \mathcal{R}_{s, a}, \mathcal{R}_{s, a}=\left\{r_{s, a} \in \mathbb{R} \mid\normpbar{r_{s, a}} \leq \alpha_{s, a}\right\} \\
\text {  } \mathcal{P}=\times_{s \in \mathcal{S}, a \in \mathcal{A}} \mathcal{P}_{s, a} 
\mathcal{P}_{s, a}=\{P_{s, a}: \mathcal{S} \rightarrow \mathbb{R} \mid \sum_{s^{\prime}} P_{s, a}\left(s^{\prime}\right)=0,\normpbar{P_{s, a}} \leq \beta_{s, a}\}
\end{gathered}
$$
\end{assumption}


Using these uncertainty sets leads to the following Bellman Operator :





\begin{theorem}[\citet{derman2021twice}]
  The $sa$-rectangular Robust Bellman operator is equivalent to a regularised non-robust Bellman operator: for $r^{s,a}_{V,\pi}(s,a)=-\left(\alpha_s+\gamma \beta_{s,a} \normqbar{V}\right)+R_0(s,a)$, where $\left\|\pi_s\right\|_{q}$ is $q$-norm of the vector $\pi(\cdot \mid s) \in \Delta_{\mathcal{A}}$, we have
$$
\begin{aligned}
&\mathcal{T}_{\mathcal{U}_p^s}^\pi V(s)=\langle \pi_s ,r^{s,a}_{V,\pi}(s,a)+\gamma \sum_{s^{\prime}} \Pzero\left(s^{\prime} \mid s, a\right) V\left(s^{\prime}\right)\rangle_A
\end{aligned}
$$
\end{theorem}

Using this formulation, they  get a closed form for the inner minimisation problem and for the Robust Bellman Operator


The work \citet{kumar2022efficient} modifies the work of \cite{derman2021twice} using Kernel that sum to $1$, $\sum_{s'}P_{s,a}(s')=0$ in their definition, but using this uncertainty set, it is still possible to get a robust kernel out of the simplex. Using this formulation, they also get a closed form for the inner minimisation problem and for the Robust Bellman Operator.

% In the work of \cite{kumar2022efficient}, close forms for  RMDPs with $L_p$ norms are derived  assuming the following uncertainty set :
\begin{assumption}
    ($sa$-rectangularity in \cite{kumar2022efficient})
    $$
\begin{gathered}
\mathcal{U}_p^{\text {$sa$}}:=\left(R_0+\mathcal{R}\right) \times\left(\Pzero +\mathcal{P}\right),  
\mathcal{R}=\times_{s \in \mathcal{S}, a \in \mathcal{A}} \mathcal{R}_{s, a}, \mathcal{R}_{s, a}=\left\{r_{s, a} \in \mathbb{R} \mid\normpbar{r_{s, a}} \leq \alpha_{s, a}\right\} \\
\text {  } \mathcal{P}=\times_{s \in \mathcal{S}, a \in \mathcal{A}} \mathcal{P}_{s, a} 
\mathcal{P}_{s, a}=\{P_{s, a}: \mathcal{S} \rightarrow \mathbb{R} \mid \sum_{s^{\prime}} P_{s, a}\left(s^{\prime}\right)=0,\normpbar{P_{s, a}} \leq \beta_{s, a}\}
\end{gathered}
$$
\end{assumption}
    



Using these uncertainty sets where robust Kernel may not belong anymore to the simplex as they do not assume $P_{0}+P_{s,a}\geq 0$. This  leads to the following Bellman Operator :



\begin{theorem}[\citet{kumar2022efficient}]
  The $sa$-rectangular Robust Bellman operator is equivalent to a regularised non-robust Bellman operator: for $r^{s,a}_{V,\pi}(s,a)=-\left(\alpha_s+\gamma \beta_{s,a} \snormqbar{V}\right)+R_0(s,a)$, where $\left\|\pi_s\right\|_{q}$ is $q$-norm of the vector $\pi(\cdot \mid s) \in \Delta_{\mathcal{A}}$, we have
$$
\begin{aligned}
&\mathcal{T}_{\mathcal{U}_p^s}^\pi V(s)=\langle \pi_s ,r^{s,a}_{V,\pi}(s,a)+\gamma \sum_{s^{\prime}} \Pzero\left(s^{\prime} \mid s, a\right) V\left(s^{\prime}\right)\rangle_A
\end{aligned}
$$
\end{theorem}

where $\snormqbar{V}$ in defined in Def.  \ref{span}.These results are due to the following lemma. 

\begin{lemma}[ \cite{kumar2022efficient}.  Duality for the minimisation problem for $sa$ rectangular case with $L_p$ norm without simplex constrain] 
    \begin{equation*}
\inf _{P : \sum_{s'}P(s')=0 \normpbar{P-\hat{P}_{s,a}}\leq \beta_{s,a}  }PV= \widehat{P}_{s,a}V -\beta_{s,a} \snormqbar{V} 
    \end{equation*}
    
\end{lemma}



Our analysis assumes the positivity of the kernel function, $P_0+P_s\geq 0$ in s-rectangular or  $P_{0}+P_{s,a}\geq 0$  for $sa$-rectangular case. Using this more realistic assumption, we can not obtain a closed form of the robust Bellman operator. However, we are still able to compute a dual form for the inner minimisation problem of RMDPs.
With our definition of rectangularity in the simplex:



\begin{assumption}($sa$-rectangularity)
\label{sa rectangle2}
  We define $sa$-rectangular $L_p$-constrained uncertainty set as
\begin{align*}
&\mathcal{U}_p^{\text {$sa$}}:=  
    \left(R_0+\mathcal{R}\right) \times\left(\Pzero +\mathcal{P}\right),  
\mathcal{R}=\times_{s \in \mathcal{S}, a \in \mathcal{A}} \mathcal{R}_{s, a}, 
 \mathcal{P}=\times_{s \in \mathcal{S}, a \in \mathcal{A}} \mathcal{P}_{s, a},  \mathcal{R}_{s, a}=\left\{r_{s, a} \in \mathbb{R} \mid \vert r_{s, a} \vert \leq \alpha_{s, a}\right\} \\
&\mathcal{P}_{s, a}=\{P_{s, a}: \mathcal{S} \rightarrow \mathbb{R} \mid \sum_{s'} P_{s,a}(s')=0, \Pzero+P_{s,a}\geq0, 
,\normpbar{P_{s, a}} \leq \beta_{s, a}\}
\end{align*}

\end{assumption}
    


and using  $
\kappa_{\mathcal{D}}(v)=\inf \left\{u^{\top} v: u \in \mathcal{D}\right\} .
$, we obtain :
\begin{lemma}[Duality for the minimisation problem for $sa$ rectangular case with $L_p$ norm] \label{saduality2}
    \begin{equation*}
        \kappa_{\hat{\mathcal{P}}_{s,a}^{\mathrm{}}}(V)=\max _{\mu\geq 0}  \{\widehat{P}_{s,a}(V-\mu) -\beta_{s,a} \snormqbar{V-\mu} \} =\max _{\alpha \in [V_{min}, V_{max}]} \widehat{P}_{s,a}[V]_\alpha - \beta_{s,a} \snormqbar{ [V]_\alpha  }  .
    \end{equation*}
    with $[V]_\alpha(s):= \begin{cases}\alpha, & \text { if } V(s)>\alpha \\ V(s), & \text { otherwise. }\end{cases}$
\end{lemma}

Proof can be found on Appendix \ref{saduality3}








Contrary to previous lemma in \cite{kumar2022efficient}, there is an additional $\max$ operator in our dual formulation.  Interestingly, their formulation is a relaxation of our Lemmas \ref{saduality} as their formulation does not assume the positivity of the kernel. Their relaxation allows practical algorithms with close form, but still suffer from non-exact formulation of RMDPs with robust Kernel that are not in the simplex.


One crucial point in our analysis is that Bellman Operator for RMDPs is a $\gamma$- contraction for  robust kernel in the simplex for any radius $\beta$ (see \cite{iyengar2005robust}). For \cite{kumar2022efficient}
and \cite{derman2021twice} the range of $\beta$ where their Robust Bellman Operator is a contraction is smaller than $\frac{1-\gamma}{\gamma \Snorm^{1/q}} $ (see Proposition 4 of \cite{derman2021twice}) which is the range where we have minimax optimality in our Theorem \ref{h3}.
For $\beta>\frac{1-\gamma}{\gamma \Snorm^{1/q}} $, there is no contraction anymore.
 In the following, we will assume that robust kernels belong to the simplex to use $\gamma$-contraction in our proof of sample complexity and ensure convergence of the following Distributionally Robust value Iteration for $L_p$ norms for any $\beta$ Algoritm~\ref{alg:cvi-dro-infinite}.



\subsection{Model-based algorithm   }
\label{DRVI}
\begin{algorithm}
	\textbf{input:} empirical nominal transition kernel $\widehat{P}_{0}$; reward function $r$; uncertainty level $\beta$. \\ 
	\textbf{initialization:} $\widehat{Q}_0(s,a)= 0$, $\widehat{V}_0(s)=0$ for all $(s,a) \in S\times A$. \\
   \For{$t = 1,2,\cdots, T$}
	{
		
		\For{$ \forall s\in S, a\in A$}{
			Set $\widehat{Q}_t(s, a)$ according to \eqref{eq:vi-iteration} for $sa-$rectangular ;

		}
		\For{$\forall s\in S$}{
			Set $\widehat{V}_t(s) = \max_a \widehat{Q}_t(s, a)$;
		}
	}

	\textbf{output:} $\widehat{Q}_T$, $\widehat{V}_T$ and $\widehat{\pi}$ obeying $\widehat{\pi}(s) = \arg\max_a \widehat{Q}_T(s,a)$.
	\caption{ $ \mathtt{DRVI }~ \mathtt{L_P}$: Distributionally robust value iteration DRVI for $L_P$ norms with $sa-$rectangular assuptions}
 \label{alg:cvi-dro-infinite}
\end{algorithm} 




We propose Alg.~\ref{alg:cvi-dro-infinite} to solve  robust MDPs in the case of $L_P$ norms using value Iteration with $sa$- rectangularity assumptions. First, we can remark that directly solving \eqref{eq:vi-iteration} is computationally costly as
 it requires an optimization over an $S$-dimensional probability simplex at each iteration, especially when the dimension of the state space $S$ is large. However, using strong duality like  \cite{iyengar2005robust} for the $TV$, \eqref{eq:vi-iteration} one can also solve using the dual problem of this formulation. The equivalence between the two formulations can be found in Lemma \ref{saduality}. Using the dual form, the optimization reduces to a  scalar optimization problem that can be solved efficiently using any $1-$dimensional solver if there exists an analytic form of the span-semi norm. Then the iterates $\big\{\widehat{Q}_t \big\}_{t\geq0}$ of DRVI for $L_P$ norms converge linearly to the fixed point $\widehat{Q}^{\star}$, 
owing to the appealing $\gamma$-contraction property of robust MDPs in the simplex. From an initialization $\widehat{Q}_0 = 0$, the update rule at the $t$-th ($t\geq 1$) iteration can be formulated as for $sa$-rectangular case as:

\begin{align}
\forall (s,a)\in  S\times A :\quad \widehat{Q}_t(s,a)=&r(s,a)+\max _{\mu\geq 0} \hat{P}(\hat{V}_{t-1}-\mu) -\beta_{s,a} \snormqbar{\hat{V}_{t-1}-\mu} \\=&r(s,a)+\max _{\alpha \in [V_{min}, V_{max}]} \widehat{P}_{}[\widehat{V}_{t-1}]_\alpha - \beta_{s,a} \snormqbar{ [\widehat{V}_{t-1}]_\alpha  }  , \label{eq:vi-iteration}
\end{align}
% or for $s-$rectangular case as :
% \begin{align}
% \forall (s,a)\in & S\times A :\quad \widehat{Q}_t(s,a) = r(s,a) + \gamma  \kappa_{\hat{\mathcal{P}}_{s}^{\mathrm{}}}(\widehat{V}_{t-1}) =r(s,a)+\max _{\alpha \in [V_{min}, V_{max}]} \widehat{P}_{}[\widehat{V}_{t-1}]_\alpha -\normqbar{\pi_s} \beta_{s,a} \snormqbar{ [\widehat{V}_{t-1}]_\alpha  }  \label{eq:vi-iteration2}
% \end{align}

 
 The specific form of the dual problem depends on the choice of the norm. In the case of $L_1$, $L_2$, or $L_\infty$, 
 span semi-norms involved in dual problems have closed form (respectively equals to median, variance, or span),
 and  equation \ref{eq:vi-iteration}  corresponds to $1$-D minimisation problem.
 
 But in general cases, one has to compute span-semi norms that can be easily computed using binary search solving $$\sum_s \operatorname{sign}\left(v(s)-\omega_p(v)\right)\left|v(s)-\omega_p(v)\right|^{\frac{1}{p-1}}=0$$ 
 to compute $\omega_n$ and then setting the semi norm  $\snormqbar{v}=\norm{v-\omega_n}$. Recall 
the $q$-variance function $\mathrm{sp}_q: \mathcal{S} \rightarrow \mathbb{R}$ and $q$-mean function $\omega_q: \mathcal{S} \rightarrow \mathbb{R}$ be defined as
$$
\snormqbar{v}:=\min _{\omega \in \mathbb{R}}\|v-\omega \mathbf{1}\|_{q}, \quad \omega_{q}(v):=\arg \min _{\omega \in \mathbb{R}}\|v-\omega \mathbf{1}\|_{q} .
$$
See \cite{kumar2022efficient} for discussion about computing span semi norms. So in the general case, we can also compute the maximum solving : 

\begin{align}
\forall (s,a)\in  S\times A :\quad \widehat{Q}_t(s,a)=&\nonumber r(s,a)+\max _{\alpha \in [V_{min}, V_{max}]} \max_{w \in \mathbb{R}}   \widehat{P}_{}[\widehat{V}_{t-1}]_\alpha - \beta_{s,a} \normqbar{ [\widehat{V}_{t-1}]_\alpha -w }  , \label{eq:vi-iteration}
\end{align}
 Using any  $2-$D  convex optimization algorithm solves the problem as this problem is jointly concave in $(\alpha,w)$ because 
  $(\alpha,w)\rightarrow-\normq{[\widehat{V}_{t-1}]_\alpha -w}$ is concave using norm property and 
$(\alpha,w)\rightarrow  \widehat{P}_{}[\widehat{V}_{t-1}]_\alpha $ also. Then the sum is concave.




Finally,  in the $sa$-case we compute the best policy which is the greedy policy of the final Q-estimates   $\widehat{Q}_T$ as the final policy $\widehat{\pi}$:
\begin{align*}
	 \forall s \in S: \quad \widehat{\pi}(s) = \arg\max_a \widehat{Q}_T(s,a).
\end{align*}






In the $s-$ rectangular case: recall that the Bellman optimality operator rectangular is from \ref{bellman}:
\begin{align*}  \mathcal{T}_{\mathcal{U}_p^{\text {$s$}}}^\pi V(s)  =    - \normq{\pi_s}\alpha_{s} +\gamma  \min _{P^\pi\in  \mathcal{P}_{s}  } P^\pi  V      +     \sum_a \pi(a\vert s) \Big( R_0(s,a) +\gamma  P_0(s'\vert s,a)V(s')  \Big)
\end{align*}
using lemma \ref{sduality} to compute the dual of the minimization problem. We obtain:





\begin{equation}
    \label{eq:vi-iterations}\mathcal{T}_{\mathcal{U}_p^{\text {$s$}}}^* V(s)
    = \max_{\pi_s \in \Delta_\mathcal{A}}\max_{\alpha \in [V_{min}, V_{max}]}  \sum_a \pi(a\vert s) \Big( R_0(s,a) +\gamma  P_0(s'\vert s,a) [V]_\alpha(s'))   \Big)  - \normq{\pi_s} (\alpha_{s}  + \gamma  \beta_{s} \snormqbar{ [V]_\alpha  } )\Big)  
\end{equation}

   



\label{DRVIs}
\begin{algorithm}
	\textbf{input:} empirical nominal transition kernel $\widehat{P}_{0}$; reward function $r$; uncertainty level $\beta$. \\ 
	\textbf{initialization:}  $\widehat{V}_0(s)=0$ for all $s \in S$. \\
   \For{$t = 1,2,\cdots, T$}
	{
		
		\For{$ \forall s\in S, $}{
			Set $ V_{k+1} = \mathcal{T}_{\mathcal{U}_p^{\text {$s$}}}^* V_k$  for $s-$rectangular with operator defined in \ref{eq:vi-iterations}

		}
		
	}

	\textbf{output:}  $\widehat{V}_T$ and $\pi_T(a \mid s) \propto(A_T (s, a))^{p-1} \mathbf{1}(A_T (s, a) \geq 0),$ with $A$ defined in \ref{avantage2}.
	\caption{ $ \mathtt{DRVI }~ \mathtt{L_P}$: Distributionally robust value iteration DRVI for $L_P$ norms with $s-$rectangular assumptions}
 \label{alg:cvi-dro-infinites}
\end{algorithm} 

Without any analytic form of the semi-norm,
we obtain the following problem, which is still concave:


\begin{equation}
    \mathcal{T}_{\mathcal{U}_p^{\text {$s$}}}^* V(s)
    = \max_{\pi_s \in \Delta_\mathcal{A}}\max_{\alpha \in [V_{min}, V_{max}]}  \max_{w \in \mathbb{R}}    \sum_a \pi(a\vert s) \Big( R_0(s,a) +\gamma  P_0(s'\vert s,a) [V]_\alpha(s'))   \Big)  - \normq{\pi_s} (\alpha_{s}  + \gamma  \beta_{s} \normqbar{ [V]_\alpha -w } )\Big)  
\end{equation}



 % We have a problem that is concave in $\alpha$ as it is the dual of a  (convex) function, which is always concave. Moreover, it is also concave in $\pi$ as the sum of linear (so concave) and concave function :($\pi\longrightarrow -\normq{\pi}$) which is still a concave function. 
 So we obtain a function that is concave of dimension $\Anorm+1$ which can be solved naively using any convex optimizer if we have access to an analytic form of the span semi-norm, which is the case for $p=1,2,\infty$ and a concave function of dimension  $\Anorm+2$ otherwise. 
The resulting algorithm consists at iterating :
\begin{equation}
    V_{k+1} = \mathcal{T}_{\mathcal{U}_p^{\text {$s$}}}^* V_k
\end{equation}
and that can be solved naively using any convex optimizer for this problem of dimension $\Anorm +2$. However, following \cite{kumar2022efficient}, it is possible to get a simple solution for this minimisation problem over $\pi$. At iteration $t$, first we need to solve

\begin{equation}
    \max_{\alpha \in [V_{min}, V_{max}]}  \max_{w \in \mathbb{R}}    \sum_a \pi(a\vert s) \Big( R_0(s,a) +\gamma  P_0(s'\vert s,a) [V]_\alpha(s'))   \Big)  - \normq{\pi_s} (\alpha_{s}  + \gamma  \beta_{s} \normqbar{ [V]_\alpha -w } )\Big)  
\end{equation}
as in classical $sa$-rectangular case using $2-$convex solver. Then once $w*$ and $\alpha^*$ founded, we can replace it in the previous expression that gives:


\begin{equation}
    \max_{\pi_s \in \Delta_\mathcal{A}}    \sum_a \pi(a\vert s) \Big( R_0(s,a) +\gamma  P_0(s'\vert s,a) [V]_{\alpha^*}(s'))   \Big)  - \normq{\pi_s} (\alpha_{s}  + \gamma  \beta_{s} \snormqbar{ [V]_{\alpha^*}  } )\Big)  .
\end{equation}
Using Th. 10 and Th 11. of \cite{kumar2022efficient},
we obtain that the optimal robust Bellman operator can be evaluated easily. Indeed, the Robust Bellman Operator $\left(\mathcal{T}_{\mathcal{U}_p^s}^* V\right)(s)$ is the solution of the following equation that can be found using binary search between $\left[\max _a Q(s, a)-\sigma, \max _a Q(s, a)\right]$,
$$
\sum_a(Q(s, a)-x)^p \mathbf{1}(Q(s, a) \geq x)=\sigma^p .
$$

where $\sigma=\alpha_s+\gamma \beta_s \snormqbar{[V]_{\alpha^*}}$, and $Q(s, a)=R_0(s, a)+\gamma \sum_{s^{\prime}} P_0\left(s^{\prime} \mid s, a\right) [V]_{\alpha^*}\left(s^{\prime}\right)$.



Moreover, the greedy policy $\pi$ w.r.t. value function $v$, defined as $\mathcal{T}_{\mathcal{U}_p^s}^* V=\mathcal{T}_{\mathcal{U}_p^s}^\pi V$ is a threshold policy. It takes only those actions that have positive advantage, with probability proportional to $(p-1)^\text{th}$  power of its advantage. That is, the optimal policy for $s$-rectangular MDPs can be expressed as :
\begin{equation*}
    \pi(a \mid s) \propto(A(s, a))^{p-1} \mathbf{1}(A(s, a) \geq 0),
\end{equation*}


where \begin{equation}
\label{avantage}
    A(s, a)=R_0(s, a)+\gamma \sum_{s^{\prime}} P_0\left(s^{\prime} \mid s, a\right) [V]_{\alpha^*}\left(s^{\prime}\right)-\left(\mathcal{T}_{\mathcal{U}_p^s}^* V\right)(s)
\end{equation}
Using these results we avoid $\Anorm+2$ dimensional convex optimisation problem at each step. The complexity of our algorithm is the same as \cite{kumar2023policy} except that we are using an additional $1-$dimensional optimisation problem to find the maximum over $\alpha$ or $2d$-dimensional optimisation problem when both span-seminorm and $\alpha$ are not known.
The final policy is taken as:

\begin{align}
  &\pi(a \mid s) \propto(A(s, a))^{p-1} \mathbf{1}(A(s, a) \geq 0),\\
   & \label{avantage2}A_T(s, a)=R_0(s, a)+\gamma \sum_{s^{\prime}} \hat{P}\left(s^{\prime} \mid s, a\right) [\hat{V}_t]_{\alpha^*}\left(s^{\prime}\right)-\left(\mathcal{T}_{\mathcal{U}_p^s}^* \hat{V}_T\right)(s)
\end{align}
Finally, complexity of our algorithm can be found in annex \ref{sec:complex}.
% \mfg{
% Let have a look at the $s$-rectangular case. The robust Bellman evaluation is
% \begin{align}
%     \mathcal{T}_{\mathcal{U}_p^{\text {$s$}}}^\pi V(s)  &=    - \normq{\pi_s}\alpha_{s} +\gamma  \min _{P^\pi\in  \mathcal{P}_{s}  } P^\pi  V      +     \sum_a \pi(a\vert s) \Big( R_0(s,a) +\gamma  P_0(s'\vert s,a)V(s')  \Big)
%     \\
%     &=    - \normq{\pi_s}\alpha_{s} +\gamma  \max _{\alpha \in [V_{min}, V_{max}]}  \Big(P^{\pi}_{0,s}[V]_\alpha - \beta_{s} \normqbar{\pi_s}\snormqbar{ [V]_\alpha  } \Big)     +     \sum_a \pi(a\vert s) \Big( R_0(s,a) +\gamma  P_0(s'\vert s,a)V(s')  \Big)
%     \\
%     &= \max_{\alpha \in [V_{min}, V_{max}]} \sum_a \pi(a\vert s) \Big( R_0(s,a) +\gamma  P_0(s'\vert s,a)(V(s') + [V]_\alpha(s'))   \Big)  - \normq{\pi_s} (\alpha_{s}  + \gamma \beta_{s} \snormqbar{ [V]_\alpha  } )\Big)       
% \end{align}
% Note that $V_\text{min}$ and $V_\text{max}$ should be defined at some point. Note that the notation $P^\pi\in  \mathcal{P}_{s}$ is not homogeneous (written differently in Lemma~\ref{sduality}). 

% From this, we can define the Bellman optimality operator as
% \begin{equation}
%     \mathcal{T}_{\mathcal{U}_p^{\text {$s$}}}^* V(s)
%     = \max_{\pi_s \in \Delta_\mathcal{A}}\max_{\alpha \in [V_{min}, V_{max}]}  \sum_a \pi(a\vert s) \Big( R_0(s,a) +\gamma  P_0(s'\vert s,a)(V(s') - [V]_\alpha(s'))   \Big)  - \normq{\pi_s} (\alpha_{s}  + \gamma  \beta_{s} \snormqbar{ [V]_\alpha  } )\Big)  
% \end{equation}
% So we have an optimization problem, which is concave in $\pi$, and concave in $\alpha$ (is this true? to be checked; not sure of it, and if not true, will complicate our argumentation, we can no longer call a convex solver, that is a problem), and one could call a solver. However, this is true only if we can compute the span semi-norm analytically. If it's not the case, we have a third intricated level of optimization (because the of the dependency of the span-seminorm to $\alpha$). It should be still be a third $\max$ in the end (from $-\min$), to be checked, I'll let you do this Pierre.

% The resulting value iteration would be
% \begin{equation}
%     V^{k+1} = \mathcal{T}_{\mathcal{U}_p^{\text {$s$}}}^* V^k
% \end{equation}
% The corresponding greedy policy is
% \begin{equation}
%     \pi_{k+1}(\cdot|s) = \arg\max_{\pi_s \in \Delta_\mathcal{A}}\max_{\alpha \in [V^k_{min}, V^k_{max}]}  \sum_a \pi(a\vert s) \Big( R_0(s,a) +\gamma  P_0(s'\vert s,a)(V^k(s') - [V^k]_\alpha(s'))   \Big)  - \normq{\pi_s} (\alpha_{s}  + \beta_{s} \snormqbar{ [V^k]_\alpha  } )\Big)
% \end{equation}
% Notice that what would be a relevant $Q$-function in this context is not obvious, the problem being from the term $\normqbar{\pi_s}\snormqbar{ [V]_\alpha  }$, I think.

% It is worth doing it for the $sa$-rectangular case too. We have the robust operator
% \begin{align}
%     \mathcal{T}_{\mathcal{U}_p^{\text {$sa$}}}^\pi V(s)  
%     &= \sum_a \pi(a\vert s) \Big( -\alpha_{s,a} +R_0(s,a) +\gamma \sum_{s'} P_0(s',s,a)v(s') + \gamma  \min _{P\in  \mathcal{P}_{s,a}  } P  V \Big)
%     \\
%     &= \sum_a \pi(a\vert s) \Big( -\alpha_{s,a} +R_0(s,a) +\gamma \sum_{s'} P_0(s',s,a)v(s') + \gamma  \max _{\alpha \in [V_{min}, V_{max}]}\Big( P_{0,s,a}[V]_\alpha - \beta_{s,a} \snormqbar{ [V]_\alpha  }\Big) \Big)
%     \\
%     &= \max _{\alpha \in [V_{min}, V_{max}]} \sum_a \pi(a\vert s) \Big( -\alpha_{s,a} +R_0(s,a) +\gamma \sum_{s'} P_0(s',s,a)(v(s') + [V]_\alpha(s')) - \gamma \beta_{s,a} \snormqbar{ [V]_\alpha  } \Big)
%     \\
%     &=  \sum_a \pi(a\vert s) \max _{\alpha \in [V_{min}, V_{max}]} \Big( -\alpha_{s,a} +R_0(s,a) +\gamma \sum_{s'} P_0(s',s,a)(v(s') + [V]_\alpha(s')) - \gamma \beta_{s,a} \snormqbar{ [V]_\alpha  } \Big)
% \end{align}
% So here we only have a linear dependency to the policy. For solving the inner optimization problem, it is scalar if we manage to compute analytically the span semi-norm, otherwise some more work is required (we have the same problem as before).

% From this point, we can define the greedy operator as before, thanks to the linearity in the policy it just amount to search for the maximizing action (greedy policy).

% So, in the end, we do not need $Q$ from a practical viewpoint (it's not even clear if we do have some practical algorithm, for the general $s$-rectangular case and for the $sa$-rectangular case when the span-seminorm cannot be computed analytically. So maybe we could have written the whole paper with value functions only (I guess the proofs would work for value functions instead of Q-functions, but it's too late for this I think, not a big deal though).
% }

\subsection{Useful Inequalities}

\label{annex A}
Here we present some useful inequalities used frequently in the derivation. 
Consider any $P$ a transition matrix 
%\pierre{tu aimais pas P je mets $P_0$? \mfg{Ca dépend du contexte. Pour tout $P$ ok, mais quand $\hat{P}$ estime spécifiquement $P_0$ il faudrait être clair.}} 
and $\beta_s$ for $s$ rectangular uncertain sets or $\beta_{\mathrm{sa}}$ for $sa$- uncertainty sets, then for $\mathbb{I}=(1,1,...,1)^{\top}$ :

\begin{equation}
\label{1}
   (1-\gamma P)^{-1}\left(\gamma \beta_s\right) \mathbb{I}< \frac{\beta}{1-\gamma} \mathbb{I} \text{ and } (1-\gamma P)^{-1} \mathbb{I}\leq \frac{1}{1-\gamma} \mathbb{I}
\end{equation}  

\begin{equation}
\label{2}
    \forall q\in \mathbb{N}^*, \quad  \snormqbar{.}\leq 2 \normqbar{.}< 2 \Snorm^{1/q}  \norminf{.} , \quad \snorminf{.}\leq 2  \norminf{.}
\end{equation}

\begin{equation}
\label{3}
    \snormqbarpi{.} \leq 2 \normqbarpi{.} \leq 2 \normqbar{.}
\end{equation}

Eq.~\eqref{1} is true, taking the supremum norm of the left-hand side inequality. Eq.~\eqref{2} and  Eq.~\eqref{3} come from properties of norms, see Eq.~(1) from \citet{scherrer2013performance}. 







\subsection{Robust Bellman Operator and robust Q values}
This is proof of Lemma \ref{Q robust}: 
\begin{lemma}
\label{bellman}
    Robust Bellman Operator for $sa-$ and $s-$ rectangular are :
\begin{align*}
    &\mathcal{T}_{\mathcal{U}_p^{\text {$sa$}}}^\pi V(s)  = \sum_a \pi(a\vert s) \Big( -\alpha_{s,a} +R_0(s,a) +\gamma \sum_{s'} P_0(s',s,a)v(s') + \gamma  \min _{P\in  \mathcal{P}_{s,a}  } P  V \Big) \\
    & \mathcal{T}_{\mathcal{U}_p^{\text {$s$}}}^\pi V(s)  =    - \normq{\pi_s}\alpha_{s} +\gamma  \min _{P^\pi\in  \mathcal{P}_{s}  } P^\pi  V      +     \sum_a \pi(a\vert s) \Big( R_0(s,a) +\gamma  P_0(s'\vert s,a)V(s')  \Big)
\end{align*}
\end{lemma}


\begin{proof}




For $sa$-rectangular: by rectangularity
\begin{align*}
&\mathcal{T}_{\mathcal{U}_p^{\text {$sa$}}}^\pi V(s)  = \sum_a \pi(a\vert s) \Big( -\alpha_{s,a} +R_0(s,a) +\gamma \min _{P\in P_0+ \mathcal{P}_{s,a}  } P  V \Big)\\
& = \sum_a \pi(a\vert s) \Big( -\alpha_{s,a} +R_0(s,a) +\gamma \min _{P\in  \mathcal{P}_{s,a}  } P  V  +P_{0,s,a}V\Big)
\end{align*}
For $s-$rectangular case :

\begin{align*}
    \mathcal{T}_{\mathcal{U}_p^{\text {$s$}}}^\pi V(s)  &=   
    \min _{P^\pi\in P_0^\pi+ \mathcal{P}_{s,}  }  \gamma   P  V      +     \min _{R^\in R_0^\pi+ \mathcal{R}_{s}  }  \sum_a \pi(a\vert s)R(s,a)   \\
    &= \sum_a \pi(a\vert s) R_0(s,a) +  \min _{R\in \mathcal{R}_{s} }  \sum_a \pi(a\vert s)R(s,a) + \sum_a \pi(a\vert s) \gamma\sum_{s'} P_0(s'\vert s,a)V(s') +    \min _{P^\pi\in  \mathcal{P}_{s,}  }  \gamma   P^\pi  V\\
    &\stackrel{(a)}{=} \sum_a \pi(a\vert s) \Big( R_0(s,a) + \sum_{s'} P_0(s'\vert s,a)V(s')\Big) -\alpha_s \normq{\pi_s} +\min _{P^\pi\in  \mathcal{P}_{s}  }  \gamma   P^\pi  V
\end{align*}
where (a) comes from  Holder's inequality.
\end{proof}


\begin{lemma}
For $sa-$ and $s-$ rectangular,
    
\begin{align*}
    &Q_{sa}^\pi (s,a)= \rsapi+\gamma P_{0,s,a}   V_{sa}^\pi, \\
    & Q_{s}^\pi (s,a)= \rspi +\gamma P_{0,s,a}   V_{s}^\pi 
\end{align*}
with
\begin{align*}
     &\rsapi= R_0(s,a)-  \alpha_{s,a} +\gamma \min _{P\in  \mathcal{P}_{s,a}  } P  V_{sa}^\pi \\
     &\rspi= R_0(s,a) -\Big( \frac{\pi_s(a)}{\normq{\pi_s}}\Big)^{q-1}  \alpha_{s} +\gamma \min _{P^\pi\in  \mathcal{P}_{s}  } P^\pi  V_{s}^\pi)
\end{align*}

% \mfg{
% XXX

% Let fix $\pi$, define $Q(s,a)$ and $V(s)$ as $\sum_a \pi(a|s) Q(s,a)$
% \begin{align}
%     T^\pi Q(s,a) &= R_0(s,a) -\Big( \frac{\pi_s(a)}{\normq{\pi_s}}\Big)^{q-1}  \alpha_{s} +\gamma \min _{P^\pi\in  \mathcal{P}_{s}  } P^\pi  V_{s}^\pi) + +\gamma P_{0,s,a}   V_{s}^\pi 
% \end{align}

% XXX
% }

\end{lemma}

\begin{proof}
   The result comes directly as for $sa$-rectangular the following relations hold, 


\begin{align*}
       V_{sa}^\pi(s)=\sum_a \pi(a\vert s) Q_{sa}^\pi(s,a) \quad 
       \text{and} 
\end{align*}
and for $s$-rectangular case
\begin{align*}
    V_{s}^\pi (s)=\sum_a \pi(a\vert s) Q_{s}^\pi(s,a).
\end{align*}
Then using fixed point equation of Bellman operator:
$\mathcal{T}_{\mathcal{U}_p^{\text {$s$}}}^\pi V^\pi_s(s)=V^\pi_s(s) $ or $\mathcal{T}_{\mathcal{U}_p^{\text {$sa$}}}^\pi V^\pi_{sa}(s)=V^\pi_{sa}(s) $ and previous Lemma \ref{bellman} for the expression of $\mathcal{T}_{\mathcal{U}_p^{\text {$s$}}}^\pi V^\pi_s(s) $,
we can identify the robust $Q$ values that give the result 







    
\end{proof}




\section{An first bound}
\label{annex B}

To lighten notations, we remove subscript $\mathrm{s}$ in most places and denote for example  $V^\pi$ instead of $\Vspi$ for $s$-rectangular sets.


\begin{lemma}[Decomposition of the bound]
\label{firstineq}
\label{decomposition}
\begin{align*}
    \norminf{\Qs -\Qpihat} \leq \norminf{\Qs-\Qhatpistar} + \norminf{  \Qhatpistar-\Qhatpihat} + \norminf{ \Qhatpihat-\Qpihat}
\end{align*}

\end{lemma}
\begin{proof}
    \begin{align*}
    0 \leq Q^* - Q^{\hat{\pi}} &=  Q^* - \underbrace{\hat{Q}^*}_{\geq \hat{Q}^{\pi_*}} + \hat{Q}^* - \hat{Q}^{\hat{\pi}} + \hat{Q}^{\hat{\pi}} - Q^{\hat{\pi}} 
    \\
    &\leq Q^* - \hat{Q}^{\pi_*} +  \hat{Q}^* - \hat{Q}^{\hat{\pi}} + \hat{Q}^{\hat{\pi}} - Q^{\hat{\pi}} 
    \\
    \Rightarrow 
    \|Q^* - Q^{\hat{\pi}}\|_\infty &\leq \| Q^* - \hat{Q}^{\pi_*}\|_\infty +  \|\hat{Q}^* - \hat{Q}^{\hat{\pi}}\|_\infty + \|\hat{Q}^{\hat{\pi}} - Q^{\hat{\pi}}\|_\infty
\end{align*}
\end{proof}
This decomposition is the starting point of our proofs for both Theorems~\ref{h4} and~\ref{h3}.
In this decomposition, the second term satisfies $\|\Qs-\Qhatpistar\|_\infty\leq \epsilon_{\mathrm{opt}}$ by definition. This term goes to $0$ exponentially fast as the robust Bellman operator is a $\gamma$-contraction. The two last terms $\|\Qs-\Qhatpistar\|_\infty$ and $\| \Qhatpihat-\Qpihat\|_\infty$ need to be controlled using concentration inequalities between the true MDP and the estimated one. To do so, we need concentration inequalities such as the following Lemma \ref{hoeffding}.
\looseness=-1

\begin{lemma}[Hoeffding's inequality for $V$]
\label{hoeffding}
For any $V \in \mathbb{R}^{|\mathcal{S}|}$ with $\norminf{V} \leq H$, with probability at least $1-\delta$, we have
$$ 
\max _{(s, a)}\left|P_{0} V-\widehat{P}_{} V\right| \leq H \sqrt{\frac{\log (2|\mathcal{S} \| \mathcal{A}| / \delta)}{2 N}}.
$$
\end{lemma}

\begin{proof}
    For any $(s, a)$ pair,  assume a discrete random variable taking value $V(i)$ with probability $P_{0,s, a}(i)$ for all $i \in\{1,2, \cdots,|\mathcal{S}|\}$. Using Hoeffding's inequality \citep{hoeffding1994probability} and  $\norminf{V} \leq H$:
$$
\mathbb{P}\left(P_{0} V-\widehat{P}_{} V \geq \varepsilon\right) \leq \exp \left(- N \varepsilon^2 / (2H^2)\right) \quad \text{ and } \quad \mathbb{P}\left(\widehat{P}_{} V-P_{0}
V \geq \varepsilon\right) \leq \exp \left(- N \varepsilon^2 /(2H^2)\right) .
$$
Then, taking $\varepsilon= H \sqrt{\frac{2\log (2|\mathcal{S}||\mathcal{A}| / \delta)}{ N}}$, we get $$\mathbb{P}\left(\left|P_{0} V-\widehat{P}_{} V\right| \geq H \sqrt{\frac{\log (2|\mathcal{S}||\mathcal{A}| / \delta)}{ N}}\right) \leq \frac{\delta}{|\mathcal{S}||\mathcal{A}|}.$$ Finally, using a union bound:
%
$$
\mathbb{P}\left(\max _{(s, a)}\left|P_{0} V-\widehat{P}_{} V\right| \geq H \sqrt{\frac{2\log (2|\mathcal{S}||\mathcal{A}| / \delta)}{ N}}\right) \leq \sum_{s, a} \mathbb{P}\left(\left|P_{0} V-\widehat{P}_{} V\right| \geq H\sqrt{\frac{2\log (2|\mathcal{S} \| \mathcal{A}| / \delta)}{ N}}\right) \leq \delta.
$$
\end{proof}
This completes the concentration proof. Next we will look at the contraction argument of the robust Bellman operator.
\begin{lemma}[Contraction of infimum operator]
\label{contration}
For $\mathcal{D}=\mathcal{P}_{s,a}$ or $\mathcal{P}_s$, the function 
$$
\forall s,a, \quad v \mapsto \kappa_{\mathcal{D}}(v)=\inf \left\{u^{\top} v: u \in \mathcal{D}\right\}
$$
is 1-Lipchitz.
%
\end{lemma}
\begin{proof}
We have that
$$
\begin{aligned}
\forall(s,a) \in \mathcal{S}\times \mathcal{A}, \quad \kappa_{\mathcal{P}_{s, a}}\left(V_2\right)-\kappa_{\mathcal{P}_{s, a}}\left(V_1\right) &=\inf _{p \in \mathcal{P}_{s, a}} p^{\top} V_2-\inf _{\tilde{p} \in \mathcal{P}_{s, a}} \tilde{p}^{\top} V_1=\inf _{p \in \mathcal{P}_{s, a}} \sup _{\tilde{p} \in \mathcal{P}_{s, a}} p^{\top} V_2-\tilde{p}^{\top} V_1 \\
& \geq \inf _{p \in \mathcal{P}_{s, a}} p^{\top}\left(V_2-V_1\right)=\kappa_{\mathcal{P}_{s, a}}\left(V_2-V_1\right) .
\end{aligned}
$$
%
Then $\forall \varepsilon>0 $, there exists $P_{s, a} \in \mathcal{P}_{s, a}$ such that
$$
P_{s, a}^{\top}\left(V_2-V_1\right)-\varepsilon \leq \kappa_{\mathcal{P}_{s, a}}\left(V_2-V_1\right).
$$
Using those two properties,
$$
\kappa_{\mathcal{P}_{s, a}}\left(V_1\right)-\kappa_{\mathcal{P}_{s, a}}\left(V_2\right) \leq P_{s, a}^{\top}\left(V_1-V_2\right)+\varepsilon \stackrel{}{\leq}\left\|P_{s, a}\right\|_1\left\|V_1-V_2\right\|+\varepsilon=\left\|V_1-V_2\right\|+\varepsilon,
$$
where we used the Holder's inequality. Since $\varepsilon$ is arbitrary small, we obtain, $\kappa_{\mathcal{P}_{s, a}}\left(V_1\right)-\kappa_{\mathcal{P}_{s, a}}\left(V_2\right) \leq\left\|V_1-V_2\right\|$. Exchanging the roles of $V_1$ and $V_2$ give the result.

The proof is similar for $\mathcal{P}_s$.
\end{proof}

Note that an immediate consequence is the already known $\gamma$- contraction of the robust Bellman operator. 

\begin{lemma}[Upper-bounds of $\norminf{\Qpihat-\Qhatpihat}$ and $\norminf{Q^*-\Qhatpistar }$]
\label{upper hat}


\begin{align*}
    \norminf{\Qpihat-\Qhatpihat}\leq& \frac{\gamma}{1-\gamma}  \max _{s, a}\left|\kappa_{\hat{\mathcal{P}}_{s, a}^{\mathrm{}}}(\Vhatpihat)-\kappa_{\mathcal{P}_{0,s, a}}(\Vhatpihat)\right|,
    \\
 \norminf{Q^*-\Qhatpistar }\leq& \frac{\gamma }{1-\gamma}\max _{s, a}\left|\kappa_{\widehat{\mathcal{P}}_{s, a}}(V^*) -\kappa_{\mathcal{P}_{0,s, a}}(V^*)\right|.
\end{align*}
\end{lemma}

\begin{proof}
For the first inequality, since we can rewrite the robust Q-function for any uncertainty sets on the dynamics as $\Qpihat\left(s, a\right)=r-\alpha_{s,a}+ \gamma \kappa_{\mathcal{P}_{0,s, a}}\left(\Vpihat\right)$ (see Eq.~\eqref{Q robust}), or replacing $\alpha_{s,a}$ by $\alpha_{s}\Big(\frac{\pihat_s(a)}{\normq{\pihat_s}}\Big)^{q-1}$ in the $s$- rectangular case:
\begin{align*}
   \Qpihat\left(s, a\right)-\Qhatpihat\left(s, a\right)&\stackrel{(a)}{=}\gamma \kappa_{\mathcal{P}_{0,s, a}}\left(\Vpihat\right)-\gamma \kappa_{\hat{\mathcal{P}}_{s, a}}\left(\Vhatpihat\right)
   \\ 
   &=\gamma\left(\kappa_{\mathcal{P}_{0,s, a}}\left(\Vpihat\right)-\kappa_{\mathcal{P}_{0,s, a}}\left(\Vhatpihat\right)\right)+\gamma\left(\kappa_{\mathcal{P}_{0,s, a}}\left(\Vhatpihat\right)-\kappa_{\hat{\mathcal{P}}_{s, a}}\left(\Vhatpihat\right)\right) 
   \\
\end{align*} 
with $\mathcal{P}_{s,a}$ defined in Assumption \ref{sa rectangle} and $\hat{\mathcal{P}}_{s,a}$ with the same definition but centred around the empirical MDP. Hence, taking the supremum norm $\norminf{.}$,
   \begin{align*}
   \norminf{\Qpihat-\Qhatpihat}  &= \max_{s,a}\left|\gamma\left(\kappa_{\mathcal{P}_{0,s, a}}\left(\Vpihat\right)-\kappa_{\mathcal{P}_{0,s, a}}\left(\Vhatpihat\right)\right)+\gamma\left(\kappa_{\mathcal{P}_{0,s, a}}\left(\Vhatpihat\right)-\kappa_{\hat{\mathcal{P}}_{s, a}}\left(\Vhatpihat\right)\right)  \right| 
   \\
   &\stackrel{(b)}{\leq}  \gamma\norminf{\Vpihat-\Vhatpihat}  +   \max_{s,a}\left|{\gamma\left(\kappa_{\mathcal{P}_{0,s, a}}\left(\Vhatpihat\right)-\kappa_{\hat{\mathcal{P}}_{s, a}}\left(\Vhatpihat\right)\right)}\right|
\\ 
& \leq\gamma\norminf{\Vpihat-\Vhatpihat}+ \gamma  \max _{s, a}\left|\kappa_{\hat{\mathcal{P}}_{s, a}^{\mathrm{}}}(\Vhatpihat)-\kappa_{\mathcal{P}_{0,s, a}}(\Vhatpihat)\right| 
   \\
   &\stackrel{(c)}{\leq} \gamma \norminf{\Qpihat-\Qhatpihat}+ \gamma \max _{s, a}\left|\kappa_{\hat{\mathcal{P}}_{s, a}^{\mathrm{}}}(\Vhatpihat)-\kappa_{\mathcal{P}_{0,s, a}}(\Vhatpihat)\right| .
   \end{align*}
Line (a) comes from the rectangularity assumption,  (b) uses the triangular inequality and the 1-contraction of the infimum in Lemma \ref{contration}, (c) uses the fact that $
\|V^\pi-\widehat{V}^\pi\|_{\infty} \leq\|Q^\pi-\widehat{Q}^\pi\|_{\infty}
$ for any $\pi$. As $1-\gamma <1$, we get the first stated result.



One can note that the proof is true for any policy, so it is also true for both $\pihat$ and $\pistar$ which concludes the proof. This proof is written for the $sa$-rectangular assumption, it is also true for the $s$-rectangular case with slightly different notations,
replacing $ \mathcal{D} = \mathcal{P}_{0,s,a}$ by  $\mathcal{D} = \mathcal{P}_{0,s}$. Now we need to find new form for $\kappa$ for both $s$ and $sa$ rectangular assumptions.




For the second claim, 
\begin{align*}
    \norminf{Q^*-\Qhatpistar }\leq& \frac{\gamma }{1-\gamma}\max _{s, a}\left|\kappa_{\widehat{\mathcal{P}}_{s, a}}(V^*) -\kappa_{\mathcal{P}_{0,s, a}}(V^*)\right|.
\end{align*}
we are using a slightly different modification: 


\begin{align*}
   \Qs\left(s, a\right)-\Qhatpistar\left(s, a\right)&\stackrel{(a)}{=}\gamma \kappa_{\mathcal{P}_{0,s, a}}\left(\Vs\right)-\gamma \kappa_{\hat{\mathcal{P}}_{s, a}}\left(\Vhatpistar\right)
   \\ 
   &=\gamma \kappa_{\mathcal{P}_{0,s, a}}\left(\Vs\right)-  \gamma \kappa_{\mathcal{P}_{0,s, a}}\left(\Vhatpistar\right)+ \gamma \kappa_{\mathcal{P}_{0,s, a}}\left(\Vhatpistar\right)
   -\gamma \kappa_{\hat{\mathcal{P}}_{s, a}}\left(\Vhatpistar\right)
   \\
   &\leq \gamma \norminf{\Qs-\Qhatpistar}+  \max _{s, a}\left|\kappa_{\widehat{\mathcal{P}}_{s, a}}(V^*) -\kappa_{\mathcal{P}_{0,s, a}}(V^*)\right| 
\end{align*} 

 using the same arguments as in the first inequality. Solving gives the result.
\end{proof}

\begin{lemma}[Duality for the minimisation problem for $sa$ rectangular case.] Denoting $\widehat{P}$ the vector $\widehat{P}_{s,a}$ or $P_0$ for $P_{0,s,a}$ ,\label{saduality3}
    \begin{equation*}
        \kappa_{\hat{\mathcal{P}}_{s,a}^{\mathrm{}}}(\Vhatpihat)=\max _{\mu\geq 0}  \{\widehat{P}_{}(\Vhatpihat-\mu) -\beta_{s,a} \snormqbar{\Vhatpihat-\mu} \} =\max _{\alpha \in [V_{min}, V_{max}]} \widehat{P}_{}[\Vhatpihat]_\alpha - \beta_{s,a} \snormqbar{ [\Vhatpihat]_\alpha  }  .
    \end{equation*}
    \begin{equation*}
        \kappa_{\mathcal{P}_{0,s, a}}(\Vs)=\max _{\mu\geq 0}  \{P_{0}(\Vs-\mu) -\beta_{s,a} \snormqbar{\Vs-\mu} \} =\max _{\alpha \in [V_{min}, V_{max}]} P_{0}[\Vs]_\alpha - \beta_{s,a} \snormqbar{ [\Vs]_\alpha  }  .
    \end{equation*}
    with $[V]_\alpha(s):= \begin{cases}\alpha, & \text { if } V(s)>\alpha \\ V(s), & \text { otherwise. }\end{cases}$
\end{lemma}


\begin{proof}
   First,  we will show that  
    \begin{equation*}
        \kappa_{\hat{\mathcal{P}}_{s, a}^{\mathrm{}}}(\Vhatpihat)=\max _{\mu\geq 0}  \{\widehat{P}_{}(\Vhatpihat-\mu) -\beta_{s,a} \snormqbar{\Vhatpihat-\mu} \} 
    \end{equation*}
    The second equation of this lemma is the same as the first one, replacing the centre of the ball constrain $\widehat{P}_{s, a}$ by $P_{0,s, a}$ and $\hat{\pi}$ by $\pistar$. By definition, 
    
\begin{align*}
        \kappa_{\hat{\mathcal{P}}_{s, a}} (\Vhatpihat)=  \min _{P\in \Delta_s , \normpbar{P-   \widehat{P}_{}  }\leq \beta_{s,a}   } \sum_{s'} P(s')\Vhatpihat(s')=  \widehat{P}_{s, a}\Vhatpihat+  \min _{y , \normpbar{y    } \leq \beta_{s,a}  , \mathrm{1}y=0, y\geq -\hat{P} }   \sum_{s'} y(s')\Vhatpihat(s')  
\end{align*}
where we use the change of variable $y(s')= P(s')-\hat{P}_{}(s')$.
Then writing the Lagrangian we get for $\mu\in \mathbb{R}_{+}^{\Snorm}$,$\gamma\in \mathbb{R}$ the Lagrangian variables:
\begin{align}
    &\widehat{P}_{}\Vhatpihat+ \max _{\mu\geq0, \nu\in \mathbb{R} }  \min _{y: \normpbar{y}\leq   \beta_{s,a}} {   -\sum_{s'}\mu(s) \hat{P}_{}(s')   +\sum_{s'} (y(s') (\Vhatpihat(s')-\mu(s')-\nu)  } \label{first} \\
    & \stackrel{(a)}{=}\widehat{P}_{}\Vhatpihat +  \max _{\mu\geq0, \nu\in \mathbb{R} }  -\sum_{s'}\mu(s') \hat{P}_{}(s')  -\beta_{s,a} \normqbar{(\Vhatpihat(s')-\mu(s')-\nu)} \label{second} \\
    &\stackrel{(b)}{=} \max _{\mu\geq 0} \widehat{P}_{}(\Vhatpihat-\mu) - \beta_{s,a}  \snormqbar{  \Vhatpihat-\mu } \label{third} 
\end{align}
where (a) is true using the equality case of Holder's inequality and (b)  is the definition of the span semi-norm (see Def. \ref{span}). The value that maximizes the inner maximization problem in \ref{second} in $\nu$ is the $q$-mean (see Def. \ref{span}) by definition denoted $w_p$.

Now the aim is to prove that 
\begin{equation*}
    \max _{\mu\geq 0}  \{\widehat{P}_{}(\Vhatpihat-\mu) -\beta_{s,a} \snormqbar{\Vhatpihat-\mu} \} =\max _{\alpha \in [V_{min}, V_{max}]} \widehat{P}_{}[\Vhatpihat]_\alpha - \beta_{s,a} \snormqbar{ [\Vhatpihat]_\alpha  }  .
\end{equation*}
In this equality, optimisation reduces in terms of $\mu \in \mathbb{R}^+$ to scalar bounded optimization in $\alpha$.
First, we have to remark that in Eq \ref{first}, the minimum is attained for :

\begin{equation}
    y^{\*}(s')=-\frac{\beta_{s,a}z(s')}{\normpbar{z}}
\end{equation}
because we are doing linear minimization under convex constraints.
The value of vector
$z$ is $z(s')=(\Vhatpihat(s')-\mu(s')-w_q)\widehat{P}_{}(s')$
with $w_q$ defined as the $q$-mean. 
The quantity $z/\normpbar{z}$ has unitary p-norm and its sign is determined $(\Vhatpihat(s')-\mu(s')-w_q)$.  We can choose any multiplicative scalar value as the vector is normalized, here we choose $\widehat{P}_{}(s')$.

Complementary slackness in equation \ref{third} gives that for all $s'$ such that $\mu(s')>0$,  $ y^*(s')=-q(s') $ or equivalently :
\begin{equation*}
    y^*(s')=-q(s') \iff  \Vhatpihat(s')-\mu(s')=w_p +\normpbar{z}/\beta_{s,a}=\alpha 
\end{equation*}
with $\alpha $ a constant. Since the optimal value of the initial problem is at least $ \min _{s'} \Vhatpihat(s')$ and lower than  $ \max _{s'} \Vhatpihat(s')$ , we have $\max _{s'} \Vhatpihat(s')\geq \alpha\geq\min _{s'} \Vhatpihat(s')$. The value of  $\alpha$ is not known in practice but we can recognise that the optimal value of $\mu$ is :


$$\mu^*(s)= \begin{cases}\Vhatpihat(s)-\alpha, & \Vhatpihat(s) \geq \alpha \\ 0, & \text { otherwise }\end{cases}$$
Then the dual optimisation problem Eq~\ref{third} reduces to 

\begin{equation*}
    \max _{\mu\geq 0} \widehat{P}_{}(\Vhatpihat-\mu) - \beta_{s,a}  \snormqbar{  \Vhatpihat-\mu }=  \max _{\alpha \in [V_{min}, V_{max}]} \widehat{P}_{}[\Vhatpihat]_\alpha - \beta_{s,a} \snormqbar{ [\Vhatpihat]_\alpha  }  .
\end{equation*}

with $[V]_\alpha(s):= \begin{cases}\alpha, & \text { if } V(s)>\alpha \\ V(s), & \text { otherwise. }\end{cases}$

The thing which is of capital importance is that the second part of the equation $\snormqbar{ [V]_\alpha} $ does not depend on $\widehat{P}_{}$.

\end{proof}

\begin{lemma}[Duality for the minimisation problem for $s$ rectangular case.] \label{sduality} 
Considering a projection matrix associated with a given  policy $\pi$ such that 
$P_s^\pi(s')  =\sum_a \pi(a\vert s) P_{s,a}(s')  $ and denoting $\widehat{P}^\pi \in \mathbb{R}^s$ the vector $\widehat{P}^\pi_s(.)$ or $P_0^\pi$ for $P_{0,s}^\pi(.)$, we have:
    \begin{equation*}
        \kappa_{\hat{\mathcal{P}}_{s}^{\mathrm{}}}(\Vhatpihat)= \max _{\alpha \in [V_{min}, V_{max}]}  \Big(\hat{P}^{\pi}[\Vhatpihat]_\alpha - \beta_{s} \normqbar{\pi_s}\snormqbar{ [\Vhatpihat]_\alpha  } \Big)
    \end{equation*}
    \begin{equation*}
        \kappa_{\mathcal{P}_{0,s}}(\Vs)=\max _{\alpha \in [V_{min}, V_{max}]}  \Big(P_{0}^\pi[\Vs]_\alpha - \beta_{s} \normqbar{\pi_s}\snormqbar{ [\Vs]_\alpha  } \Big) .
    \end{equation*}
     with $[V]_\alpha(s):= \begin{cases}\alpha, & \text { if } V(s)>\alpha \\ V(s), & \text { otherwise. }\end{cases}$
\end{lemma}


\begin{proof}
    We will first  show that  
    \begin{equation*}
         \kappa_{\hat{\mathcal{P}}_{s}^{\mathrm{}}}(\Vhatpihat)=  \hat{P}^\pi{}\Vhatpihat  +\max _{\mu\geq 0}\Bigg(    
  (-\widehat{P}^\pi_{}\mu) -  \gamma \beta_s \normqbar{\pi_s}\snormq{\Vhatpihat-\mu} \Bigg)
    \end{equation*}
    The second equation is the same replacing the centre of the ball constrain $\widehat{P}_{s}^\pi$ by $P_{0}^\pi$ and $\hat{\pi}$ by $\pistar$. By definition, 
    
\begin{align*}
        \kappa_{\hat{ \mathcal{P}}_{s, }} (\Vhatpihat)(s)=&  \min _{P\in \Delta_s , P\in \hat{\mathcal{P}}_s   } \Vhatpihat(s)=  \min _{P\in \Delta_s , P\in \hat{\mathcal{P}}_s   } \sum_a \pihat(a\vert s)    P_{}\Vhatpihat  \\&
        \stackrel{(a)}{=} \sum_a \pihat(a\vert s) \hat{P}_{s,a}\Vhatpihat    +  \min _{ \normpbar{\beta_{s,a}} \leq \beta_s   }  \sum_a  \pihat(a\vert s) \min _{y , \normpbar{y    } \leq \beta_{s,a}  , \mathrm{1}y=0, y\geq -\hat{P} } \sum_{s'} y(s') \Vhatpihat  \\
    % &   \stackrel{(b}{=} \sum_a \pihat(a\vert s) (r(s,a) +\gamma \hat{P}_{s,a}\Vhatpihat )      + \gamma \min _{ \normpbar{\beta_{s,a}} \leq \beta_s   }  \sum_a  \pihat(a\vert s)   \min _{y , \normpbar{y    } \leq \beta_{s,a}  , \mathrm{1}y=0, y\geq -\hat{p} } \sum_{s'} y(s') \Vhatpihat 
\end{align*}
where we use the change of variable $y(s')= P(s')-\hat{P}_{}(s')$ in (a). Then we case use the previous lemma for $sa$ rectangular assumption, Lemma \ref{saduality}. Then, 
\begin{align*}
   & \min _{ \normpbar{\beta_{s,a}} \leq \beta_s   }  \sum_a  \pihat(a\vert s) \min _{y , \normpbar{y    } \leq \beta_{s,a}  , \mathrm{1}y=0, y\geq -\hat{P} } \sum_{s'} y(s') \Vhatpihat =   \min _{ \normpbar{\beta_{s,a}} \leq \beta_s   }  \sum_a  \pihat(a\vert s)  \max _{\mu\geq 0} \Big(-\widehat{P}_{}\mu - \beta_{s,a}  \snormqbar{  \Vhatpihat-\mu }\Big) \\
   &=   \max _{\mu\geq 0}\Bigg(   \sum_a  \pihat(a\vert s)    
  (-\widehat{P}_{}\mu) -  \max _{ \normpbar{\beta_{s,a}} \leq \beta_s   } \sum_a \pihat(a\vert s)   \beta_{s,a}  \snormqbar{  \Vhatpihat-\mu }\Bigg) \\
  &=    \max _{\mu\geq 0}\Bigg(  \sum_a  \pihat(a\vert s)    
  (-\widehat{P}_{}\mu) -   \beta_s \normqbar{\pi_s}\snormq{\Vhatpihat-\mu} \Bigg)\\
\end{align*}
 we can exchange the min and the max as we get concave-convex problems in $\beta_{s,a}$ and $\mu$ in the second line and using Holder's inequality in the last line. Finally, we obtain: 

 \begin{align*}
      \kappa_{\hat{ \mathcal{P}}_{s, }} (\Vhatpihat)=&  
  \max _{\mu\geq 0} \sum_a \pihat(a\vert s) ( \hat{P}_{}(\Vhatpihat -\mu)  -  \beta_s \normqbar{\pi_s}\snormq{\Vhatpihat-\mu} \\
  \stackrel{(a)}{=}
  &  \max _{\alpha \in [V_{min}, V_{max}]}  \sum_a \pihat(a\vert s) \Big(\hat{P}_{}[\Vhatpihat]_\alpha - \beta_{s} \normqbar{\pi_s}\snormqbar{ [\Vhatpihat]_\alpha  } \Big) \\
  \end{align*}
 where in (a) we use Lemma \ref{saduality}.
Second claim is the same replacing $\Vhatpihat$ by $\Vs$, $\pihat$ by $\pistar$ and $\hat{P}_{}$ by $P_{0}$.

% Then writing the lagrangian we get for $\mu\in \mathbb{R}_{+}^{\Snorm}$,$\gamma\in \mathbb{R}$ the lagrangian variables:
% \begin{align}
%     &\widehat{P}_{s, a}\Vhatpihat+ \max _{\mu\geq0, \nu\in \mathbb{R} }  \min _{y: \normpbar{y}\leq   \beta_{s,a}} {   -\sum_{s'}\mu(s) \hat{P}_{s,a}(s')   +\sum_{s'} (y(s') (\Vhatpihat(s')-\mu(s')-\nu)  } \label{fourth} \\
%     & \stackrel{(a)}{=}\widehat{P}_{s, a}\Vhatpihat +  \max _{\mu\geq0, \nu\in \mathbb{R} }  -\sum_{s'}\mu(s') \hat{P}_{s,a}(s')  -\beta_{s,a} \normqbar{(\Vhatpihat(s')-\mu(s')-\nu)} \label{5th} \\
%     &\stackrel{(b)}{=} \max _{\mu\geq 0} \widehat{P}_{s, a}(\Vhatpihat-\mu) - \beta_{s,a}  \snormqbar{  \Vhatpihat-\mu } \label{6th} 
% \end{align}
% where (a) is true using the equality case of cauchy-swartz inequality and (b)  is the definition of of the seminorm. The value that maximize the inner maximization problem in \ref{5th} in $\nu$ is the $q$-mean $w_p$. 

% Then we change optimisation in terms of $\mu \in \mathbb{R}^+$ to scalar bounded optimisation, this part is central in our proof.
% First we have to remark that in Eq \ref{fourth}, the minimisation problem is attainted for :

% \begin{equation}
%     y^{\*}(s')=-\frac{\beta_{s,a}z(s')}{\normpbar{z}}
% \end{equation}
% with the value of vector $z(s')=(\Vhatpihat(s')-\mu(s')-w_q)\widehat{P}_{s, a}(s')$
% with $w_q$ defined as the $q$-mean.
% For the vector $z$, the importance is the sign of $(\Vhatpihat(s')-\mu(s')-w_q)$, we can multiply it by every scalar value as the vector is then normalised, here we choose $\widehat{P}_{s, a}(s')$.

% Using complementary slackness in equation \ref{6th} arguments for the value of $s'$ such that $\mu(s')>0$ implies $ y^*(s')=-q(s') $ or equivalently :
% \begin{equation*}
%     y^*(s')=-q(s') \iff  \Vhatpihat(s')-\mu(s')=w_p +\normpbar{z}/\beta_{s,a}=\alpha 
% \end{equation*}
% with $\alpha $ a constant. Since the optimal value of the initial problem is at least $ \min _{s'} \Vhatpihat(s')$ and lower than  $ \max _{s'} \Vhatpihat(s')$ , we have $\max _{s'} \Vhatpihat(s')\geq \alpha\geq\min _{s'} \Vhatpihat(s')$. The value of  $\alpha$ is not known in practice but we can recognise that the optimal value of $\mu$ is :


% $$\mu^*(s)= \begin{cases}\Vhatpihat(s)-\alpha, & \Vhatpihat(s) \geq \alpha \\ 0, & \text { otherwise }\end{cases}$$
% Then the dual optimisation problem eq \ref{third} reduces to 

% \begin{equation*}
%     \max _{\mu\geq 0} \widehat{P}_{s, a}(\Vhatpihat-\mu) - \beta_{s,a}  \snormqbar{  \Vhatpihat-\mu }=  \max _{\alpha \in [V_{min}, V_{max}]} \widehat{P}_{s, a}[\Vhatpihat]_\alpha - \beta_{s,a} \snormqbar{ [\Vhatpihat]_\alpha  }  .
% \end{equation*}

% with $[V]_\alpha(s):= \begin{cases}\alpha, & \text { if } V(s)>\alpha \\ V(s), & \text { otherwise. }\end{cases}$

% The thing which is of capital importance is that  $\snormqbar{ [V]_\alpha} $ does not depend on $\widehat{P}_{s, a}$, contrary to divergence contrained such as $\chi^2$.

\end{proof}


\begin{lemma} \label{trickalpha}
For $s$ and $sa$ rectangular assumptions,
   \begin{equation}
       \left|\kappa_{\hat{\mathcal{P}}_{s, a}^{\mathrm{}}}(\Vhatpihat)-\kappa_{\mathcal{P}_{0,s, a}}(\Vhatpihat)\right|\leq    \max _{s,a} \vert (\widehat{P}_{s,a} -P_{0,s,a})\Vhatpihat\vert
   \end{equation} 
      \begin{equation}
       \left|\kappa_{\hat{\mathcal{P}}_{s}^{\mathrm{}}}(\Vs)-\kappa_{\mathcal{P}_{0,s}}(\Vs)\right| \leq  \max _{s,a} \vert (\widehat{P}_{s,a} -P_{0,s,a})\Vs\vert
   \end{equation} 
\end{lemma}
\begin{proof}
    \begin{align*}
         \left|\kappa_{\hat{\mathcal{P}}_{s, a}^{\mathrm{}}}(\Vhatpihat)-\kappa_{\mathcal{P}_{0,s, a}}(\Vhatpihat)\right|&\stackrel{(a)}{=} \max _{\alpha \in [V_{min}, V_{max}]} \widehat{P}_{s,a}[\Vhatpihat]_\alpha - \beta_{s,a} \snormqbar{ [\Vhatpihat]_\alpha  }    \\ 
         &-\max _{\alpha \in [V_{min}, V_{max}]} P_{0,s,a}[\Vhatpihat]_\alpha - \beta_{s,a} \snormqbar{ [\Vhatpihat]_\alpha  }    \vert \\
         &\stackrel{(b)}{\leq } \max _{\alpha \in [V_{min}, V_{max}]} \vert (\widehat{P}_{} -P_{0,s,a})[\Vhatpihat]_\alpha\vert \\
         &\stackrel{(c)}{\leq }   \vert (\widehat{P}_{s,a} -P_{0,s,a})\Vhatpihat\vert \leq  \max _{s,a} \vert (\widehat{P}_{s,a} -P_{0,s,
         a})\Vhatpihat\vert
    \end{align*}
\end{proof}
where (a) is previous lemma, (b) is 1-Lipchitz property of $\max$ operator, (c) is triangular inequality that the maximum is attained for $\alpha=V_{max}$ for the equality. For s rectangular,

\begin{align*}
         \left|\kappa_{\hat{\mathcal{P}}_{s, a}^{\mathrm{}}}(\Vhatpihat)-\kappa_{\mathcal{P}_{0,s, a}}(\Vhatpihat)\right|&\stackrel{(a)}{=} \max _{\alpha \in [V_{min}, V_{max}]}  \sum_a \pihat(a\vert s) (\Big(\hat{P}_{s,a}[\Vhatpihat]_\alpha - \beta_{s} \normqbar{\pi_s}\snormqbar{ [\Vhatpihat]_\alpha  } \Big)  \\ 
         &-\max _{\alpha \in [V_{min}, V_{max}]}  \sum_a \pihat(a\vert s) (\Big(P_{0,s,a}[\Vhatpihat]_\alpha - \beta_{s} \normqbar{\pi_s}\snormqbar{ [\Vhatpihat]_\alpha  } \Big) \vert  \\
         &\stackrel{(b)}{\leq } \max _{\alpha \in [V_{min}, V_{max}]} \vert \sum_a \pihat(a\vert s) (\widehat{P}_{s,a} -P_{0,s,a})[\Vhatpihat]_\alpha\vert \\
         &\stackrel{}{\leq }  \max _{s,a} \vert (\widehat{P}_{s,a} -P_{0,s,a})\Vhatpihat\vert
    \end{align*}


Note that at this point, quantities for $s$ and $sa$ rectangular is the same as the part with span semi norms cancelled. Now, note that the main problem is that we can not apply classical Hoeffding's inequality as $\widehat{P}_{}$ is dependent of data as $\Vhatpihat$. We need to decouple $\Vhatpihat$ using $s$ absorbing MDPS as in \cite{agarwal2020model} but using Hoeffding arguments. Proof of the second claim is similar.






% \begin{lemma} Upper bound on $Q^*\left(s, a\right)-\Qhats\left(s, a\right) $
% 
% $$\norminf{Q^*\left(s, a\right)-\Qhats\left(s, a\right) }\leq \frac{\gamma }{1-\gamma}\left(\max _{s, a}\left|\kappa_{\widehat{\mathcal{P}}_{s, a}}(V^*) -\kappa_{\mathcal{P}_{s, a}}(V^*)\right|\right) $$

% \end{lemma}

% \begin{proof}
%     Same as 
% \end{proof}

% \begin{proof}
% $$
% \begin{aligned}
% Q^*\left(s, a\right)-\Qhats\left(s, a\right) &\stackrel{(a)}{\leq} Q^*\left(s, a\right)-\Qhatpistar\left(s, a\right)
% \\
% &\stackrel{}{=} \gamma \kappa_{\mathcal{P}_{s, a}^{\mathrm{}}}\left(V^*\right)-\gamma \kappa_{\widehat{\mathcal{P}}_{s, a}^{\mathrm{}}}\left(\Vhatpistar\right)     
% \\
%  &=               \gamma \kappa_{\mathcal{P}_{s, a}^{\mathrm{}}}\left(V^*\right)     -\gamma \kappa_{\widehat{\mathcal{P}}_{s, a}^{\mathrm{}}}\left(\Vs\right) +\gamma \kappa_{\widehat{\mathcal{P}}_{s, a}^{\mathrm{}}}\left(\Vs\right)
% -\gamma \kappa_{\widehat{\mathcal{P}}_{s, a}^{\mathrm{}}}\left(\Vhatpistar\right)
% \\
% &\stackrel{(b)}{\leq} \gamma \kappa_{\mathcal{P}_{s, a}^{\mathrm{}}}\left(V^*\right)-\gamma \kappa_{\widehat{\mathcal{P}}_{s, a}^{\mathrm{}}}\left(\Vs\right) +\gamma \norminf{V^*-\Vhatpistar}
% \\
% &\leq \gamma  \max _{s, a}\left|\kappa_{\widehat{\mathcal{P}}_{s, a}}(V^*) -\kappa_{\mathcal{P}_{s, a}}(V^*)\right|+\gamma\norminf{V^*-\Vhatpistar}
% \\
% &\stackrel{(c)}{\leq}\gamma  \max _{s, a}\left|\kappa_{\widehat{\mathcal{P}}_{s, a}}(V^*) -\kappa_{\mathcal{P}_{s, a}}(V^*)\right|+\gamma \norminf{\Qs-\Qhatpistar}
% \end{aligned}
% $$
% where (a) is the greediness property and (b) comes from 1-contraction or \ref{contration} , (c) that $
% \left\|V^\pi-\widehat{V}^\pi\right\|_{\infty} \leq\left\|Q^\pi-\widehat{Q}^\pi\right\|_{\infty}
% $ for any $\pi$. As $1-\gamma <1$, we get the result.
% \end{proof}





\begin{lemma}[$s$-absorbing MDPs for Hoeffding's concentration Inequalities]
\end{lemma}

\label{absorbing}
As in Agarwal paper \cite{agarwal2020model}, we define for a state $s$ and a scalar $u$, the MDP called $M_{s, u}$ such that: $M_{s, u}$ is identical to $M$ except that state $s$ is absorbing in $M_{s, u}$, i.e. $P_{M_{s, u}}(s \mid s, a)=1$ for all $a$, and the  reward at state $s$ in $M_{s, u}$ is $(1-\gamma) u$. The remainder of the transition model and reward function are identical to those in $M$.  In the following, we will use $V_{s, u}^\pi$ to denote the value function $V_{M_{s, u}}^\pi$ and correspondingly for $Q$ and reward and transition functions  to avoid notational clutter.  Then, we have that for all policies $\pi$ :
$$
V_{s, u}^\pi(s)=u
$$
because $s$ is absorbing with  reward $(1-\gamma) u$.
For some state $s$, we will only consider the MDP $M_{s, u}$ for $u$ in a finite set $U_s$ with
$$
U_s \subset\left[V^{\star}(s)-\Delta_{\delta, N} V^{\star}(s)+\Delta_{\delta, N}\right] .
$$
with 
$\Delta_{\delta, N}:=\frac{\gamma}{(1-\gamma)^2} \sqrt{\frac{2 \log (2|\mathcal{S}||\mathcal{A}| / \delta)}{N}}$
 The set $U_s$ consists of evenly spaced elements in this interval, where we set the size of $\left|U_s\right|$ appropriately later on. As before, we let $\widehat{M}_{s, u}$ denote the MDP that uses the empirical model $\widehat{P}$ instead of $P$, at all non-absorbing states and abbreviate the value functions in $\widehat{M}_{s, u}$ as $\widehat{V}_{s, u}^\pi$.
Then we have for a fix a state $s$, action $a$, a finite set $U_s$, and $\delta\geq 0$, that for all $u\in U_s$: with probability greater than $1-\delta$, it holds :


\begin{equation} \label{hoeffdingabsorb}
   \vert  (\widehat{P}_{s, a} -P_{0,s, a})\Vpihat_u\vert\leq \frac{1}{(1-\gamma)}\sqrt{\frac{2 \log \left(4\left|U_s\right| / \delta\right)}{N}}
\end{equation}
Now This is just Hoeffding's inequality applied to the finite set $U_s$ as now $\Vpihat_u$ and $\widehat{P}_{s, a}$ are now independent.



\begin{lemma}[\cite{agarwal2020model}, Lemma 7] \label{sameabsorb}
Let $u^*=V_M^{\star}(s)$ and $u^\pi=V_M^\pi(s)$. We have
$$
V_M^{\star}=V_{s, u^{\star}}^{\star}, \quad \text { and for all policies } \pi, \quad V_M^\pi=V_{M_{s, u^\pi}^\pi}^\pi
$$
\end{lemma}
Proof can be found in \cite{agarwal2020model}, Lemma 7.
\begin{lemma} \label{stabilityabsorb} For any $u,u',s$ and policy $\pi$:
$$\left\|Q_{s, u}^\pi-Q_{s, u^{\prime}}^\pi\right\|_{\infty} \leq\left|u-u^{\prime}\right|$$    
\end{lemma}


\begin{proof}

    To obtain the result in our robust MDP setting, we need a similar stability property like in Lemma 8 of \cite{agarwal2020model},
but for the robust value functions. It turns out that this a direct consequence of the property for classical MDP. Agarwal in \cite{agarwal2020model} show equation \ref{robustabsorb} for classical MPDs, then we have for RMDPs: 
\begin{align}
&|Q_{M_{s,u}}^{\pi}(s,a) - Q_{M_{s,u'}}^{\pi}(s,a)| \leq \frac{1}{1-\gamma} |u-u'| \label{robustabsorb}\\
\Rightarrow &|\inf_{M} Q_{M_{s,u}}^{\pi}(s,a) - \inf_{M} Q_{M_{s,u}}^{\pi}(s,a)| \leq \frac{1}{1-\gamma} |u-u'|\\
\Rightarrow &|\sup_{\pi} \inf_{M} Q_{M_{s,u}}^{\pi}(s,a) - \sup_{\pi}\inf_{M} Q_{M_{s,u}}^{\pi}(s,a)| \leq \frac{1}{1-\gamma} |u-u'|.
\end{align}
which concludes the proof for RMDPs.
\end{proof}



\begin{lemma}[Hoeffding's Concentration for dependent variables] Removing  $s,a$ notations for kernels, \label{concetrationu}
    \begin{equation}
        \left|\left(P_{0}-\widehat{P}_{}\right) \cdot \widehat{V}^{\star}\right| \leq  \frac{1}{(1-\gamma)}\sqrt{\frac{2 \log \left(4\left|U_s\right| / \delta\right)}{N}} + 2 \min _{u \in U_s} \left|\widehat{V}^{\star}(s)-u\right|
    \end{equation}
    % \begin{equation}
    %      \left|\left(P_{0}-\widehat{P}_{}\right) \cdot \widehat{V}^{\star}\right| \leq  \frac{1}{(1-\gamma)}\sqrt{\frac{2 \log \left(4\left|U_s\right| / \delta\right)}{N}} + 2 \min _{u \in U_s} \left|\widehat{V}^{\star}(s)-u\right|
    % \end{equation}
\end{lemma}
\begin{proof}

    \begin{align}\left|\left(P_{0}-\widehat{P}_{}\right) \cdot \widehat{V}^{\star}\right| 
    & =\left|\left(P_{0}-\widehat{P}_{}\right) \cdot\left(\widehat{V}^{\star}-V_{s, u}^{\star}+V_{s, u}^{\star}\right)\right| \\ 
    & \leq\left|\left(P_{0}-\widehat{P}_{}\right) \cdot\left(\widehat{V}^{\star}-V_{s, u}^{\star}\right)\right|+\left|\left(P_{0}-\widehat{P}_{}\right) \cdot\left(V_{s, u}^{\star}\right)\right| \\
    & \stackrel{(a)}{\leq}  2  \norminf{ \widehat{V}^{\star}-V_{s, u}^{\star} }+\frac{1}{(1-\gamma)}\sqrt{\frac{2 \log \left(4\left|U_s\right| / \delta\right)}{N}}  \\
    &  \stackrel{(b)}{\leq}  + 2 \left|\widehat{V}^{\star}(s)-u\right|+\frac{1}{(1-\gamma)}\sqrt{\frac{2 \log \left(4\left|U_s\right| / \delta\right)}{N}} \\ 
    \end{align}

where $(a)$ is \ref{hoeffdingabsorb} or Hoeffding's inequality for s-absorbing MDPs and Holder's inequality.
By Lemmas \ref{sameabsorb} and \ref{stabilityabsorb},
$$
\left\|\widehat{V}^{\star}-V_{s, u}^{\star}\right\|_{\infty}=\left\|\widehat{V}_{s, \widehat{V}^{\star}(s)}^{\star}-V_{s, u}^{\star}\right\|_{\infty} \leq\left|\widehat{V}^{\star}(s)-u\right| .
$$
which is point $(b)$. The last $\min$ operator in the result comes from the fact that the previous equation holds for all $u \in U_s$, we  take the best possible choice, which completes the proof of the first claim. The proof of the second claim is analogous.
\end{proof}




\begin{lemma}[Crude bound for Robust MDPs]\label{crude} This lemma is needed for next Lemma \ref{agarwalu} but the proof  differs from the classical MDP setting.
For $s$ and $sa$ rectangular assumptions,
    \begin{equation*}
        \norminf{\Qs- \Qhatpistar}\leq \Delta_{\delta,N} \text{ and }  \norminf{\Qs- \Qhats}\leq \Delta_{\delta,N}  \quad \text{with }  \quad \Delta_{\delta,N} =\frac{\gamma}{(1-\gamma)^2}\sqrt{\frac{2\log(2\Snorm\Anorm\delta )}{N}}
    \end{equation*}
\end{lemma}
\begin{proof}
    For the first claim : 
       \begin{align*}
   \norminf{\Qpi-\Qhatpi}  &= \max_{s,a}\left|\gamma\left(\kappa_{\mathcal{P}_{0,s, a}}\left(\Vpi\right)-\kappa_{\hat{\mathcal{P}}_{s, a}}\left(\Vpi\right)\right)+\gamma\left(\kappa_{\hat{\mathcal{P}}_{s, a}}\left(\Vpi\right)-\kappa_{\hat{\mathcal{P}}_{s, a}}\left(\Vhatpi\right)\right) \right| 
   \\
   &\stackrel{(b)}{\leq} \max_{s,a}\left|{\gamma\left(\kappa_{\mathcal{P}_{0,s, a}}\left(\Vpi\right)-\kappa_{\hat{\mathcal{P}}_{s, a}}\left(\Vpi\right)\right)}\right|+\gamma\norminf{\Vpi-\Vhatpi}  
\\ 
   &\stackrel{(b)}{\leq} \gamma \max _{s, a}\left|\kappa_{\hat{\mathcal{P}}_{s, a}^{\mathrm{}}}(\Vpi)-\kappa_{\mathcal{P}_{0,s, a}}(\Vpi)\right| +\gamma \norminf{\Qpi-\Qhatpi}.
   \end{align*}
\end{proof}
where we use contraction of $\kappa$, lemma \ref{contration} in  (a) and $\norminf{\Qpi-\Qhatpi}\leq \norminf{\Vpi-\Vhatpi}$ in (c) for any $\pi$.
Solving we get :
\begin{equation*}
    \norminf{\Qpi-\Qhatpi}\leq  \frac{\gamma}{1-\gamma} \max _{s, a}\left|\kappa_{\hat{\mathcal{P}}_{s, a}^{\mathrm{}}}(\Vpi)-\kappa_{\mathcal{P}_{0,s, a}}(\Vpi)\right| 
\end{equation*}
Then using Lemma \ref{trickalpha}, we obtain :

\begin{equation*}
    \norminf{\Qpi-\Qhatpi}\leq  \frac{\gamma}{1-\gamma} \max _{s, a}\left|\kappa_{\hat{\mathcal{P}}_{s, a}^{\mathrm{}}}(\Vpi)-\kappa_{\mathcal{P}_{0,s, a}}(\Vpi)\right| \leq \frac{\gamma}{1-\gamma} \norminf{(\hat{P}_{}-P_{0})V^\pi}
\end{equation*}
Taking $\pi=\pistar$, in the quantity $\norminf{(\hat{P}_{}-P_{0})V^{\pistar}}$, $V^{\pistar}$ is independent of the data and we can use classical Hoeffding inequality, Lemma \ref{hoeffding}.
Finally, we have
\begin{equation*}
    \norminf{\Qs-\Qhatpistar}  \leq \frac{\gamma}{1-\gamma} \norminf{(\hat{P}_{}-P_{0})V^\pi} \leq \frac{\gamma}{(1-\gamma)^2}\sqrt{\frac{2\log(2\Snorm\Anorm\delta )}{N}}
\end{equation*}

For the second point, using $s$ or $sa$ rectangular assumptions, 

\begin{align*}
     \norminf{\Qs- \Qhats}\leq& \norminf{\mathcal{T}^{\pistar}_{\mathcal{U}_p^{sa}} \Qs -      \hat{\mathcal{T}}^{\pihatstar}_{\mathcal{U}_p^{sa}   }  \Qs             +    \hat{\mathcal{T}}^{\pihatstar}_{\mathcal{U}_p^{sa}   }  \Qs -\hat{\mathcal{T}}^{\pihatstar}_{\mathcal{U}_p^{sa}   }  \Qhats  } \\
    &\leq \norminf{   \mathcal{T}^{\pistar}_{\mathcal{U}_p^{sa}} \Qs -      \hat{\mathcal{T}}^{\pihatstar}_{\mathcal{U}_p^{sa}  }  \Qs}        +       \norminf{\hat{\mathcal{T}}^{\pihatstar}_{\mathcal{U}_p^{sa}   }  \Qs -\hat{\mathcal{T}}^{\pihatstar}_{\mathcal{U}_p^{sa}} \Qhats} \\
    &\stackrel{(a)}{\leq} 
    \norminf{   \mathcal{T}^{\pistar}_{\mathcal{U}_p^{sa}} \Qs -      \hat{\mathcal{T}}^{\pihatstar}_{\mathcal{U}_p^{sa}  } \Qs }        +       \gamma\norminf{\Qs-\Qhats} \\
    &\stackrel{(b)}{\leq} \norminf{ \kappa_{\hat{\mathcal{P}}_{s, a}}(\Vs) -\kappa_{\mathcal{P}_{0,s, a}}(V^*) }   +       \gamma\norminf{\Qs-\Qhats}
\end{align*}
Then using Lemma \ref{trickalpha}, and solving we get :
\begin{align*}
    \norminf{\Qs-\Qhats} \frac{\gamma}{1-\gamma}\norminf{ \kappa_{\hat{\mathcal{P}}_{s, a}}(\Vs) -\kappa_{\mathcal{P}_{0,s, a}}(V^*) } \leq \frac{\gamma}{1-\gamma} \norminf{ (P_{0} -\hat{P}_{})V^*  }
\end{align*}
Finally using Lemma \ref{hoeffding}, we obtain 
\begin{equation*}
     \norminf{\Qs-\Qhats} \leq \frac{\gamma}{(1-\gamma)^2}\sqrt{\frac{2\log(2\Snorm\Anorm\delta )}{N}}
\end{equation*}
which concludes the proof.




\begin{lemma}[Similar to Agarwal, \cite{agarwal2020model} lemma 9 but for RMPDs] \label{agarwalu} 
With probability $1-\delta$, we have: 
\begin{equation*}
     \min _{u \in U_s}\left|\widehat{V}^{\star}(s)-u\right| \leq 4 \gamma \sqrt{\frac{4 \log (4|\mathcal{S}||\mathcal{A}| / \delta)}{N}}
\end{equation*}


\end{lemma}

\begin{proof}
    The proof can be found in \cite{agarwal2020model} and is similar for RMDs than for classical MPDs and consists in 
choosing $U_s$ to be the evenly spaced elements in the interval $\left[V^{\star}(s)-\Delta_{\delta / 2, N} V^{\star}(s)+\Delta_{\delta / 2, N}\right]$, then finally the size of $U_s$ is chosen to be $\left|U_s\right|=\frac{1}{(1-\gamma)^2}$. Using lemma , with probability greater than $1-\delta / 2$, we have $\widehat{V}^{\star}(s) \in\left[V^{\star}(s)-\Delta_{\delta / 2, N} V^{\star}(s)+\Delta_{\delta / 2, N}\right]$ for all $s$ according to Lemma \ref{crude}. This implies using that that $\Vhatpistar $will land in one of $\vert U_s\vert -1$evenly sized sub-intervals of length $\ 2 \Delta_{\delta / 2, N}$
:
    $$    \min _{u \in U_s}\left|\widehat{V}^{\star}(s)-u\right| \leq \frac{2 \Delta_{\delta / 2, N}}{\left|U_s\right|-1}=\frac{2}{\left|U_s\right|-1} \frac{\gamma}{(1-\gamma)^2} \sqrt{\frac{4 \log (4|\mathcal{S}||\mathcal{A}| / \delta)}{N}} \leq 4 \gamma \sqrt{\frac{4 \log (4|\mathcal{S}||\mathcal{A}| / \delta)}{N}}$$
\end{proof}

\begin{lemma}[Relation between concentration of robust and non-robust MDPs]

With probability $1-\delta$, we get:
\label{close_form}
\begin{align*}
    &\max _{s, a}\left|\kappa_{\hat{\mathcal{P}}_{s, a}^{\mathrm{}}}(\Vpihat)-\kappa_{\mathcal{P}_{0,s, a}}(\Vpihat)\right|\leq\max _{s, a}\left|\left(P_{0} -\widehat{P}_{}\right)\Vpihat \right|
    %\stackrel{(a)}{\leq}
    \leq 
     \frac{8}{(1-\gamma)}\sqrt{\frac{4 \log \left(8 \Snorm\Anorm /((1-\gamma) \delta)\right)}{N}} +2\epsilon_{opt}. \\
    &\max _{s, a}\left|\kappa_{\widehat{\mathcal{P}}_{s, a}}(V^*) -\kappa_{\mathcal{P}_{0,s, a}}(V^*)\right|\leq \max _{s, a}\left|\left(P_{0} -\widehat{P}_{}\right)  \Vs\right|
    %\stackrel{(a)}{\leq}
    \leq
     \frac{8}{(1-\gamma)}\sqrt{\frac{4 \log \left(8 \Snorm\Anorm /((1-\gamma) \delta)\right)}{N}}.
\end{align*}
\end{lemma}


\begin{proof}
    Using Lemma \ref{trickalpha}, we directly have the first inequality equality part of the first statement:
    \begin{equation*}
        \max _{s, a}\left|\kappa_{\hat{\mathcal{P}}_{s, a}^{\mathrm{}}}(\Vhatpihat)-\kappa_{\mathcal{P}_{0,s, a}}(\Vhatpihat)\right|\leq\max _{(s, a)}\left|\left(P_{0} -\widehat{P}_{}\right)\Vhatpihat \right| \leq \max _{(s, a)} \vert  ( P_{0} -\widehat{P}_{} ) (\Vhatpihat- \Vhats)    \vert  + \max _{(s, a)} \vert  ( P_{0} -\widehat{P}_{} )  \Vhats    \vert 
    \end{equation*}
  Then, combining Lemma \ref{concetrationu} and \ref{agarwalu}, using $\vert U_s  \vert=\frac{1}{(1-\gamma)^2} $ , with probability $1-\delta$,  we have :

  \begin{align*}
      \vert \left(P_{0} -\widehat{P}_{}\right)\Vhatpihat \vert \leq&  4 \gamma \sqrt{\frac{4 \log (4|\mathcal{S}||\mathcal{A}| / \delta)}{N}} + \frac{1}{(1-\gamma)}\sqrt{\frac{4 \log \left(8 \Snorm\Anorm /((1-\gamma) \delta)\right)}{N}}+ + 2\epsilon_{opt}. \\
      \leq&  \frac{8}{(1-\gamma)}\sqrt{\frac{4 \log \left(8 \Snorm\Anorm /((1-\gamma) \delta)\right)}{N}} + 2\epsilon_{opt}.
  \end{align*}
   
   The proof is exactly the same by replacing $\hat{\pi}$ by $\pi^*$ but without the $2\epsilon_{opt}$ , which gives the second stated result.
    Again, this proof is written for the $sa$-rectangular assumption, it is also true for the $s$-rectangular case with slightly different notations,
replacing $ \mathcal{D} = \mathcal{P}_{0,s,a}$ by  $\mathcal{D} = \mathcal{P}_{0,s}$.
\end{proof}

% \begin{proof}
% This is a consequence of the  definition of the infimum with $L_p$ constraints with rectangularity. Here we see the main advantage and of regularised form of  robust MDPs. Using Lemma \ref{hoeffding} in the inequality (a). Indeed, for $sa$-rectangular uncertainty set
% \begin{align*}
%     \kappa_{\hat{\mathcal{P}}_{s, a}^{\mathrm{}}}(\Vpihat)-\kappa_{\mathcal{P}_{s, a}}(\Vpihat)&=\left(R_0-\alpha_{\mathrm{sa}}-\beta_{\mathrm{sa}}\gamma \snormqbar{\Vpihat}+\widehat{P}_{s, a}\Vpihat\right)  -\left(R_0-\alpha_{\mathrm{sa}}-\beta_{\mathrm{sa}}\gamma \snormqbar{\Vpihat}+P_{s, a}\Vpihat\right)\\
%     &=\left( \widehat{P}_{s, a} - P_{s, a}\right) \Vpihat
% \end{align*}
% $$ $$
% it works also for $s$-rectangular with the same idea and with $\Vs$ as:
% \begin{align*}
%     \kappa_{\hat{\mathcal{P}}_{s, a}^{\mathrm{}}}(\Vpihat)-\kappa_{\mathcal{P}_{s, a}}(\Vpihat)&= R_0-\alpha_s\normqbar{\pihat}-\gamma \beta_s \normqbar{\pihat} \snormqbarpihat{\Qpihat} + \widehat{P}_{s, a}\Vpihat -(R_0-\alpha_s\normqbar{\pihat}-\gamma \beta_s \normqbar{\pihat} \snormqbarpihat{\Qpihat}+ \widehat{P}_{s, a}\Vpihat)\\
%     &= \left(\widehat{P}_{s, a}-P_{s, a} \right) \Vpihat.
% \end{align*}

% \end{proof}
These two inequalities are the core of our proof, as the closed form solution of the $\min$ problem in the robust setting only depends on $\alpha,\beta $ and the current value function.




\begin{theorem}

Suppose $\delta>0$, $\epsilon >0$ and $\beta >0$, let $\widehat{\pi}$ be any $\epsilon_{\text {opt }}$-optimal policy for $\widehat{M}$, i.e. $\left\|\widehat{Q}^{\widehat{\pi}}-\widehat{Q}^{\star}\right\|_{\infty} \leq \epsilon_{\text {opt } }$. If
$$
N \geq \frac{C \gamma^2
\log \left(|\mathcal{S} \| \mathcal{A}| (1-\gamma)^{-1} \delta^{-1}\right)}{(1-\gamma)^4 \epsilon^2}, 
$$
we get
$$ \norminf{\Qs-\Qpihat}\leq \epsilon +
\epsilon_{o p t   }     $$
with probability at least $1-\delta$, where $C$ is an absolute constant. Finally, for 
$N_{\text {total }}=N|\mathcal{S}||\mathcal{A}|$ and  $H=1/(1-\gamma)$, we get an overall complexity of $$N_{\text {total }}=\tilde{\mathcal{O}}\left(   \frac{H^4\Snorm\Anorm}{\epsilon^2}    \right).$$
\end{theorem}


\begin{proof}

\begin{align*}
 \norminf{\Qs -\Qpihat} &\stackrel{(a)}{\leq} \norminf{\Qs-\Qhats} + \norminf{  \Qhats
-\Qhatpihat} + \norminf{ \Qhatpihat-\Qpihat} 
\\
&\stackrel{(b)}{\leq} \epsilon_{\mathrm{opt}}+\frac{\gamma}{(1-\gamma)}\left(\max _{s, a}\left|\kappa_{\hat{\mathcal{P}}_{s, a}}\left(V^*\right)-\kappa_{\mathcal{P}_{s, a}}\left(V^*\right)\right|+\max _{s, a}\left|\kappa_{\mathcal{P}_{s, a}}\left(\Vpihat\right)-\kappa_{\mathcal{P}_{s, a}}\left(\Vpihat\right)\right|\right) 
 \\
 &\stackrel{(c)}{\leq}  \frac{16 \gamma}{(1-\gamma)^2} \sqrt{\frac{4\log (8|\mathcal{S} \| \mathcal{A}| /((1-\gamma) \delta)}{ N}} +\epsilon_{\mathrm{opt}} + \frac{2\gamma \epsilon_{\mathrm{opt}} }{1-\gamma} 
\\
&\stackrel{}{\leq}\frac{16\gamma}{(1-\gamma)^2}\sqrt{\frac{4\log (8|\mathcal{S} \| \mathcal{A}| /((1-\gamma) \delta)}{ N}}+\epsilon_{\mathrm{opt}} +  \frac{2\gamma \epsilon_{\mathrm{opt}} }{1-\gamma}  
\\
&\stackrel{(d)}{\leq} \epsilon + \frac{3\gamma \epsilon_{\mathrm{opt}} }{1-\gamma} 
\end{align*}
Inequality (a) is due to  Lemma~\ref{decomposition}. Inequality (b) comes from Lemma~\ref{upper hat}.   Finally, inequality (c) comes from Lemma \ref{close_form} and inequality (d) from the form of $N$ in the theorem. Note that this proof holds for both $s$- and $sa$-rectangular assumptions.
\end{proof}

% In the next section, we are doing the proof of Theorem \ref{h3} to improve the bound in term of $H$.
%\section{Proof for minimax optimal bound}
\section{Towards minimax optimal bounds}
\label{annex c}


We start from the same decomposition as the proof of Theorem~\ref{h4} proved in Lemma~\ref{decomposition}: 
\begin{align*}
    \norminf{\Qs -\Qpihat} \leq \norminf{\Qs-\Qhatpistar} + \norminf{  \Qhatpistar-\Qhatpihat} + \norminf{ \Qhatpihat-\Qpihat}.
\end{align*}
However, we need tighter concentration arguments for this proof.

In the following, we will frequently use the fact that, for any policy $\pi$, written below for the $s$-rectangular case (a similar expression can be obtained for the $sa$-rectangular case, adapting the regularized reward),

Recall, the fix point equation for $Q^\pi$ can be written as : 
\begin{equation}
Q^\pi=\left(I-\gamma P_0^\pi\right)^{-1} (R_0-\alpha_s \Big(\pi_s/ \normq{\pi_s}\Big)^{q-1}+ \gamma \inf _{P^\pi\in\mathcal{P}_s }P^\pi V^\pi)
\label{eq:Q_fixed_point}
\end{equation}


 It will be applied notably to $\hat{\pi}$ and $\pi^*$ (recall that $Q^* = Q^{\pi^*}$), in the RMDP but also in the empirical one.

% as in the non-robust case we have $Q^*=(I-\gamma \Pzeropistar)R_0
% $  or the same identity for a different policy, such as $\pihat$.


\begin{lemma}
\label{lemma_simplex}    

 For $s$-rectangular we have
\begin{align*}
\left(I-\gamma \Pzero^\pi\right)^{-1} \rshatpi-\left(I-\gamma \widehat{P}^\pi\right)^{-1}\rshatpi
&\stackrel{(a)}{=}\left(I-\gamma \Pzero^\pi\right)^{-1}\left(\left(I-\gamma \widehat{P}^\pi\right)-\left(I-\gamma \Pzero^\pi\right)\right) \widehat{Q}^\pi_s \\
&=\gamma\left(I-\gamma \Pzero^\pi\right)^{-1}\left(\Pzero^\pi-\widehat{P}^\pi\right) \widehat{Q}^\pi_s \\
&=\gamma\left(I-\gamma \Pzero^\pi\right)^{-1}(\Pzero-\widehat{P}) \widehat{V}^\pi_s
\end{align*}

and for optimal policy 

\begin{align}
\label{4}
    \left(I-\gamma \Pzeropistar \right)^{-1} \rshatpistar -\left(I-\gamma \Phatpistar \right)^{-1} \rshatpistar &= \gamma\left(I-\gamma\Pzeropistar\right)^{-1}(\Pzero-\widehat{P}) \Vhatpistar_s
\\
\label{5}
     \left(I-\gamma\Pzeropihat \right)^{-1} \rshatpihat -\left(I-\gamma \Phatpihat\right)^{-1} \rshatpihat &= \gamma\left(I-\gamma \Pzeropihat \right)^{-1}(\Pzero-\widehat{P}) \Vhatpihat_s
\end{align}
 \end{lemma}
The solution is a bit different as $\rshatpi$ is the regularised form of the $L_p$ optimisation problem with simplex constraints which correspond to 
$\rshatpi=R_0 -\Big(\frac{\pistar_s}{\normq{\pistar_s}} \Big)^{q-1}   \alpha_{s} +\gamma \inf _{P^\pi\in\mathcal{P}_s }P^\pi \hat{V}^\pi $
or for $sa$ case : $\rsahatpi=R_0- 
 \alpha_{\mathrm{sa}}+ \gamma  \inf _{P^\pi\in\mathcal{P}_s }P^\pi \hat{V}^\pi $


Indeed, even without close form, we can write the problem with an expectation over the nominal and the infimum problem. 


\begin{lemma}[Upper bound on $\Qs-\Qhatpistar$ and on $\Qpihat-\Qhatpihat$, all Q values are now with robust under simplex constraints.]
\label{lemma_pis}
\begin{align*}
 \norminf{\Qs-\Qhatpistar}\leq&   \gamma\norminf{(I-\gamma\Pzeropistar)^{-1}(\Pzero-\widehat{P}) \Vhatpistar }+ \frac{2\gamma \beta \Snorm^{1/q} }{1-\gamma}   \norminf{\Qs- \Qhatpistar} 
 \\
 \norminf{\Qpihat-\Qhatpihat} \leq&  \gamma \norminf{\left(I-\gamma\Pzeropihat\right)^{-1}(\Pzero-\widehat{P}) \Vhatpihat }+ \frac{2\gamma \beta\Snorm^{1/q}  }{1-\gamma}  \norminf{\Qpihat- \Qhatpihat}
\end{align*}
\end{lemma}



\begin{proof}
   

\begin{align*}
&\Qs-\Qhatpistar\\& = \left(I-\gamma \Pzeropistar\right)^{-1} ( R_0 -\Big(\frac{\pistar_s}{\normq{\pistar_s}} \Big)^{q-1}   \alpha_{s} +  \inf _{P^\pi\in\mathcal{P}_s } P^\pi \Vs )
\\
&- \left(I-\gamma \Phatpistar\right)^{-1} ( R_0 -\Big(\frac{\pistar_s}{\normq{\pistar_s}} \Big)^{q-1}   \alpha_{s} +\inf _{P^\pi\in\mathcal{P}_s }  P^\pi \Vhatpistar )
\\
\stackrel{}{=}&  
\left(I-\gamma \Pzeropistar\right)^{-1} ( R_0 -\Big(\frac{\pistar_s}{\normq{\pistar_s}} \Big)^{q-1}   \alpha_{s} +\gamma  \inf _{P^\pi\in\mathcal{P}_s } P^\pi \Vs) 
\\
&-\left(I-\gamma \Pzeropistar\right)^{-1} ( R_0 -\Big(\frac{\pistar_s}{\normq{\pistar_s}} \Big)^{q-1}   \alpha_{s} +\gamma  \inf _{P^\pi\in\mathcal{P}_s } P^\pi  \Vhatpistar) 
\\
&+ \left(I-\gamma \Pzeropistar\right)^{-1} ( R_0 -\Big(\frac{\pistar_s}{\normq{\pistar_s}} \Big)^{q-1}   \alpha_{s} +\gamma  \inf _{P^\pi\in\mathcal{P}_s } P^\pi \Vhatpistar) 
\\
&- \left(I-\gamma \Phatpistar\right)^{-1} ( R_0 -\Big(\frac{\pistar_s}{\normq{\pistar_s}} \Big)^{q-1}   \alpha_{s} +\gamma \inf _{P^\pi\in\mathcal{P}_s } P^\pi \Vhatpistar) 
 \\
 \stackrel{(a)}{=}& \gamma\left(I-\gamma\Pzeropistar\right)^{-1}(\Pzero-\widehat{P}) \Vhatpistar
 +\left(I-\gamma \Pzeropistar\right)^{-1}  \gamma \left( \inf _{P^\pi\in\mathcal{P}_s } P^\pi \Vs - \inf _{P^\pi\in\mathcal{P}_s } P^\pi \Vhatpistar 
  \right) 
 \end{align*}
where in (a) we use  previous Lemma \ref{lemma_simplex}.
 \end{proof}

Hence, taking the supremum norm $\norminf{.}$, 
 \begin{align*}
&\norminf{\Qs-\Qhatpistar}  \stackrel{}{=}\\
&
\norminf{ \gamma\left(I-\gamma\Pzeropistar\right)^{-1}(\Pzero-\widehat{P}) \Vhatpistar+\left(I-\gamma \Pzeropistar\right)^{-1}  \gamma \left(\inf _{P^\pi\in\mathcal{P}_s } P^\pi \Vs -\inf _{P^\pi\in\mathcal{P}_s } P^\pi \Vhatpistar 
  \right)  }
 \\
 \stackrel{(b)}{\leq}& \norminf{\gamma\left(I-\gamma\Pzeropistar\right)^{-1}(\Pzero-\widehat{P}) \Vhatpistar }+ 
 \norminf{\left(I-\gamma \Pzeropistar\right)^{-1} \gamma \left( \inf _{P^\pi\in\mathcal{P}_s } P^\pi \Vs - \inf _{P^\pi\in\mathcal{P}_s } P^\pi \Vhatpistar 
  \right)}
 \\
\stackrel{(c)}{\leq}& \norminf{  \gamma\left(I-\gamma\Pzeropistar\right)^{-1}(\Pzero-\widehat{P}) \Vhatpistar }+ \frac{\gamma  }{1-\gamma}    \mid \inf _{P^\pi\in\mathcal{P}_s } P^\pi \Vs - \inf _{P^\pi\in\mathcal{P}_s } P^\pi  \Vhatpistar  \mid 
  \\
 \stackrel{(d)}{\leq}&  \norminf{ \gamma\left(I-\gamma\Pzeropistar\right)^{-1}(\Pzero-\widehat{P}) \Vhatpistar }+ \frac{\gamma  }{1-\gamma} \sup _{P^\pi\in\mathcal{P}_s } P^\pi \mid \Vs- \Vhatpistar \mid
  \\
  \stackrel{(e)}{\leq}&  \norminf{  \gamma\left(I-\gamma\Pzeropistar\right)^{-1}(\Pzero-\widehat{P}) \Vhatpistar} + \frac{\gamma   }{1-\gamma}  \sup _{P: \normpbar{P}\leq\beta_{s} ,\sum_s P(s)=0} P \mid \Vs- \Vhatpistar \mid \\
   \stackrel{(f)}{\leq}&  \norminf{  \gamma\left(I-\gamma\Pzeropistar\right)^{-1}(\Pzero-\widehat{P}) \Vhatpistar} - \frac{\gamma   }{1-\gamma}  \inf _{P: \normpbar{P}\leq\beta_{s} ,\sum_s P(s)=0} -P \mid \Vs- \Vhatpistar \mid \\
    \stackrel{(g)}{\leq}&  \norminf{  \gamma\left(I-\gamma\Pzeropistar\right)^{-1}(\Pzero-\widehat{P}) \Vhatpistar} + \frac{\gamma \beta \Snorm^{1/q}  }{1-\gamma}  \snormqbarpis{ \Qs -\Qhatpistar}\\
     \stackrel{(h)}{\leq}&  \norminf{  \gamma\left(I-\gamma\Pzeropistar\right)^{-1}(\Pzero-\widehat{P}) \Vhatpistar} + \frac{2\gamma \beta \Snorm^{1/q}  }{1-\gamma}  \norminf{ \Qs -\Qhatpistar}
\end{align*}


where (b) is the triangular inequality, (c)  Eq.~\eqref{1}, (d) is the triangular inequality for seminorms, (d) is $\left|\inf _A f-\inf _A g\right| \leq \sup _A|f-g| .$, (e)  is a relaxation (f) is the relation between sup and inf, (g) is lemma 1 of \cite{kumar2022efficient}), (h) is inequality for seminorms and norms \eqref{2}.









% \begin{lemma}
% \label{lemma_2}
% For any policy $\pi$,
% with $\rsahatpi=R_0-\alpha_{\mathrm{sa}}-\gamma \beta_{\mathrm{sa}} \snormqbarpi{\Qhatpi}$
% for $sa$-rectangular sets or  $\rshatpi=R_0-\Big(\pi_s/ \normq{\pi_s}\Big)^{q-1}\normq{\pi_s}-\gamma \beta_s \Big(\pi_s/ \normq{\pi_s}\Big)^{q-1}\snormqbarpi{\Qhatpi}$ for $s$-rectangular uncertainty sets,  denote $r_{\hat{Q}^\pi}$ any of these, we have
% \begin{align*}
% \left(I-\gamma \Pzero^\pi\right)^{-1} r_{\hat{Q}^\pi}-\left(I-\gamma \widehat{P}^\pi\right)^{-1}r_{\hat{Q}^\pi}
% &=\gamma\left(I-\gamma \Pzero^\pi\right)^{-1}(\Pzero-\widehat{P}) \widehat{V}^\pi.
% \end{align*}
% %\begin{align*}
% % \left(I-\gamma \Pzero^\pi\right)^{-1} \rshatpi-\left(I-\gamma \widehat{P}^\pi\right)^{-1}\rshatpi
% % &=\gamma\left(I-\gamma \Pzero^\pi\right)^{-1}(\Pzero-\widehat{P}) \widehat{V}^\pi.
% % \end{align*}
% \end{lemma}
% \begin{proof}
% For the $s$-rectangular case, we have 
% \begin{align*}
% \left(I-\gamma \Pzero^\pi\right)^{-1} \rshatpi-\left(I-\gamma \widehat{P}^\pi\right)^{-1}\rshatpi
% &\stackrel{(a)}{=}\left(I-\gamma \Pzero^\pi\right)^{-1}\left(\left(I-\gamma \widehat{P}^\pi\right)-\left(I-\gamma \Pzero^\pi\right)\right) \widehat{Q}^\pi_s \\
% &=\gamma\left(I-\gamma \Pzero^\pi\right)^{-1}\left(\Pzero^\pi-\widehat{P}^\pi\right) \widehat{Q}^\pi_s \\
% &=\gamma\left(I-\gamma \Pzero^\pi\right)^{-1}(\Pzero-\widehat{P}) \widehat{V}^\pi_s
% \end{align*}
% where we have used in (a) the fact that $\widehat{Q}_s^\pi=\left(I-\gamma \widehat{P}^\pi\right)^{-1}\rshatpi$.
% The proof is exactly the same for the $sa$-rectangular case.
% \end{proof}

% This lemma will allow controlling the terms $\|\Qs-\Qhatpistar\|_\infty$ and $\| \Qhatpihat-\Qpihat\|_\infty$ in Lemma \ref{firstineq}. So for $\pihat$ and $\pistar$ we get respectively :
% \begin{align}
% \label{4}
%     \left(I-\gamma \Pzeropistar \right)^{-1} \rshatpistar -\left(I-\gamma \Phatpistar \right)^{-1} \rshatpistar &= \gamma\left(I-\gamma\Pzeropistar\right)^{-1}(\Pzero-\widehat{P}) \Vhatpistar_s
% \\
% \label{5}
%      \left(I-\gamma\Pzeropihat \right)^{-1} \rshatpihat -\left(I-\gamma \Phatpihat\right)^{-1} \rshatpihat &= \gamma\left(I-\gamma \Pzeropihat \right)^{-1}(\Pzero-\widehat{P}) \Vhatpihat_s
% \end{align}
% Again, we have the exact same formulation for the $sa$-rectangular case.



% \begin{lemma}[Upper bound on $\Qs-\Qhatpistar$ and on $\Qpihat-\Qhatpihat$]
% \label{lemma_pis}
% \begin{align*}
%  \norminf{\Qs-\Qhatpistar}\leq&   \gamma\norminf{(I-\gamma\Pzeropistar)^{-1}(\Pzero-\widehat{P}) \Vhatpistar }+ \frac{2\gamma \beta  }{1-\gamma}   \norminf{\Qs- \Qhatpistar} 
%  \\
%  \norminf{\Qpihat-\Qhatpihat} \leq&  \gamma \norminf{\left(I-\gamma\Pzeropihat\right)^{-1}(\Pzero-\widehat{P}) \Vhatpihat }+ \frac{2\gamma \beta  }{1-\gamma}  \norminf{\Qpihat- \Qhatpihat}
% \end{align*}

% This Lemma is true for both $sa$- and $s$- rectangular assumptions.
% \label{lemma pi hat}
% \end{lemma}

% \begin{proof}
% Considering for example the $s$-rectangular case (same derivation for the $sa$-case, up to the slightly different regularized reward), and making use of Eq.~\eqref{eq:Q_fixed_point}, we have
% % For the first inequality,
% % using that $\Qs=\left(I-\gamma \Pzeropistar\right)^{-1} (R_0-\alpha_s\normq{\pistar_s}-\gamma \beta_s \normq{\pistar_s} \snormqbarpis{\Qs}) $ as in the non robust case we have $Q^*=(I-\gamma \Pzeropistar)R_0
% % $, but replacing the non robust reward by a robust reward, we get for $s-$ rectangular assumption: 
% %
% \begin{align*}
% &\Qs-\Qhatpistar\\& = \left(I-\gamma \Pzeropistar\right)^{-1} (R_0-\alpha_s \Big(\frac{\pistar_s}{\normq{\pistar_s}} \Big)^{q-1}-\gamma \beta_s \Big(\frac{\pistar_s}{\normq{\pistar_s}} \Big)^{q-1}\snormqbarpis{\Qs}) 
% \\
% &- \left(I-\gamma \Phatpistar\right)^{-1} (R_0-\alpha_s \Big(\frac{\pistar_s}{\normq{\pistar_s}} \Big)^{q-1}-\gamma \beta_s \Big(\frac{\pistar_s}{\normq{\pistar_s}} \Big)^{q-1} \snormqbarpis{\Qhatpistar})
% \\
% \stackrel{}{=}&  
% \left(I-\gamma \Pzeropistar\right)^{-1} (R_0-\alpha_s \Big(\frac{\pistar_s}{\normq{\pistar_s}} \Big)^{q-1}-\gamma \beta_s \Big(\frac{\pistar_s}{\normq{\pistar_s}} \Big)^{q-1} \snormqbarpis{\Qs})
% \\
% &-\left(I-\gamma \Pzeropistar\right)^{-1} (R_0-\alpha_s \Big(\frac{\pistar_s}{\normq{\pistar_s}} \Big)^{q-1}-\gamma \beta_s \Big(\frac{\pistar_s}{\normq{\pistar_s}} \Big)^{q-1}\snormqbarpis{\Qhatpistar}) 
% \\
% &+ \left(I-\gamma \Pzeropistar\right)^{-1} (R_0-\alpha_s\Big(\frac{\pistar_s}{\normq{\pistar_s}} \Big)^{q-1}-\gamma \beta_s \Big(\frac{\pistar_s}{\normq{\pistar_s}} \Big)^{q-1}\snormqbarpis{\Qhatpistar})  
% \\
% &- \left(I-\gamma \Phatpistar\right)^{-1} (R_0-\alpha_s\Big(\frac{\pistar_s}{\normq{\pistar_s}} \Big)^{q-1}-\gamma \beta_s \Big(\frac{\pistar_s}{\normq{\pistar_s}} \Big)^{q-1} \snormqbarpis{\Qhatpistar}) 
%  \\
%  \stackrel{(a)}{=}& \gamma\left(I-\gamma\Pzeropistar\right)^{-1}(\Pzero-\widehat{P}) \Vhatpistar
%  +\left(I-\gamma \Pzeropistar\right)^{-1} \left(\gamma \beta_s \Big(\frac{\pistar_s}{\normq{\pistar_s}} \Big)^{q-1}(\snormqbarpis{\Qhatpistar} -\snormqbarpis{\Qs}
%  ) \right) 
%  \end{align*}
% Hence, taking the supremum norm $\norminf{.}$, 
%  \begin{align*}
% &\norminf{\Qs-\Qhatpistar}  \stackrel{}{=}\\
% &
% \norminf{ \gamma\left(I-\gamma\Pzeropistar\right)^{-1}(\Pzero-\widehat{P}) \Vhatpistar+\left(I-\gamma \Pzeropistar\right)^{-1} \left(\gamma \beta_s \Big(\frac{\pistar_s}{\normq{\pistar_s}} \Big)^{q-1}(\snormqbarpis{\Qhatpistar} -\snormqbarpis{\Qs}
%  ) \right) }
%  \\
%  \stackrel{(b)}{\leq}& \norminf{\gamma\left(I-\gamma\Pzeropistar\right)^{-1}(\Pzero-\widehat{P}) \Vhatpistar }+ \frac{\gamma \beta }{1-\gamma} \left(\vert ( \snormqbarpis{\Qs}-\snormqbarpis{\Qhatpistar} \vert 
%  ) \right)
%  \\
% \stackrel{(c)}{\leq}& \norminf{  \gamma\left(I-\gamma\Pzeropistar\right)^{-1}(\Pzero-\widehat{P}) \Vhatpistar }+ \frac{\gamma \beta }{1-\gamma} \left(  \snormqbarpis{\Qs- \Qhatpistar}
%   \right) 
%   \\
%  \stackrel{(d)}{\leq}&  \norminf{ \gamma\left(I-\gamma\Pzeropistar\right)^{-1}(\Pzero-\widehat{P}) \Vhatpistar} + \frac{2\gamma \beta }{1-\gamma}  \normqbarpis{\Qs- \Qhatpistar}\\
%  \stackrel{(e)}{\leq}&  \norminf{ \gamma\left(I-\gamma\Pzeropistar\right)^{-1}(\Pzero-\widehat{P}) \Vhatpistar }+ \frac{2\gamma \beta }{1-\gamma} \left(  \normqbar{\Qs- \Qhatpistar}
%   \right) 
%   \\
%   \stackrel{(f)}{\leq}&  \norminf{  \gamma\left(I-\gamma\Pzeropistar\right)^{-1}(\Pzero-\widehat{P}) \Vhatpistar} + \frac{2\gamma \beta  }{1-\gamma}   \norminf{\Qs- \Qhatpistar}
% \end{align*}
% where (a) is due to Lemma \ref{lemma_2} and  $\Big(\frac{\pistar_s}{\normq{\pistar_s}} \Big)^{q-1}\leq1$, (b) comes from the triangular inequality and Eq.~\eqref{1}, (c) is the triangular inequality for seminorms, (d) comes from Eq.~\eqref{2}, (e) comes from Eq.~\eqref{3}  and (f) is by the Holder's inequality.
% One can note that the proof is true for any policy, so it is also true for both $\pihat$ and $\pistar$ which concludes the proof.

% \end{proof}



% \begin{lemma} Upper bound on $\Qpihat-\Qhatpihat$.

% \begin{equation}
% \label{lemma pi hat}
%     \Qpihat-\Qhatpihat \leq  \gamma\left(I-\gamma\Ppihat\right)^{-1}(P-\widehat{P}) \Vhatpihat + \frac{2\gamma \beta  }{1-\gamma}  \norminf{\Qpihat- \Qhatpihat}
% \end{equation}
% \end{lemma}
% \begin{proof}

% $$
% \begin{aligned} 
%  =&\left(I-\gamma \Ppihat\right)^{-1} (R-\alpha_s\normqbar{\pihat}-\gamma \beta_s \normqbar{\pihat} \snormqbarpihat{\Qpihat}) 
%  \\
%  -& \left(I-\gamma \Phatpihat\right)^{-1} (R-\alpha_s \normqbar{\pihat}-\gamma \beta_s \normqbar{\pihat} \snormqbarpihat{\Qhatpihat})
%  \\
% \stackrel{}{=}&  
% \left(I-\gamma \Ppistar\right)^{-1} (R-\alpha_s\normqbar{\pihat}-\gamma \beta_s \normqbar{\pihat} \snormqbarpis{\Qs})
% -\left(I-\gamma \Ppihat\right)^{-1} (R-\alpha_s \normqbar{\pihat}-\gamma \beta_s \normqbar{\pihat} \snormqbarpihat{\Qhatpihat}) 
% \\
% +& \left(I-\gamma \Ppihat\right)^{-1} (R-\alpha_s\normqbar{\pihat}-\gamma \beta_s \normqbar{\pihat} \snormqbarpihat{\Qhatpihat})  
% + \left(I-\gamma \Phatpihat\right)^{-1} (R-\alpha_s\normqbar{\pihat}-\gamma \beta_s \normqbar{\pihat} \snormqbarpihat{\Qhatpihat}) )
% \\
%  \stackrel{a}{=}& \gamma\left(I-\gamma\Ppihat\right)^{-1}(P-\widehat{P}) \Vhatpihat
%  +\left(I-\gamma \Ppihat\right)^{-1} \left(\gamma \normqbar{\pihat} \beta_s ( \snormqbarpihat{\Qpihat}-\snormqbarpihat{\Qhatpihat}
%  ) \right) 
%  \\
%  \stackrel{b}{\leq}& \gamma\left(I-\gamma\Ppihat\right)^{-1}(P-\widehat{P}) \Vhatpihat + \frac{\gamma \beta }{1-\gamma} \left(\vert ( \snormqbarpihat{\Qpihat}-\snormqbarpihat{\Qhatpihat} \vert 
%  ) \right)
%  \\
% \stackrel{c}{\leq}&   \gamma\left(I-\gamma\Ppihat\right)^{-1}(P-\widehat{P}) \Vhatpihat + \frac{\gamma \beta }{1-\gamma} \left(  \snormqbarpihat{\Qpihat- \Qhatpihat}
%   \right) 
%   \\
%  \stackrel{d}{\leq}&   \gamma\left(I-\gamma\Ppihat\right)^{-1}(P-\widehat{P}) \Vhatpihat + \frac{2\gamma \beta }{1-\gamma}   \normqbarpis{\Qpihat- \Qhatpihat} \leq   \gamma\left(I-\gamma\Ppihat\right)^{-1}(P-\widehat{P}) \Vhatpihat + \frac{2\gamma \beta }{1-\gamma} \left(  \normqbar{\Qpihat- \Qhatpihat}
%   \right) 
%   \\
%   \stackrel{e}{\leq}&    \gamma\left(I-\gamma\Ppihat\right)^{-1}(P-\widehat{P}) \Vhatpihat + \frac{2\gamma \beta  }{1-\gamma}   \norminf{\Qpihat- \Qhatpihat}
% \end{aligned}
% $$
%  (a) is the lemma \ref{lemma_2} and $\normqbar{\pihat}\leq1$, (b) comes from lemma \ref{1} (c) is the triangular inequality for seminorms, (d) comes from \ref{2}  and (e) is Holder's inequality.
% \end{proof}


For brevity in the remaining analysis, let us define the shorthand:
$$
L=\log (8|\mathcal{S}||\mathcal{A}| /((1-\gamma) \delta)).
$$
%
Recall, slightly abusing the notation, for $V \in \mathbb{R}^{S}$, we define the vector $\operatorname{Var}_P(V) \in \mathbb{R}^{\mathcal{S} \times A}$ as $\operatorname{Var}_P(V)=P(V)^2-(P V)^2$. 
\begin{lemma}[\citet{agarwal2020model}, Lemma 9]
\label{agarwal}
With probability greater than $1-\delta$,
$$
\begin{aligned}
\left|(\Pzero-\widehat{P}) \widehat{V}^{\star}\right| & \leq \sqrt{\frac{8 L}{N}} \sqrt{\operatorname{Var}_{\Pzero}\left(\widehat{V}^{\star}\right)}+\Delta_{\delta, N}^{\prime} \mathbb{I} \\
\left|(\Pzero-\widehat{P}) \widehat{V}^{\pi^{\star}}\right| & \leq \sqrt{\frac{8 L}{N}} \sqrt{\operatorname{Var}_{\Pzero}\left(\widehat{V}^{\pi^{\star}}\right)}+\Delta_{\delta, N}^{\prime} \mathbb{I}\\
\text{where } \Delta_{\delta, N}^{\prime} & =\sqrt{\frac{c L}{N}}+\frac{c L}{(1-\gamma) N}
\text{ and $c$ is a universal constant smaller than $16$}.
\end{aligned}
$$
% where
% $$
% \Delta_{\delta, N}^{\prime}=\sqrt{\frac{c L}{N}}+\frac{c L}{(1-\gamma) N}
% $$
% with $c$ being a constant.
\end{lemma}
%


\begin{proof} The proof of \citet{agarwal2020model} holds for classical MDP but can be adapted to the robust setting using all lemmas proved for the bound in $H^4$ previously. Lemma \ref{sameabsorb},\ref{stabilityabsorb}  ,\ref{crude},\ref{agarwalu},\ref{robustabsorb} are needed but the main difference is that we are using Berstein's inequality and not Hoeffding's inequality.
The idea is first, as in the previous proof, to apply Berstein's inequality to independent variables using $s$ absorbing MDPs then using Lemma \ref{agarwalu}.
\begin{proof}
    Similar to \cite{agarwal2020model}, we first show that
    $$
\begin{aligned}
\left|\left(P_{0}-\widehat{P}_{}\right) \cdot \widehat{V}^{\star}\right| \leq & \sqrt{\frac{2 \log \left(4\left|U_s\right| / \delta\right)}{N}} \sqrt{\operatorname{Var}_{P_{0}}\left(\widehat{V}^{\star}\right)} \\
& +\min _{u \in U_s}\left|\widehat{V}^{\star}(s)-u\right|\left(1+\sqrt{\frac{2 \log \left(4\left|U_s\right| / \delta\right)}{N}}\right)+\frac{2 \log \left(4\left|U_s\right| / \delta\right)}{(1-\gamma) 3 N} \\
\left|\left(P_{0}-\widehat{P}_{}\right) \cdot \widehat{V}^{\pi^{\star}}\right| \leq & \sqrt{\frac{2 \log \left(4\left|U_s\right| / \delta\right)}{N}} \sqrt{\operatorname{Var}_{P_{0}}\left(\widehat{V}^{\pi^{\star}}\right)} \\
& +\min _{u \in U_s}\left|\widehat{V}^{\pi^{\star}}(s)-u\right|\left(1+\sqrt{\frac{2 \log \left(4\left|U_s\right| / \delta\right)}{N}}\right)+\frac{2 \log \left(4\left|U_s\right| / \delta\right)}{(1-\gamma) 3 N}
\end{aligned}
$$
First, with probability greater than $1-\delta$, we have that for all $u \in U_s$.
$$
\begin{aligned}
& \left|\left(P_{0}-\widehat{P}_{}\right) \cdot \widehat{V}^{\star}\right|=\left|\left(P_{0}-\widehat{P}_{}\right) \cdot\left(\widehat{V}^{\star}-V_{s, u}^{\star}+V_{s, u}^{\star}\right)\right| \\
& \stackrel{(a)}{\leq}\left|\left(P_{0}-\widehat{P}_{}\right) \cdot\left(\widehat{V}^{\star}-V_{s, u}^{\star}\right)\right|+\left|\left(P_{0}-\widehat{P}_{}\right) \cdot\left(V_{s, u}^{\star}\right)\right| \\
& \stackrel{(b)}{\leq}\left\|\widehat{V}^{\star}-V_{s, u}^{\star}\right\|_{\infty}+\sqrt{\frac{2 \log \left(4\left|U_s\right| / \delta\right)}{N}} \sqrt{\operatorname{Var}_{P_{0}}\left(V_{s, u}^{\star}\right)}+\frac{2 \log \left(4\left|U_s\right| / \delta\right)}{(1-\gamma) 3 N} \\
& \stackrel{(c)}{=}\left\|\widehat{V}^{\star}-V_{s, u}^{\star}\right\|_{\infty}+\sqrt{\frac{2 \log \left(4\left|U_s\right| / \delta\right)}{N}} \sqrt{\operatorname{Var}_{P_{0}}\left(\widehat{V}^{\star}-V_{s, u}^{\star}-\widehat{V}^{\star}\right)}+\frac{2 \log \left(4\left|U_s\right| / \delta\right)}{(1-\gamma) 3 N} \\
& \stackrel{(d)}{\leq}\left\|\widehat{V}^{\star}-V_{\widehat{M}_{s, u}}^{\star}\right\|_{\infty}\left(1+\sqrt{\frac{2 \log \left(4\left|U_s\right| / \delta\right)}{N}}\right)+\sqrt{\frac{2 \log \left(4\left|U_s\right| / \delta\right)}{N}} \sqrt{\operatorname{Var}_{P_{0}}\left(\widehat{V}^{\star}\right)}+\frac{2 \log \left(4\left|U_s\right| / \delta\right)}{(1-\gamma) 3 N} \\
&
\end{aligned}
$$
using the triangle inequality in (a), (b) classical Berstein's inequality, (d) for variance and Lemmas \ref{sameabsorb} and \ref{stabilityabsorb} such as 
$$
\left\|\widehat{V}^{\star}-V_{s, u}^{\star}\right\|_{\infty}=\left\|\widehat{V}_{s, \widehat{V}^{\star}(s)}^{\star}-V_{s, u}^{\star}\right\|_{\infty} \leq\left|\widehat{V}^{\star}(s)-u\right| .
$$
It is true for  $u \in U_s$, so we take the best possible choice, which completes the proof of the first claim. The proof of the second claim is similar.
Then using Lemma \ref{agarwalu} gives the final concentration theorem.
\end{proof}

% Given an  empirical MDP $M$ with transition probability $P$ and reward $R$, for any state $s$, they construct a modified 
% MDP $M_{s,u}$ for which the state $s$ is absorbing and associated to a reward $u$. 
% They apply this construction to the empirical MDP $\hat{M}$ based on the empirical transition probability $\hat{P}$.
% One verifies easily that for any policy $\pi$ $Q_{\hat{M}_{s,u}}^{\pi}(s,a)$ is independent of $\hat{P}_{s,a}$ by construction. 
% They use this to obtain a Bernstein type inequality for $
% (P_{s,a} - \hat{P}_{s,a}) Q_{\hat{M}_{s,u}}^{\pi}(s,a)$ for any $u$ in a finite set. The idea is then to show that 
% this set always contains a value close to $Q^{\pi}(s,a)$ which combined to a stability property of $Q_{M_{s,u}}$ 
% $$
% |Q_{M_{s,u}}^{\pi}(s,a) - Q_{M_{s,u'}}^{\pi}(s,a)| \leq \frac{2}{1-\gamma} |u-u'|
% $$
% and its consequence
% $$
% |\sup_{\pi} Q_{M_{s,u}}^{\pi}(s,a) - \sup_{\pi} Q_{M_{s,u'}}^{\pi}(s,a)|
% = |Q_{M_{s,u}}^{*}(s,a) - Q_{M_{s,u'}}^{*}(s,a)| 
% \leq \frac{2}{1-\gamma} |u-u'|
% $$
% for any MDP yields the bound of Lemma~\ref{agarwal}.

% To obtain the result in our robust MDP setting, we need a similar stability property 
% for the robust value functions. It turns out that this a direct consequence of the property for classical MDP:
% \begin{align*}
% &|Q_{M_{s,u}}^{\pi}(s,a) - Q_{M_{s,u'}}^{\pi}(s,a)| \leq \frac{2}{1-\gamma} |u-u'|\\
% \Rightarrow &|\inf_{M} Q_{M_{s,u}}^{\pi}(s,a) - \inf_{M} Q_{M_{s,u}}^{\pi}(s,a)| \leq \frac{2}{1-\gamma} |u-u'|\\
% \Rightarrow &|\sup_{\pi} \inf_{M} Q_{M_{s,u}}^{\pi}(s,a) - \sup_{\pi}\inf_{M} Q_{M_{s,u}}^{\pi}(s,a)| \leq \frac{2}{1-\gamma} |u-u'|.
% \end{align*}
\end{proof}


\begin{lemma}[\citet{gheshlaghi2013minimax}, Lemma 7]
\label{variance}
 This is an adaptation of \citet{gheshlaghi2013minimax} to RMDPs. For any policy $\pi$,
$$
\left\|\left(I-\gamma P_0^\pi\right)^{-1} \sqrt{\operatorname{Var}_{P_0}\left(V^\pi\right)}\right\|_{\infty} \leq \sqrt{\frac{2}{(1-\gamma)^3}},
$$
where $P_0$ is the nominal transition model of $M$. 

\end{lemma}
\begin{proof}
    This proof is exactly the same for Robust and non robust MDPs, as it uses only standard computations such as the Jensen inequality and no robust form which are specific to this problem. The main difference is that we are doing the proof on the nominal of our robust set $P_0$, considering the regularized robust Bellman operator and associated regularized reward functions. 

    \citet{gheshlaghi2013minimax}  introduce the variance of the sum of discounted rewards starting at state-action $(s,a)$,
    $$\Sigma^\pi(s,a):= \mathbb{E}[|\sum_{t \geq 0} \gamma^t R_0(s_t,a_t)-Q^\pi(s,a)|^2 \vert s_0=s, a_0=a],$$
    and we defined the same variance for robust MDPs using robust rewards $\rsapi$ and $\rspi$ and using robust Q-function instead of classical Q-function in the definition of $\Sigma$.
    %
    Then, in their Lemma 6 they show that, for any $\pi$:
    $$\Sigma^\pi=\operatorname{Var}_{P_0}\left(V^\pi\right) +\gamma^2 P_0^\pi \Sigma^\pi,$$
    which is, in fact, a Bellman equation for the variance. The proof is exactly the same for RMDPs considering our robust reward $\rsapi$ or $\rspi$ and not classical $R_0$. Note that this is thanks to the regularised form of robust RMDPs.
    %
    Finally, Lemma \ref{variance} is the same as their Lemma 7 considering robust rewards. This lemma is usually called the total variance lemma. This completes the proof.
\end{proof}

\begin{lemma}
The following upper bound holds with probability $1-\delta$:
\label{fin2}
\begin{equation}
    \left\|Q^{\widehat{\pi}}-\widehat{Q}^{\widehat{\pi}}\right\|_{\infty}<\left(C_{N}+C_\beta\right) \norminf{\Qpihat- \Qhatpihat} +\gamma 4\sqrt{\frac{ L}{N(1-\gamma)^3}}+\frac{\gamma \Delta_{\delta, N}^{\prime}}{1-\gamma}+\frac{\gamma \epsilon_{\mathrm{opt}}}{1-\gamma}\left(2+\sqrt{\frac{8 L}{N}}\right)
\end{equation}
with $C_{N}=\frac{\gamma}{1-\gamma} \sqrt{\frac{8 L}{N}}$ and $C_\beta=\frac{2\gamma \beta \Snorm^{1/q} }{1-\gamma}   $.
\end{lemma}
\begin{proof}
$$
\begin{aligned}
&\left\|Q^{\widehat{\pi}}- \widehat{Q}^{\widehat{\pi}}\right\|_{\infty}\\
&\stackrel{(a)}{\leq}  \gamma\left\|\left(I-\gamma \Pzero^{\widehat{\pi}}\right)^{-1}(\Pzero-\widehat{P}) \widehat{V}^{\widehat{\pi}} \right\|_{\infty} +\frac{2\gamma \beta \Snorm^{1/q}  }{1-\gamma}   \norminf{\Qpihat- \Qhatpihat}
\\
&\stackrel{(b)}{\leq} \gamma\left\|\left(I-\gamma P^{\widehat{\pi}}\right)^{-1}(\Pzero-\widehat{P}) \widehat{V}^{\star}\right\|_{\infty}+\gamma\left\|\left(I-\gamma \Pzero^\pi\right)^{-1}(\Pzero-\widehat{P})\left(\widehat{V}^{\widehat{\pi}}-\widehat{V}^{\star}\right)\right\|_{\infty}+\frac{2\gamma \beta \Snorm^{1/q} }{1-\gamma}   \norminf{\Qpihat- \Qhatpihat} 
\\
&\stackrel{(c)}{\leq} \gamma\left\|\left(I-\gamma \Pzero^{\widehat{\pi}}\right)^{-1}(\Pzero-\widehat{P}) \widehat{V}^{\star}\right\|_{\infty}+\frac{2\gamma \epsilon_{\mathrm{opt}}}{1-\gamma} + \frac{2\gamma \beta \Snorm^{1/q}  }{1-\gamma}   \norminf{\Qpihat- \Qhatpihat}
\\
&\stackrel{(d)}{\leq} \gamma\left\|\left(I-\gamma \Pzero^{\widehat{\pi}}\right)^{-1}\left|(\Pzero-\widehat{P}) \widehat{V}^{\star}\right|\right\|_{\infty}+\frac{2\gamma \epsilon_{\mathrm{opt}}}{1-\gamma} +\frac{2\gamma \beta \Snorm^{1/q} }{1-\gamma}   \norminf{\Qpihat- \Qhatpihat}
\\
&\stackrel{(e)}{\leq} \gamma \sqrt{\frac{8 L}{N}}\left\|\left(I-\gamma \Pzero^{\widehat{\pi}}\right)^{-1} \sqrt{\operatorname{Var}_{\Pzero}\left(\widehat{V}^{\star}\right)}\right\|_{\infty}+2\frac{\gamma \Delta_{\delta, N}^{\prime}}{1-\gamma}+\frac{2\gamma \epsilon_{\mathrm{opt}}}{1-\gamma}+ \frac{2\gamma \beta \Snorm^{1/q}  }{1-\gamma}   \norminf{\Qpihat- \Qhatpihat} 
\\
&\stackrel{(f)}{\leq} \gamma \sqrt{\frac{8 L}{N}}\left\|\left(I-\gamma \Pzero^{\widehat{\pi}}\right)^{-1}\left(\sqrt{\operatorname{Var}_{\Pzero}\left(V^{\widehat{\pi}}\right)}+\sqrt{\operatorname{Var}_{\Pzero}\left(V^{\widehat{\pi}}-\widehat{V}^{\widehat{\pi}}\right)}+\sqrt{\operatorname{Var}_{\Pzero}\left(\widehat{V}^{\widehat{\pi}}-\widehat{V}^{\star}\right)}\right)\right\|_{\infty}
\\
&\quad+\frac{\gamma \Delta_{\delta, N}^{\prime}}{1-\gamma}+\frac{2\gamma \epsilon_{\mathrm{opt}}}{1-\gamma} +\frac{2\gamma \beta  }{1-\gamma}   \norminf{\Qpihat- \Qhatpihat}
\\
&\stackrel{(g)}{\leq} \gamma \sqrt{\frac{8 L}{N}}\left(\sqrt{\frac{2}{(1-\gamma)^3}}+\frac{\sqrt{\left\|V^{\widehat{\pi}}-\widehat{V}^{\widehat{\pi}}\right\|_{\infty}^2}}{1-\gamma}+\frac{2\epsilon_{\mathrm{opt}}}{1-\gamma}\right)+\frac{\gamma \Delta_{\delta, N}^{\prime}}{1-\gamma}+\frac{2\gamma \epsilon_{\mathrm{opt}}}{1-\gamma}+\frac{2\gamma \beta  }{1-\gamma}   \norminf{\Qpihat- \Qhatpihat} 
\\
&\stackrel{(h)}{\leq} \gamma \sqrt{\frac{8 L}{N}}\left(\sqrt{\frac{2}{(1-\gamma)^3}}+\frac{\left\|Q^{\widehat{\pi}}-\widehat{Q}^{\widehat{\pi}}\right\|_{\infty}}{1-\gamma}+\frac{2\epsilon_{\mathrm{opt}}}{1-\gamma}\right)+\frac{\gamma \Delta_{\delta, N}^{\prime}}{1-\gamma}+\frac{2\gamma \epsilon_{\mathrm{opt}}}{1-\gamma}+\frac{2\gamma \beta \Snorm^{1/q} }{1-\gamma}   \norminf{\Qpihat- \Qhatpihat} 
\\
&=\gamma \sqrt{\frac{8 L}{N}}\left(\sqrt{\frac{2}{(1-\gamma)^3}}+\frac{\left\|Q^{\widehat{\pi}}-\widehat{Q}^{\widehat{\pi}}\right\|_{\infty}}{1-\gamma}\right)+\frac{\gamma \Delta_{\delta, N}^{\prime}}{1-\gamma}+\frac{\gamma \epsilon_{\mathrm{opt}}}{1-\gamma}\left(2+\sqrt{\frac{8 L}{N}}\right) +\frac{2\gamma \beta \Snorm^{1/q}  }{1-\gamma}   \norminf{\Qpihat- \Qhatpihat}
\\
&=\left(C_{N}+C_\beta\right) \norminf{\Qpihat- \Qhatpihat} +4\gamma \sqrt{\frac{ L}{N(1-\gamma)^3}}+\frac{\gamma \Delta_{\delta, N}^{\prime}}{1-\gamma}+\frac{\gamma \epsilon_{\mathrm{opt}}}{1-\gamma}\left(2+\sqrt{\frac{8 L}{N}}\right)
\end{aligned}
$$
with $C_{N}=\frac{\gamma}{1-\gamma} \sqrt{\frac{8 L}{N}}$ and $C_\beta=\frac{2\gamma \beta \Snorm^{1/q} }{1-\gamma}   $.

We have that (a) is true by Lemma \ref{lemma_pis}, (b) is by the triangular inequality using $ \widehat{V}^{\widehat{\pi}} = \widehat{V}^{\widehat{\pi}} +\widehat{V}^{\star}   -\widehat{V}^{\star}$, (c) is from the definition of $\epsilon_{\mathrm{opt}}$ and Eq.~\eqref{1}, (d) is by positivity of the classic horizon inverse matrix, that is $(I-\gamma P)^{-1} =\sum_{t>0} \gamma^t P^t > 0$,
    %\pierre{pas compris ta remarque \mfg{Ben que c'est pas vrai pour toute matrice, faut que le rayon spectral soit plus petit que 1 et que les éléments soient positifs (je dirais juste parce que $(I-\gamma P)^{-1} \sum \gamma^t P^t > 0$}}., 
    (e) is by Lemma \ref{agarwal}, (f) is by the triangular inequality for the variance (which is, in fact, a seminorm) and decomposing  $\widehat{V}^{\star}=\widehat{V}^{\star}+\widehat{V}^{\widehat{\pi}} -\widehat{V}^{\widehat{\pi}} +V^{\widehat{\pi}}- V^{\widehat{\pi}}$, (g) is by Lemma \ref{variance}, uses the definition of $\epsilon_{\mathrm{opt}}$ and takes the $\sup$ over $(s,a)$ of the variance in the second term, and eventually (h) is because we have that $\|V^\pi-\widehat{V}^\pi\|_{\infty} \leq\|Q^\pi-\widehat{Q}^\pi\|_{\infty}
$ for any $\pi$.

\end{proof}


\begin{lemma}
\label{fin}
The following upper bound holds
with probability $1-\delta$:
\begin{equation}
   \norminf{\Qs- \Qhatpistar}<\left(C_{N}+C_\beta\right)  \norminf{\Qs- \Qhatpistar} +\gamma 4\sqrt{\frac{ L}{N(1-\gamma)^3}}+\frac{\gamma \Delta_{\delta, N}^{\prime}}{1-\gamma}.
\end{equation}
with $C_{N}=\frac{\gamma}{1-\gamma} \sqrt{\frac{8 L}{N}}$ and $C_\beta=\frac{2\gamma \beta  \Snorm^{1/q} }{1-\gamma}   $.
\end{lemma}



\begin{proof}


$$
\begin{aligned}
 \norminf{\Qs- \Qhatpistar}
&\stackrel{(a)}{\leq} \gamma\left\|\left(I-\gamma \Pzeropistar\right)^{-1}(\Pzero-\widehat{P}) \Vhatpistar\right\|_{\infty} +\frac{2\gamma \beta \Snorm^{1/q}  }{1-\gamma}    \norminf{\Qs- \Qhatpistar} 
\\
&\stackrel{(b)}{\leq} \gamma\left\|\left(I-\gamma \Pzeropistar\right)^{-1}\left|(\Pzero-\widehat{P})  \Vhatpistar\right|\right\|_{\infty}+\frac{2\gamma \beta \Snorm^{1/q} }{1-\gamma}    \norminf{\Qs- \Qhatpistar}\\
&\stackrel{(c)}{\leq} \gamma \sqrt{\frac{8 L}{N}}\left\|\left(I-\gamma \Pzeropistar\right)^{-1} \sqrt{\operatorname{Var}_{\Pzero}\left( \Vhatpistar\right)}\right\|_{\infty}+2\frac{\gamma \Delta_{\delta, N}^{\prime}}{1-\gamma}+ \frac{2\gamma \beta \Snorm^{1/q}  }{1-\gamma}    \norminf{\Qs- \Qhatpistar} 
\\
&\stackrel{(d)}{\leq} \gamma \sqrt{\frac{8 L}{N}}\left\|\left(I-\gamma \Pzeropistar\right)^{-1}\left(\sqrt{\operatorname{Var}_{\Pzero}\left(\Vs\right)}+\sqrt{\operatorname{Var}_{\Pzero}\left(\Vs-\Vhatpistar\right)}\right)\right\|_{\infty} 
\\
&\quad+\frac{\gamma \Delta_{\delta, N}^{\prime}}{1-\gamma} +\frac{2\gamma \beta  \Snorm^{1/q}}{1-\gamma}    \norminf{\Qs- \Qhatpistar}
\\
&\stackrel{(e)}{\leq} \gamma \sqrt{\frac{8 L}{N}}\left(\sqrt{\frac{2}{(1-\gamma)^3}}+\frac{\sqrt{\left\|\Vs-\Vhatpistar\right\|_{\infty}^2}}{1-\gamma}\right)+\frac{\gamma \Delta_{\delta, N}^{\prime}}{1-\gamma}+\frac{2\gamma \beta  \Snorm^{1/q}}{1-\gamma}    \norminf{\Qs- \Qhatpistar} 
\\
&\leq\gamma \sqrt{\frac{8 L}{N}}\left(\sqrt{\frac{2}{(1-\gamma)^3}}+\frac{ \norminf{\Qs- \Qhatpistar}}{1-\gamma}\right) +\frac{\gamma \Delta_{\delta, N}^{\prime}}{1-\gamma} +\frac{2\gamma \beta \Snorm^{1/q} }{1-\gamma}    \norminf{\Qs- \Qhatpistar}
\\
&=\left(C_{N}+C_\beta\right)  \norminf{\Qs- \Qhatpistar} +4\gamma \sqrt{\frac{ L}{N(1-\gamma)^3}}+\frac{\gamma \Delta_{\delta, N}^{\prime}}{1-\gamma}
\end{aligned}
$$
with $C_{N}=\frac{\gamma}{1-\gamma} \sqrt{\frac{8 L}{N}}$ and $C_\beta=\frac{2\gamma \beta \Snorm^{1/q} }{1-\gamma}   $.

We have that (a) is true by Lemma \ref{lemma_pis}, (b) is by the positivity of the classic horizon inverse matrix, (c) is by Lemma (\ref{agarwal}), (d) is by the triangular inequality for the variance (which is a seminorm), (e) is by Lemma \ref{variance} and taking the $\sup$ over $(s,a)$ of the variance in the second term, and eventually (h) is because $
\|V^\pi-\widehat{V}^\pi\|_{\infty} \leq\|Q^\pi-\widehat{Q}^\pi\|_{\infty}
$ for any $\pi$.

\end{proof}

As the event on which $\Delta'_{\delta,N}$ is the same in the two previous Lemma~\ref{fin2} and Lemma~\ref{fin}, we can obtain the following. % Theorem.


\begin{theorem}
For $0< C_\beta \leq 1/2$ and $0 < C_N+C_\beta<1$, with probability $1-\delta$, we get:
 $$\norminf{\Qs- \Qpihat}<\frac{1}{1-(C_N+C_\beta)}\left(8 \gamma \sqrt{\frac{ L}{N(1-\gamma)^3}}+\frac{2\gamma \Delta_{\delta, N}^{\prime}}{1-\gamma}+\frac{\gamma \epsilon_{\mathrm{opt}}}{1-\gamma}\left(2+\sqrt{\frac{8 L}{N}}\right)\right) + \epsilon_{\mathrm{opt}}.
$$
\end{theorem}

\begin{proof}
    This result is obtained by combining the two previous Lemmas~\ref{fin2} and~\ref{fin} and passing the term in $\left(C_{N}+C_\beta\right)$ to the left-hand side.
\end{proof}

Note that $C_{\beta}  + C_{N} < 1$ implies
$C_{\beta} = \frac{2\gamma\beta \Snorm^{1/q}}{1-\gamma}<1$
and hence $\beta < \frac{1-\gamma}{2\gamma \Snorm^{1/q}}$.
%
Now we need to pick 
$C_N < 1 - C_\beta$. 
Let $C_N \leq 1 - C_\beta - \eta$, 
for any $0 < \eta < 1 - C_\beta$
the previous inequality becomes
$$\norminf{\Qs- \Qpihat}< \frac{8}{\eta} \gamma \sqrt{\frac{ L}{N(1-\gamma)^3}}+\frac{2\gamma \Delta_{\delta, N}^{\prime}}{\eta (1-\gamma)}+\frac{\gamma \epsilon_{\mathrm{opt}}}{\eta(1-\gamma)}\left(2+\sqrt{\frac{8 L}{N}}\right) + \epsilon_{\mathrm{opt}}.
$$

As $\Delta'_{\delta,N} =\sqrt{\frac{c L}{N}}+\frac{c L}{(1-\gamma) N}$, the term in $1/\sqrt{N}$ is given 
by
$\frac{8\gamma\sqrt{L} H^{3/2}}{\eta\sqrt{N}}
\left( 1 + 1/4 \sqrt{c/H}
\right)
$
and is smaller than $\epsilon$
whenever
$$
N \geq \frac{64 \gamma^2 L H^3(1+1/4\sqrt{c/H})^2}{\eta^2 \epsilon^2}.
$$
We will use $c<16$ and $H\geq 1$ and use the stronger constraint
$$
N \geq \frac{256 \gamma^2 L H^3}{\eta^2 \epsilon^2}.
$$
%
Along the same line,
the term in $1/N$ is
$\frac{2\gamma c L H^2}{\eta N}$
which is smaller than $\epsilon$
whenever
$$
N \geq \frac{2 \gamma c L H^2}{\epsilon}.
$$
%
Now, $C_N < 1  - \eta - C_{\beta}$ means 
$$ \frac{\gamma}{1-\gamma}  \sqrt{\frac{8L}{N}}
< 1  - \eta - C_{\beta}$$
hence
$$
N >\frac{8L\gamma^2 H^2}{(1-\eta-C_{\beta})^2}.
$$
We deduce that whenever 
\begin{align*}
    N
&\geq  
\max\left(
\frac{256\gamma^2LH^3}{\eta^2\epsilon^2}
,
\frac{2 \gamma c L H^2}{\epsilon} 
,
\frac{8L\gamma^2 H^2}{(1-\eta-C_{\beta})^2}
\right)\\
&= \frac{
256\gamma^2 L H^3}
{\eta^2}
\max
\left(
\frac{1}{\epsilon^2},
\frac{c \eta }{128 H \gamma \epsilon} 
,
\frac{\eta^2}{64H(1-\eta-C_{\beta})^2}
\right)
\end{align*}
the error is smaller than $2\epsilon$ up to the $\epsilon_{\mathrm{opt}}$ terms.


This bounds reduces to
$$
N \geq \frac{
C\gamma^2 L H^3
}
{\epsilon^2}
$$
with $C=256/\eta^2$
if
$$
\epsilon
\leq
\min\left(
\frac{128 H }{\eta}
,
\sqrt{64 H} \frac{1-\eta-C_\beta}{\eta}
\right).
$$
Note that $\epsilon \in [0, H)$ and $\eta <1$ so that the previous condition simplifies to
$$
\epsilon
\leq
\sqrt{64 H}\frac{1-\eta-C_\beta}{\eta} = \epsilon_0.
$$

If we want to obtain an arbitrary $\epsilon_0$, it suffices thus to take $\eta$ arbitrarily small
leading to the constant 
$C=256/\eta^2$ 
to be arbitrarily large.

Note that if $\epsilon_0 \geq O(H^{1/2+\delta})$
then $1/\eta > O(H^{\delta})$
which add a $H^{2\delta}$ factor to the bound on $N$.

However, for any $\kappa \sqrt{H}$ and for any $C_{\beta}$, it exists an $\eta$ independent of $H$ so that
$\epsilon_0 = 8\sqrt{ H}\frac{1-\eta-C_\beta}{\eta} =\kappa \sqrt{H}$,
hence the result stated in Theorem~\ref{h3}.

Now, as $L=\log (8|\mathcal{S}||\mathcal{A}| /((1-\gamma) \delta))$,
the previous condition can be summarized by
$$
N_{\text {total }}=N|\mathcal{S}||\mathcal{A}| = \tilde{\mathcal{O}}\left(\frac{H^3|\mathcal{S}||\mathcal{A}| }{\epsilon^2}\right)
$$
provided $\epsilon < \epsilon_0$.

Finally, taking
$\beta_0 = \frac{1-\gamma}{8\gamma}$ which gives $C_\beta=1/4$ and 
$\eta=1/2$ 
so that $C_N\leq 1/4$,
we obtain
$C=1024$ and
$\epsilon_0
= \sqrt{16H}$.


\section{Time Complexity}

\subsection{sa-rectangular case.}
In this section, we discuss the time complexity of our algorithm compared to non -robust algorithm and the one of \cite{kumar2022efficient}.

\label{sec:complex}

 In $sa$-rectangular case, optimal Bellman operator is:
$$
(\mathcal{T}^* V)(s)=\underbrace{\max _a}_{\text {action cost }} \underbrace{\max _\alpha}_{\text {$\alpha$ opt }}   [R(s, a)-\alpha_{s, a}- \beta_{s,a}\gamma   \underbrace{\snormqbar{[V]_{\alpha}}}_{\text {reward penalty/cost }}+\gamma \underbrace{\sum_{s^{\prime}} P\left(s^{\prime} \mid s, a\right) [V]_{\alpha}\left(s^{\prime}\right)}_{\text {sweep }}] .
$$
First the 'sweep' requires $O(S)$ iterations and the 'action cost' requires $O(A)$ iterations. We can notice that the span semi norm depends on state and action and is computed only once for value iteration for all states. Then the update requires:
$$
O( (\text{$\alpha$ opt })( S(\text { action cost })(\text { sweep cost })+\text { reward cost }))=O\left( (\text{$\alpha$ opt })(S^2 A+\text { reward cost })\right).
$$
Since the value iteration is a contraction map, we get $\epsilon$-close to the optimal value and it requires $O\left(\log \left(\frac{1}{\epsilon}\right)\right)$ full value update.  An additional $O\left(\log \left(\frac{1}{\epsilon}\right)\right)$ is required also for binary search in $\alpha$,    so the complexity is:
$$
O\left(\log \left(\frac{1}{\epsilon}\right)^2\left(S^2 A+\text { reward cost }\right)\right).
$$
In non robust MDPs, the complexity is:
$$
O\left(\log \left(\frac{1}{\epsilon}\right)\left(S^2 A\right)\right)
$$
as there is no optimisation in $\alpha$ or reward cost. Reward cost is of the order $O\left(S \log \left(\frac{S}{\epsilon}\right)\right)$ according to \cite{kumar2022efficient} in the general case and $O(S)$ when there is an analytic form for it. So in $sa$-rectangular, the complexity is :

$$O(\log \left(\frac{1}{\epsilon}\right)^2 S^2 A+S\left(\log \left(\frac{1}{\epsilon}\right)\right)^3)$$

when the span-seminorm is not known and only 

$$O(\log \left(\frac{1}{\epsilon}\right)^2 S^2 A)$$ otherwise, when there is an analytic form of the span seminorm.

\subsection{s-rectangular case}
According to lemma 4 of \cite{kumar2022efficient}, the complexity of $s$- rectangular algorithm with no analytic form of the span seminorm and without optimisation in $\alpha$ is:

$$O\left(\log \left(\frac{1}{\epsilon}\right)\left(S^2 A+S A \log \left(\frac{A}{\epsilon}\right)\right)\right).$$

In our case, an additional optimisation. in $\alpha$ is required which adds a factor $O\left(\log \left(\frac{1}{\epsilon}\right)\right)$ and then requires a total cost of :

$$O\left(\log \left(\frac{1}{\epsilon}\right)^2\left(S^2 A+S A \log \left(\frac{A}{\epsilon}\right)\right)\right).$$

% \subsection{Adaptation for the formulation. of \citet{kumar2022efficient}} \label{kumar}


% The proof for the adaptation. \cite{kumar2022efficient} is the same for the proof with upper bound in $H^3$ with only adaptation of this lemma :






% \begin{lemma}[Upper bound on $\Qs-\Qhatpistar$ and on $\Qpihat-\Qhatpihat$, all Q values are now without robust under simplex constraints.]
% \label{lemma_pis}
% \begin{align*}
%  \norminf{\Qs-\Qhatpistar}\leq&   \gamma\norminf{(I-\gamma\Pzeropistar)^{-1}(\Pzero-\widehat{P}) \Vhatpistar }+ \frac{2\gamma \beta \Snorm^{1/q} }{1-\gamma}   \norminf{\Qs- \Qhatpistar} 
%  \\
%  \norminf{\Qpihat-\Qhatpihat} \leq&  \gamma \norminf{\left(I-\gamma\Pzeropihat\right)^{-1}(\Pzero-\widehat{P}) \Vhatpihat }+ \frac{2\gamma \beta\Snorm^{1/q}  }{1-\gamma}  \norminf{\Qpihat- \Qhatpihat}
% \end{align*}
% \end{lemma}

% \begin{proof}
   

% \begin{align*}
% &\Qs-\Qhatpistar\\& = \left(I-\gamma \Pzeropistar\right)^{-1} ( R_0 -\Big(\frac{\pistar_s}{\normq{\pistar_s}} \Big)^{q-1}   (\alpha_{s} - \inf _{P : \normpbar{P}\leq\beta_{s}} P \Vs) )
% \\
% &- \left(I-\gamma \Phatpistar\right)^{-1} ( R_0 -\Big(\frac{\pistar_s}{\normq{\pistar_s}} \Big)^{q-1}   (\alpha_{s} - \inf _{P: \normpbar{P}\leq\beta_{s}} P \Vhatpistar) )
% \\
% \stackrel{}{=}&  
% \left(I-\gamma \Pzeropistar\right)^{-1} ( R_0 -\Big(\frac{\pistar_s}{\normq{\pistar_s}} \Big)^{q-1}   (\alpha_{s} - \inf _{P: \normpbar{P}\leq\beta_{s}} P \Vs) )
% \\
% &-\left(I-\gamma \Pzeropistar\right)^{-1} ( R_0 -\Big(\frac{\pistar_s}{\normq{\pistar_s}} \Big)^{q-1}   (\alpha_{s} - \inf _{P: \normpbar{P}\leq\beta_{s}} P \Vhatpistar) )
% \\
% &+ \left(I-\gamma \Pzeropistar\right)^{-1} ( R_0 -\Big(\frac{\pistar_s}{\normq{\pistar_s}} \Big)^{q-1}   (\alpha_{s} - \inf _{P \ : \normpbar{P}\leq\beta_{s}} P \Vhatpistar) )
% \\
% &- \left(I-\gamma \Phatpistar\right)^{-1} ( R_0 -\Big(\frac{\pistar_s}{\normq{\pistar_s}} \Big)^{q-1}   (\alpha_{s} - \inf _{P \ : \normpbar{P}\leq\beta_{s}} P \Vhatpistar) )
%  \\
%  \stackrel{(a)}{=}& \gamma\left(I-\gamma\Pzeropistar\right)^{-1}(\Pzero-\widehat{P}) \Vhatpistar
%  +\left(I-\gamma \Pzeropistar\right)^{-1} \gamma\Big(\frac{\pistar_s}{\normq{\pistar_s}} \Big)^{q-1}  \left( \inf _{P: \normpbar{P}\leq\beta_{s}} P \Vs - \inf _{P: \normpbar{P}\leq\beta_{s}} P \Vhatpistar 
%   \right) 
%  \end{align*}
% whera in (a) we use  previous lemma \ref{lemma_simplex}.
%  \end{proof}

% Hence, taking the supremum norm $\norminf{.}$, 
%  \begin{align*}
% &\norminf{\Qs-\Qhatpistar}  \stackrel{}{=}\\
% &
% \norminf{ \gamma\left(I-\gamma\Pzeropistar\right)^{-1}(\Pzero-\widehat{P}) \Vhatpistar+\left(I-\gamma \Pzeropistar\right)^{-1} \gamma\Big(\frac{\pistar_s}{\normq{\pistar_s}} \Big)^{q-1}  \left( \inf _{P: \normpbar{P}\leq\beta_{s}} P \Vs - \inf _{P \ : \normpbar{P}\leq\beta_{s}} P \Vhatpistar 
%   \right)  }
%  \\
%  \stackrel{(b)}{\leq}& \norminf{\gamma\left(I-\gamma\Pzeropistar\right)^{-1}(\Pzero-\widehat{P}) \Vhatpistar }+ 
%  \norminf{\left(I-\gamma \Pzeropistar\right)^{-1} \gamma\Big(\frac{\pistar_s}{\normq{\pistar_s}} \Big)^{q-1}  \left( \inf _{P: \normpbar{P}\leq\beta_{s}} P \Vs - \inf _{P: \normpbar{P}\leq\beta_{s}} P \Vhatpistar 
%   \right)}
%  \\
% \stackrel{(c)}{\leq}& \norminf{  \gamma\left(I-\gamma\Pzeropistar\right)^{-1}(\Pzero-\widehat{P}) \Vhatpistar }+ \frac{\gamma  }{1-\gamma}    \mid \inf _{P : \normpbar{P}\leq\beta_{s}} P \Vs - \inf _{P: \normpbar{P}\leq\beta_{s}} P \Vhatpistar  \mid 
%   \\
%  \stackrel{(d)}{\leq}&  \norminf{ \gamma\left(I-\gamma\Pzeropistar\right)^{-1}(\Pzero-\widehat{P}) \Vhatpistar }+ \frac{\gamma  }{1-\gamma} \sup _{P: \normpbar{P}\leq\beta_{s}} P \mid \Vs- \Vhatpistar \mid
%   \\
%    \stackrel{(f)}{\leq}&  \norminf{  \gamma\left(I-\gamma\Pzeropistar\right)^{-1}(\Pzero-\widehat{P}) \Vhatpistar} - \frac{\gamma   }{1-\gamma}  \inf _{P: \normpbar{P}\leq\beta_{s}} -P \mid \Vs- \Vhatpistar \mid \\
%     \stackrel{(g)}{\leq}&  \norminf{  \gamma\left(I-\gamma\Pzeropistar\right)^{-1}(\Pzero-\widehat{P}) \Vhatpistar} + \frac{\gamma \beta \Snorm^{1/q}  }{1-\gamma}  \snormqbarpis{ \Qs -\Qhatpistar}\\
%      \stackrel{(h)}{\leq}&  \norminf{  \gamma\left(I-\gamma\Pzeropistar\right)^{-1}(\Pzero-\widehat{P}) \Vhatpistar} + \frac{2\gamma \beta \Snorm^{1/q}  }{1-\gamma}  \norminf{ \Qs -\Qhatpistar}
% \end{align*}


% where (b) is the triangular inequality, (c)  Eq.~\eqref{1},  (d) is $\left|\inf _A f-\inf _A g\right| \leq \sup _A|f-g| .$, (e)  is a relaxation (f) is relation between sup and inf, (g) is Water Pouring lemma (see Lemma 2 in  \cite{kumar2022efficient}), (h) is ineuquality for semi norms and norms \eqref{2}.





% \pierre{le theorem est bon pas la demo, voir fin du doc}
% \begin{theorem}
% \label{h3}
% Let  $\beta=\Omega(1-\gamma)$, for
% any $\kappa>0$ and
% any $\epsilon_0 \leq \kappa \sqrt{H}$
% it exists a $C>0$ independent of $H$
% and
% any $\epsilon \in (0,\epsilon_0)$,
%  whenever $N$ the number of calls to the sampler per state-action pair satisfies
% $
% N \geq 
% C 
% \frac{L\gamma^2 H^2}{\epsilon^2\beta}
% $
% where $L=\log (8|\mathcal{S}||\mathcal{A}| /((1-\gamma) \delta))$, it holds that 
% if $\widehat{\pi}$ is any $\epsilon_{\text {opt }}$-optimal policy for $\widehat{M}$, that is when $\|\widehat{Q}^{\widehat{\pi}}-\widehat{Q}^{\star}\|_{\infty} \leq \epsilon_{o p t   }$,
% then
% $
% \norminf{\Qs-\Qpihat}\leq 2\epsilon  +\frac{8\epsilon_\text{opt}}{1-\gamma}
% %+\frac{9 \epsilon_{o p t   }}{1-\gamma}
% $
% with probability at least $1-\delta$.

% This implies that $N_{\text {total }}=N|\mathcal{S}||\mathcal{A}|$ as an overall sample complexity $\displaystyle \tilde{\mathcal{O}}\left(   \frac{H^2\Snorm\Anorm}{\epsilon^2 \beta}    \right)$
% for any $\epsilon<\epsilon_0$.
% \end{theorem}
% \begin{proof}
%   The factor  $C=\frac{1}{1-(C_N+C_\beta)}$  need to be control for different regime.
%   In the assymptotic regime for N, the  term $C_N=\gamma/(1-\gamma) \sqrt{8L/N}$ small compare to  other term of the denominator of $C$. Then 
%   \begin{equation*}
%        C=\frac{1}{1-(C_N+C_\beta)} \rightarrow _{\infty}   \frac{1}{1-C_\beta}=\frac{1-\gamma}{(1-\gamma)-2\gamma\beta}
%   \end{equation*}
%   In the regime when $\beta=O(1-\gamma)$ , we have $ \beta<C_1(1-\gamma) $. Then,
%   \begin{equation*}
%        C=\frac{1-\gamma}{(1-\gamma)-2\gamma\beta}<\frac{1}{1-2 C_1 \gamma   }=C_2
%   \end{equation*}
% So this term is constant regarding $(1-\gamma),N$ and $\beta$ in this regime.


% \vspace{0.3cm}

% In the other regime, where we have also $(1-\gamma)=O(\beta)$ ie $\beta=\Omega(1-\gamma)$ and $C_1(1-\gamma)\leq\beta  \leq C_3 (1-\gamma)$ for $C_1, C_3>0$.
% We have that :

% \begin{equation*}
%    C= \frac{1-\gamma}{(1-\gamma)-2\gamma\beta} \stackrel{(a)}{\leq} \frac{C_1(1-\gamma)}{2\gamma\beta}
% \end{equation*}
%   where in (a) we use that $(1-\gamma) C_1\geq \beta$ which is verified for $C_1=1/(4\gamma)$. Then, plugging the previous theorem becomes 

%   $$\norminf{\Qs- \Qpihat}< \frac{C_1(1-\gamma)}{2\gamma\beta}\Bigg( 8\gamma \sqrt{\frac{ L}{N(1-\gamma)^3}}+\frac{2\gamma \Delta_{\delta, N}^{\prime}}{ (1-\gamma)}+\frac{\gamma \epsilon_{\mathrm{opt}}}{(1-\gamma)}\left(2+\sqrt{\frac{8 L}{N}}\right) \Bigg)+ \epsilon_{\mathrm{opt}}.
% $$
% The term that gives the bigger condition on $N$ is the first term of this sum. The conditions becomes

% \begin{equation}
% \frac{C_5}{H\beta }.\sqrt{\frac{LH^3}{N}}\leq \epsilon
% \end{equation}
% which results in the condition 
% \begin{equation}
% \label{final_bound}
%     N\geq   \frac{H}{\beta^2\epsilon^2C_5^2}. 
% \end{equation}
% Using that $\beta H<C_3$. However, a lower bound exists in this regime that state that $N\leq  H^2/(\beta\epsilon^2 )$ . Equation \ref{final_bound} becomes $N> \frac{H^2}{\beta \epsilon^2   }$ in this regimes, which gives an overall complexity of

% $$
% N_{\text {total }}=N|\mathcal{S}||\mathcal{A}| = \tilde{\mathcal{O}}\left(\frac{H^2|\mathcal{S}||\mathcal{A}| }{\epsilon^2 \beta}\right)
% $$
% Discussion on the range is valid similar to previous proof.
% \end{proof}




% \section{A correction to be tight for rectangular and Lp norms}



% \begin{lemma}[\citet{shi2023curious} that is tighter than \citet{gheshlaghi2013minimax} for RMDPS]
% \label{laixi}
%   For any policy $\pi$,
% $$
% \left(I-\gamma P_0^\pi\right)^{-1} \sqrt{\operatorname{Var}_{P_0}\left(V^\pi\right)} \leq \sqrt{\frac{16\snorminf{V^\pi}}{(1-\gamma)^2\gamma^2}}\stackrel{(a)}{\leq} \sqrt{\frac{16}{(1-\gamma)^2\gamma^3\max \{1-\gamma, \beta\}}} 
% $$

% where $P_0$ is the nominal transition model of $M$. In (a) we use Lemma\ref{lemma_tight}. This is tighter than \citet{gheshlaghi2013minimax}  in a sense that $\snorminf{V^\pi}<\norminf{V^\pi}<1/(1-\gamma) $.
 
% \end{lemma}


% \begin{lemma}
% \label{lemma_tight}
% For every $Lp$ ball, such as $V^\pi$ is the robust value function associated, we have for $s$ and $sa$ rectangular case:
%     \begin{equation}
%         \snorminf{V^\pi}\leq \frac{1}{\gamma \max \{1-\gamma, \beta\}}
%     \end{equation}
% \end{lemma}
%  with $ \snorminf{V^\pi}=\frac{\max _s V^\pi(s)- \min _s V^\pi(s)}{2}$. 
% \begin{proof}
% First using Bellman recursion,
%     \begin{align*}
%         \max _s  V^\pi(s) =  \max _s \mathbb{E} _{a\sim \pi} [ r(s,a) + \gamma  \kappa_{\mathcal{P}_{s} }(V^\pi) ]  \leq \max _{s,a} 1+ \gamma \kappa_{\mathcal{P}_{s} }(V^\pi)
%     \end{align*}
%     where we use the fact that the reward function is bounded by 1. We are doing the proof for $s$ rectangular but is is the same for $sa$, replacing $P_{s}(a,.)$ by $P_{s,a}(.)$.
% Let call $P_{s}(a,.)$ then minimum for the s rectangular case. Then, for any $s,a \in \mathcal{S} \times \mathcal{A}$, there exists some $\widetilde{P}_{s, a} \in \mathbb{R}^S$ constructed by reducing the values of some elements of $P_{s, a}$ to obey $P_{s}(a,.) \geq \widetilde{P}_{s}(a,.) \geq 0$ and $\sum_{s^{\prime},a}\left(P_{s }\left(s^{\prime},a\right)-\widetilde{P}_{s}\left(s^{\prime},a\right)\right)^p=(\beta/2)^p$ where $\beta$ is the radius of the ball. 


% %\pierre{je normalise par 2 mes normes a la TV? Non avec beta sur 2 ça marche}
% This implies $\widetilde{P}_{s}(a,.)+\beta e_{s_0}^{\top} \in \mathcal{P}_s$, where $e_{s_0}$ is the standard basis vector supported on $s_0$, since $\left\|\widetilde{P}_{s}+(\beta/2) e_{s_0}^{\top}-P_{s}\right\|_p \leq \left\|\widetilde{P}_{s}-P_{s}\right\|_p+\frac{\beta}{2}=\beta$. Consequently,
% $$
% \begin{aligned}
% \kappa_{\mathcal{P}_{s} }(V^\pi) \leq\left(\widetilde{P}_{s}+(\beta/2) e_{s_0}^{\top}\right) V^{\pi} & \leq\left\|\widetilde{P}_{s}\right\|_1\left\|V^{\pi}\right\|_{\infty}+\sigma V^{\pi}\left(s_0\right) \\
% & \leq(1-\beta/2) \max _{s \in \mathcal{S}} V^{\pi }(s)+(\beta/2) \min _{s \in \mathcal{S}} V^{\pi}(s)
% \end{aligned}
% $$
% where the second inequality holds by 
% \begin{align*}
%     \left\|\widetilde{P}_{s}\right\|_1=\sum_{s^{\prime},a} \widetilde{P}_{s}\left(s^{\prime},a\right) &=-\sum_{s^{\prime},a}\left(P_{s}\left(s^{\prime},a\right)-\widetilde{P}_{s}\left(s^{\prime},a\right)\right)+\sum_{s^{\prime},a} P_{s}\left(s^{\prime},a\right)\\
%     &=1- \norm{P_s -\widetilde{P}_{s}}_1\leq 1- \norm{P_s -\widetilde{P}_{s}}_p=1-\beta/2
% \end{align*}
% Plugging this inequality in the first one, we obtain 
% \begin{align*}
%     \max _s V^\pi \leq 1 + (1-\beta/2)\gamma \max _s V^\pi(s) + \gamma (\beta/2) \min _s V^\pi(s)
% \end{align*}
% Finally: 
%     \begin{align*}
%         \max _s  V^\pi(s) \leq& \frac{1+\gamma (\beta/2) \min _s  V^\pi(s)}{1-\gamma(1-\beta/2) }\leq \frac{2}{\gamma \max \{1-\gamma, \beta\}} +  \min _s  V^\pi(s) \\
%         \iff& \snorminf{V^\pi} \leq  \frac{1}{\gamma \max \{1-\gamma, \beta\}}.
%     \end{align*}
% \end{proof}


% \begin{lemma}
% The following upper bound holds with probability $1-\delta$:
% \label{fin2}
% \begin{equation}
%     \left\|Q^{\widehat{\pi}}-\widehat{Q}^{\widehat{\pi}}\right\|_{\infty}<\left(C_{N}+C_\beta\right) \norminf{\Qpihat- \Qhatpihat} +8\sqrt{2}\sqrt{\frac{ L}{N(1-\gamma)^2\max \{1-\gamma, \beta\} \gamma}}+\frac{\gamma \Delta_{\delta, N}^{\prime}}{1-\gamma}+\frac{\gamma \epsilon_{\mathrm{opt}}}{1-\gamma}\left(2+\sqrt{\frac{8 L}{N}}\right)
% \end{equation}
% with $C_{N}=\frac{\gamma}{1-\gamma} \sqrt{\frac{8 L}{N}}$ and $C_\beta=\frac{2\gamma \beta  }{1-\gamma}   $.
% \end{lemma}
% \begin{proof}
% $$
% \begin{aligned}
% &\left\|Q^{\widehat{\pi}}- \widehat{Q}^{\widehat{\pi}}\right\|_{\infty}\\
% &\stackrel{(a)}{\leq}  \gamma\left\|\left(I-\gamma \Pzero^{\widehat{\pi}}\right)^{-1}(\Pzero-\widehat{P}) \widehat{V}^{\widehat{\pi}} \right\|_{\infty} +\frac{2\gamma \beta  }{1-\gamma}   \norminf{\Qpihat- \Qhatpihat}
% \\
% &\stackrel{(b)}{\leq} \gamma\left\|\left(I-\gamma P^{\widehat{\pi}}\right)^{-1}(\Pzero-\widehat{P}) \widehat{V}^{\star}\right\|_{\infty}+\gamma\left\|\left(I-\gamma \Pzero^\pi\right)^{-1}(\Pzero-\widehat{P})\left(\widehat{V}^{\widehat{\pi}}-\widehat{V}^{\star}\right)\right\|_{\infty}+\frac{2\gamma \beta  }{1-\gamma}   \norminf{\Qpihat- \Qhatpihat} 
% \\
% &\stackrel{(c)}{\leq} \gamma\left\|\left(I-\gamma \Pzero^{\widehat{\pi}}\right)^{-1}(\Pzero-\widehat{P}) \widehat{V}^{\star}\right\|_{\infty}+\frac{2\gamma \epsilon_{\mathrm{opt}}}{1-\gamma} + \frac{2\gamma \beta  }{1-\gamma}   \norminf{\Qpihat- \Qhatpihat}
% \\
% &\stackrel{(d)}{\leq} \gamma\left\|\left(I-\gamma \Pzero^{\widehat{\pi}}\right)^{-1}\left|(\Pzero-\widehat{P}) \widehat{V}^{\star}\right|\right\|_{\infty}+\frac{2\gamma \epsilon_{\mathrm{opt}}}{1-\gamma} +\frac{2\gamma \beta  }{1-\gamma}   \norminf{\Qpihat- \Qhatpihat}
% \\
% &\stackrel{(e)}{\leq} \gamma \sqrt{\frac{8 L}{N}}\left\|\left(I-\gamma \Pzero^{\widehat{\pi}}\right)^{-1} \sqrt{\operatorname{Var}_{\Pzero}\left(\widehat{V}^{\star}\right)}\right\|_{\infty}+2\frac{\gamma \Delta_{\delta, N}^{\prime}}{1-\gamma}+\frac{2\gamma \epsilon_{\mathrm{opt}}}{1-\gamma}+ \frac{2\gamma \beta  }{1-\gamma}   \norminf{\Qpihat- \Qhatpihat} 
% \\
% &\stackrel{(f)}{\leq} \gamma \sqrt{\frac{8 L}{N}}\left\|\left(I-\gamma \Pzero^{\widehat{\pi}}\right)^{-1}\left(\sqrt{\operatorname{Var}_{\Pzero}\left(V^{\widehat{\pi}}\right)}+\sqrt{\operatorname{Var}_{\Pzero}\left(V^{\widehat{\pi}}-\widehat{V}^{\widehat{\pi}}\right)}+\sqrt{\operatorname{Var}_{\Pzero}\left(\widehat{V}^{\widehat{\pi}}-\widehat{V}^{\star}\right)}\right)\right\|_{\infty}
% \\
% &\quad+\frac{\gamma \Delta_{\delta, N}^{\prime}}{1-\gamma}+\frac{2\gamma \epsilon_{\mathrm{opt}}}{1-\gamma} +\frac{2\gamma \beta  }{1-\gamma}   \norminf{\Qpihat- \Qhatpihat}
% \\
% &\stackrel{(g)}{\leq} \gamma \sqrt{\frac{8 L}{N}}\left(\sqrt{\frac{16}{(1-\gamma)^2\gamma^3\max \{1-\gamma, \beta\}}} +\frac{\sqrt{\left\|V^{\widehat{\pi}}-\widehat{V}^{\widehat{\pi}}\right\|_{\infty}^2}}{1-\gamma}+\frac{2\epsilon_{\mathrm{opt}}}{1-\gamma}\right)+\frac{\gamma \Delta_{\delta, N}^{\prime}}{1-\gamma}+\frac{2\gamma \epsilon_{\mathrm{opt}}}{1-\gamma}+\frac{2\gamma \beta  }{1-\gamma}   \norminf{\Qpihat- \Qhatpihat} 
% \\
% &\stackrel{(h)}{\leq} \gamma \sqrt{\frac{8 L}{N}}\left( \sqrt{\frac{16}{(1-\gamma)^2\gamma^3\max \{1-\gamma, \beta\}}} +\frac{\left\|Q^{\widehat{\pi}}-\widehat{Q}^{\widehat{\pi}}\right\|_{\infty}}{1-\gamma}+\frac{2\epsilon_{\mathrm{opt}}}{1-\gamma}\right)+\frac{\gamma \Delta_{\delta, N}^{\prime}}{1-\gamma}+\frac{2\gamma \epsilon_{\mathrm{opt}}}{1-\gamma}+\frac{2\gamma \beta  }{1-\gamma}   \norminf{\Qpihat- \Qhatpihat} 
% \\
% &=\gamma \sqrt{\frac{8 L}{N}}\left( \sqrt{\frac{16}{(1-\gamma)^2\gamma^3\max \{1-\gamma, \beta\}}} +\frac{\left\|Q^{\widehat{\pi}}-\widehat{Q}^{\widehat{\pi}}\right\|_{\infty}}{1-\gamma}\right)+\frac{\gamma \Delta_{\delta, N}^{\prime}}{1-\gamma}+\frac{\gamma \epsilon_{\mathrm{opt}}}{1-\gamma}\left(2+\sqrt{\frac{8 L}{N}}\right) +\frac{2\gamma \beta  }{1-\gamma}   \norminf{\Qpihat- \Qhatpihat}
% \\
% &=\left(C_{N}+C_\beta\right) \norminf{\Qpihat- \Qhatpihat} +8\sqrt{2}\sqrt{\frac{ L}{N(1-\gamma)^2\max \{1-\gamma, \beta\} \gamma}}+\frac{\gamma \Delta_{\delta, N}^{\prime}}{1-\gamma}+\frac{\gamma \epsilon_{\mathrm{opt}}}{1-\gamma}\left(2+\sqrt{\frac{8 L}{N}}\right)
% \end{aligned}
% $$
% with $C_{N}=\frac{\gamma}{1-\gamma} \sqrt{\frac{8 L}{N}}$ and $C_\beta=\frac{2\gamma \beta  }{1-\gamma}   $.

% We have that (a) is true by Lemma \ref{lemma pi hat}, (b) is by the triangular inequality using $ \widehat{V}^{\widehat{\pi}} = \widehat{V}^{\widehat{\pi}} +\widehat{V}^{\star}   -\widehat{V}^{\star}$, (c) is from the definition of $\epsilon_{\mathrm{opt}}$ and Eq.~\eqref{1}, (d) is by positivity of the classic horizon inverse matrix, that is $(I-\gamma P)^{-1} =\sum_{t>0} \gamma^t P^t > 0$,
%     %\pierre{pas compris ta remarque \mfg{Ben que c'est pas vrai pour toute matrice, faut que le rayon spectral soit plus petit que 1 et que les éléments soient positifs (je dirais juste parce que $(I-\gamma P)^{-1} \sum \gamma^t P^t > 0$}}., 
%     (e) is by Lemma \ref{laixi}, (f) is by the triangular inequality for the variance (which is, in fact, a seminorm) and decomposing  $\widehat{V}^{\star}=\widehat{V}^{\star}+\widehat{V}^{\widehat{\pi}} -\widehat{V}^{\widehat{\pi}} +V^{\widehat{\pi}}- V^{\widehat{\pi}}$, (g) is by Lemma \ref{variance}, uses the definition of $\epsilon_{\mathrm{opt}}$ and takes the $\sup$ over (s,a) of the variance in the second term, and eventually (h) is because we have that $\|V^\pi-\widehat{V}^\pi\|_{\infty} \leq\|Q^\pi-\widehat{Q}^\pi\|_{\infty}
% $ for any $\pi$.

% \end{proof}



% \begin{lemma}
% \label{fin}
% The following upper bound holds
% with probability $1-\delta$:
% \begin{equation}
%    \norminf{\Qs- \Qhatpistar}<\left(C_{N}+C_\beta\right)  \norminf{\Qs- \Qhatpistar} +\gamma 4\sqrt{\frac{ L}{N(1-\gamma)^3}}+\frac{\gamma \Delta_{\delta, N}^{\prime}}{1-\gamma}.
% \end{equation}
% with $C_{N}=\frac{\gamma}{1-\gamma} \sqrt{\frac{8 L}{N}}$ and $C_\beta=\frac{2\gamma \beta  }{1-\gamma}   $.
% \end{lemma}



% \begin{proof}


% $$
% \begin{aligned}
%  \norminf{\Qs- \Qhatpistar}
% &\stackrel{(a)}{\leq} \gamma\left\|\left(I-\gamma \Pzeropistar\right)^{-1}(\Pzero-\widehat{P}) \Vhatpistar\right\|_{\infty} +\frac{2\gamma \beta  }{1-\gamma}    \norminf{\Qs- \Qhatpistar} 
% \\
% &\stackrel{(b)}{\leq} \gamma\left\|\left(I-\gamma \Pzeropistar\right)^{-1}\left|(\Pzero-\widehat{P})  \Vhatpistar\right|\right\|_{\infty}+\frac{2\gamma \beta  }{1-\gamma}    \norminf{\Qs- \Qhatpistar}\\
% &\stackrel{(c)}{\leq} \gamma \sqrt{\frac{8 L}{N}}\left\|\left(I-\gamma \Pzeropistar\right)^{-1} \sqrt{\operatorname{Var}_{\Pzero}\left( \Vhatpistar\right)}\right\|_{\infty}+2\frac{\gamma \Delta_{\delta, N}^{\prime}}{1-\gamma}+ \frac{2\gamma \beta  }{1-\gamma}    \norminf{\Qs- \Qhatpistar} 
% \\
% &\stackrel{(d)}{\leq} \gamma \sqrt{\frac{8 L}{N}}\left\|\left(I-\gamma \Pzeropistar\right)^{-1}\left(\sqrt{\operatorname{Var}_{\Pzero}\left(\Vs\right)}+\sqrt{\operatorname{Var}_{\Pzero}\left(\Vs-\Vhatpistar\right)}\right)\right\|_{\infty} 
% \\
% &\quad+\frac{\gamma \Delta_{\delta, N}^{\prime}}{1-\gamma} +\frac{2\gamma \beta  }{1-\gamma}    \norminf{\Qs- \Qhatpistar}
% \\
% &\stackrel{(e)}{\leq} \gamma \sqrt{\frac{8 L}{N}}\left(\sqrt{\frac{16}{(1-\gamma)^2\gamma^3\max \{1-\gamma, \beta\}}} +\frac{\sqrt{\left\|\Vs-\Vhatpistar\right\|_{\infty}^2}}{1-\gamma}\right)+\frac{\gamma \Delta_{\delta, N}^{\prime}}{1-\gamma}+\frac{2\gamma \beta  }{1-\gamma}    \norminf{\Qs- \Qhatpistar} 
% \\
% &\leq\gamma \sqrt{\frac{8 L}{N}}\left(\sqrt{\frac{16}{(1-\gamma)^2\gamma^3\max \{1-\gamma, \beta\}}} +\frac{ \norminf{\Qs- \Qhatpistar}}{1-\gamma}\right) +\frac{\gamma \Delta_{\delta, N}^{\prime}}{1-\gamma} +\frac{2\gamma \beta  }{1-\gamma}    \norminf{\Qs- \Qhatpistar}
% \\
% &=\left(C_{N}+C_\beta\right)  \norminf{\Qs- \Qhatpistar} +8\sqrt{2}\sqrt{\frac{L}{(1-\gamma)^2\gamma\max \{1-\gamma, \beta\}}} +\frac{\gamma \Delta_{\delta, N}^{\prime}}{1-\gamma}
% \end{aligned}
% $$
% with $C_{N}=\frac{\gamma}{1-\gamma} \sqrt{\frac{8 L}{N}}$ and $C_\beta=\frac{2\gamma \beta  }{1-\gamma}   $.

% We have that (a) is true by Lemma \ref{lemma_pis}, (b) is by the positivity of the classic horizon inverse matrix, (c) is by Lemma (\ref{agarwal}), (d) is by the triangular inequality for the variance (which is a seminorm), (e) is by Lemma \ref{variance} and taking the $\sup$ over $(s,a)$ of the variance in the second term, and eventually (h) is because $
% \|V^\pi-\widehat{V}^\pi\|_{\infty} \leq\|Q^\pi-\widehat{Q}^\pi\|_{\infty}
% $ for any $\pi$.

% \end{proof}

% \begin{theorem}
% For $0< C_\beta \leq 1/2$ and $0 < C_N+C_\beta<1$, with probability $1-\delta$, we get:
%  $$\norminf{\Qs- \Qpihat}<\frac{1}{1-(C_N+C_\beta)}\left(16\sqrt{2}  \sqrt{\frac{ L}{N(1-\gamma)^3\gamma \max \{1-\gamma, \beta\}     }}+\frac{2\gamma \Delta_{\delta, N}^{\prime}}{1-\gamma}+\frac{\gamma \epsilon_{\mathrm{opt}}}{1-\gamma}\left(2+\sqrt{\frac{8 L}{N}}\right)\right) + \epsilon_{\mathrm{opt}}.
% $$
% \end{theorem}

% \begin{proof}
%     This result is obtained by combining the two previous Lemmas~\ref{fin2} and~\ref{fin} and passing the term in $\left(C_{N}+C_\beta\right)$ to the left-hand side.
% \end{proof}

% Note that $C_{\beta}  + C_{N} < 1$ implies
% $C_{\beta} = \frac{2\gamma\beta}{1-\gamma}<1$
% and hence $\beta < \frac{1-\gamma}{2\gamma}$.
% %
% Now we need to pick 
% $C_N < 1 - C_\beta$. 
% Let $C_N \leq 1 - C_\beta - \eta$, 
% for any $0 < \eta < 1 - C_\beta$
% the previous inequality becomes
% $$\norminf{\Qs- \Qpihat}< \frac{16\sqrt{2}}{\eta} \sqrt{\frac{ L}{N(1-\gamma)^2 \max \{1-\gamma, \beta\} \gamma }}+\frac{2\gamma \Delta_{\delta, N}^{\prime}}{\eta (1-\gamma)}+\frac{\gamma \epsilon_{\mathrm{opt}}}{\eta(1-\gamma)}\left(2+\sqrt{\frac{8 L}{N}}\right) + \epsilon_{\mathrm{opt}}.
% $$

% As $\Delta'_{\delta,N} =\sqrt{\frac{c L}{N}}+\frac{c L}{(1-\gamma) N}$, the term in $1/\sqrt{N}$ is given 
% by
% $\frac{16\sqrt{2}\sqrt{L} H^{}}{\eta\sqrt{N\gamma\max \{1-\gamma, \beta\}}}
% \left( 1 + 1/4 \sqrt{c/H}
% \right)
% $
% and is smaller than $\epsilon$
% whenever
% $$
% N \geq \frac{512 \gamma^2 L H^2(1+1/4\sqrt{c/H})^2}{\eta^2 \epsilon^2 \max \{1-\gamma, \beta\} \gamma}.
% $$
% We will use $c<16$ and $H\geq 1$ and use the stronger constraint
% $$
% N \geq \frac{2048  L H^2}{\eta^2 \gamma \epsilon^2\max \{1-\gamma, \beta\}}.
% $$
% %
% Along the same line,
% the term in $1/N$ is
% $\frac{2\gamma c L H^2}{\eta N}$
% which is smaller than $\epsilon$
% whenever
% $$
% N \geq \frac{2 \gamma c L H^2}{\epsilon}.
% $$
% %
% Now, $C_N < 1  - \eta - C_{\beta}$ means 
% $$ \frac{\gamma}{1-\gamma}  \sqrt{\frac{8L}{N}}
% < 1  - \eta - C_{\beta}$$
% hence
% $$
% N >\frac{8L\gamma^2 H^2}{(1-\eta-C_{\beta})^2}.
% $$
% We deduce that whenever 
% \begin{align*}
%     N
% &\geq  
% \max\left(
% \frac{2048  L H^2}{\eta^2 \gamma \epsilon^2\max \{1-\gamma, \beta\}}
% ,
% \frac{2 \gamma c L H^2}{\epsilon} 
% ,
% \frac{8L\gamma^2 H^2}{(1-\eta-C_{\beta})^2}
% \right)\\
% &=\frac{2048  L H^2}{\eta^2 \gamma \max \{1-\gamma, \beta\}}
% \max
% \left(
% \frac{1}{\epsilon^2},
% \frac{c \eta \gamma^2 \max \{1-\gamma, \beta\} }{1024   \epsilon} 
% ,
% \frac{\eta^2 \gamma^3 \max \{1-\gamma, \beta\}}{256(1-\eta-C_{\beta})^2}
% \right)
% \end{align*}
% the error is smaller than $2\epsilon$ up to the $\epsilon_{\mathrm{opt}}$ terms.


% This bounds reduces to
% $$
% N \geq \frac{
% C L H^2
% }
% {\epsilon^2 \gamma \max \{1-\gamma, \beta\}}
% $$
% with $C=2048/\eta^2$
% if
% $$
% \epsilon
% \leq
% \min\left(
% \frac{1024  }{ \max \{1-\gamma, \beta\}\eta \gamma^2}
% ,
% 16  \frac{1-\eta-C_\beta}{\eta \sqrt{ \gamma^3\max \{1-\gamma, \beta\}}}
% \right).
% $$
% Note that $\epsilon \in [0, H)$ and $\eta <1$ so that the previous condition simplifies to
% $$
% \epsilon
% \leq
% 16\frac{1-\eta-C_\beta}{\eta\sqrt{\gamma^3  \max \{1-\gamma, \beta\}}} = \epsilon_0.
% $$

% If we want to obtain an arbitrary $\epsilon_0$, it suffices thus to take $\eta$ arbitrarily small
% leading to the constant 
% $C=2048/\eta^2$ 
% to be arbitrarily large.

% Note that if $\epsilon_0 \geq O(H^{1/2+\delta})$
% then $1/\eta > O(H^{\delta})$
% which add a $H^{2\delta}$ factor to the bound on $N$.

% However, for any $\kappa \sqrt{H}$ and for any $C_{\beta}$, it exists an $\eta$ independent of $H$ so that
% $\epsilon_0 = 16\sqrt{ \frac{1}{\max \{1-\gamma, \beta\}}}\frac{1-\eta-C_\beta}{\eta \gamma^3} =\kappa \sqrt{\frac{1}{\max \{1-\gamma, \beta\}}}$,
% hence the result stated in Theorem~\ref{h3}.

% Now, as $L=\log (8|\mathcal{S}||\mathcal{A}| /((1-\gamma) \delta))$,
% the previous condition can be summarized by
% $$
% N_{\text {total }}=N|\mathcal{S}||\mathcal{A}| = \tilde{\mathcal{O}}\left(\frac{H^2|\mathcal{S}||\mathcal{A}| }{\epsilon^2\max \{1-\gamma, \beta\}}\right)
% $$
% provided $\epsilon < \epsilon_0$.

% Finally, taking
% $\beta_0 = \frac{1-\gamma}{8\gamma}$ which gives $C_\beta=1/4$ and 
% $\eta=1/2$ 
% so that $C_N\leq 1/4$,
% we obtain
% $C=2048$ and
% $\epsilon_0
% = 16\sqrt{\frac{1}{\max \{1-\gamma, \beta\}}}$.


% %\newpage


% \section{Adaptation of the proof  \ref{agarwal} of concentration  from 
% \label{annex_annex}
% \citet{agarwal2020model}}




% Recall, the result we're interested in is that with probability greater than $1-\delta$, assuming that $C_\beta=\frac{2\gamma \beta}{1-\gamma} \leq  1/2 $,
% %
% \begin{align*}
% \left|(\Pzero-\widehat{P}) \widehat{V}^{\star}\right| & \leq \sqrt{\frac{8 L}{N}} \sqrt{\operatorname{Var}_{\Pzero}\left(\widehat{V}^{\star}\right)}+\Delta_{\delta, N}^{\prime} \mathbb{I} \\
% \left|(\Pzero-\widehat{P}) \widehat{V}^{\pi^{\star}}\right| & \leq \sqrt{\frac{8 L}{N}} \sqrt{\operatorname{Var}_{\Pzero}\left(\widehat{V}^{\pi^{\star}}\right)}+\Delta_{\delta, N}^{\prime} \mathbb{I}
% \\
% \text{where } \Delta_{\delta, N}^{\prime} & =\sqrt{\frac{c L}{N}}+\frac{c L}{(1-\gamma) N}.
% \end{align*}
% %
% Here, we simply change a point that differ from \citet{agarwal2020model} concentration, as they consider classic MDPs and not RMDPs. To prove this concentration, they use the so-called absorbing MDPs that we will define below. In practice, we are doing exactly the same thing for the nominal of RMPDs that they are doing for their classic MDPs. The main difference is that our proof are for $\widehat{V}^{\star}$ and $\widehat{V}^{\pi^{\star}}$ that are robust value functions and that we are assuming that our nominal is and absorbing MDP.


% For a state $s$ and a scalar $u$, define the absorbing RMDP $M_{s, u}$ as follows: $M_{s, u}$ is identical to $M$ except that state $s$ is absorbing in $M_{s, u}$, i.e. $P_{M_{s, u}}(s \mid s, a)=1$ for all $a$, and the instantaneous reward at state $s$ in $M_{s, u}$ is $(1-\gamma) u/2$; the remainder of the transition model and reward function are identical to those in $M$. In order to avoid notational clutter, we use $V_{s, u}^\pi$ to denote the robust value function $V_{M_{s, u}}^\pi$ and correspondingly for robust $Q$-function and reward and transition functions. This implies that for all policies $\pi$:
% $$
% V_{s, u}^\pi(s)=u/2,
% $$
% since $s$ is absorbing with instantaneous reward $(1-\gamma) u/2$.

% Only one part of this proof changes and it is detailed here. The sole difference with \citet{agarwal2020model} is that we need a factor $2$ in the definition for both $(1-\gamma) u/2$ and $V_{s, u}^\pi(s)=u/2$ that comes from Eq.~\eqref{adaptation} (see below).

% The proof using absorbing MDPS \cite{agarwal2020model} is exactly the same, the only difference being their lemma 9, stating that
% %
% for all states $s, u, u^{\prime} \in \mathbb{R}$, and policies $\pi$,
% $$
% \left\|Q_{s, u}^{\star}-Q_{s, u^{\prime}}^{\star}\right\|_{\infty} \leq\left|u-u^{\prime}\right| \text { and }\left\|Q_{s, u}^\pi-Q_{s, u^{\prime}}^\pi\right\|_{\infty} \leq\left|u-u^{\prime}\right|.
% $$
% We show this result for the robust case below.
% \begin{proof}
% First, as the nominal of our robust MDPs is absorbing, we observe
% $$
% \left\|R_{0,s, u}-R_{0,s, u^{\prime}}\right\|_{\infty}=(1-\gamma)\left|u-u^{\prime}\right|/2,
% $$
% since these two reward functions differ only in state $s$, in which case $R_{0,s, u}(s, a)=(1-\gamma) u/2$ and $R_{0,s, u^{\prime}}(s, a)=$ $(1-\gamma) u^{\prime}/2$. Let $\pi_{s, u}$ be the optimal policy in $M_{s, u}$. Note
% %
% \begin{align}
% Q_{s, u}^{\star}-Q_{s, u^{\prime}}^{\star} & =Q_{s, u}^{\star}-\max _\pi\left(I-\gamma P_{0,s, u^{\prime}}^\pi\right)^{-1} \Big(R_{0,s, u^{\prime}}    -\alpha_{s,a} -\gamma\beta_{s,a}\snormqbar{Q_{s, u^{\prime}}^{\star}} \Big) \nonumber
% \\
% \stackrel{(a)}{\leq}& Q_{s, u}^{\star}-\left(I-\gamma P_{0,s, u^{\prime}}^{\pi_{s, u}}\right)^{-1} \left(R_{0,s, u^{\prime}}    -\alpha_{s,a} -\gamma\beta_{s,a}\snormqbar{Q_{s, u^{\prime}}^{\star}} \right)
% \nonumber
% \\
% \stackrel{(b)}{=}&\left(I-\gamma P_{0,s, u^{}}^{\pi_{s, u}}\right)^{-1} \Big(  R_{0,s, u^{}}    -\alpha_{s,a} -\gamma\beta_{s,a}\snormqbar{Q_{s, u^{}}^{\star}}      \Big)  -\left(I-\gamma P_{0,s, u^{'}}^{\pi_{s, u}}\right)^{-1} \left(R_{0,s, u^{'}}    -\alpha_{s,a} -\gamma\beta_{s,a}\snormqbar{Q_{s, u^{'}}^{\star}} \right) \nonumber
% \\ 
% \stackrel{(c)}{=}&\left(I-\gamma P_{0,s, u^{'}}^{\pi_{s, u}}\right)^{-1} \Big(  R_{0,s, u^{}}    -\alpha_{s,a} -\gamma\beta_{s,a}\snormqbar{Q_{s, u^{}}^{\star}}      \Big)  -\left(I-\gamma P_{0,s, u^{'}}^{\pi_{s, u}}\right)^{-1} \left(R_{0,s, u^{'}}    -\alpha_{s,a} -\gamma\beta_{s,a}\snormqbar{Q_{s, u^{'}}^{\star}} \right)  \nonumber
% \\ 
% \stackrel{(d)}{\leq}&\left(I-\gamma P_{0,s, u^{\prime}}^{\pi_{s, u}}\right)^{-1}\left(R_{0,s, u}-R_{0,s, u^{\prime}}\right)  +\frac{2\gamma\beta}{1-\gamma} \normqbar{Q_{s, u}^{\star}-Q_{s, u^{\prime}}^{\star}   \nonumber
% } \\
% \stackrel{(e)}{\leq}&   \left(I-\gamma P_{0,s, u^{\prime}}^{\pi_{s, u}}\right)^{-1}\left(R_{0,s, u}-R_{0,s, u^{\prime}}\right)  +\frac{2\gamma\beta}{1-\gamma} \norminf{Q_{s, u}^{\star}-Q_{s, u^{\prime}}^{\star}}
% \nonumber\\
% \stackrel{(f)}{\leq}&   \left(I-\gamma P_{0,s, u^{\prime}}^{\pi_{s, u}}\right)^{-1}\vert \left(R_{0,s, u}-R_{0,s, u^{\prime}}\right) \vert   +\frac{2\gamma\beta}{1-\gamma} \norminf{Q_{s, u}^{\star}-Q_{s, u^{\prime}}^{\star}}
% \nonumber
% \end{align}
% %



% The first inequality (a) is Lemma \ref{subtil_lema}.
% Point (b) comes from the regularised form of robust Q-functions. Point (c) follows, since $P_{s, u}$ only depends on the state $s$ and not on the value $u$, so we can write $P_{s, u}=P_{s, u'}$. Then inequality (d) comes from the classic  Eq.~\eqref{1} and the triangular inequality for seminorms, and finally domination of the supremum norm over $\snormq{.}$. Inequality (e) is domination of the supremum norm from Eq. \eqref{2}.
% Then (f) is due to the positivity of $(I-\gamma P_{0,s, u^{\prime}}^{\pi_{s, u}})^{-1}$ as it can be written as an infinite sum of positive matrices, as well as to the triangular inequality.

% We see that our problem is symmetric so with regard to $u$ and $u'$ so we can write an absolute value as we get a lower bound also of $Q_{s, u}^{\star}-Q_{s, u^{\prime}}^{\star} $.

% Finally,

% \begin{align*}
%     \vert Q_{s, u}^{\star}-Q_{s, u^{\prime}}^{\star} \vert &\leq \left(I-\gamma P_{0,s, u^{\prime}}^{\pi_{s, u}}\right)^{-1}\vert \left(R_{0,s, u}-R_{0,s, u^{\prime}}\right) \vert   +\frac{2\gamma\beta}{(1-\gamma)} \norminf{Q_{s, u}^{\star}-Q_{s, u^{\prime}}^{\star}}\\
%     =&  \left(I-\gamma P_{0,s, u^{\prime}}^{\pi_{s, u}}\right)^{-1}\frac{(1-\gamma)\left|u-u^{\prime}\right|}{2}  +\frac{2\gamma\beta}{(1-\gamma)} \norminf{Q_{s, u}^{\star}-Q_{s, u^{\prime}}^{\star}}\\
%     \leq&   \frac{1}{1-\gamma}\frac{(1-\gamma)\left|u-u^{\prime}\right|}{2}  +\frac{2\gamma\beta}{(1-\gamma)} \norminf{Q_{s, u}^{\star}-Q_{s, u^{\prime}}^{\star}}
% \end{align*}
% The two line comes from first equality of the proof and upper bound from Eq. \eqref{1}. The left-hand side of the inequality does not depend on $s,a$, so taking the sup norm, we get,


% \begin{align*}
%     \norminf{Q_{s, u}^{\star}-Q_{s, u^{\prime}}^{\star}}   \leq \frac{\vert u-u'\vert }{2}  +\frac{2\gamma\beta}{1-\gamma} \norminf{Q_{s, u}^{\star}-Q_{s, u^{\prime}}^{\star}}
% \end{align*}
% Finally, for the condition that $\beta\leq (1-\gamma)/(4\gamma)$  or equivalently  $C_\beta=\frac{2\gamma \beta}{1-\gamma} \leq  1/2 $, we get the result:


% \begin{align}
%     \label{adaptation}
%     \norminf{Q_{s, u}^{\star}-Q_{s, u^{\prime}}^{\star}} \stackrel{}{\leq}& \frac{2}{1-\gamma}= \frac{2\left|u-u^{\prime}\right|}{2}=\left|u-u^{\prime}\right|
% \end{align}




%  The rest of the proof  is similar to \citet{agarwal2020model}. Here the proof is done for $sa$-rectangular but is almost the same for $s$-rectangular as the only thing that change are the notations and $\normq{\pi_s}\leq 1$.
%  %
%  The proof for a fixed policy $\pi$ follows exactly the same line, without the greediness argument at the beginning.
% \end{proof}

% The rest of the proof is similar, considering robust MDPS in place of classical MDPs.

% Finally, We need to prove the following lemma for the adaptation of \cite{agarwal2020model}.

% \begin{lemma}
% \label{subtil_lema}
% \begin{align*}
%      \left(I-\gamma P_{0,s, u^{\prime}}^{\pi_{s, u}}\right)^{-1}\Big(R_{0,s, u^{\prime}}    -\alpha_{s,a} -\gamma\beta_{s,a}\snormqbar{Q_{s, u^{\prime}}^{\star}} \Big)  \leq   \max _\pi
%      \left(I-\gamma P_{0,s, u^{\prime}}^{\pi_{}}\right)^{-1} \left(R_{0,s, u^{\prime}}    -\alpha_{s,a} -\gamma\beta_{s,a}\snormqbar{Q_{s, u^{\prime}}^{\star}} \right)
% \end{align*}


% \end{lemma}

% \begin{proof}
%     First $Q_{s, u^{\prime}}^{\star}$ is the fix point of a Bellman Robust operator. So
%     $ $
%     \begin{align*}Q_{s, u^{\prime}}^{\star}= \left(I-\gamma P_{0,s, u^{\prime}}^{\pi_{s, u}}\right)^{-1}\Big(R_{0,s, u^{\prime}}    -\alpha_{s,a} -\gamma\beta_{s,a}\snormqbar{Q_{s, u^{\prime}}^{\star}}
%     \end{align*}
% Then fixing the  robust reward $\Big(R_{0,s, u^{\prime}}    -\alpha_{s,a} -\gamma\beta_{s,a}\snormqbar{Q_{s, u^{\prime}}^{\star}}$ independently of the Q-function $Q_{s, u^{\prime}}^{\star}$, we have that $Q_{s, u^{\prime}}^{\star}$ is also the solution of a non-robust Bellman optimal operator, with the reward equals to the one on the robust case. In the non-robust case, by greedyness, it is true that for any reward $r$ :
% \begin{align*}
%     \left(I-\gamma P_{0,s, u^{\prime}}^{\pi_{s, u}}\right)^{-1} r\leq   \max _\pi
%      \left(I-\gamma P_{0,s, u^{\prime}}^{\pi_{}}\right)^{-1}    r 
% \end{align*}
% So this is true also for the Robust case as $Q_{s, u^{\prime}}^{\star}=$ is both a solution of robust Bellman operator  and non-robust one but with same reward.

% \end{proof}




%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%




%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%




% \section{ Any norm duality for Laixi}


% Assume we have a RMPD centered using any norm $\norm{.}$. Defining the dual norm $\dnorm{.}$ such that $\|y\|_*=\max _x x^T y:\|x\| \leq 1$ and the span semi dual norm : $\sdnorm{x}= \min _{\omega \in \mathbb{R}}\|v-\omega \mathbf{1}\|_{*} $ and the generalised mean denoted $=\omega_{}(v):=\arg \min _{\omega \in \mathbb{R}}\|v-\omega \mathbf{1}\|_{}$ (For $L_2$, we retrieve the classical mean. Moreover, we define :
% $\forall s,a, \quad v \mapsto \kappa_{\mathcal{D}}(v)=\inf \left\{u^{\top} v: u \in \mathcal{D}\right\}$

%  We replace an $\mathbb{R}^\Snorm$ optimisation into a 1-d optimisation problem. (It works for norm, for f-divergence we are lucky we $\chi^2$.
% \begin{lemma}[Duality for the minimisation problem for $sa$ rectangular case.] 


%     \begin{equation*}
%         \kappa_{\hat{\mathcal{P}}_{s,a}^{\mathrm{}}}(\Vpihat)=\max _{\mu\geq 0}  \{\widehat{P}_{}(\Vpihat-\mu) -\beta_{s,a} \sdnorm{\Vpihat-\mu} \} =\max _{\alpha \in [V_{min}, V_{max}]} \widehat{P}_{}[\Vpihat]_\alpha - \beta_{s,a} \sdnorm{ [\Vpihat]_\alpha  }  .
%     \end{equation*}
%     \begin{equation*}
%         \kappa_{\mathcal{P}_{s, a}}(\Vs)=\max _{\mu\geq 0}  \{P_{0}(\Vs-\mu) -\beta_{s,a} \sdnorm{\Vs-\mu} \} =\max _{\alpha \in [V_{min}, V_{max}]} P_{0}[\Vs]_\alpha - \beta_{s,a} \sdnorm{ [\Vs]_\alpha  }  .
%     \end{equation*}
%     with $[V]_\alpha(s):= \begin{cases}\alpha, & \text { if } V(s)>\alpha \\ V(s), & \text { otherwise. }\end{cases}$
% \end{lemma}


% \begin{proof}
%    First,  we will show that  
%     \begin{equation*}
%         \kappa_{\hat{\mathcal{P}}_{s, a}^{\mathrm{}}}(\Vpihat)=\max _{\mu\geq 0}  \{\widehat{P}_{}(\Vpihat-\mu) -\beta_{s,a} \sdnorm{\Vpihat-\mu} \} 
%     \end{equation*}
%     The second equation is the same replacing the center of the ball constrain $\widehat{P}_{s, a}$ by $P_{0,s, a}$ and $\hat{\pi}$ by $\pistar$. By definition, 
    
% \begin{align*}
%         \kappa_{\hat{\mathcal{P}}_{s, a}} (\Vpihat)=  \min _{P\in \Delta_s , \norm{P-   \widehat{P}_{}  }\leq \beta_{s,a}   } \sum_{s'} P(s')\Vpihat(s')=  \widehat{P}_{s, a}\Vpihat+  \min _{y , \norm{y    } \leq \beta_{s,a}  , \mathrm{1}y=0, y\geq -\hat{P} }   \sum_{s'} y(s')\Vpihat(s')  
% \end{align*}
% where we use the change of variable $y(s')= P(s')-\hat{P}_{}(s')$.
% Then writing the Lagrangian we get for $\mu\in \mathbb{R}_{+}^{\Snorm}$,$\gamma\in \mathbb{R}$ the Lagrangian variables:
% \begin{align}
%     &\widehat{P}_{}\Vpihat+ \max _{\mu\geq0, \nu\in \mathbb{R} }  \min _{y: \norm{y}\leq   \beta_{s,a}} {   -\sum_{s'}\mu(s) \hat{P}_{}(s')   +\sum_{s'} (y(s') (\Vpihat(s')-\mu(s')-\nu)  } \label{first2} \\
%     & \stackrel{(a)}{=}\widehat{P}_{}\Vpihat +  \max _{\mu\geq0, \nu\in \mathbb{R} }  -\sum_{s'}\mu(s') \hat{P}_{}(s')  -\beta_{s,a} \dnorm{(\Vpihat(s')-\mu(s')-\nu)} \label{second2} \\
%     &\stackrel{(b)}{=} \max _{\mu\geq 0} \widehat{P}_{}(\Vpihat-\mu) - \beta_{s,a}  \sdnorm{  \Vpihat-\mu } \label{third2} 
% \end{align}
% where (a) is true using the equality case of Cauchy-Swartz inequality for dual norm, which is a minimisation problem which is attained. In (b)  is the definition of the span semi-norm (see Def. \ref{span}). The value that maximizes the inner maximization problem in \ref{second2} in $\nu$ is the $q$-mean  by definition denoted $\omega$.

% Now the aim is to prove that 
% \begin{equation*}
%     \max _{\mu\geq 0}  \{\widehat{P}_{}(\Vpihat-\mu) -\beta_{s,a} \sdnorm{\Vpihat-\mu} \} =\max _{\alpha \in [V_{min}, V_{max}]} \widehat{P}_{}[\Vpihat]_\alpha - \beta_{s,a} \sdnorm{ [\Vpihat]_\alpha  }  .
% \end{equation*}
% In this equality, optimisation reduces in terms of $\mu \in \mathbb{R}^+$ to scalar bounded optimization in $\alpha$.
% First we have to remark that in Eq \ref{first}, the minimisation problem is attainted for :

% \begin{equation}
%     y^{*}(s')=-\frac{\beta_{s,a}z(s')}{\norm{z}}
% \end{equation}
% because we are doing linear minimization under convex constraints.
% The value of vector
% $z$ is $z(s')=(\Vpihat(s')-\mu(s')-\omega)\widehat{P}_{}(s')$
% with $w$ defined as the generalised-mean for any norm.
% The quantity $z/\normpbar{z}$ has unitary norm and its sign is determined $(\Vhatpihat(s')-\mu(s')-\omega)$.  We can choose any multiplicative scalar value as the vector is normalized, here we choose $\widehat{P}_{}(s')$.

% Complementary slackness in equation \ref{third} gives that for all $s'$ such that $\mu(s')>0$,  $ y^*(s')=-q(s') $ or equivalently :
% \begin{equation*}
%     y^*(s')=-q(s') \iff  \Vpihat(s')-\mu(s')=w_p +\norm{z}/\beta_{s,a}=\alpha 
% \end{equation*}
% with $\alpha $ a constant. Since the optimal value of the initial problem is at least $ \min _{s'} \Vpihat(s')$ and lower than  $ \max _{s'} \Vpihat(s')$ , we have $\max _{s'} \Vpihat(s')\geq \alpha\geq\min _{s'} \Vpihat(s')$. The value of  $\alpha$ is not known in practice but we can recognise that the optimal value of $\mu$ is :


% $$\mu^*(s)= \begin{cases}\Vpihat(s)-\alpha, & \Vpihat(s) \geq \alpha \\ 0, & \text { otherwise }\end{cases}$$
% Then the dual optimisation problem eq \ref{third} reduces to 

% \begin{equation*}
%     \max _{\mu\geq 0} \widehat{P}_{s, a}(\Vpihat-\mu) - \beta_{s,a}  \sdnorm{  \Vpihat-\mu }=  \max _{\alpha \in [V_{min}, V_{max}]} \widehat{P}_{s, a}[\Vpihat]_\alpha - \beta_{s,a} \sdnorm{ [\Vpihat]_\alpha  }  .
% \end{equation*}

% with $[V]_\alpha(s):= \begin{cases}\alpha, & \text { if } V(s)>\alpha \\ V(s), & \text { otherwise. }\end{cases}$

% The thing which is of capital importance is that  $\snormqbar{ [V]_\alpha} $ does not depend on $\widehat{P}_{}$, contrary to all  divergence constrained such as $\chi^2$.

% \end{proof}

% \begin{lemma}[Duality for the minimisation problem for $s$ rectangular case.] \label{sduality} 
%     \begin{equation*}
%         \kappa_{\hat{\mathcal{P}}_{s}^{\mathrm{}}}(\Vpihat)= \max _{\alpha \in [V_{min}, V_{max}]}  \sum_a \pihat(a\vert s) (r(s,a) + \gamma\Big(\hat{P}_{}[\Vpihat]_\alpha - \beta_{s} \dnorm{\pi_s}\sdnorm{ [\Vpihat]_\alpha  } \Big)
%     \end{equation*}
%     \begin{equation*}
%         \kappa_{\mathcal{P}_{s}}(\Vs)=\max _{\alpha \in [V_{min}, V_{max}]}  \sum_a \pistar(a\vert s) (r(s,a) + \gamma\Big(P_{0}[\Vs]_\alpha - \beta_{s} \dnorm{\pi_s}\sdnorm{ [\Vs]_\alpha  } \Big) .
%     \end{equation*}
%      with $[V]_\alpha(s):= \begin{cases}\alpha, & \text { if } V(s)>\alpha \\ V(s), & \text { otherwise. }\end{cases}$
% \end{lemma}


% \begin{proof}
%     We will first  show that  
%     \begin{equation*}
%          \kappa_{\hat{\mathcal{P}}_{s}^{\mathrm{}}}(\Vpihat)= \sum_a \pihat(a\vert s) (r(s,a) +\gamma \hat{P}_{}\Vpihat ) +\max _{\mu\geq 0}\Bigg(\gamma   \sum_a  \pihat(a\vert s)    
%   (-\widehat{P}_{}\mu) -  \gamma \beta_s \dnorm{\pi_s}\sdnorm{\Vpihat-\mu} \Bigg)
%     \end{equation*}
%     The second equation is the same, replacing the center of the ball constrain $\widehat{P}_{s}$ by $P_{0}$ and $\hat{\pi}$ by $\pistar$. By definition, 
    
% \begin{align*}
%         \kappa_{\hat{ \mathcal{P}}_{s, }} (\Vpihat)(s)=&  \min _{P\in \Delta_s , P\in \hat{\mathcal{P}}_s   } \mathcal{T}^{\pihat} \Vpihat(s)=  \min _{P\in \Delta_s , P\in \hat{\mathcal{P}}_s   } \sum_a \pihat(a\vert s)  (r(s,a) 
%         + \gamma     P_{}\Vpihat)  \\&
%         \stackrel{(a)}{=} \sum_a \pihat(a\vert s) (r(s,a) +\gamma \hat{P}_{s,a}\Vpihat )   + \gamma \min _{ \norm{\beta_{s,a}} \leq \beta_s   }  \sum_a  \pihat(a\vert s) \min _{y , \normpbar{y    } \leq \beta_{s,a}  , \mathrm{1}y=0, y\geq -\hat{P} } \sum_{s'} y(s') \Vpihat  \\
%     % &   \stackrel{(b}{=} \sum_a \pihat(a\vert s) (r(s,a) +\gamma \hat{P}_{s,a}\Vpihat )      + \gamma \min _{ \normpbar{\beta_{s,a}} \leq \beta_s   }  \sum_a  \pihat(a\vert s)   \min _{y , \normpbar{y    } \leq \beta_{s,a}  , \mathrm{1}y=0, y\geq -\hat{p} } \sum_{s'} y(s') \Vpihat 
% \end{align*}
% where we use the change of variable $y(s')= P(s')-\hat{P}_{}(s')$ in (a). Then we case use the previous lemma for $sa$ rectangular assumption, Lemma \ref{saduality}. Then, 
% \begin{align*}
%    &\gamma \min _{ \norm{\beta_{s,a}} \leq \beta_s   }  \sum_a  \pihat(a\vert s) \min _{y , \norm{y    } \leq \beta_{s,a}  , \mathrm{1}y=0, y\geq -\hat{P} } \sum_{s'} y(s') \Vpihat =  \gamma \min _{ \norm{\beta_{s,a}} \leq \beta_s   }  \sum_a  \pihat(a\vert s)  \max _{\mu\geq 0} \Big(-\widehat{P}_{}\mu - \beta_{s,a}  \sdnorm{  \Vpihat-\mu }\Big) \\
%    &=   \max _{\mu\geq 0}\Bigg(\gamma   \sum_a  \pihat(a\vert s)    
%   (-\widehat{P}_{}\mu) -  \max _{ \norm{\beta_{s,a}} \leq \beta_s   } \gamma\sum_a \pihat(a\vert s)   \beta_{s,a}  \sdnorm{  \Vpihat-\mu }\Bigg) \\
%   &=    \max _{\mu\geq 0}\Bigg(\gamma   \sum_a  \pihat(a\vert s)    
%   (-\widehat{P}_{}\mu) -  \gamma \beta_s \dnorm{\pi_s}\sdnorm{\Vpihat-\mu} \Bigg)\\
% \end{align*}
%  we can exchange the min and the max as we get concave-convex problems in $\beta_{s,a}$ and $\mu$ in the second line and using Cauchy Swartz inequality in the last line. Finally, we obtain: 

%  \begin{align*}
%       \kappa_{\hat{ \mathcal{P}}_{s, }} (\Vpihat)=&  
%   \max _{\mu\geq 0} \sum_a \pihat(a\vert s) (r(s,a) +\gamma \hat{P}_{}(\Vpihat -\mu)  - \gamma \beta_s \dnorm{\pi_s}\sdnorm{\Vpihat-\mu} \\
%   =
%   &  \sum_a \pihat(a\vert s) (r(s,a) +\gamma\Big( \max _{\mu\geq 0}\hat{P}_{} (\Vpihat -\mu) -\beta_s \dnorm{\pi_s}\sdnorm{\Vpihat-\mu}\Big)\\
%   \stackrel{(a)}{=}
%   & \sum_a \pihat(a\vert s) (r(s,a) +\gamma\Big(\max _{\alpha \in [V_{min}, V_{max}]} \hat{P}_{}[\Vpihat]_\alpha - \beta_{s} \dnorm{\pi_s}\sdnorm{ [\Vpihat]_\alpha  } \Big) \\
%   =
%    & \max _{\alpha \in [V_{min}, V_{max}]}  \sum_a \pihat(a\vert s) (r(s,a) + \gamma\Big(\hat{P}_{}[\Vpihat]_\alpha - \beta_{s} \normqbar{\pi_s}\snormqbar{ [\Vpihat]_\alpha  } \Big) \\
%   \end{align*}
%  where in (a) we use Lemma \ref{saduality}.
% Second claim is the same replacing $\Vpihat$ by $\Vs$, $\pihat$ by $\pistar$ and $\hat{P}_{}$ by $P_{0}$.

% % Then writing the lagrangian we get for $\mu\in \mathbb{R}_{+}^{\Snorm}$,$\gamma\in \mathbb{R}$ the lagrangian variables:
% % \begin{align}
% %     &\widehat{P}_{s, a}\Vpihat+ \max _{\mu\geq0, \nu\in \mathbb{R} }  \min _{y: \normpbar{y}\leq   \beta_{s,a}} {   -\sum_{s'}\mu(s) \hat{P}_{s,a}(s')   +\sum_{s'} (y(s') (\Vpihat(s')-\mu(s')-\nu)  } \label{fourth} \\
% %     & \stackrel{(a)}{=}\widehat{P}_{s, a}\Vpihat +  \max _{\mu\geq0, \nu\in \mathbb{R} }  -\sum_{s'}\mu(s') \hat{P}_{s,a}(s')  -\beta_{s,a} \normqbar{(\Vpihat(s')-\mu(s')-\nu)} \label{5th} \\
% %     &\stackrel{(b)}{=} \max _{\mu\geq 0} \widehat{P}_{s, a}(\Vpihat-\mu) - \beta_{s,a}  \snormqbar{  \Vpihat-\mu } \label{6th} 
% % \end{align}
% % where (a) is true using the equality case of cauchy-swartz inequality and (b)  is the definition of of the seminorm. The value that maximize the inner maximization problem in \ref{5th} in $\nu$ is the $q$-mean $w_p$. 

% % Then we change optimisation in terms of $\mu \in \mathbb{R}^+$ to scalar bounded optimisation, this part is central in our proof.
% % First we have to remark that in Eq \ref{fourth}, the minimisation problem is attainted for :

% % \begin{equation}
% %     y^{\*}(s')=-\frac{\beta_{s,a}z(s')}{\normpbar{z}}
% % \end{equation}
% % with the value of vector $z(s')=(\Vpihat(s')-\mu(s')-w_q)\widehat{P}_{s, a}(s')$
% % with $w_q$ defined as the $q$-mean.
% % For the vector $z$, the importance is the sign of $(\Vhatpihat(s')-\mu(s')-w_q)$, we can multiply it by every scalar value as the vector is then normalised, here we choose $\widehat{P}_{s, a}(s')$.

% % Using complementary slackness in equation \ref{6th} arguments for the value of $s'$ such that $\mu(s')>0$ implies $ y^*(s')=-q(s') $ or equivalently :
% % \begin{equation*}
% %     y^*(s')=-q(s') \iff  \Vpihat(s')-\mu(s')=w_p +\normpbar{z}/\beta_{s,a}=\alpha 
% % \end{equation*}
% % with $\alpha $ a constant. Since the optimal value of the initial problem is at least $ \min _{s'} \Vpihat(s')$ and lower than  $ \max _{s'} \Vpihat(s')$ , we have $\max _{s'} \Vpihat(s')\geq \alpha\geq\min _{s'} \Vpihat(s')$. The value of  $\alpha$ is not known in practice but we can recognise that the optimal value of $\mu$ is :


% % $$\mu^*(s)= \begin{cases}\Vpihat(s)-\alpha, & \Vpihat(s) \geq \alpha \\ 0, & \text { otherwise }\end{cases}$$
% % Then the dual optimisation problem eq \ref{third} reduces to 

% % \begin{equation*}
% %     \max _{\mu\geq 0} \widehat{P}_{s, a}(\Vpihat-\mu) - \beta_{s,a}  \snormqbar{  \Vpihat-\mu }=  \max _{\alpha \in [V_{min}, V_{max}]} \widehat{P}_{s, a}[\Vpihat]_\alpha - \beta_{s,a} \snormqbar{ [\Vpihat]_\alpha  }  .
% % \end{equation*}

% % with $[V]_\alpha(s):= \begin{cases}\alpha, & \text { if } V(s)>\alpha \\ V(s), & \text { otherwise. }\end{cases}$

% % The thing which is of capital importance is that  $\snormqbar{ [V]_\alpha} $ does not depend on $\widehat{P}_{s, a}$, contrary to divergence contrained such as $\chi^2$.

% \end{proof}


% \begin{lemma} \label{trickalpha}
% For $s$ and $sa$ rectangular assumptions,
%    \begin{equation}
%        \left|\kappa_{\hat{\mathcal{P}}_{s, a}^{\mathrm{}}}(\Vpihat)-\kappa_{\mathcal{P}_{s, a}}(\Vpihat)\right|\leq \vert ( \widehat{P}_{} - P_{0}) \Vpihat\vert 
%    \end{equation} 
%       \begin{equation}
%        \left|\kappa_{\hat{\mathcal{P}}_{s, a}^{\mathrm{}}}(\Vs)-\kappa_{\mathcal{P}_{s, a}}(\Vs)\right|\leq \vert ( \widehat{P}_{} - P_{0}) \Vs\vert 
%    \end{equation} 
% \end{lemma}
% \begin{proof}
%     \begin{align*}
%          \left|\kappa_{\hat{\mathcal{P}}_{s, a}^{\mathrm{}}}(\Vpihat)-\kappa_{\mathcal{P}_{s, a}}(\Vpihat)\right|&\stackrel{(a)}{=} \max _{\alpha \in [V_{min}, V_{max}]} \widehat{P}_{}[\Vpihat]_\alpha - \beta_{s,a} \sdnorm{ [\Vpihat]_\alpha  }    \\ 
%          &-\max _{\alpha \in [V_{min}, V_{max}]} P_{0}[\Vpihat]_\alpha - \beta_{s,a} \sdnorm{ [\Vpihat]_\alpha  }    \vert \\
%          &\stackrel{(b)}{\leq } \max _{\alpha \in [V_{min}, V_{max}]} \vert (\widehat{P}_{} -P_{0})[\Vpihat]_\alpha\vert \\
%          &\stackrel{(c)}{\leq }   \vert (\widehat{P}_{} -P_{0})\Vpihat\vert \leq  \max _{s,a} \vert (\widehat{P}_{} -P_{0})\Vpihat\vert
%     \end{align*}
% \end{proof}
% where (a) is previous lemma, (b) is 1-Lipchitz property of $\max$ operator, (c) is triangular inequality that the maximum is attained for $\alpha=V_{max}$ for the equality. For s rectangular,

% \begin{align*}
%          \left|\kappa_{\hat{\mathcal{P}}_{s, a}^{\mathrm{}}}(\Vpihat)-\kappa_{\mathcal{P}_{s, a}}(\Vpihat)\right|&\stackrel{(a)}{=} \max _{\alpha \in [V_{min}, V_{max}]}  \sum_a \pihat(a\vert s) (r(s,a) + \gamma\Big(\hat{P}_{s}[\Vpihat]_\alpha - \beta_{s} \dnorm{\pi_s}\sdnorm{ [\Vpihat]_\alpha  } \Big)  \\ 
%          &-\max _{\alpha \in [V_{min}, V_{max}]}  \sum_a \pihat(a\vert s) (r(s,a) + \gamma\Big(P_{0}[\Vpihat]_\alpha - \beta_{s} \dnorm{\pi_s}\sdnorm{ [\Vpihat]_\alpha  } \Big) \vert  \\
%          &\stackrel{(b)}{\leq } \max _{\alpha \in [V_{min}, V_{max}]} \vert \sum_a \pihat(a\vert s) (\widehat{P}_{} -P_{0})[\Vpihat]_\alpha\vert \\
%          &\stackrel{}{\leq }  \max _{s,a} \vert (\widehat{P}_{} -P_{0})\Vpihat\vert
%     \end{align*}





% Now proof for the tight lemma of Laixi.
% \begin{lemma}
% \label{lemma_tight}
% For every  ball using arbitrary norm, such as $V^\pi$ is the robust value function associated, we have for $s$ and $sa$ rectangular case:
%     \begin{equation}
%         \snorminf{V^\pi}\leq \frac{1}{\gamma \max \{1-\gamma, \beta\}}
%     \end{equation}
% \end{lemma}
%  with $ \snorminf{V^\pi}=\frac{\max _s V^\pi(s)- \min _s V^\pi(s)}{2}$. 
% \begin{proof}
% First using Bellman recursion,
%     \begin{align*}
%         \max _s  V^\pi(s) =  \max _s \mathbb{E} _{a\sim \pi} [ r(s,a) + \gamma  \kappa_{\mathcal{P}_{s} }(V^\pi) ]  \leq \max _{s,a} 1+ \gamma \kappa_{\mathcal{P}_{s}(a,.) }(V^\pi) 
%     \end{align*}
   
%     where we use the fact that the reward function is bounded by 1. We are doing the proof for $s$ rectangular but is is the same for $sa$, replacing $P_{s}(a,.)$ by $P_{s,a}(.)$.
% Let call $P_{s}(a,.)$ then minimum for the s rectangular case. Then, for any $s \in \mathcal{S} \times \mathcal{A}$, there exists some $\widetilde{P}_{s} \in \mathbb{R}^{S\times A}$ constructed by reducing the values of some elements of $P_{s}$ to obey $ \forall a \in A , P_{s}(a,.) \geq \widetilde{P}_{s}(a,.) \geq 0$ and $ \norm{P_{s }\left(s^{\prime},a\right)-\widetilde{P}_{s}\left(s^{\prime},a\right)}=(\beta/2)$ where $\beta$ is the radius of the ball. 


% %\pierre{je normalise par 2 mes normes a la TV? Non avec beta sur 2 ça marche}
% This implies $\widetilde{P}_{s}(a,.)+\beta e_{s_0}^{\top} \in \mathcal{P}_s$, where $e_{s_0}$ is the standard basis vector supported on $s_0$, since $\left\|\widetilde{P}_{s}(a,.)+(\beta/2) e_{s_0}^{\top}-P_{s}(a,.)\right\| \leq \left\|\widetilde{P}_{s}(a,.)-P_{s}(a,.)\right\|+\frac{\beta}{2}=\beta$. Consequently,
% $$
% \begin{aligned}
% \kappa_{\mathcal{P}_{s}(a,.) }(V^\pi) \leq\left(\widetilde{P}_{s}(a,.)+(\beta/2) e_{s_0}^{\top}\right) V^{\pi} & \leq\left\|\widetilde{P}_{s}(a,.)\right\|_1\left\|V^{\pi}\right\|_{\infty}+\beta/2 V^{\pi}\left(s_0\right) \\
% & \leq(1-\beta/2) \max _{s \in \mathcal{S}} V^{\pi }(s)+(\beta/2) \min _{s \in \mathcal{S}} V^{\pi}(s)
% \end{aligned}
% $$
% where the second inequality holds by 
% \begin{align*}
%     \left\|\widetilde{P}_{s}(a,.)\right\|_1&=\sum_{s^{\prime}} \widetilde{P}_{s}\left(s^{\prime},a\right)=-\sum_{s^{\prime}}\left(P_{s}\left(s^{\prime},a\right)-\widetilde{P}_{s}\left(s^{\prime},a\right)\right)+\sum_{s^{\prime}} P_{s}\left(s^{\prime},a\right)\\
%     &=1- \norm{P_s -\widetilde{P}_{s}}_1\leq 1- \norm{P_s -\widetilde{P}_{s}}=1-\beta/2
% \end{align*}
% where the last inequality comes from the fact that  for any norm on a finite dimensional vector space, $\norm{.}\leq \norm{.}_1$.
% Plugging this inequality in the first one, we obtain 
% \begin{align*}
%     \max _s V^\pi \leq 1 + (1-\beta/2)\gamma \max _s V^\pi(s) + \gamma (\beta/2) \min _s V^\pi(s)
% \end{align*}
% Finally: 
%     \begin{align*}
%         \max _s  V^\pi(s) \leq& \frac{1+\gamma (\beta/2) \min _s  V^\pi(s)}{1-\gamma(1-\beta/2) }\leq \frac{2}{\gamma \max \{1-\gamma, \beta\}} +  \min _s  V^\pi(s) \\
%         \iff& \snorminf{V^\pi} \leq  \frac{1}{\gamma \max \{1-\gamma, \beta\}}.
%     \end{align*}
% \end{proof}

\newpage
\end{document}


\end{document}