% \documentclass{uai2024} % for initial submission
\documentclass[accepted]{uai2024} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2024} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2024} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
\usepackage{microtype}
\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{booktabs} % for professional tables
\usepackage{hyperref}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}
\usepackage{algorithm}
\usepackage{algpseudocode}
% if you use cleveref..
\usepackage[capitalize,noabbrev]{cleveref}
%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}
\NewDocumentEnvironment{alignb}{b}{
  \begin{align*}
  \refstepcounter{equation} #1 \tag{\theequation}
  \end{align*}
}
\allowdisplaybreaks
\newcommand{\swap}[3][-]{#3#1#2} % just an example
\newcommand{\myineq}[2]{\overset{(#2)}{#1}}
\newcommand{\Ptd}{\mathcal{T}^{(\lambda)}}
\newcommand{\twonorm}[1]{ \left\| #1 \right\|_2 }
\newcommand{\ftwonorm}[1]{ \left [\left\| #1 \right\|_2\right] }
\newcommand{\twonormsq}[1]{ \left\| #1 \right\|_2^2 }
\newcommand{\ftwonormsq}[1]{\left [ \left\| #1 \right\|_2^2\right ] }
\newcommand{\myceil}[1]{ \left\lceil #1 \right\rceil }
\newcommand{\tvnorm}[1]{ \left\| #1 \right\|_{\mathcal{TV}} }
\newcommand{\mynorm}[1]{ \left\| #1 \right\| }
\newcommand{\norminf}[1]{ \left\| #1 \right\|_\infty }
\newcommand{\lbrac}[1]{ \left| #1 \right| }
\newcommand{\lnorm}[1]{ \left| #1 \right| }
\newcommand{\lnormsq}[1]{ \left( #1 \right)^2 }
\newcommand{\flnormsq}[1]{\left [ \left( #1 \right)^2\right ] }
\newcommand{\varbrac}[1]{ \left\{ #1 \right\} }
\newcommand{\vbrac}[1]{ \left\langle #1 \right\rangle }
\newcommand{\fvbrac}[1]{ \left[\left\langle #1 \right\rangle\right] }
\newcommand{\brac}[1]{ \left( #1 \right) }
\newcommand{\Fbrac}[1]{ \left [ #1 \right] }
\newcommand{\E}{\mathbb{E}}
\newcommand{\prob}{\mathbb{P}}
\newcommand{\ssa}{{|\mathcal{S}||\mathcal{A}|}}
\newcommand*{\eqenv}[1]{\begin{alignb} #1 \end{alignb} }

\newcommand{\hatt}{\widehat {\mathcal{T}}_{N_{\max}}^{\rho(\sigma)}}
\newcommand{\hatT}{\boldsymbol{\bar {\mathcal{T}}}_{N_{\max}}^{\rho(\sigma)}}
\newcommand{\uprho}{^{\rho(\sigma)}}
\newcommand{\qhatstar}{\widehat Q^{*\rho(\sigma)}}
\newcommand{\ddr}{\delta^{r,\rho(\sigma)}_{s,a,N_1}}
\newcommand{\ddq}{\delta^{\rho(\sigma)}_{s,a,N_2}}
\newcommand{\Varr}[1]{\text{Var}\brac{#1}}
\title{Model-Free Robust Reinforcement Learning with Sample Complexity Analysis}

% The standard author block has changed for UAI 2024 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors

\author[1]{Yudan Wang}
\author[1,2]{Shaofeng Zou}
\author[3]{Yue Wang}
% Add affiliations after the authors
\affil[1]{%
    Electrical Engineering\\
    University at Buffalo
}


\affil[2]{
Computer Science \& Engineering\\
University at Buffalo
}

\affil[3]{%
    Electrical and Computer Engineering\\
    University of Central Florida
}
\allowdisplaybreaks[2]
  
  \begin{document}
\maketitle

\begin{abstract}
 Distributionally Robust Reinforcement Learning (DR-RL) aims to derive a policy optimizing the worst-case performance within a predefined uncertainty set. Despite extensive research, previous DR-RL algorithms have predominantly favored model-based approaches, with limited availability of model-free methods offering convergence guarantees or sample complexities. This paper proposes a model-free DR-RL algorithm leveraging the Multi-level Monte Carlo (MLMC) technique to close such a gap. Our innovative approach integrates a threshold mechanism that ensures finite sample requirements for algorithmic implementation, a significant improvement than previous model-free algorithms. We develop algorithms for uncertainty sets defined by total variation, Chi-square divergence, and KL divergence, and provide finite sample analyses under all three cases. Remarkably, our algorithms represent the first model-free DR-RL approach featuring finite sample complexity for total variation and Chi-square divergence uncertainty sets, while also offering an improved sample complexity and broader applicability compared to existing model-free DR-RL algorithms for the KL divergence model. The complexities of our method establish the tightest results for all three uncertainty models in model-free DR-RL, underscoring the effectiveness and efficiency of our algorithm, and highlighting its potential for practical applications.
\end{abstract}

\section{ Introduction}
Reinforcement learning (RL)\citep{sutton2018reinforcement} aims to find the optimal policy that maximizes cumulative rewards through interactions with the environment and has witnessed demonstrated success in real applications, including robotics\citep{kober2013reinforcement}, finance, and computer vision. However, in more practical scenarios, direct interaction with the true environment is often unfeasible due to concerns such as safety, resource constraints, and ethical considerations. Consequently, a policy is initially learned within a simulated environment and subsequently transferred to the real environment. Yet due to reasons including unexpected external perturbations and adversarial attacks, a model mismatch between the simulation and the real environment exists, meaning the simulation may not be identical to the real environment. This model mismatch further leads to a degradation in performance when attempting to directly apply the learned policy in the real environment \citep{zhao2020sim}.


One promising framework to address this issue is the DR-RL \citep{iyengar2005robust,nilim2004robustness}. Unlike conventional RL which optimizes performance under a specific environment, DR-RL constructs an uncertainty set of environments and aims to optimize the worst-case performance within this set. If the uncertainty set is designed to encompass the true environment, DR-RL can learn a policy robust to the model uncertainty and provide an optimized lower bound on the true performance.

%However, in practice, the policy from the training environment often suffers from the model performance degradation due to the mismatch between the training environment and the real environment, i.e. external perturbations and adversarial attacks in real environment, and time-varying changes of the environment. Therefore, DR-RL and distributionally DR-RL (DR-RL) are developed to address these problems. Different from non-DR-RL problems, the goal of DR-RL is to find the optimal robust policy that performs well under a range of uncertain conditions. A general approach to finding the optimal robust policy is to optimize the worst-case performance over the uncertainty set, which describes the model mismatch between the training and real environments via a range of possible distributions for the transition kernel and/or the reward functions.  
% the potential mismatches between the training and real environments. It encompasses a range of possible distributions for both the transition kernel and the reward functions in the environment.
% , where the uncertainty set is the
% , where the training environment is the center of the set. 
% The policy is trained in training environment, 
% which is economic efficiently and easy to access.$\rho$-constrainedDistributionally robust reinforcement learning (DR-RL) aims to

%In general, this uncertainty set is constrained by statistical distance $\rho$ (e.g. total variation distance,  chi-square distance and KL divergence) centered around the \textit{nominal distribution}, where \textit{nominal distribution} corresponds to the transition kernel and reward distribution of the training environment. Therefore, the goal of finding the optimal robust policy can be achieved by solving a minimax problem over the $\rho$-constrained uncertainty set. 


% 
Numerous algorithms have been studied and proposed for DR-RL, which can be broadly categorized into two groups: model-based methods and model-free methods. Model-based approaches, e.g.,  \citep{shi2023curious,panaganti2022sample,yang2022toward,wang2023achieving}, involve the collection of samples from a simulation environment to estimate an empirical robust MDP. Subsequently, robust dynamic programming \citep{iyengar2005robust} is employed on the empirical MDP to derive the optimal policy. 
In contrast, model-free methods \citep{wang2021online,liu2022distributionally,wang2023model,liang2023single} directly learn the policy while collecting samples, bypassing the need for model estimation and storage. %These two categories offer distinct strategies within the DR-RL framework, catering to different scenarios and preferences in handling environmental uncertainties. 

While model-based methods generally require fewer samples to derive an optimal policy, storing the entire model becomes prohibitively expensive or impractical for large-scale problems. Conversely, model-free methods offer an efficient alternative that adapts without the need to store the model, facilitating more practical applications. Despite extensive research on model-based methods, the model-free DR-RL approaches remain relatively understudied. This is primarily attributed to the challenge of the distribution shift between the simulation that generates samples and the worst-case environment within the uncertainty set. The utilization of such biased samples introduces errors in each updating step and can accumulate deviations from the accurate values through the model-free bootstrapping algorithms, thus posing challenges in ensuring convergence and accurately quantifying algorithmic complexity.

%Due to the simple and straightforward formulation, model-based methods have been extensively studied, including deriving the sample complexity to learn an optimal policy. 




%In general, there are two primary methods to solve the DR-RL problems: model-based approach and model-free approach. 
%\citep{panaganti2022sample,yang2022toward,clavier2023towards,shi2023curious} adopt the model-based approach to find the optimal robust policy and provide the sample complexity analysis. However, these works require a large number of samples per iteration to estimate the nominal distribution. In practice, large sample size requires expensive memory storage and is computationally costly, which constrains the practical feasibility of the model-free approached algorithms.  In contrast to model-based approach, the model-free method boast computational efficiency and less storage overhead. 

To address the challenge of biased estimated updating, \citep{liu2022distributionally, wang2023model} propose a Multi-level Monte Carlo (MLMC) operator, renowned for its unbiased estimation of worst-case performance, leading to asymptotically convergent model-free algorithms. However, implementing the MLMC estimator in these works necessitates an infinite number of samples. Specifically, to construct the vanilla MLMC estimator, the learner first generates a random level number $N$ following a geometric distribution and then generates $2^{N+1}$ samples. To ensure algorithm convergence, the parameter of the geometric distribution is set to be less than $\frac{1}{2}$, resulting in an infinite expected total number of samples required. Subsequently, in \citep{wang2023finite}, a modified MLMC algorithm is introduced, requiring finite samples for implementation under the KL divergence uncertainty set. Nevertheless, their findings are constrained by a restrictive assumption, limiting their applicability. In this paper, we present a novel MLMC-based DR-RL algorithm by incorporating a threshold design, referred to as the threshold-MLMC (T-MLMC) algorithm. This design ensures our implementation demands only a finite number of samples for any general uncertainty set models. Furthermore, we provide complexity analysis for our T-MLMC algorithm under three uncertainty sets without relying on any restrictive assumptions. Our contributions are outlined as follows.








%Therefore, the model-free DR-RL algorithm and related sample complexity analysis that is feasible to different constrained uncertainty set and uncertainty level  is required. 
% The model-free MORL algorithms boast  and are applicable in real applications. 

\subsection{Major Contributions}
\textbf{We introduce a model-free T-MLMC algorithm for DR-RL with guaranteed implementation and convergence.} Unlike previous MLMC algorithms, which typically require an infinite number of samples for implementation, our approach incorporates a threshold design on the level number during the construction of our MLMC estimator. This design ensures that our estimator behaves similarly to the traditional MLMC estimator when the level number remains below the threshold. However, it adopts a simplified structure requiring fewer samples when the level number exceeds the threshold. By implementing this threshold design, we ensure that only a finite number of samples is necessary to construct the estimator, albeit with the trade-off of introducing bias. Nevertheless, we demonstrate that our algorithm converges to a close approximation of the optimal robust value function, where the approximation error exponentially diminishes as the threshold value increases. By setting a suitable threshold value, our algorithm represents the first model-free DR-RL algorithm applicable to general uncertainty sets, providing assurances of both sample finiteness and convergence. This characteristic renders our algorithm practical for implementation and highlights its potential for diverse applications.

%algorithm with threshold MLMC method, which balance the trade-off between the convergence guarantee and  expected total sample size. 
% Compared with previous model-free DR-RL works \citep{wang2023finite,wang2023model,liu2022distributionally,liang2023single}, our threshold algorithm h

\textbf{We establish that our algorithm achieves the tightest sample complexity across three distinct uncertainty sets among model-free methods.} Adapting our algorithm to accommodate three uncertainty set models—defined by total variation, Chi-square divergence, and KL divergence, we ascertain their respective sample complexities. By fine-tuning the threshold, we strike a balance between bias and sample complexity, demonstrating that our algorithms effectively identify the optimal robust policy with minimal samples. Specifically, for both total variation and Chi-square divergence uncertainty sets, our algorithms achieve $\epsilon$-optimality with $\widetilde{\mathcal{O }}\left(\frac{|\mathcal{S}||\mathcal{A}|}{(1-\gamma)^5\epsilon^2} \right)$ samples, where $|\mathcal{S}|$ and $|\mathcal{A}|$ denote the cardinality of the state and action space, respectively, and $\gamma$ represents the discount factor. For the KL divergence uncertainty set, our algorithm exhibits a sample complexity of $\widetilde{\mathcal{O }}\left(\frac{|\mathcal{S}||\mathcal{A}|}{(1-\gamma)^5\epsilon^2p_\wedge^2 } \right)$, where $p_\wedge$ signifies the minimal non-zero entry of the nominal transition kernel. Notably, all our results boast the most stringent parameter dependencies, marking the first model-free complexity results for the total variation and Chi-square divergence models, while significantly enhancing previous findings for the KL divergence model. Furthermore, our analysis requires no restrictive assumptions, underscoring the practical applicability of our model-free algorithms. A comprehensive comparison of our results with prior ones is presented in tables \Cref{table:11,table:2,table:3}\footnote{Due to space limitations, we only list part of the complexity results from \citep{shi2023curious} for comparison. The complete results can be found in Table 1 therein.}. Evidently, across all three uncertainty sets, our outcomes achieve the most favorable sample complexity among model-free methods.


\begin{table}[!h]
\vskip 0.15in
\begin{center}
\begin{small}
\begin{sc}
\begin{tabular}{lc}
\toprule
Reference  $\qquad\qquad$Model-Free & Sample Size \\
\midrule
\textsc{\citep{panaganti2022sample}}   \hfill{ $\times$}& $\widetilde{\mathcal{O }}\brac{\frac{|\mathcal{S}|^2|\mathcal{A}|}{(1-\gamma)^4 \epsilon^2} }$  \\
\citep{yang2022toward}    \hfill{ $\times$}& $\widetilde{\mathcal{O }}\brac{\frac{|\mathcal{S}|^2|\mathcal{A}|}{ (1-\gamma)^4\epsilon^2} }$ \\
\citep{clavier2023towards}    \hfill{ $\times$}&$\widetilde{\mathcal{O }}\brac{\frac{|\mathcal{S}||\mathcal{A}|}{ (1-\gamma)^4\epsilon^2} }$ \\
\citep{shi2023curious} 
\hfill{ $\times$}& $\widetilde{\mathcal{O }}\brac{\frac{|\mathcal{S}||\mathcal{A}|}{ (1-\gamma)^3\epsilon^2} }$  \\
\citep{wang2023model}  \hfill{ $\surd$}& Asymptotic  \\
Our work    \hfill{ $\surd$}& $\widetilde{\mathcal{O }}\brac{\frac{|\mathcal{S}||\mathcal{A}|}{ (1-\gamma)^5\epsilon^2} }$ \\
\bottomrule
\end{tabular}
\end{sc}
\end{small}
\end{center}
\vskip -0.1in
\caption{Sample Complexity under TV Uncertainty Set}\label{table:11}
\end{table}


\begin{table}[!h]
\vskip -0.15in
\begin{center}
\begin{small}
\begin{sc}
\begin{tabular}{lc}
\toprule
Reference  $\qquad\qquad$Model-Free & Sample Size \\
\midrule
\citep{panaganti2022sample}    \hfill{ $\times$}& $\widetilde{\mathcal{O }}\brac{\frac{|\mathcal{S}|^2|\mathcal{A}|}{ (1-\gamma)^4\epsilon^2} }$  \\
\citep{yang2022toward}      \hfill{ $\times$}& $\widetilde{\mathcal{O }}\brac{\frac{|\mathcal{S}|^2|\mathcal{A}|}{(1-\gamma)^4 \epsilon^2} }$  \\
\citep{shi2023curious}      \hfill{ $\times$}& $\widetilde{\mathcal{O }}\brac{\frac{|\mathcal{S}||\mathcal{A}|}{ (1-\gamma)^4\epsilon^2} }$  \\
\citep{wang2023model}    \hfill{ $\surd$}& Asymptotic \\
Our work    \hfill{ $\surd$}& $\widetilde{\mathcal{O }}\brac{\frac{|\mathcal{S}||\mathcal{A}|}{ (1-\gamma)^5\epsilon^2} }$  \\
\bottomrule
\end{tabular}
\end{sc}
\end{small}
\end{center}
\vskip -0.1in
\caption{Sample Complexity under Chi-square Uncertainty Set}\label{table:2}
\end{table}


\begin{table}[!h]
\vskip -0.15in
\begin{center}
\begin{small}
\begin{sc}
\begin{tabular}{lc}
\toprule
Reference  $\qquad\qquad$Model-Free & Sample Size  \\
\midrule
\citep{panaganti2022sample}     \hfill{ $\times$}& $\widetilde{\mathcal{O }}\brac{\frac{|\mathcal{S}|^2|\mathcal{A}|e^{\frac{1}{1-\gamma}}}{ (1-\gamma)^4\epsilon^2} }$  \\
\citep{yang2022toward}     \hfill{ $\times$}&$\widetilde{\mathcal{O }}\brac{\frac{|\mathcal{S}|^2|\mathcal{A}|}{(1-\gamma)^4p_\wedge^2  \epsilon^2} }$ \\
\citep{wang2023model}    \hfill{ $\surd$}& Asymptotic  \\
\citep{liang2023single}   \hfill{ $\surd$}& Asymptotic  \\
\citep{liu2022distributionally}   \hfill{ $\surd$}& Asymptotic   \\
\citep{wang2023finite} \hfill{ $\surd$}& $\widetilde{\mathcal{O }}\brac{\frac{|\mathcal{S}||\mathcal{A}| }{p_\wedge^6 (1-\gamma)^5 \epsilon^2} }$ \\
\citep{wang2023sample} \hfill{ $\surd$}& $\widetilde{\mathcal{O }}\brac{\frac{|\mathcal{S}||\mathcal{A}| }{p_\wedge^3 (1-\gamma)^5 \epsilon^2} }$ \\
\citep{wang2023sample} (VR) \hfill{ $\surd$}& $\widetilde{\mathcal{O }}\brac{\frac{|\mathcal{S}||\mathcal{A}| }{p_\wedge^3 (1-\gamma)^4 \epsilon^2} }$ \\
Our work \hfill{ $\surd$}& $\widetilde{\mathcal{O }}\brac{\frac{|\mathcal{S}||\mathcal{A}|}{p_\wedge^2 (1-\gamma)^5 \epsilon^2} }$  \\
\bottomrule
\end{tabular}
\end{sc}
\end{small}
\end{center}
\vskip -0.1in
\caption{Sample Complexity under KL Uncertainty Set.  VR denotes the result obtained with variance reduce technique.}\label{table:3}
\end{table}



\subsection{Related Works}
\textbf{Model-based Methods for DR-RL}
When the environment is fully known by the learner, robust dynamic programming can be applied to obtain the optimal policy \citep{iyengar2005robust,nilim2004robustness}, which is shown to converge exponentially. When the environment is unknown, the learner can first use samples obtained to construct an empirical transition kernel and an empirical uncertainty set, and then apply robust dynamic programming on this empirical model, e.g., \citep{panaganti2022sample,yang2022toward,shi2023curious,clavier2023towards,zhou2021finite}. Although model-based methods generally are more data efficient, they require large memory space to store the data and model, becoming impractical for large-scale problems.


%Expect the literature shown in tables, some sample complexity analysis of offline DR-RL algorithms are provided in \citep{shi2022distributionally,zhou2021finite}. 


\textbf{Model-free Methods for DR-RL}
Model-free methods, which learn the optimal robust policy while gathering samples, have been investigated in the context of DR-RL. In \citep{wang2021online}, a model-free algorithm for a contamination uncertainty set is devised, subsequently extended to other uncertainty sets in \citep{liu2022distributionally,wang2023model} through the introduction and application of a multi-level Monte Carlo (MLMC) estimator. Despite exhibiting asymptotic convergence, these algorithms necessitate an infinite number of samples to construct the MLMC estimator, thus lacking a quantified sample complexity. In \citep{wang2023finite}, it is demonstrated that a finite sample complexity for the MLMC algorithm for the KL divergence uncertainty set can be attained under a restrictive assumption, limiting the applicability of their findings. Under a similar assumption, a variance reduction-based algorithm is proposed in \citep{wang2023sample} for the KL divergence model, and sample complexity is obtained.
On the other hand, \citep{liang2023single} introduces a stochastic approximation-based model-free algorithm, achieving asymptotic convergence without assurances on sample complexity. Despite all these works,  designing a model-free DR-RL algorithm with finite sample complexity under minimal assumptions remains an open question. In this paper, we present a model-free DR-RL algorithm, providing finite sample analysis under various uncertainty set models without imposing additional assumptions.

%The distributionally robust optimization (DRO) is explored richly under the supervised learning setting \citep{bertsimas2018data,blanchet2019quantifying,dai2020coindice,delage2010distributionally,duan2021risk,duchi2019variance,gao2023distributionally,levy2020large}. 




% \textbf{Model-Free DR-RL}
%For model-based DR-RL setting, \citep{liang2023single} proposal a single-trajectory model-free DR-RL algorithm with asymptotic convergence guarantee. However, this work is established based on some assumptions that have not yet been fully validated. \citep{liu2022distributionally} propose MLMC distributionally robust $Q$-learning algorithm with KL constrained uncertainty set, which expected sample complexity is infinite. \citep{wang2023model} provides the  asymptotic convergence guarantee for model-free MLMC DR-RL algorithm.   \citep{wang2023finite} provides the finite sample complexity analysis of the Model-free KL constrained DR-RL algorithm with a limitation uncertainty level, where the analysis is based on the assumption that uncertainty set is sufficiently small, i.e. radius is less than minimum support (minimum non-zero probability) of nominal distribution. Hence, there is a need for a model-free DR-RL algorithm and an associated sample complexity analysis that is adaptable to various constrained uncertainty sets and levels of uncertainty.




\section{Preliminaries and Problem Formulations}
%In this paper, we denote by $\Delta (\mathcal{S}) $ the probability simplex over the set $\mathcal{S}$. Denote by $\rho(p,q)$ the distance or divergence between any two distributions $p$ and $q$.  The total variation distance between two distributions $p$ and $q$ is denoted as $\rho_\text{TV}(p,q)$ (resp. $\chi^2$ divergence: $\rho_{\chi^2}(p,q)$; the Kullback-Leibler(KL) divergence:  $\rho_\text{KL}(p,q)$). 
% The $\chi^2$ distance between any two distributions $p$ and $q$ is denoted as ; The Kullback-Leibler divergence between any two distribution $p$ and $q$ is denoted as . 
\subsection{Markov Decision Processes}
A Markov decision processes (MDPs) is specified by $\mathcal{M}=(\mathcal{S}, \mathcal{A}, R,\gamma,  \mathbf{R}_0, \mathbf{P}_0)$, where $\mathcal{S}$
and $\mathcal{A}$ denote the state and action spaces. $R\subset [0,r_{\max}]$ is a finite set of possible rewards; $\mathbf{P}_0=\{p_{s,a}\in \Delta(\mathcal{S}): (s,a)\in\mathcal{S}\times\mathcal{A} \}$ is the transition kernel,  where $p_{s,a}\in \Delta(\mathcal{S})$. $\mathbf R_0=\{\mu_{s,a}\in \Delta(R):(s,a)\in\mathcal{S}\times\mathcal{A}\} $ is the reward distribution. At each time step, the agent starts from state $s_t$ and takes an action $a_t$. The environment transits to the next state $s_{t+1}$ according to the transition kernel $p_{s_t,a_t}$, and provides a reward signal $r(s_t,a_t)\sim \mu_{s_t,a_t}$ to the agent.  % We set $R$
%$ \min_{r}\varbrac{r\in R}\geq 0$ and  $ \max_{r}\varbrac{r\in R}\leq r_{\max}$. 
%  which is the $s,a$-rectangular

A policy $\pi: \mathcal{S}\to \Delta(\mathcal{A})$\footnote{When $\pi$ is a deterministic policy, i.e., $\pi(\cdot|s)$ is a 0-1 distribution for all $s$, we denote the deterministic action chosen at state $s$ by $\pi(s)$.} denotes the probability of taking actions under different state and represents the strategy of the agent. The value function of a policy $\pi$ is defined as the expected cumulative reward the agent received by following the policy starting from $s$:
\begin{align}
    V^\pi_{\mathbf{P}_0,\mathbf{R}_0}(s)= \mathbb E \Fbrac{\sum_{t=0}^\infty  \gamma^t r_t|s_0=s,{\mathbf{P}_0,\mathbf{R}_0}}. \nonumber
\end{align}

The $Q$-function is defined as the cumulative reward starting from $s$ and action $a$:
\begin{align}
    Q_{\mathbf{P}_0,\mathbf{R}_0}^\pi(s,a)=\mathbb E\Fbrac{\sum_{t=0}^\infty  \gamma^t r_t|s_0=s,a_0=a,{\mathbf{P}_0,\mathbf{R}_0}}. \nonumber
\end{align}
%The optimal value function $V^*$ is defined as:
%\begin{align}
%    V^\pi(s) := \arg\max_{\pi}\varbrac{V^\pi(s)}. \nonumber
%\end{align}
The optimal $Q$-function $Q^{*}$ is defined as
\begin{align}
    Q_{\mathbf{P}_0,\mathbf{R}_0}^*(s,a)=\max_{\pi} Q_{\mathbf{P}_0,\mathbf{R}_0}^\pi(s,a), 
\end{align}
and it satisfies the Bellman equation:
\begin{align}
    Q_{\mathbf{P}_0,\mathbf{R}_0}^*(s,a)= \mathbb E\Fbrac{r_{s,a}+ \gamma \max_{a'\in \mathcal{A}}Q_{\mathbf{P}_0,\mathbf{R}_0}^*(s',a')}. \nonumber
\end{align} 
Moreover, the optimal policy $\pi_{\mathbf{P}_0,\mathbf{R}_0}^*=\arg\max_\pi Q_{\mathbf{P}_0,\mathbf{R}_0}^\pi$ can be obtained from the optimal $Q$-function: $\pi_{\mathbf{P}_0,\mathbf{R}_0}^*(s)= \arg\max_{a\in\mathcal{A}}Q_{\mathbf{P}_0,\mathbf{R}_0}^*(s,a)$.
\subsection{Distributionally Robust MDPs}
In the formulation of distributionally robust MDPs, both transition kernel and reward distribution belong to $(s, a)$-rectangular uncertainty sets $\mathcal{P}^\rho(\sigma)=\bigotimes_{s,a}\mathcal{P}^\rho_{s,a}(\sigma)$ and $\mathcal{R}^\rho(\sigma)=\bigotimes_{s,a}\mathcal{R}^\rho_{s,a}(\sigma)$. Namely, a robust MDP can be specified as $(\mathcal{S},\mathcal{A},R,\gamma,\mathcal{R}^\rho(\sigma),\mathcal{P}^\rho(\sigma))$, where $\mathcal{P}^\rho_{s,a}(\sigma)=\{q\in\Delta(\mathcal{S}): \rho(q,p_{s,a})\leq \sigma\}$, and 
$\mathcal{R}^\rho_{s,a}(\sigma)=\{\nu\in\Delta(R): \rho(\nu,\mu_{s,a})\leq \sigma\}$. Here, $\rho$ denotes any distance or divergence between two distributions, and $\sigma$ denotes the uncertainty level. The centers of the uncertainty sets, $p_{s,a}$ and $\mu_{s,a}$, are called nominal distributions. 



%Given the statistical distance $\rho$, uncertainty level $\sigma $ and nominal distribution $p_{s,a}\in\mathbf{P}_0$ (resp. $\nu_{s,a}\in \mathbf{R}_0$), the uncertainty set is defined as $\mathcal{P}^\rho_{s,a}(\sigma):= \varbrac{q: \rho(q,p_{s,a})\leq \sigma} $ (resp. $\mathcal{R}^\rho_{s,a}(\sigma):=\varbrac{\mu: \rho(\mu,\nu_{s,a})\leq \sigma}$). The $(s,a)$-rectangular uncertainty set is defined as 
%$\mathcal{P}^\rho(\sigma)=\otimes_{s,a} \mathcal{P}^\rho_{s,a}(\sigma) \brac{\text{ resp. } \mathcal{R}^\rho(\sigma)= \otimes_{s,a}\mathcal{R}^\rho_{s,a}(\sigma)}$. Given the reward uncertainty set $ \mathcal{R}^\rho(\sigma)$ and transition kernel uncertainty set $\mathcal{P}^\rho(\sigma)$, we introduce the distributionally robust MDP $\mathcal{M}_{\text{rob}}=\brac{\mathcal{S},\mathcal{A}, R,  \gamma, \mathcal{R}^\rho(\sigma),\mathcal{P}^\rho(\sigma) }$. 

We consider three functions that can be used to define an uncertainty set, total variation, Chi-square divergence, and KL divergence. For two distributions $p,q$, the total variation between them is defined as
$\rho_{TV}(q,p):=\frac{1}{2} \mynorm{q-p}_1;$ 
The Chi-square divergence is defined as
$\rho_{\chi^2}(q,p)=\mathbb E_{p}\Fbrac{\brac{1-\frac{q(\cdot)}{p(\cdot)} }^2};$
And the KL divergence is defined as 
 $\rho_{KL}(q,p)=\mathbb E_{p}\Fbrac{\log \frac{q(\cdot)}{p(\cdot)} } . $


DR-RL aims to optimize the worst-case performance among the uncertainty sets, i.e., to optimize the robust value function:
\begin{align}
    \pi^{*,\rho(\sigma)}&=\arg\max_\pi V^{\pi, \rho(\sigma)}\nonumber\\
    &=\arg\max_\pi \inf_{q\in \mathcal{P}^\rho(\sigma), \nu\in \mathcal{R}^\rho(\sigma) } V^\pi_{q,\nu}.
\end{align}
It is also convenient to use notations of the robust state-action value functions:
\begin{align}
    Q^{\pi,\rho(\sigma)}(s,a)&=\inf_{q\in \mathcal{P}^\rho(\sigma), \nu\in \mathcal{R}^\rho(\sigma) } Q^\pi_{q,\nu}(s,a),
\end{align}
and the optimal robust policy can also be derived from it: $\pi^{*,\rho(\sigma)}=\arg\max_\pi Q^{\pi,\rho(\sigma)}$. 

The optimal robust state-action value function is hence denoted as
\begin{align}
    Q^{*,\rho(\sigma)}(s,a)=\sup_{\pi}Q^{\pi,\rho(\sigma)}(s),
\end{align}
and the optimal robust policy can be directly obtained from it: $\pi^{*,\rho(\sigma)}(s)=\arg\max_a  Q^{*,\rho(\sigma)}(s,a)$. 

It is further shown in \citep{iyengar2005robust} that the optimal robust $Q$-function satisfies the following robust Bellman equation:
\begin{align}\label{eq:4}
    Q^{*,\rho(\sigma)}(s,a)
    % \nonumber
    % \\&= \max_{a'\in\mathcal{A}}\inf_{p\in \mathcal{P}^\rho(\sigma), \mu\in \mathcal{R}^\rho(\sigma) }\varbrac{\mathbb E_{\mu }\Fbrac{r_{s,a}}+\gamma \mathbb E_{p}\Fbrac{Q^*(s',a') } } 
    = &\inf_{\nu\in \mathcal{R}^\rho(\sigma) }\mathbb E_{\nu }\Fbrac{r_{s,a}} \\&+\gamma\inf_{q\in \mathcal{P}^\rho(\sigma) }\mathbb E_{q}\Fbrac{\max_{a'\in \mathcal{A}}Q^{*,\rho(\sigma)}(s',a') } .\nonumber
\end{align}
%Here, we define \textit{robust reward Bellman operator} and \textit{robust transition Bellman operator} as follows:
%\begin{align}
%    &\mathcal{T}^{\rho(\sigma)} (Q)(s,a)= 
%    \nonumber\\&\inf_{\mu\in \mathcal{R}^\rho(\sigma) }\mathbb E_{\mu }\Fbrac{r_{s,a}}+ \gamma \inf_{p\in \mathcal{P}^\rho(\sigma) }\mathbb E_{p_{s,a}}\Fbrac{\max_{a'}Q(s',a') } .
%\end{align}
%The optimal robust value function and optimal robust $Q$-function satisfy that:
%\begin{align}
%    V^{*,\rho(\sigma)}(s)=\max_{a\in \mathcal{A}} Q^{*,\rho(\sigma)}(s,a).
%\end{align}
Hence DR-RL aims to find the optimal robust policy, or equivalently to solve the robust Bellman equation \cref{eq:4}. 




% where $\mathcal{TR} $ is the robust reward Bellman operator 
% \begin{align}
%     \mathcal{TR}
% \end{align}
% and $\mathcal{TP}$ is the robust transition Bellman operator 
% \begin{align}
%     \mathcal{TP}
% \end{align}





\subsection{Strong Duality}
% In general, the $\rho$-constrained uncertainty set includes infinite available distribution and can not be solved directly. Previous works \citep{iyengar2005robust,hu2013kullback} provide the duality form of minimax problems, which can be solved within finite steps and is widely used to solve DR-RL problems.
For a general uncertainty set $\mathcal{P}$, directly computing $\inf_{p\in\mathcal{P}} p^\top V$ for any vector $V$ is computationally expensive due to the set containing an infinite number of feasible distributions. However, this optimization problem can be equivalently solved using its dual form, which is a convex optimization \citep{iyengar2005robust,hu2013kullback}. These results play a crucial role in our algorithm design, therefore, we introduce the dual forms corresponding to the three uncertainty sets as follows.

%Furthermore, combined with the duality form, we introduce the form of the worst-case distribution within the uncertainty set for different $\rho$-constrained uncertainty sets. This worst-case distribution (distribution minimize the DRO problem) plays an important role in further analysis and research, i.e. tighter bound analysis and efficient algorithm design in practice. 

%Here we define a general model to present the duality of DRO problems: random variable $x\in\mathcal{S}$ following the distribution $x\sim p$, a function $v(\cdot): \mathcal{S}\to \mathbb R^+$.($x$ resp. $s$ or $r_{s,a}$, $p$ resp. $\mu$, $v(\cdot)  :=\max_a Q(\cdot,a) $ or resp. identity function)


\begin{lemma}[Total variation distance]\citep{iyengar2005robust}
    The optimization problem: 
    \begin{align}\label{eq:v_alpha}
        & minimize \quad \mathbb E_q[v(x)]
        \nonumber\\&
        subject \quad to \quad q\in \varbrac{\rho_{TV} 
        \brac{q,p}\leq \sigma, q\in \Delta(\mathcal{X}) },
        \end{align}
 is equivalent to
    \begin{align}\label{eq:tvu}
        &\max_{u\geq0}\bigg\{\mathbb E_{p} \Fbrac{v(x)-u(x)}
        -\frac{{\sigma}}{2}\text{Span}(v-u),\bigg\}.
    \end{align} 
    where $\text{Span}(X)=\max_i X(i)-\min_i X(i)$. 
If moreover set $$ (v(x))_\alpha= \begin{cases} v(x) & v(x)\leq \alpha \\
                     \alpha &  v(x)>\alpha,
       \end{cases}$$
       then, the optimization problem is also equivalent to
\begin{align}\label{eq:tva}
    \max_{\alpha\geq 0}\varbrac{ \mathbb E_{p}\Fbrac{(v(x))_\alpha}-\frac{{\sigma}}{2}\brac{\alpha-\min_x v(x)} }. 
\end{align} 
\label{lm:tv}
\end{lemma}

% \begin{proposition}[Lagrange Multiplier and Worst-case Distribution]
%     Set $\rho(\cdot,\cdot)=\rho_{TV}(\cdot,\cdot)$,  
% \end{proposition}

\begin{lemma}[Chi-square]\citep{iyengar2005robust}
The optimization problem:
     \begin{align}
        & minimize \quad \mathbb E_q[v(x)]
        \nonumber\\&
        subject \quad to \quad q\in \varbrac{\rho_{\chi^2} 
        \brac{q,p}\leq \sigma, q\in \Delta(\mathcal{X}) },\nonumber
        \end{align}
 is equivalent to
    \begin{align}
        &\max_{u\geq 0}\varbrac{\mathbb E_{p}\Fbrac{v(x)-u(x) } -\sqrt{\sigma \textbf{Var}_{p}\Fbrac{ v(x)-u(x)} } },\nonumber\\
        &=\max_{\alpha\geq0} \varbrac{\mathbb E_{p}\Fbrac{(v(x)_\alpha}-\sqrt{\sigma \textbf{Var}_{p}\Fbrac{ (v(x))_\alpha}  } }. 
    \end{align}\label{lm:chi}
\end{lemma}

 

\begin{lemma}[KL divergence]\citep{iyengar2005robust}
    The optimization problem 
     \begin{align}
        & minimize \quad \mathbb E_q[v(x)]
        \nonumber\\&
        subject \quad to \quad q\in \varbrac{\rho_{KL} 
        \brac{q,p}\leq \sigma, q\in \Delta(\mathcal{X}) },\nonumber
        \end{align} is equivalent to
    \begin{align}
        \max_{\alpha\geq0} \varbrac{-\alpha \log\brac{\mathbb E_{p} \Fbrac{exp\brac{-\frac{v(x)}{\alpha}} } }-\alpha \sigma }. 
    \end{align}
\end{lemma}\label{lm:kl}

\begin{remark}
For convenience, we denote the objective functions in the dual forms by $f^{\rho(\sigma)}(p, \alpha,v)$, i.e., 
    \begin{align}
        &f^{\rho_{TV}(\sigma)}(p, \alpha,v)=\mathbb E_{p}\Fbrac{(v(x))_\alpha}-\frac{{\sigma}}{2}\brac{\alpha-\min_x v(x)} ;\nonumber
        \\&f^{\rho_{\chi^2}(\sigma)}(p, \alpha,v)=\mathbb E_{p}\Fbrac{(v(x)_\alpha}-\sqrt{\sigma \textbf{Var}_{p}\Fbrac{ (v(x))_\alpha}  } ;
        \nonumber\\&
        f^{\rho_{KL}(\sigma)}(p, \alpha,v)=-\alpha \log\brac{\mathbb E_{p} \Fbrac{\exp\brac{-\frac{v(x)}{\alpha}} } }-\alpha \sigma . \nonumber
    \end{align}
We note that these objective functions correspond to the second term of \eqref{eq:4}; For the first term, we similarly denote their dual-form objective functions by $g^{\rho(\sigma)}(\mu,\alpha)$, whose specific definition can be found in  \Cref{sec:notation}.
    
    
\end{remark}

\section{Model-free Threshold-MLMC Algorithm}
In this section, we present our design of the T-MLMC algorithm. Our algorithm assumes a generative model, which can generative i.i.d. samples following the nominal kernels under arbitrary state-action pair $(s,a)\in \mathcal{S}\times \mathcal{A}$ :
\begin{align}
    r_{s,a}^i\overset{i.i.d}{\sim} \mu_{s,a}, s_{s,a}^i\overset{i.i.d}{\sim} p_{s,a}, i=1,...,N.
\end{align}





% \textbf{Model-Free algorithm: }
In robust dynamic programming, one needs to update the estimation of the robust value function by applying the robust Bellman operator:
\begin{align*}
    Q(s,a)&\leftarrow {\mathcal{T}}^{\rho(\sigma)}(Q) (s,a)\\
    &=\inf_{\nu\in \mathcal{R}^\rho(\sigma) }\mathbb E_{\nu }\Fbrac{r_{s,a}} +\gamma\inf_{q\in \mathcal{P}^\rho(\sigma) }\mathbb E_{q}\Fbrac{Q(s',a') },\nonumber
\end{align*}
which is shown to converge to the optimal robust value function. In our setting, we need to estimate the two worst-case terms with the samples from the nominal distributions. However, due to the distribution shift between the nominal kernel and the worst-case kernel, estimating them is challenging. One potential approach is to first obtain an empirical nominal distribution $\hat{p}$, and construct an uncertainty set centered on it using the same function $\rho$ and uncertainty radius $\sigma$: $\hat{\mathcal{P}}=\{q: \rho(q,\hat{p})\leq \sigma \}$. However, unlike the non-robust case, where $\hat{p}^\top V$ is an unbiased estimator of the expectation $\mathbb{E}_p[V]$, the term $\min_{p\in\mathcal{P}}(p^\top V)$ is non-linear in the nominal kernel, resulting in $\min_{p\in\hat{\mathcal{P}}}(p^\top V)$ being a biased empirical estimator \citep{wang2023model}. 

To address this issue, a multi-level Monte Carlo approach is proposed in \citep{liu2022distributionally,wang2023model}, which is inspired by the MLMC method in statistical inference from, e.g., \citep{blanchet2015unbiased,blanchet2019unbiased,wang2022unbiased}.  Specifically, MLMC first randomly generates a level number $N$ following a geometry distribution $\text{GEO}(\psi)$, and then generative $2^{N+1}$ samples. Using the these samples, an estimated operator $\widetilde{\mathcal{T}}_N$ of level $N$ is further constructed, and it is shown that $\mathbb{E}_{N}[\widetilde{\mathcal{T}}_N(V)]=\min_{p\in\mathcal{P}}(p^\top V)$ is unbiased. Hence by replacing the robust Bellman operator with the MLMC estimator, we obtain an unbiased updating rule and the algorithm is shown to converge to the optimal policy \citep{liu2022distributionally,wang2023model}. 

Although the MLMC algorithms are shown to asymptotically converge in these works, the parameter $\psi$ of the geometry distribution is set to be $\psi<\frac{1}{2}$, which results in an infinite expected number of samples required to construct the MLMC estimator ($\mathbb{E}_{N\sim\text{GEO}(\psi)}[2^{N+1}]=\infty$). To address this issue, we modify the MLMC by designing a threshold on the level number, to avoid numerous sample requirements when the level number is large. 

%ensure the number of samples required does not exceed the threshold and is thus finite. %Our construction and algorithm are presented as follows. 



%To find out the optimal robust policy, we introduce the Multi-level Monte Carlo distributionally robust value iteration algorithm. At the beginning, we need to choose the uncertainty level $\sigma$ and constrained statistical distance $\rho(\cdot,\cdot)$. Given an initialization $Q$-table  $ \widehat Q^{\rho(\sigma)}=0$, the value function and policy can be obtained via $Q$-table shown in Line $5$ and $6$ in \cref{alg:example}.

Specifically, we similarly set a fixed parameter $\psi$, and sample two level numbers $N_1,N_2\sim\text{GEO}(\psi)$. Instead of directly sampling $2^{N_i+1}$ samples, we add a threshold $N_{\max}$ when generating samples. If $N_i\leq N_{\max}$, then we generate $1+2^{N_i+1}$ i.i.d. samples; And if $N_i>N_{\max}$, we only generate $1$ samples instead. Our design ensures that the number of samples required at each time step is less than $1+2^{N_{\max}+1}$ and hence finite. Specifically, if $N_1\leq N_{\max}$, we independently draw $2^{N_1+1}+1$ samples $r_{s,a,i}\sim \mu_{s,a}, i=0,1,...,2^{N_1+1}$; And when $N_1>N_{\max}$, we draw one sample $r_{s,a,0}\sim \mu_{s,a}$. Similarly, if $N_2\leq N_{\max}$, we independently draw $2^{N_2+1}+1$ samples $s'_{s,a,i}\sim p_{s,a}, i=0,1,...,2^{N_2+1}$. And when $N_2>N_{\max}$, we only draw one sample $s'_{s,a,0}\sim p_{s,a}$. 


  
% Then, 
% Then, get the threshold indicator $\quad\xi_1=\mathbf{1}_{(N_1\leq N_{\max})} ; \quad \xi_2=\mathbf{1}_{(N_2\leq N_{\max})} $. 


% $N_1=\min\varbrac{N_1, (N_{\max}+1)\mathbb{I}_{(N_1\leq N_{\max})}-1},N_2=\min\varbrac{N_2, (N_{\max}+1)\mathbb{I}_{(N_1\leq N_{\max})}-1} $, which 

We then combine this scheme with the MLMC estimator to construct our estimation of the worst-case value as follows. 

For the worst-case reward term, we set
\begin{align}
    \widehat r^{\rho(\sigma)}(s,a):&= r_{s,a,0}+\frac{\delta^{r,\rho(\sigma)}_{s,a,N_1}}{P_{N_1}},\label{eq:hatr}
\end{align}
where $P_{N_1}=\psi(1-\psi)^{N_1}$ and
\begin{align}
    \delta^{r,\rho(\sigma)}_{s,a,N_1}:&=\sup_{\alpha\geq 0}\varbrac{g^{\rho(\sigma)}(\widehat \mu_{s,a,2^{N_1+1}},\alpha )} 
    \nonumber \\& -\frac{1}{2} \sup_{\alpha\geq 0}\varbrac{g^{\rho(\sigma)}(\widehat \mu^E_{s,a,2^{N_1}},\alpha)}
    \nonumber \\&-\frac{1}{2}\sup_{\alpha\geq 0}\varbrac{g^{\rho(\sigma)}(\widehat \mu^O_{s,a,2^{N_1}},\alpha)}\nonumber
\end{align}
when $N_1\leq N_{\max}$, and
 when $N_1>N_{\max} $, $\delta^{r,\rho(\sigma)}_{s,a,N_1}=0 $.
% where $\boldsymbol{id}(\cdot)$ is identity function. 
% $r_{s,a}$ is random variable i.e. $r_{s,a}\sim \mu_{s,a}$ and $\boldsymbol{id}(r_{s,a})=r_{s,a}$ is identity function of random variable.
Here, $\widehat \mu_{s,a,2^{N_1+1}} $ denotes the empirical reward distribution obtained from the $1+2^{N_1+1}$ samples $\{ r_{s,a,i}: i=0,1,...,2^{N_1+1}\}$; And denote by $\widehat \mu^O_{s,a,2^{N_1}}$
 and $\widehat \mu_{s,a,2^{N_1}}^E$ the empirical reward distribution estimated from the samples with odd and even indexes.

Similarly, for the worst-case value function term, we set 
\begin{align}
     \widehat{v}^{\rho(\sigma)}(Q(s,a)) :&=V(s'_{s,a,0})+\frac{\delta^{\rho(\sigma)}_{s,a,N_2}(Q) }{P_{N_2}},\label{eq:hatq}
\end{align}
% $\delta^{Q,\rho(\sigma)}_{s,a,N_2} $ are defined as:
where $V(s)=\max_{a'}Q(s,a') $ and $P_{N_2}=\psi(1-\psi)^{N_2}$. 
When $N_2>N_{\max} $, set $\delta^{\rho(\sigma)}_{s,a,N_2}(Q)=0 $; Otherwise when $N_2\leq N_{\max}$, $\delta^{\rho(\sigma)}_{s,a,N_2}(Q) $ is defined as: 
\begin{align}\label{eq:delta}
     \delta^{\rho(\sigma)}_{s,a,N_2}(Q):&=\sup_{\alpha\geq 0}\varbrac{f^{\rho(\sigma)}(\widehat p_{s,a,2^{N_2+1}},\alpha,V)} 
    \nonumber \\& -\frac{1}{2} \sup_{\alpha\geq 0}\varbrac{f^{\rho(\sigma)}(\widehat p^E_{2^{N_2}},\alpha,V)}
    \nonumber \\&-\frac{1}{2}\sup_{\alpha\geq 0}\varbrac{f^{\rho(\sigma)}(\widehat p^O_{2^{N_2}},\alpha,V )},
\end{align}  
where we similarly denote the empirical transition kernel obtained from all samples $\{s'_{s,a,i}, i=0,1,...,2^{N_2+1}\}$, samples with odd indexes, and samples with even indexes by $\widehat p_{s,a,2^{N_2+1}} $, $\widehat p^O_{s,a,2^{N_2}}$ and $\widehat p^E_{s,a,2^{N_2}}$.   

 










% and $\xi_2 2^{N_2+1}+1$ samples 
% $s'_{s,a,j}\sim p_{s,a},j=0,1,...,2^{N_2+1}$.
Combine the two terms above together, we obtain the estimated robust Bellman operator through our T-MLMC framework:
$$\widehat {\mathcal{T}}^{\rho(\sigma)}_{ N_{\max}} (Q)(s,a)=\widehat  r^{\rho(\sigma)}(s,a)+ \gamma \widehat{v}^{\rho(\sigma)}(Q(s,a)). $$


With this estimator, we propose our model-free T-MLMC algorithm as in \Cref{alg:example}.

%perform robust synchronous update of $Q$-table shown in Line 17 in \Cref{alg:example}. After $T$ iterations, output robust $Q$-table and robust policy $\pi_T(\cdot)$. 








% Denote by $\hat p_n$ the empirical distribution on $n$ samples following the nominal distribution $p$, i.e. 
% \begin{align}
%     \mathbb E_{\hat p_n} \Fbrac{v(x)}= \frac{1}{n}\sum_{i=0}^{n-1}v(x_i).
% \end{align}

% Furthermore, we denote by $\hat p^O_n $
 % and $\hat p^E_n$ the empirical distribution deduced by the odd and even samples in $\hat p_n$. 
 % Then, we define the estimated robust Bellman operator:
 % \begin{align}
 %     & \widehat {\mathcal{T}}^{\rho(\sigma)} (Q)(s,a)=
 %     \nonumber\\&
 %     \inf_{\mu_{s,a}\in \widehat{\mathcal{R}}^\rho(\sigma) }\mathbb E_{\mu_{s,a} }\Fbrac{r_{s,a}}+ \gamma \inf_{p_{s,a}\in \widehat{\mathcal{P}}^\rho(\sigma) }\mathbb E_{p_{s,a}}\Fbrac{\max_{a'}Q(s',a') },\nonumber
 % \end{align}
 
 

\begin{algorithm}[tb]
   \caption{T-MLMC Algorithm}
   \label{alg:example}
\begin{algorithmic}
   \State {\bfseries Input:} Parameter $\psi=\frac{1}{2}$, stepsizes $\beta_t$, iteration number $T$
   % \REPEAT
   \State {\bfseries Initialize: }$\widehat Q^{\rho(\sigma)}_{0}=0$ 
   % \State  $noChange = true$.
   \For{$t=0$ {\bfseries to} $T-1$}
   \For{ every $s\in \mathcal{S}$}
   \State Set $\widehat V^{\rho(\sigma)}_{t}(s)= \max_{a}\widehat 
   Q^{\rho(\sigma)}_{t}(s,a) $
   \State Set $\pi_t(s)= \arg\max_a \widehat 
   Q^{\rho(\sigma)}_{t}(s,a) $
   \EndFor
   \For{every $(s,a)\in \mathcal{S}\times\mathcal{A}$}
   \State Independently sample $N_1,N_2\sim \text{GEO}(\psi)$
   \State Compute total sample sizes:
   \State $\quad\mathcal{N}_1=1+ 2^{N_1+1}\mathbf{1}_{(N_1\leq N_{\max})} $
   \State $ \quad \mathcal{N}_2=1+ 2^{N_2+1}\mathbf{1}_{(N_2\leq N_{\max})} $
   \State Independently draw $\mathcal{N}_1$ samples  $ r_{s,a,i}\sim \mu_{s,a} $
   \State Compute $\widehat r^{\rho(\sigma)}(s,a) $ by \Cref{eq:hatr}
   \State Independently draw $\mathcal{N}_2$ samples  $s_{s,a,i}\sim p_{s,a} $
   \State Compute $\widehat v^{\rho(\sigma)}(\widehat Q^{\rho(\sigma)}_{t}(s,a)) $ by \Cref{eq:hatq}
   \State Update synchronous $Q$-table:
   $\widehat Q^{\rho(\sigma)}_{t+1}(s,a)= (1-\beta_t) \widehat Q^{\rho(\sigma)}_{t}(s,a)+\beta_t \widehat {\mathcal{T}}^{\rho(\sigma)}_{ N_{\max}}(\widehat Q^{\rho(\sigma)}_{t})(s,a) $
   \EndFor
   % \IF{$x_i > x_{i+1}$}
   % \ENDIF
   \EndFor
   % \UNTIL{$noChange$ is $true$}
   \State {\bfseries Output:} $Q^{\rho(\sigma)}_{T}(s,a) $
\end{algorithmic}
\end{algorithm}

Note that due to the threshold $N_{\max}$, the resulting T-MLMC estimator becomes biased. However, as we will show in the next section, the bias can be bounded and inversely depends on $N_{\max}$. That is,  with the increase of $N_{\max}$, the bias term tends to $0$, and we recover the MLMC estimator from the T-MLMC estimator, which however will result in increasing sample complexity. We show in the following section that by carefully designing the threshold $N_{\max}$ to balance the bias-complexity trade-off, our T-MLMC algorithm converges to the optimal robust policy with finite sample complexity. 



\section{Sample Complexity}
In this section, we present the sample complexity results of our T-MLMC algorithms under different uncertainty sets. 

As discussed above, the estimator $\widehat {\mathcal{T}}^{\rho(\sigma)}_{ N_{\max}}$ we constructed is a biased estimation of the robust Bellman operator. However, in the following result, we show that the operator reduces to the vanilla MLMC estimator and the bias diminishes if $N_{\max}\to\infty$, and hence can be controlled by setting a larger threshold. 
\begin{theorem}\label{thm:tv1}
    For any fixed $Q\in \mathbb R^{|\mathcal{S}||\mathcal{A}|}, s\in \mathcal{S}, a\in \mathcal{A} $, for three uncertainty sets we considered, the estimation bias can be bounded as:
    \begin{align}
        &\sup_{s,a}\lbrac{\mathbb E\Fbrac{\widehat {\mathcal{T}}_{N_{\max}}^{\rho(\sigma)} (Q)(s,a) } - {\mathcal{T}}^{\rho(\sigma)} (Q)(s,a)}\\
        &\leq \widetilde{\mathcal{O }}\brac{{N_{\max}}2^{-\frac{N_{\max}}{2}} };\nonumber
        % \brac{r_{\max}+\frac{r_{\max}}{1-\gamma}}
    \end{align}
        The variance can be bounded as:
    \begin{align}
        \text{Var}\brac{\widehat {\mathcal{T}}^{\rho(\sigma)}_{N_{\max}} (Q)(s,a)  }\leq \widetilde{\mathcal{O }}\brac{{N_{\max}}}.
    \end{align}
    % And the infinite norm of robust Bellman operator can be bounded as:
    % \begin{align}
    %     \E\Fbrac{\norminf{\widehat {\mathcal{T}}^{\rho(\sigma)}_{N_{\max}} (Q)(s,a) }^2 }\leq \widetilde{\mathcal{O }}\brac{{N_{\max}}}.
    % \end{align}
\end{theorem}

The result hence suggests to set a larger value of $N_{\max}$ to diminish the bias. However, we note that the number of samples required for constructing the estimator and the overall sample complexity increase as $N_{\max}\to\infty$. To balance the trade-off between the bias and sample complexity, we choose a suitable value of $N_{\max}$ and present our complexity results in the following sections. 

%We then derive the sample complexity for our T-MLMC algorithm, and 

%  Next, we first introduce some required proposition for further analysis. 

% \begin{proposition}\label{prop:constract}
%     Given statistical distance $\rho(\cdot,\cdot)$ and uncertainty level $\sigma$, robust Bellman operator $ \mathcal{T}^{\rho(\sigma)}$is $\gamma$-\textit{contraction} w.r.t. the infinity norm.
% \end{proposition}


% \begin{definition}
%     Given the empirical frequency with $n$ samples, i.e.  $x_i, i=0,1,...,n$, 
% \end{definition}

\subsection{Total Variation distance}
In this part, we provide the sample complexity analysis for the total variation uncertainty set.



Utilizing results in \cref{thm:tv1}, we obtain the following sample complexity of our T-MLMC algorithm under the TV uncertainty set. 
\begin{theorem}[Sample Complexity with TV Distance]\label{thm2:tv}
 Set $\psi=\frac{1}{2}$, $N_{\max}=\frac{2\log T}{\log 2}$ and set the stepsize as $\beta_t=\frac{2\log T}{(1-\gamma)T}. $
Then the output from \cref{alg:example} satisfies that:
 \begin{align}
    \mathbb E &\Fbrac{\mynorm{\widehat Q_T^{\rho_{TV}(\sigma)}-Q^{*\rho_{TV}(\sigma)}}_\infty^2}\nonumber
    %\\& \leq \frac{c_0}{(1-\gamma)^2}\brac{1-\frac{\alpha(1-\gamma)}{2}}^T+ c_1 \alpha\frac{\log (|\mathcal{S}||\mathcal{A}|)}{ (1-\gamma)^4}\nonumber
   \leq\widetilde{\mathcal{O }}\brac{\frac{1}{  (1-\gamma)^5 T}}.
\end{align}
To obtain $\epsilon$-optimality, i.e., 
\begin{align}
    \mathbb E \Fbrac{\mynorm{\widehat Q_T^{\rho_{TV}(\sigma)}-Q^{*,\rho_{TV}(\sigma)}}_\infty^2}\leq \epsilon^2,\nonumber
\end{align}
the expected sample complexity $N^{\rho_{TV}(\sigma)}(\epsilon)$ is
 \begin{align}
     N^{\rho_{TV}(\sigma)}(\epsilon) = |\mathcal{S}||\mathcal{A}|N_{\max} T\leq \widetilde{\mathcal{O }}\brac{\frac{|\mathcal{S}||\mathcal{A}|}{ (1-\gamma)^5 \epsilon^2 }}.\nonumber
 \end{align}
\end{theorem}
Our result presents the first finite sample complexity for the model-free DR-RL algorithm for the total variation uncertainty set, indicating the effectiveness and efficiency of our T-MLMC algorithm. Compared to model-based DR-RL algorithms \citep{yang2022toward, panaganti2022sample, shi2023curious,clavier2023towards}, our algorithm results in a sample complexity with a higher dependence on $(1-\gamma)$. This aligns with findings from the non-robust setting \citep{li2020sample} that the vanilla model-free algorithms(without techniques including variance reduction) generally have larger sample complexity. Our result is also expected to be improved to align with the model-based complexity through standard techniques like variance reduction.

% Our result presents the first finite sample complexity for the model-free DR-RL algorithm for the total variation uncertainty set, indicating the effectiveness and efficiency of our T-MLMC algorithm. Compared to model-based DR-RL algorithms \citep{yang2022toward, panaganti2022sample, shi2023curious,clavier2023towards}, our model-free algorithm has a larger sample complexity in terms of $(1-\gamma)$, but the same dependence on parameters, which is also the case in the non-robust setting \citep{li2020sample}. We also note that for model-free algorithms or general stochastic approximation algorithms \citep{li2020sample,li2021q}, the tightest dependence on $(1-\gamma)$ is also $\widetilde{\mathcal{O }}((1-\gamma)^{-5})$, which implies the tightness of our complexity result. 

%Generally, the model-free algorithm has larger sample size but exhibits superior performance with wider application range.

\subsection{Chi-square Divergence}
We then present our results for DR-RL with a Chi-square divergence uncertainty set. 
%In this part, we provide the statistical properties and sample complexity analysis in the case where the uncertainty set is constrained by $\chi^2$ distance. 

% \begin{proposition}[Lagrange Multiplier and Worst-case Distribution]
%     Set $\rho(\cdot,\cdot)=\rho_{\chi^2}(\cdot,\cdot)$, 
% \end{proposition}

\begin{theorem}[Sample Complexity with $\chi^2$ Distance]\label{thm:chi2}
 Set $N_{\max}=\frac{2\log T}{\log 2}$ and the stepsize as $\beta_t=\frac{2\log T}{(1-\gamma)T}$. Then the output of \cref{alg:example} satisfies that:
 \begin{align}
    \mathbb E &\Fbrac{\mynorm{\widehat Q_T^{\rho_{\chi^2}(\sigma)}-Q^{*\rho_{\chi^2}(\sigma)}}_\infty^2}\nonumber
   %\\& \leq \frac{c_0}{(1-\gamma)^2}\brac{1-\frac{\alpha(1-\gamma)}{2}}^T+ c_1 \alpha\frac{\log (|\mathcal{S}||\mathcal{A}|)}{ (1-\gamma)^4}\nonumber
     \leq\widetilde{\mathcal{O }}\brac{\frac{1}{  (1-\gamma)^5 T}}.
\end{align}
To ensure
\begin{align}
    \mathbb E \Fbrac{\mynorm{\widehat Q_T^{\rho_{\chi^2}(\sigma)}-Q^{*\rho_{\chi^2}(\sigma)}}_\infty^2}\leq \epsilon^2,\nonumber
\end{align}
the expected total sample complexity $N^{\rho_{\chi^2}(\sigma)}(\epsilon)$ is,
 \begin{align}
     N^{\rho_{\chi^2}(\sigma)}(\epsilon) = |\mathcal{S}||\mathcal{A}|N_{\max} T\geq \widetilde{\mathcal{O }}\brac{\frac{|\mathcal{S}||\mathcal{A}|}{  (1-\gamma)^5 \epsilon^2 }}.\nonumber
 \end{align}
\end{theorem}
Our result implies that our T-MLMC algorithm is the first model-free algorithm for DR-RL under the Chi-square divergence uncertainty set. Similarly, compared to the model-based methods, our complexity presents an additional $\mathcal{O}((1-\gamma)^{-1})$-order dependence. 
 %For model-free algorithms or general stochastic approximation algorithms \citep{li2020sample,li2021q}, the tightest dependence on $(1-\gamma)$ is also $\mathcal{O}((1-\gamma)^{-5})$, which implies the tightness of our complexity result. 
% Our result implies that our T-MLMC algorithm is the first model-free algorithm for DR-RL under the Chi-square divergence uncertainty set. Compared to the model-based methods, our complexity presents an additional $\mathcal{O}((1-\gamma)^{-1})$-order dependence, which is common in model-free algorithms. 

%When compared to model-based DR-RL algorithms cited in \citep{yang2022toward, panaganti2022sample, shi2023curious}, our model-free algorithm has a higher sample complexity, exceeding the optimal result with $\mathcal{O}((1-\gamma)^{-1})$. Generally, the model-free algorithm has larger sample size but exhibits superior performance with wider application range.
% Our work provide the sample complexity of model-free DR-RL algorithm firstly for $\chi^2$ uncertainty set (see details in \Cref{table:2}). Compared with the model-based DR-RL algorithms \citep{yang2022toward,panaganti2022sample,shi2023curious}, the sample complexity of our model-free algorithm is larger than the best result with $\mathcal{O}((1-\gamma)^{-1})$. In general, the model-free algorithm has larger sample size but better testing performance.
\subsection{KL Divergence}
We then present our results for the KL divergence uncertainty set in this section. 



\begin{theorem}[Sample Complexity with KL Distance]\label{thm:kl}
If we set $\psi=\frac{1}{2}$, threshold
$$ N_{\max}=\max\varbrac{\frac{2\log T}{\log 2},\frac{\log(1+p^2_\wedge\log(2|\mathcal{S}|)\log T )}{\log 2}} ,$$
 and the stepsize as $\beta_t=\frac{2\log T}{(1-\gamma)T}$. Then the output of \cref{alg:example} satisfies that:
 \begin{align}
    \mathbb E &\Fbrac{\mynorm{\widehat Q_T^{\rho_{KL}(\sigma)}-Q^{*\rho_{KL}(\sigma)}}_\infty^2}\nonumber
   % \\& \leq \frac{c_0}{(1-\gamma)^2}\brac{1-\frac{\alpha(1-\gamma)}{2}}^T+ c_1 \alpha\frac{\log (|\mathcal{S}||\mathcal{A}|)}{p_\wedge^2 (1-\gamma)^4}\nonumber
    \leq\widetilde{\mathcal{O }}\brac{\frac{1}{ p_\wedge^2 (1-\gamma)^5 T}}.
\end{align}
To ensure 
\begin{align}
    \mathbb E \Fbrac{\mynorm{\widehat Q_T^{\rho_{KL}(\sigma)}-Q^{*\rho_{KL}(\sigma)}}_\infty^2}\leq \epsilon^2,\nonumber
\end{align}
the expected total sample complexity $N^{\rho_{KL}(\sigma)}(\epsilon)$ is
 \begin{align}
     N^{\rho_{KL}(\sigma)}(\epsilon) = |\mathcal{S}||\mathcal{A}|N_{\max} T\geq \widetilde{\mathcal{O }}\brac{\frac{|\mathcal{S}||\mathcal{A}|}{ p_\wedge^2 (1-\gamma)^5 \epsilon^2 }}.\nonumber
 \end{align}
\end{theorem}
Our result implies that our T-MLMC algorithm also solves the DR-RL problem for KL divergence uncertainty sets effectively. Compared to other model-based methods \citep{shi2022distributionally}, our results are $\mathcal{O}((1-\gamma)^{-1})$-order larger. We also note that there are several previous works on the sample complexity of model-free DR-RL approaches for the KL divergence model  \citep{wang2023sample,wang2023finite}, and we provide a discussion on the comparison of their works with ours as follows. 


In both previous works, an assumption is made regarding the size of the uncertainty level $\sigma$, specifically, $p_\wedge\geq \mathcal{O}\left( 1-e^{-\sigma}\right)$ assuming the uncertainty set cannot be too large. This assumption significantly limits the applicability of their results, as in many scenarios, the uncertainty set must be designed relatively large to encompass a broader range of environments, particularly when the nominal environment is a low-fidelity model of the true environment. In contrast, our approach does not rely on such an assumption and can be applied to any uncertainty set.

On the other hand, our sample complexity result is less than those in \citep{wang2023finite} and the first complexity in \citep{wang2023sample}. In \citep{wang2023finite}, the sample complexity of the vanilla MLMC DR-RL algorithm is $\widetilde{\mathcal{O }}\left(\frac{|\mathcal{S}||\mathcal{A}|}{ p_\wedge^6 (1-\gamma)^5 \epsilon^2 }\right)$. Our result improves upon this by $\mathcal{O}(p_\wedge^{-4})$, and we attribute this improvement to the designing of the threshold. In \citep{wang2023sample}, a mini-batch type model-free DR-RL algorithm is introduced, with a demonstrated sample complexity of $\widetilde{\mathcal{O }}\left(\frac{|\mathcal{S}||\mathcal{A}| }{p_\wedge^3 (1-\gamma)^5 \epsilon^2} \right)$. Further enhancement is achieved through the use of variance reduction (VR) technique, bringing the complexity down to $\widetilde{\mathcal{O }}\left(\frac{|\mathcal{S}||\mathcal{A}| }{p_\wedge^3 (1-\gamma)^4 \epsilon^2} \right)$. Notably, our result outperforms their initial vanilla algorithm by an order of $\mathcal{O}(p_\wedge^{-1})$. While the complexity with VR technique in \citep{wang2023sample} exhibits a superior dependence on $1-\gamma$, it fares worse concerning $p_\wedge$. This enhancement in $1-\gamma$ can be attributed to the utilization of the VR technique, consistent with previous findings \citep{li2020sample}. We hence also anticipate further improvement in our complexity results through the application of VR technique, a direction left for future investigation. Consequently, our algorithm achieves superior sample complexity compared to previous vanilla model-free algorithms and is anticipated to surpass the results in \citep{wang2023sample} with the VR technique.





%Compared previous model-free algorithm, our algorithm can be applied without the limitation of uncertainty level.


%Our algorithm reach $\epsilon$-accurate optimal robust policy with sample complexity $\widetilde{\mathcal{O }}\brac{\frac{|\mathcal{S}||\mathcal{A}|}{ p_\wedge^2 (1-\gamma)^5 \epsilon^2 }}$. The sample complexity analysis of model-free DR-RL algorithm is provided in \citep{wang2023finite} with order $\widetilde{\mathcal{O }}\brac{\frac{|\mathcal{S}||\mathcal{A}|}{ p_\wedge^6 (1-\gamma)^5 \epsilon^2 }}$. However, this sample size is built on the assumption that   When compared to model-based DR-RL algorithms cited in \citep{yang2022toward, panaganti2022sample}, our model-free algorithm has a higher sample complexity, exceeding the optimal result with $\widetilde{\mathcal{O }}((1-\gamma)^{-1})$. Generally, the model-free algorithm has larger sample size but exhibits superior performance with wider application range.
\section{Proof Sketch}
In this section, we briefly discuss the proof sketch for our results under the TV uncertainty set. The proofs for the other two uncertainty sets can be similarly derived. For convenience, we only discuss the proof regarding the uncertainty set of the transition kernels, the proof regarding the reward uncertainty can also be similarly obtained. 



Our proof can be divided into two main parts: We first conduct a sample complexity analysis to establish the convergence of \Cref{alg:example} to the fixed point of the T-MLMC estimator; %This involves determining the number of samples required for the algorithm to converge to a stable solution within an acceptable error margin.
Then we characterize the disparity between this fixed point and the optimal robust value function. Combine the two parts together, we quantify the sample complexity of our T-MLMC algorithm converging to the near approximation of the optimal robust value function. Specifically, we decompose the error as
\begin{align}\label{eqeq20}
    {\mynorm{\widehat Q_T^{\rho(\sigma)}-Q^{*\rho(\sigma)}}_\infty^2}&
  \leq 2 {\mynorm{\widehat Q_T^{\rho(\sigma)}-\widehat Q^{*\rho(\sigma)}}_\infty^2 }\nonumber\\\quad+ 2&{\mynorm{\widehat Q^{*\rho(\sigma)}-Q^{*\rho(\sigma)}}_\infty^2 },
\end{align}
where $\widehat Q^{*\rho(\sigma)}$ denotes the fixed point of the expected T-MLMC estimator: $\hatT(Q)=\mathbb{E}[\widehat {\mathcal{T}}^{\rho(\sigma)}_{ N_{\max}}(Q)]$. The two steps in our proof correspond to bounding the two terms in \eqref{eqeq20}. 


For the first term in \eqref{eqeq20}, we adapt the stochastic approximation scheme. From the definition, our T-MLMC operator is an unbiased estimator of $\hatT(Q)$, and it suffices to show the finite variance to ensure the asymptotic convergence to the fixed point $\widehat Q^{*\rho(\sigma)}$. Using the concrete construction of T-MLMC operator: 
\begin{align}
     \widehat{v}^{\rho(\sigma)}(Q(s,a)) :&=V(s'_{s,a,0})+\frac{\delta^{\rho(\sigma)}_{s,a,N_2}(Q) }{P_{N_2}}
\end{align}
and definition of $\delta^{\rho(\sigma)}_{s,a,N_2}$, we directly calculate the variance of it. We show that the variance is finite and can be bounded by $\widetilde{\mathcal{O }}(N_{\max})$, as in \Cref{thm:tv1}. Hence according to stochastic approximation theory \citep{borkar2009stochastic}, \Cref{alg:example} converges asymptotically to the fixed point $ \widehat Q^{*\rho(\sigma)}$. We then adapt the analysis in stochastic approximation \citep{chen2022finite} to obtain the finite-time error bound on the convergence of \Cref{alg:example} to $\widehat Q^{*\rho(\sigma)}$, i.e., the first term in \eqref{eqeq20}. 



For the second term in \eqref{eqeq20}, we show that the approximation error between $ \widehat Q^{*\rho(\sigma)}$ and the optimal robust value function can be bounded by considering the disparity between the robust Bellman operator and our T-MLMC operator, i.e., the bias:
\begin{align}
    &\left\|\widehat Q^{*\rho(\sigma)}- Q^{*\rho(\sigma)}\right\|\nonumber\\&\leq \frac{1}{1-\gamma} \left\|\hatT\brac{ Q^{*\rho(\sigma)}}-{\mathcal{T}}^{\rho(\sigma)}\brac{Q^{*\rho(\sigma)}}\right\|. 
\end{align} 
We note that when the threshold is not met, the T-MLMC operator is an unbiased estimator of ${\mathcal{T}}^{\rho(\sigma)}$, in which case we bound the error using concentration inequalities as in conventional MLMC approaches; When the threshold is met, although the error bound between the two operator is large, we can set the threshold $N_{\max}$ larger such that the probability of $\text{GEO}(\psi)> N_{\max}$ is small, resulting in a smaller error bound due to its low probability. Combining the two cases together implies a tight bound on the bias, as the first part in \Cref{thm:tv1}, and further quantifies the approximation error introduced by our T-MLMC design. 

Finally, combining the two parts, we derive the sample complexity for \Cref{alg:example} to converge to an approximation of the optimal robust value function with a quantifying approximation error. By setting the value of the threshold, we hence obtain the final sample complexity results. 

 

\section{Conclusion}
In this paper, we introduce a novel model-free T-MLMC algorithm tailored for finding the optimal robust policy in the DR-RL problem. Our algorithm strikes a delicate balance between convergence guarantees and the expected total sample size, ensuring convergence within a finite sample size. We further conduct sample complexity analyses for our algorithm under three distinct uncertainty sets: total variation, Chi-square divergence, and KL divergence. Notably, our results mark the first complexity analyses for model-free DR-RL methods under the total variation and Chi-square divergence uncertainty sets, while also enhancing the complexity bounds and applicability of prior results for the KL divergence model. Our results achieve the tightest complexity bounds in the realm of model-free DR-RL methods, achieving state-of-the-art results under minimal assumptions.

\section*{Acknowledgement}
The work of Yudan Wang and Shaofeng Zou is supported by the National Science Foundation under Grants CCF-2007783, CCF-2106560 and ECCS-2337375 (CAREER). Yue Wang is supported by UCF start-up funding. 

This material is based upon work supported under the AI Research Institutes program by National Science Foundation and the Institute of Education Sciences, U.S. Department of Education through Award \# 2229873 - National AI Institute for Exceptional Education. Any opinions, findings and conclusions or recommendations expressed in this material are those of the author(s) and do not necessarily reflect the views of the National Science Foundation, the Institute of Education Sciences, or the U.S. Department of Education.  



% References
\bibliography{example_paper}

\newpage

\onecolumn

 

\appendix
% \section{Experiment}
% In this section, we conduct numerical experiments to validate the convergence of our T-MLMC algorithm.


% We adapt our algorithm under the Garnet problem $\mathcal{G}(15,20)$ \citep{archibald1995generation}. There are $15$ states and $20$ actions. The transition kernel $\mathbf{P}=\{ \mathbf{P}^a_s\}$ is randomly generated by a normal distribution: $\mathbf{P}^a_s \sim \mathcal{N}(\omega^a_s,\sigma^a_s)$ and then normalized, and the reward function $r(s,a)\sim \mathcal{N}(\nu^a_s,\psi^a_s)$, where $\omega^a_s,\sigma^a_s,\nu^a_s,\psi^a_s \sim \textbf{Uniform}[0,100]$. We implement our T-MLMC algorithm under three distinct uncertainty sets. In our experiment, the uncertainty level for each model are set to be 0.4, the step size are set to be $\beta=0.01$, and $N_{\max}=32$. 

% We run algorithm under each uncertainty set for 20 times, and at each time step, we evaluate the worst-case performance of the greedy policy derived from the algorithm. We plot the average robust value function across the 20 runs, along with the 5th and 95th percentiles of the 20 runs as an envelope of variability. To establish a baseline, we compute the optimal robust value functions using robust dynamic programming.

% As the results shown, our algorithm converge to the optimal robust value functions under all uncertainty sets,  indicating the algorithm's capacity to derive the optimal robust policy effectively. The experimental findings thus corroborate our theoretical assertions, affirming the convergence of our model-free T-MLMC algorithm.

% \begin{figure}[!htb]
%   \centering
%   \subfigure[]{
%   \includegraphics[width=0.32\linewidth]{TV_50.pdf}}
%   \subfigure[]{
%   \includegraphics[width=0.32\linewidth]{chi2_50.pdf}}
%   \subfigure[]{
%   \includegraphics[width=0.32\linewidth]{KL_50.pdf}}
%   \caption{(a)TV uncertainty set (b) $\chi^2$ uncertainty set (c) KL uncertainty set}
% \end{figure}



\section{Numerical Result}
In this section, we conduct numerical experiments to validate the convergence of our T-MLMC algorithm.


\subsection{Convergence and optimiality of T-MLMC algorithm}
We adapt our algorithm under the Garnet problem $\mathcal{G}(15,20)$ \citep{archibald1995generation}. There are $15$ states and $20$ actions. The transition kernel $\mathbf{P}=\{ \mathbf{P}^a_s\}$ is randomly generated by a normal distribution: $\mathbf{P}^a_s \sim \mathcal{N}(\omega^a_s,\sigma^a_s)$ and then normalized, and the reward function $r(s,a)\sim \mathcal{N}(\nu^a_s,\psi^a_s)$, where $\omega^a_s,\sigma^a_s,\nu^a_s,\psi^a_s \sim \textbf{Uniform}[0,100]$. We implement our T-MLMC algorithm under three distinct uncertainty sets. In our experiment, the uncertainty level for each model are set to be 0.4, the step size are set to be $\beta=0.01$, and $N_{\max}=32$. 

\vspace{-0.4cm}
\begin{figure}[!htb]\label{fig:1}
  \centering
  \subfigure[]{
  \includegraphics[width=0.32\linewidth]{num/Garnet_TV.pdf}}
  \subfigure[]{
  \includegraphics[width=0.32\linewidth]{num/Garnet_c2.pdf}}
  \subfigure[]{
  \includegraphics[width=0.32\linewidth]{num/Garnet_KL.pdf}}
  \vspace{-0.4cm}
  \caption{Garnet $\mathcal{G}(20,15)$ (a)TV  (b) $\chi^2$  (c) KL uncertainty set}
\end{figure}
\vspace{-0.3cm}

We run algorithm under each uncertainty set for 20 times, and at each time step, we evaluate the worst-case performance of the greedy policy derived from the algorithm. We plot the average robust value function across the 20 runs, along with the 5th and 95th percentiles of the 20 runs as an envelope of variability. To establish a baseline, we compute the optimal robust value functions using robust dynamic programming.

\vspace{-0.4cm}
\begin{figure}[!htb]\label{fig:2}
  \centering
  \subfigure[]{
  \includegraphics[width=0.35\linewidth]{num/recycling_robot_TV.pdf}}
  \subfigure[]{
  \includegraphics[width=0.35\linewidth]{num/recycling_robot_chi_square.pdf}}
  % \subfigure[]{
  % \includegraphics[width=0.3\linewidth]{num/KL_50.pdf}}
  \vspace{-0.3cm}
  \caption{Recycling Robot (a)TV  (b) $\chi^2$ uncertainty set }
\end{figure}
\vspace{-0.3cm}

We further provide an experiment on a real-life problem: recycling robot problem (Example 3.3 \citep{sutton2018reinforcement,wang2023model}. A mobile robot running on a rechargeable battery aims to collect empty soda cans. It has 2 battery levels: low and high. The robot can either 1) search for empty cans; 2) remain stationary and wait for someone to bring it a can; 3) go back to its home base to recharge. Under low (high) battery level, the robot finds an empty can with probabilities $\alpha(\beta)$, and remains at the same battery level. If the robot goes out to search but finds nothing, it will run out of its battery and can only be carried back by human. We introduce model uncertainty to the probabilities $\alpha,\beta$ of finding an empty can if the robot chooses the action 'search'. We implement our algorithm under this problem.
 In our experiment, the uncertainty level for each model is set to be 0.2, the recycling system are set to be $\alpha=0.5,\beta=0.5$, the learning rate is set to be $0.01$, and $N_{\max}=32$. We run algorithm under each uncertainty set for 20 times, and at each time step, we evaluate the worst-case performance of the greedy policy derived from the algorithm. We plot the average robust value function across the 20 runs, along with the 5th and 95th percentiles of the 20 runs as an envelope of variability. To establish a baseline, we compute the optimal robust value functions using robust dynamic programming.
% \vspace{-1cm}
% \vspace{-0.4cm}
\begin{figure}[!htb]
  \centering
  \subfigure[]{
  \includegraphics[width=0.3\linewidth]{num/FrozenLake_TV.pdf}}
  \subfigure[]{
  \includegraphics[width=0.3\linewidth]{num/FrozenLake_Chi_Square.pdf}}
  \subfigure[]{
  \includegraphics[width=0.3\linewidth]{num/FrozenLake_KL.pdf}}
  \vspace{-0.4cm}
  \caption{FrozenLake   (a)TV  (b) $\chi^2$  (c) KL uncertainty set}
\end{figure}
% \vspace{-0.3cm}
% \vspace{-1cm}

We further explore the theoretical FrozenLake environment. This simulation involves navigating from the starting point at [0,0] to the goal at [3,3] on a 4x4 grid of icy patches and holes. Players choose to move up, down, left, or right, but due to the ice's slipperiness, movement may not always follow the intended direction. We incorporate model uncertainty into movement probabilities to account for the ice's unpredictable nature, demanding strategic planning and robust algorithm implementation to safely reach the goal. In our experiment, the uncertainty level for each model is set to be 0.2, the learning rate is set to be $0.01$, and $N_{\max}=32$. We run algorithm under each uncertainty set for 20 times, and at each time step, we evaluate the worst-case performance of the greedy policy derived from the algorithm. We plot the average robust value function across the 20 runs, along with the 5th and 95th percentiles of the 20 runs as an envelope of variability. To establish a baseline, we compute the optimal robust value functions using robust dynamic programming.


%There are key actions in FrozenLake environment: 1). Move in Intended Direction: Players primarily move as intended but occasionally slide sideways on the ice. 
%2). Remain Stationary: Rarely used strategically, this action avoids potential slips. 
%3). Random Movement: Slipperiness may cause unintended, random moves.

\begin{figure}[!htb]
  \centering
  \subfigure[]{
  \includegraphics[width=0.3\linewidth]{num/Gambler_TV.pdf}}
  \subfigure[]{
  \includegraphics[width=0.3\linewidth]{num/Gambler_Chi_Square.pdf}}
  \subfigure[]{
  \includegraphics[width=0.3\linewidth]{num/Gambler_Kl.pdf}}
  \vspace{-0.4cm}
  \caption{Gambler (a)TV  (b) $\chi^2$  (c) KL uncertainty set}
\end{figure}

Besides, we validate our algorithm in the Gambler's Problem, featured \citep{zhou2021finite,shi2022distributionally}. In this scenario, a gambler starts with an initial stake and bets on coin toss outcomes to reach a financial goal, such as turning $1$ into $100$. Each bet can lead to a gain or loss, dictated by probabilities $p$ and $1-p$, respectively. The gambler's challenge is to devise a strategy that maximizes the odds of reaching the target without going bankrupt, considering they can bet any amount up to the lesser of their current capital or the amount needed to reach the goal. This problem emphasizes the development of optimal betting policies and the application of value iteration techniques to achieve desired outcomes in a risk-laden environment. 
 In our experiment, the uncertainty level for each model is set to be 0.2, the parameter $p$ in system is set as $p=0.6$,  the learning rate is set to be $0.01$, and $N_{\max}=32$. We run algorithm under each uncertainty set for 20 times, and at each time step, we evaluate the worst-case performance of the greedy policy derived from the algorithm. We plot the average robust value function across the 20 runs, along with the 5th and 95th percentiles of the 20 runs as an envelope of variability. To establish a baseline, we compute the optimal robust value functions using robust dynamic programming.



%Compared previous model-free algorithm, our algorithm can be applied without the limitation of uncertainty level.

As the results show, our algorithm converges to the optimal robust value functions under all uncertainty sets,  indicating the algorithm's capacity to derive the optimal robust policy effectively. The experimental findings thus corroborate our theoretical assertions, affirming the convergence of our model-free T-MLMC algorithm.



\subsection{Comparison with vanilla MLMC algorithm}
Furthermore, we compare our T-MLMC algorithm with the vanilla MLMC algorithm using the recycling robot problem as a test case. We run both algorithms with same parameters, and plot the robust value functions of the learned policy v.s. the number of samples. Our algorithm learned the optimal policy with a much fewer number of samples, demonstrating that our T-MLMC algorithm exhibits better sample complexity performance than the vanilla MLMC algorithm. 

\begin{figure}[!htb]
  \centering
  \subfigure[]{
  \includegraphics[width=0.35\linewidth]{num/mlmc_vs_T_mlmc_tv.pdf}}
  \subfigure[]{
  \includegraphics[width=0.35\linewidth]{num/mlmc_vs_T_mlmc_chi2.pdf}}
  % \subfigure[]{
  % \includegraphics[width=0.3\linewidth]{num/Gambler_Kl.pdf}}
  \vspace{-0.4cm}
  \caption{T-MLMC v.s. MLMC (a)TV  (b) $\chi^2$ uncertainty set}
\end{figure}


\section{Notations and Lemmas}\label{sec:notation}
In this section, we present the necessary notations and lemmas which are later used in the proofs. The proofs of these lemmas can be found in \Cref{sec:proof of lemma}. 

Recall that for the reward uncertainty set, we have
\begin{align}
        &g^{\rho_{TV}(\sigma)}(\mu,\alpha)=\E_\mu[(x)_\alpha]-\frac{\sigma}{2}\brac{\alpha-\min x};
        \nonumber\\& 
        g^{\rho_{\chi^2}(\sigma)}(\mu,\alpha)=\E_\mu[(x)_\alpha]-\sqrt{\text{Var}_\mu [(x)_\alpha]};
        \nonumber\\& 
        g^{\rho_{KL}(\sigma)}(\mu,\alpha)=-\alpha \log\brac{\mathbb E_{p} \Fbrac{\exp\brac{-\frac{x}{\alpha}} } }-\alpha \sigma .\nonumber
    \end{align}
Specifically, the definition is as follows 
\eqenv{
&g^{\rho_{TV}(\sigma)}(\mu_{s,a}, \alpha,r_{s,a})=\mathbb E_{\mu_{s,a}}\Fbrac{(r_{s,a})_\alpha}-\frac{{\sigma}}{2}\brac{\alpha-\min r_{s,a}} ;\nonumber
        \\&g^{\rho_{\chi^2}(\sigma)}(\mu_{s,a}, \alpha,r_{s,a})=\mathbb E_{\mu_{s,a}}\Fbrac{(r_{s,a})_\alpha}-\sqrt{\sigma \textbf{Var}_{\mu_{s,a}}\Fbrac{ (r_{s,a})_\alpha}  } ;
        \nonumber\\&
        g^{\rho_{KL}(\sigma)}(\mu_{s,a}, \alpha,r_{s,a})=-\alpha \log\brac{\mathbb E_{\mu_{s,a}} \Fbrac{\exp\brac{-\frac{r_{s,a}}{\alpha}} } }-\alpha \sigma .
}

For the transition  kernel uncertainty set, the detailed definition is as follows
\eqenv{
&f^{\rho_{TV}(\sigma)}(p_{s,a}, \alpha,V)=\mathbb E_{p_{s,a}}\Fbrac{(V(s'_{s,a}))_\alpha}-\frac{{\sigma}}{2}\brac{\alpha-\min_{s'_{s,a}} V(s'_{s,a})} ;\nonumber
        \\&f^{\rho_{\chi^2}(\sigma)}(p_{s,a}, \alpha,V)=\mathbb E_{p_{s,a}}\Fbrac{(V(s'_{s,a}))_\alpha}-\sqrt{\sigma \textbf{Var}_{p_{s,a}}\Fbrac{ (V(s'_{s,a}))_\alpha}  } ;
        \nonumber\\&
        f^{\rho_{KL}(\sigma)}(p_{s,a}, \alpha,V)=-\alpha \log\brac{\mathbb E_{p_{s,a}} \Fbrac{\exp\brac{-\frac{V(s'_{s,a})}{\alpha}} } }-\alpha \sigma .
}


% Here we recall the general model to present the duality of DRO problems: random variable $x\in\mathcal{S}$ following the distribution $x\sim p$, a function $v(\cdot): \mathcal{S}\to \mathbb R^+$.($x$ resp. $s$ or $r_{s,a}$, $p$ resp. $\mu$, $v(\cdot)  :=\max_a Q(\cdot,a) $ or resp. identity function). 

% % Then, let $\varbrac{v(x_k):0\leq k\leq |\mathcal{S}|-1}$ donate the values $\varbrac{v(x):x\in \mathcal{S}}$ arranged in increasing order. Recall that 
% \begin{align}
%     (v(x))_\alpha= \begin{cases} v(x) & v(x)\leq \alpha ,\\
%                      \alpha &  v(x)>\alpha.
%        \end{cases}
% \end{align}
% Then, we define $a_m=\sum_{i\leq m}p(x_i)v(x_i)$, $b_m=\sum_{i>m}p(x_i)$,$c_m=\sum_{i\leq m}p(x_i)v^2(x_i)$. 
% Thus, if $\alpha\in [v(x_n),v(x_{n+1}]$, we can get that:
% \eqenv{
% &\E_p[(v(x))_{\alpha^*}]=a_n+b_n\alpha,\\
%     &\text{Var}_p[(v(x))_{\alpha^*}]= c_n+b_n\alpha^2-\brac{a_n+b_n\alpha}^2. 
% }

% Given the statistical distance $\rho(\cdot,\cdot)$ and corresponding duality problems, we analyze the properties of Lagrange multiplier $\alpha$, which can decide the worst-case distribution.
% % For example, given uncertainty level $\sigma$, nominal distribution $p$ and function $v(x)$, i.e.$ x\sim p(\cdot)$. Define $q^*= \arg\min_q \mathbb E_q[v(x)] $, $s.t. $ $\rho_{TV}(q,p)\leq \sigma $.

% \begin{definition}[Lagrange Multiplier]\label{def:a.1}
%     Denote by $\boldsymbol{\alpha^*}^{\rho(\sigma)}$ the saddle point Lagrange multiplier, i.e.
%     \begin{align}
%         \boldsymbol{\alpha^*}^{\rho(\sigma)}(p, v(x))\in \arg\max_{\alpha} f^{\rho(\sigma)}(p, \alpha, v(x)),\nonumber
%     \end{align}
%     and $\boldsymbol{q^*}^{\rho(\sigma)}$(resp. $\boldsymbol{\nu^*}^{\rho(\sigma)}$) the worst-case distribution within uncertainty set,
%     \begin{align}
%         \boldsymbol{q^*}^{\rho(\sigma)}(p, v(x))\in \arg \max_q \mathbb E_q [v(x)], s.t. \rho(q,p )\leq \sigma.\nonumber
%     \end{align}
% \end{definition}
% \begin{remark}
%     The saddle point Lagrange multiplier $\boldsymbol{\alpha^*}^{\rho(\sigma)}$, nominal distribution $p$ and function $v(x)$ decide the worst-case distribution. 
% \end{remark}

% Then, we present the worst-case distribution corresponding to different uncertainty sets.

% % \begin{lemma}[TV-constrained uncertainty set]\label{lm:qtv}
% %     The value of the optimization problem: 
% %     \begin{align}
% %         & minimize \quad \mathbb E_q[v(x)]
%         \nonumber\\&
%         subject \quad to \quad q\in \varbrac{\rho_{TV} 
%         \brac{q,p}\leq \sigma, q\in \Delta(\mathcal{S}) },\nonumber
%         \end{align}
%     The worst-case distribution $q^*= \arg\min_{q}\mathbb E_q[v(x)]$, satisfies that:
%     \eqenv{
%     q^*(x)= 
%     \begin{cases} p(x)+\frac{\sigma}{2} & x_0 ,\\
%                      p(x) &  v(x)<\alpha^*,\\
%                       \sum_{v(x')\leq \alpha^*} p(x')-\frac{\sigma}{2} &  v(x)=\alpha^*,\\
%                      0 & v(x)>\alpha^*,
%        \end{cases}
%     }where $\alpha^*= \min_{i}\varbrac{v(x_i):\sum_{j>i} p(x_j)< \frac{\sigma}{2} }$. 
% % \end{lemma}


% \begin{lemma}[$\chi^2$-constrained uncertainty set]
%     The value of the optimization problem: 
%     \begin{align}
%         & minimize \quad \mathbb E_q[v(x)]
%         \nonumber\\&
%         subject \quad to \quad q\in \varbrac{\rho_{\chi^2} 
%         \brac{q,p}\leq \sigma, q\in \Delta(\mathcal{S}) },\nonumber
%         \end{align}
%     The worst-case distribution $q^*= \arg\min_{q}\mathbb E_q[v(x)]$, satisfies that:
%     \eqenv{
%     q^*(x)= p(x) \brac{1-\sqrt{\sigma}\frac{(v(x))_{\alpha^*}-\mathbb E \Fbrac{(v(x))_{\alpha^*}}}{\text{Var}_p[(v(x))_{\alpha^*}]}},
%     }where $\alpha^*\in  \varbrac{\alpha'=\frac{a_m}{1-b_m}+ \frac{1}{1-b_m}\sqrt{\frac{c_m(1-b_m)-a_m^2}{\sigma(1-b_m)-b_m } },\alpha'\in[v(x_m),v(x_{m+1})] }$. 
% \end{lemma}

% \begin{lemma}[KL constrained uncertainty set]
%     The value of the optimization problem: 
%     \begin{align}
%         & minimize \quad \mathbb E_q[v(x)]
%         \nonumber\\&
%         subject \quad to \quad q\in \varbrac{\rho_{KL} 
%         \brac{q,p}\leq \sigma, q\in \Delta(\mathcal{S}) },\nonumber
%         \end{align}
%     The worst-case distribution $q^*= \arg\min_{q}\mathbb E_q[v(x)]$, satisfies that:
%     \eqenv{
%     q^*(x)= \frac{p(x)\exp\brac{-\frac{v(x)}{\alpha^*} }}{\E_p \Fbrac{\exp\brac{-\frac{v(x)}{\alpha^*} } } }
%     }where $\alpha^*= \arg \max_{\alpha\geq0} \varbrac{\alpha \log\brac{\mathbb E_{p} \brac{exp\brac{\frac{v(x)}{\alpha}} } }-\alpha \sigma }$. 
% \end{lemma}
% Above all, we present the detailed  worst-case distribution for different uncertainty sets. 

We present the analysis of propositions and theorems proof of the T-MLMC algorithm. To simplify the proof process, we just provide the analysis of the transition kernel uncertainty set, which is easy to extend to the reward uncertainty set.

Firstly, to define the surrogate $Q$-table $ \widehat Q^{*\rho(\sigma)}$, we define the expected biased estimation of dual value as follows:
\begin{definition}[Biased estimation]\label{def:3.1}
    Draw $n$ samples from nominal distribution $s'_{s,a,i}\sim p_{s,a}, i=0,1,.., n-1$ and get the empirical distributio $\widehat p_{s,a,n} $. We define (resp. $\mu_{s,a},\hat \mu_{s,a,n}$)
    \begin{align}
        f^{*\rho(\sigma)}(\hat p_{s,a,n},V ):=\sup_{\alpha\geq 0} \varbrac{f^{\rho(\sigma)}(\hat p_{s,a,n}, \alpha, V) }.\nonumber
    \end{align} 
    % \begin{align}
    %     \boldsymbol{\bar f^*_n}^{\rho(\sigma)}\brac{p_{s,a},V(s'_{s,a})}:=\mathbb E_{\hat{p}_{s,a,n}}\Fbrac{f^{*\rho(\sigma)}(\hat p_{s,a,n},V(s'_{s,a}))}.\nonumber
    % \end{align}
\end{definition}
The estimation of the robust Bellman operator is biased and the bias depends on empirical distribution sample sizes $n$, which referred to as 
\begin{align}
    \lbrac{\E\Fbrac{f^{*\rho(\sigma)}(\hat p_{s,a,n},V)}- f^{*\rho(\sigma)}\brac{ p_{s,a}, V}}.\nonumber
\end{align}

  

We first show that when including the threshold $N_{\max}$ in our algorithm, the bias of the robust Bellman operator is equal to the bias when applying the model-based algorithm with sample size $2^{N_{\max}+1}$. 
Here, we describe the condition by the following proposition. 
\begin{proposition}[Threshold MLMC]\label{prop:mlmc}
We recall that $\widehat{\mathcal{T}}^{\rho(\sigma)}_{N_{\max}}(Q)(s,a)= \widehat r^{\rho(\sigma)}(s,a)+\gamma \widehat{v}^{\rho(\sigma)}(Q(s,a))$. The robust Bellman estimator $\widehat{v}^{\rho(\sigma)}(Q(s,a))$ (resp. $\widehat r^{\rho(\sigma)}(s,a) $) satisfies that 
\begin{align}
\E\Fbrac{\widehat{r}^{\rho(\sigma)}(s,a)}&=\mathbb E \Fbrac{r_{s,a,0}+\frac{\delta^{\rho(\sigma),r}_{s,a,N_1} }{P_{N_1}} \nonumber}=\E\Fbrac{g^{*\rho(\sigma)}(\hat \mu_{s,a,2^{N_{\max}+1}},r_{s,a})},\nonumber\\
    \E\Fbrac{\widehat{v}^{\rho(\sigma)}(Q(s,a)) }&=\mathbb E \Fbrac{V(s'_{s,a,0})+\frac{\delta^{\rho(\sigma)}_{s,a,N_2}(Q) }{P_{N_2}} \nonumber}=\E\Fbrac{f^{*\rho(\sigma)}(\hat p_{s,a,2^{N_{\max}+1}},V)}.
    % \boldsymbol{\bar f^*}_{2^{N_{\max}+1}}^{\rho(\sigma)}\brac{p_{s,a},V(s'_{s,a})  }.
\end{align}
Thus, the estimated robust Bellman operator $\widehat{\mathcal{T}}^{\rho(\sigma)}_{N_{\max}}$ satisfies 
\eqenv{
\E\Fbrac{\widehat{\mathcal{T}}^{\rho(\sigma)}_{N_{\max}}(Q)(s,a) }&= \E\Fbrac{ \widehat{r}^{\rho(\sigma)}(s,a)+\gamma 
\widehat{v}^{\rho(\sigma)}(Q(s,a))  }\\&=\E\Fbrac{g^{*\rho(\sigma)}(\hat \mu_{s,a,2^{N_{\max}+1}},r_{s,a})+ \gamma  f^{*\rho(\sigma)}(\hat p_{s,a,2^{N_{\max}+1}},V)  }.
}
\end{proposition}
% \begin{definition}
%     We define that
%     \eqenv{
%     \boldsymbol{}
%     }
% \end{definition}

The \cref{prop:mlmc} shows the fact that the estimation biases are equal when drawing $2^{N_{\max}+1}$ samples to estimate the dual value directly and when setting the $N_{\max}$-threshold MLMC algorithm to estimate the dual value. 

% When sampling $n$
Based on \cref{prop:mlmc}, for $\rho$ distance and uncertainty level $\sigma$, we define 
\begin{align}
    \E\Fbrac{ {\widehat {\mathcal{T}}}_{N_{\max}}^{\rho(\sigma)} (Q)(s,a) }&=\boldsymbol{\bar {\mathcal{T}}}_{N_{\max}}^{\rho(\sigma)} (Q)(s,a),
     \label{eq:fixp}
\end{align} where $\boldsymbol{\bar {\mathcal{T}}}_{N_{\max}}^{\rho(\sigma)} $ is the surrogate robust operator being the expectation of our T-MLMC estimator. 
\begin{proposition}\label{prop:contract}
    Given statistical distance $\rho$ and uncertainty level $\sigma$, estimated robust Bellman operator $ \widehat{\mathcal{T}}^{\rho(\sigma)}$is $\gamma$-\textit{contraction} w.r.t. the infinity norm:
    \eqenv{
   {\mynorm{\hatT( Q)-\hatT( Q') }_\infty} \leq \gamma \mynorm{ Q- Q' }_\infty.
    }
\end{proposition}


We denote its unique fixed point as $\widehat Q^{*\rho(\sigma)}$, i.e.
\eqenv{
 \hatT\brac{\widehat Q^{*\rho(\sigma)}}=\widehat Q^{*\rho(\sigma)}.
}

 It hence holds that
\begin{align}\label{eqa20}
    {\mynorm{\widehat Q_T^{\rho(\sigma)}-Q^{*\rho(\sigma)}}_\infty^2}
  \leq 2 {\mynorm{\widehat Q_T^{\rho(\sigma)}-\widehat Q^{*\rho(\sigma)}}_\infty^2 }+ 2{\mynorm{\widehat Q^{*\rho(\sigma)}-Q^{*\rho(\sigma)}}_\infty^2 }.
\end{align}


\begin{lemma}\label{lm:a8}
    The optimal robust $Q$-function and estimated optimal robust $Q$-function can be bounded as follows:
    \eqenv{
    \norminf{\widehat Q^{*\rho(\sigma)}- Q^{*\rho(\sigma)}}&\leq \frac{1}{1-\gamma} \norminf{\hatT\brac{ Q^{*\rho(\sigma)}}- {\mathcal{T}}^{\rho(\sigma)}\brac{Q^{*\rho(\sigma)}}}.
}
\end{lemma}

Combined with \cref{prop:mlmc,lm:a8}, this term can be bounded specifically for different uncertainty set (TV, $\chi^2$ and KL) in the following sections. 

We then aim to bound the first term in \cref{eqa20} ${\mynorm{\widehat Q_T^{\rho(\sigma)}-\widehat Q^{*\rho(\sigma)}}_\infty } $. The error between surrogate $Q$-table and the optimal robust $Q$-table, $ {\mynorm{ Q^{*\rho(\sigma)}-\widehat Q^{*\rho(\sigma)}}_\infty^2 }$ can be bounded following lemma.
\begin{lemma}[\citep{chen2022finite} Theorem 2.1 \& Corollary 2.1.2]\label{lm:chen}
    For the following stochastic iteration,
    \eqenv{
    \theta_{k+1}= \theta_k+\beta_k \brac{\mathcal{H}(\theta_k)-\theta_k+ w_k},
    } where $\theta\in \mathbb R^d$, $\beta_k$ is the stepsize. The fixed point $\theta^*$ satisfies that
    $\theta^*=\mathcal{H}(\theta^*)$. 
    Define $\mathcal{F}_k=\varbrac{\theta_0, w_0,...,\theta_{k-1},w_{k-1},\theta_{k} }$. 
    When 
    \eqenv{
    \norminf{\mathcal{H}(\theta)-\mathcal{H}(\theta')}\leq \gamma \norminf{\theta-\theta'},
    }
    and
    \eqenv{
    (a). \E\Fbrac{w_k|\mathcal{F}_k }=0; \qquad (b). \E\Fbrac{\norminf{w_k}^2|\mathcal{F}_k }\leq A+ B \norminf{\theta_k}^2,
    }
when $ \beta_t\leq \frac{c_2}{c_3}$, we have 
\eqenv{
\E&\Fbrac{\norminf{\theta_k-\theta^*}^2}
\\&\leq c_1 \norminf{\theta_0-\theta^*}^2 \prod_{j=0}^{k-1} (1-c_2 \beta_j) +c_4\brac{A+2B\norminf{\theta^*}^2} \sum_{i=0}^{k-1} \beta_i^2 \prod_{j=i+1}^{k-1} (1-c_2 \beta_j),
}where $c_1=\frac{3}{2}$, $c_2= \frac{1-\gamma}{2}, c_3 =\frac{32 e (B+2)\log (d)}{1-\gamma}, c_4=\frac{16e\log(d)}{1-\gamma}.  $
\end{lemma}
We consider the stochastic iteration that
\eqenv{
\widehat Q^{\rho(\sigma)}_{t+1}=\widehat Q^{\rho(\sigma)}_{t}+ \beta_t\brac{\boldsymbol{\bar{\mathcal{T}}}^{\rho(\sigma)}_{N_{\max}}\brac{Q^{\rho(\sigma)}_{t}}- \widehat Q^{\rho(\sigma)}_{t}+ W_t  },
}
where we define the filtration $\mathcal{F}_t=\varbrac{Q^{\rho(\sigma)}_0, W_0,...,Q^{\rho(\sigma)}_{t-1},W_{t-1},Q^{\rho(\sigma)}_{t} } $.
There are three requirements when applying the \cref{lm:chen}:

\textbf{   1) $\boldsymbol{\bar{\mathcal{T}}}^{\rho(\sigma)}_{N_{\max}}=\E\Fbrac{\widehat T _{N_{\max}}^{\rho(\sigma)}(Q)}$ is $\gamma$ contraction operator (\Cref{prop:contract}).}


\textbf{ 2). Unbiased estimation: }
% and 2) the estimation of robust Bellman operator can be bounded by \cref{prop:mlmc}.
\eqenv{
\E\Fbrac{{\widehat{\mathcal{T}}}^{\rho(\sigma)}_{N_{\max}}\brac{Q^{\rho_{TV}(\sigma)}_{t}}-\boldsymbol{\bar{\mathcal{T}}}^{\rho_{TV}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{TV}(\sigma)}_{t}} \bigg|\mathcal{F}_t }= 0
}

\textbf{ 3). Bounded infinite norm expectation:  }
Here, the boundary of the MLMC estimator infinite norm expectation $\E\Fbrac{\norminf{\widehat {\mathcal{T}}_{N_{\max}}^{\rho(\sigma)}(Q)}^2 }$ is required to make sure the convergence of the algorithm. Take the expectation of $N\sim \text{Geo}(\psi)$, the expectation of infinite norm can be bounded by
\eqenv{
\E\Fbrac{\norminf{\widehat {\mathcal{T}}_{N_{\max}}^{\rho(\sigma)}(Q)}^2 }
& \myineq{\leq}{a} 4r^2_{\max}+4\gamma^2\frac{r^2_{\max}}{(1-\gamma)^2}+4
\E\Fbrac{\sum_{N_1=0}^{N_{\max}}\sup_{s,a} \frac{\brac{\delta^{r,\rho(\sigma)}_{s,a,N_1}
  }^2}{P_{N_1}}+\gamma^2\sum_{N_2=0}^{N_{\max}}\sup_{s,a}\frac{\brac{
\delta^{\rho(\sigma)}_{s,a,N_2}(Q)  }^2}{P_{N_2}}} 
\\&= 4r^2_{\max}+4\gamma^2\frac{r^2_{\max}}{(1-\gamma)^2}+4\E\Fbrac{\sum_{N_1=0}^{N_{\max}} \sup_{s,a}\frac{\brac{\delta^{r,\rho(\sigma)}_{s,a,N_1}
  }^2}{\psi(1-\psi)^{N_1}}+\gamma^2\sum_{N_2=0}^{N_{\max}}\sup_{s,a}\frac{\brac{
\delta^{\rho(\sigma)}_{s,a,N_2}(Q)  }^2}{\psi(1-\psi)^{N_2}} }, 
} where $(a)$ follows from that $\sup_{s,a} r_{s,a}\leq r_{\max}$,  $\sup_{s,a} Q(s,a)\leq \frac{r_{\max}}{1-\gamma}$ and the two estimators are independent. 

Then, make the decomposition of the term $\sup_{s,a}\lbrac{
\delta^{\rho(\sigma)}_{s,a,N_2}(Q)  }^2$, 
\eqenv{\label{eqa24}
\sup_{s,a}\lbrac{\delta^{\rho(\sigma)}_{s,a,N}(Q)(s,a) }^2
     &\leq 3\sup_{s,a}\lbrac{\sup_{\alpha\geq 0}\varbrac{f^{\rho(\sigma)}(\widehat p_{s,a,2^{N+1}},\alpha,V)}- {\sup_{\alpha\geq 0}\varbrac{f^{\rho(\sigma)}( p_{s,a},\alpha,V)} }  }^2
    \\&\quad+  \frac{3}{4}\sup_{s,a}\lbrac{ \sup_{\alpha\geq 0}\varbrac{f^{\rho(\sigma)}(\widehat p^E_{s,a,2^{N}},\alpha,V)}-{\sup_{\alpha\geq 0}\varbrac{f^{\rho(\sigma)}( p_{s,a},\alpha,V)} } }^2
    \\&\quad+\frac{3}{4}\sup_{s,a}\lbrac{\sup_{\alpha\geq 0}\varbrac{f^{\rho(\sigma)}(\widehat p^O_{s,a,2^{N}},\alpha,V )}- {\sup_{\alpha\geq 0}\varbrac{f^{\rho(\sigma)}(  p_{s,a},\alpha,V)} }}^2.
}

Then the terms in \cref{eqa24} can be bounded specifically for different uncertainty sets (TV, $\chi^2$, and KL) and we can obtain the sample complexity. 

% Then the terms in \cref{eqa24} can be bounded specifically for different uncertainty sets (TV, $\chi^2$, and KL) in the following sections. Here, we present the result to show the trade-off of $\psi$:
% \eqenv{
% \E\Fbrac{\lbrac{\delta^{\rho(\sigma)}_{s,a,N}(Q)(s,a) }^2 }\leq \widetilde{\mathcal{O }}\brac{2^{-N} }.
% } Therefore, the variance with the order
% \eqenv{
% \widetilde{\mathcal{O }}\brac{\sum_{N=0}^{N_{\max}} \frac{2^{-N}}{(1-\psi)^N} }.
% } To get the bounded variance, the parameter $\psi$ should satisfy $\psi\leq\frac{1}{2}$.
% % by the similar way in model-based approach \citep{shi2023curious,yang2022toward}. 

% Next, we compute the total sample complexity of our algorithm. 
% Given the iteration $t$ and state-action pair $(s,a)$, we sample 
% $\E[\mathcal{N}_1]= 1+\sum_{N_1=0}^{N_{\max}} (2^{N_1+1} \psi(1-\psi)^{N_1} )= N_{\max}+2 $ and 
% $\E[\mathcal{N}_2]= 1+\sum_{N_2=0}^{N_{\max}} (2^{N_2+1} \psi(1-\psi)^{N_2} )= N_{\max}+2 $ samples to estimate the dual value. To achieve a $\widetilde{\mathcal{O }}(N_{\max})$ order sample size, the parameter $\psi\geq \frac{1}{2}$. 

% Hence, we set $\psi=\frac{1}{2}$ in algorithm. The total sample complexity is $|\mathcal{S}||\mathcal{A}|(N_{\max}+1)T $. 

% 
\section{Total Variation Uncertainty Set}
In this part, we present the proof of propositions and theorems specifically for the TV-constrained uncertainty set. 
\begin{theorem}[Restatement of \Cref{thm:tv1} specifically for TV distance]
    Consider the case of TV constraint uncertainty set with uncertainty level $\sigma$ i.e. $ \mathcal{P}^{TV}(\sigma)$ and $ \mathcal{R}^{TV}(\sigma)$, set $\psi=\frac{1}{2}$, for any $Q\in \mathbb R^{\mathcal{S}\times\mathcal{A}}, s\in \mathcal{S}, a\in \mathcal{A} $, the estimation bias can be bounded as:
    \begin{align}
        \sup_{s,a}\lbrac{\E\Fbrac{\widehat {\mathcal{T}}_{N_{\max}}^{\rho_{TV}(\sigma)} (Q)(s,a) } - {\mathcal{T}}^{\rho_{TV}(\sigma)} (Q)(s,a)}\leq  {\widetilde{\mathcal{O }}}\brac{N_{\max}2^{-\frac{N_{\max}}{2}} },\nonumber
        % \brac{r_{\max}+\frac{r_{\max}}{1-\gamma}}
    \end{align}
    The variance can be bounded as:
    \begin{align}
        \text{Var}\brac{\widehat {\mathcal{T}}^{\rho_{TV}(\sigma)}_{N_{\max}} (Q)(s,a)  }\leq \widetilde{\mathcal{O }}\brac{{N_{\max}}}.
    \end{align}
\end{theorem}
\begin{proof}
Firstly, we make error decomposition as follows:
    \eqenv{
    &\sup_{s,a}\lbrac{ \E\Fbrac{\widehat {\mathcal{T}}_{N_{\max}}^{\rho_{TV}(\sigma)} (Q)(s,a) } - {\mathcal{T}}^{\rho_{TV}(\sigma)} (Q)(s,a)}
   \\ &\myineq{=}{i} \sup_{s,a}\bigg|\E\Fbrac{g^{*\rho_{TV}(\sigma)}(\hat \mu_{s,a,2^{N_{\max}+1}},r_{s,a})}+\gamma\E\Fbrac{f^{*\rho_{TV}(\sigma)}(\hat p_{s,a,2^{N_{\max}+1}},V)}
   \\& \qquad- {g^{*\rho_{TV}(\sigma)}(\mu_{s,a},r_{s,a})}
- \gamma  {f^{*\rho_{TV}(\sigma)}(p_{s,a},V )} \bigg|
\\& \leq \sup_{s,a}\lbrac{\E\Fbrac{g^{*\rho_{TV}(\sigma)}(\hat \mu_{s,a,2^{N_{\max}+1}},r_{s,a})}- {g^{*\rho_{TV}(\sigma)}(\mu_{s,a},r_{s,a})} }\\&\qquad+\gamma \sup_{s,a}\lbrac{\E\Fbrac{f^{*\rho_{TV}(\sigma)}(\hat p_{s,a,2^{N_{\max}+1}},V )}-{f^{*\rho_{TV}(\sigma)}(p_{s,a},V )} }
\\& \leq \E\Fbrac{\sup_{s,a}\lbrac{ {g^{*\rho_{TV}(\sigma)}(\hat \mu_{s,a,2^{N_{\max}+1}},r_{s,a})}- {g^{*\rho_{TV}(\sigma)}(\mu_{s,a},r_{s,a})} } }\\&\qquad+\gamma \E\Fbrac{ \sup_{s,a}\lbrac{{f^{*\rho_{TV}(\sigma)}(\hat p_{s,a,2^{N_{\max}+1}},V )}-{f^{*\rho_{TV}(\sigma)}(p_{s,a},V )} }}
\label{eq:eq44}
    } where $(i)$ follows from \cref{prop:mlmc}. 

    For convenience, we only bound the second term in \cref{eq:eq44}. The first term can be bounded similarly.  By \cref{lm:tv}, 
    \eqenv{\label{eqeq42}
    &\lbrac{ {f^{*\rho_{TV}(\sigma)}(\hat p_{s,a,2^{N_{\max}+1}},V )}-{f^{*\rho_{TV}(\sigma)}(p_{s,a},V )} } 
    \\&= \Bigg|\max_{\alpha\geq 0}\varbrac{ \mathbb E_{p_{s,a}}\Fbrac{(V(s'_{s,a}))_\alpha}-\frac{{\sigma}}{2}\brac{\alpha-\min_{s'_{s,a}} V(s'_{s,a})} }\\&\qquad\qquad- {\max_{\alpha\geq 0}\varbrac{ \mathbb E_{\hat p_{s,a,2^{N_{\max}+1}}}\Fbrac{(V(s'_{s,a}))_\alpha}-\frac{{\sigma}}{2}\brac{\alpha-\min_{s'_{s,a}} V(s'_{s,a})} }}\Bigg|
    \\& \leq  {\max_{0\leq \alpha\leq\max_{s'_{s,a}} V(s'_{s,a}) }\lbrac{\mathbb E_{p_{s,a}}\Fbrac{(V(s'_{s,a}))_\alpha}-E_{\hat p_{s,a,2^{N_{\max}+1}}}\Fbrac{(V(s'_{s,a}))_\alpha}}}.
    % \\& \leq  \E\Fbrac{\frac{r_{\max}}{1-\gamma}|\mathcal{S}| \max_{s'_{s,a}} \varbrac{\lbrac{p_{s,a}(s'_{s,a})-\hat p_{s,a,2^{N_{\max}+1}}(s'_{s,a}) } } }.
    }
    
% By Hoeffding inequality, with probability $1-2^{-{N_{\max}-1}} $, we have
% \eqenv{
% \max_{s'_{s,a}} \varbrac{\lbrac{p_{s,a}(s'_{s,a})-\hat p_{s,a,2^{N_{\max}+1}}(s'_{s,a}) } }\leq \sqrt{\frac{N_{\max}\log (2 |\mathcal{S}|)}{2^{N_{\max}+1}}  },
% }then, under this case, we can get that
%     \eqenv{
%     \E\Fbrac{\max_{\max_{s'_{s,a}} V(s'_{s,a}) \geq \alpha\geq 0}\lbrac{\mathbb E_{p_{s,a}}\Fbrac{(V(s'_{s,a}))_\alpha}-E_{\hat p_{s,a,2^{N_{\max}+1}}}\Fbrac{(V(s'_{s,a}))_\alpha}}}\leq \sqrt{\frac{r^2_{\max}|\mathcal{S}|^2 N_{\max}\log (2 |\mathcal{S}|)}{2^{N_{\max}+1}(1-\gamma)^2}  }
%     }

Similarly to Lemma 9 in \citep{shi2023curious}, 
we have the following lemma.

\begin{lemma}\label{lm:tvlm}
    Consider the case of TV constraint uncertainty set $\mathcal{P}^{\text{TV}}(\sigma)$ with uncertainty level $\sigma$ , for any $\delta\in(0,1)$, one has with probability at least $1-\delta$, 
    \eqenv{
   \max_{0\leq \alpha\leq \max_{s'_{s,a}} V(s'_{s,a}) }& \lbrac{\mathbb E_{p_{s,a}}\Fbrac{(V(s'_{s,a}))_\alpha}-\E_{\hat p_{s,a,N}}\Fbrac{(V(s'_{s,a}))_\alpha}}
   \leq 3r_{\max}\sqrt{\frac{\log \brac{\frac{18  N}{\delta}}}{(1-\gamma)^2N}}.
    }
\end{lemma}
According to \Cref{lm:tvlm}, it can be shown that with probability at least $1-\frac{2^{-{N_{\max}-1}}}{|\mathcal{S}||\mathcal{A}|} $, we have 
    \eqenv{
   \max_{0\leq \alpha\leq \max_{s'_{s,a}} V(s'_{s,a}) }& \lbrac{\mathbb E_{p_{s,a}}\Fbrac{(V(s'_{s,a}))_\alpha}-E_{\hat p_{s,a,2^{N_{\max}+1}}}\Fbrac{(V(s'_{s,a}))_\alpha}}
    \\&\leq 3 \sqrt{\frac{r_{\max}^2 \brac{\log \brac{18|\mathcal{S}||\mathcal{A}|} + 2(N_{\max}+1)\log 2} }{(1-\gamma)^2 2^{{N_{\max}+1}}}}
    % \\& \leq 3 \sqrt{\frac{r_{\max}^2 \brac{2\log \brac{18|\mathcal{S}||\mathcal{A}|} + 2(N_{\max}+1)\log 2} }{(1-\gamma)^2 2^{{N_{\max}+1}}}}
    ; 
    }
    
Then, according to the Bernoulli's inequality, we have that 
\eqenv{
\brac{1-\frac{2^{-N_{\max}-1}}{\ssa} }^\ssa\geq 1- 2^{-N_{\max}-1}. 
}

Therefore, with probability at least $1-2^{-N_{\max}-1} $, we have that 
\eqenv{\label{eqeq44}
\sup_{s,a}&\varbrac{\max_{0\leq \alpha\leq \max_{s'_{s,a}} V(s'_{s,a}) } \lbrac{\mathbb E_{p_{s,a}}\Fbrac{(V(s'_{s,a}))_\alpha}-E_{\hat p_{s,a,2^{N_{\max}+1}}}\Fbrac{(V(s'_{s,a}))_\alpha}} }
\leq \frac{ r_{\max} C_{TV}}{(1-\gamma)2^{\frac{N_{\max}+1}{2}}},
}
where we set $C_{TV}= 3\sqrt{{2(N_{\max}+1)\log2+} \log(18 |\mathcal{S}||\mathcal{A}|)} $. 
    
With probability $2^{-{N_{\max}-1}}$, we directly have that 
    \eqenv{\label{eqeq45}
    \sup_{s,a}\varbrac{\max_{0\leq \alpha\leq \max_{s'_{s,a}} V(s'_{s,a}) }\lbrac{\mathbb E_{p_{s,a}}\Fbrac{(V(s'_{s,a}))_\alpha}-E_{\hat p_{s,a,2^{N_{\max}+1}}}\Fbrac{(V(s'_{s,a}))_\alpha}}}\leq \sup_{s,a}{\max_{s'_{s,a}} V(s'_{s,a})}\leq \frac{r_{\max}}{1-\gamma}. 
    }
    
    Hence, combining both cases together with \cref{eqeq42}, we further have that
    \eqenv{
    \E&\Fbrac{\sup_{s,a}\lbrac{{f^{*\rho_{TV}(\sigma)}(\hat p_{s,a,2^{N_{\max}+1}},V )}-{f^{*\rho_{TV}(\sigma)}(p_{s,a},V )} }}
    \\ &\myineq{\leq}{i}  \frac{ r_{\max}C_{TV}}{(1-\gamma)}2^{-\frac{N_{\max}+1}{2}} + \frac{r_{\max}}{1-\gamma}2^{-\brac{N_{\max}+1}} 
    \\& {\leq} \frac{r_{\max}}{1-\gamma}2^{-\frac{N_{\max}+1}{2}}\brac{2^{-\frac{N_{\max}+1}{2}}+ C_{TV} }, 
    }where $(i)$ follows from that $1-2^{-\brac{N_{\max}+1}}\leq 1$. 

    Similarly, we can get the bound
    \eqenv{
   \E&\Fbrac{\sup_{s,a} \lbrac{{g^{*\rho_{TV}(\sigma)}(\hat \mu_{s,a,2^{N_{\max}+1}},r_{s,a})}- {g^{*\rho_{TV}(\sigma)}(\mu_{s,a},r_{s,a})} }}
    \\&\qquad\qquad\qquad\leq r_{\max}2^{-\frac{N_{\max}+1}{2}}\brac{2^{-\frac{N_{\max}+1}{2}}+ C_{TV} }.
    }

    Thus, we can get that
    \eqenv{
    &\sup_{s,a}\lbrac{ \E\Fbrac{\widehat {\mathcal{T}}_{N_{\max}}^{\rho_{TV}(\sigma)} (Q)(s,a) } - {\mathcal{T}}^{\rho_{TV}(\sigma)} (Q)(s,a)}
% \\& \leq \lbrac{\E\Fbrac{f^{*\rho_{TV}(\sigma)}(\hat \mu_{s,a,2^{N_{\max}+1}},r_{s,a})}- {f^{*\rho_{TV}(\sigma)}(\mu_{s,a},r_{s,a})} }\\&\qquad+\gamma\lbrac{\E\Fbrac{f^{*\rho_{TV}(\sigma)}(\hat p_{s,a,2^{N_{\max}+1}},V(s'_{s,a}))}-{f^{*\rho_{TV}(\sigma)}(\mu_{s,a},r_{s,a})} }
\\& \leq \E\Fbrac{\sup_{s,a}\lbrac{{g^{*\rho_{TV}(\sigma)}(\hat \mu_{s,a,2^{N_{\max}+1}},r_{s,a})}- {g^{*\rho_{TV}(\sigma)}(\mu_{s,a},r_{s,a})} }}
\\&\qquad+\gamma\E\Fbrac{\sup_{s,a}\lbrac{{f^{*\rho_{TV}(\sigma)}(\hat p_{s,a,2^{N_{\max}+1}},V )}-{f^{*\rho_{TV}(\sigma)}(p_{s,a},V )} }}
\\& \leq \brac{\frac{\gamma r_{\max}}{1-\gamma}+r_{\max} }2^{-\frac{N_{\max}+1}{2}}\brac{2^{-\frac{N_{\max}+1}{2}}+ C_{TV} }.
    }

We then consider the variance of the robust Bellman operator. Firstly, we make an error decomposition of the robust Bellman operator variance as 
\eqenv{
\text{Var}\brac{\widehat {\mathcal{T}}_{N_{\max}}^{\rho_{TV}(\sigma)} (Q)(s,a) }
&= \text{Var}\brac{\widehat  r^{\rho_{TV}(\sigma)}+ \gamma \widehat{v}^{\rho_{TV}(\sigma)}(Q)(s,a) }
\\&= \text{Var}\brac{\widehat  r^{\rho_{TV}(\sigma)}}
+\gamma^2 \Varr{\widehat{v}^{\rho_{TV}(\sigma)}(Q)(s,a)},
}
which is due to the two estimators are independent.

For convenience, we analyze the second term in the above equation. The first term can be bounded similarly. 
\eqenv{
\Varr{\widehat{v}^{\rho_{TV}(\sigma)}(Q)(s,a)}&= 
\E\Fbrac{\brac{\widehat{v}^{\rho_{TV}(\sigma)}(Q)(s,a)}^2  }-\brac{\E\Fbrac{\widehat{v}^{\rho_{TV}(\sigma)}(Q)(s,a) }}^2
\\&\leq \E\Fbrac{\brac{\widehat{v}^{\rho_{TV}(\sigma)}(Q)(s,a)}^2  }.
}
Next, according to the \cref{eq:delta,eq:hatq}, the term above can be explicitly computed: 
\eqenv{\label{eqeq1491}
\E\Fbrac{\brac{\widehat{v}^{\rho_{TV}(\sigma)}(Q)(s,a)}^2  }& = \E\Fbrac{\brac{V(s'_{s,a,0})+ \frac{\delta^{\rho_{TV}(\sigma)}_{s,a,N_2}(Q)(s,a) }{P_{N_2}} }^2}
\\& \leq 2 \E\Fbrac{V(s'_{s,a,0})^2 }+ 2\E \Fbrac{ \brac{\frac{\delta^{\rho_{TV}(\sigma)}_{s,a,N_2}(Q)(s,a) }{P_{N_2}} }^2 }
\\& \leq \frac{2r^2_{\max}}{(1-\gamma)^2}+ 2\sum_{N=0}^{N_{\max}} \E\Fbrac{\brac{\frac{\delta^{\rho_{TV}(\sigma)}_{s,a,N_2}(Q)(s,a) }{P_{N_2}}|N_2=N }^2} P_N
\\& \leq \frac{2r^2_{\max}}{(1-\gamma)^2}+ 2\sum_{N=0}^{N_{\max}} \frac{\E\Fbrac{(\delta^{\rho_{TV}(\sigma)}_{s,a,N}(Q)(s,a))^2} }{P_{N}}.
}
Next, we bound the term $\lbrac{\delta^{\rho_{TV}(\sigma)}_{s,a,N}(Q)(s,a) }^2 $, where
\eqenv{
\lbrac{\delta^{\rho_{TV}(\sigma)}_{s,a,N}(Q)(s,a) }&=
\Bigg|\sup_{\alpha\geq 0}\varbrac{f^{\rho_{TV}(\sigma)}(\widehat p_{s,a,2^{N+1}},\alpha,V)} 
    \\&\quad  -\frac{1}{2} \sup_{\alpha\geq 0}\varbrac{f^{\rho_{TV}(\sigma)}(\widehat p^E_{s,a,2^{N}},\alpha,V)}
    -\frac{1}{2}\sup_{\alpha\geq 0}\varbrac{f^{\rho_{TV}(\sigma)}(\widehat p^O_{s,a,2^{N}},\alpha,V )}\Bigg|. 
}

We make an error decomposition as follows:
\eqenv{
\lbrac{\delta^{\rho_{TV}(\sigma)}_{s,a,N}(Q)(s,a) }^2&=
\Bigg|\sup_{\alpha\geq 0}\varbrac{f^{\rho_{TV}(\sigma)}(\widehat p_{s,a,2^{N+1}},\alpha,V)} 
        \\& \qquad-\frac{1}{2} \sup_{\alpha\geq 0}\varbrac{f^{\rho_{TV}(\sigma)}(\widehat p^E_{s,a,2^{N}},\alpha,V)}
    -\frac{1}{2}\sup_{\alpha\geq 0}\varbrac{f^{\rho_{TV}(\sigma)}(\widehat p^O_{s,a,2^{N}},\alpha,V )}\Bigg|^2
    \\& \leq  3\lbrac{\sup_{\alpha\geq 0}\varbrac{f^{\rho_{TV}(\sigma)}(\widehat p_{s,a,2^{N+1}},\alpha,V)}-{\sup_{\alpha\geq 0}\varbrac{f^{\rho_{TV}(\sigma)}(p_{s,a},\alpha,V)} }  }^2
    \\&\quad+  \frac{3}{4}\lbrac{ \sup_{\alpha\geq 0}\varbrac{f^{\rho_{TV}(\sigma)}( \widehat p^E_{s,a,2^N},\alpha,V)}-{\sup_{\alpha\geq 0}\varbrac{f^{\rho_{TV}(\sigma)}(p_{s,a},\alpha,V)} } }^2
    \\&\quad+\frac{3}{4}\lbrac{\sup_{\alpha\geq 0}\varbrac{f^{\rho_{TV}(\sigma)}(\widehat p^O_{s,a,2^{N}},\alpha,V )}-{\sup_{\alpha\geq 0}\varbrac{f^{\rho_{TV}(\sigma)}( p_{s,a},\alpha,V)} }}^2
    .\label{eqeq1521}
    % \\& \myineq{}{}
}

Then, combined with the analysis in \cref{eqeq44,eqeq45} and the fact $\mathbb P(A\cap B\cap C)\geq 1- \mathbb P(\neg A)-\mathbb P(\neg B)-\mathbb P(\neg C)$, we can conclude that
 with probability at least $1-3*2^{-N} $
\eqenv{
\lbrac{\delta^{\rho_{TV}(\sigma)}_{s,a,N}(Q)(s,a) }^2
&\leq 3\brac{C_{TV}\frac{r_{\max}}{1-\gamma} 2^{-\frac{N+1}{2}}}^2 + \frac{3}{4}\brac{C_{TV}\frac{r_{\max}}{1-\gamma} 2^{-\frac{N}{2}} }^2+ \frac{3}{4}\brac{C_{TV}\frac{r_{\max}}{1-\gamma} 2^{-\frac{N}{2}} }^2
\\& \leq  3\frac{C^2_{TV} r^2_{\max} }{(1-\gamma)^2}2^{-(N+1)}, 
}
 Since $ 0 \leq \sup_{\alpha\geq 0}\varbrac{f^{\rho_{TV}(\sigma)}(q,\alpha,V)}\leq \frac{r_{\max}}{1-\gamma}$ for any distribution $q$, with probability at most $3*2^{-N} $ 
we have that
\eqenv{
 \lbrac{\delta^{\rho_{TV}(\sigma)}_{s,a,N}(Q)(s,a) }^2
 \leq \brac{ \frac{r_{\max}}{1-\gamma}}^2.
}

Above all, we can get that 
\eqenv{
\E\Fbrac{\lbrac{\delta^{\rho_{TV}(\sigma)}_{s,a,N}(Q)(s,a) }^2 }&\leq 3\frac{C^2_{TV} r^2_{\max} }{(1-\gamma)^2}2^{-(N+1)}+ \brac{ \frac{r_{\max}}{1-\gamma}}^23*2^{-N}
\\&\leq \brac{3 C^2_{TV}+ 6}\brac{ \frac{r_{\max}}{1-\gamma}}^2 2^{-N-1}. \label{eqeq1551}
}
Then, plug \cref{eqeq1551} in \cref{eqeq1491}, we can get the bound of variance of robust Bellman operator as follows:
\eqenv{
\Varr{\widehat{v}^{\rho_{TV}(\sigma)}(Q)(s,a)}&\leq \E\Fbrac{\brac{\widehat{v}^{\rho_{TV}(\sigma)}(Q)(s,a)}^2  }
\\&\leq \frac{2r^2_{\max}}{(1-\gamma)^2}+ 2\sum_{N=0}^{N_{\max}} \frac{\E\Fbrac{(\delta^{\rho_{TV}(\sigma)}_{s,a,N}(Q)(s,a))^2} }{P_{N}}
\\&\leq \frac{2r^2_{\max}}{(1-\gamma)^2}+ \frac{2r^2_{\max}}{(1-\gamma)^2}\brac{3 C^2_{TV}+ 6}\sum_{N=0}^{N_{\max}} \frac{2^{-N-1} }{P_{N}}
\\& \myineq{\leq}{i} \frac{2r^2_{\max}}{(1-\gamma)^2}\brac{1+\brac{3 C^2_{TV}+ 6}(N_{\max}+1) },
} where $(i)$ follows from $P_N=\psi(1-\psi)^{N}=(1/2)^{N+1}. $

Similarly, we can get the bound of the variance $\text{Var}\brac{\widehat  r^{\rho_{TV}(\sigma)}}$ as follows:
\eqenv{
\text{Var}\brac{\widehat  r^{\rho_{TV}(\sigma)}}\leq \E\Fbrac{\brac{\widehat  r^{\rho_{TV}(\sigma)}}^2 }\leq
 2r^2_{\max}(1+\brac{3 C^2_{TV}+ 6}(N_{\max}+1) ).
}

Hence, we can get the robust Bellman operator variance bound:
\eqenv{\label{eqeqeq62}
\text{Var}\brac{\widehat {\mathcal{T}}_{N_{\max}}^{\rho_{TV}(\sigma)} (Q)(s,a) }
&= \text{Var}\brac{\widehat  r^{\rho_{TV}(\sigma)}}
+\gamma^2 \Varr{\widehat{v}^{\rho_{TV}(\sigma)}(Q)(s,a)}
\\&\leq \brac{2r^2_{\max}+\gamma^2\frac{2r^2_{\max}}{(1-\gamma)^2} }(1+\brac{3 C^2_{TV}+ 6}(N_{\max}+1) ).
}
   This completes the proof.

\end{proof}

\begin{lemma}\label{lm:lminftv}
    For any fixed $Q\in \mathbb R^{|\mathcal{S}||\mathcal{A}|} $, the infinite norm of robust Bellman operator can be bounded as:
    \begin{align}
        \E\Fbrac{\norminf{\widehat {\mathcal{T}}^{\rho_{TV}(\sigma)}_{N_{\max}} (Q)}^2 }\leq \widetilde{\mathcal{O }}\brac{{N_{\max}}}.
        \end{align}
\end{lemma}

\begin{proof}[Proof of \Cref{lm:lminftv}]
We then consider the expectation of infinite norm of robust Bellman operator. Set $\psi=\frac{1}{2}$, from the construction of T-MLMC operator we directly have that
\eqenv{\label{eqeq149}
\E\Fbrac{\norminf{\widehat {\mathcal{T}}_{N_{\max}}^{\rho_{TV}(\sigma)}(Q)}^2 }\leq 
 4r^2_{\max}+4\gamma^2\frac{r^2_{\max}}{(1-\gamma)^2}+4\E\Fbrac{\sum_{N_1=0}^{N_{\max}} \sup_{s,a}\frac{\brac{\delta^{r,\rho(\sigma)}_{s,a,N_1}
  }^2}{2^{-N_1-1}}+\gamma^2\sum_{N_2=0}^{N_{\max}}\sup_{s,a}\frac{\brac{
\delta^{\rho_{TV}(\sigma)}_{s,a,N_2}(Q)  }^2}{2^{-N_2-1}} }. 
}

% We then consider the variance of the robust Bellman operator. Firstly, we make an error decomposition of the robust Bellman operator variance as 
% \eqenv{
% \text{Var}\brac{\widehat {\mathcal{T}}_{N_{\max}}^{\rho_{TV}(\sigma)} (Q)(s,a) }
% &= \text{Var}\brac{\widehat  r^{\rho_{TV}(\sigma)}+ \gamma \widehat{v}^{\rho_{TV}(\sigma)}(Q)(s,a) }
% \\&= \text{Var}\brac{\widehat  r^{\rho_{TV}(\sigma)}}
% +\gamma^2 \Varr{\widehat{v}^{\rho_{TV}(\sigma)}(Q)(s,a)},
% }
% which is due to the two estimators are independent.

For convenience, we analyze the last term in the above equation. 
% The first term can be bounded similarly. 
% \eqenv{
% \Varr{\widehat{v}^{\rho_{TV}(\sigma)}(Q)(s,a)}&= 
% \E\Fbrac{\brac{\widehat{v}^{\rho_{TV}(\sigma)}(Q)(s,a)}^2  }-\brac{\E\Fbrac{\widehat{v}^{\rho_{TV}(\sigma)}(Q)(s,a) }}^2
% \\&\leq \E\Fbrac{\brac{\widehat{v}^{\rho_{TV}(\sigma)}(Q)(s,a)}^2  }.
% }
% Next, according to the \cref{eq:delta,eq:hatq}, the term above can be explicitly computed: 
% \eqenv{\label{eqeq149}
% \E\Fbrac{\brac{\widehat{v}^{\rho_{TV}(\sigma)}(Q)(s,a)}^2  }& = \E\Fbrac{\brac{V(s'_{s,a,0})+ \frac{\delta^{\rho_{TV}(\sigma)}_{s,a,N_2}(Q)(s,a) }{P_{N_2}} }^2}
% \\& \leq 2 \E\Fbrac{V(s'_{s,a,0})^2 }+ 2\E \Fbrac{ \brac{\frac{\delta^{\rho_{TV}(\sigma)}_{s,a,N_2}(Q)(s,a) }{P_{N_2}} }^2 }
% \\& \leq \frac{2r^2_{\max}}{(1-\gamma)^2}+ 2\sum_{N=0}^{N_{\max}} \E\Fbrac{\brac{\frac{\delta^{\rho_{TV}(\sigma)}_{s,a,N_2}(Q)(s,a) }{P_{N_2}}|N_2=N }^2} P_N
% \\& \leq \frac{2r^2_{\max}}{(1-\gamma)^2}+ 2\sum_{N=0}^{N_{\max}} \frac{\E\Fbrac{(\delta^{\rho_{TV}(\sigma)}_{s,a,N}(Q)(s,a))^2} }{P_{N}}.
% }
Consider the term $\sup_{s,a}\lbrac{\delta^{\rho_{TV}(\sigma)}_{s,a,N}(Q)(s,a) }^2 $, 
% we can get that
% \eqenv{
% \sup_{s,a}\lbrac{\delta^{\rho_{TV}(\sigma)}_{s,a,N}(Q)(s,a) }&=
% \Bigg|\sup_{\alpha\geq 0}\varbrac{f^{\rho_{TV}(\sigma)}(\widehat p_{s,a,2^{N+1}},\alpha,V)} 
%     \\&\quad  -\frac{1}{2} \sup_{\alpha\geq 0}\varbrac{f^{\rho_{TV}(\sigma)}(\widehat p^E_{s,a,2^{N}},\alpha,V)}
%     -\frac{1}{2}\sup_{\alpha\geq 0}\varbrac{f^{\rho_{TV}(\sigma)}(\widehat p^O_{s,a,2^{N}},\alpha,V )}\Bigg|. 
% }
 we make an error decomposition as follows:
\eqenv{
\sup_{s,a}\lbrac{\delta^{\rho_{TV}(\sigma)}_{s,a,N}(Q)(s,a) }^2&=
\sup_{s,a}\Bigg|\sup_{\alpha\geq 0}\varbrac{f^{\rho_{TV}(\sigma)}(\widehat p_{s,a,2^{N+1}},\alpha,V)} 
        \\& \qquad-\frac{1}{2} \sup_{\alpha\geq 0}\varbrac{f^{\rho_{TV}(\sigma)}(\widehat p^E_{s,a,2^{N}},\alpha,V)}
    -\frac{1}{2}\sup_{\alpha\geq 0}\varbrac{f^{\rho_{TV}(\sigma)}(\widehat p^O_{s,a,2^{N}},\alpha,V )}\Bigg|^2
    \\& \leq  3\sup_{s,a}\lbrac{\sup_{\alpha\geq 0}\varbrac{f^{\rho_{TV}(\sigma)}(\widehat p_{s,a,2^{N+1}},\alpha,V)}-{\sup_{\alpha\geq 0}\varbrac{f^{\rho_{TV}(\sigma)}(p_{s,a},\alpha,V)} }  }^2
    \\&\quad+  \frac{3}{4}\sup_{s,a}\lbrac{ \sup_{\alpha\geq 0}\varbrac{f^{\rho_{TV}(\sigma)}( \widehat p^E_{s,a,2^N},\alpha,V)}-{\sup_{\alpha\geq 0}\varbrac{f^{\rho_{TV}(\sigma)}(p_{s,a},\alpha,V)} } }^2
    \\&\quad+\frac{3}{4}\sup_{s,a}\lbrac{\sup_{\alpha\geq 0}\varbrac{f^{\rho_{TV}(\sigma)}(\widehat p^O_{s,a,2^{N}},\alpha,V )}-{\sup_{\alpha\geq 0}\varbrac{f^{\rho_{TV}(\sigma)}( p_{s,a},\alpha,V)} }}^2
    .\label{eqeq152}
    % \\& \myineq{}{}
}

Then, combined with the analysis in \cref{eqeq44,eqeq45} and the fact $\mathbb P(A\cap B\cap C)\geq 1- \mathbb P(\neg A)-\mathbb P(\neg B)-\mathbb P(\neg C)$, we can conclude that for any $N\geq 0$, 
 with probability at least $1-3*2^{-N} $
\eqenv{
\sup_{s,a}\lbrac{\delta^{\rho_{TV}(\sigma)}_{s,a,N}(Q)(s,a) }^2
&\leq 3\brac{C_{TV}\frac{r_{\max}}{1-\gamma} 2^{-\frac{N+1}{2}}}^2 + \frac{3}{4}\brac{C_{TV}\frac{r_{\max}}{1-\gamma} 2^{-\frac{N}{2}} }^2+ \frac{3}{4}\brac{C_{TV}\frac{r_{\max}}{1-\gamma} 2^{-\frac{N}{2}} }^2
\\& \leq  3\frac{C^2_{TV} r^2_{\max} }{(1-\gamma)^2}2^{-(N+1)}, 
}
 Since $ 0 \leq \sup_{\alpha\geq 0}\varbrac{f^{\rho_{TV}(\sigma)}(q,\alpha,V)}\leq \frac{r_{\max}}{1-\gamma}$ for any distribution $q$, with probability at most $3*2^{-N} $ 
we have that
\eqenv{
\sup_{s,a} \lbrac{\delta^{\rho_{TV}(\sigma)}_{s,a,N}(Q)(s,a) }^2
 \leq \brac{ \frac{r_{\max}}{1-\gamma}}^2.
}

Above all, we can get that 
\eqenv{
\E\Fbrac{\sup_{s,a}\lbrac{\delta^{\rho_{TV}(\sigma)}_{s,a,N}(Q)(s,a) }^2 }&\leq 3\frac{C^2_{TV} r^2_{\max} }{(1-\gamma)^2}2^{-(N+1)}+ \brac{ \frac{r_{\max}}{1-\gamma}}^23*2^{-N}
\\&\leq \brac{3 C^2_{TV}+ 6}\brac{ \frac{r_{\max}}{1-\gamma}}^2 2^{-N-1}. 
}
Thus we have that
\eqenv{\label{eqeq155}
\E\Fbrac{\sup_{s,a}\frac{\lbrac{\delta^{\rho_{TV}(\sigma)}_{s,a,N}(Q)(s,a) }^2}{P_N} }= \brac{3 C^2_{TV}+ 6}\brac{ \frac{r_{\max}}{1-\gamma}}^2.
}

Then, plug \cref{eqeq155} in \cref{eqeq149}, we can get the bound of expectation of infinite norm as follows:
\eqenv{
\E\Fbrac{\sum_{N_2=0}^{N_{\max}}\sup_{s,a}\frac{\brac{
\delta^{\rho_{TV}(\sigma)}_{s,a,N_2}(Q)  }^2}{2^{-N_2-1}} }
\leq \sum_{N_2=0}^{N_{\max}}\brac{3 C^2_{TV}+ 6}\brac{ \frac{r_{\max}}{1-\gamma}}^2=\brac{N_{\max}+1}\brac{3 C^2_{TV}+ 6}\brac{ \frac{r_{\max}}{1-\gamma}}^2. 
}


Similarly, we can get the bound as follows:
\eqenv{
\E\Fbrac{\sum_{N_1=0}^{N_{\max}} \sup_{s,a}\frac{\brac{\delta^{r,\rho(\sigma)}_{s,a,N_1}
  }^2}{2^{-N_1-1}}}\leq \sum_{N_1=0}^{N_{\max}}\brac{3 C^2_{TV}+ 6} r_{\max}^2=\brac{N_{\max}+1}\brac{3 C^2_{TV}+ 6}r^2_{\max}. 
}

Hence, combining \Cref{eqeq149} and the above equations, we can get the robust Bellman operator infinite norm bound:
\eqenv{\label{eqeq62}
\E\Fbrac{\norminf{\widehat {\mathcal{T}}_{N_{\max}}^{\rho_{TV}(\sigma)}(Q)}^2 }\leq 
4\brac{1+\brac{N_{\max}+1}\brac{3 C^2_{TV}+ 6}}\brac{r^2_{\max}+\gamma^2\brac{ \frac{r_{\max}}{1-\gamma}}^2}.
}
   This completes the proof.  
\end{proof}


Next, we present the proof of \cref{thm2:tv}
% \begin{definition}[Threshold MLMC Robust Bellman Operator]\label{def:4.1}
%      We define the threshold MLMC robust Bellman operator $\boldsymbol{\bar {\mathcal{T}}}^{\rho_{TV}(\sigma)}_{N_{\max}} $ as follows
%      \eqenv{
%      \boldsymbol{\bar {\mathcal{T}}}^{\rho_{TV}(\sigma)}_{N_{\max}} (Q)(s,a)= \E\Fbrac{\widehat{\mathcal{T}}^{\rho_{TV}(\sigma)}_{N_{\max}}(Q)(s,a) }. 
%      }
%  \end{definition}
 
\begin{theorem}[Restatement of \cref{thm2:tv}]
 Set $\psi=\frac{1}{2}$, and set the stepsize as $$\beta_t=\beta=\frac{2\log T}{(1-\gamma)T}. $$
Then the output from \cref{alg:example} satisfies that:
 \begin{align}
    \mathbb E &\Fbrac{\mynorm{\widehat Q_T^{\rho_{TV}(\sigma)}-Q^{*\rho_{TV}(\sigma)}}_\infty^2}\nonumber
    %\\& \leq \frac{c_0}{(1-\gamma)^2}\brac{1-\frac{\alpha(1-\gamma)}{2}}^T+ c_1 \alpha\frac{\log (|\mathcal{S}||\mathcal{A}|)}{ (1-\gamma)^4}\nonumber
    \leq\widetilde{\mathcal{O }}\brac{\frac{1}{  (1-\gamma)^5 T}}.
\end{align}
To obtain an $\epsilon$-optimal policy, i.e., 
\begin{align}
    \mathbb E \Fbrac{\mynorm{\widehat Q_T^{\rho_{TV}(\sigma)}-Q^{*,\rho_{TV}(\sigma)}}_\infty^2}\leq \epsilon^2,\nonumber
\end{align}
the expected sample complexity $N^{\rho_{TV}(\sigma)}(\epsilon)$ is
 \begin{align}
     N^{\rho_{TV}(\sigma)}(\epsilon) = |\mathcal{S}||\mathcal{A}|N_{\max} T\geq \widetilde{\mathcal{O }}\brac{\frac{|\mathcal{S}||\mathcal{A}|}{ (1-\gamma)^5 \epsilon^2 }}.\nonumber
 \end{align}
\end{theorem}
\begin{proof}
The update of our algorithm can be equivalently written as 
\eqenv{
\widehat Q^{\rho_{TV}(\sigma)}_{t+1}=\widehat Q^{\rho_{TV}(\sigma)}_{t}+ \beta_t\brac{\boldsymbol{\bar{\mathcal{T}}}^{\rho_{TV}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{TV}(\sigma)}_{t}}- \widehat Q^{\rho_{TV}(\sigma)}_{t}+ W_t  },
}
where $W_t={\widehat{\mathcal{T}}}^{\rho_{TV}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{TV}(\sigma)}_{t}}-\boldsymbol{\bar{\mathcal{T}}}^{\rho_{TV}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{TV}(\sigma)}_{t}} $. 

Define the filtration $\mathcal{F}_t=\varbrac{Q^{\rho_{TV}(\sigma)}_0, W_0,...,Q^{\rho_{TV}(\sigma)}_{t-1},W_{t-1},Q^{\rho_{TV}(\sigma)}_{t} } $. 
Note that by definition we have that 
\eqenv{
\E\Fbrac{W_t|\mathcal{F}_t }=0,
}
and  by \cref{lm:lminftv}, we can get that 
\eqenv{
\E\Fbrac{\norminf{W_t}^2|\mathcal{F}_t }
&\leq {\E\Fbrac{\sup_{s,a}\lbrac{{\widehat{\mathcal{T}}}^{\rho_{TV}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{TV}(\sigma)}_{t}}(s,a)-\boldsymbol{\bar{\mathcal{T}}}^{\rho_{TV}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{TV}(\sigma)}_{t}}(s,a)}^2|\mathcal{F}_t }}
\\& \leq 2{\E\Fbrac{\sup_{s,a} \lbrac{{\widehat{\mathcal{T}}}^{\rho_{TV}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{TV}(\sigma)}_{t}}(s,a)}^2+\sup_{s,a}\lbrac{\boldsymbol{\bar{\mathcal{T}}}^{\rho_{TV}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{TV}(\sigma)}_{t}}(s,a)}^2|\mathcal{F}_t }}
\\&\myineq{\leq}{a} 4\E\Fbrac{\sup_{s,a}2\lbrac{{\widehat{\mathcal{T}}}^{\rho_{TV}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{TV}(\sigma)}_{t}}(s,a)}^2|\mathcal{F}_t }
\\& \myineq{\leq}{b} 16\brac{1+\brac{N_{\max}+1}\brac{3 C^2_{TV}+ 6}}\brac{r^2_{\max}+\gamma^2\brac{ \frac{r_{\max}}{1-\gamma}}^2}, 
% \\& \leq \max_{s,a} \varbrac{\text{Var}\brac{\widehat {\mathcal{T}}^{\rho_{TV}(\sigma)}_{N_{\max}} (Q^{\rho_{TV}(\sigma)}_{t})(s,a)  } }
% \\& \leq \brac{2r^2_{\max}+2\gamma^2  \norminf{Q^{\rho_{TV}(\sigma)}_{t} }^2 }(1+\brac{9 C^2_{TV}+ 3}(N_{\max}+1) ),
% \\& \myineq{\leq}{i} \brac{2r^2_{\max}+\gamma^2\frac{2r^2_{\max}}{(1-\gamma)^2} }(1+\brac{9 C^2_{TV}+ 3}(N_{\max}+1) ),
} where $(a) $ follows from that
\eqenv{
\E\Fbrac{\sup_{s,a}\lbrac{\boldsymbol{\bar{\mathcal{T}}}^{\rho_{TV}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{TV}(\sigma)}_{t}}(s,a)}^2|\mathcal{F}_t }&= 
\E\Fbrac{\sup_{s,a}\lbrac{\E\Fbrac{\widehat{\mathcal{T}}^{\rho_{TV}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{TV}(\sigma)}_{t}}(s,a)}}^2\Big|\mathcal{F}_t }
\\& \leq \E\Fbrac{\sup_{s,a}\E\Fbrac{\lbrac{\widehat{\mathcal{T}}^{\rho_{TV}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{TV}(\sigma)}_{t}}(s,a)}^2}\Big|\mathcal{F}_t }
\\& \leq \E\Fbrac{\sup_{s,a} {\lbrac{\widehat{\mathcal{T}}^{\rho_{TV}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{TV}(\sigma)}_{t}}(s,a)}^2}\Big|\mathcal{F}_t }, 
}
and $(b)$ follows from \Cref{lm:lminftv}.

According to \cref{eq:fixp}, we have that
$$\widehat Q^{*\rho_{TV}(\sigma)}(s,a) = \hatT (\widehat Q^{*\rho_{TV}(\sigma)})(s,a)=\E\Fbrac{\hatt (\widehat Q^{*\rho_{TV}(\sigma)})(s,a)}.$$
Then, to apply \cref{lm:chen} \citep{chen2020finite}, we set the constant stepsize $\beta_t=\beta= \frac{2\log T}{(1-\gamma)T}$. We note that as long as $T\geq  {\mathcal{O }}\bigg(\frac{\log T}{(1-\gamma)^3} \bigg)$, it holds that  
$$ \beta={\frac{2\log T}{(1-\gamma)T}\leq\frac{(1-\gamma)^2}{128 e  \log(|\mathcal{S}||\mathcal{A}|) }},$$ and the condition in \cref{lm:chen} are satisfied. We hence have that
\eqenv{\label{eqeq166}
&\E\Fbrac{\norminf{\widehat Q_{T}^{\rho_{TV}(\sigma)}-\widehat Q^{*\rho_{TV}(\sigma)} }^2}
\\ &\myineq{\leq}{i} \frac{3}{2} \norminf{\widehat Q_{0}^{\rho_{TV}(\sigma)}-\widehat Q^{*\rho_{TV}(\sigma)} }^2\prod_{j=0}^{T-1} \brac{1-\frac{1-\gamma}{2} \beta_t} +\frac{16 e \log (|\mathcal{S}||\mathcal{A}|)}{1-\gamma}16\brac{r^2_{\max}+\gamma^2\frac{r^2_{\max}}{(1-\gamma)^2} }\\& \quad\cdot\brac{1+\brac{N_{\max}+1}\brac{3 C^2_{TV}+ 6}}\sum_{i=0}^{T-1} \beta_i^2 \prod_{t=i+1}^{T-1} (1-
\frac{1-\gamma}{2}\beta_t)
\\& \myineq{\leq}{ii} \frac{3}{2}\frac{r^2_{\max}}{(1-\gamma)^2} \frac{1}{T} +\frac{16 e \log (|\mathcal{S}||\mathcal{A}|)}{1-\gamma}16\brac{r^2_{\max}+\gamma^2\frac{r^2_{\max}}{(1-\gamma)^2} }
\brac{1+\brac{N_{\max}+1}\brac{3 C^2_{TV}+ 6}}\frac{4 \log T }{(1-\gamma)^2 T}
,
} where $(i)$ follows from the \cref{lm:chen}. $(ii)$ follows from $(1-(1-\gamma)\beta/2)^T\leq \frac{1}{T}$. 

We set $N_{\max}=\frac{2\log T}{\log 2}$, then the bound of $\mathbb E \Fbrac{\mynorm{\widehat Q_T^{\rho_{TV}(\sigma)}-Q^{*\rho_{TV}(\sigma)}}_\infty^2}$ can be obtained as follows
 \eqenv{
 \mathbb E& \Fbrac{\mynorm{\widehat Q_T^{\rho_{TV}(\sigma)}-Q^{*\rho_{TV}(\sigma)}}_\infty^2}
\\& \leq 
 2\E\Fbrac{\mynorm{\widehat Q_T^{\rho_{TV}(\sigma)}-\widehat Q^{*\rho_{TV}(\sigma)}}_\infty^2 }+ 2\E\Fbrac{\mynorm{\widehat Q^{*\rho_{TV}(\sigma)}-Q^{*\rho_{TV}(\sigma)}}_\infty^2 }
 \\& \myineq{\leq}{i}
  \frac{32 e \log (|\mathcal{S}||\mathcal{A}|)}{1-\gamma}16\brac{r^2_{\max}+\gamma^2\frac{r^2_{\max}}{(1-\gamma)^2} }
\brac{1+\brac{N_{\max}+1}\brac{3 C^2_{TV}+ 6}}\frac{4 \log T }{(1-\gamma)^2 T}
\\& \qquad+\frac{2r^2_{\max}}{(1-\gamma)^2 T} +\frac{2}{1-\gamma}\brac{\brac{r_{\max}+\frac{r_{\max}}{1-\gamma}}2^{-\frac{N_{\max}+1}{2}}\brac{2^{-\frac{N_{\max}+1}{2}}+ C_{TV} }}^2
\\& \myineq{\leq}{ii}  \frac{32 e \log (|\mathcal{S}||\mathcal{A}|)}{1-\gamma}16\brac{r^2_{\max}+\gamma^2\frac{r^2_{\max}}{(1-\gamma)^2} }
\brac{1+\brac{N_{\max}+1}\brac{3 C^2_{TV}+ 6}}\frac{4 \log T }{(1-\gamma)^2 T}
\\& \qquad+\frac{2r^2_{\max}}{(1-\gamma)^2 T} +\frac{1}{1-\gamma}\brac{\brac{r_{\max}+\frac{r_{\max}}{1-\gamma}}\brac{1+ C_{TV} }}^2\frac{1}{T}
\\&= \widetilde{\mathcal{O }}\brac{\frac{1}{(1-\gamma)^5T}},
 }where $(i)$ follows from \cref{lm:a8,thm:tv1}. $(ii)$ follows from $2^{\frac{\log T}{\log 2}}\leq \frac{1}{T} $. 

When $\mathbb E \Fbrac{\mynorm{\widehat Q_T^{\rho_{TV}(\sigma)}-Q^{*,\rho_{TV}(\sigma)}}_\infty^2}\leq \epsilon^2 $, the iteration $T\geq \widetilde{\mathcal{O }}\brac{(1-\gamma)^{-5}\epsilon^{-2}}$. When $\psi=\frac{1}{2}$, the expected sample size per iteration is 
$$1+\sum_{n=0}^{N_{max}} 2^{n+1}P_N=1+ \sum_{n=0}^{N_{max}} 2^{n+1} \frac{1}{2^{n+1}} =N_{\max}+2=\frac{2\log T}{\log 2}+2.$$
Above all, the total sample complexity is $\widetilde{\mathcal{O }}\brac{|\mathcal{S}||\mathcal{A}|(1-\gamma)^{-5}\epsilon^{-2} }$. 


This completes the proof.   
\end{proof}




\section{$\chi^2$ Divergence Uncertainty Set}
Similar to the proof in the TV part,  we can complete the proof. Here we show the detailed result.
\begin{theorem}[Restatement of \cref{thm:tv1} specifically for $\chi^2$ distance]
   Set $\psi=\frac{1}{2}$, then for any $Q\in \mathbb R^{\mathcal{S}\times\mathcal{A}}, s\in \mathcal{S}, a\in \mathcal{A} $, the estimation bias can be bounded as:
    \begin{align}
        \sup_{s,a}\lbrac{\mathbb E\Fbrac{\widehat {\mathcal{T}}_{N_{\max}}^{\rho_{\chi^2}(\sigma)} (Q)(s,a) } - {\mathcal{T}}^{\rho_{\chi^2}(\sigma)} (Q)(s,a)}
         \leq \widetilde{\mathcal{O }}\brac{{2^{-\frac{N_{\max}}{2}}} }. \nonumber
        % \brac{r_{\max}+\frac{r_{\max}}{1-\gamma}}
    \end{align}
    % where $C_{\chi^2}= 4\sqrt{{4(N_{\max}+1)(\sigma+1)} \log(24 |\mathcal{S}||\mathcal{A}|)} $; 
    The variance can be bounded as:
    \begin{align}
        \text{Var}\brac{\widehat {\mathcal{T}}_{N_{\max}}^{\rho_{\chi^2}(\sigma)} (Q)(s,a)  }
\leq \widetilde{\mathcal{O }}\brac{{N_{\max}}}.
    \end{align}
\end{theorem}

\begin{proof}
Firstly, we make error decomposition as follows:
    \eqenv{
    &\sup_{s,a}\lbrac{ \E\Fbrac{\widehat {\mathcal{T}}_{N_{\max}}^{\rho_{\chi^2}(\sigma)} (Q)(s,a) } - {\mathcal{T}}^{\rho_{\chi^2}(\sigma)} (Q)(s,a)}
   \\ &\myineq{=}{i} \sup_{s,a}\bigg|\E\Fbrac{g^{*\rho_{\chi^2}(\sigma)}(\hat \mu_{s,a,2^{N_{\max}+1}},r_{s,a})}+\gamma\E\Fbrac{f^{*\rho_{\chi^2}(\sigma)}(\hat p_{s,a,2^{N_{\max}+1}},V )}
   \\& \qquad- {g^{*\rho_{\chi^2}(\sigma)}(\mu_{s,a},r_{s,a})}
- \gamma  {f^{*\rho_{\chi^2}(\sigma)}(p_{s,a},V )} \bigg|
\\& \leq \sup_{s,a}\lbrac{\E\Fbrac{g^{*\rho_{\chi^2}(\sigma)}(\hat \mu_{s,a,2^{N_{\max}+1}},r_{s,a})}- {g^{*\rho_{\chi^2}(\sigma)}(\mu_{s,a},r_{s,a})} }
\\& \qquad+\gamma\sup_{s,a}\lbrac{\E\Fbrac{f^{*\rho_{\chi^2}(\sigma)}(\hat p_{s,a,2^{N_{\max}+1}},V )}-{f^{*\rho_{\chi^2}(\sigma)}(p_{s,a},V)} }\\& \leq \E\Fbrac{\sup_{s,a}\lbrac{ {g^{*\rho_{\chi^2}(\sigma)}(\hat \mu_{s,a,2^{N_{\max}+1}},r_{s,a})}- {g^{*\rho_{\chi^2}(\sigma)}(\mu_{s,a},r_{s,a})} } }\\& \qquad+\gamma \E\Fbrac{ \sup_{s,a}\lbrac{{f^{*\rho_{\chi^2}(\sigma)}(\hat p_{s,a,2^{N_{\max}+1}},V )}-{f^{*\rho_{\chi^2}(\sigma)}(p_{s,a},V)} }}, \label{eq:eqc44}
    }where $(i)$ follows from \cref{prop:mlmc}. 

    Then, for convenience, we bound the second term in \cref{eq:eqc44}. The first term can be bounded similarly.  By \cref{lm:tv}, 
    \eqenv{\label{eqeq142}
    &\lbrac{ {f^{*\rho_{\chi^2}(\sigma)}(\hat p_{s,a,2^{N_{\max}+1}},V )}-{f^{*\rho_{\chi^2}(\sigma)}(p_{s,a},V)} }
    \\&= \Bigg|\max_{\alpha\geq 0}\varbrac{ \mathbb E_{p_{s,a}}\Fbrac{(V(s'_{s,a}))_\alpha}-\sqrt{\sigma \text{Var}_{p_{s,a}}\Fbrac{(V(s'_{s,a}))_\alpha } } }\\&\qquad\qquad-\E\Fbrac{\max_{\alpha\geq 0}\varbrac{ \mathbb E_{\hat p_{s,a,2^{N_{\max}+1}}}\Fbrac{(V(s'_{s,a}))_\alpha}-\sqrt{\sigma \text{Var}_{\hat p_{s,a,2^{N_{\max}+1}}}\Fbrac{(V'(s'_{s,a}))_\alpha } }  }}\Bigg|
    \\& \leq \E\bigg[\max_{\max_{s'_{s,a}} V(s'_{s,a}) \geq \alpha\geq 0}\bigg\{\Bigg|\mathbb E_{p_{s,a}}\Fbrac{(V(s'_{s,a}))_\alpha}-E_{\hat p_{s,a,2^{N_{\max}+1}}}\Fbrac{(V(s'_{s,a}))_\alpha}
    \\& \qquad+ \sqrt{\sigma \text{Var}_{p_{s,a}}\Fbrac{(V(s'_{s,a}))_\alpha } } -\sqrt{\sigma \text{Var}_{\hat p_{s,a,2^{N_{\max}+1}}}\Fbrac{(V(s'_{s,a}))_\alpha } }\Bigg|\bigg\}\bigg].
    }
    

    According to Lemma 15 and its proof in \citep{shi2023curious}, we have the following lemma.

\begin{lemma}\label{lm:chi2lm}
    Consider the case of $\chi^2$ constraint uncertainty set $\mathcal{P}^{ {\chi^2}}(\sigma)$ with uncertainty level $\sigma$, for any $\delta\in(0,1)$, one has with probability at least $1-\delta$, 
    \eqenv{
   \max_{0\leq \alpha\leq \max_{s'_{s,a}} V(s'_{s,a}) }& \Bigg|\mathbb E_{p_{s,a}}\Fbrac{(V(s'_{s,a}))_\alpha}-\sqrt{\sigma \text{Var}_{p_{s,a}}\Fbrac{(V(s'_{s,a}))_\alpha } } 
   \\&-E_{\hat p_{s,a,N}}\Fbrac{(V(s'_{s,a}))_\alpha}+\sqrt{\sigma \text{Var}_{\hat p_{s,a,2^{N_{\max}+1}}}\Fbrac{(V'(s'_{s,a}))_\alpha } }\Bigg|
   \leq 4\sqrt{\frac{2 r^2_{\max} (1+\sigma)\log \brac{\frac{24 N}{\delta}}}{(1-\gamma)^2N}}.
    }
\end{lemma}
According to \Cref{lm:chi2lm}, we can get that with probability at least $1-\frac{2^{-{N_{\max}-1}}}{\ssa} $, we have 
    \eqenv{\label{eqeq2721}
    &\lbrac{ {f^{*\rho_{\chi^2}(\sigma)}(\hat p_{s,a,2^{N_{\max}+1}},V )}-{f^{*\rho_{\chi^2}(\sigma)}(p_{s,a},V)} }
    \\&\qquad\qquad\leq 4 r_{\max}\sqrt{\frac{2 (1+\sigma) \brac{\log \brac{24|\mathcal{S}||\mathcal{A}|}+ {{2(N_{\max}+1)}}\log 2}}{(1-\gamma)^2 2^{{N_{\max}+1}}}}= C_{\chi^2} \frac{r_{\max}}{1-\gamma}2^{-\frac{N_{\max}+1}{2}},
    } where $C_{\chi^2}=4\sqrt{2 (1+\sigma) \brac{\log \brac{24|\mathcal{S}||\mathcal{A}|}+ {{2(N_{\max}+1)}}\log 2}} $.

    Then, according to the Bernoulli's inequality, we have that 
\eqenv{
\brac{1-\frac{2^{-N_{\max}-1}}{\ssa} }^\ssa\geq 1- 2^{-N_{\max}-1}. 
}

Therefore, with probability at least $1-{2^{-{N_{\max}-1}}} $, there exists
\eqenv{\label{eqc280}
\sup_{s,a} \lbrac{ {f^{*\rho_{\chi^2}(\sigma)}(\hat p_{s,a,2^{N_{\max}+1}},V )}-{f^{*\rho_{\chi^2}(\sigma)}(p_{s,a},V)} }\leq C_{\chi^2} \frac{r_{\max}}{1-\gamma}2^{-\frac{N_{\max}+1}{2}}.
}

     Otherwise, we have 
    \eqenv{\label{eqc281}
    \sup_{s,a}&\bigg|\max_{\max_{s'_{s,a}} V(s'_{s,a}) \geq \alpha\geq 0}\bigg\{\lbrac{\mathbb E_{p_{s,a}}\Fbrac{(V(s'_{s,a}))_\alpha}-E_{\hat p_{s,a,2^{N_{\max}+1}}}\Fbrac{(V(s'_{s,a}))_\alpha}}
    \\& \qquad+ \lbrac{\sqrt{\sigma \text{Var}_{p_{s,a}}\Fbrac{(V(s'_{s,a}))_\alpha } } -\sqrt{\sigma \text{Var}_{\hat p_{s,a,2^{N_{\max}+1}}}\Fbrac{(V'(s'_{s,a}))_\alpha } }}\bigg\}\bigg|
    \\ 
    &\leq \sup_{s,a}\max_{s'_{s,a}} V(s'_{s,a})+ \sup_{s,a}\lbrac{\max_{0\leq \alpha\leq \max_{s'_{s,a}} V(s'_{s,a}) }\lbrac{\sqrt{\sigma \text{Var}_{p_{s,a}}\Fbrac{(V(s'_{s,a}))_\alpha } } -\sqrt{\sigma \text{Var}_{\hat p_{s,a,2^{N_{\max}+1}}}\Fbrac{(V'(s'_{s,a}))_\alpha } }}}
    \\& \leq \frac{r_{\max}}{1-\gamma}+ \sup_{s,a}\sqrt{\sigma \brac{\max_{s'_{s,a}} V(s'_{s,a}) }^2}
    \\& \leq (1+\sqrt{\sigma})\frac{r_{\max}}{1-\gamma}. 
    }
% Set $C_{\chi^2}= 4\sqrt{{4(N_{\max}+1)(\sigma+1)} \log(24 |\mathcal{S}||\mathcal{A}|)} $.
Plugging the above equations to \cref{eqeq272}, we can conclude that
    \eqenv{
    \E\Fbrac{\sup_{s,a}\lbrac{{f^{*\rho_{\chi^2}(\sigma)}(\hat p_{s,a,2^{N_{\max}+1}},V)}-{f^{*\rho_{\chi^2}(\sigma)}(p_{s,a},V)} }}
     &\myineq{\leq}{i} C_{\chi^2} \frac{r_{\max}}{1-\gamma}2^{-\frac{N_{\max}+1}{2}} + (1+\sqrt{\sigma})\frac{r_{\max}}{1-\gamma}2^{-\brac{N_{\max}+1}} 
    \\& {\leq} \frac{r_{\max}}{1-\gamma}2^{-\frac{N_{\max}+1}{2}}\brac{(1+\sqrt{\sigma})2^{-\frac{N_{\max}+1}{2}}+ C_{\chi^2}},
    }where $(i)$ follows from that $1-2^{-\brac{N_{\max}+1}}\leq 1$. 

    Similarly, we can get the bound
    \eqenv{
    \E\Fbrac{\sup_{s,a}\lbrac{{g^{*\rho_{\chi^2}(\sigma)}(\hat \mu_{s,a,2^{N_{\max}+1}},r_{s,a})}- {g^{*\rho_{\chi^2}(\sigma)}(\mu_{s,a},r_{s,a})} }}
    \leq r_{\max}2^{-\frac{N_{\max}+1}{2}}\brac{(1+\sqrt{\sigma})2^{-\frac{N_{\max}+1}{2}}+ 3C_{\chi^2} }.
    }

    Thus, we can get that
    \eqenv{
    \sup_{s,a}&\lbrac{ \E\Fbrac{\widehat {\mathcal{T}}_{N_{\max}}^{\rho_{\chi^2}(\sigma)} (Q)(s,a) } - {\mathcal{T}}^{\rho_{\chi^2}(\sigma)} (Q)(s,a)}
\\& \leq \E\Fbrac{\sup_{s,a}\lbrac{ {g^{*\rho_{\chi^2}(\sigma)}(\hat \mu_{s,a,2^{N_{\max}+1}},r_{s,a})}- {g^{*\rho_{\chi^2}(\sigma)}(\mu_{s,a},r_{s,a})} } }
\\& \qquad +\gamma \E\Fbrac{\sup_{s,a} \lbrac{{f^{*\rho_{\chi^2}(\sigma)}(\hat p_{s,a,2^{N_{\max}+1}},V )}-{f^{*\rho_{\chi^2}(\sigma)}(p_{s,a},V)} }}
\\& \leq \brac{\frac{\gamma r_{\max}}{1-\gamma}+r_{\max} }2^{-\frac{N_{\max}+1}{2}}\brac{(1+\sqrt{\sigma})2^{-\frac{N_{\max}+1}{2}}+ C_{\chi^2} }.
    }
        
    % $(i)$ follows from the dual representation of Wasserstein distance and the facts that 1). the function $ (v(x))_\alpha$ is a Lipschitz function on $[0,\infty)$ with bounded Lipschitz constant $1$. 2) $(v(x))_\alpha\leq \alpha \leq \max_{s'_{s,a}}V(s'_{s,a})\leq \max_{s,a}Q(s,a)\leq \frac{r_{\max}}{1-\gamma}  $.  $(ii)$ follows from  Wasserstein distance inequality in \citep{fournier2015rate}. 

    % Hence, we can get that
    % \eqenv{
    % \norminf{ {\widehat {\mathcal{T}}_{N_{\max}}^{\rho_{\chi^2}(\sigma)} (Q) } - {\mathcal{T}}^{\rho_{\chi^2}(\sigma)} (Q) } \leq \max_{s,a} \lbrac{ {\widehat {\mathcal{T}}_{N_{\max}}^{\rho_{\chi^2}(\sigma)} (Q)(s,a) } - {\mathcal{T}}^{\rho_{\chi^2}(\sigma)} (Q)(s,a) }
    % \\& \leq 
    % }

\textbf{Variance: }Next, we consider the variance of the robust Bellman operator. Firstly, we make error decomposition of the robust Bellman operator variance. 
\eqenv{
\text{Var}\brac{\widehat {\mathcal{T}}_{N_{\max}}^{\rho_{\chi^2}(\sigma)} (Q)(s,a) }
&= \text{Var}\brac{\widehat  r^{\rho_{\chi^2}(\sigma)}+ \gamma \widehat{v}^{\rho_{\chi^2}(\sigma)}(Q)(s,a) }
\\&= \text{Var}\brac{\widehat  r^{\rho_{\chi^2}(\sigma)}}
+\gamma^2 \Varr{\widehat{v}^{\rho_{\chi^2}(\sigma)}(Q)(s,a)}.
}
For convenience, we analyze the second term in the above equation. The first term can be bounded similarly. 
\eqenv{
\Varr{\widehat{v}^{\rho_{\chi^2}(\sigma)}(Q)(s,a)}= 
\E\Fbrac{\brac{\widehat{v}^{\rho_{\chi^2}(\sigma)}(Q)(s,a)}^2  }-\brac{\E\Fbrac{\widehat{v}^{\rho_{\chi^2}(\sigma)}(Q)(s,a) }}^2
\leq \E\Fbrac{\brac{\widehat{v}^{\rho_{\chi^2}(\sigma)}(Q)(s,a)}^2  }.
}
Next, according to the \cref{eq:delta,eq:hatq}, now we compute the expectation of $N_2$ and write a detailed explanation of the variance as follows:
\eqenv{\label{eqeq49}
\E\Fbrac{\brac{\widehat{v}^{\rho_{\chi^2}(\sigma)}(Q)(s,a)}^2  }& = \E\Fbrac{\brac{V(s'_{s,a,0})+ \frac{\delta^{\rho_{\chi^2}(\sigma)}_{s,a,N_2}(Q)(s,a) }{P_{N_2}} }^2}
\\& \leq 2 \E\Fbrac{V(s'_{s,a,0})^2 }+ 2\E \Fbrac{ \brac{\frac{\delta^{\rho_{\chi^2}(\sigma)}_{s,a,N_2}(Q)(s,a) }{P_{N_2}} }^2 }
\\& \leq \frac{2r^2_{\max}}{(1-\gamma)^2}+ 2\sum_{N=0}^{N_{\max}} \E\Fbrac{\brac{\frac{\delta^{\rho_{\chi^2}(\sigma)}_{s,a,N_2}(Q)(s,a) }{P_{N_2}}|N_2=N }^2} P_N
\\& \leq \frac{2r^2_{\max}}{(1-\gamma)^2}+ 2\sum_{N=0}^{N_{\max}} \frac{\E\Fbrac{(\delta^{\rho_{\chi^2}(\sigma)}_{s,a,N}(Q)(s,a))^2} }{P_{N}}.
}
Next, we bound the term $\lbrac{\delta^{\rho_{\chi^2}(\sigma)}_{s,a,N}(Q)(s,a) } $,
\eqenv{
&\lbrac{\delta^{\rho_{\chi^2}(\sigma)}_{s,a,N}(Q)(s,a) }
\\&=
\lbrac{\sup_{\alpha\geq 0}\varbrac{f^{\rho_{\chi^2}(\sigma)}(\widehat p_{s,a,2^{N+1}},\alpha,V)} 
     -\frac{1}{2} \sup_{\alpha\geq 0}\varbrac{f^{\rho_{\chi^2}(\sigma)}(\widehat p^E_{s,a,2^{N}},\alpha,V)}
    -\frac{1}{2}\sup_{\alpha\geq 0}\varbrac{f^{\rho_{\chi^2}(\sigma)}(\widehat p^O_{s,a,2^{N}},\alpha,V )}}. 
}

% Recall that
% $\boldsymbol{\alpha^*}^{\rho_{\chi^2}(\sigma)}(p, v(x))\in \arg\max_{\alpha} f^{\rho_{\chi^2}(\sigma)}(p, \alpha, v(x)) $. 

% According to \cref{lm:qtv}, when $$\boldsymbol{\alpha^*}^{\rho_{\chi^2}(\sigma)}(\widehat p^E_{2^{N}}, V)=\boldsymbol{\alpha^*}^{\rho_{\chi^2}(\sigma)}(\widehat p^O_{2^{N}}, V) ,\quad \widehat p^E_{2^{N}}(s')_{s'\in\min_s V(s)}\neq 0,\quad \widehat p^O_{2^{N}}(s')_{s'\in \min_s V(s)}\neq 0,$$ then
% \eqenv{
% \delta^{\rho_{\chi^2}(\sigma)}_{s,a,N}(Q)(s,a) =0. 
% }

% Otherwise,

We make an error decomposition as follows:
\eqenv{
&\lbrac{\delta^{\rho_{\chi^2}(\sigma)}_{s,a,N}(Q)(s,a) }^2\\&=
\lbrac{\sup_{\alpha\geq 0}\varbrac{f^{\rho_{\chi^2}(\sigma)}(\widehat p_{s,a,2^{N+1}},\alpha,V)} 
     -\frac{1}{2} \sup_{\alpha\geq 0}\varbrac{f^{\rho_{\chi^2}(\sigma)}(\widehat p^E_{s,a,2^{N}},\alpha,V)}
    -\frac{1}{2}\sup_{\alpha\geq 0}\varbrac{f^{\rho_{\chi^2}(\sigma)}(\widehat p^O_{s,a,2^{N}},\alpha,V )}}^2
    \\& \leq 3\lbrac{\sup_{\alpha\geq 0}\varbrac{f^{\rho_{\chi^2}(\sigma)}(\widehat p_{s,a,2^{N+1}},\alpha,V)}-{\sup_{\alpha\geq 0}\varbrac{f^{\rho_{\chi^2}(\sigma)}( p_{s,a},\alpha,V)} }  }^2
    \\&\quad+  \frac{3}{4}\lbrac{ \sup_{\alpha\geq 0}\varbrac{f^{\rho_{\chi^2}(\sigma)}(\widehat p^E_{s,a,2^{N}},\alpha,V)}-{\sup_{\alpha\geq 0}\varbrac{f^{\rho_{\chi^2}(\sigma)}( p_{s,a},\alpha,V)} } }^2
    \\&\quad+\frac{3}{4}\lbrac{\sup_{\alpha\geq 0}\varbrac{f^{\rho_{\chi^2}(\sigma)}(\widehat p^O_{s,a,2^{N}},\alpha,V )}-{\sup_{\alpha\geq 0}\varbrac{f^{\rho_{\chi^2}(\sigma)}( p_{s,a},\alpha,V)} }}^2.\label{eqeq52}
    % \\& \myineq{}{}
}

According to \Cref{lm:chi2lm}, we can get that with probability $1-2^{-{N_{\max}-1}} $, we have 
    \eqenv{\label{eqeq272}
    &\lbrac{ {f^{*\rho_{\chi^2}(\sigma)}(\hat p_{s,a,2^{N_{\max}+1}},V )}-{f^{*\rho_{\chi^2}(\sigma)}(p_{s,a},V)} }
    \\&\qquad\qquad\leq 4 \sqrt{\frac{2r_{\max}^2 (1+\sigma) \brac{\log \brac{24|\mathcal{S}||\mathcal{A}|}+ {{2(N_{\max}+1)}}\log 2}}{(1-\gamma)^2 2^{{N_{\max}+1}}}}= C_{\chi^2} \frac{r_{\max}}{1-\gamma}2^{-\frac{N_{\max}+1}{2}},
    } where $C_{\chi^2}=4\sqrt{2 (1+\sigma) \brac{\log \brac{24|\mathcal{S}||\mathcal{A}|}+ {{2(N_{\max}+1)}}\log 2}} $.
     Otherwise, we have 
    \eqenv{\label{eqeq273}
    \E&\bigg[\max_{\max_{s'_{s,a}} V(s'_{s,a}) \geq \alpha\geq 0}\bigg\{\lbrac{\mathbb E_{p_{s,a}}\Fbrac{(V(s'_{s,a}))_\alpha}-E_{\hat p_{s,a,2^{N_{\max}+1}}}\Fbrac{(V(s'_{s,a}))_\alpha}}
    \\& \qquad+ \lbrac{\sqrt{\sigma \text{Var}_{p_{s,a}}\Fbrac{(V(s'_{s,a}))_\alpha } } -\sqrt{\sigma \text{Var}_{\hat p_{s,a,2^{N_{\max}+1}}}\Fbrac{(V'(s'_{s,a}))_\alpha } }}\bigg\}\bigg]
    \\ %& \leq  \max_{0\leq \alpha\leq \max_{s'_{s,a}} V(s'_{s,a}) }\lbrac{\mathbb E_{p_{s,a}}\Fbrac{(V(s'_{s,a}))_\alpha}-E_{\hat p_{s,a,2^{N_{\max}+1}}}\Fbrac{(V(s'_{s,a}))_\alpha}}\\&
    &\leq \max_{s'_{s,a}} V(s'_{s,a})+ \max_{0\leq \alpha\leq \max_{s'_{s,a}} V(s'_{s,a}) }\lbrac{\sqrt{\sigma \text{Var}_{p_{s,a}}\Fbrac{(V(s'_{s,a}))_\alpha } } -\sqrt{\sigma \text{Var}_{\hat p_{s,a,2^{N_{\max}+1}}}\Fbrac{(V'(s'_{s,a}))_\alpha } }}
    \\& \leq \max_{s'_{s,a}} V(s'_{s,a})+ \sqrt{\sigma \brac{\max_{s'_{s,a}} V(s'_{s,a}) }^2}
    \\& \leq (1+\sqrt{\sigma})\frac{r_{\max}}{1-\gamma}. 
    }


Then, combined with the analysis in \cref{eqeq272,eqeq273} and the fact that for any events $A,B,C$, $\mathbb P(A\cap B\cap C)\geq 1- \mathbb P(\neg A)-\mathbb P(\neg B)-\mathbb P(\neg C)$, we can conclude that
 with probability at least $1-3*2^{-N} $
% Plug \cref{eqeq52} in \cref{}
\eqenv{
\lbrac{\delta^{\rho_{\chi^2}(\sigma)}_{s,a,N}(Q)(s,a) }^2
&\leq 3\brac{C_{\chi^2}\frac{r_{\max}}{1-\gamma} 2^{-\frac{N+1}{2}}}^2 + \frac{3}{4}\brac{C_{\chi^2}\frac{r_{\max}}{1-\gamma} 2^{-\frac{N}{2}} }^2+ \frac{3}{4}\brac{C_{\chi^2}\frac{r_{\max}}{1-\gamma} 2^{-\frac{N}{2}} }^2
\\&  = 3\frac{C^2_{\chi^2} r^2_{\max} }{(1-\gamma)^2}2^{-(N+1)}, 
}
 Since $ 0 \leq \sup_{\alpha\geq 0}\varbrac{f^{\rho_{\chi^2}(\sigma)}(q,\alpha,V)}\leq (1+\sqrt{\sigma})\frac{r_{\max}}{1-\gamma}$ for any distribution $q$, with probability at most $3*2^{-N} $ 
we have that
\eqenv{
 \lbrac{\delta^{\rho_{\chi^2}(\sigma)}_{s,a,N}(Q)(s,a) }^2
 \leq \brac{(1+\sqrt{\sigma}) \frac{r_{\max}}{1-\gamma}}^2.
}

Above all, we can get that 
\eqenv{
\E\Fbrac{\lbrac{\delta^{\rho_{\chi^2}(\sigma)}_{s,a,N}(Q)(s,a) }^2 }\leq \frac{9}{2}\frac{C^2_{\chi^2} r^2_{\max} }{(1-\gamma)^2}2^{-(N+1)}+ \brac{ \frac{r_{\max}}{1-\gamma}}^23*2^{-N}
\leq \brac{3 C^2_{\chi^2}+ 6(1+\sqrt{\sigma})^2}\brac{ \frac{r_{\max}}{1-\gamma}}^2 2^{-N-1}. \label{eqeq55}
}
Then, plug \cref{eqeq55} in \cref{eqeq49}, we can get the boundary of variance of robust Bellman operator as follows:
\eqenv{
\Varr{\widehat{v}^{\rho_{\chi^2}(\sigma)}(Q)(s,a)}&\leq \frac{2r^2_{\max}}{(1-\gamma)^2}+ 2\sum_{N=0}^{N_{\max}} \frac{\E\Fbrac{(\delta^{\rho_{\chi^2}(\sigma)}_{s,a,N}(Q)(s,a))^2} }{P_{N}}
\\&\leq \frac{2r^2_{\max}}{(1-\gamma)^2}+ \frac{2r^2_{\max}}{(1-\gamma)^2}\brac{3 C^2_{\chi^2}+ 6(1+\sqrt{\sigma})^2}\sum_{N=0}^{N_{\max}} \frac{2^{-N-1} }{P_{N}}
\\& \myineq{=}{a} \frac{2r^2_{\max}}{(1-\gamma)^2}\brac{1+\brac{3 C^2_{\chi^2}+ 6(1+\sqrt{\sigma})^2}(N_{\max}+1) }, 
} where $(a)$ follows from that $P_N=\psi(1-\psi)^N=2^{-N-1}$. 

Similarly, we can get the bound of the variance $\text{Var}\brac{\widehat  r^{\rho_{\chi^2}(\sigma)}}$ as follows:
\eqenv{
\text{Var}\brac{\widehat  r^{\rho_{\chi^2}(\sigma)}}\leq 
 2r^2_{\max}\brac{1+\brac{3 C^2_{\chi^2}+ 6(1+\sqrt{\sigma})^2}(N_{\max}+1) }.
}

Hence, we can get the robust Bellman operator variance bound:
\eqenv{\label{eqeq162}
\text{Var}\brac{\widehat {\mathcal{T}}_{N_{\max}}^{\rho_{\chi^2}(\sigma)} (Q)(s,a) }
&= \text{Var}\brac{\widehat  r^{\rho_{\chi^2}(\sigma)}}
+\gamma^2 \Varr{\widehat{v}^{\rho_{\chi^2}(\sigma)}(Q)(s,a)}
\\&\leq \brac{2r^2_{\max}+\gamma^2\frac{2r^2_{\max}}{(1-\gamma)^2} }\brac{1+\brac{3 C^2_{\chi^2}+ 6(1+\sqrt{\sigma})^2}(N_{\max}+1) }.
}
   This completes the proof.  
\end{proof}

\begin{lemma}\label{lm:lminfchi2}
    For any fixed $Q\in \mathbb R^{|\mathcal{S}||\mathcal{A}|}, s\in \mathcal{S}, a\in \mathcal{A} $, the infinite norm of robust Bellman operator can be bounded as:
    \begin{align}
        \E\Fbrac{\norminf{\widehat {\mathcal{T}}^{\rho_{\chi^2}(\sigma)}_{N_{\max}} (Q)(s,a) }^2 }\leq \widetilde{\mathcal{O }}\brac{{N_{\max}}}.
        \end{align}
\end{lemma}

\begin{proof}[Proof of \Cref{lm:lminfchi2}]
We then consider the expectation of infinite norm of robust Bellman operator. Set $\psi=\frac{1}{2}$, and then we make an error decomposition as follows
\eqenv{\label{eqceq149}
\E\Fbrac{\norminf{\widehat {\mathcal{T}}_{N_{\max}}^{\rho_{\chi^2}(\sigma)}(Q)}^2 }\leq 
 4r^2_{\max}+4\gamma^2\frac{r^2_{\max}}{(1-\gamma)^2}+4\E\Fbrac{\sum_{N_1=0}^{N_{\max}} \sup_{s,a}\frac{\brac{\delta^{r,\rho(\sigma)}_{s,a,N_1}
  }^2}{2^{-N_1-1}}+\gamma^2\sum_{N_2=0}^{N_{\max}}\sup_{s,a}\frac{\brac{
\delta^{\rho_{\chi^2}(\sigma)}_{s,a,N_2}(Q)  }^2}{2^{-N_2-1}} }. 
}

% We then consider the variance of the robust Bellman operator. Firstly, we make an error decomposition of the robust Bellman operator variance as 
% \eqenv{
% \text{Var}\brac{\widehat {\mathcal{T}}_{N_{\max}}^{\rho_{\chi^2}(\sigma)} (Q)(s,a) }
% &= \text{Var}\brac{\widehat  r^{\rho_{\chi^2}(\sigma)}+ \gamma \widehat{v}^{\rho_{\chi^2}(\sigma)}(Q)(s,a) }
% \\&= \text{Var}\brac{\widehat  r^{\rho_{\chi^2}(\sigma)}}
% +\gamma^2 \Varr{\widehat{v}^{\rho_{\chi^2}(\sigma)}(Q)(s,a)},
% }
% which is due to the two estimators are independent.

For convenience, we analyze the last term in the above equation. 
% The first term can be bounded similarly. 
% \eqenv{
% \Varr{\widehat{v}^{\rho_{\chi^2}(\sigma)}(Q)(s,a)}&= 
% \E\Fbrac{\brac{\widehat{v}^{\rho_{\chi^2}(\sigma)}(Q)(s,a)}^2  }-\brac{\E\Fbrac{\widehat{v}^{\rho_{\chi^2}(\sigma)}(Q)(s,a) }}^2
% \\&\leq \E\Fbrac{\brac{\widehat{v}^{\rho_{\chi^2}(\sigma)}(Q)(s,a)}^2  }.
% }
% Next, according to the \cref{eq:delta,eq:hatq}, the term above can be explicitly computed: 
% \eqenv{\label{eqeq149}
% \E\Fbrac{\brac{\widehat{v}^{\rho_{\chi^2}(\sigma)}(Q)(s,a)}^2  }& = \E\Fbrac{\brac{V(s'_{s,a,0})+ \frac{\delta^{\rho_{\chi^2}(\sigma)}_{s,a,N_2}(Q)(s,a) }{P_{N_2}} }^2}
% \\& \leq 2 \E\Fbrac{V(s'_{s,a,0})^2 }+ 2\E \Fbrac{ \brac{\frac{\delta^{\rho_{\chi^2}(\sigma)}_{s,a,N_2}(Q)(s,a) }{P_{N_2}} }^2 }
% \\& \leq \frac{2r^2_{\max}}{(1-\gamma)^2}+ 2\sum_{N=0}^{N_{\max}} \E\Fbrac{\brac{\frac{\delta^{\rho_{\chi^2}(\sigma)}_{s,a,N_2}(Q)(s,a) }{P_{N_2}}|N_2=N }^2} P_N
% \\& \leq \frac{2r^2_{\max}}{(1-\gamma)^2}+ 2\sum_{N=0}^{N_{\max}} \frac{\E\Fbrac{(\delta^{\rho_{\chi^2}(\sigma)}_{s,a,N}(Q)(s,a))^2} }{P_{N}}.
% }
Consider the term $\sup_{s,a}\lbrac{\delta^{\rho_{\chi^2}(\sigma)}_{s,a,N}(Q)(s,a) }^2 $, 
% we can get that
% \eqenv{
% \sup_{s,a}\lbrac{\delta^{\rho_{\chi^2}(\sigma)}_{s,a,N}(Q)(s,a) }&=
% \Bigg|\sup_{\alpha\geq 0}\varbrac{f^{\rho_{\chi^2}(\sigma)}(\widehat p_{s,a,2^{N+1}},\alpha,V)} 
%     \\&\quad  -\frac{1}{2} \sup_{\alpha\geq 0}\varbrac{f^{\rho_{\chi^2}(\sigma)}(\widehat p^E_{s,a,2^{N}},\alpha,V)}
%     -\frac{1}{2}\sup_{\alpha\geq 0}\varbrac{f^{\rho_{\chi^2}(\sigma)}(\widehat p^O_{s,a,2^{N}},\alpha,V )}\Bigg|. 
% }
 we make an error decomposition as follows:
\eqenv{
\sup_{s,a}\lbrac{\delta^{\rho_{\chi^2}(\sigma)}_{s,a,N}(Q)(s,a) }^2&=
\sup_{s,a}\Bigg|\sup_{\alpha\geq 0}\varbrac{f^{\rho_{\chi^2}(\sigma)}(\widehat p_{s,a,2^{N+1}},\alpha,V)} 
        \\& \qquad-\frac{1}{2} \sup_{\alpha\geq 0}\varbrac{f^{\rho_{\chi^2}(\sigma)}(\widehat p^E_{s,a,2^{N}},\alpha,V)}
    -\frac{1}{2}\sup_{\alpha\geq 0}\varbrac{f^{\rho_{\chi^2}(\sigma)}(\widehat p^O_{s,a,2^{N}},\alpha,V )}\Bigg|^2
    \\& \leq  3\sup_{s,a}\lbrac{\sup_{\alpha\geq 0}\varbrac{f^{\rho_{\chi^2}(\sigma)}(\widehat p_{s,a,2^{N+1}},\alpha,V)}-{\sup_{\alpha\geq 0}\varbrac{f^{\rho_{\chi^2}(\sigma)}(p_{s,a},\alpha,V)} }  }^2
    \\&\quad+  \frac{3}{4}\sup_{s,a}\lbrac{ \sup_{\alpha\geq 0}\varbrac{f^{\rho_{\chi^2}(\sigma)}( p^E_{s,a,2^N},\alpha,V)}-{\sup_{\alpha\geq 0}\varbrac{f^{\rho_{\chi^2}(\sigma)}(p_{s,a},\alpha,V)} } }^2
    \\&\quad+\frac{3}{4}\sup_{s,a}\lbrac{\sup_{\alpha\geq 0}\varbrac{f^{\rho_{\chi^2}(\sigma)}(\widehat p^O_{s,a,2^{N}},\alpha,V )}-{\sup_{\alpha\geq 0}\varbrac{f^{\rho_{\chi^2}(\sigma)}( p_{s,a},\alpha,V)} }}^2
    .\label{eqceq152}
    % \\& \myineq{}{}
}

Then, combined with the analysis in \cref{eqc280,eqc281} and the fact $\mathbb P(A\cap B\cap C)\geq 1- \mathbb P(\neg A)-\mathbb P(\neg B)-\mathbb P(\neg C)$, we can conclude that for any $N\geq 0$, 
 with probability at least $1-3*2^{-N} $
\eqenv{
\sup_{s,a}\lbrac{\delta^{\rho_{\chi^2}(\sigma)}_{s,a,N}(Q)(s,a) }^2
&\leq 3\brac{C_{\chi^2}\frac{r_{\max}}{1-\gamma} 2^{-\frac{N+1}{2}}}^2 + \frac{3}{4}\brac{C_{\chi^2}\frac{r_{\max}}{1-\gamma} 2^{-\frac{N}{2}} }^2+ \frac{3}{4}\brac{C_{\chi^2}\frac{r_{\max}}{1-\gamma} 2^{-\frac{N}{2}} }^2
\\& \leq  6\frac{C^2_{\chi^2} r^2_{\max} }{(1-\gamma)^2}2^{-(N+1)}, 
}
 Since $ 0 \leq \sup_{\alpha\geq 0}\varbrac{f^{\rho_{\chi^2}(\sigma)}(q,\alpha,V)}\leq \frac{r_{\max}}{1-\gamma}$ for any distribution $q$, with probability at most $3*2^{-N} $ 
we have that
\eqenv{
\sup_{s,a} \lbrac{\delta^{\rho_{\chi^2}(\sigma)}_{s,a,N}(Q)(s,a) }^2
 \leq \brac{ \frac{r_{\max}}{1-\gamma}}^2.
}

Above all, we can get that 
\eqenv{
\E\Fbrac{\sup_{s,a}\lbrac{\delta^{\rho_{\chi^2}(\sigma)}_{s,a,N}(Q)(s,a) }^2 }&\leq 6\frac{C^2_{\chi^2} r^2_{\max} }{(1-\gamma)^2}2^{-(N+1)}+ \brac{ \frac{r_{\max}}{1-\gamma}}^23*2^{-N}
\\&\leq \brac{6 C^2_{\chi^2}+ 6}\brac{ \frac{r_{\max}}{1-\gamma}}^2 2^{-N-1}. 
}

Besides, we can get 
\eqenv{\label{eqceq155}
\E\Fbrac{\sup_{s,a}\frac{\lbrac{\delta^{\rho_{\chi^2}(\sigma)}_{s,a,N}(Q)(s,a) }^2}{P_N} }= \brac{6 C^2_{\chi^2}+ 6}\brac{ \frac{r_{\max}}{1-\gamma}}^2.
}

Then, plug \cref{eqceq155} in \cref{eqceq149}, we can get the bound of expectation of infinite norm as follows:
\eqenv{
\E\Fbrac{\sum_{N_2=0}^{N_{\max}}\sup_{s,a}\frac{\brac{
\delta^{\rho_{\chi^2}(\sigma)}_{s,a,N_2}(Q)  }^2}{2^{-N_2-1}} }
\leq \sum_{N_2=0}^{N_{\max}}\brac{6 C^2_{\chi^2}+ 6}\brac{ \frac{r_{\max}}{1-\gamma}}^2=\brac{N_{\max}+1}\brac{6 C^2_{\chi^2}+ 6}\brac{ \frac{r_{\max}}{1-\gamma}}^2. 
}


Similarly, we can get the bound as follows:
\eqenv{
\E\Fbrac{\sum_{N_1=0}^{N_{\max}} \sup_{s,a}\frac{\brac{\delta^{r,\rho(\sigma)}_{s,a,N_1}
  }^2}{2^{-N_1-1}}}\leq \sum_{N_1=0}^{N_{\max}}\brac{6 C^2_{\chi^2}+ 6} r_{\max}^2=\brac{N_{\max}+1}\brac{6 C^2_{\chi^2}+ 6}r^2_{\max}. 
}

Hence, combining \Cref{eqceq149} and the above equations, we can get the robust Bellman operator infinite norm bound:
\eqenv{\label{eqceq62}
\E\Fbrac{\norminf{\widehat {\mathcal{T}}_{N_{\max}}^{\rho_{\chi^2}(\sigma)}(Q)}^2 }\leq 
4\brac{1+\brac{N_{\max}+1}\brac{6 C^2_{\chi^2}+ 6}}\brac{r^2_{\max}+\gamma^2\brac{ \frac{r_{\max}}{1-\gamma}}^2}.
}
   This completes the proof.  
\end{proof}



\begin{theorem}[Sample Complexity with $\chi^2$ Distance]
 Set $N_{\max}=\frac{2\log T}{\log 2}$ and the stepsize as $$\beta_t=\beta=\frac{2\log T}{(1-\gamma)T}. $$ Then the output of \cref{alg:example} satisfies that:
 \begin{align}
    \mathbb E \Fbrac{\mynorm{\widehat Q_T^{\rho_{\chi^2}(\sigma)}-Q^{*\rho_{\chi^2}(\sigma)}}_\infty^2}\nonumber
    & \leq \frac{\log T}{T^2}\frac{r^2_{\max}}{(1-\gamma)^2} 
+  \frac{r^2_{\max} \log T }{2(1-\gamma)^2 T}\brac{1+\frac{ \gamma^2}{(1-\gamma)^2} }\brac{1+\brac{6 C^2_{\chi^2}+ 6}(N_{\max}+1) }
\\&\qquad+r^2_{\max}\brac{\brac{1+\frac{\gamma}{1-\gamma}}\brac{1+ 3C_{\chi^2} }}^2\frac{1}{T}\nonumber
   %\\& \leq \frac{c_0}{(1-\gamma)^2}\brac{1-\frac{\alpha(1-\gamma)}{2}}^T+ c_1 \alpha\frac{\log (|\mathcal{S}||\mathcal{A}|)}{ (1-\gamma)^4}\nonumber
    \\& \leq\widetilde{\mathcal{O }}\brac{\frac{1}{  (1-\gamma)^5 T}}.
\end{align}
To ensure
\begin{align}
    \mathbb E \Fbrac{\mynorm{\widehat Q_T^{\rho_{\chi^2}(\sigma)}-Q^{*\rho_{\chi^2}(\sigma)}}_\infty^2}\leq \epsilon^2,\nonumber
\end{align}
the expected total sample complexity $N^{\rho_{\chi^2}(\sigma)}(\epsilon)$ is,
 \begin{align}
     N^{\rho_{\chi^2}(\sigma)}(\epsilon) = |\mathcal{S}||\mathcal{A}|N_{\max} T\geq \widetilde{\mathcal{O }}\brac{\frac{|\mathcal{S}||\mathcal{A}|}{  (1-\gamma)^5 \epsilon^2 }}.\nonumber
 \end{align}
\end{theorem}
\begin{proof}
We consider the stochastic iteration
\eqenv{
\widehat Q^{\rho_{\chi^2}(\sigma)}_{t+1}=\widehat Q^{\rho_{\chi^2}(\sigma)}_{t}+ \beta_t\brac{\boldsymbol{\bar{\mathcal{T}}}^{\rho_{\chi^2}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{\chi^2}(\sigma)}_{t}}- \widehat Q^{\rho_{\chi^2}(\sigma)}_{t}+ W_t  },
}
where $W_t={\widehat{\mathcal{T}}}^{\rho_{\chi^2}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{\chi^2}(\sigma)}_{t}}-\boldsymbol{\bar{\mathcal{T}}}^{\rho_{\chi^2}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{\chi^2}(\sigma)}_{t}} $. 

Define the filtration $\mathcal{F}_t=\varbrac{Q^{\rho_{\chi^2}(\sigma)}_0, W_0,...,Q^{\rho_{\chi^2}(\sigma)}_{t-1},W_{t-1},Q^{\rho_{\chi^2}(\sigma)}_{t} } $. 
Then, by \cref{thm:tv1}, we can get that 
\eqenv{
\E\Fbrac{W_t|\mathcal{F}_t }=0,
}

and  by \cref{lm:lminfchi2}, we can get that 
\eqenv{
\E\Fbrac{\norminf{W_t}^2|\mathcal{F}_t }
&\leq {\E\Fbrac{\sup_{s,a}\lbrac{{\widehat{\mathcal{T}}}^{\rho_{\chi^2}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{\chi^2}(\sigma)}_{t}}(s,a)-\boldsymbol{\bar{\mathcal{T}}}^{\rho_{\chi^2}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{\chi^2}(\sigma)}_{t}}(s,a)}^2|\mathcal{F}_t }}
\\& \leq 2{\E\Fbrac{\sup_{s,a} \lbrac{{\widehat{\mathcal{T}}}^{\rho_{\chi^2}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{\chi^2}(\sigma)}_{t}}(s,a)}^2+\sup_{s,a}\lbrac{\boldsymbol{\bar{\mathcal{T}}}^{\rho_{\chi^2}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{\chi^2}(\sigma)}_{t}}(s,a)}^2|\mathcal{F}_t }}
\\&\myineq{\leq}{a} 4\E\Fbrac{\sup_{s,a}2\lbrac{{\widehat{\mathcal{T}}}^{\rho_{\chi^2}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{\chi^2}(\sigma)}_{t}}(s,a)}^2|\mathcal{F}_t }
\\& \myineq{\leq}{b} 16\brac{1+\brac{N_{\max}+1}\brac{6 C^2_{\chi^2}+ 6}}\brac{r^2_{\max}+\gamma^2\brac{ \frac{r_{\max}}{1-\gamma}}^2}, 
% \\& \leq \max_{s,a} \varbrac{\text{Var}\brac{\widehat {\mathcal{T}}^{\rho_{\chi^2}(\sigma)}_{N_{\max}} (Q^{\rho_{\chi^2}(\sigma)}_{t})(s,a)  } }
% \\& \leq \brac{2r^2_{\max}+2\gamma^2  \norminf{Q^{\rho_{\chi^2}(\sigma)}_{t} }^2 }(1+\brac{9 C^2_{\chi^2}+ 3}(N_{\max}+1) ),
% \\& \myineq{\leq}{i} \brac{2r^2_{\max}+\gamma^2\frac{2r^2_{\max}}{(1-\gamma)^2} }(1+\brac{9 C^2_{\chi^2}+ 3}(N_{\max}+1) ),
} where $(a) $ follows from that
\eqenv{
\E\Fbrac{\sup_{s,a}\lbrac{\boldsymbol{\bar{\mathcal{T}}}^{\rho_{\chi^2}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{\chi^2}(\sigma)}_{t}}(s,a)}^2|\mathcal{F}_t }&= 
\E\Fbrac{\sup_{s,a}\lbrac{\E\Fbrac{\widehat{\mathcal{T}}^{\rho_{\chi^2}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{\chi^2}(\sigma)}_{t}}(s,a)}}^2\Big|\mathcal{F}_t }
\\& \leq \E\Fbrac{\sup_{s,a}\E\Fbrac{\lbrac{\widehat{\mathcal{T}}^{\rho_{\chi^2}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{\chi^2}(\sigma)}_{t}}(s,a)}^2}\Big|\mathcal{F}_t }
\\& \leq \E\Fbrac{\sup_{s,a} {\lbrac{\widehat{\mathcal{T}}^{\rho_{\chi^2}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{\chi^2}(\sigma)}_{t}}(s,a)}^2}\Big|\mathcal{F}_t }, 
}
and $(b)$ follows from \Cref{lm:lminfchi2}.

% and
% \eqenv{
% \E\Fbrac{\norminf{W_t}^2|\mathcal{F}_t }
% &\leq \max_{s,a}{\E\Fbrac{\lbrac{{\widehat{\mathcal{T}}}^{\rho_{\chi^2}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{\chi^2}(\sigma)}_{t}}(s,a)-\boldsymbol{\bar{\mathcal{T}}}^{\rho_{\chi^2}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{\chi^2}(\sigma)}_{t}}(s,a)}^2|\mathcal{F}_t }}
% \\& \leq \max_{s,a} \varbrac{\text{Var}\brac{\widehat {\mathcal{T}}_{N_{\max}}^{\rho_{\chi^2}(\sigma)} (Q^{\rho_{\chi^2}(\sigma)}_{t})(s,a)  } }
% \\& \leq \brac{2r^2_{\max}+2\gamma^2  \norminf{Q^{\rho_{\chi^2}(\sigma)}_{t} }^2 }\brac{1+\brac{3 C^2_{\chi^2}+ 6(1+\sqrt{\sigma})^2}(N_{\max}+1) },
% \\& \myineq{\leq}{i} \brac{2r^2_{\max}+\gamma^2\frac{2r^2_{\max}}{(1-\gamma)^2} }\brac{1+\brac{3 C^2_{\chi^2}+ 6(1+\sqrt{\sigma})^2}(N_{\max}+1) },
% } where $(i) $ follows from \cref{eqeq162}. 

According to \cref{eq:fixp}, we have that
$$\widehat Q^{*\rho_{\chi^2}(\sigma)}(s,a) = \hatT (\widehat Q^{*\rho_{\chi^2}(\sigma)})(s,a)=\E\Fbrac{\hatt (\widehat Q^{*\rho_{\chi^2}(\sigma)})(s,a)}.$$
Then, apply \cref{lm:chen} \citep{chen2020finite}, set the constant stepsize $\beta_t=\beta=\frac{2\log T}{(1-\gamma)T}$ and $T$ is large enough s.t.
$$\beta={\frac{2\log T}{(1-\gamma)T}\leq \frac{(1-\gamma)^2}{128 e  \log(|\mathcal{S}||\mathcal{A}|) }} .$$
We can conclude that
\eqenv{\label{eqceq66}
&\E\Fbrac{\norminf{\widehat Q_{T}^{\rho_{\chi^2}(\sigma)}-\widehat Q^{*\rho_{\chi^2}(\sigma)} }^2}\myineq{\leq}{i} \frac{3}{2} \norminf{\widehat Q_{0}^{\rho_{\chi^2}(\sigma)}-\widehat Q^{*\rho_{\chi^2}(\sigma)} }^2\prod_{j=0}^{T-1} \brac{1-\frac{1-\gamma}{2} \beta_t} \\&\quad +\frac{16 e \log (|\mathcal{S}||\mathcal{A}|)}{1-\gamma}16\brac{1+\brac{N_{\max}+1}\brac{6 C^2_{\chi^2}+ 6}}\brac{r^2_{\max}+\gamma^2\brac{ \frac{r_{\max}}{1-\gamma}}^2}\sum_{i=0}^{T-1} \beta_i^2 \prod_{t=i+1}^{T-1} (1-
\frac{1-\gamma}{2}\beta_t)
\\& \myineq{\leq}{ii} \frac{3}{2}\frac{r^2_{\max}}{(1-\gamma)^2} \frac{1}{T} +\frac{16 e \log (|\mathcal{S}||\mathcal{A}|)}{1-\gamma}16\brac{1+\brac{N_{\max}+1}\brac{6 C^2_{\chi^2}+ 6}}\brac{r^2_{\max}+\gamma^2\brac{ \frac{r_{\max}}{1-\gamma}}^2}\frac{4 \log T }{(1-\gamma)^2 T}
,
}where $(i)$ follows from the \cref{lm:chen}. $(ii)$ follows from $(1-(1-\gamma)\beta/2)^T\leq \frac{1}{T}$.

Set $N_{\max}=\frac{2\log T}{\log 2}$. Then, we make the decomposition and get the bound of $\mathbb E \Fbrac{\mynorm{\widehat Q_T^{\rho_{\chi^2}(\sigma)}-Q^{*\rho_{\chi^2}(\sigma)}}_\infty^2}$ as follows
 \eqenv{
 \mathbb E& \Fbrac{\mynorm{\widehat Q_T^{\rho_{\chi^2}(\sigma)}-Q^{*\rho_{\chi^2}(\sigma)}}_\infty^2}
 \leq 
 2\E\Fbrac{\mynorm{\widehat Q_T^{\rho_{\chi^2}(\sigma)}-\widehat Q^{*\rho_{\chi^2}(\sigma)}}_\infty^2 }+ 2\E\Fbrac{\mynorm{\widehat Q^{*\rho_{\chi^2}(\sigma)}-Q^{*\rho_{\chi^2}(\sigma)}}_\infty^2 }
 \\& \myineq{\leq}{i}
  \frac{2r^2_{\max}}{(1-\gamma)^2 T} +\frac{32 e \log (|\mathcal{S}||\mathcal{A}|)}{1-\gamma}16\brac{1+\brac{N_{\max}+1}\brac{6 C^2_{\chi^2}+ 6}}\brac{r^2_{\max}+\gamma^2\brac{ \frac{r_{\max}}{1-\gamma}}^2}\frac{4 \log T }{(1-\gamma)^2 T}
\\& \qquad+\frac{2}{1-\gamma}\brac{\brac{r_{\max}+\frac{r_{\max}}{1-\gamma}}2^{-\frac{N_{\max}+1}{2}}\brac{2^{-\frac{N_{\max}+1}{2}}+ 3C_{\chi^2} }}^2
\\& \myineq{\leq}{ii}  \frac{2r^2_{\max}}{(1-\gamma)^2 T} +\frac{32 e \log (|\mathcal{S}||\mathcal{A}|)}{1-\gamma}16\brac{1+\brac{N_{\max}+1}\brac{6 C^2_{\chi^2}+ 6}}\brac{r^2_{\max}+\gamma^2\brac{ \frac{r_{\max}}{1-\gamma}}^2}\frac{4 \log T }{(1-\gamma)^2 T}
\\& \qquad+2\frac{2}{1-\gamma}\brac{\brac{r_{\max}+\frac{r_{\max}}{1-\gamma}}\brac{1+ 3C_{\chi^2} }}^2\frac{1}{T}
\\&= \widetilde{\mathcal{O }}\brac{\frac{1}{(1-\gamma)^5T}},
 }where $(i)$ follows from \cref{lm:a8,thm:tv1}. 
 $(ii)$ follows from $2^{\frac{\log T}{\log 2}}\leq \frac{1}{T} $. 

When $\mathbb E \Fbrac{\mynorm{\widehat Q_T^{\rho_{\chi^2}(\sigma)}-Q^{*,\rho_{\chi^2}(\sigma)}}_\infty^2}\leq \epsilon^2 $, the iteration $T\geq \widetilde{\mathcal{O }}\brac{(1-\gamma)^{-5}\epsilon^{-2}}$. When $\psi=\frac{1}{2}$, the expected sample size per iteration is $$1+\sum_{n=0}^{N_{max}} 2^{n+1}P_N=1+ \sum_{n=0}^{N_{max}} 2^{n+1} \frac{1}{2^{n+1}} =N_{\max}+2=\frac{2\log T}{\log 2}+2.$$ Above all, the total sample complexity is $\widetilde{\mathcal{O }}\brac{|\mathcal{S}||\mathcal{A}|(1-\gamma)^{-5}\epsilon^{-2} }$. 

This completes the proof.     
\end{proof}

\section{KL Divergence Uncertainty Set}
In this section, we provide the proof of \cref{thm:tv1,thm:kl} specifically for KL distance. 
\begin{theorem}[Restatement of \cref{thm:tv1} specifically for KL distance]
    Consider the case of KL constraint uncertainty set with uncertainty level $\sigma$ i.e. $ \mathcal{P}^{KL}(\sigma)$ and $ \mathcal{R}^{KL}(\sigma)$, set $\psi=\frac{1}{2}$, for any $Q\in \mathbb R^{\mathcal{S}\times\mathcal{A}}, s\in \mathcal{S}, a\in \mathcal{A} $, the estimation bias can be bounded as:
    \begin{align}
        \sup_{s,a}\lbrac{\mathbb E\Fbrac{\widehat {\mathcal{T}}_{N_{\max}}^{\rho_{KL}(\sigma)} (Q)(s,a) } - {\mathcal{T}}^{\rho_{KL}(\sigma)} (Q)(s,a)}\leq \widetilde{\mathcal{O }}\brac{2^{-\frac{N_{\max}}{2}} },\nonumber
        % \brac{r_{\max}+\frac{r_{\max}}{1-\gamma}}
    \end{align}
    and the variation can be bounded as:
    \begin{align}
        \text{Var}\brac{\widehat {\mathcal{T}}_{N_{\max}}^{\rho_{KL}(\sigma)} (Q)(s,a)  }\leq \widetilde{\mathcal{O }}\brac{N_{\max}}.
    \end{align}
\end{theorem}
\begin{proof}
    Firstly, we make error decomposition as follows:
    \eqenv{
    &\sup_{s,a}\lbrac{ \E\Fbrac{\widehat {\mathcal{T}}_{N_{\max}}^{\rho_{\chi^2}(\sigma)} (Q)(s,a) } - {\mathcal{T}}^{\rho_{\chi^2}(\sigma)} (Q)(s,a)}
    \\&\myineq{=}{i} \sup_{s,a}\bigg|\E\Fbrac{g^{*\rho_{KL}(\sigma)}(\hat \mu_{s,a,2^{N_{\max}+1}},r_{s,a})}+\gamma\E\Fbrac{f^{*\rho_{KL}(\sigma)}(\hat p_{s,a,2^{N_{\max}+1}},V)}
\\& \qquad   - {g^{*\rho_{KL}(\sigma)}(\mu_{s,a},r_{s,a})}
- \gamma  {f^{*\rho_{KL}(\sigma)}(p_{s,a},V )} \bigg|
\\& \leq \sup_{s,a}\lbrac{\E\Fbrac{g^{*\rho_{KL}(\sigma)}(\hat \mu_{s,a,2^{N_{\max}+1}},r_{s,a})}- {g^{*\rho_{KL}(\sigma)}(\mu_{s,a},r_{s,a})} }\\& \qquad +\gamma\sup_{s,a}\lbrac{\E\Fbrac{f^{*\rho_{KL}(\sigma)}(\hat p_{s,a,2^{N_{\max}+1}},V)}-{f^{*\rho_{KL}(\sigma)}(p_{s,a},V )} }
\\& \leq \E\Fbrac{\sup_{s,a}\lbrac{ {g^{*\rho_{KL}(\sigma)}(\hat \mu_{s,a,2^{N_{\max}+1}},r_{s,a})}- {g^{*\rho_{KL}(\sigma)}(\mu_{s,a},r_{s,a})} } }\\& \qquad +\gamma \E\Fbrac{ \sup_{s,a}\lbrac{{f^{*\rho_{KL}(\sigma)}(\hat p_{s,a,2^{N_{\max}+1}},V )}-{f^{*\rho_{KL}(\sigma)}(p_{s,a},V )} }}, \label{eq:eq70}
    }where $(i)$ follows from \cref{prop:mlmc}.
    
Then, for convenience, we bound the second term in \cref{eq:eq70}. The first term can be bounded similarly.  By \cref{lm:kl},
\eqenv{\label{eqeq71}
    &\lbrac{f^{*\rho_{KL}(\sigma)}\brac{ p_{s,a}, V }-{{f^*}^{\rho_{KL}(\sigma)}\brac{\hat p_{s,a,2^{N_{\max}+1}},   V^* }} }
    \\&= \Bigg|\max_{\alpha\geq0} \varbrac{-\alpha \log\brac{\mathbb E_{p_{s,a}} \Fbrac{exp\brac{-\frac{V(s'_{s,a})}{\alpha}} } }-\alpha \sigma }
    \\&\qquad-\max_{\alpha\geq0} \varbrac{-\alpha \log\brac{\mathbb E_{\hat p_{s,a,2^{N_{\max}+1}}} \Fbrac{exp\brac{-\frac{V(s'_{s,a})}{\alpha}} } }-\alpha \sigma }\Bigg |
    \\& \myineq{\leq}{i} \max_{0\leq\alpha\leq \frac{r_{\max}}{(1-\gamma)\sigma }} \Bigg|-\alpha \log\brac{\mathbb E_{p_{s,a}} \Fbrac{exp\brac{-\frac{V(s'_{s,a})}{\alpha}} } }\\& \qquad+\alpha \log\brac{\mathbb E_{\hat p_{s,a,2^{N_{\max}+1}}} \Fbrac{exp\brac{-\frac{V(s'_{s,a})}{\alpha}} } }  \Bigg|
    \\& {\leq} 
    \max_{0\leq\alpha\leq \frac{r_{\max}}{(1-\gamma)\sigma }}
    \lbrac{\alpha \log\brac{\frac{\mathbb E_{\hat p_{s,a,2^{N_{\max}+1}}} \Fbrac{exp\brac{-\frac{V(s'_{s,a})}{\alpha}} } }{\mathbb E_{p_{s,a}} \Fbrac{exp\brac{-\frac{V(s'_{s,a})}{\alpha}} }}  }  }
    \\& {\leq}
      \max_{0\leq\alpha\leq \frac{r_{\max}}{(1-\gamma)\sigma }} \lbrac{\alpha\log\brac{\frac{\mathbb E_{\hat p_{s,a,2^{N_{\max}+1}}} \Fbrac{exp\brac{-\frac{V(s'_{s,a})}{\alpha}} } -\mathbb E_{p_{s,a}} \Fbrac{exp\brac{-\frac{V(s'_{s,a})}{\alpha}} }}{\mathbb E_{p_{s,a}} \Fbrac{exp\brac{-\frac{V(s'_{s,a})}{\alpha}} }}  +1 }   },
}
where $(i)$ follows from $\boldsymbol{\alpha}^{*\rho_{KL}(\sigma)}(p, V)\leq \frac{\max_{s:p(s)\neq 0}V(s)}{\sigma}\leq \frac{\max_{s,a} Q(s,a)}{\sigma}\leq \frac{r_{\max}}{(1-\gamma)\sigma }$ by \citep{hu2013kullback}. 
% We recall that $\boldsymbol{\alpha^*}^{\rho_{KL}(\sigma)}(p, V)$ is defined in \cref{def:a.1}. 

Noting that $ \hat p_{s,a,2^{N_{\max}+1}}(s'_{s,a})$ is absolutely continuous on $p_{s,a}(s'_{s,a}) $, then by Hoeffding's inequality we have 
\eqenv{
\prob\brac{\max_{s'_{s,a}}\lbrac{\frac{\hat p_{s,a,2^{N_{\max}+1}}(s'_{s,a})-p_{s,a}(s'_{s,a})  }{p_{s,a}(s'_{s,a})} }\geq \sqrt{\frac{1}{2^{N_{\max}+1}p^2_\wedge} \log \frac{2|\mathcal{S}| }{\tau} } }\leq \tau.
}

Set $\tau=\frac{2^{-(N_{\max}+1)}}{\ssa}$. With probability at least $1-\frac{2^{-(N_{\max}+1)}}{\ssa} $, we have that
\eqenv{
&\lbrac{\frac{\mathbb E_{\hat p_{s,a,2^{N_{\max}+1}}} \Fbrac{exp\brac{-\frac{V(s'_{s,a})}{\alpha}} } -\mathbb E_{p_{s,a}} \Fbrac{exp\brac{-\frac{V(s'_{s,a})}{\alpha}} }}{\mathbb E_{p_{s,a}} \Fbrac{exp\brac{-\frac{V(s'_{s,a})}{\alpha}} }}  }
 \\&\qquad\qquad\qquad\qquad\myineq{\leq}{i}
\max_{s'_{s,a}}\lbrac{\frac{\hat p_{s,a,2^{N_{\max}+1}}(s'_{s,a})-p_{s,a}(s'_{s,a})  }{p_{s,a}(s'_{s,a})} }
\\&\qquad\qquad\qquad\qquad \leq \sqrt{\frac{N_{\max}}{2^{N_{\max}+1}p^2_\wedge} \log \brac{2|\mathcal{S}|^2|\mathcal{A}| }},
} where $(i)$ follows from the fact that
\eqenv{
\lbrac{\sum p_i x_i} = \lbrac{\sum \frac{p_i}{q_i} q_i x_i }\leq  \lbrac{\sum q_i x_i}\max_i\lbrac{\frac{p_i}{q_i}}.
}


Note that if we set $\frac{N_{\max}}{p^2_\wedge} \log \brac{2|\mathcal{S}|^2|\mathcal{A}| }\leq \frac{1}{4}2^{N_{\max}+1} $, it holds that $\sqrt{\frac{N_{\max}}{2^{N_{\max}+1}p^2_\wedge} \log \brac{2|\mathcal{S}|^2|\mathcal{A}| }}\leq \frac{1}{2} $. 
Then, combined with \cref{eqeq71}, we can conclude that
\eqenv{\label{eqeq74}
&\lbrac{{f^*}^{\rho_{KL}(\sigma)}\brac{ p_{s,a}, V(s'_{s,a})}-{{f^*}^{\rho_{KL}(\sigma)}\brac{\hat p_{s,a,2^{N_{\max}+1}},   V^*(s'_{s,a})}}}
\\& {\leq}
    \frac{r_{\max}}{(1-\gamma)\sigma}  \max_{0\leq\alpha\leq \frac{r_{\max}}{(1-\gamma)\sigma }} \lbrac{\log\brac{\frac{\mathbb E_{\hat p_{s,a,2^{N_{\max}+1}}} \Fbrac{exp\brac{-\frac{V }{\alpha}} } -\mathbb E_{p_{s,a}} \Fbrac{exp\brac{-\frac{V }{\alpha}} }}{\mathbb E_{p_{s,a}} \Fbrac{exp\brac{-\frac{V(s'_{s,a})}{\alpha}} }}  +1 }   }
\\& \myineq{\leq}{i}
    \frac{r_{\max}}{(1-\gamma)\sigma} 
    \max_{0\leq\alpha\leq \frac{r_{\max}}{(1-\gamma)\sigma }}
    2\lbrac{\frac{\mathbb E_{\hat p_{s,a,2^{N_{\max}+1}}} \Fbrac{exp\brac{-\frac{V(s'_{s,a})}{\alpha}} } -\mathbb E_{p_{s,a}} \Fbrac{exp\brac{-\frac{V(s'_{s,a})}{\alpha}} }}{\mathbb E_{p_{s,a}} \Fbrac{exp\brac{-\frac{V(s'_{s,a})}{\alpha}} }}  }
    \\& \leq \frac{2r_{\max}}{(1-\gamma)\sigma} 
    \max_{s'_{s,a}}\lbrac{\frac{\hat p_{s,a,2^{N_{\max}+1}}(s'_{s,a})-p_{s,a}(s'_{s,a})  }{p_{s,a}(s'_{s,a})} }\leq \frac{2r_{\max}}{(1-\gamma)\sigma}\sqrt{\frac{N_{\max}}{2^{N_{\max}+1}p^2_\wedge} \log \brac{2|\mathcal{S}|^2 |\mathcal{A}| }},
} where $(i)$ follows from that $|\log(x+1)|\leq 2|x| $ for $|x|\leq \frac{1}{2}$.
Then, according to the Bernoulli's inequality, we have that 
\eqenv{\label{eqkl122}
\brac{1-\frac{2^{-N_{\max}-1}}{\ssa} }^\ssa\geq 1- 2^{-N_{\max}-1}. 
}

Therefore, with probability at least $1-{2^{-{N_{\max}-1}}} $, there exists
\eqenv{\label{eqkl123}
\sup_{s,a} \lbrac{ {f^{*\rho_{KL}(\sigma)}(\hat p_{s,a,2^{N_{\max}+1}},V )}-{f^{*\rho_{KL}(\sigma)}(p_{s,a},V)} }\leq \frac{2r_{\max}}{(1-\gamma)\sigma}\sqrt{\frac{N_{\max}}{2^{N_{\max}+1}p^2_\wedge} \log \brac{2|\mathcal{S}|^2 |\mathcal{A}| }}.
}



Otherwise, with probability at most $2^{-(N_{\max}+1)}$, we can conclude 
that
\eqenv{\label{eqeq75}
\sup_{s,a}\lbrac{{f^*}^{\rho_{KL}(\sigma)}\brac{ p_{s,a}, V }-{{f^*}^{\rho_{KL}(\sigma)}\brac{\hat p_{s,a,2^{N_{\max}+1}},   V^* }}}
    \leq  \max_{s'_{s,a}}V(s'_{s,a})\leq \frac{r_{\max}}{1-\gamma}.
}

Then, consider the expectation, we can get
\eqenv{
   \E&\Fbrac{\sup_{s,a}\lbrac{{f^*}^{\rho_{KL}(\sigma)}\brac{ p_{s,a}, V }-{{f^*}^{\rho_{KL}(\sigma)}\brac{\hat p_{s,a,2^{N_{\max}+1}},   V^* }}}}
   % \\&= \E\Fbrac{{f^*}^{\rho_{KL}(\sigma)}\brac{ p_{s,a}, V }-{{f^*}^{\rho_{KL}(\sigma)}\brac{\hat p_{s,a,2^{N_{\max}+1}},   V^* }} }
   \\& \leq \frac{2r_{\max}}{(1-\gamma)\sigma}\sqrt{\frac{N_{\max}}{2^{N_{\max}+1}p^2_\wedge} \log \brac{2|\mathcal{S}|^2|\mathcal{A}| }}+ 2^{-(N_{\max}+1)}\frac{r_{\max}}{1-\gamma}
   \\&\leq \frac{2r_{\max}}{(1-\gamma)\sigma}{\sqrt{N_{\max}\log \brac{2|\mathcal{S}|^2|\mathcal{A}| } }  }\frac{ 1}{p_\wedge2^\frac{N_{\max}+1}{2}}+2^{-(N_{\max}+1)}\frac{r_{\max}}{1-\gamma} , 
}where we set $C_{KL}=2\sqrt{N_{\max}\log \brac{2|\mathcal{S}|^2|\mathcal{A}| } }  $, then
\eqenv{
\E&\Fbrac{\sup_{s,a}\lbrac{{f^*}^{\rho_{KL}(\sigma)}\brac{ p_{s,a}, V }-{f}^{*\rho_{KL}(\sigma)}\brac{\hat p_{s,a,2^{N_{\max}+1}},   V^* }}}
   \\&\qquad\leq \frac{r_{\max}}{(1-\gamma)\sigma} C_{KL}\frac{ 1}{p_\wedge2^\frac{N_{\max}+1}{2}}+\frac{r_{\max}}{1-\gamma}2^{-(N_{\max}+1)}.
}

Similarly, we can get the bound
    \eqenv{
    \E&\Fbrac{\sup_{s,a}\lbrac{{g^{*\rho_{KL}(\sigma)}(\hat \mu_{s,a,2^{N_{\max}+1}},r_{s,a})}- {g^{*\rho_{KL}(\sigma)}(\mu_{s,a},r_{s,a})} }}
    \\ \qquad\qquad&\leq \frac{r_{\max}C_{KL}}{\sigma p_\wedge} { 2^{-\frac{N_{\max}+1}{2}}}+{r_{\max}}2^{-(N_{\max}+1)}.
    }

Thus, we can get that
    \eqenv{
   \sup_{s,a}\big| \E\Fbrac{\widehat {\mathcal{T}}_{N_{\max}}^{\rho_{KL}(\sigma)} (Q)(s,a) }& - {\mathcal{T}}^{\rho_{KL}(\sigma)} (Q)(s,a)\big|
\\& \leq \E\Fbrac{\sup_{s,a}\lbrac{g^{*\rho_{KL}(\sigma)}(\hat \mu_{s,a,2^{N_{\max}+1}},r_{s,a})- {g^{*\rho_{KL}(\sigma)}(\mu_{s,a},r_{s,a})} }}
\\&\qquad+\gamma\E\Fbrac{\sup_{s,a}\lbrac{f^{*\rho_{KL}(\sigma)}(\hat p_{s,a,2^{N_{\max}+1}},V )-{f^{*\rho_{KL}(\sigma)}(p_{s,a},V )} }}
\\& \leq \brac{\frac{\gamma  r_{\max}}{1-\gamma}+r_{\max} }2^{-\frac{N_{\max}+1}{2}}\brac{2^{-\frac{N_{\max}+1}{2}}+ \frac{C_{KL}}{\sigma p_\wedge} }.
    }

% \end{proof}

\textbf{Variance: }Next, we consider the variance of the robust Bellman operator. Firstly, we make error decomposition of the robust Bellman operator variance. 
\eqenv{
\text{Var}\brac{\widehat {\mathcal{T}}_{N_{\max}}^{\rho_{KL}(\sigma)} (Q)(s,a) }
&= \text{Var}\brac{\widehat  r^{\rho_{KL}(\sigma)}+ \gamma \widehat{v}^{\rho_{KL}(\sigma)}(Q)(s,a) }
\\&= \text{Var}\brac{\widehat  r^{\rho_{KL}(\sigma)}}
+\gamma^2 \Varr{\widehat{v}^{\rho_{KL}(\sigma)}(Q)(s,a)}.
}
For convenience, we analyze the second term in the above equation. The first term can be bounded similarly. 
\eqenv{
\Varr{\widehat{v}^{\rho_{KL}(\sigma)}(Q)(s,a)}&= 
\E\Fbrac{\brac{\widehat{v}^{\rho_{KL}(\sigma)}(Q)(s,a)}^2  }-\brac{\E\Fbrac{\widehat{v}^{\rho_{KL}(\sigma)}(Q)(s,a) }}^2
\\&\leq \E\Fbrac{\brac{\widehat{v}^{\rho_{KL}(\sigma)}(Q)(s,a)}^2  }.
}
Next, according to the \cref{eq:delta,eq:hatq}, now we compute the expectation of $N_2$ and write a detailed explanation of the variance as follows:
\eqenv{\label{eqeq80}
\E\Fbrac{\brac{\widehat{v}^{\rho_{KL}(\sigma)}(Q)(s,a)}^2  }& = \E\Fbrac{\brac{V(s'_{s,a,0})+ \frac{\delta^{\rho_{KL}(\sigma)}_{s,a,N_2}(Q)(s,a) }{P_{N_2}} }^2}
\\& \leq 2 \E\Fbrac{V(s'_{s,a,0})^2 }+ 2\E \Fbrac{ \brac{\frac{\delta^{\rho_{KL}(\sigma)}_{s,a,N_2}(Q)(s,a) }{P_{N_2}} }^2 }
\\& \leq \frac{2r^2_{\max}}{(1-\gamma)^2\sigma^2}+ 2\sum_{N=0}^{N_{\max}} \E\Fbrac{\brac{\frac{\delta^{\rho_{KL}(\sigma)}_{s,a,N_2}(Q)(s,a) }{P_{N_2}}|N_2=N }^2} P_N
\\& \leq \frac{2r^2_{\max}}{(1-\gamma)^2\sigma^2}+ 2\sum_{N=0}^{N_{\max}} \frac{\E\Fbrac{(\delta^{\rho_{KL}(\sigma)}_{s,a,N}(Q)(s,a))^2} }{P_{N}}.
}
Next, we bound the term $\lbrac{\delta^{\rho_{KL}(\sigma)}_{s,a,N}(Q)(s,a) } $,
\eqenv{
\lbrac{\delta^{\rho_{KL}(\sigma)}_{s,a,N}(Q)(s,a) }&=
\Bigg|\sup_{\alpha\geq 0}\varbrac{f^{\rho_{KL}(\sigma)}(\widehat p_{s,a,2^{N+1}},\alpha,V)} 
     \\&-\frac{1}{2} \sup_{\alpha\geq 0}\varbrac{f^{\rho_{KL}(\sigma)}(\widehat p^E_{s,a,2^{N}},\alpha,V)}
    -\frac{1}{2}\sup_{\alpha\geq 0}\varbrac{f^{\rho_{KL}(\sigma)}(\widehat p^O_{s,a,2^{N}},\alpha,V )}\Bigg|. 
}

Then, we make an error decomposition as follows:
\eqenv{
\lbrac{\delta^{\rho_{KL}(\sigma)}_{s,a,N}(Q)(s,a) }^2&=
\Bigg|\sup_{\alpha\geq 0}\varbrac{f^{\rho_{KL}(\sigma)}(\widehat p_{s,a,2^{N+1}},\alpha,V)} 
     \\&\quad-\frac{1}{2} \sup_{\alpha\geq 0}\varbrac{f^{\rho_{KL}(\sigma)}(\widehat p^E_{s,a,2^{N}},\alpha,V)}
    -\frac{1}{2}\sup_{\alpha\geq 0}\varbrac{f^{\rho_{KL}(\sigma)}(\widehat p^O_{s,a,2^{N}},\alpha,V )}\Bigg|^2
    \\& \leq 3\lbrac{\sup_{\alpha\geq 0}\varbrac{f^{\rho_{KL}(\sigma)}(\widehat p_{s,a,2^{N+1}},\alpha,V)}-{\sup_{\alpha\geq 0}\varbrac{f^{\rho_{KL}(\sigma)}( p_{s,a},\alpha,V)} }  }^2
    \\&\quad+  \frac{3}{4}\lbrac{ \sup_{\alpha\geq 0}\varbrac{f^{\rho_{KL}(\sigma)}(\widehat p^E_{s,a,2^{N}},\alpha,V)}-{\sup_{\alpha\geq 0}\varbrac{f^{\rho_{KL}(\sigma)}(\widehat p_{s,a},\alpha,V)} } }^2
    \\&\quad+\frac{3}{4}\lbrac{\sup_{\alpha\geq 0}\varbrac{f^{\rho_{KL}(\sigma)}(\widehat p^O_{s,a,2^{N}},\alpha,V )}-{\sup_{\alpha\geq 0}\varbrac{f^{\rho_{KL}(\sigma)}( p_{s,a},\alpha,V)} }}^2.\label{eqeq82}
    % \\& \myineq{}{}
}

\textbf{Case 1: }Combined with the analysis in \cref{eqeq74,eqeq75}, we can conclude that when $N\leq \frac{\log(1+p^2_\wedge\log(2|\mathcal{S}|^2|\mathcal{A}|)\log T )}{\log 2}$, we bound the term $\frac{\E\Fbrac{\lbrac{\delta^{\rho_{KL}(\sigma)}_{s,a,N}(Q)(s,a) }^2 }}{P_N}$ as follows,
\eqenv{
    \lbrac{\delta^{\rho_{KL}(\sigma)}_{s,a,N}(Q)(s,a) }^2\leq \brac{ \frac{r_{\max}}{1-\gamma}}^2,\qquad {\frac{1}{P_N}}= 2^N \leq 1+p^{-2}_\wedge\log(2|\mathcal{S}|^2|\mathcal{A}|)\log T.
}
Hence, we have that
\eqenv{
\frac{\E\Fbrac{\lbrac{\delta^{\rho_{KL}(\sigma)}_{s,a,N}(Q)(s,a) }^2 }}{P_N} \leq  \brac{\frac{r_{\max}}{1-\gamma}}^2\brac{1+p^{-2}_\wedge\log(2|\mathcal{S}|^2|\mathcal{A}|)\log T}.
}





\textbf{Case 2: }When $N>\frac{\log(1+p^2_\wedge\log(2|\mathcal{S}|^2|\mathcal{A}|)\log T )}{\log 2}$, consider the fact $\mathbb P(A\cap B\cap C)\geq 1- \mathbb P(\neg A)-\mathbb P(\neg B)-\mathbb P(\neg C)$, by \cref{eqeq74},
 with probability at least $1-3*2^{-N} $
% Plug \cref{eqeq52} in \cref{}
\eqenv{
\lbrac{\delta^{\rho_{KL}(\sigma)}_{s,a,N}(Q)(s,a) }^2
&\leq 3\brac{C_{KL}\frac{r_{\max}}{p_\wedge(1-\gamma)\sigma} 2^{-\frac{N+1}{2}}}^2
\\& \quad+ \frac{3}{4}\brac{C_{KL}\frac{r_{\max}}{p_\wedge(1-\gamma)\sigma}  2^{-\frac{N}{2}} }^2+ \frac{3}{4}\brac{C_{KL}\frac{r_{\max}}{p_\wedge(1-\gamma)\sigma}  2^{-\frac{N}{2}} }^2
\\& = 3\frac{C^2_{KL} r^2_{\max} }{p_\wedge^2(1-\gamma)^2\sigma^2}2^{-(N+1)}, 
}
 Since $ 0 \leq \sup_{\alpha\geq 0}\varbrac{f^{\rho_{KL}(\sigma)}(q,\alpha,V)}\leq \frac{r_{\max}}{1-\gamma}$ for any distribution $q$, with probability at most $3*2^{-N} $ 
we have that
\eqenv{
 \lbrac{\delta^{\rho_{KL}(\sigma)}_{s,a,N}(Q)(s,a) }^2
 \leq \brac{ \frac{r_{\max}}{1-\gamma}}^2.
}

Above all, we can get that 
\eqenv{
\E\Fbrac{\lbrac{\delta^{\rho_{KL}(\sigma)}_{s,a,N}(Q)(s,a) }^2 }\leq 3\frac{C^2_{KL} r^2_{\max} }{p_\wedge^2(1-\gamma)^2\sigma^2}2^{-(N+1)}+ \brac{ \frac{r_{\max}}{1-\gamma}}^23*2^{-N}.
% \leq \brac{9 C^2_{\chi^2}+ 3}\brac{ \frac{r_{\max}}{1-\gamma}}^2 2^{-N}. 
\label{eqeq86}
}

Combined with \textbf{Case 1} and \textbf{Case 2}, when $\psi=\frac{1}{2}$, $P_N=2^{-N-1}$. Then, we have that 
\eqenv{
\E&\Fbrac{\frac{\lbrac{\delta^{\rho_{KL}(\sigma)}_{s,a,N}(Q)(s,a) }^2 }{P_N}} \\&\leq  \brac{\frac{r_{\max}}{1-\gamma}}^2\brac{1+p^{-2}_\wedge\log(2|\mathcal{S}|^2|\mathcal{A}|)\log T}+\frac{3}{2}\frac{C^2_{KL} r^2_{\max} }{p_\wedge^2(1-\gamma)^2\sigma^2}+ 3\brac{ \frac{r_{\max}}{1-\gamma}}^2.\label{eq:110}
}

Then, by  \cref{eq:110}, we can get the boundary of variance of the robust Bellman operator as follows:
\eqenv{\label{eqeq83}
&\Varr{\widehat{v}^{\rho_{KL}(\sigma)}(Q)(s,a)}\\&\leq \frac{2r^2_{\max}}{(1-\gamma)^2\sigma^2}+ 2\sum_{N=0}^{N_{\max}} \frac{\E\Fbrac{(\delta^{\rho_{KL}(\sigma)}_{s,a,N}(Q)(s,a))^2} }{P_{N}}
\\&\leq \frac{2r^2_{\max}}{(1-\gamma)^2\sigma^2}+4\sum_{N=0}^{N_{\max}}  \brac{ \frac{r_{\max}}{1-\gamma}}^2\brac{4+p^{-2}_\wedge\log(2|\mathcal{S}|^2|\mathcal{A}|)\log T+
 \frac{3}{2}\frac{C^2_{KL}  }{p_\wedge^{2}\sigma^2}}
\\& \leq\brac{2 +4(N_{\max}+1)\brac{4+{\log(2|\mathcal{S}|^2|\mathcal{A}|\log T)}{(1-\gamma)} + \frac{3C^2_{KL}}{2} } }\frac{r^2_{\max}}{p_\wedge^2 (1-\gamma)^2 \sigma^2},
}

Set $C_{\text{var}}= 2 +4(N_{\max}+1)\brac{4+{\log(2|\mathcal{S}|\log T)}{(1-\gamma)} + \frac{3C^2_{KL}}{2} }  $, then 
$$\Varr{\widehat{v}^{\rho_{KL}(\sigma)}(Q)(s,a)}\leq C_{\text{var}}\frac{r^2_{\max}}{p_\wedge^2 (1-\gamma)^2 \sigma^2}. $$

Similarly, we can get the boundary of the variance $\text{Var}\brac{\widehat  r^{\rho_{KL}(\sigma)}}$ as follows:
\eqenv{
\text{Var}\brac{\widehat  r^{\rho_{KL}(\sigma)}}\leq 
C_{\text{var}}\frac{r^2_{\max}}{p_\wedge^2  \sigma^2}.
}

Hence, we can get the robust Bellman operator variance bound:
\eqenv{\label{eqeq88}
\text{Var}\brac{\widehat {\mathcal{T}}_{N_{\max}}^{\rho_{KL}(\sigma)} (Q)(s,a) }
&= \text{Var}\brac{\widehat  r^{\rho_{KL}(\sigma)}}
+\gamma^2 \Varr{\widehat{v}^{\rho_{KL}(\sigma)}(Q)(s,a)}
\nonumber\\&\leq \frac{ C_{\text{var}}}{p_\wedge^2  \sigma^2}\brac{r^2_{\max}+ \frac{\gamma^2 r^2_{\max}}{(1-\gamma)^2}}.
}
   This completes the proof.  
\end{proof}

\begin{lemma}\label{lm:lminfkl}
    For any fixed $Q\in \mathbb R^{|\mathcal{S}||\mathcal{A}|}, s\in \mathcal{S}, a\in \mathcal{A} $, the infinite norm of robust Bellman operator can be bounded as:
    \begin{align}
        \E\Fbrac{\norminf{\widehat {\mathcal{T}}^{\rho_{KL}(\sigma)}_{N_{\max}} (Q)(s,a) }^2 }\leq \widetilde{\mathcal{O }}\brac{{N_{\max}}}.
        \end{align}
\end{lemma}

\begin{proof}[Proof of \Cref{lm:lminfkl}]
We then consider the expectation of infinite norm of robust Bellman operator. Set $\psi=\frac{1}{2}$, and then we make an error decomposition as follows
\eqenv{\label{eqkl149}
\E\Fbrac{\norminf{\widehat {\mathcal{T}}_{N_{\max}}^{\rho_{KL}(\sigma)}(Q)}^2 }\leq 
 4r^2_{\max}+4\gamma^2\frac{r^2_{\max}}{(1-\gamma)^2}+4\E\Fbrac{\sum_{N_1=0}^{N_{\max}} \sup_{s,a}\frac{\brac{\delta^{r,\rho(\sigma)}_{s,a,N_1}
  }^2}{2^{-N_1-1}}+\gamma^2\sum_{N_2=0}^{N_{\max}}\sup_{s,a}\frac{\brac{
\delta^{\rho_{KL}(\sigma)}_{s,a,N_2}(Q)  }^2}{2^{-N_2-1}} }. 
}

% We then consider the variance of the robust Bellman operator. Firstly, we make an error decomposition of the robust Bellman operator variance as 
% \eqenv{
% \text{Var}\brac{\widehat {\mathcal{T}}_{N_{\max}}^{\rho_{KL}(\sigma)} (Q)(s,a) }
% &= \text{Var}\brac{\widehat  r^{\rho_{KL}(\sigma)}+ \gamma \widehat{v}^{\rho_{KL}(\sigma)}(Q)(s,a) }
% \\&= \text{Var}\brac{\widehat  r^{\rho_{KL}(\sigma)}}
% +\gamma^2 \Varr{\widehat{v}^{\rho_{KL}(\sigma)}(Q)(s,a)},
% }
% which is due to the two estimators are independent.

For convenience, we analyze the last term in the above equation. 
% The first term can be bounded similarly. 
% \eqenv{
% \Varr{\widehat{v}^{\rho_{KL}(\sigma)}(Q)(s,a)}&= 
% \E\Fbrac{\brac{\widehat{v}^{\rho_{KL}(\sigma)}(Q)(s,a)}^2  }-\brac{\E\Fbrac{\widehat{v}^{\rho_{KL}(\sigma)}(Q)(s,a) }}^2
% \\&\leq \E\Fbrac{\brac{\widehat{v}^{\rho_{KL}(\sigma)}(Q)(s,a)}^2  }.
% }
% Next, according to the \cref{eq:delta,eq:hatq}, the term above can be explicitly computed: 
% \eqenv{\label{eqeq149}
% \E\Fbrac{\brac{\widehat{v}^{\rho_{KL}(\sigma)}(Q)(s,a)}^2  }& = \E\Fbrac{\brac{V(s'_{s,a,0})+ \frac{\delta^{\rho_{KL}(\sigma)}_{s,a,N_2}(Q)(s,a) }{P_{N_2}} }^2}
% \\& \leq 2 \E\Fbrac{V(s'_{s,a,0})^2 }+ 2\E \Fbrac{ \brac{\frac{\delta^{\rho_{KL}(\sigma)}_{s,a,N_2}(Q)(s,a) }{P_{N_2}} }^2 }
% \\& \leq \frac{2r^2_{\max}}{(1-\gamma)^2}+ 2\sum_{N=0}^{N_{\max}} \E\Fbrac{\brac{\frac{\delta^{\rho_{KL}(\sigma)}_{s,a,N_2}(Q)(s,a) }{P_{N_2}}|N_2=N }^2} P_N
% \\& \leq \frac{2r^2_{\max}}{(1-\gamma)^2}+ 2\sum_{N=0}^{N_{\max}} \frac{\E\Fbrac{(\delta^{\rho_{KL}(\sigma)}_{s,a,N}(Q)(s,a))^2} }{P_{N}}.
% }
Consider the term $\sup_{s,a}\lbrac{\delta^{\rho_{KL}(\sigma)}_{s,a,N}(Q)(s,a) }^2 $, 
% we can get that
% \eqenv{
% \sup_{s,a}\lbrac{\delta^{\rho_{KL}(\sigma)}_{s,a,N}(Q)(s,a) }&=
% \Bigg|\sup_{\alpha\geq 0}\varbrac{f^{\rho_{KL}(\sigma)}(\widehat p_{s,a,2^{N+1}},\alpha,V)} 
%     \\&\quad  -\frac{1}{2} \sup_{\alpha\geq 0}\varbrac{f^{\rho_{KL}(\sigma)}(\widehat p^E_{s,a,2^{N}},\alpha,V)}
%     -\frac{1}{2}\sup_{\alpha\geq 0}\varbrac{f^{\rho_{KL}(\sigma)}(\widehat p^O_{s,a,2^{N}},\alpha,V )}\Bigg|. 
% }
 we make an error decomposition as follows:
\eqenv{
\sup_{s,a}\lbrac{\delta^{\rho_{KL}(\sigma)}_{s,a,N}(Q)(s,a) }^2&=
\sup_{s,a}\Bigg|\sup_{\alpha\geq 0}\varbrac{f^{\rho_{KL}(\sigma)}(\widehat p_{s,a,2^{N+1}},\alpha,V)} 
        \\& \qquad-\frac{1}{2} \sup_{\alpha\geq 0}\varbrac{f^{\rho_{KL}(\sigma)}(\widehat p^E_{s,a,2^{N}},\alpha,V)}
    -\frac{1}{2}\sup_{\alpha\geq 0}\varbrac{f^{\rho_{KL}(\sigma)}(\widehat p^O_{s,a,2^{N}},\alpha,V )}\Bigg|^2
    \\& \leq  3\sup_{s,a}\lbrac{\sup_{\alpha\geq 0}\varbrac{f^{\rho_{KL}(\sigma)}(\widehat p_{s,a,2^{N+1}},\alpha,V)}-{\sup_{\alpha\geq 0}\varbrac{f^{\rho_{KL}(\sigma)}(p_{s,a},\alpha,V)} }  }^2
    \\&\quad+  \frac{3}{4}\sup_{s,a}\lbrac{ \sup_{\alpha\geq 0}\varbrac{f^{\rho_{KL}(\sigma)}(\widehat p^E_{s,a,2^N},\alpha,V)}-{\sup_{\alpha\geq 0}\varbrac{f^{\rho_{KL}(\sigma)}(p_{s,a},\alpha,V)} } }^2
    \\&\quad+\frac{3}{4}\sup_{s,a}\lbrac{\sup_{\alpha\geq 0}\varbrac{f^{\rho_{KL}(\sigma)}(\widehat p^O_{s,a,2^{N}},\alpha,V )}-{\sup_{\alpha\geq 0}\varbrac{f^{\rho_{KL}(\sigma)}( p_{s,a},\alpha,V)} }}^2
    .\label{eqkl152}
    % \\& \myineq{}{}
}

\textbf{Case 1:} When $N \leq \frac{\log(1+p_\wedge^2\log(2|\mathcal{S}|^2|\mathcal{A}|)\log T)}{\log 2}$, we can get 
\eqenv{
\sup_{s,a}\lbrac{\delta^{\rho_{KL}(\sigma)}_{s,a,N}(Q)(s,a) }^2\leq \brac{\frac{r_{\max}}{1-\gamma}}^2, \frac{1}{P_N}=2^N \leq 1+ \frac{\log(2|\mathcal{S}|^2|\mathcal{A}|)\log T}{p^2_\wedge}. 
}

Therefore, we have that 
\eqenv{
\frac{\sup_{s,a}\lbrac{\delta^{\rho_{KL}(\sigma)}_{s,a,N}(Q)(s,a) }^2}{P_N}\leq \brac{\frac{r_{\max}}{1-\gamma}}^2 \brac{1+ \frac{\log(2|\mathcal{S}|^2|\mathcal{A}|)\log T}{p^2_\wedge}}. 
}

\textbf{Case 2:} When $N> \frac{\log(1+p_\wedge^2\log(2|\mathcal{S}|^2|\mathcal{A}|)\log T)}{\log 2}$,  combined with the analysis in \cref{eqkl122,eqkl123} and the fact $\mathbb P(A\cap B\cap C)\geq 1- \mathbb P(\neg A)-\mathbb P(\neg B)-\mathbb P(\neg C)$, we can conclude that for any $N\geq 0$, 
 with probability at least $1-3*2^{-N} $
\eqenv{
\sup_{s,a}\lbrac{\delta^{\rho_{KL}(\sigma)}_{s,a,N}(Q)(s,a) }^2
&\leq 3\brac{\frac{C_{KL}r_{\max}}{\sigma(1-\gamma)p_\wedge} 2^{-\frac{N+1}{2}}}^2 + \frac{3}{4}\brac{\frac{C_{KL}r_{\max}}{\sigma(1-\gamma)p_\wedge} 2^{-\frac{N}{2}} }^2+ \frac{3}{4}\brac{\frac{C_{KL}r_{\max}}{\sigma(1-\gamma)p_\wedge} 2^{-\frac{N}{2}} }^2
\\& \leq  6\frac{C^2_{KL} r^2_{\max} }{\sigma^2(1-\gamma)^2p_\wedge^2}2^{-(N+1)}, 
}
 Since $ 0 \leq \sup_{\alpha\geq 0}\varbrac{f^{\rho_{KL}(\sigma)}(q,\alpha,V)}\leq \frac{r_{\max}}{1-\gamma}$ for any distribution $q$, with probability at most $3*2^{-N} $ 
we have that
\eqenv{
\sup_{s,a} \lbrac{\delta^{\rho_{KL}(\sigma)}_{s,a,N}(Q)(s,a) }^2
 \leq \brac{ \frac{r_{\max}}{1-\gamma}}^2.
}

Above all, we can get that 
\eqenv{
\E\Fbrac{\sup_{s,a}\lbrac{\delta^{\rho_{KL}(\sigma)}_{s,a,N}(Q)(s,a) }^2 }&\leq 6\frac{C^2_{KL} r^2_{\max} }{\sigma^2(1-\gamma)^2p_\wedge^2}2^{-(N+1)}+ \brac{ \frac{r_{\max}}{1-\gamma}}^23*2^{-N}
\\&\leq 2\brac{3 \frac{C^2_{KL}}{\sigma^2 p_\wedge^2}+ 3}\brac{ \frac{r_{\max}}{1-\gamma}}^2 2^{-N}. 
}

Combined with \textbf{Case 1} and \textbf{Case 2}, we can get 
\eqenv{\label{eqkl155}
\E\Fbrac{\sup_{s,a}\frac{\lbrac{\delta^{\rho_{KL}(\sigma)}_{s,a,N}(Q)(s,a) }^2}{P_N} }= \brac{6 \frac{C^2_{KL}}{\sigma^2 p_\wedge^2}+ 7+ \frac{\log(2|\mathcal{S}|^2|\mathcal{A}|)\log T}{p^2_\wedge}}\brac{ \frac{r_{\max}}{1-\gamma}}^2.
}

Then, plug \cref{eqkl155} in \cref{eqkl149}, we can get the bound of expectation of infinite norm as follows:
\eqenv{
\E\Fbrac{\sum_{N_2=0}^{N_{\max}}\sup_{s,a}\frac{\brac{
\delta^{\rho_{KL}(\sigma)}_{s,a,N_2}(Q)  }^2}{2^{-N_2-1}} }
&\leq \sum_{N_2=0}^{N_{\max}}\brac{6 \frac{C^2_{KL}}{\sigma^2 p_\wedge^2}+7+ \frac{\log(2|\mathcal{S}|^2|\mathcal{A}|)\log T}{p^2_\wedge}}\brac{ \frac{r_{\max}}{1-\gamma}}^2\\&=\brac{N_{\max}+1}\brac{6 \frac{C^2_{KL}}{\sigma^2 p_\wedge^2}+7+ \frac{\log(2|\mathcal{S}|^2|\mathcal{A}|)\log T}{p^2_\wedge}}\brac{ \frac{r_{\max}}{1-\gamma}}^2. 
}


Similarly, we can get the bound as follows:
\eqenv{
\E\Fbrac{\sum_{N_1=0}^{N_{\max}} \sup_{s,a}\frac{\brac{\delta^{r,\rho(\sigma)}_{s,a,N_1}
  }^2}{2^{-N_1-1}}}&\leq \sum_{N_1=0}^{N_{\max}}\brac{6 \frac{C^2_{KL}}{\sigma^2 p_\wedge^2}+7+ \frac{\log(2|\mathcal{S}|^2|\mathcal{A}|)\log T}{p^2_\wedge}} r_{\max}^2\\&=\brac{N_{\max}+1}\brac{6 \frac{C^2_{KL}}{\sigma^2 p_\wedge^2}+7+ \frac{\log(2|\mathcal{S}|^2|\mathcal{A}|)\log T}{p^2_\wedge}}r^2_{\max}. 
}

Hence, combining \Cref{eqkl149} and the above equations, we can get the robust Bellman operator infinite norm bound:
\eqenv{\label{eqkl62}
\E\Fbrac{\norminf{\widehat {\mathcal{T}}_{N_{\max}}^{\rho_{KL}(\sigma)}(Q)}^2 }\leq 
4\brac{1+\brac{N_{\max}+1}\brac{6 \frac{C^2_{KL}}{\sigma^2 p_\wedge^2}+7+ \frac{\log(2|\mathcal{S}|^2|\mathcal{A}|)\log T}{p^2_\wedge}}}\brac{r^2_{\max}+\gamma^2\brac{ \frac{r_{\max}}{1-\gamma}}^2}.
}
   This completes the proof.  
\end{proof}






\begin{theorem}[Restatement of \cref{thm:kl}]
If we set $\psi=\frac{1}{2}$ and the stepsize as $$\beta_t=\beta=\frac{\log T}{(1-\gamma)T}. $$ Then the output of \cref{alg:example} satisfies that:
 \begin{align}
    \mathbb E &\Fbrac{\mynorm{\widehat Q_T^{\rho_{KL}(\sigma)}-Q^{*\rho_{KL}(\sigma)}}_\infty^2}\nonumber
  \leq\widetilde{\mathcal{O }}\brac{\frac{1}{ p_\wedge^2 (1-\gamma)^5 T}}.
\end{align}
To ensure 
\begin{align}
    \mathbb E \Fbrac{\mynorm{\widehat Q_T^{\rho_{KL}(\sigma)}-Q^{*\rho_{KL}(\sigma)}}_\infty^2}\leq \epsilon^2,\nonumber
\end{align}
the expected total sample complexity $N^{\rho_{KL}(\sigma)}(\epsilon)$ is
 \begin{align}
     N^{\rho_{KL}(\sigma)}(\epsilon) = |\mathcal{S}||\mathcal{A}|N_{\max} T\geq \widetilde{\mathcal{O }}\brac{\frac{|\mathcal{S}||\mathcal{A}|}{ p_\wedge^2 (1-\gamma)^5 \epsilon^2 }}.\nonumber
 \end{align}
\end{theorem}
\begin{proof}
We consider the stochastic iteration that
\eqenv{
\widehat Q^{\rho_{KL}(\sigma)}_{t+1}=\widehat Q^{\rho_{KL}(\sigma)}_{t}+ \beta_t\brac{\boldsymbol{\bar{\mathcal{T}}}^{\rho_{KL}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{KL}(\sigma)}_{t}}- \widehat Q^{\rho_{KL}(\sigma)}_{t}+ W_t  },
}
where $W_t={\widehat{\mathcal{T}}}^{\rho_{KL}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{KL}(\sigma)}_{t}}-\boldsymbol{\bar{\mathcal{T}}}^{\rho_{KL}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{KL}(\sigma)}_{t}} $. 

Define the filtration $\mathcal{F}_t=\varbrac{Q^{\rho_{KL}(\sigma)}_0, W_0,...,Q^{\rho_{KL}(\sigma)}_{t-1},W_{t-1},Q^{\rho_{KL}(\sigma)}_{t} } $. 
Then, by \cref{thm:kl}, we can get that 
\eqenv{
\E\Fbrac{W_t|\mathcal{F}_t }=0,
}
and  by \cref{lm:lminfkl}, we can get that 
\eqenv{
\E\Fbrac{\norminf{W_t}^2|\mathcal{F}_t }
&\leq {\E\Fbrac{\sup_{s,a}\lbrac{{\widehat{\mathcal{T}}}^{\rho_{KL}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{KL}(\sigma)}_{t}}(s,a)-\boldsymbol{\bar{\mathcal{T}}}^{\rho_{KL}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{KL}(\sigma)}_{t}}(s,a)}^2|\mathcal{F}_t }}
\\& \leq 2{\E\Fbrac{\sup_{s,a} \lbrac{{\widehat{\mathcal{T}}}^{\rho_{KL}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{KL}(\sigma)}_{t}}(s,a)}^2+\sup_{s,a}\lbrac{\boldsymbol{\bar{\mathcal{T}}}^{\rho_{KL}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{KL}(\sigma)}_{t}}(s,a)}^2|\mathcal{F}_t }}
\\&\myineq{\leq}{a} 4\E\Fbrac{\sup_{s,a}2\lbrac{{\widehat{\mathcal{T}}}^{\rho_{KL}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{KL}(\sigma)}_{t}}(s,a)}^2|\mathcal{F}_t }
\\& \myineq{\leq}{b} 16\brac{1+\brac{N_{\max}+1}\brac{6 \frac{C^2_{KL}}{\sigma^2 p_\wedge^2}+7+ \frac{\log(2|\mathcal{S}|^2|\mathcal{A}|)\log T}{p^2_\wedge}}}\brac{r^2_{\max}+\gamma^2\brac{ \frac{r_{\max}}{1-\gamma}}^2}, 
} where $(a) $ follows from that
\eqenv{
\E\Fbrac{\sup_{s,a}\lbrac{\boldsymbol{\bar{\mathcal{T}}}^{\rho_{KL}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{KL}(\sigma)}_{t}}(s,a)}^2|\mathcal{F}_t }&= 
\E\Fbrac{\sup_{s,a}\lbrac{\E\Fbrac{\widehat{\mathcal{T}}^{\rho_{KL}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{KL}(\sigma)}_{t}}(s,a)}}^2\Big|\mathcal{F}_t }
\\& \leq \E\Fbrac{\sup_{s,a}\E\Fbrac{\lbrac{\widehat{\mathcal{T}}^{\rho_{KL}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{KL}(\sigma)}_{t}}(s,a)}^2}\Big|\mathcal{F}_t }
\\& \leq \E\Fbrac{\sup_{s,a} {\lbrac{\widehat{\mathcal{T}}^{\rho_{KL}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{KL}(\sigma)}_{t}}(s,a)}^2}\Big|\mathcal{F}_t }, 
}
and $(b)$ follows from \Cref{lm:lminfkl}.


% and
% \eqenv{
% \E\Fbrac{\norminf{W_t}^2|\mathcal{F}_t }
% &\leq \max_{s,a}{\E\Fbrac{\lbrac{{\widehat{\mathcal{T}}}^{\rho_{KL}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{KL}(\sigma)}_{t}}(s,a)-\boldsymbol{\bar{\mathcal{T}}}^{\rho_{KL}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{KL}(\sigma)}_{t}}(s,a)}^2|\mathcal{F}_t }}
% \\& \leq \max_{s,a} \varbrac{\text{Var}\brac{\widehat {\mathcal{T}}^{\rho_{KL}(\sigma)}_{N_{\max}} (Q^{\rho_{KL}(\sigma)}_{t})(s,a)  } }
% \\& \myineq{\leq}{i} \frac{ C_{\text{var}}}{p_\wedge^2  \sigma^2}\brac{r^2_{\max}+ \gamma^2\norminf{Q^{\rho_{KL}(\sigma)}_{t} }^2 }
% \\& \leq\frac{r^2_{\max} C_{\text{var}}}{p_\wedge^2 (1-\gamma)^2 \sigma^2}\brac{1+\frac{\gamma^2}{(1-\gamma)^2}},
% } where $(i) $ follows from \cref{eqeq88}. 

According to \cref{eq:fixp}, we have that
$$\widehat Q^{*\rho_{KL}(\sigma)}(s,a) = \hatT (\widehat Q^{*\rho_{KL}(\sigma)})(s,a)=\E\Fbrac{\hatt (\widehat Q^{*\rho_{KL}(\sigma)})(s,a)}.$$
Then, apply \cref{lm:chen} \citep{chen2020finite}, set the constant stepsize $\beta_t=\beta= \frac{2\log T}{(1-\gamma)T} $ and $T$ large enough s.t.
$$\beta={\frac{2\log T}{(1-\gamma)T}\leq\frac{(1-\gamma)^2}{128 e  \log(|\mathcal{S}||\mathcal{A}|) }} .$$
We can conclude that
\eqenv{\label{eqeq661}
\E&\Fbrac{\norminf{\widehat Q_{T}^{\rho_{KL}(\sigma)}-\widehat Q^{*\rho_{KL}(\sigma)} }^2}\\&\myineq{\leq}{i} \frac{3}{2} \norminf{\widehat Q_{0}^{\rho_{KL}(\sigma)}-\widehat Q^{*\rho_{KL}(\sigma)} }^2\prod_{j=0}^{T-1} \brac{1-\frac{1-\gamma}{2} \beta_t}  +\frac{16 e \log (|\mathcal{S}||\mathcal{A}|)}{1-\gamma}16r^2_{\max}\brac{1+\frac{\gamma^2}{(1-\gamma)^2}} 
\\&\qquad\cdot\brac{1+\brac{N_{\max}+1}\brac{6 \frac{C^2_{KL}}{\sigma^2 p_\wedge^2}+7+ \frac{\log(2|\mathcal{S}|^2|\mathcal{A}|)\log T}{p^2_\wedge}}}
\sum_{i=0}^{T-1} \beta_i^2 \prod_{t=i+1}^{T-1} (1-
\frac{1-\gamma}{2}\beta_t)
\\& \myineq{\leq}{ii} \frac{3}{2}\frac{r^2_{\max}}{(1-\gamma)^2} \frac{1}{T} +\frac{16 e \log (|\mathcal{S}||\mathcal{A}|)}{1-\gamma}16 r^2_{\max}\brac{1+\brac{N_{\max}+1}\brac{6 \frac{C^2_{KL}}{\sigma^2 p_\wedge^2}+7+ \frac{\log(2|\mathcal{S}|^2|\mathcal{A}|)\log T}{p^2_\wedge}}}\\&\qquad\cdot\brac{1+\frac{\gamma^2}{(1-\gamma)^2}}\frac{4 \log T }{(1-\gamma)^2 T}
, 
} where $(i)$ follows from the \cref{lm:chen}. $(ii)$ follows from $(1-(1-\gamma)\beta/2)^T\leq \frac{1}{T}$.


Set $N_{\max}=\frac{2\log T}{\log 2}$. Then, we make the decomposition and get the bound of $\mathbb E \Fbrac{\mynorm{\widehat Q_T^{\rho_{KL}(\sigma)}-Q^{*\rho_{KL}(\sigma)}}_\infty^2}$ as follows
 \eqenv{
 \mathbb E& \Fbrac{\mynorm{\widehat Q_T^{\rho_{KL}(\sigma)}-Q^{*\rho_{KL}(\sigma)}}_\infty^2}
 \\&\leq 
 2\E\Fbrac{\mynorm{\widehat Q_T^{\rho_{KL}(\sigma)}-\widehat Q^{*\rho_{KL}(\sigma)}}_\infty^2 }+ 2\E\Fbrac{\mynorm{\widehat Q^{*\rho_{KL}(\sigma)}-Q^{*\rho_{KL}(\sigma)}}_\infty^2 }
 \\& \myineq{\leq}{i}
  \frac{2r^2_{\max}}{(1-\gamma)^2 T} +\frac{32 e \log (|\mathcal{S}||\mathcal{A}|)}{1-\gamma}16 r^2_{\max} \brac{1+\brac{N_{\max}+1}\brac{6 \frac{C^2_{KL}}{\sigma^2 p_\wedge^2}+7+ \frac{\log(2|\mathcal{S}|^2|\mathcal{A}|)\log T}{p^2_\wedge}}}\\& \qquad\cdot\brac{1+\frac{\gamma^2}{(1-\gamma)^2}}\frac{4 \log T }{(1-\gamma)^2 T}
+\frac{2}{1-\gamma}\brac{\brac{\frac{\gamma  r_{\max}}{1-\gamma}+r_{\max} }2^{-\frac{N_{\max}+1}{2}}\brac{2^{-\frac{N_{\max}+1}{2}}+ \frac{C_{KL}}{\sigma p_\wedge} }}^2
\\& \myineq{\leq}{ii}  \frac{2r^2_{\max}}{(1-\gamma)^2 T} +\frac{32 e \log (|\mathcal{S}||\mathcal{A}|)}{1-\gamma}16 r^2_{\max} \brac{1+\brac{N_{\max}+1}\brac{6 \frac{C^2_{KL}}{\sigma^2 p_\wedge^2}+7+ \frac{\log(2|\mathcal{S}|^2|\mathcal{A}|)\log T}{p^2_\wedge}}}\\&\qquad\cdot\brac{1+\frac{\gamma^2}{(1-\gamma)^2}}\frac{4 \log T }{(1-\gamma)^2 T}
 +\frac{2}{1-\gamma}\brac{\brac{\frac{\gamma  r_{\max}}{1-\gamma}+r_{\max} }\brac{\frac{1}{T}+ \frac{C_{KL}}{\sigma p_\wedge} }}^2\frac{1}{T}
\\&= \widetilde{\mathcal{O }}\brac{\frac{1}{(1-\gamma)^5p^2_\wedge\sigma^2 T}},
 }where $(i)$ follows from \cref{lm:a8,thm:tv1}. $(ii)$ follows from $2^{\frac{\log T}{\log 2}}\leq \frac{1}{T} $. 

When $\mathbb E \Fbrac{\mynorm{\widehat Q_T^{\rho_{KL}(\sigma)}-Q^{*,\rho_{KL}(\sigma)}}_\infty^2}\leq \epsilon^2 $, the iteration $T\geq \widetilde{\mathcal{O }}\brac{(1-\gamma)^{-5}\epsilon^{-2}p^{-2}_\wedge\sigma^{-2}}$. When $\psi=\frac{1}{2}$, the expected sample size per iteration is $N_{\max}+2$. Above all, the total sample complexity is $\widetilde{\mathcal{O }}\brac{|\mathcal{S}||\mathcal{A}|(1-\gamma)^{-5}\epsilon^{-2} p^{-2}_\wedge\sigma^{-2}}$. 

This completes the proof.     
\end{proof}


\section{Proof of Lemmas and Propositions}\label{sec:proof of lemma}
\begin{proof}[Proof of \cref{lm:a8}]

\eqenv{
&\norminf{\widehat Q^{*\rho(\sigma)}- Q^{*\rho(\sigma)}}\\&= \norminf{\hatT\brac{\widehat Q^{*\rho(\sigma)}}-{\mathcal{T}}^{\rho(\sigma)}\brac{Q^{*\rho(\sigma)}}  }
\\& \leq \norminf{\hatT\brac{\widehat Q^{*\rho(\sigma)}}-\boldsymbol{\bar{\mathcal{T}}}_{\max}^{\rho(\sigma)}\brac{Q^{*\rho(\sigma)}} }
+ \norminf{\hatT\brac{ Q^{*\rho(\sigma)}}-\mathcal{T}^{\rho(\sigma)}\brac{Q^{*\rho(\sigma)}}}
\\& \myineq{\leq}{i} \gamma\norminf{\widehat Q^{*\rho(\sigma)}-Q^{*\rho(\sigma)} }+ \norminf{\hatT\brac{ Q^{*\rho(\sigma)}}-{\mathcal{T}}^{\rho(\sigma)}\brac{Q^{*\rho(\sigma)}}},
} where $(i)$ follows from \cref{prop:contract}. 
\end{proof}

\begin{proof}[Proof of \cref{prop:mlmc}]
    Here we recall the definition of $\delta^{\rho(\sigma)}_{s,a,N_2}(Q)$ that
    \eqenv{
    \delta^{\rho(\sigma)}_{s,a,N_2}(Q):&=\sup_{\alpha\geq 0}\varbrac{f^{\rho(\sigma)}(\widehat p_{s,a,2^{N_2+1}},\alpha,V )} 
    \\&-\frac{1}{2} \sup_{\alpha\geq 0}\varbrac{f^{\rho(\sigma)}(\widehat p^E_{s,a,2^{N_2}},\alpha,V)}
    -\frac{1}{2}\sup_{\alpha\geq 0}\varbrac{f^{\rho(\sigma)}(\widehat p^O_{s,a,2^{N_2}},\alpha,V )}. 
    }
    Then, we recall that
    \begin{align}
        f^{*\rho(\sigma)}(\hat p_{s,a,n},V):=\sup_{\alpha\geq 0} \varbrac{f^{\rho(\sigma)}(\hat p_{s,a,n}, \alpha, V) }.
    \end{align}  
    % \begin{align}
    %     \boldsymbol{\bar f^*}^{\rho(\sigma)}\brac{\hat p_n, V(s'_{s,a})}:=\mathbb E_{\hat{p}_n}\Fbrac{f^{*\rho(\sigma)}(\hat p_n,V(s'_{s,a}))}.
    % \end{align}
    Thus, we can get that
    \eqenv{
    \E\Fbrac{\delta^{Q,\rho(\sigma)}_{s,a,N_2}|N_2}&= 
    \mathbb E\Fbrac{f^{*\rho(\sigma)}(\hat p_{s,a,2^{N_2+1}},V )|N_2}\\&\qquad-\frac{1}{2}\mathbb E\Fbrac{f^{*\rho(\sigma)}(\hat p^O_{s,a,2^{N_2+1}},V )|N_2}-\frac{1}{2}\mathbb E\Fbrac{f^{*\rho(\sigma)}(\hat p^E_{s,a,2^{N_2+1}},V )|N_2}
    \\&=E\Fbrac{f^{*\rho(\sigma)}(\hat p_{s,a,2^{N_2+1}},V )|N_2}-E\Fbrac{f^{*\rho(\sigma)}(\hat p_{s,a,2^{N_2}},V )|N_2}
    . 
    }
    Take the expectation of the random variable $N_2\sim \text{Geo}(\psi)$,
    we can obtain that
    \eqenv{
    \E&\Fbrac{\widehat{v}^{\rho(\sigma)}(Q(s,a)) }\\&=\E\Fbrac{V(s'_{s,a,0})+\frac{\delta^{Q,\rho(\sigma)}_{s,a,N_2} }{P_{N_2}}}
    \\&= \E[V(s'_{s,a,0})]+\E\Fbrac{ \frac{\delta^{Q,\rho(\sigma)}_{s,a,N_2} }{P_{N_2}} }
    \\&\myineq{=}{i} \E[V(s'_{s,a,0})] + \sum_{N=0}^{N_{\max}} \E\Fbrac{\frac{\delta^{Q,\rho(\sigma)}_{s,a,N_2} }{P_{N_2}}|N_2=N }\prob\brac{N}  + \sum_{N=N_{\max}+1}^\infty  \E\Fbrac{\frac{\delta^{Q,\rho(\sigma)}_{s,a,N_2} }{P_{N_2}}|N_2=N }\prob\brac{N}
    \\& \myineq{=}{ii}E\Fbrac{f^{*\rho(\sigma)}(\hat p_{s,a,2^{0}},V )} + \sum_{N=0}^{N_{\max}} \E\Fbrac{\delta^{Q,\rho(\sigma)}_N }
    \\&  \myineq{=}{iii} E\Fbrac{f^{*\rho(\sigma)}(\hat p_{s,a,2^{0}},V )} 
     + \sum_{N=0}^{N_{\max}} E\Fbrac{f^{*\rho(\sigma)}(\hat p_{s,a,2^{N+1}},V )}-E\Fbrac{f^{*\rho(\sigma)}(\hat p_{s,a,2^{N}},V )}
    \\&= E\Fbrac{f^{*\rho(\sigma)}(\hat p_{s,a,2^{N_{\max}+1}},V )},
    } where $(i)$ and $(ii)$ follows from the \cref{eq:hatq}; $(iii)$ follows from \cref{def:3.1}. 

    This completes the proof. 
\end{proof}

\begin{proof}[Proof of \Cref{prop:contract}]
    For any $Q,Q'\in \mathbb R^{|\mathcal{S}||\mathcal{A}|} $, we have that
    \eqenv{\label{eq:eq39}
    \hatT(& Q)(s,a)-\hatT( Q')(s,a)\\&=  \E\Fbrac{g^{*\rho(\sigma)}(\hat \mu_{s,a,2^{N_{\max}+1}},r_{s,a})+ \gamma  f^{*\rho(\sigma)}(\hat p_{s,a,2^{N_{\max}+1}},V )  }
\\&\qquad - \E\Fbrac{g^{*\rho(\sigma)}(\hat \mu_{s,a,2^{N_{\max}+1}},r_{s,a})+ \gamma  f^{*\rho(\sigma)}(\hat p_{s,a,2^{N_{\max}+1}},V' )  }
\\& = \gamma\brac{  \E\Fbrac{f^{*\rho(\sigma)}(\hat p_{s,a,2^{N_{\max}+1}},V )}  -\E\Fbrac{  f^{*\rho(\sigma)}(\hat p_{s,a,2^{N_{\max}+1}},V' )  } }
\\& =\gamma\E_{\hat p_{s,a,2^{N_{\max}+1}}}\Fbrac{\inf_{\rho(q,\hat p_{s,a,2^{N_{\max}+1}})\leq \sigma } \E_{q}[V' (s'_{s,a})]-\inf_{\rho(q,\hat p_{s,a,2^{N_{\max}+1}})\leq \sigma } \E_{q}[  V (s'_{s,a})]  }
\\&=\gamma \E_{\hat p_{s,a,2^{N_{\max}+1}}}\Bigg[\inf_{\rho(q,\hat p_{s,a,2^{N_{\max}+1}})\leq \sigma } \E_{q}[  \max_{a'}Q' (s'_{s,a},a')]
\\& \qquad\qquad-\inf_{\rho(q,\hat p_{s,a,2^{N_{\max}+1}})\leq \sigma } \E_{q}[  \max_{a'}Q (s'_{s,a},a')]  \Bigg]
    }
    Hence, consider the infinite norm of both sides \cref{eq:eq39}, we can get 
    \eqenv{
    &\norminf{\hatT (Q)-\hatT(Q') }
    \\&\leq \max_{s,a}\lbrac{\hatT( Q)(s,a)-\hatT( Q')(s,a)  }
    \\& =\gamma\max_{s,a}\Bigg|\E_{\hat p_{s,a,2^{N_{\max}+1}}}\Bigg[\inf_{\rho(q,\hat p_{s,a,2^{N_{\max}+1}})\leq \sigma } \E_{q}[  \max_{a'}Q' (s'_{s,a},a')]\\& \qquad\qquad\qquad\qquad\qquad\qquad\qquad\qquad-\inf_{\rho(q,\hat p_{s,a,2^{N_{\max}+1}})\leq \sigma } \E_{q}[  \max_{a'}Q (s'_{s,a},a')]  \Bigg] \Bigg|
    \\& \leq \gamma\max_{s,a}\Bigg|\E_{\hat p_{s,a,2^{N_{\max}+1}}}\Bigg[\sup_{\rho(q,\hat p_{s,a,2^{N_{\max}+1}})\leq \sigma } \E_{q}[  \max_{a'}Q' (s'_{s,a},a')
    -\max_{a'}Q (s'_{s,a},a')] \Bigg] \Bigg |
    \\& \leq \gamma\max_{s,a}\max_{s'_{s,a}}\lbrac{\max_{a'}Q' (s'_{s,a},a')-\max_{a'}Q (s'_{s,a},a') }
    \\& \leq \gamma\max_{s'} \max_{a'}\lbrac{Q(s',a')-Q'(s',a')}
    \\&= \gamma \norminf{Q-Q'}.
    }


\end{proof}

\begin{proof}[Proof of \Cref{lm:tvlm}] \citep{shi2023curious}
    Firstly, for a fixed $\alpha$, by Bernstein's inequality, we has that with probability at least $1-\delta$,
    \eqenv{\label{eq:eq128}
    \lbrac{\mathbb E_{p_{s,a}}\Fbrac{(V(s'_{s,a}))_\alpha}-\E_{\hat p_{s,a,N}}\Fbrac{(V(s'_{s,a}))_\alpha}}
    \leq \sqrt{\frac{2\log\brac{\frac{2}{\delta}}}{N}}\sqrt{\text{Var}_{p_{s,a}}((V(s'_{s,a}))_\alpha)}+\frac{2r_{\max}\log\brac{\frac{2}{\delta}}}{3N(1-\gamma)}. 
    }

    Then, the term $\max_{0\leq \alpha\leq \max_{s'_{s,a}} V(s'_{s,a}) } \lbrac{\mathbb E_{p_{s,a}}\Fbrac{(V(s'_{s,a}))_\alpha}-\E_{\hat p_{s,a,N}}\Fbrac{(V(s'_{s,a}))_\alpha}}$ is $1$-Lipschitz w.r.t. $\alpha$ for any $V$ obeying $\norminf{V}\leq \frac{r_{\max}}{1-\gamma}$.  In addition, we construct an $\epsilon_1$-net $N_{\epsilon_1}$ over $[0, \frac{r_{\max}}{1-\gamma}]$ whose size satisfies $|N_{\epsilon_1}|\leq \frac{3r_{\max}}{\epsilon_1(1-\gamma)}$ \citep{vershynin2018high}. By union bound and \Cref{eq:eq128}, with probability at least $1-{\delta}$, we have that for all $\alpha\in N_{\epsilon_1}$,  
    \eqenv{
    \lbrac{\mathbb E_{p_{s,a}}\Fbrac{(V(s'_{s,a}))_\alpha}-\E_{\hat p_{s,a,N}}\Fbrac{(V(s'_{s,a}))_\alpha}}
    \leq \sqrt{\frac{2\log\brac{\frac{2|N_{\epsilon_1}|}{\delta}}}{N}}\sqrt{\text{Var}_{p_{s,a}}(V)}+\frac{2r_{\max}\log\brac{\frac{2|N_{\epsilon_1}|}{\delta}}}{3N(1-\gamma)}.
    }

    Then, we have that 
    \eqenv{
   \max_{0\leq \alpha\leq \max_{s'_{s,a}} V(s'_{s,a}) }& \lbrac{\mathbb E_{p_{s,a}}\Fbrac{(V(s'_{s,a}))_\alpha}-\E_{\hat p_{s,a,N}}\Fbrac{(V(s'_{s,a}))_\alpha}}
  \\& \myineq{\leq}{a} \epsilon_1+ \sup_{\alpha\in N_{\epsilon_1}} \lbrac{\mathbb E_{p_{s,a}}\Fbrac{(V(s'_{s,a}))_\alpha}-\E_{\hat p_{s,a,N}}\Fbrac{(V(s'_{s,a}))_\alpha}}
   \\& \myineq{\leq }{b} \epsilon_1+\sqrt{\frac{2\log\brac{\frac{2 |N_{\epsilon_1}|}{\delta}}}{N}}\sqrt{\text{Var}_{p_{s,a}}(V)}+\frac{2r_{\max}\log\brac{\frac{2 |N_{\epsilon_1}|}{\delta}}}{3N(1-\gamma)}
   \\& \myineq{\leq}{c} \sqrt{\frac{2\log\brac{\frac{2 |N_{\epsilon_1}|}{\delta}}}{N}}\sqrt{\text{Var}_{p_{s,a}}(V)}+\frac{2r_{\max}\log\brac{\frac{ |N_{\epsilon_1}|}{\delta}}}{N(1-\gamma)}
   \\& \myineq{\leq}{d} 2\sqrt{\frac{\log\brac{\frac{2 N}{\delta}}}{N}}\norminf{V}+\frac{2r_{\max}\log\brac{\frac{1}{\delta}}}{N(1-\gamma)}
   \\&\myineq{\leq}{e} 3r_{\max}\sqrt{\frac{\log\brac{\frac{2 N}{\delta}}}{(1-\gamma)^2 N} },
    } 
    where $(a)$ follows from that the parameter $\alpha^*= \arg\max_{\alpha}\lbrac{\mathbb E_{p_{s,a}}\Fbrac{(V(s'_{s,a}))_\alpha}-\E_{\hat p_{s,a,N}}\Fbrac{(V(s'_{s,a}))_\alpha} }$ falls into a $\epsilon_1$ balls centered around some point inside $N_{\epsilon_1}$. $(b)$ follows from  \Cref{eq:eq128}. $(c)$ follows from taking $\epsilon_1=\frac{r_{\max}\log\brac{\frac{2 |N_{\epsilon_1}|}{\delta}}}{3N(1-\gamma)}$. $(d)$ follows from that $\lbrac{N_{\epsilon_1}}\leq \frac{3}{\epsilon_1(1-\gamma)}\leq 9N$. $(e)$ follows from the fact that $\norminf{V}\leq\frac{r_{\max}}{1-\gamma}$ and $N\geq \log\brac{\frac{18  N}{\delta}}$. 
    This completes the proof.
\end{proof}


\begin{proof}[Proof of \Cref{lm:chi2lm}] \citep{shi2023curious}
    Firstly, we do error decomposition as follows
    \eqenv{
    % &\max_{0\leq \alpha\leq \max_{s'_{s,a}} V(s'_{s,a}) } \lbrac{\mathbb E_{p_{s,a}}\Fbrac{(V(s'_{s,a}))_\alpha}-E_{\hat p_{s,a,N}}\Fbrac{(V(s'_{s,a}))_\alpha}}
    &\max_{0\leq \alpha\leq \max_{s'_{s,a}} V(s'_{s,a}) }\lbrac{\E_{p_{s,a}}\Fbrac{(V(s'_{s,a}))_\alpha}-\sqrt{\sigma \text{Var}_{p_{s,a}}((V(s'_{s,a}))_\alpha) }-\E_{p_{s,a}}\Fbrac{(V(s'_{s,a}))_\alpha}+\sqrt{\sigma \text{Var}_{\hat p_{s,a,N}}((V(s'_{s,a}))_\alpha) }   }
    \\& \leq \max_{0\leq \alpha\leq \max_{s'_{s,a}} V(s'_{s,a}) }\lbrac{\E_{p_{s,a}}\Fbrac{(V(s'_{s,a}))_\alpha}-\E_{\hat p_{s,a,N}}\Fbrac{(V(s'_{s,a}))_\alpha} }
    \\&\qquad+\max_{0\leq \alpha\leq \max_{s'_{s,a}} V(s'_{s,a}) }\sqrt{\sigma}\lbrac{\sqrt{ \text{Var}_{p_{s,a}}((V(s'_{s,a}))_\alpha) }-\sqrt{ \text{Var}_{\hat p_{s,a,N}}((V(s'_{s,a}))_\alpha) }}.\label{eq:chi2lm}
    }
    Then, consider the first terms in \Cref{eq:chi2lm}. By Bernstein's inequality, for fixed $\alpha$,with probability at least $1-\delta$, we have that 
    \eqenv{
    \max_{0\leq \alpha\leq \max_{s'_{s,a}} V(s'_{s,a}) }\lbrac{\E_{p_{s,a}}\Fbrac{(V(s'_{s,a}))_\alpha}-\E_{\hat p_{s,a,N}}\Fbrac{(V(s'_{s,a}))_\alpha} }\leq 2r_{\max}\sqrt{\frac{\log \brac{\frac{2  N}{\delta}}}{(1-\gamma)^2 N}}. 
    }

Next, consider the second term in \Cref{eq:chi2lm}. According to the Lemma 6 in \citep{panaganti2022sample}, with probability at least $1-\delta$, we have that
\eqenv{
\lbrac{\sqrt{ \text{Var}_{p_{s,a}}((V(s'_{s,a}))_\alpha) }-\sqrt{ \text{Var}_{\hat p_{s,a,N}}((V(s'_{s,a}))_\alpha) }}\leq \sqrt{\frac{2\log \brac{\frac{2}{\delta}}}{(1-\gamma)^2 N}}.
}

Next, we prove the Lipschitz property of the above term. 
\eqenv{\label{eq:eq134}
&\lbrac{\sqrt{ \text{Var}_{p_{s,a}}((V(s'_{s,a}))_{\alpha_1}) }-\sqrt{ \text{Var}_{\hat p_{s,a,N}}((V(s'_{s,a}))_{\alpha_1}) }}-\lbrac{\sqrt{ \text{Var}_{p_{s,a}}((V(s'_{s,a}))_{\alpha_2}) }-\sqrt{ \text{Var}_{\hat p_{s,a,N}}((V(s'_{s,a}))_{\alpha_2}) }}
\\& \leq \lbrac{\sqrt{ \text{Var}_{p_{s,a}}((V(s'_{s,a}))_{\alpha_1}) }-\sqrt{ \text{Var}_{\hat p_{s,a,N}}((V(s'_{s,a}))_{\alpha_1}) }-\sqrt{ \text{Var}_{p_{s,a}}((V(s'_{s,a}))_{\alpha_2}) }+\sqrt{ \text{Var}_{\hat p_{s,a,N}}((V(s'_{s,a}))_{\alpha_2}) }}
\\& \leq \lbrac{\sqrt{ \text{Var}_{p_{s,a}}((V(s'_{s,a}))_{\alpha_1}) }-\sqrt{ \text{Var}_{ p_{s,a}}((V(s'_{s,a}))_{\alpha_2}) }}+\lbrac{\sqrt{ \text{Var}_{\hat p_{s,a,N}}((V(s'_{s,a}))_{\alpha_1}) }-\sqrt{ \text{Var}_{\hat p_{s,a,N}}((V(s'_{s,a}))_{\alpha_2}) }}
\\& \myineq{\leq}{a} \sqrt{\lbrac{ \text{Var}_{p_{s,a}}((V(s'_{s,a}))_{\alpha_1})  - \text{Var}_{ p_{s,a}}((V(s'_{s,a}))_{\alpha_2}) }}+\sqrt{\lbrac{ \text{Var}_{\hat p_{s,a,N}}((V(s'_{s,a}))_{\alpha_1}) - \text{Var}_{\hat p_{s,a,N}}((V(s'_{s,a}))_{\alpha_2}) }}
\\& \myineq{\leq}{b} 2\sqrt{2(\alpha_1+\alpha_2)\lbrac{\alpha_1-\alpha_2}}
\leq 4\sqrt{\frac{r_{\max}|\alpha_1-\alpha_2|}{1-\gamma}}, 
}where $(a)$ follows from $|\sqrt{x}-\sqrt{y}|\leq \sqrt{|x-y|}$ and $(b)$ follows from that 
\eqenv{
&\lbrac{ \text{Var}_{p_{s,a}}((V(s'_{s,a}))_{\alpha_1})  - \text{Var}_{ p_{s,a}}((V(s'_{s,a}))_{\alpha_2}) }
\\& = \lbrac{\E_{p_{s,a}}\Fbrac{ ((V(s'_{s,a}))_{\alpha_1})^2 }-
\brac{\E_{p_{s,a}}\Fbrac{ (V(s'_{s,a}))_{\alpha_1} } }^2-\E_{p_{s,a}}\Fbrac{ ((V(s'_{s,a}))_{\alpha_2})^2 } +\brac{\E_{p_{s,a}}\Fbrac{ (V(s'_{s,a}))_{\alpha_2}} }^2}
\\& \leq \lbrac{\E_{p_{s,a}}\Fbrac{ ((V(s'_{s,a}))_{\alpha_1})^2 - ((V(s'_{s,a}))_{\alpha_2})^2 }  }+\lbrac{\brac{\E_{p_{s,a}}\Fbrac{ (V(s'_{s,a}))_{\alpha_1} } }^2-\brac{\E_{p_{s,a}}\Fbrac{ (V(s'_{s,a}))_{\alpha_2}} }^2 }
\\& \myineq{\leq}{a} 2(\alpha_1+\alpha_2)|\alpha_1-\alpha_2|,
} where $(a)$ follows from $(V(s'_{s,a}))_{\alpha}\leq \alpha $.

To prove the union bound, we also construct an $\epsilon_2$-net $N_{\epsilon_2} $ over $\Fbrac{0, \frac{r_{\max}}{1-\gamma}}$ \citep{vershynin2018high}.  With probability at least $1-\delta$, we have that
\eqenv{
\max_{0\leq \alpha\leq \max_{s'_{s,a}} V(s'_{s,a}) }&\sqrt{\sigma}\lbrac{\sqrt{ \text{Var}_{p_{s,a}}((V(s'_{s,a}))_\alpha) }-\sqrt{ \text{Var}_{\hat p_{s,a,N}}((V(s'_{s,a}))_\alpha) }}
\\& \myineq{\leq}{a} 4\sqrt{\frac{r_{\max} \epsilon_2}{1-\gamma}}+\sup_{\alpha\in N_{\epsilon_2}} \lbrac{\sqrt{ \text{Var}_{p_{s,a}}((V(s'_{s,a}))_\alpha) }-\sqrt{ \text{Var}_{\hat p_{s,a,N}}((V(s'_{s,a}))_\alpha) }}
\\& \myineq{\leq}{b} 4\sqrt{\frac{r_{\max} \epsilon_2}{1-\gamma}}+
r_{\max}\sqrt{\frac{2\log\brac{\frac{2 |N_{\epsilon_2}|}{\delta}} }{(1-\gamma)^2 N}}
\\& \myineq{\leq}{c} 2r_{\max}\sqrt{\frac{2\log\brac{\frac{2 |N_{\epsilon_2}|}{\delta}} }{(1-\gamma)^2 N}}
\\& \myineq{\leq}{d} 2 r_{\max}\sqrt{\frac{2\log\brac{\frac{24 N}{\delta}} }{(1-\gamma)^2 N}},
}
where $(a)$ follows from the property of $N_{\epsilon_2}$. $(b)$ follows from \Cref{eq:eq134}. $(c)$ follows from taking $\epsilon_2=\frac{r_{\max}\log\brac{\frac{2 |N_{\epsilon_2}|}{\delta} } }{8N(1-\gamma)} $. $(d)$ follows from that $\lbrac{N_{\epsilon_2}}\leq \frac{3}{\epsilon_2(1-\gamma)}\leq 24N$. 

\end{proof}
This completes the proof.

% \section{KL Divergence Uncertainty Set}
% In this section, we provide the proof of \cref{thm:tv1,thm:kl} specifically for KL distance. 
% \begin{theorem}[Restatement of \cref{thm:tv1} specifically for KL distance]
%     Consider the case of KL constraint uncertainty set with uncertainty level $\sigma$ i.e. $ \mathcal{P}^{KL}(\sigma)$ and $ \mathcal{R}^{KL}(\sigma)$, set $\psi=\frac{1}{2}$, for any $Q\in \mathbb R^{\mathcal{S}\times\mathcal{A}}, s\in \mathcal{S}, a\in \mathcal{A} $, the estimation bias can be bounded as:
%     \begin{align}
%         \lbrac{\mathbb E\Fbrac{\widehat {\mathcal{T}}_{N_{\max}}^{\rho_{KL}(\sigma)} (Q)(s,a) } - {\mathcal{T}}^{\rho_{KL}(\sigma)} (Q)(s,a)}\leq \widetilde{\mathcal{O }}\brac{2^{\frac{N_{\max}}{2}} },\nonumber
%         % \brac{r_{\max}+\frac{r_{\max}}{1-\gamma}}
%     \end{align}
%     and the variation can be bounded as:
%     \begin{align}
%         \text{Var}\brac{\widehat {\mathcal{T}}_{N_{\max}}^{\rho_{KL}(\sigma)} (Q)(s,a)  }\leq \widetilde{\mathcal{O }}\brac{N_{\max}}.
%     \end{align}
% \end{theorem}
% \begin{proof}
%     Then, we provide the proof of \cref{thm:kl}.
%     Firstly, we make error decomposition as follows:
%     \eqenv{
%     &\lbrac{ \E\Fbrac{\widehat {\mathcal{T}}_{N_{\max}}^{\rho_{KL}(\sigma)} (Q)(s,a) } - {\mathcal{T}}^{\rho_{KL}(\sigma)} (Q)(s,a)}
%     \\&\myineq{=}{i} \bigg|\E\Fbrac{g^{*\rho_{KL}(\sigma)}(\hat \mu_{s,a,2^{N_{\max}+1}},r_{s,a})}+\gamma\E\Fbrac{f^{*\rho_{KL}(\sigma)}(\hat p_{s,a,2^{N_{\max}+1}},V(s'_{s,a}))}
% \\& \qquad   - {g^{*\rho_{KL}(\sigma)}(\mu_{s,a},r_{s,a})}
% - \gamma  {f^{*\rho_{KL}(\sigma)}(p_{s,a},V(s'_{s,a}))} \bigg|
% \\& \leq \lbrac{\E\Fbrac{g^{*\rho_{KL}(\sigma)}(\hat \mu_{s,a,2^{N_{\max}+1}},r_{s,a})}- {g^{*\rho_{KL}(\sigma)}(\mu_{s,a},r_{s,a})} }\\& \qquad +\gamma\lbrac{\E\Fbrac{f^{*\rho_{KL}(\sigma)}(\hat p_{s,a,2^{N_{\max}+1}},V(s'_{s,a}))}-{f^{*\rho_{KL}(\sigma)}(\mu_{s,a},r_{s,a})} }
% \\& \leq \E\Fbrac{\lbrac{ {g^{*\rho_{KL}(\sigma)}(\hat \mu_{s,a,2^{N_{\max}+1}},r_{s,a})}- {g^{*\rho_{KL}(\sigma)}(\mu_{s,a},r_{s,a})} } }\\& \qquad +\gamma \E\Fbrac{ \lbrac{{f^{*\rho_{KL}(\sigma)}(\hat p_{s,a,2^{N_{\max}+1}},V(s'_{s,a}))}-{f^{*\rho_{KL}(\sigma)}(\mu_{s,a},r_{s,a})} }}, \label{eq:eq70}
%     }where $(i)$ follows from \cref{prop:mlmc}.
    
% Then, for convenience, we bound the second term in \cref{eq:eq70}. The first term can be bounded similarly.  By \cref{lm:kl},
% \eqenv{\label{eqeq71}
%     &\lbrac{f^{*\rho_{KL}(\sigma)}\brac{ p_{s,a}, V(s'_{s,a})}-{{f^*}^{\rho_{KL}(\sigma)}\brac{\hat p_{s,a,2^{N_{\max}+1}},   V^*(s'_{s,a})}} }
%     \\&= \lbrac{\max_{\alpha\geq0} \varbrac{-\alpha \log\brac{\mathbb E_{p_{s,a}} \Fbrac{exp\brac{-\frac{V(s'_{s,a})}{\alpha}} } }-\alpha \sigma }
%     -\max_{\alpha\geq0} \varbrac{-\alpha \log\brac{\mathbb E_{\hat p_{s,a,2^{N_{\max}+1}}} \Fbrac{exp\brac{-\frac{V(s'_{s,a})}{\alpha}} } }-\alpha \sigma }}
%     \\& \myineq{\leq}{i} \max_{0\leq\alpha\leq \frac{r_{\max}}{(1-\gamma)\sigma }} \lbrac{-\alpha \log\brac{\mathbb E_{p_{s,a}} \Fbrac{exp\brac{-\frac{V(s'_{s,a})}{\alpha}} } }+\alpha \log\brac{\mathbb E_{\hat p_{s,a,2^{N_{\max}+1}}} \Fbrac{exp\brac{-\frac{V(s'_{s,a})}{\alpha}} } }  }
%     \\& {\leq} 
%     \max_{0\leq\alpha\leq \frac{r_{\max}}{(1-\gamma)\sigma }}
%     \lbrac{\alpha \log\brac{\frac{\mathbb E_{\hat p_{s,a,2^{N_{\max}+1}}} \Fbrac{exp\brac{-\frac{V(s'_{s,a})}{\alpha}} } }{\mathbb E_{p_{s,a}} \Fbrac{exp\brac{-\frac{V(s'_{s,a})}{\alpha}} }}  }  }
%     \\& {\leq}
%     \frac{r_{\max}}{(1-\gamma)\sigma}  \max_{0\leq\alpha\leq \frac{r_{\max}}{(1-\gamma)\sigma }} \lbrac{\log\brac{\frac{\mathbb E_{\hat p_{s,a,2^{N_{\max}+1}}} \Fbrac{exp\brac{-\frac{V(s'_{s,a})}{\alpha}} } -\mathbb E_{p_{s,a}} \Fbrac{exp\brac{-\frac{V(s'_{s,a})}{\alpha}} }}{\mathbb E_{p_{s,a}} \Fbrac{exp\brac{-\frac{V(s'_{s,a})}{\alpha}} }}  +1 }   },
% }
% where $(i)$ follows from $\boldsymbol{\alpha^*}^{\rho_{KL}(\sigma)}(p, V)\leq \frac{\max_{s:p(s)\neq 0}V(s)}{\sigma}\leq \frac{\max_{s,a} Q(s,a)}{\sigma}\leq \frac{r_{\max}}{(1-\gamma)\sigma }$ by \citep{hu2013kullback}. 
% % We recall that $\boldsymbol{\alpha^*}^{\rho_{KL}(\sigma)}(p, V)$ is defined in \cref{def:a.1}. 

% Noting that $ \hat p_{s,a,2^{N_{\max}+1}}(s'_{s,a})$ is absolutely continuous on $p_{s,a}(s'_{s,a}) $, then by Hoeffding's inequality we have 
% \eqenv{
% \prob\brac{\max_{s'_{s,a}}\lbrac{\frac{\hat p_{s,a,2^{N_{\max}+1}}(s'_{s,a})-p_{s,a}(s'_{s,a})  }{p_{s,a}(s'_{s,a})} }\geq \sqrt{\frac{1}{2^{N_{\max}+1}p^2_\wedge} \log \frac{2|\mathcal{S}| }{\tau} } }\leq \tau.
% }

% Set $\tau=2^{-(N_{\max}+1)}$. With probability at least $1-2^{-(N_{\max}+1)} $, we have that
% \eqenv{
% \lbrac{\frac{\mathbb E_{\hat p_{s,a,2^{N_{\max}+1}}} \Fbrac{exp\brac{-\frac{V(s'_{s,a})}{\alpha}} } -\mathbb E_{p_{s,a}} \Fbrac{exp\brac{-\frac{V(s'_{s,a})}{\alpha}} }}{\mathbb E_{p_{s,a}} \Fbrac{exp\brac{-\frac{V(s'_{s,a})}{\alpha}} }}  }
%  &\myineq{\leq}{i}
% \max_{s'_{s,a}}\lbrac{\frac{\hat p_{s,a,2^{N_{\max}+1}}(s'_{s,a})-p_{s,a}(s'_{s,a})  }{p_{s,a}(s'_{s,a})} }
% \\& \leq \sqrt{\frac{N_{\max}}{2^{N_{\max}+1}p^2_\wedge} \log {2|\mathcal{S}| }},
% } where $(i)$ follows from the fact that
% \eqenv{
% \lbrac{\sum p_i x_i} = \lbrac{\sum \frac{p_i}{q_i} q_i x_i }\leq  \lbrac{\sum q_i x_i}\max_i\lbrac{\frac{p_i}{q_i}}.
% }


% Note that if we set $\frac{N_{\max}}{p^2_\wedge} \log {2|\mathcal{S}| }\leq \frac{1}{4}2^{N_{\max}+1} $ then $\sqrt{\frac{N_{\max}}{2^{N_{\max}+1}p^2_\wedge} \log {2|\mathcal{S}| }}\leq \frac{1}{2} $. 

% Then, combined with \cref{eqeq71}, we can conclude that
% \eqenv{\label{eqeq74}
% &\lbrac{{f^*}^{\rho_{KL}(\sigma)}\brac{ p_{s,a}, V(s'_{s,a})}-{{f^*}^{\rho_{KL}(\sigma)}\brac{\hat p_{s,a,2^{N_{\max}+1}},   V^*(s'_{s,a})}}}
% \\& {\leq}
%     \frac{r_{\max}}{(1-\gamma)\sigma}  \max_{0\leq\alpha\leq \frac{r_{\max}}{(1-\gamma)\sigma }} \lbrac{\log\brac{\frac{\mathbb E_{\hat p_{s,a,2^{N_{\max}+1}}} \Fbrac{exp\brac{-\frac{V(s'_{s,a})}{\alpha}} } -\mathbb E_{p_{s,a}} \Fbrac{exp\brac{-\frac{V(s'_{s,a})}{\alpha}} }}{\mathbb E_{p_{s,a}} \Fbrac{exp\brac{-\frac{V(s'_{s,a})}{\alpha}} }}  +1 }   }
% \\& \myineq{\leq}{i}
%     \frac{r_{\max}}{(1-\gamma)\sigma} 
%     \max_{0\leq\alpha\leq \frac{r_{\max}}{(1-\gamma)\sigma }}
%     2\lbrac{\frac{\mathbb E_{\hat p_{s,a,2^{N_{\max}+1}}} \Fbrac{exp\brac{-\frac{V(s'_{s,a})}{\alpha}} } -\mathbb E_{p_{s,a}} \Fbrac{exp\brac{-\frac{V(s'_{s,a})}{\alpha}} }}{\mathbb E_{p_{s,a}} \Fbrac{exp\brac{-\frac{V(s'_{s,a})}{\alpha}} }}  }
%     \\& \leq \frac{2r_{\max}}{(1-\gamma)\sigma} 
%     \max_{s'_{s,a}}\lbrac{\frac{\hat p_{s,a,2^{N_{\max}+1}}(s'_{s,a})-p_{s,a}(s'_{s,a})  }{p_{s,a}(s'_{s,a})} }\leq \frac{2r_{\max}}{(1-\gamma)\sigma}\sqrt{\frac{N_{\max}}{2^{N_{\max}+1}p^2_\wedge} \log {2|\mathcal{S}| }},
% } where $(i)$ follows from that $|\log(x+1)|\leq 2|x| $ for $|x|\leq \frac{1}{2}$.
% Otherwise, with probability at most $2^{-(N_{\max}+1)}$, we can conclude 
% that
% \eqenv{\label{eqeq75}
% \lbrac{{f^*}^{\rho_{KL}(\sigma)}\brac{ p_{s,a}, V(s'_{s,a})}-{{f^*}^{\rho_{KL}(\sigma)}\brac{\hat p_{s,a,2^{N_{\max}+1}},   V^*(s'_{s,a})}}}
%     \leq  \max_{s'_{s,a}}V(s'_{s,a})\leq \frac{r_{\max}}{1-\gamma}.
% }

% Then, consider the expectation, we can get
% \eqenv{
%    \E&\Fbrac{\lbrac{{f^*}^{\rho_{KL}(\sigma)}\brac{ p_{s,a}, V(s'_{s,a})}-{{f^*}^{\rho_{KL}(\sigma)}\brac{\hat p_{s,a,2^{N_{\max}+1}},   V^*(s'_{s,a})}}}}
%    \\&= \E\Fbrac{{f^*}^{\rho_{KL}(\sigma)}\brac{ p_{s,a}, V(s'_{s,a})}-{{f^*}^{\rho_{KL}(\sigma)}\brac{\hat p_{s,a,2^{N_{\max}+1}},   V^*(s'_{s,a})}} }
%    \\& \leq \frac{2r_{\max}}{(1-\gamma)\sigma}\sqrt{\frac{N_{\max}}{2^{N_{\max}+1}p^2_\wedge} \log {2|\mathcal{S}| }}+ 2^{-(N_{\max}+1)}\frac{r_{\max}}{1-\gamma}
%    \\&\leq \frac{2r_{\max}}{(1-\gamma)\sigma}{\sqrt{N_{\max}\log {2|\mathcal{S}| } }  }\frac{ 1}{p_\wedge2^\frac{N_{\max}+1}{2}}+2^{-(N_{\max}+1)}\frac{r_{\max}}{1-\gamma} , 
% }where we set $C_{KL}=2\sqrt{N_{\max}\log {2|\mathcal{S}| } }  $, then
% \eqenv{
% \E&\Fbrac{\lbrac{{f^*}^{\rho_{KL}(\sigma)}\brac{ p_{s,a}, V(s'_{s,a})}-{f}^{*\rho_{KL}(\sigma)}\brac{\hat p_{s,a,2^{N_{\max}+1}},   V^*(s'_{s,a})}}}
%    \leq \frac{r_{\max}}{(1-\gamma)\sigma} C_{KL}\frac{ 1}{p_\wedge2^\frac{N_{\max}+1}{2}}+\frac{r_{\max}}{1-\gamma}2^{-(N_{\max}+1)}.
% }

% Similarly, we can get the bound
%     \eqenv{
%     \E\Fbrac{\lbrac{{g^{*\rho_{KL}(\sigma)}(\hat \mu_{s,a,2^{N_{\max}+1}},r_{s,a})}- {g^{*\rho_{KL}(\sigma)}(\mu_{s,a},r_{s,a})} }}
%     \leq \frac{r_{\max}C_{KL}}{\sigma p_\wedge} { 2^{-\frac{N_{\max}+1}{2}}}+{r_{\max}}2^{-(N_{\max}+1)}.
%     }

% Thus, we can get that
%     \eqenv{
%     \big| \E\Fbrac{\widehat {\mathcal{T}}_{N_{\max}}^{\rho_{KL}(\sigma)} (Q)(s,a) }& - {\mathcal{T}}^{\rho_{KL}(\sigma)} (Q)(s,a)\big|
% \\& \leq \lbrac{\E\Fbrac{g^{*\rho_{KL}(\sigma)}(\hat \mu_{s,a,2^{N_{\max}+1}},r_{s,a})}- {g^{*\rho_{KL}(\sigma)}(\mu_{s,a},r_{s,a})} }
% \\&\qquad+\gamma\lbrac{\E\Fbrac{f^{*\rho_{KL}(\sigma)}(\hat p_{s,a,2^{N_{\max}+1}},V(s'_{s,a}))}-{f^{*\rho_{KL}(\sigma)}(\mu_{s,a},r_{s,a})} }
% \\& \leq \brac{\frac{\gamma  r_{\max}}{1-\gamma}+r_{\max} }2^{-\frac{N_{\max}+1}{2}}\brac{2^{-\frac{N_{\max}+1}{2}}+ \frac{C_{KL}}{\sigma p_\wedge} }.
%     }

% % \end{proof}

% \textbf{Variance: }Next, we consider the variance of the robust Bellman operator. Firstly, we make error decomposition of the robust Bellman operator variance. 
% \eqenv{
% \text{Var}\brac{\widehat {\mathcal{T}}_{N_{\max}}^{\rho_{KL}(\sigma)} (Q)(s,a) }
% &= \text{Var}\brac{\widehat  r^{\rho_{KL}(\sigma)}+ \gamma \widehat{v}^{\rho_{KL}(\sigma)}(Q)(s,a) }
% \\&= \text{Var}\brac{\widehat  r^{\rho_{KL}(\sigma)}}
% +\gamma^2 \Varr{\widehat{v}^{\rho_{KL}(\sigma)}(Q)(s,a)}.
% }
% For convenience, we analyze the second term in the above equation. The first term can be bounded similarly. 
% \eqenv{
% \Varr{\widehat{v}^{\rho_{KL}(\sigma)}(Q)(s,a)}= 
% \E\Fbrac{\brac{\widehat{v}^{\rho_{KL}(\sigma)}(Q)(s,a)}^2  }-\brac{\E\Fbrac{\widehat{v}^{\rho_{KL}(\sigma)}(Q)(s,a) }}^2
% \leq \E\Fbrac{\brac{\widehat{v}^{\rho_{KL}(\sigma)}(Q)(s,a)}^2  }.
% }
% Next, according to the \cref{eq:delta,eq:hatq}, now we compute the expectation of $N_2$ and write a detailed explanation of the variance as follows:
% \eqenv{\label{eqeq80}
% \E\Fbrac{\brac{\widehat{v}^{\rho_{KL}(\sigma)}(Q)(s,a)}^2  }& = \E\Fbrac{\brac{V(s'_{s,a,0})+ \frac{\delta^{\rho_{KL}(\sigma)}_{s,a,N_2}(Q)(s,a) }{P_{N_2}} }^2}
% \\& \leq 2 \E\Fbrac{V(s'_{s,a,0})^2 }+ 2\E \Fbrac{ \brac{\frac{\delta^{\rho_{KL}(\sigma)}_{s,a,N_2}(Q)(s,a) }{P_{N_2}} }^2 }
% \\& \leq \frac{2r^2_{\max}}{(1-\gamma)^2\sigma^2}+ 2\sum_{N=0}^{N_{\max}} \E\Fbrac{\brac{\frac{\delta^{\rho_{KL}(\sigma)}_{s,a,N_2}(Q)(s,a) }{P_{N_2}}|N_2=N }^2} P_N
% \\& \leq \frac{2r^2_{\max}}{(1-\gamma)^2\sigma^2}+ 2\sum_{N=0}^{N_{\max}} \frac{\E\Fbrac{(\delta^{\rho_{KL}(\sigma)}_{s,a,N}(Q)(s,a))^2} }{P_{N}}.
% }
% Next, we bound the term $\lbrac{\delta^{\rho_{KL}(\sigma)}_{s,a,N}(Q)(s,a) } $,
% \eqenv{
% \lbrac{\delta^{\rho_{KL}(\sigma)}_{s,a,N}(Q)(s,a) }=
% \lbrac{\sup_{\alpha\geq 0}\varbrac{f^{\rho_{KL}(\sigma)}(\widehat p_{s,a,2^{N+1}},\alpha,V)} 
%      -\frac{1}{2} \sup_{\alpha\geq 0}\varbrac{f^{\rho_{KL}(\sigma)}(\widehat p^E_{2^{N}},\alpha,V)}
%     -\frac{1}{2}\sup_{\alpha\geq 0}\varbrac{f^{\rho_{KL}(\sigma)}(\widehat p^O_{2^{N}},\alpha,V )}}. 
% }

% Then, we make an error decomposition as follows:
% \eqenv{
% \lbrac{\delta^{\rho_{KL}(\sigma)}_{s,a,N}(Q)(s,a) }^2&=
% \lbrac{\sup_{\alpha\geq 0}\varbrac{f^{\rho_{KL}(\sigma)}(\widehat p_{s,a,2^{N+1}},\alpha,V)} 
%      -\frac{1}{2} \sup_{\alpha\geq 0}\varbrac{f^{\rho_{KL}(\sigma)}(\widehat p^E_{2^{N}},\alpha,V)}
%     -\frac{1}{2}\sup_{\alpha\geq 0}\varbrac{f^{\rho_{KL}(\sigma)}(\widehat p^O_{2^{N}},\alpha,V )}}^2
%     \\& \leq 3\lbrac{\sup_{\alpha\geq 0}\varbrac{f^{\rho_{KL}(\sigma)}(\widehat p_{s,a,2^{N+1}},\alpha,V)}-{\sup_{\alpha\geq 0}\varbrac{f^{\rho_{KL}(\sigma)}( p_{s,a},\alpha,V)} }  }^2
%     \\&\quad+  \frac{3}{4}\lbrac{ \sup_{\alpha\geq 0}\varbrac{f^{\rho_{KL}(\sigma)}(\widehat p^E_{s,a,2^{N}},\alpha,V)}-{\sup_{\alpha\geq 0}\varbrac{f^{\rho_{KL}(\sigma)}(\widehat p_{s,a},\alpha,V)} } }^2
%     \\&\quad+\frac{3}{4}\lbrac{\sup_{\alpha\geq 0}\varbrac{f^{\rho_{KL}(\sigma)}(\widehat p_{s,a,2^{N}},\alpha,V )}-{\sup_{\alpha\geq 0}\varbrac{f^{\rho_{KL}(\sigma)}( p_{s,a},\alpha,V)} }}^2.\label{eqeq82}
%     % \\& \myineq{}{}
% }

% \textbf{Case 1: }Combined with the analysis in \cref{eqeq74,eqeq75}, we can conclude that when $N\leq \frac{\log(1+p^2_\wedge\log(2|\mathcal{S}|)\log T )}{\log 2}$, we bound the term $\frac{\E\Fbrac{\lbrac{\delta^{\rho_{KL}(\sigma)}_{s,a,N}(Q)(s,a) }^2 }}{P_N}$ as follows,
% \eqenv{
%     \lbrac{\delta^{\rho_{KL}(\sigma)}_{s,a,N}(Q)(s,a) }^2\leq \brac{ \frac{r_{\max}}{1-\gamma}}^2,\qquad {\frac{1}{P_N}}= 2^N \leq 1+p^{-2}_\wedge\log(2|\mathcal{S}|)\log T.
% }
% Hence, we have that
% \eqenv{
% \frac{\E\Fbrac{\lbrac{\delta^{\rho_{KL}(\sigma)}_{s,a,N}(Q)(s,a) }^2 }}{P_N} \leq  \brac{\frac{r_{\max}}{1-\gamma}}^2\brac{1+p^{-2}_\wedge\log(2|\mathcal{S}|)\log T}.
% }





% \textbf{Case 2: }When $N>\frac{\log(1+p^2_\wedge\log(2|\mathcal{S}|)\log T )}{\log 2}$, consider the fact  that for any events $A,B,C$, $\mathbb P(A\cap B\cap C)\geq 1- \mathbb P(\neg A)-\mathbb P(\neg B)-\mathbb P(\neg C)$, by \cref{eqeq74},
%  with probability at least $1-3*2^{-N} $
% % Plug \cref{eqeq52} in \cref{}
% \eqenv{
% \lbrac{\delta^{\rho_{KL}(\sigma)}_{s,a,N}(Q)(s,a) }^2
% &\leq 3\brac{C_{KL}\frac{r_{\max}}{p_\wedge(1-\gamma)\sigma} 2^{-\frac{N+1}{2}}}^2 + \frac{3}{4}\brac{C_{KL}\frac{r_{\max}}{p_\wedge(1-\gamma)\sigma}  2^{-\frac{N}{2}} }^2+ \frac{3}{4}\brac{C_{KL}\frac{r_{\max}}{p_\wedge(1-\gamma)\sigma}  2^{-\frac{N}{2}} }^2
% \\& = 3\frac{C^2_{KL} r^2_{\max} }{p_\wedge^2(1-\gamma)^2\sigma^2}2^{-(N+1)}, 
% }
%  Since $ 0 \leq \sup_{\alpha\geq 0}\varbrac{f^{\rho_{KL}(\sigma)}(q,\alpha,V)}\leq \frac{r_{\max}}{1-\gamma}$ for any distribution $q$, with probability at most $3*2^{-N} $ 
% we have that
% \eqenv{
%  \lbrac{\delta^{\rho_{KL}(\sigma)}_{s,a,N}(Q)(s,a) }^2
%  \leq \brac{ \frac{r_{\max}}{1-\gamma}}^2.
% }

% Above all, we can get that 
% \eqenv{
% \E\Fbrac{\lbrac{\delta^{\rho_{KL}(\sigma)}_{s,a,N}(Q)(s,a) }^2 }\leq 3\frac{C^2_{KL} r^2_{\max} }{p_\wedge^2(1-\gamma)^2\sigma^2}2^{-(N+1)}+ \brac{ \frac{r_{\max}}{1-\gamma}}^23*2^{-N}.
% % \leq \brac{9 C^2_{KL}+ 3}\brac{ \frac{r_{\max}}{1-\gamma}}^2 2^{-N}. 
% \label{eqeq86}
% }

% Combined with \textbf{Case 1} and \textbf{Case 2}, when $\psi=\frac{1}{2}$, $P_N=2^{-N}$. Then, we have that 
% \eqenv{
% \E\Fbrac{\frac{\lbrac{\delta^{\rho_{KL}(\sigma)}_{s,a,N}(Q)(s,a) }^2 }{P_N}} \leq  \brac{\frac{r_{\max}}{1-\gamma}}^2\brac{1+p^{-2}_\wedge\log(2|\mathcal{S}|)\log T}+\frac{3}{2}\frac{C^2_{KL} r^2_{\max} }{p_\wedge^2(1-\gamma)^2\sigma^2}+ 3\brac{ \frac{r_{\max}}{1-\gamma}}^2.\label{eq:110}
% }

% Then, by  \cref{eq:110}, we can get the boundary of variance of the robust Bellman operator as follows:
% \eqenv{\label{eqeq83}
% \Varr{\widehat{v}^{\rho_{KL}(\sigma)}(Q)(s,a)}&\leq \frac{2r^2_{\max}}{(1-\gamma)^2\sigma^2}+ 2\sum_{N=0}^{N_{\max}} \frac{\E\Fbrac{(\delta^{\rho_{KL}(\sigma)}_{s,a,N}(Q)(s,a))^2} }{P_{N}}
% \\&\leq \frac{2r^2_{\max}}{(1-\gamma)^2\sigma^2}+\sum_{N=0}^{N_{\max}}  \brac{ \frac{r_{\max}}{1-\gamma}}^2\brac{4+p^{-2}_\wedge\log(2|\mathcal{S}|)\log T+
%  \frac{3}{2}\frac{C^2_{KL}  }{p_\wedge^{2}\sigma^2}}
% \\& \leq\brac{2 +(N_{\max}+1)\brac{4+{\log(2|\mathcal{S}|\log T)}{(1-\gamma)} + \frac{3C^2_{KL}}{2} } }\frac{r^2_{\max}}{p_\wedge^2 (1-\gamma)^2 \sigma^2},
% }

% Set $C_{\text{var}}= 2 +(N_{\max}+1)\brac{4+{\log(2|\mathcal{S}|\log T)}{(1-\gamma)} + \frac{3C^2_{KL}}{2} }  $, then 
% $$\Varr{\widehat{v}^{\rho_{KL}(\sigma)}(Q)(s,a)}\leq C_{\text{var}}\frac{r^2_{\max}}{p_\wedge^2 (1-\gamma)^2 \sigma^2}. $$

% Similarly, we can get the boundary of the variance $\text{Var}\brac{\widehat  r^{\rho_{KL}(\sigma)}}$ as follows:
% \eqenv{
% \text{Var}\brac{\widehat  r^{\rho_{KL}(\sigma)}}\leq 
% C_{\text{var}}\frac{r^2_{\max}}{p_\wedge^2  \sigma^2}.
% }

% Hence, we can get the robust Bellman operator variance bound:
% \eqenv{\label{eqeq88}
% \text{Var}\brac{\widehat {\mathcal{T}}_{N_{\max}}^{\rho_{KL}(\sigma)} (Q)(s,a) }
% &= \text{Var}\brac{\widehat  r^{\rho_{KL}(\sigma)}}
% +\gamma^2 \Varr{\widehat{v}^{\rho_{KL}(\sigma)}(Q)(s,a)}
% \leq \frac{ C_{\text{var}}}{p_\wedge^2  \sigma^2}\brac{r^2_{\max}+ \frac{\gamma^2 r^2_{\max}}{(1-\gamma)^2}}.
% }
%    This completes the proof.  
% \end{proof}
% \begin{theorem}[Restatement of \cref{thm:kl}]
% If we set $\psi=\frac{1}{2}$ and the stepsize as $$\beta_t=\beta=\frac{\log T}{(1-\gamma)T}. $$ Then the output of \cref{alg:example} satisfies that:
%  \begin{align}
%     \mathbb E &\Fbrac{\mynorm{\widehat Q_T^{\rho_{KL}(\sigma)}-Q^{*\rho_{KL}(\sigma)}}_\infty^2}\nonumber
%   \leq\widetilde{\mathcal{O }}\brac{\frac{1}{ p_\wedge^2 (1-\gamma)^5 T}}.
% \end{align}
% To ensure 
% \begin{align}
%     \mathbb E \Fbrac{\mynorm{\widehat Q_T^{\rho_{KL}(\sigma)}-Q^{*\rho_{KL}(\sigma)}}_\infty^2}\leq \epsilon^2,\nonumber
% \end{align}
% the expected total sample complexity $N^{\rho_{KL}(\sigma)}(\epsilon)$ is
%  \begin{align}
%      N^{\rho_{KL}(\sigma)}(\epsilon) = |\mathcal{S}||\mathcal{A}|N_{\max} T\geq \widetilde{\mathcal{O }}\brac{\frac{|\mathcal{S}||\mathcal{A}|}{ p_\wedge^2 (1-\gamma)^5 \epsilon^2 }}.\nonumber
%  \end{align}
% \end{theorem}
% \begin{proof}
% We consider the stochastic iteration that
% \eqenv{
% \widehat Q^{\rho_{KL}(\sigma)}_{t+1}=\widehat Q^{\rho_{KL}(\sigma)}_{t}+ \beta_t\brac{\boldsymbol{\bar{\mathcal{T}}}^{\rho_{KL}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{KL}(\sigma)}_{t}}- \widehat Q^{\rho_{KL}(\sigma)}_{t}+ W_t  },
% }
% where $W_t={\widehat{\mathcal{T}}}^{\rho_{KL}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{KL}(\sigma)}_{t}}-\boldsymbol{\bar{\mathcal{T}}}^{\rho_{KL}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{KL}(\sigma)}_{t}} $. 

% Define the filtration $\mathcal{F}_t=\varbrac{Q^{\rho_{KL}(\sigma)}_0, W_0,...,Q^{\rho_{KL}(\sigma)}_{t-1},W_{t-1},Q^{\rho_{KL}(\sigma)}_{t} } $. 
% Then, by \cref{thm:kl}, we can get that 
% \eqenv{
% \E\Fbrac{W_t|\mathcal{F}_t }=0,
% }
% and
% \eqenv{
% \E\Fbrac{\norminf{W_t}^2|\mathcal{F}_t }
% &\leq \max_{s,a}{\E\Fbrac{\twonorm{{\widehat{\mathcal{T}}}^{\rho_{KL}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{KL}(\sigma)}_{t}}(s,a)-\boldsymbol{\bar{\mathcal{T}}}^{\rho_{KL}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{KL}(\sigma)}_{t}}(s,a)}^2|\mathcal{F}_t }}
% \\& \leq \max_{s,a} \varbrac{\text{Var}\brac{\widehat {\mathcal{T}}^{\rho_{KL}(\sigma)}_{N_{\max}} (Q^{\rho_{KL}(\sigma)}_{t})(s,a)  } }
% \\& \myineq{\leq}{i} \frac{ C_{\text{var}}}{p_\wedge^2  \sigma^2}\brac{r^2_{\max}+ \gamma^2\norminf{Q^{\rho_{KL}(\sigma)}_{t} }^2 }
% \\& \leq\frac{r^2_{\max} C_{\text{var}}}{p_\wedge^2 (1-\gamma)^2 \sigma^2}\brac{1+\frac{\gamma^2}{(1-\gamma)^2}},
% } where $(i) $ follows from \cref{eqeq88}. 

% According to \cref{eq:fixp}, we have that
% $$\widehat Q^{*\rho_{KL}(\sigma)}(s,a) = \hatT (\widehat Q^{*\rho_{KL}(\sigma)})(s,a)=\E\Fbrac{\hatt (\widehat Q^{*\rho_{KL}(\sigma)})(s,a)}.$$
% Then, apply \cref{lm:chen} \citep{chen2020finite}, set the constant stepsize $\beta_t=\beta= \frac{2\log T}{(1-\gamma)T} $ and $T$ large enough s.t.
% $${\frac{2\log T}{(1-\gamma)T}\leq\frac{(1-\gamma)^2{p_\wedge^2  \sigma^2}}{32 e r^2_{\max} { C_{\text{var}}}\log(|\mathcal{S}||\mathcal{A}|) }} .$$
% We can conclude that
% \eqenv{\label{eqeq661}
% \E\Fbrac{\norminf{\widehat Q_{T}^{\rho_{KL}(\sigma)}-\widehat Q^{*\rho_{KL}(\sigma)} }^2}&\myineq{\leq}{i} \frac{3}{2} \norminf{\widehat Q_{0}^{\rho_{KL}(\sigma)}-\widehat Q^{*\rho_{KL}(\sigma)} }^2\prod_{j=0}^{T-1} \brac{1-\frac{1-\gamma}{2} \beta_t} \\& \qquad+\frac{16 e \log (|\mathcal{S}||\mathcal{A}|)}{1-\gamma}\frac{r^2_{\max} C_{\text{var}}}{p_\wedge^2 (1-\gamma)^2 \sigma^2}\brac{1+\frac{\gamma^2}{(1-\gamma)^2}}\sum_{i=0}^{T-1} \beta_i^2 \prod_{t=i+1}^{T-1} (1-
% \frac{1-\gamma}{2}\beta_t)
% \\& \myineq{\leq}{ii} \frac{3}{2}\frac{r^2_{\max}}{(1-\gamma)^2} \frac{1}{T} +\frac{16 e \log (|\mathcal{S}||\mathcal{A}|)}{1-\gamma}\frac{r^2_{\max} C_{\text{var}}}{p_\wedge^2 (1-\gamma)^2 \sigma^2}\brac{1+\frac{\gamma^2}{(1-\gamma)^2}}\frac{4 \log T }{(1-\gamma)^2 T}
% , 
% } where $(i)$ follows from the \cref{lm:chen}. $(ii)$ follows from $(1-(1-\gamma)\beta/2)^T\leq \frac{1}{T}$.


% Set $N_{\max}=\frac{2\log T}{\log 2}$. Then, we make the decomposition and get the bound of $\mathbb E \Fbrac{\mynorm{\widehat Q_T^{\rho_{KL}(\sigma)}-Q^{*\rho_{KL}(\sigma)}}_\infty^2}$ as follows
%  \eqenv{
%  \mathbb E \Fbrac{\mynorm{\widehat Q_T^{\rho_{KL}(\sigma)}-Q^{*\rho_{KL}(\sigma)}}_\infty^2}
%  &\leq 
%  2\E\Fbrac{\mynorm{\widehat Q_T^{\rho_{KL}(\sigma)}-\widehat Q^{*\rho_{KL}(\sigma)}}_\infty^2 }+ 2\E\Fbrac{\mynorm{\widehat Q^{*\rho_{KL}(\sigma)}-Q^{*\rho_{KL}(\sigma)}}_\infty^2 }
%  \\& \myineq{\leq}{i}
%   \frac{2r^2_{\max}}{(1-\gamma)^2 T} +\frac{32 e \log (|\mathcal{S}||\mathcal{A}|)}{1-\gamma}\frac{r^2_{\max} C_{\text{var}}}{p_\wedge^2 (1-\gamma)^2 \sigma^2}\brac{1+\frac{\gamma^2}{(1-\gamma)^2}}\frac{4 \log T }{(1-\gamma)^2 T}
% \\& \qquad+\frac{2}{1-\gamma}\brac{\brac{\frac{\gamma  r_{\max}}{1-\gamma}+r_{\max} }2^{-\frac{N_{\max}+1}{2}}\brac{2^{-\frac{N_{\max}+1}{2}}+ \frac{C_{KL}}{\sigma p_\wedge} }}^2
% \\& \myineq{\leq}{ii}  \frac{2r^2_{\max}}{(1-\gamma)^2 T} +\frac{32 e \log (|\mathcal{S}||\mathcal{A}|)}{1-\gamma}\frac{r^2_{\max} C_{\text{var}}}{p_\wedge^2 (1-\gamma)^2 \sigma^2}\brac{1+\frac{\gamma^2}{(1-\gamma)^2}}\frac{4 \log T }{(1-\gamma)^2 T}
% \\& \qquad+\frac{2}{1-\gamma}\brac{\brac{\frac{\gamma  r_{\max}}{1-\gamma}+r_{\max} }\brac{\frac{1}{T}+ \frac{C_{KL}}{\sigma p_\wedge} }}^2\frac{1}{T}
% \\&= \widetilde{\mathcal{O }}\brac{\frac{1}{(1-\gamma)^5p^2_\wedge\sigma^2 T}},
%  }where $(i)$ follows from \cref{lm:a8,thm:tv1}. $(ii)$ follows from $2^{\frac{\log T}{\log 2}}\leq \frac{1}{T} $. 

% When $\mathbb E \Fbrac{\mynorm{\widehat Q_T^{\rho_{KL}(\sigma)}-Q^{*,\rho_{KL}(\sigma)}}_\infty^2}\leq \epsilon^2 $, the iteration $T\geq \widetilde{\mathcal{O }}\brac{(1-\gamma)^{-5}\epsilon^{-2}}$. When $\psi=\frac{1}{2}$, the expected sample size per iteration with threshold $N_{\max}$ is $N_{\max}+2$. Above all, the total sample complexity is $\widetilde{\mathcal{O }}\brac{|\mathcal{S}||\mathcal{A}|(1-\gamma)^{-5}\epsilon^{-2} }$. 

% This completes the proof.     
% \end{proof}


% % \section{Proof of Lemmas and Propositions}\label{sec:proof of lemma}
% % \begin{proof}[Proof of \cref{lm:a8}]

% % \eqenv{
% % \norminf{\widehat Q^{*\rho(\sigma)}- Q^{*\rho(\sigma)}}&= \norminf{\hatT\brac{\widehat Q^{*\rho(\sigma)}}-{\mathcal{T}}^{\rho(\sigma)}\brac{Q^{*\rho(\sigma)}}  }
% % \\& \leq \norminf{\hatT\brac{\widehat Q^{*\rho(\sigma)}}-\boldsymbol{\bar{\mathcal{T}}}^{\rho(\sigma)}\brac{Q^{*\rho(\sigma)}} }
% % + \norminf{\hatT\brac{ Q^{*\rho(\sigma)}}-\mathcal{T}^{\rho(\sigma)}\brac{Q^{*\rho(\sigma)}}}
% % \\& \myineq{\leq}{i} \gamma\norminf{\widehat Q^{*\rho(\sigma)}-Q^{*\rho(\sigma)} }+ \norminf{\hatT\brac{ Q^{*\rho(\sigma)}}-{\mathcal{T}}^{\rho(\sigma)}\brac{Q^{*\rho(\sigma)}}},
% % % \\& \myineq{\leq}{ii}\gamma\norminf{\widehat Q^{*\rho(\sigma)}-Q^{*,\rho(\sigma)} }+\norminf{  \boldsymbol{\bar f^*}^{\rho(\sigma)}\brac{\hat \mu_{s,a,2^{N_{\max}+1}}, \boldsymbol{id}(r_{s,a}) }
% % % -\boldsymbol{\bar f^*}^{\rho(\sigma)}\brac{ \mu_{s,a}, \boldsymbol{id}(r_{s,a}) }
% % % } \\& \qquad+\gamma \norminf{ \boldsymbol{\bar f^*}^{\rho(\sigma)}\brac{\hat p_{s,a,2^{N_{\max}+1}},  V^*(s'_{s,a})}- \boldsymbol{\bar f^*}^{\rho(\sigma)}\brac{ p_{s,a}, V^*(s'_{s,a})} }.
% % } where $(i)$ follows from \cref{prop:contract}. 
% % \end{proof}

% % \begin{proof}[Proof of \cref{prop:mlmc}]
% %     Here we recall the definition of $\delta^{\rho(\sigma)}_{s,a,N_2}(Q)$ that
% %     \eqenv{
% %     \delta^{\rho(\sigma)}_{s,a,N_2}(Q):&=\sup_{\alpha\geq 0}\varbrac{f^{\rho(\sigma)}(\widehat p_{s,a,2^{N_2+1}},\alpha,V  )} 
% %     \\&\qquad -\frac{1}{2} \sup_{\alpha\geq 0}\varbrac{f^{\rho(\sigma)}(\widehat p^E_{2^{N_2}},\alpha,V )}
% %     -\frac{1}{2}\sup_{\alpha\geq 0}\varbrac{f^{\rho(\sigma)}(\widehat p^O_{2^{N_2}},\alpha,V  )}. 
% %     }
% %     Then, we recall that
% %     \begin{align}
% %         f^{*\rho(\sigma)}(\hat p_{s,a,n},V ):=\sup_{\alpha\geq 0} \varbrac{f^{\rho(\sigma)}(\hat p_{s,a,n}, \alpha, V)  }.
% %     \end{align}  
% %     % \begin{align}
% %     %     \boldsymbol{\bar f^*}^{\rho(\sigma)}\brac{\hat p_n, V(s'_{s,a})}:=\mathbb E_{\hat{p}_n}\Fbrac{f^{*\rho(\sigma)}(\hat p_n,V(s'_{s,a}))}.
% %     % \end{align}
% %     Thus, we can get that
% %     \eqenv{
% %     \E&\Fbrac{\delta^{Q,\rho(\sigma)}_{s,a,N_2}|N_2}\\&= 
% %     \mathbb E\Fbrac{f^{*\rho(\sigma)}(\hat p_{s,a,2^{N_2+1}},V )|N_2}-\frac{1}{2}\mathbb E\Fbrac{f^{*\rho(\sigma)}(\hat p^O_{2^{N_2+1}},V )|N_2}-\frac{1}{2}\mathbb E\Fbrac{f^{*\rho(\sigma)}(\hat p^E_{2^{N_2+1}},V )|N_2}
% %     \\&=E\Fbrac{f^{*\rho(\sigma)}(\hat p_{s,a,2^{N_2+1}},V )|N_2}-E\Fbrac{f^{*\rho(\sigma)}(\hat p_{s,a,2^{N_2}},V )|N_2}
% %     . 
% %     }
% %     Take the expectation of the random variable $N_2\sim \text{Geo}(\psi)$,
% %     we can obtain that
% %     \eqenv{
% %     \E\Fbrac{\widehat{v}^{\rho(\sigma)}(Q(s,a)) }&=\E\Fbrac{V(s'_{s,a,0})+\frac{\delta^{Q,\rho(\sigma)}_{s,a,N_2} }{P_{N_2}}}
% %     \\&= \E[V(s'_{s,a,0})]+\E\Fbrac{ \frac{\delta^{Q,\rho(\sigma)}_{s,a,N_2} }{P_{N_2}} }
% %     \\&\myineq{=}{i} \E[V(s'_{s,a,0})] + \sum_{N=0}^{N_{\max}} \E\Fbrac{\frac{\delta^{Q,\rho(\sigma)}_{s,a,N_2} }{P_{N_2}}|N_2=N }\prob\brac{N}  + \sum_{N=N_{\max}+1}^\infty  \E\Fbrac{\frac{\delta^{Q,\rho(\sigma)}_{s,a,N_2} }{P_{N_2}}|N_2=N }\prob\brac{N}
% %     \\& \myineq{=}{ii}E\Fbrac{f^{*\rho(\sigma)}(\hat p_{s,a,2^{0}},V )} + \sum_{N=0}^{N_{\max}} \E\Fbrac{\delta^{Q,\rho(\sigma)}_N }
% %     \\&  \myineq{=}{iii} E\Fbrac{f^{*\rho(\sigma)}(\hat p_{s,a,2^{0}},V )} + \sum_{N=0}^{N_{\max}} E\Fbrac{f^{*\rho(\sigma)}(\hat p_{s,a,2^{N+1}},V )}-E\Fbrac{f^{*\rho(\sigma)}(\hat p_{s,a,2^{N}},V )}
% %     \\&= E\Fbrac{f^{*\rho(\sigma)}(\hat p_{s,a,2^{N_{\max}+1}},V )},
% %     } where $(i)$ and $(ii)$ follows from the \cref{eq:hatq}; $(iii)$ follows from \cref{def:3.1}. 

% %     This completes the proof. 
% % \end{proof}

% % \begin{proof}[Proof of \Cref{prop:contract}]
% %     For any $Q,Q'\in \mathbb R^{|\mathcal{S}||\mathcal{A}|} $, we have that
% %     \eqenv{\label{eq:eq39}
% %     \hatT(& Q)(s,a)-\hatT( Q')(s,a)\\&=  \E\Fbrac{g^{*\rho(\sigma)}(\hat \mu_{s,a,2^{N_{\max}+1}},r_{s,a})+ \gamma  f^{*\rho(\sigma)}(\hat p_{s,a,2^{N_{\max}+1}},V )  }
% % \\&\qquad - \E\Fbrac{g^{*\rho(\sigma)}(\hat \mu_{s,a,2^{N_{\max}+1}},r_{s,a})+ \gamma  f^{*\rho(\sigma)}(\hat p_{s,a,2^{N_{\max}+1}},V' )  }
% % \\& = \gamma\brac{  \E\Fbrac{f^{*\rho(\sigma)}(\hat p_{s,a,2^{N_{\max}+1}},V )}  -\E\Fbrac{  f^{*\rho(\sigma)}(\hat p_{s,a,2^{N_{\max}+1}},V' )  } }
% % \\& =\gamma\E_{\hat p_{s,a,2^{N_{\max}+1}}}\Fbrac{\inf_{\rho(q,\hat p_{s,a,2^{N_{\max}+1}})\leq \sigma } \E_{q}[V' (s'_{s,a})]-\inf_{\rho(q,\hat p_{s,a,2^{N_{\max}+1}})\leq \sigma } \E_{q}[  V (s'_{s,a})]  }
% % \\&=\gamma \E_{\hat p_{s,a,2^{N_{\max}+1}}}\Fbrac{\inf_{\rho(q,\hat p_{s,a,2^{N_{\max}+1}})\leq \sigma } \E_{q}[  \max_{a'}Q' (s'_{s,a},a')]-\inf_{\rho(q,\hat p_{s,a,2^{N_{\max}+1}})\leq \sigma } \E_{q}[  \max_{a'}Q (s'_{s,a},a')]  }
% %     }
% %     Hence, consider the infinite norm of both sides \cref{eq:eq39}, we can get 
% %     \eqenv{
% %     &\norminf{\hatT (Q)-\hatT(Q') }\\&\leq \max_{s,a}\lbrac{\hatT( Q)(s,a)-\hatT( Q')(s,a)  }
% %     \\& =\gamma\max_{s,a}\lbrac{\E_{\hat p_{s,a,2^{N_{\max}+1}}}\Fbrac{\inf_{\rho(q,\hat p_{s,a,2^{N_{\max}+1}})\leq \sigma } \E_{q}[  \max_{a'}Q' (s'_{s,a},a')]-\inf_{\rho(q,\hat p_{s,a,2^{N_{\max}+1}})\leq \sigma } \E_{q}[  \max_{a'}Q (s'_{s,a},a')]  } }
% %     \\& \leq \gamma\max_{s,a}\lbrac{\E_{\hat p_{s,a,2^{N_{\max}+1}}}\Fbrac{\sup_{\rho(q,\hat p_{s,a,2^{N_{\max}+1}})\leq \sigma } \E_{q}[  \max_{a'}Q' (s'_{s,a},a')-\max_{a'}Q (s'_{s,a},a')] } }
% %     \\& \leq \gamma\max_{s,a}\max_{s'_{s,a}}\lbrac{\max_{a'}Q' (s'_{s,a},a')-\max_{a'}Q (s'_{s,a},a') }
% %     \\& \leq \gamma\max_{s'} \max_{a'}\lbrac{Q(s',a')-Q'(s',a')}
% %     \\&= \gamma \norminf{Q-Q'}.
% %     }

% %     This completes the proof.
% % \end{proof}



\end{document}
