%%%%%%%% ICML 2024 EXAMPLE LATEX SUBMISSION FILE %%%%%%%%%%%%%%%%%

\documentclass{article}

% Recommended, but optional, packages for figures and better typesetting:
\usepackage{microtype}
\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{booktabs} % for professional tables
\usepackage{algorithm2e}

% hyperref makes hyperlinks in the resulting PDF.
% If your build breaks (sometimes temporarily if a hyperlink spans a page)
% please comment out the following usepackage line and replace
% \usepackage{icml2024} with \usepackage[nohyperref]{icml2024} above.
\usepackage{hyperref}

\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}

% if you use cleveref..
\usepackage[capitalize,noabbrev]{cleveref}
% Attempt to make hyperref and algorithmic work together better:
\newcommand{\theHalgorithm}{\arabic{algorithm}}

% Use the following line for the initial blind version submitted for review:
\usepackage{icml2024}

% If accepted, instead use the following line for the camera-ready submission:
% \usepackage[accepted]{icml2024}

% For theorems and such


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% THEOREMS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}

% Todonotes is useful during development; simply uncomment the next line
%    and comment out the line below the next line to turn off comments
%\usepackage[disable,textsize=tiny]{todonotes}
\usepackage[textsize=tiny]{todonotes}
\NewDocumentEnvironment{alignb}{b}{
  \begin{align*}
  \refstepcounter{equation} #1 \tag{\theequation}
  \end{align*}
}
\allowdisplaybreaks
% The \icmltitle you define below is probably too long as a header.
% Therefore, a short form for the running title is supplied here:
% \icmltitlerunning{Finite Sample Analysis of  DR-RL}

\begin{document}

\twocolumn[
\icmltitle{Model-Free Robust Reinforcement Learning with Finite Sample Complexity }

% It is OKAY to include author information, even for blind
% submissions: the style file will automatically remove it for you
% unless you've provided the [accepted] option to the icml2024
% package.

% List of affiliations: The first argument should be a (short)
% identifier you will use later to specify author affiliations
% Academic affiliations should list Department, University, City, Region, Country
% Industry affiliations should list Company, City, Region, Country

% You can specify symbols, otherwise they are numbered in order.
% Ideally, you should not use this facility. Affiliations will be numbered
% in order of appearance and this is the preferred way.
\icmlsetsymbol{equal}{*}

\begin{icmlauthorlist}
\icmlauthor{Firstname1 Lastname1}{equal,yyy}
\icmlauthor{Firstname2 Lastname2}{equal,yyy,comp}
\icmlauthor{Firstname3 Lastname3}{comp}
\icmlauthor{Firstname4 Lastname4}{sch}
\icmlauthor{Firstname5 Lastname5}{yyy}
\icmlauthor{Firstname6 Lastname6}{sch,yyy,comp}
\icmlauthor{Firstname7 Lastname7}{comp}
%\icmlauthor{}{sch}
\icmlauthor{Firstname8 Lastname8}{sch}
\icmlauthor{Firstname8 Lastname8}{yyy,comp}
%\icmlauthor{}{sch}
%\icmlauthor{}{sch}
\end{icmlauthorlist}

\icmlaffiliation{yyy}{Department of XXX, University of YYY, Location, Country}
\icmlaffiliation{comp}{Company Name, Location, Country}
\icmlaffiliation{sch}{School of ZZZ, Institute of WWW, Location, Country}

\icmlcorrespondingauthor{Firstname1 Lastname1}{first1.last1@xxx.edu}
\icmlcorrespondingauthor{Firstname2 Lastname2}{first2.last2@www.uk}

% You may provide any keywords that you
% find helpful for describing your paper; these are used to populate
% the "keywords" metadata in the PDF but will not be shown in the document
\icmlkeywords{Machine Learning, ICML}

\vskip 0.3in
]

% this must go after the closing bracket ] following \twocolumn[ ...

% This command actually creates the footnote in the first column
% listing the affiliations and the copyright notice.
% The command takes one argument, which is text to display at the start of the footnote.
% The \icmlEqualContribution command is standard text for equal contribution.
% Remove it (just {}) if you do not need this facility.

%\printAffiliationsAndNotice{}  % leave blank if no need to mention equal contribution
\printAffiliationsAndNotice{\icmlEqualContribution} % otherwise use the standard text.


\begin{abstract}
Distributionally Robust Reinforcement Learning (DR-RL) aims to identify an optimal policy resilient to the most adverse environments within a specified uncertainty set. Despite considerable research, prior DR-RL algorithms have primarily been either model-based or model-free with only asymptotic convergence guarantees. In this paper, we introduce a novel model-free DR-RL algorithm leveraging Multi-level Monte Carlo (MLMC) methods. Our approach incorporates a threshold mechanism, ensuring finite sample requirements for algorithm implementation, a departure from previous MLMC-based algorithms, and further resulting in the finite overall sample complexity. We adapt our algorithm to accommodate uncertainty sets defined by total variation, Chi-square divergence, and KL divergence in this paper. Our algorithms represent the first model-free DR-RL approach with finite sample complexity for total variation and Chi-square divergence uncertainty sets, and offer an improved sample complexity and broader applicability compared to existing model-free DR-RL algorithms for the KL divergence model. Our sample complexity analysis establish the tightest results across all three uncertainty models in model-free complexity analysis for DR-RL, and suggest potential extensions of our algorithm to other uncertainty set models.



  % We propose a novel method to analyze the sample size of our algorithm based on the statistical property such as the worst-case distribution shape.  We first provide the sample complexity analysis of our algorithm when  the uncertainty set is constrained via total variation and chi-square divergence statistical distance. For TV-constrained uncertainty set, we can prove that the expected sample complexity of our algorithm to learn the $\epsilon$-accurate optimal robust policy can be bounded by $\mathcal{O}(|\mathcal{S}||\mathcal{A}|(1-\gamma)^{-5}\epsilon^{-2})$, where $\gamma$ is the discount factor. For $\chi^2$-constrained uncertainty set, the algorithm converge to $\epsilon$-accurate optimal robust policy with sample complexity $\mathcal{O}(|\mathcal{S}||\mathcal{A}|(1-\gamma)^{-5}\epsilon^{-2})$. For the KL-constrained uncertainty set, we provide the model-free sample complexity analysis without extra assumption of uncertainty level. 
% The sample complexity of our model-free algorithm all reach the best result of model-based algorithms for above uncertainty sets.
% This document provides a basic paper template and submission guidelines.
% Abstracts must be a single paragraph, ideally between 4--6 sentences long.
% Gross violations will trigger corrections at the camera-ready phase.
\end{abstract}
\section{ Introduction}
Reinforcement learning (RL)\cite{sutton2018reinforcement} has witnessed demonstrated success in real applications, including robotics\cite{kober2013reinforcement}, finance, and computer vision.
RL aims to find the optimal policy that maximizes cumulative rewards through interactions with the environment. However  in practical scenarios, direct interact with the true environment is often unfeasible due to concerns such as safety, resource constraints, and ethical considerations. Consequently, a policy is initially learned within a simulated environment and subsequently transferred to the real environment. However, challenges arise from factors like unexpected external perturbations, adversarial attacks, and non-stationary environments, resulting in a model mismatch between the simulation and the real environment. The mismatch further leads to a degradation in performance when attempting to directly apply the learned policy in the real environment, known as the Sim-to-Real gap. 


One promising framework to address this issue and close the gap is the distributionally robust RL \cite{iyengar2005robust,nilim2004robustness}. Unlike conventional RL, which optimizes performance under a specific environment, DR-RL constructs an uncertainty set of transition dynamics and aims to optimize the worst-case performance within this set. By carefully designing the uncertainty set to encompass the characteristics of the true environment, DR-RL can provide an optimized lower bound on the true performance, and holds potential for addressing performance degradation resulting from model mismatch in RL applications.

%However, in practice, the policy from the training environment often suffers from the model performance degradation due to the mismatch between the training environment and the real environment, i.e. external perturbations and adversarial attacks in real environment, and time-varying changes of the environment. Therefore, robust RL and distributionally robust RL (DR-RL) are developed to address these problems. Different from non-robust RL problems, the goal of robust RL is to find the optimal robust policy that performs well under a range of uncertain conditions. A general approach to finding the optimal robust policy is to optimize the worst-case performance over the uncertainty set, which describes the model mismatch between the training and real environments via a range of possible distributions for the transition kernel and/or the reward functions.  
% the potential mismatches between the training and real environments. It encompasses a range of possible distributions for both the transition kernel and the reward functions in the environment.
% , where the uncertainty set is the
% , where the training environment is the center of the set. 
% The policy is trained in training environment, 
% which is economic efficiently and easy to access.$\rho$-constrainedDistributionally robust reinforcement learning (DR-RL) aims to

%In general, this uncertainty set is constrained by statistical distance $\rho$ (e.g. total variation distance,  chi-square distance and KL divergence) centered around the \textit{nominal distribution}, where \textit{nominal distribution} corresponds to the transition kernel and reward distribution of the training environment. Therefore, the goal of finding the optimal robust policy can be achieved by solving a minimax problem over the $\rho$-constrained uncertainty set. 


% 
Numerous algorithms have been studied and proposed for DR-RL to find the policy optimizing the worst-case performance, and can be broadly categorized into two groups: model-based methods and model-free methods. Model-based approaches, e.g.,  \cite{shi2023curious,panaganti2022sample,yang2022toward}, involve the collection of samples from a simulation environment to construct an empirical model. Subsequently, these methods employ robust dynamic programming techniques on the model to derive the optimal policy. In contrast, model-free methods \cite{wang2021online,liu2022distributionally,wang2023model,liang2023single} directly learn the policy while collecting samples, bypassing the need for model estimation and storage. These two categories offer distinct strategies within the DR-RL framework, catering to different scenarios and preferences in handling environmental uncertainties. 

While model-based methods generally require fewer samples to derive an optimal policy, storing the entire transition model becomes prohibitively expensive or impractical for large-scale problems. Conversely, model-free methods offer an efficient alternative that adapts without the need to store the model, facilitating more practical applications. Despite extensive research on model-based robust RL algorithms, the study and design of model-free DR-RL algorithms remain relatively understudied. This is primarily attributed to the challenge of the distribution shift between the simulation that generates samples and the worst-case environment within the uncertainty set, often resulting in an 'off-kernel' setting. The utilization of such samples for estimated updating introduces bias and may deviate from the true updating value through bootstrapping algorithms, thus posing challenges in ensuring convergence and accurately quantifying algorithmic complexity.

%Due to the simple and straightforward formulation, model-based methods have been extensively studied, including deriving the sample complexity to learn an optimal policy. 




%In general, there are two primary methods to solve the DR-RL problems: model-based approach and model-free approach. 
%\cite{panaganti2022sample,yang2022toward,clavier2023towards,shi2023curious} adopt the model-based approach to find the optimal robust policy and provide the sample complexity analysis. However, these works require a large number of samples per iteration to estimate the nominal distribution. In practice, large sample size requires expensive memory storage and is computationally costly, which constrains the practical feasibility of the model-free approached algorithms.  In contrast to model-based approach, the model-free method boast computational efficiency and less storage overhead. 

To address the issue of biased estimated updating, \cite{liu2022distributionally, wang2023model} propose a model-free approach combined with a Multi-level Monte Carlo (MLMC) estimator, which offers convergence guarantees under various uncertainty sets. However, the MLMC estimator, while unbiased, typically demands an infinite number of samples for its construction. Subsequently, in \cite{wang2023finite}, finite sample complexity results are derived for the KL divergence uncertainty set. Nevertheless, these results are confined by restrictive assumptions, limiting their applicability. In this paper, we present a modified MLMC-based robust RL algorithm incorporating a threshold design, thereby ensuring that our implementation requires only a finite number of samples. Furthermore, we provide complexity analysis without relying on any restrictive assumptions. Our contributions are outlined as follows.



%Therefore, the model-free DR-RL algorithm and related sample complexity analysis that is feasible to different constrained uncertainty set and uncertainty level  is required. 
% The model-free MORL algorithms boast  and are applicable in real applications. 

\subsection{Major Contributions}
\textbf{We design a model-free algorithm for robust RL with guarantees on implementation and convergence.} Unlike previous Multi-level Monte Carlo (MLMC) algorithms that necessitate an infinite number of samples for implementation, our approach introduces a threshold design on the level number to construct an MLMC-based estimator. This estimator behaves identically to the traditional MLMC estimator when the level number remains below the threshold, but adopts a simplified structure that requires less samples when the level number exceeds it. The incorporation of this threshold design ensures that a finite number of samples are sufficient to construct the estimator, albeit with the trade-off of introducing bias. Nevertheless, through meticulous analysis of the asymptotic convergence of our algorithm, we illustrate that despite this bias, our approach converges to the optimal robust policy. As a result, our algorithm stands as the first model-free DR-RL algorithm applicable to general uncertainty sets, offering assurances of both sample finiteness and convergence. This renders our algorithm practical for implementation and underscores its potential for diverse applications.

%algorithm with threshold MLMC method, which balance the trade-off between the convergence guarantee and  expected total sample size. 
% Compared with previous model-free DR-RL works \cite{wang2023finite,wang2023model,liu2022distributionally,liang2023single}, our threshold algorithm h

\textbf{We establish the most precise bound on the sample complexity of our algorithm across three distinct uncertainty sets.} We adapt our algorithm to accommodate three uncertainty set models, defined by total variation, Chi-square divergence, and KL-divergence, and ascertain their respective sample complexities. By designing the threshold, we strike a balance between bias and sample complexity, demonstrating that our algorithms effectively identify the optimal robust policy with minimal samples. Specifically, for both total variation and Chi-square divergence uncertainty sets, our algorithms achieve $\epsilon$-optimality with $\mathcal{O}\left(\frac{|\mathcal{S}||\mathcal{A}|}{(1-\gamma)^5\epsilon^2} \right)$ samples, where $|\mathcal{S}|$ and $|\mathcal{A}|$ denote the cardinality of the state and action space respectively, and $\gamma$ represents the discount factor. For the KL-divergence uncertainty set, our algorithm exhibits a sample complexity of $\mathcal{O}\left(\frac{|\mathcal{S}||\mathcal{A}|}{(1-\gamma)^5\epsilon^2p_\wedge^2 } \right)$, where $p_\wedge$ signifies the minimal non-zero entry of the nominal transition kernel within the uncertainty set. Notably, all our results boast the tightest parameter dependencies, marking the first model-free complexity results for the total variation and Chi-square divergence models, while significantly enhancing previous findings for the KL-divergence model. Furthermore, our analysis requires no restrictive assumptions, underscoring the practical applicability of our model-free algorithms. A comprehensive comparison of our results with prior ones is presented in tables \cref{table:11,table:2,table:3}. Evidently, across all three uncertainty sets, our outcomes achieve the most favorable sample complexity among model-free methods.


%We first provide the sample complexity analysis of our algorithm when  the uncertainty set is constrained via total variation and chi-square divergence statistical distance. For TV-constrained uncertainty set, we can prove that the expected sample complexity of our algorithm to learn the $\epsilon$-accurate optimal robust policy can be bounded by $\mathcal{O}(|\mathcal{S}||\mathcal{A}|(1-\gamma)^{-5}\epsilon^{-2})$, where $\gamma$ is the discount factor. For $\chi^2$-constrained uncertainty set, the algorithm converge to $\epsilon$-accurate optimal robust policy with sample complexity $\mathcal{O}(|\mathcal{S}||\mathcal{A}|(1-\gamma)^{-5}\epsilon^{-2})$. For the KL-constrained uncertainty set, we provide the model-free sample complexity analysis without extra assumption of uncertainty level.  The sample complexity of our model-free algorithm all reach the best result of model-based algorithms for above uncertainty sets.

% \begin{table*}[t]
% \caption{Sample Complexity of for different Uncertainty Set: $H$ is the horizon factor and $H=\frac{1}{1-\gamma}$ for $\gamma$ discount infinite horizon}
% \label{table:1}
% \vskip 0.15in
% \begin{center}
% \begin{small}
% \begin{sc}
% \begin{tabular}{lcccr}
% \toprule
% Reference &Metric& Model-Free & Sample Size &Comments\\
% \midrule
% \textsc{\cite{panaganti2022sample}}    & TV& $\times$& $\mathcal{O}\brac{\frac{|\mathcal{S}|^2|\mathcal{A}|H^4}{ \epsilon^2} }$ & - \\
% \cite{yang2022toward}    &TV& $\times$& $\mathcal{O}\brac{\frac{|\mathcal{S}|^2|\mathcal{A}|H^4}{ \epsilon^2} }$ & - \\
% \cite{clavier2023towards}    & TV& $\times$& $\mathcal{O}\brac{\frac{|\mathcal{S}||\mathcal{A}|H^4}{ \epsilon^2} }$ & - \\
% \cite{shi2023curious}    & TV& $\times$& $\mathcal{O}\brac{\frac{|\mathcal{S}||\mathcal{A}|H^4}{ \epsilon^2} }$ & - \\
% \cite{wang2023model}   & TV & $\surd$& Asymptotic & - \\
% \cite{shi2023curious}    & TV& $\times$& $\mathcal{O}\brac{\frac{|\mathcal{S}||\mathcal{A}|H^4}{ \epsilon^2} }$ & - \\
% \cline{1-5}
% \cite{panaganti2022sample}    & $\chi^2$& $\times$& $\mathcal{O}\brac{\frac{|\mathcal{S}|^2|\mathcal{A}|H^4}{ \epsilon^2} }$ & - \\
% \cite{yang2022toward}    &$\chi^2$& $\times$& $\mathcal{O}\brac{\frac{|\mathcal{S}|^2|\mathcal{A}|H^4}{ \epsilon^2} }$ & - \\
% \cite{shi2023curious}    & $\chi^2$& $\times$& $\mathcal{O}\brac{\frac{|\mathcal{S}||\mathcal{A}|H^4}{ \epsilon^2} }$ & - \\
% \cite{wang2023model}   & $\chi^2$ & $\surd$& Asymptotic & - \\
% \cline{1-5}
% \cite{panaganti2022sample}    & KL& $\times$& $\mathcal{O}\brac{\frac{|\mathcal{S}|^2|\mathcal{A}| H^4\exp{(H)}}{ \epsilon^2} }$ & with $\exp{(H)}$ term \\
% \cite{yang2022toward}    &KL& $\times$& $\mathcal{O}\brac{\frac{|\mathcal{S}||\mathcal{A}|H^4}{p_\wedge^2  \epsilon^2} }$ & - \\
% \cite{wang2023model}   & KL & $\surd$& Asymptotic & - \\
% \cite{wang2023finite}   & KL & $\surd$& Asymptotic & Unverified assumptions \\
% \cite{liu2022distributionally}   & KL & $\surd$& Infinite & -  \\
% Our work  & KL & $\surd$& $\mathcal{O}\brac{\frac{|\mathcal{S}||\mathcal{A}|}{p_\wedge^2 (1-\gamma)^5 \epsilon^2} }$& small uncertainty set radius  \\

% \bottomrule
% \end{tabular}
% \end{sc}
% \end{small}
% \end{center}
% \vskip -0.1in
% \end{table*}

% We improve the MLMC method and adopt threshold MLMC method in our work, which avoids the limitation in 
\begin{table}[!htb]
\label{table:11}
\vskip 0.15in
\begin{center}
\begin{small}
\begin{sc}
\begin{tabular}{lc}
\toprule
Reference  $\qquad\qquad$Model-Free & Sample Size \\
\midrule
\textsc{\cite{panaganti2022sample}}   \hfill{ $\times$}& $\mathcal{O}\brac{\frac{|\mathcal{S}|^2|\mathcal{A}|}{(1-\gamma)^4 \epsilon^2} }$  \\
\cite{yang2022toward}    \hfill{ $\times$}& $\mathcal{O}\brac{\frac{|\mathcal{S}|^2|\mathcal{A}|}{ (1-\gamma)^4\epsilon^2} }$ \\
\cite{clavier2023towards}    \hfill{ $\times$}&$\mathcal{O}\brac{\frac{|\mathcal{S}||\mathcal{A}|}{ (1-\gamma)^4\epsilon^2} }$ \\
\cite{shi2023curious}    \hfill{ $\times$}& $\mathcal{O}\brac{\frac{|\mathcal{S}||\mathcal{A}|}{ (1-\gamma)^4\epsilon^2} }$  \\
\cite{wang2023model}  \hfill{ $\surd$}& Asymptotic  \\
Our work    \hfill{ $\surd$}& $\mathcal{O}\brac{\frac{|\mathcal{S}||\mathcal{A}|}{ (1-\gamma)^5\epsilon^2} }$ \\
\bottomrule
\end{tabular}
\end{sc}
\end{small}
\end{center}
\vskip -0.1in
\caption{Sample Complexity of TV Uncertainty Set}
\end{table}


\begin{table}[!htb]
\label{table:2}
\vskip -0.15in
\begin{center}
\begin{small}
\begin{sc}
\begin{tabular}{lc}
\toprule
Reference  $\qquad\qquad$Model-Free & Sample Size \\
\midrule
\cite{panaganti2022sample}    \hfill{ $\times$}& $\mathcal{O}\brac{\frac{|\mathcal{S}|^2|\mathcal{A}|}{ (1-\gamma)^4\epsilon^2} }$  \\
\cite{yang2022toward}      \hfill{ $\times$}& $\mathcal{O}\brac{\frac{|\mathcal{S}|^2|\mathcal{A}|}{(1-\gamma)^4 \epsilon^2} }$  \\
\cite{shi2023curious}      \hfill{ $\times$}& $\mathcal{O}\brac{\frac{|\mathcal{S}||\mathcal{A}|}{ (1-\gamma)^4\epsilon^2} }$  \\
\cite{wang2023model}    \hfill{ $\surd$}& Asymptotic \\
Our work    \hfill{ $\surd$}& $\mathcal{O}\brac{\frac{|\mathcal{S}||\mathcal{A}|}{ (1-\gamma)^5\epsilon^2} }$  \\
\bottomrule
\end{tabular}
\end{sc}
\end{small}
\end{center}
\vskip -0.1in
\caption{Sample Complexity of Chi-square Uncertainty Set}
\end{table}


\begin{table}[!htb]
\label{table:3}
\vskip -0.15in
\begin{center}
\begin{small}
\begin{sc}
\begin{tabular}{lc}
\toprule
Reference  $\qquad\qquad$Model-Free & Sample Size  \\
\midrule
\cite{panaganti2022sample}     \hfill{ $\times$}& $\mathcal{O}\brac{\frac{|\mathcal{S}|^2|\mathcal{A}|e^{\frac{1}{1-\gamma}}}{ (1-\gamma)^4\epsilon^2} }$  \\
\cite{yang2022toward}     \hfill{ $\times$}&$\mathcal{O}\brac{\frac{|\mathcal{S}||\mathcal{A}|}{(1-\gamma)^4p_\wedge^2  \epsilon^2} }$ \\
\cite{wang2023model}    \hfill{ $\surd$}& Asymptotic  \\
\cite{liang2023single}   \hfill{ $\surd$}& Asymptotic  \\
\cite{liu2022distributionally}   \hfill{ $\surd$}& Asymptotic   \\
\cite{wang2023finite} \hfill{ $\surd$}& $\mathcal{O}\brac{\frac{|\mathcal{S}||\mathcal{A}| }{p_\wedge^6 (1-\gamma)^5 \epsilon^2} }$ \\
\cite{wang2023sample} \hfill{ $\surd$}& $\mathcal{O}\brac{\frac{|\mathcal{S}||\mathcal{A}| }{p_\wedge^3 (1-\gamma)^5 \epsilon^2} }$ \\
\cite{wang2023sample} (VR) \hfill{ $\surd$}& $\mathcal{O}\brac{\frac{|\mathcal{S}||\mathcal{A}| }{p_\wedge^3 (1-\gamma)^4 \epsilon^2} }$ \\
Our work \hfill{ $\surd$}& $\mathcal{O}\brac{\frac{|\mathcal{S}||\mathcal{A}|}{p_\wedge^2 (1-\gamma)^5 \epsilon^2} }$  \\
\bottomrule
\end{tabular}
\end{sc}
\end{small}
\end{center}
\vskip -0.1in
\caption{Sample Complexity of KL Uncertainty Set.  VR: variance reduce method}
\end{table}



\subsection{Related Works}
\textbf{Model-based Methods for DR-RL}
When the environment is fully known by the learner, robust dynamic programming can be applied to obtain the optimal policy \cite{iyengar2005robust,nilim2004robustness}, which is shown to converge exponentially. When the environment is unknown, the learner can first use samples obtained to construct an empirical transition kernel and an empirical uncertainty set, and then apply robust dynamic programming on this empirical model, e.g., \cite{panaganti2022sample,yang2022toward,shi2023curious,clavier2023towards,zhou2021finite}. Although model-based methods generally are more data efficient, they require large memory space to storage the data and model, becoming impractical for large scale problems.


%Expect the literature shown in tables, some sample complexity analysis of offline DR-RL algorithms are provided in \cite{shi2022distributionally,zhou2021finite}. 


\textbf{Model-free Methods for DR-RL}
Model-free methods, which learn the optimal robust policy while gathering samples, have been investigated in the context of DR-RL. In \cite{wang2021online}, a model-free algorithm for a contamination uncertainty set is devised, subsequently extended to other uncertainty sets in \cite{liu2022distributionally,wang2023model} through the introduction and application of a multi-level Monte Carlo (MLMC) estimator. Despite exhibiting asymptotic convergence, these algorithms necessitate an infinite number of samples to construct the MLMC estimator, thus lacking a quantified sample complexity. In \cite{wang2023finite}, it is demonstrated that a finite sample complexity for the MLMC algorithm for the KL-divergence uncertainty set can be attained under a restrictive assumption, limiting the applicability of their findings. Under a similar assumption, a variance reduction-based algorithm is proposed in \cite{wang2023sample} for the KL-divergence model, and sample complexity is obtained. On the other hand, \cite{liang2023single} introduces a stochastic approximation-based model-free algorithm, achieving asymptotic convergence without assurances on sample complexity. Despite all these works,  designing a model-free robust RL algorithm with finite sample complexity under minimal assumptions remains an open question. In this paper, we present a model-free robust RL algorithm, providing finite sample analysis under various uncertainty set models without imposing additional assumptions.

%The distributionally robust optimization (DRO) is explored richly under the supervised learning setting \cite{bertsimas2018data,blanchet2019quantifying,dai2020coindice,delage2010distributionally,duan2021risk,duchi2019variance,gao2023distributionally,levy2020large}. 




% \textbf{Model-Free DR-RL}
%For model-based DR-RL setting, \cite{liang2023single} proposal a single-trajectory model-free DR-RL algorithm with asymptotic convergence guarantee. However, this work is established based on some assumptions that have not yet been fully validated. \cite{liu2022distributionally} propose MLMC distributionally robust $Q$-learning algorithm with KL-constrained uncertainty set, which expected sample complexity is infinite. \cite{wang2023model} provides the  asymptotic convergence guarantee for model-free MLMC DR-RL algorithm.   \cite{wang2023finite} provides the finite sample complexity analysis of the Model-free KL-constrained DR-RL algorithm with a limitation uncertainty level, where the analysis is based on the assumption that uncertainty set is sufficiently small, i.e. radius is less than minimum support (minimum non-zero probability) of nominal distribution. Hence, there is a need for a model-free DR-RL algorithm and an associated sample complexity analysis that is adaptable to various constrained uncertainty sets and levels of uncertainty.




\section{Preliminaries and Problem Formulations}
%In this paper, we denote by $\Delta (\mathcal{S}) $ the probability simplex over the set $\mathcal{S}$. Denote by $\rho(p,q)$ the distance or divergence between any two distributions $p$ and $q$.  The total variation distance between two distributions $p$ and $q$ is denoted as $\rho_\text{TV}(p,q)$ (resp. $\chi^2$ divergence: $\rho_{\chi^2}(p,q)$; the Kullback-Leibler(KL) divergence:  $\rho_\text{KL}(p,q)$). 
% The $\chi^2$ distance between any two distributions $p$ and $q$ is denoted as ; The Kullback-Leibler divergence between any two distribution $p$ and $q$ is denoted as . 
\subsection{Markov Decision Processes}
A Markov decision processes (MDPs) is specified by $\mathcal{M}=(\mathcal{S}, \mathcal{A}, R,\gamma,  \mathbf{R}_0, \mathbf{P}_0)$, where $\mathcal{S}$ and$\mathcal{A}$ denote the state space and action space. $R\subset [0,r_{\max}]$ is a finite set of possible rewards; $\mathbf{P}_0=\{p_{s,a}\in \Delta(\mathcal{S}): (s,a)\in\mathcal{S}\times\mathcal{A} \}$ is the transition kernel,  where $p_{s,a}\in \Delta(\mathcal{S})$. $\mathbf R_0=\{\mu_{s,a}\in \Delta(R):(s,a)\in\mathcal{S}\times\mathcal{A}\} $ is the reward distribution. At each time step, the agent starts from state $s_t$ and takes an action $a_t$. The environment transits to the next state $s_{t+1}$ according to the transition kernel $p_{s_t,a_t}$, and provides a reward signal $r(s_t,a_t)\sim \mu_{s_t,a_t}$ to the agent.  % We set $R$
%$ \min_{r}\varbrac{r\in R}\geq 0$ and  $ \max_{r}\varbrac{r\in R}\leq r_{\max}$. 
%  which is the $s,a$-rectangular

A policy $\pi: \mathcal{S}\to \Delta(\mathcal{A})$\footnote{When $\pi$ is a deterministic policy, i.e., $\pi(\cdot|s)$ is a 0-1 distribution for all $s$, we denote the deterministic action chosen at state $s$ by $\pi(s)$.} denotes the probability of taking actions under different state and represents the strategy of the agent. The value function of a policy $\pi$ is defined as the expected cumulative reward the agent received by following the policy starting from $s$:
\begin{align}
    V^\pi_{\mathbf{P}_0,\mathbf{R}_0}(s)= \mathbb E \Fbrac{\sum_{t=0}^\infty  \gamma^t r_t|s_0=s,{\mathbf{P}_0,\mathbf{R}_0}}. \nonumber
\end{align}

The $Q$-function is defined as the cumulative reward starting from $s$ and action $a$:
\begin{align}
    Q_{\mathbf{P}_0,\mathbf{R}_0}^\pi(s,a)=\mathbb E\Fbrac{\sum_{t=0}^\infty  \gamma^t r_t|s_0=s,a_0=a,{\mathbf{P}_0,\mathbf{R}_0}}. \nonumber
\end{align}
%The optimal value function $V^*$ is defined as:
%\begin{align}
%    V^\pi(s) := \arg\max_{\pi}\varbrac{V^\pi(s)}. \nonumber
%\end{align}
The optimal $Q$-function $Q^{*}$ is defined as
\begin{align}
    Q_{\mathbf{P}_0,\mathbf{R}_0}^*(s,a)=\max_{\pi} Q_{\mathbf{P}_0,\mathbf{R}_0}^\pi(s,a), 
\end{align}
and it satisfies the Bellman equation:
\begin{align}
    Q_{\mathbf{P}_0,\mathbf{R}_0}^*(s,a)= \mathbb E\Fbrac{r_{s,a}+ \gamma \max_{a'\in \mathcal{A}}Q_{\mathbf{P}_0,\mathbf{R}_0}^*(s',a')}. \nonumber
\end{align} 
Moreover, the optimal policy $\pi_{\mathbf{P}_0,\mathbf{R}_0}^*=\arg\max_\pi Q_{\mathbf{P}_0,\mathbf{R}_0}^\pi$ can be obtained from the optimal $Q$-function: $\pi_{\mathbf{P}_0,\mathbf{R}_0}^*(s)= \arg\max_{a\in\mathcal{A}}Q_{\mathbf{P}_0,\mathbf{R}_0}^*(s,a)$.
\subsection{Robust MDPs}
In the formulation of robust MDPs, both transition kernel and reward distribution belong to $(s, a)$-rectangular uncertainty sets $\mathcal{P}^\rho(\sigma)=\bigotimes_{s,a}\mathcal{P}^\rho_{s,a}(\sigma)$ and $\mathcal{R}^\rho(\sigma)=\bigotimes_{s,a}\mathcal{R}^\rho_{s,a}(\sigma)$. Namely, a robust MDP can be specified as $(\mathcal{S},\mathcal{A},R,\gamma,\mathcal{R}^\rho(\sigma),\mathcal{P}^\rho(\sigma))$, where $\mathcal{P}^\rho_{s,a}(\sigma)=\{q\in\Delta(\mathcal{S}): \rho(q,p_{s,a})\leq \sigma\}$, and 
$\mathcal{R}^\rho_{s,a}(\sigma)=\{\mu\in\Delta(R): \rho(\mu,\nu_{s,a})\leq \sigma\}$. Here, $\rho$ denotes any distance or divergence between two distributions, and $\sigma$ denotes the uncertainty level. 



%Given the statistical distance $\rho$, uncertainty level $\sigma $ and nominal distribution $p_{s,a}\in\mathbf{P}_0$ (resp. $\nu_{s,a}\in \mathbf{R}_0$), the uncertainty set is defined as $\mathcal{P}^\rho_{s,a}(\sigma):= \varbrac{q: \rho(q,p_{s,a})\leq \sigma} $ (resp. $\mathcal{R}^\rho_{s,a}(\sigma):=\varbrac{\mu: \rho(\mu,\nu_{s,a})\leq \sigma}$). The $(s,a)$-rectangular uncertainty set is defined as 
%$\mathcal{P}^\rho(\sigma)=\otimes_{s,a} \mathcal{P}^\rho_{s,a}(\sigma) \brac{\text{ resp. } \mathcal{R}^\rho(\sigma)= \otimes_{s,a}\mathcal{R}^\rho_{s,a}(\sigma)}$. Given the reward uncertainty set $ \mathcal{R}^\rho(\sigma)$ and transition kernel uncertainty set $\mathcal{P}^\rho(\sigma)$, we introduce the distributionally robust MDP $\mathcal{M}_{\text{rob}}=\brac{\mathcal{S},\mathcal{A}, R,  \gamma, \mathcal{R}^\rho(\sigma),\mathcal{P}^\rho(\sigma) }$. 

We consider three functions that can be used to define an uncertainty set, total variation, Chi-square divergence, and KL-divergence. For two distribution $p,q$, the total variation between them is defined as
$$\rho_{TV}(q,p):=\frac{1}{2} \mynorm{q-p}_1;$$ 
The Chi-square divergence is defined as
$$\rho_{\chi^2}(q,p)=\mathbb E_{p}\Fbrac{\brac{1-\frac{q(\cdot)}{p(\cdot)} }^2};$$
And the KL divergence is defined as 
 $$\rho_{KL}(q,p)=\mathbb E_{p}\Fbrac{\log \frac{q(\cdot)}{p(\cdot)} } . $$


Robust MDPs aim to optimize the worst case performance among the uncertainty sets. Namely, for a policy $\pi$, the robust value function of $\pi$ is defined as 
\begin{align}
    V^{\pi, \rho(\sigma)}(s)=\inf_{q\in \mathcal{P}^\rho(\sigma), \nu\in \mathcal{R}^\rho(\sigma) } V^\pi_{q,\nu}(s),
\end{align}
and robust $Q$-function is defined as 
\begin{align}
    Q^{\pi,\rho(\sigma)}(s,a)=\inf_{q\in \mathcal{P}^\rho(\sigma), \nu\in \mathcal{R}^\rho(\sigma) } Q^\pi_{q,\nu}(s,a).
\end{align}
The optimal robust value function $V^{*,\rho(\sigma)}$ and robust $Q$-function are defined as the optimal values among all policies:
\begin{align}
    V^{*,\rho(\sigma)}(s)=\sup_{\pi}V^{\pi,\rho(\sigma)}(s),\\
    Q^{*,\rho(\sigma)}(s,a)=\sup_{\pi}Q^{\pi,\rho(\sigma)}(s),
\end{align}
and the optimal robust policy can be obtained from the optimal value functions. 

It is shown in \cite{iyengar2005robust} that the optimal robust value function satisfies the following optimal robust Bellman equation:
\begin{align}
    V&^{*,\rho(\sigma)}(s)\nonumber
    \\&=\max_{a\in\mathcal{A}}\inf_{q\in \mathcal{P}^\rho(\sigma), \nu\in \mathcal{R}^\rho(\sigma) } \mathbb E_{q,\nu}\Fbrac{r_{s,a}+\gamma V^{*,\rho(\sigma)}(s')} .\nonumber
\end{align}
Similarly, the optimal robust $Q$-function satisfies the robust Bellman equation:
\begin{align}\label{eq:4}
    &Q^{*,\rho(\sigma)}(s,a)
    % \nonumber
    % \\&= \max_{a'\in\mathcal{A}}\inf_{p\in \mathcal{P}^\rho(\sigma), \mu\in \mathcal{R}^\rho(\sigma) }\varbrac{\mathbb E_{\mu }\Fbrac{r_{s,a}}+\gamma \mathbb E_{p}\Fbrac{Q^*(s',a') } } 
    =\max_{a'\in \mathcal{A}}\\& \varbrac{\inf_{\nu\in \mathcal{R}^\rho(\sigma) }\mathbb E_{\nu }\Fbrac{r_{s,a}} +\gamma\inf_{q\in \mathcal{P}^\rho(\sigma) }\mathbb E_{q}\Fbrac{Q^{*,\rho(\sigma)}(s',a') }}  .\nonumber
\end{align}
%Here, we define \textit{robust reward Bellman operator} and \textit{robust transition Bellman operator} as follows:
%\begin{align}
%    &\mathcal{T}^{\rho(\sigma)} (Q)(s,a)= 
%    \nonumber\\&\inf_{\mu\in \mathcal{R}^\rho(\sigma) }\mathbb E_{\mu }\Fbrac{r_{s,a}}+ \gamma \inf_{p\in \mathcal{P}^\rho(\sigma) }\mathbb E_{p_{s,a}}\Fbrac{\max_{a'}Q(s',a') } .
%\end{align}
%The optimal robust value function and optimal robust $Q$-function satisfy that:
%\begin{align}
%    V^{*,\rho(\sigma)}(s)=\max_{a\in \mathcal{A}} Q^{*,\rho(\sigma)}(s,a).
%\end{align}

The {goal} of robust RL is to find the optimal robust policy $\pi^{*,\rho(\sigma)}=\arg\max_\pi V^{\pi,\rho(\sigma)}$, or equivalently to solve the robust Bellman equation \cref{eq:4}. 




% where $\mathcal{TR} $ is the robust reward Bellman operator 
% \begin{align}
%     \mathcal{TR}
% \end{align}
% and $\mathcal{TP}$ is the robust transition Bellman operator 
% \begin{align}
%     \mathcal{TP}
% \end{align}





\subsection{Strong Duality}
% In general, the $\rho$-constrained uncertainty set includes infinite available distribution and can not be solved directly. Previous works \cite{iyengar2005robust,hu2013kullback} provide the duality form of minimax problems, which can be solved within finite steps and is widely used to solve DR-RL problems.
For a general uncertainty set $\mathcal{P}$, directly computing $\inf_{p\in\mathcal{P}} p^\top V$ for any vector $V$ is computationally expensive due to the set containing an infinite number of feasible distributions. However, this optimization problem can be equivalently solved using its dual form, which is a convex optimization \cite{iyengar2005robust,hu2013kullback}. These results play a crucial role in our algorithm design, therefore, we introduce the dual forms corresponding to the three uncertainty sets as follows.

%Furthermore, combined with the duality form, we introduce the form of the worst-case distribution within the uncertainty set for different $\rho$-constrained uncertainty sets. This worst-case distribution (distribution minimize the DRO problem) plays an important role in further analysis and research, i.e. tighter bound analysis and efficient algorithm design in practice. 

%Here we define a general model to present the duality of DRO problems: random variable $x\in\mathcal{X}$ following the distribution $x\sim p$, a function $v(\cdot): \mathcal{X}\to \mathbb R^+$.($x$ resp. $s$ or $r_{s,a}$, $p$ resp. $\mu$, $v(\cdot)  :=\max_a Q(\cdot,a) $ or resp. identity function)


\begin{lemma}[Total variation distance]\cite{iyengar2005robust}
    The optimization problem: 
    \begin{align}\label{eq:v_alpha}
        & minimize \quad \mathbb E_q[v(x)]
        \nonumber\\&
        subject \quad to \quad q\in \varbrac{\rho_{TV} 
        \brac{q,p}\leq \sigma, q\in \Delta(\mathcal{X}) },
        \end{align}
 is equivalent to
    \begin{align}\label{eq:tvu}
        &\max_{u\geq0}\bigg\{\mathbb E_{p} \Fbrac{v(x)-u(x)}
        -\frac{{\sigma}}{2}\text{Span}(v-u),\bigg\}.
    \end{align} 
    where $\text{Span}(X)=\max_i X(i)-\min_i X(i)$. 
If moreover set $$ (v(x))_\alpha= \begin{cases} v(x) & v(x)\leq \alpha \\
                     \alpha &  v(x)>\alpha.
       \end{cases}$$
       Then, the optimization problem is also equivalent to
\begin{align}\label{eq:tva}
    \max_{\alpha\geq 0}\varbrac{ \mathbb E_{p}\Fbrac{(v(x))_\alpha}-\frac{{\sigma}}{2}\brac{\alpha-\min_x v(x)} }. 
\end{align} 
\label{lm:tv}
\end{lemma}

% \begin{proposition}[Lagrange Multiplier and Worst-case Distribution]
%     Set $\rho(\cdot,\cdot)=\rho_{TV}(\cdot,\cdot)$,  
% \end{proposition}

\begin{lemma}[Chi-square]\cite{iyengar2005robust}
The optimization problem:
     \begin{align}
        & minimize \quad \mathbb E_q[v(x)]
        \nonumber\\&
        subject \quad to \quad q\in \varbrac{\rho_{\chi^2} 
        \brac{q,p}\leq \sigma, q\in \Delta(\mathcal{X}) },\nonumber
        \end{align}
 is equivalent to
    \begin{align}
        &\max_{u\geq 0}\varbrac{\mathbb E_{p}\Fbrac{v(x)-u(x) } -\sqrt{\sigma \textbf{Var}_{p}\Fbrac{ v(x)-u(x)} } },\nonumber\\
        &=\max_{\alpha\geq0} \varbrac{\mathbb E_{p}\Fbrac{(v(x)_\alpha}-\sqrt{\sigma \textbf{Var}_{p}\Fbrac{ (v(x))_\alpha}  } }. 
    \end{align}\label{lm:chi}
\end{lemma}

 

\begin{lemma}[KL divergence]\cite{iyengar2005robust}
    The optimization problem 
     \begin{align}
        & minimize \quad \mathbb E_q[v(x)]
        \nonumber\\&
        subject \quad to \quad q\in \varbrac{\rho_{KL} 
        \brac{q,p}\leq \sigma, q\in \Delta(\mathcal{X}) },\nonumber
        \end{align} is equivalent to
    \begin{align}
        \max_{\alpha\geq0} \varbrac{-\alpha \log\brac{\mathbb E_{p} \Fbrac{exp\brac{-\frac{v(x)}{\alpha}} } }-\alpha \sigma }. 
    \end{align}
\end{lemma}\label{lm:kl}

\begin{remark}
For convenience, we denote the objective functions in the dual forms by $f^{\rho(\sigma)}(p, \alpha,v)$, i.e., 
    \begin{align}
        &f^{\rho_{TV}(\sigma)}(p, \alpha,v)=\mathbb E_{p}\Fbrac{(v(x))_\alpha}-\frac{{\sigma}}{2}\brac{\alpha-\min_x v(x)} ;\nonumber
        \\&f^{\rho_{\chi^2}(\sigma)}(p, \alpha,v)=\mathbb E_{p}\Fbrac{(v(x)_\alpha}-\sqrt{\sigma \textbf{Var}_{p}\Fbrac{ (v(x))_\alpha}  } ;
        \nonumber\\&
        f^{\rho_{KL}(\sigma)}(p, \alpha,v)=-\alpha \log\brac{\mathbb E_{p} \Fbrac{\exp\brac{-\frac{v(x)}{\alpha}} } }-\alpha \sigma . \nonumber
    \end{align}%If $v$ is the identity function: $v(x)=x$, we ignore the third term, i.e. $f^{\rho(\sigma)}(p,\alpha) $. 
\end{remark}

\section{Model-free Threshold-MLMC Robust Q-learning Algorithm}
In this section, we present our multi-level Monte Carlo Q-learning algorithm with the threshold design, which is referred to as T-MLMC robust Q-learning.  Our algorithm assumes a generative model, which can generative i.i.d. samples following the nominal kernels under arbitrary state-action pair $(s,a)\in \mathcal{S}\times \mathcal{A}$ :
\begin{align}
    r_{s,a}^i\overset{i.i.d}{\sim} \mu_{s,a}, s_{s,a}^i\overset{i.i.d}{\sim} p_{s,a}, i=1,...,N.
\end{align}





% \textbf{Model-Free algorithm: }
In robust dynamic programming, one needs to update the estimation of the robust value function by applying the robust Bellman operator, i.e., computing the worst-case performance:
\begin{align*}
    Q(s,a)&\leftarrow {\mathcal{T}}^{\rho(\sigma)}(Q) (s,a)\\
    &=\inf_{\nu\in \mathcal{R}^\rho(\sigma) }\mathbb E_{\nu }\Fbrac{r_{s,a}} +\gamma\inf_{q\in \mathcal{P}^\rho(\sigma) }\mathbb E_{q}\Fbrac{Q(s',a') },\nonumber
\end{align*}
and in model-free setting, we need to estimate these worst-case performance with only the samples from the nominal distributions. However, due to the distribution shift between the nominal kernel and the worst-case kernel, estimating them is highly challenging. One potential approach is to first obtain an empirical nominal distribution $\hat{p}$, and construct an uncertainty set centered on it using the same function $\rho$ and uncertainty radius $\sigma$: $\hat{\mathcal{P}}=\{q: \rho(q,\hat{p})\leq \sigma \}$. However, unlike the non-robust case, where $\hat{p}^\top V$ is an unbiased estimator of the expectation $\mathbb{E}_p[V]$, the term $\min_{p\in\mathcal{P}}(p^\top V)$ is non-linear in the nominal kernel, resulting in $\min_{p\in\hat{\mathcal{P}}}(p^\top V)$ being a biased empirical estimator \cite{wang2023model}. 

To address this issue, a multi-level Monte Carlo approach is proposed in \cite{liu2022distributionally,wang2023model}, which is inspired by the MLMC method in statistical inference from, e.g., \cite{blanchet2015unbiased,blanchet2019unbiased,wang2022unbiased}.  Specifically, MLMC first randomly generates a level-number $N$ following a geometry distribution $\text{GEO}(g)$, and then generative $2^{N+1}$ samples. Using the these samples, an estimated operator $\hat{T_N}$ of level $N$ is further constructed, and it is shown that $\mathbb{E}_{N}[\hat{T_N}(V)]=\min_{p\in\mathcal{P}}(p^\top V)$ is unbiased. Hence by replacing the robust Bellman operator by the MLMC estimator, we obtain an unbiased updating rule and the algorithm is shown to converge to the optimal policy \cite{liu2022distributionally,wang2023model}. 

Although the MLMC algorithms are shown to asymptotically converge in these works, the parameter $g$ of the geometry distribution is set to be $g<\frac{1}{2}$, which results in an infinite expected number of samples required to construct the MLMC estimator. To address this issue, we modify the MLMC by designing a threshold on the number of samples generated, to ensure the number of samples required does not excess the threshold and is thus finite. Our construction and algorithm are presented as follows. 



%To find out the optimal robust policy, we introduce the Multi-level Monte Carlo distributionally robust value iteration algorithm. At the beginning, we need to choose the uncertainty level $\sigma$ and constrained statistical distance $\rho(\cdot,\cdot)$. Given an initialization $Q$-table  $ \widehat Q^{\rho(\sigma)}=0$, the value function and policy can be obtained via $Q$-table shown in Line $5$ and $6$ in \cref{alg:example}.


Similarly, for a fixed parameter $g$, we sample $N_1,N_2\sim\text{GEO}(g)$. We add a threshold $N_{\max}$ when generating samples from the generative model. If $N_i\leq N_{\max}$, then we generate $1+2^{N_i+1}$ i.i.d. samples; And if $N_i>N_{\max}$, we only generate $1$ samples instead. The number of sample required at each time step is then less than $1+2^{N_{\max}+1}$ and hence finite. Specifically, if $N_1$ is less or equal than the threshold $N_{\max}$, we independently draw $2^{N_1+1}+1$ samples $r_{s,a,i}\sim \mu_{s,a}, i=0,1,...,2^{N_1+1}$; And when $N_1$ is larger than $N_{\max}$, we draw one sample $r_{s,a,0}\sim \mu_{s,a}$. Similarly, if $N_2$ is less or equal than the threshold $N_{\max}$, we independently draw $2^{N_2+1}+1$ samples $s'_{s,a,i}\sim p_{s,a}, i=0,1,...,2^{N_2+1}$. And when $N_2$ is larger than $N_{\max}$, we only draw one sample $s'_{s,a,0}\sim p_{s,a}$. 


  
% Then, 
% Then, get the threshold indicator $\quad\xi_1=\mathbf{1}_{(N_1\leq N_{\max})} ; \quad \xi_2=\mathbf{1}_{(N_2\leq N_{\max})} $. 


% $N_1=\min\varbrac{N_1, (N_{\max}+1)\mathbb{I}_{(N_1\leq N_{\max})}-1},N_2=\min\varbrac{N_2, (N_{\max}+1)\mathbb{I}_{(N_1\leq N_{\max})}-1} $, which 

We then combine this scheme with the MLMC estimator to construct our estimation of the worst-case value as follows. 


When $N_i\leq N_{\max}$, we denote the $\widehat \mu_{s,a,2^{N_1+1}} $ and $\widehat p_{s,a,2^{N_2+1}} $ the empirical distribution, i.e. 
\begin{align}
    &\E_{\widehat\mu_{s,a,2^{N_1+1}}} [r_{s,a}]= \frac{1}{2^{N_1+1}} \sum_{n=1}^{2^{N_1+1}}r_{s,a,n};
\\& \E_{\widehat p_{s,a,2^{N_2+1}}} [V(s'_{s,a})]=\frac{1}{2^{N_2+1}} \sum_{n=1}^{2^{N_2+1}}V(s'_{s,a,n}).
\end{align}
Furthermore, we denote by $\widehat \mu^O_{s,a,2^{N_1+1}}$
 and $\widehat \mu_{s,a,2^{N_1+1}}^E$ the empirical reward distribution estimated from the samples with odd and even indexes in $\{r_{s,a,i}, i=0,1,...,2^{N_1+1}\}$; And by $\widehat p^O_{s,a,2^{N_2+1}}$ and $\widehat p^E_{s,a,2^{N_2+1}}$ the the empirical transition kernel estimated from the samples with odd and even indexes in $\{s'_{s,a,i}, i=0,1,...,2^{N_2+1}\}$

We then construct the following threshold MLMC estimator for the reward term:
\begin{align}
    \widehat r^{\rho(\sigma)}(s,a):&= r_{s,a,0}+\frac{\delta^{r,\rho(\sigma)}_{s,a,N_1}}{P_{N_1}},\label{eq:hatr}
\end{align}
where when $N_1>N_{\max} $, we set $\delta^{r,\rho(\sigma)}_{s,a,N_1}=0 $; And when $N_1\leq N_{\max}$, $\delta^{r,\rho(\sigma)}_{s,a,N_1} $ is set as
\begin{align}
    \delta^{r,\rho(\sigma)}_{s,a,N_1}:&=\sup_{\alpha\geq 0}\varbrac{f^{\rho(\sigma)}(\widehat \mu_{s,a,2^{N_1+1}},\alpha )} 
    \nonumber \\& -\frac{1}{2} \sup_{\alpha\geq 0}\varbrac{f^{\rho(\sigma)}(\widehat \mu^E_{s,a,2^{N_1}},\alpha)}
    \nonumber \\&-\frac{1}{2}\sup_{\alpha\geq 0}\varbrac{f^{\rho(\sigma)}(\widehat \mu^O_{s,a,2^{N_1}},\alpha)}. \nonumber
\end{align}
% where $\boldsymbol{id}(\cdot)$ is identity function. 
% $r_{s,a}$ is random variable i.e. $r_{s,a}\sim \mu_{s,a}$ and $\boldsymbol{id}(r_{s,a})=r_{s,a}$ is identity function of random variable.




The estimator for the  worst-case expectation of $Q$ can be similarly constructed  as follows:
\begin{align}
     \widehat{v}^{\rho(\sigma)}(Q(s,a)) :&=V(s'_{s,a,0})+\frac{\delta^{\rho(\sigma)}_{s,a,N_2}(Q) }{P_{N_2}}.\label{eq:hatq}
\end{align}

% $\delta^{Q,\rho(\sigma)}_{s,a,N_2} $ are defined as:
where $V(s)=\max_{a'}Q(s,a') $. 
If $N_2>N_{\max} $, set $\delta^{\rho(\sigma)}_{s,a,N_2}(Q)=0 $. Otherwise when $N_2\leq N_{\max}$, $\delta^{\rho(\sigma)}_{s,a,N_2}(Q) $ is defined as: 
\begin{align}\label{eq:delta}
     \delta^{\rho(\sigma)}_{s,a,N_2}(Q):&=\sup_{\alpha\geq 0}\varbrac{f^{\rho(\sigma)}(\widehat p_{s,a,2^{N_2+1}},\alpha,V)} 
    \nonumber \\& -\frac{1}{2} \sup_{\alpha\geq 0}\varbrac{f^{\rho(\sigma)}(\widehat p^E_{2^{N_2}},\alpha,V)}
    \nonumber \\&-\frac{1}{2}\sup_{\alpha\geq 0}\varbrac{f^{\rho(\sigma)}(\widehat p^O_{2^{N_2}},\alpha,V )}.
\end{align}  


% and $\xi_2 2^{N_2+1}+1$ samples 
% $s'_{s,a,j}\sim p_{s,a},j=0,1,...,2^{N_2+1}$.
Combine the two terms above together, we obtain the estimated robust Bellman operator through our threshold MLMC framework:
$$\widehat {\mathcal{T}}^{\rho(\sigma)}_{ N_{\max}} (Q)(s,a)=\widehat  r^{\rho(\sigma)}(s,a)+ \gamma \widehat{v}^{\rho(\sigma)}(Q(s,a)). $$


By adapting this estimator to design a robust $Q$-learning algorithm,  we propose our model-free MLMC robust algorithm as in \Cref{alg:example}.

%perform robust synchronous update of $Q$-table shown in Line 17 in \Cref{alg:example}. After $T$ iterations, output robust $Q$-table and robust policy $\pi_T(\cdot)$. 








% Denote by $\hat p_n$ the empirical distribution on $n$ samples following the nominal distribution $p$, i.e. 
% \begin{align}
%     \mathbb E_{\hat p_n} \Fbrac{v(x)}= \frac{1}{n}\sum_{i=0}^{n-1}v(x_i).
% \end{align}

% Furthermore, we denote by $\hat p^O_n $
 % and $\hat p^E_n$ the empirical distribution deduced by the odd and even samples in $\hat p_n$. 
 % Then, we define the estimated robust Bellman operator:
 % \begin{align}
 %     & \widehat {\mathcal{T}}^{\rho(\sigma)} (Q)(s,a)=
 %     \nonumber\\&
 %     \inf_{\mu_{s,a}\in \widehat{\mathcal{R}}^\rho(\sigma) }\mathbb E_{\mu_{s,a} }\Fbrac{r_{s,a}}+ \gamma \inf_{p_{s,a}\in \widehat{\mathcal{P}}^\rho(\sigma) }\mathbb E_{p_{s,a}}\Fbrac{\max_{a'}Q(s',a') },\nonumber
 % \end{align}
 
 

\begin{algorithm}[tb]
   \caption{T-MLMC Robust Q-Learning}
   \label{alg:example}
\begin{algorithmic}
   \STATE {\bfseries Input:} Parameter $g\in(0,1)$, uncertainty level $\sigma$, statistical distance $\rho(\cdot,\cdot)$
   % \REPEAT
   \STATE {\bfseries Initialize: }$\widehat Q^{\rho(\sigma)}_{0}$ 
   % \STATE  $noChange = true$.
   \FOR{$t=0$ {\bfseries to} $T-1$}
   \FOR{ every $s\in \mathcal{S}$}
   \STATE Set $\widehat V^{\rho(\sigma)}_{t}(s)= \max_{a}\widehat 
   Q^{\rho(\sigma)}_{t}(s,a) $
   \STATE Set $\pi_t(s)= \arg\max_a \widehat 
   Q^{\rho(\sigma)}_{t}(s,a) $
   \ENDFOR
   \FOR{every $(s,a)\in \mathcal{S}\times\mathcal{A}$}
   \STATE Independently sample $N_1,N_2\sim \text{GEO}(g)$
   \STATE Compute total sample sizes:
   \STATE $\quad\mathcal{N}_1=1+ 2^{N_1+1}\mathbf{1}_{(N_1\leq N_{\max})} $
   \STATE $ \quad \mathcal{N}_2=1+ 2^{N_2+1}\mathbf{1}_{(N_2\leq N_{\max})} $
   \STATE Independently draw $\mathcal{N}_1$ samples  $ r_{as,i}\sim \mu_{s,a} $
   \STATE Compute $\widehat r^{\rho(\sigma)}(s,a) $ by \Cref{eq:hatr}
   \STATE Independently draw $\mathcal{N}_2$ samples  $s_{as,i}\sim p_{s,a} $
   \STATE Compute $\widehat v^{\rho(\sigma)}(\widehat Q^{\rho(\sigma)}_{t}(s,a)) $ by \Cref{eq:hatq}
   \STATE Update synchronous $Q$-table:
   $\widehat Q^{\rho(\sigma)}_{t+1}(s,a)= (1-\alpha_t) \widehat Q^{\rho(\sigma)}_{t}(s,a)+\alpha_t \widehat {\mathcal{T}}^{\rho(\sigma)}_{ N_{\max}}(\widehat Q^{\rho(\sigma)}_{t})(s,a) $
   \ENDFOR
   % \IF{$x_i > x_{i+1}$}
   % \ENDIF
   \ENDFOR
   % \UNTIL{$noChange$ is $true$}
   \STATE {\bfseries Output:} $Q^{\rho(\sigma)}_{T}(s,a) $
\end{algorithmic}
\end{algorithm}

Note that due to the threshold $N_{\max}$, the resulting MLMC estimator becomes biased. However, as we will show in the next section, the bias can be bounded and inversely depends on $N_{\max}$. That is,  with the increase of $N_{\max}$, the bias term tends to $0$, and we recover the MLMC estimator from the threshold MLMC estimator. By carefully designing the threshold $N_{\max}$, our T-MLMC Q-learning converges to the optimal robust policy. 



\section{Sample Complexity}
In this section, we present the sample complexity results of our T-MLMC algorithms under different uncertainty sets. 

As discussed above, the estimator $\widehat {\mathcal{T}}^{\rho(\sigma)}_{ N_{\max}}$ we constructed is a biased estimation of the robust Bellman operator. However, we show that the operator reduces to vanilla MLMC estimator and the bias diminishes if $N_{\max}\to\infty$, and hence can be controlled by setting a larger threshold. On the other hand, we also show the sample complexity of our T-MLMC algorithm increases when $N_{\max}$ becomes larger. To balance the trade-off between the bias and sample complexity, we choose a suitable value of $N_{\max}$ and present our complexity results. 
We first bound the bias and variance of our T-MLMC estimator in the following results.  
\begin{theorem}\label{thm:tv1}
    For any fixed $Q\in \mathbb R^{|\mathcal{S}||\mathcal{A}|}, s\in \mathcal{S}, a\in \mathcal{A} $, for TV distance and $\chi^2$ distance with uncertainty level $\sigma$, the estimation bias can be bounded as:
    \begin{align}
        &\lbrac{\mathbb E\Fbrac{\widehat {\mathcal{T}}_{N_{\max}}^{\rho(\sigma)} (Q)(s,a) } - {\mathcal{T}}^{\rho(\sigma)} (Q)(s,a)}\\
        &\leq \mathcal{O}\brac{\frac{N_{\max}}{1-\gamma}2^{-\frac{N_{\max}}{2}} };\nonumber
        % \brac{r_{\max}+\frac{r_{\max}}{1-\gamma}}
    \end{align}
    And the variance can be bounded as:
    \begin{align}
        \text{Var}\brac{\widehat {\mathcal{T}}^{\rho(\sigma)}_{N_{\max}} (Q)(s,a)  }\leq \mathcal{O}\brac{\frac{N_{\max}}{1-\gamma}}.
    \end{align}
    Specifically for TV distance, we have 
    \eqenv{
    &\lbrac{\mathbb E\Fbrac{\widehat {\mathcal{T}}_{N_{\max}}^{\rho(\sigma)} (Q)(s,a) } - {\mathcal{T}}^{\rho(\sigma)} (Q)(s,a)}\\
        &\leq \mathcal{O}\brac{\frac{N_{\max}}{(1-\gamma)\sigma p_{\wedge}}2^{-\frac{N_{\max}}{2}} };
    } and 
    \eqenv{
    \text{Var}\brac{\widehat {\mathcal{T}}^{\rho(\sigma)}_{N_{\max}} (Q)(s,a)  }\leq \mathcal{O}\brac{\frac{N_{\max}}{(1-\gamma)\sigma^2 p^2_{\wedge}}}
    }
\end{theorem}

%We then derive the sample complexity for our T-MLMC algorithm, and 

%  Next, we first introduce some required proposition for further analysis. 

% \begin{proposition}\label{prop:constract}
%     Given statistical distance $\rho(\cdot,\cdot)$ and uncertainty level $\sigma$, robust Bellman operator $ \mathcal{T}^{\rho(\sigma)}$is $\gamma$-\textit{contraction} w.r.t. the infinity norm.
% \end{proposition}


% \begin{definition}
%     Given the empirical frequency with $n$ samples, i.e.  $x_i, i=0,1,...,n$, 
% \end{definition}

\subsection{Total Variation distance}
In this part, we provide the sample complexity analysis for the total variation uncertainty set.



Utilizing results in \cref{thm:tv1}, we obtain the following sample complexity of our MLMC algorithm under the TV uncertainty set. 
\begin{theorem}[Sample Complexity with TV Distance]\label{thm2:tv}
 Set $g=\frac{1}{2}$, $N_{\max}=\frac{\log T}{\log 2}$ and set the stepsize as $\alpha_t=\alpha=\frac{\log T}{(1-\gamma)T}. $
Then the output from \cref{alg:example} satisfies that:
 \begin{align}
    \mathbb E &\Fbrac{\mynorm{\widehat Q_T^{\rho_{TV}(\sigma)}-Q^{*\rho_{TV}(\sigma)}}_\infty^2}\nonumber
    %\\& \leq \frac{c_0}{(1-\gamma)^2}\brac{1-\frac{\alpha(1-\gamma)}{2}}^T+ c_1 \alpha\frac{\log (|\mathcal{S}||\mathcal{A}|)}{ (1-\gamma)^4}\nonumber
   \leq\mathcal{O}\brac{\frac{1}{  (1-\gamma)^5 T}}.
\end{align}
To obtain an $\epsilon$-optimal policy, i.e., 
\begin{align}
    \mathbb E \Fbrac{\mynorm{\widehat Q_T^{\rho_{TV}(\sigma)}-Q^{*,\rho_{TV}(\sigma)}}_\infty^2}\leq \epsilon^2,\nonumber
\end{align}
the expected sample complexity $N^{\rho_{TV}(\sigma)}(\epsilon)$ is
 \begin{align}
     N^{\rho_{TV}(\sigma)}(\epsilon) = |\mathcal{S}||\mathcal{A}|N_{\max} T\leq \mathcal{O}\brac{\frac{|\mathcal{S}||\mathcal{A}|}{ (1-\gamma)^5 \epsilon^2 }}.\nonumber
 \end{align}
\end{theorem}

Our result presents the first finite sample complexity for model-free DR-RL algorithm for total variation uncertainty set. indicating the effectiveness and efficiency of our T-MLMC algorithm. Compared to model-based DR-RL algorithms \cite{yang2022toward, panaganti2022sample, shi2023curious,clavier2023towards}, our model-free algorithm has a higher sample complexity of order $\mathcal{O}((1-\gamma)^{-1})$, which is also the case in the non-robust setting \cite{li2020sample}. We also note that for  model-free algorithms or general stochastic approximation algorithms \cite{li2020sample,li2021q}, the tightest dependence on $(1-\gamma)$ is also $\mathcal{O}((1-\gamma)^{-5})$, which implies the tightness of our complexity result. 

%Generally, the model-free algorithm has larger sample size but exhibits superior performance with wider application range.

\subsection{Chi-square Divergence}
We then present our results for robust RL with Chi-square divergence uncertainty set. 
%In this part, we provide the statistical properties and sample complexity analysis in the case where the uncertainty set is constrained by $\chi^2$ distance. 

% \begin{proposition}[Lagrange Multiplier and Worst-case Distribution]
%     Set $\rho(\cdot,\cdot)=\rho_{\chi^2}(\cdot,\cdot)$, 
% \end{proposition}

\begin{theorem}[Sample Complexity with $\chi^2$ Distance]\label{thm:chi2}
 Set $N_{\max}=\frac{2\log T}{\log 2}$ and the stepsize as $\beta_t=\beta=\frac{\log T}{(1-\gamma)T}$. Then the output of \cref{alg:example} satisfies that:
 \begin{align}
    \mathbb E &\Fbrac{\mynorm{\widehat Q_T^{\rho_{\chi^2}(\sigma)}-Q^{*\rho_{\chi^2}(\sigma)}}_\infty^2}\nonumber
   %\\& \leq \frac{c_0}{(1-\gamma)^2}\brac{1-\frac{\alpha(1-\gamma)}{2}}^T+ c_1 \alpha\frac{\log (|\mathcal{S}||\mathcal{A}|)}{ (1-\gamma)^4}\nonumber
     \leq\mathcal{O}\brac{\frac{1}{  (1-\gamma)^5 T}}.
\end{align}
To ensure
\begin{align}
    \mathbb E \Fbrac{\mynorm{\widehat Q_T^{\rho_{\chi^2}(\sigma)}-Q^{*\rho_{\chi^2}(\sigma)}}_\infty^2}\leq \epsilon^2,\nonumber
\end{align}
the expected total sample complexity $N^{\rho_{\chi^2}(\sigma)}(\epsilon)$ is,
 \begin{align}
     N^{\rho_{\chi^2}(\sigma)}(\epsilon) = |\mathcal{S}||\mathcal{A}|N_{\max} T\geq \mathcal{O}\brac{\frac{|\mathcal{S}||\mathcal{A}|}{  (1-\gamma)^5 \epsilon^2 }}.\nonumber
 \end{align}
\end{theorem}
Our result implies that our T-MLMC algorithm is the first model-free algorithm for DR-RL under the Chi-sqaure divergence uncertainty set. Compared to the model-based methods, our complexity presents an additional $\mathcal{O}((1-\gamma)^{-1})$-order dependence, which is common in model-free algorithms. 

%When compared to model-based DR-RL algorithms cited in \cite{yang2022toward, panaganti2022sample, shi2023curious}, our model-free algorithm has a higher sample complexity, exceeding the optimal result with $\mathcal{O}((1-\gamma)^{-1})$. Generally, the model-free algorithm has larger sample size but exhibits superior performance with wider application range.
% Our work provide the sample complexity of model-free DR-RL algorithm firstly for $\chi^2$ uncertainty set (see details in \Cref{table:2}). Compared with the model-based DR-RL algorithms \cite{yang2022toward,panaganti2022sample,shi2023curious}, the sample complexity of our model-free algorithm is larger than the best result with $\mathcal{O}((1-\gamma)^{-1})$. In general, the model-free algorithm has larger sample size but better testing performance.
\subsection{KL Divergence}
We then present our results for KL divergence uncertainty set in this section. 
%In this part, we provide the statistical properties and sample complexity analysis in the case where the uncertainty set is constrained by KL distance. 

% \begin{proposition}[Lagrange Multiplier and Worst-case Distribution]
%     Set $\rho(\cdot,\cdot)=\rho_{KL}(\cdot,\cdot)$, 
% \end{proposition}


\begin{theorem}[Sample Complexity with KL Distance]\label{thm:kl}
If we set $g=\frac{1}{2}$, threshold
$$ N_{\max}=\max\varbrac{\frac{2\log T}{\log 2},\frac{\log(1+p^2_\wedge\log(2|\mathcal{S}|)\log T )}{\log 2}} ,$$
 and the stepsize as $\alpha_t=\alpha=\frac{\log T}{(1-\gamma)T}$. Then the output of \cref{alg:example} satisfies that:
 \begin{align}
    \mathbb E &\Fbrac{\mynorm{\widehat Q_T^{\rho_{KL}(\sigma)}-Q^{*\rho_{KL}(\sigma)}}_\infty^2}\nonumber
   % \\& \leq \frac{c_0}{(1-\gamma)^2}\brac{1-\frac{\alpha(1-\gamma)}{2}}^T+ c_1 \alpha\frac{\log (|\mathcal{S}||\mathcal{A}|)}{p_\wedge^2 (1-\gamma)^4}\nonumber
    \leq\mathcal{O}\brac{\frac{1}{ p_\wedge^2 (1-\gamma)^5 T}}.
\end{align}
To ensure 
\begin{align}
    \mathbb E \Fbrac{\mynorm{\widehat Q_T^{\rho_{KL}(\sigma)}-Q^{*\rho_{KL}(\sigma)}}_\infty^2}\leq \epsilon^2,\nonumber
\end{align}
the expected total sample complexity $N^{\rho_{KL}(\sigma)}(\epsilon)$ is
 \begin{align}
     N^{\rho_{KL}(\sigma)}(\epsilon) = |\mathcal{S}||\mathcal{A}|N_{\max} T\geq \mathcal{O}\brac{\frac{|\mathcal{S}||\mathcal{A}|}{ p_\wedge^2 (1-\gamma)^5 \epsilon^2 }}.\nonumber
 \end{align}
\end{theorem}
Our result implies that our T-MLMC algorithm also solves the DR-RL problem for KL-divergence uncertainty sets, with a small sample complexity. Compared to other model-based methods, our result is $\mathcal{O}((1-\gamma)^{-1})$-order larger, but matches in other parameters. We also note that there are several previous works on the sample complexity of model-free DR-RL approaches for the KL-divergence model \cite{wang2023sample,wang2023finite}, and we provide a discussion on the comparison of their works with ours. 


In both previous works, an assumption is made regarding the size of the uncertainty level $\sigma$, specifically, $p_\wedge\geq \mathcal{O}\left( 1-e^{-\sigma}\right)$ assuming the uncertainty set cannot be too large. This assumption significantly limits the applicability of their results, as in many scenarios, the uncertainty set must be designed relatively large to encompass a broader range of environments, particularly when the nominal environment is a low-fidelity model of the true environment. In contrast, our approach does not rely on such an assumption and can be applied to any uncertainty set.

On the other hand, our sample complexity result surpasses those in \cite{wang2023finite} and the initial complexity in \cite{wang2023sample}. In \cite{wang2023finite}, the sample complexity of the vanilla MLMC DR-RL algorithm is $\mathcal{O}\left(\frac{|\mathcal{S}||\mathcal{A}|}{ p_\wedge^6 (1-\gamma)^5 \epsilon^2 }\right)$. Our result improves upon this by $\mathcal{O}(p_\wedge^{-4})$. In \cite{wang2023sample}, a mini-batch model-free DR-RL algorithm is introduced, with a demonstrated sample complexity of $\mathcal{O}\left(\frac{|\mathcal{S}||\mathcal{A}| }{p_\wedge^3 (1-\gamma)^5 \epsilon^2} \right)$. Further enhancement is achieved through the use of variance reduction (VR) technique, bringing the complexity down to $\mathcal{O}\left(\frac{|\mathcal{S}||\mathcal{A}| }{p_\wedge^3 (1-\gamma)^4 \epsilon^2} \right)$. Notably, our result outperforms their initial vanilla algorithm by an order of $\mathcal{O}(p_\wedge^{-1})$. While the complexity with VR technique in \cite{wang2023sample} exhibits a superior dependence on $1-\gamma$, it fares worse concerning $p_\wedge$. This enhancement in $1-\gamma$ can be attributed to the utilization of the VR technique, consistent with previous findings \cite{li2020sample}. We anticipate further improvement in our complexity results through the application of VR technique, a direction left for future investigation. Consequently, our algorithm achieves superior sample complexity compared to previous vanilla model-free algorithms and is anticipated to either surpass the results in \cite{wang2023sample} with VR technique.





%Compared previous model-free algorithm, our algorithm can be applied without the limitation of uncertainty level.


%Our algorithm reach $\epsilon$-accurate optimal robust policy with sample complexity $\mathcal{O}\brac{\frac{|\mathcal{S}||\mathcal{A}|}{ p_\wedge^2 (1-\gamma)^5 \epsilon^2 }}$. The sample complexity analysis of model-free DR-RL algorithm is provided in \cite{wang2023finite} with order $\mathcal{O}\brac{\frac{|\mathcal{S}||\mathcal{A}|}{ p_\wedge^6 (1-\gamma)^5 \epsilon^2 }}$. However, this sample size is built on the assumption that   When compared to model-based DR-RL algorithms cited in \cite{yang2022toward, panaganti2022sample}, our model-free algorithm has a higher sample complexity, exceeding the optimal result with $\mathcal{O}((1-\gamma)^{-1})$. Generally, the model-free algorithm has larger sample size but exhibits superior performance with wider application range.
\section{Proof Sketch}
In this section, we briefly discuss the proof sketch for our results under the TV uncertainty set. The proofs for the other two uncertainty sets can be similarly derived. For convenience, we only present the proof regrading the uncertainty set on the transition kernels, the proof part on the reward uncertainty can also be similarly obtained. 



Our proof can be divided into two main parts: We first conduct a sample complexity analysis to establish the convergence of \Cref{alg:example} to the fixed point of the T-MLMC estimator; %This involves determining the number of samples required for the algorithm to converge to a stable solution within an acceptable error margin.
And then we characterize the disparity between this fixed point and the optimal robust value function. Combine the two part together, we quantify the sample complexity of our T-MLMC algorithm converging to the near approximation of the optimal robust value function. Specifically, we decompose the error as
\begin{align}\label{eqeq20}
    {\mynorm{\widehat Q_T^{\rho(\sigma)}-Q^{*\rho(\sigma)}}_\infty^2}&
  \leq 2 {\mynorm{\widehat Q_T^{\rho(\sigma)}-\widehat Q^{*\rho(\sigma)}}_\infty^2 }\nonumber\\\quad+ 2&{\mynorm{\widehat Q^{*\rho(\sigma)}-Q^{*\rho(\sigma)}}_\infty^2 },
\end{align}
where $\widehat Q^{*\rho(\sigma)}$ denotes the fixed point of the expected T-MLMC estimator(whose existence is proved in Appendix). The two steps corresponding to the two part in \eqref{eqeq20}. 

%And to balance the complexity and approximation error, we design a suitable threshold and derive the final complexity result. 

%This step entails quantifying the deviation between the solution obtained by our algorithm and the optimal robust value function, providing insights into the algorithm  effectiveness and accuracy.




For the first term in \eqref{eqeq20}, we study the mean and variance of the T-MLMC estimator we constructed. Using the concrete construction of T-MLMC: 
\begin{align}
     \widehat{v}^{\rho(\sigma)}(Q(s,a)) :&=V(s'_{s,a,0})+\frac{\delta^{\rho(\sigma)}_{s,a,N_2}(Q) }{P_{N_2}}
\end{align}
and definition of $\delta^{\rho(\sigma)}_{s,a,N_2}$, we directly calculate the expectation and variance of it, and show that the bias of T-MLMC estimator is in order of $\mathcal{O}(\sqrt{2^{-N_{\max}}})$, and the variance is less than $\mathcal{O}(N_{\max})$, as in \Cref{thm:tv1}. According to stochastic approximation method \cite{borkar2009stochastic}, \Cref{alg:example} converges to the fixed point $ \widehat Q^{*\rho(\sigma)}$ of T-MLMC operator. We then adapt the analysis in stochastic approximation \cite{chen2022finite} to obtain the sample complexity of the convergence of \Cref{alg:example}. 

For the second term in \eqref{eqeq20}, the approximation error between $ \widehat Q^{*\rho(\sigma)}$ and the optimal robust value function can be bounded by considering the disparity between the robust Bellman operator and our T-MLMC operator:
\begin{align}
    &\left\|\widehat Q^{*\rho(\sigma)}- Q^{*\rho(\sigma)}\right\|\nonumber\\&\leq \frac{1}{1-\gamma} \left\|\hatT_{N_{\max}}\brac{ Q^{*\rho(\sigma)}}-\boldsymbol{\mathcal{T}}^{\rho(\sigma)}\brac{Q^{*\rho(\sigma)}}\right\|, 
\end{align}
where $\hatT_{N_{\max}}$ denotes the expectation of the T-MLMC estimator: $\hatT_{N_{\max}}(Q)=\mathbb{E}[\widehat {\mathcal{T}}^{\rho(\sigma)}_{ N_{\max}}(Q)]$ w.r.t. the randomness in the samples. To bound the difference between the two operator, we note that when the threshold is not met, the T-MLMC operator is an unbiased estimator of $\boldsymbol{\mathcal{T}}^{\rho(\sigma)}$, in which case we bound the error using concentration inequalities; On the other hand, we can set the threshold $N_{\max}$ larger such that the probability of $\text{GEO}(g)> N_{\max}$ is small, resulting in a smaller error bound due to its low probability. Combining the two cases together implies a tight bound on the difference between the two operators, and further quantifies the approximation error introduced by our T-MLMC design. 

Finally, combining the two parts together, we derive the sample complexity for \Cref{alg:example} to converge to a close approximation of the optimal robust value function. By setting the value of the threshold, we hence obtain the final sample complexity result. 

  

 

\section{Conclusion}
In this paper, we introduce a novel model-based threshold MLMC algorithm tailored for finding the optimal robust policy in the DR-RL problem. Our algorithm strikes a delicate balance between convergence guarantees and the expected total sample size, ensuring convergence within a finite sample size. We further conduct sample complexity analyses for our algorithm under three distinct uncertainty sets: total variation, Chi-square divergence, and KL-divergence. Notably, our results mark the first complexity analyses for model-free DR-RL methods under the total variation and Chi-square divergence uncertainty sets, while also enhancing the complexity bounds and applicability of prior results for the KL divergence model. Our results achieve the tightest complexity bounds in the realm of model-free DR-RL methods, achieving state-of-the-art results under minimal assumptions.

%We firstly provide the model-based sample complexity analysis specifically for the TV and $\chi^2$-constrained uncertainty set. We also provide the model-based sample complexity analysis specifically for the KL constrained uncertainty set without the extra assumption on uncertainty level. 





\nocite{langley00}

\bibliography{example_paper}
\bibliographystyle{icml2024}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% APPENDIX
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newpage
\appendix
\onecolumn




\section{Notations and Lemmas}
In this section, we present the necessary notations and lemmas which are used frequently throughout the proof process. 

Recall that for reward uncertainty set, we have
\eqenv{
&f^{\rho_{TV}(\sigma)}(\mu_{s,a}, \alpha,r_{s,a})=\mathbb E_{\mu_{s,a}}\Fbrac{(r_{s,a})_\alpha}-\frac{{\sigma}}{2}\brac{\alpha-\min r_{s,a}} ;\nonumber
        \\&f^{\rho_{\chi^2}(\sigma)}(\mu_{s,a}, \alpha,r_{s,a})=\mathbb E_{\mu_{s,a}}\Fbrac{(r_{s,a})_\alpha}-\sqrt{\sigma \textbf{Var}_{\mu_{s,a}}\Fbrac{ (r_{s,a})_\alpha}  } ;
        \nonumber\\&
        f^{\rho_{KL}(\sigma)}(\mu_{s,a}, \alpha,r_{s,a})=-\alpha \log\brac{\mathbb E_{\mu_{s,a}} \Fbrac{\exp\brac{-\frac{r_{s,a}}{\alpha}} } }-\alpha \sigma .
}
For transition  kernel uncertainty set, we have
\eqenv{
&f^{\rho_{TV}(\sigma)}(p_{s,a}, \alpha,V(s'_{s,a}))=\mathbb E_{p_{s,a}}\Fbrac{(V(s'_{s,a}))_\alpha}-\frac{{\sigma}}{2}\brac{\alpha-\min_{s'_{s,a}} V(s'_{s,a})} ;\nonumber
        \\&f^{\rho_{\chi^2}(\sigma)}(p_{s,a}, \alpha,v)=\mathbb E_{p_{s,a}}\Fbrac{(V(s'_{s,a}))_\alpha}-\sqrt{\sigma \textbf{Var}_{p_{s,a}}\Fbrac{ (v(x))_\alpha}  } ;
        \nonumber\\&
        f^{\rho_{KL}(\sigma)}(p_{s,a}, \alpha,v)=-\alpha \log\brac{\mathbb E_{p_{s,a}} \Fbrac{\exp\brac{-\frac{V(s'_{s,a})}{\alpha}} } }-\alpha \sigma .
}

% Here we recall the general model to present the duality of DRO problems: random variable $x\in\mathcal{X}$ following the distribution $x\sim p$, a function $v(\cdot): \mathcal{X}\to \mathbb R^+$.($x$ resp. $s$ or $r_{s,a}$, $p$ resp. $\mu$, $v(\cdot)  :=\max_a Q(\cdot,a) $ or resp. identity function). 

% % Then, let $\varbrac{v(x_k):0\leq k\leq |\mathcal{X}|-1}$ donate the values $\varbrac{v(x):x\in \mathcal{X}}$ arranged in increasing order. Recall that 
% \begin{align}
%     (v(x))_\alpha= \begin{cases} v(x) & v(x)\leq \alpha ,\\
%                      \alpha &  v(x)>\alpha.
%        \end{cases}
% \end{align}
% Then, we define $a_m=\sum_{i\leq m}p(x_i)v(x_i)$, $b_m=\sum_{i>m}p(x_i)$,$c_m=\sum_{i\leq m}p(x_i)v^2(x_i)$. 
% Thus, if $\alpha\in [v(x_n),v(x_{n+1}]$, we can get that:
% \eqenv{
% &\E_p[(v(x))_{\alpha^*}]=a_n+b_n\alpha,\\
%     &\text{Var}_p[(v(x))_{\alpha^*}]= c_n+b_n\alpha^2-\brac{a_n+b_n\alpha}^2. 
% }

% Given the statistical distance $\rho(\cdot,\cdot)$ and corresponding duality problems, we analyze the properties of Lagrange multiplier $\alpha$, which can decide the worst-case distribution.
% % For example, given uncertainty level $\sigma$, nominal distribution $p$ and function $v(x)$, i.e.$ x\sim p(\cdot)$. Define $q^*= \arg\min_q \mathbb E_q[v(x)] $, $s.t. $ $\rho_{TV}(q,p)\leq \sigma $.

% \begin{definition}[Lagrange Multiplier]\label{def:a.1}
%     Denote by $\boldsymbol{\alpha^*}^{\rho(\sigma)}$ the saddle point Lagrange multiplier, i.e.
%     \begin{align}
%         \boldsymbol{\alpha^*}^{\rho(\sigma)}(p, v(x))\in \arg\max_{\alpha} f^{\rho(\sigma)}(p, \alpha, v(x)),\nonumber
%     \end{align}
%     and $\boldsymbol{q^*}^{\rho(\sigma)}$(resp. $\boldsymbol{\nu^*}^{\rho(\sigma)}$) the worst-case distribution within uncertainty set,
%     \begin{align}
%         \boldsymbol{q^*}^{\rho(\sigma)}(p, v(x))\in \arg \max_q \mathbb E_q [v(x)], s.t. \rho(q,p )\leq \sigma.\nonumber
%     \end{align}
% \end{definition}
% \begin{remark}
%     The saddle point Lagrange multiplier $\boldsymbol{\alpha^*}^{\rho(\sigma)}$, nominal distribution $p$ and function $v(x)$ decide the worst-case distribution. 
% \end{remark}

% Then, we present the worst-case distribution corresponding to different uncertainty sets.

% % \begin{lemma}[TV-constrained uncertainty set]\label{lm:qtv}
% %     The value of the optimization problem: 
% %     \begin{align}
% %         & minimize \quad \mathbb E_q[v(x)]
%         \nonumber\\&
%         subject \quad to \quad q\in \varbrac{\rho_{TV} 
%         \brac{q,p}\leq \sigma, q\in \Delta(\mathcal{X}) },\nonumber
%         \end{align}
%     The worst-case distribution $q^*= \arg\min_{q}\mathbb E_q[v(x)]$, satisfies that:
%     \eqenv{
%     q^*(x)= 
%     \begin{cases} p(x)+\frac{\sigma}{2} & x_0 ,\\
%                      p(x) &  v(x)<\alpha^*,\\
%                       \sum_{v(x')\leq \alpha^*} p(x')-\frac{\sigma}{2} &  v(x)=\alpha^*,\\
%                      0 & v(x)>\alpha^*,
%        \end{cases}
%     }where $\alpha^*= \min_{i}\varbrac{v(x_i):\sum_{j>i} p(x_j)< \frac{\sigma}{2} }$. 
% % \end{lemma}


% \begin{lemma}[$\chi^2$-constrained uncertainty set]
%     The value of the optimization problem: 
%     \begin{align}
%         & minimize \quad \mathbb E_q[v(x)]
%         \nonumber\\&
%         subject \quad to \quad q\in \varbrac{\rho_{\chi^2} 
%         \brac{q,p}\leq \sigma, q\in \Delta(\mathcal{X}) },\nonumber
%         \end{align}
%     The worst-case distribution $q^*= \arg\min_{q}\mathbb E_q[v(x)]$, satisfies that:
%     \eqenv{
%     q^*(x)= p(x) \brac{1-\sqrt{\sigma}\frac{(v(x))_{\alpha^*}-\mathbb E \Fbrac{(v(x))_{\alpha^*}}}{\text{Var}_p[(v(x))_{\alpha^*}]}},
%     }where $\alpha^*\in  \varbrac{\alpha'=\frac{a_m}{1-b_m}+ \frac{1}{1-b_m}\sqrt{\frac{c_m(1-b_m)-a_m^2}{\sigma(1-b_m)-b_m } },\alpha'\in[v(x_m),v(x_{m+1})] }$. 
% \end{lemma}

% \begin{lemma}[KL-constrained uncertainty set]
%     The value of the optimization problem: 
%     \begin{align}
%         & minimize \quad \mathbb E_q[v(x)]
%         \nonumber\\&
%         subject \quad to \quad q\in \varbrac{\rho_{KL} 
%         \brac{q,p}\leq \sigma, q\in \Delta(\mathcal{X}) },\nonumber
%         \end{align}
%     The worst-case distribution $q^*= \arg\min_{q}\mathbb E_q[v(x)]$, satisfies that:
%     \eqenv{
%     q^*(x)= \frac{p(x)\exp\brac{-\frac{v(x)}{\alpha^*} }}{\E_p \Fbrac{\exp\brac{-\frac{v(x)}{\alpha^*} } } }
%     }where $\alpha^*= \arg \max_{\alpha\geq0} \varbrac{\alpha \log\brac{\mathbb E_{p} \brac{exp\brac{\frac{v(x)}{\alpha}} } }-\alpha \sigma }$. 
% \end{lemma}
% Above all, we present the detailed  worst-case distribution for different uncertainty sets. 

We present the analysis of propositions and theorems proof of threshold MLMC algorithm. To simplify the proof process, we just provide the analysis of  transition kernel uncertainty set, which is easy to extend to the reward uncertainty set.

Firstly, to define the surrogate $Q$-table $ \widehat Q^{*\rho(\sigma)}$, we define the expected biased estimation of dual value as follows:
\begin{definition}[Biased estimation]\label{def:3.1}
    Draw $n$ samples from nominal distribution $s'_{s,a}\sim p_{s,a}, i=0,1,.., n-1$ and get the empirical frequency $\widehat p_{s,a,n} $. We define (resp. $\mu_{s,a},\hat \mu_{s,a,n}$)
    \begin{align}
        f^{*\rho(\sigma)}(\hat p_{s,a,n},V(s'_{s,a})):=\sup_{\alpha\geq 0} \varbrac{f^{\rho(\sigma)}(\hat p_{s,a,n}, \alpha, V(s'_{s,a})) }.\nonumber
    \end{align} 
    % \begin{align}
    %     \boldsymbol{\bar f^*_n}^{\rho(\sigma)}\brac{p_{s,a},V(s'_{s,a})}:=\mathbb E_{\hat{p}_{s,a,n}}\Fbrac{f^{*\rho(\sigma)}(\hat p_{s,a,n},V(s'_{s,a}))}.\nonumber
    % \end{align}
\end{definition}
The estimation of the robust Bellman operator is biased and the bias depends on empirical distribution sample sizes $n$. The bias can be bounded by:
\begin{align}
    \lbrac{\E\Fbrac{f^{*\rho(\sigma)}(\hat p_{s,a,n},V(s'_{s,a}))}- f^{*\rho(\sigma)}\brac{ p, V(s'_{s,a})}}.\nonumber
\end{align}

  

Under model-free setting, the estimation of the robust Bellman operator is biased and the bias depends on empirical distribution sample sizes. 
We prove that when applying threshold $N_{\max}$ in our algorithm, the bias of the robust Bellman operator is equal to the bias when applying the model-based algorithm with sample size $2^{N_{\max}+1}$. 
Here, we describe the condition by following proposition. 
\begin{proposition}[Threshold MLMC]\label{prop:mlmc}
The robust Bellman estimator $\widehat{v}^{\rho(\sigma)}(Q(s,a))$ (resp. $\widehat r^{\rho(\sigma)}(s,a) $) satisfies that 
\begin{align}
    \E\Fbrac{\widehat{v}^{\rho(\sigma)}(Q(s,a)) }&=\mathbb E \Fbrac{V(s'_{s,a,0})+\frac{\delta^{\rho(\sigma)}_{s,a,N_2}(Q) }{P_{N_2}} \nonumber}\\&=\E\Fbrac{f^{*\rho(\sigma)}(\hat p_{s,a,2^{N_{\max}+1}},V(s'_{s,a}))}
    % \boldsymbol{\bar f^*}_{2^{N_{\max}+1}}^{\rho(\sigma)}\brac{p_{s,a},V(s'_{s,a})  }
    .\nonumber
\end{align}
\end{proposition}
The \cref{prop:mlmc} shows the fact that the estimation biases are equal when drawing $2^{N_{\max}+1}$ samples to estimate the dual value directly and when setting the $N_{\max}$-threshold MLMC algorithm to estimate the dual value. 

% When sampling $n$
Based on \cref{prop:mlmc}, for $\rho$ distance and uncertainty level $\sigma$, we define the define the surrogate $Q$-table $ \widehat Q^{*\rho(\sigma)}$ and robust optimal $Q$-table $Q^{*\rho(\sigma)}$ as
% estimation of estimated robust Bellman operator and robust Bellman operator following
\begin{align}
    \E\Fbrac{ {\widehat {\mathcal{T}}}_{N_{\max}}^{\rho(\sigma)} (\widehat Q^{*\rho(\sigma)})(s,a) }&= \widehat Q^{*\rho(\sigma)}(s,a),
    \nonumber
% \\& =\boldsymbol{\bar f^*}^{\rho(\sigma)}\brac{\hat \mu_{s,a,2^{N_{\max}+1}},\boldsymbol{id}(r_{s,a})  }\nonumber\\& \quad+\gamma\boldsymbol{\bar f^*}^{\rho(\sigma)}\brac{ \hat p_{s,a,2^{N_{\max}+1}}, V(s'_{s,a})},\nonumber
\\ { {\mathcal{T}}}^{\rho(\sigma)} (Q^{*\rho(\sigma)})(s,a)&
=Q^{*\rho(\sigma)})(s,a)
  % =\boldsymbol{\bar f^*}^{\rho(\sigma)}\brac{ \mu_{s,a},\boldsymbol{id}(r_{s,a})  }\nonumber\\& \quad+\gamma\boldsymbol{\bar f^*}^{\rho(\sigma)}\brac{  p_{s,a}, V(s'_{s,a})}
  .\label{eq:fixp}
\end{align}

We do error decomposition by the surrogate $Q$-table  $ \widehat Q^{*\rho(\sigma)}$: 
\begin{align}\label{eqeq20}
    {\mynorm{\widehat Q_T^{\rho_{TV}(\sigma)}-Q^{*\rho(\sigma)}}_\infty^2}&
  \leq 2 {\mynorm{\widehat Q_T^{\rho(\sigma)}-\widehat Q^{*\rho(\sigma)}}_\infty^2 }\nonumber\\\quad+ 2&{\mynorm{\widehat Q^{*\rho(\sigma)}-Q^{*\rho(\sigma)}}_\infty^2 }.
\end{align}

\textbf{Second term in \cref{eqeq20}: }
The second term in\cref{eqeq20} is the gap between the fixed points in \cref{eq:fixp}, 
% Given $Q(s,a)$, the gap between the robust Bellman operator $\boldsymbol{\widehat {\mathcal{T}}}^{\rho(\sigma)} (Q)(s,a)$ and estimated robust Bellman operator $\boldsymbol{ {\mathcal{T}}}^{\rho(\sigma)} (Q)(s,a)$ 
which can be bound following the methods in model-based works \cite{shi2023curious,yang2022toward} combined with \cref{prop:mlmc}. 
% The surrogate biased estimated optimal $Q$-table  $ \widehat Q^{*\rho(\sigma)}$ satisfies the following equation:
% \eqenv{\label{eqeq23}
% \widehat Q^{*\rho(\sigma)}(s,a)=  \boldsymbol{\widehat {\mathcal{T}}}^{\rho(\sigma)} (\widehat Q^{*\rho(\sigma)})(s,a).
% }
% Combined with the robust Bellman equation, 
% \begin{align}
%       Q^{*\rho(\sigma)}(s,a)=  \boldsymbol{ {\mathcal{T}}}^{\rho(\sigma)} (  Q^{*\rho(\sigma)})(s,a),
% \end{align}
% we can make error decomposition as following 
% % $ {\mynorm{\widehat Q_T^{\rho(\sigma)}-\widehat Q^{*\rho(\sigma)}}_\infty^2 }$.
% \eqenv{
% &{\mynorm{\widehat Q^{*\rho(\sigma)}-Q^{*\rho(\sigma)}}_\infty^2 }\\& \leq
% \norminf{\hatT\brac{\widehat Q^{*\rho(\sigma)}}-\boldsymbol{\widehat{\mathcal{T}}}^{\rho(\sigma)}\brac{Q^{*\rho(\sigma)}} }
% \\& \quad+ \norminf{\hatT\brac{ Q^{*\rho(\sigma)}}-\mathcal{T}^{\rho(\sigma)}\brac{Q^{*\rho(\sigma)}}},
% } 

% $V(s'_{s,a})=\max_{a'}Q(s'_{s,a},a') $ 

% \begin{definition}[Biased estimation]\label{def:3.1}
%     Draw $n$ samples from nominal distribution $x_i\sim p, i=0,1,.., n-1$ and get the empirical frequency $\widehat p_n $. We define (resp. $\hat \mu_n$)
%     \begin{align}
%         f^{*\rho(\sigma)}(\hat p_n,V(s'_{s,a})):=\sup_{\alpha\geq 0} \varbrac{f^{\rho(\sigma)}(\hat p_n, \alpha, V(s'_{s,a})) },\nonumber
%     \end{align} and 
%     \begin{align}
%         \boldsymbol{\bar f^*}^{\rho(\sigma)}\brac{\hat p_n, V(s'_{s,a})}:=\mathbb E_{\hat{p}_n}\Fbrac{f^{*\rho(\sigma)}(\hat p_n,V(s'_{s,a}))}.\nonumber
%     \end{align}
% \end{definition}

% Firstly, we provide the proof of \cref{prop:estimate}. 
% \begin{proposition}[Restatement of \Cref{prop:estimate}]
% The robust Bellman operator satisfies that 
% \begin{align}
%     \mathbb E \Fbrac{r_{s,a,0}+ \frac{\delta^{r,\rho(\sigma)}_{s,a,N_1}}{P_{N_1}} }=& \boldsymbol{\bar f^*}^{\rho(\sigma)}\brac{\widehat \mu_{s,a,(2^{N_{\max}+1})},\boldsymbol{id}(r_{s,a})  },\nonumber
% \end{align}
% and 
% \begin{align}
%     \mathbb E \Fbrac{V(s'_{s,a})+\frac{\delta^{\rho(\sigma)}_{s,a,N_2} (Q)}{P_{N_2}} }= &\boldsymbol{\bar f^*}^{\rho(\sigma)}\brac{\widehat p_{s,a,(2^{N_{\max}+1})},V(s'_{s,a})  }.\nonumber
% \end{align}

%     % Sample $N$ from a geometric distribution $\text{Geo}(g)$,i.e. $\mathbb P(N=n)=p_n:= g(1-g)^n, n=0,1,...$ with threshold $N=\min\varbrac{N,N_{\max}}$. Then draw $2^{N+1}+1$ samples from nominal distribution $x_i\sim p, i=0,1,...,2^{N+1}$, 
% \end{proposition}
\begin{definition}[Biased estimation]\label{def:3.1}
    Draw $n$ samples from nominal distribution $s'_{s,a}\sim p_{s,a}, i=0,1,.., n-1$ and get the empirical frequency $\widehat p_{s,a,n} $. We define (resp. $\mu_{s,a},\hat \mu_{s,a,n}$)
    \begin{align}
        f^{*\rho(\sigma)}(\hat p_{s,a,n},V(s'_{s,a})):=\sup_{\alpha\geq 0} \varbrac{f^{\rho(\sigma)}(\hat p_{s,a,n}, \alpha, V(s'_{s,a})) }.\nonumber
    \end{align} 
    % \begin{align}
    %     \boldsymbol{\bar f^*_n}^{\rho(\sigma)}\brac{p_{s,a},V(s'_{s,a})}:=\mathbb E_{\hat{p}_{s,a,n}}\Fbrac{f^{*\rho(\sigma)}(\hat p_{s,a,n},V(s'_{s,a}))}.\nonumber
    % \end{align}
\end{definition}

\begin{proposition}[Threshold MLMC]\label{prop:estimate}
The robust Bellman operator satisfies that 
\begin{align}
    \mathbb E \Fbrac{r_{s,a,0}+ \frac{\delta^{r,\rho(\sigma)}_{s,a,N_1}}{P_{N_1}} }=& \boldsymbol{\bar f^*}^{\rho(\sigma)}\brac{\widehat \mu_{s,a,2^{N_{\max}+1}},\boldsymbol{id}(r_{s,a})  },\nonumber
\end{align}
and 
\begin{align}
    \mathbb E &\Fbrac{V(s'_{s,a,0})+\frac{\delta^{\rho(\sigma)}_{s,a,N_2}(Q) }{P_{N_2}} }=\boldsymbol{\bar f^*}^{\rho(\sigma)}\brac{\widehat p_{s,a,2^{N_{\max}+1}},V(s'_{s,a})  }.\nonumber
\end{align}
    % Sample $N$ from a geometric distribution $\text{Geo}(g)$,i.e. $\mathbb P(N=n)=p_n:= g(1-g)^n, n=0,1,...$ with threshold $N=\min\varbrac{N,N_{\max}}$. Then draw $2^{N+1}+1$ samples from nominal distribution $x_i\sim p, i=0,1,...,2^{N+1}$, 
\end{proposition}
\begin{proof}
    Here we recall the definition of $\delta^{\rho(\sigma)}_{s,a,N_2}(Q)$ that
    \eqenv{
    \delta^{\rho(\sigma)}_{s,a,N_2}(Q):&=\sup_{\alpha\geq 0}\varbrac{f^{\rho(\sigma)}(\widehat p_{s,a,2^{N_2+1}},\alpha,V(s'_{s,a}) )} 
    \\&\qquad -\frac{1}{2} \sup_{\alpha\geq 0}\varbrac{f^{\rho(\sigma)}(\widehat p^E_{2^{N_2}},\alpha,V(s'_{s,a}))}
    -\frac{1}{2}\sup_{\alpha\geq 0}\varbrac{f^{\rho(\sigma)}(\widehat p^O_{2^{N_2}},\alpha,V(s'_{s,a}) )}. 
    }
    Then, we recall that
    \begin{align}
        f^{*\rho(\sigma)}(\hat p_n,V(s'_{s,a})):=\sup_{\alpha\geq 0} \varbrac{f^{\rho(\sigma)}(\hat p_n, \alpha, V(s'_{s,a}) }.
    \end{align} and 
    \begin{align}
        \boldsymbol{\bar f^*}^{\rho(\sigma)}\brac{\hat p_n, V(s'_{s,a})}:=\mathbb E_{\hat{p}_n}\Fbrac{f^{*\rho(\sigma)}(\hat p_n,V(s'_{s,a}))}.
    \end{align}
    Thus, we can get that
    \eqenv{
    \E [\delta^{Q,\rho(\sigma)}_{s,a,N_2}]&= 
    \mathbb E_{\hat{p}_n}\Fbrac{f^{*\rho(\sigma)}(\hat p_{s,a,2^{N_2+1}},V(s'_{s,a}))}-\frac{1}{2}\mathbb E_{\hat{p}_n}\Fbrac{f^{*\rho(\sigma)}(\hat p^O_{2^{N_2+1}},V(s'_{s,a}))}-\frac{1}{2}\mathbb E_{\hat{p}_n}\Fbrac{f^{*\rho(\sigma)}(\hat p^E_{2^{N_2+1}},V(s'_{s,a}))}
    \\&=\boldsymbol{\bar f^*}^{\rho(\sigma)}\brac{\hat p_{s,a,2^{N_2+1}}, V(s'_{s,a})}-\boldsymbol{\bar f^*}^{\rho(\sigma)}\brac{\hat p_{s,a,2^{N_2}}, V(s'_{s,a})}. 
    }
    Take the expectation of the random variable $N_2\sim \text{Geo}(g)$,
    we can obtain that
    \eqenv{
    \E\Fbrac{\widehat{v}^{\rho(\sigma)}(Q(s,a)) }&=\E\Fbrac{V(s'_{s,a,0})+\frac{\delta^{Q,\rho(\sigma)}_{s,a,N_2} }{P_{N_2}}}
    \\&= \E[V(s'_{s,a,0})]+\E\Fbrac{ \frac{\delta^{Q,\rho(\sigma)}_{s,a,N_2} }{P_{N_2}} }
    \\&\myineq{=}{i} \E[V(s'_{s,a,0})] + \sum_{N=0}^{N_{\max}} \E\Fbrac{\frac{\delta^{Q,\rho(\sigma)}_{s,a,N_2} }{P_{N_2}}|N_2=N }\prob\brac{N}  + \sum_{N=N_{\max}+1}^\infty  \E\Fbrac{\frac{\delta^{Q,\rho(\sigma)}_{s,a,N_2} }{P_{N_2}}|N_2=N }\prob\brac{N}
    \\& \myineq{=}{ii}\boldsymbol{\bar f^*}^{\rho(\sigma)}\brac{\hat p_n, V(s'_{s,a})} + \sum_{N=0}^{N_{\max}} \E\Fbrac{\delta^{Q,\rho(\sigma)}_N }
    \\&  \myineq{=}{iii} \boldsymbol{\bar f^*}^{\rho(\sigma)}\brac{\hat p_n, V(s'_{s,a})} + \sum_{N=0}^{N_{\max}} \boldsymbol{\bar f^*}^{\rho(\sigma)}\brac{\hat p_{s,a,2^{N+1}}, V(s'_{s,a})}-\boldsymbol{\bar f^*}^{\rho(\sigma)}\brac{\hat p_{s,a,2^{N}}, V(s'_{s,a})} 
    \\&= \boldsymbol{\bar f^*}^{\rho(\sigma)}\brac{\hat p_{s,a,2^{N_{\max}+1}}, V(s'_{s,a})},
    } where $(i)$ and $(ii)$ follows from the \cref{eq:hatq}; $(iii)$ follows from \cref{def:3.1}. 

    This completes the proof. 
\end{proof}
% \textbf{TV-constrained uncertainty set: }[Restatement of \cref{prop:constract}]

Next, we consider the convergence properties of our threshold MLMC algorithm. The optimal robust $Q$-function satisfies the following Bellman equation:
\eqenv{
Q^{*\rho(\sigma)}(s,a)=  \boldsymbol{ {\mathcal{T}}}^{\rho(\sigma)} (Q^{*\rho(\sigma)})(s,a)=\boldsymbol{\bar f^*}^{\rho(\sigma)}\brac{\hat \mu_{s,a },\boldsymbol{id}(r_{s,a})  }+\gamma\boldsymbol{\bar f^*}^{\rho(\sigma)}\brac{ \hat p_{s,a }, V(s'_{s,a})}.
}
Then, we set the limiting of estimated optimal robust $Q$-function, $ \widehat Q^{*\rho(\sigma)}(s,a)$,
which satisfies the following equation:
\eqenv{\label{eqeq35}
\widehat Q^{*\rho(\sigma)}(s,a) &= \boldsymbol{\widehat {\mathcal{T}}}^{\rho(\sigma)} (\widehat Q^{*\rho(\sigma)})(s,a)
\\&=\boldsymbol{\bar f^*}^{\rho(\sigma)}\brac{\hat \mu_{s,a,2^{N_{\max}+1}}, \boldsymbol{id}(r_{s,a}) }
+ \gamma{ \boldsymbol{\bar f^*}^{\rho(\sigma)}\brac{\hat p_{s,a,2^{N_{\max}+1}},\widehat  V^*(s'_{s,a})}}, 
} where $\widehat V^*(s'_{s,a})= \max_{a'}\widehat Q^{*\rho(\sigma)}(s'_{s,a},a')$. 

\begin{proposition}\label{prop:contract}
    Given statistical distance $\rho$ and uncertainty level $\sigma$, estimated robust Bellman operator $ \widehat{\mathcal{T}}^{\rho(\sigma)}$is $\gamma$-\textit{contraction} w.r.t. the infinity norm:
    \eqenv{
   {\mynorm{\hatT( Q)-\hatT( Q') }_\infty} \leq \gamma \mynorm{ Q- Q' }_\infty.
    }
\end{proposition}
\begin{proof}
    For any $Q,Q'\in \mathbb R^{|\mathcal{S}||\mathcal{A}|} $, we have that
    \eqenv{\label{eq:eq39}
    \hatT(& Q)(s,a)-\hatT( Q')(s,a)\\&=  \boldsymbol{\bar f^*}^{\rho(\sigma)}\brac{\hat \mu_{s,a,2^{N_{\max}+1}}, \boldsymbol{id}(r_{s,a}) }
+ \gamma{\boldsymbol{\bar f^*}^{\rho(\sigma)}\brac{\hat p_{s,a,2^{N_{\max}+1}},   V^*(s'_{s,a})}}
\\&\qquad -  \boldsymbol{\bar f^*}^{\rho(\sigma)}\brac{\hat \mu_{s,a,2^{N_{\max}+1}}, \boldsymbol{id}(r_{s,a}) }+ \gamma{ \boldsymbol{\bar f^*}^{\rho(\sigma)}\brac{\hat p_{s,a,2^{N_{\max}+1}},   V'^*(s'_{s,a})}}
\\& = \gamma\brac{{ \boldsymbol{\bar f^*}^{\rho(\sigma)}\brac{\hat p_{s,a,2^{N_{\max}+1}},   V^*(s'_{s,a})}}-{ \boldsymbol{\bar f^*}^{\rho(\sigma)}\brac{\hat p_{s,a,2^{N_{\max}+1}},   V'^*(s'_{s,a})}} }
\\& =\gamma\E_{\hat p_{s,a,2^{N_{\max}+1}}}\Fbrac{\inf_{\rho(q,\hat p_{s,a,2^{N_{\max}+1}})\leq \sigma } \E_{q}[V' (s'_{s,a})]-\inf_{\rho(q,\hat p_{s,a,2^{N_{\max}+1}})\leq \sigma } \E_{q}[  V (s'_{s,a})]  }
\\&=\gamma \E_{\hat p_{s,a,2^{N_{\max}+1}}}\Fbrac{\inf_{\rho(q,\hat p_{s,a,2^{N_{\max}+1}})\leq \sigma } \E_{q}[  \max_{a'}Q' (s'_{s,a},a')]-\inf_{\rho(q,\hat p_{s,a,2^{N_{\max}+1}})\leq \sigma } \E_{q}[  \max_{a'}Q (s'_{s,a},a')]  }
    }
    Hence, consider the infinite norm of both sides \cref{eq:eq39}, we can get 
    \eqenv{
    &\norminf{\hatT (Q)-\hatT(Q') }\\&\leq \max_{s,a}\lbrac{\hatT( Q)(s,a)-\hatT( Q')(s,a)  }
    \\& =\gamma\max_{s,a}\lbrac{\E_{\hat p_{s,a,2^{N_{\max}+1}}}\Fbrac{\inf_{\rho(q,\hat p_{s,a,2^{N_{\max}+1}})\leq \sigma } \E_{q}[  \max_{a'}Q' (s'_{s,a},a')]-\inf_{\rho(q,\hat p_{s,a,2^{N_{\max}+1}})\leq \sigma } \E_{q}[  \max_{a'}Q (s'_{s,a},a')]  } }
    \\& \leq \gamma\max_{s,a}\lbrac{\E_{\hat p_{s,a,2^{N_{\max}+1}}}\Fbrac{\sup_{\rho(q,\hat p_{s,a,2^{N_{\max}+1}})\leq \sigma } \E_{q}[  \max_{a'}Q' (s'_{s,a},a')-\max_{a'}Q (s'_{s,a},a')] } }
    \\& \leq \gamma\max_{s,a}\max_{s'_{s,a}}\lbrac{\max_{a'}Q' (s'_{s,a},a')-\max_{a'}Q (s'_{s,a},a') }
    \\& \leq \gamma\max_{s'} \max_{a'}\lbrac{Q(s',a')-Q'(s',a')}
    \\&= \gamma \norminf{Q-Q'}.
    }
\end{proof}

Next, we bound the gap $\norminf{\widehat Q^{*\rho(\sigma)}- Q^{*\rho(\sigma)}}$. 
\begin{lemma}\label{lm:a8}
    The optimal robust $Q$-function and estimated optimal robust $Q$-function can be bounded as follows:
    \eqenv{
    \norminf{\widehat Q^{*\rho(\sigma)}- Q^{*\rho(\sigma)}}&\leq \frac{1}{1-\gamma} \norminf{\hatT\brac{ Q^{*\rho(\sigma)}}-\boldsymbol{\mathcal{T}}^{\rho(\sigma)}\brac{Q^{*\rho(\sigma)}}}.
%     \norminf{  \boldsymbol{\bar f^*}^{\rho(\sigma)}\brac{\hat \mu_{s,a,2^{N_{\max}+1}}, \boldsymbol{id}(r_{s,a}) }
% -\boldsymbol{\bar f^*}^{\rho(\sigma)}\brac{ \mu_{s,a}, \boldsymbol{id}(r_{s,a}) }
% } \\& \qquad+ \frac{\gamma}{1-\gamma} \norminf{ \boldsymbol{\bar f^*}^{\rho(\sigma)}\brac{\hat p_{s,a,2^{N_{\max}+1}},  V^*(s'_{s,a})}- \boldsymbol{\bar f^*}^{\rho(\sigma)}\brac{ p_{s,a}, V^*(s'_{s,a})} }.
    }
\end{lemma}
\begin{proof}

\eqenv{
\norminf{\widehat Q^{*\rho(\sigma)}- Q^{*\rho(\sigma)}}&= \norminf{\hatT\brac{\widehat Q^{*\rho(\sigma)}}-\boldsymbol{\mathcal{T}}^{\rho(\sigma)}\brac{Q^{*\rho(\sigma)}}  }
\\& \leq \norminf{\hatT\brac{\widehat Q^{*\rho(\sigma)}}-\boldsymbol{\widehat{\mathcal{T}}}^{\rho(\sigma)}\brac{Q^{*\rho(\sigma)}} }
+ \norminf{\hatT\brac{ Q^{*\rho(\sigma)}}-\mathcal{T}^{\rho(\sigma)}\brac{Q^{*\rho(\sigma)}}}
\\& \myineq{\leq}{i} \gamma\norminf{\widehat Q^{*\rho(\sigma)}-Q^{*\rho(\sigma)} }+ \norminf{\hatT\brac{ Q^{*\rho(\sigma)}}-\boldsymbol{\mathcal{T}}^{\rho(\sigma)}\brac{Q^{*\rho(\sigma)}}},
% \\& \myineq{\leq}{ii}\gamma\norminf{\widehat Q^{*\rho(\sigma)}-Q^{*,\rho(\sigma)} }+\norminf{  \boldsymbol{\bar f^*}^{\rho(\sigma)}\brac{\hat \mu_{s,a,2^{N_{\max}+1}}, \boldsymbol{id}(r_{s,a}) }
% -\boldsymbol{\bar f^*}^{\rho(\sigma)}\brac{ \mu_{s,a}, \boldsymbol{id}(r_{s,a}) }
% } \\& \qquad+\gamma \norminf{ \boldsymbol{\bar f^*}^{\rho(\sigma)}\brac{\hat p_{s,a,2^{N_{\max}+1}},  V^*(s'_{s,a})}- \boldsymbol{\bar f^*}^{\rho(\sigma)}\brac{ p_{s,a}, V^*(s'_{s,a})} }.
} where $(i)$ follows from \cref{prop:contract}. 
\end{proof}

\begin{lemma}[Hoeffding inequality\cite{boucheron2013concentration}]
    Let $x_1,x_2,...,x_n$ be independent random variables such that $x_i$ takes its values in $[a_i,b_i]$ almost surely for all $i\leq n$. Then, for any $\tau>0$, 
    \eqenv{
    \prob\brac{\lbrac{\frac{1}{n}\sum_{i=1}^n (x_i-\E[x_i])      }\geq \tau }\leq 2\exp\brac{-\frac{2 \tau^2 n^2}{\sum_{i=1}^n (a_i-b_i)^2 }  }.
    }

    
\end{lemma}\label{lm:hoeffding}
\begin{lemma}[Self-bounding variance inequality \cite{maurer2009empirical}]
    Let $x_1,x_2,...,x_n$ be independent and identically distributed random variables with finite variance, that is, $\Varr{x_1}\leq \infty$. Assume that $x_i\in[0,M]$ for every $i$ with $M>0$. Then, for any $\tau>0$, we have
    \eqenv{
    \prob\brac{\lbrac{\frac{1}{n}\sum_{i=1}^n x^2_i-\brac{\frac{1}{n}\sum_{i=1}^n x_i }^2 -\sqrt{\Varr{x_1}}}\geq \tau }\leq 2 \exp\brac{-\frac{\tau^2 n}{2 M^2} }.
    }
\end{lemma}

\begin{lemma}[\cite{chen2022finite} Theorem 2.1 \& Corollary 2.1.2]\label{lm:chen}
    For the following stochastic iteration,
    \eqenv{
    \theta_{k+1}= \theta_k+\beta_k \brac{\mathcal{H}(\theta_k)-\theta_k+ w_k},
    } where $\theta\in \mathbb R^d$, $\beta_k$ is the stepsize. The fixed point $\theta^*$ satisfies that
    $\theta^*=\mathcal{H}(\theta^*)$. 
    Define $\mathcal{F}_k=\varbrac{\theta_0, w_0,...,\theta_{k-1},w_{k-1},\theta_{k} }$. 
    When 
    \eqenv{
    \norminf{\mathcal{H}(\theta)-\mathcal{H}(\theta')}\leq \gamma \norminf{\theta-\theta'},
    }
    and
    \eqenv{
    (a). \E\Fbrac{w_k|\mathcal{F}_k }=0; \qquad (b). \E\Fbrac{\norminf{w_k}^2|\mathcal{F}_k }\leq A+ B \norminf{\theta_k}^2,
    }
we have 
\eqenv{
\E\Fbrac{\norminf{\theta_k-\theta^*}^2}\leq c_1 \norminf{\theta_0-\theta^*}^2 \prod_{j=0}^{k-1} (1-c_2 \beta_k) +c_4\brac{A+2B\norminf{\theta^*}^2} \sum_{i=0}^{k-1} \beta_t^2 \prod_{j=i+1}^{k-1} (1-c_2 \beta_j),
}where $c_1=\frac{3}{2}$, $c_2= \frac{1-\gamma}{2}, c_3 =\frac{32 e (B+2)\log (d)}{1-\gamma}, c_4=\frac{16e\log(d)}{1-\gamma}.  $
\end{lemma}


\section{Total Variation Propositions and Theorems Proof}
In this part, we present the proof of propositions and theorems specifically for the TV-constrained uncertainty set. 
\begin{theorem}[Restatement of \Cref{thm:tv1}]
    Consider the case of TV constraint uncertainty set with uncertainty level $\sigma$ i.e. $ \mathcal{P}^{TV}(\sigma)$ and $ \mathcal{R}^{TV}(\sigma)$, set $g=\frac{1}{2}$, for any $Q\in \mathbb R^{\mathcal{S}\times\mathcal{A}}, s\in \mathcal{S}, a\in \mathcal{A} $, the estimation bias can be bounded as:
    \begin{align}
        \lbrac{\E\Fbrac{\widehat {\mathcal{T}}^{\rho_{TV}(\sigma)} (Q)(s,a) } - {\mathcal{T}}^{\rho_{TV}(\sigma)} (Q)(s,a)}\leq \mathcal{O}\brac{2^{-\frac{N_{\max}}{2}} },\nonumber
        % \brac{r_{\max}+\frac{r_{\max}}{1-\gamma}}
    \end{align}
    and the variation can be bounded as:
    \begin{align}
        \text{Var}\brac{\widehat {\mathcal{T}}^{\rho_{TV}(\sigma)} (Q)(s,a)  }\leq \mathcal{O}\brac{N_{\max}}.
    \end{align}
\end{theorem}
\begin{proof}
Firstly, we make error decomposition as follows:
    \eqenv{
    &\lbrac{ \E\Fbrac{\widehat {\mathcal{T}}^{\rho_{TV}(\sigma)} (Q)(s,a) } - {\mathcal{T}}^{\rho_{TV}(\sigma)} (Q)(s,a)}
   \\ &= \bigg|\boldsymbol{\bar f^*}^{\rho_{TV}(\sigma)}\brac{\mu_{s,a},\boldsymbol{id}(r_{s,a})  }+\gamma\boldsymbol{\bar f^*}^{\rho_{TV}(\sigma)}\brac{ p_{s,a}, V(s'_{s,a})}
   \\&\qquad-\boldsymbol{\bar f^*}^{\rho_{TV}(\sigma)}\brac{\hat \mu_{s,a,2^{N_{\max}+1}}, \boldsymbol{id}(r_{s,a}) }
- \gamma{\boldsymbol{\bar f^*}^{\rho_{TV}(\sigma)}\brac{\hat p_{s,a,2^{N_{\max}+1}},   V(s'_{s,a})}} \bigg|
\\& \leq \lbrac{\boldsymbol{\bar f^*}^{\rho_{TV}(\sigma)}\brac{\mu_{s,a},\boldsymbol{id}(r_{s,a})  }
-\boldsymbol{\bar f^*}^{\rho_{TV}(\sigma)}\brac{\hat \mu_{s,a,2^{N_{\max}+1}}, \boldsymbol{id}(r_{s,a}) } }\\& \qquad+\gamma\lbrac{\boldsymbol{\bar f^*}^{\rho_{TV}(\sigma)}\brac{ p_{s,a}, V(s'_{s,a})}-{\boldsymbol{\bar f^*}^{\rho_{TV}(\sigma)}\brac{\hat p_{s,a,2^{N_{\max}+1}},   V(s'_{s,a})}}   }. \label{eq:eq44}
    }

    Then, for convenience, we bound the second term in \cref{eq:eq44}. The first term can be bounded similarly.  By \cref{lm:tv}, 
    \eqenv{\label{eqeq42}
    \boldsymbol{\bar f^*}^{\rho_{TV}(\sigma)}\brac{ p_{s,a}, V(s'_{s,a})}&-{\boldsymbol{\bar f^*}^{\rho_{TV}(\sigma)}\brac{\hat p_{s,a,2^{N_{\max}+1}},   V^*(s'_{s,a})}}  
    \\&= \Bigg|\max_{\alpha\geq 0}\varbrac{ \mathbb E_{p_{s,a}}\Fbrac{(V(s'_{s,a}))_\alpha}-\frac{{\sigma}}{2}\brac{\alpha-\min_{s'_{s,a}} V(s'_{s,a})} }\\&\qquad\qquad-\E\Fbrac{\max_{\alpha\geq 0}\varbrac{ \mathbb E_{\hat p_{s,a,2^{N_{\max}+1}}}\Fbrac{(V(s'_{s,a}))_\alpha}-\frac{{\sigma}}{2}\brac{\alpha-\min_{s'_{s,a}} V(s'_{s,a})} }}\Bigg|
    \\& \leq \E\Fbrac{\max_{\max_{s'_{s,a}} V(s'_{s,a}) \geq \alpha\geq 0}\lbrac{\mathbb E_{p_{s,a}}\Fbrac{(V(s'_{s,a}))_\alpha}-E_{\hat p_{s,a,2^{N_{\max}+1}}}\Fbrac{(V(s'_{s,a}))_\alpha}}}.
    % \\& \leq  \E\Fbrac{\frac{r_{\max}}{1-\gamma}|\mathcal{S}| \max_{s'_{s,a}} \varbrac{\lbrac{p_{s,a}(s'_{s,a})-\hat p_{s,a,2^{N_{\max}+1}}(s'_{s,a}) } } }.
    }
    
% By Hoeffding inequality, with probability $1-2^{-{N_{\max}-1}} $, we have
% \eqenv{
% \max_{s'_{s,a}} \varbrac{\lbrac{p_{s,a}(s'_{s,a})-\hat p_{s,a,2^{N_{\max}+1}}(s'_{s,a}) } }\leq \sqrt{\frac{N_{\max}\log (2 |\mathcal{S}|)}{2^{N_{\max}+1}}  },
% }then, under this case, we can get that
%     \eqenv{
%     \E\Fbrac{\max_{\max_{s'_{s,a}} V(s'_{s,a}) \geq \alpha\geq 0}\lbrac{\mathbb E_{p_{s,a}}\Fbrac{(V(s'_{s,a}))_\alpha}-E_{\hat p_{s,a,2^{N_{\max}+1}}}\Fbrac{(V(s'_{s,a}))_\alpha}}}\leq \sqrt{\frac{r^2_{\max}|\mathcal{S}|^2 N_{\max}\log (2 |\mathcal{S}|)}{2^{N_{\max}+1}(1-\gamma)^2}  }
%     }

    According to Lemma 9 in \cite{shi2023curious}, we can get that with probability $1-2^{-{N_{\max}-1}} $, we have 
    \eqenv{\label{eqeq44}
    &{\max_{0\leq \alpha\leq \max_{s'_{s,a}} V(s'_{s,a}) }\lbrac{\mathbb E_{p_{s,a}}\Fbrac{(V(s'_{s,a}))_\alpha}-E_{\hat p_{s,a,2^{N_{\max}+1}}}\Fbrac{(V(s'_{s,a}))_\alpha}}}
    \leq 3 \sqrt{\frac{r_{\max}^2 \brac{\log \brac{18|\mathcal{S}||\mathcal{A}|} + 2(N_{\max}+1)\log 2} }{(1-\gamma)^2 2^{{N_{\max}+1}}}}, 
    }
    otherwise, we have 
    \eqenv{\label{eqeq45}
    \max_{0\leq \alpha\leq \max_{s'_{s,a}} V(s'_{s,a}) }\lbrac{\mathbb E_{p_{s,a}}\Fbrac{(V(s'_{s,a}))_\alpha}-E_{\hat p_{s,a,2^{N_{\max}+1}}}\Fbrac{(V(s'_{s,a}))_\alpha}}\leq \max_{s'_{s,a}} V(s'_{s,a})\leq \frac{r_{\max}}{1-\gamma}. 
    }
    Hence, set $C_{TV}= 3\sqrt{{2(N_{\max}+1)} \log(18 |\mathcal{S}||\mathcal{A}|)} $. Plugging the above equations to \cref{eqeq42}, we can conclude that
    \eqenv{
    &\boldsymbol{\bar f^*}^{\rho_{TV}(\sigma)}\brac{ p_{s,a}, V(s'_{s,a})}-{\boldsymbol{\bar f^*}^{\rho_{TV}(\sigma)}\brac{\hat p_{s,a,2^{N_{\max}+1}},   V^*(s'_{s,a})}}
     \myineq{\leq}{i} 3 \sqrt{\frac{r_{\max}^2 \log \brac{18|\mathcal{S}||\mathcal{A}| 2^{{2(N_{\max}+1)}} }}{(1-\gamma)^2 2^{{N_{\max}+1}}}} + \frac{r_{\max}}{1-\gamma}2^{-\brac{N_{\max}+1}} 
    \\& {\leq} \frac{r_{\max}}{1-\gamma}2^{-\frac{N_{\max}+1}{2}}\brac{2^{-\frac{N_{\max}+1}{2}}+ 3\sqrt{\frac{3(N_{\max}+1)}{2} \log(18 |\mathcal{S}||\mathcal{A}|)} }, 
    }where $(i)$ follows from that $1-2^{-\brac{N_{\max}+1}}\leq 1$. 

    Similar, we can get the bound
    \eqenv{
    \lbrac{\boldsymbol{\bar f^*}^{\rho_{TV}(\sigma)}\brac{\mu_{s,a},\boldsymbol{id}(r_{s,a})  }-\boldsymbol{\bar f^*}^{\rho_{TV}(\sigma)}\brac{\hat \mu_{s,a,2^{N_{\max}+1}}, \boldsymbol{id}(r_{s,a}) } }
    \leq r_{\max}2^{-\frac{N_{\max}+1}{2}}\brac{2^{-\frac{N_{\max}+1}{2}}+ 3C_{TV} }.
    }

    Thus, we can get that
    \eqenv{
    &\lbrac{ \E\Fbrac{\widehat {\mathcal{T}}^{\rho_{TV}(\sigma)} (Q)(s,a) } - {\mathcal{T}}^{\rho_{TV}(\sigma)} (Q)(s,a)}
\\& \leq \lbrac{\boldsymbol{\bar f^*}^{\rho_{TV}(\sigma)}\brac{\mu_{s,a},\boldsymbol{id}(r_{s,a})  }-\boldsymbol{\bar f^*}^{\rho_{TV}(\sigma)}\brac{\hat \mu_{s,a,2^{N_{\max}+1}}, \boldsymbol{id}(r_{s,a}) } }\\& \qquad+\gamma\lbrac{\boldsymbol{\bar f^*}^{\rho_{TV}(\sigma)}\brac{ p_{s,a}, V(s'_{s,a})}-{\boldsymbol{\bar f^*}^{\rho_{TV}(\sigma)}\brac{\hat p_{s,a,2^{N_{\max}+1}},   V(s'_{s,a})}}   }
\\& \leq \brac{\frac{r_{\max}}{1-\gamma}+r_{\max} }2^{-\frac{N_{\max}+1}{2}}\brac{2^{-\frac{N_{\max}+1}{2}}+ 3C_{TV} }.
    }
        
    % $(i)$ follows from the dual representation of Wasserstein distance and the facts that 1). the function $ (v(x))_\alpha$ is a Lipschitz function on $[0,\infty)$ with bounded Lipschitz constant $1$. 2) $(v(x))_\alpha\leq \alpha \leq \max_{s'_{s,a}}V(s'_{s,a})\leq \max_{s,a}Q(s,a)\leq \frac{r_{\max}}{1-\gamma}  $.  $(ii)$ follows from  Wasserstein distance inequality in \cite{fournier2015rate}. 

    % Hence, we can get that
    % \eqenv{
    % \norminf{ {\widehat {\mathcal{T}}^{\rho_{TV}(\sigma)} (Q) } - {\mathcal{T}}^{\rho_{TV}(\sigma)} (Q) } \leq \max_{s,a} \lbrac{ {\widehat {\mathcal{T}}^{\rho_{TV}(\sigma)} (Q)(s,a) } - {\mathcal{T}}^{\rho_{TV}(\sigma)} (Q)(s,a) }
    % \\& \leq 
    % }

\textbf{Variance: }Next, we consider the variance of the robust Bellman operator. Firstly, we make error decomposition of the robust Bellman operator variance. 
\eqenv{
\text{Var}\brac{\widehat {\mathcal{T}}^{\rho_{TV}(\sigma)} (Q)(s,a) }
&= \text{Var}\brac{\widehat  r^{\rho_{TV}(\sigma)}+ \gamma \widehat{v}^{\rho_{TV}(\sigma)}(Q)(s,a) }
\\&= \text{Var}\brac{\widehat  r^{\rho_{TV}(\sigma)}}
+\gamma^2 \Varr{\widehat{v}^{\rho_{TV}(\sigma)}(Q)(s,a)}.
}
For convenience, we analyze the second term in the above equation. The first term can be bounded similarly. 
\eqenv{
\Varr{\widehat{v}^{\rho_{TV}(\sigma)}(Q)(s,a)}= 
\E\Fbrac{\brac{\widehat{v}^{\rho_{TV}(\sigma)}(Q)(s,a)}^2  }-\brac{\E\Fbrac{\widehat{v}^{\rho_{TV}(\sigma)}(Q)(s,a) }}^2
\leq \E\Fbrac{\brac{\widehat{v}^{\rho_{TV}(\sigma)}(Q)(s,a)}^2  }.
}
Next, according to the \cref{eq:delta,eq:hatq}, now we compute the expectation of $N_2$ and write a detailed explanation of the variance as follows:
\eqenv{\label{eqeq149}
\E\Fbrac{\brac{\widehat{v}^{\rho_{TV}(\sigma)}(Q)(s,a)}^2  }& = \E\Fbrac{\brac{V(s'_{s,a,0})+ \frac{\delta^{\rho_{TV}(\sigma)}_{s,a,N_2}(Q)(s,a) }{P_{N_2}} }^2}
\\& \leq 2 \E\Fbrac{V(s'_{s,a,0})^2 }+ 2\E \Fbrac{ \brac{\frac{\delta^{\rho_{TV}(\sigma)}_{s,a,N_2}(Q)(s,a) }{P_{N_2}} }^2 }
\\& \leq \frac{2r^2_{\max}}{(1-\gamma)^2}+ 2\sum_{N=0}^{N_{\max}} \E\Fbrac{\brac{\frac{\delta^{\rho_{TV}(\sigma)}_{s,a,N_2}(Q)(s,a) }{P_{N_2}}|N_2=N }^2} P_N
\\& \leq \frac{2r^2_{\max}}{(1-\gamma)^2}+ 2\sum_{N=0}^{N_{\max}} \frac{\E\Fbrac{(\delta^{\rho_{TV}(\sigma)}_{s,a,N}(Q)(s,a))^2} }{P_{N}}.
}
Next, we bound the term $\lbrac{\delta^{\rho_{TV}(\sigma)}_{s,a,N}(Q)(s,a) } $,
\eqenv{
\lbrac{\delta^{\rho_{TV}(\sigma)}_{s,a,N}(Q)(s,a) }=
\lbrac{\sup_{\alpha\geq 0}\varbrac{f^{\rho_{TV}(\sigma)}(\widehat p_{s,a,2^{N+1}},\alpha,V)} 
     -\frac{1}{2} \sup_{\alpha\geq 0}\varbrac{f^{\rho_{TV}(\sigma)}(\widehat p^E_{2^{N}},\alpha,V)}
    -\frac{1}{2}\sup_{\alpha\geq 0}\varbrac{f^{\rho_{TV}(\sigma)}(\widehat p^O_{2^{N}},\alpha,V )}}. 
}

% Recall that
% $\boldsymbol{\alpha^*}^{\rho_{TV}(\sigma)}(p, v(x))\in \arg\max_{\alpha} f^{\rho_{TV}(\sigma)}(p, \alpha, v(x)) $. 

% According to \cref{lm:qtv}, when $$\boldsymbol{\alpha^*}^{\rho_{TV}(\sigma)}(\widehat p^E_{2^{N}}, V)=\boldsymbol{\alpha^*}^{\rho_{TV}(\sigma)}(\widehat p^O_{2^{N}}, V) ,\quad \widehat p^E_{2^{N}}(s')_{s'\in\min_s V(s)}\neq 0,\quad \widehat p^O_{2^{N}}(s')_{s'\in \min_s V(s)}\neq 0,$$ then
% \eqenv{
% \delta^{\rho_{TV}(\sigma)}_{s,a,N}(Q)(s,a) =0. 
% }

% Otherwise,

We make an error decomposition as follows:
\eqenv{
\lbrac{\delta^{\rho_{TV}(\sigma)}_{s,a,N}(Q)(s,a) }^2&=
\lbrac{\sup_{\alpha\geq 0}\varbrac{f^{\rho_{TV}(\sigma)}(\widehat p_{s,a,2^{N+1}},\alpha,V)} 
     -\frac{1}{2} \sup_{\alpha\geq 0}\varbrac{f^{\rho_{TV}(\sigma)}(\widehat p^E_{2^{N}},\alpha,V)}
    -\frac{1}{2}\sup_{\alpha\geq 0}\varbrac{f^{\rho_{TV}(\sigma)}(\widehat p^O_{2^{N}},\alpha,V )}}^2
    \\& \leq 3\lbrac{\sup_{\alpha\geq 0}\varbrac{f^{\rho_{TV}(\sigma)}(\widehat p_{s,a,2^{N+1}},\alpha,V)}-\boldsymbol{\bar f^*}^{\rho_{TV}(\sigma)}\brac{\hat p_n, V(s'_{s,a})}  }^2
    \\&\qquad+  \frac{3}{2}\lbrac{ \sup_{\alpha\geq 0}\varbrac{f^{\rho_{TV}(\sigma)}(\widehat p^E_{2^{N}},\alpha,V)}-\boldsymbol{\bar f^*}^{\rho_{TV}(\sigma)}\brac{\hat p_n, V(s'_{s,a})}}^2
    \\&\qquad+\frac{3}{2}\lbrac{\sup_{\alpha\geq 0}\varbrac{f^{\rho_{TV}(\sigma)}(\widehat p^O_{2^{N}},\alpha,V )}-\boldsymbol{\bar f^*}^{\rho_{TV}(\sigma)}\brac{\hat p_n, V(s'_{s,a})}}^2.\label{eqeq152}
    % \\& \myineq{}{}
}

Then, combined with the analysis in \cref{eqeq44,eqeq45} and the fact $\mathbb P(A\cap B\cap C)\geq 1- \mathbb P(\neg A)-\mathbb P(\neg B)-\mathbb P(\neg C)$, we can conclude that
 with probability at least $1-3*2^{-N} $
\eqenv{
\lbrac{\delta^{\rho_{TV}(\sigma)}_{s,a,N}(Q)(s,a) }^2
&\leq 3\brac{C_{TV}\frac{r_{\max}}{1-\gamma} 2^{-\frac{N+1}{2}}}^2 + \frac{3}{2}\brac{C_{TV}\frac{r_{\max}}{1-\gamma} 2^{-\frac{N}{2}} }^2+ \frac{3}{2}\brac{C_{TV}\frac{r_{\max}}{1-\gamma} 2^{-\frac{N}{2}} }^2
\\&  = \frac{9}{2}\frac{C^2_{TV} r^2_{\max} }{(1-\gamma)^2}2^{N+1}, 
}
 Since $ 0 \leq \sup_{\alpha\geq 0}\varbrac{f^{\rho_{TV}(\sigma)}(q,\alpha,V)}\leq \frac{r_{\max}}{1-\gamma}$ for any distribution $q$, with probability at most $3*2^{-N} $ 
we have that
\eqenv{
 \lbrac{\delta^{\rho_{TV}(\sigma)}_{s,a,N}(Q)(s,a) }^2
 \leq \brac{ \frac{r_{\max}}{1-\gamma}}^2.
}

Above all, we can get that 
\eqenv{
\E\Fbrac{\lbrac{\delta^{\rho_{TV}(\sigma)}_{s,a,N}(Q)(s,a) }^2 }\leq \frac{9}{2}\frac{C^2_{TV} r^2_{\max} }{(1-\gamma)^2}2^{-(N+1)}+ \brac{ \frac{r_{\max}}{1-\gamma}}^23*2^{-N}
\leq \brac{9 C^2_{TV}+ 3}\brac{ \frac{r_{\max}}{1-\gamma}}^2 2^{-N}. \label{eqeq155}
}
Then, plug \cref{eqeq155} in \cref{eqeq149}, we can get the boundary of variance of robust Bellman operator as follows:
\eqenv{
\Varr{\widehat{v}^{\rho_{TV}(\sigma)}(Q)(s,a)}&\leq \frac{2r^2_{\max}}{(1-\gamma)^2}+ 2\sum_{N=0}^{N_{\max}} \frac{\E\Fbrac{(\delta^{\rho_{TV}(\sigma)}_{s,a,N}(Q)(s,a))^2} }{P_{N}}
\\&\leq \frac{2r^2_{\max}}{(1-\gamma)^2}+ \frac{2r^2_{\max}}{(1-\gamma)^2}\brac{9 C^2_{TV}+ 3}\sum_{N=0}^{N_{\max}} \frac{2^{-N} }{P_{N}}
\\& \leq \frac{2r^2_{\max}}{(1-\gamma)^2}(1+\brac{9 C^2_{TV}+ 3}(N_{\max}+1) ).
}

Similarly, we can get the boundary of the variance $\text{Var}\brac{\widehat  r^{\rho_{TV}(\sigma)}}$ as follows:
\eqenv{
\text{Var}\brac{\widehat  r^{\rho_{TV}(\sigma)}}\leq 
 2r^2_{\max}(1+\brac{9 C^2_{TV}+ 3}(N_{\max}+1) ).
}

Hence, we can get the robust Bellman operator variance bound:
\eqenv{\label{eqeq62}
\text{Var}\brac{\widehat {\mathcal{T}}^{\rho_{TV}(\sigma)} (Q)(s,a) }
&= \text{Var}\brac{\widehat  r^{\rho_{TV}(\sigma)}}
+\gamma^2 \Varr{\widehat{v}^{\rho_{TV}(\sigma)}(Q)(s,a)}
\\&\leq \brac{2r^2_{\max}+\gamma^2\frac{2r^2_{\max}}{(1-\gamma)^2} }(1+\brac{9 C^2_{TV}+ 3}(N_{\max}+1) ).
}
   This completes the proof.  
\end{proof}


Next, we present the proof of \cref{thm2:tv}
\begin{definition}[Threshold MLMC Robust Bellman Operator]\label{def:4.1}
     We define the threshold MLMC robust Bellman operator $\boldsymbol{\bar {\mathcal{T}}}^{\rho(\sigma)}_{N_{\max}} $ as follows
     \eqenv{
     \boldsymbol{\bar {\mathcal{T}}}^{\rho(\sigma)}_{N_{\max}} (Q)(s,a)= \E\Fbrac{\widehat{\mathcal{T}}^{\rho(\sigma)}_{N_{\max}}(Q)(s,a) }. 
     }
 \end{definition}
 
\begin{theorem}[Restatement of \cref{thm2:tv}]
 Set $g=\frac{1}{2}$, and set the stepsize as $$\alpha_t=\alpha=\frac{\log T}{(1-\gamma)T}. $$
Then the output from \cref{alg:example} satisfies that:
 \begin{align}
    \mathbb E &\Fbrac{\mynorm{\widehat Q_T^{\rho_{TV}(\sigma)}-Q^{*\rho_{TV}(\sigma)}}_\infty^2}\nonumber
    %\\& \leq \frac{c_0}{(1-\gamma)^2}\brac{1-\frac{\alpha(1-\gamma)}{2}}^T+ c_1 \alpha\frac{\log (|\mathcal{S}||\mathcal{A}|)}{ (1-\gamma)^4}\nonumber
    \leq\mathcal{O}\brac{\frac{1}{  (1-\gamma)^5 T}}.
\end{align}
To obtain an $\epsilon$-optimal policy, i.e., 
\begin{align}
    \mathbb E \Fbrac{\mynorm{\widehat Q_T^{\rho_{TV}(\sigma)}-Q^{*,\rho_{TV}(\sigma)}}_\infty^2}\leq \epsilon^2,\nonumber
\end{align}
the expected sample complexity $N^{\rho_{TV}(\sigma)}(\epsilon)$ is
 \begin{align}
     N^{\rho_{TV}(\sigma)}(\epsilon) = |\mathcal{S}||\mathcal{A}|N_{\max} T\geq \mathcal{O}\brac{\frac{|\mathcal{S}||\mathcal{A}|}{ (1-\gamma)^5 \epsilon^2 }}.\nonumber
 \end{align}
\end{theorem}
\begin{proof}
We consider the stochastic iteration that
\eqenv{
\widehat Q^{\rho_{TV}(\sigma)}_{t+1}=\widehat Q^{\rho_{TV}(\sigma)}_{t}+ \beta_t\brac{\boldsymbol{\widehat{\mathcal{T}}}^{\rho_{TV}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{TV}(\sigma)}_{t}}- \widehat Q^{\rho_{TV}(\sigma)}_{t}+ W_t  },
}
where $W_t={\widehat{\mathcal{T}}}^{\rho_{TV}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{TV}(\sigma)}_{t}}-\boldsymbol{\widehat{\mathcal{T}}}^{\rho_{TV}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{TV}(\sigma)}_{t}} $. 

Define the filtration $\mathcal{F}_t=\varbrac{Q^{\rho_{TV}(\sigma)}_0, W_0,...,Q^{\rho_{TV}(\sigma)}_{t-1},W_{t-1},Q^{\rho_{TV}(\sigma)}_{t} } $. 
Then, by \cref{def:4.1,thm:tv1}, we can get that 
\eqenv{
\E\Fbrac{W_t|\mathcal{F}_t }=0,
}
and
\eqenv{
\E\Fbrac{\norminf{W_t}^2|\mathcal{F}_t }
&\leq \max_{s,a}{\E\Fbrac{\twonorm{{\widehat{\mathcal{T}}}^{\rho_{TV}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{TV}(\sigma)}_{t}}(s,a)-\boldsymbol{\widehat{\mathcal{T}}}^{\rho_{TV}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{TV}(\sigma)}_{t}}(s,a)}^2|\mathcal{F}_t }}
\\& \leq \max_{s,a} \varbrac{\text{Var}\brac{\widehat {\mathcal{T}}^{\rho_{TV}(\sigma)}_{N_{\max}} (Q^{\rho_{TV}(\sigma)}_{t})(s,a)  } }
\\& \leq \brac{2r^2_{\max}+2\gamma^2  \norminf{Q^{\rho_{TV}(\sigma)}_{t} }^2 }(1+\brac{9 C^2_{TV}+ 3}(N_{\max}+1) ),
\\& \myineq{\leq}{i} \brac{2r^2_{\max}+\gamma^2\frac{2r^2_{\max}}{(1-\gamma)^2} }(1+\brac{9 C^2_{TV}+ 3}(N_{\max}+1) ),
} where $(i) $ follows from \cref{eqeq62}. 

According to \cref{eqeq35}, we have that
$$\widehat Q^{*\rho_{TV}(\sigma)}(s,a) = \hatT (\widehat Q^{*\rho_{TV}(\sigma)})(s,a)=\E\Fbrac{\hatt (\widehat Q^{*\rho_{TV}(\sigma)})(s,a)}.$$
Then, apply \cref{lm:chen} \cite{chen2020finite}, set the constant stepsize
$$\beta_t=\beta= \min \varbrac{\frac{2\log T}{(1-\gamma)T}, \frac{(1-\gamma)^2}{64 e r^2_{\max}(1+\brac{9 C^2_{TV}+ 3}(N_{\max}+1) ) \log(|\mathcal{S}||\mathcal{A}|) }} .$$
We can conclude that
\eqenv{\label{eqeq166}
\E&\Fbrac{\norminf{\widehat Q_{T}^{\rho_{TV}(\sigma)}-\widehat Q^{*\rho_{TV}(\sigma)} }^2}\leq \frac{3}{2} \norminf{\widehat Q_{0}^{\rho_{TV}(\sigma)}-\widehat Q^{*\rho_{TV}(\sigma)} }^2\prod_{j=0}^{T-1} \brac{1-\frac{1-\gamma}{2} \beta_t} \\& \qquad+\frac{16 e \log (|\mathcal{S}||\mathcal{A}|)}{1-\gamma}\brac{2r^2_{\max}+\gamma^2\frac{2r^2_{\max}}{(1-\gamma)^2} }(1+\brac{9 C^2_{TV}+ 3}(N_{\max}+1) )\sum_{i=0}^{T-1} \beta_i^2 \prod_{t=i+1}^{T-1} (1-
\frac{1-\gamma}{2}\beta_t)
\\& \leq \frac{3}{2}\frac{r^2_{\max}}{(1-\gamma)^2} \frac{1}{T} +\frac{16 e \log (|\mathcal{S}||\mathcal{A}|)}{1-\gamma}\brac{2r^2_{\max}+\gamma^2\frac{2r^2_{\max}}{(1-\gamma)^2} }(1+\brac{9 C^2_{TV}+ 3}(N_{\max}+1) )\frac{4 \log T }{(1-\gamma)^2 T}
. 
}

% \eqenv{
% \mathbb E \Fbrac{\mynorm{\widehat Q_{t+1}^{\rho_{TV}(\sigma)}-\widehat Q^{*\rho_{TV}(\sigma)}}_\infty^2}\leq \brac{1-2\alpha(1-\gamma)}\mathbb E \Fbrac{\mynorm{\widehat Q_{t}^{\rho_{TV}(\sigma)}-\widehat Q^{*\rho_{TV}(\sigma)}}_\infty^2}
% }

% \eqenv{
% \E&\Fbrac{\norminf{\widehat Q_{t+1}^{\rho_{TV}(\sigma)}-\widehat Q^{*\rho_{TV}(\sigma)} }^2}
% \\& = \E\Fbrac{\norminf{(1-\alpha)\widehat Q_{t}^{\rho_{TV}(\sigma)} + \alpha \hatt \brac{\widehat Q_{t}^{\rho_{TV}(\sigma)}}-Q^{*\rho_{TV}(\sigma)} }^{23}}
% \\& \myineq{\leq}{i} (1-\alpha)^2 \E\Fbrac{\norminf{\widehat Q_{t}^{\rho_{TV}(\sigma)}-\widehat Q^{*\rho_{TV}(\sigma)}   }^2} + \alpha^2 
% \E\Fbrac{\norminf{\hatt\brac{\widehat Q_{t}^{\rho_{TV}(\sigma)}}-\hatt\brac{\widehat Q^{*\rho_{TV}(\sigma)}   }}^2}
% \\& \quad+ 2\alpha (1-\alpha) \E\Fbrac{\norminf{\widehat Q_{t}^{\rho_{TV}(\sigma)}-\widehat Q^{*\rho_{TV}(\sigma)}   }\norminf{\hatt\brac{\widehat Q_{t}^{\rho_{TV}(\sigma)}}-\hatt\brac{\widehat Q^{*\rho_{TV}(\sigma)}   }}}
% \\& \myineq{\leq }{ii}
% \brac{(1-\alpha)^2+ 2\alpha(1-\alpha)\gamma}\E\Fbrac{\norminf{\widehat Q_{t}^{\rho_{TV}(\sigma)}-\widehat Q^{*\rho_{TV}(\sigma)}   }^2} 
% + \alpha^2 
% \E\Fbrac{\norminf{\hatt\brac{\widehat Q_{t}^{\rho_{TV}(\sigma)}}-\hatt\brac{\widehat Q^{*\rho_{TV}(\sigma)}   }}^2}
% \\& \myineq{\leq}{iii}\brac{1-2(1-\gamma)\alpha}\E\Fbrac{\norminf{\widehat Q_{t}^{\rho_{TV}(\sigma)}-\widehat Q^{*\rho_{TV}(\sigma)}   }^2} 
% + \alpha^2 \max_{s,a}\Varr{\widehat {\mathcal{T}}^{\rho_{TV}(\sigma)} (Q)(s,a)}
% \\& \myineq{\leq}{iv}\brac{1-2(1-\gamma)\alpha}\E\Fbrac{\norminf{\widehat Q_{t}^{\rho_{TV}(\sigma)}-\widehat Q^{*\rho_{TV}(\sigma)}   }^2} 
% +  2\alpha^2\brac{r^2_{\max}+\gamma^2\frac{r^2_{\max}}{(1-\gamma)^2} }(1+\brac{9 C^2_{TV}+ 3}(N_{\max}+1) ),
% }

% We set the constant stepsize
% $\alpha_t=\alpha= \frac{\log T}{(1-\gamma)T} $. Consider above equation, we can conclude that:
% \eqenv{
% \E&\Fbrac{\norminf{\widehat Q_{t+1}^{\rho_{TV}(\sigma)}-\widehat Q^{*\rho_{TV}(\sigma)} }^2 }
%  \\&\leq\brac{1-2(1-\gamma)\alpha}^{T}\norminf{\widehat Q_{0}^{\rho_{TV}(\sigma)}-\widehat Q^{*\rho_{TV}(\sigma)}   }^2 
% +  \frac{\alpha^2}{2(1-\gamma)\alpha}\brac{r^2_{\max}+\gamma^2\frac{r^2_{\max}}{(1-\gamma)^2} }(1+\brac{9 C^2_{TV}+ 3}(N_{\max}+1) )
% \\& \leq \brac{1-2\frac{\log T}{T}}^{T}\frac{r^2_{\max}}{(1-\gamma)^2} 
% +  \frac{\log T}{2(1-\gamma)^2 T}\brac{r^2_{\max}+\gamma^2\frac{r^2_{\max}}{(1-\gamma)^2} }(1+\brac{9 C^2_{TV}+ 3}(N_{\max}+1) )
% \\& \leq  \frac{\log T}{T^2}\frac{r^2_{\max}}{(1-\gamma)^2} 
% +  \frac{\log T}{2(1-\gamma)^2 T}\brac{r^2_{\max}+\gamma^2\frac{r^2_{\max}}{(1-\gamma)^2} }(1+\brac{9 C^2_{TV}+ 3}(N_{\max}+1) )
% .
% }
Set $N_{\max}=\frac{2\log T}{\log 2}$. Then, we make the decomposition and get the bound of $\mathbb E \Fbrac{\mynorm{\widehat Q_T^{\rho_{TV}(\sigma)}-Q^{*\rho_{TV}(\sigma)}}_\infty^2}$ as follows
 \eqenv{
 \mathbb E& \Fbrac{\mynorm{\widehat Q_T^{\rho_{TV}(\sigma)}-Q^{*\rho_{TV}(\sigma)}}_\infty^2}
 \leq 
 2\E\Fbrac{\mynorm{\widehat Q_T^{\rho_{TV}(\sigma)}-\widehat Q^{*\rho_{TV}(\sigma)}}_\infty^2 }+ 2\E\Fbrac{\mynorm{\widehat Q^{*\rho_{TV}(\sigma)}-Q^{*\rho_{TV}(\sigma)}}_\infty^2 }
 \\& \myineq{\leq}{i}
  \frac{2r^2_{\max}}{(1-\gamma)^2 T} +\frac{32 e \log (|\mathcal{S}||\mathcal{A}|)}{1-\gamma}\brac{2r^2_{\max}+\gamma^2\frac{2r^2_{\max}}{(1-\gamma)^2} }(1+\brac{9 C^2_{TV}+ 3}(N_{\max}+1) )\frac{4 \log T }{(1-\gamma)^2 T}
\\& \qquad+\frac{2}{1-\gamma}\brac{\brac{r_{\max}+\frac{r_{\max}}{1-\gamma}}2^{-\frac{N_{\max}+1}{2}}\brac{2^{-\frac{N_{\max}+1}{2}}+ 3C_{TV} }}^2
\\& \leq  \frac{2r^2_{\max}}{(1-\gamma)^2 T} +\frac{32 e \log (|\mathcal{S}||\mathcal{A}|)}{1-\gamma}\brac{2r^2_{\max}+\gamma^2\frac{2r^2_{\max}}{(1-\gamma)^2} }(1+\brac{9 C^2_{TV}+ 3}(N_{\max}+1) )\frac{4 \log T }{(1-\gamma)^2 T}
\\& \qquad+2\frac{2}{1-\gamma}\brac{\brac{r_{\max}+\frac{r_{\max}}{1-\gamma}}\brac{1+ 3C_{TV} }}^2\frac{1}{T}
\\&= \mathcal{O}\brac{\frac{1}{(1-\gamma)^5T}},
 }where $(i)$ follows from \cref{eqeq166}.

This completes the proof.     
\end{proof}



\section{$\chi^2$ Propositions and Theorems Proof}
Similar from the proof in TV part. Apply the Lemma 15 in \cite{shi2023curious}, we can complete the proof. Here we show the detailed result here.
\begin{theorem}
   Set $g=\frac{1}{2}$, then for any $Q\in \mathbb R^{\mathcal{S}\times\mathcal{A}}, s\in \mathcal{S}, a\in \mathcal{A} $, the estimation bias can be bounded as:
    \begin{align}
        \lbrac{\mathbb E\Fbrac{\widehat {\mathcal{T}}^{\rho_{\chi^2}(\sigma)} (Q)(s,a) } - {\mathcal{T}}^{\rho_{\chi^2}(\sigma)} (Q)(s,a)}
        & \leq \brac{\frac{\gamma r_{\max}}{1-\gamma}+r_{\max} }2^{-\frac{N_{\max}+1}{2}}\brac{2^{-\frac{N_{\max}+1}{2}}+ 3C_{\chi^2} }
         \leq \mathcal{O}\brac{\frac{2^{-\frac{N_{\max}}{2}}}{1-\gamma} },\nonumber
        % \brac{r_{\max}+\frac{r_{\max}}{1-\gamma}}
    \end{align}
    where $C_{\chi^2}= 4\sqrt{{4(N_{\max}+1)(\sigma+1)} \log(24 |\mathcal{S}||\mathcal{A}|)} $
    The variance can be bounded as:
    \begin{align}
        \text{Var}\brac{\widehat {\mathcal{T}}^{\rho_{\chi^2}(\sigma)} (Q)(s,a)  }\leq \brac{2r^2_{\max}+\gamma^2\frac{2r^2_{\max}}{(1-\gamma)^2} }(1+\brac{9 C^2_{\chi^2}+ 3}(N_{\max}+1) )
\leq \mathcal{O}\brac{\frac{N_{\max}}{(1-\gamma)^2}}.
    \end{align}
\end{theorem}

\begin{proof}
Firstly, we make error decomposition as follows:
    \eqenv{
    &\lbrac{ \E\Fbrac{\widehat {\mathcal{T}}^{\rho_{\chi^2}(\sigma)} (Q)(s,a) } - {\mathcal{T}}^{\rho_{\chi^2}(\sigma)} (Q)(s,a)}
   \\ &= \bigg|\boldsymbol{\bar f^*}^{\rho_{\chi^2}(\sigma)}\brac{\mu_{s,a},\boldsymbol{id}(r_{s,a})  }+\gamma\boldsymbol{\bar f^*}^{\rho_{\chi^2}(\sigma)}\brac{ p_{s,a}, V(s'_{s,a})}
   \\&\qquad-\boldsymbol{\bar f^*}^{\rho_{\chi^2}(\sigma)}\brac{\hat \mu_{s,a,2^{N_{\max}+1}}, \boldsymbol{id}(r_{s,a}) }
- \gamma{\boldsymbol{\bar f^*}^{\rho_{\chi^2}(\sigma)}\brac{\hat p_{s,a,2^{N_{\max}+1}},   V(s'_{s,a})}} \bigg|
\\& \leq \lbrac{\boldsymbol{\bar f^*}^{\rho_{\chi^2}(\sigma)}\brac{\mu_{s,a},\boldsymbol{id}(r_{s,a})  }
-\boldsymbol{\bar f^*}^{\rho_{\chi^2}(\sigma)}\brac{\hat \mu_{s,a,2^{N_{\max}+1}}, \boldsymbol{id}(r_{s,a}) } }\\& \qquad+\gamma\lbrac{\boldsymbol{\bar f^*}^{\rho_{\chi^2}(\sigma)}\brac{ p_{s,a}, V(s'_{s,a})}-{\boldsymbol{\bar f^*}^{\rho_{\chi^2}(\sigma)}\brac{\hat p_{s,a,2^{N_{\max}+1}},   V(s'_{s,a})}}   }. \label{eq:eq144}
    }

    Then, for convenience, we bound the second term in \cref{eq:eq144}. The first term can be bounded similarly.  By \cref{lm:tv}, 
    \eqenv{\label{eqeq142}
    \boldsymbol{\bar f^*}^{\rho_{\chi^2}(\sigma)}\brac{ p_{s,a}, V(s'_{s,a})}&-{\boldsymbol{\bar f^*}^{\rho_{\chi^2}(\sigma)}\brac{\hat p_{s,a,2^{N_{\max}+1}},   V^*(s'_{s,a})}}  
    \\&= \Bigg|\max_{\alpha\geq 0}\varbrac{ \mathbb E_{p_{s,a}}\Fbrac{(V(s'_{s,a}))_\alpha}-\frac{{\sigma}}{2}\brac{\alpha-\min_{s'_{s,a}} V(s'_{s,a})} }\\&\qquad\qquad-\E\Fbrac{\max_{\alpha\geq 0}\varbrac{ \mathbb E_{\hat p_{s,a,2^{N_{\max}+1}}}\Fbrac{(V(s'_{s,a}))_\alpha}-\frac{{\sigma}}{2}\brac{\alpha-\min_{s'_{s,a}} V(s'_{s,a})} }}\Bigg|
    \\& \leq \E\Fbrac{\max_{\max_{s'_{s,a}} V(s'_{s,a}) \geq \alpha\geq 0}\lbrac{\mathbb E_{p_{s,a}}\Fbrac{(V(s'_{s,a}))_\alpha}-E_{\hat p_{s,a,2^{N_{\max}+1}}}\Fbrac{(V(s'_{s,a}))_\alpha}}}.
    % \\& \leq  \E\Fbrac{\frac{r_{\max}}{1-\gamma}|\mathcal{S}| \max_{s'_{s,a}} \varbrac{\lbrac{p_{s,a}(s'_{s,a})-\hat p_{s,a,2^{N_{\max}+1}}(s'_{s,a}) } } }.
    }
    
% By Hoeffding inequality, with probability $1-2^{-{N_{\max}-1}} $, we have
% \eqenv{
% \max_{s'_{s,a}} \varbrac{\lbrac{p_{s,a}(s'_{s,a})-\hat p_{s,a,2^{N_{\max}+1}}(s'_{s,a}) } }\leq \sqrt{\frac{N_{\max}\log (2 |\mathcal{S}|)}{2^{N_{\max}+1}}  },
% }then, under this case, we can get that
%     \eqenv{
%     \E\Fbrac{\max_{\max_{s'_{s,a}} V(s'_{s,a}) \geq \alpha\geq 0}\lbrac{\mathbb E_{p_{s,a}}\Fbrac{(V(s'_{s,a}))_\alpha}-E_{\hat p_{s,a,2^{N_{\max}+1}}}\Fbrac{(V(s'_{s,a}))_\alpha}}}\leq \sqrt{\frac{r^2_{\max}|\mathcal{S}|^2 N_{\max}\log (2 |\mathcal{S}|)}{2^{N_{\max}+1}(1-\gamma)^2}  }
%     }

    According to Lemma 15 and its proof in \cite{shi2023curious}, we can get that with probability $1-2^{-{N_{\max}-1}} $, we have 
    \eqenv{\label{eqeq272}
    \max_{0\leq \alpha\leq \max_{s'_{s,a}} V(s'_{s,a}) }&\lbrac{\mathbb E_{p_{s,a}}\Fbrac{(V(s'_{s,a}))_\alpha}-E_{\hat p_{s,a,2^{N_{\max}+1}}}\Fbrac{(V(s'_{s,a}))_\alpha}}
    \\&\qquad\qquad\leq 4 \sqrt{\frac{2r_{\max}^2 (1+\sigma) \brac{\log \brac{24|\mathcal{S}||\mathcal{A}|}+ {{2(N_{\max}+1)}}\log 2}}{(1-\gamma)^2 2^{{N_{\max}+1}}}}= C_{\chi^2} \frac{r_{\max}}{1-\gamma}2^{-\frac{N_{\max}+1}{2}},
    } where $C_{\chi^2}=4\sqrt{2 (1+\sigma) \brac{\log \brac{24|\mathcal{S}||\mathcal{A}|}+ {{2(N_{\max}+1)}}\log 2}} $.
     Otherwise, we have 
    \eqenv{\label{eqeq273}
    \max_{0\leq \alpha\leq \max_{s'_{s,a}} V(s'_{s,a}) }\lbrac{\mathbb E_{p_{s,a}}\Fbrac{(V(s'_{s,a}))_\alpha}-E_{\hat p_{s,a,2^{N_{\max}+1}}}\Fbrac{(V(s'_{s,a}))_\alpha}}\leq \max_{s'_{s,a}} V(s'_{s,a})\leq \frac{r_{\max}}{1-\gamma}. 
    }
% Set $C_{\chi^2}= 4\sqrt{{4(N_{\max}+1)(\sigma+1)} \log(24 |\mathcal{S}||\mathcal{A}|)} $.
Plugging the above equations to \cref{eqeq272}, we can conclude that
    \eqenv{
    &\boldsymbol{\bar f^*}^{\rho_{\chi^2}(\sigma)}\brac{ p_{s,a}, V(s'_{s,a})}-{\boldsymbol{\bar f^*}^{\rho_{\chi^2}(\sigma)}\brac{\hat p_{s,a,2^{N_{\max}+1}},   V^*(s'_{s,a})}}
     \myineq{\leq}{i} C_{\chi^2} \frac{r_{\max}}{1-\gamma}2^{-\frac{N_{\max}+1}{2}} + \frac{r_{\max}}{1-\gamma}2^{-\brac{N_{\max}+1}} 
    \\& {\leq} \frac{r_{\max}}{1-\gamma}2^{-\frac{N_{\max}+1}{2}}\brac{2^{-\frac{N_{\max}+1}{2}}+ C_{\chi^2}},
    }where $(i)$ follows from that $1-2^{-\brac{N_{\max}+1}}\leq 1$. 

    Similar, we can get the bound
    \eqenv{
    \lbrac{\boldsymbol{\bar f^*}^{\rho_{\chi^2}(\sigma)}\brac{\mu_{s,a},\boldsymbol{id}(r_{s,a})  }-\boldsymbol{\bar f^*}^{\rho_{\chi^2}(\sigma)}\brac{\hat \mu_{s,a,2^{N_{\max}+1}}, \boldsymbol{id}(r_{s,a}) } }
    \leq r_{\max}2^{-\frac{N_{\max}+1}{2}}\brac{2^{-\frac{N_{\max}+1}{2}}+ 3C_{TV} }.
    }

    Thus, we can get that
    \eqenv{
    &\lbrac{ \E\Fbrac{\widehat {\mathcal{T}}^{\rho_{\chi^2}(\sigma)} (Q)(s,a) } - {\mathcal{T}}^{\rho_{\chi^2}(\sigma)} (Q)(s,a)}
\\& \leq \lbrac{\boldsymbol{\bar f^*}^{\rho_{\chi^2}(\sigma)}\brac{\mu_{s,a},\boldsymbol{id}(r_{s,a})  }-\boldsymbol{\bar f^*}^{\rho_{\chi^2}(\sigma)}\brac{\hat \mu_{s,a,2^{N_{\max}+1}}, \boldsymbol{id}(r_{s,a}) } }\\& \qquad+\gamma\lbrac{\boldsymbol{\bar f^*}^{\rho_{\chi^2}(\sigma)}\brac{ p_{s,a}, V(s'_{s,a})}-{\boldsymbol{\bar f^*}^{\rho_{\chi^2}(\sigma)}\brac{\hat p_{s,a,2^{N_{\max}+1}},   V(s'_{s,a})}}   }
\\& \leq \brac{\frac{r_{\max}}{1-\gamma}+r_{\max} }2^{-\frac{N_{\max}+1}{2}}\brac{2^{-\frac{N_{\max}+1}{2}}+ C_{\chi^2} }.
    }
        
    % $(i)$ follows from the dual representation of Wasserstein distance and the facts that 1). the function $ (v(x))_\alpha$ is a Lipschitz function on $[0,\infty)$ with bounded Lipschitz constant $1$. 2) $(v(x))_\alpha\leq \alpha \leq \max_{s'_{s,a}}V(s'_{s,a})\leq \max_{s,a}Q(s,a)\leq \frac{r_{\max}}{1-\gamma}  $.  $(ii)$ follows from  Wasserstein distance inequality in \cite{fournier2015rate}. 

    % Hence, we can get that
    % \eqenv{
    % \norminf{ {\widehat {\mathcal{T}}^{\rho_{\chi^2}(\sigma)} (Q) } - {\mathcal{T}}^{\rho_{\chi^2}(\sigma)} (Q) } \leq \max_{s,a} \lbrac{ {\widehat {\mathcal{T}}^{\rho_{\chi^2}(\sigma)} (Q)(s,a) } - {\mathcal{T}}^{\rho_{\chi^2}(\sigma)} (Q)(s,a) }
    % \\& \leq 
    % }

\textbf{Variance: }Next, we consider the variance of the robust Bellman operator. Firstly, we make error decomposition of the robust Bellman operator variance. 
\eqenv{
\text{Var}\brac{\widehat {\mathcal{T}}^{\rho_{\chi^2}(\sigma)} (Q)(s,a) }
&= \text{Var}\brac{\widehat  r^{\rho_{\chi^2}(\sigma)}+ \gamma \widehat{v}^{\rho_{\chi^2}(\sigma)}(Q)(s,a) }
\\&= \text{Var}\brac{\widehat  r^{\rho_{\chi^2}(\sigma)}}
+\gamma^2 \Varr{\widehat{v}^{\rho_{\chi^2}(\sigma)}(Q)(s,a)}.
}
For convenience, we analyze the second term in the above equation. The first term can be bounded similarly. 
\eqenv{
\Varr{\widehat{v}^{\rho_{\chi^2}(\sigma)}(Q)(s,a)}= 
\E\Fbrac{\brac{\widehat{v}^{\rho_{\chi^2}(\sigma)}(Q)(s,a)}^2  }-\brac{\E\Fbrac{\widehat{v}^{\rho_{\chi^2}(\sigma)}(Q)(s,a) }}^2
\leq \E\Fbrac{\brac{\widehat{v}^{\rho_{\chi^2}(\sigma)}(Q)(s,a)}^2  }.
}
Next, according to the \cref{eq:delta,eq:hatq}, now we compute the expectation of $N_2$ and write a detailed explanation of the variance as follows:
\eqenv{\label{eqeq49}
\E\Fbrac{\brac{\widehat{v}^{\rho_{\chi^2}(\sigma)}(Q)(s,a)}^2  }& = \E\Fbrac{\brac{V(s'_{s,a,0})+ \frac{\delta^{\rho_{\chi^2}(\sigma)}_{s,a,N_2}(Q)(s,a) }{P_{N_2}} }^2}
\\& \leq 2 \E\Fbrac{V(s'_{s,a,0})^2 }+ 2\E \Fbrac{ \brac{\frac{\delta^{\rho_{\chi^2}(\sigma)}_{s,a,N_2}(Q)(s,a) }{P_{N_2}} }^2 }
\\& \leq \frac{2r^2_{\max}}{(1-\gamma)^2}+ 2\sum_{N=0}^{N_{\max}} \E\Fbrac{\brac{\frac{\delta^{\rho_{\chi^2}(\sigma)}_{s,a,N_2}(Q)(s,a) }{P_{N_2}}|N_2=N }^2} P_N
\\& \leq \frac{2r^2_{\max}}{(1-\gamma)^2}+ 2\sum_{N=0}^{N_{\max}} \frac{\E\Fbrac{(\delta^{\rho_{\chi^2}(\sigma)}_{s,a,N}(Q)(s,a))^2} }{P_{N}}.
}
Next, we bound the term $\lbrac{\delta^{\rho_{\chi^2}(\sigma)}_{s,a,N}(Q)(s,a) } $,
\eqenv{
\lbrac{\delta^{\rho_{\chi^2}(\sigma)}_{s,a,N}(Q)(s,a) }=
\lbrac{\sup_{\alpha\geq 0}\varbrac{f^{\rho_{\chi^2}(\sigma)}(\widehat p_{s,a,2^{N+1}},\alpha,V)} 
     -\frac{1}{2} \sup_{\alpha\geq 0}\varbrac{f^{\rho_{\chi^2}(\sigma)}(\widehat p^E_{2^{N}},\alpha,V)}
    -\frac{1}{2}\sup_{\alpha\geq 0}\varbrac{f^{\rho_{\chi^2}(\sigma)}(\widehat p^O_{2^{N}},\alpha,V )}}. 
}

% Recall that
% $\boldsymbol{\alpha^*}^{\rho_{\chi^2}(\sigma)}(p, v(x))\in \arg\max_{\alpha} f^{\rho_{\chi^2}(\sigma)}(p, \alpha, v(x)) $. 

% According to \cref{lm:qtv}, when $$\boldsymbol{\alpha^*}^{\rho_{\chi^2}(\sigma)}(\widehat p^E_{2^{N}}, V)=\boldsymbol{\alpha^*}^{\rho_{\chi^2}(\sigma)}(\widehat p^O_{2^{N}}, V) ,\quad \widehat p^E_{2^{N}}(s')_{s'\in\min_s V(s)}\neq 0,\quad \widehat p^O_{2^{N}}(s')_{s'\in \min_s V(s)}\neq 0,$$ then
% \eqenv{
% \delta^{\rho_{\chi^2}(\sigma)}_{s,a,N}(Q)(s,a) =0. 
% }

% Otherwise,

We make an error decomposition as follows:
\eqenv{
\lbrac{\delta^{\rho_{\chi^2}(\sigma)}_{s,a,N}(Q)(s,a) }^2&=
\lbrac{\sup_{\alpha\geq 0}\varbrac{f^{\rho_{\chi^2}(\sigma)}(\widehat p_{s,a,2^{N+1}},\alpha,V)} 
     -\frac{1}{2} \sup_{\alpha\geq 0}\varbrac{f^{\rho_{\chi^2}(\sigma)}(\widehat p^E_{2^{N}},\alpha,V)}
    -\frac{1}{2}\sup_{\alpha\geq 0}\varbrac{f^{\rho_{\chi^2}(\sigma)}(\widehat p^O_{2^{N}},\alpha,V )}}^2
    \\& \leq 3\lbrac{\sup_{\alpha\geq 0}\varbrac{f^{\rho_{\chi^2}(\sigma)}(\widehat p_{s,a,2^{N+1}},\alpha,V)}-\boldsymbol{\bar f^*}^{\rho_{\chi^2}(\sigma)}\brac{\hat p_n, V(s'_{s,a})}  }^2
    \\&\qquad+  \frac{3}{2}\lbrac{ \sup_{\alpha\geq 0}\varbrac{f^{\rho_{\chi^2}(\sigma)}(\widehat p^E_{2^{N}},\alpha,V)}-\boldsymbol{\bar f^*}^{\rho_{\chi^2}(\sigma)}\brac{\hat p_n, V(s'_{s,a})}}^2
    \\&\qquad+\frac{3}{2}\lbrac{\sup_{\alpha\geq 0}\varbrac{f^{\rho_{\chi^2}(\sigma)}(\widehat p^O_{2^{N}},\alpha,V )}-\boldsymbol{\bar f^*}^{\rho_{\chi^2}(\sigma)}\brac{\hat p_n, V(s'_{s,a})}}^2.\label{eqeq52}
    % \\& \myineq{}{}
}

Then, combined with the analysis in \cref{eqeq44,eqeq45} and the fact $\mathbb P(A\cap B\cap C)\geq 1- \mathbb P(\neg A)-\mathbb P(\neg B)-\mathbb P(\neg C)$, we can conclude that
 with probability at least $1-3*2^{-N} $
% Plug \cref{eqeq52} in \cref{}
\eqenv{
\lbrac{\delta^{\rho_{\chi^2}(\sigma)}_{s,a,N}(Q)(s,a) }^2
&\leq 3\brac{C_{\chi^2}\frac{r_{\max}}{1-\gamma} 2^{-\frac{N+1}{2}}}^2 + \frac{3}{2}\brac{C_{\chi^2}\frac{r_{\max}}{1-\gamma} 2^{-\frac{N}{2}} }^2+ \frac{3}{2}\brac{C_{\chi^2}\frac{r_{\max}}{1-\gamma} 2^{-\frac{N}{2}} }^2
\\&  = \frac{9}{2}\frac{C^2_{\chi^2} r^2_{\max} }{(1-\gamma)^2}2^{N+1}, 
}
 Since $ 0 \leq \sup_{\alpha\geq 0}\varbrac{f^{\rho_{\chi^2}(\sigma)}(q,\alpha,V)}\leq \frac{r_{\max}}{1-\gamma}$ for any distribution $q$, with probability at most $3*2^{-N} $ 
we have that
\eqenv{
 \lbrac{\delta^{\rho_{\chi^2}(\sigma)}_{s,a,N}(Q)(s,a) }^2
 \leq \brac{ \frac{r_{\max}}{1-\gamma}}^2.
}

Above all, we can get that 
\eqenv{
\E\Fbrac{\lbrac{\delta^{\rho_{\chi^2}(\sigma)}_{s,a,N}(Q)(s,a) }^2 }\leq \frac{9}{2}\frac{C^2_{\chi^2} r^2_{\max} }{(1-\gamma)^2}2^{-(N+1)}+ \brac{ \frac{r_{\max}}{1-\gamma}}^23*2^{-N}
\leq \brac{9 C^2_{\chi^2}+ 3}\brac{ \frac{r_{\max}}{1-\gamma}}^2 2^{-N}. \label{eqeq55}
}
Then, plug \cref{eqeq55} in \cref{eqeq49}, we can get the boundary of variance of robust Bellman operator as follows:
\eqenv{
\Varr{\widehat{v}^{\rho_{\chi^2}(\sigma)}(Q)(s,a)}&\leq \frac{2r^2_{\max}}{(1-\gamma)^2}+ 2\sum_{N=0}^{N_{\max}} \frac{\E\Fbrac{(\delta^{\rho_{\chi^2}(\sigma)}_{s,a,N}(Q)(s,a))^2} }{P_{N}}
\\&\leq \frac{2r^2_{\max}}{(1-\gamma)^2}+ \frac{2r^2_{\max}}{(1-\gamma)^2}\brac{9 C^2_{\chi^2}+ 3}\sum_{N=0}^{N_{\max}} \frac{2^{-N} }{P_{N}}
\\& \leq \frac{2r^2_{\max}}{(1-\gamma)^2}(1+\brac{9 C^2_{\chi^2}+ 3}(N_{\max}+1) ).
}

Similarly, we can get the boundary of the variance $\text{Var}\brac{\widehat  r^{\rho_{\chi^2}(\sigma)}}$ as follows:
\eqenv{
\text{Var}\brac{\widehat  r^{\rho_{\chi^2}(\sigma)}}\leq 
 2r^2_{\max}(1+\brac{9 C^2_{\chi^2}+ 3}(N_{\max}+1) ).
}

Hence, we can get the robust Bellman operator variance bound:
\eqenv{\label{eqeq162}
\text{Var}\brac{\widehat {\mathcal{T}}^{\rho_{\chi^2}(\sigma)} (Q)(s,a) }
&= \text{Var}\brac{\widehat  r^{\rho_{\chi^2}(\sigma)}}
+\gamma^2 \Varr{\widehat{v}^{\rho_{\chi^2}(\sigma)}(Q)(s,a)}
\\&\leq \brac{2r^2_{\max}+\gamma^2\frac{2r^2_{\max}}{(1-\gamma)^2} }(1+\brac{9 C^2_{\chi^2}+ 3}(N_{\max}+1) ).
}
   This completes the proof.  
\end{proof}

\begin{theorem}[Sample Complexity with $\chi^2$ Distance]
 Set $N_{\max}=\frac{\log T}{\log 2}$ and the stepsize as $$\alpha_t=\alpha=\frac{\log T}{(1-\gamma)T}. $$ Then the output of \cref{alg:example} satisfies that:
 \begin{align}
    \mathbb E \Fbrac{\mynorm{\widehat Q_T^{\rho_{\chi^2}(\sigma)}-Q^{*\rho_{\chi^2}(\sigma)}}_\infty^2}\nonumber
    & \leq \frac{\log T}{T^2}\frac{r^2_{\max}}{(1-\gamma)^2} 
+  \frac{r^2_{\max} \log T }{2(1-\gamma)^2 T}\brac{1+\frac{ \gamma^2}{(1-\gamma)^2} }(1+\brac{9 C^2_{\chi^2}+ 3}(N_{\max}+1) )
\\&\qquad+r^2_{\max}\brac{\brac{1+\frac{\gamma}{1-\gamma}}\brac{1+ 3C_{\chi^2} }}^2\frac{1}{T}
   %\\& \leq \frac{c_0}{(1-\gamma)^2}\brac{1-\frac{\alpha(1-\gamma)}{2}}^T+ c_1 \alpha\frac{\log (|\mathcal{S}||\mathcal{A}|)}{ (1-\gamma)^4}\nonumber
    \\& \leq\mathcal{O}\brac{\frac{1}{  (1-\gamma)^5 T}}.
\end{align}
To ensure
\begin{align}
    \mathbb E \Fbrac{\mynorm{\widehat Q_T^{\rho_{\chi^2}(\sigma)}-Q^{*\rho_{\chi^2}(\sigma)}}_\infty^2}\leq \epsilon^2,\nonumber
\end{align}
the expected total sample complexity $N^{\rho_{\chi^2}(\sigma)}(\epsilon)$ is,
 \begin{align}
     N^{\rho_{\chi^2}(\sigma)}(\epsilon) = |\mathcal{S}||\mathcal{A}|N_{\max} T\geq \mathcal{O}\brac{\frac{|\mathcal{S}||\mathcal{A}|}{  (1-\gamma)^5 \epsilon^2 }}.\nonumber
 \end{align}
\end{theorem}
\begin{proof}
We consider the stochastic iteration that
\eqenv{
\widehat Q^{\rho_{\chi^2}(\sigma)}_{t+1}=\widehat Q^{\rho_{\chi^2}(\sigma)}_{t}+ \beta_t\brac{\boldsymbol{\widehat{\mathcal{T}}}^{\rho_{\chi^2}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{\chi^2}(\sigma)}_{t}}- \widehat Q^{\rho_{\chi^2}(\sigma)}_{t}+ W_t  },
}
where $W_t={\widehat{\mathcal{T}}}^{\rho_{\chi^2}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{\chi^2}(\sigma)}_{t}}-\boldsymbol{\widehat{\mathcal{T}}}^{\rho_{\chi^2}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{\chi^2}(\sigma)}_{t}} $. 

Define the filtration $\mathcal{F}_t=\varbrac{Q^{\rho_{\chi^2}(\sigma)}_0, W_0,...,Q^{\rho_{\chi^2}(\sigma)}_{t-1},W_{t-1},Q^{\rho_{\chi^2}(\sigma)}_{t} } $. 
Then, by \cref{def:4.1,thm:tv1}, we can get that 
\eqenv{
\E\Fbrac{W_t|\mathcal{F}_t }=0,
}
and
\eqenv{
\E\Fbrac{\norminf{W_t}^2|\mathcal{F}_t }
&\leq \max_{s,a}{\E\Fbrac{\twonorm{{\widehat{\mathcal{T}}}^{\rho_{\chi^2}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{\chi^2}(\sigma)}_{t}}(s,a)-\boldsymbol{\widehat{\mathcal{T}}}^{\rho_{\chi^2}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{\chi^2}(\sigma)}_{t}}(s,a)}^2|\mathcal{F}_t }}
\\& \leq \max_{s,a} \varbrac{\text{Var}\brac{\widehat {\mathcal{T}}^{\rho_{\chi^2}(\sigma)}_{N_{\max}} (Q^{\rho_{\chi^2}(\sigma)}_{t})(s,a)  } }
\\& \leq \brac{2r^2_{\max}+2\gamma^2  \norminf{Q^{\rho_{\chi^2}(\sigma)}_{t} }^2 }(1+\brac{9 C^2_{\chi^2}+ 3}(N_{\max}+1) ),
\\& \myineq{\leq}{i} \brac{2r^2_{\max}+\gamma^2\frac{2r^2_{\max}}{(1-\gamma)^2} }(1+\brac{9 C^2_{\chi^2}+ 3}(N_{\max}+1) ),
} where $(i) $ follows from \cref{eqeq162}. 

According to \cref{eqeq35}, we have that
$$\widehat Q^{*\rho_{\chi^2}(\sigma)}(s,a) = \hatT (\widehat Q^{*\rho_{\chi^2}(\sigma)})(s,a)=\E\Fbrac{\hatt (\widehat Q^{*\rho_{\chi^2}(\sigma)})(s,a)}.$$
Then, apply \cref{lm:chen} \cite{chen2020finite}, set the constant stepsize
$$\beta_t=\beta= \min \varbrac{\frac{2\log T}{(1-\gamma)T}, \frac{(1-\gamma)^2}{64 e r^2_{\max}(1+\brac{9 C^2_{\chi^2}+ 3}(N_{\max}+1) ) \log(|\mathcal{S}||\mathcal{A}|) }} .$$
We can conclude that
\eqenv{\label{eqeq66}
\E&\Fbrac{\norminf{\widehat Q_{T}^{\rho_{\chi^2}(\sigma)}-\widehat Q^{*\rho_{\chi^2}(\sigma)} }^2}\leq \frac{3}{2} \norminf{\widehat Q_{0}^{\rho_{\chi^2}(\sigma)}-\widehat Q^{*\rho_{\chi^2}(\sigma)} }^2\prod_{j=0}^{T-1} \brac{1-\frac{1-\gamma}{2} \beta_t} \\& \qquad+\frac{16 e \log (|\mathcal{S}||\mathcal{A}|)}{1-\gamma}\brac{2r^2_{\max}+\gamma^2\frac{2r^2_{\max}}{(1-\gamma)^2} }(1+\brac{9 C^2_{\chi^2}+ 3}(N_{\max}+1) )\sum_{i=0}^{T-1} \beta_i^2 \prod_{t=i+1}^{T-1} (1-
\frac{1-\gamma}{2}\beta_t)
\\& \leq \frac{3}{2}\frac{r^2_{\max}}{(1-\gamma)^2} \frac{1}{T} +\frac{16 e \log (|\mathcal{S}||\mathcal{A}|)}{1-\gamma}\brac{2r^2_{\max}+\gamma^2\frac{2r^2_{\max}}{(1-\gamma)^2} }(1+\brac{9 C^2_{\chi^2}+ 3}(N_{\max}+1) )\frac{4 \log T }{(1-\gamma)^2 T}
. 
}

Set $N_{\max}=\frac{2\log T}{\log 2}$. Then, we make the decomposition and get the bound of $\mathbb E \Fbrac{\mynorm{\widehat Q_T^{\rho_{\chi^2}(\sigma)}-Q^{*\rho_{\chi^2}(\sigma)}}_\infty^2}$ as follows
 \eqenv{
 \mathbb E& \Fbrac{\mynorm{\widehat Q_T^{\rho_{\chi^2}(\sigma)}-Q^{*\rho_{\chi^2}(\sigma)}}_\infty^2}
 \leq 
 2\E\Fbrac{\mynorm{\widehat Q_T^{\rho_{\chi^2}(\sigma)}-\widehat Q^{*\rho_{\chi^2}(\sigma)}}_\infty^2 }+ 2\E\Fbrac{\mynorm{\widehat Q^{*\rho_{\chi^2}(\sigma)}-Q^{*\rho_{\chi^2}(\sigma)}}_\infty^2 }
 \\& \myineq{\leq}{i}
  \frac{2r^2_{\max}}{(1-\gamma)^2 T} +\frac{32 e \log (|\mathcal{S}||\mathcal{A}|)}{1-\gamma}\brac{2r^2_{\max}+\gamma^2\frac{2r^2_{\max}}{(1-\gamma)^2} }(1+\brac{9 C^2_{\chi^2}+ 3}(N_{\max}+1) )\frac{4 \log T }{(1-\gamma)^2 T}
\\& \qquad+\frac{2}{1-\gamma}\brac{\brac{r_{\max}+\frac{r_{\max}}{1-\gamma}}2^{-\frac{N_{\max}+1}{2}}\brac{2^{-\frac{N_{\max}+1}{2}}+ 3C_{\chi^2} }}^2
\\& \leq  \frac{2r^2_{\max}}{(1-\gamma)^2 T} +\frac{32 e \log (|\mathcal{S}||\mathcal{A}|)}{1-\gamma}\brac{2r^2_{\max}+\gamma^2\frac{2r^2_{\max}}{(1-\gamma)^2} }(1+\brac{9 C^2_{\chi^2}+ 3}(N_{\max}+1) )\frac{4 \log T }{(1-\gamma)^2 T}
\\& \qquad+2\frac{2}{1-\gamma}\brac{\brac{r_{\max}+\frac{r_{\max}}{1-\gamma}}\brac{1+ 3C_{\chi^2} }}^2\frac{1}{T}
\\&= \mathcal{O}\brac{\frac{1}{(1-\gamma)^5T}},
 }where $(i)$ follows from \cref{eqeq66}.

This completes the proof.     
\end{proof}

\section{KL Propositions and Theorems Proof}
In this section, we provide the proof of \cref{thm:kl1,thm:kl}. 
\begin{theorem}
    Consider the case of KL constraint uncertainty set with uncertainty level $\sigma$ i.e. $ \mathcal{P}^{KL}(\sigma)$ and $ \mathcal{R}^{KL}(\sigma)$, set $g=\frac{1}{2}$, for any $Q\in \mathbb R^{\mathcal{S}\times\mathcal{A}}, s\in \mathcal{S}, a\in \mathcal{A} $, the estimation bias can be bounded as:
    \begin{align}
        \lbrac{\mathbb E\Fbrac{\widehat {\mathcal{T}}^{\rho_{KL}(\sigma)} (Q)(s,a) } - {\mathcal{T}}^{\rho_{KL}(\sigma)} (Q)(s,a)}\leq \mathcal{O}\brac{2^{\frac{N_{\max}}{2}} },\nonumber
        % \brac{r_{\max}+\frac{r_{\max}}{1-\gamma}}
    \end{align}
    and the variation can be bounded as:
    \begin{align}
        \text{Var}\brac{\widehat {\mathcal{T}}^{\rho_{KL}(\sigma)} (Q)(s,a)  }\leq \mathcal{O}\brac{N_{\max}}.
    \end{align}
\end{theorem}
\begin{proof}
    Then, we provide the proof of \cref{thm:kl}.
    Firstly, we make error decomposition as follows:
    \eqenv{
    \lbrac{ \E\Fbrac{\widehat {\mathcal{T}}^{\rho_{\chi^2}(\sigma)} (Q)(s,a) } - {\mathcal{T}}^{\rho_{\chi^2}(\sigma)} (Q)(s,a)}
    &= \bigg|\boldsymbol{\bar f^*}^{\rho_{KL}(\sigma)}\brac{\mu_{s,a},\boldsymbol{id}(r_{s,a})  }+\gamma\boldsymbol{\bar f^*}^{\rho_{KL}(\sigma)}\brac{ p_{s,a}, V(s'_{s,a})}\\ -\boldsymbol{\bar f^*}&^{\rho_{KL}(\sigma)}\brac{\hat \mu_{s,a,2^{N_{\max}+1}}, \boldsymbol{id}(r_{s,a}) }
- \gamma{\boldsymbol{\bar f^*}^{\rho_{KL}(\sigma)}\brac{\hat p_{s,a,2^{N_{\max}+1}},   V(s'_{s,a})}} \bigg|
\\& \leq \lbrac{\boldsymbol{\bar f^*}^{\rho_{KL}(\sigma)}\brac{\mu_{s,a},\boldsymbol{id}(r_{s,a})  }-\boldsymbol{\bar f^*}^{\rho_{KL}(\sigma)}\brac{\hat \mu_{s,a,2^{N_{\max}+1}}, \boldsymbol{id}(r_{s,a}) } }\\ +\gamma&\lbrac{\boldsymbol{\bar f^*}^{\rho_{KL}(\sigma)}\brac{ p_{s,a}, V(s'_{s,a})}-{\boldsymbol{\bar f^*}^{\rho_{KL}(\sigma)}\brac{\hat p_{s,a,2^{N_{\max}+1}},   V(s'_{s,a})}}   }. \label{eq:eq70}
    }
Then, for convenience, we bound the second term in \cref{eq:eq70}. The first term can be bounded similarly.  By \cref{lm:kl},
\eqenv{\label{eqeq71}
    {f^*}&^{\rho_{KL}(\sigma)}\brac{ p_{s,a}, V(s'_{s,a})}-{{f^*}^{\rho_{KL}(\sigma)}\brac{\hat p_{s,a,2^{N_{\max}+1}},   V^*(s'_{s,a})}} 
    \\&= \max_{\alpha\geq0} \varbrac{-\alpha \log\brac{\mathbb E_{p_{s,a}} \Fbrac{exp\brac{-\frac{V(s'_{s,a})}{\alpha}} } }-\alpha \sigma }
    -\max_{\alpha\geq0} \varbrac{-\alpha \log\brac{\mathbb E_{\hat p_{s,a,2^{N_{\max}+1}}} \Fbrac{exp\brac{-\frac{V(s'_{s,a})}{\alpha}} } }-\alpha \sigma }
    \\& \myineq{\leq}{i} \max_{0\leq\alpha\leq \frac{r_{\max}}{(1-\gamma)\sigma }} \lbrac{-\alpha \log\brac{\mathbb E_{p_{s,a}} \Fbrac{exp\brac{-\frac{V(s'_{s,a})}{\alpha}} } }+\alpha \log\brac{\mathbb E_{\hat p_{s,a,2^{N_{\max}+1}}} \Fbrac{exp\brac{-\frac{V(s'_{s,a})}{\alpha}} } }  }
    \\& {\leq} 
    \max_{0\leq\alpha\leq \frac{r_{\max}}{(1-\gamma)\sigma }}
    \lbrac{\alpha \log\brac{\frac{\mathbb E_{\hat p_{s,a,2^{N_{\max}+1}}} \Fbrac{exp\brac{-\frac{V(s'_{s,a})}{\alpha}} } }{\mathbb E_{p_{s,a}} \Fbrac{exp\brac{-\frac{V(s'_{s,a})}{\alpha}} }}  }  }
    \\& {\leq}
    \frac{r_{\max}}{(1-\gamma)\sigma}  \max_{0\leq\alpha\leq \frac{r_{\max}}{(1-\gamma)\sigma }} \lbrac{\log\brac{\frac{\mathbb E_{\hat p_{s,a,2^{N_{\max}+1}}} \Fbrac{exp\brac{-\frac{V(s'_{s,a})}{\alpha}} } -\mathbb E_{p_{s,a}} \Fbrac{exp\brac{-\frac{V(s'_{s,a})}{\alpha}} }}{\mathbb E_{p_{s,a}} \Fbrac{exp\brac{-\frac{V(s'_{s,a})}{\alpha}} }}  +1 }   },
}
where $(i)$ follows from $\boldsymbol{\alpha^*}^{\rho_{KL}(\sigma)}(p, V)\leq \frac{\max_{s:p(s)\neq 0}V(s)}{\sigma}\leq \frac{\max_{s,a} Q(s,a)}{\sigma}\leq \frac{r_{\max}}{(1-\gamma)\sigma }$ by \cite{hu2013kullback}. We recall that $\boldsymbol{\alpha^*}^{\rho_{KL}(\sigma)}(p, V)$ is defined in \cref{def:a.1}. 

Noting that $ \hat p_{s,a,2^{N_{\max}+1}}(s'_{s,a})$ is absolutely continuous on $p_{s,a}(s'_{s,a}) $, then by Hoeffding's inequality we have 
\eqenv{
\prob\brac{\max_{s'_{s,a}}\lbrac{\frac{\hat p_{s,a,2^{N_{\max}+1}}(s'_{s,a})-p_{s,a}(s'_{s,a})  }{p_{s,a}(s'_{s,a})} }\geq \sqrt{\frac{1}{2^{N_{\max}+1}p^2_\wedge} \log \frac{2|\mathcal{S}| }{\tau} } }\leq \tau.
}

Set $\tau=2^{-(N_{\max}+1)}$. With probability at least $1-2^{-(N_{\max}+1)} $, we have that
\eqenv{
\lbrac{\frac{\mathbb E_{\hat p_{s,a,2^{N_{\max}+1}}} \Fbrac{exp\brac{-\frac{V(s'_{s,a})}{\alpha}} } -\mathbb E_{p_{s,a}} \Fbrac{exp\brac{-\frac{V(s'_{s,a})}{\alpha}} }}{\mathbb E_{p_{s,a}} \Fbrac{exp\brac{-\frac{V(s'_{s,a})}{\alpha}} }}  }
 &\leq
\max_{s'_{s,a}}\lbrac{\frac{\hat p_{s,a,2^{N_{\max}+1}}(s'_{s,a})-p_{s,a}(s'_{s,a})  }{p_{s,a}(s'_{s,a})} }
\\& \leq \sqrt{\frac{N_{\max}}{2^{N_{\max}+1}p^2_\wedge} \log {2|\mathcal{S}| }}.
}
Then, we set $\frac{N_{\max}}{p^2_\wedge} \log {2|\mathcal{S}| }\leq \frac{1}{4}2^{N_{\max}+1} $, s.t. $\sqrt{\frac{N_{\max}}{2^{N_{\max}+1}p^2_\wedge} \log {2|\mathcal{S}| }}\leq \frac{1}{2} $. 

Then, combined with \cref{eqeq71}, we can conclude that
\eqenv{\label{eqeq74}
\boldsymbol{\bar f^*}&^{\rho_{KL}(\sigma)}\brac{ p_{s,a}, V(s'_{s,a})}-{\boldsymbol{\bar f^*}^{\rho_{KL}(\sigma)}\brac{\hat p_{s,a,2^{N_{\max}+1}},   V^*(s'_{s,a})}}
\\& {\leq}
    \frac{r_{\max}}{(1-\gamma)\sigma}  \max_{0\leq\alpha\leq \frac{r_{\max}}{(1-\gamma)\sigma }} \lbrac{\log\brac{\frac{\mathbb E_{\hat p_{s,a,2^{N_{\max}+1}}} \Fbrac{exp\brac{-\frac{V(s'_{s,a})}{\alpha}} } -\mathbb E_{p_{s,a}} \Fbrac{exp\brac{-\frac{V(s'_{s,a})}{\alpha}} }}{\mathbb E_{p_{s,a}} \Fbrac{exp\brac{-\frac{V(s'_{s,a})}{\alpha}} }}  +1 }   }
\\& \myineq{\leq}{i}
    \frac{r_{\max}}{(1-\gamma)\sigma} 
    \max_{0\leq\alpha\leq \frac{r_{\max}}{(1-\gamma)\sigma }}
    2\lbrac{\frac{\mathbb E_{\hat p_{s,a,2^{N_{\max}+1}}} \Fbrac{exp\brac{-\frac{V(s'_{s,a})}{\alpha}} } -\mathbb E_{p_{s,a}} \Fbrac{exp\brac{-\frac{V(s'_{s,a})}{\alpha}} }}{\mathbb E_{p_{s,a}} \Fbrac{exp\brac{-\frac{V(s'_{s,a})}{\alpha}} }}  }
    \\& \leq \frac{2r_{\max}}{(1-\gamma)\sigma} 
    \max_{s'_{s,a}}\lbrac{\frac{\hat p_{s,a,2^{N_{\max}+1}}(s'_{s,a})-p_{s,a}(s'_{s,a})  }{p_{s,a}(s'_{s,a})} }\leq \frac{2r_{\max}}{(1-\gamma)\sigma}\sqrt{\frac{N_{\max}}{2^{N_{\max}+1}p^2_\wedge} \log {2|\mathcal{S}| }},
} where $(i)$ follows from that $|\log(x+1)|\leq 2|x| $ for $|x|\leq \frac{1}{2}$.
Otherwise, with probability at most $2^{-(N_{\max}+1)}$, we can conclude 
that
\eqenv{\label{eqeq75}
\boldsymbol{\bar f^*}&^{\rho_{KL}(\sigma)}\brac{ p_{s,a}, V(s'_{s,a})}-{\boldsymbol{\bar f^*}^{\rho_{KL}(\sigma)}\brac{\hat p_{s,a,2^{N_{\max}+1}},   V^*(s'_{s,a})}} 
    \leq  \max_{s'_{s,a}}V(s'_{s,a})\leq \frac{r_{\max}}{1-\gamma}.
}

Then, consider the expectation, we can get
\eqenv{
   \boldsymbol{\bar f^*}^{\rho_{KL}(\sigma)}\brac{ p_{s,a}, V(s'_{s,a})}-&{\boldsymbol{\bar f^*}^{\rho_{KL}(\sigma)}\brac{\hat p_{s,a,2^{N_{\max}+1}},   V^*(s'_{s,a})}} 
   \\&= \E\Fbrac{{f^*}^{\rho_{KL}(\sigma)}\brac{ p_{s,a}, V(s'_{s,a})}-{{f^*}^{\rho_{KL}(\sigma)}\brac{\hat p_{s,a,2^{N_{\max}+1}},   V^*(s'_{s,a})}} }
   \\& \leq \frac{2r_{\max}}{(1-\gamma)\sigma}\sqrt{\frac{N_{\max}}{2^{N_{\max}+1}p^2_\wedge} \log {2|\mathcal{S}| }}+ 2^{-(N_{\max}+1)}\frac{r_{\max}}{1-\gamma}
   \\&\leq \frac{2r_{\max}}{(1-\gamma)\sigma}{\sqrt{N_{\max}\log {2|\mathcal{S}| } }  }\frac{ 1}{p_\wedge2^\frac{N_{\max}+1}{2}}+2^{-(N_{\max}+1)}\frac{r_{\max}}{1-\gamma} , 
}where we set $C_{KL}=2\sqrt{N_{\max}\log {2|\mathcal{S}| } }  $, then
\eqenv{
\boldsymbol{\bar f^*}&^{\rho_{KL}(\sigma)}\brac{ p_{s,a}, V(s'_{s,a})}-{\boldsymbol{\bar f^*}^{\rho_{KL}(\sigma)}\brac{\hat p_{s,a,2^{N_{\max}+1}},   V^*(s'_{s,a})}} 
    % \leq \frac{2r_{\max}}{(1-\gamma)\sigma}\sqrt{\frac{N_{\max}}{2^{N_{\max}+1}p^2_\wedge} \log {2|\mathcal{S}| }}+ 2^{-(N_{\max}+1)}\frac{r_{\max}}{1-\gamma}
   \leq \frac{r_{\max}}{(1-\gamma)\sigma} C_{KL}\frac{ 1}{p_\wedge2^\frac{N_{\max}+1}{2}}+2^{-(N_{\max}+1)}\frac{r_{\max}}{1-\gamma}.
}

% \end{proof}

\textbf{Variance: }Next, we consider the variance of the robust Bellman operator. Firstly, we make error decomposition of the robust Bellman operator variance. 
\eqenv{
\text{Var}\brac{\widehat {\mathcal{T}}^{\rho_{KL}(\sigma)} (Q)(s,a) }
&= \text{Var}\brac{\widehat  r^{\rho_{KL}(\sigma)}+ \gamma \widehat{v}^{\rho_{KL}(\sigma)}(Q)(s,a) }
\\&= \text{Var}\brac{\widehat  r^{\rho_{KL}(\sigma)}}
+\gamma^2 \Varr{\widehat{v}^{\rho_{KL}(\sigma)}(Q)(s,a)}.
}
For convenience, we analyze the second term in the above equation. The first term can be bounded similarly. 
\eqenv{
\Varr{\widehat{v}^{\rho_{KL}(\sigma)}(Q)(s,a)}= 
\E\Fbrac{\brac{\widehat{v}^{\rho_{KL}(\sigma)}(Q)(s,a)}^2  }-\brac{\E\Fbrac{\widehat{v}^{\rho_{KL}(\sigma)}(Q)(s,a) }}^2
\leq \E\Fbrac{\brac{\widehat{v}^{\rho_{KL}(\sigma)}(Q)(s,a)}^2  }.
}
Next, according to the \cref{eq:delta,eq:hatq}, now we compute the expectation of $N_2$ and write a detailed explanation of the variance as follows:
\eqenv{\label{eqeq80}
\E\Fbrac{\brac{\widehat{v}^{\rho_{KL}(\sigma)}(Q)(s,a)}^2  }& = \E\Fbrac{\brac{V(s'_{s,a,0})+ \frac{\delta^{\rho_{KL}(\sigma)}_{s,a,N_2}(Q)(s,a) }{P_{N_2}} }^2}
\\& \leq 2 \E\Fbrac{V(s'_{s,a,0})^2 }+ 2\E \Fbrac{ \brac{\frac{\delta^{\rho_{KL}(\sigma)}_{s,a,N_2}(Q)(s,a) }{P_{N_2}} }^2 }
\\& \leq \frac{2r^2_{\max}}{(1-\gamma)^2\sigma^2}+ 2\sum_{N=0}^{N_{\max}} \E\Fbrac{\brac{\frac{\delta^{\rho_{KL}(\sigma)}_{s,a,N_2}(Q)(s,a) }{P_{N_2}}|N_2=N }^2} P_N
\\& \leq \frac{2r^2_{\max}}{(1-\gamma)^2\sigma^2}+ 2\sum_{N=0}^{N_{\max}} \frac{\E\Fbrac{(\delta^{\rho_{KL}(\sigma)}_{s,a,N}(Q)(s,a))^2} }{P_{N}}.
}
Next, we bound the term $\lbrac{\delta^{\rho_{KL}(\sigma)}_{s,a,N}(Q)(s,a) } $,
\eqenv{
\lbrac{\delta^{\rho_{KL}(\sigma)}_{s,a,N}(Q)(s,a) }=
\lbrac{\sup_{\alpha\geq 0}\varbrac{f^{\rho_{KL}(\sigma)}(\widehat p_{s,a,2^{N+1}},\alpha,V)} 
     -\frac{1}{2} \sup_{\alpha\geq 0}\varbrac{f^{\rho_{KL}(\sigma)}(\widehat p^E_{2^{N}},\alpha,V)}
    -\frac{1}{2}\sup_{\alpha\geq 0}\varbrac{f^{\rho_{KL}(\sigma)}(\widehat p^O_{2^{N}},\alpha,V )}}. 
}

Then, we make an error decomposition as follows:
\eqenv{
\lbrac{\delta^{\rho_{KL}(\sigma)}_{s,a,N}(Q)(s,a) }^2&=
\lbrac{\sup_{\alpha\geq 0}\varbrac{f^{\rho_{KL}(\sigma)}(\widehat p_{s,a,2^{N+1}},\alpha,V)} 
     -\frac{1}{2} \sup_{\alpha\geq 0}\varbrac{f^{\rho_{KL}(\sigma)}(\widehat p^E_{2^{N}},\alpha,V)}
    -\frac{1}{2}\sup_{\alpha\geq 0}\varbrac{f^{\rho_{KL}(\sigma)}(\widehat p^O_{2^{N}},\alpha,V )}}^2
    \\& \leq 3\lbrac{\sup_{\alpha\geq 0}\varbrac{f^{\rho_{KL}(\sigma)}(\widehat p_{s,a,2^{N+1}},\alpha,V)}-\boldsymbol{\bar f^*}^{\rho_{KL}(\sigma)}\brac{\hat p_n, V(s'_{s,a})}  }^2
   \\ &\qquad +  \frac{3}{2}\lbrac{ \sup_{\alpha\geq 0}\varbrac{f^{\rho_{KL}(\sigma)}(\widehat p^E_{2^{N}},\alpha,V)}-\boldsymbol{\bar f^*}^{\rho_{KL}(\sigma)}\brac{\hat p_n, V(s'_{s,a})}}^2
    \\&\qquad+\frac{3}{2}\lbrac{\sup_{\alpha\geq 0}\varbrac{f^{\rho_{KL}(\sigma)}(\widehat p^O_{2^{N}},\alpha,V )}-\boldsymbol{\bar f^*}^{\rho_{KL}(\sigma)}\brac{\hat p_n, V(s'_{s,a})}}^2.\label{eqeq82}
    % \\& \myineq{}{}
}

Then, combined with the analysis in \cref{eqeq74,eqeq75}, we can conclude that when $N\leq \frac{\log(1+p^2_\wedge\log(2|\mathcal{S}|)\log T )}{\log 2}$, we have
\eqenv{
    \lbrac{\delta^{\rho_{KL}(\sigma)}_{s,a,N}(Q)(s,a) }^2\leq \brac{ \frac{r_{\max}}{1-\gamma}}^2,\qquad {\frac{1}{P_N}}= 2^N \leq 1+p^{-2}_\wedge\log(2|\mathcal{S}|)\log T.
}





When $N>\frac{\log(1+p^2_\wedge\log(2|\mathcal{S}|)\log T )}{\log 2}$, consider the fact $\mathbb P(A\cap B\cap C)\geq 1- \mathbb P(\neg A)-\mathbb P(\neg B)-\mathbb P(\neg C)$, by \cref{eqeq74},
 with probability at least $1-3*2^{-N} $
% Plug \cref{eqeq52} in \cref{}
\eqenv{
\lbrac{\delta^{\rho_{KL}(\sigma)}_{s,a,N}(Q)(s,a) }^2
&\leq 3\brac{C_{KL}\frac{r_{\max}}{p_\wedge(1-\gamma)\sigma} 2^{-\frac{N+1}{2}}}^2 + \frac{3}{2}\brac{C_{KL}\frac{r_{\max}}{p_\wedge(1-\gamma)\sigma}  2^{-\frac{N}{2}} }^2+ \frac{3}{2}\brac{C_{KL}\frac{r_{\max}}{p_\wedge(1-\gamma)\sigma}  2^{-\frac{N}{2}} }^2
\\& = \frac{9}{2}\frac{C^2_{KL} r^2_{\max} }{p_\wedge^2(1-\gamma)^2\sigma^2}2^{-(N+1)}, 
}
 Since $ 0 \leq \sup_{\alpha\geq 0}\varbrac{f^{\rho_{KL}(\sigma)}(q,\alpha,V)}\leq \frac{r_{\max}}{1-\gamma}$ for any distribution $q$, with probability at most $3*2^{-N} $ 
we have that
\eqenv{
 \lbrac{\delta^{\rho_{KL}(\sigma)}_{s,a,N}(Q)(s,a) }^2
 \leq \brac{ \frac{r_{\max}}{1-\gamma}}^2.
}

Above all, we can get that 
\eqenv{
\E\Fbrac{\lbrac{\delta^{\rho_{KL}(\sigma)}_{s,a,N}(Q)(s,a) }^2 }\leq \frac{9}{2}\frac{C^2_{KL} r^2_{\max} }{p_\wedge^2(1-\gamma)^2\sigma^2}2^{-(N+1)}+ \brac{ \frac{r_{\max}}{1-\gamma}}^23*2^{-N}.
% \leq \brac{9 C^2_{\chi^2}+ 3}\brac{ \frac{r_{\max}}{1-\gamma}}^2 2^{-N}. 
\label{eqeq86}
}
Then, combined with  \cref{eqeq83,eqeq86}, we can get the boundary of variance of robust Bellman operator as follows:
\eqenv{\label{eqeq83}
\Varr{\widehat{v}^{\rho_{KL}(\sigma)}(Q)(s,a)}&\leq \frac{2r^2_{\max}}{(1-\gamma)^2\sigma^2}+ 2\sum_{N=0}^{N_{\max}} \frac{\E\Fbrac{(\delta^{\rho_{KL}(\sigma)}_{s,a,N}(Q)(s,a))^2} }{P_{N}}
\\&\leq \frac{2r^2_{\max}}{(1-\gamma)^2\sigma^2}+\sum_{N=0}^{N_{\max}} \brac{ \brac{ \frac{r_{\max}}{1-\gamma}}^2+
1+p^{-2}_\wedge\log(2|\mathcal{S}|)\log T+ \frac{9}{2}\frac{C^2_{KL} r^2_{\max} }{p_\wedge^{2}(1-\gamma)^{2}\sigma^2}+3 \brac{ \frac{r_{\max}}{1-\gamma}}^2}
\\& \leq\brac{2 +(N_{\max}+1)\brac{5+{log(2|\mathcal{S}|\log T)}{(1-\gamma)} + \frac{9C^2_{KL}}{2} } }\frac{r^2_{\max}}{p_\wedge^2 (1-\gamma)^2 \sigma^2},
}

Set $C_{\text{var}}= 2 +(N_{\max}+1)\brac{5+{log(2|\mathcal{S}|\log T)}{(1-\gamma)} + \frac{9C^2_{KL}}{2} }  $, then 
$$\Varr{\widehat{v}^{\rho_{KL}(\sigma)}(Q)(s,a)}\leq C_{\text{var}}\frac{r^2_{\max}}{p_\wedge^2 (1-\gamma)^2 \sigma^2}. $$

Similarly, we can get the boundary of the variance $\text{Var}\brac{\widehat  r^{\rho_{KL}(\sigma)}}$ as follows:
\eqenv{
\text{Var}\brac{\widehat  r^{\rho_{KL}(\sigma)}}\leq 
C_{\text{var}}\frac{r^2_{\max}}{p_\wedge^2  \sigma^2}.
}

Hence, we can get the robust Bellman operator variance bound:
\eqenv{\label{eqeq88}
\text{Var}\brac{\widehat {\mathcal{T}}^{\rho_{KL}(\sigma)} (Q)(s,a) }
&= \text{Var}\brac{\widehat  r^{\rho_{KL}(\sigma)}}
+\gamma^2 \Varr{\widehat{v}^{\rho_{KL}(\sigma)}(Q)(s,a)}
\leq \frac{ C_{\text{var}}}{p_\wedge^2  \sigma^2}\brac{r^2_{\max}+ \frac{\gamma^2 r^2_{\max}}{(1-\gamma)^2}}.
}
   This completes the proof.  
\end{proof}
\begin{theorem}[Restatement of \cref{thm:kl}]
If we set $g=\frac{1}{2}$ and the stepsize as $$\alpha_t=\alpha=\frac{\log T}{(1-\gamma)T}. $$ Then the output of \cref{alg:example} satisfies that:
 \begin{align}
    \mathbb E &\Fbrac{\mynorm{\widehat Q_T^{\rho_{KL}(\sigma)}-Q^{*\rho_{KL}(\sigma)}}_\infty^2}\nonumber
  \leq\mathcal{O}\brac{\frac{1}{ p_\wedge^2 (1-\gamma)^5 T}}.
\end{align}
To ensure 
\begin{align}
    \mathbb E \Fbrac{\mynorm{\widehat Q_T^{\rho_{KL}(\sigma)}-Q^{*\rho_{KL}(\sigma)}}_\infty^2}\leq \epsilon^2,\nonumber
\end{align}
the expected total sample complexity $N^{\rho_{KL}(\sigma)}(\epsilon)$ is
 \begin{align}
     N^{\rho_{KL}(\sigma)}(\epsilon) = |\mathcal{S}||\mathcal{A}|N_{\max} T\geq \mathcal{O}\brac{\frac{|\mathcal{S}||\mathcal{A}|}{ p_\wedge^2 (1-\gamma)^5 \epsilon^2 }}.\nonumber
 \end{align}
\end{theorem}
\begin{proof}
We consider the stochastic iteration that
\eqenv{
\widehat Q^{\rho_{KL}(\sigma)}_{t+1}=\widehat Q^{\rho_{KL}(\sigma)}_{t}+ \beta_t\brac{\boldsymbol{\widehat{\mathcal{T}}}^{\rho_{KL}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{KL}(\sigma)}_{t}}- \widehat Q^{\rho_{KL}(\sigma)}_{t}+ W_t  },
}
where $W_t={\widehat{\mathcal{T}}}^{\rho_{KL}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{KL}(\sigma)}_{t}}-\boldsymbol{\widehat{\mathcal{T}}}^{\rho_{KL}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{KL}(\sigma)}_{t}} $. 

Define the filtration $\mathcal{F}_t=\varbrac{Q^{\rho_{KL}(\sigma)}_0, W_0,...,Q^{\rho_{KL}(\sigma)}_{t-1},W_{t-1},Q^{\rho_{KL}(\sigma)}_{t} } $. 
Then, by \cref{def:4.1,thm:kl}, we can get that 
\eqenv{
\E\Fbrac{W_t|\mathcal{F}_t }=0,
}
and
\eqenv{
\E\Fbrac{\norminf{W_t}^2|\mathcal{F}_t }
&\leq \max_{s,a}{\E\Fbrac{\twonorm{{\widehat{\mathcal{T}}}^{\rho_{KL}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{KL}(\sigma)}_{t}}(s,a)-\boldsymbol{\widehat{\mathcal{T}}}^{\rho_{KL}(\sigma)}_{N_{\max}}\brac{Q^{\rho_{KL}(\sigma)}_{t}}(s,a)}^2|\mathcal{F}_t }}
\\& \leq \max_{s,a} \varbrac{\text{Var}\brac{\widehat {\mathcal{T}}^{\rho_{KL}(\sigma)}_{N_{\max}} (Q^{\rho_{KL}(\sigma)}_{t})(s,a)  } }
\\& \myineq{\leq}{i} \frac{ C_{\text{var}}}{p_\wedge^2  \sigma^2}\brac{r^2_{\max}+ \gamma^2\norminf{Q^{\rho_{KL}(\sigma)}_{t} }^2 }
\\& \leq\frac{r^2_{\max} C_{\text{var}}}{p_\wedge^2 (1-\gamma)^2 \sigma^2}\brac{1+\frac{\gamma^2}{(1-\gamma)^2}},
} where $(i) $ follows from \cref{eqeq88}. 

According to \cref{eqeq35}, we have that
$$\widehat Q^{*\rho_{KL}(\sigma)}(s,a) = \hatT (\widehat Q^{*\rho_{KL}(\sigma)})(s,a)=\E\Fbrac{\hatt (\widehat Q^{*\rho_{KL}(\sigma)})(s,a)}.$$
Then, apply \cref{lm:chen} \cite{chen2020finite}, set the constant stepsize
$$\beta_t=\beta= \min \varbrac{\frac{2\log T}{(1-\gamma)T}, \frac{(1-\gamma)^2{p_\wedge^2  \sigma^2}}{32 e r^2_{\max} { C_{\text{var}}}\log(|\mathcal{S}||\mathcal{A}|) }} .$$
We can conclude that
\eqenv{\label{eqeq661}
\E\Fbrac{\norminf{\widehat Q_{T}^{\rho_{KL}(\sigma)}-\widehat Q^{*\rho_{KL}(\sigma)} }^2}&\leq \frac{3}{2} \norminf{\widehat Q_{0}^{\rho_{KL}(\sigma)}-\widehat Q^{*\rho_{KL}(\sigma)} }^2\prod_{j=0}^{T-1} \brac{1-\frac{1-\gamma}{2} \beta_t} \\& \qquad+\frac{16 e \log (|\mathcal{S}||\mathcal{A}|)}{1-\gamma}\frac{r^2_{\max} C_{\text{var}}}{p_\wedge^2 (1-\gamma)^2 \sigma^2}\brac{1+\frac{\gamma^2}{(1-\gamma)^2}}\sum_{i=0}^{T-1} \beta_i^2 \prod_{t=i+1}^{T-1} (1-
\frac{1-\gamma}{2}\beta_t)
\\& \leq \frac{3}{2}\frac{r^2_{\max}}{(1-\gamma)^2} \frac{1}{T} +\frac{16 e \log (|\mathcal{S}||\mathcal{A}|)}{1-\gamma}\frac{r^2_{\max} C_{\text{var}}}{p_\wedge^2 (1-\gamma)^2 \sigma^2}\brac{1+\frac{\gamma^2}{(1-\gamma)^2}}\frac{4 \log T }{(1-\gamma)^2 T}
. 
}


Set $N_{\max}=\frac{2\log T}{\log 2}$. Then, we make the decomposition and get the bound of $\mathbb E \Fbrac{\mynorm{\widehat Q_T^{\rho_{TV}(\sigma)}-Q^{*\rho_{TV}(\sigma)}}_\infty^2}$ as follows
 \eqenv{
 \mathbb E& \Fbrac{\mynorm{\widehat Q_T^{\rho_{TV}(\sigma)}-Q^{*\rho_{TV}(\sigma)}}_\infty^2}
 \leq 
 2\E\Fbrac{\mynorm{\widehat Q_T^{\rho_{TV}(\sigma)}-\widehat Q^{*\rho_{TV}(\sigma)}}_\infty^2 }+ 2\E\Fbrac{\mynorm{\widehat Q^{*\rho_{TV}(\sigma)}-Q^{*\rho_{TV}(\sigma)}}_\infty^2 }
 \\& \myineq{\leq}{i}
  \frac{2r^2_{\max}}{(1-\gamma)^2 T} +\frac{32 e \log (|\mathcal{S}||\mathcal{A}|)}{1-\gamma}\frac{r^2_{\max} C_{\text{var}}}{p_\wedge^2 (1-\gamma)^2 \sigma^2}\brac{1+\frac{\gamma^2}{(1-\gamma)^2}}\frac{4 \log T }{(1-\gamma)^2 T}
\\& \qquad+\frac{2}{1-\gamma}\brac{\frac{r_{\max}}{(1-\gamma)\sigma} C_{KL}\frac{ 1}{p_\wedge2^\frac{N_{\max}+1}{2}}+2^{-(N_{\max}+1)}\frac{r_{\max}}{1-\gamma}}^2
\\& \leq  \frac{2r^2_{\max}}{(1-\gamma)^2 T} +\frac{32 e \log (|\mathcal{S}||\mathcal{A}|)}{1-\gamma}\frac{r^2_{\max} C_{\text{var}}}{p_\wedge^2 (1-\gamma)^2 \sigma^2}\brac{1+\frac{\gamma^2}{(1-\gamma)^2}}\frac{4 \log T }{(1-\gamma)^2 T}
+\frac{2}{1-\gamma}\brac{\frac{r_{\max}C_{KL}}{(1-\gamma)p_\wedge\sigma}+\frac{r_{\max}}{1-\gamma}}^2\frac{1}{T}
\\&= \mathcal{O}\brac{\frac{1}{(1-\gamma)^5p^2_\wedge\sigma^2 T}},
 }where $(i)$ follows from \cref{eqeq661} and \cref{thm:kl}.

This completes the proof.     
\end{proof}

% \begin{proof}
%     According to \cref{eqeq35}, we have that
% $$\widehat Q^{*\rho_{KL}(\sigma)}(s,a) = \hatT (\widehat Q^{*\rho_{KL}(\sigma)})(s,a)=\E\Fbrac{\hatt (\widehat Q^{*\rho_{K}(\sigma)})(s,a)}.$$
% By \cref{thm:kl1}, we can get that
% \eqenv{
% 2 C_{\text{var}}\frac{1}{p_\wedge^2 (1-\gamma)^2 \sigma^2}.
% }

% By the similar way as the proof of \cref{thm2:tv}, we can get that
% \eqenv{
% \mathbb E& \Fbrac{\mynorm{\widehat Q_T^{\rho_{KL}(\sigma)}-Q^{*\rho_{KL}(\sigma)}}_\infty^2}
%  \\& \leq
% \frac{\log T}{T^2}\frac{r^2_{\max}}{(1-\gamma)^2} 
% +  \frac{2\log T}{(1-\gamma)^2 T}\brac{ \frac{2C_{\text{var}}}{p_\wedge^2 (1-\gamma)^2 \sigma^2}}
% +\frac{2}{1-\gamma}\brac{\frac{r_{\max}}{(1-\gamma)\sigma} C_{KL}\frac{ 1}{p_\wedge}+\frac{r_{\max}}{1-\gamma}}^2\frac{1}{T}
% \\& \leq \mathcal{O}\brac{\frac{1}{p^2_\wedge(1-\gamma)^2\sigma^2}}.
% }
% This completes the proof. 
% \end{proof}
% You can have as much text here as you want. The main body must be at most $8$ pages long.
% For the final version, one more page can be added.
% If you want, you can use an appendix like this one.  

% The $\mathtt{\backslash onecolumn}$ command above can be kept in place if you prefer a one-column appendix, or can be removed if you prefer a two-column appendix.  Apart from this possible change, the style (font size, spacing, margins, page numbering, etc.) should be kept the same as the main body.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


\end{document}


% This document was modified from the file originally made available by
% Pat Langley and Andrea Danyluk for ICML-2K. This version was created
% by Iain Murray in 2018, and modified by Alexandre Bouchard in
% 2019 and 2021 and by Csaba Szepesvari, Gang Niu and Sivan Sabato in 2022.
% Modified again in 2023 and 2024 by Sivan Sabato and Jonathan Scarlett.
% Previous contributors include Dan Roy, Lise Getoor and Tobias
% Scheffer, which was slightly modified from the 2010 version by
% Thorsten Joachims & Johannes Fuernkranz, slightly modified from the
% 2009 version by Kiri Wagstaff and Sam Roweis's 2008 version, which is
% slightly modified from Prasad Tadepalli's 2007 version which is a
% lightly changed version of the previous year's version by Andrew
% Moore, which was in turn edited from those of Kristian Kersting and
% Codrina Lauth. Alex Smola contributed to the algorithmic style files.
