\documentclass[accepted]{uai2023} % for initial submission
% \documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
\usepackage{url}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example
%\newcommand{\dbtilde}[1]{\accentset{\approx}{#1}}

%\title{Optimistic }

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1,2]{Bingshan Hu}
\author[3]{Tianyue H. Zhang}
\author[1,4]{Nidhi Hegde}
\author[3,4]{Mark Schmidt}
%\author[1]{Further~Coauthor}
%\author[3]{Further~Coauthor}
%\author[3,1]{Further~Coauthor}
% Add affiliations after the authors
\affil[1]{%
    Department of Computing Science\\
    University of Alberta\\
    Edmonton, AB, Canada
}
\affil[2]{%
    Alberta Machine Intelligence Institute (Amii)\\
    Edmonton, AB, Canada
}
\affil[3]{%
    Department of Computer Science, University of British Columbia\\
    Vancouver, BC, Canada\\
  }

  \affil[4]{%
    Canada CIFAR AI Chair\\Alberta Machine Intelligence Institute (Amii), Canada
    
  }

  \usepackage{amsmath}
\usepackage{amssymb}
\usepackage{graphicx}
\usepackage{amsthm}


\usepackage{xcolor}

\usepackage{hyperref}
\hypersetup{
    colorlinks=true,
    linkcolor=blue,
    filecolor=blue,      
    urlcolor=blue,
    citecolor=blue,
}

\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{caption}
%\usepackage{algcompatible}
%\usepackage{algpseudocode}


\newtheorem{theorem}{Theorem}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{remark}[theorem]{Remark}
\newtheorem{fact}[theorem]{Fact}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{corollary}[theorem]{Corollary}
%\newtheorem{definition}[theorem]{Definition}



%\usepackage[english]{babel}
\usepackage{bm}

\newcommand{\sign}{\text{sign}}

% \usepackage{xr}
% \makeatletter

% \newcommand*{\addFileDependency}[1]{% argument=file name and extension
% \typeout{(#1)}% latexmk will find this if $recorder=0
% % however, in that case, it will ignore #1 if it is a .aux or 
% % .pdf file etc and it exists! If it doesn't exist, it will appear 
% % in the list of dependents regardless)
% %
% % Write the following if you want it to appear in \listfiles 
% % --- although not really necessary and latexmk doesn't use this
% %
% \@addtofilelist{#1}
% %
% % latexmk will find this message if #1 doesn't exist (yet)
% \IfFileExists{#1}{}{\typeout{No file #1.}}
% }\makeatother

% \newcommand*{\myexternaldocument}[1]{%
% \externaldocument{#1}%
% \addFileDependency{#1.tex}%
% \addFileDependency{#1.aux}%
% }
% %------------End of helper code--------------

% % put all the external documents here!
% \myexternaldocument{hu_579}

\title{Optimistic Thompson Sampling-based Algorithms for Episodic Reinforcement Learning (Supplementary material)}


  \begin{document}
\maketitle
\begin{abstract}

%{\color{blue}Bingshan has  not revised the abstract yet...}


We propose two  Thompson Sampling-like, model-based learning algorithms  for episodic Markov decision processes (MDPs) with a finite time horizon.
Our proposed algorithms are inspired by Optimistic Thompson Sampling (O-TS), empirically studied in \citet{chapelle2011empirical,may2012optimistic} for stochastic multi-armed bandits. The key idea for the original O-TS is to clip the posterior distribution in an optimistic way  to ensure that the sampled models are always better than the empirical models. Both of our proposed algorithms are easy to implement and only need one posterior sample to construct an episode-dependent model. Our first  algorithm, Optimistic Thompson Sampling for MDPs (O-TS-MDP), achieves a $\widetilde{O} \left(\sqrt{AS^2H^4T} \right)$ regret bound, where $S$ is the size of the state space, $A$ is the size of the action space, $H$ is the number of time-steps per episode and $T$ is the number of episodes. Our second algorithm, Optimistic Thompson Sampling plus for MDPs (O-TS-MDP$^+$),  achieves the (near)-optimal 
 $\widetilde{O} \left(\sqrt{ASH^3T} \right)$ regret bound by taking a more aggressive clipping strategy.  Since O-TS was only empirically studied previously, we derive regret bounds of O-TS for stochastic bandits. In addition, we propose,  O-TS-Bandit$^+$, a randomized version of UCB1 \citep{auer2002finite}, for stochastic bandits. Both O-TS and O-TS-Bandit$^+$ achieve the optimal $O\left(\frac{A\ln(T)}{\Delta} \right)$ problem-dependent regret bound, where $\Delta$ denotes the sub-optimality gap.


%O-TS-MDP is a model-based algorithm with randomized value functions and only needs one posterior sample. %However, O-TS-MDP does not rely on the principle of\emph{optimism in the face of uncertainty (OFU)}, i.e., O-TS-MDP is not an optimistic learning algorithm. 
%There are two key ingredients in O-TS-MDP. The first one is the usage of the boosted variance of the posterior distribution to drive exploration. The second one is the usage of Optimistic Thompson Sampling \citep{chapelle2011empirical,may2012optimistic} to clip the posterior sample to the mean of the posterior distribution. These two ideas together contribute to yielding a simple and elegant model-based algorithm with $\widetilde{O} \left(\sqrt{AS^2H^4T} \right)$ regret bound. % among {\color{red}all the non-OFU-based algorithms with randomized value functions.}

 
 %O-TS-MDP$^+$ takes a more aggressive way to do the clipping, i.e.,  O-TS-MDP$^+$ clips the posterior sample to the upper confidence bound.
 


%{\color{red}The key ingredient is to introduce Optimistic Thompson Sampling in designing learning algorithms}


% The regret bound for O-TS-MDP matches the regret bound in \citep{pacchiano2021towards}.

% We also use the same proof technique to show a problem-independent regret bound stochastic bandits. 

% Two ingredients in the O-TS-MDP: (a) widen the posterior distribution that models the mean reward to favor exploration ; (b) clip the left side of posterior distribution ; (a) and (b) together reshapes the posterior distribution. The clipping simplify the analysis of RVSI... the fundamental reason why it works is the reshaped 


 

\end{abstract}

\section{Introduction}\label{sec:intro}
 
%The key features distinguishing reinforcement learning tasks from the classical supervised learning tasks are the \emph{lack of data before learning and limited information acquired during with  learning}. We can view RL as a two-player game, where a \emph{learning agent} interacts with the \emph{environment} actively and sequentially. The learning agent interacts with the environment sequentially. 

 Reinforcement learning (RL) algorithms have been widely implemented  in real-world applications such as  autonomous driving, image processing, natural language processing, financial modeling, gaming, etc. Typically, an RL task can be formulated as a Markov decision process (MDP) with a state space, an action space, and a state transition function. In each time-step, 
 the learning agent visits a state,  plays an action, and transitions to the next state. %. After playing an action, the learning agent obtains a reward associated with the played action and transits to the next state. The rewards associated with the remaining states and actions remain hidden. 
 In this paper, we consider the learning problem of episodic, non-stationary MDPs with $S$ states, $A$ actions, and a finite time horizon $H$. %An episodic MDP can be viewed as a sequential two-player game between a learning agent and an unknown environment.
 In each round $t=1,2,\dotsc, H$ belonging to episode $k =1,2,\dotsc, T$, the learning agent visits a state and plays an action according to an action-sampling strategy. Then, the learning agent receives a random reward drawn from a fixed but unknown reward distribution and transitions to the next state sampled from a fixed but unknown transition probability distribution associated with the played action. The goal of the learning agent is to take actions wisely to maximize the cumulative reward over $T$ episodes. 
  The learning agent faces an exploitation-vs-exploration dilemma. In a single round, the learning agent can only choose an action that empirically performs the best so far to maximize the cumulative reward (exploitation) or choose an action that has not been played too often to learn the parameters of the associated unknown distributions (exploration). %{\color{red}There are multiple types of algorithms that are successful in achieving the trade-off between exploration and exploitation. }
 
Upper Confidence Bound (UCB)-based algorithms, inspired by the philosophy of optimism in the face of uncertainty (OFU), can achieve a balance between exploitation and exploration. The high-level idea behind this class of algorithms is to construct upper confidence bounds by adding an extra term to the empirical estimates. The additive  term encourages the learning agent to play actions that have not been played too often. %is  inversely proportional to the square root of the  number of times that this action has been taken. That is also to say, for the actions that have   not been played enough, they will have higher bonus to boost the chance they will be played. 
%Usually, the UCB-based algorithms are also optimistic algorithms, i.e., the parameters in the constructed model is lower bounded by the true parameter. 
Many existing episodic MDP learning algorithms \citep{azar2017minimax,dann2017unifying,zanette2019tighter,dann2019policy,zhang2020almost,tiapkin2022dirichlet} are UCB-based. Notably, two model-based algorithms, UCBVI  \citep{azar2017minimax} and Bayes-UCBVI \citep{tiapkin2022dirichlet}, enjoy the (near)-optimal $\widetilde{O} \left(\sqrt{ASH^3T} \right)$ regret bound.\footnote{The  $\widetilde{O} (\cdot) $ notation  only hides poly $\log(ASHT)$ factors.}
 %The key feature of OFU-inspired algorithm is they are optimistic, i.e., the constructed model is better than the true model...
UCB-based algorithms usually do not randomize the obtained data, and the exploration is driven by the additive terms.
 
%To tackle the exploitation-vs-exploration dilemma, 
Another class of algorithms perturb the obtained data in a certain way to encourage the learning agent to visit states and actions that have been less explored. 
 %Based on whether the value functions are  randomized or deterministic, we have the deterministic or the randomized algorithms.\cite{azar2017minimax,tiapkin2022dirichlet,dann2017unifying} 
The extra randomness can be achieved by injecting noise into the  data.  As shown in \cite{osband2019deep}, learning algorithms with random noise efficiently drive exploration.  Many  prominent state-of-the-art algorithms \citep{russo2019worst,pacchiano2021towards,xiong2021near,agrawal2021improved} for episodic MDPs are developed by adding calibrated Gaussian noise to the   empirical  estimates of the rewards. Although  the  transition probability distribution is unknown, the aforementioned learning algorithms do not add any noise to the empirical estimates of the transition probability distributions.
Among them, a model-based algorithm, NARL-UCBVI \citep{pacchiano2021towards}, and a model-free algorithm, C-RLSVI \citep{agrawal2021improved}, achieve the same $\widetilde{O} \left(\sqrt{AS^2H^4T} \right)$ regret bound. Very recently,  SSR-Bernstein \citep{xiong2021near}, also a model-free learning algorithm, tightens the regret bound to  $\widetilde{O} \left(\sqrt{ASH^3T} \right)$.

Different from adding noise directly to the data, extra randomness can also be achieved by drawing random samples from well-constructed data-dependent distributions. 
Interestingly, if the data-dependent distribution is designed to be the posterior distribution that models an unknown parameter, it coincides with the description of Thompson Sampling \citep{thompson1933likelihood,chapelle2011empirical}, one of the oldest randomized algorithms. For example,   Gaussian distributions can be used to model the means of the reward distributions and Dirichlet distributions can be used to model the transition probability distributions in MDPs.



%Interestingly, if the data-dependent distribution is designed to be the posterior distribution that models unknown parameters, for example, Gaussian distributions to model the means of the reward distributions and Dirichlet distributions to model the transition probability distributions,  it coincides with the description of Thompson Sampling \citep{chapelle2011empirical}, one of the oldest randomized learning algorithms.   




 %Thompson Sampling \citep{thompson1933likelihood} is a Bayes-inspired learning algorithm. 
 
Thompson Sampling was originally invented for stochastic  bandits, which can be viewed as a simple MDP with only one state, $A$ actions, and one round per episode. %In other words, we only have one state with $A$ actions. 
Since there are no transitions between states, the only unknown parameters in a stochastic bandit problem are the means of the reward distributions. Conceptually,
Thompson Sampling plays an action according to the posterior probability distribution of the optimal action. 
Empirically, it is not required to compute the exact posterior probability distribution of the optimal action. Instead, Thompson Sampling  can draw a random sample from the posterior distribution associated with each action and then play the action with the highest posterior sample. 
%i.e., {\color{red}the learning agent is completely safe to behave in a greedy way in the sampled model.}  
The  practical performance of Thompson Sampling  has been studied  extensively by \citet{chapelle2011empirical}. The theoretical performance of Thompson Sampling relies on the choice of the prior distributions. Thompson Sampling with Beta priors is asymptotically optimal \citep{agrawalnear,kaufmann2012thompson}, while Thompson Sampling with Gaussian priors is problem-dependent optimal \citep{agrawalnear}.%, i.e.,  satisfying an $O\left(\frac{A\ln(T)}{\Delta} \right)$ regret bound, where $A$ is the number of arms and $\Delta$ is the sub-optimality gap.}

%Later, the optimal $O \left( \frac{A\ln(T)\Delta}{\text{KL}\left(\mu_* -\Delta , \mu_*\right)}\right)$ problem-dependent regret bounds are presented in \citep{kaufmann2012thompson,agrawalnear}, where $\text{KL}(a,b)$ denotes the KL-divergence between two Bernoulli distributions with parameters $a,b \in (0,1)$ and $\Delta > 0$ denotes the sub-optimal gap from the optimal value $\mu_*$.

In reality, it is not necessary to restrict the data-dependent distribution to the exact posterior distribution.
%There is no advantage of restricting the data-dependent distribution to be the exact posterior distribution. 
 If the data-dependent distribution is designed to be the posterior distribution with some  parameters modified, we term this type of algorithm as \emph{Thompson Sampling-like algorithms with posterior distribution reshaping}.
%\paragraph{Thompson Sampling with posterior distribution reshaping.}
Optimistic Thompson Sampling (O-TS), a Thompson Sampling-like algorithm with posterior distribution reshaping, was first introduced and empirically evaluated  by 
\cite{chapelle2011empirical,may2012optimistic}.\footnote{Originally, this learning algorithm was called Optimistic Bayesian Sampling (OBS) \citep{may2012optimistic}.} %Then, its empirical performance for stochastic bandits was investigated.  
 The key idea of O-TS is to boost the random posterior sample to the mean of the posterior distribution (Gaussian distribution) if the  random sample is smaller than the mean. In other words, O-TS reshapes the posterior distribution from a Gaussian distribution to a one-sided Gaussian distribution with the left side being clipped.
%To reshape a posterior distribution, both the mean and the variance parameters of the posterior distribution can be modified.
%Optimistic Thompson Sampling can be viewed as {\color{red}a Thompson Sampling-type learning algorithm with posterior distribution reshaping.  } 
There are other Thompson Sampling-like algorithms with reshaped posterior distributions. For stochastic bandits, \cite{jin2021mots} devise MOTS, the first Thompson Sampling-like algorithm achieving minimax optimality. MOTS reshapes the posterior distribution by both clipping the upper tail and boosting the variance of the posterior distribution.  

%Later, {\color{red}\cite{hu2022near} reshape the posterior distribution via adding a bonus term to the mean of the posterior distribution to devise an optimal Thompson Sampling-like algorithm for differentially private stochastic bandits.} 

Different from stochastic bandits where the learning agent only needs to learn  the means of the reward distributions, in episodic MDPs, the  transition probability distributions are also unknown. 
Two Thompson Sampling-like, model-based algorithms, SOS-OPS-RL \citep{agrawal2017posterior} and OPSRL  \citep{tiapkin2022optimistic}, use Dirichlet distributions to model the  transition probability distributions. To drive exploration, they also boost the variance of the Dirichlet distributions.
SOS-OPS-RL  achieves a $\widetilde{O} \left(\sqrt{AS^2H^4T} \right)$ regret bound while SPSRL achieves the (near)-optimal $\widetilde{O} \left(\sqrt{ASH^3T} \right)$ regret bound. %More detailed discussions about SOS-OPS-RL and OPSRL will be presented in Section~\ref{sec: O-TS-MDP}. }%{\color{red} (Bingshan will add more stuff)}



%\paragraph{Model-based MDP algorithms with randomized value functions}
%\cite{pacchiano2021towards,tiapkin2022optimistic,agrawal2017posterior}, O-TS-MDP, O-TS-MDP$^+$.

%\paragraph{Value-based MDP algorithms with randomized value functions}\cite{russo2019worst,xiong2021near,agrawal2021improved}

 %Only very recently, SSR of \cite{ xiong2021near}, an RLSVI-inspired, value-based algorithm, improves the regret bound to $\widetilde{O} \left( \sqrt{SAH^3T}\right)$.
%\paragraph{Model-based MDP algorithms with deterministic value functions}

% \citet{may2012optimistic} first proposes Optimistic Bayesian Sampling (OBS) for contextual bandits. They 

% \footnote{In \citep{chapelle2011empirical}, the name is Optimistic Thompson Sampling; in \citep{may2012optimistic}, the name is Optimistic Bayesian Sampling (OBS).}



%\citep{kaufmann2012thompson}
Now, we list our key contributions in this paper.



(1) We propose O-TS-MDP, a computationally efficient and theoretically elegant model-based learning algorithm with randomized value functions, for episodic MDPs. O-TS-MDP only draws one random sample and enjoys a $\widetilde{O} \left(\sqrt{AS^2H^4T} \right)$ regret bound. There are two key ingredients in O-TS-MDP. The first one is the usage of the boosted variance of the posterior distribution (a Gaussian distribution) to drive exploration. The second one is the usage of O-TS  to clip the left side of the posterior distribution to simplify the theoretical analysis. Although the regret bound of O-TS-MDP is not as tight as OPSRL  \citep{tiapkin2022optimistic} and SSR-Bernstein \citep{xiong2021near}, OPSRL needs to draw $\widetilde{O}(1)$ random samples while SSR-Bernstein is a model-free algorithm.  For our regret analysis of O-TS-MDP, we can   avoid upper bounding the absolute value of the estimation error, thus simplifying the theoretical analysis as compared to the analysis of RLSVI-based algorithms \citep{russo2019worst,agrawal2021improved,xiong2021near}.




 (2) We propose $\text{O-TS-MDP}^+$, a model-based, OFU-inspired optimistic algorithm with randomized value functions.\footnote{We say a learning algorithm is  OFU-inspired and optimistic if the Optimism Decomposition \citep{pacchiano2021towards} can be used to decompose the regret.} %An optimistic learning algorithm guarantees that the parameters in the constructed model  are greater than the parameters in the true model. Meanwhile, the estimation errors are maintained to be low.} %\footnote{\color{red}We say a learning algorithm is OFU-inspired and optimistic if the regret can be decomposed in a certain way. More specifically, if the Optimism Decomposition \citep{pacchiano2021towards} can be used to decompose the regret. The Optimism Decomposition requires the parameters in the constructed model to be greater than the parameters in the true model. Meanwhile, the estimation errors should be maintained low enough.}
$\text{O-TS-MDP}^+$ achieves the (near)-optimal $\widetilde{O} \left(\sqrt{ASH^3T} \right)$ regret bound. The key idea in $\text{O-TS-MDP}^+$ is a more aggressive clipping strategy of the posterior distribution. O-TS-MDP$^+$ boosts the value of the random sample to the upper confidence bound if the random sample is smaller than the upper confidence bound.
    The aggressive clipping contributes to reducing the variance of the reshaped posterior distribution as compared to O-TS-MDP. Consequently, the regret bound is tightened to be (near)-optimal.  O-TS-MDP$^+$ can be viewed as a randomized version of UCB-VI \citep{azar2017minimax}. %It is important to note that even though the regret bound of O-TS-MDP$^+$ is (near)-optimal, the learning algorithm itself does not involve a Bernstein type bonus. }

    (3) Although \cite{chapelle2011empirical,may2012optimistic} have demonstrated the empirical performance of  O-TS for stochastic bandits, there is no theoretical analysis for it. We derive regret bounds for O-TS for bandits. In addition, we propose O-TS-Bandit$^+$, an OFU-inspired learning algorithm, for stochastic bandits. 
     Both O-TS and O-TS-Bandit$^+$ achieve the (order)-optimal $O\left(\frac{A\ln(T)}{\Delta} \right)$ problem-dependent regret bound, where $\Delta$ is the sub-optimality gap. 
    

% \begin{enumerate}
%     \item  %O-TS-MDP is not an optimistic algorithm, does not rely on the principle of\emph{optimism in the face of uncertainty (OFU)}, i.e., O-TS-MDP is not an optimistic learning algorithm. 
%  %These two ideas together contribute to yielding a simple and elegant model-based algorithm with $\widetilde{O} \left(\sqrt{AS^2H^4T} \right)$ regret bound. % among {\color{red}all the non-OFU-based algorithms with randomized value functions.}
    
    
%     %It is important to note O-TS-MDP is not an optimistic learning algorithm, i.e.,  O-TS-MDP does not rely on the principle of optimism in the face of uncertainty to construct an optimistic MDP.    %The boosted variance of posterior distribution is to drive exploration; The usage of optimistic TS is to simplify the analysis. 
    
%     \item 
%     %It matches the $\Omega \left(\sqrt{H^3SAT} \right)$ regret lower bound up to logarithmic factors. 
    
%    \item 
% %    \item {\color{red}We conduct experiments to compare the empirical performance among XXXXX.}

%     % and the near-optimal $O(\sqrt{AT\ln(T)})$\footnote{The minimax optimal regret bound is $O \left(\sqrt{AT} \right)$.} problem-independent regret bound, where $A$ is the number of arms and $\Delta$ denotes the sub-optimality gap.

%    % \item We conduct experiments to compare the practical performance among XXXX.
    
% \end{enumerate}



\section{Learning Problem}\label{sec:learning problem}

We consider an episodic non-stationary MDP problem which can be specified by $M=\{\mathcal{S}, \mathcal{A}, H,\bm{P}, \bm{\mu}, p_0 \}$, where $\mathcal{S}$ is a finite state-space with size $S$, $\mathcal{A}$ is a finite action-space with size $A$, $H$ is the finite number of rounds in each episode  and $p_0$ is the deterministic initial state distribution. Let $\bm{P}$ and $\bm{\mu}$ denote the transition function and reward function, respectively. 
The learning agent interacts with the environment in an episodic way with the following learning protocol.
In each round $t \in [H]$ belonging to episode $k$, the learning agent observes a state $s_t^{k}$ and plays an action $a_t^{k}$. Then, the learning agent receives a random reward $X_{s_t^k, a_t^k,t}^{k} \in [0,1]$  that is drawn from  a fixed  reward distribution with mean $\mu_{s_t^{k}, a_t^{k},t} = \bm{\mu}(s_t^{k}, a_t^{k},t)$ and transitions to the next state $s_{t+1}^{k}$ that is sampled from  a fixed transition probability distribution   $P_{s_t^{k}, a_t^{k},t}   =\bm{P}(s_t^{k}, a_t^{k},t)$. The initial state $s_1^k$ is sampled from $ p_0$. {The goal of the learning agent is to accumulate as much reward as possible over a finite number of $T$ episodes, i.e., $HT$ rounds in total.

A deterministic policy $\pi = \left(\pi(\cdot, 1), \pi(\cdot, 2), \dotsc, \pi(\cdot, H)\right)$ is a sequence of functions, where each 
 $\pi(\cdot, t) : \mathcal{S} \rightarrow \mathcal{A}$ takes a state  as input and outputs an action that will be played if the learning agent visits that state. Let $\Pi$ collect all such policies.  The value function $V_t^{\pi}(s)$ for state $s$ of  policy $\pi$ in round $t$ is defined as
$V_t^{\pi}(s)  :=  \mu_{s, \pi(s, t),t} + P_{s, \pi(s,t),t}^{\intercal} {V}^{\pi}_{t+1}$. 
If we knew $\bm{P}$ and $\bm{\mu}$, we could use backwards induction to compute the optimal policy $\pi_*$.  Define $V^{\pi_*}_{H+1} = \Vec{0}$. Then, for each round $t = H, H-1, \dotsc, 1$, for each state $s \in \mathcal{S}$, we compute 
\begin{equation*}
\begin{array}{lll}
\pi_*(s,t) &=& \mathop{\arg\max}\limits_{a \in \mathcal{A}} \left\{\mu_{s,a,t} + P_{s,a,t}^{\intercal}V^{\pi_*}_{t+1} \right\}\quad,\\
    V_{t}^{\pi_*} (s)&= &\mu_{s,\pi_*(s,t),t} + P_{s,\pi_*(s,t),t}^{\intercal}V^{\pi_*}_{t+1} \quad.
    \end{array}
\end{equation*}
%The quality of a  policy $\pi$ is measured by the expected reward  $\mathbb{E}_{s \sim p_0} V_1^{\pi}(s)$. 
The expected regret $ \mathcal{R} ( T)$  over $T$ episodes is defined as
\begin{equation}
  \label{regret: RL}
\begin{array}{l}
     \mathcal{R} ( T)
      = \sum\limits_{k=1}^{T} \mathbb{E} \left[ \left(V_1^{\pi_*}(s_1^k) - V_1^{\pi_k}(s_1^k) \right)\right] \quad,
      % = &\sum\limits_{k=1}^{T} \mathbb{E}_{s \sim p_0}  \mathbb{E} \left[ \left(V_1^{\pi_*}(s) - V_1^{\pi_k}(s) \right) \mid s_1 = s \right],
    \end{array}
\end{equation}
where $\pi_k$ is random and the initial state $s_1^k \sim p_0$. If there exists $(s,t)$ such that $\pi_k(s,t) \ne \pi_*(s,t)$, then regret may occur. 
Define $\mathcal{F}_{k} = \left\{s_t^{q}, a_t^{q}, X_{s_t^{q}, \ a_t^{q}, t}^{q}, t \in [H], q \in [k] \right\}$
as the history trajectory  by the end of episode $k$ following policies $\pi_1, \dotsc, \pi_q, \dotsc, \pi_k$ with the initial state in each episode  independently drawn from $p_0$. Define $\mathcal{F}_0 = \{\}$. %{\color{blue}For all the theoretical analysis in this paper, we fix $s$ as the initial state.}

%\section{Related Work}\label{sec:literature}

% %\subsection{UCB-based algorithms with deterministic value functions}

% Our work is most related to model-based learning algorithms. The key features of model-based learning algorithms is we have the planning phase and running phase. And compared model-free based learning algorithms, it is.... (talk about the disadvantage of model-free based learning algorithm...)

% For model-based learning algorithm, conditioned on history, based on whether the constructed model is deterministic or randomized, 

% One type of algorithms is based on the philosophy of optimism in the face of uncertainty. We construct an MDP instance that is optimistic and then we find the best policy for the constructed MDP in the planning phase. Then, we run this policy in the true MDP. Majority of exsiting learning algorithms here have deterministic value functions.  \citep{azar2017minimax,dann2017unifying,zanette2019tighter,tiapkin2022dirichlet}

% {\color{red} Key features are strong optimism and exploration is driven by the added bonus}




% %Model-based algorithms with deterministic value functions... 

% Model-based algorithms with randomized value functions...
% %Model-free algorithms...\citep{osband2016generalization,xiong2021randomized}

% %\subsection{Algorithms with randomized value functions}

% Injecting noise...\citep{russo2019worst, kveton2019garbage, pacchiano2021towards}

% Thompson sampling ...\citep{agrawal2017posterior,tiapkin2022optimistic}
% However, for these learning algorithms, essentially, they also use the philosophy of optimism in the face of uncertainty. Although the constructed MDP is randomized, they are also optimistic by drawing multiple number of i.i.d. samples from the same posterior distribution. They also boost the variance of the posterior distribution to drive exploration and achieve optimism.

\section{O-TS-MDP and O-TS-MDP$^+$}\label{sec: O-TS-MDP}
\begin{algorithm}[!ht]
	\caption{O-TS-MDP} 
	\label{Optimistic Thompson Sampling}
	\begin{algorithmic}[1]
	\STATE {\bf{Input:}} MDP instance $M$, number of episodes $T$
	
\STATE {\bf{Initialization}:} 

Set $\widehat{O}_{s,a,t} \leftarrow 0$, $\widehat{P}_{s,a,t} \leftarrow \Vec{0}$, $\widehat{\mu}_{s,a,t} \leftarrow 0, \forall (s,a,t)$   

\FOR {episode $k = 1, 2, \dotsc,T$} 
\STATE Set $\widetilde{V'}_{H+1}^{\pi_k} = \Vec{0}$%, $\pi_k 
\label{planning start}

\FOR {$t = H,H-1, \dotsc, 1$}
\FOR {$s \in \mathcal{S}$}
\FOR {$a \in \mathcal{A}$}


\STATE Draw $\widetilde{\mu}_{s,a,t} \sim \mathcal{N} \left( \widehat{\mu}_{s,a,t}, {\color{red}\left(\sqrt{SH}  \sigma_{s,a,t}^k \right)^2}\right) $

 
{\color{red}Set $\widetilde{\mu}'_{s,a,t} \leftarrow  \max\left\{\widetilde{\mu}_{s,a,t}, \widehat{\mu}_{s,a,t} \right\}$}

 
Set $\widetilde{Q}_{s,a,t} \leftarrow \widetilde{\mu}'_{s,a,t} + \widehat{P}_{s,a,t}^{\intercal} \widetilde{V'}^{\pi_k}_{t+1}  $ 
\ENDFOR 

\STATE  

Set $\pi_k(s, t) \leftarrow \mathop{\arg\max}_{a \in \mathcal{A}} \widetilde{Q}_{s,a,t}$

Set $\widetilde{V'}^{\pi_k}_{t}(s) \leftarrow \widetilde{Q}_{s,\pi_k(s,t),t}$
\ENDFOR 
\ENDFOR  \label{planning end}

\STATE Sample $s_1^k \sim p_0$, run $\pi_k$, and %obtain  a trajectory, %$s_1^k, a_1^k,  \dotsc, s_t^k, a_t^k, \dotsc, s_H^k, a_H^k$
update  $\widehat{\mu}_{s_t^k, \pi_k(s_t^k,t), t}$, $\widehat{O}_{s_t^k, \pi_k(s_t^k,t), t}$, and $\widehat{P}_{s_t^k, \pi_k(s_t^k,t), t}$ for all $t \in [H]$. %Note that $s_{t+1}^k \sim P_{s_t^k, \pi_k(s_t^k,t),t}$
\ENDFOR	
	\end{algorithmic}
\end{algorithm}
We first present some notations used by our proposed algorithms O-TS-MDP and O-TS-MDP$^+$.
Let $\widehat{O}_{s,a,t}^k = \sum_{q=1}^{k} \bm{1} \left\{\left(s_t^q, a_t^q\right) = (s, a) \right\}$ denote the number of times that $(s,a)$ has been visited in round $t$ by the end of episode $k$.
Let $\widehat{\mu}_{s,a,t}^{k} = \frac{1}{\widehat{O}_{s,a,t}^k }\sum_{q=1}^{k} \bm{1} \left\{\left(s_t^q, a_t^q\right) = (s, a)  \right\} X_{s,a,t}^{q}$ denote the empirical mean of $(s,a,t)$ by the end of episode $k$. 
Let $\widehat{P}_{s,a,t}^k(s') = \frac{1}{\widehat{O}_{s,a,t}^k }\sum_{q=1}^{k} \bm{1} \left\{\left(s_t^q, a_t^q\right) = (s, a) , s_{t+1}^{q} = s' \right\}$ denote the empirical transition probability distribution. Let $\sigma_{s,a,t}^{k} := 5\sqrt{ H^2 \log^2\left(\frac{H}{\delta}\right)/\widehat{O}_{s,a,t}^{k-1}}$, where $ \delta = \frac{1}{ASH^2T^2}$. For the case when $(s,a,t)$ has not been visited yet by the end of episode $k-1$, i.e., $\widehat{O}_{s,a,t}^{k-1} = 0$, our algorithms set $\sigma_{s,a,t}^{k}$ to a large constant. 

\subsection{O-TS-MDP}
O-TS-MDP is presented in Algorithm~\ref{Optimistic Thompson Sampling}. The key ingredients in O-TS-MDP are the  boosted variance of the posterior distribution (a Gaussian distribution) to drive exploration and  the usage of O-TS \citep{may2012optimistic,chapelle2011empirical} to clip a Gaussian distribution to a one-sided Gaussian distribution with the left side being truncated. The reshaping of the posterior distribution plays a crucial role in simplifying the algorithm and the theoretical analysis. 

The same as other model-based algorithms, in episode $k$, O-TS-MDP  constructs an episode-dependent model $\tilde{M}_k'$ to simulate the true model. %Then, O-TS-MDP finds the optimal policy $\pi_k$ for $\tilde{M}_k'$. 
To construct $\tilde{M}_k'$ with randomized value functions, O-TS-MDP draws a random sample  $\widetilde{\mu}_{s,a,t}^{k} \sim \mathcal{N}\left(\widehat{\mu}_{s,a,t}^{k-1}, SH\left( \sigma_{s,a,t}^{k} \right)^2  \right)$ for each $(s,a,t)$.
If $\widetilde{\mu}_{s,a,t}^{k} < \widehat{\mu}_{s,a,t}^{k-1}$, O-TS-MDP  boosts it to $\widehat{\mu}_{s,a,t}^{k-1}$.  
Let $\widetilde{\mu'}_{s,a,t}^{k}  := \max \left\{\widehat{\mu}_{s,a,t}^{k-1}, \widetilde{\mu}_{s,a,t}^{k} \right\}$. Note that  $\widetilde{\mu'}_{s,a,t}^{k}$ can be viewed as a random variable drawn from distribution $\mathcal{N'}_{s,a,t}^{k}$ with the probability density function (PDF) $f'(x)=$ 
\begin{equation*}
 \left\{ \begin{array}{ll}
        0, &\text{$x < \widehat{\mu}_{s,a,t}^{k-1}$},\\
        \phi \left(x ; \widehat{\mu}_{s,a,t}^{k-1}, SH\left(\sigma_{s,a,t}^{k} \right)^2  \right) + \frac{\delta\left(x-\widehat{\mu}_{s,a,t}^{k-1}\right)}{2}  ,&\text{$x \ge \widehat{\mu}_{s,a,t}^{k-1}$},\end{array} \right.
\end{equation*}
where $\phi(x; \mu, \sigma^2)$ denotes the PDF of $\mathcal{N}\left(\mu, \sigma^2 \right)$ and $\delta(\cdot)$ denotes the Dirac delta function.\footnote{Note that from Algorithm~\ref{Optimistic Thompson Sampling} we can see that to implement O-TS-MDP, it is not required to compute $f'(x)$. We simply draw a random sample from a Gaussian distribution and then compare the sample value with the mean of the Gaussian distribution.} With $\widetilde{\mu'}_{s,a,t}^{k}$ for all $(s,a,t)$ in hand, we construct $ \tilde{M}'_k = \left\{\mathcal{S}, \mathcal{A}, H, \widehat{P}^{k-1} , \widetilde{\mu'}^{k} , p_0 \right\}$
, where $\widehat{P}^{k-1} = \left\{\widehat{P}_{s,a,t}^{k-1}\right\}$  collects all the empirical transition probability distributions by the end of episode $k-1$ and $\widetilde{\mu'}^{k} = \left\{ \widetilde{\mu'}_{s,a,t}^{k}\right\}$ collects all the random samples after the boosting. After constructing $\tilde{M}'_k$, O-TS-MDP uses backwards induction to find the optimal policy $\pi_k$ for  $\tilde{M}'_k$ (shown in Line~\ref{planning start} to Line~\ref{planning end} in Algorithm~\ref{Optimistic Thompson Sampling}). Let $\widetilde{V'}_{t}^{\pi}$  denote the value functions of a fixed policy $\pi$ for  $\tilde{M}'_k$ in round $t$.  

 Now, we present a regret bound for Algorithm~\ref{Optimistic Thompson Sampling}.
\begin{theorem}
\label{regret theorem 1}
The regret of Algorithm~\ref{Optimistic Thompson Sampling} is $\widetilde{O} \left(\sqrt{AS^2H^4T} \right)$.
\end{theorem}
O-TS-MDP is a \emph{computationally efficient and space efficient}  algorithm which only needs one random sample for each $(s,a,t)$ to construct the episode-dependent model. Per episode, the time complexity  is $O(AS^2H)$ and the space complexity is $O(AS^2H)$.
In contrast, OPSRL \citep{tiapkin2022optimistic} and SOS-OPS-RL \citep{agrawal2017posterior} need  multiple posterior samples to construct a model. The improvement of O-TS-MDP comes from the usage of  $\widehat{P}^{k-1}$ to construct the model. Instead, OPSRL and SOS-OPS-RL use Dirichlet random variables to construct the model.
%{\color{red}(here, should talk about the amazing work they have done...)}.
Although O-TS-MDP, OPSRL, and SOS-OPS-RL are all model-based algorithms with randomized value functions, O-TS-MDP is not an optimistic learning algorithm while OPSRL and SOS-OPS-RL are both optimistic algorithms. In other words,  in O-TS-MDP, the value functions $\widetilde{V'}_1^{\pi_*}(s)$ are not guaranteed to be greater than $V_1^{\pi_*}(s)$ with high probability. As shown in the regret analysis, O-TS-MDP only achieves weak optimism. That is, each  $\widetilde{V'}_1^{\pi_*}(s)$ are  only guaranteed to be greater than $V_1^{\pi_*}(s)$ with a small constant  probability. %In contrast, . %The value functions of the optimal policy $\pi_*$ for the constructed episode-dependent model are guaranteed to be greater than $V_1^{\pi_*}(s)$ with high probability. 
However, the strong optimism guarantee in OPSRL and SOS-OPS-RL is at the cost of drawing multiple posterior samples. We believe that the regret bound of O-TS-MDP can also be tightened to $\widetilde{O} \left(\sqrt{ASH^3T} \right)$ if drawing $\widetilde{O}(1)$ random samples.

The key idea behind SSR-Bernstein \citep{xiong2021near} to achieve the optimal $\widetilde{O}\left(\sqrt{ASH^3T} \right)$ regret bound is to limit the amount of randomness within the learning algorithm. More specifically, in SSR-Bernstein, in each episode, all tuples $(s, a, t)$ use the same random seed, which is a Gaussian random variable. In other words, for the entire learning algorithm across $T$ episodes, the number of independent Gaussian random variables in SSR-Bernstein is exactly $T$. In contrast, in O-TS-MDP, each tuple $(s,a,t)$ has its own random seed meaning that the total amount of independent Gaussian random variables is $O(ASHT)$. As compared to O-TS-MDP, SSR-Bernstein does not fully randomize its obtained data.


Although O-TS-MDP and the RLSVI-based algorithms \citep{russo2019worst,agrawal2021improved,xiong2021near} share in common  an exploration mechanism,  the introduce of O-TS to clip the left side of the posterior distribution \emph{simplifies the theoretical analysis in two ways}. First, upper bounding the \emph{absolute value of the estimation error} is not needed in the theoretical analysis of O-TS-MDP (Proof of Lemma~\ref{temp 95} in the appendix presents more details). 
All the aforementioned RLSVI-based algorithms  upper bound the absolute value of the estimation error,  which is more complicated than upper bounding the one-sided error, as stated in \cite{xiong2021near}. 
Second, as compared to C-RLSVI of \cite{agrawal2021improved} and SSR of \citet{xiong2021near}, O-TS-MDP does not clip the randomized value functions to  values in $[0,H]$. Consequently, the analysis of O-TS-MDP can reuse the value difference lemma (Lemma~\ref{value difference lemma}) directly. %{\color{red}ADD SOMETHING HERE}. %, which also simplifies the analysis.


 
 %O-TS-MDP is a model-based algorithm while RLSVI is a model-free algorithm.  
 
 %In Section~\ref{sec: O-TS-MDP for bandits}, more details will be presented.

 %{\color{red} When comparing to C-RLSVL, O-TS-MDP does not need to clip the value functions to a value in $[0,H]$. This ensures the analysis of O-TS-MDP can reuse the value difference lemma shown in Lemma~\ref{value difference lemma} and simplify the analysis.  by introducing Optimistic Thompson Sampling.  }
 

 %The regret bound of O-TS-MDP is $\sqrt{SH}$ worse than the regret bounds for OPSRL of \cite{tiapkin2022optimistic} and SSR with Bernstein-type noise of \cite{xiong2021near}. $\Omega\left(\sqrt{ASH^3T}\right)$ lower bound. 

% \item A good place to discuss "optimistic learning algorithm"


 Recall that $\sigma_{s,a,t}^k = 5 \sqrt{H^2 \log^2(T/\delta)/\widehat{O}_{s,a,t}^{k-1}}$. To sketch the regret analysis, we first construct the empirical MDP 
$ \hat{M}_k = \left\{\mathcal{S}, \mathcal{A}, H, \widehat{P}^{k-1} , \widehat{\mu}^{k-1} , p_0 \right\}$, where $\widehat{\mu}^{k-1} = \left\{\widehat{\mu}_{s,a,t}^{k-1} \right\}$ collects all the empirical means.
 Let $\widehat{V}_{t}^{\pi}$ denote the value functions of a fixed policy $\pi$ for $\hat{M}_k$. 
 To decompose the regret, we define two high-probability events: 
 \begin{equation}
  \label{apple}
\begin{array}{lll}
\mathcal{E}^k  & = & \left\{\left| \left(\widehat{P}_{s,a,t}^{k-1} - P_{s,a,t} \right)^{\intercal} V_{t+1}^{\pi_k} \right| \le \sigma_{s,a,t}^{k}, \right.\\
 && \left.\left| \left(\widehat{\mu}_{s,a,t}^{k-1} - \mu_{s,a,t} \right) \right| \le \sigma_{s,a,t}^{k}, \forall (s,a,t)  \right\},\\
&& \\
 \mathcal{E}_{\pi_*}^k   & = & \left\{\left|\left(P_{s,a,t} - \widehat{P}^{k-1}_{s,a,t}\right)^{\intercal}V^{\pi_*}_{t+1} \right|\le \sigma_{s,a,t}^{k}\right.\\
 && \left. \left|\widehat{\mu}_{s,a,t}^{k-1} - \mu_{s,a,t} \right| \le  \sigma_{s,a,t}^{k}, \forall (s,a,t)  \right\}.
 \end{array}
 \end{equation}
We prepare three lemmas  to prove  Theorem~\ref{regret theorem 1}. Recall $V_1^{\pi_*}(s)$ is the value function of the optimal policy $\pi_*$ for the true MDP $M$ and $\widetilde{V'}_{1}^{\pi_k}(s)$ is the value function of the optimal policy $\pi_k$ for the episode-dependent MDP $\tilde{M}'_k$. Our technical Lemma~\ref{temp 95} upper bounds the expected performance gap between the optimal policy $\pi_*$ over $M$ and the optimal policy $\pi_k$ over $\tilde{M}'_k$. %{\color{red}The proof for Lemma~\ref{temp 95} is similar to the proof of Lemma~6 of \citet{russo2019worst}......}
\begin{lemma}
 \label{temp 95}
(Optimism). In episode $k$,  we have
\begin{equation}
\label{temp 505}
    \begin{array}{ll}
      &  \mathbb{E} \left[\left(V_1^{\pi_*}( s_1^k) - \widetilde{V'}_{1}^{\pi_k}(s_1^k) \right) \bm{1} \left\{\mathcal{E}_{\pi_*}^k \right\}        \right] \\
           \le  & \sqrt{SH}   \sum\limits_{t=1}^{H} \mathbb{E} \left[O \left({\sigma}^k_{s_t^k, \pi_k(s_t^k,t),t} \right) \right].
    \end{array}
\end{equation}
\end{lemma}
Recall that $\widehat{V}_1^{\pi_k}(s)$ is the value function of policy $\pi_k$ over the empirical MDP $\hat{M}_k$.  Lemma~\ref{temp 78} upper bounds the expected performance gap  for a single policy $\pi_k$ over MDPs $\tilde{M}'_k$ and $\hat{M}_k$ and Lemma~\ref{temp 91} upper bounds the expected performance gap for policy $\pi_k$ over MDPs $\hat{M}_k$ and $M$. Note that MDPs $\tilde{M}'_k$ and $\hat{M}_k$ are constructed based on the same $\widehat{P}^{k-1}$ which is determined by $\mathcal{F}_{k-1}$.
\begin{lemma}
\label{temp 78}
(Posterior deviation). In episode $k$, we have
\begin{equation}
    \begin{array}{l}
          \mathbb{E} \left[  \widetilde{V'}_{1}^{\pi_k}(s_1^k) - \widehat{V}_1^{\pi_k}(s_1^k)         \right] 
     \le \sqrt{SH}  \sum\limits_{t=1}^{H} \mathbb{E} \left[{\sigma}^k_{s_t^k, \pi_k(s_t^k,t),t} \right].
     \end{array}
          \end{equation}
\end{lemma}
\begin{lemma}
    \label{temp 91}
(Empirical deviation).    In episode $k$, we have
    \begin{equation}
    \begin{array}{ll}
        & \mathbb{E} \left[ \left(\widehat{V}_{1}^{\pi_k}(s_1^k) - {V}_{1}^{\pi_k}(s_1^k) \right) \bm{1} \left\{ \mathcal{E}^k \right\} \right] \\
        & \le 2\sum\limits_{t=1}^{H}\mathbb{E} \left[\sigma_{s_t, \pi_k(s_t,t),t}^{k}\right].
         \end{array}
    \end{equation}
\end{lemma}

%After these preparations, we now prove Theorem~\ref{regret theorem 1}.
\begin{proof}[Proof of Theorem~\ref{regret theorem 1}] 
We have
\begin{equation}
\begin{array}{ll}
  &\sum\limits_{k=1}^{T}  \mathbb{E} \left[ \left(V_1^{\pi_*}(s_1^k) - V_1^{\pi_k}(s_1^k) \right)\right] \\
\le   &  \sum\limits_{k=1}^{T}  \mathbb{E}_{}\left[ \underbrace{ \left(V_1^{\pi_*}(s_1^k) - \widetilde{V'}_1^{\pi_k}(s_1^k) \right)  \bm{1} \left\{ \mathcal{E}_{\pi_*}^{k} \right\}}_{ \text{Lemma~\ref{temp 95}}} \right]  \\
+ & \sum\limits_{k=1}^{T}\mathbb{E}\left[\underbrace{ \left(\widetilde{V'}_1^{\pi_k}(s_1^k)-\widehat{V}_1^{\pi_k}(s_1^k) \right)  }_{\text{Lemma}~\ref{temp 78}}\right] \\
+ &  \sum\limits_{k=1}^{T}  \mathbb{E}_{}\left[ \underbrace{ \left(\widehat{V}_1^{\pi_k}(s_1^k) -V_1^{\pi_k}(s_1^k)  \right)  \bm{1} \left\{ \mathcal{E}^{k} \right\} }_{\text{Lemma~\ref{temp 91}}}\right]  \\

+ & H \sum\limits_{k=1}^{T} \underbrace{ \mathbb{P}\left\{\overline{\mathcal{E}^k} \right\} +\mathbb{P}\left\{\overline{\mathcal{E}_{\pi_*}^k} \right\} }_{\text{Lemma~\ref{temp 90}}}\\
\le & \sqrt{SH} \cdot \mathbb{E} \left[\sum\limits_{k=1}^{T}\sum\limits_{t=1}^{H}O \left( {\sigma}^k_{s_t^k, \pi_k(s_t^k,t),t} \right) \right]+ O(1) \\
\le & \widetilde{O} \left(\sqrt{AS^2H^4T} \right)\quad,
    \end{array}
\end{equation}
where the last inequality uses $\sum\limits_{k=1}^{T}\sum\limits_{t=1}^{H}  O \left( {\sigma}^k_{s_t^k, \pi_k(s_t^k,t),t} \right) \le \widetilde{O} \left(\sqrt{ASH^3T} \right)$, a well-known result in the MDP literature \citep{russo2019worst,agrawal2021improved}. 
\end{proof}


\subsection{$\text{O-TS-MDP}^{+}$}

Different from O-TS-MDP where only weak optimism is guaranteed, $\text{O-TS-MDP}^{+}$ is an OFU-inspired optimistic algorithm with randomized value functions.
$\text{O-TS-MDP}^{+}$ takes a more aggressive clipping strategy  to achieve strong optimism. $\text{O-TS-MDP}^{+}$ boosts the random sample to the upper confidence bound if it is smaller than the upper confidence bound. This aggressive clipping strategy contributes to reducing the variance of the posterior distribution as compared to O-TS-MDP, which, consequently, leads to tightening the regret bound to $\widetilde{O}\left(\sqrt{ASH^3T} \right)$. 


Similar to Algorithm~\ref{Optimistic Thompson Sampling}, in each episode, O-TS-MDP$^+$ also constructs a model 
$ \tilde{M}'_k = \left\{\mathcal{S}, \mathcal{A}, H, \widehat{P}^{k-1} , \widetilde{\mu'}^{k}, p_0  \right\}$.
%and then finds the best policy $\pi_k$ for the constructed  model $\tilde{M}'_k$. 
Now, we present how to construct $\widetilde{\mu'}^{k}$.
Let $\overline{\mu}_{s,a,t}^{k} := \widehat{\mu}_{s,a,t}^{k-1} + 2 \sigma_{s,a,t}^{k}$ be the upper confidence bound for $(s,a,t)$ which is determined by $\mathcal{F}_{k-1}$.
At the beginning of episode $k$, for each $(s,a,t)$, $\text{O-TS-MDP}^{+}$ draws a random sample $\widetilde{\mu}_{s,a,t}^{k} \sim \mathcal{N}\left(\widehat{\mu}_{s,a,t}^{k-1}, \left( \sigma_{s,a,t}^{k} \right)^2 \right)$. 
Then, $\text{O-TS-MDP}^{+}$ boosts it to $\overline{\mu}_{s,a,t}^{k}$ if $\widetilde{\mu}_{s,a,t}^{k} < \overline{\mu}_{s,a,t}^{k}$. Let $\widetilde{\mu'}^{k}_{s,a,t} =  \max\left\{\widetilde{\mu}^k_{s,a,t}, \overline{\mu}_{s,a,t}^{k} \right\}$ denote the sample value after the boosting. The PDF for the distribution of $\widetilde{\mu'}^{k}_{s,a,t}$ can be  defined as $f'(x) = 0$ if $x < \overline{\mu}_{s,a,t}^{k}$. Otherwise, $f'(x) = \phi \left(x; \widehat{\mu}_{s,a,t}^{k-1}, \left(\sigma_{s,a,t}\right)^2 \right) +  \Phi \left(\overline{\mu}_{s,a,t}^{k}; \widehat{\mu}_{s,a,t}^{k-1}, \left(\sigma_{s,a,t}\right)^2  \right) \delta(x-\overline{\mu}_{s,a,t}^{k})$, 
where $\Phi(x; \mu, \sigma^2)$ denotes the cumulative distribution function (CDF) of $\mathcal{N}\left(\mu, \sigma^2 \right)$. Let $\widetilde{\mu'}^k = \left\{\widetilde{\mu'}_{s,a,t}^k \right\}$ collect all the samples after the boosting. After constructing $\tilde{M}'_k$, $\text{O-TS-MDP}^{+}$  computes the optimal policy $\pi_k$ for $\tilde{M}'_k$ by using backwards induction.  Algorithm~\ref{O-TS-MDP Plus} presents the pseudo-code of $\text{O-TS-MDP}^{+}$. The differences between O-TS-MDP and $\text{O-TS-MDP}^{+}$ are highlighted in Algorithm~\ref{Optimistic Thompson Sampling} and Algorithm~\ref{O-TS-MDP Plus}, respectively.

% {\color{red}(shown in Line~\ref{?} to Line~\ref{?})}. 


\begin{algorithm}[!ht]
	\caption{$\text{O-TS-MDP}^+$} 
	\label{O-TS-MDP Plus}
	\begin{algorithmic}[1]
	\STATE {\bf{Input:}} MDP instance $M$, number of episodes $T$
	
\STATE {\bf{Initialization}:} 

Set $\widehat{O}_{s,a,t} \leftarrow 0$, $\widehat{P}_{s,a,t} \leftarrow \Vec{0}$, $\widehat{\mu}_{s,a,t} \leftarrow 0, \forall (s,a,t)$  

\FOR {episode $k = 1, 2, \dotsc,T$} 
\STATE Set $\widetilde{V'}_{H+1}^{\pi_k} = \Vec{0}$%, $\pi_k \leftarrow \bm{0}_{S \times H}$  %{\color{blue} \text{$\% \ $$\widetilde{V}$ is a $S$-dimensional vector}}

\FOR {$t = H,H-1, \dotsc, 1$}
\FOR {$s \in \mathcal{S}$}
\FOR {$a \in \mathcal{A}$}
\STATE 
Draw $\widetilde{\mu}_{s,a,t} \sim \mathcal{N} \left( \widehat{\mu}_{s,a,t}, {\color{blue}\left(\sigma_{s,a,t}^{k} \right)^2}\right) $

{\color{blue}Set $\overline{\mu}_{s,a,t} \leftarrow \widehat{\mu}_{s,a,t} + 2\sigma_{s,a,t}^{k}$

Set $\widetilde{\mu}'_{s,a,t} \leftarrow  \max\left\{\widetilde{\mu}_{s,a,t}, \overline{\mu}_{s,a,t} \right\}$}

Set $\widetilde{Q}_{s,a,t} \leftarrow \widetilde{\mu}'_{s,a,t} + \widehat{P}_{s,a,t}^{\intercal} \widetilde{V'}^{\pi_k}_{t+1}  $ 
\ENDFOR 

\STATE  Set $\pi_k(s, t) \leftarrow \mathop{\arg\max}_{a \in \mathcal{A}} \widetilde{Q}_{s,a,t}$

Set $\widetilde{V'}^{\pi_k}_{t}(s) \leftarrow \widetilde{Q}_{s,\pi_k(s,t),t}$
\ENDFOR 
\ENDFOR  \label{planning end}
 \STATE Sample $s_1^k \sim p_0$, 
 run $\pi_k$, and update  $\widehat{\mu}_{s_t^k, \pi_k(s_t^k,t), t}$, $\widehat{O}_{s_t^k, \pi_k(s_t^k,t), t}$ and $\widehat{P}_{s_t^k, \pi_k(s_t^k,t), t}$ for all $t \in [H]$.
\ENDFOR	
	\end{algorithmic}
\end{algorithm}
Now, we present a regret bound for Algorithm~\ref{O-TS-MDP Plus}.
\begin{theorem}
\label{regret theorem 2}
The regret of Algorithm~\ref{O-TS-MDP Plus} is $\widetilde{O} \left(\sqrt{ASH^3T} \right)$.
\end{theorem}
  
 %The regret bound of  O-TS-MDP$^+$  matches the $\Omega \left(\sqrt{ASH^3T} \right)$ regret lower bound up to  logarithmic factors. %$\text{O-TS-MDP}^{+}$, OPSRL of \cite{tiapkin2022optimistic}, and SOS-PS-RL of \citep{agrawal2017posterior} share in common that they are all model-based, OFU-inspired, optimistic learning algorithms with random value functions.
    O-TS-MDP$^+$ achieves the same (near)-optimal regret bound as OPSRL of \cite{tiapkin2022optimistic} and SSR-Bernstein of \cite{xiong2021near}. %However, O-TS-MDP$^+$ only needs to draw one random sample for each $(s,a,t)$ to construct the episode-dependent model as compared to OPSRL of \cite{tiapkin2022optimistic}.
O-TS-MDP$^+$ can be viewed as a randomized version of UCB-VI  \citep{azar2017minimax}. It is important to note that the O-TS-MDP$^+$ learning algorithm itself does not need a Bernstein-type bonus. However, the analysis of Theorem~\ref{regret theorem 2} relies on a   concentration bound that is derived based on Bernstein's inequality (see Lemma~\ref{temp 89} for more details).  



Now, we sketch the regret analysis. Recall that $\overline{\mu}_{s,a,t}^{k} = \widehat{\mu}_{s,a,t}^{k-1} +2\sigma_{s,a,t}^k$. We first construct a new MDP $\bar{M}_k = \left\{\mathcal{S}, \mathcal{A}, H, \widehat{P}^{k-1}, \overline{\mu}^{k}, p_0 \right\}$,
where $\overline{\mu}^{k} = \left\{ \overline{\mu}_{s,a,t}^{k}\right\}$ collects all the upper confidence bounds. Let $\overline{V}_t^{\pi}$ be the value functions of a fixed policy $\pi$ for $\bar{M}_k $.   Note that conditioned on history $\mathcal{F}_{k-1}$, although the constructed $\bar{M}_k$ is deterministic, $\overline{V}_1^{\pi_k}(s)$ is still random as $\pi_k$  is random. Recall that  $\pi_k$ is the optimal policy for $\tilde{M}'_k$. We still use $\mathcal{E}^k$ and $\mathcal{E}_{\pi_*}^k$, the events that have been defined in (\ref{apple}), to decompose the regret.
We prepare three lemmas for Theorem~\ref{regret theorem 2}.
 \begin{lemma}
 \label{opt lemma 2}
 (Optimism).
In episode $k$, we have
\begin{equation}
\mathbb{E}_{}\left[  \left(V_1^{\pi_*}(s_1^k) - \widetilde{V'}_1^{\pi_k}(s_1^k) \right) \bm{1} \left\{\mathcal{E}_{\pi_*}^k \right\}\right] \le 0. 
\end{equation}
\end{lemma}
\begin{lemma}\label{post devi lemma}(Posterior deviation).
In episode $k$, 
we have
\begin{equation}
\begin{array}{l}
\mathbb{E}_{}\left[  \left(\widetilde{V'}_1^{\pi_k}(s_1^k)-\overline{V}_1^{\pi_k}(s_1^k) \right)  \right] 
\le  
   \sum\limits_{t=1}^{H} \mathbb{E} \left[{\sigma}^k_{s_t^k, \pi_k(s_t^k,t),t}  \right].
    \end{array}
\end{equation}
\end{lemma}
\begin{lemma}
    \label{temp 88}(UCB-like).
    In episode $k$, we have
    \begin{equation}
    \begin{array}{ll}
          & \mathbb{E}_{}\left[  \left(\overline{V}_1^{\pi_k}(s_1^k) -V_1^{\pi_k}(s_1^k) \right)  \bm{1} \left\{ \mathcal{E}^k \right\} \right] \\
          
         \le & 2\sum\limits_{t=1}^{H} \mathbb{E} \left[{\sigma}^k_{s_t^k, \pi_k(s_t^k,t),t}\right].
        \end{array}
    \end{equation}
\end{lemma}
\begin{proof}[Proof of Theorem~\ref{regret theorem 2}] We have
\begin{equation}
\begin{array}{ll}
     & \sum\limits_{k=1}^{T}  \mathbb{E} \left[ \left(V_1^{\pi_*}(s_1^k) - V_1^{\pi_k}(s_1^k) \right)\right] \\
\le & \sum\limits_{k=1}^{T} \mathbb{E}_{}\left[ \underbrace{ \left(V_1^{\pi_*}(s_1^k) - \widetilde{V'}_1^{\pi_k}(s_1^k) \right) \bm{1} \left\{\mathcal{E}_{\pi_*}^k \right\} }_{\text{Optimism, Lemma~\ref{opt lemma 2}}}\right]\\
+ &
   \sum\limits_{k=1}^{T} \mathbb{E}_{}\left[ \underbrace{ \left(\widetilde{V'}_1^{\pi_k}(s_1^k)-\overline{V}_1^{\pi_k}(s_1^k) \right)   }_{\text{Posterior deviation, Lemma~\ref{post devi lemma}}}\right] \\
   + &  \sum\limits_{k=1}^{T}     \mathbb{E}_{}\left[ \underbrace{ \left(\overline{V}_1^{\pi_k}(s_1^k) -V_1^{\pi_k}(s_1^k) \right)   \bm{1} \left\{ \mathcal{E}^k \right\} }_{\text{UCB-like, Lemma~\ref{temp 88} }}\right]  \\
  
   + & H \sum\limits_{k=1}^{T} \underbrace{ \mathbb{P}\left\{\overline{\mathcal{E}^k} \right\} +\mathbb{P}\left\{\overline{\mathcal{E}_{\pi_*}^k} \right\} }_{\text{Lemma~\ref{temp 90}}}\\
   \le &  \mathbb{E} \left[\sum\limits_{k=1}^{T} \sum\limits_{t=1}^{H}O \left({\sigma}^k_{s_t^k, \pi_k(s_t^k,t),t} \right) \right] + O(1) \\
   \le & \widetilde{O} \left(\sqrt{ ASH^3T} \right).
   \end{array}
\end{equation}\end{proof}



\section{O-TS and O-TS-Bandit$^+$}
Since a stochastic bandit problem can be viewed as a special MDP with $S = 1, H=1$ and \cite{chapelle2011empirical} have already demonstrated the empirical performance of O-TS for stochastic bandits, to fill a gap in the stochastic bandit literature we  present regret bounds of O-TS for stochastic bandits.  In addition, we propose O-TS-Bandit$^+$, an OFU-inspired, optimistic learning algorithm, for stochastic bandits. Note that O-TS-Bandit$^+$ can be viewed as a randomized version of UCB1 \citep{auer2002finite}. 


%O-TS-MDP and O-TS-MDP$^+$ for stochastic bandits.  In this section, we present both problem-dependent and problem-independent regret bounds for O-TS-MDP and O-TS-MDP$^+$ over stochastic bandits.

%Conceptually,Thompson Sampling plays an action according to the posterior probability distribution of being the optimal action. 
%However, to implement it, it is  unnecessary to compute the exact posterior  distribution of being optimal. Instead, Thompson Sampling  draws a random sample from the posterior distribution associated with each action and then plays the action with the highest posterior sample. %i.e., {\color{red}the learning agent is completely safe to behave in a greedy way in the sampled model.}  


Now, we present the learning problem of stochastic bandits with bounded rewards formally. 
In a stochastic bandit problem, we have an arm set $\mathcal{A}$ with size $A$. At the beginning of each round $t$, the environment generates a reward vector $X(t) = \left(  X_1(t),X_2(t), \dotsc, X_A(t) \right)$ with each $X_{j}(t) \in [0, 1]$ i.i.d. over time from a fixed but unknown probability distribution with mean $\mu_j$.
Simultaneously, the learning agent pulls an arm  $J_t \in \mathcal{A}$.  At the end of round $t$, the learning agent observes and obtains $X_{J_t}(t)$, the reward associated with the pulled arm. The goal of the learning agent is to pull arms sequentially to accumulate as much reward as possible over a finite number of $T$ rounds. Without loss of generality, we assume that the first arm is the unique optimal arm. In other words, we assume $\mu_1 > \mu_j$ for all $j \ne 1$. Let $\Delta_j := \mu_1 - \mu_j$ denote the mean reward gap. 

We use regret to measure the performance of the learning agent's decisions. Similar to (\ref{regret: RL}),  the regret is defined as
\begin{equation}
\label{regret def bandit}
\begin{array}{lll}
\mathcal{R}(T) &= & 
T \cdot \mu_1 - \mathbb{E} \left[\sum\limits_{t=1}^{T} \mu_{J_t}  \right]  \quad,
\end{array}
\end{equation}
where the expectation is taken over $J_t$. Different from episodic MDPs, where only the worst-case regret bounds are analyzed, for stochastic bandits we are interested in both  problem-dependent regret bounds and  problem-independent regret bounds.
The difference between problem-dependent regret bounds and problem-independent regret bounds is the former one depends on the mean reward parameters $\mu_1, \mu_2, \dotsc, \mu_A$
while the latter one provides a regret bound for all the possible choices of mean reward parameters.

We introduce additional notation specific to stochastic bandits. Let $O_j(t-1)$ denote the number of pulls of arm $j$ by the end of round $t-1$ and $\widehat{\mu}_{j, O_j(t-1)}$ denote the empirical mean of arm $j$ by the end of round $t-1$.

\subsection{O-TS} \label{sec: O-TS-MDP for bandits}
The learning algorithm of O-TS is presented in Algorithm~\ref{O-TS-MDP for bandits}. Similar to Algorithm~\ref{Optimistic Thompson Sampling}, at the beginning of each round $t$, for each arm $j \in \mathcal{A}$, a random sample $\widetilde{\mu}_j(t)$ is drawn from $\mathcal{N} \left(\widehat{\mu}_{j, O_j(t-1)}, 1/O_j(t-1) \right)$. If $\widetilde{\mu}_j(t)$ is smaller than  $\widehat{\mu}_{j, O_j(t-1)}$, it will be boosted to $\widehat{\mu}_{j, O_j(t-1)}$. Let $\widetilde{\mu}_j'(t) = \max \left\{\widetilde{\mu}_j(t), \widehat{\mu}_{j, O_j(t-1)}\right\}$. With all $\widetilde{\mu}_j'(t)$ in hand, the learning agent pulls arm $J_t = \mathop{\arg\max}_{j \in \mathcal{A}}\widetilde{\mu}_j'(t)$.



\begin{algorithm}[!ht]
	\caption{O-TS (Optimistic Thompson Sampling \citep{chapelle2011empirical})} 
	\label{O-TS-MDP for bandits}
	\begin{algorithmic}[1]
	\STATE {\bf{Input:}} an arm set $\mathcal{A}$
	
\STATE Pull each arm once to initialize  $O_j, \widehat{\mu}_{j, O_j}$

\FOR {round $t = A+1, A+2, \dotsc$} 
\FOR {$a \in \mathcal{A}$}
\STATE 
Draw $\widetilde{\mu}_{j}(t) \sim \mathcal{N} \left( \widehat{\mu}_{j, O_j}, 1/O_j \right) $

{\color{red}Set $\widetilde{\mu}'_{j}(t) \leftarrow  \max\left\{\widetilde{\mu}_{j}(t), \widehat{\mu}_{j, O_j} \right\}$}
\ENDFOR 

\STATE  Pull arm $J_t \leftarrow \mathop{\arg\max}_{j \in \mathcal{A}} \widetilde{\mu}'_{j}(t)$

 \STATE Update  $O_{J_t}$ and $ \widehat{\mu}_{J_t, O_{J_t}}$\quad.
\ENDFOR	
	\end{algorithmic}
\end{algorithm}
Now, we present regret bounds for Algorithm~\ref{O-TS-MDP for bandits}.
\begin{theorem}
\label{theorem: bandit, O-TS-MDP, dependent}
    The problem-dependent regret bound of Algorithm~\ref{O-TS-MDP for bandits} is $\sum_{j \in \mathcal{A}: \Delta_j >0}O \left( \frac{\ln(T)}{\Delta_j}\right)$.
\end{theorem}
\begin{theorem}
      \label{theorem: bandit, O-TS-MDP, independent}
   The problem-independent regret bound of Algorithm~\ref{O-TS-MDP for bandits} is $O \left( \sqrt{AT\ln(A)}\right)$.
\end{theorem}
O-TS achieves the same problem-dependent and problem-independent regret bounds as Thompson Sampling with Gaussian priors (TS-Gaussian) (Algorithm~2 in \citet{agrawalnear}). The key difference between O-TS and TS-Gaussian is TS-Gaussian uses normal distributions while O-TS uses one-sided Gaussian distributions with the left side being clipped. %{\color{red} Although the practical performance gain of O-TS over TS-Gaussian is marginal in stochastic bandits \citep{chapelle2011empirical}, the idea of O-TS is quite useful in episodic MDPs.}
%    \item The distribution for $\widetilde{\mu}'_j$ first-order stochastic dominates the distribution for $\widetilde{\mu}_j(t)$. {\color{red}It is related to the proof...}
 %MOTS of \cite{jin2021mots} also uses the clipped Gaussian distributions. However, the clipping rules are completely different between O-TS-Bandit and MOTS. In MOTS, the posterior sample will be clipped to be the upper confidence bound if it is greater than the upper confidence bound, i.e., the value of the posterior sample after the clipping can still be negative but is upper bounded by the upper confidence bound. The clipping of the upper tail of the posterior distribution is to control the overestimation of the sub-optimal arm. In addition to clipping the upper tail, MOTS also boosts the variance of the posterior distribution to drive exploration. The boosting of the variance plays a crucial role to  achieve minimax optimality. In contrast, in O-TS-Bandit, the value of the posterior sample after the clipping  cannot be negative as it is lower bounded by the empirical mean. Also, O-TS-Bandit does not boost the variance of the posterior distribution. 
The problem-independent regret bound of O-TS is minimax optimal up to a $\sqrt{\ln(A)}$ factor. Note that  O-TS and TS-Gaussian are not optimistic learning algorithms. %A more refined analysis shown in \cite{agrawalnear} can improve the problem-independent regret bound to $O \left( \sqrt{AT\ln(A)}\right)$. However, the improvement is  marginal.
 

% \begin{proof}[Proof sketch of Theorem~\ref{theorem: bandit, O-TS-MDP, dependent}]The regret analysis of O-TS is very similar to that of TS-Gaussian \citep{agrawalnear}. For each sub-optimal arm $j$, we upper bound $\mathbb{E} \left[O_j(t) \right]$. Let $L_j = 108\ln(T)/\Delta_j^2$ and $y_j = \mu_j + 0.5\Delta_j$. Then, we have
% \begin{equation}
% \begin{array}{ll}
% &\mathbb{E} \left[O_j(T) \right] \\
% \le & L_j + \sum\limits_{t=1}^{T}  \underbrace{\mathbb{P}  \left\{J_t = j, \widetilde{\mu}'_j(t) \le y_j \right\}}_{\omega_1}   \\
%  + & \sum\limits_{t=1}^{T}  \underbrace{\mathbb{P}  \left\{J_t = j, \widetilde{\mu}'_j(t) > y_j ,O_j(t-1) > L_j\right\}}_{\omega_2}.
%  \end{array}
% \end{equation}
% Recall that $\widetilde{\mu}_j(t) \sim \mathcal{N}\left(\widehat{\mu}_{j, O_j(t-1)},1/O_j(t-1) \right)$.
% Term $\omega_1$ is upper bounded by $\mathbb{P}  \left\{J_t = j, \widetilde{\mu}_j(t) \le y_j \right\} $ as $\widetilde{\mu}'_j(t) = \max\left\{ \widetilde{\mu}_j(t) , \widehat{\mu}_{j, O_j(t-1)}\right\}$. The proof now is reduced to the analysis of TS-Gaussian.   When $O_j(t-1) > L_j$, with high probability, we have $y_j >\widehat{\mu}_{j, O_j(t-1)} + \sqrt{6\ln(t)/O_j(t-1)}$, which implies $\mathbb{P}\left\{ \widetilde{\mu}'_j(t) > y_j\right\} = \mathbb{P}\left\{ \widetilde{\mu}_j(t) > y_j \right\}$ given the history. Now, we use concentration bounds of Gaussian distributions to complete the proof.
% \end{proof}

\subsection{O-TS-Bandit$^+$}
An OFU-inspired optimistic learning algorithm, O-TS-Bandit$^+$, is presented in Algorithm~\ref{O-TS-MDP+ for bandits main}.  
Similar to Algorithm~\ref{O-TS-MDP Plus}, O-TS-Bandit$^+$  does the clipping %clips the left side of the posterior distribution 
aggressively to boost optimism and can be viewed as a randomized version of UCB1 \citep{auer2002finite}. %$\widetilde{\mu}_j(t) $ that is drawn from $\mathcal{N} \left(\widehat{\mu}_{j, O_j(t-1)}, 1/O_j(t-1) \right)$. 
%However, it takes a more aggressive way to do the clipping. 
Let
 $\overline{\mu}_j(t) = \widehat{\mu}_{j, O_j(t-1)} + \sqrt{1.5\ln(t)/O_j(t-1)}$ be the upper confidence bound. O-TS-Bandit$^+$  boosts $\widetilde{\mu}_j(t) $ to $\overline{\mu}_j(t)$ if it is smaller than $\overline{\mu}_j(t)$. Let $\widetilde{\mu}_j'(t) = \max \left\{\widetilde{\mu}_j(t), \overline{\mu}_j(t)\right\}$ denote the value after the boosting. Then, O-TS-Bandit$^+$ pulls arm $J_t = \mathop{\arg\max}_{j \in \mathcal{A}}\widetilde{\mu}_j'(t)$. The differences between O-TS and $\text{O-TS-Bandit}^{+}$ are highlighted in Algorithm~\ref{O-TS-MDP for bandits} and Algorithm~\ref{O-TS-MDP+ for bandits main}, respectively.


\begin{algorithm}[!ht]
	\caption{O-TS-Bandit$^+$} 
	\label{O-TS-MDP+ for bandits main}
	\begin{algorithmic}[1]
	\STATE {\bf{Input:}} an arm set $\mathcal{A}$
	\STATE Pull each arm once to initialize  $O_j, \widehat{\mu}_{j, O_j}$
\FOR {round $t = A+1, A+2, \dotsc$} 
\FOR {$j \in \mathcal{A}$}
\STATE Draw $\widetilde{\mu}_{j}(t) \sim \mathcal{N} \left( \widehat{\mu}_{j, O_j}, 1/O_j \right) $

{\color{blue}Set $\overline{\mu}_{j} \leftarrow \widehat{\mu}_{j, O_j} + \sqrt{1.5 \ln (t)/O_j}$

Set $\widetilde{\mu}'_{j}(t) \leftarrow  \max\left\{\widetilde{\mu}_{j}(t), \overline{\mu}_{j}(t) \right\}$}
\ENDFOR 
\STATE  Pull arm $J_t \leftarrow \mathop{\arg\max}_{j \in \mathcal{A}} \widetilde{\mu}'_{j}(t)$
 \STATE Update  $O_{J_t}$ and $ \widehat{\mu}_{J_t, O_{J_t}}$ \quad.
\ENDFOR	
	\end{algorithmic}
\end{algorithm}
Now, we present regret bounds for Algorithm~\ref{O-TS-MDP+ for bandits main}.
\begin{theorem}
\label{O-TS-MDP dependent regret}
The  problem-dependent regret bound of Algorithm~\ref{O-TS-MDP+ for bandits main} is $\sum_{j \in \mathcal{A}: \Delta_j >0}O \left( \frac{\ln(T)}{\Delta_j}\right)$.
\end{theorem}
\begin{theorem}
      \label{theorem: bandit, O-TS-MDP+, independent}
    The problem-independent regret bound of Algorithm~\ref{O-TS-MDP+ for bandits main} is $O \left( \sqrt{AT\ln(T)}\right)$.
\end{theorem}
O-TS and O-TS-Bandit$^+$ have the same problem-dependent regret bound. For the problem-independent regret bound, O-TS-Bandit$^+$ is worse than O-TS. %The key difference between O-TS and O-TS-Bandit$^+$ is O-TS is not an OFU-inspired optimistic algorithm while  
Note that O-TS-Bandit$^+$ is an optimistic algorithm. 
 O-TS-Bandit$^+$ and MOTS \citep{jin2021mots} share in common that they both clip the Gaussian distributions based on the upper confidence bounds. The key difference lies in that MOTS clips the upper tail of Gaussian distributions to control the overestimation of the sub-optimal arms while O-TS-Bandit$^+$ keeps the upper tail of the Gaussian distributions to preserve optimism. 

%MOTS clips the posterior sample to the upper confidence bound if it is greater than the upper confidence bound. The clipping of the upper tail of the posterior distribution is to  Also, MOTS boosts variance of the posterior distribution to drive exploration, which plays a crucial role to {\color{blue} achieve minimax optimality.} }
%{\color{red}Therefore, the regret analysis of O-TS-Bandit$+$ can be similar to UCB1 of \cite{auer2002finite}.}
% \begin{proof}[Proof sketch of Theorem~\ref{O-TS-MDP dependent regret}]
% Since O-TS-Bandit$^+$ is an optimistic algorithm and can be viewed as a randomized version of UCB1 \citep{auer2002finite}, the regret decomposition in the analysis of O-TS-Bandit$^+$ can be very similar to UCB1.
%  For each sub-optimal arm $j$, we upper bound $\mathbb{E} \left[O_j(T) \right]$. Let $L_j = 25\ln(T)/\Delta_j^2$. 
%  We have 
%   $\mathbb{E} \left[O_j(T) \right] 
%          \le  L_j + \sum_{j =1}^{T} \mathbb{E} \left[ \bm{1} \left\{J_t =j, O_j(t-1) > L_j \right\} \right] $. 
         
%           If event $\left\{J_t =j, O_j(t-1) > L_j \right\}$
%          happens, by using contradiction, it implies at least one of the following is true:
%          \begin{equation*}
%              \begin{array}{lll}
%             \omega_1 & =  & \left\{\widetilde{\mu}'_1(t) \le \mu_1 \right\}, \\
%             \omega_2 & =   & \left\{\widetilde{\mu}'_j(t) \ge \mu_j + \sqrt{\frac{24\ln(t)}{O_j(t-1)}}, O_j(t-1) \ge L_j\right\},  \\
%                 \omega_3 & =  
                   
%                     & \left\{ \sqrt{\frac{24\ln(t)}{O_j(t-1)}} > \Delta_j, O_j(t-1) \ge L_j \right\}.
%              \end{array}
%          \end{equation*}
%           Event $\omega_1$ denotes  the underestimation of the optimal arm and event $\omega_2$ denotes the overestimation of the sub-optimal arm. Event $\omega_3$ cannot happen due to the tuning of $L_j$. From concentration bounds, we know $\mathbb{P}\left\{\omega_1 \right\} \le O(1/t^2)$ and $\mathbb{P}\left\{\omega_2 \right\}  \le O(1/t^3)$, which concludes the proof sketch. \end{proof}

\section{Experimental Results} \label{sec: expe}
In this section, we evaluate the empirical performance of our proposed algorithms O-TS-MDP and O-TS-MDP$^+$  for MDPs with $S =[5, 20, 50]$, $A=3$ and $H = 10$. For a fair  performance comparison, our experimental set-up is fully adopted from \citet{dann2017unifying}, where the empirical performance for several UCB-based algorithms was studied.  %with a slight difference in reward generation 
 %We generate randomized MDPs with $S =[5, 10,50]$, $A=3$ and $H = 10$. 
% {\color{blue}(The following description seems contradicted. The mean reward can either be $\mu_{s,a,t}$ or 0 with 0.85 probability.)
% For a specific $(s,a,t)$, the random reward $X_{s, a,t}$ for each episode is a Bernoulli random variable with mean $\mu_{s, a,t}$. The mean reward is set to 0 with $p = 0.85$ to ensure sparsity, and otherwise sampled uniformly at random in $[0,1]$. }
For a specific $(s,a,t)$, the random reward $X_{s, a,t}$ in each episode is drawn from a Bernoulli distribution with parameter $\mu_{s, a,t}$. To ensure the sparsity of the random rewards, we set $\mu_{s, a,t} = 0$ with probability $0.85$ and with probability $0.15$, the value of $\mu_{s, a,t}$ is drawn from a uniform distribution.  
%for each episode is a Bernoulli random variable with mean $\mu_{s, a,t}$ and support $[0,1]$. We ensure the sparsity of the reward by setting the mean $\mu_{s, a,t}$ to 0 with $p = 0.85$. Otherwise, with $p = 0.15$, $\mu_{s, a,t}$ is sampled uniformly from $[0,1]$ to incorporate randomness. 
% {\color{green}Similarly, the transition probability distribution $P_{s,a,t}$ is sampled from a Dirichlet distribution with parameters $(0.1, 0.1, \dotsc, 0.1)$, meaning that with a high chance the transition probability distribution is concentrated on a single state.} 
The sparsity design is to control the occurrence that sub-optimal policies can obtain rewards by chance. % when taking an action, but nonetheless randomized. The learning agent has access to only the empirical observations. 
We compare O-TS-MDP, O-TS-MDP$^+$, SSR-Bernstein \citep{xiong2021near}, and TS-MDP, a Thompson Sampling-based learning algorithm without clipping the posterior distributions, i.e., constructing the episode-dependent model as $\tilde{M} = \left\{\mathcal{S}, \mathcal{A}, H, \widehat{P}^{k-1}, \widetilde{\mu}^k, p_0 \right\}$.
We set $T = 10^7$ and compare the cumulative average rewards of each episode. %Results are consistent across different runs of randomized initialization.  
\begin{figure}[h]
\centering
\includegraphics[scale=0.8]{s5.pdf}
\caption{Empirical performance for 5 states}
\label{s5}
\end{figure}

\begin{figure}[h]
\centering
\includegraphics[scale=0.8]{s20.pdf}
\caption{Empirical performance for 20 states}
\label{s20}
\end{figure}

\begin{figure}[h]
\centering
\includegraphics[scale=0.8]{s50.pdf}
\caption{Empirical performance for 50 states}
\label{s50}
\end{figure}

As shown in Figure \ref{s5}, the  rewards for all algorithms steadily increase as the learning agent gains a better estimation of the parameters of the true MDP  over time. When the number of states is small ($S=5$), O-TS-MDP$^+$ performs slightly better than O-TS-MDP. TS-MDP demonstrates a similar trend as O-TS-MDP since they are both Thompson Sampling-based algorithms. Despite the lack of  theoretical analysis, TS-MDP does achieve better empirical performance. The  gap between  O-TS-MDP and  TS-MDP comes from the fact that  clipping  the left side of the posterior distributions increases the chance to visit a sub-optimal $(s,a,t)$,  just as implied in the design of MOTS \citep{jin2021mots}. It is not surprising that SSR-Bernstein   outperforms the remaining algorithms as it is theoretically optimal. Figure \ref{s20} and \ref{s50} show similar trends for Thomson-sampling-based methods in larger state space. SSR-Bernstein still performs the best.  % and may enjoy a smaller variance as compared to O-TS-MDP and O-TS-MDP$^+$ given the same amount of observations for a specific $(s,a,t)$.

 It is important to note that SSR-Bernstein uses a single random seed for all $(s,a,t)$ in each episode. In contrast, in our proposed algorithms, each $(s,a,t)$ has its own randomness within an episode. In other words, our proposed algorithms  inject  more randomness than SSR-Bernstein. Additionally, SSR-Bernstein needs to construct confidence intervals to tune the magnitude of the variance, whereas our algorithms are simpler and easier to implement and enjoy good regret bounds. We also implement UCB-VI \citep{azar2017minimax} and 
more experimental results can be found in Appendix~\ref{app: exp}.%In comparison, our algorithm is simpler and easier to implement and enjoys a state-of-the-art theoretical bound. 


\section{Conclusion and Future Work} \label{sec: conc}
In this work, we have presented two  Optimistic Thompson Sampling-based learning algorithms, O-TS-MDP and O-TS-MDP$^+$, for episodic MDPs. The key feature that distinguishes our proposed learning algorithms from the existing RLSVI-based algorithms \citep{russo2019worst,xiong2021near,agrawal2021improved} is the introduction of O-TS to avoid upper bounding the absolute value of the estimation error, thus  simplifying the regret analysis.
 This work leaves two interesting open questions.  Just as pointed out in \cite{abeille2017linear,pacchiano2021towards,agrawal2021improved}}, removing the extra $\sqrt{SH}$ factor is challenging if the learning algorithms are Thompson Sampling-based.  The first open question is whether the ideas in SSR of \cite{xiong2021near}, i.e., controlling the amount of randomness within the learning algorithm, can be used to tighten the regret bound of O-TS-MDP to  $\widetilde{O} \left(\sqrt{ASH^3T} \right)$. Our thought is that by reducing the amount of posterior random samples, a better regret bound for O-TS-MDP may be possible.
 The analysis of O-TS-MDP and RLSVI-based algorithms all rely on the property that the sum of multiple independent normal random variables is still normally distributed, and normal distributions have nice anti-concentration bounds. Although \cite{tiapkin2022optimistic} have proved a sharp anti-concentration bound for Dirichlet distributions, the distribution of the sum of multiple independent Dirichlet random variables is still less understood. The lack of understanding of Dirichlet distributions results in the need for multiple posterior samples in OPSRL of \cite{tiapkin2022optimistic}.
The second interesting open question is whether we can reshape the Dirichlet posterior distribution in an optimistic way to improve the number of Dirichlet random variables in  OPSRL to one.

\section*{ACKNOWLEDGEMENTS}
We thank Alan Milligan for helping with the experiments and providing comments on an early version of the manuscript. This work was partially supported by the Alberta Machine Intelligence Institute (Amii), the Canada CIFAR AI Program and the Natural Sciences and Engineering Research Council of Canada (NSERC) Discovery Grant RGPIN-2022-03669.


%The key reason why OPSRL of \cite{tiapkin2022optimistic} needs to draw $\widetilde{O}\left(1 \right)$ random sample is due to the fact that the fundamental properties of Dirichlet random variables are  less understood. 
  


%\begin{enumerate}
    %\item  The ideas for O-TS-MDP can also be applied to other episodic MDP learning problems  like model-based algorithms  with linear function approximation.

    %\item {\color{red}O-TS-MDP also achieves the same regret bound as O-TS-MDP$^+$ if drawing $\widetilde{O}(1)$ samples.}

    %\item The analysis of RLSVI of \cite{russo2019worst} can be simplified if introducing the clipping.  The analysis will be different from the one shown in \cite{agrawal2021improved}.
% \item The principle of O-TS-MDP to Dirichlet distribution.  To see whether it will work. 

% \item  More importantly,  
%  The fundamental reason why it is safe to clip a standard Gaussian distribution to a one-sided Gaussian is the standard Gaussian distribution is first-order stochastically dominated by a  Gaussian distribution with the left-tail being clipped.  As \cite{tiapkin2022dirichlet} has shown that ...

% \item use ideas shown in \cite{xiong2021near} to improve the regret bound of O-TS-MDP...

% \item By modifying O-TS-MDP$+$, we can have a randomized version of Bayes-UCBVI of \cite{tiapkin2022optimistic}.


    
%\end{enumerate}
 







% Discuss the reasons why need to drawn multiple posterior samples in \citep{tiapkin2022optimistic}.
% References
%\bibliographystyle{plain}

\bibliography{hu_579}
\onecolumn
\section*{Appendix}
The appendix is organized as follows.
\begin{enumerate}
    \item Appendix~\ref{proofs for O-TS-MDP} presents proofs for Theorem~\ref{regret theorem 1} ;
    \item Appendix~\ref{app: regret proof 2} presents proofs for Theorem~\ref{regret theorem 2} ;
    \item Appendix~\ref{app: other} presents other technical lemmas used in MDPs ;
    \item Appendix~\ref{app: bandit 1} presents proofs for Theorem~\ref{theorem: bandit, O-TS-MDP, dependent} ;
      \item Appendix~\ref{app: O-TS independent new} presents proofs for Theorem~\ref{theorem: bandit, O-TS-MDP, independent} ;
    \item Appendix~\ref{app: bandit 2} presents proofs for Theorem~\ref{O-TS-MDP dependent regret} ;
    \item Appendix~\ref{app: independent} presents proofs for Theorem~\ref{theorem: bandit, O-TS-MDP+, independent} ;
    \item Appendix~\ref{app: exp} presents additional experimental results.
\end{enumerate}

\renewcommand{\thesubsection}{\Alph{subsection}}
Since $ \mathbb{E} \left[ \left(V_1^{\pi_*}(s_1^k) - V_1^{\pi_k}(s_1^k) \right)\right] = \mathbb{E} \left[ \underbrace{\mathbb{E} \left[ \left(V_1^{\pi_*}(s) - V_1^{\pi_k}(s) \right)\mid s_1^k = s \right] } \right]$,
for the  analysis in Appendix~\ref{proofs for O-TS-MDP} and Appendix~\ref{app: regret proof 2}, we upper bound the expected value difference conditioned on $s_1^k = s$. For ease of presentation, we drop the conditioning.

%{\color{blue} Add the explanation of optimistic learning algorithm: Optimimism Regret Decomposition...}

\subsection{Proofs for Theorem~\ref{regret theorem 1}} \label{proofs for O-TS-MDP}

To prove Theorem~\ref{regret theorem 1}, in episode $k$, we also construct another MDP 
    $ \tilde{M}_k  = \left\{\mathcal{S}, \mathcal{A}, H, \widehat{P}^{k-1} , \widetilde{\mu}^{k} , p_0 \right\}$, where   $\widetilde{\mu}^{k} = \left\{\widetilde{\mu}_{s,a,t}^{k} \right\}$ collects all the random samples that are initially drawn from  Gaussian distributions (before doing the clippings). Note that $ \tilde{M}_k$ can be viewed as the ``parental'' MDP of $\tilde{M}'_k$. Let  $\widetilde{V}_{t}^{\pi}$  denote the value functions of a fixed policy $\pi$ for  $\tilde{M}_k$ in round $t$. We have the following facts.

    \begin{enumerate}
        
        \item The MDPs of $ \tilde{M}_k$ and $ \tilde{M}'_k$ use the same $\widehat{P}^{k-1}$ to construct the models.
        \item The learning algorithm of O-TS-MDP guarantees $\widetilde{\mu'}_{s,a,t}^{k} \ge \widetilde{\mu}_{s,a,t}^{k} $ for all $(s,a,t)$ hold simultaneously.
         \item The learning algorithm of O-TS-MDP guarantees $\widetilde{\mu'}_{s,a,t}^{k} \ge \widehat{\mu}_{s,a,t}^{k-1} $ for all $(s,a,t)$ hold simultaneously.
        \item The empirical estimates of $\widehat{\mu}^{k-1} $ and $\widehat{P}^{k-1}$,  the distributions for $\widetilde{\mu}^{k} $ and $\widetilde{\mu'}^{k} $, and whether event $\mathcal{E}_{\pi_*}^k$ is true or not are all determined by $\mathcal{F}_{k-1}$. 
    \end{enumerate}
       
     
     


Let $c_0 =  \frac{2e^{-2}}{5\sqrt{2\pi} }$ be a universal constant.% and $\mathbb{P}_{k-1} \left\{ \cdot \right\} = \mathbb{P}\left\{ \cdot \mid \mathcal{F}_{k-1} = F_{k-1}  \right\} $.  

\begin{lemma}(Weak optimism lemma).
For any instantiation $F_{k-1}$ of $\mathcal{F}_{k-1}$ such that event $\mathcal{E}_{\pi_*}^k$ is true, we have
\begin{equation}
    \begin{array}{l}
        \mathbb{P}\left\{\bm{1} \left\{\mathcal{E}_{\pi_*}^k \right\}  \left(\widetilde{V}_{1}^{\pi_*}(s)    - V_1^{\pi_*}(s) \right) \ge 0 \mid \mathcal{F}_{k-1} = F_{k-1}  \right\} \ge c_0.
    \end{array}
    \label{temp 55}
\end{equation}
\label{temp 92}
\end{lemma}

The proof of Lemma~\ref{temp 95} uses the result stated in Lemma~\ref{temp 92}.
\begin{proof}[Proof of Lemma~\ref{temp 95} (Optimism lemma)]We use Markov's inequality and Lemma~\ref{temp 92} to complete the proof. Some parts of the analysis use ideas presented in the proof of Lemma~6 in \citet{russo2019worst}. We would like to highlight that the introduce of O-TS simplifies the analysis avoiding upper bounding the absolute value of the estimation error.

We first rewrite the LHS in (\ref{temp 505}) as
\begin{equation}
    \begin{array}{lll}
         \text{LHS of } (\ref{temp 505}) & = &  \mathbb{E} \left[\left(V_1^{\pi_*}(s) - \widetilde{V'}_{1}^{\pi_k}(s) \right)  \bm{1} \left\{\mathcal{E}_{\pi_*}^k \right\}          \right] \\
      &   =   & \mathbb{E} \left[ \mathbb{E}\left[    \bm{1} \left\{\mathcal{E}_{\pi_*}^k \right\}   \left(V_1^{\pi_*}(s) -  \widetilde{V'}_1^{\pi_k}(s) \right) \mid \mathcal{F}_{k-1} \right]\right] \\
      &  =^{(a)}   & \mathbb{E} \left[\underbrace{ \mathbb{E}\left[    \bm{1} \left\{\mathcal{E}_{\pi_*}^k \right\}   \left(V_1^{\pi_*}(s) -  \mathop{\max}\limits_{\pi \in \Pi}\widetilde{V'}_1^{\pi}(s) \right) \mid \mathcal{F}_{k-1} \right]}_{ \lambda}\right] \quad,
        %&   =   & \mathbb{E} \left[\mathbb{E} \left[ \lambda \mid \mathcal{F}_{k-1}\right]\right] \quad,
          \label{temp 93}
    \end{array}
\end{equation}
where  equality (a) uses the fact that policy $\pi_k$ is the optimal one for $\tilde{M}_k'$, i.e., $\widetilde{V'}_1^{\pi_k}(s) = \mathop{\max}_{\pi \in \Pi}\widetilde{V'}_1^{\pi}(s)$.
   Since  $\lambda$ is  determined by $\mathcal{F}_{k-1}$, we only need to consider all  the  instantiations $F_{k-1}$ of $\mathcal{F}_{k-1}$  such that  $\lambda > 0$. Let $\mathbb{E}_{k-1} \left[ \cdot \right] = \mathbb{E}\left[ \cdot \mid \mathcal{F}_{k-1} = F_{k-1} \right] $ and $\mathbb{P}_{k-1} \left\{ \cdot \right\} = \mathbb{P}\left\{ \cdot \mid \mathcal{F}_{k-1} = F_{k-1}  \right\} $. Conditioned on such $F_{k-1}$, 
we use  Markov's inequality and have
\begin{equation}
\begin{array}{lll}
 \lambda &
\le & \frac{\mathbb{E}_{k-1}\left[ \underbrace{ \max \left\{0,   \bm{1} \left\{\mathcal{E}_{\pi_*}^k \right\}   \left(\widetilde{V'}_{1}^{\pi_k}(s)   - \mathbb{E}_{k-1} \left[ \mathop{\max}\limits_{\pi \in \Pi}\widetilde{V'}_1^{\pi}(s)    \right]\right)\right\}}_{\text{r.v.}}   \right] }{\mathbb{P}_{k-1} \left\{\underbrace{\max \left\{0,   \bm{1} \left\{\mathcal{E}_{\pi_*}^k \right\}   \left(\widetilde{V'}_{1}^{\pi_k}(s)   - \mathbb{E}_{k-1} \left[\mathop{\max}\limits_{\pi \in \Pi}\widetilde{V'}_1^{\pi}(s)    \right]\right) \right\} }_{\text{r.v.}} \ge \lambda \right\}} \\
&& \\
 &\le & 
\frac{\mathbb{E}_{k-1}\left[ \max \left\{0,  \left(\widetilde{V'}_{1}^{\pi_k}(s)   - \mathbb{E}_{k-1} \left[ \mathop{\max}\limits_{\pi \in \Pi}\widetilde{V'}_1^{\pi}(s)    \right]\right)\right\}   \right] }{\mathbb{P}_{k-1} \left\{ \bm{1} \left\{\mathcal{E}_{\pi_*}^k \right\}   \left(\widetilde{V'}_{1}^{\pi_k}(s)   - \mathbb{E}_{k-1} \left[\mathop{\max}\limits_{\pi \in \Pi}\widetilde{V'}_1^{\pi}(s)    \right]\right)  \ge \lambda \right\}} \quad.
    \end{array}
    \label{temp 100}
\end{equation}
\paragraph{Construct a lower bound.} Now, we construct a lower bound for the denominator in the last step of (\ref{temp 100}) by using Lemma~\ref{temp 92}. We have
\begin{equation}
    \begin{array}{ll}
         & \mathbb{P}_{k-1} \left\{\bm{1} \left\{\mathcal{E}_{\pi_*}^k \right\}   \left(\widetilde{V'}_{1}^{\pi_k}(s)   - \mathbb{E}_{k-1} \left[\mathop{\max}\limits_{\pi \in \Pi}\widetilde{V'}_1^{\pi}(s)    \right]\right) \ge \lambda \right\} \\
         = & \mathbb{P}_{k-1} \left\{\bm{1} \left\{\mathcal{E}_{\pi_*}^k \right\}   \left(\widetilde{V'}_{1}^{\pi_k}(s)   - \mathbb{E}_{k-1} \left[\mathop{\max}\limits_{\pi \in \Pi} \widetilde{V'}_1^{\pi}(s)    \right]\right) \ge  \mathbb{E}_{k-1} \left[    \bm{1} \left\{\mathcal{E}_{\pi_*}^k \right\}   \left(V_1^{\pi_*}(s) -  \mathop{\max}\limits_{\pi \in \Pi}\widetilde{V'}_1^{\pi}(s) \right) \right] \right\}\\
         = & \mathbb{P}_{k-1} \left\{\bm{1} \left\{\mathcal{E}_{\pi_*}^k \right\}  \left(\widetilde{V'}_{1}^{\pi_k}(s)    - V_1^{\pi_*}(s) \right) \ge 0  \right\}\\
          \ge^{(a)} & \mathbb{P}_{k-1} \left\{\bm{1} \left\{\mathcal{E}_{\pi_*}^k \right\}  \left(\widetilde{V'}_{1}^{\pi_*}(s)    - V_1^{\pi_*}(s) \right) \ge 0  \right\}\\
           \ge^{(b)} & \mathbb{P}_{k-1} \left\{\bm{1} \left\{\mathcal{E}_{\pi_*}^k \right\}  \left(\widetilde{V}_{1}^{\pi_*}(s)    - V_1^{\pi_*}(s) \right) \ge 0  \right\}\\
         \ge^{(c)} & c_0 \quad,
    \end{array}
    \label{temp 99}
\end{equation}
where inequality (a) uses the fact that   $\widetilde{V'}_{1}^{\pi_k}(s) \ge \widetilde{V'}_{1}^{\pi_*}(s)$ as policy $\pi_k$ is the optimal one for $\tilde{M}'_k$ and inequality (b) uses the fact that $\widetilde{V'}_{1}^{\pi_*}(s) \ge \widetilde{V}_{1}^{\pi_*}(s)$ as MDPs $\tilde{M}'_k$ and $\tilde{M}_k$ are constructed based on the same $\widehat{P}^{k-1}$ and O-TS-MDP guarantees that   $\widetilde{\mu'}_{s,a,t}^k \ge \widetilde{\mu}_{s,a,t}^k$ for all $(s,a,t)$ hold simultaneously. Inequality (c) uses  Lemma~\ref{temp 92}.

\paragraph{Construct an upper bound.}
To construct an upper bound for the numerator in the last step of (\ref{temp 100}), we introduce   a new MDP $ \tilde{\tilde{M}}'_k = \left\{\mathcal{S}, \mathcal{A}, H, \widehat{P}^{k-1}, \widetilde{\widetilde{\mu'}}^{k}, p_0 \right\}$,
where $\widetilde{\widetilde{\mu'}}_{s,a,t}^{k} \sim \mathcal{N'}_{s,a,t}^{k}$. Note   $\widetilde{\mu'}_{s,a,t}^{k}$ and $\widetilde{\widetilde{\mu'}}_{s,a,t}^{k} $ are i.i.d. according to $\mathcal{N'}_{s,a,t}^{k}$, a distribution determined by $\mathcal{F}_{k-1}$.
 Let $\widetilde{\widetilde{V'}}_t^{\pi}$ denote the value functions of  a fixed policy $\pi$ for   $\tilde{\tilde{M}}_k'$. Then, we 
have 
\begin{equation}
\begin{array}{l}
\mathbb{E}_{k-1} \left[\widetilde{\widetilde{V'}}_{1}^{\pi_k}(s) \mid \pi_k\right]  
\le  \mathbb{E}_{k-1} \left[ \mathop{\max}\limits_{\pi \in \Pi}\widetilde{\widetilde{V'}}_{1}^{\pi}(s) \right]
  = \mathbb{E}_{k-1} \left[ \mathop{\max}\limits_{\pi \in \Pi}\widetilde{V'}_{1}^{\pi}(s) \right] \quad.
 % & = & \mathbb{E}_{k-1} \left[ \widetilde{V'}_{1}^{\pi_k}(s) \right] \quad.
   \end{array}
   \label{temp 503}
   \end{equation}
Now, we come back to constructing the upper bound for the  numerator in the last step of  (\ref{temp 100}).  We have
\begin{equation}
\begin{array}{ll}
& \mathbb{E}_{k-1} \left[\max \left\{0, \widetilde{V'}_{1}^{\pi_k}(s)   - \mathbb{E}_{k-1} \left[\mathop{\max}\limits_{\pi \in \Pi}\widetilde{V'}_1^{\pi}(s) \right] \right\}\right]  \\
\le^{(a)} & \mathbb{E}_{k-1} \left[\max \left\{0, \widetilde{V'}_{1}^{\pi_k}(s)   - \mathbb{E}_{k-1} \left[\widetilde{\widetilde{V'}}_1^{\pi_k}(s) \mid \pi_k \right]\right\}\right]  \\
\le & \mathbb{E}_{k-1} \left[ \left| \widetilde{V'}_{1}^{\pi_k}(s)   - \mathbb{E}_{k-1} \left[\widetilde{\widetilde{V'}}_1^{\pi_k}(s) \mid \pi_k \right] \right| \right]  \\

=  & \mathbb{E}_{k-1} \left[\left|\widetilde{V'}_{1}^{\pi_k}(s)   - \mathbb{E}_{k-1} \left[\widetilde{\widetilde{V'}}_1^{\pi_k}(s)    \mid \pi_k, \tilde{M}'_k \right]\right| \right]  \\
=  & \mathbb{E}_{k-1} \left[\left|\mathbb{E}_{k-1} \left[\left(\widetilde{V'}_{1}^{\pi_k}(s)   - \widetilde{\widetilde{V'}}_1^{\pi_k}(s)    \right) \mid \pi_k, \tilde{M}'_k \right]\right|\right]  \\
\le  & \mathbb{E}_{k-1} \left[\mathbb{E}_{k-1} \left[\left|\left(\widetilde{V'}_{1}^{\pi_k}(s)   - \widetilde{\widetilde{V'}}_1^{\pi_k}(s)    \right) \right| \mid \pi_k, \tilde{M}'_k \right]\right]  \\
=  & \mathbb{E}_{k-1} \left[\left| \widetilde{V'}_{1}^{\pi_k}(s)   - \widetilde{\widetilde{V'}}_1^{\pi_k}(s)     \right|\right]  \\
=  & \mathbb{E}_{k-1} \left[\left| \widetilde{V'}_{1}^{\pi_k}(s)   -  \widehat{V}_{1}^{\pi_k}(s) + \widehat{V}_{1}^{\pi_k}(s)- \widetilde{\widetilde{V'}}_1^{\pi_k}(s)      \right|\right]  \\
\le  & \mathbb{E}_{k-1} \left[\left| \widetilde{V'}_{1}^{\pi_k}(s)   -  \widehat{V}_{1}^{\pi_k}(s)     \right| \right] +\mathbb{E}_{k-1} \left[\left|  \widehat{V}_{1}^{\pi_k}(s)- \widetilde{\widetilde{V'}}_1^{\pi_k}(s)       \right|\right]  \\
=  &  \mathbb{E}_{k-1} \left[ \mathbb{E}_{k-1} \left[  \left|\underbrace{ \widetilde{V'}_{1}^{\pi_k}(s)   -  \widehat{V}_{1}^{\pi_k}(s)  }_{I_1 \ge 0}   \right| \mid \pi_k \right] \right]  + \mathbb{E}_{k-1} \left[ \mathbb{E}_{k-1} \left[\left| \underbrace{  \widehat{V}_{1}^{\pi_k}(s)- \widetilde{\widetilde{V'}}_1^{\pi_k}(s) }_{I_2 \le 0}      \right| \mid \pi_k \right] \right] \\
=^{(b)}  &  \mathbb{E}_{k-1} \left[ \mathbb{E}_{k-1} \left[  \underbrace{ \widetilde{V'}_{1}^{\pi_k}(s)   -  \widehat{V}_{1}^{\pi_k}(s)  }_{I_1}    \mid \pi_k \right] \right]  + \mathbb{E}_{k-1} \left[ \mathbb{E}_{k-1} \left[ \underbrace{ \widetilde{\widetilde{V'}}_1^{\pi_k}(s) - \widehat{V}_{1}^{\pi_k}(s) }_{-I_2}      \mid \pi_k \right] \right] \\
=^{(c)}  & \mathbb{E}_{k-1} \left[\widetilde{V'}_{1}^{\pi_k}(s)   -  \widehat{V}_{1}^{\pi_k}(s)  \right]  +   \mathbb{E}_{k-1} \left[ \widetilde{\widetilde{V'}}_1^{\pi_k}(s) - \widehat{V}_{1}^{\pi_k}(s)    \right]\quad,
\end{array}
\label{temp 98}
\end{equation}
where inequality (a) uses (\ref{temp 503}) and equality (b) uses the fact that 
conditioned on $\mathcal{F}_{k-1} = F_{k-1}$ and $\pi_k$, we have $I_1 \ge 0$ since MDPs $\tilde{M}'_k$ and $\hat{M}_k$ are constructed based on the same $\widehat{P}^{k-1} $ and O-TS-MDP guarantees $\widetilde{\mu'}_{s,a,t}^k \ge \widehat{\mu}_{s,a,t}^{k-1}$ for all $(s,a,t)$ hold simultaneously. Similarly, we have $I_2 \le 0$.


\paragraph{Upper bound $\lambda$. }
By plugging (\ref{temp 99}) and (\ref{temp 98}) into (\ref{temp 100}), we have
\begin{equation}
\begin{array}{lll}
   \lambda &\le & \frac{1}{c_0} \cdot \left(\mathbb{E}_{k-1} \left[\widetilde{V'}_{1}^{\pi_k}(s)   -  \widehat{V}_{1}^{\pi_k}(s)  \right]  + \mathbb{E}_{k-1} \left[ \widetilde{\widetilde{V'}}_1^{\pi_k}(s) - \widehat{V}_{1}^{\pi_k}(s)    \right] \right) \\
   & = & O \left(\mathbb{E}_{k-1} \left[\widetilde{V'}_{1}^{\pi_k}(s)   -  \widehat{V}_{1}^{\pi_k}(s) \right]    + \mathbb{E}_{k-1} \left[ \widetilde{\widetilde{V'}}_1^{\pi_k}(s) - \widehat{V}_{1}^{\pi_k}(s)    \right] \right)\quad.
    \end{array}
\end{equation}
By plugging the upper bound of $\lambda$ into (\ref{temp 93}), we have
\begin{equation}
    \begin{array}{lll}
         \mathbb{E} \left[ \bm{1} \left\{\mathcal{E}_{\pi_*}^k \right\} \cdot \left(V_1^{\pi_*}(s) - \widetilde{V'}_{1}^{\pi_k}(s)        \right) \right] 
       & \le & O \left(\mathbb{E}\left[\widetilde{V'}_{1}^{\pi_k}(s)   -  \widehat{V}_{1}^{\pi_k}(s) \right]    + \mathbb{E} \left[ \widetilde{\widetilde{V'}}_1^{\pi_k}(s) - \widehat{V}_{1}^{\pi_k}(s)    \right] \right)\quad.
          \label{temp 97}
    \end{array}
\end{equation}
Now, we use Lemma~\ref{temp 78} to upper bound  (\ref{temp 97}). We have
\begin{equation}
    \begin{array}{l}
         \mathbb{E} \left[\widetilde{V'}_{1}^{\pi_k}(s)   -  \widehat{V}_{1}^{\pi_k}(s)  \right] \le  \sqrt{SH}\sum\limits_{t=1}^{H}  \mathbb{E} \left[ \sigma_{s_t, \pi_k(s_t,t),t}^k \right]\quad.
        \end{array}
        \end{equation}
        Similarly, we have
\begin{equation}
    \begin{array}{l}
         \mathbb{E} \left[\widetilde{\widetilde{V'}}_{1}^{\pi_k}(s)   -  \widehat{V}_{1}^{\pi_k}(s) \right] 
           \le \sqrt{SH}  \sum\limits_{t=1}^{H}  \mathbb{E} \left[\sigma_{s_t, \pi_k(s_t,t),t}^k \right]\quad,
                   \end{array}
\end{equation}
which concludes the proof. 
\end{proof}
Before presenting the proof of Lemma~\ref{temp 78}, we present Lemma~\ref{stochastic dominance 1} first.

\begin{lemma} 
\label{stochastic dominance 1}
Let $\widehat{\mu}$ be a constant in $\mathbb{R}$ and $\sigma > 0$ be a positive  constant. Let    $\widetilde{\mu}'$ be a random variable that is drawn from a distribution  with  probability density function $f'(x)$ defined as the following.
\begin{equation}
    f'(x) = \left\{ \begin{array}{ll}
         0, & \mbox{if $x < \widehat{\mu}$};\\
        \phi \left(x; \widehat{\mu}, \sigma^2 \right) +0.5 \cdot  \delta(x-\widehat{\mu}) ,& \mbox{if $x \ge \widehat{\mu}$},\end{array} \right. 
\end{equation}
where $\phi \left(x; \widehat{\mu}, \sigma^2 \right)$  denotes the PDF of $\mathcal{N}\left(\widehat{\mu}, \sigma^2 \right)$ and $\delta(x) $ denotes the Dirac delta function.
Then, we have 
$\mathbb{E}\left[\widetilde{\mu}' \right] - \widehat{\mu} \le \sigma$.
\end{lemma}
\begin{proof}[Proof of Lemma~\ref{stochastic dominance 1}]
We  use the definition of expectation to complete the proof. We have
\begin{equation}
\begin{array}{lll}  \mathbb{E}\left[\widetilde{\mu}' \right]  
  &= & \int_{- \infty}^{+ \infty} x f'(x)dx \\
  & = & \int_{\widehat{\mu}}^{+ \infty} x f'(x)dx \\
   &= & \int_{\widehat{\mu}}^{+ \infty} x \cdot \left(\phi \left(x; \widehat{\mu}, \sigma^2 \right) +0.5 \cdot  \delta(x-\widehat{\mu}) \right)dx \\
   &= &  \int_{\widehat{\mu}}^{+ \infty}  (x-\widehat{\mu}) \frac{1}{\sigma \sqrt{2\pi}} e^{- \frac{(x-\widehat{\mu})^2}{2 \sigma^2}} dx +  \int_{\widehat{\mu}}^{+ \infty}  \widehat{\mu} \frac{1}{\sigma \sqrt{2\pi}} e^{- \frac{(x-\widehat{\mu})^2}{2 \sigma^2}} dx  +  0.5 \widehat{\mu} \\
   &\le & \sigma + \widehat{\mu}\quad.
    \end{array}
\end{equation}\end{proof}
\begin{proof}[Proof of Lemma~\ref{temp 78} (Posterior deviation lemma)] 
We have
\begin{equation}
    \begin{array}{lll}
       \mathbb{E} \left[  \widetilde{V'}_{1}^{\pi_k}(s) - \widehat{V}_1^{\pi_k}(s)           \right]  &  =
         & \mathbb{E} \left[ \mathbb{E} \left[   \underbrace{\widetilde{V'}_{1}^{\pi_k}(s) - \widehat{V}_1^{\pi_k}(s) }_{\text{LHS in Lemma~\ref{value difference lemma}}}   \mid  \mathcal{F}_{k-1}  , \tilde{M}'_k \right] \right] \\
         & = & \mathbb{E} \left[ \mathbb{E} \left[ \underbrace{ \mathbb{E}_{s_2, \dotsc, s_H} \left[\sum\limits_{t=1}^{H} \left(\widetilde{\mu'}_{s_t, \pi_k(s_t, t),t}^{k} - \widehat{\mu}_{s_t, \pi_k(s_t, t),t}^{k-1} \right) \mid  \mathcal{F}_{k-1}, \tilde{M}'_k\right]  }_{\text{RHS in Lemma~\ref{value difference lemma}}}\mid  \mathcal{F}_{k-1}, \tilde{M}'_k  \right] \right] \\
           & = & \mathbb{E} \left[\mathbb{E}_{s_2, \dotsc, s_H} \left[\sum\limits_{t=1}^{H} \left(\widetilde{\mu'}_{s_t, \pi_k(s_t, t),t}^{k} - \widehat{\mu}_{s_t, \pi_k(s_t, t),t}^{k-1} \right) \mid  \mathcal{F}_{k-1}, \tilde{M}'_k \right]\right] \\
           & = & \mathbb{E} \left[\sum\limits_{t=1}^{H} \left(\widetilde{\mu'}_{s_t, \pi_k(s_t, t),t}^{k} - \widehat{\mu}_{s_t, \pi_k(s_t, t),t}^{k-1} \right) \right] \\
           & = & \sum\limits_{t=1}^{H} \mathbb{E} \left[ \underbrace{\mathbb{E}\left[  \left(\widetilde{\mu'}_{s_t, \pi_k(s_t, t),t}^{k} - \widehat{\mu}_{s_t, \pi_k(s_t, t),t}^{k-1} \right) \mid \mathcal{F}_{k-1}, s_t, \pi_k \right]}_{\text{Lemma~\ref{stochastic dominance 1}}} \right] \\
           & \le & \sum\limits_{t=1}^{H}\mathbb{E}_{}\left[  \sqrt{SH} \cdot \sigma_{s_t, \pi_k(s_t,t),t}^{k} 
   \right]\quad,
    \end{array}
\end{equation}
where the second equality uses the fact that conditioned on $\mathcal{F}_{k-1}$ and $\tilde{M}'_k$, the policy $\pi_k$ is determined.
\end{proof}


\begin{proof}[Proof of Lemma~\ref{temp 91} (Empirical deviation lemma)]
We have 
 \begin{equation}
    \begin{array}{ll}
      &  \mathbb{E} \left[ \left(\widehat{V}_{1}^{\pi_k}(s) - {V}_{1}^{\pi_k}(s) \right) \cdot \bm{1} \left\{ \mathcal{E}^k \right\} \right]\\
      =  & \mathbb{E} \left[ \mathbb{E} \left[ \underbrace{\left(\widehat{V}_{1}^{\pi_k}(s) - {V}_{1}^{\pi_k}(s) \right) }_{\text{LHS in Lemma~\ref{value difference lemma}}}\cdot \bm{1} \left\{ \mathcal{E}^k \right\}  \mid \pi_k , \mathcal{F}_{k-1}\right]\right]\\
         % =  & \mathbb{E} \left[ \mathbb{E} \left[ \left(\widehat{V}_{1}^{\pi_k} - {V}_{1}^{\pi_k} \right)  \mid \mathcal{F}_{k-1}, \pi_k \right]\right]\\
          = & \mathbb{E} \left[ \mathbb{E} \left[ \bm{1} \left\{ \mathcal{E}^k \right\} \underbrace{  \mathbb{E}_{s_2, \dotsc, s_H} \left[\sum\limits_{t=1}^{H} \left(\widehat{\mu}_{s_t, \pi_k(s_t,t),t}^{k-1} - \mu_{s_t, \pi_k(s_t,t),t} \right) + \left(\widehat{P}_{s_t, \pi_k(s_t,t),t}^{k-1} - P_{s_t, \pi_k(s_t,t),t} \right)^{\intercal} V_{t+1}^{\pi_k} \mid  \pi_k, \mathcal{F}_{k-1}\right] }_{\text{RHS in Lemma~\ref{value difference lemma}}}\mid  \pi_k, \mathcal{F}_{k-1} \right]\right]\\
                \le  & \mathbb{E} \left[  \mathbb{E} \left[  \bm{1} \left\{ \mathcal{E}^k \right\}  \mathbb{E}_{s_2, \dotsc, s_H} \left[\sum\limits_{t=1}^{H}\underbrace{ \left|\widehat{\mu}_{s_t, \pi_k(s_t,t),t}^{k-1} - \mu_{s_t, \pi_k(s_t,t),t} \right| }_{\le \sigma_{s_t, \pi_k(s_t,t),t}^{k}}+ \underbrace{\left| \left(\widehat{P}_{s_t, \pi_k(s_t,t),t}^{k-1} - P_{s_t, \pi_k(s_t,t),t} \right)^{\intercal} V_{t+1}^{\pi_k} \right|}_{\le \sigma_{s_t, \pi_k(s_t,t),t}^k }\mid  \pi_k, \mathcal{F}_{k-1}\right] \mid  \pi_k, \mathcal{F}_{k-1} \right]\right]\\
                  \le  & \mathbb{E} \left[  \mathbb{E} \left[  \mathbb{E}_{s_2, \dotsc, s_H} \left[\sum\limits_{t=1}^{H}2\sigma_{s_t, \pi_k(s_t,t),t}^{k}\mid  \pi_k, \mathcal{F}_{k-1}\right] \mid  \pi_k , \mathcal{F}_{k-1}\right]\right]\\
                  =  & \mathbb{E} \left[  \mathbb{E}_{s_2, \dotsc, s_H} \left[\sum\limits_{t=1}^{H}2\sigma_{s_t, \pi_k(s_t,t),t}^{k}\mid  \pi_k, \mathcal{F}_{k-1}\right]\right]\\
                  =  & \sum\limits_{t=1}^{H}\mathbb{E} \left[2\sigma_{s_t, \pi_k(s_t,t),t}^{k}\right]\quad.
         \end{array}
    \end{equation}
   The last inequality uses the fact that if event $\mathcal{E}^k$ is true, we have $
 \left| \left(\widehat{\mu}_{s,a,t}^{k-1} - \mu_{s,a,t} \right) \right| \le \sigma_{s,a,t}^{k}$ and $
 \left| \left(\widehat{P}_{s,a,t}^{k-1} - P_{s,a,t} \right)^{\intercal} V_{t+1}^{\pi_k} \right| \le \sigma_{s,a,t}^{k}$ for all $(s,a,t)$ hold   simultaneously. \end{proof}


    \begin{proof}[Proof of Lemma~\ref{temp 92}]
    The proof is very similar to the proof of Lemma~5 in \citet{russo2019worst}. We use anti-concentration bounds of Gaussian distributions shown in Lemma~\ref{Gaussians}. Let $\mathbb{P}_{k-1} \left\{\cdot \right\}= \mathbb{P}\left\{\cdot \mid \mathcal{F}_{k-1} = F_{k-1}\right\}$ and $\mathbb{E}_{k-1} \left[\cdot \right]= \mathbb{E}\left[\cdot \mid \mathcal{F}_{k-1} = F_{k-1}\right]$.
    
    We first rewrite the LHS in (\ref{temp 55}) as
\begin{equation}
    \begin{array}{l}
          \mathbb{P}_{k-1} \left\{  \bm{1} \left\{\mathcal{E}_{\pi_*}^k \right\}  \left(\widetilde{V}_{1}^{\pi_*}(s)    - V_1^{\pi_*}(s) \right) \ge 0  \right\} 
         =   \mathbb{P}_{k-1} \left\{ \bm{1} \left\{\mathcal{E}_{\pi_*}^k \right\} \cdot \underbrace{ \left( \widetilde{V}_{1}^{\pi_*}(s) - \widehat{V}_{1}^{\pi_*}(s)   \right) }_{I_2} \ge  \bm{1} \left\{\mathcal{E}_{\pi_*}^k \right\} \cdot \underbrace{\left(V_1^{\pi_*}(s) -  \widehat{V}_{1}^{\pi_*}(s) \right)}_{I_1}  \right\}. 
                         \end{array}
                         \label{temp 56}
\end{equation}
Conditioned on $\mathcal{F}_{k-1} = F_{k-1}$ such that event $\mathcal{E}_{\pi_*}^k $ is true, we construct an upper bound on $I_1$. We have
\begin{equation}
    \begin{array}{lll}
         I_1 & = & 
         
         
         \underbrace{  V_1^{\pi_*}(s) -  \widehat{V}_{1}^{\pi_*}(s) }_{\text{LHS in Lemma~\ref{value difference lemma}}}  \\
         & = & \underbrace{ \mathbb{E}_{s_2, \dotsc, s_H} \left[\sum\limits_{t=1}^{H} \left(\mu_{s_t, \pi_*(s_t,t),t}- \widehat{\mu}_{s_t, \pi_*(s_t,t),t}^{k-1}   \right) + \left(P_{s_t, \pi_*(s_t,t),t} - \widehat{P}_{s_t, \pi_*(s_t,t),t}^{k-1}   \right)^{\intercal} V_{t+1}^{\pi_*} \mid \mathcal{F}_{k-1} = F_{k-1} \right]}_{\text{RHS in Lemma~\ref{value difference lemma}}} \\
         & \le & \mathbb{E}_{s_2, \dotsc, s_H} \left[\sum\limits_{t=1}^{H} \left|\widehat{\mu}_{s_t, \pi_*(s_t,t),t}^{k-1} - \mu_{s_t, \pi_*(s_t,t),t} \right| + \left|\left(\widehat{P}_{s_t, \pi_*(s_t,t),t}^{k-1} - P_{s_t, \pi_*(s_t,t),t} \right)^{\intercal} V_{t+1}^{\pi_*} \right| \mid \mathcal{F}_{k-1} = F_{k-1 }\right] \\
          & \le & \mathbb{E}_{s_2, \dotsc, s_H} \left[\sum\limits_{t=1}^{H} 2 \sigma_{s_t, \pi_*(s_t,t),t}^k \mid \mathcal{F}_{k-1} = F_{k-1 }\right] \quad,
 \end{array}
\end{equation}
where the expectation is taken over the sampled trajectory $s_2, \dotsc, s_H$ drawn following $\pi_*$ and  $ \widehat{P}^{k-1}$ given $s$ as the initial state. The last inequality uses the fact that if event 
$\mathcal{E}_{\pi_*}^k$ is true, we have $
\left|\mu_{s,a,t} -\widehat{\mu}_{s,a,t}^{k-1}\right| \le  \sigma_{s,a,t}^{k}$ and $ \left|\left(P_{s,a,t} - \widehat{P}^{k-1}_{s,a,t}\right)^{\intercal}V^{\pi_*}_{t+1} \right|\le \sigma_{s,a,t}^{k}$ for all $(s,a,t)$ hold simultaneously.

By plugging the upper bound on $I_1$ in (\ref{temp 56}), we have
\begin{equation}
    \begin{array}{lll}
     (\ref{temp 56}) & =    &  \mathbb{P}_{k-1} \left\{ \bm{1} \left\{\mathcal{E}_{\pi_*}^k \right\} \cdot \underbrace{ \left( \widetilde{V}_{1}^{\pi_*}(s) - \widehat{V}_{1}^{\pi_*}(s)   \right) }_{I_2} \ge  \bm{1} \left\{\mathcal{E}_{\pi_*}^k \right\} \cdot \underbrace{\left(V_1^{\pi_*}(s) -  \widehat{V}_{1}^{\pi_*}(s) \right)}_{I_1}  \right\} \\
      &  \ge &  \mathbb{P}_{k-1} \left\{ \bm{1} \left\{\mathcal{E}_{\pi_*}^k \right\} \cdot  \left( \widetilde{V}_{1}^{\pi_*}(s) - \widehat{V}_{1}^{\pi_*}(s)   \right)  \ge  \bm{1} \left\{\mathcal{E}_{\pi_*}^k \right\} \cdot \underbrace{\left(\mathbb{E}_{s_2, \dotsc, s_H} \left[\sum\limits_{t=1}^{H} 2 \sigma_{s_t, \pi_*(s_t,t),t}^k \mid \mathcal{F}_{k-1} = F_{k-1 }\right] \right)}_{\text{an upper bound on }I_1}  \right\} \\
        &   = &  \mathbb{P}_{k-1} \left\{  \left( \widetilde{V}_{1}^{\pi_*}(s) - \widehat{V}_{1}^{\pi_*}(s)   \right) \ge \mathbb{E}_{s_2, \dotsc, s_H} \left[\sum\limits_{t=1}^{H} 2 \sigma_{s_t, \pi_*(s_t,t),t}^k \mid \mathcal{F}_{k-1} = F_{k-1 }\right]   \right\} \\
%              = &  \mathbb{P}_{k-1} \left\{  \underbrace{ \left( \widetilde{V}_{1}^{\pi_*}(s) - \widehat{V}_{1}^{\pi_*}(s)   \right) }_{I_2} \ge \mathbb{E}_{s_2, \dotsc, s_H} \left[\sum\limits_{t=1}^{H} 2 \sigma_{s_t, \pi_*(s_t,t),t}^k \mid  \tilde{M}_k, \mathcal{F}_{k-1} = F_{k-1 }\right]   \right\} \\
              &    = &  \mathbb{E}_{k-1} \left[ \bm{1} \left\{   \left( \widetilde{V}_{1}^{\pi_*}(s) - \widehat{V}_{1}^{\pi_*}(s)   \right) \ge \mathbb{E}_{s_2, \dotsc, s_H} \left[\sum\limits_{t=1}^{H} 2 \sigma_{s_t, \pi_*(s_t,t),t}^k \mid   \mathcal{F}_{k-1} = F_{k-1 }\right]   \right\}  \right] \\
              &        = &  \mathbb{E}_{k-1} \left[ \mathbb{E}_{k-1} \left[ \bm{1} \left\{  \underbrace{ \left( \widetilde{V}_{1}^{\pi_*}(s) - \widehat{V}_{1}^{\pi_*}(s)   \right) }_{I_2} \ge \mathbb{E}_{s_2, \dotsc, s_H} \left[\sum\limits_{t=1}^{H} 2 \sigma_{s_t, \pi_*(s_t,t),t}^k \mid   \mathcal{F}_{k-1} = F_{k-1 }\right]   \right\} \mid \tilde{M}_k \right] \right] \quad.
        \end{array}
        \label{temp 702}
        \end{equation}
        Now, conditioned on $\mathcal{F}_{t-1} = F_{t-1}$ and $\tilde{M}_k$, we rewrite $I_2$ as
        \begin{equation}
            \begin{array}{lllll}
                 I_2 & = &  \widetilde{V}_{1}^{\pi_*}(s) - \widehat{V}_{1}^{\pi_*}(s)   & = & \mathbb{E}_{s_2, \dotsc, s_H} \left[\sum\limits_{t=1}^{H} \left(\widetilde{\mu}_{s_t, \pi_*(s_t,t),t}^{k} - \widehat{\mu}_{s_t, \pi_*(s_t,t),t}^{k-1}  \right)  \mid \mathcal{F}_{k-1} = F_{k-1}, \tilde{M}_k\right]  \quad,
            \end{array}
        \end{equation}
        where the expectation is still taken over the sampled trajectory $s_2, \dotsc, s_H$ drawn following $\pi_*$ and  $ \widehat{P}^{k-1} $ given $s$ as the initial state. 
Then, we have
\begin{equation}
    \begin{array}{lll}
       (\ref{temp 702})  & = & \mathbb{E}_{k-1} \left[ \mathbb{E}_{k-1} \left[ \bm{1} \left\{  \underbrace{ \left( \widetilde{V}_{1}^{\pi_*}(s) - \widehat{V}_{1}^{\pi_*}(s)   \right) }_{I_2} \ge \mathbb{E}_{s_2, \dotsc, s_H} \left[\sum\limits_{t=1}^{H} 2 \sigma_{s_t, \pi_*(s_t,t),t}^k \mid   \mathcal{F}_{k-1} = F_{k-1 }\right]   \right\} \mid \tilde{M}_k \right] \right]  \\
        & = & \mathbb{E}_{k-1} \left[ \mathbb{E}_{k-1} \left[ \bm{1} \left\{ \mathbb{E}_{s_2, \dotsc, s_H} \left[\sum\limits_{t=1}^{H} \left(\widetilde{\mu}_{s_t, \pi_*(s_t,t),t}^{k} - \widehat{\mu}_{s_t, \pi_*(s_t,t),t}^{k-1}  \right)  \mid \mathcal{F}_{k-1} = F_{k-1}, \tilde{M}_k\right]  \ge \right.\right.\right. \\
         & & \quad\quad\quad\quad\quad\quad\quad \left.\left.\left. \mathbb{E}_{s_2, \dotsc, s_H} \left[\sum\limits_{t=1}^{H} 2 \sigma_{s_t, \pi_*(s_t,t),t}^k \mid   \mathcal{F}_{k-1} = F_{k-1 }\right]   \right\} \mid \tilde{M}_k \right] \right] \\
         & = & \mathbb{E}_{k-1} \left[ \mathbb{E}_{k-1} \left[ \bm{1} \left\{ \mathbb{E}_{s_2, \dotsc, s_H} \left[\sum\limits_{t=1}^{H} \left(\widetilde{\mu}_{s_t, \pi_*(s_t,t),t}^{k} - \widehat{\mu}_{s_t, \pi_*(s_t,t),t}^{k-1}  \right)  \mid \mathcal{F}_{k-1} = F_{k-1}, \tilde{M}_k\right]  \ge \right.\right.\right. \\
         & & \quad\quad\quad\quad\quad\quad\quad \left.\left.\left. \mathbb{E}_{s_2, \dotsc, s_H} \left[\sum\limits_{t=1}^{H} 2 \sigma_{s_t, \pi_*(s_t,t),t}^k \mid   \mathcal{F}_{k-1} = F_{k-1 }, \tilde{M}_k \right]   \right\} \mid \tilde{M}_k \right] \right] \\
         & = & \mathbb{E}_{k-1} \left[ \mathbb{E}_{k-1} \left[ \bm{1} \left\{ \underbrace{\mathbb{E}_{s_2, \dotsc, s_H} \left[\sum\limits_{t=1}^{H} \left(\widetilde{\mu}_{s_t, \pi_*(s_t,t),t}^{k} - \widehat{\mu}_{s_t, \pi_*(s_t,t),t}^{k-1} -2 \sigma_{s_t, \pi_*(s_t,t),t}^k \right)  \mid \mathcal{F}_{k-1} = F_{k-1}, \tilde{M}_k\right]}_{I_3}  \ge 0 \right\} \mid \tilde{M}_k \right] \right].
         \end{array}
         \label{temp 703}
         \end{equation}
For each $(s,a,t)$, let $n_{s,a,t}^k := \widetilde{\mu}_{s,a,t}^k - \widehat{\mu}_{s,a,t}^{k-1}$ and  $\omega_{s,a,t}$ be the probability that $(s,a)$ is visited in round $t$ when following policy $\pi_*$ and $\widehat{P}^{k-1}$ given $s$ as the initial state. Now, we express $I_3$ as
\begin{equation}
    \begin{array}{lll}
   I_3 & = & 
\underbrace{ \sum\limits_{t=1}^{H} \sum\limits_{ s \in \mathcal{S}} \omega_{s, \pi_*(s,t),t} \cdot n^k_{s, \pi_*(s,t),t} }_{=:X} - \underbrace{\sum\limits_{t=1}^{H} \sum\limits_{ s \in \mathcal{S}} \omega_{s, \pi_*(s,t),t} \cdot 2 \sigma_{s, \pi_*(s,t),t}^k }_{=:z}  \quad.
    \end{array}
\end{equation}
Since $\widetilde{\mu}_{s,a,t}^k \sim \mathcal{N} \left( \widehat{\mu}_{s,a,t}^{k-1}, SH \left(\sigma_{s,a,t}^k\right)^2 \right)$,  we know $n_{s,a,t}^k \sim \mathcal{N} \left(0,  SH \left(\sigma_{s,a,t}^k\right)^2 \right)$, which future implies   $X \sim \left(0,  \sum\limits_{t=1}^{H} \sum\limits_{s \in \mathcal{S}} SH \cdot \left(\omega_{s, \pi_*(s,t),t} \right)^2 \cdot \left(\sigma^k_{s,\pi_*(s,t),t}\right)^2 \right)$.
 Now, we construct an upper bound for $z$ using Cauchy-Schwarz inequality. We have 
 \begin{equation}
     \begin{array}{lll}
     z &     
     \le & \sqrt{\sum\limits_{t=1}^{H} \sum\limits_{s \in \mathcal{S}}  2^2}  \sqrt{\sum\limits_{t=1}^{H} \sum\limits_{s \in \mathcal{S}} \left(\omega_{s, \pi_*(s,t),t} \right)^2  \left(\sigma_{s, \pi_*(s,t),t}^{k}\right)^2  }
     \le  2\sqrt{SH}  \sqrt{\sum\limits_{t=1}^{H} \sum\limits_{s \in \mathcal{S}} \left(\omega_{s, \pi_*(s,t),t}\right)^2  \left(\sigma_{s, \pi_*(s,t),t}^{k}\right)^2  }.     
     \end{array}
 \end{equation}
 
     
Now, we come back to  (\ref{temp 703})  and have
\begin{equation}
    \begin{array}{lll}
         (\ref{temp 703}) & =  & \mathbb{E}_{k-1} \left[ \mathbb{E}_{k-1} \left[ \bm{1} \left\{ \underbrace{ \sum\limits_{t=1}^{H} \sum\limits_{ s \in \mathcal{S}} \omega_{s, \pi_*(s,t),t} \cdot n^k_{s, \pi_*(s,t),t} - \sum\limits_{t=1}^{H} \sum\limits_{ s \in \mathcal{S}} \omega_{s, \pi_*(s,t),t} \cdot 2 \sigma_{s, \pi_*(s,t),t}^k }_{= I_3}  \ge 0 \right\} \mid \tilde{M}_k \right] \right] \\
         
         & =  & \mathbb{E}_{k-1} \left[ \bm{1} \left\{\underbrace{  \sum\limits_{t=1}^{H} \sum\limits_{ s \in \mathcal{S}} \omega_{s, \pi_*(s,t),t} \cdot n^k_{s, \pi_*(s,t),t} }_{X}- \underbrace{ \sum\limits_{t=1}^{H} \sum\limits_{ s \in \mathcal{S}} \omega_{s, \pi_*(s,t),t} \cdot 2 \sigma_{s, \pi_*(s,t),t}^k }_{z}  \ge 0 \right\}  \right] \\
         & =  & \mathbb{P}_{k-1} \left\{\sum\limits_{t=1}^{H} \sum\limits_{s \in \mathcal{S}}\omega_{s, \pi_*(s,t),t} \cdot n_{s, \pi_*(s,t),t}^{k}      - \sum\limits_{t=1}^{H} \sum\limits_{s \in \mathcal{S}}\omega_{s, \pi_*(s,t),t} \cdot 2 \sigma_{s, \pi_*(s,t),t}^{k} \ge 0  \right\} \\
         &  \ge &  \mathbb{P}_{k-1} \left\{\underbrace{\sum\limits_{t=1}^{H} \sum\limits_{s \in \mathcal{S}}\omega_{s, \pi_*(s,t),t} \cdot n_{s, \pi_*(s,t),t}^{k}   }_{ X}    \ge  2 \cdot \sqrt{SH\sum\limits_{t=1}^{H} \sum\limits_{s \in \mathcal{S}} \left( \omega_{s, \pi_*(s,t),t} \right)^2 \cdot \left(\sigma_{s, \pi_*(s,t),t}^{k}\right)^2  } \right\} \\
               &   \ge & c_0\quad,
    \end{array}
    \label{temp 704}
\end{equation}
where the last inequality uses anti-concentration bounds of Gaussian distributions shown in Lemma~\ref{Gaussians}. \end{proof}









\subsection{Proofs for Theorem~\ref{regret theorem 2}} \label{app: regret proof 2}

\begin{proof}[Proof of Lemma~\ref{opt lemma 2} (Optimism lemma)]
As O-TS-MDP$^+$ guarantees $\overline{V}_{1}^{\pi_*}(s)  \le \widetilde{V'}_{1}^{\pi_*}(s) \le \widetilde{V'}_{1}^{\pi_k}(s) $,
we only need to show that if event $\mathcal{E}_{\pi_*}^k$ is true, we have $V_{1}^{\pi_*}(s) \le \overline{V}_{1}^{\pi_*}(s) $ to complete the proof.   We use backwards induction to prove this claim. Recall  event 
$\mathcal{E}_{\pi_*}^k =
\left\{\left|\mu_{s,a,t} -\widehat{\mu}_{s,a,t}^{k-1} \right| \le  \sigma_{s,a,t}^{k},
\left|\left(P_{s,a,t} - \widehat{P}^{k-1}_{s,a,t}\right)^{\intercal}V^{\pi_*}_{t+1} \right| \le \sigma_{s,a,t}^{k}, \forall s,a,t \right\}$. Define $\overline{V}_{H+1}^{\pi_*} =V_{H+1}^{\pi_*}=\Vec{0}$.  If event $\mathcal{E}_{\pi_*}^k$ is true, we have the following.
    
    When $t = H$, for any $s$, we have 
    \begin{equation}
    \begin{array}{lll}
        V_{H}^{\pi_*}(s) &=& \mu_{s,\pi_*(s,H),H} + \underbrace{P_{s,\pi_*(s,H),H}^{\intercal}V_{H+1}^{\pi_*}}_{=0} \\
        &\le& \widehat{\mu}^{k-1}_{s,\pi_*(s,H),H} + \sigma_{s,\pi_*(s,H),H}^{k} \\
        &< & \widehat{\mu}^{k-1}_{s,\pi_*(s,H),H} + 2\sigma_{s,\pi_*(s,H),H}^{k}  \\
        &= & \overline{\mu}_{s,\pi_*(s,H),H}^k + \underbrace{\left\langle \widehat{P}_{s,\pi_*(s,H),H}^{k-1}, \overline{V}_{H+1}^{\pi_*} \right\rangle }_{=0} \\
        &= &\overline{V}_{H}^{\pi_*}(s)\quad.
            \end{array}
    \end{equation}
  When $t = H-1$, for any $s$, we have
    \begin{equation}
        \begin{array}{lll}
           V_{H-1}^{\pi_*}(s) &
           =&\mu_{s,\pi_*(s,H-1),H-1} + P_{s,\pi_*(s,H-1),H-1}^{\intercal} V_{H}^{\pi_*} \\
        &   \le& \widehat{\mu}^{k-1}_{s,\pi_*(s,H-1),H-1} + \sigma_{s,\pi_*(s,H-1),H-1}^{k}  + \left\langle \widehat{P}_{s,\pi_*(s,H-1),H-1}^{k-1} ,V_{H}^{\pi_*} \right\rangle + \sigma_{s,\pi_*(s,H-1),H-1}^{k} \\
         %&  = &\overline{\mu}_{s, \pi_*(s, H-1), H-1} + \left\langle \widehat{P}_{s,\pi_*(s,H-1),H-1}^{k-1} ,\overline{V}_{H}^{\pi_*} \right\rangle \\
         &  \le &\overline{\mu}_{s, \pi_*(s, H-1), H-1} + \left\langle \widehat{P}_{s,\pi_*(s,H-1),H-1}^{k-1} ,\overline{V}_{H}^{\pi_*} \right\rangle \\
          & = &\overline{V}_{H-1}^{\pi_*}(s)\quad.
        \end{array}
    \end{equation}
\vdots

When $t = 1$, for any $s$, we have
\begin{equation}
    \begin{array}{lll}
         V_{1}^{\pi_*}(s) 
      &  =&\mu_{s,\pi_*(s,1),1} + P_{s,\pi_*(s,1),1}^{\intercal} V_{2}^{\pi_*} \\
       & \le &\widehat{\mu}^{k-1}_{s,\pi_*(s,1),1} + \sigma_{s,\pi_*(s,1),1}^{k}  + \left\langle \widehat{P}_{s,\pi_*(s,1),1}^{k-1} ,V_{2}^{\pi_*} \right\rangle + \sigma_{s,\pi_*(s,1),1}^{k}  \\
        & \le &\overline{\mu}_{s, \pi_*(s, 1), 1} + \left\langle \widehat{P}_{s,\pi_*(s,1),1}^{k-1} ,\overline{V}_{2}^{\pi_*} \right\rangle \\
        &=& \overline{V}_{1}^{\pi_*}(s)\quad,
    \end{array}
\end{equation}
which concludes the proof.
\end{proof}
\begin{proof}[Proof of Lemma~\ref{post devi lemma} (Posterior deviation lemma)] 
We have
\begin{equation}
    \begin{array}{ll}
         &  \mathbb{E}_{}\left[  \widetilde{V'}_1^{\pi_k}(s)-\overline{V}_1^{\pi_k}(s)   \right]  \\
          = &  \mathbb{E}_{}\left[ \mathbb{E}_{} \left[ \underbrace{\widetilde{V'}_1^{\pi_k}(s)-\overline{V}_1^{\pi_k}(s) }_{\text{LHS in Lemma}~\ref{value difference lemma}}  \mid \mathcal{F}_{k-1},  \tilde{M}'_k  \right] \right]\\
        = &  \mathbb{E}_{}\left[ {\color{black}\mathbb{E}_{} \left[ {\color{black} \underbrace{\mathbb{E}_{s_2, \dotsc, s_H} \left[\sum\limits_{t=1}^{H}\left(\widetilde{\mu'}_{s_t, \pi_k(s_t,t),t}^{k}-\overline{\mu}_{s_t, \pi_k(s_t,t),t}^{k} \right) \mid  \mathcal{F}_{k-1},  \tilde{M}'_k \right]}_{\text{RHS in Lemma}~\ref{value difference lemma}} } \mid  \mathcal{F}_{k-1},  \tilde{M}'_k \right] }\right]\\
         = & \mathbb{E}_{}\left[{\color{black} \mathbb{E}_{s_2, \dotsc, s_H} \left[\sum\limits_{t=1}^{H}\left(\widetilde{\mu'}_{s_t, \pi_k(s_t,t),t}^{k}-\overline{\mu}_{s_t, \pi_k(s_t,t),t}^{k} \right) \mid  \mathcal{F}_{k-1} ,  \tilde{M}'_k   \right] } \right]\\
          = & \sum\limits_{t=1}^{H}\mathbb{E}_{}\left[ \widetilde{\mu'}_{s_t, \pi_k(s_t,t),t}^{k}-\overline{\mu}_{s_t, \pi_k(s_t,t),t}^{k} \right]\\
   
        = & \sum\limits_{t=1}^{H}\mathbb{E}_{}\left[\underbrace{\mathbb{E}_{} \left[   \left( \widetilde{\mu'}_{s_t, \pi_k(s_t,t),t}^{k}-\overline{\mu}_{s_t, \pi_k(s_t,t),t}^{k}\right)
    \mid \pi_k, \mathcal{F}_{k-1}, s_t \right] }_{\text{ Lemma}~\ref{stochastic dominance}} \right] \\
        
        \le & \sum\limits_{t=1}^{H}\mathbb{E}_{}\left[  \sigma_{s_t, \pi_k(s_t,t),t}^{k} 
    \right]\quad,
    \end{array}
\end{equation}
where the first equality uses the fact that conditioned on $\mathcal{F}_{k-1}$ and $\tilde{M}'_k$, MDP $\bar{M}_k$ and policy $\pi_k$ are determined. Note that $\tilde{M}'_k$ and $\bar{M}_k$ are constructed based on the same $\widehat{P}^{k-1}$.
\end{proof}

\begin{proof}[Proof of Lemma~\ref{temp 88} (UCB-like lemma)] 
The proof is very similar to the proofs for Lemma~\ref{temp 91} and uses the result of Lemma~\ref{temp 91}. Note that $\pi_k$ may not be the optimal policy for $\bar{M}_k$. We first do the following decomposition. We have
    \begin{equation}
    \begin{array}{lll}
          \mathbb{E}\left[  \left(\overline{V}_1^{\pi_k}(s) -V_1^{\pi_k}(s) \right)  \bm{1} \left\{ \mathcal{E}^k \right\} \right] 
    &   \le &  \mathbb{E}\left[  \left(\overline{V}_1^{\pi_k}(s) -\widehat{V}_1^{\pi_k}(s) \right)  \right] + \underbrace{\mathbb{E}\left[  \left( \widehat{V}_1^{\pi_k}(s) - {V}_1^{\pi_k}(s) \right)  \bm{1} \left\{ \mathcal{E}^k \right\} \right] }_{\le \sum\limits_{t=1}^{H} 
 \mathbb{E}\left[2 \sigma_{s_t, \pi_k(s_t, a_t),t}^{k}  
  \right], \ \text{Lemma~\ref{temp 91}}}\quad.
        \end{array}
        \label{temp 550}
    \end{equation}

For the first term above, we have
    \begin{equation}
        \begin{array}{ll}
             & \mathbb{E}\left[  \left(\overline{V}_1^{\pi_k}(s) -\widehat{V}_1^{\pi_k}(s) \right) \right]  \\
           = & \mathbb{E}\left[ \mathbb{E} \left[\left(\overline{V}_1^{\pi_k}(s) -\widehat{V}_1^{\pi_k}(s) \right)   \mid \pi_k, \mathcal{F}_{k-1}\right] \right] \\
           = &  \mathbb{E}\left[ \mathbb{E} \left[     \mathbb{E}_{s_2, \dotsc, s_H} \left[\sum\limits_{t=1}^{H} \left(\overline{\mu}_{s_t, \pi_k(s_t, t),t}^k - \widehat{\mu}_{s_t, \pi_k(s_t, t),t}^{k-1} \right)   \mid \pi_k, \mathcal{F}_{k-1}\right]  \mid \pi_k, \mathcal{F}_{k-1}\right] \right] \\
            = &  \mathbb{E}\left[ \mathbb{E}_{s_2, \dotsc, s_H} \left[\sum\limits_{t=1}^{H} \left(\overline{\mu}_{s_t, \pi_k(s_t, t),t}^k - \widehat{\mu}_{s_t, \pi_k(s_t, t),t}^{k-1} \right)   \mid \pi_k, \mathcal{F}_{k-1}\right] 
  \right] \\
  = &  \sum\limits_{t=1}^{H} 
 \mathbb{E}\left[ \left(\overline{\mu}_{s_t, \pi_k(s_t, t),t}^k - \widehat{\mu}_{s_t, \pi_k(s_t, t),t}^{k-1} \right)  
  \right] \\
   = &  \sum\limits_{t=1}^{H} 
 \mathbb{E}\left[2 \sigma_{s_t, \pi_k(s_t, t),t}^{k}  
  \right] \quad.
        \end{array}
    \end{equation}

    
    
    Then, we have 
    \begin{equation}
    \begin{array}{lll}
            (\ref{temp 550}) & \le & \sum\limits_{t=1}^{H} 
 \mathbb{E}\left[O \left( \sigma_{s_t, \pi_k(s_t, a_t),t}^{k}  
  \right) \right]\quad, 
    \end{array}
    \end{equation}
    which concludes the proof.\end{proof}

\begin{lemma}
Let $\widehat{\mu}$ be a constant in $\mathbb{R}$ and $\sigma > 0$ be a positive  constant.  Let $\overline{\mu} = \widehat{\mu} + z\sigma$, where constant $z >0$.
Let   $\widetilde{\mu}'$ be a random variable that is drawn from a distribution  with  probability density function $f'(x)$ defined as the following.
\begin{equation}
    f'(x) = \left\{ \begin{array}{ll}
        0, & \mbox{if $x < \overline{\mu}$};\\
        \phi \left(x; \widehat{\mu}, \sigma^2 \right) +  \Phi \left(\overline{\mu}; \widehat{\mu}, \sigma^2 \right) \delta(x-\overline{\mu}) ,& \mbox{if $x \ge \overline{\mu}$},\end{array} \right. 
\end{equation}
where $\phi \left(x; \widehat{\mu}, \sigma^2 \right)$ and $\Phi \left(x; \widehat{\mu}, \sigma^2 \right)$ denote the PDF and CDF of $\mathcal{N}\left(\widehat{\mu}, \sigma^2 \right)$, and $\delta(x)$ denotes the Dirac delta function. 
Then, we have $\mathbb{E}\left[\widetilde{\mu}' \right] - \overline{\mu} \le \sigma$.
\label{stochastic dominance}
\end{lemma}


\begin{proof}[Proof of Lemma~\ref{stochastic dominance}]
We use the definition of expectation and   have
\begin{equation}
\begin{array}{ll}
  &   \mathbb{E}\left[\widetilde{\mu}' \right]  \\
  = & \int_{- \infty}^{+ \infty} x f'(x)dx \\
   = &\int_{\overline{\mu}}^{+ \infty} x f'(x)dx   \\


   = & \int_{\overline{\mu}}^{+ \infty} x  \cdot \left( \phi \left(x; \widehat{\mu}, \sigma^2 \right) +  \Phi \left(\overline{\mu}; \widehat{\mu}, \sigma^2 \right) \delta(x-\overline{\mu})\right)dx \\
  =  & \int_{\overline{\mu}}^{+ \infty} (x-\widehat{\mu}) \frac{1}{\sigma \sqrt{2\pi}} e^{- \frac{(x-\widehat{\mu})^2}{2 \sigma^2}} dx  +\int_{\overline{\mu}}^{+ \infty} \widehat{\mu} \frac{1}{\sigma \sqrt{2\pi}} e^{- \frac{(x-\widehat{\mu})^2}{2 \sigma^2}} dx + \overline{\mu} \cdot \Phi \left(\overline{\mu}; \widehat{\mu}, \sigma^2 \right) \\
  = & \int_{\overline{\mu}}^{+ \infty} (x-\widehat{\mu}) \frac{1}{\sigma \sqrt{2\pi}} e^{- \frac{(x-\widehat{\mu})^2}{2 \sigma^2}} dx + \widehat{\mu} \cdot \left(1-\int^{\overline{\mu}}_{- \infty}  \frac{1}{\sigma \sqrt{2\pi}} e^{- \frac{(x-\widehat{\mu})^2}{2 \sigma^2}} dx \right) + \overline{\mu} \cdot \Phi \left(\overline{\mu}; \widehat{\mu}, \sigma^2 \right) \\
    \le &\int_{\widehat{\mu}}^{+ \infty} (x-\widehat{\mu}) \frac{1}{\sigma \sqrt{2\pi}} e^{- \frac{(x-\widehat{\mu})^2}{2 \sigma^2}} dx  +  \widehat{\mu} \cdot \left(1-\Phi \left(\overline{\mu}; \widehat{\mu}, \sigma^2 \right) \right)+ \overline{\mu} \cdot \Phi \left(\overline{\mu}; \widehat{\mu}, \sigma^2 \right)  \\
    \le & \sigma + \widehat{\mu} + z\sigma \cdot \Phi \left(\overline{\mu}; \widehat{\mu}, \sigma^2 \right) \\
    \le & \sigma + \widehat{\mu} + z\sigma  \\
    = & \sigma + \overline{\mu}\quad.
    \end{array}
\end{equation}
\end{proof}
\subsection{Other Lemmas used for MDPs}\label{app: other}
\begin{lemma}
(Value difference lemma, Lemma~3 in \citet{russo2019worst}, Lemma~E.15 in \citet{dann2017unifying}). Consider any fixed policy $\pi$ and two MDPs $M^{(1)} = \left(\mathcal{S}, \mathcal{A}, H, {P}^{(1)}, \mu^{(1)}, p_0 \right)$  and $M^{(2)} = \left(\mathcal{S}, \mathcal{A}, H, {P}^{(2)}, \mu^{(2)}, p_0 \right)$. Let $V_t^{\pi,(1)}$ and $V_t^{\pi,(2)}$ denote the respective value functions of $\pi$ under $M^{(1)}$ and   $M^{(2)}$. Then, for any $s$, we have
\begin{equation}
\begin{array}{ll}
  &  V_1^{\pi,(1)}(s) - V_1^{\pi,(2)}(s) \\
%  = &\mathbb{E}_{\pi,{P}^{(1)}  }  \left[ \sum\limits_{t=1}^{H}\left(\mu^{(1)}_{s_t, \pi(s_t,t),t} - \mu^{(2)}_{s_t,\pi(s_t,t),t} \right) + \left( P^{(1)}_{s_t,\pi(s_t,t),t} - P^{(2)}_{s_t,\pi(s_t,t),t}\right)^{\intercal} V^{\pi,(2)}_{t+1} {\color{red}\mid s_1 = s} \right] \\
    = &\mathbb{E}_{s_2, \dotsc, s_H }  \left[ \sum\limits_{t=1}^{H}\left(\mu^{(1)}_{s_t, \pi(s_t,t),t} - \mu^{(2)}_{s_t,\pi(s_t,t),t} \right) + \left( P^{(1)}_{s_t,\pi(s_t,t),t} - P^{(2)}_{s_t,\pi(s_t,t),t}\right)^{\intercal} V^{\pi,(2)}_{t+1}  \mid s_1 = s  \right] 
   % = &\mathbb{E}_{s_2, \dotsc, s_H }  \left[ \sum\limits_{t=1}^{H}\left(\mu^{(1)}_{s_t, \pi(s_t,t),t} - \mu^{(2)}_{s_t,\pi(s_t,t),t} \right) + \left( P^{(1)}_{s_t,\pi(s_t,t),t} - P^{(2)}_{s_t,\pi(s_t,t),t}\right)^{\intercal} V^{\pi,(2)}_{t+1} \mid s_1 = s \right] \\
\quad,
\end{array}
\end{equation}
where the expectation is over the sampled state trajectory $ s_2, \dotsc, s_H$ drawn following $\pi$ in $M^{(1)}$ given $ s$ as the initial state.
\label{value difference lemma}
\end{lemma}




\begin{lemma}
    (Proposition 5.2 in \citet{agrawal2017posterior}). For any fixed vector $h \in [0, H]^S$, let $\widehat{p} \in \Delta^S$ be the average of $n$ independent multi-noulli trials with parameter $p \in \Delta^S$. Then, for any $\delta \in (0,1)$, we have
    \begin{equation}
      \mathbb{P} \left\{  \left| \left(\widehat{p} - p \right)^{\intercal} h\right| > 5  \sqrt{\frac{H^2\log^2(T/\delta)}{n}} \right\} \le \delta \quad.
    \end{equation}
    \label{temp 89}
\end{lemma}
\begin{proof}[Proof of Lemma~\ref{temp 89}]
     Proposition 5.2 in  \cite{agrawal2017posterior} is derived based on Bernstein's inequality already. By setting $c_i = H$ and using the fact that ${\gamma}_i \le p_i$ in Proposition 5.2, 
    we have $2 \sqrt{\log(n/\delta) \sum\limits_{i < S}\frac{\gamma_i c_i^2}{n}} < 2 \sqrt{ \frac{H^2\log^2(T/\delta)}{n}}$ and the fast rate term $3H \frac{\log(2/\delta)}{n} \le 3H \frac{\log(T/\delta)}{n} \le 3H \sqrt{\frac{\log^2(T/\delta)}{n} } $. Combining both terms, we have the stated concentration bound.
\end{proof}


\begin{lemma}
    In any episode $k$, we have 
$ \mathbb{P} \left\{\overline{\mathcal{E}^k} \right\} \le O(SAHT \delta)$ and
$ \mathbb{P} \left\{\overline{\mathcal{E}_{\pi_*}^{k}} \right\} \le O(SAHT \delta)$.

    \label{temp 90}
\end{lemma}
\begin{proof}[Proof of Lemma~\ref{temp 90}] 
For each $(s,a,t)$, we 
 let $\widehat{P}_{s,a,t}^{(n_{s,a,t})}$ denote the average of $n_{s,a,t}$ independent multi-noulli trials with parameter $P_{s,a,t}$ and $\widehat{\mu}_{s,a,t}^{(n_{s,a,t})}$ denote the average of $n_{s,a,t}$ independent Bernoulli trials with parameter $\mu_{s,a,t}$.



\paragraph{Proofs for the first claim.} We have
 \begin{equation}
 \begin{array}{lll}
        \mathbb{P} \left\{\overline{\mathcal{E}^k} \right\} &
        \le & \mathbb{P} \left\{\exists s,a,t: 
 \left| \left(\widehat{P}_{s,a,t}^{k-1} - P_{s,a,t} \right)^{\intercal} V_{t+1}^{\pi_k} \right| > \sigma_{s,a,t}^{k}  \right\} 
  +  \mathbb{P} \left\{\exists s,a,t: 
 \left| \left(\widehat{\mu}_{s,a,t}^{k-1} - {\mu}_{s,a,t} \right) \right| > \sigma_{s,a,t}^{k}  \right\}.
 \end{array}
 \end{equation}
For the first term above, we use Lemma~\ref{temp 89}. For the second term above, we use Hoeffding's inequality.  
We have
  \begin{equation}
    \begin{array}{ll}
    & \mathbb{P} \left\{\exists s,a,t: 
 \left| \left(\widehat{P}_{s,a,t}^{k-1} - P_{s,a,t} \right)^{\intercal} V_{t+1}^{\pi_k} \right| > \sigma_{s,a,t}^{k}  \right\}\\
  \le &  \sum\limits_{s,a } \sum\limits_{t=1}^{H} \mathbb{P} \left\{
 \left| \left(\widehat{P}_{s,a,t}^{k-1} - P_{s,a,t} \right)^{\intercal} V_{t+1}^{\pi_k} \right| > \sigma_{s,a,t}^{k}  \right\} \\
  \le &  \sum\limits_{s,a } \sum\limits_{t=1}^{H}\sum\limits_{n_{s,a,t}=1}^{k-1} \mathbb{P} \left\{
 \left| \left(\widehat{P}_{s,a,t}^{(n_{s,a,t})} - P_{s,a,t} \right)^{\intercal} V_{t+1}^{\pi_k} \right| > \sigma_{s,a,t}^{k} \right\} \\
 % = &  \sum\limits_{s,a } \sum\limits_{t=1}^{H}\sum\limits_{n_{s,a,t}=1}^{k-1}  \mathbb{E} \left[\bm{1} \left\{ \left| \left(\widehat{P}_{s,a,t}^{(n_{s,a,t})} - P_{s,a,t} \right)^{\intercal} V_{t+1}^{\pi_k} \right| > O \left(\sqrt{\frac{H^2\log (1/\delta)}{n_{s,a,t}}} \right)\right\} \right]\\
%   = & \sum\limits_{s,a } \sum\limits_{t=1}^{H}\sum\limits_{n_{s,a,t}=1}^{k-1}  \mathbb{E} \left[ \underbrace{\mathbb{E} \left[ \bm{1} \left\{ \left| \left(\widehat{P}_{s,a,t}^{(n_{s,a,t})} - P_{s,a,t} \right)^{\intercal} V_{t+1}^{\pi_k} \right| > O \left(\sqrt{\frac{H^2\log (1/\delta)}{n_{s,a,t}}} \right)\right\} \mid \pi_k \right]}_{}\right]\\
  = &  \sum\limits_{s,a } \sum\limits_{t=1}^{H}\sum\limits_{n_{s,a,t}=1}^{k-1}  \mathbb{E} \left[\underbrace{ \mathbb{P}  \left\{
 \left| \left(\widehat{P}_{s,a,t}^{(n_{s,a,t})} - P_{s,a,t} \right)^{\intercal} V_{t+1}^{\pi_k} \right| > \sigma_{s,a,t}^{k} \mid \pi_k  \right\}}_{\le \delta, \ \text{Lemma~\ref{temp 89}}} \right]\\
  \le & O(SAHT\delta) \quad,
        \end{array}
    \end{equation}
where the last inequality uses Lemma~\ref{temp 89}. Note that $\pi_k$ is random as it is the optimal policy for $\tilde{M}'_k$. Conditioned on $\pi_k$, the value functions $V_{t+1}^{\pi_k} \in [0,H]$ are determined.

Similarly, we have
\begin{equation}
    \begin{array}{ll}
         & \mathbb{P} \left\{\exists s,a,t: 
 \left| \left(\widehat{\mu}_{s,a,t}^{k-1} - {\mu}_{s,a,t} \right) \right| >\sigma_{s,a,t}^{k}  \right\} \\
         \le & \sum\limits_{s,a } \sum\limits_{t=1}^{H}\sum\limits_{n_{s,a,t}=1}^{k-1} \underbrace{ \mathbb{P} \left\{
 \left| \left(\widehat{\mu}_{s,a,t}^{(n_{s,a,t})} - \mu_{s,a,t} \right) \right| > \sigma_{s,a,t}^{k}  \right\}}_{\text{Hoeffding's inequality}} \\
 \le & O(SAHT\delta)\quad,
    \end{array}
    \label{temp 77}
\end{equation}
which concludes the proof for the first claim.




\paragraph{Proofs for the second claim.} We have
\begin{equation}
    \begin{array}{lll}
          \mathbb{P} \left\{\overline{\mathcal{E}_{\pi_*}^{k}} \right\} 
       & \le  & \mathbb{P} \left\{\exists (s,a,t): \left|\left(P_{s,a,t} - \widehat{P}^{k-1}_{s,a,t}\right)^{\intercal}V^{\pi_*}_{t+1} \right| > \sigma_{s,a,t}^{k} \right\} +  \mathbb{P} \left\{\exists (s,a,t): \left|\mu_{s,a,t} -\widehat{\mu}_{s,a,t}^{k-1}\right| > \sigma_{s,a,t}^{k} \right\}\quad.
    \end{array}
\end{equation}
We only need to upper bound  the first term above as the second term  is exactly the same as (\ref{temp 77}).
We have
  \begin{equation}
    \begin{array}{ll}
    & \mathbb{P} \left\{\exists (s,a,t): \left|\left(P_{s,a,t} - \widehat{P}^{k-1}_{s,a,t}\right)^{\intercal}V^{\pi_*}_{t+1} \right| > \sigma_{s,a,t}^{k}  \right\}\\
  \le &  \sum\limits_{s,a } \sum\limits_{t=1}^{H} \mathbb{P} \left\{
 \left| \left(\widehat{P}_{s,a,t}^{k-1} - P_{s,a,t} \right)^{\intercal} V_{t+1}^{\pi_*} \right| > \sigma_{s,a,t}^{k}  \right\} \\
  \le &  \sum\limits_{s,a } \sum\limits_{t=1}^{H}\sum\limits_{n_{s,a,t}=1}^{k-1} \underbrace{\mathbb{P} \left\{
 \left| \left(\widehat{P}_{s,a,t}^{(n_{s,a,t})} - P_{s,a,t} \right)^{\intercal} V_{t+1}^{\pi_*} \right| > \sigma_{s,a,t}^{k} \right\} }_{\le \delta, \ \text{Lemma~\ref{temp 89}}} \\
 % = &  \sum\limits_{s,a } \sum\limits_{t=1}^{H}\sum\limits_{n_{s,a,t}=1}^{k-1}  \mathbb{E} \left[\bm{1} \left\{ \left| \left(\widehat{P}_{s,a,t}^{(n_{s,a,t})} - P_{s,a,t} \right)^{\intercal} V_{t+1}^{\pi_k} \right| > O \left(\sqrt{\frac{H^2\log (1/\delta)}{n_{s,a,t}}} \right)\right\} \right]\\
%   = & \sum\limits_{s,a } \sum\limits_{t=1}^{H}\sum\limits_{n_{s,a,t}=1}^{k-1}  \mathbb{E} \left[ \underbrace{\mathbb{E} \left[ \bm{1} \left\{ \left| \left(\widehat{P}_{s,a,t}^{(n_{s,a,t})} - P_{s,a,t} \right)^{\intercal} V_{t+1}^{\pi_k} \right| > O \left(\sqrt{\frac{H^2\log (1/\delta)}{n_{s,a,t}}} \right)\right\} \mid \pi_k \right]}_{}\right]\\
%  = &  \sum\limits_{s,a } \sum\limits_{t=1}^{H}\sum\limits_{n_{s,a,t}=1}^{k-1}  \mathbb{E} \left[\underbrace{ \mathbb{P}  \left\{\left| \left(\widehat{P}_{s,a,t}^{(n_{s,a,t})} - P_{s,a,t} \right)^{\intercal} V_{t+1}^{\pi_k} \right| > \sigma_{s,a,t}^{k} \mid \pi_k  \right\}}_{\le \delta, \text{Lemma~\ref{temp 89}}} \right]\\
  \le & O(SAHT\delta) \quad,
        \end{array}
    \end{equation}
    which concludes the proof of the second claim.
    
For the case where  $(s,a,t)$  has not been visited yet by the of episode $k-1$, i.e.,  $\widehat{O}^{k-1}_{s,a,t} = 0$, based on the learning algorithm, we set $\widehat{P}_{s,a,t}^{(k-1)} = \Vec{0}$ and $\widehat{\mu}_{s,a,t}^{k-1} = 0$. Since $\widehat{O}^{k-1}_{s,a,t} = 0$, 
 the learning algorithm can set $\sigma_{s,a,t}^{k} = \widetilde{O} \left(\sqrt{\frac{H^2}{\widehat{O}_{s,a,t}^{k-1}}} \right)$ to an extremely large value. Then, we know events $
 \left| \left(\widehat{P}_{s,a,t}^{(k-1)} - P_{s,a,t} \right)^{\intercal} V_{t+1}^{\pi_k} \right| > \sigma_{s,a,t}^{k}$, $
 \left| \left(\widehat{P}_{s,a,t}^{(k-1)} - P_{s,a,t} \right)^{\intercal} V_{t+1}^{\pi_*} \right| > \sigma_{s,a,t}^{k}$, and $
 \left|\widehat{\mu}_{s,a,t}^{(k-1)} - \mu_{s,a,t}  \right| > \sigma_{s,a,t}^{k}$ cannot happen.   \end{proof}
    
 %    Then, we have
 %    $\mathbb{P} \left\{
 % \left| \left(\widehat{P}_{s,a,t}^{(k-1)} - P_{s,a,t} \right)^{\intercal} V_{t+1}^{\pi_k} \right| > \sigma_{s,a,t}^{k} \right\} = 0$, $\mathbb{P} \left\{
 % \left| \left(\widehat{P}_{s,a,t}^{(k-1)} - P_{s,a,t} \right)^{\intercal} V_{t+1}^{\pi_*} \right| > \sigma_{s,a,t}^{k} \right\} = 0$, and $\mathbb{P} \left\{
 % \left|\widehat{\mu}_{s,a,t}^{(k-1)} - \mu_{s,a,t}  \right| > \sigma_{s,a,t}^{k} \right\} = 0$.
 

    


    
    % \begin{equation}
    % \begin{array}{ll}
    % & \widehat{V}_{1}^{\pi_k} - {V}_{1}^{\pi_k} \\
    % = & \mathbb{E}_{s_2, \dotsc, s_H} \left[\sum\limits_{t=1}^{H} \left(\widehat{\mu}_{s_t, \pi_k(s_t,t),t}^{k-1} - \mu_{s_t, \pi_k(s_t,t),t} \right) + \left(\widehat{P}_{s_t, \pi_k(s_t,t),t}^{k-1} - P_{s_t, \pi_k(s_t,t),t} \right)^{\intercal} V_{t+1}^{\pi_k} \mid \mathcal{F}_{k-1}, \pi_k\right] \\
      
    %      \end{array}
    % \end{equation}




\begin{lemma}
\label{Gaussians}
(Concentration and anti-concentration bounds of Gaussian distributions). For a Gaussian distributed random variable $Z$ with mean $\mu$ and variance $\sigma^2$, for $z > 0$, we have
\begin{equation}
    \mathbb{P} \left\{Z > \mu + z \sigma \right\} \le \frac{1}{2}e^{- \frac{z^2}{2}} \quad, \mathbb{P} \left\{Z < \mu - z \sigma \right\} \le \frac{1}{2}e^{- \frac{z^2}{2}}\quad,
\end{equation}
and 
\begin{equation}
     \mathbb{P} \left\{Z > \mu + z \sigma \right\} \ge \frac{1}{\sqrt{2\pi}} \frac{z}{z^2+1} e^{- \frac{z^2}{2}} \quad.
\end{equation}
\end{lemma}



%Recall $\widetilde{M}_k = \left\{\mathcal{S}, \mathcal{A}, H, \left\{\widehat{P}^{k-1} \right\}, \left\{\widetilde{\mu'}^{k} \right\}, p_0  \right\}$ and $\overline{M}_k = \left\{\mathcal{S}, \mathcal{A}, H, \left\{\widehat{P}^{k-1} \right\}, \left\{\overline{\mu}^{k} \right\}, p_0  \right\}$.


%From Lemma~\ref{value difference lemma}, we have $\overline{V}_{1}^{\pi_*}(s) - \widetilde{V}_{1}^{\pi_*}(s) \le 0$. Note that the usage of clipping guarantees that $\overline{\mu}_{s,a,t}^{k} \le \widetilde{\mu'}^{k}_{s,a,t}$ for all $(s,a,t)$. Since $\pi_k$ is the optimal policy for $\widetilde{M}_k$, we know $\widetilde{V}_{1}^{\pi_*}(s) \le \widetilde{V}_{1}^{\pi_k}$.

%= \overline{\mu}^{k}_{s,\pi_*(s,t),t} + \left\langle \widehat{P}_{s,\pi_*(s,t),t}^{k-1} ,V_{t+1}^{\pi_*} \right\rangle = \overline{V}_{t}^{\pi_*}(s)$.








% \begin{lemma}
% For a fixed policy $\pi$, 
% let $\widetilde{V'}_1^{\pi}$ be the value functions of policy $\pi$ for MDP $\tilde{M}' := \left\{\mathcal{S}, \mathcal{A}, H, P, \left\{ \widetilde{\mu}'\right\} , p_0\right\}$, where each $ \widetilde{\mu'}_{s,a,t} \sim XXXX$ and $\widetilde{V}_1^{\pi}$ be the value functions of policy $\pi$ for MDP $\tilde{M} := \left\{\mathcal{S}, \mathcal{A}, H, P, \left\{ \widetilde{\mu} \right\} , p_0\right\}$, where each $\widetilde{\mu}_{s,a,t} \sim XXX$. Then, if each $ \widetilde{\mu'}_{s,a,t}$ first-order stochastic dominates $ \widetilde{\mu}_{s,a,t}$, for any constant $b \in \mathbb{R}$, we have
% \begin{equation}
%     \mathbb{P}_{\widetilde{\mu}'} \left\{\widetilde{V'}_1^{\pi}(s)  \le b  \right\} \le  \mathbb{P}_{\widetilde{\mu}} \left\{\widetilde{V}_1^{\pi}(s) \le b  \right\} \quad.
%     \label{temp 1}
% \end{equation}
% \label{To-be-continued}
% \end{lemma}
% \begin{proof}[Proof of Lemma~\ref{To-be-continued}]
% We first introduce a new MDP $\widehat{M} := \left\{ \mathcal{S}, \mathcal{A}, H, P, \left\{ \widehat{\mu}\right\} , p_0 \right\}$. We have
% \begin{equation}
%     \begin{array}{ll}
%          & \mathbb{P}_{\widetilde{\mu}'} \left\{\widetilde{V'}_1^{\pi}(s)   \le b  \right\} \le  \mathbb{P}_{\widetilde{\mu}} \left\{\widetilde{V}_1^{\pi}(s) \le b  \right\} \\
%         \Leftrightarrow & \mathbb{P}_{\widetilde{\mu}'} \left\{\widetilde{V'}_1^{\pi}(s) - \widehat{V}_1^{\pi}(s)  \le b  -\widehat{V}_1^{\pi}(s)\right\} \le  \mathbb{P}_{\widetilde{\mu}} \left\{\widetilde{V}_1^{\pi}(s) - \widehat{V}_1^{\pi}(s) \le b - \widehat{V}_1^{\pi}(s) \right\} \\
%         \Leftrightarrow & \mathbb{P}_{\widetilde{\mu}'} \left\{\mathbb{E}_{\pi, P} \left[\sum\limits_{t=1}^{H} \left( \widetilde{\mu'}_{s_t, \pi(s_t, t),t} - \widehat{\mu}_{s_t, \pi(s_t, t),t} \right) \mid s_1 = s \right] \le b  -\widehat{V}_1^{\pi}(s)\right\} \\
%         & \le  \mathbb{P}_{\widetilde{\mu}} \left\{\mathbb{E}_{\pi, P} \left[\sum\limits_{t=1}^{H} \left( \widetilde{\mu}_{s_t, \pi(s_t, t),t} - \widehat{\mu}_{s_t, \pi(s_t, t),t} \right) \mid s_1 = s \right] \le b - \widehat{V}_1^{\pi}(s) \right\} \\
%         \Leftrightarrow & \mathbb{P}_{\widetilde{\mu}'} \left\{\sum\limits_{t=1}^{H} \sum\limits_{(s,a)}^{} \omega_{s,a,t} \left( \widetilde{\mu'}_{s, a,t} - \widehat{\mu}_{s, a,t} \right) \le b  -\widehat{V}_1^{\pi}(s)\right\} 
%          \le  \mathbb{P}_{\widetilde{\mu}} \left\{\sum\limits_{t=1}^{H} \sum\limits_{(s,a)}^{} \omega_{s,a,t} \left( \widetilde{\mu}_{s, a,t} - \widehat{\mu}_{s, a,t} \right) \le b - \widehat{V}_1^{\pi}(s) \right\} \\
%          \Leftrightarrow & \mathbb{P}_{\widetilde{\mu}'} \left\{\sum\limits_{t=1}^{H} \sum\limits_{(s,a)}^{} \omega_{s,a,t} \left( \widetilde{\mu'}_{s, a,t}  \right) \le \sum\limits_{t=1}^{H} \sum\limits_{(s,a)}^{} \omega_{s,a,t} \widehat{\mu}_{s, a,t} + b  -\widehat{V}_1^{\pi}(s)\right\} \\
         
%         &  \le  \mathbb{P}_{\widetilde{\mu}} \left\{\sum\limits_{t=1}^{H} \sum\limits_{(s,a)}^{} \omega_{s,a,t} \left( \widetilde{\mu}_{s, a,t}  \right) \le \sum\limits_{t=1}^{H} \sum\limits_{(s,a)}^{} \omega_{s,a,t} \widehat{\mu}_{s, a,t} + b  -\widehat{V}_1^{\pi}(s) \right\} \\
%     \end{array}
% \end{equation}



% For each $s,a,t$, let $b_{s,a,t} \in \mathbb{R}$ be a constant. From Lemma~\ref{stochastic dominance 1}, we know
% \begin{equation}
% \begin{array}{ll}
%    & \mathbb{P}_{\widetilde{\mu}'} \left\{\widetilde{\mu}'_{s,a,t} \le b_{s,a,t} \right\} \le \mathbb{P}_{\widetilde{\mu}} \left\{\widetilde{\mu}_{s,a,t} \le b_{s,a,t} \right\} \\
%    \Leftrightarrow & \mathbb{P}_{\widetilde{\mu}'} \left\{ \omega_{s,a,t}\widetilde{\mu}'_{s,a,t} \le \omega_{s,a,t} \cdot b_{s,a,t} \right\} \le \mathbb{P}_{\widetilde{\mu}} \left\{\omega_{s,a,t}\widetilde{\mu}_{s,a,t} \le \omega_{s,a,t} \cdot b_{s,a,t} \right\} \\
%     \Leftrightarrow & \mathop{\prod}\limits_{s,a,t}\mathbb{P}_{\widetilde{\mu}'} \left\{ \omega_{s,a,t}\widetilde{\mu}'_{s,a,t} \le \omega_{s,a,t} \cdot b_{s,a,t} \right\} \le \mathop{\prod}\limits_{s,a,t} \mathbb{P}_{\widetilde{\mu}} \left\{\omega_{s,a,t}\widetilde{\mu}_{s,a,t} \le \omega_{s,a,t} \cdot b_{s,a,t} \right\} \\
%         \Leftrightarrow & \mathbb{P}_{\widetilde{\mu}'} \left\{ \omega_{s,a,t}\widetilde{\mu}'_{s,a,t} \le \omega_{s,a,t} \cdot b_{s,a,t}, \forall s,a,t \right\} \le \mathbb{P}_{\widetilde{\mu}} \left\{\omega_{s,a,t}\widetilde{\mu}_{s,a,t} \le \omega_{s,a,t} \cdot b_{s,a,t} , \forall s,a,t\right\} \\
%     \end{array}
% \end{equation}

% Since all $s,a,t$ are independent, we have
% \begin{equation}
% \begin{array}{ll}
%   &  \mathbb{P}_{\widetilde{\mu}'} \left\{\mu'_{s,a,t} \le b_{s,a,t} , \forall s,a,t \right\} \le \mathbb{P}_{\widetilde{\mu}} \left\{\mu_{s,a,t} \le b_{s,a,t} , \forall s,a,t\right\} \\
%   \Leftrightarrow & \mathbb{P}_{\widetilde{\mu}'} \left\{\mu'_{s,a,t} - \widehat{\mu}_{s,a,t} \le b_{s,a,t} - \widehat{\mu}_{s,a,t} , \forall s,a,t \right\} \le \mathbb{P}_{\widetilde{\mu}} \left\{\mu_{s,a,t}-\widehat{\mu}_{s,a,t} \le b_{s,a,t}- \widehat{\mu}_{s,a,t} , \forall s,a,t\right\} \\
%   \Rightarrow & \mathbb{P}_{\widetilde{\mu}'} \left\{\mu'_{s,a,t} - \widehat{\mu}_{s,a,t} + P_{s,a,t} - P_{s,a,t} \le b_{s,a,t} - \widehat{\mu}_{s,a,t} + P_{s,a,t} - P_{s,a,t}, \forall s,a,t \right\} 
%     \end{array}
% \end{equation}

% \end{proof}

%Define a confidence set around $M$, let us say ${M}_k$ collects all the MDPs that are very similar to the true MDP $M$ at the beginning of episode $k$.

%Define $\mathcal{F}_{k-1}$ as the collected history information by the end of episode $k-1$.

%Now we show optimism.

% \begin{lemma}
% If $\widehat{M}_k \in \mathcal{M}_k$, for any $s \in \mathcal{S}$, we have
% \begin{equation}
%     \mathbb{P} \left\{\widetilde{V}_{1}^{\pi_k}(s) \ge V_{1}^{\pi_*}(s) \mid \mathcal{F}_{k-1} \right\} \ge c_0 \quad.
% \end{equation}

% \end{lemma}

% We want to show 
% \begin{equation}
%     \mathbb{P} \left\{\widetilde{V}_{1}^{\pi_*}(s) \ge V_{1}^{\pi_*}(s) \mid \mathcal{F}_{k-1} \right\} \ge c_0 \quad.
% \end{equation}


% %\section{UCB Algorithm}
% \begin{algorithm}[!ht]
% 	\caption{UCB Algorithm for tabular MDP} 
% 	\label{UCB MDP Algorithm}
% 	\begin{algorithmic}[1]
% 	\STATE {\bf{Input:}} MDP instance $M$, number of episodes $T$
	
% \STATE {\bf{Initialization}:} 

% For each $(s,a,t) \in \mathcal{S} \times \mathcal{A} \times [H]:$ Set $\widehat{O}_{s,a,t} \leftarrow 0$, $\widehat{P}_{s,a,t} \leftarrow \Vec{0}$, $\widehat{\mu}_{s,a,t} \leftarrow 0$  

% \FOR {episode $k = 1, 2, \dotsc,T$} 



% \STATE Set $\overline{V}_{H+1} = \Vec{0}$, $\pi_k := 0_{S \times H}$ ; %{\color{blue} \text{$\% \ $$\widetilde{V}$ is a $S$-dimensional vector}}

% \FOR {$t = H,H-1, \dotsc, 1$}

% \FOR {$s \in \mathcal{S}$}

% \FOR {$a \in \mathcal{A}$}


% \STATE 



% Set $\overline{\mu}_{s,a,t} \leftarrow  \widehat{\mu}_{s,a,t} + O \left( \sqrt{\frac{H^2 \log(1/\delta)}{\widehat{O}_{s,a,t}}} \right)$

% Set $\overline{Q}_{s,a,t} \leftarrow \overline{\mu}_{s,a,t}+ \widehat{P}_{s,a,t}^{\intercal} \overline{V}_{t+1}  $ 
% \ENDFOR 
% %\STATE Set $\pi_k (s, t) \leftarrow  \mathop{\arg\max}\limits_{a \in \mathcal{A}}   \left(\widetilde{\mu}_{s,a,t}^{k} + \left\langle \widetilde{V}_{t+1}^{\pi_k}, \widetilde{P}_{s,a,t}^{k} \right\rangle  \right)$ 
% \STATE  Set $\overline{V}^{}_{t}(s) \leftarrow \ \max\limits_{a \in \mathcal{A}} \left\{ \overline{Q}_{s,a,t} \right\}$ 
% \STATE Set $\pi_k(s, t) \leftarrow \mathop{\arg\max}\limits_{a \in \mathcal{A}} \overline{Q}_{s,a,t}$
% \ENDFOR 
% \ENDFOR  \label{planning end}
% \STATE Sample $s_1^k \sim p_0$ 

% \FOR {$t = 1,2, \dotsc , H$}
% \STATE Run policy $\pi_k$ and update the statistics of $\widehat{\mu}_{s_t^k, \pi_k(s_t^k,t), t}$, $\widehat{O}_{s_t^k, \pi_k(s_t^k,t), t}$ and $\widehat{P}_{s_t^k, \pi_k(s_t^k,t), t}$. Note that $s_{t+1}^k \sim P_{s_t^k, \pi_k(s_t^k,t),t}$
% %\STATE Play $a_t^{k} \leftarrow \mathop{\arg\max}\limits_{a \in \mathcal{A}} \left\{ \widetilde{Q}_{s_t^k,a,t}^{k} \right\}$, where $\widetilde{Q}_{s_t^k,a,t}^{k} \sim \mathcal{N} \left(\frac{\overline{Q}_{s_t^k,a,t}}{H-t+1}, \frac{\square}{\widehat{O}_{s_t^k,a,t}} \right), \forall a \in \mathcal{A}$
% %\STATE Transition to $s_{t+1}^{k} \sim P_{s_t^k, a_t^k, t}$

% \ENDFOR
% %\label{phase 3 start}
%  %$s_1^{k},a_1^{k},X_{s_1^k,a_1^{k},1}^k,s_2^{k},a_2^{k},X_{s_2^k,a_2^{k},2}^k, \dotsc,s_H^k, a_H^{k},X_{s_H^k,a_H^{k},H}^k$
% \ENDFOR	
% 	\end{algorithmic}
% \end{algorithm}
% First you can to try to prove Algorithm~\ref{UCB MDP Algorithm} and the conjectured regret bound will be $\widetilde{O} \left(\sqrt{H^4 SAT} \right)$. The analysis will be very similar to UBEV. Let us fix the initial state as $s$. 

% With high probability, we have
% \begin{equation}
%   {V}_1^{\pi_*}(s) \le  \overline{V}_1^{\pi_*}(s) \le  \overline{V}_1^{\pi_k}(s)
% \end{equation}

% Then we will have with high probability the regret occurs in episode $k$ is at most
% \begin{equation}
% \begin{array}{ll}
%     & 
%    {V}_1^{\pi_*}(s)  - {V}_1^{\pi_k}(s)  \\
%    \le  & \overline{V}_1^{\pi_k}(s) -  {V}_1^{\pi_k}(s) \\
%    = & \overline{V}_1^{\pi_k}(s) -  {V}_1^{\pi_k}(s) + \widehat{V}_1^{\pi_k}(s) - \widehat{V}_1^{\pi_k}(s) \\
%       = & \underbrace{\overline{V}_1^{\pi_k}(s) - \widehat{V}_1^{\pi_k}(s)}_{\text{Lemma~3 in Daniel Russo's paper}} +  \underbrace{\widehat{V}_1^{\pi_k}(s) - {V}_1^{\pi_k}(s) }_{\text{Use Lemma~2 in Daniel Russo's paper}}   \\
%    \end{array}
% \end{equation}

% To upper bound $\sum\limits_{k=1}^{T}\mathbb{E}\left[ \overline{V}_1^{\pi_k}(s) - \widehat{V}_1^{\pi_k}(s)\right]$ and $\sum\limits_{k=1}^{T}\mathbb{E}\left[  \widehat{V}_1^{\pi_k}(s) - V_1^{\pi_k}(s)\right]$, you need to use pigeon-hole argument. I did this thing last year in the old overleaf project. You can refer to the analysis of UBEV or  Appendix~B in the following paper

% "(More) Efficient Reinforcement Learning via
% Posterior Sampling"



%\paragraph{Idea 1} You can show $\left(\widetilde{V}_1^{\pi_k}(s) - \widehat{V}_1^{\pi_k}(s)   \right)$ is a Gaussian random variable with the parameters determined by the history information $\mathcal{F}_{k-1}$ using the Lemma~3 in Daniel Russo's paper. 

% The black terms use value difference lemma and posterior concentration bound. We have
% \begin{equation}
%     \begin{array}{ll}
% \sum\limits_{k=1}^{T} \mathbb{E}_{s \sim p_0} \left[\mathbb{E}_{\mathcal{F}_{k-1}} \left[\mathbb{E}_{\sim} \left[\widetilde{V}_1^{\pi_k} (s) - \widehat{V}_1^{\pi_k}(s) \mid \mathcal{F}_{k-1}\right]\right] \right] \\
    
%     \end{array}
% \end{equation}

%The blue terms use value difference lemma and empirical concentration bound.








%Conditioned on history information $\mathcal{F}_{k-1}$, the performance gap between the best policy $\pi_*$ in the true MDP $M$ and the expected performance of the best policy $\pi_k$ in the sampled MDP $\tilde{M}'_k$ can be expressed as
% \begin{equation}
%     V_1^{\pi_*}(s) - \mathbb{E} \left[\widetilde{V'}_1^{\pi_k}(s) \mid \mathcal{F}_{k-1} \right]\quad.
% \end{equation}
%where the expectation is taken over the randomness of the learning algorithm. Note that $\pi_k$ is drawn from a distribution determined by the sampled MDP.

%Let $\sigma_{\pi_k,  k} := $.







%\newpage

%Let $\Phi_r^{(1)} := \left\{\pi \in \Phi_r: \pi_*(s,t) \ne \pi(s,t), \forall (s,t) \right\}$.

%Let $\Phi_r^{(2)} := \left\{\pi \in \Phi_r: \exists (s,t)  \ s.t. \ \pi_*(s,t)= \pi(s,t) \right\}$.

% Regret can be expressed as
% \begin{equation}
%     \begin{array}{lll}
%          \mathcal{R} & = & \sum\limits_{k=1}^{K}\mathbb{E}_{s \sim p_0} \mathbb{E}  \left[V_1^{\pi_*}(s) - V_1^{\pi_k}(s) \right] \\
%          & \le & \mathbb{E}_{s \sim p_0} \sum\limits_{k=1}^{K} \sum\limits_{r = 1}^{\log (\cdot)} \mathbb{E} \left[ \bm{1} \left\{\pi_k \in \Phi_r \right\} \right] \cdot 2^{r-1} \\
%          & \le & \mathbb{E}_{s \sim p_0} \sum\limits_{k=1}^{K} \sum\limits_{r = 1}^{\log (\cdot)} 2^{r-1} \underbrace{\mathbb{E} \left[ \bm{1} \left\{\pi_k \in \Phi_r , \mathcal{E}_{ \widetilde{\mu}}(k) \right\} \right] }_{\Gamma_1}  
%           +  \mathbb{E}_{s \sim p_0} \sum\limits_{k=1}^{K} \sum\limits_{r = 1}^{\log (\cdot)} \mathbb{E} \left[ \bm{1} \left\{\pi_k \in \Phi_r , \overline{\mathcal{E}_{ \widetilde{\mu}}(k)} \right\} \right] \cdot 2^{r-1} 
%           %+  \mathbb{E}_{s \sim p_0} \sum\limits_{k=1}^{K} \sum\limits_{r = 1}^{\log (\cdot)} \mathbb{E} \left[ \bm{1} \left\{\pi_k \in \Phi_r , \overline{\mathcal{E}_{\pi_*, \widehat{\mu}}(k)} \right\} \right] \cdot 2^{r-1} \\
         
% %          & \le & \mathbb{E}_{s \sim p_0} \sum\limits_{r = 1}^{\log (\cdot)} \sum\limits_{k=1}^{K}  \mathbb{E} \left[ \bm{1} \left\{\pi_k \in \Phi_r^{(1)}  \right\} \right] \cdot 2^{r-1} + \mathbb{E}_{s \sim p_0} \sum\limits_{r = 1}^{\log (\cdot)} \underbrace{\sum\limits_{k=1}^{K} \mathbb{E} \left[ \bm{1} \left\{\pi_k \in \Phi_r^{(2)} \right\} \right] \cdot 2^{r-1}}_{I_2} \\
%     \end{array}
% \end{equation}

% Now, we fix the initial state as $s$ and decompose $\Gamma_1$. We have
% \begin{equation}
%     \begin{array}{lll}
%        \Gamma_1 & = & \mathbb{E} \left[ \bm{1} \left\{\pi_k \in \Phi_r , \mathcal{E}_{ \widetilde{\mu}}(k) \right\} \right]\\
%        & = & \mathbb{E} \left[\mathbb{E} \left[ \bm{1} \left\{\pi_k \in \Phi_r , \mathcal{E}_{ \widetilde{\mu}}(k) \right\} \mid \mathcal{F}_{k-1}\right]\right]\\
%        & = & \underbrace{\mathbb{E} \left[\mathbb{E} \left[ \bm{1} \left\{\pi_k \in \Phi_r , \mathcal{E}_{ \widetilde{\mu}}(k) , \widetilde{V}_{1,k}^{\pi_k}(s) \le y_r \right\} \mid \mathcal{F}_{k-1}\right]\right] }_{\Lambda_1}
%        + \mathbb{E} \left[\mathbb{E} \left[ \bm{1} \left\{\pi_k \in \Phi_r , \mathcal{E}_{ \widetilde{\mu}}(k), \widetilde{V}_{1,k}^{\pi_k}(s) > y_r \right\} \mid \mathcal{F}_{k-1}\right]\right]\\
%        %& = & \mathbb{E} \left[ \mathbb{P} \left\{\pi_k \in \Phi_r , \mathcal{E}_{\pi_*, \widetilde{\mu}}(k),  \mathcal{E}_{\pi_*, \widehat{\mu}}(k) , \widetilde{V}_{1,k}^{\pi_k}(s) \le y_r  \mid \mathcal{F}_{k-1}\right\} \right] \\
%       % &+ &\mathbb{E} \left[\mathbb{E} \left[ \bm{1} \left\{\pi_k \in \Phi_r , \mathcal{E}_{\pi_*, \widetilde{\mu}}(k),  \mathcal{E}_{\pi_*, \widehat{\mu}}(k) , \widetilde{V}_{1,k}^{\pi_k}(s) > y_r \right\} \mid \mathcal{F}_{k-1}\right]\right]\\
%       % & \le  & \sum\limits_{k=1}^{K}  \mathbb{E} \left[ \bm{1} \left\{\pi_k \in \Phi_r^{(2)} ,\exists (s,t)  \ s.t. \ \pi_*(s,t)= \pi_k(s,t) \right\} \right] \cdot 2^{r-1} \\
% %&        \le & \sum\limits_{s \in \mathcal{S} }\sum\limits_{t= 1}^{H}  \sum\limits_{k=1}^{K}  \mathbb{E} \left[ \bm{1} \left\{\pi_k \in \Phi_r^{(2)}, \pi_k(s,t) = \pi_*(s,t), \pi_k(s',t') \ne \pi_*(s',t'), \forall s' \in \mathcal{S} \backslash {s}, \forall t \in [H] \backslash t' \right\} \right] \cdot 2^{r-1} \\
%      %   & \le & \sum\limits_{a \in \mathcal{A}}\sum\limits_{k=1}^{K}  \mathbb{E} \left[ \bm{1} \left\{\pi_k \in \Phi_r^{(2)}, (s_t^k, a_t^k, t) = (s,a,t), \pi_k(s,t) = \pi_*(s,t) \right\} \right] \cdot 2^{r-1} \\
       
%     \end{array}
% \end{equation}

% \begin{equation}
%     \begin{array}{lll}
%     \Lambda_1 &= &\mathbb{E} \left[\mathbb{E} \left[ \bm{1} \left\{\pi_k \in \Phi_r , \mathcal{E}_{ \widetilde{\mu}}(k) , \widetilde{V}_{1,k}^{\pi_k}(s) \le y_r \right\} \mid \mathcal{F}_{k-1}\right]\right] \\
%         & = &  \mathbb{E} \left[\mathbb{E} \left[ \bm{1} \left\{\pi_k \in \Phi_r , \mathcal{E}_{ \widetilde{\mu}}(k),  \widetilde{V}_{1,k}^{\pi_k}(s) \le y_r \right\} \cdot \bm{1} \left\{ \overline{V}_{1,k}^{\pi_*}(s) > y_r \right\}  \mid \mathcal{F}_{k-1}\right]\right] \\
%         & + & \mathbb{E} \left[\mathbb{E} \left[ \bm{1} \left\{\pi_k \in \Phi_r , \mathcal{E}_{ \widetilde{\mu}}(k),  \widetilde{V}_{1,k}^{\pi_k}(s) \le y_r \right\} \cdot \bm{1} \left\{ \overline{V}_{1,k}^{\pi_*}(s) \le y_r \right\}  \mid \mathcal{F}_{k-1}\right]\right] \\
%         & \le &  \mathbb{E} \left[\mathbb{E} \left[ \bm{1} \left\{\pi_k \in \Phi_r , \mathcal{E}_{ \widetilde{\mu}}(k),  \widetilde{V}_{1,k}^{\pi_k}(s) \le y_r \right\} \cdot \bm{1} \left\{ \overline{V}_{1,k}^{\pi_*}(s) > y_r \right\}  \mid \mathcal{F}_{k-1}\right]\right] \\
%         & + & \mathbb{E} \left[\mathbb{E} \left[ \bm{1} \left\{\pi_k \in \Phi_r , \mathcal{E}_{ \widetilde{\mu}}(k),  \widetilde{V}_{1,k}^{\pi_k}(s) \le y_r \right\} \cdot \bm{1} \left\{ \overline{V}_{1,k}^{\pi_*}(s) \le V_1^{\pi_*}(s) \right\}  \mid \mathcal{F}_{k-1}\right]\right] \\
%           & \le &  \mathbb{E} \left[\mathbb{E} \left[ \bm{1} \left\{\pi_k \in \Phi_r , \mathcal{E}_{ \widetilde{\mu}}(k),  \widetilde{V}_{1,k}^{\pi_k}(s) \le y_r \right\} \cdot \bm{1} \left\{ \overline{V}_{1,k}^{\pi_*}(s) > y_r \right\}  \mid \mathcal{F}_{k-1}\right]\right] 
%          +  \mathbb{E} \left[\mathbb{E} \left[  \bm{1} \left\{ \overline{V}_{1,k}^{\pi_*}(s) \le V_1^{\pi_*}(s) \right\}  \mid \mathcal{F}_{k-1}\right]\right] \\
%             & = &  \mathbb{E} \left[\bm{1} \left\{ \overline{V}_{1,k}^{\pi_*}(s) > y_r \right\}  \cdot \mathbb{E} \left[ \bm{1} \left\{\pi_k \in \Phi_r , \mathcal{E}_{ \widetilde{\mu}}(k),  \widetilde{V}_{1,k}^{\pi_k}(s) \le y_r \right\}  \mid \mathcal{F}_{k-1}\right]\right] 
%          +  \mathbb{E} \left[\mathbb{E} \left[  \bm{1} \left\{ \overline{V}_{1,k}^{\pi_*}(s) \le V_1^{\pi_*}(s) \right\}  \mid \mathcal{F}_{k-1}\right]\right] \\
           
%            & \le &  \mathbb{E} \left[\mathbb{P} \left\{\pi_k = \pi_*, \mathcal{E}_{ \widetilde{\mu}}(k),   \widetilde{V}_{1,k}^{\pi_k}(s) \le y_r  \mid \mathcal{F}_{k-1}\right\} \bm{1} \left\{ \underline{V}_{1,k}^{\pi_*}(s) \le y_r \right\}  \right] 
%          +  \mathbb{E} \left[\mathbb{E} \left[  \bm{1} \left\{ \overline{V}_{1,k}^{\pi_*}(s) \le V_1^{\pi_*}(s) \right\}  \mid \mathcal{F}_{k-1}\right]\right] \\
%         % = & \mathbb{E} \left[ \mathbb{P} \left\{\pi_k \in \Phi_r , \mathcal{E}_{\pi_*, \widetilde{\mu}}(k),  \widetilde{V}_{1,k}^{\pi_k}(s) \le y_r  \mid \mathcal{F}_{k-1}\right\} \right] \\
%          %  = & \mathbb{E} \left[ \mathbb{P} \left\{\pi_k \in \Phi_r , \mathcal{E}_{\pi_*, \widetilde{\mu}}(k),  \widetilde{V}_{1,k}^{\pi_k}(s) \le y_r  \mid \mathcal{F}_{k-1}\right\} \cdot \frac{ \bm{1} \left\{ \overline{V}_{1,k}^{\pi_*}(s) - y_r \right\} }{\bm{1} \left\{ \overline{V}_{1,k}^{\pi_*}(s) - y_r\right\}}  \cdot \frac{1}{\bm{1} \left\{ \overline{V}_{1,k}^{\pi_*}(s) - y_r \right\}}\right] \\
%     \end{array}
% \end{equation}

% \begin{lemma}
% For any instantiation $F_{k-1}$ of $\mathcal{F}_{k-1}$, we have
% \begin{equation}
% \begin{array}{ll}
%     & \mathbb{P} \left\{\pi_k \in \Phi_r , \mathcal{E}_{ \widetilde{\mu}}(k),   \widetilde{V}_{1,k}^{\pi_k}(s) \le y_r  \mid \mathcal{F}_{k-1}\right\} \bm{1} \left\{ \underline{V}_{1,k}^{\pi_*}(s) > y_r \right\} \\
%     \le & \mathbb{P} \left\{\pi_k = \pi_*, \mathcal{E}_{ \widetilde{\mu}}(k),   \widetilde{V}_{1,k}^{\pi_k}(s) \le y_r  \mid \mathcal{F}_{k-1}\right\} \bm{1} \left\{ \underline{V}_{1,k}^{\pi_*}(s) \le y_r \right\} \quad.
%     \end{array}
%     \label{hard 1}
% \end{equation}
% \label{succeed 1}

% \begin{equation}
% \begin{array}{ll}
%     & \mathbb{P} \left\{\pi_k \in \Phi_r , \mathcal{E}_{\pi_*, \widetilde{\mu}}(k),   \widetilde{V}_{1,k}^{\pi_k}(s) \le y_r  \mid \mathcal{F}_{k-1}\right\} \bm{1} \left\{ \overline{V}_{1,k}^{\pi_*}(s) \le y_r \right\} \\
%     \le & \mathbb{P} \left\{\pi_k = \pi_*, \mathcal{E}_{\pi_*, \widetilde{\mu}}(k),   \widetilde{V}_{1,k}^{\pi_k}(s) \le y_r  \mid \mathcal{F}_{k-1}\right\} \bm{1} \left\{ \underline{V}_{1,k}^{\pi_*}(s) \le y_r \right\} \quad.
%     \end{array}
%     \label{hard 1}
% \end{equation}
% \label{succeed 1}

% \end{lemma}
% \begin{proof}[Proof of Lemma~\ref{succeed 1}]
% For any instantiation $F_{k-1}$ of $\mathcal{F}_{k-1}$ such that 
% $\mathbb{E} \left[\bm{1} \left\{\mathcal{E}_{ \widetilde{\mu}}(k), \mathop{\max}\limits_{\pi \ne \pi_*}\widetilde{V}_{1,k}^{\pi}(s) \le y_r \right\}  \mid \mathcal{F}_{k-1} = F_{k-1} \right] =  0$, it is trivial to prove since both sides in 
% (\ref{hard 1}) are 0.

% For any instantiation $F_{k-1}$ of $\mathcal{F}_{k-1}$ such that 
% $\mathbb{E} \left[\bm{1} \left\{\mathcal{E}_{ \widetilde{\mu}}(k) , \mathop{\max}\limits_{\pi \ne \pi_*}\widetilde{V}_{1,k}^{\pi}(s) \le y_r \right\}  \mid \mathcal{F}_{k-1} = F_{k-1} \right] >  0$, we have
% \begin{equation}
%     \begin{array}{ll}
%          & \mathbb{P} \left\{\pi_k \in \Phi_r , \mathcal{E}_{ \widetilde{\mu}}(k),  \widetilde{V}_{1,k}^{\pi_k}(s) \le y_r  \mid \mathcal{F}_{k-1} = F_{k-1}\right\} \\
%          \le & \mathbb{P} \left\{\pi_k \in \Phi_r , \mathcal{E}_{ \widetilde{\mu}}(k),  \mathop{\max}\limits_{\pi \ne \pi_*}\widetilde{V}_{1,k}^{\pi}(s) \le y_r , \widetilde{V}_{1,k}^{\pi_*}(s) \le y_r  \mid \mathcal{F}_{k-1} = F_{k-1}\right\} \\
%          \le & \mathbb{P} \left\{\mathcal{E}_{ \widetilde{\mu}}(k),  \mathop{\max}\limits_{\pi \ne \pi_*}\widetilde{V}_{1,k}^{\pi}(s) \le y_r , \underline{V}_{1,k}^{\pi_*}(s) \le y_r  \mid \mathcal{F}_{k-1}\right\} \\
%          = & \mathbb{E} \left[ \bm{1} \left\{\mathcal{E}_{ \widetilde{\mu}}(k),  \mathop{\max}\limits_{\pi \ne \pi_*}\widetilde{V}_{1,k}^{\pi}(s) \le y_r , \underline{V}_{1,k}^{\pi_*}(s) \le y_r  \right\} \mid \mathcal{F}_{k-1} = F_{k-1}\right]  \\
% %          = & \mathbb{P} \left\{\mathcal{E}_{\pi_*, \widetilde{\mu}}(k),  \mathop{\max}\limits_{\pi \ne \pi_*}\widetilde{V}_{1,k}^{\pi}(s)  \le y_r , \underline{V}_{1,k}^{\pi_*}(s)-\mathcal{V}_{1,k}^{\pi_*}(s)  \le y_r-\mathcal{V}_{1,k}^{\pi_*}(s)  \mid \mathcal{F}_{k-1}\right\} \\
%          = & \bm{1} \left\{\underline{V}_{1,k}^{\pi_*}(s)\le y_r   \right\}\mathbb{P} \left\{\mathcal{E}_{ \widetilde{\mu}}(k),  \mathop{\max}\limits_{\pi \ne \pi_*}\widetilde{V}_{1,k}^{\pi}(s) \le y_r   \mid \mathcal{F}_{k-1}= F_{k-1}\right\}\quad,
%     \end{array}
%     \label{hard 2}
% \end{equation}
% and
% \begin{equation}
%     \begin{array}{ll}
%           & \mathbb{P} \left\{\pi_k = \pi_*, \mathcal{E}_{ \widetilde{\mu}}(k),   \widetilde{V}_{1,k}^{\pi_k}(s) \le y_r  \mid \mathcal{F}_{k-1}= F_{k-1} \right\} \\
%           \ge & \mathbb{P} \left\{\widetilde{V}_{1,k}^{\pi_*}(s) > y_r \ge \mathop{\max}\limits_{\pi \ne \pi_*}\widetilde{V}_{1,k}^{\pi}(s), \mathcal{E}_{ \widetilde{\mu}}(k),    \mid \mathcal{F}_{k-1}= F_{k-1}\right\} \\
%           \ge & \mathbb{P} \left\{\underline{V}_{1,k}^{\pi_*}(s) > y_r \ge \mathop{\max}\limits_{\pi \ne \pi_*}\widetilde{V}_{1,k}^{\pi}(s), \mathcal{E}_{ \widetilde{\mu}}(k)   \mid \mathcal{F}_{k-1}= F_{k-1}\right\} \\
%           =& \mathbb{E} \left[\bm{1} \left\{\underline{V}_{1,k}^{\pi_*}(s) > y_r \ge \mathop{\max}\limits_{\pi \ne \pi_*}\widetilde{V}_{1,k}^{\pi}(s), \mathcal{E}_{ \widetilde{\mu}}(k)   \right\} \mid \mathcal{F}_{k-1}= F_{k-1} \right] \\
%            = & \bm{1} \left\{\underline{V}_{1,k}^{\pi_*}(s) > y_r  \right\}\mathbb{P} \left\{\mathcal{E}_{ \widetilde{\mu}}(k),  \mathop{\max}\limits_{\pi \ne \pi_*}\widetilde{V}_{1,k}^{\pi}(s) \le y_r   \mid \mathcal{F}_{k-1}=F_{k-1}\right\} \quad,
%     \end{array}
% \end{equation}
% which means 
% \begin{equation}
%    \bm{1} \left\{\underline{V}_{1,k}^{\pi_*}(s) > y_r\right\} \le \frac{\mathbb{P} \left\{\pi_k = \pi_*, \mathcal{E}_{ \widetilde{\mu}}(k),  \widetilde{V}_{1,k}^{\pi_k}(s) \le y_r  \mid \mathcal{F}_{k-1} = F_{k-1}\right\}}{\mathbb{P} \left\{\mathcal{E}_{\pi_*, \widetilde{\mu}}(k),   \mathop{\max}\limits_{\pi \ne \pi_*}\widetilde{V}_{1,k}^{\pi}(s) \le y_r   \mid \mathcal{F}_{k-1}=F_{k-1}\right\}} \quad.
%     \label{hard 3}
% \end{equation}

% From (\ref{hard 2}) and (\ref{hard 3}), we conclude the proof.
% \end{proof}
%{\color{red}It seems we can use the argument of ghost sampling in statistical learning theory.}
% \begin{lemma}
% Let us fix the initial state as $s$. For any instantiation $F_{k-1}$ of $\mathcal{F}$, we have
% \begin{equation} 
% \begin{array}{ll}
%    & \mathbb{P}\left\{\pi_k  \in \Phi_r, \widetilde{V}_1^{\pi_k, k}(s) \le y_r \mid \mathcal{F}_{k-1} = F_{k-1}  \right\} \\
%    \le & \frac{1-\mathbb{P}\left\{ \widetilde{V}_1^{\pi_*,k}(s) > y_r \mid \mathcal{F}_{k-1} = F_{k-1}   \right\}}{\mathbb{P}\left\{\widetilde{V}_1^{\pi_*,k}(s) > y_r \mid \mathcal{F}_{k-1} = F_{k-1}  \right\}} \cdot \mathbb{P}\left\{ \pi_k = \pi_* , \widetilde{V}_1^{\pi_k, k}(s) \le y_r \mid \mathcal{F}_{k-1} = F_{k-1}   \right\} \quad.
%     \end{array}
% \end{equation}
% \label{band 1}
% \end{lemma}
% \begin{proof}[Proof of Lemma~\ref{band 1}]



% We have
% \begin{equation}
%     \begin{array}{ll}
%     &  \mathbb{P}\left\{\pi_k  \in \Phi_r, \widetilde{V}_1^{\pi_k, k}(s) \le y_r \mid \mathcal{F}_{k-1} = F_{k-1}  \right\} \\
%     \le &  \mathbb{P}\left\{\pi_k  \in \Phi_r, \widetilde{V}_1^{\pi, k}(s) \le y_r, \forall \pi \mid \mathcal{F}_{k-1} = F_{k-1}  \right\} \\
%     = &  \mathbb{P}\left\{\pi_k  \in \Phi_r, \widetilde{V}_1^{\pi, k}(s) - \widehat{V}_1^{\pi, k}(s) \le y_r - \widehat{V}_1^{\pi, k}(s), \forall \pi \mid \mathcal{F}_{k-1} = F_{k-1}  \right\} \\
%     = & \mathbb{P}\left\{\pi_k  \in \Phi_r, \underbrace{\widetilde{V}_1^{\pi, k}(s) - \widehat{V}_1^{\pi, k}(s)}_{=: Z_{\pi, k}} \le y_r - \widehat{V}_1^{\pi, k}(s), \forall \pi \mid \mathcal{F}_{k-1} = F_{k-1}  \right\} \\
%     \le & \mathbb{P}\left\{ Z_{\pi, k} \le y_r - \widehat{V}_1^{\pi, k}(s), \forall \pi \mid \mathcal{F}_{k-1} = F_{k-1}  \right\} \\
%     = & \mathbb{P}\left\{ Z_{\pi_*, k} \le y_r - \widehat{V}_1^{\pi_*, k}(s) \mid \mathcal{F}_{k-1} = F_{k-1}  \right\} \cdot \mathbb{P}\left\{ Z_{\pi, k} \le y_r - \widehat{V}_1^{\pi, k}(s), \forall \pi \ne \pi_* \mid \mathcal{F}_{k-1} = F_{k-1}  \right\} \\
   
%     \end{array}
% \end{equation}

% We also have
% \begin{equation}
%     \begin{array}{ll}
%     & \mathbb{P}\left\{ \pi_k = \pi_* , \widetilde{V}_1^{\pi_k, k}(s) \le y_r \mid \mathcal{F}_{k-1} = F_{k-1}   \right\} \\
%     \ge & \mathbb{P}\left\{\widetilde{V}_1^{\pi_*, k}(s) >  y_r \ge \widetilde{V}_1^{\pi, k}(s) , \forall \pi \ne \pi_* \mid \mathcal{F}_{k-1} = F_{k-1}   \right\} \\
%     =  & \mathbb{P}\left\{\widetilde{V}_1^{\pi_*, k}(s) >  y_r, y_r \ge \widetilde{V}_1^{\pi, k}(s) , \forall \pi \ne \pi_* \mid \mathcal{F}_{k-1} = F_{k-1}   \right\} \\
%      =  & \mathbb{P}\left\{\widetilde{V}_1^{\pi_*, k}(s) - \widehat{V}_1^{\pi_*, k}(s)>  y_r -  \widehat{V}_1^{\pi_*, k}(s), y_r  - \widehat{V}_1^{\pi, k}(s) \ge \widetilde{V}_1^{\pi, k}(s) - \widehat{V}_1^{\pi, k}(s), \forall \pi \ne \pi_* \mid \mathcal{F}_{k-1} = F_{k-1}   \right\} \\
%         =  & \mathbb{P}\left\{Z_{\pi_*, k}>  y_r - \widehat{V}_1^{\pi_*, k}(s), y_r  - \widehat{V}_1^{\pi, k}(s) \ge Z_{\pi,k}, \forall \pi \ne \pi_* \mid \mathcal{F}_{k-1} = F_{k-1}   \right\} \\
%          =  & \mathbb{P}\left\{Z_{\pi_*, k}>  y_r - \widehat{V}_1^{\pi_*, k}(s) \mid \mathcal{F}_{k-1} = F_{k-1}   \right\} \cdot   \mathbb{P}\left\{ y_r  - \widehat{V}_1^{\pi, k}(s) \ge Z_{\pi,k}, \forall \pi \ne \pi_* \mid \mathcal{F}_{k-1} = F_{k-1}   \right\} \\
%     \end{array}
% \end{equation}


% %$\mathbb{P}\left\{   \mathbb{E}_{s \sim p_0}\widetilde{V}_1^{\pi_*, k}(s) - \mathbb{E}_{s \sim p_0}\widehat{V}_1^{\pi_*, k}(s)  \le y_r - \mathbb{E}_{s \sim p_0}\widehat{V}_1^{\pi_*, k}(s) \right\} $

% \end{proof}

%\subsection{Key lemmas}
% \begin{lemma}
% Let $\tau_i$ be the episode when the $i$-th play of policy $\pi_*$ 
% occurs. Then, we have
% \begin{equation}
%     \mathbb{E} \left[\frac{1}{\mathbb{P}\left\{\mathbb{E}_{s \sim p_0}\widetilde{V}_1^{\pi_*, \tau_i}(s) > y_r \mid \mathcal{F}_{\tau_i -1}  \right\}} -1 \right] = \left\{ \begin{array}{ll}
%          c_1, &  \forall i, \\
%         O \left(\frac{\delta}{\epsilon_r^2}\right) ,& i \ge O \left(\frac{\log (1/\delta)}{\epsilon_r^2} \right).
%         \end{array} 
%         \right. 
% \end{equation}
% \label{Key lemma}
% \end{lemma}

% \begin{proof}[Proof of Lemma~\ref{Key lemma}]

% Given $\mathcal{F}_{\tau_i -1}$, let $Z_{\pi_*, i} \sim \mathcal{N} \left(0, \ \underbrace{\sum\limits_{t=1}^{H}\sum\limits_{s,a}\left(\omega_{s,a,t}^{\pi_*, \widehat{P}_{\tau_i-1}}\right)^2 \cdot \frac{\square SH^3 \log(1/\delta)}{\widehat{O}_{s,a,t}^{\tau_i-1}} }_{=: \sigma^2_{\pi_*, \tau_i -1 }}\right)$. Let $Y_i$ be a random variable denoting the number of consecutive independent trials until $Z_{\pi_*, i}$ is greater than $V_1^{\pi_*}(s) -\widehat{V}_1^{\pi_*, \tau_i}(s)- \epsilon_r$, where $\epsilon_r := \mathbb{E}_{s \sim p_0}V_1^{\pi_*}(s)-y_r$. {\color{red}Need to explain $\omega_{s,a,t}^{\pi_*, \widehat{P}_{\tau_i - 1}}$}
% Now, we have
% \begin{equation}
% \begin{array}{ll}
%    & \mathbb{E}  \left[\frac{1}{\mathbb{P}_{}\left\{\mathbb{E}_{s \sim p_0}\widetilde{V}_{1}^{\pi_*, \tau_i}(s) > y_r \mid \mathcal{F}_{\tau_i-1} \right\}} -1 \right] \\
%    = & \mathbb{E}\left[  \mathbb{E} \left[\frac{1}{\mathbb{P}_{}\left\{\mathbb{E}_{s \sim p_0}\widetilde{V}_{1}^{\pi_*, \tau_i}(s) > y_r \mid \mathcal{F}_{\tau_i-1} \right\}} -1 \mid \mathcal{F}_{\tau_i - 1}\right] \right] \\
%    = & \mathbb{E}\left[  \mathbb{E} \left[\frac{1}{\mathbb{P}_{}\left\{\mathbb{E}_{s \sim p_0}\widetilde{V}_{1}^{\pi_*, \tau_i}(s) - \mathbb{E}_{s \sim p_0}\widehat{V}_{1}^{\pi_*, \tau_i}(s) > y_r - \mathbb{E}_{s \sim p_0}\widehat{V}_{1}^{\pi_*, \tau_i}(s) \mid \mathcal{F}_{\tau_i-1} \right\}} -1 \mid \mathcal{F}_{\tau_i - 1}\right] \right] \\
 
% = & \mathbb{E}\left[  \mathbb{E} \left[\frac{1}{\mathbb{E} \left[\bm{1}\left\{\mathbb{E}_{s \sim p_0}\widetilde{V}_{1}^{\pi_*, \tau_i}(s) - \mathbb{E}_{s \sim p_0}\widehat{V}_{1}^{\pi_*, \tau_i}(s) > y_r - \mathbb{E}_{s \sim p_0}\widehat{V}_{1}^{\pi_*, \tau_i}(s)  \right\} \mid \mathcal{F}_{\tau_i-1} \right]} -1 \mid \mathcal{F}_{\tau_i - 1}\right] \right] \\
% = & \mathbb{E}\left[  \mathbb{E} \left[\frac{1}{\mathbb{E} \left[\bm{1}\left\{Z_{\pi_*, i} > y_r - \mathbb{E}_{s \sim p_0}\widehat{V}_{1}^{\pi_*, \tau_i}(s)  \right\} \mid \mathcal{F}_{\tau_i-1} \right]} -1 \mid \mathcal{F}_{\tau_i - 1}\right] \right] \\
% = & \mathbb{E}\left[  \mathbb{E} \left[\frac{1}{\mathbb{P}_{}\left\{Z_{\pi_*, i} > y_r - \mathbb{E}_{s \sim p_0}\widehat{V}_{1}^{\pi_*, \tau_i}(s) \mid \mathcal{F}_{\tau_i-1} \right\}} -1 \mid \mathcal{F}_{\tau_i - 1}\right] \right] \\
%    = & \mathbb{E}  \left[ \mathbb{E} \left[Y_i \mid \mathcal{F}_{\tau_i -1 }\right]\right]
% \\
% = & \mathbb{E}  \left[Y_i \right]
% \quad.

% \end{array}
% \end{equation}

% Now, we will the following two results. 
% \paragraph{We have $\mathbb{E}  \left[Y_i \right] \le c_1$ for all $i$.}  
% \

% Consider any integer $\phi \ge 1$. Let $z= \sqrt{\ln(\phi)}$. Let $\textbf{MAX}_{\phi}$ be the maximum of $\phi$ i.i.d. samples that are drawn from $\mathcal{N} \left(0, \ \sigma^2_{\pi_*, \tau_i -1} \right)$. Then, for any integer $\phi \ge 1$, we have
% \begin{equation}
%     \begin{array}{ll}
%          & \mathbb{P}\left\{Y_i \le \phi \right\} \ge \mathbb{P}\left\{\textbf{MAX}_{\phi} \ge y_r - \mathbb{E}_{s \sim p_0}\widehat{V}_{1}^{\pi_*, \tau_i}(s)  \right\} \\
%        % \ge &  \mathbb{P}\left\{\mathop{\max}\limits_{h \in [\phi]} \widetilde{V}_{1,i}^{\pi_*, h}(s) \ge V_1^{\pi_*}(s) - \epsilon_r \right\} \\
%         \ge & \mathbb{P}\left\{\textbf{MAX}_{\phi} >  z \cdot \sigma_{\pi_*, \tau_i - 1} \ge y_r - \mathbb{E}_{s \sim p_0}\widehat{V}_{1}^{\pi_*, \tau_i}(s)\right\} \\
%         = & \mathbb{E}\left[\bm{1} \left\{\textbf{MAX}_{\phi} > z \cdot \sigma_{\pi_*, \tau_i - 1} \ge y_r - \mathbb{E}_{s \sim p_0}\widehat{V}_{1}^{\pi_*, \tau_i}(s) \right\} \right] \\
% %         = & \mathbb{E}\left[\mathbb{E}\left[\bm{1} \left\{\mathop{\max}\limits_{h \in [\phi]} \widetilde{V}_{1,i}^{\pi_*, h}(s) > \widehat{V}_1^{\pi_*}(s) + \sqrt{\sigma_{\pi_*, i}}  \ge V_1^{\pi_*}(s) - \epsilon_r \right\} \mid \mathcal{F}_{\tau_i -1}\right] \right] \\
%           = & \mathbb{E}\left[\mathbb{E}\left[\bm{1} \left\{\textbf{MAX}_{\phi} >  z \cdot \sigma_{\pi_*, \tau_i - 1}  \right\} \cdot \bm{1} \left\{  z \cdot \sigma_{\pi_*, \tau_i - 1} \ge y_r - \mathbb{E}_{s \sim p_0}\widehat{V}_{1}^{\pi_*, \tau_i}(s) \right\} \mid \mathcal{F}_{\tau_i -1}\right] \right] \\
          
%            = & \mathbb{E}\left[\mathbb{E}\left[\bm{1} \left\{\textbf{MAX}_{\phi} >  z \cdot \sigma_{\pi_*, \tau_i - 1}  \right\} \cdot \bm{1} \left\{  z \cdot \sigma_{\pi_*, \tau_i - 1} \ge  \mathbb{E}_{s \sim p_0} V_1^{\pi_*}(s)- \epsilon_r - \mathbb{E}_{s \sim p_0}\widehat{V}_{1}^{\pi_*, \tau_i}(s) \right\} \mid \mathcal{F}_{\tau_i -1}\right] \right] \\
%             \ge & \mathbb{E}\left[\mathbb{E}\left[\bm{1} \left\{\textbf{MAX}_{\phi} >  z \cdot \sigma_{\pi_*, \tau_i - 1}  \right\} \cdot \bm{1} \left\{  z \cdot \sigma_{\pi_*, \tau_i - 1} \ge  \mathbb{E}_{s \sim p_0} V_1^{\pi_*}(s) - \mathbb{E}_{s \sim p_0}\widehat{V}_{1}^{\pi_*, \tau_i}(s) \right\} \mid \mathcal{F}_{\tau_i -1}\right] \right] \\
%               = & \mathbb{E}\left[\bm{1} \left\{  z \cdot \sigma_{\pi_*, \tau_i - 1} \ge  \mathbb{E}_{s \sim p_0} V_1^{\pi_*}(s) - \mathbb{E}_{s \sim p_0}\widehat{V}_{1}^{\pi_*, \tau_i}(s) \right\}\mathbb{E}\left[\bm{1} \left\{\textbf{MAX}_{\phi} >  z \cdot \sigma_{\pi_*, \tau_i - 1}  \right\}   \mid \mathcal{F}_{\tau_i -1}\right] \right] \\
%               = & \mathbb{E}\left[\bm{1} \left\{  z \cdot \sigma_{\pi_*, \tau_i - 1} \ge  \mathbb{E}_{s \sim p_0} V_1^{\pi_*}(s) - \mathbb{E}_{s \sim p_0}\widehat{V}_{1}^{\pi_*, \tau_i}(s) \right\} \cdot \mathbb{P} \left\{\textbf{MAX}_{\phi} >  z \cdot \sigma_{\pi_*, \tau_i - 1} \mid \mathcal{F}_{\tau_i -1} \right\}   \right]  \\
%                = & \mathbb{E}\left[\bm{1} \left\{  z \cdot \sigma_{\pi_*, \tau_i - 1} \ge  \mathbb{E}_{s \sim p_0} V_1^{\pi_*}(s) - \mathbb{E}_{s \sim p_0}\widehat{V}_{1}^{\pi_*, \tau_i}(s) \right\} \cdot \mathbb{P} \left\{\textbf{MAX}_{\phi} >  \sqrt{\ln(\phi)} \cdot \sigma_{\pi_*, \tau_i - 1} \mid \mathcal{F}_{\tau_i -1} \right\}   \right]  \\
%           %\ge & \mathbb{E}\left[\mathbb{E}\left[\bm{1} \left\{\mathop{\max}\limits_{h \in [\phi]} \widetilde{V}_{1,i}^{\pi_*, h}(s) > \widehat{V}_1^{\pi_*}(s) + \sqrt{\sigma_{\pi_*, i}}  \right\} \cdot \bm{1} \left\{ \widehat{V}_1^{\pi_*}(s) + \sqrt{\sigma_{\pi_*, i}}  \ge V_1^{\pi_*}(s)  \right\} \mid \mathcal{F}_{\tau_i -1}\right] \right] \\
%             % = & \mathbb{E}\left[\bm{1} \left\{ \widehat{V}_1^{\pi_*}(s) + \sqrt{\sigma_{\pi_*, i}}  \ge V_1^{\pi_*}(s)  \right\}\mathbb{E}\left[\bm{1} \left\{\mathop{\max}\limits_{h \in [\phi]} \widetilde{V}_{1,i}^{\pi_*, h}(s) > \widehat{V}_1^{\pi_*}(s) + \sqrt{\sigma_{\pi_*, i}}  \right\}   \mid \mathcal{F}_{\tau_i -1}\right] \right] \\
%             % = & \mathbb{E}\left[\bm{1} \left\{ \widehat{V}_1^{\pi_*}(s) + \sqrt{\sigma_{\pi_*, i}}  \ge V_1^{\pi_*}(s)  \right\}\mathbb{P} \left\{\mathop{\max}\limits_{h \in [\phi]} \widetilde{V}_{1,i}^{\pi_*, h}(s) > \widehat{V}_1^{\pi_*}(s) + \sqrt{\sigma_{\pi_*, i}} \mid \mathcal{F}_{\tau_i -1} \right\}   \right]  \\
%     \end{array}
% \end{equation}

% $Z_{\pi_*, i} \sim \mathcal{N} \left(0, \ \underbrace{\sum\limits_{t=1}^{H}\sum\limits_{s,a}\left(\omega_{s,a,t}^{\pi_*, \widehat{P}_{\tau_i-1}}\right)^2 \cdot \frac{\square SH^3 \log(1/\delta)}{\widehat{O}_{s,a,t}^{\tau_i-1}} }_{=: \sigma^2_{\pi_*, \tau_i -1 }}\right)$


% We have
% \begin{equation}
%     \begin{array}{ll}
%          & \mathbb{P} \left\{\textbf{MAX}_{\phi} >  \sqrt{\ln(\phi)} \cdot \sigma_{\pi_*, \tau_i - 1} \mid \mathcal{F}_{\tau_i -1} \right\} \\
%          = & 
%     \end{array}
% \end{equation}

% \end{proof}
% Now, we want to upper bound
% \begin{equation}
%     \begin{array}{ll}
%         & \sum\limits_{k=1}^{T} \mathbb{E} \left[\frac{1}{\mathbb{P}_{k-1}\left\{\widetilde{V}_1^{\pi_*}(s) > y_r  \right\}} -1 \right] \\
%         \le & \sum\limits_{i=1}^{T} \mathbb{E} \left[\frac{1}{\mathbb{P}_{\tau_i-1}\left\{\widetilde{V}_{1, i}^{\pi_*}(s) > y_r  \right\}} -1 \right] \quad.
%     \end{array}
% \end{equation}


 %It is important to note that $\widetilde{V}_1^{\pi_*}(s) - \widehat{V}_1^{\pi_*}(s)$ is drawn from a distribution determined by the history. Conditioned on history, we do not need to compute the complex distribution of $\widetilde{V}_1^{\pi_*}(s)$. We have




% We want to have
% \begin{equation}
%     V_1^{\pi_*}(s) \le \widehat{V}_1^{\pi_*}(s) + \sqrt{\sigma_{\pi_*, i}}
% \end{equation}

% Optimism: %for a fixed sub-optimal policy $\pi \in \Phi_r$, we have
% \begin{equation}
% \begin{array}{ll}
%     & \mathbb{P}\left\{\widehat{V}_1^{\pi_*}(s) - \widetilde{V}_1^{\pi_*}(s) > \square \mid \mathcal{F}_{i} = F_i \right\} \\
%     =&  \mathbb{P}\left\{\mathbb{E}_{\pi_*, \widehat{}} \left[\sum\limits_{t=1}^{H} \left(\widehat{\mu}_{s_t, \pi_*(s_t, t),t} - \widetilde{\mu}_{s_t, \pi_*(s_t, t),t} \right) \right] > \square  \mid \mathcal{F}_{i} = F_i  \right\}\\
%     =&  \mathbb{P}\left\{\mathbb{E}_{\pi_*, \widehat{}} \left[\sum\limits_{t=1}^{H} n_{s_t, \pi_*(s_t, t),t}  \right] > \square  \mid \mathcal{F}_{i} = F_i  \right\}\\
%     =&  \mathbb{P}\left\{ \sum\limits_{t=1}^{H} \sum\limits_{s,a} \omega_{s,a,t}^{\pi_*, \widehat{P}} \cdot n_{s, a,t}   > \square  \mid \mathcal{F}_{i} = F_i  \right\}\\
%    % &=& \mathbb{E}_{\pi, \widehat{}} \left[\sum\limits_{t=1}^{H} \left(\widehat{\mu}_{s_t, \pi(s_t, t),t} - \widetilde{\mu}_{s_t, \pi(s_t, t),t} \right) \right] \\

%     \end{array}
%     \end{equation}


    % Let $\sigma_{\pi_*, \widehat{P}} :=  \sum\limits_{t=1}^{H} \sum\limits_{s,a} \omega_{s,a,t}^{\pi_*, k} \cdot n_{s, a,t}$. Then we know $Z_{\pi_*, k} \sim \mathcal{N} \left(0, \sum\limits_{t=1}^{H}\sum\limits_{s,a}\left(\omega_{s,a,t}^{\pi_*, \widehat{P}_{k-1}}\right)^2 \cdot \frac{\square SH^3 \log(1/\delta)}{\widehat{O}_{s,a,t}^{k-1}} \right)$


    


%Define event $\mathcal{E}_k^{\theta, r} := \left\{\mathop{\max}\limits_{\pi \in \Phi_r} \widetilde{V}^{\pi, k}_{1}(s) \le V_1^{\pi_*}(s) - \epsilon_r \right\}$.

% Regret can be decomposed as
% \begin{equation}
%     \begin{array}{lll}
%    \mathcal{R}(T) & \le & \sum\limits_{r=1}^{\square}\sum\limits_{k=1}^{T} \mathbb{E} \left[\bm{1} \left\{\pi_k \in \Phi_r \right\} \right] \cdot O(\epsilon_r) \\& \le & \sum\limits_{r=1}^{\square}\sum\limits_{k=1}^{T} \mathbb{E} \left[\bm{1} \left\{\pi_k \in \Phi_r, \mathcal{E}_k^{\mu, r}, \mathcal{E}_k^{\theta, r} \right\} \right] \cdot O(\epsilon_r) + \sum\limits_{r=1}^{\square}\sum\limits_{k=1}^{T} \mathbb{E} \left[\bm{1} \left\{\pi_k \in \Phi_r, \mathcal{E}_k^{\mu, r}, \overline{\mathcal{E}_k^{\theta, r}} \right\} \right] \cdot O(\epsilon_r) \\
%    & + & \sum\limits_{r=1}^{\square}\sum\limits_{k=1}^{T} \mathbb{E} \left[\bm{1} \left\{\pi_k \in \Phi_r, \overline{\mathcal{E}_k^{\mu, r}} \right\} \right] \cdot O(\epsilon_r)
%     \end{array}
% \end{equation}

% \begin{lemma}
% We have
% \begin{equation}
% \begin{array}{ll}
%    &  \sum\limits_{r=1}^{\square}\sum\limits_{k=1}^{T} \mathbb{E} \left[\bm{1} \left\{\pi_k \in \Phi_r, \mathcal{E}_k^{\mu, r}, \overline{\mathcal{E}_k^{\theta, r}} \right\} \right] \cdot O(\epsilon_r) \\
%    = &  \sum\limits_{r=1}^{\square}\sum\limits_{k=1}^{T} \mathbb{E} \left[\bm{1} \left\{\pi_k \in \Phi_r, \mathcal{E}_k^{\mu, r}, \mathop{\max}\limits_{\pi \in \Phi_r} \widetilde{V}^{\pi, k}_{1}(s) > V_1^{\pi_*}(s) -  \epsilon_r \right\} \right] \cdot O(\epsilon_r) \\
%    \le &  \sum\limits_{r=1}^{\square} \sum\limits_{\pi \in \Phi_r} \underbrace{\sum\limits_{k=1}^{T} \mathbb{E} \left[\bm{1} \left\{\pi_k = \pi, \mathcal{E}_k^{\mu, r}, \widetilde{V}^{\pi, k}_{1}(s) > V_1^{\pi_*}(s) - \epsilon_r \right\} \right]}_{I_1} \cdot O(\epsilon_r) \\
%     \end{array}
% \end{equation}
% \end{lemma}

% If policy $\pi$ has been observed $L_{\pi, r} := O \left(\frac{\square}{0.5^{2r}} \right)$ times, we have
% \begin{equation}
%     \begin{array}{lll}
% I_1 & = & \sum\limits_{k=1}^{T} \mathbb{E} \left[\bm{1} \left\{\pi_k = \pi, \mathcal{E}_k^{\mu, r}, \widetilde{V}^{\pi, k}_{1}(s) > V_1^{\pi_*}(s) - \epsilon_r \right\} \right] \\
% & \le & L_{\pi, r} + \sum\limits_{k=1}^{T} \mathbb{E} \left[\bm{1} \left\{\pi_k = \pi, \mathcal{E}_k^{\mu, r}, T_{\pi} > L_{\pi, r},  \widetilde{V}^{\pi, k}_{1}(s) > V_1^{\pi_*}(s) - \epsilon_r \right\} \right] \\
% & = & L_{\pi, r} + \sum\limits_{k=1}^{T} \mathbb{E}\left[ \mathbb{E}  \left[\bm{1} \left\{\pi_k = \pi, \mathcal{E}_k^{\mu, r}, T_{\pi} > L_{\pi, r},  \widetilde{V}^{\pi, k}_{1}(s) > V_1^{\pi_*}(s) - \epsilon_r \right\} \mid \mathcal{F}_{k-1} \right] \right] \\
% & \le & L_{\pi, r} + \sum\limits_{k=1}^{T} \mathbb{E}\left[ \mathbb{E}  \left[\bm{1} \left\{\mathcal{E}_k^{\mu, r}, T_{\pi} > L_{\pi, r} \right\} \cdot \bm{1} \left\{\pi_k = \pi, \widetilde{V}^{\pi, k}_{1}(s) > V_1^{\pi_*}(s) - \epsilon_r \right\} \mid \mathcal{F}_{k-1} \right] \right] \\
% & = & L_{\pi, r} + \sum\limits_{k=1}^{T} \mathbb{E}\left[\bm{1} \left\{\mathcal{E}_k^{\mu, r}, T_{\pi} > L_{\pi, r} \right\} \mathbb{E}  \left[  \bm{1} \left\{\pi_k = \pi, \widetilde{V}^{\pi, k}_{1}(s) > V_1^{\pi_*}(s) - \epsilon_r \right\} \mid \mathcal{F}_{k-1} \right] \right] \\
% & = & L_{\pi, r} + \sum\limits_{k=1}^{T} \mathbb{E}\left[\bm{1} \left\{\mathcal{E}_k^{\mu, r}, T_{\pi} > L_{\pi, r} \right\} \mathbb{P}  \left\{\pi_k = \pi, \widetilde{V}^{\pi, k}_{1}(s) > V_1^{\pi_*}(s) - \epsilon_r \mid \mathcal{F}_{k-1} \right\}  \right] \\
% & = & L_{\pi, r} + \sum\limits_{k=1}^{T} \mathbb{E}\left[\bm{1} \left\{\mathcal{E}_k^{\mu, r}, T_{\pi} > L_{\pi, r} \right\} \mathbb{P}  \left\{ \widetilde{V}^{\pi, k}_{1}(s) > V_1^{\pi_*}(s) - \epsilon_r \mid \mathcal{F}_{k-1} \right\}  \right] \\
%     \end{array}
% \end{equation}

% For the instantiation $F_{k-1}$ of $\mathcal{F}_{k-1}$ such that the indicator function returns true, we have 
% \begin{equation}
%     \begin{array}{ll}
% & \mathbb{P}  \left\{ \widetilde{V}^{\pi, k}_{1}(s) > V_1^{\pi_*}(s) - \epsilon_r \mid \mathcal{F}_{k-1} = F_{k-1}\right\} \\
%  = & \mathbb{P}  \left\{ \widetilde{V}^{\pi, k}_{1}(s)-\widehat{V}^{\pi, k}_{1}(s) > V_1^{\pi_*}(s) - \widehat{V}^{\pi, k}_{1}(s) - \epsilon_r \mid \mathcal{F}_{k-1} = F_{k-1}\right\} \\
%  = & \mathbb{P}  \left\{ \widetilde{V}^{\pi, k}_{1}(s)-\widehat{V}^{\pi, k}_{1}(s) > V_1^{\pi}(s) - \widehat{V}^{\pi, k}_{1}(s) + 2 \epsilon_r \mid \mathcal{F}_{k-1} = F_{k-1}\right\} \\
%    \le  & \mathbb{P}  \left\{ \widetilde{V}^{\pi, k}_{1}(s)-\widehat{V}^{\pi, k}_{1}(s) > V_1^{\pi}(s) - \widehat{V}^{\pi, k}_{1}(s) + 2 \sqrt{\sigma_{\pi, L_{\pi,r}}} \mid \mathcal{F}_{k-1} = F_{k-1}\right\} \\  \le  & \mathbb{P}  \left\{ \widetilde{V}^{\pi, k}_{1}(s)-\widehat{V}^{\pi, k}_{1}(s) > - \sqrt{\sigma_{\pi, L_{\pi,r}}} + 2 \sqrt{\sigma_{\pi, L_{\pi,r}}} \mid \mathcal{F}_{k-1} = F_{k-1}\right\} \\
%    \le  &     \mathbb{P}  \left\{ \widetilde{V}^{\pi, k}_{1}(s)-\widehat{V}^{\pi, k}_{1}(s) >  \sqrt{\sigma_{\pi, L_{\pi,r}}}  \mid \mathcal{F}_{k-1} = F_{k-1}\right\} \\
%     \end{array}
% \end{equation}

% We need to find a lower bound for
% \begin{equation}
%     \begin{array}{ll}
%         & \mathbb{P}  \left\{ \widetilde{V}^{\pi, k}_{1}(s)-\widehat{V}^{\pi, k}_{1}(s) \le V_1^{\pi_*}(s) - \widehat{V}^{\pi, k}_{1}(s) - \epsilon_r \mid \mathcal{F}_{k-1} = F_{k-1}\right\} \\
%         \ge & 
%     \end{array}
% \end{equation}

% \begin{lemma}
% We have
% \begin{equation}
%     \sum\limits_{r=1}^{\square}\sum\limits_{k=1}^{T} \mathbb{E} \left[\bm{1} \left\{\pi_k \in \Phi_r, \overline{\mathcal{E}_k^{\mu, r}} \right\} \right] \cdot O(\epsilon_r)
% \end{equation}
% \end{lemma}



%\subsection*{Definitions and notations}

%First we need to define a list of high probability events. We skip this part now. It will be similar to UBEV paper.


% For each sub-optimal $(s,a,t)$, define $\Delta_{s,a,t}^* := \frac{ Q_{s,*,t}^* - Q_{s,a,t}^*}{H-t+1} \in (0,1)$. This value is computed based on the optimal policy $\pi_*$. So I put a star here. 
%  All the sub-optimal $(s,a,t)$ can be grouped based on $\Delta_{s,a,t}^*$. If $\Delta_{s,a,t}^* \in (0.5^r, 0.5^{r-1} ]$, we put it into $\Phi_r^{s,t}$. For all $(s,a,t) \in \Phi_r^{s,t}$, we have
%  \begin{equation}
%   Q_{s,*,t}^* + (H-t+1) \cdot 0.5^{r} \le   Q_{s,a,t}^* \le Q_{s,*,t}^* + (H-t+1) \cdot 0.5^{r-1}
%  \end{equation}

 %Now, for each sub-optimal $(s,a,t)$, we define $Q_{s,a,t}^* < x^*_{s,a,t} < y^*_{s,a,t} < Q_{s,*,t}^*$. Note that $x^*_{s,a,t}$ can be tuned as the upper confidence bound of $\overline{Q}_{s,a,t}^{k}$.

 % Define
 % \begin{equation}
 %     \varepsilon_{r,t} := (H-t+1) \cdot \frac{1}{3} 0.5^r 
 % \end{equation}
 
%  \footnote{We need to assume the mean reward in $(0,1)$.} 

% Now, in each episode $k$, for each  $(s,a,t)$, let $\overline{Q}_{s,a,t}^{k}$ be the estimated version or the optimistic version of ${Q}_{s,a,t}^{*}$. It is important to note that ${Q}_{s,a,t}^{*}$ and 
% $\overline{Q}_{s,a,t}^{k}$ are computed based on different ``policies''.


% Regret can be decomposed as
% \begin{equation}
% \begin{array}{lll}
%      \mathcal{R} (\mathcal{M}, T, \textbf{ALG}) &:= & \mathbb{E}_{\textbf{ALG}} \left[ \sum\limits_{k=1}^{T} \left(V_1^{*}(s_1^k) - V_1^{\pi_k}(s_1^k) \right)\right]\\
%    %  &= & \mathbb{E}_{\textbf{ALG}} \left[ \sum\limits_{k=1}^{T} \sum\limits_{t=1}^{H} \left(Q_{s_t^k,*,t}^{*} - Q_{s_t^k, a_t^k,t}^{*}\right)\right] \\
%    %  &= & \mathbb{E}_{\textbf{ALG}} \left[ \sum\limits_{k=1}^{T} \sum\limits_{t=1}^{H} \left(Q_{s_t^k,*,t}^{*} - Q_{s_t^k, \pi_k(s_t^k,t),t}^{*}\right)\right] \\
%     %  &= & \mathbb{E}_{\textbf{ALG}} \left[ \sum\limits_{k=1}^{T} \sum\limits_{t=1}^{H} \Delta_{s_t^k, \pi_k(s_t^k,t),t}^*\right] \\
%       & \le & \mathbb{E}_{\textbf{ALG}} \left[ \sum\limits_{k=1}^{T} \sum\limits_{t=1}^{H} \sum\limits_{s \in \mathcal{S}} \sum\limits_{a \in \mathcal{A}: \Delta_{s,a,t}^* > 0} \bm{1} \left\{ \left( s_t^k, \pi_k(s_t^k,t),t \right) = \left(s,a,t\right)\right\} \cdot \Delta_{s,a,t}^*\right] \\
%        & \le  & \mathbb{E}_{\textbf{ALG}} \left[ \sum\limits_{k=1}^{T} \sum\limits_{t=1}^{H} \sum\limits_{s \in \mathcal{S}} \sum\limits_{a \in \Phi_r^{s,t}} \bm{1} \left\{ s_t^k = s,  \pi_k(s,t) = a  \right\} \cdot (H-t+1) 0.5^{r-1}\right] \\
%     \end{array}
% \end{equation}



%\section{An analysis of TS for bandits}


% \begin{algorithm}[!ht]
% 	\caption{Optimistic Thompson Sampling for Bandits} 
% 	\label{TS Bandits Algorithm}
% 	\begin{algorithmic}[1]
% 	\STATE {\bf{Input:}} Bandits instance and number of arms $K$, number of episodes $T$
	
% \STATE {\bf{Initialization}:}
% For each $j \in [K]:$ Set $\widehat{O}_j{(0)} \leftarrow 0$, $\widehat{\mu}_{i, O_j(0)} \leftarrow 0$

% \FOR {round $t = 1, 2, \dotsc,K$} 

% \STATE Play arm $J_t$ and receive reward $\widetilde{\mu}_j(t)$

%  Set $\widehat{O}_j{(t)} \leftarrow 1$

%  Set $\widehat{\mu}_{i, O_j(t)} \leftarrow X_J(t)$
% \ENDFOR

% \FOR {round $t = K+1, \dotsc, T$} 
% \STATE Set $\overline{\mu}_j(t) := \widehat{\mu}_{i, O_j(t-1)} + \sqrt{\frac{3 \ln (t)}{O_j(t-1)}}$

% For each arm $j$, draw $\widetilde{\mu}_j(t) \sim \max(\overline{\mu}_j(t), \mathcal{N} (\widehat{\mu}_j(t), \sigma^2))$

% Play arm $j = \mathrm{argmax}_j \widetilde{\mu}_j(t)$  and receive reward $X_j(t)$

% Set $\widehat{O}_j{(t)} \leftarrow \widehat{O}_j{(t-1)} + 1$

% Set $\widehat{\mu}_{i, O_j(t)} = \frac{1}{O_j(t)}\sum_1^{t} X_j(t)$
% \ENDFOR
% 	\end{algorithmic}
% \end{algorithm}





% Now, for each sub-optimal arm $j$, we upper bound the expected times of pulls. We first decompose the regret as

% \begin{equation}
%     \begin{array}{ll}
%           & \sum\limits_{t=1}^{T} \mathbb{E} \left[\bm{1} \left\{J_t = j\right\} \right] \cdot \Delta_j \\
%           \le & \underbrace{\sum\limits_{t=1}^{T} \mathbb{E} \left[\bm{1} \left\{J_t = j, \theta_j(t) \le y_j \right\}\right]}_{I_1} + \sum\limits_{t=1}^{T} \mathbb{E} \left[\bm{1} \left\{J_t = j, \theta_j(t) > y_j \right\}\right]\quad.
%     \end{array}
% \end{equation}


% This is the same with UCB analysis: 
% \begin{enumerate}
%     \item sub-optimal arm $j$ is overestimated: $\widehat{\mu_j} \geq \mu_j + \sqrt{\frac{3\ln(t)}{O_j(t-1)}}$ (bound by Hoeffding's)
%     \item optimal arm $1$ is underestimated $\widehat{\mu_1} \geq \mu_1 + \sqrt{\frac{3\ln(t)}{O_1(t-1)}}$ (bound by Hoeffding's)
%     \item two arm's reward is similar $\mu_1 -\mu_j \leq 2\sqrt{\frac{3\ln(t)}{O_j(t-1)}}$ (set $L_j$ such that this cannot happen)
% \end{enumerate}



%\section{Bingshan's analysis on problem-dependent regret bound}


% For $I_1$, we further decompose it as
% \begin{equation}
%     \begin{array}{lll}
%          I_1 & = & \sum\limits_{t=1}^{T} \mathbb{E} \left[\bm{1} \left\{J_t = j, \theta_j(t) \le y_j \right\}\right] \\
%          & \le & \sum\limits_{t=1}^{T} \mathbb{E} \left[\bm{1} \left\{J_t = j, \theta_j(t) \le y_j, \underline{\mu_1}(t) \le \theta_1(t) \le \overline{\mu_1}(t) \right\}\right]\\
%          &+& \sum\limits_{t=1}^{T} \mathbb{E} \left[\bm{1} \left\{J_t = j, \theta_j(t) \le y_j, \underline{\mu_1}(t) > \theta_1(t) \right\} \right] + \sum\limits_{t=1}^{T} \mathbb{E} \left[\bm{1} \left\{J_t = j, \theta_j(t) \le y_j, \overline{\mu_1}(t) < \theta_1(t) \right\} \right]\\
%          & \le & \underbrace{\sum\limits_{t=1}^{T} \mathbb{E} \left[\bm{1} \left\{J_t = j, \theta_j(t) \le y_j, \underline{\mu_1}(t) \le \theta_1(t) \le \overline{\mu_1}(t) \right\} \right]}_{\omega_1}
%          +\underbrace{\sum\limits_{t=1}^{T} \mathbb{E} \left[\bm{1} \left\{ \underline{\mu_1}(t) > \theta_1(t) \right\} \right] + \sum\limits_{t=1}^{T} \mathbb{E} \left[ \bm{1} \left\{\overline{\mu_1}(t) < \theta_1(t) \right\}  \right]}_{\omega_2}\\
         
%     \end{array}
% \end{equation}

% We now upper bound $\omega_1$. We have
% \begin{equation}
%     \begin{array}{lll}
%          \omega_1 & = & \sum\limits_{t=1}^{T} \mathbb{E} \left[\bm{1} \left\{J_t = j, \theta_j(t) \le y_j, \underline{\mu_1}(t) \le \theta_1(t) \le \overline{\mu_1}(t) \right\}\right] \\
%          & = & \sum\limits_{t=1}^{T} \mathbb{E} \left[ \mathbb{E} \left[\bm{1} \left\{J_t = j, \theta_j(t) \le y_j, \underline{\mu_1}(t) \le \theta_1(t) \le \overline{\mu_1}(t) \right\} \mid \mathcal{F}_{t-1}\right]\right] \\
%          & = & \sum\limits_{t=1}^{T} \mathbb{E} \left[ \mathbb{E} \left[\bm{1} \left\{J_t = j, \theta_j(t) \le y_j, \underline{\mu_1}(t) \le \theta_1(t) \le \overline{\mu_1}(t) , \overline{\mu_1}(t) > y_j\right\} \mid \mathcal{F}_{t-1}\right]\right] \\
%          & + & \sum\limits_{t=1}^{T} \mathbb{E} \left[ \mathbb{E} \left[\bm{1} \left\{J_t = j, \theta_j(t) \le y_j, \underline{\mu_1}(t) \le \theta_1(t) \le \overline{\mu_1}(t),  \overline{\mu_1}(t) \le y_j \right\} \mid \mathcal{F}_{t-1}\right]\right] \\
%          & = & \sum\limits_{t=1}^{T} \mathbb{E} \left[\bm{1} \left\{\overline{\mu_1}(t) > y_j \right\} \cdot \mathbb{E} \left[\bm{1} \left\{J_t = j, \theta_j(t) \le y_j , \underline{\mu_1}(t) \le \theta_1(t) \le \overline{\mu_1}(t) \right\} \mid \mathcal{F}_{t-1}\right]\right] \\
%          & + & \sum\limits_{t=1}^{T} \mathbb{E} \left[\bm{1} \left\{\overline{\mu_1}(t) \le y_j \right\} \cdot \mathbb{E} \left[\bm{1} \left\{J_t = j, \theta_j(t) \le y_j , \underline{\mu_1}(t) \le \theta_1(t) \le \overline{\mu_1}(t) \right\} \mid \mathcal{F}_{t-1}\right]\right] \\
%          & \le & \sum\limits_{t=1}^{T} \mathbb{E} \left[\bm{1} \left\{\overline{\mu_1}(t) > y_j \right\} \cdot \mathbb{E} \left[\bm{1} \left\{J_t = j, \theta_j(t) \le y_j , \underline{\mu_1}(t) \le \theta_1(t) \le \overline{\mu_1}(t) \right\} \mid \mathcal{F}_{t-1}\right]\right] + \sum\limits_{t=1}^{T} \mathbb{E} \left[ \bm{1} \left\{\overline{\mu_1}(t) \le y_j \right\}\right]
%     \end{array}
% \end{equation}

% We now show the following key lemmas. 
% \begin{lemma}
% For any instantiation $F_{t-1}$ of $\mathcal{F}_{t-1}$, we have
% \begin{equation}
% \begin{array}{ll}
%    &  \mathbb{E} \left[\bm{1} \left\{J_t = j, \theta_j(t) \le y_j , \underline{\mu_1}(t) \le \theta_1(t) \le \overline{\mu_1}(t) \right\} \mid \mathcal{F}_{t-1} = F_{t-1}\right] \\
%    \le & \square \cdot \mathbb{E} \left[\bm{1} \left\{J_t = 1, \theta_j(t) \le y_j , \underline{\mu_1}(t) \le \theta_1(t) \le \overline{\mu_1}(t) \right\} \mid \mathcal{F}_{t-1} = F_{t-1}\right]
%     \end{array}
% \end{equation}
% \label{novel lemma}
% \end{lemma}
% \begin{proof}[Proof of Lemma~\ref{novel lemma}]
% For the LHS, we have
% \begin{equation}
%     \begin{array}{ll}
%          & \bm{1} \left\{\overline{\mu_1}(t) > y_j \right\} \cdot \mathbb{E} \left[\bm{1} \left\{J_t = j, \theta_j(t) \le y_j , \underline{\mu_1}(t) \le \theta_1(t) \le \overline{\mu_1}(t) \right\} \mid \mathcal{F}_{t-1} = F_{t-1}\right] \\
%          = & \bm{1} \left\{\overline{\mu_1}(t) > y_j \right\} \cdot \mathbb{P}  \left\{J_t = j, \theta_j(t) \le y_j , \underline{\mu_1}(t) \le \theta_1(t) \le \overline{\mu_1}(t) \mid \mathcal{F}_{t-1} = F_{t-1} \right\}  \\ 
%          \le & \bm{1} \left\{\overline{\mu_1}(t) > y_j \right\} \cdot \mathbb{P}  \left\{J_t = j, \mathop{\max}\limits_{i \in [K]}\theta_i(t) \le y_j , \underline{\mu_1}(t) \le \theta_1(t) \le \overline{\mu_1}(t) \mid \mathcal{F}_{t-1} = F_{t-1} \right\}  \\ 
%             \le  & \bm{1} \left\{\overline{\mu_1}(t) > y_j \right\} \cdot \mathbb{P}  \left\{J_t = j, \mathop{\max}\limits_{i \in [K] \backslash \{1\}}\theta_i(t) \le y_j , \theta_1(t) \le y_j, \underline{\mu_1}(t) \le \theta_1(t) \le \overline{\mu_1}(t) \mid \mathcal{F}_{t-1} = F_{t-1} \right\}  \\ \le  & \bm{1} \left\{\overline{\mu_1}(t) > y_j \right\} \cdot \mathbb{P}  \left\{ \mathop{\max}\limits_{i \in [K] \backslash \{1\}}\theta_i(t) \le y_j , \theta_1(t) \le y_j, \underline{\mu_1}(t) \le \theta_1(t) \le \overline{\mu_1}(t) \mid \mathcal{F}_{t-1} = F_{t-1} \right\}  \\ 
%              \le  & \bm{1} \left\{\overline{\mu_1}(t) > y_j \right\} \cdot \mathbb{P}  \left\{J_t = j, \mathop{\max}\limits_{i \in [K] \backslash \{1\}}\theta_i(t) \le y_j , \theta_1(t) \le y_j, \underline{\mu_1}(t) \le \theta_1(t) \le \overline{\mu_1}(t) \mid \mathcal{F}_{t-1} = F_{t-1} \right\}  \\ 
%              \le  & \bm{1} \left\{\overline{\mu_1}(t) > y_j \right\} \cdot \mathbb{P}  \left\{ \mathop{\max}\limits_{i \in [K] \backslash \{1\}}\theta_i(t) \le y_j , \underline{\mu_1}(t)  \le y_j, \underline{\mu_1}(t) \le \theta_1(t) \le \overline{\mu_1}(t) \mid \mathcal{F}_{t-1} = F_{t-1} \right\}  \\ 
%              =  & \bm{1} \left\{\overline{\mu_1}(t) > y_j \right\} \cdot \mathbb{E} \left[  \bm{1}\left\{ \mathop{\max}\limits_{i \in [K] \backslash \{1\}}\theta_i(t) \le y_j , \underline{\mu_1}(t)  \le y_j, \underline{\mu_1}(t) \le \theta_1(t) \le \overline{\mu_1}(t)  \right\} \mid \mathcal{F}_{t-1} = F_{t-1}  \right]  \\ 
%                 =  & \bm{1} \left\{\overline{\mu_1}(t) > y_j \right\} \cdot \mathbb{E} \left[\bm{1} \left\{ \underline{\mu_1}(t)  \le y_j \right\}  \bm{1}\left\{ \mathop{\max}\limits_{i \in [K] \backslash \{1\}}\theta_i(t) \le y_j , \underline{\mu_1}(t) \le \theta_1(t) \le \overline{\mu_1}(t)  \right\} \mid \mathcal{F}_{t-1} = F_{t-1}  \right]  \\ 
%                   =  & \bm{1} \left\{\overline{\mu_1}(t) > y_j \right\} \cdot \bm{1} \left\{ \underline{\mu_1}(t)  \le y_j \right\}  \cdot \mathbb{E} \left[ \bm{1}\left\{ \mathop{\max}\limits_{i \in [K] \backslash \{1\}}\theta_i(t) \le y_j , \underline{\mu_1}(t) \le \theta_1(t) \le \overline{\mu_1}(t)  \right\} \mid \mathcal{F}_{t-1} = F_{t-1}  \right]  \\ 
%     \end{array}
% \end{equation}

% Now, we deal with the RHS. We have
% \begin{equation}
%     \begin{array}{ll}
%          & \bm{1} \left\{\underline{\mu_1}(t) \le y_j \right\} \cdot \mathbb{E} \left[\bm{1} \left\{J_t = 1, \theta_j(t) \le y_j , \underline{\mu_1}(t) \le \theta_1(t) \le \overline{\mu_1}(t) \right\} \mid \mathcal{F}_{t-1} = F_{t-1}\right]
%      \\ 
%      = & \bm{1} \left\{\underline{\mu_1}(t) \le y_j \right\} \cdot \mathbb{P}  \left\{J_t = 1, \theta_j(t) \le y_j , \underline{\mu_1}(t) \le \theta_1(t) \le \overline{\mu_1}(t) \mid \mathcal{F}_{t-1} = F_{t-1} \right\} 
%      \\ 
%      \ge & \bm{1} \left\{\underline{\mu_1}(t) \le y_j \right\} \cdot \mathbb{P}  \left\{\theta_1(t) > y_j \ge \mathop{\max}\limits_{i \in [K] \backslash 1} \theta_i(t), \theta_j(t) \le y_j , \underline{\mu_1}(t) \le \theta_1(t) \le \overline{\mu_1}(t) \mid \mathcal{F}_{t-1} = F_{t-1} \right\} \\
%       \ge & \bm{1} \left\{\underline{\mu_1}(t) \le y_j \right\} \cdot \bm{1} \left\{\underline{\mu_1}(t) > y_j \right\} \cdot \mathbb{P}  \left\{ y_j \ge \mathop{\max}\limits_{i \in [K] \backslash 1} \theta_i(t), \theta_j(t) \le y_j , \underline{\mu_1}(t) \le \theta_1(t) \le \overline{\mu_1}(t) \mid \mathcal{F}_{t-1} = F_{t-1} \right\} 
%      \\ 
%     \end{array}
% \end{equation}

% \end{proof}

% We upper bound $\omega_2$ by using concentration inequality of Gaussians. We have
% \begin{equation}
%     \begin{array}{ll}
%        & \sum\limits_{t=1}^{T} \mathbb{E} \left[\bm{1} \left\{ \underline{\mu_1}(t) > \theta_1(t) \right\} \right] \\
%        = & \sum\limits_{t=1}^{T} \mathbb{E} \left[\bm{1} \left\{ \widehat{\mu}_{1, O_1(t-1)} - \sqrt{\frac{6 \ln (t)}{O_1(t-1)}}> \theta_1(t) \right\}  \right] \\
%        = & \sum\limits_{t=1}^{T} \mathbb{E} \left[ \mathbb{E} \left[\bm{1} \left\{\widehat{\mu}_{1, O_1(t-1)} - \sqrt{\frac{6 \ln (t)}{O_1(t-1)}}> \theta_1(t) \right\} \mid \mathcal{F}_{t-1} \right] \right] \\
%        = & \sum\limits_{t=1}^{T} \mathbb{E} \left[ \underbrace{\mathbb{P}  \left\{\widehat{\mu}_{1, O_1(t-1)} - \sqrt{6 \ln(t)} \cdot \sqrt{\frac{1}{O_1(t-1)}}> \theta_1(t) \mid \mathcal{F}_{t-1} \right\}}_{\le 0.5 e^{-3 \ln(t)}}  \right]  \\
%        \le & \sum\limits_{t=1}^{T} 0.5 \frac{1}{t^3}\\
%     \le & O(1) \quad.
%     \end{array}
% \end{equation}








%\subsection{Proofs for Theorem~\ref{theorem: bandit, O-TS-MDP, independent}}
\newpage
\subsection{Proofs for Theorem~\ref{theorem: bandit, O-TS-MDP, dependent}}\label{app: bandit 1}
\begin{proof}[Proof of Theorem~\ref{theorem: bandit, O-TS-MDP, dependent}]
Regret can be expressed as
\begin{equation}
    \begin{array}{l}
         \mathcal{R}(T) =       
         \sum\limits_{t=1}^{T}  \mathbb{E} \left[\mu_1 - \mu_{J_t}  \right]    
        =   \sum\limits_{j \in \mathcal{A}: \Delta_j > 0} \underbrace{\sum\limits_{t=1}^{T}  \mathbb{E} \left[ \bm{1} \left\{J_t = j \right\}  \right] }_{\mathbb{E} \left[O_j(T) \right]} \cdot \Delta_j  \quad.
    \end{array}
\end{equation}
We first define some events to decompose the regret. Let  $C_j(t-1) = \left\{\left|\widehat{\mu}_{j, O_j(t-1)} - \mu_j \right| \le \sqrt{\frac{0.5\ln \left(T {\Delta_j^2}\right)}{O_j(t-1)}} \right\}$ be the event that the confidence interval of the empirical mean holds for an arm $j \in \mathcal{A}$. Let $\widetilde{\mu}_j(t)$ be the value of the random posterior sample of arm $j \in \mathcal{A}$ before the boosting, i.e., $\widetilde{\mu}_j(t) \sim \mathcal{N} \left(\widehat{\mu}_{j, O_j(t-1)}, \frac{1}{O_j(t-1)} \right)$.
For any sub-optimal arm $j$, let $y_j = \mu_j + \frac{1}{2} \Delta_j$. Let $\mathcal{E}'_j(t) = \left\{ \widetilde{\mu}'_j(t) \le y_j\right\}$ be the event that the random sample of the sub-optimal arm $j$ after the boosting  is near to the true mean $\mu_j$.
Let $\mathcal{F}_{t} = \left\{ J_{\tau}, X_{J_{\tau}}(\tau), \tau = 1,2, \dotsc, t\right\}$ collect all the history information by the end of round $t$. 



Now, for a fixed sub-optimal arm $j$, we upper bound the expected number of pulls of it by the end of round $T$. Let $L_j = \frac{36\ln(T\Delta_j^2)}{\Delta_j^2}$.  We   have
\begin{equation}
    \begin{array}{ll}
          & \mathbb{E} \left[O_j(T) \right] \\
          = & \sum\limits_{t=1}^{T}  \mathbb{E} \left[ \bm{1} \left\{J_t = j \right\}  \right]  \\
         \le & L_j + \sum\limits_{t=1}^{T}  \mathbb{E} \left[ \bm{1} \left\{J_t = j , O_j(t-1) > L_j\right\}  \right]  \\
     \le & L_j  
     +   \underbrace{ \sum\limits_{t=1}^{T}  \mathbb{E} \left[ \bm{1} \left\{J_t = j, \overline{C_j(t-1)}\right\}  \right] }_{\omega_3}  +  \underbrace{ \sum\limits_{t=1}^{T}  \mathbb{E} \left[ \bm{1} \left\{J_t = j, \overline{\mathcal{E}'_j(t)}, C_j(t-1), O_j(t-1) > L_j\right\}  \right]}_{\omega_2} + \underbrace{\sum\limits_{t=1}^{T}  \mathbb{E} \left[ \bm{1} \left\{J_t = j, \mathcal{E}'_j(t)\right\}  \right] }_{ \omega_1}. \end{array}
    \label{temp 102}
\end{equation}

\paragraph{Upper bound $\omega_3$.} We only need to use Hoeffding's inequality here. Let  $\tau_s$ be the round when the $s$-th pull of arm $j$ occurs. Define $\tau_0 = 0$. The definition of $\tau_s$ means $\bm{1} \left\{J_t = j \right\}=0$ for all  $t \in \left\{ \tau_s+1, \dotsc, \tau_{s+1}-1 \right\}$. We have
\begin{equation}
    \begin{array}{lll}
       \omega_3 & = &  \sum\limits_{t=1}^{T}  \mathbb{E} \left[ \bm{1} \left\{ J_t = j, \overline{C_j(t-1)}\right\}  \right] \\
       & \le & \sum\limits_{s=0}^{T}   \mathbb{E} \left[\sum\limits_{t = \tau_s + 1}^{\tau_{s+1}} \bm{1} \left\{ J_t = j, \overline{C_j(t-1)}\right\}  \right] \\
       & \le & \sum\limits_{s=0}^{T}   \mathbb{E} \left[ \bm{1} \left\{ J_{\tau_{s+1}} = j, \overline{C_j(\tau_{s+1}-1)}\right\}  \right] \\
              & \le & \sum\limits_{s=0}^{T}   \mathbb{E} \left[ \bm{1} \left\{ \overline{C_j(\tau_{s+1}-1)}\right\}  \right] \\
                & = & \sum\limits_{s=0}^{T}   \mathbb{P} \left\{ \overline{C_j(\tau_{s+1}-1)}\right\} \\
%       &  = & \sum\limits_{t=1}^{T}  \mathbb{P}  \left\{\left|\widehat{\mu}_{j, O_j(t-1)} - \mu_j \right| > \sqrt{\color{red}\frac{0.5\ln(T{\Delta_j^2})}{O_j(t-1)}}\right\}   \\

       & = & \sum\limits_{s=0}^{T}  \underbrace{\mathbb{P}  \left\{\left|\widehat{\mu}_{j, s} - \mu_j \right| > \sqrt{\frac{0.5\ln(T\Delta_j^2)}{s}}\right\} }_{\text{Hoeffding's inequality}} \\
       & \le & \sum\limits_{s=0}^{T}  O \left(\frac{1}{T \Delta_j^2} \right) \\
       & \le & O \left(\frac{1}{\Delta_j^2} \right) \quad.
            \end{array}
\end{equation}

\paragraph{Upper bound $\omega_2$ (posterior deviation bound).} 
We have
    \begin{equation}
        \begin{array}{lll}
             \omega_2 &  = & \sum\limits_{t=1}^{T} \mathbb{E} \left[ \bm{1} \left\{J_t = j, \overline{\mathcal{E}'_j(t)}, C_j(t-1), O_j(t-1) > L_j\right\}  \right]  \\
        &  =   & \sum\limits_{t=1}^{T} \mathbb{E} \left[ \bm{1} \left\{ C_j(t-1), O_j(t-1) > L_j\right\} \cdot \mathbb{E} \left[ \bm{1} \left\{J_t = j, \overline{\mathcal{E}'_j(t)} \right\} \mid \mathcal{F}_{t-1}\right] \right]  \\
         &  \le   & \sum\limits_{t=1}^{T} \mathbb{E} \left[ \bm{1} \left\{ C_j(t-1), O_j(t-1) > L_j\right\} \cdot \mathbb{P}  \left\{ \overline{\mathcal{E}'_j(t)} \mid \mathcal{F}_{t-1} \right\} \right]  \\
        & =   &  \sum\limits_{t=1}^{T} \mathbb{E} \left[\underbrace{ \underbrace{\bm{1} \left\{ C_j(t-1), O_j(t-1) > L_j\right\}}_{I_1} \cdot \underbrace{\mathbb{P}  \left\{ \widetilde{\mu}'_j(t) > y_j \mid \mathcal{F}_{t-1} \right\}}_{I_2} }_{I} \right]  \quad.
        \end{array}
    \end{equation}

Note that the value of $I_1$ is determined by $\mathcal{F}_{t-1}$. Now, we categorize all the possible instantiations $F_{t-1}$ of $\mathcal{F}_{t-1}$ into two types. For a particular instantiation $F_{t-1}$ such that $I_1 = 0$, we have $I = 0$.
For any instantiation $F_{t-1}$ such that $I_1 = 1$, i.e., $\bm{1} \left\{ C_j(t-1), O_j(t-1) > L_j\right\} = 1$, we first construct a lower bound on $y_j$.
From $O_j(t-1) > L_j = \frac{36 \ln \left(T \Delta_j^2 \right)}{\Delta_j^2}$, we know $\frac{\Delta_j}{2} >3\sqrt{\frac{ \ln \left(T \Delta_j^2 \right)}{O_j(t-1)}}$. Then, 
we have $y_j = \mu_j + \frac{1}{2} \Delta_j \ge \widehat{\mu}_{j, O_j(t-1)} - \sqrt{\frac{0.5\ln(T \Delta_j^2)}{O_j(t-1)}} +  \frac{1}{2} \Delta_j \ge \widehat{\mu}_{j, O_j(t-1)} - \sqrt{\frac{0.5\ln(T \Delta_j^2)}{O_j(t-1)}} +  3\sqrt{\frac{\ln(T \Delta_j^2)}{O_j(t-1)}}  > \widehat{\mu}_{j, O_j(t-1)} + \sqrt{\frac{2\ln(T \Delta_j^2)}{O_j(t-1)}}$.
Then, we have
\begin{equation}
    \begin{array}{ll}
         &  \mathbb{P}  \left\{ \widetilde{\mu}'_j(t) > y_j \mid \mathcal{F}_{t-1} = F_{t-1} \right\} \\
       =  & 1- \mathbb{P}  \left\{ \widetilde{\mu}'_j(t) \le y_j \mid \mathcal{F}_{t-1} = F_{t-1} \right\} \\
        =  & 1- \left( \int_{\widehat{\mu}_{j, O_j(t-1)}}^{y_j} \phi \left(x; \widehat{\mu}_{j, O_j(t-1)}, \frac{1}{O_j(t-1)}  \right)dx + 0.5\right) \\
        =  & 1- \left( 0.5 - \int_{y_j}^{+ \infty} \phi \left(x; \widehat{\mu}_{j, O_j(t-1)}, \frac{1}{O_j(t-1)}  \right)dx + 0.5\right) \\
        = & \int_{y_j}^{+ \infty} \phi \left(x; \widehat{\mu}_{j, O_j(t-1)}, \frac{1}{O_j(t-1)}  \right)dx \\
        = & \mathbb{P}  \left\{ \widetilde{\mu}_j(t) > y_j \mid \mathcal{F}_{t-1} = F_{t-1} \right\} \\
           = & \mathbb{P}  \left\{ \widetilde{\mu}_j(t) - \widehat{\mu}_{j, O_j(t-1)} > y_j -  \widehat{\mu}_{j, O_j(t-1)} \mid \mathcal{F}_{t-1} = F_{t-1} \right\} \\
                  \le & \underbrace{\mathbb{P}  \left\{ \widetilde{\mu}_j(t) - \widehat{\mu}_{j, O_j(t-1)} > \sqrt{\frac{2\ln(T \Delta_j^2)}{O_j(t-1)}} \mid \mathcal{F}_{t-1} = F_{t-1} \right\}}_{\text{Gaussian concentration inequality}} \\
                  \le & O\left(
                  \frac{1}{T \Delta_j^2}\right) \quad,
    \end{array}
\end{equation}
which gives us $I \le O\left(
                  \frac{1}{T \Delta_j^2}\right)$. The last inequality uses concentration bound of Gaussian distributions shown in Lemma~\ref{Gaussians}.
Then, we have 
$  \omega_2  \le \sum\limits_{t=1}^{T} O\left(
                  \frac{1}{T \Delta_j^2}\right) 
          \le  O\left(
                  \frac{1}{ \Delta_j^2}\right) $.

\paragraph{Upper bound $\omega_1$.} The proof is very similar to the proof for Lemma~2.14 in \citet{agrawalnear}. Let $L_{1,j} = \left\lceil \frac{288 \ln \left(T \Delta_j^2 + e^{32} \right)}{\Delta_j^2} \right\rceil$. Let  $\tau_s$ be the round when the $s$-th pull of arm $1$ occurs. The definition of $\tau_s$ means $\bm{1} \left\{J_t = 1 \right\}=0$ for all  $t \in \left\{ \tau_s+1, \dotsc, \tau_{s+1}-1 \right\}$. Set $\tau_0 = 0$.
We have
\begin{equation}
    \begin{array}{lll}
       \omega_1 & =  & \sum\limits_{t=1}^{T}  \mathbb{E} \left[ \bm{1} \left\{J_t = j, \mathcal{E}'_j(t)\right\}  \right]  \\
%         & = &  \sum\limits_{t=1}^{T}  \mathbb{E} \left[ \mathbb{E} \left[\bm{1} \left\{J_t = j, \mathcal{E}'_j(t)\right\} \mid \mathcal{F}_{t-1} \right] \right]  \\
         & = &  \sum\limits_{t=1}^{T}  \mathbb{E} \left[ \underbrace{\mathbb{P} \left\{J_t = j, \mathcal{E}'_j(t) \mid \mathcal{F}_{t-1} \right\} }_{\text{LHS in Lemma~\ref{temp 66}}} \right]  \\
          & \le &  \sum\limits_{t=1}^{T}  \mathbb{E} \left[ \underbrace{ \frac{\mathbb{P} \left\{ \widetilde{\mu}_1(t) \le y_j \mid \mathcal{F}_{t-1}  \right\}}{\mathbb{P} \left\{ \widetilde{\mu}_1(t)  > y_j \mid \mathcal{F}_{t-1}   \right\}}\mathbb{P} \left\{J_t = 1, \mathcal{E}'_j(t) \mid \mathcal{F}_{t-1}   \right\} }_{\text{RHS in Lemma~\ref{temp 66}}} \right]  \\
%            & = &  \sum\limits_{t=1}^{T}  \mathbb{E} \left[ \mathbb{E} \left[ \frac{\mathbb{P} \left\{ \widetilde{\mu}_1(t) \le y_j \mid \mathcal{F}_{t-1}  \right\}}{\mathbb{P} \left\{ \widetilde{\mu}_1(t)  > y_j \mid \mathcal{F}_{t-1}   \right\}} \bm{1}\left\{J_t = 1, \mathcal{E}'_j(t)   \right\} \mid \mathcal{F}_{t-1}  \right] \right]  \\
            & = &  \sum\limits_{t=1}^{T}  \mathbb{E} \left[  \frac{\mathbb{P} \left\{ \widetilde{\mu}_1(t) \le y_j \mid \mathcal{F}_{t-1}  \right\}}{\mathbb{P} \left\{ \widetilde{\mu}_1(t)  > y_j \mid \mathcal{F}_{t-1}   \right\}} \bm{1}\left\{J_t = 1, \mathcal{E}'_j(t)   \right\} \right]  \\
             & \le &  \sum\limits_{s=1}^{T}  \mathbb{E} \left[ \sum\limits_{t = \tau_s +1}^{\tau_{s+1}} \frac{\mathbb{P} \left\{ \widetilde{\mu}_1(t) \le y_j \mid \mathcal{F}_{t-1}  \right\}}{\mathbb{P} \left\{ \widetilde{\mu}_1(t)  > y_j \mid \mathcal{F}_{t-1}   \right\}} \bm{1}\left\{J_t = 1, \mathcal{E}'_j(t)   \right\} \right]  \\
               & = & \underbrace{ \sum\limits_{s=1}^{T}  \mathbb{E} \left[ \sum\limits_{t = \tau_s +1}^{\tau_{s+1} -1} \frac{\mathbb{P} \left\{ \widetilde{\mu}_1(t) \le y_j \mid \mathcal{F}_{t-1}  \right\}}{\mathbb{P} \left\{ \widetilde{\mu}_1(t)  > y_j \mid \mathcal{F}_{t-1}   \right\}} \bm{1}\left\{J_t = 1, \mathcal{E}'_j(t)   \right\} \right] }_{=0}+ \sum\limits_{s=1}^{T}  \mathbb{E} \left[  \frac{\mathbb{P} \left\{ \widetilde{\mu}_1(\tau_{s+1}) \le y_j \mid \mathcal{F}_{\tau_{s+1}-1}  \right\}}{\mathbb{P} \left\{ \widetilde{\mu}_1(\tau_{s+1})  > y_j \mid \mathcal{F}_{\tau_{s+1}-1}   \right\}} \bm{1} \left\{J_{\tau_{s+1}} = 1, \mathcal{E}'_j(\tau_{s+1})   \right\} \right]   \\
                & = &   \sum\limits_{s=1}^{T}  \mathbb{E} \left[  \frac{\mathbb{P} \left\{ \widetilde{\mu}_1(\tau_{s+1}) \le y_j \mid \mathcal{F}_{\tau_{s+1}-1}  \right\}}{\mathbb{P} \left\{ \widetilde{\mu}_1(\tau_{s+1})  > y_j \mid \mathcal{F}_{\tau_{s+1}-1}   \right\}} \bm{1} \left\{J_{\tau_{s+1}} = 1, \mathcal{E}'_j(\tau_{s+1})   \right\} \right]   \\
                     & \le &   \sum\limits_{s=1}^{T}  \mathbb{E} \left[  \frac{\mathbb{P} \left\{ \widetilde{\mu}_1(\tau_{s+1}) \le y_j \mid \mathcal{F}_{\tau_{s+1}-1}  \right\}}{\mathbb{P} \left\{ \widetilde{\mu}_1(\tau_{s+1})  > y_j \mid \mathcal{F}_{\tau_{s+1}-1}   \right\}}\right]   \\
                      & = &   \sum\limits_{s=1}^{L_{1,j}}  \underbrace{\mathbb{E} \left[  \frac{\mathbb{P} \left\{ \widetilde{\mu}_1(\tau_{s+1}) \le y_j \mid \mathcal{F}_{\tau_{s+1}-1}  \right\}}{\mathbb{P} \left\{ \widetilde{\mu}_1(\tau_{s+1})  > y_j \mid \mathcal{F}_{\tau_{s+1}-1}   \right\}}\right] }_{\text{Constant}}+ \sum\limits_{s=L_{1,j} + 1}^{T}  \underbrace{\mathbb{E} \left[  \frac{\mathbb{P} \left\{ \widetilde{\mu}_1(\tau_{s+1}) \le y_j \mid \mathcal{F}_{\tau_{s+1}-1}  \right\}}{\mathbb{P} \left\{ \widetilde{\mu}_1(\tau_{s+1})  > y_j \mid \mathcal{F}_{\tau_{s+1}-1}   \right\}}\right] }_{\le O \left(\frac{1}{T  \Delta_j^2} \right)}  \\
                      & \le & O \left(L_{1,j} \right) + O \left( \frac{1}{\Delta_j^2}\right) \\
                      & \le & O \left( \frac{\ln \left(T\Delta_j^2 + e^{32} \right)}{\Delta_j^2}\right) \quad,        \end{array}
\end{equation}
where the last inequality uses Lemma~\ref{temp 101}.

Now, we plug the upper bounds on $\omega_1, \omega_2$ and $\omega_3$ into (\ref{temp 102}). We have
\begin{equation}
    \mathbb{E}\left[O_j(T) \right] \le O \left(\frac{\ln(T\Delta_j^2)}{\Delta_j^2} \right) + O \left( \frac{\ln \left(T\Delta_j^2 + e^{32} \right)}{\Delta_j^2}\right) + O \left( \frac{1}{\Delta_j^2} \right) + O \left( \frac{1}{\Delta_j^2} \right) \le O \left( \frac{\ln \left(T\Delta_j^2 + e^{32} \right)}{\Delta_j^2}\right) \quad,
    \label{Hoodie 1}
\end{equation}
which gives
\begin{equation}
\begin{array}{lll}
      \mathcal{R}(T) &\le& \sum\limits_{j \in \mathcal{A}: \Delta_j >0} O \left( \frac{\ln \left(T\Delta_j^2 + e^{32} \right)}{\Delta_j}\right)\\
      &\le& \sum\limits_{j \in \mathcal{A}: \Delta_j >0} O \left( \frac{\ln \left(T + e^{32} \right)}{\Delta_j}\right)\\
        &\le& \sum\limits_{j \in \mathcal{A}: \Delta_j >0} O \left( \frac{\ln \left(2T  e^{32} \right)}{\Delta_j}\right)\\
        &=& \sum\limits_{j \in \mathcal{A}: \Delta_j >0} O \left( \frac{\ln \left(T   \right)}{\Delta_j}\right) + O \left( \frac{\ln \left(2 e^{32}  \right)}{\Delta_j}\right) \\
         &=& \sum\limits_{j \in \mathcal{A}: \Delta_j >0} O \left( \frac{\ln \left(T   \right)}{\Delta_j}\right) \quad,
      \end{array}
\end{equation}
which concludes the proof of Theorem~\ref{theorem: bandit, O-TS-MDP, dependent}.
\end{proof}






\begin{lemma}
(Lemma~2.13 in \citet{agrawalnear}). We have
\begin{equation}
    \mathbb{E} \left[  \frac{\mathbb{P} \left\{ \widetilde{\mu}_1(\tau_{s+1}) \le y_j \mid \mathcal{F}_{\tau_{s+1}-1}  \right\}}{\mathbb{P} \left\{ \widetilde{\mu}_1(\tau_{s+1})  > y_j \mid \mathcal{F}_{\tau_{s+1}-1}   \right\}}\right]  \le \left\{ \begin{array}{ll}
         e^{64}+5,  & \forall s \\
         \frac{5}{T \Delta_j^2}, & s > L_{1,j} \quad.
    \end{array}
     \right.
\end{equation}
\label{temp 101}
\end{lemma}










% \begin{lemma}
%     (First-order stochastic dominance). In any round $t$, for any $a \in \mathbb{R}$, we have $\mathbb{P} \left\{\widetilde{\mu}_1'(t) \le a \mid \mathcal{F}_{t-1} = F_{t-1}\right\} \le 
%     \mathbb{P} \left\{\widetilde{\mu}_1(t) \le a \mid \mathcal{F}_{t-1} = F_{t-1} \right\}$, where $\widetilde{\mu}_1(t) \sim \mathcal{N} \left(\widehat{\mu}_{1, O_1(t-1)}, \frac{3\ln(t)}{O_{1}(t-1)} \right)$.


    

%     Based on the definition of first-order stochastic dominance, we just need to show 
% \begin{equation}
%     \mathbb{P} \left\{\widetilde{\mu}' \le a \right\} \le 
%     \mathbb{P} \left\{\widetilde{\mu} \le a \right\} , \forall a \in \mathbb{R} \quad,
% \end{equation}
% and $\exists a \in \mathbb{R}$ such that
% \begin{equation}
%     \mathbb{P} \left\{\widetilde{\mu}' \le a \right\} < 
%     \mathbb{P} \left\{\widetilde{\mu} \le a \right\} \quad.
% \end{equation}


%     \item When $a < \overline{\mu}$, we have
%     \begin{equation}
%         \mathbb{P} \left\{\widetilde{\mu}' \le a \right\} =0 < \mathbb{P} \left\{\widetilde{\mu} \le a \right\} \quad;
%     \end{equation}

%     \item When $a \ge \overline{\mu}$, we have
%        \begin{equation}
%         \mathbb{P} \left\{\widetilde{\mu}' \le a \right\} = 1- \mathbb{P} \left\{\widetilde{\mu}' > a \right\} =1- \mathbb{P} \left\{\widetilde{\mu} > a \right\} = \mathbb{P} \left\{\widetilde{\mu} \le a \right\} \quad,
%     \end{equation}
% which concludes the proof of the first claim.
% \end{lemma}


\begin{lemma}
(Lemma~2.8 in \citet{agrawalnear}).
    For all $t$ and all instantiations $F_{t-1}$ of $\mathcal{F}_{t-1}$, we have
    \begin{equation}
    \begin{array}{lll}

        \mathbb{P} \left\{J_t = j, \mathcal{E}'_j(t) \mid \mathcal{F}_{t-1} = F_{t-1}  \right\} & \le & \frac{\mathbb{P} \left\{ \widetilde{\mu}'_1(t) \le y_j \mid \mathcal{F}_{t-1} = F_{t-1}  \right\}}{\mathbb{P} \left\{ \widetilde{\mu}'_1(t)  > y_j \mid \mathcal{F}_{t-1} = F_{t-1}  \right\}}\mathbb{P} \left\{J_t = 1, \mathcal{E}'_j(t) \mid \mathcal{F}_{t-1} = F_{t-1}  \right\}\quad.
            \end{array}
    \end{equation}

 \label{temp 65}    
 
\end{lemma}




\begin{lemma}
For all $t$ and all instantiations $F_{t-1}$ of $\mathcal{F}_{t-1}$, we have
    \begin{equation}
    \begin{array}{lll}

        \mathbb{P} \left\{J_t = j, \mathcal{E}'_j(t) \mid \mathcal{F}_{t-1} = F_{t-1}  \right\} & \le & \frac{\mathbb{P} \left\{ \widetilde{\mu}_1(t) \le y_j \mid \mathcal{F}_{t-1} = F_{t-1}  \right\}}{\mathbb{P} \left\{ \widetilde{\mu}_1(t)  > y_j \mid \mathcal{F}_{t-1} = F_{t-1}  \right\}}\mathbb{P} \left\{J_t = 1, \mathcal{E}'_j(t) \mid \mathcal{F}_{t-1} = F_{t-1}  \right\}\quad.
            \end{array}
    \end{equation}
\label{temp 66} 
\end{lemma}

\begin{proof}[Proof of Lemma~\ref{temp 66}]

From Lemma~\ref{temp 65}, we have
        \begin{equation}
    \begin{array}{lll}

        \mathbb{P} \left\{J_t = j, \mathcal{E}'_j(t) \mid \mathcal{F}_{t-1} = F_{t-1}  \right\} & \le & \frac{\mathbb{P} \left\{ \widetilde{\mu}'_1(t) \le y_j \mid \mathcal{F}_{t-1} = F_{t-1}  \right\}}{\mathbb{P} \left\{ \widetilde{\mu}'_1(t)  > y_j \mid \mathcal{F}_{t-1} = F_{t-1}  \right\}}\mathbb{P} \left\{J_t = 1, \mathcal{E}'_j(t) \mid \mathcal{F}_{t-1} = F_{t-1}  \right\}\quad.
            \end{array}
    \end{equation}
    Now, we construct a lower bound for $\mathbb{P} \left\{ \widetilde{\mu}'_1(t)  > y_j \mid \mathcal{F}_{t-1} = F_{t-1}  \right\}$ to complete the proof. We have
    \begin{equation}
        \begin{array}{ll}
            & \mathbb{P} \left\{ \widetilde{\mu}'_1(t)  > y_j \mid \mathcal{F}_{t-1} = F_{t-1}  \right\} \\ 
            = & \mathbb{P} \left\{ \max \left\{\widetilde{\mu}_1(t), \widehat{\mu}_{1, O_1(t-1)} \right\}   > y_j \mid \mathcal{F}_{t-1} = F_{t-1}  \right\} \\
              \ge & \mathbb{P} \left\{ \widetilde{\mu}_1(t)   > y_j \mid \mathcal{F}_{t-1} = F_{t-1}  \right\} \quad,
        \end{array}
    \end{equation}
    which implies 
    \begin{equation}
        \begin{array}{lll}
             \frac{\mathbb{P} \left\{ \widetilde{\mu}'_1(t) \le y_j \mid \mathcal{F}_{t-1} = F_{t-1}  \right\}}{\mathbb{P} \left\{ \widetilde{\mu}'_1(t)  > y_j \mid \mathcal{F}_{t-1} = F_{t-1}  \right\}} & \le & \frac{\mathbb{P} \left\{ \widetilde{\mu}_1(t) \le y_j \mid \mathcal{F}_{t-1} = F_{t-1}  \right\}}{\mathbb{P} \left\{ \widetilde{\mu}_1(t)  > y_j \mid \mathcal{F}_{t-1} = F_{t-1}  \right\}}\quad. 
        \end{array}
    \end{equation}\end{proof}

\newpage
\subsection{Proofs for Theorem~\ref{theorem: bandit, O-TS-MDP, independent}}\label{app: O-TS independent new}

\begin{proof}[Proof of Theorem~\ref{theorem: bandit, O-TS-MDP, independent}]

Set $\Delta = \sqrt{\frac{A \ln(A)}{T}}$. Let set $\Phi = \left\{j \in \mathcal{A}: 0 < \Delta_j < \Delta \right\}$ collect all the arms with the mean reward gap smaller than $\Delta$. Let $\overline{\Phi} = \left\{j \in \mathcal{A}: \Delta_j \ge \Delta \right\}$.
Then, we have
\begin{equation}
    \begin{array}{lll}
         \mathcal{R}(T) & = &  \sum\limits_{t=1}^{T}  \mathbb{E} \left[\mu_1 - \mu_{J_t}  \right]  \\ \\
         & =  &  \sum\limits_{t=1}^{T}  \mathbb{E} \left[\left(\mu_1 - \mu_{J_t} \right) \bm{1} \left\{J_t \in \Phi \right\} \right] + \sum\limits_{t=1}^{T}  \mathbb{E} \left[\left(\mu_1 - \mu_{J_t} \right) \bm{1} \left\{J_t \in \overline{\Phi} \right\} \right]    
         \\
         & \le & T\cdot \Delta + \sum\limits_{j \in \mathcal{A}: \Delta_j \ge \Delta} \underbrace{\sum\limits_{t=1}^{T}  \mathbb{E} \left[ \bm{1} \left\{J_t = j \right\}  \right] }_{\mathbb{E} \left[O_j(T) \right]} \cdot \Delta_j  \\
         & \le^{(a)} & T\cdot \Delta +\sum\limits_{j \in \mathcal{A}: \Delta_j \ge \Delta} O \left( \frac{\ln \left(T\Delta_j^2 + e^{32} \right)}{\Delta_j}\right) \\
          & \le^{(b)} & T\cdot \Delta +\sum\limits_{j \in \mathcal{A}: \Delta_j \ge \Delta} O \left( \frac{\ln \left(2 T\Delta_j^2 \cdot e^{32} \right)}{\Delta_j}\right) \\
          & = & T\cdot \Delta +\sum\limits_{j \in \mathcal{A}: \Delta_j \ge \Delta} O \left( \frac{\ln \left( T\Delta_j^2 \cdot e^2  \right)}{\Delta_j}\right) +\sum\limits_{j \in \mathcal{A}: \Delta_j \ge \Delta} O \left( \frac{\ln \left(2  \cdot e^{30} \right)}{\Delta_j}\right) \\
           & \le^{(c)} & T\cdot \Delta +\sum\limits_{j \in \mathcal{A}: \Delta_j \ge \Delta} O \left( \frac{\ln \left( T \Delta^2 e^2  \right)}{\Delta}\right) +\sum\limits_{j \in \mathcal{A}: \Delta_j \ge \Delta} O \left( \frac{\ln \left(2  \cdot e^{30} \right)}{\Delta}\right)  \\
               & \le & T\cdot \Delta +A \cdot O \left( \frac{\ln \left( T \Delta^2 e^2  \right)}{\Delta}\right) +A \cdot  O \left( \frac{\ln \left(2  \cdot e^{30} \right)}{\Delta}\right)  \\
                & = & T\cdot \Delta +A \cdot  O \left( \frac{\ln \left( A\ln(A) e^2  \right)}{\sqrt{\frac{A\ln(A)}{T}}}\right) +A \cdot O \left( \frac{\ln \left(2  \cdot e^{30} \right)}{\sqrt{\frac{A\ln(A)}{T}}}\right) \\
             & \le^{(d)} & T\cdot \Delta +A \cdot O \left( \frac{\ln \left( A^2 \cdot e^2  \right)}{\sqrt{\frac{A\ln(A)}{T}}}\right) +O \left(\sqrt{AT} \right)  \\
                & = & T\cdot \Delta +A \cdot O \left( \frac{\ln \left( A^2  \right)}{\sqrt{\frac{A\ln(A)}{T}}}\right) +A \cdot O \left( \frac{\ln \left(  e^2  \right)}{\sqrt{\frac{A\ln(A)}{T}}}\right) +O \left(\sqrt{AT} \right)  \\
         & \le & \sqrt{AT\ln(A)} + O \left( \sqrt{AT\ln(A)}\right) + O \left( \sqrt{AT}\right) + O \left( \sqrt{AT}\right) \\
         & = & O \left( \sqrt{AT\ln(A)}\right)\quad,
    \end{array}
\end{equation}
which concludes the proof.

Inequality (a) uses (\ref{Hoodie 1}) and Inequality (b) uses the fact that $x+y \le 2xy$ when $x, y > 1$. Inequality (c) uses the fact that $f(x) = \frac{\ln(Tx^2e^2)}{x}$ is a decreasing function when $x > \sqrt{\frac{1}{T}}$. Inequality (d) uses the fact that $
\ln(x) \le x$ when $x > 1$.
\end{proof}

\newpage
\subsection{Proofs for Theorem~\ref{O-TS-MDP dependent regret}}\label{app: bandit 2}

\begin{proof}[Proof of Theorem~\ref{O-TS-MDP dependent regret}]
%We first present the analysis of the problem-dependent regret bound. 
Since $\text{O-TS-Bandit}^+$ is an optimistic learning algorithm, the regret analysis can be very similar to the proofs for UCB1 policy of \citet{auer2002finite}.  Regret can be expressed as
\begin{equation}
    \begin{array}{l}
         \mathcal{R}(T)  = 
         
         \sum\limits_{t=1}^{T}  \mathbb{E} \left[\mu_1 - \mu_{J_t}  \right]  
         
        =  \sum\limits_{j \in \mathcal{A}: \Delta_j > 0} \underbrace{\sum\limits_{t=1}^{T}  \mathbb{E} \left[ \bm{1} \left\{J_t = j \right\}  \right] }_{\mathbb{E} \left[O_j(T) \right]} \cdot \Delta_j  \quad.
    \end{array}
\end{equation}
Now, for a fixed sub-optimal arm $j$, we let $L_j = \frac{25\ln(T)}{\Delta_j^2}$. Then, we do the following decomposition. We have
\begin{equation}
    \begin{array}{lll}
         \mathbb{E} \left[O_j(T) \right] & =  & \sum\limits_{t=1}^{T}  \mathbb{E} \left[ \bm{1} \left\{J_t = j \right\}  \right] \\
         & \le & L_j + \sum\limits_{t=1}^{T}  \mathbb{E} \left[ \bm{1} \left\{J_t = j, O_j(t-1) \ge L_j \right\}  \right] \\
         & \le & L_j + \sum\limits_{t=1}^{T}  \mathbb{E} \left[ \bm{1} \left\{\widetilde{\mu}'_j(t) \ge \widetilde{\mu}'_1(t) , O_j(t-1) \ge L_j\right\}  \right] \\
         & \le & L_j + \underbrace{\sum\limits_{t=1}^{T}  \mathbb{E} \left[ \bm{1} \left\{\widetilde{\mu}'_1(t) \le \mu_1 \right\}  \right]}_{\omega_1} 
 + \underbrace{ \sum\limits_{t=1}^{T}  \mathbb{E} \left[ \bm{1} \left\{\widetilde{\mu}'_j(t) \ge \mu_j + \sqrt{\frac{24\ln(t)}{O_j(t-1)}}, O_j(t-1) \ge L_j \right\}  \right] }_{\omega_2}\\
         &+ &
           \underbrace{ \sum\limits_{t=1}^{T}  \mathbb{E} \left[ \bm{1} \left\{ \sqrt{\frac{24\ln(t)}{O_j(t-1)}} > \Delta_j, O_j(t-1) \ge L_j \right\}  \right]}_{=0}\quad.
    \end{array}
\end{equation}
\paragraph{Under-estimation of the optimal arm.} We have
\begin{equation}
    \begin{array}{lll}
       \omega_1  & = & \sum\limits_{t=1}^{T}  \mathbb{E} \left[ \bm{1} \left\{\widetilde{\mu}'_1(t) \le \mu_1 \right\}  \right]  \\
         & \le & \sum\limits_{t=1}^{T}  \mathbb{E} \left[ \bm{1} \left\{\overline{\mu}_1(t) \le \mu_1\right\}  \right]  \\
           & = & \sum\limits_{t=1}^{T}  \mathbb{E} \left[ \bm{1} \left\{\widehat{\mu}_{1, O_1(t-1)} + \sqrt{\frac{1.5\ln(t)}{O_1(t-1)}} \le \mu_1\right\}  \right]  \\
             & \le & \sum\limits_{t=1}^{T} \sum\limits_{s=1}^{t-1} \mathbb{E} \left[ \bm{1} \left\{\widehat{\mu}_{1, s} + \sqrt{\frac{1.5\ln(t)}{s}} \le \mu_1\right\}  \right]  \\
             & = & \sum\limits_{t=1}^{T} \sum\limits_{s=1}^{t-1} \mathbb{P} \left\{\widehat{\mu}_{1, s} + \sqrt{\frac{1.5\ln(t)}{s}} \le \mu_1\right\}   \\
             & \le & \sum\limits_{t=1}^{T} \sum\limits_{s=1}^{t-1} O \left( \frac{1}{t^3}\right) \\
             & \le & O(1) \quad.
    \end{array}
\end{equation}

\paragraph{Over-estimation of the sub-optimal arm.} Define event $\mathcal{E}_j(t) := \left\{\left|\widehat{\mu}_{j, O_j(t-1)} - \mu_j \right| \le \sqrt{\frac{1.5\ln(t)}{O_j(t-1)}} \right\}$. Let $\mathcal{F}_{t-1}$ collect all the history information by the end of round $t-1$. Then, we have
\begin{equation}
    \begin{array}{lll}
        \omega_2 & = & \sum\limits_{t=1}^{T}  \mathbb{E} \left[ \bm{1} \left\{\widetilde{\mu}'_j(t) \ge \mu_j + \sqrt{\frac{24\ln(t)}{O_j(t-1)}}, O_j(t-1) \ge L_j \right\}  \right]  \\
         & = & \sum\limits_{t=1}^{T}  \mathbb{E} \left[ \bm{1} \left\{\widetilde{\mu}'_j(t) - \overline{\mu}_{j}(t) \ge \mu_j + \sqrt{\frac{24\ln(t)}{O_j(t-1)}}  - \overline{\mu}_{j}(t), O_j(t-1) \ge L_j \right\}  \right]  \\
          & \le & \sum\limits_{t=1}^{T}  \mathbb{E} \left[ \bm{1} \left\{\widetilde{\mu}'_j(t) - \overline{\mu}_{j}(t) \ge \mu_j + \sqrt{\frac{24\ln(t)}{O_j(t-1)}}  - \overline{\mu}_{j}(t), O_j(t-1) \ge L_j , \mathcal{E}_j(t)\right\}  \right] + \sum\limits_{t=1}^{T}  \mathbb{E} \left[ \bm{1} \left\{\overline{ \mathcal{E}_j(t)}\right\}  \right]  \\
             & = & \sum\limits_{t=1}^{T}  \mathbb{E} \left[ \bm{1} \left\{\widetilde{\mu}'_j(t) - \overline{\mu}_{j}(t) \ge \mu_j + \sqrt{\frac{24\ln(t)}{O_j(t-1)}} 
 - \left( \sqrt{\frac{1.5\ln(t)}{O_j(t-1)}} + \widehat{\mu}_{j, O_j(t-1)} \right), O_j(t-1) \ge L_j , \mathcal{E}_j(t)\right\}  \right] + \sum\limits_{t=1}^{T}  \mathbb{E} \left[ \bm{1} \left\{\overline{ \mathcal{E}_j(t)}\right\}  \right]  \\
 & \le & \sum\limits_{t=1}^{T}  \mathbb{E} \left[ \bm{1} \left\{\widetilde{\mu}'_j(t) - \overline{\mu}_{j}(t) \ge \sqrt{\frac{24\ln(t)}{O_j(t-1)}} 
 -  2 \sqrt{\frac{1.5\ln(t)}{O_j(t-1)}}  , O_j(t-1) \ge L_j \right\}  \right] + \sum\limits_{t=1}^{T}  \mathbb{E} \left[ \bm{1} \left\{\overline{ \mathcal{E}_j(t)}\right\}  \right]  \\
  & = & \sum\limits_{t=1}^{T}  \mathbb{E} \left[ \bm{1} \left\{\widetilde{\mu}'_j(t) - \overline{\mu}_{j}(t) \ge \sqrt{\frac{6\ln(t)}{O_j(t-1)}} \right\}  \right] + \sum\limits_{t=1}^{T}  \mathbb{E} \left[ \bm{1} \left\{\overline{ \mathcal{E}_j(t)}\right\}  \right]  \\
  & = & \sum\limits_{t=1}^{T}  \mathbb{E} \left[  \mathbb{P} \left\{\widetilde{\mu}'_j(t) - \overline{\mu}_{j}(t) \ge \sqrt{\frac{6\ln(t)}{O_j(t-1)}} \mid \mathcal{F}_{t-1}\right\}  \right] + \underbrace{\sum\limits_{t=1}^{T}  \mathbb{E} \left[ \bm{1} \left\{\overline{ \mathcal{E}_j(t)}\right\}  \right] }_{= O(1)} \\
  & = & \sum\limits_{t=1}^{T}  \mathbb{E} \left[  \mathbb{P} \left\{\widetilde{\mu}_j(t) \ge  \overline{\mu}_{j}(t) + \sqrt{\frac{6\ln(t)}{O_j(t-1)}} \mid \mathcal{F}_{t-1}\right\}  \right] + O(1)\\
  & \le &  \sum\limits_{t=1}^{T}   O \left(e^{-3\ln(t)} \right) + O(1)\\
  & \le & O(1) \quad,
  % & = & \sum\limits_{t=1}^{T}  \mathbb{E} \left[ 1- \mathbb{P} \left\{\widetilde{\mu}'_j(t) < \overline{\mu}_{j}(t) + \sqrt{\frac{6\ln(t)}{O_j(t-1)}} \mid \mathcal{F}_{t-1}\right\}  \right] + \sum\limits_{t=1}^{T}  \mathbb{E} \left[ \bm{1} \left\{\overline{ \mathcal{E}_j(t)}\right\}  \right]  \\
    \end{array}
\end{equation}
where the second last inequality uses concentration bound for Gaussian distributions and 
\begin{equation*}
\begin{array}{l}
\sum\limits_{t=1}^{T}  \mathbb{E} \left[ \bm{1} \left\{\overline{ \mathcal{E}_j(t)}\right\}  \right]    
        =   \sum\limits_{t=1}^{T}  \mathbb{P}\left\{\left|\widehat{\mu}_{j, O_j(t-1)} - \mu_j \right| > \sqrt{\frac{1.5\ln(t)}{O_j(t-1)}} \right\} 
        \le  \sum\limits_{t=1}^{T} \sum\limits_{s=1}^{t-1}  \mathbb{P}\left\{\left|\widehat{\mu}_{j, s} - \mu_j \right| > \sqrt{\frac{1.5\ln(t)}{s}} \right\} 
        \le O(1).
        \end{array}
\end{equation*}

Then, we have
\begin{equation}
    \begin{array}{lll}
         \mathcal{R}(T) & = &
         
 \sum\limits_{j \in \mathcal{A}: \Delta_j > 0} \underbrace{\sum\limits_{t=1}^{T}  \mathbb{E} \left[ \bm{1} \left\{J_t = j \right\}  \right] }_{\mathbb{E} \left[O_j(T) \right]} \cdot \Delta_j  \\
       & \le &  \sum\limits_{j \in \mathcal{A}: \Delta_j > 0} \left( L_j + O(1) \right) \cdot \Delta_j \\
        & \le &  \sum\limits_{j \in \mathcal{A}: \Delta_j > 0} O \left(\frac{\ln(T)}{\Delta_j} \right) \quad,
    \end{array}
\end{equation}
which concludes the proof. 

% We have
% \begin{equation}
%     \begin{array}{ll}
%        & \int_{\overline{\mu} +\sqrt{\frac{6\ln(t)}{O_j(t-1)}}}^{+\infty}   \left(\phi \left(x; \widehat{\mu}, \sigma^2 \right) +  \Phi \left(\overline{\mu}; \widehat{\mu}, \sigma^2 \right) \delta(x-\overline{\mu})  \right) dx \\
%     = &    \int_{\overline{\mu} +\sqrt{\frac{6\ln(t)}{O_j(t-1)}}}^{+\infty}  \phi \left(x; \widehat{\mu}, \sigma^2 \right)  dx \\
%     = & 
%     \end{array}
% \end{equation}
\end{proof}

\subsection{Proofs for Theorem~\ref{theorem: bandit, O-TS-MDP+, independent}}\label{app: independent}
%As Theorem~\ref{theorem: bandit, O-TS-MDP, independent} and Theorem~\ref{theorem: bandit, O-TS-MDP+, independent} have the same problem-independent regret bound and can use the same way to do the proof, we only present the proof for one of them.
\begin{proof}[Proof of  
 Theorem~\ref{theorem: bandit, O-TS-MDP+, independent}]
    Set $\Delta = \sqrt{\frac{A \ln(T)}{T}}$. Let set $\Phi = \left\{j \in \mathcal{A}: 0 < \Delta_j < \Delta \right\}$ collect all the arms with the mean reward gap smaller than $\Delta$. Let $\overline{\Phi} = \left\{j \in \mathcal{A}: \Delta_j \ge \Delta \right\}$.
Then, we have
\begin{equation}
    \begin{array}{lll}
         \mathcal{R}(T) & = &  \sum\limits_{t=1}^{T}  \mathbb{E} \left[\mu_1 - \mu_{J_t}  \right]  \\ \\
         & =  &  \sum\limits_{t=1}^{T}  \mathbb{E} \left[\left(\mu_1 - \mu_{J_t} \right) \bm{1} \left\{J_t \in \Phi \right\} \right] + \sum\limits_{t=1}^{T}  \mathbb{E} \left[\left(\mu_1 - \mu_{J_t} \right) \bm{1} \left\{J_t \in \overline{\Phi} \right\} \right]    
         \\
         & \le & T\cdot \Delta + \sum\limits_{j \in \mathcal{A}: \Delta_j \ge \Delta} \underbrace{\sum\limits_{t=1}^{T}  \mathbb{E} \left[ \bm{1} \left\{J_t = j \right\}  \right] }_{\mathbb{E} \left[O_j(T) \right]} \cdot \Delta_j  \\
         & \le & T\cdot \Delta +\sum\limits_{j \in \mathcal{A}: \Delta_j \ge \Delta} O \left(\frac{\ln(T)}{\Delta_j} \right) \\
         & \le & \sqrt{AT\ln(T)} + O \left(\frac{A\ln(T)}{\Delta} \right) \\
         & = & O \left( \sqrt{AT\ln(T)}\right)\quad,
    \end{array}
\end{equation}
which concludes the proof.
\end{proof}
 \subsection{Additional Experimental Results} \label{app: exp} 
 We run additional experiments with different random seeds and state size $S=[5,20,50]$, action size $A = 3$ and $H=10$. We use the same way as described in Section~\ref{sec: expe} to generate the underlying parameters of the MDPs. All experimental results share similar trends. We also include the performance of UCB based method UCB-VI here. From Figure \ref{s5_new}, \ref{s20_new} and \ref{s50_new}, we can see that SSR-Bernstein still performs the best. All three Thompson Sampling-like algorithms perform similarly and better than UCB-VI.  %The experimental results are consistent across multiple runs with different random initialization. 

% \begin{figure}[h]
% \centering
% \includegraphics[scale=0.8]{s20.pdf}
% \caption{Empirical performance for 20 States}
% \label{s20}
% \end{figure}

% \begin{figure}[h]
% \centering
% \includegraphics[scale=0.8]{s50.pdf}
% \caption{Empirical performance for 50 States}
% \label{s50}
% \end{figure}

\begin{figure}[h]
\centering
\includegraphics[scale=0.8]{s5_new.pdf}
\caption{Empirical performance for 5 states, including UCB-VI}
\label{s5_new}
\end{figure}

\begin{figure}[h]
\centering
\includegraphics[scale=0.8]{s20_new.pdf}
\caption{Empirical performance for 20 states, including UCB-VI}
\label{s20_new}
\end{figure}

\begin{figure}[h]
\centering
\includegraphics[scale=0.8]{s50_new.pdf}
\caption{Empirical performance for 50 states, including UCB-VI}
\label{s50_new}
\end{figure}
\end{document}
