\section{Regret Analysis}\label{app:regret}
\textbf{Regret decomposition:} 
Recall the regret~\eqref{def:regret} decomposition of $\algo$,
\begin{align}
    \cR(T;\algo) &= T J\ust_{\cM} - \sum_{k=1}^{K(T)}{\sum_{t = \tau_k}^{\tau_{k+1}-1}{r(s_t,a_t)}} \notag\\
    &= \underbrace{\sum_{k=1}^{K(T)}{H_k \br{J\ust_{\cM} - J_{\cM}(\phi_k)}}}_{(a)} + \underbrace{\sum_{k=1}^{K(T)}{\br{H_k~ J_{\cM}(\phi_k) - \sum_{t=\tau_k}^{\tau_{k+1}-1}{r(s_t,\phi_k(s_t))}}}}_{(b)}.\label{eq:decompregret}
\end{align}

The term (a) captures the regret arising due to the gap between the optimal value of the average reward and the average reward of the policies $\{\phi_k\}$ that are actually played in different episodes, while (b) captures the sub-optimality arising since the distribution of the induced Markov chain does not reach the stationary distribution in finite time.~(a) and (b) are bounded separately.

\textbf{Bounding} (a): 
This term can be further decomposed into the sum of the regrets arising due to playing policies from the sets $\Phi\uc{2^{-i}}$, for $i = 1, 2, \ldots, \ceil{\log{\br{1/\eps}}}$, and the regret arising from playing all $\eps$-optimal policies.~To bound the regret arising due to policies from $\Phi\uc{2^{-i}}$, we count the number of timesteps in which policies from $\Phi\uc{2^{-i}}$ are played, and then multiply it by $2^{-i+1}$.~We then add these regret terms from $i=1$ to $\ceil{\log{\br{1/\eps}}}$. Note that the cumulative regret arising from playing the set of $\eps$-optimal policies is upper-bounded by $\eps T$.~Recall that at the beginning of the $k$-th episode, \algo~solves $\cM^+_{\tau_k}$ with the accuracy parameter set equal to $\frac{1}{\sqrt{T}}$.~This ``loss of accuracy'' as compared to the case where \algo~could have solved $\cM^+_{\tau_k}$ accurately at the beginning of every episode, leads to an additional term in the upper-bound of (a).~From Lemma~\ref{lem:conv_evi}, the difference between the two solutions is at most $\frac{1}{\sqrt{T}}$ for each episode, hence this term can be upper-bounded as $\sqrt{T}$.~Hence, we bound (a) by firstly considering that~\algo~solves $\cM^+_{\tau_k}$ for the optimal policy (with complete accuracy), and then add $\sqrt{T}$ to obtain the upper-bound of term (a).

The regret arising due to playing policies from the set $\Phi\uc{2^{-i}}$ is bounded as follows.~Lemma~\ref{lem:keycell} proves the existence of a key cell in every episode on the set $\cG_1$.~Its proof relies crucially on Lemma~\ref{lem:gap_phi} and on the properties of the index of policies that are derived in Section~\ref{app:prop_emdp}.~Lemma~\ref{lem:lb_num_visit} gives a lower-bound of the number of plays of a key cell in any episode by \algo~using Lemma~\ref{lem:keycell}, Corollary~\ref{cor:G_2}, and Lemma~\ref{lem:bdd_epi_tool}.~Next, Lemma~\ref{lem:bdd_Phi_play} establishes an upper-bound on the number of timesteps when policies from $\Phi\uc{2^{-i}}$ are played. This upper-bound multiplied by $2^{-i+1}$, is the regret arising from playing policies from $\Phi\uc{2^{-i}}$.~Next, we derive an important property of the policy $\phi \in \Phi_{SD}$ that is played in the $k$-th episode. This is used to upper-bound the number of plays of sub optimal policies.
\begin{lemma}\label{lem:keycell}
    Consider a sample path from the set $\cG_1$~\eqref{def:G_1}.~For each $k=1,2,\ldots$, there exists at least one $s \in \cS$ (where $s$ could vary with $k$, and here we are suppressing dependence upon $k$) such that 
    \begin{align*}
        &\diamc{q\inv_{\tau_k}(s,\phi_k(s))} \geq \frac{1}{3 C_{ub}} \max \left\{\gap{s,\phi_k(s)}, C_{ub}~ \diam{\tau_k}{\phi_k}\right\}, \\
    \mbox{ and }&\mu\uc{\infty}_{\phi_k,p}(\pi_\cS(q\inv_{\tau_k}(s,\phi_k(s)))) \geq (\diam{\tau_k}{\phi_k} / 3)^{d_\cS + 1}.
    \end{align*}
    Such a $q\inv_{\tau_k}(s,\phi_k(s))$ is called a key cell for the $k$-th episode.
\end{lemma}
\begin{proof}
     Let us fix $k \in \bN$ and a policy $\phi \in \Phi_{\tau_k}$. Let $\bar{\phi}$ be the unique continuous extension of $\phi$ as defined in \eqref{def:pol_ext}.~We will first show that if
    \begin{align}\label{cond:noplayphi_1}
        \diam{\tau_k}{\bar{\phi}} \leq \Delta(\bar{\phi})/C_{ub},
    \end{align}
    then $\bar{\phi}$ will not be played from episode $k$ onwards.~From Lemma~\ref{lem:optimism} we have that on the set $\cG_1$, $J\ust_{\cM^+_{\tau_k}} = J_{\cM^+_{\tau_k}}(\tilde{\phi}_k) \geq J\ust_{\cM}$.~Hence, if $J_{\cM^+_{\tau_k}}(\phi) < J\ust_{\cM}$, then the algorithm will not play $\bar{\phi}$.~From Lemma~\ref{lem:ub_opt} we have that on the set $\cG_1$, $J_{\cM^+_{\tau_k}}(\phi) \leq J_{\cM}(\bar{\phi}) + C_{ub}~ \diam{\tau_k}{\bar{\phi}}$.~Thus, on $\cG_1$, $\bar{\phi}$ will never be played from the $k$-th episode onwards if
    \begin{align*}
        J_{\cM}(\bar{\phi}) + C_{ub}~ \diam{\tau_k}{\bar{\phi}} \leq J\ust_{\cM},
    \end{align*}
    or, if $\diam{\tau_k}{\bar{\phi}} \leq \Delta(\bar{\phi})/ C_{ub}$.~In other words, on the set $\cG_1$,
    \begin{align}
        \diam{\tau_k}{\phi_k} > \Delta(\phi_k)/ C_{ub}.\label{cond:phi_k}
    \end{align}
    We will prove the result by contradiction.~Let us assume that for all $s \in \cS$ that satisfy $\mu\uc{\infty}_{\phi_k,p}(\pi_\cS(q\inv_{\tau_k}(s,\phi_k(s)))) \geq (\diam{\tau_k}{\phi_k} / 3)^{d_\cS + 1}$, the following is true:
    \begin{align}
        \diamc{q\inv_{\tau_k}(s,\phi_k(s))} \leq \frac{1}{3 C_{ub}} \max{\{\gap{s,\phi_k(s)}, C_{ub} \diam{\tau_k}{\phi_k}\}}. \label{assum:contra}
    \end{align}
    Define the following sets of $\cS$-cells:
    \begin{align*}
        \cQ\uc{1} &:= \{\xi \in \cQ_{\tau_k} \mid \mu\uc{\infty}_{\phi_k,p}(\xi) < (\diam{\tau_k}{\phi_k} /3)^{d_\cS + 1},~\diamc{q\inv_{\tau_k}(q(\xi),\phi_k(q(\xi)))} \geq \diam{\tau_k}{\phi_k}/ 3 \},\\
        \cQ\uc{2} &:= \{\xi \in \cQ_{\tau_k} \mid \diamc{q\inv_{\tau_k}(q(\xi),\phi_k(q(\xi)))} < \diam{\tau_k}{\phi_k}/ 3 \},\\
        \cQ\uc{3} &:= \{\xi \in \cQ_{\tau_k} \mid \mu\uc{\infty}_{\phi_k,p}(\xi) \geq (\diam{\tau_k}{\phi_k} /3)^{d_\cS + 1},~\diamc{q\inv_{\tau_k}(q(\xi),\phi_k(q(\xi)))} \geq \diam{\tau_k}{\phi_k}/ 3 \}.
    \end{align*}
    We observe that $\cQ_{\tau_k}$ is partitioned by $\cQ\uc{1}$, $\cQ\uc{3}$ and $\cQ\uc{3}$.~Note that $\abs{\cQ\uc{1}} \leq (\diam{\tau_k}{\phi_k}/3)^{-d_\cS}$.~Also, note that by the necessary condition for $\phi_k$ to be played and by our assumption, for every $\xi \in \cQ\uc{3}$, $\frac{1}{3} \diam{\tau_k}{\phi_k} \leq \diamc{q\inv_{\tau_k}(q(\xi),\phi_k(q(\xi)))} \leq \frac{1}{3 C_{ub}}\min_{s\in \zeta}\{\gap{s,\phi_k(s)}\}$. Then,
    \begin{align*}
        \diam{\tau_k}{\phi_k} &= \int_{\cS}{\diamc{q\inv_{\tau_k}(s,\phi_k(s))} \mu\uc{\infty}_{\phi_k,p}(s) ~ds}\\
        &= \sum_{\xi \in \cQ_{\tau_k}}{\diamc{q\inv_{\tau_k}(q(\xi),\phi_k(q(\xi)))} \mu\uc{\infty}_{\phi_k,p}(\xi)} \\
        &= \sum_{\xi \in \cQ\uc{1}}{\diamc{q\inv_{\tau_k}(q(\xi),\phi_k(q(\xi)))} \mu\uc{\infty}_{\phi_k,p}(\xi)} + \sum_{\xi \in \cQ\uc{2}}{\diamc{q\inv_{\tau_k}(q(\xi),\phi_k(q(\xi)))} \mu\uc{\infty}_{\phi_k,p}(\xi)}\\
        &\quad + \sum_{\xi \in \cQ\uc{3}}{\diamc{q\inv_{\tau_k}(q(\xi),\phi_k(q(\xi)))} \mu\uc{\infty}_{\phi_k,p}(\xi)}\\
        &\leq \frac{\diam{\tau_k}{\phi_k}}{3} + \frac{\diam{\tau_k}{\phi_k}}{3} + \frac{1}{3 C_{ub}}\int_{\cS}{\gap{s,\phi_k(s)} \mu\uc{\infty}_{\phi_k,p}(s)~ ds} \\
        &= \frac{\diam{\tau_k}{\phi_k}}{3} + \frac{\diam{\tau_k}{\phi_k}}{3} + \frac{\Delta(\phi_k)}{3~ C_{ub}} \\
        &< \diam{\tau_k}{\phi_k},
    \end{align*}
which yields us a contradiction.~Hence, we conclude that our assumption~\eqref{assum:contra} was wrong.~This concludes the proof.
\end{proof}

\mycomment{When a policy $\phi$ is played by the algorithm in the $k$-th episode, there exists at least one $\cS$-cell $\xi \in \cQ_{\tau_k}$ such that $\mu\uc{\infty}_{\phi,p}(\xi) \geq \frac{1}{3} \diam{\tau_k}{\phi}$ and $\diamc{q\inv_{\tau_k}(q(\xi), \phi(q(\xi)))} \geq \frac{2}{3} \diam{\tau_k}{\phi}$. Let us denote $\cQ\up = \{\xi \in \cQ_{\tau_k} \mid \mu\uc{\infty}_{\phi,p}(\xi) \geq \frac{1}{3} \diam{\tau_k}{\phi}^{d_\cS + 1}\}$, and see that,
\begin{align*}
    \diam{\tau_k}{\phi} &= \sum_{\xi \in \cQ_{\tau_k}}{\diamc{q\inv_{\tau_k}(q(\xi) \phi(q(\xi)))} \mu\uc{\infty}_{\phi,p}(\xi)}\\
    &\leq \sum_{\xi \in \cQ\up}{\diamc{q\inv_{\tau_k}(q(\xi) \phi(q(\xi)))} \mu\uc{\infty}_{\phi,p}(\xi)} + \frac{1}{3} \diam{\tau_k}{\phi}.
\end{align*}
This implies that $\frac{2}{3}\diam{\tau_k}{\phi} \leq \sum_{\xi \in \cQ\up}{\diamc{q\inv_{\tau_k}(q(\xi) \phi(q(\xi)))} \mu\uc{\infty}_{\phi,p}(\xi)} \leq \diam{\tau_k}{\phi}$. This establishes our claim.}

Define, 
\al{
\eps(T) := T^{-\frac{1}{2d_\cS + d_z + 3}},~~\teps(T) := T^{-\frac{1}{2d_\cS + d + 3}}
}
Note that $\eps(T) \geq \teps(T)$ since $d_z \leq d$.~Also, note that $t\ust(\eps(T)) \leq t\ust(\teps(T))$, where $t\ust(\cdot)$ is defined ins \eqref{def:t_star_2}.


\textbf{Choosing $C_H$:} We choose the constant associated with the episode duration~\eqref{def:epi_dur} of \algo~as,
\begin{align}
    C_H \geq 16~ t\ust(\teps(T))~ \br{\frac{3(1 + C_{ub})}{1-\gamma}}^{2(d_\cS + 1)} \frac{\log{\br{\frac{12 T^2 d^\frac{d}{2}}{t\ust(\eps(T)) \teps(T)^d \delta}}} + 1}{\log(T/\delta)}. \label{def:CH}
\end{align}
\begin{lemma}\label{lem:lb_num_visit}
    Pick a sample path from the set $\cG_1 \cap \cG_{2,\eps}$, where $\cG_1$ and $\cG_{2,\eps}$ are as in \eqref{def:G_1} and \eqref{def:G2}, respectively.~Let $\zeta$ be a key cell in episode $k$ (such key cells have been shown to exist in Lemma~\ref{lem:keycell}), i.e., for some $\xi \subseteq \pi_\cS(\zeta)$ such that $\xi \in \cQ_{\tau_k}$, and for some $s \in \xi$, the following holds,
    \begin{align*}
        &\diamc{\zeta} > \frac{1}{3 C_{ub}} \max{\{\gap{s,\phi_k(s)}, C_{ub}~ \diam{\tau_k}{\phi_k}\}}, \mbox{ and},\\
        &\mu\uc{\infty}_{\phi_k,p}(\xi) \geq (\diam{\tau_k}{\phi_k} / 3)^{d_\cS + 1}.
    \end{align*}
    Then, if~$\Delta(\phi_k) \geq \eps(T) C_{ub}$, then the number of visits to $\zeta$ during the $k$-th episode can be lower-bounded as follows, 
    \al{
    n_k(\zeta) \geq \frac{4 t\ust(\teps(T))}{t\ust(\eps(T))} \br{\log{\br{\frac{12 T^2 d^\frac{d}{2}}{t\ust(\eps(T)) \teps(T)^d \delta}}} + 1} \diamc{\zeta}^{-(d_\cS + 1)}.
    }
\end{lemma}

\begin{proof}
    Recall that on $\cG_1$ we have $\diam{\tau_k}{\phi_k} \geq \frac{\Delta(\phi_k)}{C_{ub}}$~\eqref{cond:phi_k}. Hence, $\diam{\tau_k}{\phi_k} > \eps(T)$ and $\mu\uc{\infty}_{\phi_k,p}(\xi) \geq (\eps(T) / 3)^{d_\cS + 1}$.~So, upon using Corollary~\ref{cor:G_2} we obtain,
    \begin{align*}
        n_k(\zeta) \geq \frac{H_k~\mu^{(\infty)}_{\phi_k,p}(\xi)}{2 t\ust(\eps(T))} - \sqrt{\frac{H_k}{t\ust(\eps(T))} \log{\br{\frac{8 T^2 d^\frac{d}{2}}{t\ust(\eps(T)) \eps(T)^d \delta}}}} - 1.
    \end{align*}
    Next, we note that the duration of the $k$-th episode $H_k$ can be lower-bounded as follows,
    \begin{align}
        H_k &\geq \frac{C_H (1 - \gamma)^{2(d_\cS + 1)} \log{\br{T/\delta}}}{\pdiam{\tau_k}{\phi_k}^{2(d_\cS + 1)}} \notag\\
        &\geq \frac{C_H (1 - \gamma)^{2(d_\cS + 1)} \log{\br{T/\delta}}}{(3(1 + C_{ub}))^{2(d_\cS + 1)}} \br{\frac{3}{\diam{\tau_k}{\phi_k}}}^{2(d_\cS + 1)} \notag\\
        &\geq \frac{16 t\ust(\eps(T))}{\mu\uc{\infty}_{\phi_k,p}(\xi)^2} \br{\log{\br{\frac{8 T^2 d^\frac{d}{2}}{t\ust(\eps(T)) \eps(T)^d \delta}}} + 1}, \label{lb:Hk}
    \end{align}
    where the first inequality follows from the lower-bound of $H_k$~\eqref{bdd:hk}, the second inequality follows since from Corollary~\ref{cor:ub_pdiam} we have $\pdiam{\tau_k}{\phi_k} \leq (1 + C_{ub}) \diam{\tau_k}{\phi_k}$.~The third inequality follows from the fact that $\mu\uc{\infty}_{\phi_k,p}(\xi) \geq (\diam{\tau_k}{\phi_k} / 3)^{d_\cS + 1}$.~Lemma~\ref{lem:bdd_epi_tool} when combined with \eqref{lb:Hk} yields
    \begin{align*}
        n_k(\zeta) &\geq \frac{H_k~ \mu^{(\infty)}_{\phi_k,p}(\xi)}{2 t\ust(\eps(T))} - \sqrt{\frac{H_k}{t\ust(\eps(T))} \log{\br{\frac{8 T^2 d^\frac{d}{2}}{t\ust(\eps(T)) \eps(T)^d \delta}}}} - 1 \\
        &\geq \frac{H_k~ \mu^{(\infty)}_{\phi_k,p}(\xi)}{4 t\ust(\eps(T))},
    \end{align*}
    or,
    \begin{align*}
        n_k(\zeta) &\geq \frac{C_H (1 - \gamma)^{2(d_\cS + 1)} \log{\br{\frac{T}{\delta}}}}{4~t\ust(\eps(T))}~\pdiam{\tau_k}{\phi_k}^{-2(d_\cS + 1)} \times (\diam{\tau_k}{\phi_k} / 3)^{d_\cS + 1}\\
        &\geq \frac{C_H \log{\br{\frac{T}{\delta}}}}{4~t\ust(\eps(T))~(3(1 + C_{ub})^2)^{d_\cS + 1}}~\diam{\tau_k}{\phi_k}^{-(d_\cS + 1)}\\
        &\geq \frac{C_H \log{\br{\frac{T}{\delta}}}}{4~t\ust(\eps(T))~(3(1 + C_{ub}))^{2(d_\cS + 1)}}~\diamc{\zeta}^{-(d_\cS + 1)}\\
        &\geq \frac{4 t\ust(\teps(T))}{t\ust(\eps(T))} \br{\log{\br{\frac{12 T^2 d^\frac{d}{2}}{t\ust(\eps(T)) \teps(T)^d \delta}}} + 1} \diamc{\zeta}^{-(d_\cS + 1)},
    \end{align*}
    where the first inequality follows from the lower-bound of $H_k$~\eqref{bdd:hk} and from the fact that $\mu^{(\infty)}_{\phi_k,p}(\xi) \geq (\diam{\tau_k}{\phi_k} / 3)^{d_\cS + 1}$. The second and the third inequality follow from the fact that $\pdiam{\tau_k}{\phi_k} \leq (1+C_{ub})\diam{\tau_k}{\phi_k}$, and $\diam{\tau_k}{\phi_k} < 3~\diamc{\zeta}$, respectively. The fourth inequality follows from \eqref{def:CH}. This concludes the proof.
\end{proof}

\begin{lemma}\label{lem:bdd_Phi_play}
    Consider the set of policies $\Phi\uc{2^{-i}} = \{\phi \in \Phi_{SD} \mid \Delta(\phi) \in (2^{-i}, 2^{-i+1}]\}$, where $i \in \bN$.~On the set $\cG_1$,~\algo~can play policies from the set $\Phi\uc{2^{-i}}$ for a maximum of $\cO(\log{\br{\frac{T}{\delta}} 2^{i (2d_\cS + d_z + 3)}})$ time steps.
\end{lemma}
\begin{proof}
    We prove this lemma in the following three steps: First, we derive the number of episodes in which a cell can serve as a key cell while policies from $\Phi\uc{2^{-i}},~i \in \bN$ are being played. Secondly, we derive an upper-bound on the episode duration when policies from $\Phi\uc{2^{-i}}$ are played. Thirdly, we multiply upper-bounds on the number of episodes with the upper-bound on the duration of the episodes and then sum it over all possible key cells corresponding to policies in $\Phi\uc{2^{-i}}$, and this yields the desired upperbound on cumulative plays from $\Phi\uc{2^{-i}}$. 
    
    Before proceeding with proving these three properties, we begin with some preliminary results.~Recall that for $\beta>0$, the set $\cZ_\beta \subseteq \cS \times \cA$ consists of those state-action pairs $(s, a)$ for which $\gap{s,a} \leq \beta$.~Let us denote the smallest subset of $\cP_t$ that covers $\cZ_\beta$, as the active covering of $\cZ_\beta$ at time $t$.~From Lemma~\ref{lem:keycell}, we obtain that if for all $j = 0, 1, \ldots, i$, the active covering of $\cZ_{2^{-j}}$ at time $\tau_k$ does not contain a cell $\zeta$ that satisfies the following conditions, 
    \begin{enumerate}
        \item $\diamc{\zeta} \geq \frac{\sqrt{d}}{3 C_{ub}}~2^{-j}$,  and 
        \item $\mu\uc{\infty}_{\phi,p}(\xi) \geq \br{\Delta(\phi)/3 C_{ub}}^{d_\cS + 1}$ for all $\xi$ which satisfy $\xi \in \cQ_{\tau_k}$ and $\xi \subseteq \pi_\cS(\zeta)$,
    \end{enumerate}
    then there is no cell that qualifies to be a key cell for a policy from the set $\Phi\uc{2^{-i}}$.~Thus, under the above condition,~\algo~will not play a policy from $\Phi\uc{2^{-i}}$ $k$-th episode onwards.~Let $\cY_j$ be the covering of $\cZ_{2^{-j}}$ by cells of diameter $\frac{\sqrt{d}}{3 C_{ub}}~2^{-j}$.~We make the following observation: If every cell in $\cY_j$ for $j = 1, 2, \ldots, i$ is split, then no cell in the active covers of $\cZ_{2^{-j}}$ for $j = 1, 2, \ldots, i$ can serve as the key cell while playing policies from $\Phi\uc{2^{-i}}$. This is a sufficient condition for any policy from $\Phi\uc{2^{-i}}$ to be not played by \algo.
    
    \texttt{Step 1:} First, we bound the number of episodes when a cell $\zeta \in \cY_i$ or any of its ancestors has served as a key cell.~From the cell activation rule~\eqref{def:activationrule}, we have that $\zeta$ would be split when the number of visits to $\zeta$ exceeds $c_a 2^{d_\cS+2} \log{\br{\frac{T}{\delta}}} \diamc{\zeta}^{-(d_\cS+2)}$. In Lemma~\ref{lem:bdd_Phi_play}, we derived the lower-bound on the number of visits to a key cell. Invoking that lower-bound, we obtain that $\zeta$ can be played in at most
    \begin{align*}
        \frac{c_a t\ust(\eps(T)) 2^{d_\cS+2} \log{\br{\frac{T}{\delta}}}}{4 t\ust(\teps(T)) \br{\log{\br{\frac{12 T^2 d^\frac{d}{2}}{t\ust(\eps(T)) \teps(T)^d \delta}}} + 1}}  \diamc{\zeta}^{-1}
    \end{align*}
    episode as a key cell when the corresponding episode plays a policy from $\Phi\uc{2^{-i}}$. Replacing $\diamc{\zeta}$ with $\frac{\sqrt{d}}{3 C_{ub}}~2^{-j}$, we obtain that $\zeta$ can be played in at most
    \begin{align*}
        \frac{3 c_a t\ust(\eps(T)) C_{ub} 2^{d_\cS+2} \log{\br{\frac{T}{\delta}}}}{4 t\ust(\teps(T)) \sqrt{d} \br{\log{\br{\frac{12 T^2 d^\frac{d}{2}}{t\ust(\eps(T)) \teps(T)^d \delta}}} + 1}} ~2^{j}
    \end{align*}
    episode as a key cell when the corresponding episode plays a policy from $\Phi\uc{2^{-i}}$.
    
    \texttt{Step 2:} Now, we produce an upper-bound on the length of the episodes while playing policies from $\Phi\uc{2^{-i}}$. See that
    \begin{align*}
        H_k &\leq \frac{C_H (1+\gamma)^{2(d_\cS + 1)} \log{\br{\frac{T}{\delta}}}}{\pdiam{\tau_k}{\phi_k}^{2(d_\cS + 1)}} \\
        &\leq \frac{C_H (1+\gamma)^{2(d_\cS + 1)} \log{\br{\frac{T}{\delta}}}}{\diam{\tau_k}{\phi_k}^{2(d_\cS + 1)}} \\
        &\leq \frac{C_H ((1+\gamma) C_{ub})^{2(d_\cS + 1)} \log{\br{\frac{T}{\delta}}}}{2^{-i 2(d_\cS + 1)}},
    \end{align*}
    where the first inequality follows from the upper-bound on $H_k$~\eqref{bdd:hk}, the second inequality follows from Corollary~\ref{cor:opt_pdiam}, and the third inequality follows from the definition of $\Phi\uc{2^{-i}}$.
    
    \texttt{Step 3:} First, we note that the cardinality of $\cY_j$ is at most $c_z 2^{j d_z}$ for every $j \in \bN$, where the scaling constant of the zooming dimension,
    \al{
        c_s := \frac{\sqrt{d}}{3C_{ub}}. \label{def:cs}
    }
    This follows from the definition of the zooming dimension~\eqref{def:zoomingdim}.~Multiplying the bounds from step $1$ and step $2$, we obtain an upper-bound on the number of plays of a cell $\zeta \in \cY_j$ as a key cell while playing policies from $\Phi\uc{2^{-i}}$. Summing this upper-bound for all cells in $\cY_j$ and then summing those terms over $j = 1, 2, \ldots, i$, we obtain that the total number of time steps in which policies from $\Phi\uc{2^{-i}}$ is played, can be bounded above by 
    \begin{align*}
        &\sum_{j = 1}^{i}{\sum_{\zeta \in \cY_j}{\br{\frac{3 c_a t\ust(\eps(T)) C_{ub} 2^{d_\cS+2} \log{\br{\frac{T}{\delta}}}}{4 t\ust(\teps(T)) \sqrt{d} \br{\log{\br{\frac{12 T^2 d^\frac{d}{2}}{t\ust(\eps(T)) \teps(T)^d \delta}}} + 1}} ~2^{j}} \times \br{\frac{C_H ((1+\gamma)C_{ub})^{2(d_\cS + 1)} \log{\br{\frac{T}{\delta}}}}{2^{-i 2(d_\cS + 1)}}}}}\\
        &=\frac{3 c_a c_z t\ust(\eps(T)) C_H C_{ub}^{2d_\cS + 3} (1+\gamma)^{2(d_\cS + 1)} 2^{d_\cS+2} \br{\log\br{\frac{T}{\delta}}}^2}{4 t\ust(\teps(T)) \sqrt{d} \br{\log{\br{\frac{12 T^2 d^\frac{d}{2}}{t\ust(\eps(T)) \teps(T)^d \delta}}} + 1}} 2^{i 2(d_\cS + 1)} \sum_{j = 0}^{i}{2^{j (d_z + 1)}} \\
        &\leq \frac{3 c_a c_z t\ust(\eps(T)) C_H C_{ub}^{2d_\cS + 3} (1+\gamma)^{2(d_\cS + 1)} 2^{d_\cS+1} \br{\log\br{\frac{T}{\delta}}}^2}{ t\ust(\teps(T)) \sqrt{d} \br{\log{\br{\frac{12 T^2 d^\frac{d}{2}}{t\ust(\eps(T)) \teps(T)^d \delta}}} + 1}} 2^{i (2d_\cS + d_z + 3)}.
    \end{align*}
    This concludes the proof.
\end{proof}
Let us denote
\begin{align}
    C\up := \frac{3 c_a c_z t\ust(\eps(T)) C_H C_{ub}^{2d_\cS + 3} (1+\gamma)^{2(d_\cS + 1)} 2^{d_\cS+1} \br{\log\br{\frac{T}{\delta}}}^2}{ t\ust(\teps(T)) \sqrt{d} \br{\log{\br{\frac{12 T^2 d^\frac{d}{2}}{t\ust(\eps(T)) \teps(T)^d \delta}}} + 1}}. \label{def:Cup}
\end{align}
As has been discussed earlier at the beginning of this section, we derive an upper-bound on (a) of \eqref{eq:decompregret} by summing the three terms: the regret due to playing policies from the set $\Phi\uc{2^{-i}},~i = 1, 2, \ldots, \ceil{\log{1/\eps(T)}}$, the regret due to playing other policies, and the suboptimality that arises due to the inaccuracy in the solution of the extended MDPs at the beginning of every episode, which can be bounded by $\sqrt{T}$. The first term is bounded using the bound obtained on the number of plays of policies from $\Phi\uc{i}$ in Lemma~\ref{lem:bdd_Phi_play}.~The regret arising from playing policies that are not in $\cup_{i=1}^{\ceil{\log{1/\eps}}}{\Phi\uc{2^{-i}}}$ is at most $\eps(T) T$. Hence,
\begin{align}
    \sum_{k=1}^{K(T)}{H_k (J\ust_\cM - J_\cM(\phi_k)} &\leq C\up \sum_{i=1}^{i\ust}{2^{i(2d_\cS + d_z + 3)} \times 2^{-i+1}} + \eps(T) T + \sqrt{T}\notag\\
    & \leq 2 C\up~ 2^{i\ust(2d_\cS + d_z + 2)} + T^\frac{2d_\cS + d_z + 2}{2d_\cS + d_z + 3} + \sqrt{T} \notag\\
    & \leq (2 C\up + 1)~ T^\frac{2d_\cS + d_z + 2}{2d_\cS + d_z + 3} + \sqrt{T},\label{bdda}
\end{align}
where the second step follows from Lemma~\ref{lem:bdd_Phi_play}.

\textbf{Bounding} (b): We now provide an upper-bound on the term $(b)$ of~\eqref{eq:decompregret}. This proof relies on the uniform ergodicity property~(Assumption~\ref{assum:unif_ergodic}) of the underlying MDP $\cM$ and a trick that converts Markovian noise to martingale noise using the Poisson equation~\eqref{eq:pois} \citep{metivier1984applications}.
\begin{prop}\label{prop:bddb}
Define
\al{
\cG_3 := \left\{ \omega:~\eqref{bdd:fluctuation} \mbox{ holds } \right\},
}
    \begin{align}
        \sum_{k=1}^{K(T)}{\sum_{t=\tau_k}^{\tau_{k+1}-1}{J_\cM(\phi_k) - r(s_t,\phi_k(s_t))}} \leq \frac{m\ust}{1-\alpha} \sqrt{\frac{T}{2} \log{\br{\frac{3}{\delta}}}} + \frac{m\ust}{1 - \alpha} (1 + K(T)), \label{bdd:fluctuation}
    \end{align}
    where $K(T)$ denotes the total number of episodes until time $T$, and $m\ust = \ceil{\log_{\frac{1}{\alpha}}\br{C}} + 1$. 
Then, we have,
\al{
\bP\br{\cG_3} \geq 1 - \frac{\delta}{3},~\delta \in (0,1). \label{bdd:fluc}
}
\end{prop}
\begin{proof}
    Let us denote the episode index at time $t$ by $k(t)$.~We begin by converting the Markovian noise to a martingale difference sequence, i.e.,
    \begingroup
        \allowdisplaybreaks
        \begin{align}
            &\sum_{t=0}^{T-1}{J_\cM(\phi_{k(t)}) - r(s_t,\phi_{k(t)}(s_t))} \notag\\
            &= \sum_{t=0}^{T-1}{\int_{\cS}{h^{\phi_{k(t)}}_\cM(s) p(s_t,\phi_{k(t)}(s_t), ds)} - h^{\phi_{k(t)}}_\cM(s_t)} \notag\\
            &= \sum_{t=1}^{T-1}{\int_{\cS}{h^{\phi_{k(t)}}_\cM(s) p(s_{t-1},\phi_{k(t-1)}(s_{t-1}), ds)} - h^{\phi_{k(t)}}_\cM(s_t)} \notag\\
            &\quad + \sum_{t=1}^{T-1}{\int_{\cS}{ h^{\phi_{k(t)}}_\cM(s) p(s_t,\phi_{k(t)}(s_t), ds)} - \int_{\cS}{ h^{\phi_{k(t)}}_\cM(s) p(s_{t-1},\phi_{k(t-1)}(s_{t-1}), ds)}} \notag\\
            &\quad + \int_{\cS}{h^{\phi_1}_\cM(s) p(s_0,\phi_1(s_0), ds)} - h^{\phi_1}_\cM(s_0) \notag\\
            &= \sum_{t=1}^{T-1}{\int_{\cS}{h^{\phi_{k(t)}}_\cM(s) p(s_{t-1},\phi_{k(t-1)}(s_{t-1}), ds)} - h^{\phi_{k(t)}}_\cM(s_t)} \notag\\
            &\quad + \sum_{t=1}^{T-1}{\int_{\cS}{\br{h^{\phi_{k(t)}}_\cM(s) -  h^{\phi_{k(t-1)}}_\cM(s)} p(s_{t-1},\phi_{k(t-1)}(s_{t-1}), ds)}} \notag\\
            &\quad + \int_{\cS}{h^{\phi_{k(T-1})}_\cM(s) p(s_{T-1},\phi_{k(T-1)}(s_{T-1}), ds)} - h^{\phi_1}_\cM(s_0). \label{ineq:abs_dif_2}
        \end{align}
    \endgroup    
    Now consider the first summation term in the r.h.s. of \eqref{ineq:abs_dif_2}.~Denote $m_t = \int_{\cS}{h^{\phi_{k(t)}}_\cM(s) p(s_{t-1},\phi_{k(t-1)}(s_{t-1}), ds)} - h^{\phi_{k(t)}}_\cM(s_t)$.~Noting that $\phi_k$ is $\cF_{\tau_k-1}$-measurable, we obtain the following:
    \begin{align*}
        \bE\sqbr{m_t \mid \cF_{t-1}} &= \bE\sqbr{\int_{\cS}{h^{\phi_{k(t)}}_\cM(s) p(s_{t-1},\phi_{k(t-1)}(s_{t-1}), ds)} - h^{\phi_{k(t)}}_\cM(s_t) \mid \cF_{t-1}} \\
        &= \int_{\cS}{h^{\phi_{k(t)}}_\cM(s) p(s_{t-1},\phi_{k(t-1)}(s_{t-1}), ds)} - \int_{\cS}{h^{\phi_{k(t)}}_\cM(s) p(s_{t-1},\phi_{k(t-1)}(s_{t-1}), ds)}\\
        &=0.
    \end{align*}
    Hence, $\flbr{m_t}$ is a martingale difference sequence.~Also, from the bound on the span of $h^\phi_\cM$ that was derived in Lemma~\ref{lem:bdd_rvf_spn}, we have that $m_t \in \sqbr{-\frac{m\ust}{1-\alpha}, \frac{m\ust}{1-\alpha}}$.~An application of Azuma-Hoeffding inequality~(Lemma~\ref{lem:ah_ineq}), yields the following: for each $\delta \in (0,1)$, with probability at least $1 - \frac{\delta}{3}$ we have,
    \begin{align}
        \sum_{t=1}^{T-1}{\int_{\cS}{h^{\phi_{k(t)}}_\cM(s) p(s_{t-1},\phi_{k(t-1)}(s_{t-1}), ds)} - h^{\phi_{k(t)}}_\cM(s_t)} \leq \frac{m\ust}{1-\alpha} \sqrt{\frac{T}{2} \log{\br{\frac{3}{\delta}}}}. \label{bd:1}
    \end{align}
    Now, consider the second summation term in the r.h.s. of \eqref{ineq:abs_dif_2}. The $t$-th element in this summation can assume a non-zero value only when a new episode starts at time $t$.~Hence, upon using Lemma~\ref{lem:bdd_rvf_spn}, we conclude that this summation can be upper-bounded as
    \begin{align}
        \sum_{t=1}^{T-1}{\int_{\cS}{\br{h^{\phi_{k(t)}}_\cM(s) -  h^{\phi_{k(t-1)}}_\cM(s)} p(s_{t-1},\phi_{k(t-1)}(s_{t-1}), ds)}} \leq \frac{m\ust}{1 - \alpha} K(T), \label{bd:2}
    \end{align}
    where $K(T)$ denotes the number of episodes that have been started until time $T$ by the learning algorithm.~Again by using Lemma~\ref{lem:bdd_rvf_spn}, the third term can be bounded as,
    \begin{align}
        \int_{\cS}{h^{\phi_{k(T-1})}_\cM(s) p(s_{T-1},\phi_{k(T-1)}(s_{T-1}), ds)} - h^{\phi_1}_\cM(s_0)\leq \frac{m\ust}{1 - \alpha}. \label{bd:3}
    \end{align}
    Putting all the individual bounds from \eqref{bd:1}, \eqref{bd:2} and \eqref{bd:3} together, we have that for any $\delta \in (0,1)$ with probability at least $1 - \delta$,
    \begin{align}
        \sum_{t=1}^{T-1}{J_\cM(\phi_{k(t)}) - r(s_t,\phi_{k(t)}(s_t))} \leq \frac{m\ust}{1-\alpha} \sqrt{\frac{T}{2} \log{\br{\frac{3}{\delta}}}} + \frac{m\ust}{1 - \alpha} (1 + K(T)).\label{ub:b}
    \end{align}
    This concludes the proof.
\end{proof}
Upon combining the upper-bounds on all the terms of the regret decomposition, we obtain the upper-bound on the regret. This is done in the next section.

\subsection{Proof of Theorem~\ref{thm:regupperbound}}
\begin{proof}
    We first derive an upper-bound on $K(T)$, which is the total number of episodes.~The number of episodes of length greater than $T^{\frac{2 d_\cS + 2}{2 d_\cS + d_z + 3}}$ is trivially bounded above by $T^{\frac{d_z + 1}{2 d_\cS + d_z + 3}}$. Now let us bound the number of episodes of length less than $T^{\frac{2 d_\cS + 2}{2 d_\cS + d_z + 3}}$. If the length of the $k$-th episode is less than $T^{\frac{2 d_\cS + 2}{2 d_\cS + d_z + 3}}$, then from the rule of setting episode duration~\eqref{def:epi_dur}, we have
    \begin{align*}
        \frac{C_H \log{\br{\frac{T}{\delta}}}}{\pdiam{\tau_k}{\phi_k}^{2(d_\cS + 1)}} \leq T^{\frac{2 d_\cS + 2}{2 d_\cS + d_z + 3}},
    \end{align*}
    or
    \begin{align*}
        \pdiam{\tau_k}{\phi_k} \geq \br{C_H \log{\br{\frac{T}{\delta}}}}^{\frac{1}{2(d_\cS+1)}} T^{-\frac{1}{2 d_\cS + d_z + 3}}.
    \end{align*}
    From Corollary~\ref{cor:opt_pdiam} and Corollary~\ref{cor:ub_pdiam}, we obtain that
    \begin{align*}
        \frac{1}{3(C_{ub}+1)}\pdiam{\tau_k}{\phi_k} \leq \frac{1}{3} \diam{\tau_k}{\phi_k}.
    \end{align*}
    Also, from the condition of a cell $\zeta$ to be a key cell in the $k$-th episode, we have that
    \begin{align*}
        \diamc{\zeta} \geq \frac{1}{3} \diam{\tau_k}{\phi_k}.
    \end{align*}
    Combining the above three relations, we obtain that if the length of the $k$-th episode is less than $T^{\frac{2 d_\cS + 2}{2 d_\cS + d_z + 3}}$, then the diameter of the corresponding key cell is greater than 
    \begin{align*}
        \frac{\br{C_H \log{\br{\frac{T}{\delta}}}}^{\frac{1}{2(d_\cS+1)}}}{3(C_{ub}+1)} T^{-\frac{1}{2 d_\cS + d_z + 3}}.
    \end{align*}
    From the definition of the zooming dimension~\eqref{def:zoomingdim}, it follows that there can at most be $\cO\br{T^{\frac{d_z}{2 d_\cS + d_z + 3}}}$ such key cells activated by \algo, and each key cell of level $\ell$ becomes deactivated when it has been played in $\cO(2^\ell)$ episodes.~Hence there can be at most $\cO\br{T^{\frac{d_z + 1}{2 d_\cS + d_z + 3}}}$ episodes of length less than $T^{\frac{2 d_\cS + 2}{2 d_\cS + d_z + 3}}$. Hence,
    \begin{align*}
        K(T) \leq C_K T^{\frac{d_z + 1}{2 d_\cS + d_z + 3}},
    \end{align*}
    where $C_K$ is a constant.
    
    We now add all the upper-bounds of various regret components from \eqref{bdda} and \eqref{ub:b}, and use the upper-bound on $K(T)$ derived above. This yields,
    \begin{align*}
        \cR(T;\algo) & \leq (2 C\up + 1)~ T^\frac{2d_\cS + d_z + 2}{2d_\cS + d_z + 3} + \sqrt{T} + \frac{m\ust}{1-\alpha} \sqrt{\frac{T}{2} \log{\br{\frac{3}{\delta}}}} + \frac{m\ust}{1 - \alpha} (1 + K(T)) \\
        & \leq (2 C\up + 1)~ T^\frac{2d_\cS + d_z + 2}{2d_\cS + d_z + 3} +  \br{1 + \frac{m\ust}{1-\alpha} \sqrt{\frac{1}{2} \log{\br{\frac{3}{\delta}}}}} \sqrt{T} + \frac{m\ust}{1 - \alpha} \br{1 + C_K T^{\frac{d_z + 1}{2 d_\cS + d_z + 3}}} \\
        &= \ctO\br{T^{\frac{2 d_\cS + d_z + 2}{2 d_\cS + d_z + 3}}}.
    \end{align*}
    Note that $\bP(\cG_1 \cap \cG_{2,\eps} \cap \cG_3) \geq 1 - \delta$.~Thus, we have the desired regret upper-bound with probability at least $1 - \delta$.
\end{proof}