\section{Concentration Inequality}\label{app:conc_ineq}
In this section, we will show that the discretized MDP kernel belongs to a confidence ball around its estimate.~First, let us introduce some notations.~Let $\tilde{\cZ} \subseteq \cS \times \cA$, and $\tilde{\cQ}$ be a partition of $\cS$ that is made of $\cS$-cells.~Let $\tilde{\cS}$ be the set of representative points of the $\cS$-cells in $\tilde{\cQ}$.~Recall the discretization of $p$ given $\tilde{\cZ}$ and $\tilde{\cS}$, $\wp_{\tilde{\cZ} \to \tilde{\cS},p}$~\eqref{def:disc_p}.~Denote the continuous extension of $\wp_{\tilde{\cZ} \to \tilde{\cS},p}$ by $\bar{\wp}_{\tilde{\cZ} \to \tilde{\cS},p}$, i.e.,
\begin{align*}
    \bar{\wp}_{\tilde{\cZ} \to \tilde{\cS},p}(z,B) := \sum_{\xi \in \cQ}{\frac{\lambda(B \cap \xi)}{\lambda(\xi)} \wp_{\tilde{\cZ} \to \tilde{\cS},p}(z,q(\xi))},
\end{align*}
for every $z \in \cZ, B \in \cB_\cS$.~Define the set,
\begin{align}\label{def:G_1}
    \cG_1 := \cap_{t=0}^{T-1}{\flbr{\norm{\wp_{\cS \times \cA \to \cS_t, p}(z\up, \cdot) - \wp_{\cZ_t \to \cS_t, \hat{p}_t}(z,\cdot)}_1} \leq \eta_t(\zeta) \mbox{ for every } z \in \cZ_t, z\up \in q\inv(z)}.
\end{align}
We show that $\cG_1$ holds with a high probability.
\begin{lemma}\label{lem:conc_ineq}
    $\bP(\cG_1) \geq 1 - \frac{\delta}{3}$, where $\cG_1$ is as in~\eqref{def:G_1}.
\end{lemma}

\begin{proof}
    Fix $t$, and consider a point $z \in \cZ_t$.~Within this proof, we denote $q\inv_t(z)$ by $\zeta$.~Let $\zeta$ be of level $\ell$, and note that $\zeta$ is active at time $t$.~Let $z\up$ be an arbitrary point in $\zeta$.~We want to get a high probability bound on $\norm{\wp_{\cZ_t \to \cS_t,\hat{p}_t}(z,\cdot) - \wp_{\cS \times \cA \to \cS_t,p}(z,\cdot)}_1$.~We have,
    \begin{align}
        &\norm{\wp_{\cZ_t \to \cQ_t, \hat{p}_t}(z,\cdot) - \wp_{\cS \times \cA \to \cS_t,p}(z,\cdot)}_1 \notag\\
        = & \norm{\hat{p}_t(z,\cdot) - \bar{\wp}_{\cS \times \cA \to \cS_t,p}(z\up,\cdot)}_{TV} \notag\\
        \leq & \norm{\hat{p}_t(z,\cdot) - \bar{\wp}_{\cS \times \cA \to \cS\uc{\ell},p}(z\up,\cdot)}_{TV} + \norm{\bar{\wp}_{\cS \times \cA \to \cS\uc{\ell},p}(z\up,\cdot) - \bar{\wp}_{\cS \times \cA \to \cS_t,p}(z\up,\cdot)}_{TV} \notag\\
        \leq & \norm{\hat{p}\uc{d}_t(z,\cdot) - \wp_{\cS \times \cA \to \cS\uc{\ell},p}(z\up,\cdot)}_{1} + \norm{\bar{\wp}_{\cS \times \cA \to \cS\uc{\ell},p}(z\up,\cdot) - \bar{\wp}_{\cS \times \cA \to \cS_t,p}(z\up,\cdot)}_{TV}. \label{eq:con_ineq_decomp}
    \end{align}
    By definition, $\cQ_t$ is a finer partition of $\cS$ than $\cQ\uc{\ell}$.~Hence, from Lemma~\ref{lem:disc_dist}, we have that \nal{\norm{\bar{\wp}_{\cS \times \cA \to \cS\uc{\ell},p}(z\up,\cdot) - \bar{\wp}_{\cS \times \cA \to \cS_t,p}(z\up,\cdot)}_{TV} \leq C_p~\diamc{\zeta}.}
    
    Next, we will provide a high probability upperbound on the first term of r.h.s. of \eqref{eq:con_ineq_decomp}.~We will denote $\wp_{\cS \times \cA \to \cS\uc{\ell},p}(z\up,\cdot)$ by $p\uc{d}_t(z\up,\cdot)$ in order to simplify the notation.~Note that both $\hat{p}\uc{d}_t(z,\cdot)$ and $p\uc{d}_t(z\up,\cdot)$ have the support $\tilde{\cS}_t(z)$, where $|\tilde{\cS}_t(z)| \leq d^{\frac{d_\cS}{2}} \diamc{\zeta}^{-d_\cS}$.~Let $\tilde{\cS}^+_t(z)$ denote the collection of those points in $\cS_t$ such that for any $s \in \tilde{\cS}^+_t(z)$, we have $\hat{p}\uc{d}_t(z,s) - p\uc{d}_t(z\up,s) > 0$.~So, we can write the following:
    \begin{align}
        \bP\br{\norm{\hat{p}\uc{d}_t(z,\cdot) - p\uc{d}_t(z\up,\cdot)}_1 \geq \iota} &= \bP\br{\max_{\cS\up \subset \tilde{\cS}^+_t(z)}{\sum_{s \in \cS\up}{\hat{p}\uc{d}_t(z,s) - p\uc{d}_t(z\up,s)}} \geq \frac{\iota}{2}} \notag\\
        &= \bP\br{\cup_{\cS\up \subset \tilde{\cS}^+_t(z)}{\flbr{\sum_{s \in \cS\up}{\hat{p}\uc{d}_t(z,s) - p\uc{d}_t(z\up,s) \geq \frac{\iota}{2}}}}}.\label{cineq:union}
    \end{align}
    Note that if $\cS\up \subset \tilde{\cS}^+_t(z)$, then $\tilde{\cS}_t(z) \setminus \cS\up \not\subset \tilde{\cS}^+_t(z)$. Hence the number of subsets of $\tilde{\cS}^+_t(z)$ is at most $2^{|\tilde{\cS}_t(z)|-1}$.~If $\bP\br{\sum_{s \in \cS\up}{\hat{p}\uc{d}_t(z,s) - p\uc{d}_t(z\up,s) \geq \frac{\iota}{2}}} \leq b_\iota,~\forall \cS\up \subset \tilde{\cS}^+_t(z)$, then by an application of union bound in~\eqref{cineq:union}, we obtain that the following must hold,
    \begin{align}
        \bP\br{\norm{\hat{p}\uc{d}_t(z,\cdot) - p\uc{d}_t(z\up,\cdot)}_1 \geq \iota} \leq 2^{|\tilde{\cS}_t(z)|-1} b_\iota.\label{ineq:l1byunion}
    \end{align}
    Consider a fixed $\xi \subseteq \cS$.~Define the following random processes,
    \begin{align}
        v_i(z) &:= \ind{(s_i, a_i) \in \zeta_i},\\
        v_i(z,\xi) &:= \ind{(s_i, a_i, s_{i+1}) \in \zeta_i \times \xi},\\
        w_i(z,\xi) &:= v_i(z,\xi) - p(s_i,a_i,\xi) v_i(z),
    \end{align}
    where $i = 0, 1, \ldots, T-1$.~Let $\cS\up \subset S^+_t$ and $\xi = \cup_{s \in \cS\up}{q\inv(s)}$. Then we have,
    \begin{align}
        \sum_{s \in \cS\up}{\hat{p}\uc{d}_t(z,s) - p\uc{d}_t(z\up,s)} &= \frac{N_t\br{\zeta, \xi}}{N_t\br{\zeta}} - p(z\up,\xi) \notag\\
        &= \frac{N_t\br{\zeta, \xi} - p(z\up,\xi) N_t\br{\zeta}}{N_t\br{\zeta}} \notag\\
        &\leq \frac{1}{N_t\br{\zeta}}\br{\sum_{i = 0}^{t - 1}{w_i(z,\xi)}} + \frac{L_p}{2 N_t\br{\zeta}} \sum_{i=0}^{N_t(\zeta)}{\diamc{\zeta_{t_i}}}\notag\\
        &\leq \frac{1}{N_t\br{\zeta}}\br{\sum_{i = 0}^{t - 1}{w_i(z,\xi)}} + 1.5 L_p~ \diamc{\zeta}, \label{ineq:determ}
    \end{align}
    where the last step follows from Lemma~\ref{lem:avg_diam}.~Note that $\flbr{w_i(z,\zeta)}_{i \in [T-1]}$ is martingale difference sequence w.r.t. $\flbr{\cF_i}_{i \in [T-1]}$.~Moreover, $\abs{w_i(z,\zeta)} \leq 1$. Hence from Lemma~\ref{lem:ah_ineq} we have,
    \begin{align*}
        \bP\br{\flbr{\frac{\sum_{i=0}^{t-1}{w_i(z,\xi)}}{N_t\br{\zeta}} \geq \sqrt{\frac{2}{N_t(\zeta)} \log{\br{\frac{3}{\delta}}}}, N_t(\zeta) = N}} \leq \frac{\delta}{3}.
    \end{align*}
    Upon combining this with~\eqref{ineq:determ} we get,
    \begin{align*}
        \bP\br{\flbr{\sum_{s \in \cS\up}{\hat{p}\uc{d}_t(z,s) - p\uc{d}_t(z\up,s)} \geq \sqrt{\frac{2}{N_t(\zeta)} \log{\br{\frac{3}{\delta}}}} + 1.5 L_p~ \diamc{\zeta}, N_t(\zeta) = N}}  \leq \frac{\delta}{3}.
    \end{align*}
    Upon using~\eqref{ineq:l1byunion} in the above, and taking a union bound over all possible values of $N$, we obtain,
    \begin{align*}
        \bP\br{\flbr{\norm{\hat{p}\uc{d}_t(z,\cdot) - p\uc{d}_t(z\up,\cdot)}_1 \geq \sqrt{\frac{2 |\tilde{\cS}_t(z)|}{N_t(\zeta)} \log{\br{\frac{3T}{\delta}}}} + 3 L_p~ \diamc{\zeta}, N_t(\zeta) = N}} \leq \frac{\delta}{3}.
    \end{align*}
    Note that we do not have to take a union over all possible values of $\tilde{\cS}_t(z)$ because of the one-to-one correspondence between $N_t(\zeta)$ and $\tilde{\cS}_t(z)$.~Replacing $|\tilde{\cS}_t(z)|$ by its upper-bound $d^{\frac{d_\cS}{2}} \diamc{\zeta}^{-d_\cS}$, we have,
    \begin{align}
        \bP\br{\norm{\hat{p}\uc{d}_t(z,\cdot) - p\uc{d}_t(z\up,\cdot)}_1  \geq \diamc{\zeta}^{-\frac{d_\cS}{2}} \sqrt{\frac{2~d^{\frac{d_\cS}{2}} \log{\br{\frac{3 T}{\delta}}}}{N_t(\zeta)}} + 3 L_p~ \diamc{\zeta}} \leq \frac{\delta}{3}.
    \end{align}
    Let $\cN_1 := 2 d^\frac{d}{2} \br{\frac{T}{c_a \log{\br{T/\delta}}}}^\frac{d}{d_\cS + 2}$, which is the number of cells the \algo~can activate under all sample paths.~Upon taking union bound over all the cells that could possibly be activated in all possible sample paths at some $t$ and using the fact that $N_t(\zeta) \geq N_{\min}(\zeta)$, the above inequality yields that with a probability at least $1 - \frac{\delta}{3}$, the following holds,
    \begin{align}
        \norm{\hat{p}\uc{d}_t(z,\cdot) - p\uc{d}_t(z\up,\cdot)}_1 &\leq 3~\br{\frac{c_a \log{\br{\frac{T}{\delta}}}}{N_t(\zeta)}}^\frac{1}{d_\cS + 2} + 3 L_p~ \diamc{\zeta},\label{ineq:mu_z}
    \end{align}
    for every $z \in \zeta$, $\zeta \in \cP_t$, and $t \in \{0,1, \ldots, T-1\}$, where $c_a$ is a constant that satisfies
    \al{
        d^{\frac{d_\cS}{2}} \log{\br{\frac{3 T \cN_1}{\delta}}} \leq 4.5 c_a \log{\br{\frac{T}{\delta}}}. \label{def:ca}
    }
    After some algebraic manipulation, we obtain that it suffices to have,
    \nal{
        c_a = \frac{2 d^{\frac{d_\cS}{2}}}{9} \frac{\log{\br{6 d^\frac{d}{2}}}}{\log{\br{\frac{T}{\delta}}}} + \frac{d}{d_\cS+2} + 1.
    }
    The proof follows upon combining the upper-bounds of the first and the second terms of \eqref{eq:con_ineq_decomp}.
\end{proof}

\begin{remark}\label{rem:conc_ineq}
    See that $\cap_{t=0}^{T-1}\{\wp_{\cS_t \times \cA_t \to \cS_t, p}(\cdot, \cdot) \in \cC_t\} \subseteq \cG_1$, where $\cC_t$ is as defined in~\eqref{def:confball}. Hence, 
    \nal{
        \bP\br{\cap_{t=0}^{T-1}\{\wp_{\cS_t \times \cA_t \to \cS_t, p} \in \cC_t\}} \geq 1 - \frac{\delta}{3}.
    }
\end{remark}