\section{Auxiliary Results}\label{app:aux_res}

In this section, we derive some useful properties of the algorithm that are used in the proof of regret upper-bound.~The first lemma shows that for any active cell $\zeta$ at time $t$, the quantity $\frac{1}{N_t(\zeta)}{\sum_{i=1}^{N_t(\zeta)}{\diamc{\zeta_{t_i}}}}$ is bounded above by $3~\diamc{\zeta}$.~We use this in concentration inequality for the transition kernel estimate.

\begin{lemma}\label{lem:avg_diam}
    For all $t \in [T-1]$ and $\zeta \in \cP_t$, let $t_i$ denote the time instance when $\zeta$ or any of its ancestor was visited by \algo~for the $i$-th time. Then
    \begin{align*}
        \frac{1}{N_t(\zeta)} \sum_{i=1}^{N_t(\zeta)}{\diamc{\zeta_{t_i}}} \leq 3~ \diamc{\zeta}.
    \end{align*}
\end{lemma}
\begin{proof}
    By the activation rule~\eqref{def:activationrule}, a cell $\zeta\up$ can be played at most $N_{\max}(\zeta\up) - N_{\min}(\zeta\up) = \tilde{c}_a 2^{2\ell(\zeta\up)} + \frac{\tilde{c}_a}{3} \ind{\zeta\up = \cS \times \cA}$ times while being active, where $\tilde{c}_a = 3 c_a d\inv \log{\br{\frac{T}{\eps \delta}}}~ \eps^{-d_\cS}$. We can write,
    \begin{align*}
        \frac{1}{N_t(\zeta)} \sum_{i=1}^{N_t(\zeta)}{\diamc{\zeta_{t_i}}} &= \frac{1}{N_t(\zeta)} \sum_{i=1}^{N_{\min}(\zeta)}{\diamc{\zeta_{t_i}}} + \frac{1}{N_t(\zeta)} \sum_{i=N_{\min}(\zeta)+1}^{N_t(\zeta)}{\diamc{\zeta_{t_i}}}\\
        &= \frac{\tilde{c}_a \sqrt{d}}{3 N_t(\zeta)} + \frac{\tilde{c}_a \sqrt{d}}{N_t(\zeta)} \sum_{\ell = 0}^{\ell(\zeta) - 1}{2^\ell} +  \frac{N_t(\zeta) - N_{\min}(\zeta) - 1}{N_t(\zeta)} \diamc{\zeta} \\
        &< \frac{\tilde{c}_a \sqrt{d}}{N_t(\zeta)} 2^{\ell(\zeta)} + \frac{N_t(\zeta) - N_{\min}(\zeta) - 1}{N_t(\zeta)} \diamc{\zeta} \\
        &= \frac{3 N_{\min}(\zeta)}{N_t(\zeta)} \diamc{\zeta} + \frac{N_t(\zeta) - N_{\min}(\zeta) - 1}{N_t(\zeta)} \diamc{\zeta}\\
        &=\frac{(N_t(\zeta) + 2 N_{\min}(\zeta) - 1)~ \diamc{\zeta}}{N_t(\zeta)}\\
        &\leq 3~ \diamc{\zeta},
    \end{align*}
    where the last step is due to the fact that $N_{\min}(\zeta) \leq N_t(\zeta)$.
\end{proof}
Next, we show that under Assumption~\ref{assum:bdd_der}, the total variation norm between $\bar{\wp}_{\cS \times \cA \to \cS_t,p}(z,\cdot)$ and $\bar{\wp}_{\cS \times \cA \to \cS\uc{\ell},p}(z,\cdot)$ is bounded above by the discretization width of the partition $\cQ\uc{\ell}$.~We use this result in Lemma~\ref{lem:conc_ineq}.
\begin{lemma}\label{lem:disc_dist}
    Let us fix any state-action pair $z$ and time $t$. Let $\ell = \ell(q_t\inv(z))$. Recall distributions $\bar{\wp}_{\cS \times \cA \to \cS_t,p}(z,\cdot)$ and $\bar{\wp}_{\cS \times \cA \to \cS\uc{\ell},p}(z,\cdot)$ from Lemma~\ref{lem:conc_ineq}. Under Assumption~\ref{assum:bdd_der}, we have that
    \begin{align*}
        \norm{\bar{\wp}_{\cS \times \cA \to \cS\uc{\ell},p}(z,\cdot) - \bar{\wp}_{\cS \times \cA \to \cS_t,p}(z,\cdot)}_{TV} \leq C_p \sqrt{d}~ 2^{-\ell}
    \end{align*}
    for every $z \in \cS \times \cA$.
\end{lemma}
\begin{proof}
    Recall that $\cS_t$ is the set of representative points of $\cQ_t$ and that $\cQ^{(\ell)}$ is a coarser partition of $\cS$ than $\cQ_t$. Let us fix $\xi \in \cQ\uc{\ell}$, and let us denote the Radon-Nikodym derivative of the distribution $p(z, \cdot)$ by $f$. Let $\bar{f} = p(z,\xi)/\lambda(\xi)$.~We have,
    \begin{align*}
        \sup_{B \subseteq \xi}{\abs{\bar{\wp}_{\cS \times \cA \to \cS\uc{\ell},p}(z, B) - p(z, B)}} &\leq \int_{\xi}{(f - \bar{f}) \ind{f \geq \bar{f}} d\lambda}\\
        &\leq \int_{\xi}{(\bar{f} + C_p \sqrt{d} \eps) \ind{f \geq \bar{f}} d\lambda} - \int_{\xi}{\bar{f} \ind{f \geq \bar{f}} d\lambda}\\
        &\leq C_p \sqrt{d} \eps \times \eps^{d_\cS},
    \end{align*}
    where $\eps = 2^{-\ell}$. Hence, by Assumption~\ref{assum:bdd_der}, we have that for every $z \in \cS \times \cA$ and for every $\xi \in \cQ\uc{\ell}$,
    \begin{align*}
        \sup_{B \subseteq \xi}{\abs{\bar{\wp}_{\cS \times \cA \to \cS\uc{\ell},p}(z,B) - \bar{\wp}_{\cS \times \cA \to \cS_t,p}(z, B)}} &\leq \sup_{B \subseteq \xi}{\abs{\bar{\wp}_{\cS \times \cA \to \cS\uc{\ell},p}(z,B) - p(z, B)}}\\
        &\leq C_p \sqrt{d} \eps \times \eps^{d_\cS}.
    \end{align*}
   As $\cQ\uc{\ell}$ is coarser than $\cQ$, it follows that
   \begin{align*}
        \norm{\bar{\wp}_{\cS \times \cA \to \cS\uc{\ell},p}(z,\cdot) - \bar{\wp}_{\cS \times \cA \to \cS_t,p}(z,\cdot)}_{TV} &\leq \sum_{\xi \in \cQ\uc{\ell}}{\sup_{B \subseteq \xi}{\abs{\bar{\wp}_{\cS \times \cA \to \cS\uc{\ell},p}(z,B) - p(z, B)}}}\\
        &\leq C_p \sqrt{d} \eps \times \eps^{d_\cS} \times \eps^{-d_\cS}\\
        &\leq C_p \sqrt{d} \eps.
    \end{align*}
    Hence, we have proven the claim.
\end{proof}