\section{Guarantee on Number of Visits to Cells}\label{app:visits}
Recall that $\mu^{(t)}_{\phi,p,s}$ denotes the distribution of $s_t$ when policy $\phi$ is applied to the MDP that has the transition kernel $p$ and the initial state is $s$, and $\mu^{(\infty)}_{\phi,p}$ denotes the unique invariant distribution of the Markov chain induced by the policy $\phi$ on the MDP with transition kernel $p$.~Consider an $\cS$-cell $\xi$ for which the diameter is greater than $\eps$, and $\mu^{(\infty)}_{\phi,p}(\xi) \geq (\eps/3)^{d_\cS + 1}$ for all stationary deterministic policies $\phi$, where $\eps>0$.~Later we will choose an appropriate value for $\eps$.~From Assumption~\ref{assum:unif_ergodic} we get that for all $\phi \in \Phi_{SD}$ and for every initial state $s \in \cS$ we have,
\begin{align*}
    \mu^{(t)}_{\phi,p,s}(\xi) \geq \mu^{(\infty)}_{\phi,p}(\xi) - \frac{C}{2} \alpha^t.
\end{align*}
Since $\mu^{(\infty)}_{\phi,p}(\xi) \geq (\eps/3)^{d_\cS + 1}$, we have
\al{
    \mu^{(t)}_{\phi,p,s}(\xi) \geq \frac{1}{2} \mu^{(\infty)}_{\phi,p}(\xi),~\forall t \geq t\ust(\eps),\label{def:t_star_1}
}
where,
\al{
t\ust(\eps) := \ceil{\log_{\frac{1}{\alpha}}{\br{C\br{\frac{3}{\eps}}^{d_\cS + 1}}}}. \label{def:t_star_2}
}


\begin{lemma}\label{lem:lb_n_epi}
    Fix $k \in \bN$ and consider a $\cS$-cell $\xi \in \cQ_{\tau_k}$ such that $\mu^{(\infty)}_{\phi,p}(\xi) \geq (\eps/3)^{d_\cS + 1}$. Let $\zeta \in \cP_{\tau_k}$ denote the active cell that contains $\left\{(s,\phi_k(s))\right\}_{s \in \xi}$.~Let $n_k(\zeta)$ be the number of visits to $\zeta$ in the $k$-th episode, and $H_k$ be the duration of the $k$-th episode. Then, with a probability at least $1 - \frac{\delta}{3}$, we have,
    \begin{align*}
        n_k(\zeta) \geq \frac{H_k~ \mu^{(\infty)}_{\phi,p}(\xi)}{2 t\ust(\eps)} - \sqrt{\frac{H_k}{t\ust(\eps)} \log{\br{\frac{6 T}{t\ust(\eps) \delta}}}} - 1.
    \end{align*}
\end{lemma}
\begin{proof}
    Denote $m: = \floor{H_k / t\ust(\eps)}$ and $t_i := \tau_k + i ~t\ust(\eps)$. Let $i\ust \in \{0\}\cup\bN$ be such that $t_{i\ust} \leq T < t_{i\ust+1}$. Define the following martingale difference sequence $\{b_i\}_i$ w.r.t. the filtration $\{\cF_{t_i}\}_i$,
    \begin{align*}
        b_i := \ind{s_{t_i} \in \xi} - \bE\sqbr{\ind{s_{t_i} \in \xi} \mid \cF_{t_{i-1}}},~i=1,2,\ldots,i\ust.
    \end{align*}
    Also, define
    \begin{align*}
        g_i := \ind{(i-1) t\ust(\eps) \leq H_k},~i=1,2,\ldots,i\ust,
    \end{align*}
    and note that it is $\{\cF_{t_i}\}_i$-predictable sequence. It can be shown that $b_i$'s are conditionally $\frac{1}{2}$ sub-Gaussian, i.e., $\bE[\exp(\beta~ b_i)\mid \cF_{t_{i-1}}] \leq \exp(\beta^2/8)$~\citep{raginsky2013concentration}. Also, note that $\{g_i\}_i$ is a $\{0,1\}$-valued, $\{\cF_{t_i}\}$-predictable stochastic process. Hence, we can use Corollary \ref{cor:self_norm_vec} and obtain,
    \begin{align}\label{ineq:7}
        \bP\br{\sum_{i=1}^{m+1}{\ind{s_{t_i} \in \xi}} \leq \sum_{i=1}^{m+1}{\bE\sqbr{\ind{s_{t_i} \in \xi} \mid \cF_{t_{i-1}}}} - \sqrt{\frac{m+2}{2} \log{\br{\frac{3(m+2)}{\delta}}}}} \leq \frac{\delta}{3}.
    \end{align}
    From~\eqref{def:t_star_1},~\eqref{def:t_star_2} we have that
    \al{
        \bE\sqbr{\ind{s_{t_{i-1}} \in \xi} \mid \cF_{t_{i-1}}} \geq \frac{1}{2} \mu^{(\infty)}_{\phi,p}(\xi).\label{ineq:8}
        }
    Also, observe that $m + 1 > \frac{H_k}{t\ust(\eps)}$ and $m \leq \frac{H_k}{t\ust(\eps)}$. Since under \algo~algorithm we have $H_k \geq 2 t\ust(\eps)$, we get $m+2 \leq 2m$. Upon using~\eqref{ineq:8} and $m+2 \leq 2m$ in~\eqref{ineq:7}, we obtain,
    \begin{align*}
        \bP\br{\sum_{i=1}^{m}{\ind{s_{t_i} \in \xi}} \leq \frac{H_k~ \mu^{(\infty)}_{\phi,p}(\xi)}{2 t\ust(\eps)} - \sqrt{\frac{H_k}{t\ust(\eps)} \log{\br{\frac{6 H_k}{t\ust(\eps) \delta}}}}- 1} \leq \frac{\delta}{3}.
    \end{align*}
    The claim then follows since $H_k \leq T$, and $\sum_{i=1}^{m}{\ind{s_{t_i} \in \xi}} \leq n_k(\zeta)$.
\end{proof}

\begin{cor}\label{cor:G_2}
    Fix an $\eps > 0$. Consider the triplet $(k, \xi, \zeta)$ such that $k \in \{0\}\cup\bN$, $\xi \in \cQ_{\tau_k}$, $\diamc{\xi} \geq \eps$, $\mu\uc{\infty}_{\phi,p}(\xi) \geq (\eps/3)^{d_\cS + 1}$, $\zeta \in \cP_{\tau_k}$, and for every $s \in \xi$, $(s,\phi_k(s)) \in \zeta$.~Define the event,
    \begin{align}
        \cG_{2,\eps}:= \flbr{n_k(\zeta) \geq \frac{H_k~ \mu^{(\infty)}_{\phi,p}(\xi)}{2 t\ust(\eps)} - \sqrt{\frac{H_k}{t\ust(\eps)} \log{\br{\frac{12 T^2 d^\frac{d}{2}}{t\ust(\eps) \eps^d \delta}}}} - 1,~ \forall (k,\xi,\zeta) \mbox{ that satisfies the above conditions.}}, \label{def:G2}
    \end{align}
    where $t\ust(\eps) = \ceil{\log_{\frac{1}{\alpha}}{\br{C\br{\frac{3}{\eps}}^{d_\cS + 1}}}}$. We have, $\bP(\cG_{2,\eps}) \geq 1 - \frac{\delta}{3}$.
\end{cor}
\begin{proof}
    Since $k$ denotes the episode number, it can not exceed $T$. By definition of $\cP_{\tau_k}$ and $\cQ_{\tau_k}$, $\diamc{\zeta} \geq \diamc{\xi}$. Also, the number of cells that have a diameter greater than $\eps$ is less than $(\sqrt{d}/\eps)^d$. So, the total number of possible combinations of $(k, \xi, \zeta)$ that satisfies the given condition is at most $T (\sqrt{d}/\eps)^d$.~The proof then follows from Lemma \ref{lem:lb_n_epi} by taking a union bound over all $(k, \xi, \zeta)$ and by the fact that $H_k \leq T$.
\end{proof}