\section{Properties of Extended Value Iteration~(EVI) and Extended Policy Evaluation~(EPE)}\label{app:prop_evi_epe}
We recall the definition of Extended MDP at time $t$ that was discussed in Section~\ref{sec:algo},
\begin{align*}
    \cM^+_t = \{(\cS_t, \cA_t,\tilde{p}, \tilde{r}_t) : \tilde{p} \in \cC_t\}, 
\end{align*}
where $\cS_t$ and $\cA_t$ are the discretized state and action space respectively, at time $t$, while $\tilde{r}$ is the discretized reward function with an additional bonus term. $\cC_t$ is a set of plausible discrete transition kernels.~Note that \algo~calls the \evi~subroutine~(Algorithm~\ref{algo:evi}) with a parameter $\gamma$ which specifies the desired accuracy; upon calling \evi~with accuracy parameter $\gamma$, it returns a policy that is $\gamma$-optimal for the extended MDP.~We begin with introducing some notation. For $\phi \in \Phi_t$, $J_{\cM^+_t}(\phi)$ denotes the value of the policy $\phi$ evaluated on the extended MDP $\cM^+_t$. To be precise, this is the optimal average reward when the control action for the extended MDP is chosen according to the policy $\phi$, and the kernel is chosen so as to maximize the average reward.~The next result is similar in spirit to~\citet[Theorem 7]{jaksch2010near}.

\begin{lemma}\label{lem:conv_evi}
    Fix a time $t \in \bN$.~Consider the extended MDP $\cM^+_t$ and the corresponding \evi~iterates:
    \begin{align}
        v_0(s) &= 0, \notag\\
        v_{n+1}(s) &= \max_{\substack{a \in \cA_t(s)\\ \te \in \cC_t}} \flbr{\tilde{r}_t(s,a) + \sum_{s\up \in \cS_t}{\te(s,a,s\up) v_n(s\up)}},~\forall s \in \cS_t, n \in \bN. \label{iter:evi}
    \end{align}
    Then,
    \begin{align*}
        \lim_{n \to \infty}{\br{v_{n+1}(s) - v_n(s)}} = J\ust_{\cM^{+}_t}.
    \end{align*}
    Moreover, whenever $\spn{v_{n+1} - v_{n}} \leq \gamma$, the policy that chooses greedy actions which are optimal w.r.t. $v_n$, is $\gamma$-optimal.
\end{lemma}
\begin{proof}
    Consider the $n$-th step of the \evi~iteration, and let the action $a_n(s)$ and the kernel $\te_n$ maximize the r.h.s. of \eqref{iter:evi}, i.e.,
    \nal{
        (a_n(s), \te_n) \in \underset{\substack{a \in \cA_t(s)\\ \te \in \cC_t}}{\arg\max} \flbr{\tilde{r}_t(s,a) + \sum_{s\up \in \cS_t}{\te(s,a,s\up) v_n(s\up)}}, \mbox{ for every } s \in \cS_t.
    }
    Let $s\ust \in \arg\max_{s \in \cS_t}{v_n}(s)$. Then, $\te_i(s,\cdot)$ has to be chosen from the set $\cC_t$ in such a manner that one assigns the maximum possible probability to a state in $s\ust$. Thus, we must have $\te_i(s, s\ust) \geq \min{\flbr{1, \frac{1}{2}\eta_t(q_t\inv(s,a_n(s)))}}$, where $q_t\inv(s,a_n(s))$ is the active cell at time $t$ that contains $(s,a_n(s))$. Since $\eta_t(q_t\inv(s,a_n(s))) > 0$ for all $s \in \cS_t$, it follows that $\te_i(s\ust, s\ust) > 0$. It is evident that the associated Markov chain is aperiodic. The proof then follows from \citet[Theorem $9.4.4$]{puterman2014markov}. The second claim follows from \citet[Theorem 8.5.6]{puterman2014markov}.
    %~Now, construct the stochastic matrix $\Te = \{\Te_{s,s\up}\}_{s,s\up\in \cS_t}$ by letting $\Te_{s,s\up} = \te_i(s, s\up)$. 
\end{proof}

The next result follows from the previous result. It proves the convergence of the \epe~algorithm~\eqref{algo:epe}, also derives the gap between the true value of a policy and that returned by the~\epe.
\begin{cor}\label{cor:conv_epe}
    Fix a time $t \in \bN$.~Recall the extended MDP $\cM^{d,+}_{t} = \{(\cS_t, \cA_t, \tilde{p}, d_t) : \tilde{p} \in \cC_t\}$, where
    \begin{align*}
        d_t(s,a) = \diamc{q\inv_t(s,a)},~\forall (s,a) \in \cS_t \times \cA_t,
    \end{align*}
    policy $\phi \in \Phi_t$ and the corresponding \epe~iterates:
    \begin{align}
        g^\phi_0(s) &= 0, \notag\\
        g^\phi_{n+1}(s) &= \max_{\te \in \cC_t} \flbr{d_t(s,\phi(s)) + \sum_{s\up \in \cS_t}{\te(s,\phi(s),s\up) g^\phi_n(s\up)}},~\forall s \in \cS_t, n \in \bN. \label{iter:epe}
    \end{align}
    Then
    \begin{align*}
        \lim_{n \to \infty}{\br{g^\phi_{n+1}(s) - g^\phi_n(s)}} = \pdiam{t}{\phi}.
    \end{align*}
    Moreover, when $\spn{g^\phi_{n+1} - g^\phi_{n}} \leq \gamma (g^\phi_{n+1}(s\lst) - g^\phi_{n}(s\lst))$, i.e. the stopping criteria is met, then $(g^\phi_{n+1}(s\lst) - g^\phi_{n}(s\lst))$ satisfies the following:
    \begin{align}
        \frac{\pdiam{t}{\phi}}{1 + \gamma} \leq (g^\phi_{n+1}(s\lst) - g^\phi_{n}(s\lst)) \leq \frac{\pdiam{t}{\phi}}{1 - \gamma}. \label{bd:err_epe}
    \end{align}
\end{cor}

\begin{proof}
    Similar to the proof of Lemma~\ref{lem:conv_evi}, one can show that the transition kernels which maximize the r.h.s. in every iteration of \epe~\eqref{iter:epe} are aperiodic. The convergence of \epe~ then follows from~\citet[Theorem $9.4.4$]{puterman2014markov}.~From \citet[Theorem 8.5.6]{puterman2014markov}, it follows that 
    \begin{align*}
        \abs{(g^\phi_{n+1}(s\lst) - g^\phi_{n}(s\lst)) - \pdiam{t}{\phi}} \leq \gamma~ (g^\phi_{n+1}(s\lst) - g^\phi_{n}(s\lst)),
    \end{align*}
    or
    \begin{align*}
        g^\phi_{n+1}(s\lst) - g^\phi_{n}(s\lst) \leq \frac{\pdiam{t}{\phi}}{1 - \gamma}, \mbox{ and},
        g^\phi_{n+1}(s\lst) - g^\phi_{n}(s\lst) \geq \frac{\pdiam{t}{\phi}}{1 + \gamma}.
    \end{align*}
    This concludes the proof.
\end{proof}

\begin{remark}[Upper and lower-bounds of episode duration]\label{rem:bdd_hk}
Let $d_k = \epe(\cM^{d,+}_{\tau_k}, \tilde{\phi_k}, \gamma, s\lst)$ be the value of the policy $\tilde{\phi}_k$ evaluated on $\cM^{d,+}_{\tau_k}$.~From Corollary~\ref{cor:conv_epe} we have,
    \begin{align*}
        \frac{\pdiam{\tau_k}{\tilde{\phi}_k}}{1 + \gamma} \leq d_k \leq \frac{\pdiam{\tau_k}{\tilde{\phi}_k}}{1 - \gamma}.
    \end{align*}
    As $H_k = \frac{C_H \log{\br{\frac{T}{\delta}}}}{d_k^{2(d_\cS + 1)}}$, we conclude that
    \begin{align}
        \frac{C_H (1 - \gamma)^{2(d_\cS + 1)} \log{\br{\frac{T}{\delta}}}}{\pdiam{\tau_k}{\tilde{\phi}_k}^{2(d_\cS + 1)}} \leq H_k \leq \frac{C_H (1 + \gamma)^{2(d_\cS + 1)} \log{\br{\frac{T}{\delta}}}}{\pdiam{\tau_k}{\tilde{\phi}_k}^{2(d_\cS + 1)}}.\label{bdd:hk}
    \end{align}
\end{remark}