\section{Properties of Extended MDP}\label{app:prop_emdp}
We present three results in this section.~We begin by showing that extended MDPs constructed by \algo~are optimistic, i.e., on the set $\cG_1$~\eqref{def:G_1}, the optimal average reward of the extended MDP $\cM^+_t$ is greater than or equal to the optimal average reward of the true MDP for all $t \in \{0,1,\ldots,T-1\}$.~Next, we show that the span of the \epe~iterates~\eqref{iter:v_epe} for the extended MDP $\cM^+_t$ and any $\phi \in \Phi_t$ are bounded for all $t \in \{0,1,\ldots,T-1\}$.~Lastly, we derive an upper-bound on the average reward of policy $\phi \in \Phi_t$ evaluated on MDP $\cM^+_t$ for every $t \in \{0,1,\ldots,T-1\}$.
\begin{lemma}[Optimism]\label{lem:optimism}
    On the set $\cG_1$, we have,
    \al{
        J\ust_{\cM^+_t} \geq J\ust_\cM, \mbox{ for every } t \in \{0,1,\ldots, T-1\},\label{ineq:optimism}
    }
    where $J\ust_{\cM^+_t}$ is the optimal average reward of the extended MDP~$\cM^+_t$, and $J\ust_\cM$ is the optimal average reward of the MDP $\cM$.
\end{lemma}
\begin{proof}
    Consider the value iteration algorithm applied to the MDP $\cM$. For every $s \in \cS$,
    \begin{align}
        V_0(s) &= 0, \notag\\
        V_{n+1}(s) &=  \max_{a \in \cA}\Big\{r(s,a) + \int_{\cS}{p(s, a, s\up) V_n(s\up) d s\up}\Big\},~\forall n \in \bN. \label{iter:vi}
    \end{align}
    We assumed that $\cM$ is uniformly ergodic in Assumption~\ref{assum:unif_ergodic}, and hence the following value iteration algorithm converges, i.e., $\lim_{n \to \infty}{\spn{V_{n+1} - V_n}} = J\ust_\cM$.~Also, it follows from~\citep{hernandez2012adaptive} that $\lim_{n \to \infty} |V_n(s) - (n J\ust_\cM + h_\cM(s)) |=0$ for every $s \in \cS$.~Since we have shown in Lemma~\ref{lem:bdd_rvf_spn} that $h_\cM$ is bounded, it then follows that 
    \al{
    \lim_{n \to \infty}{\frac{1}{n}V_n(s)} = J\ust_\cM,~\forall s\in \cS. \label{eq:V/n=j}
    }
    
    We will prove that $V_n(s\up) \leq v_n(s)$ for every $n \in \bN$, $s \in \cS_t$ and $s\up \in q\inv(s)$. We prove this via induction. The base case, i.e. $n=0$ is seen to hold trivially. Next, assume that the following hold for all $i \in [n]$, where $n \in \bN$,
    \begin{align}\label{ineq:opt_ind_hyp}
        v_i(s) &\geq V_i(s\up),~\forall s \in \cS_t,~\forall s\up \in q\inv(s).
    \end{align}
    Consider a state-action pair $(s,a) \in \cS \times \cA$ and let $\ts \in \cS_t$ such that $s \in q\inv(\ts)$.~Then,
    \begingroup
    \allowdisplaybreaks
    \begin{align}
        r(s,a) + \int_{\cS}{p(s,a,s\up) V_n(s\up) ds\up} &\leq r(s,a) + \sum_{s\up \in \cS_t}{\wp_{\cS \times \cA \to \cS_t,p}(s,a,s\up) v_n(s\up)} \notag\\
        &\leq r(q(\zeta)) + L_r \diamc{\zeta} + \sum_{s\up \in \cS_t}{\wp_{\cS \times \cA \to \cS_t,p}(s,a,s\up) v_n(s\up)} \notag\\
        & \leq \max_{\substack{\ta \in A_t(\bar{s})\\ \te \in \cC_t}}{\flbr{\tilde{r}_t(\ts,\ta) + \sum_{s\up \in \cS_t}{\te(\ts, \ta, s\up) v_n(s\up)}}} \notag\\
        &= v_{n+1}(\ts),\label{ineq:opt1}
    \end{align}
    \endgroup
    where the first inequality follows from~\eqref{ineq:opt_ind_hyp}, the second inequality follows from Assumption~\ref{assum:lip}~(i), while the third inequality follows from the definition of the set $\cG_1$.~Since we have shown the above inequality for an arbitrary action $a$, we get,
    \begin{align}
        V_{n+1}(s) &= \max_{a \in \cA}{\flbr{r(s,a) + \int_{\cS}{p(s,a,s\up) V_n(s\up) ds\up}}} \notag\\
        &\leq v_{n+1}(\ts).
    \end{align}
    This completes the induction argument. The proof is then completed by dividing both sides of this inequality by $n$ and then taking limit $n \to \infty$.
\end{proof}

\begin{lemma}\label{lem:bd_span_epe}
    Let $t \in \{0,1,\ldots,T-1\}$.~Consider the extended MDP $\cM^{+}_t$, a policy $\phi \in \Phi_t$ and the corresponding \epe~\eqref{algo:epe}~iterates:
    \begin{align}
        v^{\phi,t}_0(s) &= 0, \notag\\
        v^{\phi,t}_{n+1}(s) &= \max_{\te \in \cC_t} \flbr{\tilde{r}_t(s,\phi(s)) + \sum_{s\up \in \cS_t}{\te(s,\phi(s),s\up) v^{\phi,t}_n(s\up)}},~\forall s \in \cS_t, n \in \bN. \label{iter:v_epe}
    \end{align}
    On the set $\cG_1$, we have 
    \nal{
    \spn{v^{\phi,t}_n} \leq C_v,~\forall n \in \bN, t \in \bN,
    }
    where,
    \begin{align}
        C_v &:= \max{\flbr{\frac{\ovl{m} (\ovl{m} + 5)}{2} + \frac{3}{C \alpha^{\ovl{m}+1}} + \frac{4 \tilde{m}}{1 - \alpha}, \frac{\ceil{\log_{\br{\frac{1}{\alpha}}^{\tilde{m}\inv}}{\br{\frac{2}{\alpha}}} + 1}}{1 - \alpha^{\tilde{m}\inv}}}},\label{def:Cv}\\
        \ovl{m} &:= \ceil{\log_{\frac{1}{\alpha}}{\br{\frac{2C}{\kappa} \br{\frac{C_\eta \tilde{m} \sqrt{d}}{1-\alpha}}^{d_\cS}}}},\label{def:m_bar} \mbox{ and } \\
        \tilde{m} &:= \ceil{\log_{\frac{1}{\alpha}}\br{\frac{2C}{3\alpha - 1}}}. \label{def:tm}
    \end{align}
    $C$ and $\alpha$ are as in Assumption~\ref{assum:unif_ergodic}.
\end{lemma}
\begin{proof}
    We first note that $v^{\phi,t}_n(s)$ is the optimal value of the expected reward for the extended MDP $\cM_t^+$ that is accumulated during the first $n$ steps when the process starts in state $s$. The first component of the extended action of the extended MDP is taken to be policy $\phi$ and doesn't need to be optimized, while the second component is the transition kernel that maximizes the r.h.s. of \eqref{iter:v_epe} in every step $i \in \{0,1,\ldots,n-1\}$.~We consider the following two cases separately. 

    \textbf{Case 1:} When,
    \al{
    \max_{s \in \cS_t}\diamc{q_t\inv(s,\phi(s))}\geq \frac{1 - \alpha}{2 (3(1 + L_p) + C_p) \br{\tilde{m}+1}}. \label{cond:1}
    }
    Let $\zeta$ be the cell with the largest diameter from the set $\{q_t\inv(s,\phi(s)):~s\in \cS_t\}$.~We first show that $\{s_i\}_{i=0}^{\infty}$, the CMP induced by the transition kernel $p$ under the application of policy $\phi$, hits $\pi_\cS(\zeta)$ within 
    \begin{align*}
        \frac{\ovl{m} (\ovl{m} + 5)}{2} + \frac{3}{C \alpha^{\ovl{m}+1}}
    \end{align*}
    steps in expectation, where $\ovl{m}$ is as defined in~\eqref{def:m_bar}.~From Assumption~\ref{assum:unif_ergodic}, Assumption~\ref{assum:statn_dist} and~\eqref{cond:1}, we have that for any $s\up \in \cS$,
    \begin{align*}
        \mu\uc{i}_{\phi,p,s\up}(\pi_\cS(\zeta)) \geq \frac{1}{2}\mu\uc{\infty}_{\phi,p}(\pi_\cS(\zeta)), \mbox{ and } \mu\uc{i}_{\phi,p,s\up}(\pi_\cS(\zeta)) \leq \frac{3}{2}\mu\uc{\infty}_{\phi,p}(\pi_\cS(\zeta))~\forall i \geq \ovl{m}.
    \end{align*}
    Now, consider another process $\{x_i\}_{i=0}^{\infty}$ that is independent across time; $x_i$ assumes the value $1$ with a probability $\mu\uc{i}_{\phi,p,s\up}(\pi_\cS(\zeta))$, and $0$ with a probability $1 - \mu\uc{i}_{\phi,p,s\up}(\pi_\cS(\zeta))$.~Define the following random variables $T\uc{x}_{\{1\}}$ and $T\uc{s}_{\pi_\cS(\zeta),s\up}$,
    \begin{align*}
        T\uc{x}_{\{1\}} &:= \inf{\{i\geq 0 \mid x_i = 1\}}, \mbox{ and}\\
        T\uc{s}_{\pi_\cS(\zeta),s\up} &:= \inf{\{i\geq 0 \mid s_i \in \pi_\cS(\zeta), s_0 = s\up\}}.
    \end{align*}
    We note that the distributions of $T\uc{x}_{\{1\}}$ and $T\uc{s}_{\pi_\cS(\zeta),s\up}$ are identical, so that $\bE\sqbr{T\uc{x}_{\{1\}}} = \bE\sqbr{T\uc{s}_{\pi_\cS(\zeta),s\up}}$.~We derive an upper-bound on $\bE\sqbr{T\uc{x}_{\{1\}}}$, and this would also serve as the upper-bound on $\bE\sqbr{T\uc{s}_{\pi_\cS(\zeta),s\up}}$.~We have,
    \begin{align*}
        \bE\sqbr{T\uc{x}_{\{1\}}} &= \sum_{i=0}^{\infty}{i \cdot \mu\uc{i}_{\phi,p}(\pi_\cS(\zeta)) \prod_{j=0}^{i-1}{\br{1 - \mu\uc{j}_{\phi,p,s}(\pi_\cS(\zeta))}}} \\
        &\leq \frac{\ovl{m}(\ovl{m} -1)}{2} + \sum_{i=\ovl{m}}^{\infty}{\frac{3i}{2} \mu\uc{\infty}_{\phi,p}(\pi_\cS(\zeta)) \prod_{j=\ovl{m}}^{i-1}{\br{1 - \frac{1}{2}\mu\uc{\infty}_{\phi,p}(\pi_\cS(\zeta))}}} \\
        &\leq \frac{\ovl{m}(\ovl{m} -1)}{2} + \frac{3}{2}\mu\uc{\infty}_{\phi,p}(\pi_\cS(\zeta))\sum_{i=0}^{\infty}{i \br{1 - \frac{1}{2}\mu\uc{\infty}_{\phi,p}(\pi_\cS(\zeta))}}^i + \frac{3 \ovl{m}}{2}\mu\uc{\infty}_{\phi,p}(\pi_\cS(\zeta))\sum_{i=0}^{\infty}{\br{1 - \frac{1}{2}\mu\uc{\infty}_{\phi,p}(\pi_\cS(\zeta))}^i} \\
        &\leq \frac{\ovl{m} (\ovl{m} + 5)}{2} + \frac{6}{\mu\uc{\infty}_{\phi,p}(\pi_\cS(\zeta))}.
    \end{align*}
    Furthermore, from Assumption~\ref{assum:statn_dist}, and since $\bE\sqbr{T\uc{x}_{\{1\}}}=\bE\sqbr{T\uc{s}_{\pi_\cS(\zeta),s\up}}$, we get, 
    \begin{align*}
        \bE\sqbr{T\uc{s}_{\pi_\cS(\zeta),s\up}} \leq \frac{\ovl{m} (\ovl{m} + 5)}{2} + \frac{6}{\kappa} \br{\frac{\sqrt{d}}{\diamc{\zeta}}}^{d_\cS}.
    \end{align*}
    From \eqref{cond:1} we can write,
    \begin{align*}
        \bE\sqbr{T\uc{s}_{\pi_\cS(\zeta),s\up}} &\leq \frac{\ovl{m} (\ovl{m} + 5)}{2} + \frac{6}{\kappa} \br{\frac{(3(1 + L_p) + C_p) \sqrt{d} (\tilde{m} + 1)}{1 - \alpha}}^{d_\cS}  \\
        &\leq \frac{\ovl{m} (\ovl{m} + 5)}{2} + \frac{3}{C \alpha^{\ovl{m}+1}}.
    \end{align*}
    Next, consider two states $\ovl{s} \in \cS_t$, and $\tilde{s} \in q\inv(\ovl{s})$.~We note that on the set $\cG_1$, for the extended MDP $\cM_t^+$ whenever the state is $\ovl{s}$, there is an extended action such that the next state transition distribution is $p(\tilde{s},\phi(\tilde{s}),\cdot)$.~Hence, on the set $\cG_1$, there is a sequence of extended actions such that starting from any state, in expectation, within $\frac{\ovl{m} (\ovl{m} + 5)}{2} + \frac{3}{C \alpha^{\ovl{m}+1}}$ steps the process hits $q(\pi_\cS(\zeta))$ where $\pi_\cS(\zeta)$ is the $\cS$-projection of $\zeta$, the largest cell in $\{q_t\inv(s,\phi(s)):~s\in \cS_t\}$.
    
    Now, consider the process $\{s_t\}$ associated with the extended MDP, in which the initial state is $s \in \cS_t$.~We claim that for any state $s\up$, there exists a sequence of extended actions where the first components of the extended actions are chosen by $\phi$ such that $s\up$ can be reached in $\frac{2}{(3(1 + L_p) + C_p) ~\diamc{q_t\inv(s,\phi(s))}}$ steps in expectation.~This is true because there is a transition kernel in $\cC_t$ that assigns at least $\frac{3(1 + L_p) + C_p}{2} \diamc{q_t\inv(s,\phi(s))}$ transition probability to $s\up$ when the current state is from $s$.~To summarize, starting from any state using a sequence of actions the state process can reach $q(\zeta)$ in $\frac{\ovl{m} (\ovl{m} + 5)}{2} + \frac{3}{C \alpha^{\ovl{m}+1}}$ steps in expectation, and from $q(\zeta)$, again it can reach any other state using a sequence of actions in $\frac{2}{(3(1 + L_p) + C_p) \diamc{q_t\inv(s,\phi(s))}}$.~Therefore, there cannot be state $s\up$ such that 
    \nal{
    \max_{s \in \cS_t}{v^{\phi,t}_n(s)} > v^{\phi,t}_n(s\up) + \frac{\ovl{m} (\ovl{m} + 5)}{2} + \frac{3}{C \alpha^{\ovl{m}+1}} + \frac{2}{(3(1 + L_p) + C_p) \diamc{\zeta}}.
    }
    Now, from the lower-bound on $\diamc{\zeta}$~\eqref{cond:1}, we obtain that
    \begin{align}
        \spn{v^{\phi,t}_n} \leq \frac{\ovl{m} (\ovl{m} + 5)}{2} + \frac{3}{C \alpha^{\ovl{m}+1}} + \frac{4 \tilde{m}}{1 - \alpha}. \label{ub:case_1}
    \end{align}

    \textbf{Case 2:}~In this case, we have that
    \begin{align}
        \max{\{\diamc{q_t\inv(s,\phi(s))} : s \in \cS_t\}} < \frac{1 - \alpha}{2 (3(1 + L_p) + C_p) \br{\tilde{m}+1}}. \label{cond:2}
    \end{align}
    Let $\bar{\phi} \in \Phi_{SD}$ be the extension of policy $\phi \in \Phi_t$ such that
    \begin{align*}
        \bar{\phi}(s) = \phi(q(\pi_\cS(\zeta))), \mbox{ for ever } s \in \pi_\cS(\zeta), \mbox{ for every } \pi_\cS(\zeta) \in \cQ_t.
    \end{align*}
    Claim:~We claim that there is a sequence of extended actions for the extended MDP $\cM^+_t$ such that the first components of the extended actions are governed by $\phi$ and on the set $\cG_1$, the $m$-step state transition kernel prescribed by the sequence of extended actions is the same as the discretization of the $m$-step composition of true transition kernel induced under application of policy $\bar{\phi}$. Let the state process of the extended MDP be denoted by $\{\tilde{s}_i\}$ and let the state process of the extended MDP be denoted by $\{s_i\}$. Then, mathematically, our claim says that there exists a sequence of probability kernels $\{\tilde{p}_i \in \cC_t: i \in \{1,2,\ldots\}\}$ such that
    \begin{align*}
        \bP(\tilde{s}_i = s\up \mid \tilde{s}_0 = s, \tilde{p}, \phi) = \bP(s_i \in q_t\inv(s\up)\mid s_0 = s, \bar{\phi}),~\forall s, s\up \in \cS_t,
    \end{align*}
    where $\bP$ denotes the joint probability distribution of the processes $\{\tilde{s}_i\}$ and $\{s_i\}$, condition on $\tilde{p}$ and $\phi$ implies that the extended actions are governed by $\tilde{p}$ and $\phi$.~Similarly, ~condition on $\bar{\phi}$ implies that the actions are governed by $\bar{\phi}$.~We show this using mathematical induction. The base cases follow from Lemma~\ref{lem:conc_ineq}.~Let us assume that for every $s, s\up \in \cS_t$ and for every $j \in \{1, 2, \ldots i\}$,
    \begin{align*}
        \bP(\tilde{s}_j = s\up \mid \tilde{s}_0 = s, \tilde{p}, \phi) = \bP(s_j \in q_t\inv(s\up)\mid s_0 = s, \bar{\phi}).
    \end{align*}
    See that
    \begin{align*}
        \bP(\tilde{s}_{i+1} = s\up \mid \tilde{s}_0 = s, \tilde{p}, \phi) &=  \sum_{\ts \in \cS_t}{\bP(\tilde{s}_{i+1} = s\up \mid \tilde{s}_i = \ts, \tilde{p}, \phi) \bP(\tilde{s}_i = \ts \mid \tilde{s}_0 = s, \tilde{p}, \phi)} \\
        &= \sum_{\ts \in \cS_t}{\tilde{p}_{i+1}(\ts,\phi(\ts),s\up) \bP(s_i = q\inv_t(\ts) \mid s_0 = s, \bar{\phi})}.
    \end{align*}
    Here, we note that for every $s \in \cS \times \cA$, there is a kernel $\te_s \in \cC_t$ such that $\te_s(q\inv_t(s, \phi(s)), s\up) = p(s,\bar{\phi}(s), q\inv_t(s\up))$ for every $s\up \in \cS_t$. As the set $\cC_t$ is convex, for any probability measure $\nu$ on $(\cS,\cB_\cS)$,
    \begin{align*}
        \int_{\cS}{\te_s(\ts,\phi(\ts), s\up) d\nu(s)} \in \cC_t.
    \end{align*}
    Taking $\nu$ to be a measure that satisfies $\nu(B) = \bP(s_i \in B \mid s_i \in q\inv_t(\ts))$ for every $B \in \cB_\cS$, we get that
    \begin{align*}
        \int_{\cS}{\te_s(\ts,\phi(\ts), s\up) d\nu(s)} = \bP(s_{i+1} \in q\inv_t(s\up) \mid s_i \in q\inv_t(\ts)).
    \end{align*}
    Taking $\tilde{p}_{i+1}(\ts, \phi(\ts), \cdot) = \int_{\cS}{\te_s(\ts,\phi(\ts), \cdot) d\nu(s)}$, we get that
    \begin{align*}
        \bP(\tilde{s}_{i+1} = s\up \mid \tilde{s}_0 = s, \tilde{p}, \phi) &= \sum_{\ts \in \cS_t}{\tilde{p}_{i+1}(\ts,\phi(\ts),s\up) \bP(s_i = q\inv_t(\ts) \mid s_0 = s, \bar{\phi})}\\
        &= \sum_{\ts \in \cS_t}{\bP(s_{i+1} \in q\inv_t(s\up) \mid s_i \in q\inv_t(\ts)) \bP(s_i = q\inv_t(\ts) \mid s_0 = s, \bar{\phi})} \\
        &= \bP(s_{i+1} \in q\inv_t(s\up) \mid s_0 = s, \bar{\phi}).
    \end{align*}
    This completes the proof of our claim.

    From \eqref{cond:2}, we have that for any $\te \in \cC_t$,
    \begin{align*}
        \max_{s\in \cS_t}{\norm{\te(s,\phi(s),\cdot) - \tilde{p}_i(s,\phi(s),\cdot)}_1} \leq \frac{1 - \alpha}{2\tilde{m}},~\forall s \in \cS_t, s\up \in q_t\inv(s).
    \end{align*}
    Define the discretization of the $m$-step transition kernel under the application of policy $\bar{\phi}$ as follows:
    \begin{align*}
        \wp_{t,\phi}\uc{m}(s,s\up) := p_\phi\uc{m}(s,q\inv_t(s\up)),~\forall s \in \cS, s\up \in \cS_t.
    \end{align*}
    Let $\te_\phi\uc{m}$ denote the $m$-step transition kernel of the CMP induced by $\te$ under application of policy $\phi$.~From the previous claim and Lemma~\ref{lem:diff_kern_comp}, we have that
    \begin{align}
        \norm{\wp_{t,\phi}\uc{\tilde{m}}(s,\cdot) - \te_\phi\uc{\tilde{m}}(s,\cdot)}_1 \leq \frac{1 - \alpha}{2}, \label{diff:te_p}
    \end{align}
    where $p_\phi\uc{m}$ is defined in~\eqref{def:p_tstage}.~Also, observe that
    \begin{align}
        \max_{s,s\up\in \cS_t}{\norm{\wp\uc{\tilde{m}}_{t,\phi}(s,\cdot) - \wp\uc{\tilde{m}}_{t,\phi}(s\up,\cdot)}_1} \leq \frac{3\alpha - 1}{2}. \label{diff:pmpm}
    \end{align}
    Hence, combining \eqref{diff:te_p} and \eqref{diff:pmpm}, we have that for any $\te \in \cC_t$,
    \begin{align*}
        \max_{s,s\up\in \cS_t}{\norm{\te\uc{\tilde{m}}_\phi(s,\cdot) - \te\uc{\tilde{m}}_\phi(s\up,\cdot)}_1} &\leq \max_{s,s\up\in \cS_t}\bigg\{\norm{\te\uc{\tilde{m}}_\phi(s,\cdot) - \wp\uc{\tilde{m}}_{t,\phi}(s,\cdot)}_1 + \norm{\wp\uc{\tilde{m}}_{t,\phi}(s,\cdot) - \wp\uc{\tilde{m}}_{t,\phi}(s\up,\cdot)}_1 \\
        &\quad + \norm{\wp\uc{\tilde{m}}_{t,\phi}(s\up,\cdot) - \te\uc{\tilde{m}}_\phi(s\up,\cdot)}_1 \bigg\}\\
        &\leq \frac{1 - \alpha}{2}+ 3\alpha - 1 + \frac{1 - \alpha}{2} \\
        &= 2\alpha.
    \end{align*}
    Now, from Lemma~\ref{lem:pn_contra}, we have that the Markov chain induced by the transition kernel $\te$ under the application of policy $\phi$ is uniformly ergodic with constants $\frac{2}{\alpha}$ and $\alpha^{\tilde{m}\inv}$, i.e.,
    \begin{align*}
        \norm{\mu\uc{i}_{\phi,\te,s} - \mu\uc{\infty}_{\phi,\te}}_1 \leq \frac{2}{\alpha} \cdot \br{\alpha^{\tilde{m}\inv}}^i,~\forall i \in \bN.
    \end{align*}
    Hence, from Lemma~\ref{lem:bdd_pval_spn}, we conclude that
    \begin{align}
        \spn{v^{\phi,t}_n} \leq \frac{\ceil{\log_{\br{\frac{1}{\alpha}}^{\tilde{m}\inv}}{\br{\frac{2}{\alpha}}}} + 1}{1 - \alpha^{\tilde{m}\inv}}. \label{ub:case_2}
    \end{align}
    Combining the upper-bounds from \eqref{ub:case_1} and \eqref{ub:case_2}, we obtain the desired upper-bound.
\end{proof}

In the next lemma, we establish that the optimism injected by \algo~is not huge. 


\begin{lemma}\label{lem:ub_opt}
    Consider time $t \in \bN$ and a policy $\phi \in \Phi_t$.~Let $\bar{\phi} \in \Phi_{SD}$ be the extension of $\phi$ as follows:
    \begin{align*}
        \bar{\phi}(s) = \phi(q(\xi)), \mbox{ for every } s \in \xi, \mbox{ for every } \xi \in \cQ_t.
    \end{align*}
    Then, we have that on the set $\cG_1$,
    \begin{align}\label{eq:lb_index}
        J_{\cM^+_t}(\phi) \leq J_\cM(\bar{\phi}) + C_{ub}~ \diam{t}{\bar{\phi}},~\forall t \in \bN, \phi \in \Phi_t,
    \end{align}
    where 
    $J_{\cM^+_t}(\phi)$ is the optimal value of $\cM^{+}_{t}$ when the control input component of the extended action is chosen according to the policy $\phi$, and the transition kernel is chosen so as to maximize the average reward, $\diam{t}{\bar{\phi}}$ is as defined in \eqref{def:diam_pol}, and
    \al{
    C_{ub} := 2 L_r + (3(1 + L_p) + C_p) C_v. \label{def:Cub}
    }
    $L_r, L_p$ are as stated in Assumption~\ref{assum:lip}, $C_p$ is as stated in Assumption~\ref{assum:statn_dist}, and $C_v$ is as defined in~\eqref{def:Cv}.
\end{lemma}
\begin{proof}
    Consider the iteration \eqref{iter:v_epe}. From Corollary~\ref{cor:conv_epe} it follows that
    \begin{align*}
        \lim_{n \to \infty}{\br{v^\phi_{n+1}(s) - v^\phi_n(s)}} = J_{\cM^+_t}(\phi),~\mbox{ for every } s \in \cS_t.
    \end{align*}
    As the sequence of Cesaro means converges to the same limit, we can write
    \begin{align*}
        \lim_{n \to \infty}{\frac{1}{n}v^\phi_n(s)} = J_{\cM^+_t}(\phi).
    \end{align*}
    Similarly, from the policy evaluation iteration for the true MDP~\eqref{def:epe_true}, we have that
    \begin{align*}
        \lim_{n \to \infty}{\frac{1}{n}V^{\bar{\phi}}_n(s)} = J_{\cM}(\bar{\phi}).
    \end{align*}
    In order to prove the lemma, we will show that on the set $\cG_1$, for every $n \in \bN$, for every $s \in \cS_t$ and for every $s\up \in q\inv(s)$, the following holds,
    \begin{align}\label{eq:lb_Vindex}
        v^\phi_n(s) \leq V^{\bar{\phi}}_n(s\up) + C_{ub}~ \bE_{p,\bar{\phi}}\sqbr{\sum_{i=0}^{n-1}{\diamc{q_t\inv(s_i, \bar{\phi}(s_i))}} \middle| s_0 = s\up},
    \end{align}
    where $\bE_{p,\phi}$ denotes that the expectation is taken with respect to the measure induced by $\phi$ when it is applied to MDP with transition kernel $p$.~We prove this using induction. The base case~$(n=0)$ is seen to hold trivially. Next, we assume that the following holds for $i \in \{0,1,\ldots,n\}$, where $n \in \bN$,
    \begin{align}\label{indhyp}
        v^\phi_i(s) \leq V^{\bar{\phi}}_i(s\up) + C_{ub}~ \bE_{p,\bar{\phi}}\sqbr{\sum_{j=0}^{i-1}{\diamc{q_t\inv(s_j, \bar{\phi}(s_j))}} \middle| s_0 = s\up},
    \end{align}
    for every $s \in \cS_t$ and for every $s\up \in q\inv(s)$.~Let us fix $s \in \cS_t$ and $s\up \in q\inv(s)$ arbitrarily, then from~\eqref{iter:v_epe} we obtain the following,
    \begingroup
    \allowdisplaybreaks
    \begin{align*}
        v^\phi_{n+1}(s) &= r(q(q\inv_t(s,\phi(s)))) + \max_{\te \in \cC_t}{\sum_{s\upp \in \cS_t}{\te(q(q\inv_t(s,\phi(s))), s\upp) v^\phi_n(s\upp)}} + L_r~ \diamc{q\inv_t(s,\phi(s))}\\
        &= r(q(q\inv_t(s,\phi(s)))) + \sum_{s\upp \in \cS_t}{\te_n(q(q\inv_t(s,\phi(s))), s\upp) \bar{V}^\phi_n(s\upp)} + L_r~ \diamc{q\inv_t(s,\phi(s))}\\
        &\leq r(s\up, \phi(s\up)) + \sum_{s\upp \in \cS_t}{\wp(s\up, \phi(s\up), s\upp; \cS_t \times A_t, \cQ_t) ~v^\phi_n(s\upp)} + \eta_t(q\inv_t(s,\phi(s))) \spn{v^\phi_n} + 2L_r~ \diamc{q\inv_t(s,\phi(s))} \\
        &\leq r(s\up, \phi(s\up)) + \int_\cS{p(s\up, \phi(s\up), s\upp) V^\phi_n(s\upp) ds\upp} + C_{ub}~ \bE_{p,\phi}\sqbr{\sum_{i=1}^{n}{\diamc{q_t\inv(s_i, \phi(s_i))}} \middle| s_0 = s\up} \\
        &\quad + \br{2L_r + (3(1 + L_p) + C_p) C_v} \diamc{q\inv_t(s,\phi(s))} \\
        &\leq r(s\up, \phi(s\up)) + \int_\cS{p(s\up, \phi(s\up), s\upp) V^\phi_n(s\upp) ds\upp} + C_{ub}~ \bE_{p,\phi}\sqbr{\sum_{i=1}^{n}{\diamc{q_t\inv(s_i, \phi(s_i))}} \middle| s_0 = s\up}\\
        &\quad +  \br{2L_r + (3(1 + L_p) + C_p) C_v} \diamc{q\inv_t(s,\phi(s))}\\
        &= V^\phi_{n+1}(s) + C_{ub}~ \bE_{p,\phi}\sqbr{\sum_{i=0}^{n}{\diamc{q_t\inv(s_i, \phi(s_i)))}} \middle| s_0 = s},
    \end{align*}
    \endgroup
    where $\te_n$ is a transition kernel belonging to the set $\cC_t$ that maximizes the expression in the r.h.s. of the first equality.~The first inequality follows from Lipschitz continuity of the reward function, the definition of event $\cG_1$ and from Lemma~\ref{lem:bdd_dotdifLv}.~The second inequality is obtained by invoking the induction hypothesis~\eqref{indhyp}, and by using the upper-bound on $\spn{v^\phi_n}$ from Lemma~\ref{lem:bd_span_epe}.~This concludes the induction argument, and proves~\eqref{eq:lb_Vindex}.~The proof of the claim follows by dividing both side of~\eqref{eq:lb_Vindex} by $n$ and taking limit $n \to \infty$.
\end{proof}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Properties of Proxy Diameter}\label{app:prop_pdiam}
In this section, we present three results as the corollaries of the results obtained in the previous section.
\begin{cor}\label{cor:opt_pdiam}
    Fix a time $t$. Let $\phi \in \Phi_t$ and $\bar{\phi} \in \Phi_{SD}$ be the unique extension of $\phi$ such that 
    \al{
    \bar{\phi}(s\up) = \phi(s), \mbox{ for every } s \in \cS_t \mbox{ and } s\up \in q\inv(s).\label{def:pol_ext}
    }
    On the set $\cG_1$, we have,
    \al{
        \pdiam{t}{\phi} \geq \diam{t}{\phi},~\forall t \in \{0,1,\ldots, T-1\}, \phi \in \Phi_t.\label{ineq:opt_pdiam}
    }
    where $\pdiam{t}{\phi}$ is the average reward of policy $\phi$ evaluated on the extended MDP~$\cM^{d,+}_t$ and $\diam{t}{\bar{\phi}} = \int_{\cS}{q_t\inv(s,\phi(s)) \mu\uc{\infty}_{\phi,p}(s) ds}$.
\end{cor}
\begin{proof}
    Define the MDP, $\cM^d_t := (\cS, \cA, p, \tilde{d})$ where
    \begin{align*}
        \tilde{d}(s,a) = \diamc{q\inv_t(s,a)},\mbox{ for every } (s,a) \in \cS \times \cA.
    \end{align*}
    As $p$ satisfy Assumption~\ref{assum:unif_ergodic},
    \begin{align*}
        J_{\cM^d_t}(\bar{\phi}) = \diam{t}{\bar{\phi}},\mbox{ for every } \bar{\phi} \in \Phi_{SD}.
    \end{align*}
    Note that the extended policy evaluation~\eqref{iter:v_epe} and policy evaluation~\eqref{def:epe_true} algorithms are equivalent to extended value iteration~\eqref{iter:evi} and value iteration~\eqref{iter:vi} algorithms, respectively, except that the control inputs have to be chosen from singleton sets.~Then the proof follows from Lemma~\ref{lem:optimism}.
\end{proof}

\begin{cor}\label{cor:bd_span_epe}
    Let $t \in \{0,1,\ldots,T-1\}$.~Consider the extended MDP $\cM^{d,+}_t$, a policy $\phi \in \Phi_t$ and the corresponding \epe~\eqref{algo:epe}~iterates:
    \begin{align*}
        g^{\phi,t}_0(s) &= 0, \notag\\
        g^{\phi,t}_{n+1}(s) &= \max_{\te \in \cC_t} \flbr{d_t(s,\phi(s)) + \sum_{s\up \in \cS_t}{\te(s,\phi(s),s\up) g^{\phi,t}_n(s\up)}},~\forall s \in \cS_t, n \in \bN.
    \end{align*}
    On the set $\cG_1$, we have 
    \nal{
        \spn{g^{\phi,t}_n} \leq C_v,~\forall n \in \bN, t \in \bN,
    }
    where, $C_v$, $\ovl{m}$ and $\tilde{m}$ are defined in \eqref{def:Cv}, \eqref{def:m_bar} and \eqref{def:tm}, respectively.
\end{cor}
\begin{proof}
    Follows from Lemma~\ref{lem:bd_span_epe}.
\end{proof}

\begin{cor}\label{cor:ub_pdiam}
    Consider time $t \in \bN$ and a policy $\phi \in \Phi_t$.~Let $\bar{\phi} \in \Phi_{SD}$ be the extension of $\phi$ as defined in \eqref{def:pol_ext}.~Then, we have that on the set $\cG_1$,
    \begin{align*}
        \pdiam{t}{\phi} \leq  (C_{ub} + 1)~ \diam{t}{\bar{\phi}},~\forall t \in \bN, \phi \in \Phi_t,
    \end{align*}
    where $C_{ub}$ is as defined in \eqref{def:Cub}.
\end{cor}
\begin{proof}
    Noting that $J_{\cM^d_t}(\bar{\phi}) = \diam{t}{\bar{\phi}}$ and $J_{\cM^{d,+}_t}(\phi) = \pdiam{t}{\phi}$, the claim follows from Lemma~\ref{lem:ub_opt} and Corollary~\ref{cor:bd_span_epe}.
\end{proof}
