\section{General Results for MDPs}\label{app:gen_res}
Consider an MDP $\cM = (\cS, \cA, p, r)$ and a policy $\phi \in \Phi_{SD}$ that maps states in $\cS$ to actions in $\cA$.~We assume that the transition kernel $p$ satisfies Assumption~\ref{assum:unif_ergodic}.~Hence, there exists a unique invariant distribution $\mu\uc{\infty}_{\phi,p}$ for the controlled Markov process~(CMP) induced by the transition kernel $p$ under the application of policy $\phi$.~Under Assumption~\ref{assum:unif_ergodic}, there exists a solution to the following Poisson equation~\citep{hernandez2012further}:
\begin{align}
    J + h(s) = r(s,\phi(s)) + \int_{\cS}{h(s\up) p(s,\phi(s),ds\up)},~\forall s \in \cS.\label{eq:pois}
\end{align}
Specifically, $(J_\cM(\phi), h^\phi_\cM) \in \bR \times \bR^\cS$ satisfies \eqref{eq:pois}, where
\begin{align}
    J_\cM(\phi) &= \underset{T\to\infty}{\lim\inf}{\frac{1}{T} \bE\sqbr{\sum_{t=0}^{T-1}{r(s_t,\phi(s_t))} \mid s_0=s}} = \int_{\cS}{r(s,\phi(s)) \mu\uc{\infty}_{\phi,p}(ds)}, \label{def:Jphi}\\
   \mbox{ and } h^\phi_\cM(s\up) &= \sum_{t=0}^{\infty}{\int_{\cS}{r(s,\phi(s)) (\mu\uc{\infty}_{\phi,p} - \mu\uc{t}_{\phi,p,s\up})(ds)}},~\forall s\up \in \cS. \label{def:hphi}
\end{align}
Recall that $\mu\uc{t}_{\phi,p,s}$ denotes the distribution of $s_t$ when initial state is $s_0 = s$, where $\{s_t\}_t$ is the CMP induced by the transition kernel $p$ under the application of $\phi$.~$h^\phi_\cM$ is called the relative value function of $\phi$.

The following is popularly known as the average reward optimality equation~(AROE),
\begin{align*}
    &J + h(s) = \max_{a \in \cA}{\flbr{r(s,a) + \int_{\cS}{h(s\up) p(s, a, s\up) ds\up}}}, \mbox{ and}\\
    &h(s\lst) = 0,
\end{align*}
where $s\lst \in \cS$ is a designated state.~\citet{hernandez2012adaptive} shows that under Assumption~\ref{assum:unif_ergodic}, AROE has a solution.~A policy $\phi\ust$ is optimal if it satisfies the following,
\begin{align}
    \phi\ust(s) \in \underset{a \in \cA}{\arg\max}{\flbr{r(s,a) + \int_{\cS}{h^{\phi\ust}_\cM(s\up)p(s,a,s\up) d s\up}}},~\forall s \in \cS.
\end{align}
In that case, $J\ust_\cM = J_\cM(\phi\ust)$ and $h_\cM = h^{\phi\ust}_\cM$ solve AROE.

Denote the $t$-stage transition kernel under the application of policy $\phi$ by $p\uc{t}_\phi$, i.e.,
\begin{align}
    p\uc{t}_\phi(s,B) = \bP(s_{\tau+t} \in B \mid s_\tau = s, a_{t\up} = \phi(s_{t\up}), t\up = \tau, \tau + 1, \ldots, \tau+t-1),~t\in\bN,s \in \cS, B \in \cB_\cS,\tau \in \bN. \label{def:p_tstage}
\end{align}
Our next result shows that when $t$ is sufficiently large, then Assumption~\ref{assum:unif_ergodic} is equivalent to saying that $p\uc{t}_\phi$ has the ``contractive property,''~\eqref{def:contractive}. 

\begin{lemma}\label{lem:pn_contra}
    Consider an MDP $\cM = (\cS, \cA, p, r)$ such that $p$ satisfies Assumption~\ref{assum:unif_ergodic}. Then, for every policy $\phi \in \Phi_{SD}$ we have,
    \begin{align}
        \norm{p\uc{i}_\phi(s, \cdot) - p\uc{i}_\phi(s\up,\cdot)}_{TV} \leq 2 \alpha,~\forall s, s\up \in \cS, i \geq m\ust, \label{def:contractive}
    \end{align}
    where $p\uc{i}_\phi$ is the $i$-stage transition probability of the CMP induced by the transition kernel $p$ under the application of policy $\phi$ as defined in~\eqref{def:p_tstage}, and 
    \begin{align}
        m\ust := \ceil{\log_\frac{1}{\alpha}{(C)}} + 1. \label{def:mstar}
    \end{align}
    Conversely, if
    \begin{align*}
        \norm{p\uc{m}_\phi(s, \cdot) - p\uc{m}_\phi(s\up,\cdot)}_{TV} \leq 2 \alpha\up,~\forall s, s\up \in \cS,
    \end{align*}
    for some $m \in \bN$, then Assumption~\ref{assum:unif_ergodic} holds with $C = \frac{2}{\alpha\up}$ and $\alpha = {\alpha\up}^{\frac{1}{m}}$.
\end{lemma}
\begin{proof}
    We first note that $p\uc{i}_\phi(s, \cdot) = \mu\uc{i}_{\phi,p,s}$ for every $s \in \cS$.~Hence, for any $s, s\up \in \cS$,
    \begin{align*}
        \norm{p\uc{i}_\phi(s, \cdot) - p\uc{i}_\phi(s\up, \cdot)}_{TV} \leq \norm{\mu\uc{i}_{\phi,p,s} - \mu\uc{\infty}_{\phi,p}}_{TV} + \norm{\mu\uc{i}_{\phi,p,s\up} - \mu\uc{\infty}_{\phi,p}}_{TV}.
    \end{align*}
    Also, $C \alpha^i \leq \alpha$ for $i \geq \log_\frac{1}{\alpha}{(C)} + 1$.~Now, using Assumption~\ref{assum:unif_ergodic}, we have that when $i \geq m\ust$, then the following holds,
    \begin{align*}
        \norm{p\uc{i}_\phi(s, \cdot) - p\uc{i}_\phi(s\up, \cdot)}_{TV} &\leq \norm{\mu\uc{i}_{\phi,p,s} - \mu\uc{\infty}_{\phi,p}}_{TV} + \norm{\mu\uc{i}_{\phi,p,s\up} - \mu\uc{\infty}_{\phi,p}}_{TV}\\
        &\leq 2 \alpha.
    \end{align*}
    This concludes the proof of the first claim.
    
    Now, we prove the second claim. Consider the CMP that is described by the transition kernel $p$ and evolves under the application of the policy $\phi$. Consider two copies of this CMP, where these copies differ in the distribution of the initial state. Denote these distributions by $\mu\uc{0}_1$ and $\mu\uc{0}_2$. Denote the distributions of $s_i$ in the corresponding processes by $\mu\uc{i}_1$ and $\mu\uc{i}_2$, respectively. We show the following:
    \begin{align}\label{ineq:geom_close}
        \norm{\mu\uc{i}_{1} - \mu\uc{i}_{2}}_{TV} \leq \tilde{C} \cdot \tilde{\alpha}^i \norm{\mu\uc{0}_{1} - \mu\uc{0}_{2}}_{TV},~\forall i \in \bN,
    \end{align}
    where $\tilde{C} = \frac{1}{\alpha\up}$ and $\tilde{\alpha} = {\alpha\up}^{\frac{1}{m}}$.~The claim then follows by letting $\mu\uc{0}_{1} = \delta_s$ and $\mu\uc{0}_{2} = \mu\uc{\infty}_{\phi,p}$.~Note that,
    \begin{align}
        \norm{\mu\uc{m}_{1} - \mu\uc{m}_{2}}_{TV} &= 2~ \sup_{A \subseteq \cS}{\flbr{(\mu\uc{m}_{1} - \mu\uc{m}_{2})(A)}} \notag\\
        &= 2~\sup_{A \subseteq \cS}{\flbr{\int_{\cS}{p\uc{m}_\phi(s,A) ~d(\mu\uc{0}_{1} - \mu\uc{0}_{2})(s)}}} \notag\\
        &\leq \sup_{\substack{A \subseteq \cS \\ s,s\up \in \cS}}{\flbr{p\uc{m}_\phi(s,A) - p\uc{m}_\phi(s\up,A)}} \norm{\mu\uc{0}_{1} - \mu\uc{0}_{2}}_{TV} \notag\\
        &\leq \alpha\up \norm{\mu\uc{0}_{1} - \mu\uc{0}_{2}}_{TV}. \label{bdd:mu1m-mu2m}
    \end{align}
    Also, note that for any $i \in \bN$,
    \begin{align}
        \norm{\mu\uc{i}_{1} - \mu\uc{i}_{2}}_{TV} &= 2~ \sup_{A \subseteq \cS}{\flbr{(\mu\uc{i}_{1} - \mu\uc{i}_{2})(A)}} \notag\\
        &= 2~\sup_{A \subseteq \cS}{\flbr{\int_{\cS}{p(s,\phi(s),A) ~d(\mu\uc{i-1}_{1} - \mu\uc{i-1}_{2})(s)}}} \notag\\
        &\leq \sup_{\substack{A \subseteq \cS \\ s,s\up \in \cS}}{\flbr{p(s,\phi(s),A) - p(s\up,\phi(s\up),A)}} \norm{\mu\uc{i-1}_{1} - \mu\uc{i-1}_{2}}_{TV} \notag\\
        &\leq \norm{\mu\uc{i-1}_{1} - \mu\uc{i-1}_{2}}_{TV}, \label{bdd:mu1t-mu2t}
    \end{align}
    where the first step follows from the definition of the total variation norm, while the third step follows from Lemma~\ref{lem:bdd_dotdifLv}.~Combining \eqref{bdd:mu1m-mu2m} and \eqref{bdd:mu1t-mu2t}, we can write
    \begin{align*}
        \norm{\mu\uc{i}_{1} - \mu\uc{i}_{2}}_{TV} &\leq {\alpha\up}^{\floor{\frac{i}{m}}} \norm{\mu\uc{0}_{1} - \mu\uc{0}_{2}}_{TV}\\
        &\leq \frac{1}{\alpha\up}\br{{\alpha\up}^{\frac{1}{m}}}^i \norm{\mu\uc{0}_{1} - \mu\uc{0}_{2}}_{TV},~\forall i \in \bN.
    \end{align*}
    This concludes the proof of the lemma.
\end{proof}

Consider two CMPs $\{s_{1,i}\}$ and $\{s_{2,i}\}$, both of which are induced by $\phi$ operating on the MDP $\cM$ that has transition kernel $p$. Their initial state distributions are $\mu\uc{0}_1$ and $\mu\uc{0}_2$ respectively. Next, we derive an upper-bound on the cumulative sum of distances of the distributions of $s_{1,i}$ and $s_{2,i}$.
\begin{lemma}\label{lem:sum_tv_dist}
    Consider an MDP $\cM = (\cS, \cA, p, r)$ that satisfies Assumption~\ref{assum:unif_ergodic}, and a policy $\phi \in \Phi_{SD}$.~Let $\{s_{1,i}\}$ and $\{s_{2,i}\}$ be two CMPs induced by $\phi$ when it is applied to $\cM$. Let $\mu\uc{i}_1$ and $\mu\uc{i}_2$ denote the distributions of $s_{1,i}$ and $s_{2,i}$, respectively.~Then,
    \begin{align*}
        \sum_{i=0}^{\infty}{\norm{\mu\uc{i}_1 - \mu\uc{i}_2}_{TV}} \leq \frac{m\ust}{1 - \alpha} \norm{\mu\uc{0}_{1} - \mu\uc{0}_{2}}_{TV},
    \end{align*}
    where $m\ust$ is as defined in~\eqref{def:mstar}.
\end{lemma}
\begin{proof}
    From Lemma~\ref{lem:pn_contra}, we have that,
    \begin{align}\label{ineq:pn_contra}
        \norm{\mu\uc{i}_{1} - \mu\uc{i}_{2}}_{TV} \leq \alpha \norm{\mu\uc{0}_{1} - \mu\uc{0}_{2}}_{TV}, \mbox{ for } i \geq m\ust.
    \end{align}
    Also, for any $i \in \bN$ we have,
    \begin{align*}
        \norm{\mu\uc{i}_{1} - \mu\uc{i}_{2}}_{TV} &= 2~ \sup_{A \subseteq \cS}{\flbr{(\mu\uc{i}_{1} - \mu\uc{i}_{2})(A)}} \\
        &= 2~\sup_{A \subseteq \cS}{\flbr{\int_{\cS}{p(s,\phi(s),A) ~d(\mu\uc{i-1}_{1} - \mu\uc{i-1}_{2})(s)}}}\\
        &\leq \sup_{\substack{A \subseteq \cS \\ s,s\up \in \cS}}{\flbr{p(s,\phi(s),A) - p(s\up,\phi(s\up),A)}} \norm{\mu\uc{i-1}_{1} - \mu\uc{i-1}_{2}}_{TV} \\
        &\leq \norm{\mu\uc{i-1}_{1} - \mu\uc{i-1}_{2}}_{TV},
    \end{align*}
    where the first step follows from the definition of the total variation norm, and the third step follows from Lemma~\ref{lem:bdd_dotdifLv}.~Hence, 
    \begin{align}\label{ineq:non_expan}
        \norm{\mu\uc{i}_{1} - \mu\uc{i}_{2}}_{TV} \leq \norm{\mu\uc{0}_{1} - \mu\uc{0}_{2}}_{TV},~\forall i \in \bN
    \end{align}
    Using \eqref{ineq:pn_contra} iteratively, and~\eqref{ineq:non_expan}, we can write,
    \begin{align*}
        \sum_{t=0}^{\infty}{\norm{\mu\uc{i}_1 - \mu\uc{i}_2}_{TV}} &= \sum_{m=0}^{m\ust-1}{\sum_{i=0}^{\infty}{\norm{\mu\uc{m+i\cdot m\ust}_{1} - \mu\uc{m+i\cdot m\ust}_{2}}_{TV}}}\\
        &\leq \frac{m\ust}{1 - \alpha} \norm{\mu\uc{0}_{1} - \mu\uc{0}_{2}}_{TV}.
    \end{align*}
    where $m\ust = \ceil{\log_\frac{1}{\alpha}{(C)}} + 1$.~This concludes the proof.
\end{proof}
We now derive an upper-bound on the span of the relative value function $h^\phi_\cM$
~\eqref{def:hphi} associated with a policy $\phi\in\Phi_{SD}$.
\begin{lemma}[Bound on the span of relative value function]\label{lem:bdd_rvf_spn}
    Consider an MDP $\cM = (\cS, \cA, p, r)$ such that $p$ satisfies Assumption~\ref{assum:unif_ergodic}. For any policy $\phi \in \Phi_{SD}$, the span of the corresponding relative value function $h^\phi_\cM$~\eqref{def:hphi} can be bounded as,
    \al{
    \spn{h^\phi_\cM} \le \frac{m\ust\spn{r}}{1 - \alpha},
    }
    where $m\ust$ is as defined in~\eqref{def:mstar}.
\end{lemma}
\begin{proof}
    From the definition of $h^\phi_\cM$~\eqref{def:hphi} we obtain,
    \begin{align}
        \spn{h^\phi_\cM} &= \spn{\sum_{t=0}^{\infty}{\int_{\cS}{r(s,\phi(s))\br{\mu\uc{\infty}_{\phi,p} - \mu\uc{t}_{\phi,p,\cdot}}(ds)}}} \notag\\
        & \leq \sum_{t=0}^{\infty}{\spn{\int_{\cS}{r(s,\phi(s))\br{\mu\uc{\infty}_{\phi,p} - \mu\uc{t}_{\phi,p,\cdot}}(ds)}}} \notag\\
        & \leq \frac{1}{2} \sum_{t=0}^{\infty}{\max_{s}\norm{\mu\uc{\infty}_{\phi,p} - \mu\uc{t}_{\phi,p,s}}_{TV}} \spn{r}, \label{lem:bd_spn:ineq:1}
    \end{align}
    where the first inequality follows since span is a seminorm~\citep{puterman2014markov}, while the second inequality follows from Lemma~\ref{lem:bdd_dotdifLv}.~In Lemma~\ref{lem:sum_tv_dist} we let $\mu\uc{0}_1 = \mu\uc{\infty}_{\phi,p}$ and $\mu\uc{0}_2 = \delta_s$, where $\delta_s$ is the Dirac measure on $(\cS,\cB_\cS)$ centered at $s$, and get the following, 
    \begin{align*}
        \frac{1}{2} \sum_{t=0}^{\infty}{\max_{s}\norm{\mu\uc{\infty}_{\phi,p} - \mu\uc{t}_{\phi,p,s}}_{TV}} \spn{r} &\leq \frac{m\ust\spn{r}}{1 - \alpha}.
    \end{align*}
    This concludes the proof.
\end{proof}

\begin{lemma}[Bound on the span of policy evaluation iterates]\label{lem:bdd_pval_spn}
    Consider an MDP $\cM = (\cS, \cA, p, r)$ such that $p$ satisfies Assumption~\ref{assum:unif_ergodic}, and consider the policy evaluation algorithm applied to obtain the average reward of a policy $\phi \in \Phi_{SD}$ on $\cM$ i.e.,
    \begin{align}
        V^\phi_0(s) &= 0,\notag\\
        V^\phi_{i+1}(s) &= r(s,\phi(s)) + \int_{\cS}{p(s,\phi(s),s\up) V^\phi_i(s\up) ds\up},~i=1,2,\ldots.\label{def:epe_true}
    \end{align}
    We have,
    \al{
    \spn{V^\phi_i} \leq \frac{m\ust + 1}{1 - \alpha},
    }
where $m\ust = \ceil{\log_{\frac{1}{\alpha}}(C)} + 1$.
\end{lemma}
\begin{proof}
    Since Assumption~\ref{assum:unif_ergodic} holds, Lemma~\ref{lem:pn_contra} gives us the following,
    \begin{align*}
        \norm{p\uc{m\ust}_\phi(s, \cdot) - p\uc{m\ust}_\phi(s\up,\cdot)}_{TV} \leq 2 \alpha,~\forall s, s\up \in \cS,
    \end{align*}
    where $p\uc{m}_\phi$~\eqref{def:p_tstage} is the $m$-step transition kernel of the CMP induced by the transtion kernel $p$ under the application of policy $\phi$.~Also, note that
    \begin{align*}
        V^\phi_{i+m\ust}(s) = \sum_{j=0}^{m\ust}{\bE\sqbr{r(s_{i+j}, \phi(s_{i+j})) \mid s_i = s}} + \int_{\cS}{p\uc{m\ust}_\phi(s,s\up) V^\phi_i(s\up) ds\up}.
    \end{align*}
    Hence,
    \begin{align*}
        \spn{V^\phi_{i+m\ust}} &\leq \spn{\sum_{j=0}^{m\ust}{\bE\sqbr{r(s_{i+j}, \phi(s_{i+j})) \mid s_i = s}}} + \spn{\int_{\cS}{p\uc{m\ust}_\phi(s,s\up) V^\phi_i(s\up) ds\up}}\\
        &\leq m\ust + 1 + \frac{1}{2} \spn{V^\phi_i}  \norm{p\uc{m\ust}_\phi(s, \cdot) - p\uc{m\ust}_\phi(s\up,\cdot)}_{TV} \\
        &\leq m\ust + 1 + \alpha \spn{V^\phi_i},
    \end{align*}
    where the second inequality follows from Lemma~\ref{lem:bdd_dotdifLv}.~Using the above inequality, we have that for every $k \leq m\ust$,
    \begin{align*}
        \spn{V^\phi_{i\cdot m\ust + k}} &\leq (m\ust + 1) \sum_{j=0}^{i-1}{\alpha^j} + \alpha^i \spn{V^\phi_{k}} \\
        &\leq (m\ust + 1) \sum_{j=0}^{i-1}{\alpha^j} + m\ust \alpha^i \\
        &\leq \frac{m\ust + 1}{1 - \alpha}.
    \end{align*}
    This concludes the proof.
\end{proof}

\subsection{Proof of Lemma~\ref{lem:gap_phi}}
\begin{proof}
    Using the definition of $\gap{s,\phi(s)}$~\eqref{def:subgap}, we obtain that,
    \al{
        \int_{\cS}{\gap{s,\phi(s)}~ \mu\uc{\infty}_{\phi,p}(s)~ ds} &= \int_{\cS}{\br{J\ust_{\cM} + h_{\cM}(s) - r(s,\phi(s)) - \int_{\cS}{h_{\cM}(s\up)~p(s,\phi(s),s\up)~ ds\up}} \mu\uc{\infty}_{\phi,p}(s)~ ds} \notag\\
        &= J\ust_{\cM}\int_{\cS}{\mu\uc{\infty}_{\phi,p}(s)~ ds} + \int_{\cS}{h_{\cM}(s) \mu\uc{\infty}_{\phi,p}(s)~ ds} - \int_{\cS}{r(s,\phi(s)) \mu\uc{\infty}_{\phi,p}(s)~ ds} \notag\\
        &- \int_{\cS}{\br{\int_{\cS}{h_{\cM}(s\up)~p(s,\phi(s),s\up)~ ds\up}} \mu\uc{\infty}_{\phi,p}(s)~ ds} \notag\\
        &= J\ust_{\cM} + \int_{\cS}{h_{\cM}(s) \mu\uc{\infty}_{\phi,p}(s)~ ds} - J_\cM(\phi) \notag\\
        &- \int_{\cS}{h_{\cM}(s\up) \br{\int_{\cS}{p(s,\phi(s),s\up) \mu\uc{\infty}_{\phi,p}(s)~ds}}ds\up} \notag\\
        &= J\ust_{\cM} - J_\cM(\phi) + \int_{\cS}{h_{\cM}(s) \mu\uc{\infty}_{\phi,p}(s)~ ds} - \int_{\cS}{h_{\cM}(s) \mu\uc{\infty}_{\phi,p}(s)~ ds}\notag\\
        &= \Delta(\phi),
    }
    where the third equality follows from \eqref{def:Jphi} and the fourth equality follows from the property of the stationary distribution. This concludes the proof.
\end{proof}