\section{Problem Setup}\label{sec:prelim}
\textbf{Notation.} The set of natural numbers is denoted by $\bN$.~We denote the span of a $\bR$-valued function $f \in \bR^X$ by $\spn{f}$, i.e., $\spn{f} = \max_{x \in X}{f(x)} - \min_{x \in X}{f(x)}$.~We abbreviate ``with high probability'' as ``w.h.p.''~For a $\sigma$-algebra $\cF$ and a measure $\mu: \cF \to \bR$, we let $\norm{\mu}_{TV}$ denote its total variation norm~\citep{folland2013real}, i.e., $\norm{\mu}_{TV} := \sup \flbr{|\mu(B)|: B \in \cF}$.~$a \vee b$ denotes the maximum, and $a \wedge b$ denotes the minimum of two real numbers $a$ and $b$.~$\ceil{a}$ denotes the smallest integer that is greater than or equal to $a$.~At certain places, we use a single variable~($z$) to denote state-action pairs.

Let $\cM = (\cS, \cA, p, r)$ be an MDP, where the state-space $\cS$ and action-space $\cA$ are compact sets of dimension $d_\cS$ and $d_\cA$, respectively.~Let $\cS$ be endowed with Borel $\sigma$-algebra $\cB_\cS$. To simplify exposition, we assume that $\cS = [0,1]^{d_\cS}$ and $\cA = [0,1]^{d_\cA}$ without loss of generality.~We denote the system state and action taken at time $t$ by $s_t,a_t$ respectively.~The state $s_t$ evolves as follows,

\begin{align*}
     \bP\left(s_{t+1}\in B| s_t=s, a_t=a\right) = p(s,a,B), \mbox{ a.s.},&\notag\\
     \forall (s,a,B) \in \cS \times \cA \times \cB_\cS,~t \in \{0\}\cup\bN,&
\end{align*}

where $p: \cS \times \cA \times \cB_\cS \to [0,1]$ is the transition kernel that is not known by the agent.~The agent earns a reward $r(s_t,a_t)$ at time $t$, where the reward function $r: \cS \times \cA \to [0,1]$ is a measurable map.~The goal of the agent is to maximize the infinite horizon average reward.~The spaces $\cS, \cA$ are endowed with metrics $\rho_\cS$ and $\rho_\cA$, respectively. The space $\cS \times \cA$ is endowed with a metric $\rho$ that is sub-additive, i.e., we have,
\begin{align*}
    \rho\br{(s,a),(s\up,a\up)} \leq \rho_\cS (s, s\up) + \rho_\cA (a, a\up),
\end{align*}
for all $(s, a), (s\up, a\up) \in \cS \times \cA$.~For $\cZ \subseteq \cS \times \cA$, $\diamc{\cZ} := \sup_{z_1,z_2 \in \cZ}{\rho(z_1,z_2)}$.~A stationary deterministic policy is a measurable map $\phi: \cS \to \cA$ that implements the action $\phi(s)$ when the system state is $s$. Let $\Phi_{SD}$ be the set of all such policies.~The infinite horizon average reward of a policy $\phi$ when it acts on an MDP $\cM$ is denoted by $J_\cM(\phi)$, and is defined as,
\begin{align*}
    J_\cM(\phi) := \underset{T \to \infty}{\lim\inf}{\frac{1}{T} \bE_{\cM,\phi}\sqbr{\sum_{t-0}^{T-1}{r(s_t,a_t)}}},
\end{align*}
where $\bE_{\cM,\phi}$ denotes expectation taken under consideration that policy $\phi$ is used to take actions throughout on the MDP $\cM$.~The optimal average reward of the MDP $\cM$ is defined as $J\ust_\cM := \sup_{\phi \in \Phi_{SD}}{J_\cM(\phi)}$.~The regret~\citep{lattimore2020bandit} of a learning algorithm $\psi$ until $T$ is defined as,
\begin{align}
    \cR(T;\psi) &:= T J\ust_\cM -  \sum_{t=0}^{T-1} r(s_t,a_t).\label{def:regret}
\end{align}
The goal of this work is to design a learning algorithm with tight regret upper bound for Lipschitz MDPs.%~We now introduce the class of Lipschitz MDPs.
~An MDP is Lipschitz if it satisfies the assumption below.
\begin{assum}[Lipschitz continuity]\label{assum:lip}
    \begin{enumerate}
        \item[(i)] \label{assum:lip_r} The reward function $r$ is $L_r$-Lipschitz, i.e., $\forall ~s, s\up \in \cS, a, a\up \in \cA$,
        \begin{align*}
            | r(s,a) - r(s\up,a\up) | \le L_r \rho\left((s,a),(s\up,a\up)\right).
        \end{align*}
        \item[(ii)] \label{assum:lip_p} 
        The transition kernel $p$ is $L_p$-Lipschitz, i.e., $\forall~ s, s\up \in \cS, a, a\up \in \cA$,
        \begin{align*}
            \norm{p(s,a,\cdot) - p(s\up, a\up, \cdot)}_{TV} \le L_p \rho\br{(s,a),(s\up,a\up)}.
        \end{align*}
    \end{enumerate}
\end{assum}
The following assumption ensures that the underlying MDP is ergodic and is typically required for average reward setup~\citep{ortner2020regret,wei2021learning,hao2021adaptive}.
\begin{assum}[Uniform ergodicity]\label{assum:unif_ergodic}
    We assume that $\{s_t\}$, the controlled Markov process~(CMP) induced by transition kernel $p$ under application of any  stationary deterministic policy is uniformly ergodic~\citep{douc2018markov}, that is, for every $\phi \in \Phi_{SD}$, there exists a unique distribution $\mu\uc{\infty}_{\phi,p}$, two constants, $C \in (0,\infty)$ and $\alpha \in (0,1)$ such that
    \begin{align*}
        \norm{\mu\uc{t}_{\phi,p,s} - \mu\uc{\infty}_{\phi,p}}_{TV} \le C \alpha^t, ~\forall s \in \cS, t \in \{0\} \cup \bN,
    \end{align*}
    where $\mu\uc{t}_{\phi,p,s}$ denotes the distribution of $s_t$ under the application of policy $\phi$ given $s_0=s$.
\end{assum}
We note that even when $\cM$ is known,~\eqref{assum:unif_ergodic} is the weakest known sufficient condition that ensures a computationally efficient way to obtain an optimal policy~\citep{arapostathis1993discrete}.~Consider the Average Reward Optimality Equation~(AROE) corresponding to the MDP $\cM$, $J + h(s) = \max_{a \in \cA}{\flbr{r(s,a) + \int_{S}{h(s\up)~ p(s,a,s\up)~ ds\up}}}$.~It can be shown that under Assumption~\ref{assum:unif_ergodic}, there exists a function $h_\cM: \cS \to \bR$ such that $(J\ust_\cM, h_\cM)$ satisty the AROE~\citep{hernandez2012adaptive} where $h_{\cM}$ is the relative value function.~Imposing an additional condition $h(s\lst) = 0$ results in unique solution to the AROE, where $s\lst$ is a designated state.~Also, there exists a stationary deterministic policy $\phi\ust$ that is optimal, i.e., $J\ust_\cM = J_\cM(\phi\ust)$.~Similarly, for a policy $\phi \in \Phi_{SD}$ there is a function $h^\phi_\cM:\cS \to \bR$ such that $(J_\cM(\phi), h^\phi_\cM)$ is the solution of $J + h(s) = r(s,\phi(s)) + \int_{S}{h(s\up)~ p(s,\phi(s),s\up)~ ds\up}$.~See Appendix~\ref{app:gen_res} for more details on properties of average reward MDPs.~The suboptimality gap~\citep{burnetas1997optimal} of a state-action pair is defined as follows:
\begin{align}
    \gap{s,a} := &J\ust_{\cM} + h_{\cM}(s) - r(s,a) \notag\\
    &- \int_{\cS}{h_{\cM}(s\up)~ p(s,a,s\up)~ ds\up}.\label{def:subgap}
\end{align}

\textbf{Zooming dimension.} Let us denote the set of state-action pairs $(s,a)$ such that $\gap{s,a} \leq \beta$ by $\cZ_\beta$.~We define the zooming dimension as
\begin{align}\label{def:zoomingdim}
    d_z := \inf{\flbr{d\up > 0 ~|~ \cN_{c_s \beta}\br{\cZ_\beta} \leq c_z \beta^{-d\up},~\forall  \beta > 0}},
\end{align}
where $\cN_{c_s \beta}\br{\cZ_\beta}$ denotes the $c_s \beta$-covering number~\citep{cao2020provably} of $\cZ_\beta$, $c_s$ \eqref{def:cs} and $c_z$ are problem-dependent constants.~Note that $d_z$ is logarithm of the covering number of a subset of $\cS \times \cA$, hence $d_z \leq d$.