\documentclass[accepted]{uai2025}

\usepackage[american]{babel}

\usepackage{natbib}
\bibliographystyle{plainnat}
\renewcommand{\bibsection}{\subsubsection*{References}}

\usepackage{mathtools}
\usepackage{booktabs}
\usepackage{tikz}
\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage{amsthm}
\usepackage{amsfonts}
\usepackage{color}
\usepackage{mathtools}
\usepackage{mathrsfs}
\usepackage{hyperref}
\usepackage{booktabs}
\usepackage{enumitem}
\usepackage{nicefrac}
\usepackage{microtype}
\usepackage{xcolor}
\usepackage{comment}
\usepackage{bbm}
\usepackage{svg}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{enumitem}
\usepackage{times}
\usepackage{dsfont}
\usepackage{caption,color}
\usepackage{subcaption}
\usepackage{ulem}
\usepackage{units}


\newcommand{\cA}{\mathcal{A}}
\newcommand{\cB}{\mathcal{B}}
\newcommand{\cC}{\mathcal{C}}
\newcommand{\cD}{\mathcal{D}}
\newcommand{\cE}{\mathcal{E}}
\newcommand{\cF}{\mathcal{F}}
\newcommand{\cG}{\mathcal{G}}
\newcommand{\cH}{\mathcal{H}}
\newcommand{\cI}{\mathcal{I}}
\newcommand{\cK}{\mathcal{K}}
\newcommand{\cL}{\mathcal{L}}
\newcommand{\cM}{\mathcal{M}}
\newcommand{\cN}{\mathcal{N}}
\newcommand{\cO}{\mathcal{O}}
\newcommand{\ctO}{\tilde{\mathcal{O}}}
\newcommand{\cP}{\mathcal{P}}
\newcommand{\cQ}{\mathcal{Q}}
\newcommand{\cR}{\mathcal{R}}
\newcommand{\cS}{\mathcal{S}}
\newcommand{\cT}{\mathcal{T}}
\newcommand{\cX}{\mathcal{X}}
\newcommand{\cY}{\mathcal{Y}}
\newcommand{\cZ}{\mathcal{Z}}
\newcommand{\bA}{\mathbb{A}}
\newcommand{\bC}{\mathbb{C}}
\newcommand{\bE}{\mathbb{E}}
\newcommand{\bN}{\mathbb{N}}
\newcommand{\bP}{\mathbb{P}}
\newcommand{\bQ}{\mathbb{Q}}
\newcommand{\bR}{\mathbb{R}}
\newcommand{\bZ}{\mathbb{Z}}
\newcommand{\mbf}{\mathbf}
\newcommand{\bv}{\mathbf{v}}
\newcommand{\dE}{\mathds{E}}
\newcommand{\ui}{^{(\mathcal{I})}}
\newcommand{\ust}{^{\star}}
\newcommand{\lst}{_{\star}}
\newcommand{\uc}[1]{^{(#1)}}
\newcommand{\up}{^{\prime}}
\newcommand{\upp}{^{\prime\prime}}
\newcommand{\Te}{\Theta}
\newcommand{\te}{\theta}
\newcommand{\om}{\omega}
\newcommand{\Om}{\Omega}
\newcommand{\ap}{\alpha}
\newcommand{\gm}{\gamma}
\newcommand{\eps}{\epsilon}
\newcommand{\teps}{\tilde{\epsilon}}
\newcommand{\lm}{\lambda}
\newcommand{\blamb}{\bm{\lambda}}
\newcommand{\ts}{\tilde{s}}
\newcommand{\ta}{\tilde{a}}
\newcommand{\txi}{\tilde{\xi}}
\newcommand{\uth}{^\textit{th}}
\newcommand{\algo}{\texttt{ZoRL}}
\newcommand{\scopt}{\texttt{ScOpt}}
\newcommand{\ucrlc}{\textit{UCRL-C}}
\newcommand{\ucrl}{\textit{UCRL-2}}
\newcommand{\bigo}[1]{\cO\paren{#1}}
\newcommand{\deff}{d_{\text{eff.}}}

\newcommand{\norm}[1]{\left\lVert#1\right\rVert}
\newcommand{\abs}[1]{\left|#1\right|}
\newcommand{\vol}[1]{\textit{Vol}\left(#1\right)}
\newcommand{\paren}[1]{\left(#1\right)}
\newcommand{\br}[1]{\left(#1\right)}
\newcommand{\flbr}[1]{\left\{#1\right\}}
\newcommand{\sqbr}[1]{\left[#1\right]}
\newcommand{\ceil}[1]{\left\lceil#1\right\rceil}
\newcommand{\floor}[1]{\left\lfloor#1\right\rfloor}
\newcommand{\angl}[1]{\left\langle#1\right\rangle}
\newcommand{\ol}[1]{\bar{#1}}
\newcommand{\ubar}[1]{\underline{#1}}
\newcommand{\ovl}[1]{\mkern 1.5mu\overline{\mkern-1.5mu#1\mkern-1.5mu}\mkern 1.5mu}
\newcommand{\pj}[1]{\mbox{proj}\paren{#1}}
\newcommand{\diam}[2]{\mbox{diam}_{#1}(#2)}
\newcommand{\pdiam}[2]{\widetilde{\mbox{diam}}_{#1}(#2)}
\newcommand{\diamc}[1]{\mbox{diam}\br{#1}}
\newcommand{\spn}[1]{~\textit{sp}\paren{#1}}
\newcommand{\ind}[1]{\mathbbm{1}_{\flbr{#1}}}
\newcommand{\rel}[2]{\textit{Rel}_{#1}{\br{#2}}}
\newcommand{\relp}[1]{\textit{Rel}{\br{#1}}}
\newcommand{\dom}[2]{\textit{Dom}_{#1}{\paren{#2}}}
\newcommand{\gap}[1]{\text{gap}\br{#1}}
\newcommand{\evi}{\texttt{EVI}}
\newcommand{\epe}{\texttt{EPE}}
\newcommand{\kl}[1]{\textit{KL}\paren{#1}}
\newcommand{\clip}[2]{\textit{clip}\sqbr{#1\middle|#2}}
\newcommand{\inv}{^{-1}}
\newcommand{\mycomment}[1]{}

\newcommand{\ak}[1]{\textcolor{red}{#1}}
\newcommand{\akb}[1]{\textcolor{cyan}{(#1)}}
\newcommand{\rs}[1]{\textcolor{red}{#1}}
\newcommand{\id}[1]{\textcolor{purple}{#1}}

\newcommand{\al}[1]{\begin{align}#1\end{align}}
\newcommand{\nal}[1]{\begin{align*}#1\end{align*}}

\newtheorem{assumption}{\textbf{Assumption}}

\newtheorem{thm}{Theorem}[section]
\newtheorem{lemma}[thm]{Lemma}
\newtheorem{prop}[thm]{Proposition}
\newtheorem{cor}[thm]{Corollary}
\newtheorem{clm}[thm]{Claim}

\newtheorem{defn}[thm]{Definition}
\newtheorem{conj}[thm]{Conjecture}
\newtheorem{exmp}[thm]{Example}
\newtheorem{assum}[thm]{Assumption}
\newtheorem{exerc}[thm]{Exercise}
\newtheorem*{remark}{Remark}


\title{Provably Adaptive Average Reward Reinforcement Learning for Metric Spaces}

\author[1]{\href{mailto:<avikkar@iisc.ac.in>?Subject=Your UAI 2025 paper}{Avik Kar}}
\author[1]{\href{mailto:<rahulsingh@iisc.ac.in>?Subject=Your UAI 2025 paper}{Rahul Singh}}

\affil[1]{
    Department of Electrical Communication Engineering\\
    Indian Institute of Science\\
    Bengaluru
}
  
\begin{document}
\maketitle

\begin{abstract}
    We study infinite-horizon average-reward reinforcement learning (RL) for Lipschitz MDPs, a broad class that subsumes several important classes such as linear and RKHS MDPs, function approximation frameworks, and develop an adaptive algorithm \algo~with regret bounded as $\mathcal{O}\big(T^{1 - d_{\text{eff.}}^{-1}}\big)$, where $d_{\text{eff.}}= 2d_\mathcal{S} + d_z + 3$, $d_\mathcal{S}$ is the dimension of the state space and $d_z$ is the zooming dimension. In contrast, algorithms with fixed discretization yield $d_{\text{eff.}} = 2(d_\mathcal{S} + d_\mathcal{A}) + 2$, $d_\mathcal{A}$ being the dimension of action space. \algo~achieves this by discretizing the state-action space adaptively and zooming into ``promising regions'' of the state-action space. $d_z$, a problem-dependent quantity bounded by the state-action space's dimension, allows us to conclude that if an MDP is benign, then the regret of \algo~will be small. The zooming dimension and \algo~are truly adaptive, i.e., the current work shows how to capture adaptivity gains for infinite-horizon average-reward RL. \algo~outperforms other state-of-the-art algorithms in experiments, thereby demonstrating the gains arising due to adaptivity.
\end{abstract}

\input{tex_files/introduction}
\input{tex_files/preliminaries}
\input{tex_files/algorithm}
\input{tex_files/regret_analysis}
\input{tex_files/simulations}
\input{tex_files/conclusion}

\subsubsection*{Acknowledgements}
This work is partially supported by the SERB Grant SRG/2021/002308. The authors acknowledge the Prime Minister's Research Fellowship to Avik Kar.


%\bibsection
\bibliography{refs}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newpage

\onecolumn

\title{Adaptive Discretization-based Non-Episodic Reinforcement Learning in Metric Spaces\\(Supplementary Material)}
\maketitle

\appendix
\textbf{Organization of the Appendix.}~Some properties of the MDPs that satisfy Assumption~\ref{assum:unif_ergodic} are discussed in Appendix~\ref{app:gen_res}. It also includes the proof of Lemma~\ref{lem:gap_phi}. Some important properties of extended MDPs can be found in Appendix~\ref{app:prop_emdp}. We use these properties while analyzing the regret of \algo. Next, in Appendix~\ref{app:prop_pdiam}, we show certain properties of the proxy diameters of policies. Results obtained in Appendix~\ref{app:prop_emdp} play a crucial role in deriving those properties. A high probability lower bound on the number of visits to the key cells in each episode is derived in Appendix~\ref{app:visits}.~In Appendix~\ref{app:regret}, we derive the desired regret bound.~Appendix~\ref{app:conc_ineq} covers the concentration results for estimates of the discretized model.~In Appendix~\ref{app:prop_evi_epe}, we derive bounds on inaccuracy that \evi~and \epe~injects into \algo~due to finite computation power.~Details of the experiments, the associated environments and additional simulation results are reported in Appendix~\ref{app:sim}.~Appendix~\ref{app:aux_res} derives some key results that are used in the proof of Lemma~\ref{lem:conc_ineq}.~Appendix~\ref{app:use_res} contains some known results that are used in this paper.

\input{appendix/app_gen_results}
\input{appendix/app_emdp_prop}
\input{appendix/app_visits}
\input{appendix/app_regret_analysis}
\input{appendix/app_conc_ineq}
\input{appendix/app_evi_prop}
\input{appendix/app_sim}
\input{appendix/app_aux_results}
\input{appendix/app_useful_results}

\end{document}
