% \documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
\usepackage{bm}
\usepackage{algorithm}
\usepackage{algorithmicx}
\usepackage{algpseudocode}  
\usepackage{mathtools} % amsmath with fixes and additions
\usepackage{amsmath,amssymb,amsthm,amsfonts,mathrsfs}
\usepackage{subfigure}
\usepackage{enumitem}


\newtheorem{definition}{Definition}
\newtheorem{theorem}{Theorem}
\newtheorem{assumption}{Assumption}
\newtheorem{lemma}{Lemma}
\newtheorem{corollary}{Corollary}
\newtheorem{proposition}{Proposition}
\newtheorem{remark}{Remark}
\newtheorem{problem}{Problem}
\newtheorem{example}{Example}
\newtheorem{claim}{Claim}
\newtheorem{observation}{Observation}


%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\NAM}{\textsc{CMDP-PSRL}}
\newcommand{\bbP}{\mathbb{P}}



\title{Regret Guarantees for Model-Based Reinforcement Learning with Long-Term Average Constraints}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:Mridul Agarwal <agarw180@purdue.edu>}{Mridul Agarwal}{}} % Lead author
\author[1]{Qinbo Bai}
\author[2,1]{Vaneet Aggarwal}
% Add affiliations after the authors
\affil[1]{%
    School of Electrical and Computer Engineering.\\
    Purdue University\\
    West Lafayette, Indiana, USA
}
\affil[2]{%
    School of Industrial Engineering.\\
    Purdue University\\
    West Lafayette, Indiana, USA
}

%\newcommand{\citep}[1]{\cite{#1}}
% The \author macro works with any number of authors. There are two commands
% used to separate the names and addresses of multiple authors: \And and \AND.
%
% Using \And between authors leaves it to LaTeX to determine where to break the
% lines. Using \AND forces a line break at that point. So, if LaTeX puts 3 of 4
% authors names on the first line, and the last on the second line, try using
% \AND instead of \And before the third author name.


\begin{document}
	\maketitle
\begin{abstract}
    We consider the problem of constrained Markov Decision Process (CMDP) where an agent interacts with an ergodic Markov Decision Process. At every interaction, the agent obtains a reward and incurs $K$ costs. The agent aims to maximize the long-term average reward while simultaneously keeping the $K$ long-term average costs lower than a certain threshold. In this paper, we propose \NAM, a posterior sampling based algorithm using which the agent can learn optimal policies to interact with the CMDP. We show that with the assumption of slackness, characterized by $\kappa$, the optimization problem is feasible for the sampled MDPs. Further, for MDP with $S$ states, $A$ actions, and mixing time $T_M$, we prove that following \NAM\ algorithm, the agent can bound the regret of not accumulating rewards from an optimal policy by $\Tilde{O}(T_MS\sqrt{AT})$. Further, we show that the violations for any of the $K$ constraints is also bounded by $\Tilde{O}(T_MS\sqrt{AT})$. To the best of our knowledge, this is the first work that obtains a $\Tilde{O}(\sqrt{T})$ regret bounds for ergodic MDPs with long-term average constraints using a posterior sampling method.
\end{abstract}

\section{Introduction}
Consider a wireless sensor network where the devices aim to update a server with sensor values. At time $t$, the device can choose to send a packet to obtain a reward of $1$ unit or to queue the packet and obtain $0$ reward. However, communicating a packet results in $p_t$ power consumption. At time $t$, if the wireless channel condition, $s_t$, is weak and the device chooses to send a packet, the resulting instantaneous power consumption, $p_t$, is high. The goal is to send as many packets as possible while keep the average power consumption, $\sum_{t=1}^Tp_t/T$, within some limit, say $C$. This environment has state $(s_t, q_t)$ as the channel condition and queue length at time $t$. To limit the power consumption, the agent may choose to send packets when the channel condition is good or when the queue length grows beyond a certain threshold. The agent aims to learn the policies in an \textit{online manner} which requires efficiently balancing exploration of state-space and exploitation of the estimated system dynamics \citep{singh2020learning}.

Similar to the example above, many applications require to keep some costs low while simultaneously maximizing the rewards \citep{altman1999constrained}. Owing to the importance of this problem, in this paper, we consider the problem of constrained Markov Decision Processes (CMDP). We aim to develop a reinforcement learning algorithm following which an agent can bound the constraint violations and the regret in obtaining the optimal reward to $o(T)$.

The problem setup, where the system dynamics are known, is extensively studied \citep{altman1999constrained}. For a constrained setup, the optimal policy is possibly stochastic \citep{altman1999constrained,puterman2014markov}. In the domain where the agent learns the system dynamics and aims to learn good policies online, there has been work where to show asymptotic convergence to optimal policies \citep{gattami2021reinforcement}, or even provide regret guarantees when the MDP is episodic \citep{zheng2020constrained,ding2021provably}. Recently, \citep{singh2020learning} considered the problem of online optimization of infinite-horizon communicating Markov Decision Processes with long-term average constraints. They provide an optimism based algorithm where confidence bounds on each transition probabilities $p(s'|s,a)$ is constructed. Using this, they obtain a regret bound of $\Tilde{O}\left(\sqrt{SAT} + T_MT^{2/3}\right)$\footnote{$\Tilde{O}(\cdot)$ hides the logarithmic terms}. Additionally, finding the optimistic policy is a computationally intensive task as the number of optimization variables become $S^2\times A$ for MDP with $S$ states and $A$ actions.

In this paper, we consider the reinforcement learning an infinite-horizon ergodic MDP \citep{tarbouriech2019active,gattami2021reinforcement} with long-term average constraints.
We use $\ell_1$ deviation bounds \citep{jaksch2010near} and use a Bellman error analysis to bound the reward regret of the MDP as $\Tilde{O}(T_MS\sqrt{AT})$. Additionally, we also bound the constraint violations as $\Tilde{O}(T_MS\sqrt{AT})$. We propose a posterior sampling based algorithm where we sample the transition dynamics using a Dirichlet distribution \citep{osband2013more}, which achieves this regret bound.

Unlike optimistic algorithms, the sampled MDP may not be infeasible for the constrained optimization. Hence, we consider slackness characterized by Slater's parameter \cite{ding2020natural}, which allows us to prove that the optimization problem is feasible even with the sampled MDPs. Posterior sampling also helps to reduces the optimization variables, to find only the optimal policy for the sampled MDP, to only $S\times A$ variables. Finally, we provide numerical examples where the algorithm converges to the calculated optimal policies. To the best of our knowledge, this is the first work to obtain $O(\sqrt{T})$ regret guarantees for the infinite horizon long-term average constraint setup with posterior sampling.


\section{Related Work}
Stochastic Optimization using Markov Decision Processes has very rich roots \citep{howard:dp}. There have been work in understanding convergence of the algorithm to find optimal policies for known MDPs \citep{BertsekasTsitsiklis96,altman1999constrained}. Also, when the MDP is not known, there are algorithms with asymptotic guarantees for learning the optimal policies \citep{Watkins1992} which maximize an objective without any constraints. Recent algorithms even achieve finite time near-optimal regret bounds with respect to the number of interactions with the environment \citep{jaksch2010near,osband2013more,agrawal2017optimistic,jin2018provably}. \citet{jaksch2010near} uses the optimism principle for minimizing regret for weakly communicating infinite horizon MDPs with bounded diameter. \citet{osband2013more} extended the analysis of \citet{jaksch2010near} to posterior sampling for episodic MDPs and bounded the Bayesian regret and further improved the regret bounds \cite{osband2017posterior}. \citet{agrawal2017optimistic} uses a posterior sampling based approach and obtains a frequentist regret for the infinite horizon MDPs with bounded diameter.

In many reinforcement learning settings, the agent not only wants to maximize the rewards but also satisfy certain cost constraints \citep{altman1999constrained}. Early works in this area were pioneered by \citep{altman1991constrained}. They provided an algorithm which combined forced explorations and following policies optimized on empirical estimates to obtain an asymptotic convergence. \citep{BORKAR2005207} studied the constrained RL problem using actor-critic and a two time-scale framework \citep{BORKAR1997291} to obtain asymptotic performance guarantees. Very recently, \citep{gattami2021reinforcement} analyzed the asymptotic performance for Lagrangian based algorithms for infinite-horizon long-term average constraints.

Inspired by the finite-time performance analysis of reinforcement learning algorithm for unconstrained problems, there has been a significant thrust in understanding the finite-time performances of constrained MDP algorithms. \citep{zheng2020constrained} considered an episodic CMDP and use an optimism based algorithm to bound the constraint violation as $\Tilde{O}(\sqrt{T^{1.5}})$ with high probability. \citep{kalagarla2020sample} also considered the episodic setup to obtain PAC-style bound for an optimism based algorithm. \citep{ding2021provably} considered the setup of $H$-episode length episodic CMDPs with $d$-dimensional linear function approximation to bound the constraint violations as $\Tilde{O}(d\sqrt{H^5T})$ by mixing the optimal policy with an exploration policy. \citep{efroni2020exploration} proposes a linear-programming and primal-dual policy optimization algorithm to bound the regret as $O(S\sqrt{H^3T})$. \citep{qiu2020cmdp} proposed an algorithm which obtains a regret bound of $\Tilde{O}(S\sqrt{AH^2T})$ for the problem of adversarial stochastic shortest path. Compared to these works, we focus on setting with infinite horizon long-term average constraints.

After developing a better understanding of the policy gradient algorithms \citep{Alekh2020}, there has been theoretical work in the area of model-free policy gradient algorithms for constrained MDP and safe reinforcement learning as well. \citep{xu2020primal} consider an infinite horizon discounted setup with constraints and obtain global convergence using policy gradient algorithms. \citep{ding2020natural} also considers an infinite horizon discounted setup. They use a natural policy gradient to update the primal variable and sub-gradient descent to update the dual variable.

Recently \citep{singh2020learning} considered the setup of infinite-horizon CMDPs with long-term average constraints and obtain a regret bound of $\Tilde{O}(T^{2/3})$ using an optimism based algorithm and forced explorations. We consider a similar setting with ergodic CMDP and propose a posterior sampling based algorithm to bound the regret as $\Tilde{O}(poly(DSA)\sqrt{T})$ using explorations assisted by the ergodicity of the MDP.


\section{Problem Formulation} \label{sec:system_model}
We consider an infinite horizon discounted Markov decision process (MDP) $\mathcal{M}$, defined by the tuple $\left(\mathcal{S}, \mathcal{A}, P, r, c_1, \cdots, c^k\right)$. 
$\mathcal{S}$ denotes a finite set of state space with $|\mathcal{S}| = S$, and $\mathcal{A}$ denotes a finite set of actions with $|\mathcal{A}| = A$. 
$P:\mathcal{S}\times\mathcal{A}\to\Delta(\mathcal{S})$ denotes the probability $P(s'|s,a)$ of transitioning to state $s'$ from state $s$ after taking action $a$. 
$r:\mathcal{S}\times\mathcal{A}\to[0,1]$ denotes the average reward in state $s$ after taking action $a$. $c^k:\mathcal{S}\times\mathcal{A}\to[0,1]$ denotes average cost incurred by the agent for constraint $k\in[K] = \{1, 2, \cdots, K\}$ after taking action $a$ in state $s$. We use a stochastic policy $\pi : \mathcal{S} \to \Delta(\mathcal{A})$, such that given state $s$, $\pi(a|s)$ is the probability of selecting action $a$.

Note that the a policy $\pi$ induces a Markov chain over the state space of the MDP.
Pertaining to the Markov chains generated by the policies for $\mathcal{M}$, we now define the mixing time of MDP.
 
\begin{definition}[Mixing Time]
Consider the Markov Chain induced by the policy $\pi$ on the MDP $\mathcal{M}$. Let $T_{s\to s'}^\pi$ be a random variable that denotes the first time step when this Markov Chain enters state $s'$ starting from state $s$. Then, the mixing time of the MDP $\mathcal{M}$ is defined as:
\begin{align}
    T_M = \max_{s'\neq s}\max_{\pi}\mathbb{E}\left[T_{s\to s'}^\pi\right]
\end{align}
\end{definition}
Similar to \citet{singh2020learning}, let $P_{\pi}^t(s)$ be the $t$ step state distribution starting from state $s$ following policy $\pi$ and $P_\pi$ be the steady-state state distribution generated by policy $\pi$.

Our first assumption on the MDP allows any policy to reach any state $s'$ starting from any state $s$, and to converge to a steady state. We formalize the result in the following assumption:

\begin{assumption}
The MDP $\mathcal{M}$ is ergodic, or $\Vert P^t_{\pi}(s) - P_{\pi}\Vert_{TV} \le C\rho^t$ with $P_\pi$ being the long-term steady state distribution induced by policy $\pi$, and $C > 0$ and $\rho < 1$ are problem specific constants. And, the mixing time of the MDP $\mathcal{M}$ is finite or $T_M < \infty$.
\end{assumption}

After discussing the transition dynamics of the system, we move to the rewards and costs of the MDP $\mathcal{M}$. 
\begin{assumption} \label{known_rewards}
The reward function $r(s,a)$ and the costs $c^k(s,a), k\in[K]$ are known to the agent.
\end{assumption}
We note that in most of the problems, rewards are engineered. Hence, Assumption \ref{known_rewards} is justified in many setups. However, the system dynamics are stochastic and typically not known. 

Following a policy $\pi$, the expected long-term average cost are given by $\zeta_{\pi}^{P,k}$. Also, we denote the average long-term reward using $\zeta_{\pi}^{P,k}$. Formally, we have:
\begin{align}
    \zeta_{\pi}^{P,k} &= \mathbb{E}_{s_0, a_0, s_1, a_1,\cdots}\left[\lim_{\tau\to\infty}\frac{1}{\tau}\sum_{t=0}^{\tau} c^k\left(s_t,a_t\right)\right]\label{eq:average_cost}\\
    \lambda_{\pi}^{P, r} &= \mathbb{E}_{s_0, a_0, s_1, a_1,\cdots}\left[\lim_{\tau\to\infty}\frac{1}{\tau}\sum_{t=0}^{\tau} r\left(s_t,a_t\right)\right]\label{eq:average_reward}\\
    s_0&\sim \rho_0(s_0),\ a_t\sim \pi(a_t|s_t),\ s_{t+1}\sim P(s_{t+1}|s_t, a_t)\nonumber
\end{align}
For brevity, in the rest of the paper, $\mathbb{E}_{s_t, a_t, s_{t+1}; t\geq 0}[\cdot]$ will be denoted as $\mathbb{E}_{\rho, \pi, P}[\cdot]$, where $s_0\sim \rho_0(s_0),\ a_t\sim \pi(s_t|a_t),\ s_{t+1}\sim P(s_{t+1}|s_t, a_t)$. Both, $\zeta_{\pi}^{P,k}$ and $\lambda_{\pi}^{P,r}$ satisfy the following form of Bellman equation:
\begin{align}
    \lambda_{\pi}^{P,r} + h_\pi^{P,r}(s) &= \sum_{a}\pi(a|s)r(s,a)\nonumber\\
    &~+ \sum_{s'} \sum_{a}\pi(a|s) P(s'|s,a)h_\pi^{P,r}(s)
\end{align}
\begin{align}
    \zeta_{\pi}^{P,k} + h_\pi^{P,r}(s) &= \sum_a\pi(a|s)c^k(s,a)\\
    &~+ \sum_{s'} \sum_a\pi(a|s) P(s'|s,a)h_\pi^{P,k}(s)
\end{align}
where $h_\pi^{P,r}(s)$ is the bias for reward and $h_\pi^{P,k}$ is the bias for cost $k\in[K]$.

The objective is find a policy $\pi^*$ which is the solution of the following optimization problem.
\begin{align}
    \max_\pi &\ \lambda_\pi^{P,r}\text{ \ \ s.t.}\\
    \zeta_{\pi}^{P,k} &\leq C_k~~~\forall~k\in[K]
\end{align}
where $C_k~\forall~k\in[K]$ are the bounds on the average costs which the agent needs to satisfy. 

After formulating the optimization problem, we now state our next assumption characterizing the slackness.
\begin{assumption} \label{ch_2_slaters_conditon}
There exists a policy $\pi$, and one constant $\kappa \ge 2ST_M\sqrt{14A\log(AT)/\sqrt{T}} + CST_M/((1-\rho)\sqrt{T})$ such that 
\begin{align}
    \zeta_{\pi}^{P,k} \le C_k - \kappa\label{eq:slater_formulation}
\end{align}
\end{assumption}
The slackness assumption is mild because, in various applications some a priori knowledge about a strictly feasible policy is available. Hence, this assumption is again a standard assumption in the constrained RL literature \cite{efroni2020exploration,ding2021provably,ding2020natural}. $\kappa$ is referred as Slater's constant. \cite{ding2021provably} assumes that the Slater's constant $\kappa$ is known.

Any online algorithm starting with no prior knowledge will require to obtain estimates of transition probabilities $P$ and obtain reward $r$ and costs $c^k, \forall\ k\in[K]$ for each state action pair. Initially, when algorithm does not have good estimates of the model, it accumulates a regret as well as violates constraints as it does not know the  optimal policy.  We define reward regret $R(T)$ as the difference between the cumulative reward obtained $r_t$ vs the expected rewards from running the optimal policy $\pi^*$ for $T$ steps, or
\begin{align}
    R(T)& = T\lambda_{\pi^*}^{P,r} - \sum_{t=1}^Tr(s_t,a_t) \label{eq:regret_rewards}
\end{align}
Additionally, we define constraint regret $R_k(T)$ for each constraint $k\in[K]$ as the gap between the cumulative cost incurred $c_t^k, k\in[K]$ and constraint bounds, or
\begin{align}
    R^k(T)& = \left(\sum_{t=1}^Tc^k(s_t,a_t) - TC_k\right)_+\label{eq:regret_costs},
\end{align}
where $(x)_+ = \max(0, x)$.

In the following section, we present a model-based algorithm to obtain this policy $\pi^*$, and reward regret and the constraint regret accumulated by the algorithm. 

\section{The CMDP-PSRL Algorithm} \label{sec:cmdp_psrl}
For infinite horizon optimization problems (or $\tau\to\infty$), we can use steady state distribution of the state to obtain expected long-term rewards or costs \citep{puterman2014markov}. We use
    \begin{align}
        \zeta_{\pi}^{P,k} &= \sum_{s\in \mathcal{S}}\sum_{a\in \mathcal{A}}c_k(s, a)d_{\pi}^{P}(s,a),\ \ \forall\ k\in[K]\\
        \lambda_{\pi}^{P,r} &= \sum_{s\in \mathcal{S}}\sum_{a\in \mathcal{A}}r(s, a)d_{\pi}^{P}(s,a)
    \end{align}
where $d_{\pi}^{P}(s,a)$ is the steady state joint distribution of the state and actions under policy $\pi$. 

Based on the above formulation, we can solve the joint optimization problem of following form
    \begin{align}
        \max_{d(s,a)} \sum_{s\in \mathcal{S}}\sum_{a\in \mathcal{A}}r(s, a)d(s,a) \label{eq:optimization_equation}
    \end{align}
with the following set of constraints,
\begin{align}
    \sum_{a\in\mathcal{A}}d(s',a) &= \sum_{s\in\mathcal{S}, a\in\mathcal{A}}P(s'|s, a)d(s,a)\label{eq:transition_constraint}\\
    \sum_{s\in\mathcal{S}, a\in\mathcal{A}} d(s,a) = 1,&\ \  d(s,a) \geq 0 \label{eq:valid_prob_constrant}\\
    \sum_{s\in \mathcal{S}}\sum_{a\in \mathcal{A}}c_k(s, a)d(s,a)&\leq C_k\ \ \forall\ k\in[K]\label{eq:cmdp_constraints}
\end{align}
for all $ s'\in\mathcal{S},~\forall~s\in\mathcal{S},$ and $\forall~a\in\mathcal{A}$.
Equation \eqref{eq:transition_constraint} denotes the constraint on the transition structure for the underlying Markov Process. Equation \eqref{eq:valid_prob_constrant} ensures that the solution is a valid probability distribution. Finally, Equation \eqref{eq:cmdp_constraints} are the constraints for the constrained MDP setup which the policy must satisfy.

Note that arguments in Equation (\ref{eq:optimization_equation}) are linear, and the constraints in Equation \eqref{eq:transition_constraint} and Equation \eqref{eq:valid_prob_constrant} are linear, this is a linear programming problem. Since convex optimization problems can be solved in polynomial time \citep{potra2000interior}, we can use standard approaches to solve Equation (\ref{eq:optimization_equation}). After solving the optimization problem, we obtain the optimal policy from the obtained steady state distribution $d^*(s,a)$ as,
\begin{align}
    \pi^*(a|s) = \frac{\bbP(s,a)}{\bbP(s)} = \frac{d^*(s,a)}{\sum_{b\in\mathcal{A}}d^*(s, b)}\ \ \ \forall\ s\in\mathcal{S}\label{eq:optimal_policy}
\end{align}

Since we assumed that the CMDP is ergodic, the Markov Chain induced from policy $\pi$ is ergodic. Hence, every state is reachable following the policy $\pi^*$, we have $\bbP(s) > 0$ and Equation \eqref{eq:optimal_policy} is defined for all states $s\in\mathcal{S}$.

Further, since we assumed that the induced Markov Chain is irreducible for all stationary policies, we assume Dirichlet distribution as prior for the state transition probability $P(s'|s, a)$. Dirichlet distribution is also used as a standard prior in literature \citep{agrawal2017optimistic,osband2013more}. Further, there exists a steady state distribution when the transition probability is sampled from a Dirichlet distribution \cite[{Proposition 1}]{agarwal2022multi}.

The complete constrained posterior sampling based algorithm, which we name CMDP-PSRL, is described in Algorithm \ref{alg:model_based_algo}. The algorithm proceeds in epochs, and a new epoch is started whenever the visitation count in epoch $e$, $\nu_e(s,a)$, is at least the total visitations before episode $e$, $N_e(s,a)$, for any state action pair (Line 8). In Line 9, we sample transition probabilities $\Tilde{P}$ using the updated posterior and in Line 10, we update the policy using the optimization problem specified in Equation \eqref{eq:optimization_equation}-\eqref{eq:cmdp_constraints} for $P = \Tilde{P}_e$. Further, if the sampled MDP does not satisfy the cost constraint in Equation \eqref{eq:cmdp_constraints}, we ignore that constraint \footnote{We will show in the analysis that cumulative constraint violations are still bounded.} for that epoch.


\begin{algorithm}[thbp]
    \begin{small}
	\caption{CMDP-PSRL} \label{alg:model_based_algo}
    \begin{algorithmic}[1]
        % \Procedure{Model Based Online}
            \State \textbf{Input: }{$\mathcal{S}, \mathcal{A}, r, c_1, \cdots, c_K$}
            \State Initialize $N(s, a, s') = 1\ \forall (s,a,s')\in \mathcal{S}\times\mathcal{A}\times\mathcal{S},\ \pi_e(a|s) = \frac{1}{|\mathcal{A}|}\ \forall\ (a, s)\in\mathcal{A}\times\mathcal{S},~e = 0,~\nu_e(s,a) = N_e(s,a) = 0~\forall(s,a)\in\mathcal{S}\times\mathcal{A}$
    	        \For{time index $t = 1, 2, \cdots $}
    	            \State Observe state $s$
    	            \State Play action $a\sim\pi(\cdot|s)$
    	            \State Observe rewards $\{r^k\}$ and next state $s'$
    	            \State $\nu_e(s,a) +=1,~N(s, a, s') += 1$
    	            \If {$\nu_e(s,a) \geq \max(1, N_e(s,a))$ for any $s,a$} 
    	                \State $\Tilde{P}_e(s'| a, s) \sim Dir(N(s, a, s'))\ \forall\ (s, a, s')$
    	                \State Solve steady state distribution $d(s, a)$ as the  solution of the optimization problem in Equations (\ref{eq:optimization_equation}-\ref{eq:cmdp_constraints}) for $\Tilde{P}_e$.
    	                \State Obtain optimal policy for next epoch, $e+1$, $\pi_{e+1}$ as 
    	               $$\pi_{e+1}(a|s) = \frac{d(s, a)}{\sum_{a\in\mathcal{A}}d(s, a)}$$
                        \State $e = e+1$
                        \State $t_e = t$
                        \State $\nu_e(s,a) = 0, N_e(s,a) = \sum_{e'}^e \nu_{e'}(s,a)~ \forall(s,a)$
    	           \EndIf
    	        \EndFor
        % \EndProcedure
    \end{algorithmic}
    \end{small}
\end{algorithm}

\section{Analysis} \label{sec:regret}
We first obtain the feasibility of the optimization problem Equation \eqref{eq:optimization_equation}-\eqref{eq:cmdp_constraints} for the sampled MDP. We note that we assumed slackness in the true MDP with transition probabilities $P$. Hence, if the the sampled MDP is close to the true MDP, the deviation in the cost will be less and there will be a policy which satisfies the constraint in Equation \eqref{eq:cmdp_constraints}. We formalize this intuition in the following result.

\begin{lemma}\label{lem:feasibility_of_sampled_MDP}
Following Algorithm \ref{alg:model_based_algo}, if $t_{e+1}-t_e\ge \sqrt{T}$ and $\Vert\Tilde{P}_e(\cdot\vert s,a) - P(\cdot\vert s,a)\Vert_1\le \sqrt{\frac{14S\log(2At)}{N_e(s,a)}}\forall~s,a$ there exists a policy $\pi$ which satisfies,
\begin{align}
    \zeta_{\pi}^{\tilde{P}_e,k} \le C_k~\forall~k\in[K],
\end{align}
and the optimization problem in Equation \eqref{eq:optimization_equation}-\eqref{eq:cmdp_constraints} is feasible, where $t_e$ is the start time of epoch $e$.
\end{lemma}
\begin{proof}[Proof Outline]
We consider the policy $\pi$ which satisfies the Slater's condition in Equation \eqref{eq:slater_formulation}. We then consider the Bellman error of taking one step in MDP with transition probabilities $\tilde{P}_e$ and then following policy $\pi$ on the MDP with transition probabilities $P$. Now, using \cite[{Lemma 1}]{agarwal2022multi} relating the average costs following policy $\pi$ with $P$ and $\tilde{P}_e$ ($\zeta_{\pi}^{P,k}$, and $\zeta_{\pi}^{\tilde{P}_e,k}$ for all $k\in[K]$ respectively) with the Bellman error gives the required result. The complete proof is provided in the supplementary material.
\end{proof}

After obtaining a feasible policy $\pi_e$ maximizing rewards for the sampled MDP, we now quantify is regret. We note that when optimizing for long-term average rewards and long-term average constraints, we want to simultaneously minimize the reward regret and the constraint regrets. Further, if we know the optimal policy $\pi^*$ before hand, the deviations resulting from the stochasticity of the process can still result in some constraint violations. Also, since we sample a MDP, the policy which is feasible for the MDP may violate constraints on the true MDP. We want to bound this gap between $K$ costs for the two MDPs as well.

We aim to quantify the regret from \textbf{(R.1)} deviation of long-term average rewards of the optimal policy because of incorrect knowledge of the MDP ($\lambda_{\pi^*}^{P,r} - \lambda_{\pi_e}^{\tilde{P}_e,r}$),  \textbf{(R.2)} deviation of the long-term average rewards generated by the optimal policy for the sampled MDP on the sampled MDP and the long-term average rewards generated by the optimal policy for the sampled MDP on the true MDP ($\lambda_{\pi_e}^{\tilde{P}_e,r} - \lambda_{\pi_e}^{P,r}$), and \textbf{(R.3)} deviation of the expected rewards from following the optimal policy of the sampled MDP ($\lambda_{\pi_e}^{P,r} - r(s_t,a_t)$).

Similarly, the constraint violations for each $k\in[K]$ are incurred from \textbf{(C.1)} deviation of long-term average rewards of the optimal policy because of incorrect knowledge of the MDP ($C_k - \zeta_{\pi_e}^{\tilde{P}_e,k}$),  \textbf{(C.2)} deviation of the long-term average costs generated by the optimal policy for the sampled MDP on the sampled MDP and the long-term average costs generated by the optimal policy for the sampled MDP on the true MDP ($\zeta_{\pi_e}^{\tilde{P}_e,r} - \lambda_{\pi_e}^{P,r}$), and \textbf{(C.3)} deviation of the expected costs from following the optimal policy of the sampled MDP ($\zeta_{\pi_e}^{P,r} - c^k(s_t,a_t)$).

We now prove the regret bounds for Algorithm \ref{alg:model_based_algo}. We first give the high level ideas used in obtaining the bounds on regret. We divide the regret into regret incurred in each epoch $e$. Then, we use the posterior sampling lemma \cite[Lemma 1]{osband2013more} to obtain the equivalence between the long-term average rewards of the true MDP $\mathcal{M}$ and the long-term average rewards for the optimal value of the sampled MDP $\widehat{\mathcal{M}}$. This step allows us to deal with the regret from \textbf{(R.1)}. Then we use the Bellman error formulation to relate average rewards for the  policy $\pi_e$ on $P$ and $\tilde{P}_e$ \citep{agarwal2022multi}. Combining this with Azuma's concentration inequality for Martingales allows us to bound the regret from \textbf{(R.2)} and \textbf{(R.3)}.

Bounding constraint violations requires similar considerations for \textbf{(C.2)} and \textbf{(C.3)}. Further, \textbf{(C.1)} becomes zero if Equation \eqref{eq:cmdp_constraints} is feasible for the sampled MDP. However, if Equation \eqref{eq:cmdp_constraints} is not feasible, the cost may be as high as $1$ ($c^k(s,a) \le 1~\forall~k\in[K]$). We bound the violations by bounding the time-steps for which the optimal policy for unconstrained optimization runs.

To obtain bounds on the regret, we first note that the total number of epochs, $E$, for which the Algorithm \ref{alg:model_based_algo} runs is bounded by $O(1 + 2SA + SA\log(T)$ from \cite[{Proposition 1}]{jaksch2010near}.

We formally state the regret bounds and constraint violation bounds in Theorem \ref{thm:regret_bound} which we prove rigorously in the supplementary material.

\begin{theorem}\label{thm:regret_bound}
The expected reward regret $\mathbb{E}\left[R(T)\right]$, and the expected constraint regret $\mathbb{E}\left[R_k(T)\right]~\forall\ k\in[K]$ of Algorithm \ref{alg:model_based_algo} are bounded as
\begin{align}
    \mathbb{E}\left[R(T)\right] &\leq O\left(T_MS\sqrt{AT\log(AT)} + \frac{CS^2A\log T}{1-\rho}\right)\nonumber\\
    \mathbb{E}\left[R^k(T)\right] &\leq O\left(T_MS\sqrt{AT\log(AT)} + \frac{CS^2A\log T}{1-\rho}\right)\nonumber 
\end{align}
\end{theorem}
\begin{proof}[Proof Outline] We break the cumulative regret into the regret incurred in each epoch $e$. This gives us:
\begin{align}
\mathbb{E}\left[R_T\right] &= \mathbb{E}\left[\sum_{e=1}^E\sum_{t=t_e}^{t_{e+1}-1} \left(\lambda_{\pi^*}^{P,r} - r(s_t, a_t) \right)\right]\\
    &= \sum_{e=1}^E\mathbb{E}\left[\sum_{t=t_e}^{t_{e+1}-1} \left(\lambda_{\pi^*}^{P,r} - r(s_t, a_t) \right)\right]\label{eq:break_into_epochs}\\
    &= \sum_{e=1}^E\mathbb{E}\left[\sum_{t=t_e}^{t_{e+1}-1} \left(\lambda_{\pi_e}^{\Tilde{P}_e,r} - r(s_t, a_t) \right)\right]\label{eq:use_posterior_sampling_lemma}\\
    &= \sum_{e=1}^E\mathbb{E}\left[\sum_{t=t_e}^{t_{e+1}-1}\left(\lambda_{\pi_e}^{\Tilde{P}_e,r} - \lambda_{\pi_e}^{P,r}+ \lambda_{\pi_e}^{P,r} - r(s_t, a_t)\right)\right]\nonumber\\
    &= \sum_{e=1}^E\mathbb{E}\left[\sum_{t=t_e}^{t_{e+1}-1}\left(\lambda_{\pi_e}^{\Tilde{P}_e,r} - \lambda_{\pi_e}^{P,r}\right)\right]\nonumber\\
    &~~+ \mathbb{E}\left[\sum_{e=1}^E\sum_{t=t_e}^{t_{e+1}-1}\left(\lambda_{\pi_e}^{P,r} - r(s_t, a_t)\right)\right] \label{eq:regret_breakdown}
\end{align}
The Equation \eqref{eq:use_posterior_sampling_lemma} follows from \cite[{Lemma 1}]{osband2013more} for regret each each epoch of Equation \eqref{eq:break_into_epochs}. Proceeding from Equation \eqref{eq:regret_breakdown} requires additional consideration. Typical proof techniques to bound regret requires a bounded bias-span ($\max_{s,s'}(h_{\pi}^{\tilde{P}_e,r}(s)-h_{\pi}^{\tilde{P}_e,r}(s'))$) which may be large for the sampled MDP. For this, we consider an MDP for the transition probability $P_e^r$ satisfies 
\begin{align}
    \lambda_{\pi_e}^{P_e^r,r}&\ge \max_{P'\in \mathcal{P}_{t_e}}\lambda_{\pi_e}^{P',r},\text{where}\\
    \mathcal{P}_{t_e} &= \Big\{P':\|P'(\cdot|s,a) - \Bar{P}_{t_e}(\cdot|s,a)\|_1 \nonumber\\
    &~~~~~~~~\le \sqrt{\frac{14S\log(AT)}{N_e(s,a)}} \Big\}~\forall~s,a\nonumber
\end{align}
where $\Bar{P}_{t_e}(\cdot|s,a)$ is the estimated transition probability given $s,a$ at time $t_e$. We now have,
\begin{align}
R(T) &\le \sum_{e=1}^E\mathbb{E}\left[\sum_{t=t_e}^{t_{e+1}-1}\left(\lambda_{\pi_e}^{P_e^r,r} - \lambda_{\pi_e}^{P,r}\right)\right]\nonumber\\
    &~~+ \sum_{e=1}^E\mathbb{E}\left[\sum_{t=t_e}^{t_{e+1}-1}\left(\lambda_{\pi_e}^{P,r} - r(s_t, a_t)\right)\right] \label{eq:regret_breakdown_optimal_MDP}
\end{align}
The first term of Equation \eqref{eq:regret_breakdown_optimal_MDP} is bounded by bounding the expected Bellman error. The second term is converted to a Martingale sequence by conditioning it on the state $s_{t_e}$ and is bounded using the ergodicity of the MDP $\mathcal{M}$ and Azuma's concentration inequality. The complete proof on bounding the regret is provided in the supplementary material.

Regarding the constraint violations, for each $k\in[K]$, we want to bound,
\begin{align}
    \mathbb{E}\left[R^k(T)\right] &= \mathbb{E}\left[\left(\sum_{t=1}^Tc_k(s_t, a_t) - TC_k\right)_+\right]
\end{align}

We divide the constraint violation regret into regret over epochs as well. Now, for each epoch, we know that the constraint is satisfied by the policy for the sampled MDP. This allows us to obtain:
\begin{align}
    \mathbb{E}\left[R^k(T)\right]&=\mathbb{E}\left[\left(\sum_{e}\sum_{t=t_e}^{t_{e+1}-1}\left(c_k(s_t, a_t) - C_k\right)\right)_+\right] \\
%%%%%%%%%%%%%%%%%    
    &=\mathbb{E}\Big[\Big(\sum_{e}\sum_{t=t_e}^{t_{e+1}-1}\Big(\left(c_k(s_t, a_t) - \zeta_{\pi_e}^{P,k}\right)\nonumber\\
    &~~~+ \left(\zeta_{\pi_e}^{P,k} - \zeta_{\pi_e}^{\Tilde{P}_e,k}\right) + \left(\zeta_{\pi_e}^{\Tilde{P}_e,k} - C_k\right)\Big)\Big)_+\Big]\label{eq:constraint_regret_breakdown}\\
%%%%%%%%%%%%%%%%%    
    &=\mathbb{E}\Big[\left(\sum_{e}\sum_{t=t_e}^{t_{e+1}-1}c_k(s_t, a_t) - \zeta_{\pi_e}^{P,k}\right)_+\nonumber\\
    &~~~+ \left(\sum_{e}\sum_{t=t_e}^{t_{e+1}-1}\zeta_{\pi_e}^{P,k} - \zeta_{\pi_e}^{\Tilde{P}_e,k}\right)_+\nonumber
\end{align}
\begin{align}
    &~~~+ \left(\sum_{e}\sum_{t=t_e}^{t_{e+1}-1}\zeta_{\pi_e}^{\Tilde{P}_e,k} - C_k\right)_+\Big]\label{eq:max_zero_sum_break}\\
%%%%%%%%%%%%%%%%%    
    &=\mathbb{E}\Big[\Big|\sum_{e}\sum_{t=t_e}^{t_{e+1}-1}\left(c_k(s_t, a_t) - \zeta_{\pi_e}^{P,k}\right)\Big|\nonumber\\
    &~~~+ \left|\sum_{e}\sum_{t=t_e}^{t_{e+1}-1}\zeta_{\pi_e}^{P,k} - \zeta_{\pi_e}^{\Tilde{P}_e,k}\right|\nonumber\\
    &~~~+ \left(\sum_{e}\sum_{t=t_e}^{t_{e+1}-1}\zeta_{\pi_e}^{\Tilde{P}_e,k} - C_k\right)_+\Big]\label{eq:convert_pos_to_mod}
\end{align}

The first term in Equation \eqref{eq:constraint_regret_breakdown} denotes the difference between the incurred costs and the expected costs from following policy $\pi_e$. The second term denotes the difference between the expected costs from policy $\pi_e$ on the true MDP and on the sampled MDP. The third terms denotes the violations of the policy $\pi_e$ which would be zero if the policy $\pi_e$ satisfies constraint Eqution \eqref{eq:cmdp_constraints} for the sampled MDP.
Equation \eqref{eq:max_zero_sum_break} is obtained from the fact $\max(0, x+y) \le \max(0,x) + \max(0,y)$ and Equation \eqref{eq:constraint_regret_breakdown} is obtained from the fact $\max(0,x) \le \vert x\vert$.

The first and second term in Equation \eqref{eq:constraint_regret_breakdown} are bounded similar to Equation \eqref{eq:regret_breakdown}, and we focus our attention to the third term. If the optimization problem in Equation \eqref{eq:optimization_equation}-\eqref{eq:cmdp_constraints} is feasible, the term $(\zeta_{\pi_e}^{\tilde{P}_e,k} - C_k) \le 0$ and if the optimization equation is infeasible, the term is upper bounded by $1$ as $C_k \ge 0$ and $\zeta_{\pi_e}^{\tilde{P}_e}\le 1$. Hence, we get:
\begin{align}
    &\left(\sum_{e}\sum_{t=t_e}^{t_{e+1}-1}\left(\zeta_{\pi_e}^{\Tilde{P}_e,k} - C_k\right)\right)_+ \nonumber\\
    &\le \sum_{e}\left(\sum_{t=t_e}^{t_{e+1}-1}\zeta_{\pi_e}^{\Tilde{P}_e,k} - C_k\right)_+\label{eq:break_violations_per_epoch}\\
%%%%%%%%%%%%%%%%%            
    &= \sum_{e}\left(\sum_{t=t_e}^{t_{e+1}-1}\zeta_{\pi_e}^{\Tilde{P}_e,k} - C_k\right)_+\bm{1}\left\{t_{e+1}-t_e > \sqrt{T}\right\}\nonumber\\
    &~~~+ \sum_{e}\left(\sum_{t=t_e}^{t_{e+1}-1}\zeta_{\pi_e}^{\Tilde{P}_e,k} - C_k\right)_+\bm{1}\left\{t_{e+1}-t_e \le \sqrt{T}\right\}\\
%%%%%%%%%%%%%%%%%            
    &\le \sum_{e}\sum_{t=t_e}^{t_{e+1}-1}\bm{1}\left\{t_{e+1}-t_e \le \sqrt{T}\right\}\label{eq:constraint_bounded_by_zero}\\
%%%%%%%%%%%%%%%%%            
    &\le \sum_{e}\sqrt{T} = E\sqrt{T}\\
%%%%%%%%%%%%%%%%%            
    &\le (1 + 2SA + SA\log_2(T/SA))\sqrt{T}\label{eq:bound_episodes}
\end{align}
where Equation \eqref{eq:break_violations_per_epoch} follows from the fact that total violations are less than the cumulative violations are considered per epoch.
Equation \eqref{eq:constraint_bounded_by_zero} follows from Lemma \ref{lem:feasibility_of_sampled_MDP} as $\left(\zeta_{\pi_e}^{\Tilde{P}_e,k} - C_k\right)\le 0$ when $t_e > \sqrt{T}$ and Equation \eqref{eq:bound_episodes} comes from \cite[{Proposition 1}]{jaksch2010near}.
\end{proof}

We note that the fundamental setup of unconstrained optimization ($K=0$), the bound is loose compared to that of UCRL2 algorithm \cite{jaksch2010near}. This is because we use a stochastic policy instead of a deterministic policy. Recall that the optimal policy for CMDP setup is possibly stochastic \cite{altman1999constrained}. 


\section{Evaluation of the Proposed Algorithm}

\begin{figure}[t]
    \centering
    \subfigure[{Reward growth \textit{w.r.t.} time}]{
        \includegraphics[width=0.45\textwidth]{linear_model-based_rewards_zero_CV.png}
        \label{fig:linear_rewards}
    }
    \subfigure[Regret \textit{w.r.t.} time]{
        \includegraphics[width=0.45\textwidth]{linear_model-based_regrets_zero_CV.png}
        \label{fig:linear_regret}
    }
    \caption{Reward and regret performance of the proposed CMDP-PSRL algorithm on a flow and service control problem for a single queue. The algorithms is compared against the optimistic algorithm from Singh et al. \cite{singh2020learning} compared to which our algorithm extremely well.}
    \label{fig:reward_CMDP-PSRL}
\end{figure}

\begin{figure}[t]
    \centering
    \subfigure[Service constraints \textit{w.r.t.} time]{
        \includegraphics[width=0.45\textwidth]{linear_model-based_service_zero_CV.png}
        \label{fig:linear_service}
    }    
    \subfigure[Flow constraints \textit{w.r.t.} time]{
        \includegraphics[width=0.45\textwidth]{linear_model-based_flow_zero_CV.png}
        \label{fig:linear_flow}
    }        
    \caption{Constraint violation performance of the proposed CMDP-PSRL algorithm on a flow and service control problem for a single queue. The average constraint violations become zero as the algorithm proceeds, however, it never crosses zero to increase the reward further.}
    \label{fig:regret_CMDP-PSRL}
\end{figure}

To validate the performance of the proposed CDMP-PSRL algorithm and the understanding of our analysis,  we run the simulation on the flow and service control in a single-serve queue, which is introduced in \citep{altman1991constrained}. A discrete-time single-server queue with a buffer of finite size $L$ is considered in this case. The number of the customer waiting in the queue is considered as the state in this problem and thus $\vert S\vert=L+1$. Two kinds of the actions, service and flow, are considered in the problem and control the number of customers together. The action space for service is a finite subset $A$ in $[a_{min},a_{max}]$, where $0<a_{min}\leq a_{max}<1$. Given a specific service action $a$, the service a customer is successfully finished with the probability $b$. If the service is successful, the length of the queue will reduce by 1. Similarly, the space for flow is also a finite subsection $B$ in $[b_{min}, b_{max}]$. In contrast to the service action, flow action will increase the queue by $1$ with probability $b$ if the specific flow action $b$ is given. Also, we assume that there is no customer arriving when the queue is full. The overall action space is the Cartesian product of the $A$ and $B$. According to the service and flow probability, the transition probability can be computed and is given in the Table \ref{table:transition}.

\begin{table*}[ht]   
		\caption{Transition probability of the queue system}  
		\label{table:transition}
		\begin{center}  
			\begin{tabular}{|c|c|c|c|}  
				\hline  
				Current State & $P(x_{t+1}=x_t-1)$ & $P(x_{t+1}=x_t)$ & $P(x_{t+1}=x_t+1)$ \\ \hline
				$1\leq x_t\leq L-1$ & $a(1-b)$ & $ab+(1-a)(1-b)$ & $(1-a)b$ \\ \hline
				$x_t=L$ & $a$ & $1-a$ & $0$ \\ \hline
				$x_t=0$ & $0$ & $1-b(1-a)$ & $b(1-a)$ \\ 
				\hline  
			\end{tabular}  
		\end{center}  
\end{table*}

For the reward function as $r(s,a, b)$ and the constraints for service and flow as $c^1(s,a, b)$ and $c^2(s,a, b)$, respectively, and stationary policies for service and flow as $\pi_a$ and $\pi_b$, respectively, the problem can be defined as
\begin{equation}
    \begin{split}
        \max_{\pi_a,\pi_b} &\quad \lim\limits_{T\rightarrow\infty}\frac{1}{T}\sum_{t=1}^{T}r(s_t,\pi_a(s_t),\pi_b(s_t))\\
        s.t. &\quad \lim\limits_{T\rightarrow\infty}\frac{1}{T}\sum_{t=1}^{T}c^1(s_t,\pi_a(s_t),\pi_b(s_t))\geq 0\\
        &\quad \lim\limits_{T\rightarrow\infty}\frac{1}{T}\sum_{t=1}^{T}c^2(s_t,\pi_a(s_t),\pi_b(s_t))\geq 0
    \end{split}
\end{equation}

According to the discussion in \citep{altman1991constrained}, we define the reward function as $r(s,a,b)=5 - s$, which is an decreasing function only dependent on the state. It is reasonable to give higher reward when the number of customer waiting in the queue is small. For the constraint function, we define $c^1(s,a,b)=-10a + 6$ and $c^2 = - 8 *(1-b)^2+2$, which are dependent only on service and flow action, respectively. Higher constraint value is given if the probability for the service and flow are low and high, respectively.

In the simulation, the length of the buffer is set as $L=5$. The service action space is set as $[0.2,0.4,0.6,0.8]$ and the flow action space is set as $[0.4,0.5,0.6,0.7]$. We use the length of horizon $T=50000$ and run $50$ independent simulations of the proposed CMDP-PSRL algorithm. We also plot the standard deviation around the mean value in the shadow to show the random error. In order to compare this result to the optimal, we assume that the full information of the transition dynamics is known and then use  Linear Programming to solve the problem. The optimal cumulative reward from LP is shown to be $4.47$. The reward performance of the \NAM\ algorithm is shown in the Figure \ref{fig:reward_CMDP-PSRL} where we observe that the reward converges towards the optimal value. We also plot the constraint violations in Figure \ref{fig:regret_CMDP-PSRL}. The service and flow constraints converge to 0 as expected. We note that the reward of the proposed CMDP-PSRL  algorithm becomes closer the optimal reward as the algorithm proceeds, and to further increase the reward, it does not violates the constraint.

We also compared our algorithm against the optimistic algorithm of \cite{singh2020learning}. We note that their algorithm performs significantly worse compared to our algorithm. We account this poor performance on two accounts. An optimistic algorithm does not find a policy for transition probabilities close to $P$ for significantly large time. The other issue is because they consider confidence interval for each $P(s'|s,a)$. This also shows in their analysis and hence they obtain a $O(T^{2/3})$ regret bound. Further, the optimization problem takes a significantly more time to solve for optimistic setup. However, the variance of their optimistic algorithm is significantly lower compared to the variance of our \NAM\ algorithm.

\section{Conclusion} \label{conclusion}
This paper, considers the setup of reinforcement learning in ergodic infinite-horizon constrained Markov Decision Processes with $K$ long-term average constraint. We propose a posterior sampling based algorithm, \NAM, which proceeds in epochs. At every epoch, we sample a new CMDP and generate a solution for the constraint optimization problem. A major advantage of the posterior sampling based algorithm over an optimistic approach is, that it reduces the complexity of solving for the optimal solution of the constraint problem. We also study the proposed \NAM\ algorithm from regret perspective. We bound the regret of the reward collected by the \NAM\ algorithm as $\Tilde{O}(T_MS\sqrt{AT} + CS^2A/(1-\rho))$. Further, we bound the gap between the long-term average costs of the sampled MDP and the true MDP to bound the $K$ constraint violations as $\Tilde{O}(T_MS\sqrt{AT} + CS^2A/(1-\rho))$. Finally, we evaluate the proposed \NAM\ algorithm on a flow control problem for single queue and show that the proposed algorithm performs empirically well. This paper is the first work which obtains a $\Tilde{O}(\sqrt{T})$ regret bounds for ergodic MDPs with long-term average constraints using a posterior sampling algorithm. A model-free algorithm that obtains similar regret bounds for infinite horizon long-term average constraints remains an open problem. 

	\bibliography{refs}
	%\newpage
	\onecolumn
	%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\appendix
%\newpage
\section{Proof for Regret Bounds}\label{app:regret_bound_proof}
We now complete the proof of Theorem \ref{thm:regret_bound} here. 

\subsection{Variable Definitions}
We first define some important variables required for the proof.

We define value $V_{\gamma,\pi}^{P,r}, V_{\gamma,\pi}^{P,k}$ function for rewards $r$ and cost $c^k$ as:
\begin{align}
    V_{\gamma,\pi}^{P,r}(s) &= \mathbb{E}\left[\sum_{t=0}^\infty \gamma^tr(s_t,a_t)|s_0 = s\right]\\
    V_{\gamma,\pi}^{P,k}(s) &= \mathbb{E}\left[\sum_{t=0}^\infty \gamma^tc^k(s_t,a_t)|s_0 = s\right]
\end{align}

We also define Q-value $Q_{\gamma,\pi}^{P,r}, Q_{\gamma,\pi}^{P,k}$ function for rewards $r$ and cost $c^k$ as:
\begin{align}
    Q_{\gamma,\pi}^{P,r}(s,a) &= \mathbb{E}\left[\sum_{t=0}^\infty \gamma^tr(s_t,a_t)|s_0 = s, a_0 = a\right]\\
    Q_{\gamma,\pi}^{P,k}(s,a) &= \mathbb{E}\left[\sum_{t=0}^\infty \gamma^tc^k(s_t,a_t)|s_0 = s, a_0 = a\right]
\end{align} 

Based on this, we define Bellman error $B_{\pi}^{P',r}, B_{\pi}^{P',k}$ function for rewards $r$ and cost $c^k$ as:

\begin{align}
    B_{\pi}^{P',r} &= \lim_{\gamma\to 1}\left(r(s,a) + \sum_{s'}P'(s'|s,a)V_{\gamma,\pi}^{P,r}(s,a) - Q_{\gamma,\pi}^{P,r}(s,a)\right)\label{eq:Bellman_def}\\
    B_{\pi}^{P',k} &= \lim_{\gamma\to 1}\left(c^k(s,a) + \sum_{s'}P'(s'|s,a)V_{\gamma,\pi}^{P,k}(s,a) - Q_{\gamma,\pi}^{P,k}(s,a)\right)
\end{align}

\newpage
\subsection{Auxiliary Lemmas}

We now state and prove various lemmas required to complete the proof of Theorem \ref{thm:regret_bound}.

The first lemma obtains concentration bounds for the sampled MDP. We have:

\begin{lemma}\label{lem:ell_1_conc}
The probability that the event 
\begin{align}
    \mathcal{E}_t = \left\{\Vert \Bar{P}_t(\cdot\vert s, a) - P(\cdot\vert s, a)\Vert_1 \leq \sqrt{\frac{14S\log(2AT)}{\max\{1, n_t(s,a)\}}}\forall (s,a)\in\mathcal{S}\times\mathcal{A}\right\}
\end{align}
fails to occur for any $t\le T$ is bounded by $\frac{1}{T^5}$.
\end{lemma}
\begin{proof}[Proof Outline]
From the result of \cite{weissman2003inequalities}, the $\ell_1$ distance of a probability distribution over $S$ events with $n$ samples is bounded as:
\begin{align}
    \mathbb{P}\left(\Vert  P(\cdot\vert s,a) - \Bar{P}_t(\cdot\vert s,a)\Vert_1\geq \epsilon\right)&\leq (2^S-2)\exp{\left(-\frac{n(s,a)\epsilon^2}{2}\right)}\nonumber\\
    &\le (2^S)\exp{\left(-\frac{n(s,a)\epsilon^2}{2}\right)}
\end{align}

Thus, for $\epsilon = \sqrt{\frac{2}{n(s,a)}\log(2^S20 SAT^7)}\leq \sqrt{\frac{14S}{n(s,a)}\log(2AT)} \leq \sqrt{\frac{14S}{n(s,a)}\log(2AT)}$, we have
\begin{align}
    \mathbb{P}\left(\Vert  P(\cdot\vert s,a) - \Bar{P}_t(\cdot\vert s,a)\Vert_1\geq \sqrt{\frac{14S}{n(s,a)}\log(2AT)}\right)&\leq (2^S)\exp{\left(-\frac{n(s,a)}{2}\frac{2}{n(s,a)}\log(2^S20 SAT^7)\right)}\\
    &= 2^S \frac{1}{2^S 20 SAT^7}\\
    &= \frac{1}{20AST^7}
\end{align}

We sum over the all the possible values of $n(s,a)$ till $t$ time-step to bound the probability that the event $\mathcal{E}_t$ does not occur as:
\begin{align}
    \sum_{n(s,a)=1}^t \frac{1}{20SAT^7} \leq \frac{1}{20SAT^6}
\end{align}

Finally, summing over all the $s,a$, we get
\begin{align}
    \mathbb{P}\left(\Vert  P(\cdot\vert s,a) -\Bar{P}_t(\cdot\vert s,a) \Vert_1\geq \sqrt{\frac{14S}{n(s,a)}\log(2AT)}~\forall s,a\right) \leq \frac{1}{20t^6}
\end{align}

Further, using union bounds and summing over all the $t\le T$, we get
\begin{align}
    \mathbb{P}\left(\Vert  P(\cdot\vert s,a) -\Bar{P}_t(\cdot\vert s,a) \Vert_1\geq \sqrt{\frac{14S}{n(s,a)}\log(2AT)}~\forall s,a~\forall~t\le T \right) &\leq \sum_{t=1}^T\frac{1}{20T^6}\\
    &\le \frac{1}{T^5} 
\end{align}
\end{proof}


The next lemma relates the difference between average per step reward $\lambda_{\pi}^{P,r}$ (or cost $\lambda_{\pi}^{P,k}$) for following policy $\pi$ on true MDP with transition probabilities and average per step reward $\lambda_\pi^{\Tilde{P},r}$ for following policy $\pi$ on MDP with transition probabilities $\tilde{P}$ with the Bellman error $B_\pi^{\tilde{P},r}(s,a)$ as:

\begin{lemma}\label{lem:bound_average_by_bellman}
The difference of long-term average rewards for running the policy $\pi$ on the MDP, $\lambda_{\pi}^{\Tilde{P},r}$, and the average long-term average rewards for running the policy $\pi$ on the true MDP, $\lambda_{\pi}^{\Tilde{P},r}$, is the long-term average Bellman error as
\begin{align}
    \lambda_{\pi}^{\Tilde{P},r} - \lambda_{\pi}^{P,r} = \sum_{s,a}d_{\pi}(s,a) B_{\pi}^{\Tilde{P},r}(s,a) = \mathbb{E}_{\pi, P}\left[B_{\pi}^{\Tilde{P},r}(s,a)\right].
\end{align}
\end{lemma}

\begin{proof}
Note that for all $s\in\mathcal{S}$, we have:
\begin{align}
    V_{\gamma,\pi}^{\Tilde{P},r}(s) &= \mathbb{E}_{a\sim\pi}\left[Q_{\gamma,\pi}^{\Tilde{P},r}(s,a)\right]\\
    &= \mathbb{E}_{a\sim\pi}\left[B_{\gamma,\pi}^{\Tilde{P},r}(s,a) + r(s,a) + \gamma\sum_{s'\in\mathcal{S}}P(s'\vert s,a)V_{\gamma\pi}^{\Tilde{P},r}(s')\right]\label{eq:optimistic_MDP_lambda}
\end{align}
where Equation \eqref{eq:optimistic_MDP_lambda} follows from the definition of the Bellman error for state action pair $s,a$.

Similarly, for the true MDP, we have,
\begin{align}
    V_{\gamma,\pi}^{P,r}(s) &= \mathbb{E}_{a\sim\pi}\left[Q_{\gamma,\pi}^{P,r}(s,a)\right]\\
    &= \mathbb{E}_{a\sim\pi}\left[r(s,a)+ \gamma\sum_{s'\in\mathcal{S}}P(s'\vert s,a)V_{\gamma,\pi}^{P,r}(s')\right] \label{eq:true_MDP_lambda}
\end{align}

Subtracting Equation \eqref{eq:true_MDP_lambda} from Equation \eqref{eq:optimistic_MDP_lambda}, we get:
\begin{align}
V_{\gamma,\pi}^{\Tilde{P},r}(s) - V_{\gamma,\pi}^{P,r}(s) &= \mathbb{E}_{a\sim\pi}\left[B_{\gamma,\pi}^{\Tilde{P},r}(s,a) + \gamma\sum_{s'\in\mathcal{S}}P(s'\vert s,a)\left(V_{\gamma,\pi}^{\Tilde{P},r} - V_{\gamma,\pi}^{P,r}\right)(s')\right]\\
&= \mathbb{E}_{a\sim\pi}\left[B_{\gamma,\pi}^{\Tilde{P},r}(s,a)\right] + \gamma\sum_{s'\in\mathcal{S}}P_{\pi}\left(V_{\gamma,\pi}^{\Tilde{P},r} - V_{\gamma,\pi}^{P,r}\right)(s')
\end{align}
Using the vector format for the value functions, we have,
\begin{align}
    \Bar{V}_{\gamma,\pi}^{\Tilde{P},r} - \Bar{V}_{\gamma,\pi}^{P,r} &= \left(I-\gamma P_{\pi}\right)^{-1}{B}_{\gamma,\pi}^{P,r}%\mathbb{E}_{a\sim\pi}\left[(\cdot,a)\right]
\end{align}
Now, converting the value function to average per-step reward we have,
\begin{align}
    \lambda_{\pi}^{\Tilde{P},r}\bm{1}_S - \lambda_{\pi}^{P,r}\bm{1}_S &= \lim_{\gamma\to1}(1-\gamma)\left(\Bar{V}_{\gamma,\pi}^{\Tilde{P},r} - \Bar{V}_{\gamma,\pi}^{P,r}\right)\\
    &= \lim_{\gamma\to1}(1-\gamma)\left(I-\gamma P_{\pi}\right)^{-1}{B}_{\gamma,\pi}^{\Tilde{P},r}\\
    &= \left(\sum_{s,a}d_{\pi}^P(s,a) B_{\pi}^{\Tilde{P},r}(s,a)\right)\bm{1}_S
\end{align}
where the last equation follows from the definition of occupancy measures by \cite{puterman2014markov}, and the existence of the limit $\lim_{\gamma\to1}B_{\gamma,\pi}^{\tilde{P},r}$ in Equation \eqref{eq:value_to_bias}.
\end{proof}

After relating the gap between the long-term average rewards of policy $\pi_e$ on the two MDPs, we now want to bound the sum of Bellman error over an epoch. For this, we first bound the Bellman error for a particular state action pair $s,a$ in the form of following lemma. We have,
\begin{lemma}\label{lem:bound_bellman_s_a_main}
For an MDP with rewards $r(s,a)$ and transition probability $\Tilde{P}(s'\vert s,a)$ such that $\Vert \Tilde{P}(\cdot\vert s,a) - P(\cdot\vert s,a)\Vert_1 \le \epsilon_{s,a}$, the Bellman error $B_{\pi_e}^{\Tilde{P},r}(s,a)$ for state-action pair $s,a$ is upper bounded as
\begin{align}
B_{\pi}^{\Tilde{P},r}(s,a) \le \big\Vert \Tilde{P}(\cdot\vert s,a) - P(\cdot\vert s,a)\big\Vert_1\Vert h_{\pi}^{\Tilde{P},r}(\cdot)\Vert _\infty
\end{align}
where $\Vert h_{\pi}^{\Tilde{P},r}(\cdot)\Vert _\infty$ is the bias-span of the MDP with transition probability $\Tilde{P}$.
\end{lemma}
\begin{proof}
Starting with the definition of Bellman error in Equation \eqref{eq:Bellman_def}, we get
\begin{align}
%%%%%%%%%%%%%%%%%
B_{\pi}^{\Tilde{P},r}(s,a) &= \lim_{\gamma\to1}B_{\gamma,\pi}^{\Tilde{P},r}(s,a)\\
%%%%%%%%%%%%%%%%%
&= \lim_{\gamma\to1}\left(Q_{\gamma,\pi}^{\Tilde{P},r}(s,a) - \left(r(s,a) +\gamma \sum_{s'\in\mathcal{S}}P(s'\vert s,a)V_{\gamma,\pi}^{\Tilde{P},r} \right)\right)\\
%%%%%%%%%%%%%%%%%
&=\lim_{\gamma\to1}\left(\left(r(s,a) + \gamma\sum_{s'\in\mathcal{S}} \Tilde{P}(s'\vert s,a)V_{\gamma,\pi}^{\Tilde{P},r}(s')\right) - \left(r(s,a) +\gamma \sum_{s'\in\mathcal{S}}P(s'\vert s,a)V_{\gamma,\pi}^{\Tilde{P},r}(s') \right)\right)\\
%%%%%%%%%%%%%%%%%
&= \lim_{\gamma\to1}\gamma\sum_{s'\in\mathcal{S}}\left( \Tilde{P}(s'\vert s,a) - P(s'\vert s,a)\right)V_{\gamma,\pi}^{\Tilde{P},r}(s')\label{eq:rewards_known}\\
%%%%%%%%%%%%%%%%%
&= \lim_{\gamma\to1}\gamma\left(\sum_{s'\in\mathcal{S}}\left( \Tilde{P}(s'\vert s,a) - P(s'\vert s,a)\right)V_{\gamma,\pi}^{\Tilde{P},r}(s') + V_{\gamma,\pi}^{\Tilde{P},r}(s) - V_{\gamma,\pi}^{\Tilde{P},r}(s)\right)\\
%%%%%%%%%%%%%%%%%
&= \lim_{\gamma\to1}\gamma\Big(\sum_{s'\in\mathcal{S}}\left( \Tilde{P}(s'\vert s,a) - P(s'\vert s,a)\right)V_{\gamma,\pi}^{\Tilde{P},r}(s') - \sum_{s'\in\mathcal{S}} \Tilde{P}(s'\vert s,a)V_{\gamma,\pi}^{\Tilde{P},r}(s)\nonumber\\
&~~~~~+ \sum_{s'\in\mathcal{S}} P(s'\vert s,a)V_{\gamma,\pi}^{\Tilde{P},r}(s)\Big)\\
%%%%%%%%%%%%%%%%%
&= \lim_{\gamma\to1}\gamma\left(\sum_{s'\in\mathcal{S}}\left( \Tilde{P}(s'\vert s,a) - P(s'\vert s,a)\right)\left(V_{\gamma,\pi}^{\Tilde{P},r}(s') - V_{\gamma,\pi}^{\Tilde{P},r}(s)\right)\right)\\
%%%%%%%%%%%%%%%%%
&= \left(\sum_{s'\in\mathcal{S}}\left( \Tilde{P}(s'\vert s,a) - P(s'\vert s,a)\right)\lim_{\gamma\to1}\gamma\left(V_{\gamma,\pi}^{\Tilde{P},r}(s') - V_{\gamma,\pi}^{\Tilde{P},r}(s)\right)\right)\label{eq:interchange_limit_and_expectation}\\
%%%%%%%%%%%%%%%%%
&= \left(\sum_{s'\in\mathcal{S}}\left( \Tilde{P}(s'\vert s,a) - P(s'\vert s,a)\right)h_{\pi}^{\Tilde{P},r}(s')\right)\label{eq:value_to_bias}\\
%%%%%%%%%%%%%%%%%
&\le \Big\Vert \Tilde{P}(\cdot\vert s,a) - P(\cdot\vert s,a)\Big\Vert_1\Vert h_{\pi}^{\Tilde{P},r}(\cdot)\Vert_\infty\label{eq:reward_holders}\\
%%%%%%%%%%%%%%%%%
&\le \epsilon_{s,a}\Tilde{T}_M \label{eq:bound_bias}
\end{align}
where Equation \eqref{eq:rewards_known} comes from the assumption that the rewards are known to the agent. Equation \eqref{eq:interchange_limit_and_expectation} follows from the fact that the difference between value function at two states is bounded. Equation \eqref{eq:value_to_bias} comes from the definition of bias term \cite{puterman2014markov} where $h$ is the bias of the policy $\pi$ when run on the sampled MDP. Equation \eqref{eq:reward_holders} follows from H\"{o}lder's inequality. In Equation \eqref{eq:bound_bias}, the $\ell_1$ norm of probability vector difference is bounded from the definition.

Additionally, note that the $\ell_1$ norm in Equation \eqref{eq:reward_holders} is bounded by $2$. Thus the Bellman error is loose upper bounded by $2\Vert h_{\pi}^{\Tilde{P},r}(\cdot)\Vert_\infty$ for all state-action pairs.
\end{proof}

\begin{lemma}[Bounded Span of optimal MDP in confidence interval]\label{lem:bounded_v_span_of_optimal_MDP}
For a MDP with rewards $r(s,a)$ and transition probabilities $P_e^r=\arg\max_{P_e\in\mathcal{P}_{t_e}}\lambda_{\pi_e}^{P_e,r}$, for policy $\pi_e$, the difference of bias of any two states $s$, and $s'$, is upper bounded by the mixing time of the true MDP $T_M$ as:
\begin{align}
    h_{\pi_e}^{P_e^r,r}(s) - h_{\pi_e}^{P_e^r,r}(s') \leq T_M~ \forall~s,s'\in \mathcal{S}
\end{align}
\end{lemma}

\begin{proof}
Note that $\lambda_{\pi_e}^{P_e^r,r} \ge \lambda_{\pi_e}^{P',r}$ for all $P'\in\mathcal{P}_{t_e}$. Now, consider the following Bellman equation:
\begin{align}
    h_{\pi_e}^{P_e^r,r}(s) &= r_{\pi_e}(s,a) - \lambda_{\pi_e}^{P_e^r,r} + <P_{\pi_e,e}^r(\cdot\vert s), h_{\pi_e}^{P_e^r,r}>\nonumber\\
    &= Th_{\pi_e}^{P_e^r,r}(s)
\end{align}
where $r_{\pi_e}(s) = \sum_{a}\pi_e(a\vert s)r(s,a)$ and $P_{\pi_e,e}^r(s'\vert s) = \sum_{a}\pi(a\vert s)P_e^r(s'\vert s,a)$.

Consider two states $s, s'\in \mathcal{S}$. Also, let $\tau$ be a random variable defined as:
\begin{align}
    \tau = \min\{t\geq 1: s_t = s', s_1 = s\}
\end{align}

We also define another operator, 
\begin{align}
\bar{T}h(s)=
\begin{cases}
\min_{s,a}r(s,a) - \lambda_{\pi_e}^{P_e^r,r} + <P_{\pi_e}(\cdot\vert s), h>, &s\neq s'\\
h_{\pi_e}^{P_e^r,r}(s'), &s=s'
\end{cases}
\end{align}
where $P_{\pi_e}(\cdot\vert s) = \sum_{a}\pi_e(a\vert s)P(s'\vert s,a)$.

Now, note that 
\begin{align}
    h(s) &= Th(s)\\
    &=\max_{P'\in\mathcal{P}_{t_e}}\left(r_{\pi_e}^r(s) -\lambda_{\pi_e}^{P_e^r,r} + <P_{\pi_e}', h>\right)\\
    &\ge r_{\pi_e}^r(s) -\lambda_{\pi_e}^{P_e^r,r} + <P_{\pi_e}, h>\\
    &\ge \min_{s,a}r(s,a) -\lambda_{\pi_e}^{P_e^r,r} + <P_{\pi_e}, h>\\
    &= \bar{T}h(s)
\end{align}
Further, for any two vectors $u, v$, where all the elements of $u$ are not smaller than $w$ we have $\bar{T}u \ge \bar{T}w$. Hence, we have $\bar{T}^nh_{\pi}^{P,r}(s) \le h_{\pi}^{P,r}(s)$ for all $s$. Unrolling the recurrence, we have
\begin{align}
    h_{\pi}^{P_e^r,r}(s) \ge \bar{T}^nh_{\pi}^{P_e^r,r}(s) = \mathbb{E}\left[-(\lambda_\pi^{P_e^r,r} - \min_{s,a}r(s,a))(n\wedge\tau) + h_{\pi}^{P_e^r,r}(s_{n\wedge\tau})\right]
\end{align}
For $\lim n\to \infty$, we have $h_{\pi}^{P_e^r,r}(s) \ge h_{\pi}^{P_e^r,r}(s') - T_M$, completing the proof.
\end{proof}

\subsection{Proof of results from main text}
After stating the necessary lemmas, we can now prove Lemma \ref{lem:feasibility_of_sampled_MDP} and Theorem \ref{thm:regret_bound}.

\begin{proof}[Proof of Theorem \ref{thm:regret_bound}]
We continue our proof from Equation \eqref{eq:regret_breakdown_optimal_MDP}. We had:
\begin{align}
    R(T) &\le \sum_{e=1}^E\mathbb{E}\left[\sum_{t=t_e}^{t_{e+1}-1}\left(\lambda_{\pi_e}^{P_e^r,r} - \lambda_{\pi_e}^{P,r}\right)\right] + \sum_{e=1}^E\mathbb{E}\left[\sum_{t=t_e}^{t_{e+1}-1}\left(\lambda_{\pi_e}^{P,r} - r(s_t, a_t)\right)\right]\\
    &= R_1(T) + R_2(T)\label{eq:regret_breakdown_optimal_MDP_app}
\end{align}
where $R_1(T)$ and $R_2(T)$ are:
\begin{align}
    R_1(T) &= \sum_{e=1}^E\mathbb{E}\left[\sum_{t=t_e}^{t_{e+1}-1}\left(\lambda_{\pi_e}^{P_e^r,r} - \lambda_{\pi_e}^{P,r}\right)\right]\\
    R_2(T) &= \sum_{e=1}^E\mathbb{E}\left[\sum_{t=t_e}^{t_{e+1}-1}\left(\lambda_{\pi_e}^{P,r} - r(s_t, a_t)\right)\right]
\end{align}

We first consider $R_2(T)$ term. We start by defining filtration $\mathcal{H}_t = \{s_0,a_0, \cdots, s_t, a_t\}$ as the set of of observed states and played actions. Further, we have $\lambda_{\pi_e}^{P,r}$ as
\begin{align}
    \lambda_{\pi_e}^{P,r} = \mathbb{E}_{(s,a)\sim\pi_e,P}[r(s,a)]
\end{align}
We have,
\begin{align}
    \mathbb{E}_{(s,a)\sim\pi_e, P}[r(s,a)] &= \mathbb{E}_{(s,a)\sim\pi_e, P}[r(s,a)] \pm \mathbb{E}_{(s_t,a_t)\sim\pi_e, P}[r(s_t,a_t)\vert \mathcal{H}_{t_e-1}]\label{eq:expectation_to_conditional_expectation}\\
    &=\mathbb{E}_{(s_t,a_t)\sim\pi_e, P}[r(s_t,a_t)\vert \mathcal{H}_{t_e-1}] + \left(\mathbb{E}_{(s,a)\sim\pi_e, P}[r(s,a)]- \mathbb{E}_{(s_t,a_t)\sim\pi_e, P}[r(s_t,a_t)\vert \mathcal{H}_{t_e-1}]\right)\\
    &\le\mathbb{E}_{(s_t,a_t)\sim\pi_e, P}[t(s_t,a_t)\vert \mathcal{H}_{t_e-1}] + 2\left(\Vert \pi_e(a\vert s)d_{\pi_e}(s) - \pi_e(a\vert s)P_{\pi,s_{t_e-1}}^{t-t_e+1}(s)\Vert _{TV}\right)\label{eq:change_expectation_to_diff_prob}\\
    &\le \mathbb{E}_{(s_t,a_t)\sim\pi_e, P}[r(s_t,a_t)\vert \mathcal{H}_{t_e-1}] + 2CS\rho^{t-t_e}\label{eq:TV_bounded_by_l1}
\end{align}
Hence, we have,
\begin{align}
    \sum_{t=t_e}^{t_{e+1}-1}\left(\lambda_{\pi_e}^{P,r} - r(s_t,a_t)\right) &= \sum_{t=t_e}^{t_{e+1}-1}\left(\mathbb{E}_{(s,a)\sim\pi_e,P}[r(s,a)] - r(s_t,a_t)\right)\\
    &\le \sum_{t=t_e}^{t_{e+1}-1}\left(\mathbb{E}_{(s_t,a_t)\sim\pi_e, P}[r(s_t,a_t)\vert \mathcal{H}_{t_e-1}] + 2CS\rho^{t-t_e} - r(s_t,a_t)\right)\\
    &\le \sum_{t=t_e}^{t_{e+1}-1}\left(\mathbb{E}_{(s_t,a_t)\sim\pi_e, P}[r(s_t,a_t)\vert \mathcal{H}_{t_e-1}] - r(s_t,a_t)\right) + \sum_{t=t_e}^\infty 2CS\rho^{t-t_e}\\
    &\le \sum_{t=t_e}^{t_{e+1}-1}\left(\mathbb{E}_{(s_t,a_t)\sim\pi_e, P}[r(s_t,a_t)\vert \mathcal{H}_{t_e-1}] - r(s_t,a_t)\right) + \frac{2CS}{1-\rho}
\end{align}
Using Azuma-Hoeffding's inequality, we get,
\begin{align}
    \sum_{t=t_e}^{t_{e+1}-1}\left(\mathbb{E}_{(s_t,a_t)\sim\pi_e, P}[r(s_t,a_t)\vert \mathcal{H}_{t_e-1}] - r(s_t,a_t)\right)&\le 2\sqrt{(t_{e+1}-t_e)\log(2T)}
\end{align}
with probability at least $1-1/T$. Summing over all the epochs and using Cauchy-Schwarz inequality, we get:
\begin{align}
    \sum_{t=t_e}^{t_{e+1}-1}\left(\lambda_{\pi_e}^{P,r} - r(s_t,a_t)\right) &= \sum_{e=1}^E\left(\sum_{t=t_e}^{t_{e+1}-1}\left(\mathbb{E}_{(s_t,a_t)\sim\pi_e, P}[r(s_t,a_t)\vert \mathcal{H}_{t_e-1}] - r(s_t,a_t)\right) + \frac{2CS}{1-\rho}\right)\\
    &\le \sum_{e=1}^E2\sqrt{(t_{e+1}-t_e)\log(2T)} + \frac{2CSE}{1-\rho}\\
    &\le 2\sqrt{E\sum_{e=1}^E(t_{e+1}-t_e)\log(2T)} + \frac{2CSE}{1-\rho}\\
    &= 2\sqrt{ET\log(2T)} + \frac{2CSE}{1-\rho}
\end{align}

% \begin{align}
%     \sum_{e=1}^E + \sum_{e=1}^E\frac{2CS}{1-\rho}\\
%     &\le 4\sqrt{T\log(2T)} + \frac{2CSE}{1-\rho}
% \end{align}
with probability at least $1-E/T$. Further, the maximum value of the sum is bounded by $T$ and that event occurs with probability less than $1/T$ which gives,
\begin{align}
    \mathbb{E}\left[R_2(T)\right] &=  \sum_{e=1}^E\mathbb{E}\left[\sum_{t=t_e}^{t_{e+1}-1}\left(\lambda_{\pi_e}^{P,r} - r(s_t, a_t)\right)\right]\\
    &\le 4\sqrt{T\log(2T)} + \frac{2CSE}{1-\rho} + \frac{E}{T}T\\
    &= E + 4\sqrt{ET\log(2T)} + \frac{2CSE}{1-\rho}
\end{align}

We can now focus on the $R_1(T)$ term. We have:
\begin{align}
    R_1(T) &= \sum_{e=1}^T\mathbb{E}\left[\sum_{t=t_e}^{t_{e+1}-1}(\lambda_{\pi_e}^{P_e^r,r} - \lambda_{\pi_e}^{P,r})\right]\\
    &= \sum_{e=1}^T\mathbb{E}\left[\sum_{t=t_e}^{t_{e+1}-1}\mathbb{E}_{s,a\sim\pi,P}\left[B_{\pi_e}^{P_e^r,r}(s,a)\right]\right]
\end{align}

Similar to Equations \eqref{eq:expectation_to_conditional_expectation}-\eqref{eq:TV_bounded_by_l1}, we have:
\begin{align}
    \sum_{e=1}^E\sum_{t=t_e}^{t_{e+1}-1}\mathbb{E}_{s,a\sim\pi,P}\left[B_{\pi_e}^{P_e^r,r}(s,a)\right] &\le \sum_{e=1}^E\sum_{t=t_e}^{t_{e+1}-1}\mathbb{E}_{s,a\sim\pi,P}\left[B_{\pi_e}^{P_e^r,r}(s,a)|\mathcal{H}_{t_e-1}\right] + \sum_{e=1}^E\sum_{t=t_e}^{t_{e+1}-1}2CT_MS\rho^{t-t_e}\label{eq:total_bellman_sum}
\end{align}
Again, using Azuma-Hoeffding's inequality, with probability at least $1-1/T$ we have:
\begin{align}
    \sum_{t=t_e}^{t_{e+1}-1}\mathbb{E}_{s,a\sim\pi,P}\left[B_{\pi_e}^{P_e^r,r}(s,a)|\mathcal{H}_{t_e-1}\right] \le \sum_{t=t_e}^{t_{e+1}-1}B_{\pi_e}^{P_e^r,e}(s_t,a_t) + 2T_M\sqrt{(t_{e+1}-t_e)\log(2T)}
\end{align}
Summing over all the epochs, we get, with probability at least $1-E/T$:
\begin{align}
    \sum_{e=1}^E\sum_{t=t_e}^{t_{e+1}-1}\mathbb{E}_{s,a\sim\pi,P}\left[B_{\pi_e}^{P_e^r,r}(s,a)|\mathcal{H}_{t_e-1}\right] &\le \sum_{e=1}^E\sum_{t=t_e}^{t_{e+1}-1}B_{\pi_e}^{P_e^r,e}(s_t,a_t) + \sum_{e=1}^E\sum_{t=t_e}^{t_{e+1}-1}2T_M\sqrt{(t_{e+1}-t_e)\log(2T)}\\
    &\le \sum_{e=1}^E\sum_{t=t_e}^{t_{e+1}-1}B_{\pi_e}^{P_e^r,e}(s_t,a_t) + 2T_M\sqrt{E\sum_{e=1}^E(t_{e+1}-t_e)\log(2T)}\\
    &=  \sum_{e=1}^E\sum_{t=t_e}^{t_{e+1}-1}B_{\pi_e}^{P_e^r,e}(s_t,a_t) + 2T_M\sqrt{ET\log(2T)}\\
    &\le \sum_{e=1}^E\sum_{t=t_e}^{t_{e+1}-1}\Big\Vert \Tilde{P}(\cdot\vert s,a) - P(\cdot\vert s,a)\Big\Vert_1\Vert h_{\pi}^{\Tilde{P},r}(\cdot)\Vert_\infty + 2T_M\sqrt{ET\log(2T)}\label{eq:use_Lemma_bound_on_Bellman}\\
    &\le \sum_{e=1}^E\sum_{s,a}\nu_e(s,a)2\sqrt{\frac{14S\log(2AT)}{N_e(s,a)}}\Vert h_{\pi}^{\Tilde{P},r}(\cdot)\Vert_\infty + 2T_M\sqrt{ET\log(2T)}\label{eq:use_ell_1_bound}\\
    &\le 2T_M\sqrt{14S\log(2AT)}\sum_{s,a}\sum_{e=1}^E\frac{\nu_e(s,a)}{\sqrt{N_e(s,a)}} + 2T_M\sqrt{ET\log(2T)}\label{eq:use_lemma_bias_span_bound}\\
    &\le 2(\sqrt{2}+1)T_M\sqrt{14S\log(2AT)}\sum_{s,a}\sqrt{N(s,a)} + 2T_M\sqrt{ET\log(2T)}\label{eq:jaksch_sum_lemma}\\
    &\le 2(\sqrt{2}+1)T_M\sqrt{14S\log(2AT)}\sqrt{SAT} + 2T_M\sqrt{ET\log(2T)}\label{eq:use_cauchy_schwarz}
\end{align}
where Equation \eqref{eq:use_Lemma_bound_on_Bellman} follows from Lemma \ref{lem:bound_bellman_s_a_main}. Equation \eqref{eq:use_ell_1_bound} follows from Lemma \ref{lem:ell_1_conc} with probability $1-1/T^{5}$. Equation \eqref{eq:use_lemma_bias_span_bound} comes from Lemma \ref{lem:bounded_v_span_of_optimal_MDP}. Equation \eqref{eq:jaksch_sum_lemma} follows from \cite[{Lemma 19}]{jaksch2010near} and Equation \eqref{eq:use_cauchy_schwarz} follows from Cauchy-Schwarz inequality.

Together with Equation \eqref{eq:total_bellman_sum}, we get:
\begin{align}
    R_1(T) \le 2(\sqrt{2}+1)T_MS\sqrt{AT\log(AT)} + 2T_M\sqrt{ET\log(2T)} + \frac{2T_MSE}{1-\rho} + E + \sqrt{T}
\end{align}

Combining $R_1(T)$ and $R_2(T)$ we get the required bound on regret.
The bound on constraint violations follows similarly.
\end{proof}

\begin{proof}[Proof of Lemma \ref{lem:feasibility_of_sampled_MDP}]
We begin with considering the policy $\pi$ in Assumption \ref{ch_2_slaters_conditon}. We now prove the result for one $k\in[K]$ and the result follows for all $k\in[K]$. We consider an MDP with transition dynamics $P_e^k$ which maximizes $\zeta_{\pi}^{P',k}$ for all $\|P'(\cdot\vert s,a)-P(\cdot\vert s,a)\|_1\le \sqrt{\frac{14S\log(2At)}{N_e(s,a)}}$ for all $s,a$.
Consider the difference between the average cost $k$ incurred from following policy $\pi$ on the MDP with true transition probabilities $P$ and the average cost $k$ incurred from following policy $\pi$ on the MDP with transition probabilities $P_e^k$ and using Lemma \ref{lem:bound_average_by_bellman}. We have:
\begin{align}
    \zeta_{\pi}^{\tilde{P}_e,k} - \zeta_{\pi}^{P,k} &\le \zeta_{\pi}^{P_e^k,k} - \zeta_{\pi}^{P,k}\\
    &= \sum_{s,a}d_{\pi}^P(s,a)B_{\pi}^{P_e^k,k}(s,a)\\
    &= \mathbb{E}\left[B_{\pi}^{P_e^k,k}(s,a)\right]\label{eq:cost_difference_feasibility}
\end{align}
where the of Bellman error $B_{\pi}^{P_e^k,k}(s,a)$ is of the following form,
\begin{align*}
    B_{\pi}^{\tilde{P}_e,k}(s,a) = \lim_{\gamma\to1}\left(Q_{\gamma, \pi}^{\Tilde{P},k}(s,a) - c^k(s,a) -  \gamma\sum\nolimits_{s'\in\mathcal{S}}P(s'\vert s,a)V_{\gamma, \pi}^{\Tilde{P}, k}(s,a)\right),
\end{align*}
and the value function, $V_{\gamma, \pi}^{\Tilde{P},k}(s)$ and $Q$-value, $Q_{\gamma, \pi}^{\Tilde{P},k}(s,a)$, function become:
\begin{align*}
    V_{\gamma, \pi}^{\Tilde{P},k}(s) = \sum_{t=1}^\infty \gamma^{t-1}\mathbb{E}_{a_t\sim \pi, s_{t+1}\sim P}\left[c^k(s_t, a_t)\vert s_1 = s\right]\\
    Q_{\gamma, \pi}^{\Tilde{P},k}(s,a) = \sum_{t=1}^\infty \gamma^{t-1}\mathbb{E}_{a_t\sim \pi, s_{t+1}\sim P}\left[c^k(s_t, a_t)\vert s_1 = s, a_1 = a\right].
\end{align*}

We bound the expectation using Azuma-Hoeffdings inequality as follows:
\begin{align}
    \mathbb{E} \left[ B_{\pi}^{P_e^k,k}(s,a)\right] &= \mathbb{E} \left[ B_{\pi}^{P_e^k,k}(s_t,a_t)\vert \mathcal{H}_{t_e-1}\right] + C\Vert h_{\pi}^{P_e,k}(\cdot)\Vert_\infty\rho^{t-t_e}\\
    &= \frac{1}{t_{e+1}-t_e}\sum_{t=t_e}^{t_{e+1}-1}\left(\mathbb{E} \left[ B_{\pi}^{P_e^k,k}(s_t,a_t)\vert \mathcal{H}_{t_e-1}\right] + C\Vert h_{\pi}^{P_e,k}(\cdot)\Vert_\infty\rho^{t-t_e}\right)\label{eq:sum_both_sides}\\
    &\le \frac{1}{t_{e+1}-t_e}\sum_{t=t_e}^{t_{e+1}-1}\left(\mathbb{E} \left[ B_{\pi}^{P_e^k,k}(s_t,a_t)\vert \mathcal{H}_{t_e-1}\right]\right) + \frac{CS\Vert h_{\pi}^{P_e,k}(\cdot)\Vert_\infty}{(1-\rho)(t_{e+1}-t_e)}\label{eq:gp_sum_bellman}\\
    &\le \frac{1}{t_{e+1}-t_e}\left(T_M\sqrt{14S\log AT}\sum_{s,a}\frac{\nu_e(s,a)}{\sqrt{N_e(s,a)}} + 4T_M\sqrt{7(t_{e+1}-t_e)\log(t_{e+1}-t_e)}\right)\nonumber\\
    &~~+ \frac{CST_M}{(1-\rho)(t_{e+1}-t_e)}\label{eq:bellman_expectation}\\
    &\le \frac{1}{t_{e+1}-t_e}\left(T_M\sqrt{14S\log AT}\sum_{s,a}\sqrt{\nu_e(s,a)} + 4T_M\sqrt{7(t_{e+1}-t_e)\log(t_{e+1}-t_e)}\right)\nonumber\\
    &~~+ \frac{CST_M}{(1-\rho)(t_{e+1}-t_e)}\label{eq:lower_bound_N_e}\\
    &\le \frac{1}{t_{e+1}-t_e}\left(T_M S\sqrt{14A\log AT}\sqrt{\sum_{s,a}\nu_e(s,a)} + 4T_M\sqrt{7(t_{e+1}-t_e)\log(t_{e+1}-t_e)}\right)\nonumber\\
    &~~+ \frac{CST_M}{(1-\rho)(t_{e+1}-t_e)}\label{eq:cauchy_schwarz_1}\\
    &\le \frac{1}{t_{e+1}-t_e}\left(T_M S\sqrt{14A\log AT}\sqrt{(t_{e+1}-t_e)} + 4T_M\sqrt{7(t_{e+1}-t_e)\log(t_{e+1}-t_e)}\right)\nonumber\\
    &~~+ \frac{CST_M}{(1-\rho)(t_{e+1}-t_e)}\label{eq:sum_N_e_is_epoch_length}\\
    &\le \left(T_M S\sqrt{\frac{14A\log AT}{(t_{e+1}-t_e)}} + 4T_M\sqrt{\frac{7\log(t_{e+1}-t_e)}{(t_{e+1}-t_e)}}\right)+ \frac{CST_M}{(1-\rho)(t_{e+1}-t_e)}\label{eq:reduce_error_terms}
\end{align}
where Equation \eqref{eq:sum_both_sides} is obtained by summing both sides from $t = t_e$ to $t= t_{e+1}$. Equation \eqref{eq:gp_sum_bellman} is obtained by summing over the geometric series with ratio $\rho$. Equation \eqref{eq:bellman_expectation} comes from analysis used in the proof of Theorem \ref{thm:regret_bound}. Equation \eqref{eq:lower_bound_N_e} comes from the fact that $N_e(s,a) \ge \nu_e(s,a)$ for all $s,a$, and then replacing the lower bound of $N_e(s,a)$. Equation \eqref{eq:cauchy_schwarz_1} follows from the Cauchy Schwarz inequality. Equation \eqref{eq:sum_N_e_is_epoch_length} follows from the fact that the epoch length $t_{e+1}-t_e$ is same as the number of visitations to all state action pairs in an epoch. 

Combining Equation \eqref{eq:reduce_error_terms} with Equation \eqref{eq:cost_difference_feasibility}, we obtain the required result as follows:
\begin{align}
\zeta_{\pi}^{\tilde{P}_e,k} &\le \zeta_{\pi}^{P_e^k,k} - \zeta_{\pi}^{P,k} + \zeta_{\pi}^{P,k}\\
&\le \left(T_M S\sqrt{\frac{14A\log AT}{(t_{e+1}-t_e)}} + 4T_M\sqrt{\frac{7\log(t_{e+1}-t_e)}{(t_{e+1}-t_e)}}\right)+ \frac{CST_M}{(1-\rho)(t_{e+1}-t_e)} + \zeta_{\pi}^{P,k}\\
&\le \left(T_M S\sqrt{\frac{14A\log AT}{\sqrt{T}}} + 4T_M\sqrt{\frac{7\log(\sqrt{T})}{\sqrt{T}}}\right)+ \frac{CST_M}{(1-\rho)\sqrt{T}} + \zeta_{\pi}^{P,k}\label{eq:replace_eplen_with_fT}\\
&\le \left(T_M S\sqrt{\frac{14A\log AT}{\sqrt{T}}} + 4T_M\sqrt{\frac{7\log(\sqrt{T})}{\sqrt{T}}}\right)+ \frac{CST_M}{(1-\rho)\sqrt{T}} + C_k -\kappa\label{eq:use_assumption_slater}\\
&\le C_k\label{eq:use_slater_value}
\end{align}
where Equation \eqref{eq:replace_eplen_with_fT} comes from the fact that we consider epoch length $t_{e+1}-t_e\ge\sqrt{T}$ and Equation \eqref{eq:use_assumption_slater} comes from Assumption \ref{ch_2_slaters_conditon} and Equation \eqref{eq:use_slater_value} comes from the value of Slater's constant $\kappa$ in Assumption \ref{ch_2_slaters_conditon}. Replicating the analysis for all $k\in[K]$, for the policy $\pi$, $\zeta_{\pi}^{\tilde{P}_e,k}$ satisfy the constraint for all $k\in[K]$ and hence, the optimization problem in Equation \eqref{eq:optimization_equation}-\eqref{eq:cmdp_constraints} is feasible.
\end{proof}
	
\end{document}