% \documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
% version; also before submission to
% see how the non-anonymous paper would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
% ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced automatically for papers to be published. Do not make any other change above this note for an accepted version.
%!TEX root =  main.tex
%------------------------
\usepackage{lmodern}
\usepackage[english]{babel}
\usepackage{latexsym}
\usepackage{amsmath}
\usepackage{mathrsfs}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{bm}
\usepackage{datetime}
\usepackage[table,xcdraw]{xcolor}
\usepackage{accents}
\usepackage{tikz}
\usepackage{listings}
\usepackage{mdframed}
\usepackage{pgfplots}
\usepackage{pgfplotstable}
\usepackage[algoruled, linesnumbered]{algorithm2e} 
\usepackage{xr}

\usepackage{dsfont}
\usepackage{color}
\usepackage{colortbl}
\usepackage{pifont}
\usepackage{caption}
\usepackage{microtype} % improved spacing between words for easier reading
\usepackage{float}
\usepackage{xfrac} % sfrac
\usepackage{xspace}
\usepackage{booktabs}
\usepackage{blkarray}
\usepackage{graphicx}
\usepackage{subcaption}


\usepackage[textsize=tiny,
disable
]{todonotes}
\newcommand{\todoc}[1]{\todo[color=orange!20!white]{Csaba: #1}}


\newcommand{\red}[1]{\textcolor{red}{#1}}
\newcommand{\blue}[1]{\textcolor{blue}{#1}}
\newcommand{\teal}[1]{\textcolor{teal}{#1}}

%%%%%%%%-- adding this new command here----------
\usepackage{zref-xr}
\usepackage{nameref}
\usepackage{hyperref}
%---------------------------------


% \usepackage{hyperref}
\hypersetup{
    % bookmarks=true,         % show bookmarks bar?
    unicode=false,          % non-Latin characters in AcrobatÕs bookmarks
    pdftoolbar=true,        % show AcrobatÕs toolbar?
    pdfmenubar=true,        % show AcrobatÕs menu?
    pdffitwindow=false,     % window fit to page when opened
    pdfstartview={FitH},    % fits the width of the page to the window
    pdftitle={XXXXX},    % title
    pdfauthor={XXX},     % author
    pdfsubject={Bandits, Reinforcement Learning},   % subject of the document
    pdfcreator={Creator},   % creator of the document
    pdfproducer={Producer}, % producer of the document
    pdfkeywords={bandits} {reinforcement learning} {policy gradient}, % list of keywords
    pdfnewwindow=true,      % links in new window
    colorlinks=true,       % false: boxed links; true: colored links
    linkcolor=blue,          % color of internal links (change box color with linkbordercolor)
    citecolor=blue,        % color of links to bibliography
    filecolor=magenta,      % color of file links
    urlcolor=cyan           % color of external links
}
\usepackage{amsthm}
\usepackage{times}
\usepackage{nicefrac}
\usepackage{wrapfig}
\usepackage{pgfplots}
% \usepackage[capitalize]{cleveref}
\usepackage{thm-restate}


%----------------------
\usepackage[capitalize,noabbrev]{cleveref}
\zxrsetup{toltxlabel=true,tozreflabel=false,verbose}
%---------------------------------------------
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% THEOREMS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\definecolor{shadecolor}{gray}{0.90}
\declaretheoremstyle[
headfont=\normalfont\bfseries,
notefont=\mdseries, notebraces={(}{)},
bodyfont=\normalfont,
postheadspace=0.5em,
spaceabove=5pt,
mdframed={
  skipabove=3pt,
  skipbelow=3pt,
  hidealllines=true,
  backgroundcolor={shadecolor},
  innerleftmargin=2pt,
  innerrightmargin=2pt}
]{shaded}
% \declaretheorem[style=shaded]{theorem}

\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\newtheorem{remark}[theorem]{Remark}
\theoremstyle{remark}

%% Some suggested packages, as needed:
\usepackage[round]{natbib} % has a nice set of citation styles and commands
\bibliographystyle{plainnat}
\renewcommand{\bibsection}{\subsubsection*{References}}

\makeatletter
\def\thm@space@setup{\thm@preskip=0pt
\thm@postskip=0pt}
\makeatother

\setlength{\abovedisplayskip}{0pt}
\setlength{\belowdisplayskip}{0pt}
\setlength{\abovedisplayshortskip}{0pt}
\setlength{\belowdisplayshortskip}{0pt}

\setlength{\textfloatsep}{10pt plus 1.0pt minus 2.0pt}

\mdfdefinestyle{mdframedthmbox}{%
	leftmargin=.0\textwidth,
	rightmargin=.0\textwidth,%
	innertopmargin=0.75em,
	innerleftmargin=.5em,
	innerrightmargin=.5em,
}

\newenvironment{thmbox}
	{%
		\begin{mdframed}[style=mdframedthmbox]%
	}{% 
		\end{mdframed}%
	}


\newcommand{\myquote}[1]{\null~\\{\null\hspace{.05\textwidth}\begin{minipage}[t]{.90\textwidth} #1 \end{minipage}}}
\newcommand{\myquoten}[1]{{\null\hspace{.05\textwidth}\begin{minipage}[t]{.90\textwidth} #1 \end{minipage}}}

\crefname{algline}{Line}{Line}
\crefname{algline}{Line}{Line}


%!TEX root =  main.tex
\newcommand{\E}{\mathbb{E}}
\newcommand{\cZ}{\mathcal{Z}}
\newcommand{\EE}[1]{\E[#1]}
\newcommand{\EEg}[1]{\E\left[#1\right]}
\newcommand{\Prob}[1]{\mathbb{P}(#1)}
\newcommand{\PP}{\mathbb{P}}
\newcommand{\one}[1]{\mathbb{I}\{#1\}}
\newcommand{\Supp}{\operatorname{supp}}
\newcommand{\ip}[1]{\langle #1 \rangle}
\newcommand{\bip}[1]{\left\langle #1 \right\rangle}
\newcommand{\norm}[1]{\left\|#1\right\|}
\newcommand{\norminf}[1]{\left\|#1\right\|_{\infty}}
\newcommand{\indnorm}[2]{\|#1\|_{#2}}
\newcommand{\normsq}[1]{\|#1\|^2}
\newcommand{\R}{\mathbb{R}}

\newcommand{\N}{\mathbb{N}}
\newcommand{\cA}{\mathcal{A}}
\newcommand{\cB}{\mathcal{B}}
\newcommand{\cC}{\mathcal{C}}
\newcommand{\cD}{\mathcal{D}}
\newcommand{\cE}{\mathcal{E}}
\newcommand{\cF}{\mathcal{F}}
\newcommand{\cG}{\mathcal{G}}
\newcommand{\cH}{\mathcal{H}}
\newcommand{\cK}{\mathcal{K}}
\newcommand{\cL}{\mathcal{L}}
\newcommand{\cN}{\mathcal{N}}
\newcommand{\cM}{\mathcal{M}}
\newcommand{\cP}{\mathcal{P}}
\newcommand{\cQ}{\mathcal{Q}}
\newcommand{\cR}{\mathcal{R}}
\newcommand{\cS}{\mathcal{S}}
\newcommand{\cT}{\mathcal{T}}
\newcommand{\cV}{\mathcal{V}}
\newcommand{\sA}{\mathscr A}

\newcommand{\epsapp}{\epsilon}
\newcommand{\epssub}{\delta}

\DeclareMathOperator{\Range}{range}
\newcommand{\rows}{\operatorname{rows}}

\renewcommand{\epsilon}{\varepsilon}
\newcommand{\tvarepsilon}{\tilde{\varepsilon}}



\newcommand{\ceil}[1]{\left\lceil {#1} \right\rceil}
\newcommand{\floor}[1]{\left\lfloor {#1} \right\rfloor}
\newcommand{\ones}{\mathbf{1}}
\newcommand{\zeros}{\mathbf{0}}
\DeclareMathOperator*{\argmin}{arg\ min}
\DeclareMathOperator*{\argmax}{arg\ max}

\theoremstyle{plain}
\newtheorem{example}[theorem]{Example}

\def\rvzero{{\mathbf{0}}}
\def\rvone{{\mathbf{1}}}

\def\identiymatrix{\mathbf{Id}}

\newcommand{\softmax}{\mathrm{softmax}}
\newcommand{\KL}{D_{\mathrm{KL}}}

% Graph
\def\gA{{\mathcal{A}}}
\def\gB{{\mathcal{B}}}
\def\gC{{\mathcal{C}}}
\def\gD{{\mathcal{D}}}
\def\gE{{\mathcal{E}}}
\def\gF{{\mathcal{F}}}
\def\gG{{\mathcal{G}}}
\def\gH{{\mathcal{H}}}
\def\gI{{\mathcal{I}}}
\def\gJ{{\mathcal{J}}}
\def\gK{{\mathcal{K}}}
\def\gL{{\mathcal{L}}}
\def\gM{{\mathcal{M}}}
\def\gN{{\mathcal{N}}}
\def\gO{{\mathcal{O}}}
\def\gP{{\mathcal{P}}}
\def\gQ{{\mathcal{Q}}}
\def\gR{{\mathcal{R}}}
\def\gS{{\mathcal{S}}}
\def\gT{{\mathcal{T}}}
\def\gU{{\mathcal{U}}}
\def\gV{{\mathcal{V}}}
\def\gW{{\mathcal{W}}}
\def\gX{{\mathcal{X}}}
\def\gY{{\mathcal{Y}}}
\def\gZ{{\mathcal{Z}}}

% Sets
\def\sA{{\mathbb{A}}}
\def\sB{{\mathbb{B}}}
\def\sC{{\mathbb{C}}}
\def\sD{{\mathbb{D}}}
% Don't use a set called E, because this would be the same as our symbol
% for expectation.
\def\sE{{\mathbb{E}}}
\def\sF{{\mathbb{F}}}
\def\sG{{\mathbb{G}}}
\def\sH{{\mathbb{H}}}
\def\sI{{\mathbb{I}}}
\def\sJ{{\mathbb{J}}}
\def\sK{{\mathbb{K}}}
\def\sL{{\mathbb{L}}}
\def\sM{{\mathbb{M}}}
\def\sN{{\mathbb{N}}}
\def\sO{{\mathbb{O}}}
\def\sP{{\mathbb{P}}}
\def\sQ{{\mathbb{Q}}}
\def\sR{{\mathbb{R}}}
\def\sS{{\mathbb{S}}}
\def\sT{{\mathbb{T}}}
\def\sU{{\mathbb{U}}}
\def\sV{{\mathbb{V}}}
\def\sW{{\mathbb{W}}}
\def\sX{{\mathbb{X}}}
\def\sY{{\mathbb{Y}}}
\def\sZ{{\mathbb{Z}}}

\newcommand{\dimE}{\mathrm{dim}_{\mathcal{E}}}
\DeclareMathOperator{\diam}{diam}
\newcommand{\Alg}{\mathcal{A}}
\DeclareMathOperator{\Ber}{Ber}

\newcommand{\val}[1]{V^{#1}(\rho)}
\newcommand{\vals}[1]{V^{#1}}

\newcommand{\const}[1]{V_c^{#1}(\rho)}
\newcommand{\consthat}[1]{\hat{V}_c^{#1}(\rho)}
\newcommand{\optconst}{V_c^{*}(\rho)}

\newcommand{\consts}[1]{V_c^{#1}}
\newcommand{\consthats}[1]{\hat{V}_c^{#1}}
\newcommand{\optconsts}{V_c^{*}}

\newcommand{\reward}[1]{V_r^{#1}(\rho)}
\newcommand{\rewardhat}[1]{\hat{V}_r^{#1}(\rho)}
\newcommand{\optreward}{V_r^{*}(\rho)}

\newcommand{\rewards}[1]{V_r^{#1}}
\newcommand{\rewardhats}[1]{\hat{V}_r^{#1}}
\newcommand{\optrewards}{V_r^{*}}


\newcommand{\lag}[2]{V_{l}^{#1,#2}(\rho)}
\newcommand{\laghat}[2]{\hat{V}_{l}^{#1,#2}(\rho)}
\newcommand{\optlag}[1]{V_l^{#1}(\rho)}

\newcommand{\dual}[1]{V_d^{#1}(\rho)}
\newcommand{\dualhat}[1]{\hat{V}_d^{#1}(\rho)}
\newcommand{\optdual}{V_d^{*}(\rho)}

\newcommand{\lagq}[1]{Q_{l}^{#1}}
\newcommand{\lagqhat}[1]{\hat{Q}_{l}^{#1}}

\newcommand{\laga}[1]{A_{l}^{#1}}
\newcommand{\lagahat}[1]{\hat{A}_{l}^{#1}}
\newcommand{\lagatilde}[1]{\tilde{A}_{l}^{#1}}

\newcommand{\cI}{\mathcal{I}}



\newcommand{\rewardq}[1]{Q_{r}^{#1}}
\newcommand{\rewardqhat}[1]{\hat{Q}_{r}^{#1}}

\newcommand{\rewardqs}[1]{Q_{r}^{#1}(s,\cdot)}
\newcommand{\rewardqhats}[1]{\hat{Q}_{r}^{#1}(s,\cdot)}

\newcommand{\constq}[1]{Q_{c}^{#1}}
\newcommand{\constqhat}[1]{\hat{Q}_{c}^{#1}}

\newcommand{\constqs}[1]{Q_{c}^{#1}(s,\cdot)}
\newcommand{\constqhats}[1]{\hat{Q}_{c}^{#1}(s,\cdot)}

\newcommand{\piopt}{\pi^*}
\newcommand{\pit}{\pi_{t}}
\newcommand{\pihat}{\hat{\pi}}
\newcommand{\pitil}{\tilde{\pi}}
\newcommand{\tautil}{\tilde{\tau}}
\newcommand{\pip}{\pi^{\prime}}
\newcommand{\pitt}{\pi_{t+1}}
\newcommand{\lambdat}{\lambda_{t}}
\newcommand{\lambdatt}{\lambda_{t+1}}
\newcommand{\transpose}{^\mathsf{\scriptscriptstyle T}}
\newcommand{\inv}{^\mathsf{\scriptscriptstyle -1}}


\newcommand{\epsb}{\varepsilon_{\text{\tiny{b}}}}
\newcommand{\epss}{\varepsilon_{\text{\tiny{s}}}}

\def\Alg/{{CBP}}
\newcommand{\Tau}{\mathrm{T}}
\newcommand{\thetahat}{\hat{\theta}}

\def\cAlg/{\red{CBP}}
\def\cGDA/{\blue{GDA}}
\def\cCRPO/{\teal{CRPO}}

% %--------------------------
% \externaldocument[supp:]{jain_326-supp}
% \usepackage{nameref}
% \usepackage{zref-xr,zref-user}
% \zxrsetup{toltxlabel}
% %----------------------


%-----------------------------------
\zexternaldocument*{jain_326-supp}

\title{Towards Painless Policy Optimization for Constrained MDPs}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1$^*$]{Arushi Jain}
\author[2\thanks{The first two authors contributed equally. Email: arushi.jain@mail.mcgill.ca, vaswani.sharan@gmail.com.}]{Sharan Vaswani}
\author[3]{Reza Babanezhad}
\author[4,5]{Csaba Szepesv\'ari}
\author[1,5]{Doina Precup}

% Add affiliations after the authors
\affil[1]{%
    Mila, McGill University
}
\affil[2]{%
   Simon Fraser University
}
\affil[3]{%
   SAIT AI Lab, Montreal
  }
\affil[4]{%
   Amii, University of Alberta 
   }
\affil[5]{%
DeepMind
} 
  
  
\begin{document}
\maketitle

%!TEX root =  main.tex
\begin{abstract}
We study policy optimization in an infinite horizon, $\gamma$-discounted constrained Markov decision process (CMDP). Our objective is to return a policy that achieves large expected reward with a small constraint violation. We consider the online setting with linear function approximation and assume global access to the corresponding features. We propose a generic primal-dual framework that allows us to bound the reward sub-optimality and constraint violation for arbitrary algorithms in terms of their primal and dual regret on online linear optimization problems. We instantiate this framework to use coin-betting algorithms and propose the \textbf{Coin Betting Politex (CBP)} algorithm. Assuming that the action-value functions are $\varepsilon_{\text{\tiny{b}}}$-close to the span of the $d$-dimensional state-action features and no sampling errors, we prove that $T$ iterations of CBP result in an $O\left(\frac{1}{(1 - \gamma)^3 \sqrt{T}} + \frac{\varepsilon_{\text{\tiny{b}}} \sqrt{d}}{(1 - \gamma)^2} \right)$ reward sub-optimality and an $O\left(\frac{1}{(1 - \gamma)^2 \sqrt{T}} + \frac{\varepsilon_{\text{\tiny{b}}} \sqrt{d}}{1 - \gamma} \right)$ constraint violation. Importantly, unlike gradient descent-ascent and other recent methods, CBP does not require extensive hyperparameter tuning. Via experiments on synthetic and Cartpole environments, we demonstrate the effectiveness and robustness of CBP.
\end{abstract}
% Unlike policy gradient methods, \Alg/ does not incur a dependence on the concentrability coefficient, [Move to Intro + some subtlety]




% For a CMDP, the agent aims to return a policy that maximizes the expected discounted reward while satisfying a constraint on the expected utility. 

% OLD Abstract
% We study the planning problem in an infinite-horizon, discounted constrained Markov decision process (CMDP). We consider the online learning setting with linear function approximation and assume global access to the corresponding features. In order to return a good policy that achieves large expected reward but satisfies the constraints, we propose a generic primal-dual framework. Our framework allows us to bound the sub-optimality gap and constraint violation in terms of the primal and dual regret for arbitrary algorithms. We instantiate this framework to use coin-betting algorithms from online linear optimization to control both the primal and dual regret and refer to the resulting algorithm as Coin Betting Politex (\Alg/). Assuming that the action-value functions are $\varepsilon$-close to the span of $d$-dimensional state-action features and $\gamma$ is the discount factor, we prove that $T$ iterations of \Alg/ result in an $O\left(\frac{1}{(1 - \gamma)^3 \sqrt{T}} + \frac{\varepsilon \sqrt{d}}{1 - \gamma} \right)$ bound on the reward sub-optimality and a constraint violation of $O\left(\frac{1}{(1 - \gamma)^2 \sqrt{T}} + \varepsilon \sqrt{d} \right)$. Unlike gradient descent-ascent and other recent methods, \Alg/ is robust to the choice of hyper-parameters. Via experiments on tabular and OpenAI gym tasks, we demonstrate the superior performance and robustness of \Alg/.
%!TEX root =  main.tex
\section{Introduction}
\label{sec:Introduction}
% why is it important
Popular reinforcement learning (RL) algorithms focus on optimizing an unconstrained objective, and have found applications in games such as Atari~\citep{mnih2015human} or Go~\citep{silver2016mastering}, robot manipulation tasks~\citep{tan2018sim,zeng2020tossingbot} or clinical trials~\citep{schaefer2005modeling}. However, many applications require the planning agent to satisfy constraints -- for example, in wireless sensor networks~\citep{buratti2009overview, julian2002qos} there is a constraint on  average power consumption of a deployed policy. Similarly, in safe RL, the policy is constrained to only visit certain states while exploring in physical systems ~\citep{moldovan2012safe,ono2015chance,fisac2018general}. The constrained Markov decision process (CMDP)~\citep{altman1999constrained} is a natural framework to model long-term constraints that need to be satisfied by a policy. The typical objective for CMDPs is to maximize the cumulative reward (similar to unconstrained MDPs), while (approximately) satisfying the constraint.

We focus on a well-studied problem in CMDPs -- return an approximately feasible policy (that is allowed to violate the constraints by a small amount), while (approximately) maximizing the cumulative reward. The past literature on this topic considered two approaches. The first approach is \emph{primal-only algorithms}, where constraints are (approximately) enforced without directly relying on introducing a Lagrangian formulation~\citep{achiam2017constrained, chow2018lyapunov,dalal2018safe,liu2020ipo,xu2021crpo}.
Of these methods, only the recent work of~\citet{xu2021crpo} guarantees global convergence to the optimal feasible policy in both the tabular and function approximation settings. 

The second approach in CMDPs is to form the Lagrangian, and solve the resulting saddle-point problem using \emph{primal-dual algorithms}~\citep{altman1999constrained,borkar2005actor,bhatnagar2012online,borkar2014risk,tessler2018reward,liang2018accelerated, paternain2019constrained, yu2019convergent, ding2021provably, ding2020natural, stooke2020responsive}. Such approaches update both the policy parameters (primal variables), while updating the Lagrange multipliers (dual variables). Of these methods,~\citet{tessler2018reward} prove a local convergence guarantee, while~\citet{paternain2019constrained} prove that their proposed algorithm will converge to a neighbourhood of the optimal policy. More recently, \citet{ding2020natural} proposed to use natural policy gradient updates~\citep{kakade2001natural} for changing the policy parameters
while using gradient descent to update the dual variables. They prove that this primal-dual algorithm converges to the optimal policy in both the tabular and the function approximation settings.   

Although there is no lack in algorithms designed for CMDPs, \emph{these algorithms are often highly sensitive to the choice of their hyperparameters}. For example,~\cref{fig:sensitivity-intro} demonstrates the effect of varying the hyperparameters for two provably efficient algorithms, the primal-dual natural-policy ascent, gradient descent method (in short, GDA) of \citet{ding2020natural} and the primal-only CRPO method of~\citet{xu2021crpo} on a synthetic tabular environment. 
\begin{figure}[t]
		\begin{subfigure}[b]{0.49\linewidth}
			\centering
			\captionsetup{justification=centering}
			\includegraphics[width=0.98\textwidth]{figs/OG_eps_0.25.png}
			\caption[]{\small{Optimality gap\\ (OG)}}
		\end{subfigure}
		\begin{subfigure}[b]{0.49\linewidth}
			\centering
			\captionsetup{justification=centering}
			\includegraphics[width=\textwidth]{figs/CV_eps_0.25.png}
			\caption[]{\small{Constraint violation (CV)}}
		\end{subfigure}
		\caption[]
        {\textbf{Hyperparameter sensitivity:} Optimality gap and constraint violation (averaged across $5$ runs) for different hyperparameters for \textbf{\cGDA/}~\citep{ding2020natural}, \textbf{\cCRPO/}~\citep{xu2021crpo}
        and the proposed algorithm \textbf{\cAlg/} on a gridworld environment with access to the true CMDP. The dark lines show the performance of the best hyperparameter, while the lighter-shade lines represent results while using other hyperparameters. Both GDA and CRPO exhibit large variations 
        in their performance, while \Alg/ is more robust. See~\cref{sec:Experiments} for details.}          
        \label{fig:sensitivity-intro}
\end{figure}
While one can find hyperparameters that control the worst-case performance of either GDA or CRPO, 
such choices result in a poor empirical performance on individual instances, a feature that GDA and CRPO share with
%the hyperparameter that allow for this make these algorithms too conservative in that 
% price paid on indivi
%In fact, for GDA and CRPO it is hard to find a single hyperparameter that makes them perform reasonably well across a range of environments: The penalty for choosing a hyper
%We note that such sensitivity to hyperparameter tuning is also prevalent in 
unconstrained MDP policy optimization algorithms, such as Politex~\citep{abbasi2019politex}, 
or natural policy gradient~\citep{kakade2001natural}.
%Furthermore, hyperparameters recommended according to the theoretical worst-case analysis have poor empirical performance (see~\cref{fig:CB_GDA_best_param_MB} in~\cref{app:experiments} for example). Hence in order to obtain reasonably good performance on a new environment, the algorithm hyperparameters need to be tuned from scratch, incurring a significant computational overhead. 

\paragraph{CONTRIBUTIONS:} \textit{Designing robust policy optimization algorithms that require minimal hyperparameter tuning is our main motivation, and towards this, we make the following contributions.}
 % \todoc{Is this the same type of sensitivity? I mean, at least for Politex, you can find a single parameter value which controls worst-case performance. A worse form of parameter sensitivity is when no such single parameter exist. 
% }

\textbf{Generic Primal-Dual Framework}: In~\cref{sec:framework}, we cast the problem of planning in discounted infinite horizon CMDPs to a generic primal-dual framework. In particular, we prove that any algorithm that can control (i) the primal and dual regret for specific online linear optimization problems and (ii) the errors due to function approximation and sampling, will (approximately) maximize the cumulative discounted reward while (approximately) minimizing the constraint violation (\cref{thm:generic-bound}). Importantly, this result holds for any CMDP and is independent of how the policies or value functions are represented.

\textbf{Instantiating the Framework}: In~\cref{sec:framework-instantiation}, we instantiate the framework using two algorithms from the online linear optimization literature -- Gradient Descent Ascent (GDA) (\cref{sec:gda}) and Coin-Betting (CB) (\cref{sec:cb}). While GDA requires setting the hyperparameters to specific problem-dependent constants, CB is more robust to hyperparameter tuning (see \cref{fig:sensitivity-intro}). In the simpler tabular setting, the approximation errors can be easily controlled and we use~\cref{thm:generic-bound} in conjunction with existing regret bounds to prove that the average optimality gap (difference in the cumulative reward of achieved policy and the optimal policy) and the average constraint violation decrease at an $O\left(\nicefrac{1}{\sqrt{T}}\right)$ rate (\cref{cor:gda,cor:cb}). 

\textbf{Handling Linear Function Approximation}: In~\cref{sec:putting-together}, we assume global access to a $d$-dimensional feature map $\Phi : \cS \times \cA \rightarrow \R^d$, and that the action-value functions for any policy are $\epsb$-close to the span of these features. With this assumption, we prove that it is possible to control the approximation errors for each state-action pair. Subsequently, in~\cref{sec:lfa-algorithm}, we use the robust coin-betting algorithms to instantiate the primal-dual framework in the linear function approximation setting and propose the \emph{Coin-Betting Politex} (\Alg/) algorithm. Ignoring sampling errors, in~\cref{sec:linear-bound}, we prove that the average optimality gap for \Alg/ scales as $O \left(\frac{1}{(1 - \gamma)^3 \, \sqrt{T}} + \frac{\epsb \sqrt{d}}{(1 - \gamma)^2} \right)$, while the average constraint violation is $O \left(\frac{1}{(1 - \gamma)^2 \, \sqrt{T}} + \frac{\epsb \sqrt{d}}{(1 - \gamma)} \right)$. With linear function approximation, the average constraint violation for the algorithm of~\citet{ding2020natural} decreases at a worse $O\left(\nicefrac{1}{T^{1/4}} \right)$ rate. On the other hand, the CRPO algorithm of~\citet{xu2021crpo} results in an $O\left(\nicefrac{1}{\sqrt{T}} \right)$ bound for both the average suboptimality and constraint violation. However, both algorithms can amplify the function approximation errors to large, potentially unbounded values. Importantly, both algorithms require typically unknown quantities which impedes their practical use. 

\textbf{Experimental Evaluation}: In~\cref{sec:Experiments}, we first describe some practical considerations when implementing \Alg/. We then evaluate \Alg/ and compare its empirical performance to the algorithms of~\citet{ding2020natural,xu2021crpo}. Our experiments on synthetic tabular environment and the Cartpole environment with linear  function approximation demonstrate the consistent effectiveness and robustness of \Alg/. 

% DUMP
%  in requires knowing the Slater constant for the CMDP, while~\citet{xu2021crpo} requires knowledge of $\text{KL}(\pi_0 || \piopt)$ where $\piopt$ and $\pi_0$ are respectively the optimal and initial policies. Both these quantities are 

% of $O\left(\nicefrac{1}{T^{1/4}} + \frac{\epsb}{(1 - \gamma)^2} \norminf{\frac{d^\pi{^*}}{\rho}} \right)$. Here, $d^{\piopt}$ is the distribution over states induced by the optimal policy, $\rho$ is the initial state distribution, and hence the $\norminf{\frac{d^\pi{^*}}{\rho}}$ term can be potentially very large (even infinite).  from the dependency on $\norminf{\frac{d^\pi{^*}}{\rho}}$. 
%!TEX root =  main.tex
\section{Problem Formulation}
\label{sec:problem-formulation}
We consider an infinite-horizon discounted constrained Markov decision process (CMDP)~\citep{altman1999constrained} defined by the tuple $\langle \cS, \cA, \cP, r, c, b, \rho, \gamma \rangle$ where $\cS$ is the countable set of states, $\cA$ is the countable action set, $\cP : \cS \times \cA \rightarrow \Delta_\cS$ is the transition probability function, $\Delta_\cS$ is the $\cS$-dimensional probability simplex, $\rho \in \Delta_{\cS}$ is the initial distribution of states and $\gamma \in [0, 1)$ is the discount factor. The primary reward to be maximized is denoted by $r : \cS \times \cA \rightarrow [0,1]$. For each state $s$, we define the reward value function w.r.t. the policy $\pi : \cS \rightarrow \Delta_{\cA}$ as $V_r^\pi(s)=\mathbb{E}_{\pi,\cP} \Big[\sum_{t=0}^\infty \gamma^t r(s_t, a_t)| s_0=s\Big]$ where $ a_t\sim\pi( \cdot| s_t),$ and $s_{t+1}\sim\cP(\cdot | s_t, a_t)$ and $\Delta_{\cA}$ is $\cA$-dimensional simplex. The expected discounted return or \emph{reward value} of a policy $\pi$ is defined as $\reward{\pi} = \mathbb{E}_{s_0\sim \rho} \Big[V_r^\pi(s_0)\Big]$. Similarly, the constraint reward is denoted by $c: \cS \times \cA \rightarrow [0,1]$ and the \emph{constraint reward value} for $\pi$ by $\const{\pi}$. 
For each $(s,a)$ under policy $\pi$, the reward action-value function is defined as $\rewardq{\pi}: \cS \times \cA \rightarrow \R$ s.t. $\rewardq{\pi}(s,a) = r(s,a)+\gamma\E_{s'\sim\cP(\cdot|s,a)}[V_{r}^{\pi}(s')]$ and satisfies the relation: $V_r^\pi(s) = \langle \pi(\cdot | s), \rewardq{\pi}(s,\cdot) \rangle=\E_{a\sim \pi(\cdot|s)}[\rewardq{\pi}(s,a)]$. 
% \todoc{there is something wrong with $\reward{\pi}(s)$. I guess macros.. But, also,
% $V_r^\pi(s)$ is not defined. And is the above relationship uniquely defining $Q_r^\pi$?
% And the sentence is weird because it seems that the main thing is that $Q_r^\pi$ is any function mapping 
% state-action functions: ``the reward action-value function is defined as $\rewardq{\pi}: \cS \times \cA \rightarrow \R$''.
% }
We define $\constq{\pi}$ analogously. The agent's objective is to return a policy $\pi$ that maximizes $\reward{\pi}$, while ensuring that $\const{\pi} \geq b$. Formally,  
\begin{align}
\max_{\pi} \reward{\pi} \quad \text{s.t.} \quad  \const{\pi} \geq b.
\label{eq:objective}
\end{align}
Throughout, we will assume the existence of a feasible policy (i.e., one with $ \const{\pi} \geq b$), and denote the optimal feasible policy by $\piopt$. 
Due to sampling and other errors, we will aim for finding policy $\pi$ with some $\epsilon>0$ such that,
%In this work, we will consider an easier problem with a relaxed feasibility requirement. In particular, given a target error $\epsilon$, we aim to return a policy $\pi$ such that,
\begin{align}
\reward{\pi} & \geq \reward{\piopt} - \epsilon  \quad \text{s.t.} \quad  \const{\pi} \geq b - \epsilon.
\label{eq:relaxed-objective}
\end{align}
In the next section, we specify a generic primal-dual framework solving the problem in~\cref{eq:relaxed-objective}. 


% We assume that there exists a feasible policy $\pitil$ for~\cref{eq:objective}, and define the Slater constant $\zeta := \const{\pitil} - b$. The Slater constant determines the complexity of solving~\cref{eq:objective}~\citep{ding2020natural}. 
%!TEX root =  main.tex
\section{Primal-Dual Framework}
\label{sec:framework}
By Lagrangian duality, $\pi^*$ is a solution to \cref{eq:objective} if and only if for some $\lambda^*\ge 0$,
$(\pi^*,\lambda^*)$ solves the saddle-point problem  
\vspace{-2ex}
\begin{align}
\max_{\pi} \min_{\lambda \geq 0} \reward{\pi} + \lambda [\const{\pi} - b]\,.  
\label{eq:objective-saddle}
\end{align}
Here, $\lambda \in \R$ is the Lagrange multiplier for the constraint.

We will solve the above primal-dual saddle-point problem iteratively, by alternatively updating the policy (primal variable) and the Lagrange multiplier (dual variable). If $T$ is the total number of iterations, we define $\pit$ and $\lambdat$ to be the primal and dual iterates for $t \in [T]:=\{1,\dots,T\}$. Updating the $(\pit, \lambdat)$ variables will require estimating the action-value functions. We define $\rewardqhat{t} := \rewardqhat{\pit}$ and $\constqhat{t} := \constqhat{\pit}$ as the \emph{estimated} action-value functions corresponding to the policy $\pit$. We also define $\hat{V}_c^{\pi}(s) := \langle \pi(\cdot|s),\constqhat{t}(s,\cdot)\rangle$, $\reward{t}: = \reward{\pit}$ and $\const{t}:=\const{\pit}$.
In this section, we assume that  $\norm{\rewardq{t} - \rewardqhat{t}}_{\infty} \leq \tvarepsilon$ and $\norm{\constq{t} - \constqhat{t}}_{\infty} \leq \tvarepsilon$. 

Given a generic primal-dual algorithm, our task is to characterize its performance in terms of its cumulative reward and constraint violation. Specifically, for a sequence of policies $\{\pi_0, \pi_1, \ldots, \pi_{T-1}\}$ and Lagrange multipliers $\{\lambda_0, \lambda_1, \ldots, \lambda_{T-1}\}$ generated by an algorithm, we define the \emph{average optimality gap} (\texttt{OG}) and the \emph{average constraint violation} (\texttt{CV}) as,
\vspace{-2ex}
\begin{align*}
&\text{Avg. optimality gap (\texttt{OG})} := \frac{1}{T} \sum_{t = 0}^{T - 1} [\reward{\piopt} - \reward{t}] \\
&\text{Avg. constraint violation (\texttt{CV})} := \frac{1}{T} \left[\sum_{t = 0}^{T-1} b - \const{t} \right]_{+}
\end{align*}
where $[x]_{+} = \max\{x,0\}$. For this algorithm, we define the \emph{primal regret} and \emph{dual regret} as follows:
\vspace{-2ex}
\begin{align}
\cR^{p}(\pi^*, T)  &:= \sum_{t = 0}^{T-1}  \big\langle \piopt(\cdot|s) - \pit(\cdot|s),  \\ & \quad\quad\quad \rewardqhat{t}(s,\cdot) + \lambdat \constqhat{t}(s,\cdot) \big\rangle_{s\sim \nu_{\rho,\piopt}}, \nonumber \\
 \cR^{d}(\lambda, T) &:= \sum_{t = 0}^{T-1} ( \lambdat - \lambda)\, ( \consthat{t} - b)\,.\label{eq:regret}   
\end{align}
Here, $\langle f,g \rangle_{s\sim \nu_{\rho,\piopt}} = \mathbb{E}_{s\sim \nu_{\rho,\piopt}} [f(s)g(s) ]$ 
and $\nu_{\rho,\piopt}$ is the discounted occupation measure induced by following $\piopt$ from $\rho$ normalized so that it becomes a probability measure.
Observe that the above quantities correspond to the regret for online linear optimization algorithms that can independently update the primal and dual variables.\footnote{Note: We are computing the primal regret with respect to the optimal policy $\pi^*$ that satisfies the constraints. $R^p(\pi^*, T)$ can be  negative since the current policy $\pi_t$ can violate the constraints and obtain higher rewards.} Our main result (proved in~\cref{app:proofs}) in this section characterizes the performance of a generic algorithm in terms of its primal and dual regret.  
\begin{restatable}{theorem}{generic}
Assuming that $\norm{\rewardq{t} - \rewardqhat{t}}_{\infty} \leq \tvarepsilon$ and $\norm{\constq{t} - \constqhat{t}}_{\infty} \leq \tvarepsilon$, for a generic algorithm producing a sequence of polices $\{\pi_0, \pi_1, \ldots, \pi_{T-1}\}$ and dual variables $\{\lambda_0, \lambda_1, \ldots, \lambda_{T-1}\}$ such that for all $t$, $\lambdat$ is constrained to lie in the $[0, U]$ where $U > \lambda^*$, \texttt{OG} and \texttt{CV} can be bounded as:
\begin{align*}
\texttt{\text{OG}} & \leq \frac{\cR^{p}(\pi^*, T) + (1 - \gamma) \cR^{d}(0, T)}{(1 - \gamma) T} + \tvarepsilon \, g(U),
\end{align*}
\begin{align*}
\texttt{\text{CV}} &\leq \frac{\cR^{p}(\pi^*, T) + (1 - \gamma) \cR^{d}(U, T)}{(U - \lambda^*) (1 - \gamma) T} + \frac{\tvarepsilon \, g(U)}{(U - \lambda^*)} ,
\end{align*}
where $g(U) := \left[\frac{1 + U}{1 - \gamma} + U \right]$.
\label{thm:generic-bound}
\end{restatable}
We note that such a general primal-dual regret decomposition for convex MDPs (including CMDPs) was recently done by~\citet{zahavy2021reward}. However, they handle the tabular setting where the primal variables correspond to state-action occupancy measures, whereas, the above result defines the primal variables to be the policy parameters. More importantly, our result does not require any assumption about the underlying CMDP. In the unconstrained setting, reducing the policy optimization problem to that of online linear optimization has been previously explored in the \textit{Politex} algorithm~\citep{abbasi2019politex}, and we build upon this work. Politex is an iterative policy optimization algorithm where the policy at each timestep is proportional to the softmax over the sum of all the action-value functions seen in the past. This algorithm bounds the optimality gap in terms of approximation error in the action-value functions and a regret term similar to online linear optimization.

In order to bound the average reward optimality gap and the average constraint violation, we need to (i) project the dual variables onto the $[0, U]$ interval and ensure that $U > \lambda^*$, (ii) update the primal and dual variables to control the respective regret in~\cref{eq:regret}, and (iii) control the approximation error $\tvarepsilon$. Next, we use this recipe to design algorithms with provable guarantees.

\section{Instantiating the framework}
\label{sec:framework-instantiation}
In this section, we will instantiate the primal-dual framework by using the above technique -- specifying the value of $U$ in~\cref{sec:u-specification} and describing algorithms that control the primal and dual regret in~\cref{sec:algorithms}.

\subsection{Upper-bound for dual variables}
\label{sec:u-specification}
In~\cref{app:proofs}, we prove the following upper-bound on the optimal dual variable

\begin{restatable}{lemma}{sd}
The objective~\cref{eq:objective} satisfies strong duality, and the optimal dual variables are bounded as $\lambda^* \leq \frac{1}{\zeta (1 - \gamma)}$, where $\zeta := \max_{\pi} \const{\pi} - b > 0$. 
\label{lemma:sd}
\end{restatable}
Unlike~\citet{ding2020natural,ding2021provably} who bound the dual variables in terms of the unknown Slater constant, the upper-bound from~\cref{lemma:sd} can be computed by maximizing the constraint value function as an unconstrained problem. Throughout, we will set $U = \frac{2}{\zeta \, (1- \gamma)}$ that satisfies the requirement $U>\lambda^*$ and projects the dual variables onto $[0, U]$ range (in~\cref{sec:framework}). 

\subsection{Controlling the primal and dual regret}
\label{sec:algorithms}
In this section, we specify two algorithms to update the primal and dual variables, and control the primal and dual regret respectively. In particular, in~\cref{sec:gda}, we will use mirror ascent to update the primal variables, and gradient descent to update the dual variables. Inspired by the literature on online linear optimization~\citep{orabona2016coin}, we will use robust, parameter-free algorithms to update the primal and dual variables in~\cref{sec:cb}.  

% \vspace{-2ex}
\subsubsection{Gradient descent ascent}
\label{sec:gda}
At iteration $t \in [T]$, if the primal and dual iterates are $\pit$ and $\lambdat$ respectively, given $\rewardqhat{t}$ and $\constqhat{t}$, the gradient descent ascent (GDA) update \todoc{a bit misleading name; more like mirror ascent, gradient descent} can be written as follows: if $\lagqhat{t}(s,a) = \rewardqhat{t}(s,a) + \lambda_{t} \, \constqhat{t}(s,a)$ and $\consthat{t} = \sum_{s \in \cS} \rho(s) \sum_{a \in \cA} \pit(a | s) \constqhat{t}(s,a)$, then, 
\begin{align}
\pitt(a|s) & = \frac{\pit(a|s) \exp\left(\eta_1 \lagqhat{t}(s,a) \right)}{\sum_{a'}\pit(a'|s)\exp\left(\eta_1 \lagqhat{t}(s,a') \right)} \label{eq:gda-primal} \\
\lambdatt & = \PP_{\left[0, U \right]}[\lambdat - \eta_2 \, (\consthat{t} - b)] \label{eq:gda-dual}.
\end{align}
Here $\PP_{[a,b]}$ is a projection onto the $[a,b]$ interval, and $\eta_1$ and $\eta_2$ are the step-size parameters for the primal and dual updates respectively. In the tabular setting, the resulting algorithm is the same as that analyzed by~\citet{ding2020natural}. 

Analyzing the primal and dual regret for the above updates is fairly standard in online linear optimization. Using results from the paper of~\citet[Theorem 6.8]{orabona2019modern}, by setting $\eta_1 = \sqrt{\frac{2 \log |\cA|}{t}} \, \frac{1 - \gamma}{1 + U}$, $\eta_2 = \frac{U (1 - \gamma)}{\sqrt{t}}$ and $U = \frac{2}{\zeta \, (1- \gamma)}$,
we get
 $\cR^{p}(\pi^*,T) \leq  \frac{1 + U}{1 - \gamma} \sqrt{2 \log |\cA|} \sqrt{T}$ 
 and $\cR^{d}(\lambda,T) \leq \frac{U}{1 - \gamma} \sqrt{T}$. Observe that both the primal and dual regret scale as $O(\sqrt{T})$, and using~\cref{thm:generic-bound}, both the average optimality gap and constraint violation will decrease at an $O(\nicefrac{1}{\sqrt{T}})$ rate. 

We also note that obtaining the above bounds requires setting the two step-sizes ($\eta_1$ and $\eta_2$) to specific values that depend on problem-dependent parameters. \todoc{this does not seem to be the case.. at least the above values are instance independent.}
In~\cref{fig:sensitivity-intro}, we have seen that GDA is quite sensitive to the values of $\eta_1$ and $\eta_2$, even in the simple tabular setting. In order to alleviate this, we use the recent progress in online linear optimization, and propose robust algorithms in the next section. 

\subsubsection{Coin-betting}
\label{sec:cb}
\citet{orabona2016coin} and \citet{orabona2017training} propose \emph{coin-betting} algorithms that reduce the online linear optimization problems in~\cref{eq:regret} to online betting. Unlike adaptive gradient methods like AdaGrad~\citep{duchi2011adaptive} or Adam~\citep{kingma2014adam} that require setting the initial step-size, \textit{coin-betting algorithms are completely parameter-free}. 
In this work, we will directly instantiate the regret-minimization algorithms from these works. We first provide some intuition for the coin-betting algorithms. 

\textbf{Coin-betting algorithms}:~\citet{orabona2016coin} shows that the online linear optimization can be viewed as a problem of placing repeated bets (denoted by $x_t$) in round $t$ on the outcomes of unknown adversarial coin flips (denoted by $c_t$). The outcomes of the coin are either heads or tails meaning that $c_t \in \{-1, +1\}$. With our bet $x_t$, we earn an amount $x_t c_t$ in round $t$. Starting with an initial wealth of $\epsilon_0$, at round $t$ we place bets with a fraction (denoted by $\beta_t$) of the remaining wealth on either heads or tails where $x_t$ becomes $
x_t = \beta_t \, \left(\epsilon_0 + \sum_{i=1}^{t-1} x_i c_i\right)$. Our goal is to maximize the wealth generated from this process. The coin-betting strategy uses KT estimator~\citep{krichevsky1981performance} which bets $\beta_t = \frac{\sum_{i=1}^{t-1}c_i}{t}$ fraction of the current wealth on the most common outcome observed until time $t$.~\citet{orabona2016coin} connects the problem of maximizing wealth to the problem of minimizing the regret in online linear optimization setting. In particular, the authors view the outcome of coin ($c_t$) as the negative of subgradient of the losses (denoted by $g_t$) on current prediction and $x_t$ as the our response at round $t$. Using this reduction, we get a parameter-free 1-d online linear optimization algorithm where we predict 
\[
x_t = \left(-\frac{\sum_{i=1}^{t-1}g_i}{t}\right) \, \left(\epsilon_0 - \sum_{i=1}^{t-1} x_i g_i\right)
\] 
in round $t$. This problem can be further extended to $d-$dimensional and Learning with Expert Advice (simplex) setting~\citep{orabona2016coin}. For the problem at hand, the iterates $x_t$ correspond to either the policy $\pit$ for the primal problem or the Lagrange multipliers $\lambdat$ for the dual problem.

We now instantiate the algorithm of~\citet{orabona2016coin} for updating the policy (primal variables) in the CMDP setting. In order to do this, we define additional variables $w_{t}$ for each $(s,a)$ pair and iteration $t$. These variables will be computed recursively, and used to compute the policy $\pitt$ at iteration $t$. In particular, for $t \geq 1$,
\vspace{-3ex}
\begin{align}
& w_{t+1}(s,a) = \frac{\sum_{i=0}^{t} \lagatilde{i}(s,a)}{(t+1) + T/2} \left(1 + \sum_{i=0}^{t} \lagatilde{i}(s,a) \, w_{i}(s,a) \right)  \nonumber \\
& \pitt(a|s) = 
\begin{cases}
    \pi_0(a|s), \, \text{if } \sum_{a}{\pi_0(a|s) \, [w_{t+1}(s,a)]_{+}} = 0 \\
    \frac{\pi_0(a|s) \, [w_{t+1}(s,a)]_{+}}{\sum_{a'}{\pi_0(a'|s) \, [w_{t+1}(s,a')]_{+}}}, \, \text{otherwise} 
    \label{eq:cb-primal}
\end{cases}
\end{align}
where, given $\pit$, $\lagatilde{t}(s,a)$ is equal to $$\lagahat{t}(s,a) \, \cI\{w_t(s,a) > 0\} + [\lagahat{t}(s,a)]_{+} \, \cI\{w_t(s,a) \leq 0\}$$ and $\lagahat{t}(s,a) = \frac{1 - \gamma}{1 + U} \, \left[\lagqhat{t}(s,a) - \left \langle \lagqhat{t}(s,\cdot), \pit(\cdot|s) \right \rangle \right]$. $\cI\{\omega\}$ is the indicator function with value $1$ when condition $\omega$ satisfy. For the above calculation, we use the normalized (by $\frac{1 + U}{1 - \gamma}$) action-value functions that are ensured to lie in the $[0,1]$ range. The quantity $\lagahat{t}(s,a)$ can be interpreted as the (normalized) advantage function for policy $\pit$ in the unconstrained MDP with rewards equal to $r(s,a) + \lambdat \, c(s,a)$. Observe that the above update does not have any tunable hyperparameters. 

Similarly, we use the coin-betting algorithm of~\citet{orabona2017training} to update the Lagrange multipliers, instantiating it in the CMDP setting: for $t \geq 1$, if $\sigma(x) := \frac{1}{1 + \exp(-x)}$, then, 
\vspace{-2ex}
\begin{align}
\lambdatt &= \lambda_0 - \beta_{t} \left[\frac{1}{1 - \gamma} - \sum_{i = 0}^{t} \left(\lambda_i - \lambda_0 \right) \, (\consthat{\pi_i} - b) \right],  \nonumber \\
\beta_{t} & = (1 - \gamma) \, \left( 
2 \sigma \left(\frac{2 \sum_{i = 0}^{t} (\consthat{\pi_i} - b)}{\frac{1}{1 - \gamma} + \sum_{i = 0}^{t} \vert \consthat{\pi_i} - b \vert} - 1 \right) \right) \label{eq:cb-dual}
\end{align}
Similar to the primal update, the dual update uses normalized (by $\nicefrac{1}{1 - \gamma}$) value functions that lie in the $[-1,1]$ range, and does not have any tunable parameter. Importantly, these updates result in \emph{no-regret} algorithms meaning that both the primal and dual regret scale as $o(T)$. Specifically, for the primal updates in~\cref{eq:cb-primal} and the dual updates in~\cref{eq:cb-dual}, the results of~\citet{orabona2017training} imply that
\small{
\begin{align*}
\cR^{p}(\pi^*,T) & \leq \frac{3 (1 + U)}{1 - \gamma} \sqrt{T} \sqrt{1 + \text{KL}(\pi_0 || \pi^*)}, \\
\cR^{d}(\lambda,T) & \leq \frac{1}{1 - \gamma} + \norm{\lambda-\lambda^0} \sqrt{\left(\frac{1}{(1 - \gamma)^2}+\frac{G_T}{1 - \gamma}\right)
\Gamma_T},
\end{align*}
}
\normalsize
where $\text{KL}(\pi_0||\pi^*) =\mathbb{E}_{s\sim \nu_{\rho,\piopt}} \text{KL}(\pi_0(\cdot|s)||\piopt(\cdot|s)$,
$\Gamma_T=\log\left(1 + (G_T \, (1-\gamma)+1)^2\norm{\lambda-\lambda^0}^2 \right)$ and $G_T = \sum_{i = 0}^{T} \vert \consthat{\pi_i} - b \vert = O(T)$. Since both regrets scale as $O(\sqrt{T})$ in the worst case, using the coin-betting updates will also result in an $O(\nicefrac{1}{\sqrt{T}})$ decrease in both the average optimality gap and constraint violation. Unlike the updates in~\cref{sec:gda}, the coin-betting updates do not require tuning a hyperparameter.

If we can control the approximation errors, we can use the above algorithms to completely instantiate the primal-dual framework. In~\cref{app:tabular}, we do this for the simpler tabular setting, and consider the linear function approximation setting in the next section. 

\section{Putting everything together}
\label{sec:putting-together}
In this section, we will bound the approximation errors in the linear function approximation setting and instantiate the above framework. 

In order to scale to large state-action spaces, we consider the special case of linear function approximation and assume \emph{global} access to a $d$-dimensional feature map $\Phi : \cS \times \cA \rightarrow \R^d$. Given $\Phi$, we make the following (approximate) realizability assumption on action-value functions~\citep{abbasi2019politex}.  
\begin{assumption}[Linear function approximation]
With global access to the feature map $\Phi$, the action-value functions for each memoryless policy $\pi$ are $\epsb$-close to the span of the state-action features i.e. 
\begin{align*}
\inf_{\theta \in \R^d} \max_{(s,a)} \vert \rewardq{\pi}(s,a) - \langle \theta, \phi(s,a) \rangle \vert & \leq \epsb,
\\
\inf_{\theta \in \R^d} \max_{(s,a)} \vert \constq{\pi}(s,a) - \langle \theta, \phi(s,a) \rangle \vert & \leq \epsb.
\end{align*}
\label{assum:linear-realizability}
\end{assumption}
\vspace{-3ex}
This setting subsumes the tabular case which can be recovered (with $\epsb = 0$) when $d = |\cS | \, |\cA |$, and the feature-map consisting of one-hot vectors for each state-action pair. Given a good estimate of $\theta_r^\pi := \argmin \left[\max_{(s,a)} \vert \rewardq{\pi}(s,a) - \langle \theta, \phi(s,a) \rangle \vert \right]$, we can easily estimate the action-value functions for every $(s,a)$ pair as $\rewardq{\pi}(s,a) \approx \langle \theta_r^\pi, \phi(s,a) \rangle$. A naive way to estimate $\theta_r^\pi$ is to form a subset $\cC \subseteq \cS \times \cA$ of $(s,a)$ pairs, rollout $m$ independent trajectories using policy $\pi$ and starting from each $(s,a) \in \cC$. The average (across trajectories) cumulative discounted return is an unbiased estimate $Q_r(s,a)$ of the action-value function. If $Q_r$ is defined to be the $|\cC|$-dimensional vector of estimated action-value functions, and for a fixed set of weights $\omega$ s.t. $\omega(s,a) \geq 0$ and $\sum_{(s,a) \in \cC} \omega(s,a) = 1$, we use the weighted-least squares estimate with $z := (s,a)$,
\begin{align}
\thetahat^\pi_r &= \argmin_{\theta} \sum_{z \in \cC} \omega(z) \left[ \langle \theta, \phi(z) \rangle - Q_r(z) \right]^2.
\label{eq:lspg}
\end{align}
For the $(s,a) \in \cC$, the sampling error is $O(\nicefrac{1}{\sqrt{m}})$ by using Hoeffding's inequality. For the $(s,a) \notin \cC$, we can then use the resulting $\thetahat^\pi_r$ to estimate $\rewardqhat{\pi}$ as $\rewardqhat{\pi} = \langle \thetahat^\pi, \phi(s,a) \rangle$. In~\cref{app:proofs}, we prove the following result to bound the extrapolation errors for all $(s,a)$.  
\begin{restatable}{lemma}{faextr}
For policy $\pi$, any distribution $\omega$ and subset $\cC$, if we use $m$ trajectories to estimate the action-value function for each $(s,a) \in \cC$, and solve~\cref{eq:lspg} to compute $\thetahat_r^\pi$, then for any $(s,a) \in (\cS \times \cA)$ pair, the error $\vert \langle \phi(s,a), \thetahat_r^\pi \rangle - \rewardq{\pi} \vert$ can be upper-bounded by   
\begin{align*}
\epsb (1+ \indnorm{\phi(s,a)}{G_{\omega}^{\dagger}}) \nonumber + \frac{\indnorm{\phi(s,a)}{G_{\omega}^{\dagger}}}{1 - \gamma} \, \sqrt{\frac{\log(2 |\cC| /\delta)}{2 m}},
\end{align*}
where $G_{\omega} = \sum_{(s,a) \in \cC} \omega(s,a) \phi(s,a) \phi(s,a)^{\transpose}$ and $A^\dagger$ is pseudoinverse of $A$.
\label{lemma:lspe-extrapolation}
\end{restatable}
Hence, the extrapolation errors can be upper-bounded by choosing $\cC$ and $\omega$ to control the $\indnorm{\phi(s,a)}{G_{\omega}^{\dagger}}$ term for each $(s,a)$ pair. Moreover, to ensure scalability, we want that size of $\cC$ to be independent of $|\cS||\cA|$. Fortunately, the Kiefer-Wolfowitz theorem~\citep{kiefer1960equivalence} guarantees the existence of a \emph{coreset} $\cC$ s.t. $|\cC| \leq \frac{d (d+1)}{2}$ and distribution $\omega$ that ensure $\sup_{(s,a)} \indnorm{\phi(s,a)}{G_{\omega}^{\dagger}} \leq \sqrt{d}$. If we can find such a $\cC$ and distribution $\omega$, then the error, $\tvarepsilon \leq \epsb (1+\sqrt{d}) + \frac{\sqrt{d}}{1 - \gamma} \sqrt{\frac{\log(2 d (d+1) /\delta)}{2 m}}$. Here, the first term in error is due to the approximation error ($\epsb$) and the second term is result of the sampling error (dependent on $m$ trajectories). For our theoretical results, we  assume that a coreset $\cC$ and distribution $\omega$ is provided, and in~\cref{app:exp-details}, we describe the G-experimental design procedure to compute it. 

Now that we have control over $\tvarepsilon$, we instantiate the primal-dual framework with coin-betting algorithms. 

% \vspace{-3ex}
\subsection{\Alg/ Algorithm}
\label{sec:lfa-algorithm}
In this section, we use the coin-betting algorithms (~\cref{sec:cb}) with linear function approximation to completely specify the Coin-Betting Politex (\Alg/) algorithm (\cref{alg:cbp}). In~\cref{alg:cbp},~Line~$2$ computes the coreset $C$ and distribution $\omega$ offline (see~\cref{app:g-experimental-algo} for details). In order to set $U$, the upper-bound on the dual variables, we need to estimate $\zeta$ and this is achieved by solving the unconstrained problem maximizing $\consthat{\pi}$ in~Line~$3$. While this can be done by any algorithm that can solve MDPs with linear function approximation (for example, NPG~\citep{kakade2001natural} or Politex~\citep{abbasi2019politex}), we will use~\cref{eq:cb-primal} (see~\cref{sec:Experiments}) work. After Monte-Carlo sampling $\forall (s,a) \in \cC$ (Line~$5$) and estimating $\thetahat_r^{\pit}$ and $\thetahat_c^{\pit}$ according to~\cref{eq:lspg} (Line~$6$), these vectors are used to calculate $\rewardqhat{\pit}$ and $\constqhat{\pit}$ for states encountered in a trajectory generated by policy $\pit$ (Line~$8$). These action-value functions are then used to update the policy at these states. While this can be achieved by any algorithm controlling the primal regret, \Alg/ uses the parameter-free coin-betting updates (Line~$9$). At the end of iteration $t$, in Line~$11$, the dual variables are updated using the coin-betting algorithm.

In the next section, we bound the average optimality gap and constraint violation for \Alg/.
% \vspace{-2ex}
\subsubsection{Theoretical Guarantee}
\label{sec:linear-bound}
We now use~\cref{thm:generic-bound} to bound the average optimality gap and constraint violation for~\cref{alg:cbp}. We note that recent work~\citep{liu2021parameter} uses parameter-free coin-betting algorithms for convex-concave min-max optimization. 
Since the function to be maximized in~\cref{eq:objective-saddle} is non-concave in $\pi$, this work is not directly applicable to our setting.
%Unlike~\cref{eq:objective-saddle}, our proof technique is specific to RL, and 
% \todoc{well, fine for now, but this is yet another thing I don't get when in some representation a problem is not convex and the reader is just told this.. magic?! no... hahaa, fair enough} 
%  \todoc{I am surprised $\epsb $ is not multiplied by $(1+\sqrt{d})$ in the theorem below.}

\begin{restatable}{corollary}{cblfa}
Under~\cref{assum:linear-realizability}, \texttt{OG} and \texttt{CV} of \Alg/ can be bounded as:
\begin{align*}
\texttt{\text{OG}} & \leq \frac{
\left(\frac{3 (1 + U) \, \sqrt{1 + \text{KL}(\pi_0 || \pi^*)}}{1 - \gamma} +  \Psi \right)}{(1 - \gamma) \sqrt{T}}+ \frac{\tvarepsilon (1 + 2 U)}{1 - \gamma}, \\
\texttt{\text{CV}} &\leq  \frac{\zeta \left(\frac{3 (1 + U) \, \sqrt{1 + \text{KL}(\pi_0 || \pi^*)}}{1 - \gamma} +  \Psi \right)}{\sqrt{T}} + \zeta \, \tvarepsilon (1 + 2 U),
\end{align*}
where $U = \frac{2}{\zeta (1 - \gamma)}$, $\tvarepsilon = \epsb (1+\sqrt{d}) + \frac{\sqrt{d}}{1 - \gamma} \sqrt{\frac{\log(2 d (d+1) /\delta)}{2 m}}$ and $\Psi= 4U\sqrt{\log((T+1)U)} + 1$.
\label{cor:cb-lfa}
\end{restatable}  
Since $U = O(\nicefrac{1}{1-\gamma})$, the average optimality gap for \Alg/ is $O \left(\frac{1}{(1 - \gamma)^3 \, \sqrt{T}} + \frac{\tvarepsilon}{(1 - \gamma)^2} \right)$, while the average constraint violation scales as $O \left(\frac{1}{(1 - \gamma)^2 \, \sqrt{T}} + \frac{\tvarepsilon}{1 - \gamma} \right)$. In the function approximation case, ignoring sampling errors,~\citet{ding2020natural} obtain an $O \left(\frac{1}{(1 - \gamma)^3 \sqrt{T}} + \left[\frac{\epsb}{(1 - \gamma)^3} \, \norminf{\frac{d^\pi{^*}}{\rho}} \right]^{1/2} \right)$ average optimality gap, and an $O \left(\frac{1}{(1 - \gamma)^2 T^{1/4}} + \left[\frac{\epsb}{(1 - \gamma)^3} \, \norminf{\frac{d^{\piopt}}{\rho}} \right]^{1/4} \right)$ average constraint violation. Here, $d^{\piopt}$ is the distribution over states induced by the optimal policy, and $\rho$ is the initial state distribution. Compared to~\cref{cor:cb-lfa}, the \texttt{CV} decreases at a slower $O\left(\nicefrac{1}{T^{1/4}}\right)$ rate. Comparing the error terms, the bound for~\citet{ding2020natural} depends on the potentially large (even infinite) $\norminf{\frac{d^{\piopt}}{\rho}}$ factor, while forming the coreset ensures that the errors are well controlled for~\cref{cor:cb-lfa}. Furthermore,~\citet{ding2020natural} require knowledge of the typically unknown Slater constant for the CMDP. 

On the other hand,~\citet{xu2021crpo} use a neural function approximation (with 1 hidden layer) where only the first layer is trained. In order to compare to~\cref{cor:cb-lfa}, we set the width of the second layer to $1$ in~Theorem~2 of \citet{xu2021crpo}, making the function approximation equal to a linear mapping with a ReLU non-linearity. In this setting,~\citet[Theorem 4]{xu2021crpo} prove that both the average optimality gap and constraint violation scale as $O \left( \frac{1}{(1 - \gamma) \sqrt{T}} + \frac{\epsb}{(1 - \gamma)^{2.5}} \, \norminf{\frac{d^{\piopt}}{\rho}} \right)$. 
% \todoc{Which theorem is this? Add theorem number here. Also, the last term does not disappear with $T\to\infty$. Is this correct?}
Observe that although both \texttt{OG} and \texttt{CV} decrease at an $O\left(\nicefrac{1}{\sqrt{T}}\right)$ rate, the error amplification also depends on $\norminf{\frac{d^{\piopt}}{\rho}}$. Furthermore, this result requires setting the hyperparameters according to the typically unknown $\text{KL}(\piopt || \pi_0)$ quantity. These problems make the theoretical results of~\citet{ding2020natural} and \citet{xu2021crpo} potentially vacuous, and the algorithms difficult to use.

\begin{algorithm}[!t]
\LinesNumbered
\caption{Coin-Betting Politex}
\label{alg:cbp}
    \textbf{Input}: $\pi_0$ (arbitary policy initialization), $\lambda_0 \in [0, U]$ (dual variable initialization), $m$ (Number of trajectories), $T$ (Number of iterations), Feature map $\Phi$. \vspace{1ex} 
    
    Compute coreset $\cC$ and distribution $\omega$ \vspace{1ex} 
    
    Solve the unconstrained problem $\max_{\pi} \consthat{\pi}$ to estimate $\zeta$ in~\cref{lemma:sd} and set $U = \frac{2}{\zeta (1 - \gamma)}$.  \vspace{1ex}
    
    \For{$t \leftarrow 0$  \KwTo  $T-1$}{
    For every $(s,a) \in \cC$, use $m$ trajectories starting from $(s,a)$ using policy $\pit$ and estimate the action-value functions $q_r(s,a)$ and $q_c(s,a)$. \vspace{1ex}
    
    Compute and store $\thetahat_r^{\pit}$ and $\thetahat_c^{\pit}$ using~\cref{eq:lspg}.  \\

    \For{every $s$ encountered in the trajectory generated by $\pit$, and for every $a$}{\vspace{1ex}
    Compute \label{algline:extrapolation}
    $\rewardqhat{t}(s,a)  = \langle \thetahat^{\pit}_{r}, \phi(s,a) \rangle$; $\constqhat{t}(s,a) = \langle \thetahat^{\pit}_{c}, \phi(s,a) \rangle$ and $\lagqhat{t}(s,a)  = \rewardqhat{t}(s,a) + \lambda_{t} \, \constqhat{t}(s,a)$. \vspace{1ex}
    
    Update $\pitt(a|s)$ using~\cref{eq:cb-primal}. 
    }
    
    Compute $\consthat{\pit}$, update $\lambdatt$ using~\cref{eq:cb-dual}. 
    }
\end{algorithm}


% DUMP
% Their result assumes that the underlying Markov chain is ergodic, and there is no exploration issue (recall that we handle this issue using the Kiefer-Wolfowitz theorem). 

% , while \Alg/ only requires the easily computable $\zeta$ quantity (see~\cref{sec:practical}) for projecting the dual variables. 


%!TEX root =  main.tex
\vspace{-1ex}
\section{Experiments}
\label{sec:Experiments}
In this section, we first describe some practical considerations for implementing \red{\Alg/} and compare with baselines \blue{GDA} and \teal{CRPO} on a synthetic tabular environment and the Cartpole environment with linear function approximation. For the experiments below, we initialized $\pi_0$ to a random policy and $\lambda_0 =1$ in \cref{alg:cbp}. The parameter $m$ effects the error $\tilde\epsilon$ and the performance. The code can be found at \url{https://github.com/arushijain94/CoinBettingPolitex}.
\vspace{-2ex}
\subsection{Practical considerations}
\label{sec:practical}
% \vspace{-2ex}
\textbf{Checking feasibility and Estimating $\zeta$:}We use the updates in~\cref{eq:cb-primal} to solve the unconstrained problem maximizing $\consthat{\pi}$, and return policy $\pitil$. If $\consthat{\pitil} < b$, we declare the problem infeasible, whereas, if $\consthat{\pitil} > b$, we estimate $\zeta = \consthat{\pitil} - b$.\footnote{If $\consthat{\pitil} = b$, then we return policy $\pitil$ as the optimal feasible policy in the CMDP.} It is important to note that~\cref{lemma:sd} does not require the exact maximization of $\consthat{\pi}$ to upper-bound $\lambda^*$. Any feasible policy for which $\const{\pi} > b$ can be used to estimate $\zeta$ and upper-bound $\lambda^*$, though the tightest upper-bound is obtained for $\max_{\pi} \const{\pi}$ (see the proof of~\cref{lemma:sd} in~\cref{app:proofs}).   

\textbf{Gradient normalization and practical coin-betting:} Recall that the coin-betting algorithms in~\cref{sec:cb} require normalizing the gradients by $\nicefrac{1 + U}{1 - \gamma}$. Unfortunately, this upper-bound on the gradient norms is quite loose in practice, and directly using the updates~\cref{eq:cb-primal,eq:cb-dual} results in poor empirical performance. Since coin-betting algorithms do not have a step-size that can be scaled to counteract the normalization, this issue needs to be handled differently. In particular, we continue to directly use the updates in~\cref{eq:cb-primal} with the normalization, but use a heuristic, Algorithm 2 of \citealt{orabona2017training}, for updating the dual variable. This heuristic is a way to adaptively normalize the dual gradients (depending on the previously observed values).
For the details, see~\cref{alg:cbp-practical} in \cref{app:practical-cbp-algo}. 
While this heuristic introduces a hyperparameter in the dual updates, our empirical results suggest 
that the resulting coin-betting algorithm is quite robust to the choice of this parameter and so we use this method in our subsequent experiments.
% \vspace{-2ex}
\begin{figure}[h]
		\begin{subfigure}[b]{0.49\linewidth}
			\centering
			\captionsetup{justification=centering}
			\includegraphics[width=.99\textwidth]{plot/tabular/model_based/DiscountFactor/CB_GDA_CRPO/OG.png}
			\caption[]{\small{OG}}
		\end{subfigure}
		\begin{subfigure}[b]{0.49\linewidth}
			\centering
			\captionsetup{justification=centering}
			\includegraphics[width=\textwidth]{plot/tabular/model_based/DiscountFactor/CB_GDA_CRPO/CV.png}
			\caption[]{\small{CV}}
		\end{subfigure}
		\caption[]
        {\textbf{Environment Misspecification in model-based tabular setting with varying $\gamma$:} 
        Assuming access to the true CMDP, we vary discount factor $\gamma = \{0.7, 0.8\}$. We use the hyperparameters for the original CMDP with $\gamma=0.9$. \cAlg/ converges faster with a smaller variance as compared to \cGDA/ and \cCRPO/.}
        \label{fig:CB_GDA_discount_MB}
\end{figure}
\subsection{Tabular Setting}
\label{sec:tabular-exps}
We consider a synthetic gridworld environment similar to~\citet[Example 3.5]{sutton2018reinforcement} (see~\cref{app:des-tabular-env} for details) and set the discount factor $\gamma = 0.9$. We first consider a \textbf{model-based setting}, where we have complete knowledge of the CMDP. In~\cref{fig:sensitivity-intro} (in~\cref{sec:Introduction}), we compared the performance of the three algorithms. For each algorithm, the hyperparameter range is described in~\cref{app:experiments} and the \textit{best hyperparameter} corresponds to the least \texttt{OG} while satisfying \texttt{CV} $\in [-0.25,0]$. The key observation is that \textit{CBP is robust to its hyperparameter values}, while GDA and CRPO are sensitive to their hyperparameter values. In~\cref{fig:CB_GDA_best_param_MB} (\cref{app:tab-exp}), we show best performing variants for all methods. In addition, we demonstrate the poor performance of GDA when used with the theoretical step-sizes suggested in~\cref{cor:gda}. Next, we measure the robustness of the algorithms with respect to \textit{environment misspecification} where we vary $\gamma$. In~\cref{fig:CB_GDA_discount_MB}, we observe that \Alg/ has consistently faster convergence with a lower variation in the performance.   

In the \textbf{model-free setting}, from \cref{fig:CB_GDA_MF_TDSampling} we observe the effect of increasing the number of samples in approximating the $Q$ value function on the performance. \Alg/ consistently converges faster than its counterparts in sampling based approaches. In~\cref{app:tab-exp}, we demonstrate \Alg/ robustness to hyperparameters (\cref{fig:MF_sampling_1_hot_hyperparam_sensitivity}) and environment misspecification (\cref{fig:MF_sampling_1_hot_gamma}).


% In~\cref{app:tab-exp}, we also consider the \textbf{model-free setting} and approximate the $Q$ functions with sampling. In this case, we demonstrate the consistent superiority of \Alg/ (\cref{fig:CB_GDA_MF_TDSampling}) and its robustness to hyperparameters (\cref{fig:MF_sampling_1_hot_hyperparam_sensitivity}) and environment misspecification (\cref{fig:MF_sampling_1_hot_gamma}).   


\subsection{Linear Setting}\label{sec:linear-exps}
In the following experiments, all the algorithms require $O(d)$ memory to construct $Q$ value functions and have a similar handle on the policy $\pi$. 

\paragraph{Gridworld environment:}We start with linear function approximation (LFA) on the gridworld environment. We use tile coding~\citep{sutton2018reinforcement} to construct $d$-dimensional feature space (see~\cref{app:lfa-exp} for details). We used LSTDQ~\citep{lagoudakis2003least} \todoc{why?} to estimate $Q$ functions  with $300$ samples for all $(s,a)$ pairs. In~\cref{fig:LFA_grid}, we show the performance of the best hyperparameter (see~\cref{tab:hyper_lfa_sampling_tc} for specific values) for each algorithm. We observe that the \texttt{OG} of \Alg/ converges consistently faster across different feature dimensions. Again, we observe a good hyperparameter robustness of \Alg/ in~\cref{fig:lfa-gridworld-hyperparmeter-sensitivity} (\cref{app:experiments}).~\cref{fig:g-experimental} in~\cref{app:lfa-exp} shows that we can obtain similar performance by using G-experimental design, but at a much lower computational cost. 
\begin{figure}[h]
\centering
		\begin{subfigure}[b]{0.49\linewidth}
			\centering
			\captionsetup{justification=centering}
			\includegraphics[width=.97\textwidth]{plot/tabular/model_free/TD_Sampling/FinalResults/OG.png}
			\caption[]{\small{OG}}
		\end{subfigure}
		\begin{subfigure}[b]{0.49\linewidth}
			\centering
			\captionsetup{justification=centering}
			\includegraphics[width=\textwidth]{plot/tabular/model_free/TD_Sampling/FinalResults/CV.png}
			\caption[]{\small{CV}}
		\end{subfigure}
		\caption[]
        {\textbf{Effect of sampling in model-free tabular setting:} $\text{Number of samples}=\{1000, 2000, 3000\}$ are varied for $Q$ value estimation to observe the change in performance (averaged over $5$ runs). The performance improves with increase in samples and \cAlg/ converges faster than the baselines \cGDA/ and \cCRPO/.}
        \label{fig:CB_GDA_MF_TDSampling}
\end{figure}
\begin{figure}[h]
		\begin{subfigure}[b]{0.49\linewidth}
			\centering
			\captionsetup{justification=centering}
			\includegraphics[width=\textwidth]{plot/tabular/model_free/TileCodingFeatures/FinalResults_TC/VaryFeatures/OG_eps_0.25.png}
			\caption[]{\small{OG}}
		\end{subfigure}
		\begin{subfigure}[b]{0.49\linewidth}
			\centering
			\captionsetup{justification=centering}
			\includegraphics[width=\textwidth]{plot/tabular/model_free/TileCodingFeatures/FinalResults_TC/VaryFeatures/CV_eps_0.25.png}
			\caption[]{\small{CV}}
		\end{subfigure}
		\caption[]
        {\textbf{LFA in gridworld environment:} For varying feature dimension $d$, \texttt{OG} for \cAlg/ consistently converges faster than the baselines \cGDA/ and \cCRPO/.}
        \label{fig:LFA_grid}
\end{figure}
% \vspace{-3ex}
\paragraph{Cartpole environment with exploration:}We use the Cartpole environment from the OpenAI gym~\citep{brockman2016openai}, and modify it to include multiple constraints. The agent is rewarded to keep the pole upright, whereas it receives a constraint reward if (1) the cart enters certain areas (x-axis position), or (2) the angle of pole is smaller than a certain threshold (see~\cref{app:lfa-exp} for details). We used tile coding to construct the feature space, and LSTDQ to estimate the $Q$ functions for both reward and constraint reward. 

In~\cref{fig:cartpole} we show the cumulative discounted reward and the constraint violation (\texttt{CV 1}, \texttt{CV 2}) for the two constraints as mentioned above. The dark lines correspond to the best hyperparameter that achieves the maximum return, while satisfying  \texttt{CV }$\in [-6,0]$ for both constraints, with the lighter shade-lines correspond to the other hyperparameters. All the algorithms satisfy the constraints and achieve comparable reward, but \Alg/ has considerably less variance in performance for different values of the hyperparameters. In~\cref{fig:cartpole_entropy_sensitivity} (\cref{app:lfa-exp}), we added entropy regularization ~\citep{geist2019theory,haarnoja2018soft} and observed a similar robustness for CBP.
\begin{figure}[!t]
    \centering
		\begin{subfigure}[b]{0.48\linewidth}
			\centering
			\captionsetup{justification=centering}
			\includegraphics[width=\textwidth]{plot/Cartpole/5Runs/BCV1_eps_6_e0.0.png}
			\caption[]{\small{CV 1}}
		\end{subfigure}
		\begin{subfigure}[b]{0.48\linewidth}
			\centering
			\captionsetup{justification=centering}
			\includegraphics[width=\textwidth]{plot/Cartpole/5Runs/BCV2_eps_6_e0.0.png}
			\caption[]{\small{CV 2}}
		\end{subfigure}
		\begin{subfigure}[b]{0.48\linewidth}
			\centering
			\captionsetup{justification=centering}
			\includegraphics[width=\textwidth]{plot/Cartpole/5Runs/BReturn_eps_6_e0.0.png}
			\caption[]{\small{Return}}
		\end{subfigure}
		\caption[]
        {\textbf{Cartpole environment:} Performance of \cAlg/, \cGDA/ and \cCRPO/ with two constraints (averaged across $5$ runs). The dark lines depict performance with the best hyperparameters. Light lines correspond to performance with other setting of hyperparameters. CBP exhibit robustness to the choice of hyperparameters.}
        \label{fig:cartpole}
\end{figure}
% DUMP
% Using a large number of trajectories $m$, $\consthat{\pitil} \approx \constq{\pitil}$ up to the irreducible misspecification error $\epsb$. If the number of iterations of the coin-betting algorithm is large, $\pitil \approx \argmax_{\pi} \consthat{\pi}$, and hence we declare the problem empirically infeasible if $\consthat{\pitil} < b$. 




% using the normalized gradients . For GDA, this normalization issue can be overcome by simply scaling up the step-sizes, and doing a grid-search over a large range of step-sizes to find the correct scale (see~\cref{sec:tabular-exps,sec:linear-exps}) leads to good empirical convergence. 

%!TEX root =  main.tex
\vspace{-2ex}
\section{Conclusion}
\label{sec:Conclusion}
\vspace{-1ex}
In this paper, we proposed a general primal-dual framework to solve CMDPs with tabular and linear function approximation setting. The main motivation of this work was to reduce the hyperparameter sensitivity in the policy optimization setting. 
% We instantiated this framework using coin-betting algorithms from online linear optimization, and proposed the \Alg/ algorithm.
We empirically showed that the existing algorithms suffer from high hyperparameter sensitivity (\cref{fig:sensitivity-intro}). Furthermore, they can even lead to uncontrolled errors in function approximation setting. To alleviate the above mentioned problems, we proposed a theoretically sound \Alg/ algorithm which leverages the coin-betting technique from online linear optimization. In addition, we also use experimental design procedure to control the errors.
%An important problem for future work is to reduce the compute cost of \Alg. 

\citet{orabona2017training} has shown coin-betting algorithms scale to neural networks. Similarly, in future we plan to scale \Alg/ to non-linear function approximation. We aim to use the recent advances in online linear optimization to design ``painless'' parameter-free policy optimization algorithms. We believe that this is important for reproducibility in RL and hope our work will encourage future research in this area. 
%A specific issue to be addressed is the compute cost of the algorithm. In relation to this, one hopes that since the various quantities computed by the algorithm change relatively slowly, significant speedups are possible without paying much extra price in terms of performance.

\begin{acknowledgements}
We would like to thank Tor Lattimore for feedback on the paper. Csaba Szepesv\'ari acknowledges the funding from Natural Sciences and Engineering Research Council (NSERC) of Canada and ``Design.R AI-assisted CPS Design'' (DARPA)  project. Doina Precup and Csaba Szepesv\'ari both acknowledge funding from Canada CIFAR AI Chairs Program for Mila and Amii respectively.
\end{acknowledgements}
\bibliography{jain_326}
\end{document}
