% %!TEX root = main.tex
% \newcommand{\appendixTitle}{%
% \vbox{
%     \centering
% 	\hrule height 4pt
% 	\vskip 0.2in
% 	{\LARGE \bf Towards Painless Policy Optimization for Constrained MDPs: Supplementary material}
% 	\vskip 0.2in
% 	\hrule height 1pt 

% \textbf{Arushi Jain, Sharan Vaswani, Reza Babanezhad, Csaba Szepesv\'ari, Doina Precup}
% }}

% \appendixTitle

% %------------------------
\documentclass[accepted]{uai2022} % after acceptance, for a revised

% version; also before submission to
% see how the non-anonymous paper would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
% ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced automatically for papers to be published. Do not make any other change above this note for an accepted version.
%!TEX root =  main.tex
%----------------------
\usepackage{lmodern}
\usepackage[english]{babel}
\usepackage{latexsym}
\usepackage{amsmath}
\usepackage{mathrsfs}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{bm}
\usepackage{datetime}
\usepackage[table,xcdraw]{xcolor}
\usepackage{accents}
\usepackage{tikz}
\usepackage{listings}
\usepackage{mdframed}
\usepackage{pgfplots}
\usepackage{pgfplotstable}
\usepackage[algoruled, linesnumbered]{algorithm2e} 
\usepackage{xr}

\usepackage{dsfont}
\usepackage{color}
\usepackage{colortbl}
\usepackage{pifont}
\usepackage{caption}
\usepackage{microtype} % improved spacing between words for easier reading
\usepackage{float}
\usepackage{xfrac} % sfrac
\usepackage{xspace}
\usepackage{booktabs}
\usepackage{blkarray}
\usepackage{graphicx}
\usepackage{subcaption}


\usepackage[textsize=tiny,
disable
]{todonotes}
\newcommand{\todoc}[1]{\todo[color=orange!20!white]{Csaba: #1}}


\newcommand{\red}[1]{\textcolor{red}{#1}}
\newcommand{\blue}[1]{\textcolor{blue}{#1}}
\newcommand{\teal}[1]{\textcolor{teal}{#1}}

%%%%%%%%-- adding this new command here----------
\usepackage{zref-xr}
\usepackage{nameref}
\usepackage{hyperref}
%---------------------------------


% \usepackage{hyperref}
\hypersetup{
    % bookmarks=true,         % show bookmarks bar?
    unicode=false,          % non-Latin characters in AcrobatÕs bookmarks
    pdftoolbar=true,        % show AcrobatÕs toolbar?
    pdfmenubar=true,        % show AcrobatÕs menu?
    pdffitwindow=false,     % window fit to page when opened
    pdfstartview={FitH},    % fits the width of the page to the window
    pdftitle={XXXXX},    % title
    pdfauthor={XXX},     % author
    pdfsubject={Bandits, Reinforcement Learning},   % subject of the document
    pdfcreator={Creator},   % creator of the document
    pdfproducer={Producer}, % producer of the document
    pdfkeywords={bandits} {reinforcement learning} {policy gradient}, % list of keywords
    pdfnewwindow=true,      % links in new window
    colorlinks=true,       % false: boxed links; true: colored links
    linkcolor=blue,          % color of internal links (change box color with linkbordercolor)
    citecolor=blue,        % color of links to bibliography
    filecolor=magenta,      % color of file links
    urlcolor=cyan           % color of external links
}
\usepackage{amsthm}
\usepackage{times}
\usepackage{nicefrac}
\usepackage{wrapfig}
\usepackage{pgfplots}
% \usepackage[capitalize]{cleveref}
\usepackage{thm-restate}


%----------------------
\usepackage[capitalize,noabbrev]{cleveref}
\zxrsetup{toltxlabel=true,tozreflabel=false,verbose}
%---------------------------------------------
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% THEOREMS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\definecolor{shadecolor}{gray}{0.90}
\declaretheoremstyle[
headfont=\normalfont\bfseries,
notefont=\mdseries, notebraces={(}{)},
bodyfont=\normalfont,
postheadspace=0.5em,
spaceabove=5pt,
mdframed={
  skipabove=3pt,
  skipbelow=3pt,
  hidealllines=true,
  backgroundcolor={shadecolor},
  innerleftmargin=2pt,
  innerrightmargin=2pt}
]{shaded}
% \declaretheorem[style=shaded]{theorem}

\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\newtheorem{remark}[theorem]{Remark}
\theoremstyle{remark}

%% Some suggested packages, as needed:
\usepackage[round]{natbib} % has a nice set of citation styles and commands
\bibliographystyle{plainnat}
\renewcommand{\bibsection}{\subsubsection*{References}}

\makeatletter
\def\thm@space@setup{\thm@preskip=0pt
\thm@postskip=0pt}
\makeatother

\setlength{\abovedisplayskip}{0pt}
\setlength{\belowdisplayskip}{0pt}
\setlength{\abovedisplayshortskip}{0pt}
\setlength{\belowdisplayshortskip}{0pt}

\setlength{\textfloatsep}{10pt plus 1.0pt minus 2.0pt}

\mdfdefinestyle{mdframedthmbox}{%
	leftmargin=.0\textwidth,
	rightmargin=.0\textwidth,%
	innertopmargin=0.75em,
	innerleftmargin=.5em,
	innerrightmargin=.5em,
}

\newenvironment{thmbox}
	{%
		\begin{mdframed}[style=mdframedthmbox]%
	}{% 
		\end{mdframed}%
	}


\newcommand{\myquote}[1]{\null~\\{\null\hspace{.05\textwidth}\begin{minipage}[t]{.90\textwidth} #1 \end{minipage}}}
\newcommand{\myquoten}[1]{{\null\hspace{.05\textwidth}\begin{minipage}[t]{.90\textwidth} #1 \end{minipage}}}

\crefname{algline}{Line}{Line}
\crefname{algline}{Line}{Line}


%!TEX root =  main.tex
\newcommand{\E}{\mathbb{E}}
\newcommand{\cZ}{\mathcal{Z}}
\newcommand{\EE}[1]{\E[#1]}
\newcommand{\EEg}[1]{\E\left[#1\right]}
\newcommand{\Prob}[1]{\mathbb{P}(#1)}
\newcommand{\PP}{\mathbb{P}}
\newcommand{\one}[1]{\mathbb{I}\{#1\}}
\newcommand{\Supp}{\operatorname{supp}}
\newcommand{\ip}[1]{\langle #1 \rangle}
\newcommand{\bip}[1]{\left\langle #1 \right\rangle}
\newcommand{\norm}[1]{\left\|#1\right\|}
\newcommand{\norminf}[1]{\left\|#1\right\|_{\infty}}
\newcommand{\indnorm}[2]{\|#1\|_{#2}}
\newcommand{\normsq}[1]{\|#1\|^2}
\newcommand{\R}{\mathbb{R}}

\newcommand{\N}{\mathbb{N}}
\newcommand{\cA}{\mathcal{A}}
\newcommand{\cB}{\mathcal{B}}
\newcommand{\cC}{\mathcal{C}}
\newcommand{\cD}{\mathcal{D}}
\newcommand{\cE}{\mathcal{E}}
\newcommand{\cF}{\mathcal{F}}
\newcommand{\cG}{\mathcal{G}}
\newcommand{\cH}{\mathcal{H}}
\newcommand{\cK}{\mathcal{K}}
\newcommand{\cL}{\mathcal{L}}
\newcommand{\cN}{\mathcal{N}}
\newcommand{\cM}{\mathcal{M}}
\newcommand{\cP}{\mathcal{P}}
\newcommand{\cQ}{\mathcal{Q}}
\newcommand{\cR}{\mathcal{R}}
\newcommand{\cS}{\mathcal{S}}
\newcommand{\cT}{\mathcal{T}}
\newcommand{\cV}{\mathcal{V}}
\newcommand{\sA}{\mathscr A}

\newcommand{\epsapp}{\epsilon}
\newcommand{\epssub}{\delta}

\DeclareMathOperator{\Range}{range}
\newcommand{\rows}{\operatorname{rows}}

\renewcommand{\epsilon}{\varepsilon}
\newcommand{\tvarepsilon}{\tilde{\varepsilon}}



\newcommand{\ceil}[1]{\left\lceil {#1} \right\rceil}
\newcommand{\floor}[1]{\left\lfloor {#1} \right\rfloor}
\newcommand{\ones}{\mathbf{1}}
\newcommand{\zeros}{\mathbf{0}}
\DeclareMathOperator*{\argmin}{arg\ min}
\DeclareMathOperator*{\argmax}{arg\ max}

\theoremstyle{plain}
\newtheorem{example}[theorem]{Example}

\def\rvzero{{\mathbf{0}}}
\def\rvone{{\mathbf{1}}}

\def\identiymatrix{\mathbf{Id}}

\newcommand{\softmax}{\mathrm{softmax}}
\newcommand{\KL}{D_{\mathrm{KL}}}

% Graph
\def\gA{{\mathcal{A}}}
\def\gB{{\mathcal{B}}}
\def\gC{{\mathcal{C}}}
\def\gD{{\mathcal{D}}}
\def\gE{{\mathcal{E}}}
\def\gF{{\mathcal{F}}}
\def\gG{{\mathcal{G}}}
\def\gH{{\mathcal{H}}}
\def\gI{{\mathcal{I}}}
\def\gJ{{\mathcal{J}}}
\def\gK{{\mathcal{K}}}
\def\gL{{\mathcal{L}}}
\def\gM{{\mathcal{M}}}
\def\gN{{\mathcal{N}}}
\def\gO{{\mathcal{O}}}
\def\gP{{\mathcal{P}}}
\def\gQ{{\mathcal{Q}}}
\def\gR{{\mathcal{R}}}
\def\gS{{\mathcal{S}}}
\def\gT{{\mathcal{T}}}
\def\gU{{\mathcal{U}}}
\def\gV{{\mathcal{V}}}
\def\gW{{\mathcal{W}}}
\def\gX{{\mathcal{X}}}
\def\gY{{\mathcal{Y}}}
\def\gZ{{\mathcal{Z}}}

% Sets
\def\sA{{\mathbb{A}}}
\def\sB{{\mathbb{B}}}
\def\sC{{\mathbb{C}}}
\def\sD{{\mathbb{D}}}
% Don't use a set called E, because this would be the same as our symbol
% for expectation.
\def\sE{{\mathbb{E}}}
\def\sF{{\mathbb{F}}}
\def\sG{{\mathbb{G}}}
\def\sH{{\mathbb{H}}}
\def\sI{{\mathbb{I}}}
\def\sJ{{\mathbb{J}}}
\def\sK{{\mathbb{K}}}
\def\sL{{\mathbb{L}}}
\def\sM{{\mathbb{M}}}
\def\sN{{\mathbb{N}}}
\def\sO{{\mathbb{O}}}
\def\sP{{\mathbb{P}}}
\def\sQ{{\mathbb{Q}}}
\def\sR{{\mathbb{R}}}
\def\sS{{\mathbb{S}}}
\def\sT{{\mathbb{T}}}
\def\sU{{\mathbb{U}}}
\def\sV{{\mathbb{V}}}
\def\sW{{\mathbb{W}}}
\def\sX{{\mathbb{X}}}
\def\sY{{\mathbb{Y}}}
\def\sZ{{\mathbb{Z}}}

\newcommand{\dimE}{\mathrm{dim}_{\mathcal{E}}}
\DeclareMathOperator{\diam}{diam}
\newcommand{\Alg}{\mathcal{A}}
\DeclareMathOperator{\Ber}{Ber}

\newcommand{\val}[1]{V^{#1}(\rho)}
\newcommand{\vals}[1]{V^{#1}}

\newcommand{\const}[1]{V_c^{#1}(\rho)}
\newcommand{\consthat}[1]{\hat{V}_c^{#1}(\rho)}
\newcommand{\optconst}{V_c^{*}(\rho)}

\newcommand{\consts}[1]{V_c^{#1}}
\newcommand{\consthats}[1]{\hat{V}_c^{#1}}
\newcommand{\optconsts}{V_c^{*}}

\newcommand{\reward}[1]{V_r^{#1}(\rho)}
\newcommand{\rewardhat}[1]{\hat{V}_r^{#1}(\rho)}
\newcommand{\optreward}{V_r^{*}(\rho)}

\newcommand{\rewards}[1]{V_r^{#1}}
\newcommand{\rewardhats}[1]{\hat{V}_r^{#1}}
\newcommand{\optrewards}{V_r^{*}}


\newcommand{\lag}[2]{V_{l}^{#1,#2}(\rho)}
\newcommand{\laghat}[2]{\hat{V}_{l}^{#1,#2}(\rho)}
\newcommand{\optlag}[1]{V_l^{#1}(\rho)}

\newcommand{\dual}[1]{V_d^{#1}(\rho)}
\newcommand{\dualhat}[1]{\hat{V}_d^{#1}(\rho)}
\newcommand{\optdual}{V_d^{*}(\rho)}

\newcommand{\lagq}[1]{Q_{l}^{#1}}
\newcommand{\lagqhat}[1]{\hat{Q}_{l}^{#1}}

\newcommand{\laga}[1]{A_{l}^{#1}}
\newcommand{\lagahat}[1]{\hat{A}_{l}^{#1}}
\newcommand{\lagatilde}[1]{\tilde{A}_{l}^{#1}}

\newcommand{\cI}{\mathcal{I}}



\newcommand{\rewardq}[1]{Q_{r}^{#1}}
\newcommand{\rewardqhat}[1]{\hat{Q}_{r}^{#1}}

\newcommand{\rewardqs}[1]{Q_{r}^{#1}(s,\cdot)}
\newcommand{\rewardqhats}[1]{\hat{Q}_{r}^{#1}(s,\cdot)}

\newcommand{\constq}[1]{Q_{c}^{#1}}
\newcommand{\constqhat}[1]{\hat{Q}_{c}^{#1}}

\newcommand{\constqs}[1]{Q_{c}^{#1}(s,\cdot)}
\newcommand{\constqhats}[1]{\hat{Q}_{c}^{#1}(s,\cdot)}

\newcommand{\piopt}{\pi^*}
\newcommand{\pit}{\pi_{t}}
\newcommand{\pihat}{\hat{\pi}}
\newcommand{\pitil}{\tilde{\pi}}
\newcommand{\tautil}{\tilde{\tau}}
\newcommand{\pip}{\pi^{\prime}}
\newcommand{\pitt}{\pi_{t+1}}
\newcommand{\lambdat}{\lambda_{t}}
\newcommand{\lambdatt}{\lambda_{t+1}}
\newcommand{\transpose}{^\mathsf{\scriptscriptstyle T}}
\newcommand{\inv}{^\mathsf{\scriptscriptstyle -1}}


\newcommand{\epsb}{\varepsilon_{\text{\tiny{b}}}}
\newcommand{\epss}{\varepsilon_{\text{\tiny{s}}}}

\def\Alg/{{CBP}}
\newcommand{\Tau}{\mathrm{T}}
\newcommand{\thetahat}{\hat{\theta}}

\def\cAlg/{\red{CBP}}
\def\cGDA/{\blue{GDA}}
\def\cCRPO/{\teal{CRPO}}

% %--------------------------
% \externaldocument[supp:]{jain_326-supp}
% \usepackage{nameref}
% \usepackage{zref-xr,zref-user}
% \zxrsetup{toltxlabel}
% %----------------------


%------------------------
\zexternaldocument*{jain_326}


\title{Towards Painless Policy Optimization for Constrained MDPs: Supplementary material}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1$^*$]{Arushi Jain}
\author[2\thanks{The first two authors contributed equally. Email: arushi.jain@mail.mcgill.ca, vaswani.sharan@gmail.com.}]{Sharan Vaswani}
\author[3]{Reza Babanezhad}
\author[4,5]{Csaba Szepesv\'ari}
\author[1,5]{Doina Precup}

% Add affiliations after the authors
\affil[1]{%
    Mila, McGill University
}
\affil[2]{%
  Simon Fraser University
}
\affil[3]{%
  SAIT AI Lab, Montreal
  }
\affil[4]{%
  Amii, University of Alberta 
  }
\affil[5]{%
DeepMind
} 
  
\begin{document}
\onecolumn
\maketitle
\appendix
\renewcommand{\theequation}{\thesection.\arabic{equation}}
\setcounter{equation}{0}
\setcounter{figure}{5}




\section*{Organization of the Appendix}
\begin{itemize}

  \item[\ref{app:tabular}] \nameref{app:tabular}
    
  \item[\ref{app:proofs}] \nameref{app:proofs}
 
  \item[\ref{app:exp-details}] \nameref{app:exp-details}
  
  \item[\ref{app:experiments}] \nameref{app:experiments}
 
\end{itemize}
\section{Theoretical Guarantees in the Tabular Setting}
\label{app:tabular}
In the tabular setting, we use $m$ independent trajectories for \emph{each} $(s,a)$ pair. By Hoeffding's inequality and union bound across all states and actions, the sampling error can be bounded by $\frac{1}{1 - \gamma} \sqrt{\frac{\log(2 S A/\delta)}{2 m}}$ (similar to the proof of~\cref{lemma:lspe-extrapolation}). Since all action-value functions can be represented in the tabular setting, the bias error term $\epsb = 0$, and hence $\tvarepsilon= \frac{1}{1 - \gamma} \sqrt{\frac{\log(2 S A/\delta)}{2 m}}$. Compared to the linear function approximation setting in~\cref{sec:putting-together} that has a computational complexity proportional to $O(d^2)$, the computational cost in the tabular setting is $O(SA)$. However, the approximation error is smaller than that in~\cref{lemma:lspe-extrapolation}. 

Now that we have bounded the approximation errors in the tabular setting, we instantiate~\cref{thm:generic-bound} for GDA in~\cref{sec:gda}. Plugging in the value of $U$, the primal and dual regret from~\cref{eq:regret} and $\tvarepsilon$, we obtain the following corollary. 
\begin{thmbox}
\begin{restatable}{corollary}{gda}
For the gradient descent ascent updates in~\cref{eq:gda-primal,eq:gda-dual} with the specified step-sizes, $U = \frac{2}{\zeta \, (1 - \gamma)}$, using $m$ trajectories, the average optimality gap (\texttt{OG}) and constraint violation (\texttt{CV}) can be bounded as:
\begin{align*}
\texttt{\text{OG}} & \leq \frac{
\left(\frac{(1 + U) \, \sqrt{2 \log |A|}}{1 - \gamma} + U \right)}{(1 - \gamma) \sqrt{T}} + \frac{\epss (1 + 2 U)}{1 - \gamma}, \\
\texttt{\text{CV}} &\leq  \frac{\zeta \left(\frac{(1 + U) \, \sqrt{2 \log |A|}}{1 - \gamma} + U \right)}{\sqrt{T}} + \epss (1 + 2 U), 
\end{align*}
where $\epss = \frac{1}{1 - \gamma} \sqrt{\frac{\log(2 S A/\delta)}{2 m}}$. 
\label{cor:gda}
\end{restatable}
\end{thmbox}
\begin{proof}
To get the result we replace the regrets for primal and dual of GDA~\citep[Theorem 6.8]{orabona2019modern} in ~\cref{thm:generic-bound} and get the required results. Specifically we set 
\[
\cR^{p}(\pi^*,T) \leq  \frac{1 + U}{1 - \gamma} \sqrt{2 \log |A|} \sqrt{T},
\]
and
\[
\cR^{d}(0,T), \cR^{d}(U,T)  \leq \frac{U}{1 - \gamma} \sqrt{T}.
\]

\end{proof}
Hence, the average optimality gap for GDA is $O \left(\frac{1}{(1 - \gamma)^3 \, \sqrt{T}} + \frac{\epss}{(1 - \gamma)^2} \right)$, while the average constraint violation scales as $O \left(\frac{1}{(1 - \gamma)^2 \, \sqrt{T}} + \frac{\epss}{1 - \gamma} \right)$. Compared to the tabular result in~\citet{ding2020natural}, the above bound on the optimality gap is worse by a factor of $O(\nicefrac{1}{1 - \gamma})$ and matches their bound on the constraint violation. On the other hand, in the tabular setting without sampling error (when $\epss = 0$),~\citet[Theorem 3]{xu2021crpo} obtain an $O \left(\frac{1}{(1 - \gamma)^{1.5} \, \sqrt{T}} \right)$ bound on both the optimality gap and constraint violation. However, in order to set this bound, they require the knowledge of $\text{KL}(\piopt || \pi_0 )$ to set the algorithm hyper-parameters. This information is not available, making it difficult to implement their algorithm. 

Now, we instantiate~\cref{thm:generic-bound} for the coin-betting algorithms in~\cref{sec:cb}. Plugging in the value of $U$, the primal and dual regret and $\tvarepsilon$, we obtain the following corollary. 
\begin{thmbox}
\begin{restatable}{corollary}{cb}
Using the primal updates in~\cref{eq:cb-primal}, and the dual updates in~\cref{eq:cb-dual}, with $U = \frac{2}{\zeta \, (1 - \gamma)}$, using $m$ trajectories, the average optimality gap (\texttt{OG}) and constraint violation (\texttt{CV}) for CBP can be bounded as:
\begin{align*}
\texttt{\text{OG}} & \leq \frac{
\left(\frac{3 (1 + U) \, \sqrt{1 + \text{KL}(\pi_0 || \pi^*)}}{1 - \gamma} + \Psi \right)}{(1 - \gamma) \sqrt{T}} + \frac{\epss (1 + 2 U)}{1 - \gamma}, \\
\texttt{\text{CV}} &\leq  \frac{\zeta \left(\frac{3 (1 + U) \, \sqrt{1 + \text{KL}(\pi_0 || \pi^*)}}{1 - \gamma} + \Psi \right)}{\sqrt{T}} + \zeta \, \epss (1 + 2 U), 
\end{align*}
where $\epss = \frac{1}{1 - \gamma} \sqrt{\frac{\log(2 S A/\delta)}{2 m}}$ and $ \Psi= 4U\sqrt{\log((T+1)U)} + 1$.
\label{cor:cb}
\end{restatable}
\end{thmbox}
\begin{proof}
To get the result we replace the regrets for primal and dual of CB in ~\cref{thm:generic-bound} and get the required results. Specifically from~\citep[Corollary 6]{orabona2016coin} and~\citep[Theorem 8]{orabona2017training}, we get the upper-bound for primal regret and the dual regret:    
\[
\cR^{p}(\pi^*,T) \leq \frac{3 (1 + U)}{1 - \gamma} \sqrt{T} \sqrt{1 + \text{KL}(\pi_0 || \pi^*)},
\]
and
\[
\cR^{d}(\lambda,T) \leq \frac{1}{1 - \gamma} + \norm{\lambda-\lambda^0} \sqrt{\left(\frac{1}{(1 - \gamma)^2}+\frac{G_T}{1 - \gamma} \right)
\Gamma_T}
\] 
where $\Gamma_T=\log\left(1 + (G_T \, (1-\gamma)+1)^2\norm{\lambda-\lambda^0}^2 \right)$ and $G_T = \sum_{i = 0}^{T} \vert \consthat{\pi_i} - b \vert$. Since $\vert \consthat{\pi_i} - b \vert \leq \frac{1}{1-\gamma}$ we have $G_T \leq T/1-\gamma$ and $\norm{\lambda - \lambda^0} \leq 2U$ for all $\lambda$. Using these upperbound and replace in $\cR^{d}(\lambda,T)$ we get: 
\[
\cR^{d}(\lambda,T) \leq \frac{4U\sqrt{(T+1)\log((T+1)U)} + 1}{1-\gamma}
\]
\end{proof}
\clearpage
%!TEX root =  main.tex
\section{Main Proofs}
\label{app:proofs}

The following well known result will be useful:
\begin{thmbox}
\begin{lemma}[Value difference lemma]
For any value function $\vals{\pi}$ (reward or cost), and any two memoryless policies $\pi$ and $\pip$,
\begin{align*}
\vals{\pip} - \vals{\pi} & = \left(I - \gamma P_{\pip} \right)^{-1} \left[T_{\pip} \vals{\pi} - \vals{\pi} \right] 
\end{align*}
where $T_{\pip} \vals{\pi} = [r_{\pip} + \gamma P_{\pip} \vals{\pi}]$ is the Bellman operator for policy $\pip$. 
\label{lemma:val-diff}
\end{lemma}
\end{thmbox}
\begin{proof}
As is well known, $\vals{\pip} = \left(I - \gamma P_{\pip} \right)^{-1} r_{\pip}$. Hence,
\begin{align*}
\vals{\pip} - \vals{\pi} & =  \left(I - \gamma P_{\pip} \right)^{-1} r_{\pip} - \vals{\pi}  \\
& =  \left(I - \gamma P_{\pip} \right)^{-1} \left(r_{\pip} - \left(I - \gamma P_{\pip} \right) \vals{\pi} \right) \\
& = \left(I - \gamma P_{\pip} \right)^{-1} \left(r_{\pip} + \gamma P_{\pip} \vals{\pi} - \vals{\pi} \right) \\
& = \left(I - \gamma P_{\pip} \right)^{-1} \left[T_{\pip} \vals{\pi} - \vals{\pi} \right] 
\end{align*}
\end{proof}

Let us now turn to the proof of \cref{thm:generic-bound}:

\begin{restatable}{theorem}{}
Assuming that $\norm{\rewardq{t} - \rewardqhat{t}}_{\infty} \leq \tvarepsilon$ and $\norm{\constq{t} - \constqhat{t}}_{\infty} \leq \tvarepsilon$, for a generic algorithm producing a sequence of polices $\{\pi_0, \pi_1, \ldots, \pi_{T-1}\}$ and dual variables $\{\lambda_0, \lambda_1, \ldots, \lambda_{T-1}\}$ such that for all $t$, $\lambdat$ is constrained to lie in the $[0, U]$ where $U > \lambda^*$, \texttt{OG} and \texttt{CV} can be bounded as:
\begin{align*}
\texttt{\text{OG}} & \leq \frac{\cR^{p}(\pi^*, T) + (1 - \gamma) \cR^{d}(0, T)}{(1 - \gamma) T} + \tvarepsilon \, g(U), \\
\texttt{\text{CV}} &\leq \frac{\cR^{p}(\pi^*, T) + (1 - \gamma) \cR^{d}(U, T)}{(U - \lambda^*) (1 - \gamma) T} + \frac{\tvarepsilon \, g(U)}{(U - \lambda^*)} ,
\end{align*}
where $g(U) := \left[\frac{1 + U}{1 - \gamma} + U \right]$.
\label{thm:generic-bound-app}
\end{restatable}
\begin{proof}
We will begin with bounding the value differences in the Lagrangian using~\cref{lemma:val-diff}. Let $T_{\piopt}^{r}$ and $T_{\piopt}^{c}$ be the Bellman operators of the optimal policy for the reward and cost respectively. Then, 
\begin{align*}
[\rewards{\piopt} - \rewards{\pit}] + \lambdat \,[\consts{\piopt} - \consts{\pit}] 
&= \left(I - \gamma P_{\piopt} \right)^{-1} \big[ \left[T^{r}_{\piopt} \rewards{\pit} - \rewards{\pit} \right] + \lambdat \left[T^{c}_{\piopt} \consts{\pit} - \consts{\pit} \right] \big] 
\end{align*}
Let $M_{\pi}$ be the state-action operator applied $Q$ functions such that $M_{\pi}(Q)(s) = \sum_{a} \pi(a|s) Q(s,a)$. Observe that $T^{r}_{\piopt} \rewards{\pit} = M_{\piopt} \rewardq{\pit}$ and $\rewards{\pit} = M_{\pit} \rewardq{\pit}$. The expressions for the constraint rewards are analogous. Rewriting the above expression, 
\begin{align*}
[\rewards{\piopt} - \rewards{\pit}] + \lambdat \,[\consts{\piopt} - \consts{\pit}] &= \left(I - \gamma P_{\piopt} \right)^{-1} \bigg[ \left[M_{\piopt} \rewardq{\pit} - M_{\pit} \rewardq{\pit} \right] + \lambdat \left[M_{\piopt} \constq{\pit} - M_{\pit} \constq{\pit} \right] \bigg] \\
&= \left(I - \gamma P_{\piopt} \right)^{-1} \bigg[ [M_{\piopt} - M_{\pit}] \, [\rewardq{\pit} + \lambda_t \constq{\pit}] \bigg] \\
& = \left(I - \gamma P_{\piopt} \right)^{-1} \bigg[ [M_{\piopt} - M_{\pit}] \, [\rewardqhat{\pit} + \lambda_t \constqhat{\pit}] \bigg] \\ 
& + \underbrace{\left(I - \gamma P_{\piopt} \right)^{-1} \bigg[ [M_{\piopt} - M_{\pit}] \, [\rewardq{\pit} - \rewardqhat{\pit} + \lambda_t (\constq{\pit} - \constqhat{\pit})] \bigg]}_{\text{Error}} \\
\end{align*}
Let us first bound the maximum norm of the ``Error'' term, 
\begin{align*}
\norm{\text{Error}}_\infty 
& =
\norm{
 \left(I - \gamma P_{\piopt} \right)^{-1} \bigg[ [M_{\piopt} - M_{\pit}] \, [\rewardq{\pit} - \rewardqhat{\pit} + \lambda_t (\constq{\pit} - \constqhat{\pit})] \bigg]}_\infty\\
 &\leq \frac{1}{1 - \gamma} \norm{[\rewardq{\pit} - \rewardqhat{\pit} + \lambda_t (\constq{\pit} - \constqhat{\pit})]}_{\infty} \\
& \leq \frac{1}{1 - \gamma} \norm{\rewardq{\pit} - \rewardqhat{\pit}}_{\infty} + \lambda_t \norm{\constq{\pit} - \constqhat{\pit}]}_{\infty} \\
\intertext{By assumption, $\norm{\rewardq{\pit} - \rewardqhat{\pit}}_{\infty}$, $\norm{\rewardq{\pit} - \rewardqhat{\pit}}_{\infty}\le \epsilon$.}
\implies \norm{\text{Error}}_\infty \leq \frac{\epsilon}{1 - \gamma} (1 +  \lambdat) &
\intertext{Since the dual variables are projected onto the $[0,U]$ interval, $\lambda_t \leq U$, implying that}
\norm{\text{Error} }_\infty \leq \frac{\epsilon}{1 - \gamma} \left(1 + U \right) & 
\end{align*}
Substituting in this bound on the error, using the convention that left-multiplication by a measure means integration with respect to it,
\begin{align*}
[\reward{\piopt} - \reward{\pit}] + \lambdat \,[\const{\piopt} - \const{\pit}] 
& \leq 
\rho \left(I - \gamma P_{\piopt} \right)^{-1} \bigg[ [M_{\piopt} - M_{\pit}] \, [\rewardqhat{\pit} + \lambda_t \constqhat{\pit}] \bigg] + \frac{\epsilon}{1 - \gamma} \left(1 + U \right) \\
& \leq \frac{1}{1 - \gamma} \nu_{\rho,\pi^*} \bigg[ [M_{\piopt} - M_{\pit}] \, [\rewardqhat{\pit} + \lambda_t \constqhat{\pit}] \bigg] + \frac{\epsilon}{1 - \gamma} \left(1 + U \right)\,, \\
\intertext{
where $\nu_{\rho,\pi^*} = (1-\gamma)\rho \left(I-\gamma P_{\piopt} \right)^{-1}$ is the
discounted probability measure over the states obtained when starting from $\rho$ and following $\piopt$.
Summing from $t = 0$ to $T-1$ and dividing by $T$.}
\frac{1}{T} \nu_{\rho,\pi^*}  \sum_{t = 0}^{T-1} \left[ [\reward{\piopt} - \reward{\pit}] + \lambdat \,[\const{\piopt} - \const{\pit}] \right] &\leq 
\frac{\nu_{\rho,\pi^*}}{(1 - \gamma) T} \sum_{t = 0}^{T-1} \left[ [\cM_{\piopt} - \cM_{\pit}] [\rewardqhat{\pit} + \lambdat \constqhat{\pit}] \right] + \frac{\epsilon}{1 - \gamma} \left(1 + U \right) \\
\end{align*}
Now, observe that 
\begin{align*}
\nu_{\rho,\pi^*} \sum_{t = 0}^{T-1} \left[ 
[\cM_{\piopt} - \cM_{\pit}] [\rewardqhat{\pit} + \lambdat \constqhat{\pit}] \right] &= \sum_{t = 0}^{T-1} \langle \piopt(\cdot|s) - \pit(\cdot|s), \rewardqhats{\pit} + \lambdat \constqhats{\pit}] \rangle_{s\sim \nu_{\rho,\pi^*} } = \cR^{p}(\pi^*, T)     
\end{align*}
Putting everything together, 
\begin{align}
\frac{1}{T} \sum_{t = 0}^{T - 1} [\reward{\piopt} - \reward{\pit}] + \frac{1}{T} \sum_{t = 0}^{T - 1} \lambdat \, [\const{\piopt} - \const{\pit}] & \leq \frac{\cR^{p}(\pi^*, T)}{(1 - \gamma) T} + \frac{\epsilon}{1 - \gamma} \left(1 + U \right).
\label{eq:lagrangian-bound}
\end{align}
The above result bounds the sub-optimality in the Lagrangian. Next, we will see how this result implies a bound on the sub-optimality in the objective and the constraint violation. To bound the reward sub-optimality, we will upper bound the negative of the second term on the left-hand side in the above equation, i.e., we upper bound $\frac{1}{T} \sum_{t = 0}^{T - 1} \lambdat \, [\const{\pit} - \const{\piopt}]$. We have,
\begin{align}
\frac{1}{T} \sum_{t = 0}^{T - 1} \lambdat \, [\const{\pit} - \const{\piopt}] & \leq \frac{1}{T} \sum_{t = 0}^{T - 1} \lambdat \, [\const{\pit} - b] & \tag{since $\const{\piopt} \geq b$} \\
& = \frac{1}{T} \sum_{t = 0}^{T - 1} \lambdat \, [\const{\pit} - \consthat{\pit}] + \frac{1}{T} \sum_{t = 0}^{T - 1}  \lambdat [\consthat{\pit} - b] \nonumber \\
& = \frac{1}{T} \sum_{t = 0}^{T - 1} \lambdat \, [\const{\pit} - \consthat{\pit}] + \frac{\cR^{d}(0, T)}{T}  \nonumber \\
& \leq U \epsilon + \frac{\cR^{d}(0, T)}{T}\,. & \label{eq:cost-bound}
\end{align}
Using~\cref{eq:lagrangian-bound,eq:cost-bound},
\begin{align}
\texttt{OG} = \frac{1}{T} \sum_{t = 0}^{T - 1} [\reward{\piopt} - \reward{\pit}] & \leq
\frac{\cR^{p}(\pi^*, T) + (1 - \gamma) \cR^{d}(0, T)}{(1 - \gamma) T} + \frac{\epsilon}{1 - \gamma} \left(1 + U \right) + U \epsilon
\label{eq:reward-result}
\end{align}
This proves the first part of the theorem. We now bound the constraint violation. For an arbitrary $\lambda$, 
\begin{align}
\frac{1}{T} \sum_{t = 0}^{T - 1} \left[(\lambdat - \lambda) (\const{\pit} - b) \right]  & = \frac{1}{T} \sum_{t = 0}^{T - 1} \left[(\lambdat - \lambda) (\const{\pit} - \consthat{\pit}) \right] + \frac{1}{T} \sum_{t = 0}^{T - 1} \left[(\lambdat - \lambda) (\consthat{\pit} - b) \right] \nonumber \\
& = \frac{1}{T} \sum_{t = 0}^{T - 1} \left[(\lambdat - \lambda) (\const{\pit} - \consthat{\pit}) \right] + \frac{\cR^{d}(\lambda, T)}{T}\,, \nonumber \\
\intertext{implying}
\frac{1}{T} \sum_{t = 0}^{T - 1} \left[(\lambdat - \lambda) (\const{\pit} - b) \right] & \leq U \epsilon + \frac{\cR^{d}(\lambda, T)}{T}\,. & \label{eq:cost-bound-2}
\end{align}
Adding~\cref{eq:cost-bound-2} and~\cref{eq:lagrangian-bound} and reordering the terms gives
\begin{align*}
& \frac{1}{T}  \sum_{t = 0}^{T-1} (\reward{\piopt} - \reward{\pit}) +  \frac{\lambda}{T} \sum_{t = 0}^{T-1} (b - \const{\pit}) \\ 
& \leq \frac{1}{T} \sum_{t = 0}^{T-1} \underbrace{\lambdat (b - \const{\piopt})}_{\leq 0 \text{ since $\const{\piopt} \geq b$.}} + \underbrace{\frac{\cR^{p}(\pi^*, T) + (1 - \gamma) \cR^{d}(\lambda, T)}{(1 - \gamma) T} + \frac{\epsilon}{1 - \gamma} \left(1 + U \right) + U \epsilon}_{h(\lambda)} \\
\implies & \frac{1}{T} \sum_{t = 0}^{T-1} (\reward{\piopt} - \reward{\pit}) +  \frac{\lambda}{T} \sum_{t = 0}^{T-1} (b - \const{\pit}) \leq h(\lambda)
\end{align*}
We consider two cases: (i) if $\sum_{t = 0}^{T-1} (b - \const{\pit}) \geq 0$, we set $\lambda = U$, else, if (ii) $\sum_{t = 0}^{T-1} (b - \const{\pit}) < 0$, we set $\lambda = 0$. Using these choices, and since $\cR^{d}(\lambda,T)$ is linearly increasing in $\lambda$,
\begin{align*}
\frac{1}{T}  \sum_{t = 0}^{T-1} (\reward{\piopt} - \reward{\pit}) + \frac{ U}{T}
\left[ \sum_{t = 0}^{T-1} (b - \const{\pit})\right]_{+} & \leq  h(U) 
\end{align*}
Now take the policy $\pip$ such that $\reward{\piopt} - \reward{\pip} = \frac{1}{T}  \sum_{t = 0}^{T-1} (\reward{\piopt} - \reward{\pit})$ and $\const{\piopt} - \const{\pip} = \frac{1}{T}  \sum_{t = 0}^{T-1} (b - \const{\pit})$. Then,
\begin{align*}
[\reward{\piopt} - \reward{\pip}] + U\left[b - \const{\pip}\right]_{+} & \leq h(U)\,.
\end{align*}
Using~\cref{lemma:lag-constraint} with $C = U > \lambda^*$ and $\beta = h(U)$, we get
\begin{align*}
\texttt{CV} &= \frac{1}{T} \left[\sum_{t = 0}^{T-1} b - \const{\pit}\right]_{+} 
=
\left[b - \const{\pip}\right]_{+} \\
& \leq \frac{h(U)}{U - \lambda^*} 
= \frac{\cR^{p}(\pi^*, T) + (1 - \gamma) \cR^{d}(U, T)}{(U - \lambda^*) (1 - \gamma) T} + \frac{1}{(U - \lambda^*) } \left[\frac{\epsilon}{(1 - \gamma)} \left(1 + U \right) + U \epsilon \right] \,,
\end{align*}
which completes the proof subject to proving \cref{lemma:lag-constraint}.
\end{proof}

%For the proof of \cref{lemma:lag-constraint}, the following lemma is useful:

%\subsection{Auxiliary Lemmas}
\begin{thmbox}
\begin{lemma}[Constraint violation bound]
For any $C > \lambda^*$  and any $\pitil$ s.t. $\reward{\piopt} - \reward{\pitil} + C [b - \const{\pitil}]_{+} \leq \beta$, we have $[b - \const{\pitil}]_{+} \leq \frac{\beta}{C - \lambda^*}$. 
\label{lemma:lag-constraint}
\end{lemma}
\end{thmbox}
\begin{proof}
Define $\nu(\tau) = \max_{\pi} \{\reward{\pi} \mid \const{\pi} \geq b + \tau \}$ and note that by definition, $\nu(0) = \reward{\piopt}$ and that $\nu$ is a decreasing function for its argument.

Let $\lag{\pi}{\lambda} = \reward{\pi}+\lambda(\const{\pi}-b)$. Then,
for any policy $\pi$ s.t. $\const{\pi} \geq b + \tau$, we have
\begin{align}
\lag{\pi}{\lambda^*} & \leq \max_{\pi'} \lag{\pi'}{\lambda^*} \nonumber \\%= \optdual &\tag{by definition} \\
%&
&= \reward{\piopt} &\tag{by strong duality} \\ % from~\cref{lemma:sd}} \\
& = \nu(0) & \tag{from above relation} \\
\implies \nu(0) - \tau \lambda^* & \geq \lag{\pi}{\lambda^*} - \tau \lambda^* = \reward{\pi} + \lambda^* \underbrace{(\const{\pi} - b - \tau)}_{\text{Positive}} \nonumber \\
\implies \nu(0) - \tau \lambda^* & \geq \max_{\pi} \{\reward{\pi} \mid \const{\pi} \geq b + \tau \} = \nu(\tau) \,.\nonumber \\
\implies \tau \lambda^* \leq \nu(0) - \nu(\tau)\,. \label{eq:inter-1}
\end{align}

%By strong duality, $\nu(\tau)$ is a concave function. 
Now we choose $\tautil = -(b - \const{\pitil})_{+}$.
\begin{align*}
(C - \lambda^*) |\tautil| &= \lambda^* \tautil + C |\tautil| & \tag{since $\tautil \leq 0$} \\
& \leq \nu(0) - \nu(\tautil) + C |\tautil| & \tag{\cref{eq:inter-1}} \\
& = \reward{\piopt} - \reward{\pitil} + C |\tautil| + \reward{\pitil} - \nu(\tautil) & \tag{definition of $\nu(0)$} \\
& = \reward{\piopt} - \reward{\pitil} + C (b - \const{\pitil})_{+} + \reward{\pitil} - \nu(\tautil) \\
%(C - \lambda^*) |\tautil| 
& \leq \beta + \reward{\pitil} - \nu(\tautil)\,.
\intertext{Now let us bound $\nu(\tautil)$:} 
\nu(\tautil) & = \max_{\pi} \{\reward{\pi} \mid \const{\pi} \geq b - (b - \const{\pitil})_{+} \}  \\
& \geq \max_{\pi} \{\reward{\pi} \mid \const{\pi} \geq \const{\pitil} \} & \tag{tightening the constraint} \\
\nu(\tautil) & \geq \reward{\pitil} 
\implies (C - \lambda^*) |\tautil| 
 \leq \beta \implies (b - \const{\pitil})_{+} \leq \frac{\beta}{C - \lambda^*} 
\end{align*}
\end{proof}

\subsection{Proof of Lemma~\ref{lemma:sd}}

\begin{restatable}{lemma}{}
The objective~\cref{eq:objective} satisfies strong duality, and the optimal dual variables are bounded as $\lambda^* \leq \frac{1}{\zeta (1 - \gamma)}$, where $\zeta := \max_{\pi} \const{\pi} - b > 0$. 
\label{lemma:sd-app}
\end{restatable}
\begin{proof}
Starting from the Lagrangian form in~\cref{eq:objective-saddle},
\begin{align*}
\reward{*} & := \max_{\pi} \min_{\lambda \geq 0} \reward{\pi} + \lambda [\const{\pi} - b] \\
\intertext{Using the linear programming formulation of CMDPs in terms of the state-occupancy measures $\mu$, we know that both the objective and the constraint are linear functions of $\mu$, and strong duality holds w.r.t $\mu$. Since $\mu$ and $\pi$ have a one-one mapping, we can switch the min and the max~\citep{paternain2019constrained}, implying,} 
\reward{*}& = \min_{\lambda \geq 0} \max_{\pi} \reward{\pi} + \lambda [\const{\pi} - b] \\
& = \max_{\pi} \reward{\pi} + \lambda^* [\const{\pi} - b]\,. \\
\intertext{Define $\pitil := \argmax_{\pi} \const{\pi}$. Then,}
\reward{*}& \geq \reward{\pitil} + \lambda^* [\const{\pitil} -b] \\
\implies \lambda^* & \leq \frac{\reward{*} - \reward{\pitil}}{[\const{\pitil} -b]} \leq \frac{1}{(1 - \gamma) \zeta}\,.
\end{align*}
\end{proof}



\subsection{Proofs for Section~\ref{sec:putting-together}}

\begin{restatable}{lemma}{lspe-extrapolation}
For policy $\pi$, any distribution $\omega$ and subset $\cC$, if we use $m$ trajectories to estimate the action-value function for each $(s,a) \in \cC$, and solve~\cref{eq:lspg} to compute $\thetahat_r^\pi$, then for any $(s,a) \in (\cS \times \cA)$ pair, the error $\vert \langle \phi(s,a), \thetahat_r^\pi \rangle - \rewardq{\pi} \vert$ can be upper-bounded by   
\begin{align*}
\epsb (1+ \indnorm{\phi(s,a)}{G_{\omega}^{\dagger}}) \nonumber + \frac{\indnorm{\phi(s,a)}{G_{\omega}^{\dagger}}}{1 - \gamma} \, \sqrt{\frac{\log(2 |\cC| /\delta)}{2 m}},
\end{align*}
where $G_{\omega} = \sum_{(s,a) \in \cC} \omega(s,a) \phi(s,a) \phi(s,a)^{\transpose}$ and $A^\dagger$ is pseudoinverse of $A$.
\label{lemma:lspe-extrapolation}
\end{restatable}
\begin{proof}
By solving $\thetahat^\pi_r = \argmin_{\theta} \sum_{z \in \cC} \omega(z) \left[ \langle \theta, \phi(z) \rangle - q_r(z) \right]^2$ (~\cref{eq:lspg}), we get that
\[
    \theta_r^\pi = G_{\omega}^{\dagger} \sum_{(s,a) \in \cC} \omega(s,a) \phi(s,a) \, q_r^\pi(s,a)
\]
and lets denote $z=(s,a)$, $\phi(z)=\phi(s,a)$ and $\epsilon(z)=q_r(z)-\rewardq{\pi}(z)+\rewardq{\pi}(z)- \langle \phi(z), \theta_r^* \rangle$ and $\theta^*_r := \argmin_{\theta} \max_{(s,a)} \norm{Q_r^\pi(s,a) - \langle \theta, \phi(s,a) \rangle}$ is the optimal parameter for the given policy $\pi$. 

Therefore we can write $q_r(z)= \epsilon(z)+\langle \phi(z), \theta_r^* \rangle$ 
\begin{align*}
\vert \langle \phi(z), \theta_r^\pi \rangle - \rewardq{\pi} \vert & =\vert \langle \phi(z), \theta_r^\pi \rangle-\langle \phi(z), \theta_r^* \rangle+\langle \phi(z), \theta_r^* \rangle - \rewardq{\pi} \vert \\
& \leq \vert \langle \phi(z), \theta_r^\pi \rangle - \langle \phi(z), \theta_r^* \rangle \vert + \epsilon_b \quad [\epsilon_b \text{ from } \cref{assum:linear-realizability}]
\end{align*}
Now we need to bound the first term of above inequality. Based on the definition of $\epsilon(z)$, we can write $\theta_r^\pi = G_{\omega}^{\dagger} \sum_{z' \in \cC} (\langle \phi(z'), \theta_r^* \rangle + \epsilon(z')) \, \omega(z') \phi(z')$. Using this equality we can get easily that: 
\begin{align*}
\vert \langle \phi(z), \theta_r^\pi\rangle - \langle \phi(z), \theta_r^* \rangle \vert& =\vert  \sum_{z' \in \cC} \epsilon(z') \omega(z') \phi(z)^T G_{\omega}^{\dagger} \phi(z')\vert\\ 
& \leq \sum_{z' \in \cC} \vert \epsilon(z') \vert \vert \omega(z') \phi(z)^T G_{\omega}^{\dagger} \phi(z')\vert  \\
&\leq \vert \max_{z' \in \cC} \epsilon(z')\vert \sum_{z' \in \cC} \vert \omega(z') \phi(z)^T G_{\omega}^{\dagger} \phi(z') \vert \\
\end{align*}
To bound the sum term we can 
\begin{align*}
    \left (\sum_{z' \in \cC} \omega(z') \vert \phi(z)^T G_{\omega}^{\dagger} \phi(z') \vert\right)^2 & \leq \sum_{z' \in \cC} \omega(z') \left ( \vert\phi(z)^T G_{\omega}^{\dagger} \phi(z') \vert\right)^2 & \tag{Jensen's inequality} \\
    &= \phi(z)^T G_{\omega}^{\dagger} \left(  \sum_{z' \in \cC}  \left[\omega(z') \phi(z') \phi(z')^T\right]  \right) G_{\omega}^{\dagger} \phi(z)\\
    & = \|\phi(z)\|^2_{G_{\omega}^{\dagger}}
\end{align*}

To finish the proof we need to bound $\vert \max_{z' \in \cC} \epsilon(z')\vert$. Based on the definition of $\epsilon(z')$ we have
\begin{align*}
\vert \epsilon(z') \vert & \leq \vert q_r(z')-\rewardq{\pi}(z') \vert + \vert \rewardq{\pi}(z')- \langle \phi(z'), \theta_r^* \rangle \vert \\ 
& \leq \vert q_r(z)-\rewardq{\pi}(z) \vert + \epsilon_b \\
& \leq \frac{1}{1 - \gamma} \,   \sqrt{\frac{\log(2  /\delta)}{2 m}} + \epsilon_b
\end{align*}
where the second inequality is due to function approximation error (\cref{assum:linear-realizability}) and the last inequality comes from Hoeffding’s inequality. Specifically, since the $m$ trajectories are independent, and the action-value functions lie in the $[0, \nicefrac{1}{1- \gamma}]$ range, we use Hoeffding's inequality to conclude that the sampling error for each $z \in \cC$ can be upper-bounded by $\frac{1}{1 - \gamma} \sqrt{\frac{\log(2/\delta)}{2 m}}$. Since we desire uniform control over all states and actions in $\cC$, by union bound, with probability $1 - \delta$, $|q_r(z) - \rewardq{\pi}(z)| \leq \frac{1}{1 - \gamma} \sqrt{\frac{\log(2 |\cC|/\delta)}{2 m}}$. Putting everything together we get the result. 
\end{proof}


\begin{restatable}{corollary}[{}
Under~\cref{assum:linear-realizability}, \texttt{OG} and \texttt{CV} of \Alg/ can be bounded as:
\begin{align*}
\texttt{\text{OG}} & \leq \frac{
\left(\frac{3 (1 + U) \, \sqrt{1 + \text{KL}(\pi_0 || \pi^*)}}{1 - \gamma} +  \Psi \right)}{(1 - \gamma) \sqrt{T}}+ \frac{\tvarepsilon (1 + 2 U)}{1 - \gamma}, \\
\texttt{\text{CV}} &\leq  \frac{\zeta \left(\frac{3 (1 + U) \, \sqrt{1 + \text{KL}(\pi_0 || \pi^*)}}{1 - \gamma} +  \Psi \right)}{\sqrt{T}} + \zeta \, \tvarepsilon (1 + 2 U),
\end{align*}
where $U = \frac{2}{\zeta (1 - \gamma)}$, $\tvarepsilon = \epsb (1+\sqrt{d}) + \frac{\sqrt{d}}{1 - \gamma} \sqrt{\frac{\log(2 d (d+1) /\delta)}{2 m}}$ and $\Psi= 4U\sqrt{\log((T+1)U)} + 1$.
\label{cor:cb-lfa}
\end{restatable}  
\begin{proof}
The proof is similar to the proof of~\cref{cor:cb} but with a different $\tvarepsilon$.  
\end{proof}



\clearpage
%!TEX root =  main.tex
\section{Additional Implementation Details}
\label{app:exp-details}
In~\cref{app:practical-cbp-algo}, we describe a more practical variant of \Alg/ and in~\cref{app:g-experimental-algo}, we describe the offline G-experimental design procedure required to form the coreset $\cC$ for \Alg/. Details about the synthetic tabular environment are presented in~\cref{app:des-tabular-env} whereas~\cref{app:hyperparameters} details the hyperparameters used across the different experiments. 

\subsection{Practical Coin-Betting Politex algorithm}
\label{app:practical-cbp-algo}
We present the practical version of CBP which uses a parameter $\alpha_\lambda$ in \cref{alg:cbp-practical}.
\begin{algorithm}[!ht]
\LinesNumbered
\caption{Practical Coin-Betting Politex}
\label{alg:cbp-practical}
    \textbf{Input}: $\alpha_\lambda>0$ (parameter), $\pi_0$ (policy initialization), $\lambda_0$ (dual variable initialization), $m$ (Number of trajectories), $T$ (Number of iterations), Feature map $\Phi$. \vspace{1ex} \\
    
    \textbf{Initialize}: $L_0=0$\\
    
    Compute coreset $\cC$ and distribution $\omega$  \vspace{1ex} \\
    
    Solve the unconstrained problem $\max_{\pi} \consthat{\pi}$ to estimate $\zeta$ in~\cref{lemma:sd} and set $U = \frac{2}{\zeta (1 - \gamma)}$.  \vspace{1ex}
    
    \For{$t \leftarrow 0$  \KwTo  $T-1$}{
    For every $(s,a) \in \cC$, use $m$ trajectories starting from $(s,a)$ using policy $\pit$ and estimate the action-value functions $q_r(s,a)$ and $q_c(s,a)$.  \vspace{1ex}
    
    Compute and store $\thetahat_r^{\pit}$ and $\thetahat_c^{\pit}$ using~\cref{eq:lspg}.  \\

    \For{every $s$ encountered in the trajectory generated by $\pit$, and for every $a$}{\vspace{1ex}
    Compute 
    $\rewardqhat{t}(s,a)  = \langle \thetahat^{\pit}_{r}, \phi(s,a) \rangle$; $\constqhat{t}(s,a) = \langle \thetahat^{\pit}_{c}, \phi(s,a) \rangle$ and $\lagqhat{t}(s,a)  = \rewardqhat{t}(s,a) + \lambda_{t} \, \constqhat{t}(s,a)$. \vspace{1ex}
    
    Update policy,
    \begin{align*}
    \lagahat{t}(s,a) &= \frac{1 - \gamma}{1 + U} \, \left[\lagqhat{t}(s,a) - \left \langle \lagqhat{t}(s,\cdot), \pit(\cdot|s) \right \rangle \right]\\
    \lagatilde{t}(s,a) &= \lagahat{t}(s,a) \, \cI\{w_t(s,a) > 0\} + [\lagahat{t}(s,a)]_{+} \, \cI\{w_t(s,a) \leq 0\}\\
    w_{t+1}(s,a) &= \frac{\sum_{i=0}^{t} \lagatilde{i}(s,a)}{(t+1) + T/2} \left(1 + \sum_{i=0}^{t} \lagatilde{i}(s,a) \, w_{i}(s,a) \right)\\
    \pitt(a|s) &= 
    \begin{cases}
        \pi_0(a|s), \quad \text{if } \sum_{a}{\pi_0(a|s) \, [w_{t+1}(s,a)]_{+}} = 0 \\
        \frac{\pi_0(a|s) \, [w_{t+1}(s,a)]_{+}}{\sum_{a}{\pi_0(a|s) \, [w_{t+1}(s,a)]_{+}}}, \quad \text{otherwise}.
    \end{cases}    
    \end{align*}
    }
    Update dual variable,
    \begin{align*}
    \hat V_c^t(\rho)& = \langle \rho(\cdot), \langle \constqhat{t}(s,\cdot), \pit(\cdot|s) \rangle\rangle\\
    g_t &= b - \hat V_c^t(\rho)\\
    L_t &= \max(L_{t-1}, |g_t|)\\
    \lambda_{t+1} &= \lambda_0 + \frac{\sum_{i=0}^{t} g_i}{L_t \max(\sum_{i=0}^t |g_i| + L_t, \alpha_\lambda L_t)}\Big( L_t + \sum_{i=0}^t [(\lambda_i - \lambda_0)g_i]_{+} \Big)
    \end{align*}
    }
    % \Return mixture policy $\bar{\pi}_{T} := \frac{\sum_{t = 0}^{T-1} \pit}{T}$. 
\end{algorithm}

\subsection{Offline G-Experimental Design to build coreset $\cC$}
\label{app:g-experimental-algo}
We use offline G-experimental design to form the coreset in Line~$2$ of~\cref{alg:cbp}. In particular, we use the greedy iterative algorithm in~\cref{alg:g-design-coreset-algo} to build $\cC$: in iteration $\tau$, go through all the states and actions adding the $(s,a)$ pair (to $\cC$) with the highest marginal gain computed as $\textsl{g}_{\tau}(s,a) := \indnorm{\phi(s,a)}{G_{\tau}^{\dagger}}$. Here $G_{\tau}$ is the Gram matrix formed by the features of the $(s,a)$ pairs present in $\cC$ at iteration $\tau$. For a specified input $\epsilon' > 0$, the algorithm terminates at iteration $\Tau$ when $\max_{(s,a)} \textsl{g}_{\Tau}(s,a) \leq \epsilon'$. Hence, the algorithm directly controls $\sup_{(s,a)} \indnorm{\phi(s,a)}{G_{\omega}^{\dagger}} \leq \epsilon'$ in~\cref{lemma:lspe-extrapolation}, and hence controls $\epss$ in practice. However, this procedure does not have a guarantee on how large $\vert \cC \vert$ can be. In practice, we set $\epsilon'$ such that $\vert \cC \vert = O(d)$. Although we only consider forming the coreset in an offline manner that involves iterating through all $SA$ state-action pairs, efficient online variants forming the coreset while running the algorithm have been developed recently~\citep{li2021sample}. Such techniques are beyond the scope of this paper and we plan to explore them in future work.

\begin{algorithm}[!h]
\LinesNumbered
\caption{Coreset $\cC$ formation using G-experimental design}
\label{alg:g-design-coreset-algo}
    \textbf{Input}: $\Phi$ (Feature map), $\epsilon'>0$ (tolerance parameter), $\nu = 1$ (default value). \vspace{1ex} \\
    
    \textbf{Initialize}: $G^{\dagger} = \frac{1}{\nu} \cI_{d}$, $\cC = \emptyset$, $g_{max} = \infty$ (maximum marginal gain).\\
    \While{$g_{max} \leq \epsilon'$}{
        $g_{max} = 0$\\
        \For{ $\forall (s,a) \in (\cS \times \cA)$}{
            Compute $g(s,a) = \sqrt{\phi(s,a)^{\transpose} G^{\dagger} \phi(s,a)}$ \quad [marginal gain]\\
            \If{$g_{max} < g(s,a)$}{
            $(s^*, a^*) = (s,a)$\\
            $g_{max} = g(s,a)$
            }
        }
        $\cC = \cC \cup \{(s^*, a^*)\}$ \\
        $G^{\dagger} = G^{\dagger} - \frac{G^{\dagger} \phi(s^*, a^*) \phi(s^*, a^*)^{\transpose} G^{\dagger}}{1 + \phi(s^*, a^*)^{\transpose} G^{\dagger} \phi(s^*, a^*)}$ \quad [Sherman-Morrison to compute $\left(G + \phi(s^*, a^*) \,\phi(s^*, a^*)^{\transpose}\right)^{\dagger}$]
        
    }
\end{algorithm}

\clearpage
\subsection{Synthetic Tabular Environment}
\label{app:des-tabular-env}
In \cref{fig:gridworld}, we show the synthetic tabular environment which is modified from Example 3.5 \citep{sutton2018reinforcement} to add the constraint rewards.
\begin{figure}[!ht]
    \centering
    \includegraphics[scale=0.3]{plot/Grid/5x5Grid.png}
    \caption{\textbf{Tabular environment:} A $5X5$ gridworld environment where all actions results in reward(r) and constraint reward(c) as $0$, except special states denoted by $A$ and $B$. All four actions in states $(A,B)$ transitions the agent to states $(A', B')$ and results in reward as $(1, 0.5)$ and constraint rewards as $(0.1, 1)$ respectively. The remaining transitions incur zero reward and zero constrain reward.}
    % \caption{\textbf{Tabular environment:} A $5X5$ gridworld environment where all actions results in reward($r$) and constraint reward($c$) as $0$, except special states denoted by $A$ and $B$. All four actions in states $(A,B)$ transitions the agent to states $(A', B')$ and results in reward as $(1, 0.5)$ and constraint reward as $(0.1, 1)$ respectively. Remaining states for all the cardinal actions receive zero rewards ($r, c$).}
    \label{fig:gridworld}
\end{figure}

\subsection{Hyper-parameters}
\label{app:hyperparameters}
\begin{table}[!h]
\caption[]{\textbf{Hyperparameters for tabular setting:} Shows the hyperparameters for different algorithms CBP, GDA and CRPO for experiments in~\cref{app:expaddtab}.}
\label{tab:hyperparameter-tabular}
\centering
\begin{tabular}{|c|c|c|c|}
\hline
Experiments          & \textbf{CBP}       & \textbf{GDA}                                                                     & \textbf{CRPO}     \\ \hline
\textit{Model-based} & $\alpha_\lambda=8$ & \begin{tabular}[c]{@{}c@{}}$\alpha_\pi=1.0$,\\ $\alpha_\lambda=0.1$\end{tabular} & $\alpha_\pi=0.75,\eta=0.0$ \\ \hline
\textit{Model-free}  & $\alpha_\lambda=8$ & \begin{tabular}[c]{@{}c@{}}$\alpha_\pi=1.0$,\\ $\alpha_\lambda=0.1$\end{tabular} & $\alpha_\pi=0.75,\eta=0.0$ \\ \hline
\end{tabular}
\end{table}
%%%%%
\begin{table}[!h]
\caption[]{\textbf{Hyperparameters for LFA setting with sampling :} Shows the hyperparamters used for different $d$ dimension features for CBP, GDA, CRPO with fixed number of samples for $\hat Q$ approximations (for gridworld experiments in \cref{app:lfa-exp}).}
\centering
\begin{tabular}{|l|l|l|l|}
\hline
\textbf{Algorithms} & $d=40$                  & $d=56$                  & $d=80$                 \\ \hline
\textbf{CBP}        & $\alpha_\lambda=0.25$ & $\alpha_\lambda=0.25$ & $\alpha_\lambda=0.1$ \\ \hline
\textbf{GDA} &
  \begin{tabular}[c]{@{}l@{}}$\alpha_{\pi}=1.0$,\\ $\alpha_{\lambda}=0.1$\end{tabular} &
  \begin{tabular}[c]{@{}l@{}}$\alpha_\pi=1.0$,\\ $\alpha_\lambda=1.0$\end{tabular} &
  \begin{tabular}[c]{@{}l@{}}$\alpha_\pi=1.0$,\\ $\alpha_\lambda=0.1$\end{tabular} \\ \hline
\textbf{CRPO}       & $\alpha_\pi=0.75$     & $\alpha_\pi=0.75$     & $\alpha_\pi=0.75$    \\ \hline
\end{tabular}
\label{tab:hyper_lfa_sampling_tc}
\end{table}

\begin{table}[!h]
\caption[]{\textbf{Hyperparameters for Cartpole environment}: Shows the best hyperparameter for different values of entropy regularization coefficient $\nu$ and different algorithms namely CBP, GDA and CRPO (for experiments in \cref{app:lfa-exp}).}
\centering
\begin{tabular}{|c|c|c|c|c|}
\hline
\textit{Algorithms} & \boldsymbol{$\nu=0$}     & \boldsymbol{$\nu=0.1$}   & \boldsymbol{$\nu=0.01$}  & \boldsymbol{$\nu=0.001$}          \\ \hline
CBP                 & $\alpha_\lambda=0.1$ & $\alpha_\lambda=0.1$ & $\alpha_\lambda=5.0$ & $\alpha_\lambda=0.5$ \\ \hline
GDA &
  \begin{tabular}[c]{@{}c@{}}$\alpha_\pi=0.01$,\\ $\alpha_\lambda=0.001$\end{tabular} &
  \begin{tabular}[c]{@{}c@{}}$\alpha_\pi=0.001$,\\ $\alpha_\lambda=0.001$\end{tabular} &
  \begin{tabular}[c]{@{}c@{}}$\alpha_\pi=0.1$,\\ $\alpha_\lambda=0.1$\end{tabular} &
  \begin{tabular}[c]{@{}c@{}}$\alpha_\pi=0.01$,\\ $\alpha_\lambda=0.0001$\end{tabular} \\ \hline
\textit{CRPO} &
  \begin{tabular}[c]{@{}c@{}}$\alpha_\pi=0.1$,\\ $\eta=10$\end{tabular} &
  \begin{tabular}[c]{@{}c@{}}$\alpha_\pi=0.1$,\\ $\eta=0$\end{tabular} &
  \begin{tabular}[c]{@{}c@{}}$\alpha_\pi=0.5$,\\ $\eta=10$\end{tabular} &
  \begin{tabular}[c]{@{}c@{}}$\alpha_\pi=0.5$,\\ $\eta=0$\end{tabular} \\ \hline
\end{tabular}
\label{tab:hyper_lfa_cartpole}
\end{table}
\clearpage
%!TEX root =  main.tex
\section{Additional Experimental Results}
\label{app:experiments}

\subsection{Tabular Setting}
\label{app:expaddtab}
\paragraph{Model-based setting:} In \cref{fig:CB_GDA_best_param_MB}, we demonstrate performance -- optimality gap (OG) and constraint violation (CV) -- with best hyperparameters for three algorithms namely, CBP, GDA and CRPO. In addition, we show the performance of GDA with theoretical learning rates of $\pi$ and $\lambda$ to focus on the importance of tuning GDA's hyperparameter for practical purpose. We observe OG converges to zero quickly for our CBP as compared to GDA and CRPO with constraint satisfaction (when $CV\leq0$). The ideal performance metric is when both OG and CV converges to $0$ value. Refer \cref{tab:hyperparameter-tabular} for best values of hyperparameter. We used the following ranges of hyperparameters. For CBP, $\alpha_\lambda=\{1,2,5,8,15,50,100,300,500\}$. The hyperparameter of GDA varied as $\alpha_\pi = \{0.001, 0.01, 0.1, 1.0\}$ (learning rate policy) and $\alpha_\lambda=\{0.0001, 0.001, 0.01, 0.1, 1.0\}$ (learning rate dual variable). For CRPO, the learning rate of policy varied as $\alpha_\pi=\{0.001, 0.01, 0.05, 0.1, 0.5, 0.75\}$ and tolerance parameter $\eta = \{0, 0.25\}$. 
\label{app:tab-exp}
\begin{figure}[!ht]
        \centering
		\begin{subfigure}[b]{0.25\linewidth}
			\centering
			\captionsetup{justification=centering}
			\includegraphics[width=0.9\textwidth]{plot/tabular/model_based/best_hyperparam/OG_with_theory_LR.png}
			\caption[]{\small{OG}}
		\end{subfigure}
		\begin{subfigure}[b]{0.25\linewidth}
			\centering
			\captionsetup{justification=centering}
			\includegraphics[width=0.9\textwidth]{plot/tabular/model_based/best_hyperparam/CV_with_theory_LR.png}
			\caption[]{\small{CV}}
		\end{subfigure}
		\caption[]
        {\textbf{Model-based in tabular case:} OG and CV for CB, GDA and CRPO with best hyperparameters. The results are averaged over $5$ runs with $95\%$ confidence interval. We assume that we have access to true CMDP. The best hyperparameter has the least OG and satisfies the condition $CV \in[-0.25,0]$. We also show the performance of baseline GDA with theoretical $\alpha_\pi=\sqrt{\frac{2 \log |A|}{T}}\frac{1-\gamma}{1+U}$ and $\alpha_\lambda = \frac{U(1-\gamma)}{\sqrt{T}}$. Here, $U=\frac{2}{\zeta(1-\gamma)}$. This is shown in blue dashed line.}
        \label{fig:CB_GDA_best_param_MB}
\end{figure}

\paragraph{Model-free setting:} Here, we test the performance of algorithms in the model-free setting (don't have access to true CMDP model). We use TD(0) based sampling approach \citep{sutton1988learning} to estimate the $Q$ action-value function. We sample data for all $(s,a) \in \cS \times \cA$. In \cref{fig:CB_GDA_MF_TDSampling}, we observe the effect on performance by varying the number of samples for $Q$ action-value estimation. Here, we consider one-hot encoded features (no overlapping of features). We observe that CBP consistently converges faster than its counterpart in the sampling-based approach. Further, it also matches the expectation that the performance improves with the increase in number of samples. In~\cref{fig:MF_sampling_1_hot_hyperparam_sensitivity,fig:MF_sampling_1_hot_gamma}, we show the robustness of CBP with hyperparameter sensitivity and environment misspecification respectively. The hyperparameters used for these experiments are presented in \cref{tab:hyperparameter-tabular} in \cref{app:hyperparameters}.
% \begin{figure}[!ht]
% \centering
% 		\begin{subfigure}[b]{0.25\linewidth}
% 			\centering
% 			\captionsetup{justification=centering}
% 			\includegraphics[width=0.9\textwidth]{plot/tabular/model_free/TD_Sampling/FinalResults/OG.png}
% 			\caption[]{\small{OG}}
% 		\end{subfigure}
% 		\begin{subfigure}[b]{0.25\linewidth}
% 			\centering
% 			\captionsetup{justification=centering}
% 			\includegraphics[width=0.9\textwidth]{plot/tabular/model_free/TD_Sampling/FinalResults/CV.png}
% 			\caption[]{\small{CV}}
% 		\end{subfigure}
% 		\caption[]
%         {\textbf{Model-free tabular case:} We don't have access to true CMDP model here. We vary the $\text{number of samples}=\{1000, 2000, 3000\}$ for $Q$ value estimation to observe the effect on the performance. The results are averaged over $5$ runs. The performance improves with increase in samples. CBP consistently performs better than the baselines GDA and CRPO.}
%         \label{fig:CB_GDA_MF_TDSampling}
% \end{figure}


\begin{figure}[!ht]
        \centering
		\begin{subfigure}[b]{0.25\linewidth}
			\centering
			\captionsetup{justification=centering}
			\includegraphics[width=0.9\textwidth]{plot/tabular/model_free/TD_Sampling/FinalResults/HyperParameter/OG.png}
			\caption[]{\small{OG}}
		\end{subfigure}
		\begin{subfigure}[b]{0.25\linewidth}
			\centering
			\captionsetup{justification=centering}
			\includegraphics[width=0.9\textwidth]{plot/tabular/model_free/TD_Sampling/FinalResults/HyperParameter/CV.png}
			\caption[]{\small{CV}}
		\end{subfigure}
		\caption[]
        {\textbf{Sensitivity to hyperparameters in model-free gridworld environment:} Performance with different hyperparameters for CB, GDA and CRPO. The results are averaged over $5$ runs with $95\%$ confidence interval. We used $2000$ samples for $\forall (s,a) \in \cS \times \cA$ to estimate $Q$ values for both reward and cost. The results are demonstrated on one-hot features (with no feature overlap). The hyperparameters for CB are $\alpha_\lambda=\{1, 2, 5, 8, 15, 50, 100, 300, 500\}$. The hyperparameter for GDA are $\alpha_\pi = \{0.001, 0.01, 0.1, 1.0\}$ (learning rates for policy) and $\alpha_\lambda=\{0.0001, 0.001, 0.01, 0.1, 1.0\}$ (learning rate for dual variable). The hyperparameter for CRPO are $\alpha_\pi= \{0.001, 0.01, 0.05, 0.1, 0.5, 0.75\}$. We use $\eta=0.0$ for CRPO. The key observation is that CBP is robust against the variations in hyperparameters with a smaller variance in performance against multiple runs.}
        \label{fig:MF_sampling_1_hot_hyperparam_sensitivity}
\end{figure}

\begin{figure}[!ht]
        \centering
		\begin{subfigure}[b]{0.25\linewidth}
			\centering
			\captionsetup{justification=centering}
			\includegraphics[width=0.9\textwidth]{plot/tabular/model_free/TD_Sampling/FinalResults/DiscountFactor/OG_param_sens.png}
			\caption[]{\small{OG}}
		\end{subfigure}
		\begin{subfigure}[b]{0.25\linewidth}
			\centering
			\captionsetup{justification=centering}
			\includegraphics[width=0.9\textwidth]{plot/tabular/model_free/TD_Sampling/FinalResults/DiscountFactor/CV_param_sens.png}
			\caption[]{\small{CV}}
		\end{subfigure}
		\caption[]
        {\textbf{Environment misspecification in model-free gridworld by varying discount factor $\gamma$:}  We only introduce sampling error by estimating $Q$ function with $2000$ samples for all $(s,a)$ pair for all three algorithms (no feature overlap). We vary the discount factor $\gamma=\{0.7, 0.8\}$ to observe the effects of environment misspecification on CBP, GDA and CRPO. We keep the hyperparameters fixed for all the algorithms, similar to the one for original CMDP with $\gamma=0.9$. The results are averaged over $5$ runs with $95\%$ confidence interval.The hyperparameters used are reported in \cref{tab:hyperparameter-tabular}. We observe that CRPO does not even satisfy constraint for case when $\gamma=0.8$ ($CV>0$). Further, our CBP converges consistently faster than the baselines.}
        \label{fig:MF_sampling_1_hot_gamma}
\end{figure}


\subsection{Linear setting}
\label{app:lfa-exp}
\paragraph{Gridworld environment:} We use $5\times5$ gridworld environment as show in \cref{fig:gridworld}. Tile coding is used to learn the feature representation for every $(s,a)$ pair in the environment. Number of tilings used are $1$ and we vary the tiling size to change the dimension of the features (feature overlap for multiple $(s,a)$ pairs). In \cref{fig:lfa-gridworld-hyperparmeter-sensitivity} we show hyperparameter sensitivity on performance for all the three algorithms with different $d$ dimension of features. The values of all other parameters were kept fixed. Similar observation holds here, CBP is robust to varying values of hyperparameters. The range of hyperparameter is similar to one in \cref{fig:MF_sampling_1_hot_hyperparam_sensitivity}.
\begin{figure}[!ht]
        \centering
		\begin{subfigure}[b]{0.25\linewidth}
			\centering
			\captionsetup{justification=centering}
			\includegraphics[width=0.9\textwidth]{plot/tabular/model_free/TileCodingFeatures/FinalResults_TC/hyperparam_search/OG_tile10.png}
			\caption[]{\small{OG ($d=40$)}}
		\end{subfigure}
		\begin{subfigure}[b]{0.25\linewidth}
			\centering
			\captionsetup{justification=centering}
			\includegraphics[width=0.9\textwidth]{plot/tabular/model_free/TileCodingFeatures/FinalResults_TC/hyperparam_search/CV_tile10.png}
			\caption[]{\small{CV ($d=40$)}}
		\end{subfigure}
		
		\begin{subfigure}[b]{0.25\linewidth}
			\centering
			\captionsetup{justification=centering}
			\includegraphics[width=0.9\textwidth]{plot/tabular/model_free/TileCodingFeatures/FinalResults_TC/hyperparam_search/OG_tile14.png}
			\caption[]{\small{OG ($d=56$)}}
		\end{subfigure}
		\begin{subfigure}[b]{0.25\linewidth}
			\centering
			\captionsetup{justification=centering}
			\includegraphics[width=0.9\textwidth]{plot/tabular/model_free/TileCodingFeatures/FinalResults_TC/hyperparam_search/CV_tile14.png}
			\caption[]{\small{CV ($d=56$)}}
		\end{subfigure}
		
		\begin{subfigure}[b]{0.25\linewidth}
			\centering
			\captionsetup{justification=centering}
			\includegraphics[width=0.9\textwidth]{plot/tabular/model_free/TileCodingFeatures/FinalResults_TC/hyperparam_search/OG_tile20.png}
			\caption[]{\small{OG ($d=80$) }}
		\end{subfigure}
		\begin{subfigure}[b]{0.25\linewidth}
			\centering
			\captionsetup{justification=centering}
			\includegraphics[width=0.9\textwidth]{plot/tabular/model_free/TileCodingFeatures/FinalResults_TC/hyperparam_search/CV_tile20.png}
			\caption[]{\small{CV ($d=80$)}}
		\end{subfigure}
		\caption[]
        {\textbf{Linear function approximation in gridworld environment:} We approximate the $Q$ value using LSTDQ with $300$ samples for each $(s,a)$ pair. The dimension of the features (denoted by $d$) are varied to observe the sensitivity to a range of hyperparameter values. We kept all the other parameters fixed. We use (a,b) $d=40$, (b,c) $d=56$, (e,f) $d=80$ dimension features respectively. \cAlg/ is consistently robust to variation in the hyperparameters for different dimensions as compared to baselines \cGDA/ and \cCRPO/.}
        \label{fig:lfa-gridworld-hyperparmeter-sensitivity}
\end{figure}
% \todoc{``The dimension of the features (denoted by $d$) are varied to observe the sensitivity to hyperparameters''. Which hyperparameters? Were all the other ``parameters'' kept the same?}

\paragraph{G-experimental design for gridworld environment:} In~\cref{fig:g-experimental} we show the performance with G-experimental design (\cref{app:g-experimental-algo}). Here subset of $(s,a)\in \cC$ pairs are chosen from a \textit{coreset}. 
\begin{figure}[!ht]
\centering
		\begin{subfigure}[b]{0.25\linewidth}
			\centering
			\captionsetup{justification=centering}
			\includegraphics[width=0.9\textwidth]{plot/tabular/model_free/TileCodingFeatures/FinalResults_TC/KW/OG.png}
			\caption[]{\small{OG}}
		\end{subfigure}
		\begin{subfigure}[b]{0.25\linewidth}
			\centering
			\captionsetup{justification=centering}
			\includegraphics[width=0.9\textwidth]{plot/tabular/model_free/TileCodingFeatures/FinalResults_TC/KW/CV.png}
			\caption[]{\small{CV}}
		\end{subfigure}
		\caption[]
        {\textbf{G-experimental design:} We show the performance with G-experimental design, where subset of $(s,a)\in \cC$ are chosen to learn the  weight vectors of $Q$ function. Here, we are in model-free setting with $d=56$ dimensional features in LFA. We used $300$ samples for $c \in \cC$ to learn the estimate of $Q$ for all three algorithms.}
        \label{fig:g-experimental}
\end{figure}


 \paragraph{Exploration in continuous state-spaces:} We used G-experimental design for the discrete state-action environment in the previous section. However, such a procedure is difficult to implement for the continuous state-action spaces we consider in this section. In order to achieve enough exploration in practice, similar to~\citet{xu2021crpo}, we use entropy regularization~\citep{geist2019theory,cen2021fast} for the policy updates. Specifically, for a specified regularization parameter $\nu$, our task is to find a sequence of policies $\{\pi_0, \pi_1, \ldots, \pi_{T-1}\}$ that minimize the regularized primal regret,
\begin{align*}
\cR^{p}_{\nu}(\pi^*, T) & := \sum_{t = 0}^{T-1} \sum_{s = 0}^{\cS-1} \big[\langle \piopt(\cdot|s) - \pit(\cdot|s), \rewardqhat{t} + \lambdat \constqhat{t} \rangle + \nu \, d^{\pit}(s) \sum_{a \in \cA} \pi(a|s) \log(\pi(a|s)) \big]. 
\end{align*}
It can be easily seen~\citep{geist2019theory} that the form of the algorithm updates remain the same, but the action-value functions for policy $\pi$ need to be redefined to depend on the ``effective reward'' equal to $r(s,a) - \nu \log \pi(a|s)$. Therefore, the new $\hat Q_l^t$ with exploration is equal to $\hat Q_l^t(s,a) = \hat Q_r^t(s,a) +\lambdat \hat Q_c^t(s,a) - \nu \log \pi_t(a|s)$. 

\paragraph{Cartpole environment}: We added two constraint rewards ($c_1, c_2$) to the classic OpenAI gym Cartpole environment. (1) Cart receives a $c_1=0$ constraint reward value when enters the area $[-2.4, -2.2], [-1.3, -1.1], [1.1, 1.3], [2.2, 2.4]$, else receive $c_1=+1$. (2) When the angle of the cart is less than $4$ degrees receive $c_2=+1$, else everywhere $c_2=0$. Each episode length is no longer than 200.

We used tile coding \citep{sutton2018reinforcement} to discretize the continuous state space of the environment. The dimension of the features is $2^{12}$. We used $8$ number of tilings with each grid size $4\times 4$. For experimenting the effect of adding exploration on the performance, we incorporated the entropy coefficient \citep{haarnoja2018soft,geist2019theory}. We varied the entropy regularizer $\nu = \{0, 0.1, 0.01, 0.001\}$. Refer to \cref{fig:cartpole_entropy_sensitivity} for the experiment with $\nu$ coefficient.

We conducted the experiments with following $\alpha_\lambda$ parameter value of CBP $\{0.1, 0.5, 5, 50, 250, 500, 750, 1000\}$. For GDA, we varied the learning rate of policy $\alpha_\pi = \{0.1, 0.01, 0.001, 0.0001\}$ and learning rate of dual variable $\alpha_\lambda= \{0.1, 0.01, 0.001, 0.0001\}$. For CRPO baseline, the following values of learning rate of policy $\alpha_\pi=\{0.001, 0.005, 0.01, 0.05, 0.1, 0.5\}$ are experimented with. We kept the tolerance parameter $\eta$ of CRPO as $\{0, 10\}$. The best hyperparameters are summarized in \cref{tab:hyper_lfa_cartpole} for the different values of entropy regularizer $\nu$.
\begin{figure}[!ht]
        \centering
		\begin{subfigure}[b]{0.25\linewidth}
			\centering
			\captionsetup{justification=centering}
			\includegraphics[width=0.9\textwidth]{plot/Cartpole/5Runs/BReturn_eps_6_e0.0.png}
			\caption[]{\small{Return ($\nu=0$)}}
		\end{subfigure}
		\begin{subfigure}[b]{0.25\linewidth}
			\centering
			\captionsetup{justification=centering}
			\includegraphics[width=0.9\textwidth]{plot/Cartpole/5Runs/BCV1_eps_6_e0.0.png}
			\caption[]{\small{CV 1 ($\nu=0$)}}
		\end{subfigure}
		\begin{subfigure}[b]{0.25\linewidth}
			\centering
			\captionsetup{justification=centering}
			\includegraphics[width=0.9\textwidth]{plot/Cartpole/5Runs/BCV2_eps_6_e0.0.png}
			\caption[]{\small{CV 2 ($\nu=0$)}}
		\end{subfigure}
		
		\begin{subfigure}[b]{0.25\linewidth}
			\centering
			\captionsetup{justification=centering}
			\includegraphics[width=0.9\textwidth]{plot/Cartpole/5Runs/BReturn_eps_6_e0.1.png}
			\caption[]{\small{Return ($\nu=0.1$)}}
		\end{subfigure}
		\begin{subfigure}[b]{0.25\linewidth}
			\centering
			\captionsetup{justification=centering}
			\includegraphics[width=0.9\textwidth]{plot/Cartpole/5Runs/BCV1_eps_6_e0.1.png}
			\caption[]{\small{CV 1 ($\nu=0.1$)}}
		\end{subfigure}
		\begin{subfigure}[b]{0.25\linewidth}
			\centering
			\captionsetup{justification=centering}
			\includegraphics[width=0.9\textwidth]{plot/Cartpole/5Runs/BCV2_eps_6_e0.1.png}
			\caption[]{\small{CV 2 ($\nu=0.1$)}}
		\end{subfigure}
		
		\begin{subfigure}[b]{0.25\linewidth}
			\centering
			\captionsetup{justification=centering}
			\includegraphics[width=0.9\textwidth]{plot/Cartpole/5Runs/BReturn_eps_6_e0.01.png}
			\caption[]{\small{Return ($\nu=0.01$)}}
		\end{subfigure}
		\begin{subfigure}[b]{0.25\linewidth}
			\centering
			\captionsetup{justification=centering}
			\includegraphics[width=0.9\textwidth]{plot/Cartpole/5Runs/BCV1_eps_6_e0.01.png}
			\caption[]{\small{CV 1 ($\nu=0.01$)}}
		\end{subfigure}
		\begin{subfigure}[b]{0.25\linewidth}
			\centering
			\captionsetup{justification=centering}
			\includegraphics[width=0.9\textwidth]{plot/Cartpole/5Runs/BCV2_eps_6_e0.01.png}
			\caption[]{\small{CV 2 ($\nu=0.01$)}}
		\end{subfigure}
		
		\begin{subfigure}[b]{0.25\linewidth}
			\centering
			\captionsetup{justification=centering}
			\includegraphics[width=0.9\textwidth]{plot/Cartpole/5Runs/BReturn_eps_6_e0.001.png}
			\caption[]{\small{Return ($\nu=0.001$)}}
		\end{subfigure}
		\begin{subfigure}[b]{0.25\linewidth}
			\centering
			\captionsetup{justification=centering}
			\includegraphics[width=0.9\textwidth]{plot/Cartpole/5Runs/BCV1_eps_6_e0.001.png}
			\caption[]{\small{CV 1 ($\nu=0.001$)}}
		\end{subfigure}
		\begin{subfigure}[b]{0.25\linewidth}
			\centering
			\captionsetup{justification=centering}
			\includegraphics[width=0.9\textwidth]{plot/Cartpole/5Runs/BCV2_eps_6_e0.001.png}
			\caption[]{\small{CV 2 ($\nu=0.001$)}}
		\end{subfigure}
		\caption[]
        {\textbf{Cartpole environment:} We show the sensitivity to entropy regularization $\nu$ for all three algorithms CBP, GDA and CRPO. The performance is averaged over $5$ runs with $95\%$ confidence interval. Different rows corresponds to different value of $\nu=\{0, 0.1, 0.01, 0.001\}$. Darker lines show the performance with the best hyperparameters. Lighter shade lines show performance with other values of hyperparameters. The range of hyperparameter for \Alg/ is $\alpha_\lambda=\{0.1, 0.5, 5, 50, 250, 500, 750, 1000\}$. For GDA, we vary learning rates of both policy and dual variable as $\{0.1, 0.01, 0.001, 0.0001\}$. For CRPO, we vary $\alpha_\pi=\{0.1, 0.5, 0.01, 0.001, 0.005\}$ and tolerance hyperparameter as $\eta=\{0, 10\}$.}
        \label{fig:cartpole_entropy_sensitivity}
\end{figure}
\clearpage
\bibliography{jain_326}
\end{document}

% \nobibliography{jain_326-ref}
% \end{document}
