%\documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
% version; also before submission to
% see how the non-anonymous paper
% would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
% ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
\bibliographystyle{plainnat}
\renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\usepackage{amsmath,amssymb,amsfonts,amsthm}
\usepackage{algorithm,algorithmic}
\usepackage{mathtools, bbm}
\usepackage[mathscr]{euscript}
\usepackage{dsfont}
\usepackage{booktabs,multirow}
\usepackage{nicefrac}
\usepackage{subfig}
\newtheorem{definition}{Definition}
\newtheorem{proposition}{Proposition}
\newtheorem{lemma}{Lemma}
\newtheorem{assumption}{}
\renewcommand\theassumption{(A\arabic{assumption})}
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}{Corollary}
\newtheorem{remark}{Remark}
\newcommand{\R}{\mathbb{R}}
\newcommand{\E}{\mathbb{E}}
\newcommand{\N}{\mathbb{N}}
\newcommand{\p}{\mathbb{P}}
\newcommand{\1}{\mathds{1}}

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL.
% In the main paper, hard code any cross-reference to the supplementary material.
\usepackage{xr}
% \externaldocument{uai2023-template}
\externaldocument{vijayan_677}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
%\newcommand{\swap}[3][-]{#3#1#2} % just an example
\allowdisplaybreaks
\setcounter{lemma}{6}
\setcounter{equation}{36}

\title{A policy gradient approach for optimization of smooth risk measures\\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
%\author[1]{\href{mailto:<jj@example.edu>?Subject=Your UAI 2023 paper}{Jane~J.~von~O'L\'opez}{}}
\author[1]{Nithia Vijayan}
\author[1]{Prashanth L.A.}
% Add affiliations after the authors
\affil[1]{%
    Department of Computer Science and Engineering,
    Indian Institute of Technology Madras, India.
}
\begin{document}
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle
\appendix

\section{Results for the policy gradient template}
\label{sec:SF_proofs}
\subsection{Results with true objective function $\rho(\cdot)$}
The following lemmas establish some results related to the SF-based gradient estimate.
\begin{lemma}
    \label{lm:del_rho_mu_2}
    $\mathbb{E}\left[\widehat{\nabla}_{\mu,n}\rho(\theta)\mid\theta\right] = \nabla\rho_{\mu}(\theta)$.
\end{lemma}
\begin{proof}
    We follow the technique from \cite{shamir}. Since $v_{1:n}$ are i.i.d r.v.s, and have symmetric distribution around the origin, we obtain
    \begin {align*}
    \mathbb{E}\left[\widehat{\nabla}_{\mu,n}\rho(\theta)\mid\theta\right]
    &=\mathbb{E}_{v_{1:n}}\left[\widehat{\nabla}_{\mu,n}\rho(\theta)\right]
    =\frac{d}{2\mu n} \sum_{i=1}^{n}\mathbb{E}_{v}\left[\left(\rho({\theta+\mu v})- \rho({\theta-\mu v})\right)v\right] \\
    &=\frac{d}{2\mu} \left(\mathbb{E}_{v}\left[\rho({\theta+\mu v})v\right] + \mathbb{E}_{v}\left[\rho({\theta+\mu (- v)})(- v)\right]\right)
    =\frac{d}{\mu}\mathbb{E}_{v}\left[\rho({\theta+\mu v})v\right] =\nabla \rho_{\mu}(\theta),
    \end {align*}
    where last equality follows from \cite[Lemma 2.1]{flaxman}.
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{lemma}
    \label{lm:bias_rho}
    Suppose $\forall \theta_1, \theta_2 \in \R^d$, $\left\lVert \nabla \rho (\theta_1)-\nabla \rho (\theta_2)\right\rVert \leq L_{\rho'} \left\lVert \theta_1 - \theta_2 \right\rVert$. Then $\left\lVert \nabla \rho_{\mu}(\theta) - \nabla \rho (\theta)\right\rVert \leq
    \frac{\mu d L_{\rho'}}{2}$.
\end{lemma}
\begin{proof}
    The result follows from \cite[Proposition 7.5]{gao18}.
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{lemma}
    \label{lm:var_rho}
    Suppose $\forall \theta \in \R^d$, $\rho(\theta)$ is bounded and $\forall \theta_1, \theta_2 \in \R^d$, $\left\lvert \rho(\theta_1)-\rho(\theta_2)\right\rvert \leq L_{\rho} \left\lVert \theta_1 - \theta_2 \right\rVert$. Then $\E\left[\left\lVert \widehat{\nabla}_{\mu,n} \rho(\theta) \right\rVert^2 \right]\leq \frac{d^2L_{\rho}^2}{n}$.
\end{lemma}
\begin{proof}
    Since $\forall v \in \mathbb{S}^{d-1},\;\lVert v \rVert = 1$, from  \eqref{eq:hat_nabla_rho_0}, we have
    \begin {align*}
    \E_{v_{1:n}}\left[\left\lVert\widehat{\nabla}_{\mu,n}\rho(\theta)\right\rVert^2 \right]
    &\stackrel{(a)}{\leq}\frac{d^2}{4\mu^2n^2}\sum_{i=1}^{n}\E_{v}\left[\left\lVert\left(\rho({\theta+\mu v}) - \rho({\theta-\mu v})\right) v\right\rVert^2\right]\\
    &\leq\frac{d^2}{4\mu^2n}\E_{v}\left[\left\lVert\left(\rho({\theta+\mu v}) - \rho({\theta-\mu v})\right)\right\rVert^2\lVert v \rVert^2\right]\\
    &\leq \frac{d^2L_{\rho}^2}{n}\left\lVert v\right\rVert^4 = \frac{d^2L_{\rho}^2}{n},
    \end {align*}
    where \((a)\) follows from the fact that $v_{1:n}$ are i.i.d mean zero r.v.s, and $\rho(\cdot)$ is bounded.
    Finally,
    \begin{align*}
        \E\left[\left\lVert \widehat{\nabla}_{\mu,n} \rho(\theta) \right\rVert^2 \right]=\E\left[\mathbb{E}_{v_{1:n}}\left[\left\lVert\widehat{\nabla}_{\mu,n}\rho(\theta)\right\rVert^2 \right]\right]\leq \frac{d^2L_{\rho}^2}{n}.
    \end{align*}
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Results with approximate objective function $\hat{\rho}_m(\cdot)$}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
The following lemmas establish bounds for the bias and variance of the gradient estimate in \eqref{eq:hat_nabla_hat_rho}.
\begin{lemma}
    \label{lm:est_bound_SF}
    Suppose $\forall \theta \in \R^d$, $\rho(\theta)$ and $\hat{\rho}_m(\theta)$ are bounded, and $\E\left[\left\lvert \rho(\theta)- \hat{\rho}_m(\theta)\right\rvert^2\right]\leq\frac{C_1}{m}$. Then
    \begin{align*}
        \mathbb{E}\left[\left\lVert \frac{d}{n}\sum\limits_{i=1}^{n}\frac{\hat{\rho}_m({\theta\pm\mu v_i}) - \rho({\theta\pm\mu v_i})}{2\mu}v_i \right\rVert^2\right]\leq \frac{d^2C_1}{4\mu^2mn}.
    \end{align*}
\end{lemma}
\begin{proof}
    Notice that
    \begin {align*}
    &\mathbb{E}\left[\left\lVert  \frac{d}{n}\sum_{i=1}^{n}\frac{\hat{\rho}_m({\theta\pm\mu v_i}) - \rho({\theta\pm\mu v_i})}{2\mu}v_i \right\rVert^2\right]
    \leq \frac{d^2}{4n^2\mu^2}\mathbb{E}\left[\mathbb{E}_{v_{1:n}}\left\lVert \sum_{i=1}^{n}\left (\hat{\rho}_m({\theta\pm\mu v_i}) - \rho({\theta\pm\mu v_i})\right )v_i \right\rVert^2 \right] \\
    &\stackrel{(a)}{\leq} \frac{d^2n}{4\mu^2n^2}\mathbb{E}\left[\mathbb{E}_{v}\left[\left\lVert\hat{\rho}_m({\theta\pm\mu v}) - \rho({\theta\pm\mu v})v \right\rVert^2\right]\right]\\
    &\stackrel{(b)}{\leq} \frac{d^2}{4\mu^2n}\mathbb{E}\left[\mathbb{E}_{v}\left[\left(\hat{\rho}_m({\theta\pm\mu v}) - \rho({\theta\pm\mu v})\right)^2\right]\right]\\
    &= \frac{d^2}{4\mu^2n}\mathbb{E}\left[\left(\hat{\rho}_m({\theta\pm\mu v}) - \rho({\theta\pm\mu v})\right)^2\right]
    \leq\frac{d^2C_1}{4\mu^2mn},
    \end {align*}
    where \((a)\) follows from the fact that $v_{1:n}$ are i.i.d mean zero r.v.s, and $\hat{\rho}_m(\cdot)$ and $\rho(\cdot)$ are bounded, and \((b)\) follows since $\lVert v \rVert = 1$.
\end{proof}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{lemma}
    \label{lm:bias_SF}
    $\mathbb{E}\left[\left\lVert\widehat{\nabla}_{\mu,n} \hat{\rho}_m({\theta}) -\nabla\rho(\theta)\right\rVert^2\right]
    \leq  \frac{4d^2L_{\rho}^2}{n}+ \mu^2 d^2 L_{\rho'}^2 + \frac{d^2C_1}{\mu^2mn}$
\end{lemma}
\begin{proof}
    Notice that
    \begin {align}
    &\widehat{\nabla}_{\mu,n} \hat{\rho}_m({\theta})
    = \frac{d}{n}\sum_{i=1}^{n}\frac{\hat{\rho}_m({\theta+\mu v_i}) - \hat{\rho}_m({\theta - \mu v_i})}{2\mu}v_i \nonumber\\
    &= \frac{d}{n}\sum_{i=1}^{n}\frac{\rho({\theta+\mu v_i}) - \rho({\theta - \mu v_i})}{2\mu}v_i
    + \frac{d}{n}\sum_{i=1}^{n}\frac{\hat{\rho}_m({\theta+\mu v_i}) - \rho({\theta+\mu v_i})}{2\mu}v_i
    + \frac{d}{n}\sum_{i=1}^{n}\frac{\rho({\theta-\mu v_i})-\hat{\rho}_m({\theta-\mu v_i})  }{2\mu}v_i \nonumber\\
    &= \widehat{\nabla}_{\mu,n} \rho({\theta})
    + \frac{d}{n}\sum_{i=1}^{n}\frac{\hat{\rho}_m({\theta+\mu v_i}) - \rho({\theta+\mu v_i})}{2\mu}v_i
    + \frac{d}{n}\sum_{i=1}^{n}\frac{\rho({\theta-\mu v_i})-\hat{\rho}_m({\theta-\mu v_i})}{2\mu}v_i.
    \label{eq:bias_1}
    \end {align}
    From \eqref{eq:bias_1} and Lemma \ref{lm:est_bound_SF}, we obtain
    \begin {align*}
    &\mathbb{E}\left[\left\lVert \widehat{\nabla}_{\mu,n} \hat{\rho}_m({\theta}) -\nabla\rho(\theta) \right\rVert^2\right]
    \leq 2\mathbb{E}\left[\left\lVert \widehat{\nabla}_{\mu,n} \rho({\theta}) - \nabla\rho_g(\theta) \right\rVert^2\right]+ \frac{d^2C_1}{\mu^2mn}\\
    &\stackrel{(a)}{\leq} 4 \mathbb{E}\left[\left\lVert \widehat{\nabla}_{\mu,n} \rho({\theta}) - \mathbb{E}\left[\widehat{\nabla}_{\mu,n}\rho(\theta)\mid\theta\right] \right\rVert^2\right]
    + 4 \mathbb{E}\left[\left\lVert \nabla\rho_{\mu}(\theta) - \nabla\rho(\theta) \right\rVert^2\right] + \frac{d^2C_1}{\mu^2mn} \\
    &\stackrel{(b)}{\leq} 4 \mathbb{E}\left[\left\lVert \widehat{\nabla}_{\mu,n} \rho_g({\theta}) \right\rVert^2\right]+ \mu^2 d^2 L_{\rho'}^2+ \frac{d^2C_1}{\mu^2mn}\\
    &\stackrel{(c)}{\leq} \frac{4d^2L_{\rho}^2}{n}+ \mu^2 d^2 L_{\rho'}^2 + \frac{d^2C_1}{\mu^2mn},
    \end {align*}
    where $(a)$ follows from Lemma \ref{lm:del_rho_mu_2}, $(b)$ follows from Lemma \ref{lm:bias_rho} and since $\mathbb{E}[\lVert X-E[X\mid Y]\rVert^2] \leq \mathbb{E}[\lVert X \rVert^2]$, and \((c)\) follows from Lemma \ref{lm:var_rho}.
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{lemma}
    \label{lm:var_SF}
    $\mathbb{E}\left[\left\lVert \widehat{\nabla}_{\mu,n} \hat{\rho}_m({\theta}) \right\rVert^2\right]
    \leq \frac{2d^2L_{\rho}^2}{n}+  \frac{d^2C_1}{\mu^2mn}$.
\end{lemma}
\begin{proof}
    Using \eqref{eq:bias_1} and Lemma \ref{lm:est_bound_SF}, we obtain
    \begin {align*}
    \mathbb{E}\left[\left\lVert \widehat{\nabla}_{\mu,n} \hat{\rho}_m({\theta}) \right\rVert^2\right]
    &\leq 2\mathbb{E}\left[\left\lVert \widehat{\nabla}_{\mu,n} \rho({\theta}) \right\rVert^2\right]+ \frac{d^2C_1}{\mu^2mn}
    \leq \frac{2d^2L_{\rho}^2}{n}+  \frac{d^2C_1}{\mu^2mn},
    \end {align*}
    where the last inequality follows from Lemma \ref{lm:var_rho}.
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Proof of Proposition \ref{pr:non_asym_sf}}
Using the fundamental theorem of calculus, we obtain
\begin {align}
& \rho(\theta_k) - \rho(\theta_{k+1})
=\langle \nabla \rho(\theta_k), \theta_k - \theta_{k+1} \rangle
+ \int_0^1 \left\langle \nabla \rho(\theta_{k+1}+\tau(\theta_k-\theta_{k+1}))-\nabla \rho(\theta_k), \theta_k - \theta_{k+1} \right\rangle d\tau\nonumber\\
&\leq\langle \nabla \rho(\theta_k), \theta_k - \theta_{k+1} \rangle
+\int_0^1 \left\lVert\nabla\rho(\theta_{k+1}+\tau(\theta_k - \theta_{k+1})) - \nabla\rho(\theta_k) \right\rVert \left\lVert \theta_k - \theta_{k+1} \right\rVert d\tau\nonumber\\
&\stackrel{(a)}{\leq} \left \langle \nabla \rho(\theta_k), \theta_k - \theta_{k+1} \right \rangle
+ L_{\rho'}\left\lVert \theta_k - \theta_{k+1} \right\rVert^2  \int_0^1 (1-\tau) d\tau \nonumber\\
&= \left \langle \nabla \rho(\theta_k), \theta_k - \theta_{k+1} \right \rangle + \frac{L_{\rho'}}{2}\left\lVert \theta_k - \theta_{k+1} \right\rVert^2 \nonumber\\
&= \alpha\left\langle \nabla \rho(\theta_k),-\widehat{\nabla}_{\mu,n}\hat{\rho}_m(\theta_k) \right\rangle
+ \frac{L_{\rho'}}{2}\alpha^2 \left\lVert\widehat{\nabla}_{\mu,n}\hat{\rho}_m(\theta_k)\right\rVert^2\nonumber\\
&= \alpha\left \langle \nabla \rho(\theta_k), \nabla \rho(\theta_k)-\widehat{\nabla}_{\mu,n}\hat{\rho}_m(\theta_k) \right \rangle
-\alpha\left \lVert \nabla \rho(\theta_k)\right \rVert^2
+ \frac{L_{\rho'}}{2}\alpha^2\left\lVert \widehat{\nabla}_{\mu,n}\hat{\rho}_m(\theta_k) \right\rVert^2 \nonumber\\
&\stackrel{(b)}{\leq} \frac{\alpha}{2}\left \lVert \nabla \rho(\theta_k) \right \rVert^2 + \frac{\alpha}{2}\left \lVert \nabla \rho(\theta_k) -\widehat{\nabla}_{\mu,n}\hat{\rho}_m(\theta_k) \right \rVert^2
-\alpha\left \lVert \nabla \rho(\theta_k)\right \rVert^2+ \frac{L_{\rho'}}{2}\alpha^2\left\lVert \widehat{\nabla}_{\mu,n}\hat{\rho}_m(\theta_k) \right\rVert^2\nonumber\\
&= \frac{\alpha}{2}\left \lVert \nabla \rho(\theta_k) -\widehat{\nabla}_{\mu,n}\hat{\rho}_m(\theta_k) \right \rVert^2
-\frac{\alpha}{2}\left \lVert \nabla \rho(\theta_k)\right \rVert^2
+ \frac{L_{\rho'}}{2}\alpha^2\left\lVert \widehat{\nabla}_{\mu,n}\hat{\rho}_m(\theta_k) \right\rVert^2.
\label{eq:sf_1}
\end {align}
In the above the step $(a)$ follows since $\rho(\cdot)$ is smooth and the step $(b)$ follows from $2\langle a, b \rangle \leq \lVert a \rVert^2+ \Vert b \rVert^2$.
Rearranging  and taking expectations on both sides of \eqref{eq:sf_1}, we obtain
\begin {align}
&\alpha\mathbb{E}\left[\left \lVert \nabla \rho(\theta_k)\right \rVert^2\right]\nonumber\\
&\leq  2\mathbb{E}\left[\rho(\theta_{k+1}) - \rho(\theta_{k})\right] + L_{\rho'}\alpha^2 \mathbb{E}\left[\left\lVert \widehat{\nabla}_{\mu,n}\hat{\rho}_m(\theta_k) \right\rVert^2\right]
+  \alpha \mathbb{E}\left[\left \lVert \nabla \rho(\theta_k) -\widehat{\nabla}_{\mu,n}\hat{\rho}_m(\theta_k) \right \rVert^2 \right]\nonumber\\
&\leq  2\mathbb{E}\left[\rho(\theta_{k+1}) - \rho(\theta_{k})\right] + L_{\rho'}\alpha^2 \left (\frac{2d^2L_{\rho}^2}{n}+  \frac{d^2C_1}{\mu^2mn}\right )
+  \alpha\left( \frac{4d^2L_{\rho}^2}{n}+ \mu^2 d^2 L_{\rho'}^2 + \frac{d^2C_1}{\mu^2mn} \right)
\label{eq:sf_2}
\end {align}
where the last inequality follows from lemmas \ref{lm:bias_SF}-\ref{lm:var_SF}.

Summing up \eqref{eq:sf_2} from $k=0,\cdots,N-1$, we obtain
\begin {align*}
&\alpha\sum\limits_{k=0}^{N-1}\mathbb{E}\left[\left \lVert \nabla \rho(\theta_k)\right \rVert^2\right]
\leq  2 \mathbb{E}\left[\rho(\theta_{N}) - \rho(\theta_{0})\right]
+N L_{\rho'} \alpha^2 \left(\frac{2d^2 L_{\rho}^2}{n}+  \frac{d^2C_1}{\mu^2mn}\right)
+ N \alpha \left( \frac{4d^2L_{\rho}^2}{n}+ \mu^2 d^2 L_{\rho'}^2 + \frac{d^2C_1}{\mu^2mn} \right).
\end {align*}
Since $\theta_R$ is chosen uniformly at random from the policy iterates $\{\theta_0,\cdots,\theta_{N-1}\}$, we obtain
\begin {align*}
\mathbb{E}\left[\left\lVert \nabla \rho(\theta_R)\right\rVert^2\right]
&= \frac{1}{N}\sum\limits_{k=0}^{N-1}\mathbb{E}\left[\left\lVert \nabla \rho(\theta_k)\right \rVert^2\right]\\
&\leq \frac{2 \left(\rho^* - \rho(\theta_{0})\right)}{N \alpha}
+L_{\rho'} \alpha \left(\frac{2d^2 L_{\rho}^2}{n}+  \frac{d^2C_1}{\mu^2mn}\right)
+ \frac{4d^2L_\rho^2}{n}+ \mu^2 d^2 L_{\rho'}^2 + \frac{d^2C_1}{\mu^2mn}.
\end {align*}
\hfill\qed
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{DRM}
\label{sec:drm_proofs}
\subsection{Estimating DRM using Order statistics}
\label{subsec:est_drm}
The following lemma estimates the DRM in an on-policy RL setting.
\begin{lemma}
    \label{lm:hat_rho_G}
    $\hat{\rho}_g^G(\theta)=  \sum\limits_{i=1}^{m} {R^\theta_{(i)}} \left(g\left(1- \frac{i-1}{m}\right) - g\left(1- \frac{i}{m}\right)\right)$.
\end{lemma}
\begin{proof}
    Our proof follows the technique from \cite{kim_2010}. We rewrite \eqref{eq:G} as
    \begin {align}
    \label{eq:G2}
    G^m_{R^{\theta}}(x) =
    \begin{cases}
        0,&\textrm{if } x < R^\theta_{(1)}\\
        \frac{i}{m},&\textrm{if } R^\theta_{(i)} \leq x < R^\theta_{(i+1)}, i\in\{1,\!\cdots\!,m-1\}\\
        1,&\textrm{if } x \geq R^\theta_{(m)},\\
    \end{cases}
    \end {align}
    where $R^\theta_{(i)}$ is the $i^{th}$ smallest order statistic from the samples $R^\theta_1,\cdots R^\theta_m$.

    We assume without loss of generality that $R^\theta_{(j)} < 0 < R^\theta_{(j+1)}$, and obtain,
    \begin {align*}
    &\hat{\rho}_g^G(\theta)=\int\limits_{-M_r}^{0}(g(1-G^m_{R^{\theta}}(x))-1) dx + \int\limits_{0}^{M_r}g(1-G^m_{R^{\theta}}(x))dx\\
    &=\int\limits_{-M_r}^{R^\theta_{(1)}} (g(1-G^m_{R^{\theta}}(x))-1)dx
    + \sum_{i=2}^j \int\limits_{R^\theta_{(i-1)}}^{R^\theta_{(i)}} (g(1-G^m_{R^{\theta}}(x))-1)dx
    + \int\limits_{R^\theta_{(j)}}^{0}(g(1-G^m_{R^{\theta}}(x))-1)dx\\
    &\quad+ \int\limits_{0}^{R^\theta_{(j+1)}} g(1-G^m_{R^{\theta}}(x))dx
    + \sum_{i=j+1}^{m-1}\int\limits_{R^\theta_{(i)}}^{R^\theta_{(i+1)}} g(1-G^m_{R^{\theta}}(x))dx+ \int\limits_{R^\theta_{(m)}}^{M_r}g(1-G^m_{R^{\theta}}(x))dx\\
    &= \sum_{i=2}^j \int\limits_{R^\theta_{(i-1)}}^{R^\theta_{(i)}} \left(g\left(1- \frac{i-1}{m}\right)-1\right)dx
    + \int\limits_{R^\theta_{(j)}}^{0}\left(g\left(1- \frac{j}{m}\right)-1\right)dx+ \int\limits_{0}^{R^\theta_{(j+1)}} g\left(1- \frac{j}{m}\right)dx\\
    &\quad + \sum_{i=j+1}^{m-1}\int\limits_{R^\theta_{(i)}}^{R^\theta_{(i+1)}} g\left( 1-\frac{i}{m}\right)dx\\
    &= \sum_{i=2}^j \left({R^\theta_{(i)}}-{R^\theta_{(i-1)}}\right) \left(g\left(1- \frac{i-1}{m}\right)-1\right)
    - {R^\theta_{(j)}}\left(g\left(1- \frac{j}{m}\right)-1\right)+ {R^\theta_{(j+1)}} g\left(1- \frac{j}{m}\right)\\
    &\quad+ \sum_{i=j+1}^{m-1} \left({R^\theta_{(i+1)}}-{R^\theta_{(i)}}\right) g\left( 1\!-\!\frac{i}{m}\right)\\
    &= \sum_{i=2}^j \left({R^\theta_{(i)}}-{R^\theta_{(i-1)}}\right) g\left(1- \frac{i-1}{m}\right)+ {R^\theta_{(1)}}
    + \sum_{i=j}^{m-1} \left({R^\theta_{(i+1)}}-{R^\theta_{(i)}}\right) g\left( 1-\frac{i}{m}\right) \\
    &= \sum_{i=1}^{m} {R^\theta_{(i)}} g\left(1- \frac{i-1}{m}\right) - \sum_{i=1}^{m-1}{R^\theta_{(i)}} g\left(1- \frac{i}{m}\right)\\
    &= \sum_{i=1}^{m} {R^\theta_{(i)}} \left(g\left(1- \frac{i-1}{m}\right) - g\left(1- \frac{i}{m}\right)\right).
    \end {align*}
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
The following lemma estimates the DRM in an off-policy RL setting.
\begin{lemma}
    \label{lm:hat_rho_H}
    $\hat{\rho}_g^H(\theta)= R^b_{(1)}+\sum\limits_{i=2}^{m} {R^b_{(i)}} g\left(1- \min\left\{1,\frac{1}{m}\sum\limits_{k=1}^{i-1}\psi^\theta_{(k)}\right\}\right)
    - \sum\limits_{i=1}^{m-1}{R^b_{(i)}} g\left(1- \min\left\{1,\frac{1}{m}\sum\limits_{k=1}^{i}\psi^\theta_{(k)}\right\}\right)$.
\end{lemma}
\begin{proof}
    We rewrite \eqref{eq:H} as
    \begin {align}
    \label{eq:H2}
    H^m_{R^{\theta}}(x) =
    \begin{cases}
        0,&\textrm{if } x < R^b_{(1)}\\
        min\{1,\frac{1}{m}\sum\limits_{j=1}^{i}\psi^\theta_{(j)}\},&\textrm{if } R^b_{(i)} \leq x < R^\theta_{(i+1)},i\in\{1,\!\cdots\!,m-1\}\\
        1,&\textrm{if } x \geq R^b_{(m)},\\
    \end{cases}
    \end {align}
    where $R^b_{(i)}$ is the $i^{th}$ smallest order statistic from the samples $R^b_1,\cdots R^b_m$, and $\psi^\theta_{(i)}$ is the importance sampling ratio of $R^b_{(i)}$.

    We assume without loss of generality that $R^b_{(j)} < 0 < R^b_{(j+1)}$, and obtain,
    \begin {align*}
    &\hat{\rho}_g^H(\theta)=\int\limits_{-M_r}^{0}(g(1-H^m_{R^{\theta}}(x))-1) dx + \int\limits_{0}^{M_r}g(1-H^m_{R^{\theta}}(x))dx\\
    &=\int\limits_{-M_r}^{R^b_{(1)}} (g(1-H^m_{R^{\theta}}(x))-1)dx
    + \sum_{i=2}^j \int\limits_{R^b_{(i-1)}}^{R^b_{(i)}} (g(1-H^m_{R^{\theta}}(x))-1)dx
    + \int\limits_{R^b_{(j)}}^{0}(g(1-H^m_{R^{\theta}}(x))-1)dx\\
    &\quad+ \int\limits_{0}^{R^b_{(j+1)}} g(1-H^m_{R^{\theta}}(x))dx
    + \sum_{i=j+1}^{m-1}\int\limits_{R^b_{(i)}}^{R^b_{(i+1)}} g(1-H^m_{R^{\theta}}(x))dx+ \int\limits_{R^b_{(m)}}^{M_r}g(1-H^m_{R^{\theta}}(x))dx\\
    &= \sum_{i=2}^j \int\limits_{R^b_{(i-1)}}^{R^b_{(i)}} \left(g\left(1- min\left\{1,\frac{1}{m}\sum_{k=1}^{i-1}\psi^\theta_{(k)}\right\}\right)-1\right)dx
    + \int\limits_{R^b_{(j)}}^{0}\left(g\left(1- min\left\{1,\frac{1}{m}\sum_{k=1}^{j}\psi^\theta_{(k)}\right\}\right)-1\right)dx\\
    &\quad+ \int\limits_{0}^{R^b_{(j+1)}} g\left(1- \min\left\{1,\frac{1}{m}\sum_{k=1}^{j}\psi^\theta_{(k)}\right\}\right)dx
    + \sum_{i=j+1}^{m-1}\int\limits_{R^b_{(i)}}^{R^b_{(i+1)}} g\left( 1-\min\left\{1,\frac{1}{m}\sum_{k=1}^{i}\psi^\theta_{(k)}\right\}\right)dx\\
    &= \sum_{i=2}^j \left({R^b_{(i)}}-{R^b_{(i-1)}}\right) \left(g\left(1- \min\left\{1,\frac{1}{m}\sum_{k=1}^{i-1}\psi^\theta_{(k)}\right\}\right)-1\right)
    - {R^b_{(j)}}\left(g\left(1- \min\left\{1,\frac{1}{m}\sum_{k=1}^{j}\psi^\theta_{(k)}\right\}\right)-1\right)\\
    &\quad + {R^b_{(j+1)}} g\left(1- \min\left\{1,\frac{1}{m}\sum_{k=1}^{j}\psi^\theta_{(k)}\right\}\right)
    + \sum_{i=j+1}^{m-1} \left({R^b_{(i+1)}}-{R^b_{(i)}}\right) g\left( 1-\min\left\{1,\frac{1}{m}\sum_{k=1}^{i}\psi^\theta_{(k)}\right\}\right)\\
    &= \sum_{i=2}^j \left({R^b_{(i)}}-{R^b_{(i-1)}}\right) g\left(1-\min\left\{1, \frac{1}{m}\sum_{k=1}^{i-1}\psi^\theta_{(k)}\right\}\right)+ {R^b_{(1)}}\\
    &\quad+ \sum_{i=j}^{m-1} \left({R^b_{(i+1)}}-{R^b_{(i)}}\right) g\left( 1-\min\left\{1,\frac{1}{m}\sum_{k=1}^{i}\psi^\theta_{(k)}\right\}\right) \\
    &= R^b_{(1)}+\sum_{i=2}^{m} {R^b_{(i)}} g\left(1- \min\left\{1,\frac{1}{m}\sum_{k=1}^{i-1}\psi^\theta_{(k)}\right\}\right)
    - \sum_{i=1}^{m-1}{R^b_{(i)}} g\left(1- \min\left\{1,\frac{1}{m}\sum_{k=1}^{i}\psi^\theta_{(k)}\right\}\right).
    %&= \sum_{i=1}^{m} {R^\theta_{(i)}} \left(g\left(1\!-\! \min\left\{1,\frac{1}{m}\sum_{k=1}^{i-1}\psi^\theta_{(k)}\right\}\right)
    %- g\left(1\!-\! \min\left\{1,\frac{1}{m}\sum_{k=1}^{i}\psi^\theta_{(k)}\right\}\right)\right)
    \end {align*}
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{The estimation error of the DRM}
In the following lemma, we bound the estimation error of the DRM in an on-policy RL setting.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% \begin{lemma}
    %     \label{lm:est_error_G}
    %     $\E\left[\left\lvert \rho_g(\theta)- \hat{\rho}_g^G(\theta)\right\rvert^2\right]\leq\frac{16M_r^2M_{g'}^2}{m}$.
    % \end{lemma}
\begin{proof}(\textbf{Lemma \ref{lm:est_error_G}})
    Since $\forall x\in(-M_r,M_r),\left\lvert\1\{R^{\theta}\leq x\}\right\rvert  \leq  1$ a.s., using Hoeffding's inequality, we obtain $\forall x\in(-M_r,M_r)$,
    \begin {align}
    &\p\left(\left\lvert G^m_{R^{\theta}}(x) - F_{R^{\theta}}(x) \right\rvert > \epsilon\right) \leq 2\exp\left(-\frac{m\epsilon^2}{2}\right), \textrm{ and} \nonumber\\
    &\E\left[\left\lvert G^m_{R^{\theta}}(x) - F_{R^{\theta}}(x)\right\rvert^2 \right]
    =\int_{0}^{\infty}\p\left(\left\lvert G^m_{R^{\theta}}(x) -F_{R^{\theta}}(x)\right\rvert > \sqrt{\epsilon}\right)d\epsilon
    \leq \int_{0}^{\infty} 2\exp\left(-\frac{m\epsilon}{2}\right) d\epsilon = \frac{4 }{m}.\label{eq:G_err}
    \end {align}
    Now,
    \begin {align}
    &\E\left[\left\lvert \rho_g(\theta)- \hat{\rho}_g^G(\theta)\right\rvert^2\right]
    =\E\left[\left\lvert\int_{-M_r}^{M_r}(g(1-F_{R^{\theta}}(x))- g(1-G^m_{R^{\theta}}(x))) dx\right\rvert^2 \right]\nonumber\\
    &\stackrel{(a)}{\leq}2M_r\E\left[\int_{-M_r}^{M_r}\left\lvert(g(1-F_{R^{\theta}}(x))- g(1-G^m_{R^{\theta}}(x))) \right\rvert^2 dx\right]
    \nonumber\\
    &\stackrel{(b)}{\leq}2M_r\int_{-M_r}^{M_r}\E\left[\left\lvert(g(1-F_{R^{\theta}}(x))- g(1-G^m_{R^{\theta}}(x))) \right\rvert^2 \right]dx
   \nonumber\\
    &\stackrel{(c)}{\leq}2M_rM_{g'}^2\!\int_{-M_r}^{M_r}\E\left[\left\lvert G^m_{R^{\theta}}(x) - F_{R^{\theta}}(x) \right\rvert^2 \right]dx\nonumber\\
    &\stackrel{(d)}{\leq}2M_rM_{g'}^2\int_{-M_r}^{M_r}\frac{4 }{m}dx
    =\frac{16M_r^2M_{g'}^2}{m},\label{eq:est_error_G}
    \end {align}
    where \((a)\) follows from the Cauchy-Schwarz inequality, \((b)\) follows from the Fubini's theorem, \((c)\) follows from Lemma \ref{lm:g_lip}, and \((d)\) follows from \eqref{eq:G_err}.
\end{proof}
In the following lemma, we bound the estimation error of the DRM in an off-policy RL setting.
% \begin{lemma}
    %     \label{lm:est_error_H}
    %     $\E\left[\left\lvert \rho_g(\theta)- \hat{\rho}_g^H(\theta)\right\rvert^2\right]\leq\frac{16M_r^2M_{g'}^2M_s^2}{m}$.
    % \end{lemma}
\begin{proof}(\textbf{Lemma \ref{lm:est_error_H}})
    We use parallel arguments to the proof of Lemma \ref{lm:est_error_G}.

    From \eqref{eq:is_ratio}, we obtain $\forall x \in(-M_r,M_r)$, $\left\lvert\1\{R^{\theta}\leq x\}\psi^\theta\right\rvert  \leq  M_s$ a.s.
    From Hoeffding inequality, we obtain $\forall x \in(-M_r,M_r)$,
    \begin {align}
    \label{eq:hatH_prob}
    \p\left(\left\lvert \hat{H}^m_{R^{\theta}}(x) - F_{R^{\theta}}(x) \right\rvert > \epsilon\right) \leq 2\exp\left(-\frac{m\epsilon^2}{2M_s^2}\right).
    \end {align}
    From \eqref{eq:H} and \eqref{eq:hatH}, we observe that $\p\left(\left\lvert H^m_{R^{\theta}}(x) - F_{R^{\theta}}(x) \right\rvert > \epsilon\right) \leq \p\left(\left\lvert \hat{H}^m_{R^{\theta}}(x) - F_{R^{\theta}}(x) \right\rvert > \epsilon\right)$. Hence, we obtain $\forall x \in (-M_r,M_r)$,
    \begin {align}
    \label{eq:H_prob}
    \p\left(\left\lvert H^m_{R^{\theta}}(x) - F_{R^{\theta}}(x) \right\rvert > \epsilon\right) \leq 2\exp\left(-\frac{m\epsilon^2}{2M_s^2}\right).
    \end {align}
    Using similar arguments as in \eqref{eq:G_err} along with \eqref{eq:H_prob}, we obtain $\forall x \in[-M_r,M_r]$,
    \begin {align}
    \label{eq:H_err}
    \E\left[\left\lvert H^m_{R^{\theta}}(x) - F_{R^{\theta}}(x)\right\rvert^2 \right]\leq\frac{4 M_s^2}{m},\forall x.
    \end {align}
    Using similar arguments as in \eqref{eq:est_error_G} along with \eqref{eq:H_err}, we obtain
    \begin {align*}
    \E\left[\left\lvert \rho_g(\theta)- \hat{\rho}_g^H(\theta)\right\rvert^2\right]=\frac{16M_r^2M_{g'}^2M_s^2}{m}.
    \end {align*}
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Lipschitz properties of the DRM and its gradient}
\label{subsec:lip_drm}
\subsubsection{Results related to the distortion function}
The following lemma establishes Lipschitzness of the $g(\cdot)$, and $g'(\cdot)$. We require this result to establish the smoothness of the DRM.
\begin{lemma}
    \label{lm:g_lip}
    $\forall t,t'\in(0,1)$,$\left\lvert g(t) -g(t')\right\rvert \leq M_{g'} \left\lvert t - t'\right\rvert$, and
    $\left\lvert g'(t) -g'(t')\right\rvert \leq  M_{g''}\left\lvert t - t'\right\rvert$.
\end{lemma}
\begin{proof}
    Using mean value theorem, we obtain $ g(t) -g(t') = g'(\tilde{t}) (t - t')$, where $\tilde{t} \!\in\! (t,t')$. From \ref{as:g'_bound}, we obtain  $\left\lvert g'(\tilde{t}) \right\rvert \leq M_{g'}, \forall \tilde{t} \in(0,1)$. Hence,
    $\left\lvert g(t) -g(t')\right\rvert \leq M_{g'} \left\lvert t - t'\right\rvert\;\forall t,t'\in(0,1)$.

    Similarly, we have $ g'(t)-g'(t') = g''(\tilde{t}) (t - t')$, where $\tilde{t} \in (t,t')$. From \ref{as:g'_bound}, we obtain  $\left\lvert g''(\tilde{t}) \right\rvert \leq M_{g''}, \forall \tilde{t} \in(0,1)$. Hence,
    $\left\lvert g'(t) -g'(t')\right\rvert \leq M_{g''} \left\lvert t - t'\right\rvert\;\forall t,t'\in(0,1)$.
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{Lipschitz properties of the CDF}
The following two lemmas establish an upper bound for the gradient and the Hessian of the CDF. These lemmas are similar to lemmas in \cite{nv2021}. For the sake of completeness, we provide the detailed proof.
\begin{lemma}
    \label{lm:nablaFG}
    $\forall x \in(-M_r,M_r)$,
    \begin{align*}
        &\nabla F_{R^{\theta}}(x) =\E\left[\1\{R^{\theta} \leq x\}\sum\limits_{t=0}^{T-1}\nabla\log\pi_\theta(A_t | S_t)\right], \textrm{ and}\\
        &\nabla^2 F_{R^{\theta}}(x) =\E\left[\1\{R^{\theta}\leq x\}\left(\sum_{t=0}^{T-1}\nabla^2\log\pi_\theta(A_t | S_t)+
        \left[\sum_{t=0}^{T-1}\nabla\log\pi_\theta(A_t | S_t)\right] \left[\sum_{t=0}^{T-1}\nabla\log\pi_\theta(A_t | S_t)\right]^T\right)\right].
    \end{align*}
\end{lemma}
\begin{proof}
    Let $\Omega$ denote the set of all sample episodes. For any episode $\omega\in\Omega$, we denote by $T(\omega)$, its length, and $S_t(\omega)$ and $A_t(\omega)$, the state and action at time $t\in\{0,1,2,\cdots\}$ respectively. \\
    Let $R(\omega)=\sum\limits_{t=0}^{T(\omega)-1}\gamma^t r(S_t(\omega),A_t(\omega),S_{t+1}(\omega))$ be the cumulative discounted reward of the episode $\omega$, and let\\ $\p_\theta(\omega) =\prod\limits_{t=0}^{T(\omega)-1}\pi_\theta(A_t(\omega)|S_t(\omega))p(S_{t+1}(\omega),S_t(\omega),A_t(\omega))$. \\
    From $\frac{\nabla \p_\theta(\omega)}{\p_\theta(\omega)} =\sum\limits_{t=0}^{T(\omega)-1}\nabla\log\pi_\theta(A_t(\omega) | S_t(\omega))$,
    we obtain
    \begin{align*}
        \nabla F_{R^{\theta}}(x)
        &=\nabla\E\!\left[\1\{R^{\theta}\leq x\}\right] =\nabla \sum_{\omega\in\Omega} \1\{R(\omega)\leq x\}\p_\theta(\omega)\\
        &\stackrel{(a)}{=}\sum_{\omega\in\Omega}\nabla\left( \1\{R(\omega)\leq x\}\p_\theta(\omega)\right)\\
        &\stackrel{(b)}{=}\sum_{\omega\in\Omega} \1\{R(\omega)\leq x\}\nabla \p_\theta(\omega)\\
        &=\sum_{\omega\in\Omega} \1\{R(\omega)\leq x\} \frac{\nabla \p_\theta(\omega)}{\p_\theta(\omega)}\p_\theta(\omega)\\
        &=\sum_{\omega\in\Omega} \1\{R(\omega)\leq x\} \sum_{t=0}^{T(\omega)-1}\nabla\log\pi_\theta(A_t(\omega)|S_t(\omega))\p_\theta(\omega)\\
        &=\E\left[\1\{R^{\theta}\leq x\}\sum_{t=0}^{T-1}\nabla\log\pi_\theta(A_t|S_t)\right].
    \end{align*}
    In the above, the equality in \((a)\) follows by an application of the dominated convergence theorem to interchange the differentiation and the expectation operation. The aforementioned application is allowed since  (i) $\Omega$ is finite and the underlying measure is bounded, as we consider an MDP where the state and actions spaces are finite, and the policies are proper, (ii) $\nabla\log\pi_\theta(A_t|S_t)$ is bounded from \ref{as:nabla_logpi}. The equality in \((b)\) follows, since for a given episode $\omega$, the cumulative reward $R(\omega)$ does not depend on $\theta$.

    Similarly, \\
    from $\frac{\nabla^2 \p_\theta(\omega)}{\p_\theta(\omega)} =\sum\limits_{t=0}^{T(\omega)-1}\nabla^2\log\pi_\theta(A_t(\omega) | S_t(\omega))+
    \left[\sum\limits_{t=0}^{T(\omega)-1}\nabla\log\pi_\theta(A_t(\omega) | S_t(\omega))\right] \left[\sum\limits_{t=0}^{T(\omega)-1}\nabla\log\pi_\theta(A_t(\omega) | S_t(\omega))\right]^T$, we obtain
    \begin{align*}
        &\nabla^2 F_{R^{\theta}}(x)
        =\E\left[\1\{R^{\theta}\leq x\}\left(\sum_{t=0}^{T-1}\!\nabla^2\log\pi_\theta(A_t | S_t)+
        \left[\sum_{t=0}^{T-1}\nabla\log\pi_\theta(A_t | S_t)\right] \left[\sum_{t=0}^{T-1}\nabla\log\pi_\theta(A_t | S_t)\right]^T\right)\right].
    \end{align*}
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{lemma}
    \label{lm:nablaF_bound}
    $\forall x \in (-M_r,M_r),\left\lVert \nabla F_{R^{\theta}}(x) \right\rVert \leq M_e M_d$, and
    $\left\lVert \nabla^2 F_{R^{\theta}}(x) \right\rVert\leq M_e M_h +M_e^2M_d^2$.
\end{lemma}
\begin{proof}
    From \ref{as:nabla_logpi} and \eqref{eq:M_pi}, for any $x\in(-M_r,M_r)$, we have
    \begin{align}
        \label{eq:nabla_G_bound}
        \left\lVert\1\{R^{\theta}\leq x\}\sum_{t=0}^{T-1}\!\nabla\log \pi_{\theta}(A_t\mid S_t)\right\rVert \leq  M_e M_d \textrm{ a.s},
    \end{align}
    and
    \begin{align}
        \label{eq:nabla_G_bound1}
        \left\lVert\1\{R^{\theta}\leq x\}\left(\sum_{t=0}^{T-1}\nabla^2\log\pi_\theta(A_t | S_t)+
        \left[\sum_{t=0}^{T-1}\nabla\log\pi_\theta(A_t | S_t)\right] \left[\sum_{t=0}^{T-1}\nabla\log\pi_\theta(A_t | S_t)\right]^T\right)\right\rVert
        \leq  M_e M_h +M_e^2M_d^2\textrm{ a.s}.
    \end{align}

    From Lemma \ref{lm:nablaFG}, for any $x\in(-M_r,M_r)$, we have
    \begin{align}
        \left\lVert \nabla F_{R^{\theta}}(x) \right\rVert
        &\leq \E\left[\left\lVert \1\{R^{\theta}\leq x\} \sum_{t=0}^{T-1} \nabla\log \pi_{\theta}(A_t| S_t)\right\rVert\right]
        \leq M_e M_d, \label{eq:nabla_F_R_bound}
    \end{align}
    and
    \begin{align}
        \left\lVert \nabla^2 F_{R^{\theta}}(x) \right\rVert
        &\leq \E\left[\left\lVert\1\{R^{\theta}\leq x\}\left(\sum_{t=0}^{T-1}\nabla^2\log\pi_\theta(A_t | S_t)+
        \left[\sum_{t=0}^{T-1}\nabla\log\pi_\theta(A_t | S_t)\right] \left[\sum_{t=0}^{T-1}\nabla\log\pi_\theta(A_t | S_t)\right]^T\right)\right\rVert\right]\nonumber\\
        &\leq M_e M_h +M_e^2M_d^2, \label{eq:nabla2_F_R_bound}
    \end{align}
    where these inequalities follow from \eqref{eq:nabla_G_bound}, \eqref{eq:nabla_G_bound1}, and the assumption that the state and action spaces are finite.
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
The following lemma establishes Lipschitzness of the CDF and its gradient.
\begin{lemma}
    \label{lm:F_lip}
    $\forall x \in(-M_r,M_r)$,
    \begin{align*}
        &\left\lvert F_{R^{\theta_1}}(x) - F_{R^{\theta_2}}(x) \right\rvert \leq M_eM_d \left\lVert \theta_1 - \theta_2 \right\rVert, \textrm{ and}\\
        &\left\lVert \nabla F_{R^{\theta_1}}(x) - \nabla F_{R^{\theta_2}}(x) \right\rVert \leq (M_eM_h+M_e^2M_d^2) \left\lVert \theta_1 - \theta_2 \right\rVert.
    \end{align*}
\end{lemma}
\begin{proof}
    The result follows by Lemma \ref{lm:nablaF_bound} and Lemma~1.2.2 in \cite{nesterov_book}.
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{Gradient of the DRM}
The following lemma derives an expression for the gradient of the DRM. This lemma is similar to Theorem 1 in \cite{nv2021}. For the sake of completeness, we provide detailed proof.
\begin{lemma}
    \label{lm:nabla_rho_g}
    $\nabla \rho_g(\theta)\!=\!-\int_{-M_r}^{M_r} g'(1-F_{R^{\theta}}(x)) \nabla F_{R^{\theta}}(x)dx$.
\end{lemma}
\begin{proof}
    Notice that
    \begin{align*}
        \nabla \rho_g(\theta)
        &=\nabla\int_{-M_r}^{0}\left(g(1-F_{R^{\theta}}(x))-1\right)dx  + \nabla\int_{0}^{M_r} g(1-F_{R^{\theta}}(x))dx \nonumber\\
        &\stackrel{(a)}{=}\int_{-M_r}^{0}\nabla \left(g(1-F_{R^{\theta}}(x))-1\right) dx + \int_{0}^{M_r}\nabla g(1-F_{R^{\theta}}(x)) dx \\%\label{eq:B}\\
        &=-\int_{-M_r}^{M_r} g'(1-F_{R^{\theta}}(x)) \nabla F_{R^{\theta}}(x)dx.\nonumber
    \end{align*}
    In the above, \((a)\) follows by an application of the dominated convergence theorem to interchange the differentiation and the integration operation. The aforementioned application is allowed since
    (i) $\rho_g(\theta)$ is finite for any $\theta \in \R^d$; (ii)  $\lvert g'(\cdot) \rvert \leq M_{g'}$ from \ref{as:g'_bound}, and $\nabla F_{R^{\theta}}(\cdot)$ is bounded from \eqref{eq:nabla_F_R_bound}. The bounds on $g'$ and $\nabla F_{R^{\theta}}$ imply \\$\int_{-M_r}^{M_r} \left\lVert g'(1-F_{R^{\theta}}(x)) \nabla F_{R^{\theta}}(x)\right\rVert dx \leq 2 M_rM_{g'} M_eM_d$.
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{Lipschitz properties of the DRM and its gradient}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
The following two lemmas establish the Lipschitzness of the DRM and its gradient.
% \begin{lemma}
    %     \label{lm:rho_lip}
    %     $\forall \theta_1,\theta_2 \in \mathbb{R}^d$, $\left\lvert \rho_g(\theta_1)\!-\! \rho_g(\theta_2)\right\rvert
    %     \leq L_\rho \left\lVert \theta_1 \!-\! \theta_2 \right\rVert$, where $L_\rho=2M_rM_{g'}M_eM_d$.
    % \end{lemma}
\begin{proof}(\textbf{Lemma \ref{lm:rho_lip}})
    \begin {align*}
    \left\lvert \rho_g(\theta_1)- \rho_g(\theta_2)\right\rvert
    & \leq  \int_{-M_r}^{M_r}\left\lvert g(1-F_{R^{\theta_1}}(x))-g(1-F_{R^{\theta_2}}(x)) \right\rvert dx \\
    & \stackrel{(a)}{\leq} M_{g'} \int_{-M_r}^{M_r}\left\lvert F_{R^{\theta_1}}(x)-F_{R^{\theta_2}}(x)\right\rvert dx\\
    &\stackrel{(b)}{\leq} 2M_rM_{g'}M_eM_d \left\lVert \theta_1 - \theta_2 \right\rVert,
    \end {align*}
    where \((a)\) follows from Lemma \ref{lm:g_lip} and \((b)\) follows from Lemma \ref{lm:F_lip}. The result follows since $L_\rho=2M_rM_{g'}M_eM_d$.
    %\end{proof}
    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    % \begin{lemma}
        %     \label{lm:nabla_rho_lip}
        %     $\forall \theta_1,\theta_2 \in \mathbb{R}^d$,$\left\lVert \nabla\rho_g(\theta_1) - \nabla \rho_g(\theta_2) \right\rVert \leq
        %      L_{\rho'} \left\lVert  \theta_1  \!-\! \theta_2\right\rVert$,\\
        %      \hspace*{\fill}where $L_{\rho'}=2M_r M_e \left( M_h M_{g'}+M_eM_d^2 (M_{g'}+ M_{g''})\right)$.
        % \end{lemma}
    %\begin{proof}(\textbf{Lemma \ref{lm:nabla_rho_lip}})

    From Lemma \ref{lm:nabla_rho_g}, we obtain
    \begin {align*}
    &\left\lVert \nabla\rho_g(\theta_1) - \nabla \rho_g(\theta_2) \right\rVert \\
    &\leq \int_{-M_r}^{M_r}\left\lVert  g'(1-F_{R^{\theta_1}}(x)) \nabla F_{R^{\theta_1}}(x)
    - g'(1-F_{R^{\theta_2}}(x)) \nabla F_{R^{\theta_2}}(x) \right\rVert dx\\
    &\leq \int_{-M_r}^{M_r}\left\lVert  g'(1-F_{R^{\theta_1}}(x)) \nabla F_{R^{\theta_1}}(x)\right.
    -g'(1-F_{R^{\theta_1}}(x)) \nabla F_{R^{\theta_2}}(x) + g'(1-F_{R^{\theta_1}}(x)) \nabla F_{R^{\theta_2}}(x) \\
    &\quad \left.-g'(1-F_{R^{\theta_2}}(x)) \nabla F_{R^{\theta_2}}(x) \right\rVert dx   \\
    &\leq \int_{-M_r}^{M_r}  \left\lvert g'(1-F_{R^{\theta_1}}(x)) \right\rvert  \left\lVert \nabla F_{R^{\theta_1}}(x) - \nabla F_{R^{\theta_2}}(x)\right\rVert
    +\left\lVert\nabla F_{R^{\theta_2}}(x)\right\rVert \left\lvert  g'(1-F_{R^{\theta_1}}(x)) - g'(1-F_{R^{\theta_2}}(x))\right\rvert dx   \\
    %        &\leq \int_{-M_r}^{M_r} M_{g'} \left\lVert \nabla F_{R^{\theta_1}}(x) -  \nabla F_{R^{\theta_2}}(x)\right\rVert \\
    %        &\hfill+M_eM_d \left\lvert  g'(1-F_{R^{\theta_1}}(x))  - g'(1-F_{R^{\theta_2}}(x))\right\rvert dx  \\
    %        &\hfill \text{(from lemmas \ref{lm:g_bound} and \ref{lm:nablaF_bound})} \\
    &\stackrel{(a)}{\leq} \int_{-M_r}^{M_r} M_{g'} \left\lVert \nabla F_{R^{\theta_1}}(x) -  \nabla F_{R^{\theta_2}}(x)\right\rVert
    +M_eM_d M_{g''}\left\lvert F_{R^{\theta_1}}(x) - F_{R^{\theta_2}}(x)\right\rvert dx \\
    &\stackrel{(b)}{\leq} \int_{-M_r}^{M_r} M_{g'} (M_eM_h+M_e^2M_d^2)\left\lVert  \theta_1  - \theta_2\right\rVert
    + M_e^2M_d^2 M_{g''}\left\lVert  \theta_1  - \theta_2\right\rVert dx \\
    &\leq 2M_r M_e \left( M_h M_{g'}+M_eM_d^2 (M_{g'}+ M_{g''})\right) \left\lVert  \theta_1  - \theta_2\right\rVert,
    \end {align*}
    where \((a)\) follows from \ref{as:g'_bound}, and Lemmas \ref{lm:g_lip}, \ref{lm:nablaF_bound}, and \((b)\) follows from Lemma \ref{lm:F_lip}. The result follows since $L_{\rho'}=2M_r M_e \left( M_h M_{g'}+M_eM_d^2 (M_{g'}+ M_{g''})\right)$.
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Mean-variance risk measure}
\label{sec:mvrm_proofs}
\subsection{The estimation error of the MVRM}
In the following lemma, we bound the estimation error of the MVRM in an on-policy RL setting.
% \begin{lemma}
    % \label{lm:est_error_mvrm_pi}
    % $\E\left[\left\lvert \hat{\rho}_\lambda^{\pi}(\theta)- \rho_\lambda(\theta) \right\rvert^2\right]
    % \leq \frac{8M_r^2\left(1+\frac{4\lambda^2M_r^2}{m-1}\right)}{m}$
    % \end{lemma}
\begin{proof}(\textbf{Lemma \ref{lm:est_error_mvrm_pi}})
    From \eqref{eq:rho_lambda} and \eqref{eq:hat_rho_lambda_pi}, we obtain
    \begin{align}
        \label{eq:mse_hat_rho_lambda_pi}
        \E\left[\left\lvert \hat{\rho}_\lambda^{\pi}(\theta)- \rho_\lambda(\theta) \right\rvert^2\right]
        &\leq2\E\left[\left\lvert \hat{J}_m^{\pi}(\theta)-J(\theta)\right\rvert^2\right]+2\lambda^2\E\left[\left\lvert V(\theta)-\widehat{V}_m^{\pi}(\theta)
        \right\rvert^2\right]\nonumber\\
        &\leq \frac{8M_r^2}{m}+\frac{32\lambda^2M_r^4}{m}=\frac{8M_r^2+32\lambda^2M_r^4}{m},
    \end{align}
    where the last inequality follows from Theorem 2-3 \cite[chapter V1]{mood74} in conjunction with the fact $\left\lvert R^\theta\right\rvert\leq M_r$ and $m>2$.
\end{proof}
In the following lemma, we bound the estimation error of the MVRM in an off-policy RL setting.
% \begin{lemma}
    % \label{lm:est_error_mvrm_b}
    % $\E\left[\left\lvert \hat{\rho}_\lambda^{b}(\theta)- \rho_\lambda(\theta) \right\rvert^2\right]
    % \leq \frac{8M_r^2\left(1+\frac{4\lambda^2M_r^2}{m-1}\right)}{m}$
    % \end{lemma}
\begin{proof}(\textbf{Lemma \ref{lm:est_error_mvrm_b}})
    From \eqref{eq:rho_lambda} and \eqref{eq:hat_rho_lambda_b}, we obtain
    \begin{align}
        \label{eq:mse_hat_rho_lambda_b}
        \E\left[\left\lvert \hat{\rho}_\lambda^{b}(\theta)- \rho_\lambda(\theta) \right\rvert^2\right]
        &\leq2\E\left[\left\lvert \hat{J}_m^{b}(\theta)-J(\theta)\right\rvert^2\right]+2\lambda^2\E\left[\left\lvert V(\theta)-\widehat{V}_m^{b}(\theta)
        \right\rvert^2\right]\nonumber\\
        &\leq \frac{8M_r^2M_s^2}{m}+\frac{32\lambda^2M_r^4M_s^4}{m}=\frac{8M_r^2M_s^2+32\lambda^2M_r^4M_s^4}{m},
    \end{align}
    where the last inequality follows from Theorem 2-3 \cite[chapter V1]{mood74} in conjunction with the fact $\left\lvert R^b\psi_\theta\right\rvert\leq M_rM_s$, and $m>2$.
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Lipschitz properties of the MVRM and its gradient}
\label{subsec:lip_mvrm}
% \begin{lemma}
    % \label{lm:lip_rho_lambda}
    % $\forall \theta_1,\theta_2 \in \R^d$,
    %  \begin{align*}
        % & \left\lvert \rho_\lambda(\theta_1)-\rho_\lambda(\theta_1)\right\rvert
        % \leq L_{\rho} \left\lVert \theta_1 - \theta_2 \right\rVert;\;L_{\rho}=M_rM_eM_d+ 3\lambda M_r^2 M_e M_d\\
        % &\left\lVert \nabla \rho_\lambda(\theta_1)-\nabla \rho_\lambda(\theta_1)\right\rVert
        % \leq  L_{\rho'} \left\lVert \theta_1 - \theta_2 \right\rVert;\;L_{\rho'} = M_rM_e\left(M_h+M_eM_d^2\right)+\lambda M_r^2M_e\left(3M_h+5 M_eM_d^2\right)
        % \end{align*}
    % \end{lemma}
\begin{proof}(\textbf{Lemma \ref{lm:lip_rho_lambda}})
    Let $\Omega$ denote the set of all sample episodes. For any episode $\omega\in\Omega$, we denote by $T(\omega)$, its length, and $S_t(\omega)$ and $A_t(\omega)$, the state and action at time $t\in\{0,1,2,\cdots\}$ respectively.

    Let $R(\omega)=\sum\limits_{t=0}^{T(\omega)-1}\gamma^t r(S_t(\omega),A_t(\omega),S_{t+1}(\omega))$ be the cumulative discounted reward of the episode $\omega$, and let \\
    %Let $\omega= (S_0(\omega), A_0(\omega),\ldots,A_{T(\omega)-1}(\omega),S_{T(\omega)}(\omega))$ denote a sample trajectory. Letting
    $\p_\theta(\omega) = \prod\limits_{t=0}^{T(\omega)-1}\pi_\theta(A_t(\omega)|S_t(\omega))p(S_{t+1}(\omega),S_t(\omega),A_t(\omega))$. \\
    From $\frac{\nabla \p_\theta(\omega)}{\p_\theta(\omega)} =\sum\limits_{t=0}^{T(\omega)-1}\!\nabla\log\pi_\theta(A_t(\omega) | S_t(\omega))$,
    we obtain
    \begin{align}
        \label{eq:nabla_J}
        \nabla J(\theta)
        &=\nabla\E\left[R^{\theta}\right] =\nabla \sum_{\omega\in\Omega} R(\omega)\p_\theta(\omega)\nonumber\\
        &\stackrel{(a)}{=}\sum_{\omega\in\Omega}\nabla\left( R(\omega)\p_\theta(\omega)\right)\nonumber\\
        &\stackrel{(b)}{=}\sum_{\omega\in\Omega} R(\omega)\nabla \p_\theta(\omega)\nonumber\\
        &=\sum_{\omega\in\Omega} R(\omega) \frac{\nabla \p_\theta(\omega)}{\p_\theta(\omega)}\p_\theta(\omega)\\
        &=\sum_{\omega\in\Omega} R(\omega) \sum_{t=0}^{T(\omega)-1}\nabla\log\pi_\theta(A_t(\omega)|S_t(\omega))\p_\theta(\omega)\nonumber\\
        &=\E\left[R^{\theta}\sum_{t=0}^{T-1}\nabla\log\pi_\theta(A_t|S_t)\right].
    \end{align}
    In the above, \((a)\) follows by an application of the dominated convergence theorem to interchange the differentiation and the expectation operation. The aforementioned application is allowed since  (i) $\Omega$ is finite and the underlying measure is bounded, as we consider an MDP where the state and actions spaces are finite, and the policies are proper, (ii) $\nabla\log\pi_\theta(A_t|S_t)$ is bounded from \ref{as:nabla_logpi}. The step \((b)\) follows, since for a given episode $\omega$, the cumulative reward $R(\omega)$ does not depend on $\theta$.

    Similarly, \\
    from $\frac{\nabla^2 \p_\theta(\omega)}{\p_\theta(\omega)} =\sum\limits_{t=0}^{T(\omega)-1}\nabla^2\log\pi_\theta(A_t(\omega) | S_t(\omega))+
    \left[\sum\limits_{t=0}^{T(\omega)-1}\nabla\log\pi_\theta(A_t(\omega) | S_t(\omega))\right] \left[\sum\limits_{t=0}^{T(\omega)-1}\nabla\log\pi_\theta(A_t(\omega) | S_t(\omega))\right]^T$, we obtain
    \begin{align}
        \label{eq:nabla2_J}
        &\nabla^2 J(\theta)
        =\E\left[R^{\theta}\left(\sum_{t=0}^{T-1}\nabla^2\log\pi_\theta(A_t | S_t)+
        \left[\sum_{t=0}^{T-1}\nabla\log\pi_\theta(A_t | S_t)\right] \left[\sum_{t=0}^{T-1}\nabla\log\pi_\theta(A_t | S_t)\right]^T\right)\right].
    \end{align}

    Similarly,
    \begin{align}
        \label{eq:nabla_E_R_theta_sq}
        \nabla\E\left[\left(R^{\theta}\right)^2\right]
        & =\nabla \sum_{\omega\in\Omega} R(\omega)^2\p_\theta(\omega)\nonumber\\
        &=\sum_{\omega\in\Omega}\nabla\left( R(\omega)^2\p_\theta(\omega)\right)\\
        &=\sum_{\omega\in\Omega} R(\omega)^2\nabla \p_\theta(\omega)\nonumber\\
        &=\sum_{\omega\in\Omega} R(\omega)^2 \frac{\nabla \p_\theta(\omega)}{\p_\theta(\omega)}\p_\theta(\omega)\\
        &=\sum_{\omega\in\Omega} R(\omega)^2 \sum_{t=0}^{T(\omega)-1}\nabla\log\pi_\theta(A_t(\omega)|S_t(\omega))\p_\theta(\omega)\nonumber\\
        &=\E\left[\left(R^{\theta}\right)^2\sum_{t=0}^{T-1}\nabla\log\pi_\theta(A_t|S_t)\right],
    \end{align}
    and
    \begin{align}
        \label{eq:nabla2_E_R_theta_sq}
        &\nabla^2 \E\left[\left(R^{\theta}\right)^2\right]
        =\E\left[\left(R^{\theta}\right)^2\left(\sum_{t=0}^{T-1}\!\nabla^2\log\pi_\theta(A_t | S_t)+
        \left[\sum_{t=0}^{T-1}\!\nabla\log\pi_\theta(A_t | S_t)\right] \left[\sum_{t=0}^{T-1}\!\nabla\log\pi_\theta(A_t | S_t)\right]^T\right)\right].
    \end{align}
    From \eqref{eq:nabla_J}-\eqref{eq:nabla2_J}, we obtain
    \begin{align}
        \label{eq:norm_nabla_J}
        \left\lVert\nabla J(\theta)\right\rVert
        &\leq\E\left[ \left\lVert R^{\theta}\sum_{t=0}^{T-1}\nabla\log\pi_\theta(A_t|S_t)\right\rVert\right]
        \leq M_r E\left[ \left\lVert \sum_{t=0}^{T-1}\nabla\log\pi_\theta(A_t|S_t)\right\rVert\right]
        \leq M_r M_eM_d,
    \end{align}
    and
    \begin{align}
        \label{eq:norm_nabla2_J}
        \left\lVert\nabla^2 J(\theta)\right\rVert
        &\leq\E\left[\left\lVert R^{\theta}\left(\sum_{t=0}^{T-1}\nabla^2\log\pi_\theta(A_t | S_t)+
        \left[\sum_{t=0}^{T-1}\nabla\log\pi_\theta(A_t | S_t)\right] \left[\sum_{t=0}^{T-1}\nabla\log\pi_\theta(A_t | S_t)\right]^T\right)\right\rVert\right]\nonumber\\
        &\leq M_r\E\left[\left\lVert\sum_{t=0}^{T-1}\nabla^2\log\pi_\theta(A_t | S_t)\right\rVert+
        \left\lVert\sum_{t=0}^{T-1}\nabla\log\pi_\theta(A_t | S_t)\right\rVert^2\right]\nonumber\\
        &\leq M_r\left(M_eM_h+M_e^2M_d^2\right).
    \end{align}
    Hence from \eqref{eq:norm_nabla2_J} and Lemma~1.2.2 in \cite{nesterov_book}, we obtain
    \begin{align}
        \label{eq:lip_nabla2_J}
        \left\lVert \nabla J(\theta_1)-\nabla J(\theta_2)\right\rVert
        &\leq M_r\left(M_eM_h+M_e^2M_d^2\right) \left\lVert \theta_1 - \theta_2 \right\rVert
    \end{align}
    Similarly, from \eqref{eq:nabla_E_R_theta_sq}-\eqref{eq:nabla2_E_R_theta_sq}, we obtain
    \begin{align}
        \label{eq:norm_nabla_E_R_theta_sq}
        \left\lVert \nabla\E\left[\left(R^{\theta}\right)^2\right] \right\rVert
        &\leq M_r^2\E\left[\left\lVert\sum_{t=0}^{T-1}\nabla\log\pi_\theta(A_t|S_t)\right\rVert\right]
        \leq M_r^2M_eM_d,
    \end{align}
    and
    \begin{align}
        \label{eq:norm_nabla2_E_R_theta_sq}
        &\left\lVert\nabla^2 \E\left[\left(R^{\theta}\right)^2\right]\right\rVert
        \leq M_r^2\E\left[\left\rVert\sum_{t=0}^{T-1}\nabla^2\log\pi_\theta(A_t | S_t)\right\rVert+
        \left\lVert\sum_{t=0}^{T-1}\nabla\log\pi_\theta(A_t | S_t)\right\rVert^2\right]
        \leq M_r^2\left(M_eM_h+M_e^2M_d^2\right).
    \end{align}
    Now,
    \begin{align}
        \label{eq:norm_nabla2_V}
        \left\lVert \nabla^2 V(\theta)\right\rVert
        &=\left\lVert \nabla^2 \left(\E\left[\left(R^{\theta}\right)^2\right]-J(\theta)^2\right)\right\rVert\nonumber\\
        &=\left\lVert \nabla^2 \E\left[\left(R^{\theta}\right)^2\right]-2J(\theta) \nabla^2 J(\theta)-2\nabla J(\theta) \nabla J(\theta)^\top\right\rVert\nonumber\\
        &\leq\left\lVert \nabla^2 \E\left[\left(R^{\theta}\right)^2\right]\right\rVert+2\left\lvert J(\theta)\right\rvert \left\lVert\nabla^2 J(\theta)\right\rVert+2\left\lVert\nabla J(\theta)\right\rVert^2\nonumber\\
        &\leq 3M_r^2M_eM_h+5 M_r^2M_e^2M_d^2.
    \end{align}
    Hence, from \eqref{eq:norm_nabla2_V} and Lemma~1.2.2 in \cite{nesterov_book}, we obtain
    \begin{align}
        \label{eq:lip_nabla_V}
        \left\lVert \nabla V(\theta_1)-\nabla V(\theta_2)\right\rVert
        &\leq \lambda \left(3M_r^2M_eM_h+5 M_r^2M_e^2M_d^2\right) \left\lVert \theta_1 - \theta_2 \right\rVert
    \end{align}
    % \begin{align}
        % \label{eq:nabla_rho_lambda}
        % &\nabla \rho_\lambda(\theta)=
        % \nabla J(\theta)-\lambda\nabla V(\theta)\nonumber\\
        % &=\nabla J(\theta)-\lambda\nabla \left(\E\left[\left(R^{\theta}\right)^2\right]-J(\theta)^2\right)\nonumber\\
        % &=\nabla J(\theta)-\lambda\nabla \E\left[\left(R^{\theta}\right)^2\right]+2\lambda J(\theta) \nabla J(\theta)\nonumber\\
        % &=\left( 1+2\lambda J(\theta)\right)\nabla J(\theta)-\lambda\nabla \E\left[\left(R^{\theta}\right)^2\right]\nonumber\\
        % &=\left(1+2\lambda \E\left[R^{\theta}\right]\right)\E\left[R^{\theta}\sum_{t=0}^{T-1}\nabla\log\pi_\theta(A_t|S_t)\right]-\lambda \E\left[\left(R^{\theta}\right)^2\sum_{t=0}^{T-1}\nabla\log\pi_\theta(A_t|S_t)\right]
        % \end{align}
    Now,
    \begin{align}
        \label{eq:norm_nabla_rho_lambda}
        \left\lVert \nabla \rho_\lambda(\theta)\right\rVert&=
        \left\lVert \nabla J(\theta)-\lambda\nabla V(\theta)\right\rVert\nonumber\\
        &\leq\left\lVert\nabla J(\theta)\right\rVert+\lambda\left\lVert\nabla \E\left[\left(R^{\theta}\right)^2\right]\right\rVert+2\lambda \left\lvert J(\theta) \right\rvert \left\lVert\nabla J(\theta)\right\rVert\nonumber\\
        &\leq M_rM_eM_d+ 3\lambda M_r^2 M_e M_d.
    \end{align}
    Hence, from \eqref{eq:norm_nabla_rho_lambda} and Lemma~1.2.2 in \cite{nesterov_book}, we obtain
    \begin{align}
        \label{eq:lip_rho_lambda}
        \left\lvert \rho_\lambda(\theta_1)-\rho_\lambda(\theta_1)\right\rvert
        &\leq \left(M_rM_eM_d+ 3\lambda M_r^2 M_e M_d\right)  \left\lVert \theta_1 - \theta_2 \right\rVert.
    \end{align}
    From \eqref{eq:lip_nabla2_J} and \eqref{eq:lip_nabla_V}, we obtain
    \begin{align}
        \label{eq:lip_nabla_rho_lambda}
        \left\lVert \nabla \rho_\lambda(\theta_1)-\nabla \rho_\lambda(\theta_1)\right\rVert &\leq
        \left\lVert \nabla J(\theta_1)-\nabla J(\theta_2) \right\rVert+ \lambda\left\lVert \nabla V(\theta_2)-\nabla V(\theta_1)\right\rVert\nonumber\\
        &\leq \left( M_rM_e\left(M_h+M_eM_d^2\right)+\lambda M_r^2M_e\left(3M_h+5 M_eM_d^2\right)\right) \left\lVert \theta_1 - \theta_2 \right\rVert.
    \end{align}
    %\begin{align}
    %\label{eq:nabla2_V}
    %&\nabla^2 V(\theta)\nonumber\\
    %&=\nabla^2 \left(\E\left[\left(R^{\theta}\right)^2\right]-J(\theta)^2\right)\nonumber\\
    %&=\nabla^2 \E\left[\left(R^{\theta}\right)^2\right]-2J(\theta) \nabla^2 J(\theta)-2\nabla J(\theta) \nabla J(\theta)^\top\nonumber\\
    %&=\left(1+2\lambda J(\theta)\right)\nabla^2 J(\theta)+2\lambda\nabla J(\theta) \nabla J(\theta)^\top-\lambda\nabla^2 \E\left[\left(R^{\theta}\right)^2\right]\nonumber\\
    %&=\E\left[\left(R^{\theta}\right)^2\left(\sum_{t=0}^{T-1}\!\nabla^2\log\pi_\theta(A_t | S_t)+\left[\sum_{t=0}^{T-1}\!\nabla\log\pi_\theta(A_t | S_t)\right] \left[\sum_{t=0}^{T-1}\!\nabla\log\pi_\theta(A_t | S_t)\right]^T\right)\right]\nonumber\\
    %&-2\E\left[R^{\theta}\right]\E\left[R^{\theta}\left(\sum_{t=0}^{T-1}\!\nabla^2\log\pi_\theta(A_t | S_t)+\left[\sum_{t=0}^{T-1}\!\nabla\log\pi_\theta(A_t | S_t)\right] \left[\sum_{t=0}^{T-1}\!\nabla\log\pi_\theta(A_t | S_t)\right]^T\right)\right]\nonumber\\
    %&-2\E\left[R^{\theta}\sum_{t=0}^{T-1}\nabla\log\pi_\theta(A_t|S_t)\right] \E\left[R^{\theta}\sum_{t=0}^{T-1}\nabla\log\pi_\theta(A_t|S_t)\right]^\top\nonumber\\
    %\end{align}

    %\begin{align}
    %\label{eq:nabla2_rho_lambda}
    %&\nabla^2 \rho_\lambda(\theta)=\nabla^2 J(\theta)-\lambda\nabla^2 V(\theta)\nonumber\\
    %&=\nabla^2 J(\theta)-\lambda\nabla^2 \left(\E\left[\left(R^{\theta}\right)^2\right]-J(\theta)^2\right)\nonumber\\
    %&=\nabla^2 J(\theta)-\lambda\nabla^2 \E\left[\left(R^{\theta}\right)^2\right]+2\lambda J(\theta) \nabla^2 J(\theta)+2\lambda\nabla J(\theta) \nabla J(\theta)^\top\nonumber\\
    %&=\left(1+2\lambda J(\theta)\right)\nabla^2 J(\theta)+2\lambda\nabla J(\theta) \nabla J(\theta)^\top-\lambda\nabla^2 \E\left[\left(R^{\theta}\right)^2\right]\nonumber\\
    %&=\left(1+2\lambda \E\left[R^{\theta}\right]\right)\E\left[R^{\theta}\left(\sum_{t=0}^{T-1}\!\nabla^2\log\pi_\theta(A_t | S_t)+\left[\sum_{t=0}^{T-1}\!\nabla\log\pi_\theta(A_t | S_t)\right] \left[\sum_{t=0}^{T-1}\!\nabla\log\pi_\theta(A_t | S_t)\right]^T\right)\right]\nonumber\\
    %&+2\lambda\E\left[R^{\theta}\sum_{t=0}^{T-1}\nabla\log\pi_\theta(A_t|S_t)\right] \E\left[R^{\theta}\sum_{t=0}^{T-1}\nabla\log\pi_\theta(A_t|S_t)\right]^\top\nonumber\\
    %&-\lambda \E\left[\left(R^{\theta}\right)^2\left(\sum_{t=0}^{T-1}\!\nabla^2\log\pi_\theta(A_t | S_t)+\left[\sum_{t=0}^{T-1}\!\nabla\log\pi_\theta(A_t | S_t)\right] \left[\sum_{t=0}^{T-1}\!\nabla\log\pi_\theta(A_t | S_t)\right]^T\right)\right]
    %\end{align}
\end{proof}



\bibliography{vijayan_677}

\end{document}
