\documentclass[accepted]{uai2022} % for initial submission
% \documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
%\usepackage[american]{babel}
\usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%\input{macros}
%%%%%%%%%%% macros.tex %%%%%%%%%

%\usepackage{microtype}
\usepackage{graphicx}
%\usepackage{subfig}
%\usepackage{booktabs} % for professional tables
%\usepackage{mathrsfs}
%\usepackage{enumitem}
%\usepackage{color}
%\usepackage{hyperref}
%\usepackage{dsfont}
%\usepackage{enumitem}
%\usepackage{bbm}
\usepackage{algorithm}
\usepackage{algorithmic}
%\usepackage{todonotes}
%\usepackage{pdfpages}
%\usepackage{amsmath,array}
\usepackage{amssymb}


\newcommand\argmax{\mathop{\rm arg\,max}}
\newcommand\argmin{\mathop{\rm arg\,min}}
\newcommand {\defn} {\triangleq}
\newcommand \Reals {\ensuremath{\mathbb{R}}}
\newcommand \E {\mathop{\mbox{\ensuremath{\mathbb{E}}}}\nolimits}
\newcommand \V {\mathop{\mbox{\ensuremath{\mathbb{V}}}}\nolimits}
\renewcommand \Pr {\mathop{\mbox{\ensuremath{\mathbb{P}}}}\nolimits}
%\newtheorem{lemma}{Lemma}
%% macros for symbols
\newcommand \pol {\pi}
\newcommand \Pol {\Pi}
\newcommand \bel {\beta}
\newcommand \mdp {\mu}
\newcommand \MDP {\mathcal{M}}
\newcommand \return {R}
\newcommand \utility {U}
\newcommand \GP {\mathcal{GP}}
\newcommand \param {\theta} %% unknown parameter
\newcommand \Params {\Theta} %% unknown parameter



\newcommand \disc {\gamma}
\newcommand \MDPs {\mathcal{M}} %% The MDP
\newcommand \Pols {\Pi} %% The policy
\newcommand \VC[3] {V_{#1,#2}^{#3}}
\newcommand \VS[2] {V_{#1,#2}^{*}}
\newcommand \CS {\mathcal{S}} %% The state space
\newcommand \CA {\mathcal{A}} %% The action space
\newcommand \CV {\mathcal{V}} %% The value function estimate
\newcommand \trans {\mathcal{T}} %% The transition kernel
\newcommand \rew {\rho} %% The reward function\newcommand \disc {\gamma} %% discount factor
\newcommand \abel {\hat{\xi}} %% approximate belief
\newcommand \mbel {\psi} %% belief about MDPs 

\newcommand \p {\partial}

\newcommand \Bellman {\mathscr{L}}
\newcommand \PBellman {\mathscr{P}}
\newcommand \util {U}
\newcommand \val {\vectorsym{v}}
\newcommand \Vals {\mathcal{V}}
\newcommand \discount {\gamma}
\newcommand \horizon {T}

%\newcommand \StateSet {{\CQ}}


%% Commands

% \DeclareMathOperator{\st}{s.t.\,}
% \DeclareMathOperator{\trace}{tr}

\newcommand \onenorm[1]{\left\|#1\right\|_1}
\newcommand \pnorm[2]{\left\|#1\right\|_{#2}}
\newcommand \inftynorm[1]{\left\right\|#1\|_\infty}
\newcommand \norm[1]{\left\|#1\right\|}


\newcommand \dd {\, \mathrm{d}}
\let\Pr\relax
\newcommand \Pr {\mathbb{P}}
\newcommand \UE {U_{\alpha_2}^E}
\newcommand \UA {U_{\alpha_1}^A}

\newcommand \cset[2] {\left\{#1 ~\middle|~ #2\right\}}
\newcommand \set[1] {\left\{#1\right\}}
\newcommand \ind[1] {\mathds{1}\left\{#1\right\}}

\newcommand \KL[2] {D\left( #1 ~\middle\|~ #2\right)}

\DeclareMathAlphabet{\mathpzc}{OT1}{pzc}{m}{it}

\newcommand \Softmax {{\mathpzc{Softmax}}}
\newcommand \GammaDist {{\mathpzc{Gamma}}}
\newcommand \Dirichlet {{\mathpzc{Dir}}}
\newcommand \Uniform {{\mathpzc{Unif}}}
\newcommand \Bernoulli {{\mathpzc{Bern}}}
\newcommand \Binomial {{\mathpzc{Binom}}}
\newcommand \Beta {{\mathpzc{Beta}}}
\newcommand \Geometric {{\mathpzc{Geom}}}
\newcommand \Normal {{\mathpzc{N}}}
\newcommand \Multinomial {{\mathpzc{Mult}}}
\newcommand \Wishart {{\mathpzc{Wish}}}


\if 1
\newcommand \note[2][blue] {{\color{#1} \texttt{[#2]}}}
\newcommand \cor[2][red] {{\color{#1}#2}}
\newcommand \ins[2][magenta] {{\color{#1}#2}}
\newcommand \del[2][red] {{\color{#1}\sout{#2}}}
\newcommand \grm[2][green!50!black] {{\color{#1}#2}}
\else

\newcommand \ins[2][magenta] {{\color{#1}#2}}
\fi

%%% other notes
\newcommand\hecomment[1]{{\color{red}{[HE: \texttt{#1}]}}}
\newcommand\cdcomment[1]{{{\color{blue}\texttt{[CD: #1]}}}}
\newcommand\dbcomment[1]{{\color{magenta}{[DB: \texttt{#1}]}}}
\newcommand\ejcomment[1]{{{\color{orange}\texttt{[EJ: #1]}}}}
\newenvironment{mycomments}[1]{%
	\leavevmode\color{#1}\ignorespaces%
}{%
}%

%\newcommand \mdp {\ensuremath{\mathcal{M}}}
%\newcommand \pol {\ensuremath{\pi}}
\newcommand \states {\ensuremath{\mathcal{S}}}
\newcommand \actions {\ensuremath{\mathcal{A}}}
\newcommand \transitions {\ensuremath{\mathcal{T}}}
\newcommand \rewards {\ensuremath{\mathcal{R}}}
\newcommand \agent {\ensuremath{\theta}}
\newcommand \adversary {\ensuremath{\omega}}
\newcommand \paramadversary {\ensuremath{\psi}}
\newcommand \paramagent {\ensuremath{\phi}}
\newcommand \real {\ensuremath{\mathbb{R}}}
\newcommand \expect {\mathop{\mbox{\ensuremath{\mathbb{E}}}}\nolimits}
\newcommand \var {\mathop{\mbox{\ensuremath{\mathbb{V}}}}\nolimits}
\newcommand \cvar[1] {\ensuremath{\mathrm{CVaR}\left[#1\right]}}
\newcommand \prob[1] {\mathbb{P}\left[#1\right]}
\newcommand \returns {\ensuremath{Z_{\pi}(s)}}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\usepackage{caption,subcaption}
\usepackage{amsmath,array, amssymb, amsthm}
\usepackage{tikz}
\newcommand{\dataset}{{\cal D}}
\newcommand{\fracpartial}[2]{\frac{\partial #1}{\partial  #2}}
\newif\ifsinglecol
\singlecolfalse

%\newenvironment{proof}{\par\noindent{\bf Proof\ }}{\hfill\BlackBox\\[2mm]}
\newtheorem{example}{Example} 
\newtheorem{theorem}{Theorem}
\newtheorem{lemma}[theorem]{Lemma} 
\newtheorem{proposition}[theorem]{Proposition} 
\newtheorem{remark}[theorem]{Remark}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{conjecture}[theorem]{Conjecture}
\newtheorem{axiom}[theorem]{Axiom}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{SENTINEL: Taming Uncertainty\\ with Ensemble based Distributional Reinforcement Learning}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1, 2]{\href{mailto:<hannese@chalmers.se>?Subject=SENTINEL: Taming Uncertainty with Ensemble based Distributional Reinforcement Learning}{Hannes Eriksson}{}}
\author[3, 4]{Debabrota Basu}
\author[1]{Mina Alibeigi}
\author[2,5]{Christos Dimitrakakis}
% Add affiliations after the authors
\affil[1]{%
    Zenseact AB\\
    Gothenburg, Sweden
}
\affil[2]{%
    Chalmers University of Technology\\
    Gothenburg, Sweden
}
\affil[3]{%
    Scool\\
    INRIA Lille-Nord Europe\\
    Lille, France
  }
  
\affil[4]{%
    CRIStAL,
    CNRS,
    Lille, France
  }
  
\affil[5]{%
    University of Neuchatel, Switzerland
    and
    University of Oslo, Norway
  }
  
\begin{document}
\maketitle

\begin{abstract}
    In this paper, we consider risk-sensitive sequential decision-making in Reinforcement Learning (RL). 
    Our contributions are two-fold. First, we introduce a novel and coherent quantification of risk, namely composite risk, which quantifies the joint effect of aleatory and epistemic risk during the learning process.
    Existing works considered either aleatory or epistemic risk individually, or as an additive combination.
    We prove that the additive formulation is a particular case of the composite risk when the epistemic risk measure is replaced with expectation.
    Thus, the composite risk is more sensitive to both aleatory and epistemic uncertainty than the individual and additive formulations.
    We also propose an algorithm, SENTINEL-K, based on ensemble bootstrapping and distributional RL for representing epistemic and aleatory uncertainty respectively. The ensemble of K learners uses Follow The Regularised Leader (FTRL) to aggregate the return distributions and obtain the composite risk.
    We experimentally verify that SENTINEL-K estimates the return distribution better, and while used with composite risk estimates, demonstrates higher risk-sensitive performance than state-of-the-art risk-sensitive and distributional RL algorithms.
\end{abstract}

%\input{introduction}
%%%%%%%%%%%% introduction.tex %%%%%%%%%%%

\section{Introduction}\label{sec:intro}
Reinforcement Learning (RL) algorithms, with their recent success in games and simulated environments~\citep{mnih2015human}, have drawn interest for real-world and industrial applications~\citep{pan2017virtual,mahmood2018benchmarking}. 
In addition, since in RL the environment is by definition unknown to the agent, exploring it so as to improve performance and eventually obtain the optimal policy entails risks.
Although the risk is not an issue in simulation, it is important to consider risks when interacting in the real world~\citep{pinto17a,garcia2015comprehensive,DBLP:journals/corr/abs-1810-09126}.
In this paper, we employ a model-free approach that enables us both to efficient in terms of the amount of data needed, and to be flexible with respect to the risk metric the agent should consider when making decisions.
\begin{figure*}[t!]
	\centering
	\includegraphics[width=0.79\textwidth]{blockdiagram}
	\caption{SENTINEL-K with FTRL-driven composite risk estimator and K CDQNs as return distribution estimators.}% {Hannes: I drew it in tikz, see blockdiagram.tex, let me know if you want me to change it a bit more} {Deb: have a few more comments. lets talk.}}
	\label{fig:blockdiagram}
\end{figure*}

Risk sensitivity in reinforcement learning and Markov Decision Processes (MDPs) has sometimes been considered under a minimax formulation over plausible MDPs~\citep{satia:uncertain,HEGER1994105,tamar2014scaling}.
Alternative approaches include maximising a risk-sensitive statistic instead of the expected return~\citep{chow2014algorithms,tamar2015optimizing,clements2019estimating}.
In this paper, we focus on the second approach due to its flexibility.
Either approach requires estimating the uncertainty associated with the decision-making procedure.
This uncertainty includes both the inherent randomness in the model and the uncertainty due to imperfect information about the true model.
These two type of uncertainties are called \textit{aleatory} and \textit{epistemic} uncertainty respectively~\citep{der2009aleatory}.

In recent literature, researchers have either quantified epistemic and aleatory risks separately~\citep{mihatsch2002risk,eriksson2019epistemic} or considered an additive risk formulation where their weighted sum is minimised by an RL algorithm~\citep{clements2019estimating}. 

In this work, we propose a \textit{composite risk} formulation in order to accurately capture the combined effect of aleatory and epistemic uncertainty for decision-making in RL (Section~\ref{sec:risk}). Our composition of risks relies on \emph{coherent} risk measures, for which we show that their composition remains coherent. Our choice of focusing on coherent risk measures is also motivated by its extensive use and corresponding benefits in control theory~\cite{majumdar2017risk}, decision theory~\cite{pflug2016time}, and reinforcement learning theory~\cite[and references therein]{tamar2016sequential,ruszczynski2010risk}.

%Using Theorem~\ref{thm:comp_geq_add} and a reductive experiment (Figure~\ref{fig:gauss-mix-cvar}), we show that using an additive risk, which is the sum of separately computed epistemic and aleatory Conditional Value at Risk (CVaR)\footnote{$CVaR_{\alpha}$ is the expected value of $\alpha\%$ of events in the left tail.},  strictly underestimates the total CVaR~\citep{rockafellar2000optimization}, and the deviation is significant as CVaR focuses more on less probable events. In contrast, the composite risk takes into consideration the combined effect of two types of uncertainty, and better reflects the underlying risk. Finally, we show that additive risk is essentially a special case of composite risk.

We incorporate composite risk measures within the Distributional RL (DRL) framework~\citep{bellemare2017distributional,tang2018exploration,rowland2019statistics}.
The DRL framework aims to model the distribution of returns of a policy for a given environment (Section~\ref{sec:drl}).
This highly expressive distributional representation allows us to both estimate appropriate risk measures and to incorporate them in final decision-making.
However, DRL approaches are typically limited to modelling aleatory uncertainty, with epistemic uncertainty due to partial information not being explicitly modelled in terms of the return distribution.
We us a bootstrapping~\citep{efron1985bootstrap} framework to represent epistemic uncertainty. Our framework, which we call SENTINEL-K, is illustrated in Figure~\ref{fig:blockdiagram}. At a high level, we use Categorical Deep Q Network (CDQN)~\citep{bellemare2017distributional} to model aleatory uncertainty and a bootstrapped ensemble for epistemic uncertainty. These can be used with any coherent measures and ensemble algorithm.

We discuss related work in Section~\ref{sec:related}. This is followed by some background on risk
measures, Markov decision processes, and DRL in
Section~\ref{sec:background}. SENTINEL-K is flexible enough to use any combination of
coherent risk measures for aleatory and epistemic risks, as we explain in Section~\ref{sec:risk}.  The algorithm is described in detail in Section~\ref{sec:algo}, with Section~\ref{sec:ensemble} and ~\ref{sec:ftrl} showing how the ensemble is created and its members weighted respectively.


 Section~\ref{sec:experiments} examines the performance of SENTINEL-K with a composite CVaR metric on a highway environment with $10$ cars. Our results show that our approach leads to fewer number of crashes than competing algorithms: Variational DQN (VDQN)~\citep{tang2018exploration}, CDQN~\citep{bellemare2017distributional}, total variance decomposition Uncertainty Aware-DQN (UA-DQN)~\citep{clements2019estimating}, as well as SENTINEL-K with additive CVaR estimate, which we used as an ablation test to showcase the importance of the using a coherent composite risk.
The supplementary material includes further experiments, showing that SENTINEL-K  features significantly improved estimates of return distributions, and shows that using FTRL for weighing the ensemble members measurably improves performance.
  

\section{Related Work}\label{sec:related}
For RL applications in the real world, such as for autonomous driving and robotics, \textit{risk-sensitive} RL approaches can avoid the negative consequences of excessive exploration that may lead to unsafe decisions in real-life.
This has initiated a spate of research efforts~\citep{howard1972risk,satia:uncertain, coraluppi1999risk, marcus1997risk, mihatsch2002risk,DBLP:journals/corr/abs-1810-09126} spanning five decades. But the majority of risk-sensitive RL papers~\citep{howard1972risk, coraluppi1999risk, marcus1997risk} focused on discrete state-space MDPs and either aleatory or epistemic risk.
We are interested in designing a general risk-sensitive framework applicable to any type of state space and risk.

Both \textit{aleatory} and \textit{epistemic} uncertainties are important for risk-sensitive RL.
The former expresses the \emph{randomness} inherent to the problem and the latter a \emph{lack of knowledge} about the problem. Aleatory risk-sensitivity in MDPs was first considered by~\citep{howard1972risk}, who introduced the idea of exponential utilities for the return.\footnote{Here, we use return to mean the total discounted reward} Epistemic uncertainty in MDPs was investigated by~\citep{satia:uncertain}, who provided game theoretic and Bayesian solution methods. Later works~\citep{coraluppi1999risk, marcus1997risk, mihatsch2002risk} extend risk-neutral methods to the risk-sensitive setting by using a non-linear utility~\citep{garcia2015comprehensive}. They consider aleatory risk-sensitive RL with exponential utility on the return \citep{mihatsch2002risk}.
Follow-up works~\citep{chow2014algorithms, chow2015riskconstrained} focus on scaling up these approaches. Other work on risk-sensitive RL focuses on CVaR~\citep{chow2014algorithms, tamar2015optimizing, chow2015risk}.
%\cite{cvar_conc} shows that CVaR of a distribution can be accurately estimated using i.i.d. samples.
There have been recent works considering epistemic risk~\citep{eriksson2019epistemic}, wherein problem uncertainty is expressed in a Bayesian framework as a distribution over MDPs.
\cite{depeweg2018decomposition, clements2019estimating} intuitively incorporates both of these risks in decision making. 
\cite{depeweg2018decomposition} considers the risk in the per-step rewards obtained in a MDP while 
\cite{clements2019estimating} proposes to use the additive formulation of epistemic and aleatory risks. Both of them use variance, which is not a coherent measure~\citep{coherent_risk}. Unlike previous work, our methodology of composite risk also allows us to apply any pair of coherent risk measures\footnote{For example, CVaR, Wang risk measure~\citep{Wang2002ARM}, Standard Deviation (SD).} to aleatory and epistemic uncertainty.

We instead define a generalised composite risk measure that takes into account both epistemic and aleatory uncertainty, and their entangled effect. Coherence is important, as we show that for any two coherent risk measures the composite risk retains coherence. This gives a principled approach for combining different application-appropriate risk measures for epistemic and aleatory uncertainties. 


%In~\citep{bellemare2017distributional} the authors propose a distributional reinforcement learning framework where instead of learning the expected return, they learn a categorical representation of the return. 
%Follow up papers~\citep{dabney2018distributional, rowland2019statistics} improve on the original framework by considering quantiles and expectiles, respectively.  
%In~\citep{tang2018exploration} the authors propose VDQN, a distributional Q-learner and demonstrate results for a Gaussian.

To express aleatory uncertainty, we rely on a distributional RL method called CDQN, which incorporates highly expressive approximators to model continuous and multimodal return distributions. In addition, we leverage ensemble methods to express epistemic uncertainty. Ensemble methods have first been used in risk-neutral RL by for representing epistemic uncertainty in order to improve  exploration~\citep{dimitrakakis2006nearly,dimitrakakis2007ensembles}. This approach was later applied to MDPs by~\citet{osband2016deep}. On the other hand, \citet{wiering2008ensemble} used ensembles to combine policies instead. Ensembles have also been used to represent aleatory~\citep{fausser2015neural,  pacchiano2020optimism} uncertainty. Recently, \citep{depeweg2018decomposition,clements2019estimating} also use multiple Bayesian Neural Networks (BNNs) to estimate epistemic uncertainty. In the best of our knowledge, we are the first to use bootstrapped CDQNs for quantifying epistemic risk, which gives us freedom to model distributions on plausible MDPs without any structural assumptions, e.g. Gaussian distribution on parameters of Bayesian NNs or Gaussian distribution on state transitions~\citep{clements2019estimating}. An additional difference with prior work is that we use a follow the regularised leader (FTRL) algorithm to weigh the ensemble members in order to improve our uncertainty estimates.
\iffalse
%use as motivation of model selection with ftrl later
In~\citep{pacchiano2020optimism}, the authors show that model selection can give some advantages over model averagin\define \define \define g. In that work the authors only consider models inside the sphere $\mdp_M \pm \epsilon_M$, which thereby excludes models that are overly pessimistic or optimistic.
\fi
%\paragraph{Our Contributions.}
%In this paper, we propose two main contributions. 
%The first is the \textit{composite risk} formulation, which is more holistic than considering aleatory and epistemic risk individually. It also estimates the total risk more accurately than the additive risk formulation, which underestimated the total risk.  
%We also reintroduce FTRL from bandit literature as a means of model selection, by weighting each estimator differently depending on how far away they are from the average estimator. 
%We demonstrate better sample complexity over naive model averaging empirically.
%We experimentally demonstrate superior performance of the proposed composite risk-driven FTRL-based distributional RL over the risk-neutral and additive risk-based distributional RL in terms of (i) uncertainty estimation, (ii) performance, and (iii) theoretical properties.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


%\input{background}
%%%%%%%%%% background.tex %%%%%%%%%%%%

\section{Background}\label{sec:background}
%In this section, we introduce the notion of risk measures, coherent risk measures, the risk-sensitive Markov decision process formulation, and the distributional RL framework.

%, and the Risk-Sensitive Distributional (RSDRL) formulation.
\subsection{Risk Measures: Coherence}
%\cdcomment{Note that what we call 'utility' is a special case of a risk measure, I guess}
The idea of quantifying risk in decision making is long-studied in decision theory and has found multiple applications in finance and actuarial science.
A \textit{risk measure} maps a real-valued distribution to a real number, and quantifies the probability of occurrence of an event away from the expectation~\citep{szego2002measures}.
Some well-known risk measures are variance, Value at Risk (VaR) and Conditional Value at Risk (CVaR). \textit{Coherent} risk measures obey a set of axioms~\cite{coherent_risk}: normalisation, monotonicity, sub-additivity, homogeneity, and translation invariance.
Not all risk measures are coherent: CVaR is coherent, but variance and VaR do not satisfy respect homogeneity and subadditivity respectively~\citep{coherent_risk}.

If a coherent risk measure also satisfies comonotonic subadditivity~\citep[Axiom 4]{SONG2009459}, it can be expressed as an expectation over a distorted distribution function, for a concave \textit{distortion function} $U_\alpha:[0,1] \to [0,1]$.  Specifically (see~\citep[Theorem 2]{WANG1997173}) a random variable $Z$ with associated probability measure $P$ and cumulative distribution function $F_Z$ satisfies: 
\begin{align}
    &\mathrm{Risk}_{U_\alpha}(Z) \triangleq \int_{\mathcal{Z}} Z \dd(U_{\alpha}\circ P)\notag\\ 
    &= \int_{\mathcal{Z}} U_{\alpha}(1-F_Z(z))\dd z = \int_0^1 U_{\alpha}(t)\dd q(1-t),
\end{align}
where $(U_{\alpha}\circ P)(A) \defn U_{\alpha}[P(A)]$ for any $A\subseteq \mathcal{Z}$.
The last line is obtained from substitution of variables~\citep{wirch2001distortion}.
Here, $q$ is the quantile function, i.e. $q(1-t) = \inf\{z\geq 0|F_Z(z) \geq 1-t \} = F_Z^{-1}(1-t)$, $U(0)=0$, and $U(1)=1$.
%, and $g'(t)$ is derivative of $g(t)$ with respect to $t$.
%The distortion function allows us to treat different samples with different risk-sensitive weights unlike standard expectation where $U(t) = t$. 
Since in this paper we use the risk measures for decision making, we represent a coherent risk measure through its corresponding \emph{distortion function} $U_{\alpha}$.
% cd: I don't like calling it distortion utility function, as it distorts only the probability, rather than the utility.

In this paper we focus on the \textit{CVaR}~\citep{rockafellar2000optimization} risk measure. It is extensively used in risk-sensitive RL as it is coherent, applies to general $L_p$ spaces, and captures the heaviness of the tail of a distribution. It is the expectation of the worst $\alpha$-quantile of a probability distribution, with $\alpha \in [0,1]$:
\begin{align}\hspace*{-1em}
CVaR_\alpha(Z) &\triangleq \mathbb{E}[Z \, | \, Z \leq \nu_\alpha \wedge \Pr(Z \geq \nu_\alpha) = 1-\alpha].
\end{align}
For CVaR, $U_{\alpha}(t) = \min \{ \frac{t}{1-\alpha},1\}$, 
For $\alpha = 1$, CVaR reduces to the expected value, and thus risk-neutrality.

Due to generality of our methodology and the composite risk formulation, we are able to incorporate other coherent risk measures such as the Wang risk measure~\citep{Wang2002ARM}, and standard deviation~\citep{pcirillo} (Fig.~\ref{fig:risk}).%, and entropic VaR~\citep{ahmadi2012entropic}.

\iffalse
\begin{equation}
e_\tau(L) = \underset{\ell \in \mathbb{R}}{\arg \min} \, \mathbb{E}[\tau \max(L - \ell, 0)^2+(1-\tau)\max(\ell-L, 0)^2]
\end{equation}

~\cite{emmer2014vwhat} shows that expectiles as a risk measures are not comonotically additive, while they are coherent risk measures. \cite{rowland2019statistics} uses expectiles instead of categorical or quantiles for DRL, and is the successor work of the QDRL paper.

\begin{theorem}
	A risk measure $\rho$ is \textbf{comonotonically additive} if for any comonotic random variables $L_1$ and $L_2$ it holds that
	\begin{equation}
	\rho(L_1 + L_2) = \rho(L_1) + \rho(L_2).
	\end{equation}
\end{theorem}

\begin{theorem}
	A risk measure $\rho$ is \textbf{subadditive} if it holds that
	\begin{equation}
	\rho\Big(\sum_{i=1}^N L_i\Big) \leq \sum_{i=1}^N \rho(L_i).
	\end{equation}
\end{theorem}

\begin{align}
F(U_E, U_A, \beta) &= \int_\Theta U_E \Big(\int_\mathcal{Z}U_A(z)d\Pr(z \, | \, \theta)\Big)d\beta(\theta)\\
&= \int_\Theta U_E \Big(\mdp_i + \sqrt{-2\ln \alpha}\Big)d\beta(\theta)\\
&= \hat{\mdp} + \sqrt{-2 \ln \alpha} - \sqrt{\frac{1}{N}\Sigma_{i=1}^N(\mdp_i^2 - \hat{\mdp}^2)}\sqrt{-2 \ln \alpha}
\end{align}
\fi

\subsection{RL: MDP and Distributional RL}\label{sec:drl}
\noindent\textbf{MDPs.} We consider problems that can be modelled by a Markov Decision Process (MDP)~\citep{sutton2018reinforcement}. An MDP is a tuple $\mdp \triangleq (\mathcal{S}, \mathcal{A}, \mathcal{R}, \mathcal{T}, \gamma)$. $\mathcal{S} \in \mathbb{R}^d$ is a state space of dimension $d$. $\mathcal{A}$ is the set of admissible actions. $\mathcal{T}$ is a transition kernel that determines the probability of successor states $s'$ given the present state $s$ and action $a$. The reward function $\mathcal{R}$ quantifies the goodness of taking action $a$ in state $s$. %In general we consider problems where some of the parameters of the MDP are unknown, such as the transition and reward functions. 
%This sequential decision-making problem can be viewed as a Reinforcement Learning (RL) problem~\cite{sutton2018reinforcement}.
In the risk-neutral setup, the goal of the agent is to find a policy $\pi: \mathcal{S} \rightarrow \mathcal{A}$ to maximise expected value of cumulative rewards given a time horizon $T$: 
$V^\pi(s,a) = \mathbb{E}\left[\sum_{t=0}^T \gamma^{t}R(s_t, a_t)\right]$.
Here, $s_t \sim \mathcal{T}(.|s_{t-1},a_{t-1})$, $a_t = \pi(s_t)$, $s_0=s$, $a_0=a$, and the discount factor $\gamma \in (0,1)$.
% When the utility function $U$ is the identity, $U^\pi(s,a)$ reduces the expected long-term discounted reward, i.e. the Q-function. If  $U$ is a coherent risk measure, such as CVaR, it leads to a risk-sensitive formulation of MDP~\citep{mihatsch2002risk,DBLP:journals/corr/abs-1810-09126}. \cdcomment{I don't know how this works. The distortion function you define is $[0,1]\to[0,1]$. However, the utility defined here is $\Reals \to \Reals$. What is the correspondence between U and g? They sem to be very different things. Do we need this part? Is it left over from the previous version?}\dbcomment{No, we dont}
%\subsection{RL: MDP and Distributional RL}\label{sec:drl}
\noindent\textbf{Distributional RL.}
The variable at the core of both risk-neutral and risk-sensitive RL is usually the accumulated discounted reward $Z^\pi(s,a) \triangleq \sum_{t=0}^T \gamma^{t}R(s_t, a_t)$.
$Z^\pi(s,a)$ is called the return of a policy $\pi$.
In distributional RL, the goal is to learn the return distribution $Z^{\pol}(s, a)$ obtained by following policy $\pol$ from state $x$ and action $a$ under the given MDP.

%Different methods are proposed to parametrise the return distribution.
%\cite{bellemare2017distributional} propose \emph{CDQN}, a categorical distribution with $N$ atoms and, with support in $[V_{min}, V_{max}]$. 
%The mass of the atom $z_i$ is given by $\frac{e^{\theta_i(s,a)}}{\sum_j e^{\theta_j(s,a)}}$. 
%\cite{tang2018exploration}, \cite{Dabney2018DistributionalRL}, and~\cite{rowland2019statistics} use unimodal Gaussians, quantiles, and expectiles to model the return distribution respectively.
%In this work, we choose to extend CDQN, as it permits richer representations of distributions, and flexibility to compute different statistics.


In this work, we choose to extend CDQN by~\citet{bellemare2017distributional}, as it permits richer representations of distributions, and flexibility to compute different statistics. The intuition of using this distributional framework for risk-sensitive RL is its flexibility to model multimodal and asymmetrical distributions, which is important for an accurate estimate of risk.

\iffalse
Get $\Pr(z \, | \, s, a, \theta_n)$. (Just forward pass of each independent network).
Compute $\frac{1}{N}\sum_{n=0}^N \mathbb{E}[z \, | \, s, a, \theta_n]$. Select the maximising action $a^{*}$. For conciseness lets just say it is the following: $a^{*} = \underset{a \in \mathcal{A}}{\arg\max} \, \mathbb{E}_{\Theta, \mathcal{Z}}[z \, | \, s, a, \theta]$. Now given this optimal action $a^{*}$ we want to update the mass of each atom in $\Pr(z \, | \, s, a, \theta_n)$. So first, do a Bellman projection $[r_t + \gamma z]_{V_{\text{MIN}}}^{V_{\text{MAX}}}$. Then, distribute its mass to its nearest neighbours. Do this for all atoms

So the full pseudo-code is the following:

\begin{enumerate}
	\item (Given a sample transition $\tau = (s, a, r, x')$
	\item Obtain $a^{*} = \underset{a \in \mathcal{A}}{\arg\max} \, \mathbb{E}_{\Theta, \mathcal{Z}}[z \, | \, s, a, \theta]$
	\item  $(\Phi \hat{\mathcal{T}}Z_{\theta_n}(s, a))_i = \sum_{j=0}^{N-1}\Big[1-\frac{|\lfloor \hat{\mathcal{T}}z_j\rfloor_{V_{\text{MIN}}}^{V_{\text{MAX}}} -z_i|}{\Delta z}\Big]_0^1 \, p_j(x', a^{*})$
	\item Minimise cross-entropy loss $D_{\text{KL}}(\Phi\hat{\mathcal{T}}Z_{\tilde{\theta}_n}(s, a) \, || \, Z_{\theta_n}(s, a))$
\end{enumerate}

What do we do differently? Well, $Q(x_{t+1}, a) := \sum_i z_i p_i(x_{t+1}, a)$ is simply $\int_\mathcal{Z}z\dd\Pr(z \, | \, \theta)$. So for aleatory DRL, we could simply change this to:

\begin{equation}
Q(x_{t+1}, a) := \sum_i U(z_i) p_i(x_{t+1}, a) = \mathbb{E}[z_i \, | z_i \leq \nu_\alpha].
\end{equation}



\section{Problem Formulation}
In this work we are considering decision making problems that can be modelled by a Markov Decision Process (MDP). An MDP is a tuple $(\mathcal{X}, \mathcal{A}, \mathcal{R}, \mathcal{T}, \gamma)$, where $\mathcal{X} \in \mathbb{R}^d$ is our state representation of dimension $d$. $\mathcal{A}$ is the set of actions admissible in this MDP, $\mathcal{T}$ is a transition kernel $\Pr(x' \, | \, s, a)$ which determines the probability of successor states, given that the agent is in state $x$ and commits to action $a$. The reward function $\mathcal{R}$ similarly considers the goodness of taking action $a$ in state $x$, and is typically a scalar. In general we consider problems where some of the parameters of the MDP are unknown, such as the transition and reward functions. 
This sequential decision making problem can be viewed as a Reinforcement Learning (RL) problem~\citep{sutton2018reinforcement}.

The goal of the agent is generally to maximise some \textit{utility function}~\citep{friedman1948uac} over the reward sequence $r_t, ..., r_{T-t}$, where $T$ is the \textit{problem horizon}. A common approach is to maximise the following: $R_t = \sum_t^T r_t \gamma^{t}$. In this paper, we consider alternative concave (for risk-aversion), and convex (for risk-taking) utility functions. A common approach in recent times is constrained value-at-risk (CVaR), which typically only considers performance in parts of the return distribution. 

For analysis, it is imperative to consider two additional functions, namely the \textit{value function} $V$ and the policy $\pol$. The value function $V_\mdp^\pol : \mathbb{R}^d \rightarrow \mathbb{R}$ assesses the goodness of following policy $\pol$ in MDP $\mdp$. 

\fi

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%



%\input{risk_measures}
%%%%%%%%%%%%%%% risk_measures.tex %%%%%%%%%%%%%

\section{Quantifying Composite Risk}
\label{sec:risk}
In risk-sensitive RL, we encounter two types of uncertainties: \textit{aleatory} and \textit{epistemic}.
Aleatory uncertainty is engendered by the stochasticity of the MDP model $\mdp$ and the policy $\pol$. 
Epistemic uncertainty exists due to the fact that the MDP model $\mdp$ is unknown. In the Bayesian setting, this is represented as a belief distribution $\beta$ over a set of plausible MDPs $\Theta$. Hence, risk measures can also be defined with respect to the MDP distribution.
Consequently, as an agent learns more about the underlying MDP, the epistemic risk vanishes.
The aleatory risk is inherent to the MDP  $\mdp$ and policy $\pol$, and thus persists even after correctly estimating the model $\mdp$. Let us now define risk measures for aleatory and epistemic uncertainties, and then combine them into a composite risk measure.

\textbf{Aleatory Risk.}
Given a coherent risk measure with distortion function $U^A_{\alpha}$, the aleatory risk is quantified as the deviation of total risk of individual models from the risk of the average model.
\begin{align*}
    A(U^A_{\alpha}, \beta) &\triangleq \int_{\Theta} \int_{\mathcal{Z}} Z \dd(U^A_{\alpha} \circ \Pr)(Z|\theta) \dd\beta(\theta) \\&- \int_{\Theta} \int_{\mathcal{Z}} \hat{Z} \dd(U^A_{\alpha} \circ \Pr)(\hat{Z}) 
\end{align*}
%\iffalse
\ifsinglecol
\begin{align*}
%A(U^A_{\alpha}, \beta) &\triangleq \mathbb{E}_\beta[\mathbb{E}_{\Pr(.|\theta)}[U^A_{\alpha}(Z)]-U^A_{\alpha}(\mathbb{E}_{\Pr(.|\theta)}[Z])]\\
%= \int_\Theta \int_{\mathcal{Z}}(U_A(z) -U_A(\mu_z))\dd\Pr(z \, | \, \theta) \dd\bel(\theta),\\
A(U^A_{\alpha}, \beta) &\triangleq \mathbb{E}_{\theta \sim \beta}[\sup_{Q\in Q^{\theta}_{\alpha}}\mathbb{E}[Z_\theta] - \sup_{Q\in Q^{\hat{\theta}}_{\alpha}}\mathbb{E}[Z_{\hat{\theta}}]] = \mathbb{E}_{\theta \sim \beta}[\sup_{Q\in Q^{\theta}_{\alpha}}\mathbb{E}[Z_\theta]] - \sup_{Q\in Q^{\hat{\theta}}_{\alpha}}\int_{\Theta}\int_{\mathcal{Z}}z \dd Q(z|\theta)\dd\beta(\theta).
%\\&=\int_{\Theta} \int_0^1 (U^A_{\alpha}(u)-U^A_{\alpha}(\mu_Z))\dd q_{Z|\theta}(1-u) \dd\bel(\theta) = \E_{\theta\sim \beta}\left[\int_0^1 U^A_{\alpha}(u - \mu_Z)\dd q_{Z|\theta}(1-u)\right]%\dd q_{\beta}(1-v) 
\end{align*}
\else
\iffalse
\begin{align*}
A(U_A, \beta) &\triangleq \mathbb{E}_\beta[\mathbb{E}_{\Pr(.|\theta)}[U_A(Z)]-U_A(\mathbb{E}_{\Pr(.|\theta)}[Z])]\\
&= \int_\Theta \int_{\mathcal{Z}}(U_A(z) -U_A(\mu_z))\dd\Pr(z \, | \, \theta) \dd\bel(\theta),
\end{align*}
\fi
\fi
%Here, $q_{Z|\theta}$ is the quantile function of the return distribution for a given model $\theta$. $U^A_{\alpha}(\mdp_z) \triangleq U^A_{\alpha}\Big(\int_\Theta z\Pr(z \, | \, \theta)\dd\bel(\theta)\Big)$, i.e. the distortion of the average model given a belief distribution $\beta$ over the plausible set of models $\Theta$. 
\iffalse
Here, $Q^{\theta}_\alpha$ is a set of absolutely continuous distributions defined around $\Pr(Z|\theta)$ constrained by $\alpha$ and $Q^{\hat{\theta}}_\alpha$ is a set of absolutely continuous distributions defined around $\int_\Theta \Pr(Z|\theta) \dd \beta(\theta)$ constrained by $\alpha$.
\fi
Here, $\Pr(\hat{Z}) = \int_{\Theta} \Pr(Z|\theta)\dd\beta(\theta)$, i.e. the return distribution of the average model. The centered definition of aleatory risk is necessary to show that additive risk is a special case of composite risk.

\textbf{Epistemic Risk.}
Given a coherent risk measure with distortion function $U^E_{\alpha}$, the epistemic risk quantifies the uncertainty invoked by not knowing the true model. Thus, the risk can be computed over any statistics of the models, such as expectation.
\begin{align*}
    E(U^E_{\alpha}, \beta) &\triangleq \int_{\Theta} \int_{\mathcal{Z}} Z \dd\Pr(Z|\theta) \dd(U^E_{\alpha}\circ\beta)(\theta)
\end{align*}
\ifsinglecol
\begin{align*}
E(U^E_{\alpha}, \beta) &\triangleq \sup_{\beta' \in \Beta_{\alpha}} \mathbb{E}_{\theta \sim \beta'}[\mathbb{E}_{Z\sim \Pr(.|\theta)}[Z]].
%\int_\Theta U^E_{\alpha}\left(1-F_{\beta}\left(\int_{\mathcal{Z}} z\Pr(z \, | \, \theta)\right)\right) \dd\theta =\int_\Theta U^E_{\alpha}(1-F_{\beta}(\overline{Z}_{\theta})) \dd\theta
%=  \int_0^1 U^E_{\alpha}(v) \dd q_{\overline{Z}_{\theta}}(1-v)
\end{align*}
\else
\iffalse
\begin{align*}
E(U_E, \beta) &\triangleq \mathbb{E}_\beta[U_E(\mathbb{E}_{\Pr(.|\theta)}[Z])] \\
&= \int_\Theta U_E\left(\int_{\mathcal{Z}} z \dd\Pr(z \, | \, \theta)\right)\dd\bel(\theta)
\end{align*}
\fi
\fi
\iffalse
Here, $\Beta_\alpha$ is a set of absolutely continuous distributions defined around $\beta(\theta)$ constrained by $\alpha$.
\fi

\textbf{Composite Risk under Model and Inherent Uncertainty.}
In typical risk-sensitive RL settings, the true MDP model is both unknown and inherently stochastic. Thus, the overall uncertainty is a composition of aleatory and epistemic uncertainties. For that reason, quantify it using what we call the \textit{composite risk}.
\begin{definition}[Composite Risk]\label{def:composite}
For two coherent risk measures with distortion functions $U^A_{\alpha_1}$ and $U^E_{\alpha_2}$, belief distribution $\beta$ on model parameters $\theta \in \Theta$, and a random variable $Z \in \mathcal{Z}$, the composite risk of epistemic and aleatory uncertainties is defined as
\begin{align}\label{eq:composite}
    &F^C(U^A_{\alpha_1}, U^E_{\alpha_2}, \beta) \triangleq \mathrm{Risk}_{\UE}(\mathrm{Risk}_{\UA}(Z|\theta)|\beta)\notag\\
    &= \int_{\Theta} \int_{\mathcal{Z}} Z \dd(U^A_{\alpha_1} \circ \Pr)(Z|\theta) \dd(U^E_{\alpha_2} \circ \beta)(\theta)\notag\\
    &= \int_0^1 \int_0^1 U^E_{\alpha_2}(v) U^A_{\alpha_1}(u)\dd q_{Z|\theta}(1-u) \dd q_{\beta}(1-v) 
\end{align}
\iffalse
\begin{align}\label{eq:composite}
F^C(U^A_{\alpha_1}, U^E_{\alpha_2}, \beta) %&\triangleq \int_\Theta U_E\Bigg(\int_{\mathcal{Z}}U_A(z) \dd\Pr(z \, | \, \theta) \Bigg) \dd\bel(\theta).\\
&\triangleq \sup_{\beta' \in \Beta_{\alpha_2}} \mathbb{E}_{\theta \sim \beta'}\left[\sup_{Q\in Q^{\theta}_{\alpha_1}} \mathbb{E}_{Z\sim Q(.|\theta)}[Z]\right].
%\int_0^1 U^E_{\alpha_2}\left(\int_0^1 U^A_{\alpha_1}(u)\dd q_{Z|\theta}(1-u)\right) \dd q_{\beta}(1-v) 
%&= U_E(U_A(Z|\Pr(Z|\theta))|\beta(\theta)) 
\end{align}
\fi
\end{definition}%\cdcomment{isn't $\theta$ also random?}added it
Here, $q_{Z|\theta}$ and $q_{\beta}$ are quantile functions of $Z$ conditioned on $\theta$ and that of $\theta$ respectively. For brevity, we also denote $F^C(U^A_{\alpha_1}, U^E_{\alpha_2}, \beta)$ as $\mathrm{Risk}_{\UE}\circ\mathrm{Risk}_{\UA}$ (e.g. $\mathrm{CVaR} \circ \mathrm{CVaR}$), whenever it is clear from the context.
%The composite risk is flexible to use two different risk measures for quantifying epistemic and aleatory uncertainties. We demonstrate such experiments in Figure~\ref{fig:risk}.
\begin{theorem}[Coherence]\label{thm:coherence}
	If $U^A_{\alpha_1}$ and $U^E_{\alpha_2}$ are distortion functions for two coherent risk measures, the composite risk measure $F^C(U^A_{\alpha_1}, U^E_{\alpha_2}, \beta)$ is also coherent.
\end{theorem}
%Appendix~\ref{sec:proof}
The proof of Theorem~\ref{thm:coherence} is available in Supplementary material. The generic nature of our composite risk definition allows us to use different risk measures compatible with epistemic and aleatory risks. This is demonstrated in experiments (Figure~\ref{fig:risk}) using different combinations of CVaR, Wang risk, and standard deviation for quantifying epistemic and aleatory uncertainties. This flexibility was absent in previous risk-sensitive RL literature~\citep{eriksson2019epistemic,depeweg2018decomposition,clements2019estimating}. 
%But the definition of composite risk is flexible enough to incorporate also non-coherent risk measures.

\textbf{Comparison with Additive Risk Formulations.} \cite{clements2019estimating, depeweg2018decomposition} use a weighted sum of epistemic and aleatory variances as their risk measure. This formulation has mainly two problems. First, variance is not a coherent risk measure as it does not follow the homogeneity and subadditivity properties, as shown in~\citep{pcirillo}. Secondly, we show that even if we replace the variance with a coherent risk measure, the additive formulation is equivalent to considering $U^E_{\alpha}$ as an identity function. Thus, it is less sensitive to the effect of epistemic uncertainty than composite risk. More formally:
% \cdcomment{Do we need to talk about this here? Also, we use sup-A to refer to Aleatory, so A as additive is confusing.}
% Often the additive risk measure or weighted sum of the epistemic and aleatory uncertainty is used in risk-sensitive RL literature~\citep{clements2019estimating}. \dbcomment{We should talk about the other papers used variance both depeweg and clements and they are not coherent but additive. They assumed these two sources to be independent and could use the law of total variance to decompose aleatory and epistemic but due to subadditivity of coherent risk measures they wont be correct if they are not independent.}
%for estimating \cdcomment{estimating??} the total uncertainty~\citep{clements2019estimating}. 

\begin{theorem}\label{thm:comp_geq_add}
	We are given two sources of aleatory and epistemic uncertainties $\xi_1$ and $\xi_2$. If $U^A_{\alpha_1}$ and $U^E_{\alpha_2}$ are distortion measures for two coherent risk measures quantifying aleatory and epistemic risks respectively, then, i) $F^A(U^A_{\alpha_1}, \beta) = F^C(U^A_{\alpha_1}, I, \beta)$, where $I$ is the identity function, and ii) $F^C(U^A_{\alpha_1}, U^E_{\alpha_2}, \beta) \geq F^A(U^A_{\alpha_1}, \beta)$, if $\alpha_2 \neq 1$.
	%The second claim follows from Remark~\ref{remark:dual}.
	%\cdcomment{This is an ambiguous statement. What do you mean "risk" and "underestimate"?}
\end{theorem}

\begin{figure}[t!]
	\centering
	\includegraphics[width=0.4\textwidth]{img/experiment6/distributions_dir}
	\caption{Estimation of total $CVaR_{\alpha}$ from a mixture of 100 Gaussians sampled from a posterior distribution. Total $CVaR_{\alpha}[Data]$ is based on the marginal distribution of $r$ as in Example~\ref{ex:gauss-mix-cvar}. We compare this with composite and additive estimates and illustrate results over $100$ runs. Here, lower value of CVaR indicates higher mass on the left tail of the distribution and higher risk of obtaining low returns.}\label{fig:gauss-mix-cvar}% Each run samples its own parameters for the mixtures. \cdcomment{The y axis is the expected utility. Why is this estimating uncertainty? Is the uncertainty the error bars?} }
\end{figure}
\begin{example}[A Reductive Empirical Evaluation of Composite and Additive Risks]\label{ex:gauss-mix-cvar}
    We consider a mixture of $100$ Gaussians: $p(r) = \sum_{i=1}^{100} \phi_i\mathcal{N}(\mu_i, \sigma_i^2)$, where $\Phi \sim Dir([0.5]^{100}), \mu \sim \mathcal{N}(0, 1)$, and $\sigma^2 \sim \Gamma^{-1}(2, 0, 1)$.
	We compute $CVaR_{\alpha}[r]$ using the data generated from this mixture over 100 runs. We further estimate composite risk with $U_E, U_A = CVaR_{\alpha}$ and additive risk with $U_A= CVaR_{\alpha}$. The results illustrated in Figure~\ref{fig:gauss-mix-cvar} show that the additive CVaR risk strictly underestimates the total CVaR risk computed from the data, whereas the composite risk is closer to the one computed from data. Specifically, for lower values of $\alpha$ (specifically, $\alpha \leq 0.5$), i.e. towards the extreme end of the left tail where events occur with low probability, the additive CVaR risk deviates significantly from data whereas the composite measure yields closer estimation. Such values of $\alpha$'s are typically interesting for risk-sensitive applications.
\end{example}


This means that for given sources of aleatory and epistemic uncertainties the additive risk which only considers expectation over epistemic uncertainty will always underestimate the composite effect of epistemic risk. 
Thus, we observe that additive risk leads to worse risk-sensitive performance than composite risk in RL problems (Table~\ref{tab:highway10} and Figure~\ref{fig:highway}). 
\iffalse
\subsection{Additive Risk under Uncertainty} 
Define the expected utility under full uncertainty ($F(U, \beta)$), epistemic uncertainty ($E(U, \beta)$), and aleatory uncertainty ($A(U, \beta)$).
\begin{align*}
F(U, \beta) &\triangleq \mathbb{E}_\beta[\mathbb{E}_{\Pr(.|\theta)}[U(Z)]] = \int_\Theta \int_{\mathcal{Z}}U(z) \dd\Pr(z \, | \, \theta) \dd\bel(\theta)\\
E(U, \beta) &\triangleq \mathbb{E}_\beta[U(\mathbb{E}_{\Pr(.|\theta)}[Z])] = \int_\Theta U\left(\int_{\mathcal{Z}} z \dd\Pr(z \, | \, \theta)\right)\dd\bel(\theta)\\
A(U, \beta) &\triangleq \mathbb{E}_\beta[\mathbb{E}_{\Pr(.|\theta)}[U(Z)]-U(\mathbb{E}_{\Pr(.|\theta)}[Z])] = \int_\Theta \int_{\mathcal{Z}}(U(z) -U(\mu_z))\dd\Pr(z \, | \, \theta) \dd\bel(\theta)
\end{align*}
$$ F(U, \beta) = A(U, \beta) + E(U, \beta).$$

Motivate the example with a three state deterministic MDP with the terminating states having reward distributions $\mathcal{N}(0,1)$ and $\mathcal{N}(\epsilon,1)$.

Check properties and sanities like: 
\begin{itemize}
	\item[1.] For convex utilities, $E(U, \beta) \leq F(U, \beta)$ where the equality holds for a deterministic environment. 
	\item[2.] If we replace the expected utility with a coherent risk measure, due to sub-additivity, we get $ F(U, \beta) \leq A(U, \beta) + E(U, \beta)$.
	\item[3.] Thus, learning the total risk of a policy is equivalent to minimising the sum of aleatory and epistemic uncertainty which is equivalent to minimising the variance plus bias in a learning problem.
\end{itemize}
This is the observation that evokes use of bootstrap or ensemble-based methods to estimate uncertainty.

\subsection{Composite Risk under Uncertainty} 
Define the expected utility under full uncertainty ($F(U, \beta)$), epistemic uncertainty ($E(U, \beta)$), and aleatory uncertainty ($A(U, \beta)$).
\begin{align*}
F(U, \beta) &\triangleq \mathbb{E}_\beta[\mathbb{E}_{\Pr(.|\theta)}[U(Z)]] = \int_\Theta U_E\Bigg(\int_{\mathcal{Z}}U_A(z) \dd\Pr(z \, | \, \theta) \Bigg) \dd\bel(\theta)\\
E(U, \beta) &\triangleq \mathbb{E}_\beta[U(\mathbb{E}_{\Pr(.|\theta)}[Z])] = \int_\Theta U\left(\int_{\mathcal{Z}} z \dd\Pr(z \, | \, \theta)\right)\dd\bel(\theta)\\
A(U, \mdp) &\triangleq \mathbb{E}_{\Pr(.|\mdp)}[U(Z)] = \int_{\mathcal{Z}}U(z)\dd\Pr(z \, | \, \mdp)
\end{align*}

The additive formulation is a special case of the composite formulation with mgf

Is this correct, we need $U_E$ to be additive

we might want to prove coherence and convexity of the composition

we might want to prove cvar optimality and convergence of bootstrapping
\fi

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


%\input{algorithms}
%%%%%%%%%%%%%%%%%%% algorithms.tex %%%%%%%%%

\section{Algorithm: SENTINEL-K}\label{sec:algo}
Now, we outline the algorithmic details of SENTINEL-K that estimates composite risk over returns using an ensemble of $K$ distributional RL estimators, namely CDQN, in tandem with an adaptation of FTRL for estimator selection, and leverage the estimates for decision making.
%We further evaluate the composite risk using return distribution used for decision making.

\textbf{Sketch of the Algorithm.}
Pseudocode of SENTINEL-K with composite risk is given in Algorithm~\ref{alg:composite}.
 It has two main blocks: obtaining $K$ estimates of return distribution with distributional RL framework (Lines~\ref{line:block1_1}-~\ref{line:block1_2}), and using them to compute composite risk for each action (Lines~\ref{line:block2_1}-~\ref{line:block2_2}). Finally, following the mechanism of Q-learning~\citep{watkins1992q}, it chooses the action with maximal composite risk in the decision making step (Line~\ref{line:action}).
 
 In the first block (Lines~\ref{line:block1_1}-~\ref{line:block1_2}), we specifically use an ensemble of $K$ CDQNs. Each CDQN uses target and value networks for estimating the return distribution. We set a schedule for updating the target networks $\Gamma_1$ and a more frequent one ($\Gamma_1 \cup \Gamma_2$) for the value networks (Section~\ref{sec:ensemble}). 
 
 The second block (Lines~\ref{line:block2_1}-~\ref{line:block2_2}) is used for decision-making and iterated at every time step. It adapts the FTRL algorithm (Section~\ref{sec:ftrl}) for aggregating the $K$ estimated return distributions and to compose aleatory risk $Q^A_i(s_t, a)$ of each of the estimators to provide a final estimate of the composite risk $Q^C(s_t, a)$ for each action, and then selecting the action with highest $Q^C(s_t, a)$.

\subsection{Ensembling and Bootstrapping $K$-Estimators}\label{sec:ensemble}
The ensemble of SENTINEL-K consists of $K$ distribution estimators. Each estimator gets its own dataset $\lbrace D_i\rbrace_{i=1}^K \subseteq \mathcal{D}$, value network $\lbrace \theta_{i}\rbrace_{i=1}^K$ and target network $\lbrace \theta_{i}^-\rbrace_{i=1}^K$. 
The $K$ datasets are created from the original dataset $\mathcal{D}$ by \textit{data masking} (Line~\ref{line:datamask}). 
For each transition $s_t, a_t, r_t, s_{t+1}$, a fixed weight vector $\mathbf{u}_t \in [0, 1]^K$ is generated such that $u^j_t \sim Ber(\frac{1}{3})$. 
Thus, on an average, each estimator $i$ has access to $\frac{1}{3}$ of the dataset. Details about data masking are in Supplementary material.
% Appendix~\ref{sec:datamask}

After preparing the datasets for the estimators, the target and value networks of the CDQN have to be updated and optimised.
For $i$-th estimator, it begins with sampling mini batches of data $\tau$ from the respective dataset $D_i$ (Line~\ref{line:minibatch}).
Then, this dataset is used to compute the composite risk for all actions $a \in \mathcal{A}$ and to obtain $a^{*}$ (Lines~\ref{line:f_estimate}-~\ref{line:astar}). 
Obtaining the composite risk first involves estimating the aleatory risk with $Q_i^A(s_t, a)=\int_{\mathcal{Z}} Z \dd(\UA \circ \Pr)(Z|\theta_i)$ for a particular estimator $i$. This quantity can be attained by considering each of the estimators separately, however, as we turn to compute the epistemic risk the estimators jointly contribute to this risk.
Then, we compose the aleatory risk of all the estimators to compute $Q^C(s_t, a) = \mathrm{Risk}_{\UE}(\{Q_i^A(s_t, a)\}_{i=1}^K)$. 
Here, $\mathrm{Risk}_{\UE}$ is the risk measure corresponding to the distortion $\UE$. Finally, the optimal action $a^{*} = \underset{a}{\arg\max} \, Q^C(s_t, a)$, and the risk estimates $Q^C(s_t, a)$ are used to update the value and network parameters $\lbrace \theta_{i}\rbrace_{i=1}^K$ and $\lbrace \theta^-_{i}\rbrace_{i=1}^K$ (Lines~\ref{line:valuenet}-~\ref{line:targetnet}) by minimising the cross-entropy loss of the current parameters and the projected Bellman update as described in~\citep{bellemare2017distributional}.

Ensembling estimators have been shown to outperform individual estimators as seen in~\citep{wiering2008ensemble, fausser2015neural, osband2016deep, pacchiano2020optimism}.
Further, incorporating multiple estimators introduces uncertainty over the estimators. 
Because of having separate data sets, each of the estimators learn different parts of the MDP.
Thus, uncertainty over estimators acts as a quantifier of the model uncertainty. In Section~\ref{sec:experiments}, we show that this ensemble-based approach leads SENTINEL-K to achieving superior performance.

\subsection{Weighing Estimates with FTRL}\label{sec:ftrl}
%Follow the Regularised Leader (FTRL)}
Now, the question is to adaptively and accurately aggregate the $K$ estimated return distributions.
\cite{pacchiano2020optimism} shows that adaptive model selection can boost performance in comparison to model averaging.
The rationale for this can be given by seeing that some estimators might be overly optimistic or pessimistic.
By weighing these less, you can effectively have a more robust ensemble. Further discussion of this issue is given in Supplementary material.
%Appendix~\ref{sec:appendix_ftrl}
%By weighing these less, you can effectively have a more robust ensemble. Further discussion of this issue is given in Appendix D.2.

We adapt the Follow The Regularised Leader (FTRL) algorithm~\citep{cesa2006prediction} studied in bandits and online learning for adaptively weighing the estimators. FTRL puts exponentially more weight on an estimator depending on its accuracy of estimating the return distribution. Since we do not know the `true' return distribution, we use the KL-divergence from the posterior of a single estimator $i$, $\Pr(Z \, | \, \theta_i)$, to the posterior marginalised over $\bel(\theta)$, i.e. $l(\theta_i, \beta) \triangleq
D_{\mathrm{KL}}\Big(\Pr(\hat{Z})\, || \, \Pr(Z \, | \, \theta_i) \Big)$, %D_{\mathrm{KL}}\Big(\int_\Theta \int_\mathcal{Z}z \dd\Pr(z \, | \, \theta)\dd\bel(\theta)\, || \, \int_\mathcal{Z}z \dd\Pr(z \, | \, \theta_i) \Big)$, 
as proxy of estimation loss of estimator $i$.
FTRL selects estimator $i$ with weight
\begin{equation}\label{eq:weights}
w_i = \dfrac{e^{\lambda l(\theta_i, \beta)}}{ \sum_j e^{\lambda l(\theta_j, \beta)}}, \quad\lambda\in [0,\infty).
\end{equation}
Using FTRL weights for aggregating the $K$ return distributions is analogous to using an exponentially weighted average forecaster~\citep{cesa2006prediction} on the $K$ learners to create a final estimate of the return distribution and corresponding composite risk. This leads to a better aggregation of individual estimates than equally weighted average or a greedy selection of the best estimate~\citep[Theorem 2.2]{cesa2006prediction}.
%
%where $\mathbb{P}_\theta(z) = \int_\Theta \mathbb{P}(z \, | \, \theta) \dd\bel(\theta)$ and $D_{KL}\big(\mathbb{P}_\theta(z) \, || \, \mathbb{P}(z \, | \, \theta_i)\big) = -\sum_{\mathcal{Z}} \mathbb{P}_\theta(z) \log\Big( \frac{\mathbb{P}(z \, | \, \theta_i)}{\mathbb{P}_\theta(z)}\Big)$.
Having computed the weights $\mathbf{w}$ (Line~\ref{line:ftrl}), we compute the weighted composite risk measure by first computing the aleatory risk of  each of the estimators, %$Q^A_i(s_t, a) = \int_\mathcal{Z}\UA(z)\dd\Pr(z \, | \, \theta_i)$ (Line~\ref{line:aleatory}), and then the composite risk is computed by $Q^C(s_t, a) = \UE(\mathbf{w} \cdot \mathbf{Q}^A(s_t, a))$ (Line~\ref{line:composite}).
$Q^A_i(s_t, a) = \int_{\mathcal{Z}} Z \dd(\UA \circ \Pr)(Z|\theta_i)$ (Line~\ref{line:aleatory}), and then the composite risk is computed by $Q^C(s_t, a) = \mathrm{Risk}_{\UE}(\{w_i Q_i^A(s_t, a)\}_{i=1}^K)$ (Line~\ref{line:composite}).
%Here,  $\cdot : \mathbb{R}^K \times \mathbb{R}^K \rightarrow \mathbb{R}^K$ is the pointwise product.
Here, $\lambda \in [0, \infty)$ is a regularising parameter that determines to what extent estimators far away from the marginal estimator should be penalised.
If $\lambda \rightarrow 0$, we obtain standard model averaging. If $\lambda \rightarrow \infty$, it reduces to greedy selection. We experimentally show that performing FTRL with a reasonable $\lambda$ value, namely 1, leads to better performance.

\textbf{Action Selection.} The algorithm always selects the action with the high composite risk $Q^C$. Its behaviour depends on the choice of risk measures or distortion utility functions $\UA$ and $\UE$. SENTINEL-K reduces to a risk-neutral algorithm if we choose both $\UA, \UE$ as identity functions, and to additive risk-sensitive algorithm if we choose $\UE$ as identity. Designing it to accommodate composite risk provides us the flexibility to be risk-sensitive, risk-neutral, and treating epistemic and aleatory risk with different metrics. 
%We use risk-neutral SENTINEL-K to validate its efficiency to estimate return distributions, and the one with composite CVaR risk to perform risk-sensitive tasks. 

\iffalse
\paragraph{Bootstrap regret}

If we generate data using a stationary behaviour policy $\pol_0$ and then partition the data with overlaps to $K$ estimators (experts) then after $T$ batches of data of batch size $B$ then we should be able to achieve a regret of $D\sqrt{\frac{T}{B} \log{K}}$ where $D$ is the diameter of $Z$ the data space.

\paragraph{Risk-sensitive bootstrap regret}

In the normal bootstrap regret we are trying to minimise $\mathbb{E}[\log{p(z \, | \, \theta_i)}]$ but here we want to minimise the $CVaR[p(z \, | \, \theta_i)]$ (aleatory) and $CVaR[p(\theta, z)]$ for composite if it is convex we are happy. $CVaR_\theta( \mathbb{E}_z[z \, | \, \theta])$
TODO: figure out how to do this
\fi

\iffalse
\[
w_i = \exp(-\lambda\Big(D_{KL}\big(\Pr_\theta(z) \, || \, p(z \, | \, \theta_i)\big)\Big))
\]
\fi

%\STATE{\textbf{for} $i$ \textbf{in} $[1, K]$}
%\begin{ALC@g}
%    \STATE Sample minibatch $\tau \sim D_i$
%    \STATE Obtain $a^{*} = \argmax_a F(Z(s_t,a)|U_A, U_B, \beta)$ using Def~\ref{def:composite}, $\tau$ and
%    \STATE Fit target network $\theta^{-}_i$ using $\tau, a^{*}$
%\end{ALC@g}
%\STATE{\textbf{for} $a$ \textbf{in} $\mathcal{A}$:}
%\begin{ALC@g}
%    \STATE Compute weights $\mathbf{w} = w_1, ..., w_K$ from Eq.~\ref{eq:weights}.
%    \STATE{\textbf{for} $i$ \textbf{in} $K$:}
%    \begin{ALC@g}
%    \STATE Compute aleatory risks $Q^A_i(s_t,a)$ from $\int_\mathcal{Z}U_A(z)\dd\mathbb{P}(z \, | \, \theta_i^{-})$.
%    %\STATE Let $f_{\mathbf{Q}^A}(q) = \mathbf{w}$ and compute composite risk $Q^C(s_t, a) = \int_{\mathbb{R}} %U_E(q)f_{\mathbf{Q}^A}(q)\dd q$.
%    \end{ALC@g}
%    \STATE $Q^C(s_t, a) = \sum_{i=1}^K U_E\Big(w_i Q_i^A(s_t, a)\Big)$
%\end{ALC@g}
%\STATE{\textbf{return: }$\underset{a}{\arg\max} \, Q^C(s_t, a)$}
%


\iffalse
\paragraph{Ensembling and bootstrapping}
There are a few ways of masking the data. Initially, we generated weights $w \sim Uniform(0, 1)$ on the data, one for each model, and used these to do sampled weighting. Another approach is to set flags whether to use the data or not, so $w \sim Ber(\frac{1}{3})$ is one idea Christos had. In the Osband paper they use $Poi(1)$ and $Exp(1)$.

\textbf{Bootstrapped DQN~\citep{osband2016deep}}

\begin{equation}
\theta_{t+1} \leftarrow \theta_t + \alpha(y_t^Q - Q(s_t, a_t, \theta_t))\nabla_\theta Q(s_t, a_t, \theta_t)
\end{equation}
\begin{equation}
y_t^Q \leftarrow r_t + \gamma \, \underset{a\in\mathcal{A}}{\max} \, Q(s_{t+1}, \underset{a\in\mathcal{A}}{\arg\max} \, Q(s_{t+1}, a,\theta_t),\theta^{-})
\end{equation}

They have $K \in \mathbb{N}$ bootstrapped estimates $Q_k(s, a, \theta^{-})$. They then sample one $k \in \{1,..., K\}$ uniformly, and follow $Q_k$ for that episode.
\fi
\iffalse 
In~\citep{fausser2015neural, wiering2008ensemble}, they consider $M$ independent agents, each with their set of parameters $\theta_m$. They use a Boltzmann policy:

\begin{equation}
\pi_t(s_t, a) = \frac{p_t(s_t, a)^{\frac{1}{\tau}}}{\sum_b p_t(s_t, b)^{\frac{1}{\tau}}}
\end{equation}
\fi

\begin{algorithm}[t!]
	\caption{SENTINEL-K with Composite Risk}
	\label{alg:composite}
	\begin{algorithmic}[1]
		\STATE{\textbf{Input:} } Initial state $s_0$, action set $\mathcal{A}$, distortion measures $\UA, \UE$, hyperparameter $\lambda$, target networks
		$[\theta_1^{-}, ..., \theta_K^{-}]$, value networks
		$[\theta_1, ..., \theta_K]$, update schedule $\Gamma_1, \Gamma_2$.
		\FOR{$t=1,2,\ldots$}
		\STATE //* Update $K$-value and target networks for estimating return distributions *//
		\FOR{$t' \in \Gamma_1 \cup \Gamma_2$ }\label{line:block1_1}
		\STATE  Generate $\{D_1, ..., D_K\} \gets \mathrm{DataMask}(\mathcal{D}^{t'})$\label{line:datamask}
		\FOR{$i = 1, \ldots, K$}
		\STATE Sample mini batch $\tau \sim D_i$\label{line:minibatch}
		\STATE Estimate \eqref{eq:composite} $F^C(Z(s_t,a)|\UA, \UE, \beta)$ using $\tau$ and $K$-target networks $\lbrace \theta_{i}^-\rbrace_{i=1}^K$. \label{line:f_estimate}
		\STATE Get $a^{*} = \argmax_a F^C(Z(s_t,a)|\UA, \UE, \beta)$\label{line:astar}
		\STATE  Update value network $\theta_i$ using $\tau, a^{*}$\label{line:valuenet}
		\STATE  Update target network $\theta^{-}_i$ using $\tau, a^{*}$ if $t' \in \Gamma_1$\label{line:targetnet}
		%\STATE Sample mini batch, compute optimal action, and update $K$ target and value networks
		\ENDFOR
		\ENDFOR \label{line:block1_2}
		\STATE //* Estimate the composite risk of each action using the estimated return distributions *//
		\FOR{$a \in \mathcal{A}$}\label{line:block2_1}
		\STATE Compute weights $\mathbf{w} = w_1, ..., w_K$ from Eq.~\ref{eq:weights}.\label{line:ftrl}
		\FOR{$i$ \textbf{in} $K$}
		\STATE Compute aleatory risks $Q^A_i(s_t, a)$ from $\int_{\mathcal{Z}} Z \dd(\UA \circ \Pr)(Z|\theta_i)$ \label{line:aleatory}
		%\STATE Compute aleatory risks $Q^A_i(s_t,a)$ from $\int_\mathcal{Z}\UA(z)\dd\Pr(z \, | \, \theta_i)$.\label{line:aleatory}
		\ENDFOR
		\STATE Compute composite risk over weighted aleatory estimates $Q^C(s_t, a) = \mathrm{Risk}_{\UE}(\{w_i Q_i^A(s_t, a)\}_{i=1}^K)$\label{line:composite}
		\ENDFOR \label{line:block2_2}
		\STATE //* Action selection *//
		\STATE{Take action $a_t = \argmax_a Q^C(s_t, a)$}\label{line:action}
		\STATE 	Observe $s_{t}$ and update the dataset $\mathcal{D}^{t} \gets \mathcal{D}^{t-1} \cup \{s_t, a_{t-1}, s_{t-1}, r_{t-1}\}$\label{line:update}
		\ENDFOR 
	\end{algorithmic}
\end{algorithm}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


%\input{experiments}
%%%%%%%%%%%%% experiments.tex %%%%%%%%%%%%%

\section{Experimental Evaluation}\label{sec:experiments}
We test the risk-sensitive performance of SENTINEL-K with composite CVaR risk in two environments with continuous state spaces. We also display the flexibility of our composite risk formulation by evaluating heterogeneous risks with SENTINEL-K.\footnote{\noindent Ablation studies for risk-neutral SENTINEL are in Appendix.}
Settings for each of these experiments and results are elaborated in corresponding subsections.
In all the experiments, we use $4$ CDQNs in the ensemble and call it SENTINEL-4. We justify this choice of $K=4$ in Supplementary material. For each experiment, we report the mean and standard error of the mean over 20 runs for $10^5$ steps.
%Appendix~\ref{sec:ensemble_size}
%\subsection{Validation experiments}
\begin{table*}[t!]
	\centering
	\caption{Performance of risk-neutral (VDQN, CDQN, SENTINEL-K), aleatory risk-sensitive VDQN-CVaR, UA-DQN and risk-sensitive (SENTINEL-4 with additive and composite CVaRs) for highway-v1 with 10 vehicles. Results are reported over 20 runs. SENTINEL-4 with composite CVaR performs better.}\label{tab:highway10}
	\resizebox{0.8\textwidth}{!}{	\begin{tabular}{c|c|c|c}
		Agent & Value $\pm \sigma$ & Aleatory metric $\pm \sigma$ & $\#$ crashes $\pm \sigma$ \\
		\hline
		VDQN$_{RN}$~\cite{tang2018exploration}& $23.30\pm0.36$ & $14.29\pm0.80$ & $1252.33\pm170.35$ \\
		CDQN$_{RN}$~\cite{bellemare2017distributional}& $25.96\pm0.51$ & $19.50\pm1.44$ & $839.53\pm150.20$\\
		SENTINEL-4$_{RN}$& $26.56\pm0.32$ & $20.88\pm 1.25$& $617.11\pm 100.15$\\
		VDQN-CVaR$_A$~\cite{tang2018exploration} & $24.39\pm0.50$ & $16.64\pm1.25$ & $871.33\pm171.23$\\
		UA-DQN$_{E+A}$~\cite{clements2019estimating}& $24.46\pm0.29$ & $16.9\pm0.44$ & $1060.65\pm13.94$\\
		SENTINEL-4$_{E+A}$& $26.82\pm0.42$ & $21.54\pm1.40$ & $645.55\pm127.59$\\
		SENTINEL-4$_{E\circ A}$& $\mathbf{27.43\pm0.13}$ & $\mathbf{24.16\pm0.54}$ & $\mathbf{341.18\pm43.86}$\\
	\end{tabular}
}
\end{table*}
%\begin{tabular}{c|c|c|c}
%		Agent & Value $\pm \sigma$ & Aleatory metric $\pm \sigma$ & $\#$ crashes $\pm \sigma$ \\
%		\hline
%		VDQN$_{RN}$~\cite{tang2018exploration}& $23.30\pm1.59$ & $14.29\pm3.60$ & $1252.33\pm761.85$ \\
%		CDQN$_{RN}$~\cite{bellemare2017distributional}& $25.96\pm2.27$ & $19.50\pm6.43$ & $839.53\pm671.70$\\
%		SENTINEL-4$_{RN}$& $26.56\pm1.45$ & $20.88\pm 5.58$& $617.11\pm 447.89$\\
%		VDQN-CVaR$_A$~\cite{tang2018exploration} & $24.39\pm2.24$ & $16.64\pm5.58$ & $871.33\pm765.76$\\
%		UA-DQN$_{E+A}$~\cite{clements2019estimating}& $24.46\pm1.28$ & $16.9\pm1.97$ & $1060.65\pm62.34$\\
%		SENTINEL-4$_{E+A}$& $26.82\pm1.87$ & $21.54\pm6.24$ & $645.55\pm570.58$\\
%		SENTINEL-4$_{E\circ A}$& $\mathbf{27.43\pm0.60}$ & $\mathbf{24.16\pm2.40}$ & $\mathbf{341.18\pm196.13}$\\
%	\end{tabular}
\iffalse % with 25 runs
\paragraph{Risk-sensitive Performance.}
	\begin{table*}[t!]
		\centering
		\begin{tabular}{c|c|c|c}
			Agent & Value $\pm \sigma$ & Aleatory metric $\pm \sigma$ & $\#$ crashes $\pm \sigma$ \\
			\hline
			VDQN risk-neutral& $582.56\pm39.70$ & $357.27\pm89.96$ & $1252.33\pm761.85$ \\
			CDQN risk-neutral& $648.92\pm56.81$ & $487.60\pm160.68$ & $839.53\pm671.70$\\
			SENTINEL-4 additive& $670.54\pm46.71$ & $538.61\pm155.96$ & $645.55\pm570.58$\\
			SENTINEL-4 composite& $\mathbf{685.80\pm15.09}$ & $\mathbf{604.08\pm59.93}$ & $\mathbf{341.18\pm196.13}$\\
		\end{tabular}
		\caption{Performance of risk-neutral (VDQN, CDQN), and risk-sensitive (SENTINEL-4 with additive and composite CVaRs) for highway environment with 10 vehicles. Results are reported over 20 runs. SENTINEL-4 with composite CVaR performs better.}\label{tab:highway10}
	\end{table*}
\fi

\textbf{Risk-sensitive Performance.} In order to demonstrate performance in a larger domain, we opt to evaluate SENTINEL-4 in the \textit{highway}~\citep{highway-env} environment.
Highway is an environment developed to test RL for autonomous driving. 
We use a version of the \textit{highway-v1} domain with five lanes, and ten vehicles in addition to the ego vehicle.
In this environment, the episode is terminated if any of the vehicles crash or if the time elapsed is greater than $40$ time steps. 
The reward function is a combination of multiple factors, including staying in the right lane, the ego vehicle speed, and the speed of the other vehicles.

We test the risk-neutral CDQN and VDQN algorithms, an aleatory risk-sensitive VDQN and the total variance decomposition algorithm UA-DQN along with SENTINEL-4 with both additive and composite CVaRs.
The typical performance metric for this scenario is the expected discounted return $\mathbb{E}_\mdp^{\pol}[R]$. 
In order to test the risk-sensitive performance, we use two metrics.
In order to measure aleatory risk $\UA[R \, | \, \pol, \mdp]$, we use CVaR as $\UA$ with threshold $\alpha = 0.25$.
The CVaR metric is a statistic of the left-tail of the return distribution and higher values would mean better performance in the $25\%$ worst-cases of performance. 
Finally, as a proxy for the epistemic risk, we use the number of crashes (lower is better).

Experimental results are illustrated in Table~\ref{tab:highway10} and Figure~\ref{fig:highway}.
From Table~\ref{tab:highway10}, we observe that our algorithm with composite risk achieves a higher value, higher estimate of aleatory risk, and less number of crashes.
Thus, SENTINEL-4 with composite CVaR outperforms the competing algorithms in terms of all three metrics. The simultaneous improvement in both the value function and \#crashes is due to the fact that \textit{highway} is designed to have a reward function that penalises unsafe driving.
Additionally, we observe that the variance of performance metrics over 20 runs is the least for our algorithm with composite CVaR measure. 
This shows the stability of our algorithm which is another demonstration of good risk-sensitive performance.
Figure~\ref{fig:highway} resonates with these observations in terms of the total number of crashes.

\begin{figure*}[t!]
\begin{minipage}[c]{0.32\textwidth}
	\centering
	\includegraphics[width=1\textwidth]{img/experiment5/medium-highway_100k_only_crash}
	\caption{The total number of crashes in highway environment with $10$ vehicles over $20$ runs and horizon $10^6$. Fewer \#crashes indicate better risk-sensitive performance.} %\cdcomment{I guess this misses cdqn/vdqn that are not risk-neutral.}}
	\label{fig:highway}
\end{minipage}\hfill
\begin{minipage}[c]{0.32\textwidth}
	\centering
	\includegraphics[width=\textwidth]{img/risk_measure_experiment/wasserstein.png}%
	\caption{Performance and convergence of SENTINEL-4 using different risk measures. 
	%By $R_1 \circ R_2$ we mean risk measure $R_1$ composed with risk measure $R_2$. 
	We show the number of falls in the \textit{CartPole} environment over $20$ runs with different initialisation.}\label{fig:risk}%Shaded region represents $\mu_t \pm \sigma_t$.
\end{minipage}\hfill
\begin{minipage}[c]{0.32\textwidth}
	\centering
	\includegraphics[width=\textwidth]{img/experiment4/wasserstein}%
	% DON'T change to PDF, it will make scrolling lag on older browsers! %
	\caption{Performance and convergence of SENTINEL-4 (risk-neutral) for different values of $\lambda$. We show the number of falls in \textit{CartPole} environment over $20$ runs with different initialisation.}\label{fig:cartpole}
\end{minipage}
\end{figure*}
\textbf{Heterogeneous Risk Measures.}
In order to demonstrate the flexibility of the composite risk framework estimated with SENTINEL, we investigate performance using heterogeneous coherent risk measures, that composes different coherent risk measures for aleatory and epistemic risk. The chosen risk measures are aleatory and epistemic CVaR, aleatory and epistemic Wang risk, aleatory CVaR with epistemic standard deviation, and aleatory standard deviation with epistemic CVaR. Note that any combination of coherent risk measures is possible. 
We evaluate SENTINEL-4 in the \textit{CartPole-v0} environment~\citep{openAI16}. 
This environment is a popular test-bed for continuous state-space RL tasks. 
In the environment, a reward of $1$ is attained for every time step the pole is kept upright. 
If the pole falls to either of the sides or if the number of time steps reaches $200$, the episode is terminated. 
This means that the undiscounted return attained per episode is in $[0, 200]$. Thus, we choose $V_{min} = 0, V_{max} = \frac{1-\gamma^{200}}{1-\gamma}$ as the histogram support of CDQN. The results are shown in Figure~\ref{fig:risk}, which demonstrates than SENTINEL-4 performs flexibly and comparably for these composite risks.

\textbf{FTRL vs. Average vs. Greedy.} We choose $[0.01, 0.1, 1.0, \ln 100]$ as the different values of the regularising hyperparameter $\lambda$ and test the performance of SENTINEL-4 for \textit{CartPole-v0}. As $\lambda \rightarrow 0$, we perform standard model averaging which is sensitive to outliers.
As $\lambda \rightarrow \infty$, model selection gets greedily biased towards the best average estimator while not providing other estimators a chance to improve. %In fact, we expect performance to be poor when $\lambda$ is too high since it is putting almost all weight on one single estimator while not providing other estimators a chance to improve. 
A sound value of $\lambda$ would be one that excludes outlier estimators while still involves most of the other estimators. 
%We run each of the experiments for $10^5$ steps and average the results over $20$ runs.
Figure~\ref{fig:cartpole} shows performance in terms of cumulative $\#$ Falls (lower is better) for the $\lambda$ values with $CVaR_{0.25}\circ CVaR_{0.25}$.
We observe that FTRL with reasonable $\lambda=1.0$ shows better performance, i.e. less number of falls, than the ones with large $\lambda=4.6$ and small $\lambda$'s $0.01$ and $0.1$.
We also observe that for $\lambda=1$ the variance of $\#$Falls is significantly less than that of other values and thus, stability of performance.

\textbf{Summary of Results.} 
%In Figure~\ref{fig:distributions}, we can see that our composite risk measure with $U_A, U_E = CvaR_{\alpha}$ more accurately estimate the full uncertainty over the additive formulation with $U_A = CVaR_{\alpha}$. 
%In particular, the additive risk measure deviates from the original estimate in the far left tail, i.e. $\alpha \rightarrow 0$, where the less probable events happen. But this events are typically interesting for risk-sensitive applications.
Fig.~\ref{fig:highway} shows the risk-sensitive performance of VDQN, CDQN, aleatory CVaR, total variance decomposition UA-DQN and SENTINEL-4 additive and composite CVaR risks on a large continuous state environment. SENTINEL-4 with composite risk outperforms competing algorithms in terms of the achieved value function and estimated aleatory risk. It causes the least number of crashes than competing algorithms.
Fig.~\ref{fig:risk} demonstrates the ability to chose any coherent risk measure for SENTINEL-K, including different risk measures for both epistemic and aleatory risk.
Fig.~\ref{fig:cartpole} shows that selecting $\lambda$ is important in bootstrapped RL, and tuning it yields better performance over model averaging ($\lambda \rightarrow 0$) and greedy selection ($\lambda \rightarrow \infty$).
We defer the results on the choice of $K$ in ensemble, convergence in return distribution, and improved efficiency in estimating multi-modal return distributions, to Appendix. %In Fig.~\ref{fig:toy_example1} and~\ref{fig:toy_example2}, we show that SENTINEL-K framework estimates multi-modal return distributions more efficiently than the classical DRL algorithms.%, such as VDQN. 

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


%\input{discussion}
%%%%%%%%%%%%%%%%% discussion.tex %%%%%%%%%%%%

\section{Discussion}\label{sec:discussion}
In this paper, we study the problem of risk-sensitive RL. We propose two main contributions. 
The first is the \textit{composite risk} formulation that quantifies the holistic effect of aleatory and epistemic risk involved in learning. With a reductive experiment, we show that composite risk estimates the total risk involved in a problem more accurately than existing additive formulations.
The second one is \textit{SENTINEL-K} which ensembles $K$ distributional RL estimators, namely CDQNs, to provide an accurate estimate of the return distribution.  
We adopt FTRL from bandit literature as a means of model selection. %FTRL weighs each estimator differently depending on how far away they are from the average estimator. 
%This leads to a better estimate of the composite risk over return. 
FTRL weighs each estimator adaptively and leads to better experimental performance than greedy selection and model averaging.
Experiments show that SENTINEL-K achieves superior risk-sensitive performance while used with composite CVaR estimate, and can operate on composition of different risks unlike existing works.

Motivated by the experimental success, we aim to investigate theoretical properties of FTRL-driven bootstrapped distributional RL with and without composite risk estimates. %Since real-world deployment of RL in industries is an ongoing endeavour of our times, we expect this work to contribute in safe and risk-sensitive use of RL.
%We experimentally demonstrate superior performance of the proposed composite risk-driven FTRL-based distributional RL over the risk-neutral and additive risk-based distributional RL in terms of (i) uncertainty estimation, (ii) performance, and (iii) theoretical properties.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%\begin{contributions} % will be removed in pdf for initial submission,
                      % so you can already fill it to test with the
                      % ‘accepted’ class option

%\end{contributions}

\begin{acknowledgements} % will be removed in pdf for initial submission,
                         % so you can already fill it to test with the
                         % ‘accepted’ class option
We would like to thank Dapeng Liu for fruitful discussions in the beginning of the project, further, this work was partially supported by the Wallenberg AI, Autonomous Systems and Software Program (WASP) funded by the Knut and Alice Wallenberg Foundation and the computations were enabled by resources provided by the Swedish National Infrastructure for Computing (SNIC) at C3SE partially funded by the Swedish Research Council through grant agreement no. 2018-05973.
\end{acknowledgements}

\bibliography{references}

%\newpage
%\appendix
%\onecolumn
%\input{additional_theory}
%\input{additional_results}

\end{document}
