% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
% \documentclass[underreview]{uai2023} % after submission
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}
% \usepackage{balance}

%% Some suggested packages, as needed:
\usepackage[numbers]{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams


% NEW
\usepackage{subcaption}
% \usepackage{amsmath}
\usepackage{amsfonts}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{xcolor}
\usepackage{xspace}
\usepackage{nicefrac}
\setlist[enumerate]{label={\arabic*)}}

\definecolor{gray}{cmyk}{0,0,0,0.8}
  \renewcommand{\algorithmiccomment}[1]{\hfill \small\textcolor{gray}{\textit{$\triangleright$ #1}}}
\usepackage[capitalise,noabbrev,nameinlink]{cleveref}
\Crefname{ALC@unique}{Line}{Lines}
\newcounter{myalg}
\AtBeginEnvironment{algorithmic}{\refstepcounter{myalg}}
\makeatletter
\@addtoreset{ALC@unique}{myalg}
\makeatother

\usepackage{xr}
\makeatletter
\newcommand*{\addFileDependency}[1]{%
\typeout{(#1)}
\@addtofilelist{#1}
\IfFileExists{#1}{}{\typeout{No file #1.}}
}\makeatother

\newcommand*{\includeexternaldocument}[2][prefix]{%
\externaldocument[#1]{#2}%
\addFileDependency{#2.tex}%
\addFileDependency{#2.aux}%
}
\includeexternaldocument[supp-]{koprulu_731-supp}
% \externaldocument[supp-]{koprulu_731-supp}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\DeclareMathOperator*{\argmax}{arg~max}
\DeclareMathOperator*{\argmin}{arg~min}
\DeclareMathOperator*{\st}{~s.t.~}


\title{Risk-Aware Curriculum Generation for Heavy-Tailed Task Distributions}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{
\href{mailto:<cevahir.koprulu@utexas.edu>?Subject=Your UAI 2023 paper}{Cevahir~Koprulu}
% Cevahir~Koprulu
}
\author[2]{Thiago~D.~Simão}
\author[2]{Nils~Jansen}
\author[1]{Ufuk~Topcu}
% Add affiliations after the authors
\affil[1]{%
    % Computer Science Dept.\\
    % Cranberry University\\
    % Pittsburgh, Pennsylvania, USA
    University of Texas at Austin
}
\affil[2]{%
Radboud University, Nijmegen
    % Second Affiliation\\
    % Address\\
    % …
}

  
% \input{nomenclature.tex}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%% NOMENCLATURE %%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% Environments
\newtheorem{definition}{Definition}

% General
\newcommand{\Reals}{\ensuremath{\mathbb{R}}}
\newcommand{\PositiveIntegers}{\ensuremath{\mathbb{Z}^+}}
% \newcommand{\Expectation}{\ensuremath{\mathbb{E}}}
\DeclareMathOperator*{\Expectation}{\mathbb{E}}
\newcommand{\Probability}{\mathbb{P}}
\newcommand{\CDF}{F}
\newcommand{\Indicator}{\mathbf{1}}
\newcommand{\NormalDistribution}{\mathcal{N}}
\newcommand{\NormalDistributionMean}{\mu}
\newcommand{\NormalDistributionSTD}{\sigma}
\newcommand{\CauchyDistributionPDF}{f}
\newcommand{\CauchyDistributionLocation}{l}
\newcommand{\CauchyDistributionScale}{s}
\newcommand{\CauchyDistributionRandomVariable}{X}
\DeclarePairedDelimiter{\norm}{\lVert}{\rVert}
\newcommand{\ProbSimplex}{\Delta}
\newcommand{\RandomVariable}{X}
\newcommand{\RandomVariableSample}{x}

% Contextual MDP
\newcommand{\context}{\ensuremath{\mathbf{c}}}
\newcommand{\Cmdp}{\ensuremath{\mathcal{M}}}
\newcommand{\CmdpStateSpace}{\ensuremath{\mathcal{S}}}
\newcommand{\CmdpActionSpace}{\ensuremath{\mathcal{A}}}
\newcommand{\CmdpContextSpace}{\ensuremath{\mathcal{C}}}
\newcommand{\CmdpMapping}{\ensuremath{\mathsf{M}}}
\newcommand{\CmdpState}{\ensuremath{\mathbf{s}}}
\newcommand{\CmdpAction}{\ensuremath{\mathbf{a}}}
\newcommand{\CmdpReward}{\ensuremath{r}}
\newcommand{\CmdpTransitionFunction}{\ensuremath{p_{\context}}}
\newcommand{\CmdpRewardFunction}[1][\context]{\ensuremath{r_{#1}}}
\newcommand{\CmdpInitialDistribution}{\ensuremath{p_{0,\context}}}
\newcommand{\CmdpPolicy}{\ensuremath{\pi}}
\newcommand{\CmdpDiscount}{\ensuremath{\gamma}}


% Contextual RL
\newcommand{\ValueFunction}{\ensuremath{V_{\CmdpPolicy}}}
\newcommand{\TargetContextDistribution}{\ensuremath{\varphi}}
\newcommand{\CRLExpectation}[1][\TargetContextDistribution]{\ensuremath{J(\CmdpPolicy,#1)}}
\newcommand{\Trajectory}{\ensuremath{\boldsymbol{\tau}}}
\newcommand{\Return}{\ensuremath{G}}
\newcommand{\ReturnSample}{g}

% Self-paced RL
\newcommand{\ContextDistribution}{\varrho}
\newcommand{\ContextDistributionParameter}{\nu}
\newcommand{\KLDivergenceBound}{\epsilon}
\newcommand{\NumberOfIterations}{K}
\newcommand{\NumberOfRollouts}{M}
\newcommand{\KLDivergence}{D_{\text{KL}}}
\newcommand{\PerformanceConstraint}{\delta}
\newcommand{\TrajectorySet}{\mathcal{D}}
\newcommand{\SPRLIteration}{k}

% Risk
\newcommand{\CVaRCoeff}{\alpha}
\newcommand{\CVaR}[1][\CVaRCoeff]{\text{CVaR}_{#1}}
\newcommand{\AlphaQuantile}[1][\CVaRCoeff]{q_{#1}}
\newcommand{\FilterCVaR}{\ensuremath{\mathfrak{F}}}

% CEM
\newcommand{\CEMNumberOfIteration}{I}
\newcommand{\ContextDistributionCEM}{\Tilde{\ContextDistribution}}
\newcommand{\ImportanceWeight}{\omega}
\newcommand{\BatchSize}{N}
\newcommand{\SmoothenedCVaRCoeff}{\beta}
\newcommand{\SetOfReturns}{\mathcal{\Return}}

% RaCGEN
\newcommand{\RACGEN}{\textsc{RaCGEN}\xspace}
\newcommand{\RACGENIteration}{\SPRLIteration}
\newcommand{\PrimaryTrajectorySet}{\mathcal{D}^{\textit{pri}}}
\newcommand{\AuxiliaryTrajectorySet}{\mathcal{D}^{\textit{aux}}}
\newcommand{\PrimaryContext}{\context^{\textit{pri}}}
\newcommand{\AuxiliaryContext}{\context^{\textit{aux}}}
\newcommand{\NumberOfPrimaryRollouts}{\NumberOfRollouts^{\textit{pri}}}
\newcommand{\NumberOfAuxiliaryRollouts}{\NumberOfRollouts^{\textit{aux}}}
\newcommand{\PrimaryImportanceWeight}{\ImportanceWeight^{\textit{pri}}}
\newcommand{\AuxiliaryImportanceWeight}{\ImportanceWeight^{\textit{aux}}}
\newcommand{\ImportanceWeightSet}{\Omega}
\newcommand{\EstimatedAlphaQuantile}[1][\CVaRCoeff]{\hat{q}_{#1}}
\newcommand{\Quantile}{q}
% \newcommand{\SmoothenedCVaRCoeff}{\beta}
\newcommand{\RiskLevelSchedule}{\rho}

% SOTA and Baselines
\newcommand{\GOALGAN}{\textsc{GoalGAN}\xspace}
\newcommand{\ALPGMM}{\textsc{ALP-GMM}\xspace}
\newcommand{\PLR}{\textsc{PLR}\xspace}
\newcommand{\VDS}{\textsc{VDS}\xspace}
\newcommand{\SPDL}{\textsc{SPDL}\xspace}
\newcommand{\CURROT}{\textsc{CURROT}\xspace}
\newcommand{\DEFAULT}{\textsc{Default}\xspace}
\newcommand{\DEFAULTCEM}{\textsc{Default-CEM}\xspace}
\newcommand{\SPDLN}{\textsc{SPDL-N}\xspace}
\newcommand{\RACGENN}{\textsc{RaCGEN-N}\xspace}
\newcommand{\Normal}{\textrm{Normal}\xspace}
\newcommand{\Cauchy}{\textrm{Cauchy}\xspace}
\newcommand{\CURROTPerf}{\delta}
\newcommand{\CURROTWass}{\epsilon_{\text{Wass}}}
\newcommand{\PLRStale}{\rho}
\newcommand{\PLRTemp}{\beta}
\newcommand{\PLRReplay}{p}
\newcommand{\VDSLR}{\text{LR}}
\newcommand{\VDSEpoch}{n_{\text{ep}}}
\newcommand{\VDSBatch}{n_{\text{batch}}}
\newcommand{\GOALGANNoise}{\delta_{\text{noise}}}
\newcommand{\GOALGANRollout}{n_{\text{rollout}}^{\text{GG}}}
\newcommand{\GOALGANSuccess}{p_{\text{success}}}
\newcommand{\ALPGMMRandom}{p_{\text{rand}}}
\newcommand{\ALPGMMRollout}{n_{\text{rollout}}^{\text{AG}}}
\newcommand{\ALPGMMBuffer}{s_{\text{buffer}}}
\newcommand{\PPO}{\textsc{PPO}\xspace}

\newcommand{\new}[1]{{\color{blue}#1}}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%% NOMENCLATURE %%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{document}
\maketitle

\begin{abstract}
Automated curriculum generation for reinforcement learning (RL) aims to speed up learning by designing a sequence of tasks of increasing difficulty.
Such tasks are usually drawn from probability distributions with exponentially bounded tails, such as uniform or Gaussian distributions.
However, existing approaches overlook \emph{heavy-tailed} distributions.
Under such distributions, current methods may fail to learn optimal policies in \emph{rare} and \emph{risky} tasks, which fall under the tails and yield the lowest returns, respectively.
We address this challenge by proposing a risk-aware curriculum generation algorithm that simultaneously creates two curricula:
\begin{enumerate*}
    \item a \emph{primary curriculum} that aims to maximize the expected discounted return with respect to a distribution over target tasks, and 
    \item an \emph{auxiliary curriculum} that identifies and over-samples rare and risky tasks observed in the primary curriculum.
\end{enumerate*}
Our empirical results evidence that the proposed algorithm achieves significantly higher returns in frequent as well as rare tasks compared to the state-of-the-art methods.
\end{abstract}

\section{Introduction}

The design of task sequences, i.e., curricula, improves the performance of reinforcement learning (RL) agents and speeds up convergence in complex tasks \citep{narvekar2020curriculum}.
A curriculum typically begins with easy tasks and gradually increases difficulty toward target tasks.
A common approach is to tailor the curricula using human input to identify easy and hard tasks~\citep{Asada1996,narvekar2020curriculum}.
% Common approaches manually tailor curricula to identify easy and hard tasks~\cite{Asada1996}, which requires human input \citep{narvekar2020curriculum}.
Recent studies facilitate this process by automating the curriculum generation~\citep{pmlr-v78-florensa17a,portelas2020teacher}.
In particular, \emph{self-paced} RL uses a \emph{context} to parameterize the dynamics and rewards of the environment, which implicitly defines a task~\citep{klink2020self_SPCRL,NEURIPS2020_68a97503,klink2021probabilistic,klink2022curriculum}.
These methods assume the contexts are drawn from distributions known a priori.
Considering problems described by a \emph{target context distribution}, self-paced RL automatically generates a curriculum, represented by a sequence of context distributions, to speed up learning.

In general, curriculum generation methods overlook \emph{heavy-tailed} distributions and focus on target context distributions with exponentially bounded tails, e.g., a normal or uniform distribution.
However, heavy-tailed distributions commonly appear in the real world:
    words in natural language~\citep{zipf2013psycho} and  relationships in social networks~\citep{albert2002statistical} follow a power law distribution, 
    while Cauchy distribution appear in risk analysis~\citep{osu2011financial} and rainfall models~\citep{langat2019identification}.
Rare events are more likely to occur under heavy-tailed distributions, as the area under the extreme regions is larger than the area under the tails of an exponentially-bounded probability distribution \citep{wilcox2003applying}.
A possible explanation for such disregard is that simulated RL environments typically have uniform variations \citep{bellemare2013arcade,cobbe2019quantifying,1606.01540}, and, even when a target distribution is non-uniform, it does not reflect the heavy-tailed nature of the real world \citep{vinyals2017starcraft}.

% Exposing an RL agent to a heavy-tailed context distribution poses challenges, as 
Existing RL algorithms may underperform in a context drawn from the tails of a heavy-tailed distribution \citep{zhang2021learning,chan2022zipfian}.
% The reason behind such failure is that off-the-shelf algorithms overlook rare contexts among frequent tasks.
This occurs because an off-the-shelf algorithm would be \emph{underexposed} to rare contexts, i.e., they do not encounter such contexts sufficiently to learn a good policy for them.
Curriculum generation methods face the same problem since they do not explicitly address rare contexts, either.
These approaches may yield policies that are sub-optimal in rare contexts drawn from exponentially-bounded distributions, as rare contexts have a low impact on the average performance. However, in heavy-tailed distributions, these rare contexts together are more frequent and exacerbate the performance loss. % problem as rare contexts are more likely to occur overall.
Furthermore, we observe that rare contexts correlate with \textit{risky} contexts, where the agent's return is among the lowest (see \cref{section:pitfalls}).
As a result, curriculum learning methods fail to be robust in rare and risky contexts.

We address the challenges faced under heavy-tailed task distributions by developing a \emph{risk-aware} curriculum generation algorithm (\RACGEN).
To improve the policy in the tails, \RACGEN simultaneously creates two curricula:
\begin{enumerate*}
    \item a \textit{primary curriculum} that speeds up the learning of the target context distribution via self-paced RL~\citep{klink2021probabilistic}; and 
    \item an \textit{auxiliary curriculum} that targets risky and rare contexts under the primary curriculum.
\end{enumerate*}
The auxiliary curriculum is inspired by a cross entropy method~(CEM), which estimates probabilities of rare events~\citep{de2005tutorial}.
Similar to \citeauthor{greenberg2022efficient}'s work, which does not focus on curriculum learning, we employ CEM to generate a distribution over contexts where the agent's return is below the conditional value of risk (CVaR) of the distribution over returns. In comparison, via CEM, \RACGEN generates a sequence of auxiliary context distributions, 
that identifies rare and risky contexts under primary context distributions produced by the primary curriculum. 

\textbf{Contribution.}
Our contribution is three-fold, we :
\begin{enumerate*}
    \item identify shortcomings of existing automated curriculum methods under heavy-tailed target context distributions;
    \item propose \RACGEN, which combines self-paced RL with CEM to simultaneously speed up learning and improve the performance in rare contexts; %robustness;
    and
    \item demonstrate empirically that, compared to state-of-the-art automated curriculum methods, \RACGEN achieves significantly higher returns, with $p {<} 0.001$, in frequent as well as rare and risky contexts.
\end{enumerate*}


\section{Related Work}

We discuss the connections between RL and three subjects related to our work: generalization, curriculum learning, heavy-tail task distributions and
risk optimization.


\paragraph{Generalization in RL.}
We investigate the setting where an RL agent trains on a set of tasks and is deployed to tasks unseen during training.
% We investigate a multi-task RL 
This problem is formulated via contextual Markov decision processes~(CMDPs).
In this setting, a singleton task refers to an MDP instance described by a context that parameterizes the reward and transition functions \citep{hallak2015contextual}.
The objective % of an RL agent in a CMDP 
is to maximize the expected discounted return in the MDPs corresponding to the contexts drawn from a probability distribution over the context space of the CMDP.
The contexts that an RL agent sees in training and test time are sampled from the same distribution.
Therefore, from a generalization perspective, we consider an \emph{interpolation} problem as contexts in test time can be interpolated from contexts seen during training \citep{kirk2023survey}.
Under the interpolation subarea, we particularly focus on contextual MDPs where contexts are drawn from a heavy-tailed probability distribution defined over context spaces.


\textbf{Curriculum learning for RL.} Automatically generating curricula in RL aims to accelerate convergence to optimal policies by modifying the configuration of the environment. 
Numerous works consider curricula as sequences of distributions over such configurations.
\citet{pmlr-v78-florensa17a} focus on distributions over initial states by starting in the neighborhood of the goal state and reversely working towards a target distribution. 
Other studies propose generating distributions over goal states by optimizing with respect to intrinsic motivation \citep{baranes2010intrinsically,portelas2020teacher}, intermediate goal difficulty~\citep{florensa2018automatic}, value disagreement \citep{NEURIPS2020_566f0ea4}, and feasibility and coverage of goal states~\citep{racaniere2019automated}. 
Another line of work takes the perspective of generating distributions of levels, i.e., environment instances, that prioritizes higher learning potential~\citep{jiang2021prioritized,jiang2021replay}.
Our work falls under self-paced RL, a curriculum learning approach adopted from supervised learning where training samples are automatically ordered in increasing complexity~\citep{NIPS2010_e57c6b95,jiang2015self}. 
\citet{ren2018self} consider curricula as a sequence of environment interactions and proposes a self-paced mechanism that minimizes coverage penalty.
\citeauthor{eimer2021self}'s work generates a sequence of contexts, not distributions, with respect to their capacity of value improvement~\citep{eimer2021self}. \citet{klink2020self_SPCRL,NEURIPS2020_68a97503,klink2021probabilistic,klink2022curriculum,koprulu2023reward} formulate the generation of curricula as interpolations between distributions over contexts. \citet{chen2021variational} also study interpolations between task distributions, but not under the self-paced RL framework.
Although they do not consider risk as a safety metric, \citet{turchetta2020safe} proposes an approach for generating curricula in safety-critical applications. When the student behaves dangerously, the teacher intervenes by activating reset controllers that take the student to a safe state. 

\textbf{RL under heavy-tailed task distributions.}
Some supervised learning algorithms have considered learning under heavy-tailed distributions, such as in computer vision by \emph{Long-tailed Image Net} benchmark \citep{liu2019large} and only a few works that particularly concentrate on rare events or heavy-tailed task distributions in RL.
\citet{frank2008reinforcement} devise an importance sampling approach to alter probabilities of rare events in simulation data for a tabular setting.
\citet{chan2022zipfian} is the first work that investigates the shortcomings of Deep RL algorithms in rare events, sampled from Zipfian distributions, which are heavy-tailed and fall under the family of power law distributions.
In addition, \citet{zhuang2021no} propose no-regret RL algorithms for settings with rewards that follow heavy-tailed distributions.
To our knowledge, our work is the first work that proposes an automated curriculum learning method to address heavy-tailed task distributions.

\textbf{Risk optimization in RL.} 
Minimizing risk in RL aims to learn policies that maximize performance while satisfying safety requirements during training and test time \citep{garcia2015comprehensive}. To this aim, \citet{tamar2015policy} proposes a policy gradient algorithm for general coherent risk measures, among which CVaR is very popular \citep{tamar2015optimizing,rajeswaran2017epopt,DBLP:journals/ml/YangSTS23}.
\citet{greenberg2022efficient} focuses on CVaR optimization in a multi-task setting and presents a risk-averse RL algorithm that combines risk-optimizing policy gradient methods with CEM that identifies and samples risky tasks.
Although we take inspiration from \citet{greenberg2022efficient}, we do not optimize risk in RL. Instead, we utilize CEM in curriculum generation to sample rare and risky contexts, namely, tasks, under context distributions generated by primary curricula.

\section{Contextual MDP}
We formalize our problem of interest as a contextual RL problem, which uses contextual Markov decision processes~(CMDPs) to model a multi-task setting given a distribution over target contexts.
Upon introducing these concepts, we continue laying the foundations for self-paced RL and cross-entropy methods, which we adopt to generate primary and auxiliary curricula, respectively.

\begin{definition}
A \emph{contextual Markov decision process (CMDP)} $\Cmdp=\langle \CmdpStateSpace, \CmdpActionSpace, \CmdpContextSpace, \CmdpMapping, \CmdpDiscount \rangle$ is defined by a state space \CmdpStateSpace, an action space \CmdpActionSpace, a context space $\CmdpContextSpace \subseteq \Reals^n$ for $n\in\PositiveIntegers$, a mapping from context space to Markov decision process parameters \CmdpMapping, and a discount factor \CmdpDiscount.
\end{definition}
A CMDP $\Cmdp$ represents a family of MDPs parameterized by its contexts \CmdpContextSpace.
Given a context $\context \in \CmdpContextSpace$, we obtain an MDP $\CmdpMapping(\context)=\langle \CmdpStateSpace, \CmdpActionSpace, \CmdpTransitionFunction, \CmdpRewardFunction, \CmdpInitialDistribution, \CmdpDiscount \rangle$, where \CmdpStateSpace, \CmdpActionSpace, and \CmdpDiscount{} are the same state space as in $\Cmdp$, but its probabilistic transition function \CmdpTransitionFunction, reward function \CmdpRewardFunction, and initial state distribution~$\CmdpInitialDistribution$ depend on its context \context.
A policy~$\CmdpPolicy:\CmdpStateSpace\times\CmdpContextSpace\to\ProbSimplex(\CmdpActionSpace)$, which defines the behavior of an agent in a CMDP $\Cmdp$, outputs a probability simplex over action space $\CmdpActionSpace$ given state $\CmdpState\in\CmdpStateSpace$ and context $\context\in\CmdpContextSpace$. Note that the agent observes the context $\context$.
Following policy $\CmdpPolicy$, an agent collects a trajectory $\Trajectory=\{(\CmdpState_t, \context, \CmdpAction_t,\CmdpReward_t)\}_{t=0}^T$ of length~$T$ with an initial state~$\CmdpState_0\sim\CmdpInitialDistribution$, states $\CmdpState_{t+1}\sim\CmdpTransitionFunction(\cdot|\CmdpState_t,\CmdpAction_t)$, actions~$\CmdpAction_t\sim\CmdpPolicy(\cdot|\CmdpState_t,\context)$, and rewards $\CmdpReward_t=\CmdpRewardFunction(\CmdpState_t,\CmdpAction_t)$ for times steps $t\in[T]$.


% \subsection{Contextual Reinforcement Learning}
Given a CMDP $\Cmdp$ and a target context distribution~$\TargetContextDistribution$, i.e., a probability simplex $\ProbSimplex(\CmdpContextSpace)$ over context space~$\CmdpContextSpace$,
\emph{contextual RL} aims to learn a policy that maximizes the expected discounted return in contexts $\context$ drawn from $\TargetContextDistribution$:
%
\begin{equation}
    \max_{\CmdpPolicy}\CRLExpectation=\max_{\CmdpPolicy}\Expectation_{\Probability_{\context}^{\CmdpPolicy}(\Trajectory),\TargetContextDistribution(\context)}[\Return(\Trajectory)],
    \label{eq:crl}
\end{equation}
%
where $\Return(\Trajectory)=\sum_{t=0}^T\CmdpDiscount^t\CmdpRewardFunction(\CmdpState_t,\CmdpAction_t)$ is the discounted return for trajectory $\Trajectory$, and $\Probability_{\context}^{\CmdpPolicy}(\Trajectory)$ is the probability distribution of trajectory $\Trajectory$ induced by policy $\CmdpPolicy$ in context $\context$ as:
%
$
    \Probability_{\context}^{\CmdpPolicy}(\Trajectory)
    =
    \CmdpInitialDistribution(\CmdpState_0)\prod_{t=0}^{T} \CmdpPolicy(\CmdpAction_t|\CmdpState_t,\context) \CmdpTransitionFunction(\CmdpState_{t+1}|\CmdpState_t,\CmdpAction_t).
$

Contextual RL formulates an optimal decision-making problem that we attempt to solve in this paper.
Particularly, we focus on heavy-tailed target context distributions under which rare and risky contexts pose a challenge: An agent requires more samples in a risky context, as it is non-trivial to acquire an optimal behavior. When this context falls under the tails of the target context distribution $\TargetContextDistribution$, simply using the target distribution $\TargetContextDistribution$ prevents the agent from obtaining sufficiently many samples in a sample-efficient manner. In addition, a learning algorithm can get stuck in local optima while maximizing the expected discounted return $\CRLExpectation$ by overlooking rare contexts. The literature on automated curriculum generation fails to address this phenomenon by solely focusing on exponentially-bounded target context distributions where contexts are either equally likely or not so spread out, e.g. under a uniform or a normal distribution, respectively (see \cref{section:pitfalls} for a detailed discussion).

\paragraph{Problem statement.}
% In a multi-task setting,
Given a CMDP~$\Cmdp$ to describe the parameterization of a set of tasks via contexts, and a \emph{heavy-tailed} target context distribution $\TargetContextDistribution$ to specify their probability of occurrence, \emph{sample-efficiently} learn a policy $\CmdpPolicy$ that maximizes the expected discounted return $\CRLExpectation$ in $\Cmdp$.


\Cref{fig:pointmass1D} shows an example domain, called the \emph{point-mass} environment, where \emph{a context specifies the position of the door}.
The agent must reach the goal position by passing through the door.
An episode terminates when the agent hits the wall or reaches the goal.
The state space, i.e., all possible positions of the agent, and the action space, i.e., forces applied to the point mass along two axes, are independent of the context.
However, the context affects the transitions, e.g., whether the agent ends up in the wall, and the rewards, e.g., if the agent receives a reward for approaching the goal position without hitting the wall. 

\begin{figure}[tbp]
    \centering
    \includegraphics[width=.35\textwidth]{figures/point_mass_1d.pdf}
    \caption{Point-mass environment with 1D context space: Context $\context$ determines the position of the door.}
    \label{fig:pointmass1D}
\end{figure}

For the point-mass environment, an example target context distribution is a univariate normal distribution $\NormalDistribution(\NormalDistributionMean,\NormalDistributionSTD^2)$, not heavy-tailed, with mean $\NormalDistributionMean$ and standard deviation $\NormalDistributionSTD$ over context values, i.e., door positions, in context space $\CmdpContextSpace$.

\section{Contextual RL}
% This section reviews methods to solve CMDPs.
In this section, we review two methods that aim to solve CMDPs from different perspectives.


\subsection{Self-Paced RL}

\begin{algorithm}[tb]
    \caption{Self-paced RL \protect\citep{klink2021probabilistic}}
    \label{alg:sprl}
        \textbf{Input}: Target and initial context distributions $\TargetContextDistribution, \ContextDistribution_{0}$\\
    \textbf{Parameters}: Performance constraint $\PerformanceConstraint$, KL divergence bound~$\KLDivergenceBound$, number of curriculum iterations $\NumberOfIterations$, number of rollouts per policy update $\NumberOfRollouts$\\
    \textbf{Output}: Policy $\CmdpPolicy$
    \begin{algorithmic}[1] %[1] enables line numbers
        \STATE Initialize policy $\CmdpPolicy$
        \FOR{$k=1$ \textbf{to} $\NumberOfIterations$}
        \STATE $\context_i\sim\ContextDistribution_{k-1}$, $i\in[\NumberOfRollouts]$ \COMMENT{sample contexts} \label{l:sprl_sample_contexts}
        % \STATE $\Trajectory_i\sim\Probability_{\context_i}^{\CmdpPolicy}(\Trajectory)$, $i\in[\NumberOfRollouts]$
        \STATE $\TrajectorySet_k=\{(\context_i,\Trajectory_i) | \Trajectory_i\sim\Probability_{\context_i}^{\CmdpPolicy}(\Trajectory) \}_{i=1}^{\NumberOfRollouts}$
        \COMMENT{collect trajectories} \label{l:sprl_collect_trajectories}
        \STATE $\CmdpPolicy \leftarrow \Psi(\TrajectorySet_k,\CmdpPolicy )$ \COMMENT{update policy with RL algorithm $\Psi$}
        % \STATE Update policy $\CmdpPolicy$ via RL algorithm of choice using rollouts $\TrajectorySet_k=\{(\context_i,\Trajectory_i)\}_{i=1}^{\NumberOfRollouts}$
        \STATE $\ContextDistribution_{\SPRLIteration} \leftarrow\Phi_{\TargetContextDistribution}(\CmdpPolicy,\TrajectorySet_{\SPRLIteration},\ContextDistribution_{\SPRLIteration-1}) $
        \COMMENT{new context distribution (\ref{eq:sprl})}
        % Obtain the next context distribution $\ContextDistribution_{k}$ by optimizing \cref{eq:sprl} using $\TrajectorySet_k$
        \ENDFOR
        \STATE \textbf{return} $\CmdpPolicy$
    \end{algorithmic}
\end{algorithm}

Self-paced RL \citep{klink2021probabilistic} is an automated curriculum generation approach that creates a sequence $\{\ContextDistribution_{\SPRLIteration}\}_{\SPRLIteration=1}^\NumberOfIterations$ of context distributions $\ContextDistribution_{\SPRLIteration}$ to learn a policy $\CmdpPolicy$ that maximizes $\CRLExpectation$ given a CMDP $\Cmdp$ with a target context distribution~$\TargetContextDistribution$. 
The algorithm starts with an initial context distribution $\ContextDistribution_0$, under which \emph{easy} contexts are more likely to occur. Looking at the point-mass environment, an easy context has a door positioned in the middle of the room and thus yields the highest return under an optimal policy.

\Cref{alg:sprl} provides more details about the method.
At iteration $\SPRLIteration$, first, the algorithm samples contexts $\{\context_i\}_{i=1}$ from the current context distribution $\ContextDistribution_{\SPRLIteration - 1}$ (\cref{l:sprl_sample_contexts}), and rolls out policy $\CmdpPolicy$ to collect a set of trajectories $\TrajectorySet_{\SPRLIteration}$ (\cref{l:sprl_collect_trajectories}).
Then, using $\TrajectorySet_{\SPRLIteration}$, policy $\CmdpPolicy$ is updated via an RL algorithm of choice. Finally, the algorithm generates the next context distribution~$\ContextDistribution_{\SPRLIteration}$, which minimizes the KL divergence to the target context distribution $\TargetContextDistribution$:
\begin{align}
    \Phi_{\TargetContextDistribution}(\CmdpPolicy,\TrajectorySet_{\SPRLIteration},\ContextDistribution_{\SPRLIteration-1}) = \argmin_{\ContextDistribution_{\SPRLIteration}}~&~\KLDivergence(\ContextDistribution_{\SPRLIteration}||\TargetContextDistribution) \nonumber\\
    \text{s.t.}~&~\CRLExpectation[\ContextDistribution_{\SPRLIteration}]\geq\PerformanceConstraint, \label{eq:sprl}  \\
    &~\KLDivergence(\ContextDistribution_{\SPRLIteration-1}||\ContextDistribution_{\SPRLIteration}) \leq \KLDivergenceBound, \nonumber
\end{align}
where there are two constraints: 
\begin{enumerate*}
    \item the expected discounted return $\CRLExpectation[\ContextDistribution_{\SPRLIteration}]$ under the next context distribution $\ContextDistribution_{\SPRLIteration}$ should be equal to or greater than the desired level of performance~$\PerformanceConstraint$, and
    \item the maximum KL divergence between the current context distribution $\ContextDistribution_{\SPRLIteration-1}$ and the next context distribution~$\ContextDistribution_{\SPRLIteration}$ should be less then the divergence bound $\KLDivergenceBound$.
\end{enumerate*}
The performance constraint guarantees that the agent collects sufficiently large returns.
In parallel, the KL divergence constraint prevents the curriculum from diverging too much from the previous context distribution, which could result in performance loss as past experience becomes less valuable.
To estimate $\CRLExpectation[\ContextDistribution_{\SPRLIteration}]$, self-paced RL uses the following unbiased sample average given $\NumberOfRollouts$ trajectories:
\begin{equation*}
    \CRLExpectation[\ContextDistribution_{\SPRLIteration}]
    =
    \frac{1}{\NumberOfRollouts}\sum_{i=1}^{\NumberOfRollouts}\frac{\ContextDistribution_{\SPRLIteration}(\context_{i})}{\ContextDistribution_{\SPRLIteration-1}(\context_{i})}\sum_{t=0}^{T_i}\CmdpDiscount^t\CmdpRewardFunction[\context_{i}](\CmdpState_t,\CmdpAction_t),
\end{equation*}
where $T_i$ is the length of the $i$-th trajectory.
\cref{eq:sprl} can be solved via any constrained optimization algorithm, such as trust-region, as adapted by \citet{klink2020self_SPCRL,NEURIPS2020_68a97503,klink2021probabilistic}. 

\paragraph{Self-paced RL fails under heavy-tailed target context distributions.}
The existing literature on self-paced RL merely focuses on exponentially-bounded target context distributions and generates a curriculum by taking the expected discounted return $\CRLExpectation[\ContextDistribution_{\SPRLIteration}]$ into account. Therefore, they do not address the challenges caused by risky and rare contexts that appear under heavy-tailed target context distributions.
We propose a risk-aware curriculum generation method that tackles these challenges by integrating the cross entropy method, which we explain next, into self-paced RL.

\subsection{Cross Entropy Method}
\begin{algorithm}[tb]
    \caption{CEM variant \protect\citep{greenberg2022efficient}}
    \label{alg:cem}
        \textbf{Input}: Context distribution $\ContextDistribution$, % return function $\Return$, 
    risk level $\AlphaQuantile$, policy $\CmdpPolicy$\\
    \textbf{Parameters}: Number of iterations $\CEMNumberOfIteration$, batch size $\BatchSize$, risk level $\CVaRCoeff$, smoothing risk level $\SmoothenedCVaRCoeff$\\
    \textbf{Output}: Auxiliary context distribution $\ContextDistributionCEM{}$

    \begin{algorithmic}[1] %[1] enables line numbers
        \STATE $\ContextDistributionCEM\leftarrow\ContextDistribution$
        \COMMENT{Initialize auxiliary context distribution}
        \FOR{$i=1$ \textbf{to} $\CEMNumberOfIteration$}
        \STATE $(\context_n,\Trajectory_n)\sim\Probability_{\ContextDistributionCEM}^{\CmdpPolicy}(\context_n,\Trajectory_n)$, $n\in[\BatchSize]$
        \COMMENT{collect trajectories}
        \STATE $\SetOfReturns\leftarrow\{\Return(\Trajectory_n)\}_{n=1}^{\BatchSize}$ 
        \COMMENT{compute returns}
        \STATE $\ImportanceWeight_n\leftarrow\ContextDistribution(\context_n)/\ContextDistributionCEM(\context_n)$, $n\in[\BatchSize]$
        \COMMENT{compute IS weights}
        \STATE $\AlphaQuantile[]\leftarrow\max{\{\AlphaQuantile(\SetOfReturns), \AlphaQuantile[\SmoothenedCVaRCoeff](\SetOfReturns)\}}$%
        \COMMENT{estimate quantile} \label{l:cem_estimate_quantile}
        \STATE $\ContextDistributionCEM\leftarrow\arg\max_{\ContextDistributionCEM'}\sum_{n=1}^{\BatchSize}\ImportanceWeight_n\Indicator_{\Return(\Trajectory_n)\leq\AlphaQuantile[]}\log \ContextDistributionCEM'(\context_n)$
        \COMMENT{new auxiliary context distribution}
        \ENDFOR
        \STATE \textbf{return} $\ContextDistributionCEM$
    \end{algorithmic}
\end{algorithm}
The cross entropy method (CEM) is a generic approach to rare event simulation and optimization~\citep{de2005tutorial}.
We use CEM to identify and sample risky contexts from the primary context distribution, thus CEM does not aim to learn a policy.

We call a context $\context$ \emph{risky} if the discounted return $\Return(\Trajectory)$ of trajectory $\Trajectory$ in $\context$ is below the CVaR of the return distribution.
CVaR is a popular risk measure defined as $\CVaR(\RandomVariable)=\Expectation[\RandomVariable|\RandomVariable\leq\AlphaQuantile(\RandomVariable)]$, where $\AlphaQuantile(\RandomVariable)=\min\{\RandomVariableSample|\CDF_{\RandomVariable}(\RandomVariableSample)\geq\CVaRCoeff\}$ is the $\CVaRCoeff$-quantile of a random variable $\RandomVariable$ with cumulative distribution function $\CDF_{\RandomVariable}$.
To adapt CVaR to our setting, we take inspiration from \citet{greenberg2022efficient} and define CVaR as $\CVaR(\Return|\CmdpPolicy,\ContextDistribution)=\Expectation[\Return|\Return\leq\AlphaQuantile(\Return|\CmdpPolicy,\ContextDistribution)]$, where $\AlphaQuantile(\Return|\CmdpPolicy, \ContextDistribution)=\min\{g|\CDF_{\Return|\CmdpPolicy,\ContextDistribution}(g)\geq\CVaRCoeff\}$.
In other words, CVaR is the expectation of the lowest $\AlphaQuantile$-fraction of returns obtained by policy~$\CmdpPolicy$ in contexts drawn from context distribution~$\ContextDistribution$.

Our goal is to sample context-trajectory pairs $(\context,\Trajectory)$ from the distribution
%
$
    \Probability_{\ContextDistribution,\CVaRCoeff}^{\CmdpPolicy}(\context,\Trajectory)
    =
    % \frac{1}{\CVaRCoeff}
    \CVaRCoeff^{-1}
    \FilterCVaR(\Trajectory, \CmdpPolicy, \CVaRCoeff)
    \Probability_{\ContextDistribution}^{\CmdpPolicy}(\context,\Trajectory),
$
%
where 
%
$
    \FilterCVaR(\Trajectory, \CmdpPolicy, \CVaRCoeff)
    =
    \Indicator[{\Return(\Trajectory)\leq\AlphaQuantile(\Return|\CmdpPolicy)}]
$
%
and
%
$
    \Probability_{\ContextDistribution}^{\CmdpPolicy}(\context,\Trajectory)
    =
    \ContextDistribution(c)\Probability_{\context}^{\CmdpPolicy}(\Trajectory)
$
%
is the probability of drawing context $\context$ from context distribution $\ContextDistribution$ and collecting trajectory $\Trajectory$ via policy $\CmdpPolicy$.
In short, given risk-level $\CVaRCoeff$, we want to find the distribution $\Probability_{\ContextDistribution,\CVaRCoeff}^{\CmdpPolicy}$, that is the closest distribution to the tail of the distribution $\Probability_{\ContextDistribution}^{\CmdpPolicy}$.
We employ CEM to find a context distribution $\ContextDistributionCEM$ for which the distribution $\Probability_{\ContextDistributionCEM}^{\CmdpPolicy}$ is similar to $\Probability_{\ContextDistribution,\CVaRCoeff}^{\CmdpPolicy}$. To this end, CEM solves the following KL divergence minimization problem:
\begin{align}
    \ContextDistributionCEM&  \in
        \argmin_{\ContextDistributionCEM'}\KLDivergence(\Probability_{\ContextDistribution,\CVaRCoeff}^{\CmdpPolicy}||\Probability_{\ContextDistributionCEM'}^{\CmdpPolicy}) \nonumber \\
        &=\argmax_{\ContextDistributionCEM'}\!\!\Expectation_{(\context,\Trajectory)\sim\Probability_{\ContextDistribution}^{\CmdpPolicy}}\left[ \frac{\FilterCVaR(\Trajectory, \CmdpPolicy, \CVaRCoeff)\log\ContextDistributionCEM'(\context)}{\CVaRCoeff}\right] \label{eq:CEM}\\
        &=\argmax_{\ContextDistributionCEM'}\!\!\Expectation_{(\context,\Trajectory)\sim\Probability_{\ContextDistributionCEM'}^{\CmdpPolicy}}\left[\frac{\ImportanceWeight(\context,\Trajectory)\FilterCVaR(\Trajectory, \CmdpPolicy, \CVaRCoeff)\log\ContextDistributionCEM'(\context)}{\CVaRCoeff}\right],\nonumber
\end{align}
where $\ImportanceWeight(\context,\Trajectory)=\nicefrac{\Probability_{\ContextDistribution}^{\CmdpPolicy}(\context,\Trajectory)}{\Probability_{\ContextDistributionCEM'}^{\CmdpPolicy}(\context,\Trajectory)}=\nicefrac{\ContextDistribution(\context)}{\ContextDistributionCEM'(\context)}$ is the importance sampling (IS) weight for the context-trajectory pair~$(\context,\Trajectory)$. As the distribution over which the expectation is computed changes from $\Probability_{\ContextDistribution}^{\CmdpPolicy}$ to $\Probability_{\ContextDistributionCEM'}^{\CmdpPolicy}$ in \cref{eq:CEM}, an IS weight is necessary. We provide the pseudocode of CEM variant for sampling risky contexts by \citet{greenberg2022efficient} in \cref{alg:cem}.
Given a smoothing risk level $\SmoothenedCVaRCoeff>\CVaRCoeff$, \cref{l:cem_estimate_quantile} enables smooth updates of context distribution $\ContextDistributionCEM$.

\paragraph{Integrating CEM into curriculum generation to sample risky contexts.}
\citet{greenberg2022efficient} use CEM to identify and sample risky contexts under the target context distribution $\TargetContextDistribution$, which can be achieved in \cref{alg:cem} by replacing input $\ContextDistribution$ with $\TargetContextDistribution$.
This method does not focus on curriculum generation, hence it does not benefit from performance and convergence advantages that come with curriculum learning in RL \citep{narvekar2020curriculum}.
In the next section, we address this gap by proposing a risk-aware curriculum generation method, where CEM takes the context distribution $\ContextDistribution_{\SPRLIteration}$ from curriculum iteration $\SPRLIteration$ to generate an auxiliary distribution $\ContextDistributionCEM_{\SPRLIteration}$ identifying the risky contexts under $\ContextDistribution_{\SPRLIteration}$.

\section{Risk-aware Curriculum Generation}
Building upon our problem of interest, contextual RL, we first discuss the challenges that emerge with heavy-tailed context distributions. Then, we present a risk-aware curriculum generation algorithm that adopts self-paced RL and CEM to address these challenges.
\subsection{Pitfalls of Heavy-tailed Task Distributions}
\label{section:pitfalls}

A probability distribution is \emph{heavy-tailed} if its tails are not exponentially-bounded; intuitively, they are heavier than the tails of the exponential probability distribution \citep{Asmussen2003}.
Extreme events or outliers are more likely to occur under heavy-tailed distributions, as the area under the extreme regions of the distribution is larger than the area under the tails of an exponentially-bounded probability distribution \citep{wilcox2003applying}. 
%A distribution, with one-dimensional support, can have one heavy tail, such as log-normal distribution and Pareto distribution, or two, such as Cauchy distribution and T-distribution. 
In addition, some moments of a heavy-tailed distribution do not exist.
For instance, a Cauchy distribution has no finite moments of order 1 or higher, which causes its mean and variance to be undefined.
Therefore, a Cauchy distribution is described by its median $\CauchyDistributionLocation$, i.e., location parameter, and its median absolute deviation $\CauchyDistributionScale$, i.e., scale parameter.
We focus on Cauchy distributions since they look similar to normal distributions; at the same time, they are considered pathological due to their heavy-tailed nature.
For a random variable~$\CauchyDistributionRandomVariable$ drawn from a Cauchy distribution with location~$\CauchyDistributionLocation$ and scale~$\CauchyDistributionScale$, the corresponding probability density function is $\CauchyDistributionPDF(x|\CauchyDistributionLocation,\CauchyDistributionScale)=\nicefrac{1}{\pi\CauchyDistributionScale}[1+(\nicefrac{x-\CauchyDistributionLocation}{\CauchyDistributionScale})^2]^{-1}$.

\begin{figure}[tb]
    \centering
    \begin{subfigure}[b]{\columnwidth}
        \centering
        \includegraphics[width=\textwidth]{figures/target_cvar.pdf}
        \caption{Normal and Cauchy target context distributions}
        \label{fig:target_cvar}
    \end{subfigure}
    % \\
    \begin{subfigure}[b]{\columnwidth}
        \centering
        \includegraphics[width=\textwidth]{figures/return_dist.pdf}
        \caption{Distribution of returns under Normal and Cauchy target}
        \label{fig:return_cvar}
    \end{subfigure}
    \caption{Pitfalls of heavy-tailed target context distributions in the point-mass environment: (\subref{fig:target_cvar}) Non-risky and risky contexts for $\CVaRCoeff=0.2$; and (\subref{fig:return_cvar}) distributions of returns for an optimal policy  under $\TargetContextDistribution_{\Normal}$ and $\TargetContextDistribution_{\Cauchy}$.}
    \label{fig:cauchy_vs_gaussian}
\end{figure}


Consider the point-mass environment from \cref{fig:pointmass1D} with one-dimensional context space $\CmdpContextSpace=[-7,7]$, corresponding to possible door positions. 
In \cref{fig:cauchy_vs_gaussian}, we analyze two target context distributions:
a normal distribution $\TargetContextDistribution_{\Normal}$ with mean $\NormalDistributionMean=3.5$ and standard deviation $\NormalDistributionSTD=0.7$ (\cref{fig:target_cvar} left), and
a Cauchy distribution $\TargetContextDistribution_{\Cauchy}$ with location $\CauchyDistributionLocation=3.5$ and scale $\CauchyDistributionScale=0.7$ (\cref{fig:target_cvar} right).



\begin{algorithm*}[tbp]
    \caption{Risk-aware curriculum generation (\textbf{$\RACGEN$})}
    \label{alg:racgen}
\textbf{Input}: Target context distribution $\TargetContextDistribution$\\
\textbf{Parameter}: Initial context distribution $\ContextDistribution_{0}$, performance constraint $\PerformanceConstraint$, KL divergence bound $\KLDivergenceBound$, number of curriculum iterations $\NumberOfIterations$, number of rollouts per policy update $\NumberOfRollouts$, smoothing risk level $\SmoothenedCVaRCoeff$, final risk level $\CVaRCoeff$, initial risk level $\CVaRCoeff_0$, risk level scheduling factor $\RiskLevelSchedule$\\
\textbf{Output}: Policy $\CmdpPolicy$
\begin{algorithmic}[1] %[1] enables line numbers
    \STATE Initialize policy $\CmdpPolicy$.
    \STATE $\ContextDistributionCEM_{0}\leftarrow\ContextDistribution_{0}$
    \COMMENT{initialize auxiliary context distribution}
    \FOR{$\RACGENIteration=1$ \textbf{to} $\NumberOfIterations$}
        \STATE $\{\PrimaryContext_i|\PrimaryContext_i\sim\ContextDistribution_{\RACGENIteration-1}\}_{i=1}^{\NumberOfPrimaryRollouts}$ 
        \COMMENT{sample primary contexts} \label{l:sample_primary_context}
        \STATE $\{\AuxiliaryContext_j|\AuxiliaryContext_j\sim\ContextDistributionCEM_{\RACGENIteration-1}\}_{j=1}^{\NumberOfAuxiliaryRollouts}$
        \COMMENT{sample auxiliary contexts} \label{l:sample_aux_context}
        \STATE $\PrimaryTrajectorySet_{\RACGENIteration}\leftarrow\{(\PrimaryContext_i,\Trajectory_i)|\Trajectory_i\sim\Probability_{\PrimaryContext_i}^{\CmdpPolicy}\}_{i=1}^{\NumberOfPrimaryRollouts}$
        \COMMENT{collect primary trajectories} \label{l:collect_primary_trajectories}
        \STATE $\AuxiliaryTrajectorySet_{\RACGENIteration}\leftarrow\{(\AuxiliaryContext_j,\Trajectory_j)|\Trajectory_j\sim\Probability_{\AuxiliaryContext_j}^{\CmdpPolicy}\}_{j=1}^{\NumberOfAuxiliaryRollouts}$
        \COMMENT{collect auxiliary trajectories} \label{l:collect_aux_trajectories}
        \STATE $\CmdpPolicy \leftarrow \Psi(\TrajectorySet_k,\CmdpPolicy )$, for $\TrajectorySet_{\RACGENIteration}=\PrimaryTrajectorySet_{\RACGENIteration}\cup\AuxiliaryTrajectorySet_{\RACGENIteration}$
        \COMMENT{update policy with RL algorithm $\Psi$} \label{l:update_policy}
        \STATE $\ContextDistribution_{\SPRLIteration} \leftarrow\Phi_{\TargetContextDistribution}(\CmdpPolicy,\PrimaryTrajectorySet_{\SPRLIteration},\ContextDistribution_{\SPRLIteration-1}) $
        \COMMENT{new context distribution (\ref{eq:sprl})} \label{l:new_context}
        \STATE $\Quantile\leftarrow\max\{\EstimatedAlphaQuantile[\CVaRCoeff_{\RACGENIteration-1}](\{\Return(\Trajectory)|(\context,\Trajectory)\in\PrimaryTrajectorySet_{\RACGENIteration}\}),\EstimatedAlphaQuantile[\SmoothenedCVaRCoeff](\{\Return(\Trajectory)|(\context,\Trajectory)\in\TrajectorySet_{\RACGENIteration}\})\}$
        \COMMENT{estimate quantile} \label{l:estimate_quantile}
        \STATE $\ImportanceWeightSet\leftarrow\{(\ImportanceWeight,\context,\Trajectory) |\ImportanceWeight=\nicefrac{\ContextDistribution_{\RACGENIteration}(\context)}{\ContextDistributionCEM_{\RACGENIteration}(\context)}, (\context,\Trajectory) \in \TrajectorySet_{\RACGENIteration}\}$
        \COMMENT{compute IS weights} \label{l:compute_is}
        \STATE
        $\ContextDistributionCEM_{\RACGENIteration}\leftarrow\arg\max_{\ContextDistributionCEM'}\sum_{(\ImportanceWeight,\context,\Trajectory)\in\ImportanceWeightSet}\ImportanceWeight\Indicator_{\Return(\Trajectory)\leq\AlphaQuantile[]}\log \ContextDistributionCEM'(\context)$
        \COMMENT{new auxiliary context distribution} \label{l:new_aux_context}
        \STATE $\CVaRCoeff_{\RACGENIteration}\leftarrow\max\{\CVaRCoeff,1-\nicefrac{(1-\CVaRCoeff)\RACGENIteration}{(\RiskLevelSchedule\NumberOfIterations)}\}$
        \COMMENT{apply soft risk scheduling} \label{l:soft_risk_scheduling}
    \ENDFOR
    % \STATE \textbf{return} $\CmdpPolicy$
\end{algorithmic}
\end{algorithm*}


Let us assume the reward is the negative of the exponential distance to the goal position $\CmdpRewardFunction(\CmdpState_t,\CmdpAction_t) {=} {-}\exp{\norm{\CmdpState_t {-} \mathbf{g}}_2}$, where $\mathbf{g}$ is the position of the goal.
\cref{fig:return_cvar} shows the distributions of discounted returns of the optimal policy with $\CmdpDiscount=0.95$, i.e., $\Probability_{\TargetContextDistribution,\CVaRCoeff}^{\CmdpPolicy^*}$, under normal (left) and Cauchy (right) target context distributions, respectively.
The distribution of discounted returns under $\TargetContextDistribution_{\Cauchy}$ is more spread than its counterpart under $\TargetContextDistribution_{\Normal}$.
Similarly, the expectation $\Expectation[\Return|\CmdpPolicy,\TargetContextDistribution]$ and conditional value-at-risk $\CVaR[\CVaRCoeff=0.2](\Return|\CmdpPolicy,\TargetContextDistribution)$ are further apart under the Cauchy distribution. 
\cref{fig:target_cvar} supports this observation by illustrating risky contexts, namely, contexts with returns lower than $\CVaR(\Return)$, in red and non-risky contexts in green. Risky contexts under $\TargetContextDistribution_{\Cauchy}$ pile up on the borders as we clip every sample with respect to the boundaries of the context space. In comparison, $\TargetContextDistribution_{\Normal}$ has risky contexts only under its right tail, closer to its mean.


In a multi-task setting, generalization from one task to another becomes challenging as the environment configuration changes drastically. Similarly, in the point-pass environment, generalizing the behavior learned from one context to another requires the policy to learn how the reward function and the transition function change with respect to the context.
If the contexts are further apart in the context space, the generalization will be poorer in comparison to transferring behavior to a context that is similar to the source~\citep{zhang2021learning}.

\Cref{fig:target_cvar} highlights that risky contexts can cause challenges in generalization under a Cauchy distribution, as the likely contexts and contexts under tails are quite different.
Generalization is less critical under a normal distribution, where $99.617\%$ of the samples occur in the interval between three standard deviations from the mean, i.e.,
%
$
    \mathcal{I}
    =
    [\NormalDistributionMean - 3\NormalDistributionSTD, 
    \NormalDistributionMean+3\NormalDistributionSTD].
$
%
In contrast, in a Cauchy distribution with $\CauchyDistributionLocation = 3.5$ and $\CauchyDistributionScale = 0.7$, only $35.0828\%$ of the samples fall into the interval
%
$
    \mathcal{I}
    =
    [\CauchyDistributionLocation-3\CauchyDistributionScale, \CauchyDistributionLocation+3\CauchyDistributionScale].
$

Therefore, we argue that to improve generalization under heavy-tailed context distributions, an automated curriculum learning algorithm should identify and oversample risky and rare contexts.

\subsection{Risk-aware Curriculum Generation}

We propose a risk-aware curriculum generation algorithm, $\RACGEN$, that simultaneously creates two curricula: 
\begin{enumerate*}
    \item a \emph{primary} curriculum, i.e., a sequence $\{\ContextDistribution_k\}_{k=0}^\NumberOfIterations$ of context distributions, via a self-paced RL algorithm, and \item an \emph{auxiliary} curriculum that identifies risky and rare contexts in the primary curriculum via a variant of CEM.
\end{enumerate*}

\textbf{Primary curriculum.} Given a target context distribution~$\TargetContextDistribution$, a self-paced RL algorithm \citep{klink2021probabilistic} generates a sequence of context distributions $\{\ContextDistribution_k\}_{k=0}^\NumberOfIterations$  by optimizing \cref{eq:sprl}.

\textbf{Auxiliary curriculum.} Upon generating the next primary context distribution $\ContextDistribution_{\RACGENIteration}$ at iteration $\RACGENIteration$ of the primary curriculum, the auxiliary curriculum outputs the next auxiliary context distribution $\ContextDistributionCEM_{\RACGENIteration}$. We propose a CEM variant that achieves this by solving \cref{eq:CEM} given the current risk-level $\CVaRCoeff_{\RACGENIteration}$ and the primary context distribution $\ContextDistribution_{\RACGENIteration}$, which corresponds to the reference context distribution in \cref{alg:cem}.
\begin{figure*}[tbp]
    \centering
    \hfill 
    \begin{subfigure}{0.47\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figures/pm2d_ht_w_racgen_pri_contexts.pdf}
        \caption{$\RACGEN$'s primary context distributions $\ContextDistribution_{\RACGENIteration}$.}
        \label{fig:pm2d_ht_w_racgen_pri}
    \end{subfigure}\hfill
    \begin{subfigure}{0.47\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figures/pm2d_ht_w_racgen_aux_contexts.pdf}
        \caption{$\RACGEN$'s auxiliary context distributions $\ContextDistributionCEM_{\RACGENIteration}$.}
        \label{fig:pm2d_ht_w_racgen_aux}
    \end{subfigure}
    \hfill~
    \caption{Point-mass environments from contexts sampled at iterations $\RACGENIteration\in\{0, 20, 75, 195\}$, which determine the position and the width of the door.
    Figures \ref{fig:pm2d_ht_w_racgen_pri} and \ref{fig:pm2d_ht_w_racgen_aux} demonstrate how primary and auxiliary contexts (green and red dots, respectively) evolve during the training.
    The shade of a dot indicates the curriculum iteration, whereas darker shades are of later iterations.}
\end{figure*}

\cref{alg:racgen} presents the pseudocode for the \RACGEN method. In summary, at each iteration, the algorithm generates trajectories based on contexts sampled from the primary and auxiliary curricula and updates the policy and the two curricula.
More specifically, at iteration $\RACGENIteration$, $\RACGEN$ samples $M^{pri}$-many primary $\PrimaryContext_i$ and $M^{aux}$-many auxiliary $\AuxiliaryContext_j$ contexts from the current primary $\ContextDistribution_{\RACGENIteration-1}$ and auxiliary $\ContextDistributionCEM_{\RACGENIteration-1}$ distributions, respectively (\cref{l:sample_primary_context,l:sample_aux_context}).
Then, rolling out policy $\CmdpPolicy$, it collects two sets of trajectories: primary $\PrimaryTrajectorySet_{\RACGENIteration}$ and auxiliary $\AuxiliaryTrajectorySet_{\RACGENIteration}$ (\cref{l:collect_primary_trajectories,l:collect_aux_trajectories}).
The union $\TrajectorySet_{\RACGENIteration}$ of these sets are used to update policy $\CmdpPolicy$ via an RL algorithm of choice (\cref{l:update_policy}). To generate the next primary context distribution $\ContextDistribution_{\RACGENIteration}$, $\RACGEN$ optimizes \cref{eq:sprl} with the primary trajectory set $\PrimaryTrajectorySet_{\RACGENIteration}$ (\cref{l:new_context}), which completes the primary curriculum update.
The auxiliary curriculum update begins with estimating a risk quantile $\AlphaQuantile[]$ (\cref{l:estimate_quantile}).
Following \citeauthor{greenberg2022efficient}'s approach \citep{greenberg2022efficient}, $\RACGEN$ uses a smooth quantile update (\cref{l:estimate_quantile}). 
Then, $\RACGEN$ computes IS weights of sampled context-trajectory pairs $(\context,\Trajectory)\in\TrajectorySet_{\RACGENIteration}$ (\cref{l:compute_is}).
Finally, it generates the next auxiliary context distribution~$\ContextDistributionCEM_{\RACGENIteration}$ using the estimated quantile $\AlphaQuantile[]$ (\cref{l:new_aux_context}), note that this optimization problem has a closed-form solution for some probability distributions, such as a normal or a Cauchy distribution. 

\textbf{Soft-risk scheduling.}
$\RACGEN$ uses soft-risk scheduling to linearly decrease $\CVaRCoeff_k$ from an initial risk level $\CVaRCoeff_0$ to a final risk level $\CVaRCoeff\le\CVaRCoeff_0$.
Originally, \citet{greenberg2022efficient} proposes soft-risk scheduling for CVaR policy gradient algorithms to enable policies to learn in contexts with high returns.
In contrast, the soft-risk scheduling in $\RACGEN$ allows the generation of auxiliary context distribution that focuses on contexts with high returns at first, which allows faster learning at the initial phase of the training. Then, as $\CVaRCoeff_k$ decreases, risky and rare contexts become the focal point of the auxiliary curriculum. Our empirical results evidence that soft-risk scheduling facilitates not only faster performance increase, but also higher returns at the training's end.

\section{Empirical Results}

We set up experiments in \textbf{two domains} to investigate the benefits of $\RACGEN$ under heavy-tailed target context distributions.
We demonstrate the evolution of the primary and auxiliary curricula.
Furthermore, we consider \textbf{two performance metrics}: %the progression of the expected discounted return with respect to the target context distribution, and the distribution of discounted returns for the final policies.
\begin{enumerate*}
\item the distribution of discounted returns ($\Return(\Trajectory)$) with respect to the target context distribution  and \item its expectation ($\CRLExpectation$).
\end{enumerate*}
We compare $\RACGEN$ with \textbf{six state-of-the-art algorithms} for automated curriculum generation: 
    $\CURROT$~\citep{klink2022curriculum}, 
    $\SPDL$~\citep{klink2021probabilistic},
    $\PLR$~\citep{jiang2021prioritized},
    $\VDS$~\citep{NEURIPS2020_566f0ea4},
    $\GOALGAN$~\citep{florensa2018automatic}, and
    $\ALPGMM$~\citep{portelas2020teacher}.
\cref{supp-app:algorithms}
provides more details about each algorithm.
Finally, we include \textbf{two baseline methods}: $\DEFAULT$ and $\DEFAULTCEM$. 
$\DEFAULT$ draws contexts from the target context distribution without generating a curriculum. $\DEFAULTCEM$ extends $\DEFAULT$ with an auxiliary curriculum generated by inputting the target context distribution to CEM (Algorithm \ref{alg:cem}).
These baselines serve as ablation studies to understand whether generating a curriculum indeed boosts learning performance and speed, and whether targeting rare and risky contexts without a primary curriculum is sufficient, respectively.

\subsection{Point-mass Environment}

We begin with a point-mass environment (\cref{fig:pointmass1D}), that has a two-dimensional context space $\CmdpContextSpace=[-7,7]\times[0.5,14]$, where the context determines the position and the width of the door. \citet{klink2020self_SPCRL,NEURIPS2020_68a97503,klink2021probabilistic} study settings where the target context distribution is Gaussian and narrow, referring to small variance, whereas \citet{klink2022curriculum} focuses on a bi-modal target context distribution with small variance around each mode. In contrast, the target context distribution in our experimental setting is a Cauchy distribution with location $l=(3.5, 0.5)$ and scale $s=\textrm{diag}(0.7^2, 0.5^2)$.
The initial context distribution is a Cauchy distribution with location $l=(0, 4.25)$ and scale $s=\textrm{diag}(2^2, 1.875^2)$.
\cref{supp-app:hyperparameters} provides more details.
The code to reproduce the experiments is available online\footnote{ \url{https://github.com/cevahir-koprulu/risk-aware-curriculum-generation}}.


\paragraph{Curriculum generation.} 
\cref{fig:pm2d_ht_w_racgen_pri,fig:pm2d_ht_w_racgen_aux} show the progress of the primary and auxiliary context distributions during training, respectively.
The primary curriculum starts sampling easy contexts where the door is in the center of the room, and its width is high.
The auxiliary curriculum follows a similar pattern as $\CVaRCoeff_{0}=1$, which prevents it from identifying risky and rare contexts, as the objective is to allow the agent to learn the task as quickly as possible.
As the training continues, the primary curriculum generates context distributions $\ContextDistribution_{\RACGENIteration}$ that approach to the target context distribution $\TargetContextDistribution$.
In comparison, as $\CVaRCoeff_{\RACGENIteration}$ decreases linearly, the auxiliary curriculum outputs context distributions $\ContextDistributionCEM_{\RACGENIteration}$ that identify the rare and risky contexts under the tails of their corresponding primary context distributions.
\cref{fig:pm2d_ht_w_racgen_aux} validates this argument as the last auxiliary contexts (darker shades) are centered around the context $\context\approx(5.4,0.9)$, approximately 3 median standard deviations away from the median of the target context distribution along the $x$-axis~(door position).

\begin{figure}[tbp]
    \centering
    \includegraphics[width=.47\textwidth]{figures/pm2d_ht_w_expected_disc_return_progression.pdf}
    \caption{Expected discounted return with respect to the target context distribution in the point-mass environment.
    % We visualize the progression of the expected discounted return obtained by policies of different SOTA and baseline methods.
    The bold lines show the median and the shaded regions cover the first and third quartiles of 10 independent runs.}
    \label{fig:pm2d_ht_w_exp_disc_return}
\end{figure}

\paragraph{Performance progression.}
% \cref{fig:pm2d_ht_w_exp_disc_return} illustrates the progression of the performance measure by the \textit{expected discounted return with respect to the target context distribution}.
\cref{fig:pm2d_ht_w_exp_disc_return} shows the progression of the discounted expected return in the target context distribution during training.
We introduce two algorithms in this experiment: $\RACGENN$ and $\SPDLN$, which generate normal context distributions only. We evaluate these algorithms because the original $\SPDL$ algorithms have a Gaussian assumption \citep{klink2020self_SPCRL,NEURIPS2020_68a97503,klink2021probabilistic}.
In contrast, $\RACGEN$ and $\SPDL$ assume the target distribution is Cauchy.
We observe that although $\DEFAULT$ and $\DEFAULTCEM$ achieve higher returns faster than other algorithms, they stop improving at the early phases of the training.
$\RACGEN$ attains the highest expected returns and even continues to improve toward the end of the training.
$\CURROT$ and $\RACGENN$ perform similarly, despite the fact that $\CURROT$ has no risk-aware mechanism.
$\SPDL$ and $\SPDLN$ also achieve similar expected discounted returns, though \cref{fig:pm2d_ht_w_return_dist} shows that $\SPDL$ performs slightly better due to having the correct assumption about the type of the context distributions.
$\ALPGMM$, $\PLR$, $\VDS$, and $\GOALGAN$ fail to learn policies that receive higher expected returns than $\DEFAULT$ and $\DEFAULTCEM$ in the median.

\paragraph{Final performance.}
\cref{fig:pm2d_ht_w_return_dist} shows that $\RACGEN$ outperforms all state-of-the-art algorithms, their variations, and the baselines in the experiment.
Furthermore, it achieves returns that are significantly higher than the returns of all algorithms according to a Welch's t-test with $p<0.0001$. 
    
\begin{figure}[tbp]
    \centering
    \includegraphics[width=\columnwidth]{figures/pm2d_ht_w_final_return_dist_without_pvals.pdf}
    \caption{Distribution of the discounted return with respect to contexts drawn from the target context distribution in the point-mass environment over 10 independent training runs.
    Box plots show the minimum, the first quartile, the median, the third quartile, and the maximum of all returns, from bottom-to-top, whereas the rhombus data samples correspond to outlier values.
    % The horizontal lines on top indicate that the discounted returns obtained by two algorithms are significantly different according to a Welch's t-test with $p<0.0001$.
    }
    \label{fig:pm2d_ht_w_return_dist}
\end{figure}

\subsection{Lunar Lander Environment}

In the lunar lander environment \citep{1606.01540}, the agent must land a pod on planets with varying gravity and wind disturbances.
We consider a context space $\CmdpContextSpace = [-12, -0.01] \times [0,10]$ that determines the gravity and wind power.
We use a Cauchy target context distribution with location $l=(-7,5)$ and scale $s=\textrm{diag}(1,1)$. The initial context distribution is a Cauchy distribution with location $l=(-3.7, 0.)$ and scale $s=\textrm{diag}(0.25^2, 0.25^2)$, where the median corresponds to a no wind condition in Mars. More details on \cref{supp-app:env}.
Our analysis focuses on the final performance (\cref{fig:ll2d_ht_w_return_dist,fig:ll2d_ht_w_fractions}), and \cref{supp-app:detailed_analysis} provides the training curves.

\paragraph{Final performance.} \cref{fig:ll2d_ht_w_return_dist} demonstrates the distribution of the discounted return obtained by the final policies in contexts drawn from the target context distribution.
By identifying rare and risky contexts via a CEM module, $\RACGEN$ and $\DEFAULTCEM$ achieve discounted return distributions significantly higher than $\DEFAULT$ and the rest with $p<0.01$ and $p<0.0001$, respectively, according to a Welch's t-test.
$\RACGEN$ has a higher median and a tighter range than $\DEFAULTCEM$. In addition, low outlier values are not as spread out as in $\DEFAULTCEM$, which is an informative observation because outlier low returns particularly occur in rare and risky contexts. Therefore, $\RACGEN$ is advantageous over $\DEFAULTCEM$ by generating a primary curriculum in addition to an auxiliary curriculum.

\paragraph{Performance in risky contexts.}
We also note that $\RACGEN$ does not achieve the highest maximum discounted return. This is likely because $\RACGEN$ may overlook more trivial contexts by allocating a portion of its sample budget to auxiliary ones, which are non-trivial and yield low-return.
In addition, due to an average of 65\% success rate, the lunar lander is a challenging domain under the given target context distribution, which likely results in policies trained via $\RACGEN$ to attend risky contexts more.
Nevertheless, in terms of first and third quartiles, median, and minimum values, we conclude that $\RACGEN$ outperforms the state-of-the-art methods and the baselines in this environment under a heavy-tailed target context distribution.

\paragraph{Performance profiles.} \cref{fig:ll2d_ht_w_fractions} further demonstrates that \RACGEN achieves higher returns in high and medium-risk contexts than the remaining methods.
The figure shows the fraction of contexts ($y$-axis) where an algorithm learns a policy that achieves a return higher than the return $\mathbf{r}$ ($x$-axis).
The curves show the median over $5$ runs.
First, we notice that \RACGEN almost always achieves returns higher than $-46$, with \DEFAULT following closely and the rest achieving lower returns in high-risk contexts.
At $\mathbf{r}=-30$, \DEFAULT starts to perform worse than \RACGEN, which supports our previous argument that \RACGEN achieves the highest minimum returns.
The curve of \RACGEN stays on the top until $\mathbf{r}=62$, which demonstrates that \RACGEN performs the best in most of the contexts. However, as we previously discussed in \cref{fig:ll2d_ht_w_return_dist}, \RACGEN does not yield the highest returns in low-risk contexts since its curve goes under the others in terms of the portion of contexts with high returns, more specifically for returns $\mathbf{r}\in[62,74]\cup[82,100]$.


\begin{figure}[tbp]
    \centering
    \includegraphics[width=\columnwidth]{figures/ll2d_ht_w_final_return_dist_without_pvals.pdf}
    \caption{Distribution of the discounted return with respect to contexts drawn from the target context distribution in the lunar lander environment over 5 independent training runs.
    % We visualize the progression of the expected discounted return obtained by policies of different SOTA and baseline methods.
    %Box plots show the minimum, the first quartile, the median, the third quartile, and the maximum of all returns, from bottom-to-top, whereas the rhombus data samples correspond to outlier values.
    }
    \label{fig:ll2d_ht_w_return_dist}
\end{figure}


\section{Conclusions}

In this paper, we investigate how to generate curricula in a multi-task setting where the task distribution has a heavy tail.
We propose the risk-aware curriculum generation method (\RACGEN) that oversamples rare and risky tasks, improving the agent's performance in such tasks.
Our empirical evaluation shows that, under a heavy-tail task distribution, \RACGEN outperforms state-of-the-art curriculum generation methods that do not take the heavy tail distributions into account.
Furthermore, \RACGEN has a fast convergence rate, comparable to the state-of-the-art curriculum generation methods, despite deliberately sampling risky tasks.

\paragraph{Limitations.} The algorithms that \RACGEN employ to generate primary and auxiliary curricula, \SPDL and CEM, respectively, search over a fixed parametric family of distributions. Therefore, \RACGEN is limited to producing primary and auxiliary context distributions of the same parametric type. Given an arbitrary target context distribution, \RACGEN needs to assume that it belongs to a certain parametric family to generate primary and auxiliary context distributions. There, it is likely that the likelihood of some primary or auxiliary contexts would be over or under-estimated. As a result, \RACGEN may return sub-optimal policies.

\paragraph{Future Work.} We are planning to extend \RACGEN to address arbitrary target context distributions. \CURROT addresses such limitation of \SPDL by replacing KL divergence with Wasserstein distance. Similarly, the generalized version of CEM \citep{botev2011generalized} extends CEM for arbitrary distributions. We can combine \CURROT and the generalized CEM to tackle the limitations of \RACGEN.

\begin{figure}[tbp]
    \centering
    \includegraphics[width=\columnwidth]{figures/ll2d_ht_w_fractions_final_returns.pdf}
    \caption{Performance profiles of evaluated algorithms in the lunar lander environment: the fraction of episodes where the final policies achieve discounted returns greater than $\mathbf{r}$.
    It presents the median over 5 independent training runs.
    }
    \label{fig:ll2d_ht_w_fractions}
\end{figure}

\begin{acknowledgements} % will be removed in pdf for initial submission,
						 % (without ‘accepted’ option in \documentclass)
                         % so you can already fill it to test with the
                         % ‘accepted’ class option
This work is supported by
the Office of Naval Research (ONR) under grant number N00014-22-1-2254,
the National Science Foundation (NSF) under grant number 1646522,
the European Research Council (ERC) under the starting grant 101077178 (DEUCE),
and the Dutch Research Council (NWO) under the grant NWA.1160.18.238 (PrimaVera). 
\end{acknowledgements}

\nocite{stable-baselines3,Schulmanetal_ICLR2016}
% References
\bibliography{koprulu_731}

\end{document}
