% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
% \documentclass[underreview]{uai2023} % after submission
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}
\usepackage{balance}

%% Some suggested packages, as needed:
\usepackage[numbers]{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams


% NEW
\usepackage{subcaption}
% \usepackage{amsmath}
\usepackage{amsfonts}
\usepackage{algorithm}
\usepackage[noend]{algorithmic}
\usepackage{xcolor}
\usepackage{xspace}
\usepackage{nicefrac}
\setlist[enumerate]{label={\arabic*)}}

\definecolor{gray}{cmyk}{0,0,0,0.8}
  \renewcommand{\algorithmiccomment}[1]{\hfill \small\textcolor{gray}{\textit{$\triangleright$ #1}}}
% \usepackage{todos}
\usepackage[capitalise,nameinlink]{cleveref}
% \usepackage[capitalise,noabbrev,nameinlink]{cleveref}
% \creflabelformat{equation}{#2\textup{#1}#3} % changes Equation (2) to Equation 2. This way we can use (\cref{eq:2}) without double parenthesis



%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\DeclareMathOperator*{\argmax}{arg~max}
\DeclareMathOperator*{\argmin}{arg~min}
\DeclareMathOperator*{\st}{~s.t.~}


\title{Risk-Aware Curriculum Generation for Heavy-Tailed Task Distributions \\ (Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{
\href{mailto:<cevahir.koprulu@utexas.edu>?Subject=Your UAI 2023 paper}{Cevahir~Koprulu}
% Cevahir~Koprulu
}
\author[2]{Thiago~D.~Simão}
\author[2]{Nils~Jansen}
\author[1]{Ufuk~Topcu}
% Add affiliations after the authors
\affil[1]{%
    % Computer Science Dept.\\
    % Cranberry University\\
    % Pittsburgh, Pennsylvania, USA
    University of Texas at Austin
}
\affil[2]{%
Radboud University, Nijmegen
    % Second Affiliation\\
    % Address\\
    % …
}
  
% \input{nomenclature.tex}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%% NOMENCLATURE %%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% Environments
\newtheorem{definition}{Definition}

% General
\newcommand{\Reals}{\ensuremath{\mathbb{R}}}
\newcommand{\PositiveIntegers}{\ensuremath{\mathbb{Z}^+}}
% \newcommand{\Expectation}{\ensuremath{\mathbb{E}}}
\DeclareMathOperator*{\Expectation}{\mathbb{E}}
\newcommand{\Probability}{\mathbb{P}}
\newcommand{\CDF}{F}
\newcommand{\Indicator}{\mathbf{1}}
\newcommand{\NormalDistribution}{\mathcal{N}}
\newcommand{\NormalDistributionMean}{\mu}
\newcommand{\NormalDistributionSTD}{\sigma}
\newcommand{\CauchyDistributionPDF}{f}
\newcommand{\CauchyDistributionLocation}{l}
\newcommand{\CauchyDistributionScale}{s}
\newcommand{\CauchyDistributionRandomVariable}{X}
\DeclarePairedDelimiter{\norm}{\lVert}{\rVert}
\newcommand{\ProbSimplex}{\Delta}
\newcommand{\RandomVariable}{X}
\newcommand{\RandomVariableSample}{x}

% Contextual MDP
\newcommand{\context}{\ensuremath{\mathbf{c}}}
\newcommand{\Cmdp}{\ensuremath{\mathcal{M}}}
\newcommand{\CmdpStateSpace}{\ensuremath{\mathcal{S}}}
\newcommand{\CmdpActionSpace}{\ensuremath{\mathcal{A}}}
\newcommand{\CmdpContextSpace}{\ensuremath{\mathcal{C}}}
\newcommand{\CmdpMapping}{\ensuremath{\mathsf{M}}}
\newcommand{\CmdpState}{\ensuremath{\mathbf{s}}}
\newcommand{\CmdpAction}{\ensuremath{\mathbf{a}}}
\newcommand{\CmdpReward}{\ensuremath{r}}
\newcommand{\CmdpTransitionFunction}{\ensuremath{p_{\context}}}
\newcommand{\CmdpRewardFunction}[1][\context]{\ensuremath{r_{#1}}}
\newcommand{\CmdpInitialDistribution}{\ensuremath{p_{0,\context}}}
\newcommand{\CmdpPolicy}{\ensuremath{\pi}}
\newcommand{\CmdpDiscount}{\ensuremath{\gamma}}


% Contextual RL
\newcommand{\ValueFunction}{\ensuremath{V_{\CmdpPolicy}}}
\newcommand{\TargetContextDistribution}{\ensuremath{\varphi}}
\newcommand{\CRLExpectation}[1][\TargetContextDistribution]{\ensuremath{J(\CmdpPolicy,#1)}}
\newcommand{\Trajectory}{\ensuremath{\boldsymbol{\tau}}}
\newcommand{\Return}{\ensuremath{G}}
\newcommand{\ReturnSample}{g}

% Self-paced RL
\newcommand{\ContextDistribution}{\varrho}
\newcommand{\ContextDistributionParameter}{\nu}
\newcommand{\KLDivergenceBound}{\epsilon}
\newcommand{\NumberOfIterations}{K}
\newcommand{\NumberOfRollouts}{M}
\newcommand{\KLDivergence}{D_{\text{KL}}}
\newcommand{\PerformanceConstraint}{\delta}
\newcommand{\TrajectorySet}{\mathcal{D}}
\newcommand{\SPRLIteration}{k}

% Risk
\newcommand{\CVaRCoeff}{\alpha}
\newcommand{\CVaR}[1][\CVaRCoeff]{\text{CVaR}_{#1}}
\newcommand{\AlphaQuantile}[1][\CVaRCoeff]{q_{#1}}
\newcommand{\FilterCVaR}{\ensuremath{\mathfrak{F}}}

% CEM
\newcommand{\CEMNumberOfIteration}{I}
\newcommand{\ContextDistributionCEM}{\Tilde{\ContextDistribution}}
\newcommand{\ImportanceWeight}{\omega}
\newcommand{\BatchSize}{N}
\newcommand{\SmoothenedCVaRCoeff}{\beta}
\newcommand{\SetOfReturns}{\mathcal{\Return}}

% RaCGEN
\newcommand{\RACGEN}{\textsc{RaCGEN}\xspace}
\newcommand{\RACGENIteration}{\SPRLIteration}
\newcommand{\PrimaryTrajectorySet}{\mathcal{D}^{\textit{pri}}}
\newcommand{\AuxiliaryTrajectorySet}{\mathcal{D}^{\textit{aux}}}
\newcommand{\PrimaryContext}{\context^{\textit{pri}}}
\newcommand{\AuxiliaryContext}{\context^{\textit{aux}}}
\newcommand{\NumberOfPrimaryRollouts}{\NumberOfRollouts^{\textit{pri}}}
\newcommand{\NumberOfAuxiliaryRollouts}{\NumberOfRollouts^{\textit{aux}}}
\newcommand{\PrimaryImportanceWeight}{\ImportanceWeight^{\textit{pri}}}
\newcommand{\AuxiliaryImportanceWeight}{\ImportanceWeight^{\textit{aux}}}
\newcommand{\ImportanceWeightSet}{\Omega}
\newcommand{\EstimatedAlphaQuantile}[1][\CVaRCoeff]{\hat{q}_{#1}}
\newcommand{\Quantile}{q}
% \newcommand{\SmoothenedCVaRCoeff}{\beta}
\newcommand{\RiskLevelSchedule}{\rho}

% SOTA and Baselines
\newcommand{\GOALGAN}{\textsc{GoalGAN}\xspace}
\newcommand{\ALPGMM}{\textsc{ALP-GMM}\xspace}
\newcommand{\PLR}{\textsc{PLR}\xspace}
\newcommand{\VDS}{\textsc{VDS}\xspace}
\newcommand{\SPDL}{\textsc{SPDL}\xspace}
\newcommand{\CURROT}{\textsc{CURROT}\xspace}
\newcommand{\DEFAULT}{\textsc{Default}\xspace}
\newcommand{\DEFAULTCEM}{\textsc{Default-CEM}\xspace}
\newcommand{\SPDLN}{\textsc{SPDL-N}\xspace}
\newcommand{\RACGENN}{\textsc{RaCGEN-N}\xspace}
\newcommand{\Normal}{\textrm{Normal}\xspace}
\newcommand{\Cauchy}{\textrm{Cauchy}\xspace}
\newcommand{\CURROTPerf}{\delta}
\newcommand{\CURROTWass}{\epsilon_{\text{Wass}}}
\newcommand{\PLRStale}{\rho}
\newcommand{\PLRTemp}{\beta}
\newcommand{\PLRReplay}{p}
\newcommand{\VDSLR}{\text{LR}}
\newcommand{\VDSEpoch}{n_{\text{ep}}}
\newcommand{\VDSBatch}{n_{\text{batch}}}
\newcommand{\GOALGANNoise}{\delta_{\text{noise}}}
\newcommand{\GOALGANRollout}{n_{\text{rollout}}^{\text{GG}}}
\newcommand{\GOALGANSuccess}{p_{\text{success}}}
\newcommand{\ALPGMMRandom}{p_{\text{rand}}}
\newcommand{\ALPGMMRollout}{n_{\text{rollout}}^{\text{AG}}}
\newcommand{\ALPGMMBuffer}{s_{\text{buffer}}}
\newcommand{\PPO}{\textsc{PPO}\xspace}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%% NOMENCLATURE %%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{document}
\onecolumn
\appendix
\maketitle


\section{Automated Curriculum Generation Algorithms}
\label{app:algorithms}

This section summarizes the automated curriculum generation methods considered in the empirical evaluation.

\begin{description}
    \item 
    [\SPDL~\citep{klink2021probabilistic}:]
    We use \emph{Self-paced Deep Reinforcement Learning} to generate the primary curriculum. The main document provides more details.
    
    \item 
    [\CURROT~\citep{klink2022curriculum}:]  
    \emph{Curriculum RL via Constrained Optimal Transport} formulates the curriculum generation problem as constrained optimal transport by generating a context distribution that minimizes the Wasserstein distance to the target context distribution and satisfies two constraints: the discounted return should be higher than some pre-determined threshold in every context with non-zero probability and the Wasserstein distance to the previous context distribution should be lower than some distance threshold.
    
    \item 
    [\PLR~\citep{jiang2021prioritized}:]
    \emph{Prioritized Level Replay} addresses procedural context generation environments, where a \emph{level} is an allegorically created unique environment instance. $\PLR$ samples the next training level by prioritizing the ones with a higher average magnitude of generalized advantage estimate \citep{Schulmanetal_ICLR2016}, that is the discounted sum of all temporal-difference errors occurring in the future.
    
    \item 
    [\VDS~\citep{NEURIPS2020_566f0ea4}:]
    \emph{Value Disagreement based Sampling} addresses the goal-conditioned setting and uses the epistemic uncertainty of the value function to sample goals. Intuitively, the value function confidently assigns low and high values to hard and easy goals, respectively, but it is uncertain about the values of the goals that are at the boundary of the current policy's ability. To generate a curriculum, $\VDS$ samples these goals for which the value function has high epistemic uncertainty.
    
    \item 
    [\GOALGAN~\citep{florensa2018automatic}:]
    \emph{Goal Generative Adversarial Network} also addresses the goal-conditioned setting. The proposed approach uses a goal discriminator to determine whether a goal is at the intermediate difficulty for the current policy, and a goal generator that generates goals that are at such level of difficulty.
    
    \item 
    [\ALPGMM~\citep{portelas2020teacher}:]
    \emph{Absolute Learning Progress with Gaussian Mixture Models} generates a Gaussian mixture model over the absolute learning progress of task parameters, e.g., contexts in our setting, and uses a bandit scheme to choose a Gaussian as an arm whose utility is the absolute learning progress. The chosen Gaussian distribution is used to draw the next task parameter. 
    
\end{description}

\section{Experimental Details}
\label{app:experimental_details}

This section discusses the hyperparameter selection process for the evaluated curriculum generation algorithms and additional details regarding the environments used in the experiments.

\subsection{Algorithm Hyperparameters}
\label{app:hyperparameters}

$\RACGEN$, $\RACGENN$, $\SPDL$, and $\SPDLN$ generate curricula, primary, via the self-paced RL framework, which has four parameters:  performance constraint threshold $\PerformanceConstraint$, KL divergence threshold $\KLDivergenceBound$, number of curriculum iterations $\NumberOfIterations$, and number of rollouts per policy update $\NumberOfRollouts$. For every environment, we chose $\PerformanceConstraint$ to be around the midpoint between the minimum and maximum possible discounted return. To select KL divergence threshold $\KLDivergenceBound$, we ran a grid search over $\{0.5,0.25, 0.1\}$ for the point-mass environment and $\{0.1,0.05,0.01\}$ for the lunar lander environment. We selected the values that yield the best performing $\RACGEN$ curricula, and use the same values for $\RACGENN$, $\SPDL$, and $\SPDLN$. For the point-mass environment, we set $\NumberOfIterations=300$, based on the experiments of \citet{klink2021probabilistic} in the same environment, and set $\NumberOfRollouts=30$ after a search over $\{30,40\}$, similar to the previous grid search. For the lunar lander environment, we set $\NumberOfIterations=250$, and ran a grid search for $\NumberOfRollouts$ over $\{30, 40\}$ and chose 40 based on the best performing $\RACGEN$ curriculum.

\begin{table}[tbp]
    \centering
    \caption{Self-paced RL parameter values used in $\RACGEN$, $\RACGENN$, $\SPDL$, and $\SPDLN$.}
    \begin{tabular}{lcc}
        \hline
        Environment  & $\PerformanceConstraint$ & $\KLDivergenceBound$ \\
        \hline
        PointMass-2D     & 4.0    & 0.25 \\
        LunarLander-2D     & -100     & 0.025  \\
        \hline
    \end{tabular}
    \label{tab:sprl_parameters}
\end{table}

\begin{table}[tbp]
    \centering
    \caption{Selected values for parameters of \CURROT, \PLR, and \VDS.}
    \begin{tabular}{lcccccccc}
        \hline
        Environment  & $\CURROTPerf$ & $\CURROTWass$ & $\PLRStale$ & $\PLRTemp$ & $\PLRReplay$ & $\VDSLR$ & $\VDSEpoch$ &  $\VDSBatch$\\
        \hline
        PointMass-2D     & 4.0 & 0.5 & 0.45 & 0.15 & 0.85 & 0.01 & 5 & 40\\
        LunarLander-2D   & -50 & 0.5 & 0.45 & 0.15 & 0.85 & 0.01 & 3 & 40\ \\
        \hline
    \end{tabular}
    \label{tab:params1}
\end{table}

\begin{table}[tbp]
    \centering
    \caption{Selected values for parameters of \GOALGAN and \ALPGMM}
    \begin{tabular}{lcccccc}
        \hline
        Environment  & $\GOALGANNoise$ & $\GOALGANRollout$ & $\GOALGANSuccess$ & $\ALPGMMRandom$ & $\ALPGMMRollout$ & $\ALPGMMBuffer$\\
        \hline
        PointMass-2D     & 0.05 & 200 & 0.1 & 0.1 & 50 & 500\\
        LunarLander-2D   & 0.1 & 200 & 0.2 & 0.1 & 100 & 500\ \\
        \hline
    \end{tabular}
    \label{tab:params2}
\end{table}

$\RACGEN$, $\RACGENN$, and $\DEFAULTCEM$ use a CEM module to generate auxiliary curricula. There are four parameters that CEM utilizes: smoothing risk level $\SmoothenedCVaRCoeff$, final risk level $\CVaRCoeff$, initial risk level $\CVaRCoeff_0$, risk level scheduling factor $\RiskLevelSchedule$. We illustrate the pitfalls of heavy-tailed target context distributions based on a point-mass environment with 1D context space, where we set $\CVaRCoeff=0.2$. We use this value for both environments in the experiments. Based on the discussion in \citet{greenberg2022efficient}, we set $\SmoothenedCVaRCoeff=0.5$ to care for the mean of the lower 50\% of the samples. We set $\CVaRCoeff_0=1$ to start soft-risk scheduling by identifying contexts that have returns lower than the expectation of the returns. We compute $\RiskLevelSchedule$ to have a certain number of curriculum updates until $\CVaRCoeff$ is reached. We ran grid searches over $\{10,15,20\}$ and $\{40, 80, 120\}$ for point-mass and lunar-lander environments, respectively. Based on the best performing $\RACGEN$, we set $\RiskLevelSchedule$ so that it decays to $\CVaRCoeff$ from 1.0 in 20 and 80 curriculum updates in point-mass and lunar lander environments, respectively, for methods $\RACGEN$, $\RACGENN$, and $\DEFAULTCEM$. The algorithms using a CEM module also need $\NumberOfPrimaryRollouts$ and $\NumberOfAuxiliaryRollouts$ to be set. As $\NumberOfRollouts=\NumberOfPrimaryRollouts+\NumberOfAuxiliaryRollouts$, we set $\NumberOfAuxiliaryRollouts$ to 20 and 10 for point-mass and lunar-lander environments, respectively.

\CURROT has two main parameters: performance constraint threshold $\CURROTPerf$ and Wasserstein distance threshold $\CURROTWass$, similar to the self-paced RL algorithm we employ for \RACGEN. Following the logic described by the developers of \CURROT \citep{klink2022curriculum}, we set $\CURROTPerf$ to be half-way between the minimum and maximum discounted returns. In addition, $\CURROTWass$ is usually set to a high value such as $0.5$, so we keep the same approach. 
\PLR has three parameters: the staleness coefficient $\PLRStale$, the score temperature $\PLRTemp$, and the replay probability $p$. We ran a grid search over $(\PLRStale, \PLRTemp, \PLRReplay)\in\{0.15,0.45\}\times\{0.15,0.45\}\times\{0.7,0.85\}$.
\VDS has three parameters to set: the learning rate $\VDSLR$ for the Q-function ensemble, the number of epochs $\VDSEpoch$, and the number of minibatches $\VDSBatch$. We ran a grid search over $(\VDSLR, \VDSEpoch, \VDSBatch)\in\{0.0001,0.001\}\times\{3,5\}\times\{20,40\}$. \cref{tab:params1} consists of the final parameter values used in point-mass and lunar lander environments for \CURROT, \PLR, and \VDS.

\GOALGAN has three parameters: the random noise added to each context sample $\GOALGANNoise$, the number of rollouts between context distribution updates $\GOALGANRollout$, and the percentage of samples drawn from the success buffer $\GOALGANSuccess$. We ran a grid search over $(\GOALGANNoise, \GOALGANRollout, \GOALGANSuccess)\in\{0.05,0.1\}\times\{100,200\}\times\{0.1,0.2\}$. \ALPGMM has three parameters: the ratio of randomly sampled contexts $\ALPGMMRandom$, the number of completed learning episodes before updating the context distribution $\ALPGMMRollout$, and the size of the past trajectory buffer $\ALPGMMBuffer$. We ran a grid search over $(\ALPGMMRandom, \ALPGMMRollout, \ALPGMMBuffer)\in\{0.1,0.2\}\times\{50,100\}\times\{500,1000\}$.
\cref{tab:params2} shows the final parameter used for \GOALGAN, and \ALPGMM.


\subsection{Environment Descriptions}
\label{app:env}

\paragraph{Point-mass environment.} \cref{fig:pm2d_ht_w_fractions} further demonstrates that \RACGEN achieves higher returns in high and medium-risk contexts than the remaining methods.
The figure shows the fraction of contexts ($y$-axis) where an algorithm learns a policy that achieves a return higher than the return $\mathbf{r}$ ($x$-axis).
The plot shows the median over $5$ runs.
First, we notice that \RACGEN almost always achieves returns higher than $-46$, with \DEFAULT following closely and the rest achieving lower returns in high-risk contexts.
At $\mathbf{r}=-30$, \DEFAULT starts to perform worse than \RACGEN, which supports our previous argument that \RACGEN achieves the highest minimum returns, indeed.
The curve of \RACGEN stays on the top until $\mathbf{r}=62$, which demonstrates that \RACGEN performs the best in most of the contexts. However, as we discussed in Figure 6 of the main document, \RACGEN does not yield the highest returns in low-risk contexts since its curve goes under the others in terms of the portion of contexts with high returns, more specifically for returns $\mathbf{r}\in[62,74]\cup[82,100]$.

\begin{figure*}[tbp]
    \centering
    \hfill 
    \begin{subfigure}{0.35\textwidth}
        \centering
    \includegraphics[width=1\textwidth]{figures/point_mass_1d.pdf}
    \caption{Visualized point-mass environment.}
    \label{fig:pointmass2D_sup}
    \end{subfigure}\hfill
    \begin{subfigure}{0.45\textwidth}
        \centering
    \includegraphics[width=\columnwidth]{figures/pm2d_ht_w_fractions_final_returns.pdf}
    \caption{Performance profiles in  point mass environment.
    }
    \label{fig:pm2d_ht_w_fractions}
    \end{subfigure}
    \hfill~
    \caption{Point-mass environment: (a) Visualization of Point-mass environment with 2D context space: Context $\context$ determines the position and the width of the door. (b) Performance profiles of evaluated algorithms in the point mass environment: the fraction of episodes where the final policies achieve discounted returns greater than $\mathbf{r}$.
    It presents the median over 10 independent training runs.}
\end{figure*}

We use the environment studied by \citet{NEURIPS2020_68a97503,klink2021probabilistic,klink2022curriculum}, which has a two-dimensional context space. A context in the point-mass environment determines the position and the width of the door, that the agent needs to pass to eventually reach the goal position. As the algorithm of choice, we employ the stable-baselines3 \citep{stable-baselines3} implementation of \PPO with an MLP policy of 3 hidden layers, 128 neurons at each layer. We set the batch size to 128, Generalized Advantage Estimator factor to 0.99, and the number of steps in between updates to 6144. The discount factor of the point-mass environment is 0.95. We leave the values of the rest of the parameters as set in the stable-baselines3 implementation of \PPO. An illustration of the point-mass environment is in \cref{fig:pointmass2D_sup}.

\paragraph{Lunar-lander environment.}

The lunar-lander environment we use is the third version in OpenAI Gym \citep{1606.01540}, which has a two-dimensional context space. A context in the lunar-lander environment determines the gravity and the wind power of the planet that a pod needs to land on. We again utilize the stable-baselines3 \citep{stable-baselines3} implementation of \PPO with the default MLP policy. We set the number of epochs for surrogate loss optimization to 4, Generalized Advantage Estimator factor to 0.99, and the number of steps in between updates to 10240. The discount factor of the lunar-lander environment is 0.99. We leave the values of the rest of the parameters as set in the stable-baselines3 implementation of \PPO. An illustration of the lunar-lander environment is in \cref{fig:lunar_lander}. 

\begin{figure*}[tbp]
    \centering
    \hfill 
    \begin{subfigure}{0.35\textwidth}
        \centering
    \includegraphics[width=1\textwidth]{figures/lunar_lander.png}
    \caption{Visualized lunar-lander.}
    \label{fig:lunar_lander}
    \end{subfigure}\hfill
    \begin{subfigure}{0.5\textwidth}
        \centering
    \includegraphics[width=1\textwidth]{figures/ll2d_ht_w_expected_disc_return_progression.pdf}
    \caption{Expected discounted return progression in lunar lander.}
    \label{fig:ll2d_ht_w_expected_disc_return_progression}
    \end{subfigure}
    \hfill~
    \caption{Lunar-lander environment: (a) Visualization of the environment, where Context $\context$ determines the wind and the gravity. (b) Expected discounted return with respect to the target context distribution in the lunar-lander environment.
    The bold lines are the median and the lightly shaded regions cover the first and third quartiles of 5 independent training runs.}
\end{figure*}

\subsection{Detailed Analysis of Results}
\label{app:detailed_analysis}

\cref{fig:ll2d_ht_w_expected_disc_return_progression} shows the progression of the expected discounted return in contexts drawn from the target context distribution.

\paragraph{Point-mass environment.} \cref{fig:pm2d_ht_w_fractions} demonstrates that \RACGEN achieves higher returns in 79.2\% of the contexts, whereas the remaining methods perform poorly in contrast. 
The figure shows the fraction of contexts ($y$-axis) where an algorithm learns a policy that achieves a return higher than the return $\mathbf{r}$ ($x$-axis).
The curves correspond to the median over $10$ runs.
\DEFAULT and \DEFAULTCEM mostly achieve returns around $\mathbf{r}=2.5$, as the final policies learn to push the point mass to the middle section of the wall. Such behavior is suboptimal when the task is to pass doors away from the middle section. In comparison, \RACGEN and \RACGENN yield lower returns in 20\% of the contexts, as the learned policies sometimes approach the door but fail to pass it. The curve of \RACGEN stays on top of all evaluated methods for $\mathbf{r}\in[4.2,7.2]$, as \RACGEN generates Cauchy context distributions instead of Gaussian when the target context distribution is Cauchy. Although \GOALGAN achieves higher returns than \RACGEN in less than 10\% of the contexts, the figure for the distribution of returns in the main document indicates that such cases are outliers.

\paragraph{Lunar-lander environment.} \cref{tab:lunarlander_detailed_all} consists of the numerical values from the boxplots in Figure 6.
As we indicate in Section 6.2, \RACGEN outperforms all algorithms regarding median, the first and third quartiles, and minimum values. However, DEFAULT-CEM is the best performer considering the maximum return achieved.
As \RACGEN attends risky contexts much more than easy ones, it may have overlooked high-return contexts more than the baselines.

In \cref{tab:lunarlander_detailed_all}, we also observe that \RACGEN has a tighter range and less spread-out low outliers than DEFAULT-CEM.
Even though both approaches identify and oversample rare and risky contexts using their CEM modules, \RACGEN performs better in such contexts.
Furthermore, DEFAULT (without a curriculum and a CEM module) yields higher first quartile and minimum values than DEFAULT-CEM.
Thus, we argue that \RACGEN is more robust than the baselines.

\cref{tab:lunarlander_detailed_median} provides the median returns in contexts, namely, the median in a context across independent training runs.
As we focus on the median returns, we disregard training runs that yield unusually high or low returns in a context.
In this case, \RACGEN outperforms all algorithms in every statistic.
Therefore, \cref{tab:lunarlander_detailed_median} also supports our argument that \RACGEN is more advantageous than all state-of-the-art algorithms and baselines in the lunar lander experiment.

\begin{table}[tbp]
    \centering
    \caption{Statistics of distributions of discounted returns collected by policies trained under listed algorithms in the lunar-lander environment. We use the discounted returns collected in 100 contexts (drawn from the target context distribution) by policies from 5 independent runs (in total 500 discounted returns per algorithm)}
    \begin{tabular}{lccccc}
        \hline
        Algorithms & Maximum (Upper Whisker) & 3rd Quartile & Median & 1st Quartile & Minimum (Lower Whisker)\\
        \hline
        \RACGEN      & 88.57      & \textbf{48.89} & \textbf{31.74} & \textbf{15.77} & \textbf{-30.25}\\
        \DEFAULTCEM          & \textbf{101.45} & 48.22     & 30.77     & 11.22     & -44.11 \\
        \DEFAULT              & 92.08      & 43.86     & 27.45     & 11.37     & -34.29 \\
        \SPDL                 & 90.13      & 44.22     & 25.02     & -1.45     & -67.89 \\
        \CURROT               & 79.79      & 36.67     & 22.29     & 7.94      & -33.52 \\
        \PLR                  & 74.91      & 30.76     & 15.29     & -0.81     & -47.93 \\
        \GOALGAN              & 93.51      & 35.20     & 15.19     & -7.67     & -68.93 \\
        \VDS                  & 90.00      & 27.87     & 8.06      & -17.44    & -85.32 \\
        \ALPGMM              & 90.07      & 24.68     & 5.51      & -20.32    & -80.70 \\
        \hline
    \end{tabular}
    \label{tab:lunarlander_detailed_all}
\end{table}


\begin{table}[tbp]
    \centering
    \caption{Statistics of distributions of discounted returns collected by policies trained under listed algorithms in the lunar-lander environment, where we focus on the median return across 5 independent runs in 100 contexts drawn from the target context distribution.}
    \begin{tabular}{lccccc}
        \hline
        Algorithms & Maximum (Upper Whisker) & 3rd Quartile & Median & 1st Quartile & Minimum (Lower Whisker)\\
        \hline
        \RACGEN      & \textbf{69.53}  & \textbf{42.79}  & \textbf{32.42}  & \textbf{23.39}  & \textbf{0.40} \\
        \DEFAULTCEM          & 66.03     & 39.86     & 29.89     & 18.78     & -5.74 \\
        \DEFAULT              & 56.14     & 35.03     & 27.72     & 20.49     & -0.48 \\
        \SPDL                 & 64.18     & 38.16     & 27.45     & 15.16     & -17.56 \\
        \CURROT               & 52.07     & 30.37     & 22.01     & 15.13     & -7.23 \\
        \PLR                  & 41.47     & 24.13     & 16.29     & 8.15      & -11.97 \\
        \GOALGAN              & 48.23     & 24.21     & 14.01     & 5.88      & -12.67 \\
        \VDS                  & 44.06     & 20.97     & 8.14      & -4.88     & -41.83 \\
        \ALPGMM              & 42.04     & 17.19     & 6.23      & -5.17     & -36.37 \\
        \hline
    \end{tabular}
    \label{tab:lunarlander_detailed_median}
\end{table}

\bibliography{koprulu_731}

\end{document}