% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}
\usepackage{balance}
%% Some suggested packages, as needed:
% \usepackage[round]{natbib} % has a nice set of citation styles and commands
% \usepackage[numbers]{natbib} % has a nice set of citation styles and commands
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usetikzlibrary{automata,positioning,arrows}
\tikzset{node distance=3.5cm, % Minimum distance between two nodes. Change if necessary.
every state/.style={ % Sets the properties for each state
semithick,
fill=gray!10},
initial text={}, % No label on start arrow
double distance=2pt, % Adjust appearance of accept states
every edge/.style={ % Sets the properties for each transition
                draw,
->,>=stealth, % Makes edges directed with bold arrowheads
auto,
semithick}}

% NEW
\usepackage{amsthm}
\usepackage{comment}
\usepackage{subcaption}
% \usepackage{amsmath}
\usepackage{amsfonts}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{xcolor}
\usepackage{multirow}

\definecolor{gray}{cmyk}{0,0,0,0.8}
  \renewcommand{\algorithmiccomment}[1]{\hfill \small\textcolor{gray}{\textit{$\triangleright$ #1}}}
% \usepackage{todos}
\usepackage[capitalise,noabbrev,nameinlink]{cleveref}
\creflabelformat{equation}{#2\textup{#1}#3} % changes Equation (2) to Equation 2. This way we can use (\cref{eq:2}) without double parenthesis



%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\DeclareMathOperator*{\argmax}{arg~max}
\DeclareMathOperator*{\argmin}{arg~min}
\DeclareMathOperator*{\st}{~s.t.~}


\title{Reward-Machine-Guided, Self-Paced Reinforcement Learning\\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<cevahir.koprulu@utexas.edu>?Subject=Your UAI 2023 paper}{Cevahir~Koprulu}{}}
\author[1]{Ufuk~Topcu}
% Add affiliations after the authors
\affil[1]{%
    University of Texas at Austin\\
    USA
}

\usepackage{silence}
\WarningFilter{latex}{Marginpar on page}

\begin{document}
\onecolumn
\appendix
\maketitle

% \input{koprulu_587-nomenclature.tex}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%% NOMENCLATURE %%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% Environments
\newtheorem{definition}{Definition}
\newtheorem{lemma}{Lemma}
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}{Corollary}
\newtheorem{assumption}[theorem]{Assumption}
\newenvironment{proofsketch}{%
\renewcommand{\proofname}{Proof Sketch}\proof}{\endproof}

% Common
\newcommand{\Reals}{\mathbb{R}}
\newcommand{\PositiveIntegers}{\mathbb{Z^+}}
\newcommand{\Expectation}{\mathbb{E}}
\newcommand{\KLDivergence}{D_{\text{KL}}}
\newcommand{\Naturals}{\mathbb{N}}
\newcommand{\RLAlgorithm}{\Psi}

% Common - MDPs
\newcommand{\Discount}{\gamma}
\newcommand{\Policy}{\pi}
\newcommand{\CommonAction}{a}
\newcommand{\CommonState}{s}

% Two-door Environment Labels
\newcommand{\PointMassLabelDoorOne}{d1}
\newcommand{\PointMassLabelBox}{b}
\newcommand{\PointMassLabelDoorTwo}{d2}
\newcommand{\PointMassLabelGoal}{g}
\newcommand{\PointMassLabelWall}{w}

% HalfCheetah-v3 Environment Labels
\newcommand{\CheetahFlagOne}{f_1}
\newcommand{\CheetahFlagTwo}{f_2}
\newcommand{\CheetahFlagThree}{f_3}

% Swimmer-v3 Environment Labels
\newcommand{\SwimmerFlagOne}{f_1}
\newcommand{\SwimmerFlagTwo}{f_2}
\newcommand{\SwimmerFlagThree}{f_3}

% Labeled MDP
\newcommand{\Lmdp}{\mathcal{M}}
\newcommand{\LmdpStates}{S}
\newcommand{\LmdpCommonState}{\CommonState}
\newcommand{\LmdpInit}{\phi}
\newcommand{\LmdpActions}{A}
\newcommand{\LmdpCommonAction}{\CommonAction}
\newcommand{\LmdpTransition}{p}
\newcommand{\LmdpRewardFunction}{R}
\newcommand{\LmdpReward}{r}
\newcommand{\LmdpRewardSequence}{\rho}
\newcommand{\LmdpDiscount}{\Discount}
\newcommand{\LmdpLabels}{\mathcal{P}}
\newcommand{\LmdpCommonLabel}{\ell}
\newcommand{\LmdpLabelingFunction}{L}
\newcommand{\LmdpLabelSequence}{\lambda}
\newcommand{\LmdpPolicy}{\Policy}

% Reward Machine
\newcommand{\RM}{\mathcal{R}}
\newcommand{\RMStates}{Q}
\newcommand{\RMCommonState}{\mathsf{q}}
\newcommand{\RMReward}{r}
\newcommand{\RMInit}{\RMCommonState_{I}}
\newcommand{\RMInputAlphabet}{2^{\LmdpLabels}}
\newcommand{\RMOutputAlphabet}{O}
\newcommand{\RMTransitionFunction}{\delta_{\RMCommonState}}
\newcommand{\RMOutputFunction}{\delta_{\LmdpReward}}
\newcommand{\RMCommonLabel}{\LmdpCommonLabel}
\newcommand{\LogicFormula}{\rho}

% Contextual MDP
\newcommand{\context}{c}
\newcommand{\Cmdp}{\mathcal{\Bar{M}}}
\newcommand{\CmdpStates}{S}
\newcommand{\CmdpActions}{A}
\newcommand{\CmdpContextSpace}{\mathcal{C}}
\newcommand{\CmdpMapping}{\mathsf{M}}
\newcommand{\CmdpCommonState}{\CommonState}
\newcommand{\CmdpCommonAction}{\CommonAction}
\newcommand{\CmdpTransition}{p_{\context}}
\newcommand{\CmdpRewardFunction}{R_{\context}}
\newcommand{\CmdpInitialDistribution}{\phi_{\context}}
\newcommand{\CmdpPolicy}{\Policy}
\newcommand{\CmdpDiscount}{\Discount}
\newcommand{\CmdpDimensions}{D}
\newcommand{\CmdpReward}{r}

% Contextual RL
\newcommand{\PolicyParameter}{\omega}
\newcommand{\ValueFunction}{V_{\PolicyParameter}}
\newcommand{\TargetContextDistribution}{\varphi}

% Self-paced Contextual RL
\newcommand{\KLCoefficient}{\alpha}
\newcommand{\ContextDistribution}{\varrho}
\newcommand{\ContextDistributionParameter}{\nu}
\newcommand{\RelativeEntropyBound}{\epsilon}
\newcommand{\SPRLTrajectorySet}{\mathcal{D}}
\newcommand{\EstimatedValueFunction}{\hat{V}_{\PolicyParameter}}
\newcommand{\KLPenaltyProportion}{\zeta}
\newcommand{\KLPenaltyOffset}{K_{\alpha}}
\newcommand{\NumberOfIterations}{K}
\newcommand{\NumberOfRollouts}{N}
\newcommand{\Trajectory}{\tau}
\newcommand{\ContextUpdateOffset}{K_{\text{OFFSET}}}
\newcommand{\NumberOfStepsBetweenUpdates}{n_{\text{STEP}}}
\newcommand{\STDLowerBound}{\sigma_{\text{LB}}}
\newcommand{\KLLowerBound}{D_{\text{KL}_{LB}}}
\newcommand{\SPRLInitMean}{\mu_{\text{INIT}}}
\newcommand{\SPRLInitVar}{\Sigma_{\text{INIT}}}
\newcommand{\SPRLTargetMean}{\mu_{\text{TARGET}}}
\newcommand{\SPRLTargetVar}{\Sigma_{\text{TARGET}}}


% Labeled Contextual MDP
\newcommand{\LCmdp}{\Cmdp^{\LmdpLabelingFunction}}
\newcommand{\LCmdpStates}{\CmdpStates}
\newcommand{\LCmdpActions}{\CmdpActions}
\newcommand{\LCmdpContextSpace}{\CmdpContextSpace}
\newcommand{\LCmdpMapping}{\CmdpMapping^{\LmdpLabelingFunction}}
\newcommand{\LCmdpCommonState}{\CommonState}
\newcommand{\LCmdpCommonAction}{\CommonAction}
\newcommand{\LCmdpTransition}{\CmdpTransition}
\newcommand{\LCmdpRewardFunction}{\CmdpRewardFunction^{\LmdpLabelingFunction}}
\newcommand{\LCmdpInitialDistribution}{\CmdpInitialDistribution}
\newcommand{\LCmdpPolicy}{\Policy}
\newcommand{\LCmdpDiscount}{\Discount}
\newcommand{\LCmdpLabels}{\LmdpLabels}
\newcommand{\LCmdpLabelingFunction}{\LmdpLabelingFunction_{\context}}
\newcommand{\LCmdpDimensions}{\CmdpDimensions}
\newcommand{\LCmdpCommonLabel}{\LmdpCommonLabel}
\newcommand{\LCmdpReward}{\LmdpReward}
\newcommand{\LCmdpHistory}{h}
\newcommand{\LCmdpNumberOfDimensions}{\Gamma}

% Product of LCMDP and RM
\newcommand{\Pmdp}{\Cmdp^{\LmdpLabelingFunction}_{\RM}}
\newcommand{\PmdpStates}{\bar{\CmdpStates}}
\newcommand{\PmdpActions}{\CmdpActions}
\newcommand{\PmdpContextSpace}{\CmdpContextSpace}
\newcommand{\PmdpMapping}{\bar{\CmdpMapping}^{\LmdpLabelingFunction}}
\newcommand{\PmdpCommonState}{\bar{\CommonState}}
\newcommand{\PmdpCommonAction}{\CommonAction}
\newcommand{\PmdpTransition}{\bar{p}_{\context}}
\newcommand{\PmdpRewardFunction}[1][\context]{\bar{R}^{\LmdpLabelingFunction}_{#1}}
\newcommand{\PmdpInitialDistribution}{\bar{\phi}_{\context}}
\newcommand{\PmdpPolicy}{\Policy}
\newcommand{\PmdpDiscount}{\Discount}
\newcommand{\PmdpLabels}{\LmdpLabels}
\newcommand{\PmdpLabelingFunction}{\LCmdpLabelingFunction}
\newcommand{\PmdpDimensions}{\LCmdpDimensions}
\newcommand{\PmdpCommonLabel}{\LmdpCommonLabel}
\newcommand{\PmdpReward}{\bar{\LCmdpReward}}
\newcommand{\PmdpTrajectory}{\bar{\Trajectory}}
\newcommand{\RMContextMapping}{\mathsf{F}}
\newcommand{\RMContextMappingCommonOutput}{\mathsf{f}}

\newcommand{\RMMDPContextSet}{\mathcal{G}}
\newcommand{\RMMDPContextMapping}{\mathsf{H}_{min}}
\newcommand{\RMMDPContextSetALL}{\Gamma}

% GoalGAN
\newcommand{\GoalGANdNoise}{\delta_{\text{NOISE}}}
\newcommand{\GoalGANnRollout}{n_{\text{ROLLOUT}}^{\text{GG}}}
\newcommand{\GoalGANpSuccess}{p_{\text{SUCCESS}}}

% ALP-GMM
\newcommand{\ALPGMMpRandom}{p_{\text{RAND}}}
\newcommand{\ALPGMMnRollout}{n_{\text{ROLLOUT}}^{\text{AG}}}
\newcommand{\ALPGMMsBuffer}{s_{\text{BUFFER}}}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%% NOMENCLATURE %%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Theory}

% Alternative Assumption!
\newtheorem{theoremalt}{Theorem}[theorem]

% New environment
\newenvironment{theoremp}[1]{
  \renewcommand\thetheoremalt{#1}
  \theoremalt
}{\endtheoremalt}


\begin{theoremp}{2}
Under Assumption 1, $\RMMDPContextSet_1$ and $\RMMDPContextSet_2$ are sets of sufficient context parameters on $(\RMCommonState,\LCmdpCommonState,\LCmdpCommonAction,\LCmdpCommonState')$ if and only if $\RMMDPContextSet_1\cap\RMMDPContextSet_2$ is a set of sufficient context parameters on  $(\RMCommonState,\LCmdpCommonState,\LCmdpCommonAction,\LCmdpCommonState')$.
\end{theoremp}
\begin{proof}
Sufficiency (backward statement): $\RMMDPContextSet_1 \cap \RMMDPContextSet_2 \subseteq \RMMDPContextSet_1$ and 
$\RMMDPContextSet_1 \cap \RMMDPContextSet_2 \subseteq \RMMDPContextSet_2$, then by Lemma 1, $\RMMDPContextSet_1 \cap \RMMDPContextSet_2$ is a set of sufficient context parameters on $(\RMCommonState,\LCmdpCommonState,\LCmdpCommonAction,\LCmdpCommonState')$.

Necessity (forward statement): Let $\context_{\RMMDPContextSet}=[\context[i]]_{i\in \RMMDPContextSet}$ for any $\RMMDPContextSet \subseteq 2^{\LCmdpDimensions}$. Then, for any $\context,\context'\in\LCmdpContextSpace$ that satisfy $\context_{\RMMDPContextSet_1\cap\RMMDPContextSet_2}=\context'_{\RMMDPContextSet_1\cap\RMMDPContextSet_2}$ we want to show that $\RMTransitionFunction(\RMCommonState,\LCmdpLabelingFunction(\LCmdpCommonState,\LCmdpCommonAction,\LCmdpCommonState'))=\RMTransitionFunction(\RMCommonState,\LmdpLabelingFunction_{\context'}(\LCmdpCommonState,\LCmdpCommonAction,\LCmdpCommonState'))$. Under Assumption 1, there exists $\context''\in\LCmdpContextSpace$ such that
\begin{align*}
    \context_{\RMMDPContextSet_1 \setminus \RMMDPContextSet_2}&=\context''_{\RMMDPContextSet_1 \setminus \RMMDPContextSet_2},\\
    \context'_{\RMMDPContextSet_2 \setminus \RMMDPContextSet_1}&=\context''_{\RMMDPContextSet_2 \setminus \RMMDPContextSet_1},\\
    \context_{\RMMDPContextSet_1 \cap \RMMDPContextSet_2}&=\context''_{\RMMDPContextSet_1 \cap \RMMDPContextSet_2}=\context'_{\RMMDPContextSet_1 \cap \RMMDPContextSet_2}.
\end{align*}
Such a choice is always possible since ${\RMMDPContextSet_1 \cap \RMMDPContextSet_2}$, ${\RMMDPContextSet_1 \setminus \RMMDPContextSet_2}$ and ${\RMMDPContextSet_2 \setminus \RMMDPContextSet_1}$ are disjoint sets. Then, 
\begin{align*}
    \context''_{\RMMDPContextSet_1}&=\context_{\RMMDPContextSet_1},\\ \context''_{\RMMDPContextSet_2}&=\context'_{\RMMDPContextSet_2}.
\end{align*}
Therefore, $\RMTransitionFunction(\RMCommonState,\LCmdpLabelingFunction(\LCmdpCommonState,\LCmdpCommonAction,\LCmdpCommonState'))= \RMTransitionFunction(\RMCommonState,\LmdpLabelingFunction_{\context''}(\LCmdpCommonState,\LCmdpCommonAction,\LCmdpCommonState'))= \RMTransitionFunction(\RMCommonState,\LmdpLabelingFunction_{\context'}(\LCmdpCommonState,\LCmdpCommonAction,\LCmdpCommonState'))$.
\end{proof}

\section{Experimental Details}

We provide details about the algorithms and the environments studied in the paper. These details include hyperparameters of the algorithms, how we select them, and the structure of the environments.

\subsection{Algorithms}
We compare two baseline algorithms and four automated curriculum generation methods:
\begin{enumerate}
    \item \textbf{Default}: We evaluate learning without a curriculum, i.e. sampling contexts from the target context distribution and training the agent in these contexts as a baseline.
    \item \textbf{Default*}: We extend \emph{Default} by running it on a product contextual MDP, thus we observe the effect of capturing temporal abstractions without learning a curriculum.
    \item \textbf{GoalGAN}: \citet{florensa2018automatic} develop \emph{Goal Generative Adversarial Network} for the goal-conditioned setting, where a goal discriminator to determine whether a goal is at the intermediate difficulty for the current policy, and a goal generator which generates goals that are at such level of difficulty. Although GoalGAN takes an initial context distribution, similar to self-paced RL approaches, it does not allow for a target context distribution, instead, it generates a curriculum as if the target context distribution is a uniform distribution over the context space $\PmdpContextSpace$.
    \item \textbf{ALP-GMM}: \citet{florensa2018automatic} propose \emph{Absolute Learning Progress with Gaussian Mixture Models}, which generates a Gaussian mixture model over the absolute learning progress of task parameters, e.g. contexts in our setting. ALP-GMM uses a bandit scheme to choose a Gaussian as an arm whose utility is the absolute learning progress. The chosen Gaussian distribution is used to draw the next task parameter. 
    \item \textbf{SPDL}: \citet{NEURIPS2020_68a97503} propose \emph{Self-paced Deep RL} by exploiting deep learning methods under the self-paced RL framework. We build Intermediate SPRL and RM-guided SPRL on top of this algorithm.
    \item \textbf{Intermediate SPRL}: We present an intermediate self-paced RL algorithm, which runs on the product contextual MDP. Therefore, we assess how self-paced RL performs when the agent can capture the temporal abstractions. We provide the pseudocode in Algorithm \ref{alg:Intermediate_SPRL}.
    \item \textbf{RM-guided SPRL}: We develop a reward-machine-guided, self-paced RL algorithm, which uses reward machines to update the policy and value functions of an RL agent, as well as to direct the curriculum generation.
\end{enumerate}
In our experiments, all methods use the soft actor-critic algorithm \cite{haarnoja2018soft} as the RL method of choice.
\begin{algorithm}[tbp]
\caption{Intermediate Self-Paced RL}
\label{alg:Intermediate_SPRL}
\textbf{Input}: Product MDP $\Pmdp$, target context distribution $\TargetContextDistribution$, initial context distribution $\ContextDistribution(\cdot|\ContextDistributionParameter_{0})$,\\
\textbf{Parameter}: KL penalty proportion $\KLPenaltyProportion$, relative entropy bound $\RelativeEntropyBound$,  KL penalty offset offset $\KLPenaltyOffset$, number $\NumberOfIterations$ of iterations, number $\NumberOfRollouts$ of rollouts\\
\textbf{Output}: Final policy $\PmdpPolicy_{\PolicyParameter_{\NumberOfIterations}}$
\begin{algorithmic}[1] %[1] enables line numbers
    \STATE Initialize policy $\PmdpPolicy_{\PolicyParameter_{0}}$.
    \FOR{$k=1$ to $\NumberOfIterations$}
    % \STATE \textbf{Policy Update:}
    \STATE $\context_i\sim\ContextDistribution(\context|\ContextDistributionParameter_{k-1})$, $i\in[\NumberOfRollouts]$,
    \COMMENT{sample contexts}
    \STATE $\SPRLTrajectorySet_{k}\leftarrow\{(\context_i, \PmdpTrajectory_i) | \PmdpTrajectory_i = (\PmdpCommonState_{i,0},\PmdpCommonAction_{i,0},\PmdpReward_{i,1},\PmdpCommonState_{i,1}),\cdots,$ $(\PmdpCommonState_{i,T_i-1},\PmdpCommonAction_{i,T_i-1},\PmdpReward_{i,T_i},\PmdpCommonState_{i,T_i}),i\in[\NumberOfRollouts]\}$,
    \COMMENT{collect trajectories}
    \STATE $\PmdpPolicy_{\PolicyParameter_{k}}\leftarrow \Psi(\SPRLTrajectorySet_{k},\PmdpPolicy_{\PolicyParameter_{k-1}})$
    \COMMENT{update policy with RL algorithm $\Psi$}
    \STATE Compute next context distribution parameter $\ContextDistributionParameter_{k}$ by solving 
    \begin{align}
    \max_{\ContextDistributionParameter_{k}} \quad & \frac{1}{\NumberOfRollouts} \sum_{i=1}^{\NumberOfRollouts} \sum_{t=0}^{T_i-1} \CmdpDiscount^t \frac{\ContextDistribution(\context_i|\ContextDistributionParameter_{k})}{\ContextDistribution(\context_i|\ContextDistributionParameter_{k-1})} \PmdpReward_{i, t+1} \nonumber  - \KLCoefficient_{k} \KLDivergence(\ContextDistribution(\context|\ContextDistributionParameter_{k})\:||\:\TargetContextDistribution(\context)) \nonumber \\
    \textrm{s.t.} \quad &  \KLDivergence(\ContextDistribution(\context|\ContextDistributionParameter_{k})\:||\:\ContextDistribution(\context|\ContextDistributionParameter_{k-1})) \leq \RelativeEntropyBound,
    \label{eq:Intermediate_SPRL_objective}
    \end{align}
    where
    $\KLCoefficient_{k}=\begin{cases}
        $0$ & \text{if $k \leq \KLPenaltyOffset$}; \\
        \mathsf{B}(\ContextDistributionParameter_{k-1},\SPRLTrajectorySet_{k}) & \text{otherwise},
        \end{cases}$, and
    $\mathsf{B}(\ContextDistributionParameter_{k-1},\SPRLTrajectorySet_{k}) = \KLPenaltyProportion \frac{\max{(0, \frac{1}{\NumberOfRollouts}\sum_{i=1}^{\NumberOfRollouts}\sum_{t=1}^{T_i} \PmdpDiscount^t \PmdpReward_{i, t})}}{\KLDivergence(\ContextDistribution(\context|\ContextDistributionParameter_{k-1})||\TargetContextDistribution(\context))}$.
    \ENDFOR
    \STATE \textbf{return} $\PmdpPolicy_{\PolicyParameter_{\NumberOfIterations}}$
\end{algorithmic}
\end{algorithm}

Table \ref{tab:hyperparam_sprl} shows the hyperparameters of RM-guided SPRL, Intermediate SPRL, and SPDL algorithms in three case studies: 
\begin{itemize}
    \item \textbf{Case-1:} Two-door environment \& 2D context space with wide target distribution,
    \item \textbf{Case-2:} Customized Swimmer-v3 environment \& 2D context space with narrow target distribution,
    \item \textbf{Case-3:} Customized HalfCheetah-v3 environment \& 3D context space with narrow target distribution.
\end{itemize}

\begin{table*}[t]
\centering
%\resizebox{.95\columnwidth}{!}{
\begin{tabular}{clccccccc}
\toprule
&Algorithm&$\RelativeEntropyBound$&$\ContextUpdateOffset$&$\KLPenaltyProportion$&$\KLPenaltyOffset $&$\NumberOfStepsBetweenUpdates$&$\STDLowerBound$&$\KLLowerBound$ \\
\midrule
   \multirow{3}{*}{\rotatebox{90}{Case-1}} & RM-guided SPRL & 0.05 & 70 & 0.96 & 10 & 16384 & $(4\cdot 10^{-3}, 4\cdot 10^{-3})$ & 8000 \\
     & Intermediate SPRL & 0.05 & 70 & 1.2 & 10 & 16384 & $(4\cdot 10^{-3}, 4\cdot 10^{-3})$ & 8000 \\
     & SPDL & 0.05 &70 & 1.2 & 10 & 16384 & $(4\cdot 10^{-3}, 4\cdot 10^{-3})$ & 8000 \\
\midrule
        \multirow{3}{*}{\rotatebox{90}{Case-2}} & RM-guided SPRL & 0.1 & 10 & 1.0 & 5 & 16384 & $(4\cdot 10^{-3}, 4\cdot 10^{-3})$ & 8000 \\
    & Intermediate SPRL & 0.1 & 10 & 4.0 & 5 & 16384 & $(4\cdot 10^{-3}, 4\cdot 10^{-3})$ & 8000 \\
     & SPDL & 0.1 & 10 & 4.0 & 5 & 16384 & $(4\cdot 10^{-3}, 4\cdot 10^{-3})$ & 8000 \\
\midrule
        \multirow{3}{*}{\rotatebox{90}{Case-3}} & RM-guided SPRL & 0.05 & 80 & 1.0 & 0 & 16384 & $(4\cdot 10^{-3}, 4\cdot 10^{-3}, 4\cdot 10^{-3})$ & 8000 \\
    & Intermediate SPRL  & 0.05 & 80 & 4.0 & 0 & 16384 & $(4\cdot 10^{-3}, 4\cdot 10^{-3}, 4\cdot 10^{-3})$ & 8000 \\
     & SPDL & 0.05 & 80 & 4.0 & 0 & 16384 & $(4\cdot 10^{-3}, 4\cdot 10^{-3}, 4\cdot 10^{-3})$ & 8000 \\
     \bottomrule
\end{tabular}
\caption{Hyper-parameters for self-paced RL algorithms}
\label{tab:hyperparam_sprl}
\end{table*}

There are four parameters in Table \ref{tab:hyperparam_sprl} that we do not provide in Algorithm 1: $\ContextUpdateOffset$, $\NumberOfStepsBetweenUpdates$, $\STDLowerBound$, and $\KLLowerBound$. \citet{NEURIPS2020_68a97503} introduce these parameters as a part of SPDL. 
$\ContextUpdateOffset$ is the number of context distribution updates before they enable a self-paced RL algorithm to update the initial context distribution. This parameter allows learning a meaningful value function that estimates the expectation of the value of the initial states, which the objective function of the self-paced RL problem takes into account. 
$\NumberOfStepsBetweenUpdates$ is the number of environment interactions between two context distribution updates, and it replaces $\NumberOfRollouts$, which is the number of rollouts, i.e., trajectories between two context distribution updates.
$\STDLowerBound$ is the lower bound for the standard deviation of a context distribution, which is used to stabilize learning, particularly for narrow target distributions.
$\KLLowerBound$ is the threshold for KL divergence to the target context distribution and it determines when the algorithm stop clipping the standard deviation of a context distribution using $\STDLowerBound$. 
In all case studies, we run a grid search over 
\begin{align*}
    \KLPenaltyOffset & \in \{0, 5, 10\}, \\ 
    \NumberOfStepsBetweenUpdates & \in \{8192, 16384\}, \\ 
    \KLLowerBound & \in \{8000, 10000\},\\
    \RelativeEntropyBound & \in \{0.05, 0.1\}. 
\end{align*}
We set $\STDLowerBound$ with respect to the standard deviation of the narrow target context distribution (see Table \ref{tab:init_target_dist}).
For $\ContextUpdateOffset$, the grid search is over the sets $\{60, 70\}$, $\{5, 10\}$, and $\{70, 80\}$ for Case-1, Case-2 and Case-3, respectively. 
For $\KLPenaltyProportion$, the parameter value search in Case-1 is over $\{0.96,0.98,1.0,1.2\}$, whereas the search in Case-2 and Case-3 is over $\{1.0,2.0, 3.0, 4.0\}$.
We use the parameters that yield the fastest convergence to the optimal expected discounted return via Intermediate SPRL for SPDL (see Table \ref{tab:hyperparam_sprl}).
\begin{table*}[t]
\centering
%\resizebox{.95\columnwidth}{!}{
\begin{tabular}{lcccc}
\toprule
 Case&$\SPRLInitMean $& $\SPRLInitVar$ &$\SPRLTargetMean$ &$\SPRLTargetVar$ \\
\midrule
   1 & (0, 0) & $diag((0.25, 0.25))$ & (2, 2) & $diag((1, 1))$ \\
   2 & (0, 1) & $diag((0.1, 0.1))$ & (-0.6, 1.6) & $diag((1.6\cdot 10^{-7}, 1.6\cdot 10^{-7}))$\\
   3 & (1, 4, 7) & $diag((0.25, 0.25, 0.25))$ & (4, 7, 10) & $diag((1.6\cdot 10^{-7}, 1.6\cdot 10^{-7}, 1.6\cdot 10^{-7}))$ \\
     \bottomrule
\end{tabular}
\caption{Initial and target context distributions}
\label{tab:init_target_dist}
\end{table*}

We compare GoalGAN with self-paced RL algorithms in our case studies since it is a state-of-the-art automated curriculum generation method that can handle sparse rewards. We tune the random noise $\GoalGANdNoise$ that is on every sample, the number $\GoalGANnRollout$ of policy rollouts between context distribution updates, and the percentage $\GoalGANpSuccess$ of samples drawn from the success buffer. The tuning is done for every case via a grid-search over
\begin{align*}
    \GoalGANdNoise & \in \{0.05, 0.1\}, \\ 
    \GoalGANnRollout & \in \{50, 100\}, \\ 
    \GoalGANpSuccess & \in \{0.2, 0.3\}.
\end{align*}
We also evaluate ALP-GMM as it is a state-of-the-art automated curriculum generation method that is competitive with SPDL \citep{klink2021probabilistic}. We tune the percentage of random context samples $\ALPGMMpRandom$, the number $\ALPGMMnRollout$ of policy rollouts between context distribution updates, and the size of the buffer of past trajectories $\ALPGMMsBuffer$. The tuning is done for every case via a grid-search over
\begin{align*}
    \ALPGMMpRandom & \in \{0.2, 0.3\}, \\ 
    \ALPGMMnRollout & \in \{100, 200\}, \\ 
    \ALPGMMsBuffer & \in \{1000, 2000\}.
\end{align*}
Due to failing to accomplish the task in every experiment with all combinations of these available parameter values, we use a combination that we consider locally better than the other combinations in terms of the expected discounted return obtained in the evaluation runs. The hyperparameters of GoalGAN and ALP-GMM that we use are in Table \ref{tab:hyperparam_goalgan_alpgmm}.


\begin{table}[h]
\centering
%\resizebox{.95\columnwidth}{!}{
\begin{tabular}{lcccccc}
\toprule
 Case &$\GoalGANdNoise$& $\GoalGANnRollout$ &$\GoalGANpSuccess$ & $\ALPGMMpRandom$ & $\ALPGMMnRollout$ & $\ALPGMMsBuffer$\\
\midrule
   1 & 0.1 & 100 & 0.3 & 0.2 & 200 & 1000\\
   2 & 0.1 & 100 & 0.3 & 0.3 & 200 & 1000 \\
   3 & 0.1 & 100 & 0.3 & 0.3 & 200 & 1000 \\
     \bottomrule
\end{tabular}
\caption{Hyper-parameters for GoalGAN and ALP-GMM}
\label{tab:hyperparam_goalgan_alpgmm}
\end{table}

In Case-1, we set the discount rate $\Discount$ to 0.98, whereas Case-2 and Case-3 use $\Discount=0.99$. Every curriculum learning method uses the stable-baselines3 \cite{stable-baselines3} implementation of the soft actor-critic (SAC) RL algorithm \cite{haarnoja2018soft}. In Case-1, we provide a replay buffer of size 150,000 and a batch size of 64, start the learning after 500 environment interactions, update the policy every 5 interactions via the soft Q-updates, and use a multi-layer perceptron policy with 2 layers of 64 neurons and tanh activation function. In Case-2 and Case-3, SAC uses a replay buffer size of 500,000, and batch size of 256, updates the policy every 8 steps and the learning starts after 10,000 interactions with the environment. SAC also uses a multi-layer perceptron policy with 2 layers of 256 neurons and ReLU activation function. In addition, we set the learning rate to 0.001. We keep the rest of the hyperparameters as what the implementation provides in its default setting.

We conduct all experiments on a laptop with an 11$^{th}$ Gen Intel Core i7-11800H processor and an Nvidia GeForce RTX 3060 graphics card and 16GB of RAM.

\subsection{Environments}

\subsubsection{Two-Door Environment}

Case-1 is based on a two-door environment (see Fig. \ref{fig:TwoDoor}). The two-door environment is a 40-by-40 grid world, where the initial state is at the coordinates (20, 35), which correspond to the positions along horizontal and vertical axes, respectively. Box is a 5-by-5 square, and its top-left corner is at the coordinates (15, 20). The goal is positioned at (20, 5). We set the vertical position of the doors to 30 and 10 for the first and second doors, respectively. The width of the doors has a width of 5. In short, the labeled contextual MDP $\LCmdp$ of the two-door environment has a state space $\LCmdpStates=\{1,2,\cdots,40\}^2$ and action space $\LCmdpActions=\{U, D, L, R\}$, which correspond to the coordinates and the four cardinal directions, i.e., up, down, left, right, respectively. The transitions in the two-door environment are deterministic. Unless the agent completes the task or dies by moving onto a wall or the second door before getting the key, we allow the agent to take at most 4800 steps. 
    
\begin{figure}[h]
\centering
\setlength{\fboxsep}{-2.5pt}%
\setlength{\fboxrule}{3pt}%
\fbox{\includegraphics[width=0.3\textwidth]{figures/two_door_discrete.pdf} }
\caption{Two-door Environment}
\label{fig:TwoDoor}
\end{figure}

Case study 1 has a context space $\LCmdpContextSpace_1=[-4,4]^2$, where the context parameters are the horizontal positions of the first and second doors, respectively. We design the reward-machine-context mapping $\RMContextMapping_1$ for case study 1 as
\begin{align*}
    \RMContextMapping_1(\RMCommonState_0,\RMCommonState_0)&=\varnothing, \RMContextMapping_1(\RMCommonState_0,\RMCommonState_1)=\{1\}, \RMContextMapping_1(\RMCommonState_0,\RMCommonState_5)=\{1\},\\
    \RMContextMapping_1(\RMCommonState_1,\RMCommonState_1)&=\{1\}, \RMContextMapping_1(\RMCommonState_1,\RMCommonState_2)=\varnothing, \RMContextMapping_1(\RMCommonState_1,\RMCommonState_5)=\{1\},\\
    \RMContextMapping_1(\RMCommonState_2,\RMCommonState_2)&=\{1\}, \RMContextMapping_1(\RMCommonState_2,\RMCommonState_3)=\{2\}, \RMContextMapping_1(\RMCommonState_2,\RMCommonState_5)=\{1, 2\},\\
    \RMContextMapping_1(\RMCommonState_3,\RMCommonState_3)&=\{1,2\}, \RMContextMapping_1(\RMCommonState_3,\RMCommonState_4)=\varnothing, \RMContextMapping_1(\RMCommonState_3,\RMCommonState_5)=\{1,2\}.
\end{align*}

An expert designs such a mapping by asking questions about the task structure. For instance, for the transition $\mathsf{q}_1,\mathsf{q}_5$ in the reward machine in Figure 3, the expert should ask: Is there a transition $s,a,s’$ in the labeled contextual MDP $\bar{\mathcal{M}}^L$ such that it causes the agent to hit the wall, i.e., ($\mathsf{q}_1,\mathsf{q}_5$), for some context $c$ but lets the agent pass through the door, i.e., ($\mathsf{q}_1,\mathsf{q}_2$), for a different context $c’$? The idea is to find the context parameters $i\in\{1,…,dim(\mathcal{C})\}$ for which a change of value, e.g. $c[i] \neq c’[i]$, prevents a transition $(\mathsf{q},\mathsf{q}’)$ in the reward machine from happening. For ($\mathsf{q}_1,\mathsf{q}_5$), the mapping outputs the first context parameter, $\mathsf{F}(\mathsf{q}_1,\mathsf{q}_5)=\{1\}$, as the identifier, since it determines the position of the first door. In other words, when the agent is in the second room and can move into the first door/wall with an up action, then the position of the first door determines whether it moves into the door, or the wall. However, the position of the second door does not identify which transition will happen.

\cref{fig:twodoor_success} demonstrates the progression of the rate of successful task completion in contexts drawn from the target context distribution.

\begin{figure}[h]
\centering
\includegraphics[width=0.6\textwidth]{figures/two_door_discrete_2d_wide_RM_g_vs_Inter_vs_SPDL_vs_DefP_vs_Def_vs_Goal_vs_ALP_expected_success.pdf}
\caption{Two-door Environment: Progression of the successful episodes ratio in contexts drawn from the target context distribution over curriculum updates.}
\label{fig:twodoor_success}
\end{figure}

\subsubsection{Custom Swimmer-v3 Environment}

Case study 3 is based on a variation of the Swimmer-v3 environment from OpenAI gym \cite{1606.01540}. The original environment consists of a robot that moves like a worm by applying force on 2 joints. The objective is to move towards the right of the initial position as fast as possible. The state and action spaces are 8 and 2-dimensional continuous spaces, respectively. We set the maximum number of steps in an episode to 10000. We design the reward-machine-context mapping $\RMContextMapping_3$ for case study 3 as:
\begin{align*}
    \RMContextMapping_2(\RMCommonState_0,\RMCommonState_0)=\{2\}, \RMContextMapping_2(\RMCommonState_0,\RMCommonState_1)=\{2\},
    \RMContextMapping_2(\RMCommonState_1,\RMCommonState_1)=\{1\}, \RMContextMapping_2(\RMCommonState_1,\RMCommonState_2)=\{1\}, 
    \RMContextMapping_2(\RMCommonState_2,\RMCommonState_2)=\{\empty\}.
\end{align*}
\begin{figure}[h]
\centering
\includegraphics[width=0.6\textwidth]{figures/swimmer_2d_narrow_RM_g_vs_Inter_vs_SPDL_vs_DefP_vs_Def_vs_Goal_vs_ALP_expected_return.pdf}
\caption{Customized-Swimmer Environment: Progression of the expected discounted return with respect to the target context distribution.}
\label{fig:swimmer_return}
\end{figure}

\cref{fig:swimmer_return} demonstrates the progression of the expected discounted return with respect to the target context distribution. \cref{fig:swimmer_curriculum} illustrates how self-paced RL algorithms update context distribution parameters, i.e., mean and variance of normal distributions, during the training.

\subsubsection{Custom HalfCheetah-v3 Environment}

\begin{figure}[h]
\centering
\includegraphics[width=0.6\textwidth]{figures/half_cheetah_3d_narrow_RM_g_vs_Inter_vs_SPDL_vs_DefP_vs_Def_vs_Goal_vs_ALP_expected_success.pdf}
\caption{Customized HalfCheetah-v3 Environment: Progression of the successful episodes ratio in contexts drawn from the target context distribution over curriculum updates.}
\label{fig:cheetah_success}
\end{figure}

Case study 3 is based on a variation of the HalfCheetah-v3 environment from OpenAI gym \cite{1606.01540}. The original environment consists of a 2-dimensional robot, shaped like a cheetah. The objective is to make the cheetah run forward, which corresponds to the right of the scene, as fast as possible by applying torque on 6 joints. The state and action spaces are 18 and 6-dimensional continuous spaces, respectively. We set the maximum number of steps in an episode to 2000. We design the reward-machine-context mapping $\RMContextMapping_3$ for case study 3 as:
\begin{align*}
    \RMContextMapping_3(\RMCommonState_0,\RMCommonState_0)&=\{1\}, \RMContextMapping_3(\RMCommonState_0,\RMCommonState_1)=\{1\}, 
    \RMContextMapping_3(\RMCommonState_1,\RMCommonState_1)=\{2\}, \RMContextMapping_3(\RMCommonState_1,\RMCommonState_2)=\{2\}, \\
    \RMContextMapping_3(\RMCommonState_2,\RMCommonState_2)&=\{1\}, \RMContextMapping_3(\RMCommonState_2,\RMCommonState_3)=\{1\}, 
    \RMContextMapping_3(\RMCommonState_3,\RMCommonState_3)=\{3\}, \RMContextMapping_3(\RMCommonState_3,\RMCommonState_4)=\{3\}. 
\end{align*}

To support the discussion of results in customized HalfCheetah-v3, we provide two tables. \cref{tab:halfcheetah_final_performance} shows the expected discounted return and expected success rate achieved by the final policies in every training run of RM-guided SPRL and Intermediate SPRL. \cref{tab:halfcheetah_final_context_distribution} provides the means of Gaussian context distributions generated at the final iteration in every training run of RM-guided SPRL and Intermediate SPRL. In the last training run of Intermediate SPRL, the trained agent fails to learn a policy that can complete the task and the curriculum does not converge to the target context distribution.
\cref{fig:cheetah_success} demonstrates the progression of the rate of successful task completion in contexts drawn from the target context distribution. \cref{fig:cheetah_curriculum} illustrates how self-paced RL algorithms update context distribution parameters, i.e., mean and variance of normal distributions, during the training.

\begin{table}[t]
\centering
%\resizebox{.95\columnwidth}{!}{
\caption{Customized HalfCheetah-v3: Expected discounted returns and success rates achieved by policies from the final iteration of every training run.}
\begin{tabular}{ccccccccccc}\toprule
 Algorithm & Seed 1  & Seed 2 & Seed 3 & Seed 4 & Seed 5 & Seed 6 & Seed 7 & Seed 8 & Seed 9 & Seed 10 \\ \midrule
   \multirow{2}{*}{RM-guided SPRL}
   & 455.37  & 472.31  & 479.13  & 506.75  & 446.76 & 472.85  & 501.31  & 467.63  & 507.10  & 499.94  \\
   & 100\% & 100\% & 100\% & 100\% & 99\% & 100\% & 100\% & 100\% & 100\% & 100\%\\
   \midrule
  \multirow{2}{*}{Intermediate SPRL}
   & 475.57 & 488.74 & 467.48 & 475.58 & 492.25 & 509.89 & 485.06 & 493.68 & 492.13 & \textbf{-0.47}   \\
   & 100\% & 100\% & 100\% & 100\% & 100\% & 100\% & 100\% & 100\% & 100\% & \textbf{0\%}\\
 \bottomrule
\end{tabular}
\label{tab:halfcheetah_final_performance}
\end{table}

\begin{table}[t]
\centering
%\resizebox{.95\columnwidth}{!}{
\caption{Customized HalfCheetah-v3: Means of Gaussian context distributions generated at the final iteration of every training run. Note that the mean of the target context distribution is 
$(4, 7, 10)$.}
\begin{tabular}{ccccccccccc}\toprule
 Algorithm & Seed 1  & Seed 2 & Seed 3 & Seed 4 & Seed 5 & Seed 6 & Seed 7 & Seed 8 & Seed 9 & Seed 10 \\ \midrule
   \multirow{3}{*}{RM-guided SPRL}
   & 3.99  & 3.99  & 4.00  & 4.00  & 4.00 & 4.00  & 4.00  & 3.99 & 3.99 & 3.99 \\
   & 7.00 & 6.99 & 6.99 & 7.00 & 6.99 & 7.00 & 7.00 & 6.99 & 6.99 & 7.00 \\
  & 9.99  & 9.99 & 9.99 & 9.99 & 9.99 & 9.99 & 9.99 & 9.997 & 10.00 & 9.99\\
   \midrule
  \multirow{3}{*}{Intermediate SPRL}
   & 3.98 & 3.99 & 3.96 & 3.95 & 3.99 & 3.98 & 3.98 & 3.98 & 3.98 & \textbf{0.11} \\
   & 6.98 & 6.98 & 6.97 & 6.96 & 6.99 & 6.97 & 6.99 & 6.97 & 6.98 & \textbf{1.44} \\
  & 9.98 & 9.99 & 9.94 & 9.96 & 9.99 & 9.98 & 9.99 & 9.97 & 9.99 & \textbf{3.74}\\
 \bottomrule
\end{tabular}
\label{tab:halfcheetah_final_context_distribution}
\end{table}

\begin{figure}[t]
\centering
    \begin{subfigure}{.42\linewidth}
    \centering
    \includegraphics[width=\linewidth]{figures/swimmer_2d_narrow_RM_guided_SPRL_vs_Intermediate_SPRL_vs_SPDL_curriculum_progression.pdf}
    \caption{Customized Swimmer-v3 environment.}
    \label{fig:swimmer_curriculum}
    \end{subfigure}
~
    \begin{subfigure}{.42\linewidth}
    \centering
    \includegraphics[width=\linewidth]{figures/half_cheetah_3d_narrow_RM_guided_SPRL_vs_Intermediate_SPRL_vs_SPDL_curriculum_progression.pdf}
    \caption{Customized HalfCheetah-v3 environment.}
    \label{fig:cheetah_curriculum}
    \end{subfigure}
\caption{Progression of the statistics (mean and variance) of context distributions generated in the curriculum.}
\end{figure}
\clearpage
\bibliography{koprulu_587}

\end{document}