%% -----------------------------------------------------------------------------
\clearpage
\appendix
\onecolumn
\fontsize{10pt}{11pt}\selectfont

\etocsettocdepth{1}
\part{Appendix} % Start the appendix part
\begingroup
\parindent=0em
\etocsettocstyle{}{}
\localtableofcontents % Insert the document TOC
\endgroup

\newpage



\lsection{Suitcase Lock Implementation}
\label{appendix:suitcase-impl}
Each Suitcase Lock problem instance has $N$ dials, each with $M$ digits, and $2N$ actions, half which increment a deterministic subset of the dials (modulo $M$), and half which decrement the same dials (see Figure \ref{fig:suitcase-results}).
Let $k_i$ denote the effect size of action $a_i$, and $\bar{k}$ denote the mean effect size across all actions.
Given a $\bar{k}$, we generate problem instances with different start states, goal states, and sets of actions such that they have mean effect size $\bar{k}$.

We always ensure that every state can be reached from every other state.
Note that if $\bar{k}=N$, or if all actions modify (for example) an even number of state variables, it is not possible to reach every state from every other state.
To circumvent this issue, we check that for a given problem instance, the increment and decrement action sets can each be reduced to an $N\times N$ binary matrix with full rank.
We repeatedly generate action sets with the desired mean effect size until we find one that satisfies this condition.
The resulting action sets are therefore different for each random seed, except when $\bar{k}=1$ where we always use the identity matrix $I$, and when $\bar{k}=(N-1)$ where we use $1-I$ with an extra 1 added to the first diagonal element to break symmetry.
The decrement actions are always the negation of the increment actions, and we ignore them for $M=2$.

\newpage

% \lsection{Macro-Learning Procedure}
% \label{appendix:algorithm}
% \begin{algorithm}[h!]
% \caption{Learn macro-actions with focused effects}
% \label{alg:algorithm}
% \textbf{Input}: Starting state $s_0$, number of macro-actions $N_M$, number of repetitions $R_M$, search budget $B_M$\\
% % \textbf{Parameter}: Optional list of parameters\\
% \textbf{Output}: List of macro-actions $L_M$\\
% \begin{algorithmic}[1] %[1] enables line numbers
% \STATE \textbf{Define} $g(m) := \mathrm{length}(m) $\\
% \STATE \textbf{Define} $h(s) := \left\{
% 	\begin{array}{ll}
% 	    |\mathrm{net\_effects}(s-s_0)| & \mbox{if } > 0,\\
% 		\infty  & \text{otherwise}
% 	\end{array}
% \right.$
% \STATE \textbf{Define} $f(s,m) := g(m) + h(s)$\\
% where $m$ is the macro (i.e. action sequence) from $s_0$ to $s$\\
% \hspace{10px}
% \STATE Let $L_M$ be an empty list of macro-actions
% \STATE Let $Q$ be a (max) priority queue of size $N_M/R_M$
% \FOR{repetition $r$ in $\{1,...,R_M\}$}
% \STATE Run best-first search (BFS) from $s_0$ with budget $B_M/R_M$, minimizing heuristic $f(s,m)$
% \FOR{each state $s_i$ and macro $m_i$ visited by BFS}
% \STATE Store $m_i$ in $Q$, with priority $h(s_i)$\\
% // When $Q$ becomes full, the action sequences\\
% // with largest $h$-score will get evicted first
% \ENDFOR
% \STATE Add each unique macro in $Q$ to $L_M$
% \STATE Clear $Q$
% \STATE $s_0 \gets$ new random state, such that none of the macros in $L_M$ can run
% \IF {$s_0$ is None}
% \STATE \textbf{break}
% \ENDIF
% \ENDFOR
% \STATE \textbf{return} $L_M$
% \end{algorithmic}
% \end{algorithm}

% Here we use $m_i$ to denote the macro consisting of the action sequence that BFS used to generate state $s_i$. Consider the following example with two primitive actions $a_1$ and $a_2$, where BFS starts at state $s_0$. Expand $s_0$: $a_1 \rightarrow$ generates $s_1$; $a_2 \rightarrow$ generates $s_2$. Expand $s_1$: $a_1 \rightarrow$ generates $s_3$; $a_2 \rightarrow$ generates $s_4$. Then macro $m_4$, corresponding to state $s_4$, would be the action-sequence $[a_1, a_2]$.

% For a planning problem with $|S|$ states and $|T|$ valid state transitions, this algorithm has time complexity $O(\min (B_M, |S|+|T|))$, and space complexity $O(N_M + \min(B_M, |S|))$.

% \clearpage

\lsection{Simulator Details}
\label{appendix:simulator-details}
\subsection{PDDLGym}
We use the PDDLGym library \cite{silver2020pddlgym} to automatically construct black-box simulators for PDDL planning problems.
State information is represented as a variable-length list of currently-true literals.
The planning agent has access to this state information, along with the goal (represented as a conjunction of literals), the action applicability function, and the simulator function.
We chose a representative set of PDDL problems and executed uniform random actions to generate 100 unique random starting states for each, keeping the goal fixed.
The associated \texttt{.pddl} files can be found in the code repository.

\subsection{15-Puzzle}
The 15-puzzle is a $4\times4$ grid of 15 numbered, sliding tiles and one blank space (see Figure \ref{fig:npuzzle}).
The puzzle begins in a scrambled configuration, and the objective is to slide the tiles until the numbers are arranged in increasing order.
There are approximately $10^{13}$ states and the worst-case shortest solution requires 80 actions \cite{brungger1999parallel}.
Our simulator uses a state representation with 16 variables (for the positions of each tile and of the blank space), and 48 primitive actions (that swap the blank space with one of the adjacent tiles), of which only 2--4 can be applied in each state.
Similarly, macro-actions can only run if they begin with the correct blank space location.

We set the macro-learning budget $B_M = \num{32000}$ simulator queries, the number of macros $N_M = 192$, and the number of repetitions $R_M = 16$.
The budget was chosen to approximately match the number of steps required to solve one problem instance with primitive actions.
This resulted in $12$ generated macros per repetition, and a per-repetition simulator budget of $\num{2000}$ state transitions.
We compared these macro-actions against $192$ ``random'' macro-actions of the same lengths, which were generated (for each random seed) by selecting actions uniformly at random from the valid actions at each state.

We then solved the 15-puzzle using greedy best-first search with the goal-count heuristic and a simulation budget of $B_S = \num{500000}$ state transitions.
We generate 100 unique starting states by scrambling the 15-puzzle with uniform random actions for either $225$ or $226$ steps, with equal probability (to ensure that we see all possible blank space locations).
The resulting puzzles can be found in the code repository.

\begin{note} \textbf{On Finding States Where Macro Preconditions Do Not Apply}
\label{note:finding-macro-start-states}

\emph{As mentioned in Section \ref{sec:macro-learning}, the macro-learning procedure includes an option to repeat the search $R_M$ times from new starting states where the previously-discovered macros are not applicable. In general, finding such a state can be as hard as planning, although it might be easier if there is no requirement for generating a plan, e.g. by resetting the simulator to generate a new starting state. Some environment implementations do not allow resetting to arbitrary states, and, in those cases, a plan must be generated. 15-puzzle is the only domain where we use $R_M > 1$, and for this domain, we found that either state generation strategy (random walk or simulator resets) was effective. For domains where the simulator cannot be reset, and where a random walk is insufficient, it is possible to make the search more informed, such as by incorporating state novelty into the heuristic.}
\end{note}

\subsection{Rubik's Cube}
The Rubik's cube is a $3\times3\times3$ cube with colored stickers on each outward-facing square (see Figure \ref{fig:cube}).
The puzzle begins in a scrambled configuration, and the objective is to rotate the faces of the cube until all stickers on each face are the same color.
There are approximately $4.3\times10^{19}$ states, and the worst-case shortest solution requires 26 actions \cite{rokicki2014gods}.
Our simulator fixes a canonical orientation of the cube, and uses a 48-state-variable representation (for the positions of each colored square, excluding the stationary center squares).
The problem has 12 primitive actions (i.e. rotating each of the 6 faces by a quarter-turn in either direction), and these actions are highly non-focused: each modifies 20 of the 48 state variables.

\begin{figure}[b]
\centering
\begin{subfigure}{.2\textwidth}
  \centering
  \includegraphics[width=.7\textwidth]{images/npuzzle}
  \caption{15-Puzzle}
  \label{fig:npuzzle}
\end{subfigure}%
\quad\quad
\begin{subfigure}{.2\textwidth}
  \centering
  \includegraphics[width=.61\textwidth]{images/cube}
  \caption{Rubik's Cube}
  \label{fig:cube}
\end{subfigure}
\caption{Visualizations of the planning domains that use domain-specific simulators}
\label{fig:domains}
\end{figure}

We set the number of learned macro-actions $N_M=576$ so that we could fairly compare the generated macro-actions against our set of expert macro-actions.
We learned macro-actions from a single starting state $R_M=1$, and set a simulation budget of $B_M=\num{1000000}$ simulator queries.
We also compared against $576$ ``random'' macro-actions of the same lengths as the expert macros (six distinct macro-actions plus their corresponding variations), which were regenerated for each random seed.
We set the search budget $B_S = \num{2000000}$ simulator queries.

We obtained starting states for Rubik's cube from modified versions of the 100 hardest problems from \textcite{buchner2018abstraction}.
The problems were specified as random sequences of primitive actions to be applied to a solved Rubik's cube in order to generate the starting state, as well as a corresponding \textrm{SAS}$^+$ representation for each problem.
The original \citeauthor{buchner2018abstraction} problems incorporated 18 half-turn and quarter-turn action primitives, whereas our simulator uses only 12 quarter-turn action primitives.
Our modification removed the 6 half-turn actions from the \textrm{SAS}$^+$ representation and converted problem specifications involving half-turns to their equivalent quarter-turn-only specifications.
The resulting problems consisted of between 12 and 29 primitive actions, with an average of about 20.
(We also tried generating starting states by scrambling the cube with uniform random actions for $60$ steps, with similar results.) The problems we use, and the procedure we use to generate randomly scrambled starting states, can be found in the linked code repository.

\newpage

\lsection{Updating the Simulator with Macros}
\label{appendix:updating-sim}
\subsection*{PDDLGym}
For the PDDLGym simulators, we build new macro-operators for the saved primitive-action sequences by:
\begin{enumerate}
    \item Re-binding the original lifted parameters to new variables that capture any dependencies between subsequent actions.
    For example, the sequence [\texttt{PLACE\_ON(B,C)}, \texttt{PLACE\_ON(A,B)}], would result in two distinct parameters for objects \texttt{A} and \texttt{C}, plus a third, shared parameter for object \texttt{B} that is reused by both primitives.
    \item Combining the preconditions of subsequent primitive actions when they are not already met by the effects of previous primitive actions.
    For example, if \texttt{ACTION1} has precondition \texttt{(A and B)} and effect \texttt{C}, and \texttt{ACTION2} has precondition \texttt{(C and D)}, this would result in the combined precondition \texttt{(A and B and D)}.
    \item Combining and simplifying the effects of the primitive actions to remove unnecessary negations.
    For example, if the combined precondition so far is \texttt{A}, and if \texttt{ACTION1} has effect \texttt{(B and (not A))} and \texttt{ACTION2} has effect \texttt{(C and A)}, this would result in a combined effect of \texttt{(B and C)}, since \texttt{A} is already a precondition.
\end{enumerate}

\noindent We present pseudocode in Algorithm \ref{alg:update-sim}, and the implementation and the resulting macro-augmented PDDL files can be found in the code repository.
% at \textsl{pddlgym/pddlgym/generated-pddl/*/macros-gen.pddl}.

Note that while the desired number of macro-actions for all PDDLGym domains was set to $N_M=8$, we were only able to find four unique macros for the \texttt{doors} domain.

\setcounter{algorithm}{1}
\begin{algorithm}[ht]
\caption{Construct lifted macro for PDDLGym}
\label{alg:update-sim}
\textbf{Input}:

\indent \textsl{actions}, a sequence of grounded primitive actions\\
\indent \textsl{operators}, map from names to lifted primitive operators\\
\textbf{Output}:

\textsl{macro}, a newly-constructed, lifted macro-operator

\begin{algorithmic}[1] %[1] enables line numbers
\STATE \textsl{macro.params} := $\emptyset$
\STATE \textsl{macro.preconds} := $ \emptyset$
\STATE \textsl{macro.effects} := $ \emptyset$
\STATE \textsl{lifted} := map from grounded to lifted variable names
\FOR{\textsl{action} in \textsl{actions}}
    \STATE \textsl{op} := \textsl{operators}[\textsl{action.name}]
    \STATE \textsl{lifted}.update(\{ \textsl{v} $\mapsto$ new\_variable\_name(), for \textsl{v} in \textsl{action.variables} if \textsl{v} not in \textsl{lifted}\})
    % \STATE Add any new variables from \textsl{action} to \textsl{lifted} and give them new lifted names
    \STATE \textsl{binding} := \{\textsl{p} $\mapsto$ \textsl{lifted}[\textsl{v}], for (\textsl{p, v}) in zip(\textsl{op.params}, \textsl{action.variables})\}
    \hspace{\fill}
    \FOR{\textsl{p} in \textsl{op.params}}
        \STATE \textsl{macro.params}.add( \textsl{binding}[\textsl{p}] )
    \ENDFOR
    \hspace{\fill}
    \FOR{\textsl{literal} in bind\_literals(\textsl{op.preconds}, \textsl{binding})}
        \IF{\textsl{literal} not in \textsl{macro.effects} and \textsl{literal} not in \textsl{macro.preconds}}
            \STATE \textsl{macro.preconds}.add(\textsl{literal})
        \ENDIF
    \ENDFOR
    \hspace{\fill}
    \STATE cleanup\_contradictory\_effects(\textsl{op.effects})\\
    // Simplify any contradictory effects to just their positive part, e.g. \texttt{((not A) and A)} becomes \texttt{(A)}
    \hspace{\fill}
    \FOR{\textsl{literal} in bind\_literals(\textsl{op.effects}, \textsl{binding})}
        \IF{(\textsl{$\neg$literal}) in \textsl{macro.effects}}
            \STATE \textsl{macro.effects}.remove(\textsl{$\neg$literal})
        \ELSE
            \STATE \textsl{macro.effects}.add(\textsl{literal})
        \ENDIF
    \ENDFOR
\ENDFOR
\RETURN \textsl{macro}
\end{algorithmic}
\end{algorithm}

% For a macro with $|A|$ primitive actions consisting of $|P|$ total parameters, where each action has at most $|C|$ preconditions and $|E|$ effects, this algorithm has time (and space) complexity $O(|A|\cdot(|P|+|C|^2+|C||E|+|E|^2))$.

\subsection*{Domain-Specific Simulators}
For 15-puzzle and Rubik's cube, both simulators use a position-based representation (i.e. the positions of each numbered tile or blank space; the positions of each colored sticker excluding the stationary center stickers).
Primitive actions are expressed as permutations operations on the indices of the state variables.

To augment the simulator with macro-actions, we computed the overall permutation for each sequence of primitive actions, and store the result (along with its precondition, if any) as a new permutation operation that the simulator can apply using the same procedure it uses for primitive actions.

In the case of Rubik's cube, none of the primitive actions have preconditions, so the resulting macros do not have preconditions either.
However, for 15-puzzle, primitive-action preconditions depend on the position of the blank space.
Fortunately, since we only construct macros for valid action sequences and since actions deterministically modify the position of the blank space, as long as the initial precondition is satisfied, each action will automatically satisfy the precondition of the next action in the sequence.
Thus, when saving each 15-puzzle macro-action, we simply keep track of the blank-space location required to execute its first primitive action, along with its overall permutation.

The code to generate the overall permutation of a 15-puzzle or Rubik's cube macro-action can be found in the corresponding module in the code repository.

\newpage

\lsection{Expert Rubik's Cube Macros}
\label{appendix:expert-macros}
We use the following expert macro-actions (expressed in standard cube notation \cite{singmaster1981notes}):
\begin{itemize}
    \item 3-corner swap (see Figure \ref{fig:cube-expert}): $L'\,B\,L\,F'\,L'\,B'\,L\,F$
    \item 3-edge swap, middle: $L'\,R\,U\,U\,R'\,L\,F\,F$
    \item 3-edge swap, face: $R\,R\,U\,R\,U\,R'\,U'\,R'\,U'\,R'\,U\,R'$
    \item 2-corner rotate: $R\,B'\,R'\,U'\,B'\,U\,F\,U'\,B\,U\,R\,B\,R'\,F'$
    \item R-permutation: $F\,F\,R'\,F'\,U'\,F'\,U\,F\,R\,F'\,U\,U\,F\,U\,U\,F'\,U'$
    \item 2-edge flip: $L\,R'\,F\,L\,R'\,D\,L\,R'\,B\,L\,R'\,U\,U\,L\,R'\,F\,L\, R'\,D\,L\,R'\,B\,L\,R'$
\end{itemize}
To generate the full set of 576 expert macro-actions, we consider 96 variations of each of the above sequences, including all 24 possible orientations, along with their inverse and mirror-flipped versions.

The learned 3-pair-swap macro in Figure \ref{fig:cube-learned} was not included with the expert macro-actions. We also provide its action sequence for completeness.
\begin{itemize}
    \item 3-pair-swap (see Figure \ref{fig:cube-learned}): $F'\,L\,F'\,L'\,F\,F\,R\,U'\,R'\,F'\,U\,F$ 
\end{itemize}


% \newpage

% \lsection{Additional Figures and Tables}
% \label{appendix:figures-and-tables}



