% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}
%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{amsfonts}

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
% \externaldocument{main}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)
\usepackage{amsthm}
%% Self-defined macros
\newcommand{\sota}{state-of-the-art }
\newtheorem{theorem}{Theorem}
\setcounter{theorem}{3}
\newtheorem{corollary}{Corollary}
\newtheorem{lemma}{Lemma}
%\newtheorem*{proof}{Proof}
\newtheorem{definition}{Definition}
\newtheorem{proposition}{Proposition}
\newcommand{\symnet}{{\sc SymNet2.0}}
\newcommand{\symnetET}{{Symnet2.0}}
\newcommand{\symnetD}{{Symnet2.0-D}}
\newcommand{\symnetone}{{SymNet}}
\newcommand{\symnetNotZero}{{\sc SymNet2}}

\newcommand{\obtuple}[1]{{\langle #1 \rangle}}
\newcommand{\ModelNetNotZero}{{\sc SymNet3}}


% \newcommand{\obtuple}[1]{{\langle #1 \rangle}}

\newcommand{\ModelNet}{{\sc SymNet3.0}}

\newcommand{\vscom}[1]{{\color{violet}{{[VS: #1]}}}}
\newcommand{\pscom}[1]{{\color{red}{{[PS: #1]}}}}
\newcommand{\macom}[1]{{\color{green}{{[MA: #1]}}}}
\newcommand{\dacom}[1]{{\color{magenta}{{[DA: #1]}}}}
\newcommand{\todo}[1]{{\color{yellow}{{[ToDo: #1]}}}}
\newcommand{\cam}[1]{{\color{violet}{{[Cam: #1]}}}}
% \usepackage{paralist} % for inparaenum
%  END author1
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{SymNet 3.0: Exploiting Long-Range Influences in Learning\\Generalized Neural Policies for Relational MDPs \\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{Vishal Sharma\thanks{Equal Contribution}}
\author[1]{Daman Arora$^*$}
\author[1]{Mausam}
\author[1]{Parag Singla}
% Add affiliations after the authors
\affil[1]{%
    Indian Institute of Technology Delhi \{vishal.sharma, cs5180404, mausam, parags\}@cse.iitd.ac.in
}
  
  \begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle
% \section{Supplement}

\section{Proofs}
% \subsubsection{Proof for Theorem 3}

\begin{theorem}

\label{thm:sinet_more_policy}
% \ModelNet{} can represent certain policies that \symnet{} can not represent.
For a node $n$ in the influence graph, let $L(n, k)$ denote the multi-set of node features of nodes that are exactly $k$ hops away from node $n$ in the influence graph. In reference to theorem 1, given the features of nodes $n_1$ and $n_2$, if there exists a $k > 0$ such that $L(n_1, k) \neq L(n_2, k)$, then, given a sufficiently powerful attention function
% and projection matrices of appropriate dimensions,
\ModelNet{} has the power to learn the parameters that break the symmetry induced between $s_1$ and $s_2$ which have the features of nodes $n_1$ and $n_2$ swapped.
% Recall from Theorem~\ref{thm:state_rep} that SymNet2.0 will not be able to break this symmetry among $s_1$ and $s_2$.
% \ModelNet{} can generate different embeddings for $n_1$ and $n_2$ when their features are swapped. 
\end{theorem}

\begin{proof}[Proof (Sketch)]
The high level intuition of the proof is that their will exist certain "key-nodes" in the graph which have unique features. For example, in the Navigation domain, it will be the Goal location. For the Pizza domain, it would be the location of the Pizza and the Customer. If a node is using distance from these key-nodes, then it is possible to break symmetries induced by a fixed-depth GAT. The more the number of key-nodes, the easier it is to break the symmetry. The formal proof is as follows: 


Let $C_{f,n,d}$ denote the number of times $f$ occurs in $L(n, d)$. As $L(n_1, k) \neq L(n_2, k)$,  $\exists f'$, s.t. $C_{f',n_1,k} \neq C_{f',n_2,k}$.
With a sufficiently powerful attention function, in the state $s_1$, node $n_1$ can focus attention on $f'$ to learn a node embedding different from that of $n_2$.
% Next, the only difference between $s_1$ and $s_2$ is the swapping of features $f_1$ and $f_2$, as $f' \notin L(n_2, k)$ so the node $n_2$ in $s_2$ will now learn a different node embedding than node $n_1$ in $s_1$. Thus breaking the symmetry.
% Next, in the state $s_2$, as $f' \notin L(n_2, k)$ and the only difference between $s_1$ and $s_2$ is the swapping of features $f_1$ and $f_2$, so the node $n_2$ in $s_2$ will now learn a different node embedding than node $n_1$ in $s_1$. Thus breaking the symmetry.

We construct one such set of parameters for \ModelNet{} that will break the symmetry among $s_1$ and $s_2$ with respect to nodes $n_1$ and $n_2$.
\begin{enumerate}
    \item $GAT_{pre}$ can learn an identity mapping for each node by focusing all attention on itself and those nodes in its neighbourhood that have exactly the same features as itself while ignoring all other neighbours.
    
    % \item Since $L(n_1, k) \cap L(n_2, k) \neq \phi$, we can assume, without loss of generality their exists $f'$, s.t. $f' \in L(n_1, k)$ and $f' \notin L(n_2, k)$.  Consider the following (un-normalized) attention function.
    % \item As $L(n_1, k) \neq L(n_2, k)$, without loss of generality, let their be a $f'$, s.t. $f' \in L(n_1, k)$ and $f' \notin L(n_2, k)$.
%     \[
%     e(f_i, f_j, d_{ij}) = \begin{cases} 
%       0 & d = 0 \\
%       0 & f_i=f_1, f_j=f', d_{ij}=k \\
%       -INF & \text{otherwise}
%    \end{cases}
% \]
    \item Next, consider the following (un-normalized) attention function in the influence layer.
    \[
    e(f_i, f_j, d_{ij}) = \begin{cases} 
      0 & d = 0 \\
      0 & f_j=f', d_{ij}=k \\
      -INF & \text{otherwise}
   \end{cases}
\]
where $INF$ is a very large positive number. 

    % \item With a similar process as mentioned for $GAT_{pre}$, $GAT_{post}$ can also learn an identity mapping.
    \item Next, $GAT_{post}$ can also learn an identity mapping (similar to $GAT_{pre}$).
\end{enumerate}
The above parameters ensure that, in the influence layer, any given node gives a non-zero attention weight (after normalization) to itself and to any other node at a distance $k$ having features $f'$.
In state $s_1$, $n_1$'s attention is spread over $n_1$ and those nodes at a distance $k$ that have $f'$ as their features. Therefore the influence embedding for $n_1$ in $s_1$ will be $\frac{C_{f',n_1,k}}{C_{f',n_1,k}+1}*k$. Similarly for $n_2$ in $s_2$, it will be $\frac{C_{f',n_2,k}}{C_{f',n_2,k}+1}*k$. Since $C_{f',n_1,k} \neq C_{f',n_2,k}$, the embeddings will be different when the features are swapped, thus breaking the symmetry among $n_1$ in $s_1$ and $n_2$ in $s_2$.
\end{proof}

\textbf{An example of attention function in the influence layer that can break symmetry:}
Additionally, we also provide an explicit construction of attention weights of the influence layer, that is independent of $f'$. Assume that the features of nodes come from a finite-ordered set $F$ and there exists a function $idx_F:F\rightarrow \mathbb{N}$ that returns the index of a feature in the ordered-set $F$. Consider the un-normalized attention function for $m, n \geq 1$, 
    \begin{equation}
    a_{m,n}(f_i, f_j, d_{ij}) = \begin{cases}
        0 & d_{ij} = 0\\
        0 & idx(f_j) = m  \text{ and } d_{ij}=n \\
        -INF & \text{otherwise}
    \end{cases}
\end{equation}


where $-INF$ is a very large negative number. 
Since the influence layer has multi-head attention, we can assign each head with a different attention function. Specifically, we assign $a_{m,n}$ to the $(n|F|+m)^{th}$ attention head. Note that if a graph has $|G|$ nodes, this ensures there are atmost $|G||F|$ attention heads. 

Note that given these attention heads, it is possible to encode the multi-set of neighbours at a distance $k$ in the influence embedding! Let $C_{f, n, d}$ denote the number of times feature $f$ occurs in the $d$-hop neighbour of node $n$. If we're given that $L(n_1, k) \neq L(n_2, k)$, we can say that $\exists f'\in F$ such that $C_{f,n_1,k} \neq C_{f,n_2,k}$. 

Consider the embedding of node $n_1$ in state $s_1$, specifically the $a_{idx(f'),k}^{th}$ attention head which would correspond to the $(k|F|+idx(f'))^{th}$ element of the influence embedding. Equal attention would be spread over $n_1$ and $C_{f',n_1,k}$ nodes. Therefore the aggregated distance would be $\frac{C_{f',n_1,k}}{1 + C_{f',n_1,k}}*k$. Correspondingly for $n_2$ in $s_2$, this element would be $\frac{C_{f',n_2,k}}{1 + C_{f,n_2,k}}*k$. Since $C_{f',n_1,k} \neq C_{f',n_2,k}$ the embedding for $n_1$ in $s_1$ would not equal the embedding for $n_2$ in $s_2$.
A similar argument can be made for $n_2$ in $s_1$ and $n_1$ in $s_2$. In practice this kind of a construction would require the dimension of the heads to scale with the size of the graph, however this is an exaggeration in the practical setting. In practical domains, there are only a fixed-small number of key features, and just considering the distance from them is sufficient for computing the policy. 
\section{RDDL Example}
The IPPC domain of Navigation is a 2D grid world where a robot has to reach a goal cell. Each cell in the grid has a death probability with which the robot can die. The agent receives a $+1$ reward for reaching the goal and $0$ otherwise.

\textbf{Object Types:} \texttt{x, y}

\textbf{Non-Fluents:} \texttt{north(y, y), south(y, y), east(x, x), west(x, x), min\_x(x), max\_x(x), prob(x, y), goal(x, y)}

\textbf{State-Fluents:} \texttt{robot\_at(x, y)}

\textbf{Actions:} \texttt{move\_north, move\_south, move\_east, move\_west}
\section{Experimental Details}
\begin{itemize}
    \item \textbf{Data generation:} For each domain, we generate $1000$ training, $100$ validation, and $200$ test instances with size increasing from train to val to test instances.
Similar to \symnet, we use state-of-the-art online planner PROST and generate $30$ trajectories of each training instance using the default settings.
\item \textbf{Architectural Details}: For our experiments with \symnet, we use a GAT with depth 4, having shared weights across layers, each layer having 10 attention heads. For \ModelNet, both the pre-processing and post-processing GATs are of depth 2 and have 10 attention heads, with shared weights. For \ModelNet, the influence layer uses 10 attention heads. The final node embedding dimension for both models is 20, and action decoders used are MLPs with 1 hidden layer of dimension 20. 
\item \textbf{Training details}: We train all models for $48$ hours on a $K40$ GPU using imitation learning for LR domains and for $24$ hours for IPPC domains. Each checkpoint is evaluated on validation instances and we pick the one with best average for testing.  
\end{itemize}

% \subsection{Attention Maps}

\begin{figure*}[!ht]
    \centering
      \includegraphics[clip,scale=0.55]{images/nav_kl.png}%
      \includegraphics[clip,scale=0.55]{images/recon_kl.png}%
\caption{(left) Figure shows the attention map averaged across all heads for the robot's location computed by \ModelNet+KL for the DNav domain. We note that the attention heads focus on the corners of the grid helping in the localization of all nodes.
(right) Figure shows the attention map averaged across all heads for the robot's location computed by \ModelNet+KL for the SRecon domain. Here, 0 and 1 denote the object 0 and object 1. We note that the attention heads focus on one of the corners of the grid.}
\label{fig:results_coverage}
\end{figure*}
\begin{figure*}[!ht]
    \centering
      \includegraphics[clip,scale=0.55]{images/corridor_kl.png}%
      \includegraphics[clip,scale=0.55]{images/stochastic_wall_kl.png}
\caption{(left) Figure shows the attention map averaged across all heads for the robot's location computed by \ModelNet+KL for the StNav domain. Here, the probability of death of each cell is written on the cell. We note that the attention is focused on the entrance of the column which is safest.
(right) Figure shows the attention map averaged across all heads for the robot's location computed by \ModelNet+KL for the StWall domain. Here, the attention is focuses on the goal and the cells near the safe passage cell.}
\label{fig:results_coverage_2}
\end{figure*}


% \begin{figure*}
%     \begin{minipage}[b]{.25\linewidth}
%     \centering
%     \includegraphics[scale=0.3]{images/nav_kl.png}
%     \caption
%       {%
%         The attention map of the influence-layer in the Pizza domain for the R node (see Results for details).
%       }
%   \end{minipage}\hfill
%   \begin{minipage}[b]{.25\linewidth}
%     \centering
%     \includegraphics[scale=0.3]{images/recon_kl.png}%
%     \caption
%       {%
%         % The attention map of the influence-layer in the Pizza domain (see Results for details).
%         The attention map of the influence-layer in the Pizza domain for the R node (see Results for details).
%         \label{fig:attention}%
%       }%
%   \end{minipage}
%   \begin{minipage}[b]{.25\linewidth}
%     \centering
%     \includegraphics[scale=0.3]{images/corridor_kl.png}%
%     \caption
%       {%
%         % The attention map of the influence-layer in the Pizza domain (see Results for details).
%         The attention map of the influence-layer in the Pizza domain for the R node (see Results for details).
%         \label{fig:attention}%
%       }%
%   \end{minipage}
%   \begin{minipage}[b]{.25\linewidth}
%     \centering
%     \includegraphics[scale=0.3]{images/stochastic_wall_kl.png}%
%     \caption
%       {%
%         % The attention map of the influence-layer in the Pizza domain (see Results for details).
%         The attention map of the influence-layer in the Pizza domain for the R node (see Results for details).
%         \label{fig:attention}%
%       }%
%   \end{minipage}
% %   \caption{Test}
% \end{figure*}
\newpage
\newpage


\section{Detailed Results and Attention Maps }
The detailed results of experiments for each run of various models for LR domains is shown in Table \ref{tab:full_LR_results} and for IPPC domains is shown in Table \ref{tab:full_IPPC_results}


\begin{table*}[!ht]
\centering
    \begin{tabular}{l|cccccc|c}
        \toprule
        \textbf{Model} & \textbf{SRecon} & \textbf{Pizza} & \textbf{DNav} & \textbf{StWall} & \textbf{EAcad} & \textbf{StNav} & \textbf{Mean} \\
        \midrule
        PROST           & 0.34            & 0.09           & 0.94          & 0.69            & 0.37           & 0              & 0.67          \\ \\
\symnetNotZero[1]       & 0.49            & 0.22           & 0.74          & 0.26            & 0.89           & 0              & 0.63          \\
\symnetNotZero[2]       & 0.47            & 0.35           & 0.57          & 0.23            & 0.89           & 0.01           & 0.56          \\
\symnetNotZero[3]       & 0.49            & 0.27           & 0.46          & 0.27            & 0.9            & 0              & 0.54          \\
\symnetNotZero[4]       & 0.43            & 0.11           & 0.43          & 0.31            & 0.9            & 0.13           & 0.55          \\
\symnetNotZero[5]       & 0.47            & 0.33           & 0.57          & 0.27            & 0.9            & 0.03           & 0.58          \\ \\
\ModelNetNotZero-KL[1]    & 0.63            & 0.65           & 0.86          & 0.47            & 0.95           & 0              & 0.76          \\
\ModelNetNotZero-KL[2]    & 0.63            & 0.43           & 0.83          & 0.31            & 0.72           & 0.2            & 0.62          \\
\ModelNetNotZero-KL[3]   & 0.73            & 0.69           & 0.87          & 0.42            & 0.81           & 0              & 0.7           \\
\ModelNetNotZero-KL[4]    & 0.7             & 0.64           & 0.8           & 0.24            & 0.9            & 0.08           & 0.65          \\
\ModelNetNotZero-KL[5]    & 0.72            & 0.66           & 0.86          & 0.24            & 0.95           & 0.09           & 0.68          \\ \\
\ModelNetNotZero+KL${_D}$[1] & 0.64            & 0.42           & 0.96          & 0.44            & 0.96           & 0.43           & 0.79          \\
\ModelNetNotZero+KL${_D}$[2]& 0.55            & 0.82           & 0.91          & 0.31            & 0.9            & 0.01           & 0.71          \\
\ModelNetNotZero+KL${_D}$[3]& 0.59            & 0.55           & 0.88          & 0.4             & 0.86           & 0.11           & 0.71          \\
\ModelNetNotZero+KL${_D}$[4]& 0.72            & 0.44           & 0.92          & 0.31            & 0.94           & 0.04           & 0.72          \\
\ModelNetNotZero+KL${_D}$[5] & 0.61            & 0.67           & 0.88          & 0.41            & 0.95           & 0.17           & 0.75          \\ \\
\ModelNetNotZero+KL[1]    & 0.46            & 0.09           & 0.92          & 0.43            & 0.92           & 0.05           & 0.76          \\
\ModelNetNotZero+KL[2]     & 0.65            & 0.31           & 0.93          & 0.33            & 0.91           & 0.02           & 0.72          \\
\ModelNetNotZero+KL[3]     & 0.67            & 0.14           & 0.97          & 0.27            & 0.94           & 0.03           & 0.73          \\
\ModelNetNotZero+KL[4]    & 0.66            & 0.18           & 0.93          & 0.36            & 0.95           & 0.15           & 0.75          \\
\ModelNetNotZero+KL[5]     & 0.61            & 0.18           & 0.98          & 0.36            & 0.83           & 0              & 0.72         \\ 
        % $r_{5}$ & \ModelNet & \textbf{0.56} & \textbf{0.48} & \textbf{0.95} & \textbf{0.5} & 0.92 & \textbf{0.17} & \textbf{0.6} \\
        \bottomrule
    \end{tabular}
    \caption{Performance of all runs of different models on $6$ LR domains. }
    \label{tab:full_LR_results}%
\end{table*}


\begin{table*}[!ht]
\centering
    % \begin{tabular}{l|cccccccccccc|c}
    \begin{tabular}{c|p{.5cm}cp{.5cm}p{.5cm}p{.5cm}p{.5cm}cp{.5cm}p{.6cm}p{.5cm}cc|c}
        \toprule
         \textbf{Model} & \textbf{Tam} & \textbf{Traffic} & \textbf{Sys} & \textbf{Skill} & \textbf{Nav} & \textbf{TT} & \textbf{Recon} & \textbf{Elev} & \textbf{Acad} & \textbf{CT} & \textbf{Wild} & \textbf{GoL} & \textbf{Mean} \\
        \midrule     
PROST           & 0.86         & 0.91          & 0.76         & 0.84           & 0            & 0.03        & 0.59           & 0.91          & 0.64          & 0.34        & 0.32         & 0.57          & 0.56          \\ \\
\symnetNotZero[1]       & 0.91         & 0.9           & 0.76         & 0.84           & 0.09         & 0.76        & 0.41           & 0.9           & 0.88          & 0.65        & 0.71         & 0.82          & 0.72          \\
\symnetNotZero[2]     & 0.92         & 0.88          & 0.72         & 0.78           & 0.69         & 0.79        & 0.3            & 0.91          & 0.85          & 0.91        & 0.51         & 0.83          & 0.76          \\
\symnetNotZero[3]     & 0.89         & 0.87          & 0.85         & 0.86           & 0.59         & 0.82        & 0.3            & 0.92          & 0.9           & 0.76        & 0.47         & 0.72          & 0.75          \\
\symnetNotZero[4]     & 0.89         & 0.9           & 0.81         & 0.83           & 0.45         & 0.76        & 0.35           & 0.95          & 0.64          & 0.89        & 0.63         & 0.8           & 0.74          \\
\symnetNotZero[5]   & 0.89         & 0.86          & 0.82         & 0.79           & 0.88         & 0.77        & 0.37           & 0.93          & 0.86          & 0.82        & 0.78         & 0.75          & 0.79          \\ \\
\ModelNetNotZero-KL[1]    & 0.92         & 0.85          & 0.87         & 0.85           & 0.78         & 0.77        & 0.21           & 0.92          & 0.89          & 0.76        & 0.88         & 0.82          & 0.79          \\
\ModelNetNotZero-KL[2]     & 0.91         & 0.89          & 0.82         & 0.84           & 0.29         & 0.59        & 0.41           & 0.88          & 0.68          & 0.84        & 0.8          & 0.75          & 0.73          \\
\ModelNetNotZero-KL[3]   & 0.93         & 0.83          & 0.75         & 0.84           & 0.2          & 0.8         & 0.61           & 0.88          & 0.62          & 0.81        & 0.64         & 0.78          & 0.72          \\
\ModelNetNotZero-KL[4]   & 0.91         & 0.84          & 0.79         & 0.77           & 0.88         & 0.53        & 0.48           & 0.89          & 0.65          & 0.77        & 0.74         & 0.8           & 0.75          \\
\ModelNetNotZero-KL[5]     & 0.89         & 0.82          & 0.83         & 0.73           & 0.5          & 0.79        & 0.41           & 0.76          & 0.79          & 0.8         & 0.73         & 0.82          & 0.74          \\ \\
\ModelNetNotZero+KL$_D$[1] & 0.9          & 0.85          & 0.81         & 0.76           & 0.87         & 0.77        & 0.23           & 0.94          & 0.92          & 0.77        & 0.66         & 0.79          & 0.77          \\
\ModelNetNotZero+KL$_D$[2] & 0.89         & 0.86          & 0.83         & 0.83           & 0.84         & 0.82        & 0.36           & 0.28          & 0.55          & 0.86        & 0.66         & 0.83          & 0.72          \\
\ModelNetNotZero+KL$_D$[3] & 0.9          & 0.83          & 0.85         & 0.75           & 0.86         & 0.76        & 0.19           & 0.81          & 0.78          & 0.81        & 0.58         & 0.69          & 0.73          \\
\ModelNetNotZero+KL$_D$[4] & 0.91         & 0.87          & 0.84         & 0.84           & 0.8          & 0.8         & 0.37           & 0.86          & 0.8           & 0.72        & 0.7          & 0.76          & 0.77          \\
\ModelNetNotZero+KL$_D$[5] & 0.9          & 0.86          & 0.8          & 0.67           & 0.87         & 0.22        & 0.31           & 0.66          & 0.86          & 0.76        & 0.47         & 0.77          & 0.68          \\ \\
\ModelNetNotZero+KL[1]    & 0.91         & 0.86          & 0.82         & 0.63           & 0.85         & 0.81        & 0.18           & 0.93          & 0.8           & 0.76        & 0.53         & 0.65          & 0.73          \\
\ModelNetNotZero+KL[2]   & 0.9          & 0.86          & 0.78         & 0.7            & 0.85         & 0.8         & 0.29           & 0.9           & 0.89          & 0.72        & 0.56         & -0.27         & 0.67          \\
\ModelNetNotZero+KL[3]  & 0.91         & 0.86          & 0.84         & 0.68           & 0.87         & 0.66        & 0.27           & 0.9           & 0.75          & 0.89        & 0.21         & 0.76          & 0.72          \\
\ModelNetNotZero+KL[4]    & 0.89         & 0.81          & 0.83         & 0.74           & 0.72         & 0.74        & 0.21           & 0.92          & 0.88          & 0.83        & 0.4          & -0.1          & 0.66          \\
\ModelNetNotZero+KL[5]    & 0.9          & 0.86          & 0.83         & 0.75           & 0.27         & 0.7         & 0.26           & 0.9           & 0.67          & 0.8         & 0.36         & -0.13         & 0.6          \\

        % $r_5$ & \ModelNet & \textbf{0.94} & \textbf{0.86} & \textbf{0.85} & 0.77 & 0.29 & 0.64 & \textbf{0.33} & 0.88 & 0.76 & \textbf{0.9} & \textbf{0.87} & 0.66 & 0.73 \\
        \bottomrule
    \end{tabular}
    \caption{Performance of all runs of different models on $12$ IPPC domains. }
    \label{tab:full_IPPC_results}%
\end{table*}

Tables~\ref{sup_tab:LR_results} and \ref{sup_tab:IPPC_results} show the results when the best model among \ModelNet-KL, \ModelNet+KL$_D$ and \ModelNet+KL is chosen based on validation scores..

\begin{table*}[t]
\centering
    \begin{tabular}{l|cccccc|c}
        \toprule
        \textbf{Model}      & \textbf{SRecon} & \textbf{Pizza} & \textbf{DNav} & \textbf{StWall} & \textbf{EAcad} & \textbf{StNav} & \textbf{Mean} \\ \hline
PROST               & 0.34            & 0.09           & 0.94          & 0.69            & 0.37           & 0              & 0.41          \\
\symnetNotZero          & 0.47            & 0.26           & 0.55          & 0.27            & 0.9            & 0.03           & 0.41          \\
\ModelNetNotZero-KL        & \textbf{0.68}   & \textbf{0.62}  & 0.84          & 0.33            & 0.87           & 0.08           & 0.57          \\
\ModelNetNotZero+KL$_D$ & 0.62            & 0.58           & 0.91          & \textbf{0.38}   & \textbf{0.92}  & \textbf{0.15}  & 0.59          \\
\ModelNetNotZero+KL        & 0.61            & 0.18           & \textbf{0.95} & 0.35            & 0.91           & 0.05           & 0.51          \\ \hline
\ModelNetNotZero(best val) & 0.68            & 0.58           & \textbf{0.95} & 0.38            & 0.91           & 0.15           & \textbf{0.61}

 \\
 \bottomrule
    \end{tabular}
    \caption{Comparison of \ModelNet{} variants with the baselines on $6$ LR domains. The last row denotes the score of the best among \ModelNet+$KL$, \ModelNet-$KL$ and \ModelNet-$KL_D$ chosen on the basis of average validation reward.}
    \label{sup_tab:LR_results}%
\end{table*}


% \begin{table}[]
% \begin{tabular}{ccccccccccccc}
% \multicolumn{1}{c|}{\textbf{Model}}  & \textbf{Tam}         & \textbf{Traf}        & \textbf{Sys}         & \textbf{Skill}       & \textbf{Nav}         & \textbf{TT}          & \textbf{Recon}       & \textbf{Elev}        & \textbf{Acad}        & \textbf{CT}          & \textbf{GoL}         & \textbf{Wild}        \\ \hline
% \multicolumn{1}{c|}{PROST}           & 0.86                 & 0.91                 & 0.76                 & 0.84                 & 0                    & 0.03                 & 0.59                 & 0.91                 & 0.64                 & 0.34                 & 0.32                 & 0.57                 \\
% \multicolumn{1}{l}{}                 & \multicolumn{1}{l}{} & \multicolumn{1}{l}{} & \multicolumn{1}{l}{} & \multicolumn{1}{l}{} & \multicolumn{1}{l}{} & \multicolumn{1}{l}{} & \multicolumn{1}{l}{} & \multicolumn{1}{l}{} & \multicolumn{1}{l}{} & \multicolumn{1}{l}{} & \multicolumn{1}{l}{} & \multicolumn{1}{l}{} \\
% \multicolumn{1}{c|}{SymNet2 1}       & 0.91                 & 0.9                  & 0.76                 & 0.84                 & 0.09                 & 0.76                 & 0.41                 & 0.9                  & 0.88                 & 0.65                 & 0.71                 & 0.82                 \\
% \multicolumn{1}{c|}{SymNet2 2}       & 0.92                 & 0.88                 & 0.72                 & 0.78                 & 0.69                 & 0.79                 & 0.3                  & 0.91                 & 0.85                 & 0.91                 & 0.51                 & 0.83                 \\
% \multicolumn{1}{c|}{SymNet2 3}       & 0.89                 & 0.87                 & 0.85                 & 0.86                 & 0.59                 & 0.82                 & 0.3                  & 0.92                 & 0.9                  & 0.76                 & 0.47                 & 0.72                 \\
% \multicolumn{1}{c|}{SymNet2 4}       & 0.89                 & 0.9                  & 0.81                 & 0.83                 & 0.45                 & 0.76                 & 0.35                 & 0.95                 & 0.64                 & 0.89                 & 0.63                 & 0.8                  \\
% \multicolumn{1}{c|}{SymNet2 5}       & 0.89                 & 0.86                 & 0.82                 & 0.79                 & 0.88                 & 0.77                 & 0.37                 & 0.93                 & 0.86                 & 0.82                 & 0.78                 & 0.75                 \\
% \multicolumn{1}{l}{}                 & \multicolumn{1}{l}{} & \multicolumn{1}{l}{} & \multicolumn{1}{l}{} & \multicolumn{1}{l}{} & \multicolumn{1}{l}{} & \multicolumn{1}{l}{} & \multicolumn{1}{l}{} & \multicolumn{1}{l}{} & \multicolumn{1}{l}{} & \multicolumn{1}{l}{} & \multicolumn{1}{l}{} & \multicolumn{1}{l}{} \\
% \multicolumn{1}{c|}{SymNet3-KL 1}    & 0.92                 & 0.85                 & 0.87                 & 0.85                 & 0.78                 & 0.77                 & 0.21                 & 0.92                 & 0.89                 & 0.76                 & 0.88                 & 0.82                 \\
% \multicolumn{1}{c|}{SymNet3-KL 2}    & 0.91                 & 0.89                 & 0.82                 & 0.84                 & 0.29                 & 0.59                 & 0.41                 & 0.88                 & 0.68                 & 0.84                 & 0.8                  & 0.75                 \\
% \multicolumn{1}{c|}{SymNet3-KL 3}    & 0.93                 & 0.83                 & 0.75                 & 0.84                 & 0.2                  & 0.8                  & 0.61                 & 0.88                 & 0.62                 & 0.81                 & 0.64                 & 0.78                 \\
% \multicolumn{1}{c|}{SymNet3-KL 4}    & 0.91                 & 0.84                 & 0.79                 & 0.77                 & 0.88                 & 0.53                 & 0.48                 & 0.89                 & 0.65                 & 0.77                 & 0.74                 & 0.8                  \\
% \multicolumn{1}{c|}{SymNet3-KL 5}    & 0.89                 & 0.82                 & 0.83                 & 0.73                 & 0.5                  & 0.79                 & 0.41                 & 0.76                 & 0.79                 & 0.8                  & 0.73                 & 0.82                 \\
% \multicolumn{1}{l}{}                 & \multicolumn{1}{l}{} & \multicolumn{1}{l}{} & \multicolumn{1}{l}{} & \multicolumn{1}{l}{} & \multicolumn{1}{l}{} & \multicolumn{1}{l}{} & \multicolumn{1}{l}{} & \multicolumn{1}{l}{} & \multicolumn{1}{l}{} & \multicolumn{1}{l}{} & \multicolumn{1}{l}{} & \multicolumn{1}{l}{} \\
% \multicolumn{1}{c|}{SymNet3+KL\_D 1} & 0.9                  & 0.85                 & 0.81                 & 0.76                 & 0.87                 & 0.77                 & 0.23                 & 0.94                 & 0.92                 & 0.77                 & 0.66                 & 0.79                 \\
% \multicolumn{1}{c|}{SymNet3+KL\_D 2} & 0.89                 & 0.86                 & 0.83                 & 0.83                 & 0.84                 & 0.82                 & 0.36                 & 0.28                 & 0.55                 & 0.86                 & 0.66                 & 0.83                 \\
% \multicolumn{1}{c|}{SymNet3+KL\_D 3} & 0.9                  & 0.83                 & 0.85                 & 0.75                 & 0.86                 & 0.76                 & 0.19                 & 0.81                 & 0.78                 & 0.81                 & 0.58                 & 0.69                 \\
% \multicolumn{1}{c|}{SymNet3+KL\_D 4} & 0.91                 & 0.87                 & 0.84                 & 0.84                 & 0.8                  & 0.8                  & 0.37                 & 0.86                 & 0.8                  & 0.72                 & 0.7                  & 0.76                 \\
% \multicolumn{1}{c|}{SymNet3+KL\_D 5} & 0.9                  & 0.86                 & 0.8                  & 0.67                 & 0.87                 & 0.22                 & 0.31                 & 0.66                 & 0.86                 & 0.76                 & 0.47                 & 0.77                 \\
% \multicolumn{1}{l}{}                 & \multicolumn{1}{l}{} & \multicolumn{1}{l}{} & \multicolumn{1}{l}{} & \multicolumn{1}{l}{} & \multicolumn{1}{l}{} & \multicolumn{1}{l}{} & \multicolumn{1}{l}{} & \multicolumn{1}{l}{} & \multicolumn{1}{l}{} & \multicolumn{1}{l}{} & \multicolumn{1}{l}{} & \multicolumn{1}{l}{} \\
% \multicolumn{1}{c|}{SymNet3+KL 1}    & 0.91                 & 0.86                 & 0.82                 & 0.63                 & 0.85                 & 0.81                 & 0.18                 & 0.93                 & 0.8                  & 0.76                 & 0.53                 & 0.65                 \\
% \multicolumn{1}{c|}{SymNet3+KL 2}    & 0.9                  & 0.86                 & 0.78                 & 0.7                  & 0.85                 & 0.8                  & 0.29                 & 0.9                  & 0.89                 & 0.72                 & 0.56                 & -0.27                \\
% \multicolumn{1}{c|}{SymNet3+KL 3}    & 0.91                 & 0.86                 & 0.84                 & 0.68                 & 0.87                 & 0.66                 & 0.27                 & 0.9                  & 0.75                 & 0.89                 & 0.21                 & 0.76                 \\
% \multicolumn{1}{c|}{SymNet3+KL 4}    & 0.89                 & 0.81                 & 0.83                 & 0.74                 & 0.72                 & 0.74                 & 0.21                 & 0.92                 & 0.88                 & 0.83                 & 0.4                  & -0.1                 \\
% \multicolumn{1}{c|}{SymNet3+KL 5}    & 0.9                  & 0.86                 & 0.83                 & 0.75                 & 0.27                 & 0.7                  & 0.26                 & 0.9                  & 0.67                 & 0.8                  & 0.36                 & -0.13               
% \end{tabular}
% \end{table}

% Please add the following required packages to your document preamble:
% \usepackage[table,xcdraw]{xcolor}
% If you use beamer only pass "xcolor=table" option, i.e. \documentclass[xcolor=table]{beamer}
\begin{table}[t]
\centering
    \begin{tabular}{p{2.78cm}|cp{.7cm}p{.5cm}p{.5cm}p{.5cm}p{.5cm}cp{.5cm}cccc|c}
        \toprule
         \textbf{Model}               & \textbf{Tam}  & \textbf{Traffic} & \textbf{Sys}  & \textbf{Skill} & \textbf{Nav}  & \textbf{TT}   & \textbf{Recon} & \textbf{Elev} & \textbf{Acad} & \textbf{CT}   & \textbf{GoL}  & \textbf{Wild} & \textbf{Mean} \\ \hline
\textcolor{gray}{PROST}               & \textcolor{gray}{0.86}          & \textcolor{gray}{0.91}          & \textcolor{gray}{0.76}          & \textcolor{gray}{0.84}           &\textcolor{gray}{0}             & \textcolor{gray}{0.03}          & \textcolor{gray}{0.59}           & \textcolor{gray}{0.91}          & \textcolor{gray}{0.64}          & \textcolor{gray}{0.34}          & \textcolor{gray}{0.32}          & \textcolor{gray}{0.57}          & \textcolor{gray}{0.56}          \\
\symnetNotZero           & 0.9           & \textbf{0.88} & 0.79          & \textbf{0.82}  & 0.54          & \textbf{0.78} & 0.35           & \textbf{0.92} & \textbf{0.83} & \textbf{0.81} & 0.62          & 0.78          & \textbf{0.75} \\
\ModelNetNotZero-KL        & \textbf{0.91} & 0.85          & 0.81          & 0.81           & 0.53          & 0.7           & \textbf{0.42}  & 0.87          & 0.73          & 0.8           & \textbf{0.76} & \textbf{0.79} & 0.75          \\
\ModelNetNotZero+KL$_D$ & 0.9           & 0.85          & \textbf{0.83} & 0.77           & \textbf{0.85} & 0.67          & 0.29           & 0.71          & 0.78          & 0.78          & 0.61          & 0.77          & 0.73          \\
\ModelNetNotZero+KL        & 0.9           & 0.85          & 0.82          & 0.7            & 0.71          & 0.74          & 0.24           & 0.91          & 0.8           & 0.8           & 0.41          & 0.18          & 0.67          \\ \hline
\ModelNetNotZero(best val) & 0.91          & 0.85          & 0.83          & 0.81           & 0.85          & 0.67          & 0.42           & 0.91          & 0.78          & 0.8           & 0.76          & 0.79          & \textbf{0.78}\\
        \bottomrule
    \end{tabular}
    \caption{Comparison of \ModelNet{} variants with the baselines on $12$ IPPC domains. The last row denotes the score of the best among \ModelNet+$KL$, \ModelNet-$KL$ and \ModelNet-$KL_D$ chosen on the basis of average validation reward.}
    \label{sup_tab:IPPC_results}%
\end{table}
% Please add the following required packages to your document preamble:
% \usepackage[table,xcdraw]{xcolor}
% If you use beamer only pass "xcolor=table" option, i.e. \documentclass[xcolor=table]{beamer}
% \subsection{Training and Architecture Details:}
% In the spirit of domain-independent generalized planning, we use a single architecture on all domains. Both pre- and post-process GATs have $2$ layers with shared weights.
% For each domain, we generate $500$ training, $100$ validation, and $200$ test instances with size (\#state-fluents) increasing from train to validation to test instances.
% Similar to \symnet, we use state-of-the-art online planner PROST~\cite{keller&Eyerich12} and generate $30$ trajectories of each training instance. We train all models for $24$ hours on a $K40$ GPU using imitation learning.
% Each checkpoint is evaluated on validation instances and we pick the one with best average for testing.

% \bibliography{uai2023}



% \subsection{Results with extra data on IPPC domains}

%  We hypothesize that since \ModelNet\; has more parameters than \symnet, more training data is required for \ModelNet\; to surpass or match the performance of \symnet. Therefore, for these domains, we generate 100 more instances, and then train on these 200 instances, using the same validation and test instances. The results in the table \ref{tab:extra_data} below show that  \ModelNet\; gives similar or better performance on these domains when the training data is increased. 


% \begin{table*}[htp]
% \centering
% \begin{tabular}{c|cc|cc}
% \textbf{Model} & \textbf{Acad(100)} & \textbf{Acad(200)} & \textbf{Skill(100)} & \textbf{Skill(200)} \\ \hline
% PROST          & 0.65               & 0.64               & 0.9                 & 0.86                \\
% \symnet      & 0.85               & 0.61               & 0.88                & 0.89                \\
% \ModelNet-KL       & 0.81               & 0.82               & 0.77                & 0.89                \\
% \ModelNet+KL       & 0.76               & 0.87               & 0.5                 & 0.57               
% \end{tabular}
% \caption{Table shows the scores of various models trained with different amount of training data. We observe that on increasing the amount of training data, the relative performance of both \ModelNet-KL and \ModelNet+KL increases with respect to \symnet}
% \label{tab:extra_data}
% \end{table*}



\section{Sizes of instances}
In the spirit of transfer, the sizes of instances increase from training to validation to test instances. A measure of size is the number of state fluents present in the instance. We report the minimum and maximum of train, validation and the test sets for LR domains in table \ref{tab:LR_count} and for IPPC domains in table \ref{tab:IPPC_count}. 

\begin{table*}[tbh]
\centering
\begin{tabular}{c|cc|cc|cc}
\textbf{Domain} & \textbf{Train(min)} & \textbf{Train(max)} & \textbf{Val(min)} & \textbf{Val(max)} & \textbf{Test(min) }& \textbf{Test(max)} \\ \hline
SRecon & 48         & 193        & 208      & 249      & 373       & 924       \\
Pizza  & 34         & 153        & 205      & 265      & 409       & 493       \\
EAcad  & 8          & 36         & 42       & 192      & 120       & 320       \\
StWall & 25         & 100        & 121      & 225      & 256       & 400       \\
DNav   & 81         & 196        & 225      & 324      & 400       & 625      
\end{tabular}

    \caption{Number of state fluents for LR domains for training, validation, and test sets.}
    \label{tab:LR_count}%
\end{table*}


\begin{table*}[tbh]
\centering
\begin{tabular}{c|c|c|c|c|c|c}
\textbf{Domain} & \textbf{Train(min)} & \textbf{Train(max)} & \textbf{Val(min)} & \textbf{Val(max)} & \textbf{Test(min)} & \textbf{Test(max)} \\ \hline
Acad            & 4                   & 50                  & 60                & 96                & 120                & 240                \\
CT              & 12                  & 84                  & 112               & 144               & 180                & 312                \\
GoL             & 4                   & 36                  & 42                & 64                & 81                 & 100                \\
Skill           & 6                   & 42                  & 12                & 42                & 30                 & 60                 \\
Recon           & 29                  & 81                  & 68                & 107               & 120                & 257                \\
TT              & 12                  & 75                  & 108               & 147               & 192                & 300                \\
Wild            & 10                  & 72                  & 72                & 128               & 128                & 288                \\
Tam             & 2                   & 48                  & 28                & 84                & 48                 & 140                \\
Elev            & 9                   & 32                  & 18                & 32                & 24                 & 60                 \\
Traffic         & 32                  & 80                  & 32                & 80                & 56                 & 104                \\
Sys             & 2                   & 15                  & 2                 & 14                & 15                 & 25                 \\
Nav             & 9                   & 49                  & 25                & 90                & 120                & 120               
\end{tabular}
\caption{Number of state fluents for IPPC domains for training, validation and test sets.}
\label{tab:IPPC_count}
\end{table*}


\section{Domains and Generators}

\begin{enumerate}
\item \textbf{Deterministic Navigation (DNav)}

Deterministic Navigation involves a Robot and a Goal cell located in a square grid. For each step that the Robot is not in the Goal cell, it receives a reward of -1. The optimal policy requires the Robot to reach the Goal in the minimum number of timesteps. To generate instances, first the grid size is sampled uniformly from $[D_{min}, D_{max}]$ and then the goal and start cell of the robot is samples uniformly from the grid cells. Parameters for generation:
\begin{enumerate}
    \item $D_{min}$: Minimum allowed grid dimension
    \item $D_{max}$: Maximum allowed grid dimension. 
\end{enumerate}

To generate the train, validation, and test sets, we use the parameters mentioned in Table~\ref{tab:dnav}

\begin{table*}[htp]
\centering
\begin{tabular}{c|c|c|c}
\label{tab:size}
               & $D_{min}$ & $D_{max}$ & Horizon \\ \hline
\textbf{Train} & 9         & 14        & 40      \\
\textbf{Val}   & 15        & 18        & 60      \\
\textbf{Test}  & 20        & 25        & 60     
\end{tabular}
\caption{Table shows the parameters used in to generate instances in the DNav domain.}
\label{tab:dnav}
\end{table*}

\begin{table*}[!htp]
\centering
\begin{tabular}{c|c|c|c|c|c|c}
\label{table:eacad}
               & $L_{min}$ & $L_{max}$ & $C_{min}$ & $C_{max}$ & $p$ & Horizon \\ \hline
\textbf{Train} & 2         & 8         & 2            & 8         & 1   & 40      \\
\textbf{Val}   & 7         & 12        & 3            & 8         & 1   & 100     \\
\textbf{Test}  & 12        & 20        & 5            & 8         & 1   & 200    
\end{tabular}
\caption{Table shows the parameters used in to generate instances in the EAcad domain.}
\label{tab:eacad}
\end{table*}

\begin{table*}[!htp]
\centering
\begin{tabular}{c|c|c|c|c|c|c|c}
               & $D_{min}$ & $D_{max}$ & $O_{min}$ & $O_{max}$ & $p_{min}$ & $p_{max}$ & Horizon \\ \hline
\textbf{Train} & 6         & 13        & 2            & 4         & 0            & 0.5          & 40      \\
\textbf{Val}   & 13        & 14        & 2            & 4         & 0            & 0.5          & 80      \\
\textbf{Test}  & 18        & 19        & 2            & 4         & 0            & 0.5          & 80     
\end{tabular}
\caption{Table shows the parameters used in to generate instances in the SRecon domain.}
\label{tab:srecon}
\end{table*}

\begin{table*}[!htp]
\centering
    \begin{tabular}{c|c|c|c|c|c|c|c}
                  & $w_{min}$ & $w_{max}$ & $h_{min}$ & $h_{max}$ & $d_{min}$ & $d_{max}$ & Horizon \\ \hline
    \textbf{Train} & 5         & 12        & 5            & 12        & 2            & 4            & 100     \\
    \textbf{Val}   & 14        & 16        & 14           & 16        & 2            & 4            & 150     \\
    \textbf{Test}  & 20        & 22        & 20           & 22        & 2            & 4            & 200    
    \end{tabular}
\caption{Table shows the parameters used in to generate instances in the Pizza domain.}
\label{tab:pizza}
\end{table*}

\begin{table*}[!htp]
\centering
    \begin{tabular}{c|c|c|c}
                  & $n_{min}$ & $n_{max}$ & Horizon \\ \hline
    \textbf{Train} & 5         & 10 & 40     \\
    \textbf{Val}   & 11        & 15 & 40     \\
    \textbf{Test}  & 16        & 20             & 40    
    \end{tabular}
\caption{Table shows the parameters used in to generate instances in the StWall domain.}
\label{tab:stwall}
\end{table*}

\begin{table*}[!htp]
\centering
    \begin{tabular}{c|c|c|c|c|c}
                  & $w_{min}$ & $w_{max}$ & $h_{min}$ & $h_{max}$ & Horizon \\ \hline
    \textbf{Train} & 5         & 10 & 5 & 10 & 40     \\
    \textbf{Val}   & 11        & 15 & 11 & 15& 40     \\
    \textbf{Test}  & 15        & 20 & 15 & 20             & 40    
    \end{tabular}
\caption{Table shows the parameters used in to generate instances in the StNav domain.}
\label{tab:stnav}
\end{table*}


\item \textbf{Extreme Academic Advising (EAcad)}
Extreme Academic Advising consists of various courses which are arrange in a Directed Acyclic Graph. Certain courses are program requirements. For each time step, every program requirement that is not completed adds a negative reward to the total reward. In order to get high reward, an agent must complete program requirements in the shortest amount of time possible. If all the pre-requisites of a course have been completed then the probaility of completion of the course when attempted is 0.95. Otherwise, the probability of completion is 0.05. This incentivizes an agent to complete courses in the DAG order specifically leaving out courses which are not ancestors to a requirement course. To generate the courses, we set $L$ which is the number of levels, and $C$ the number of courses per level. Additionally, each courses has, on average $p$ number of prerequisites from the previous level. The number of course requirements is $R$, and the are sampled with a probability proportional to the square of their level. This is done so as to choose courses which require a lot of pre-requisites to be completed in order. Parameters of generation are:
\begin{enumerate}
    \item $L_{min}$: Minimum number of levels
    \item $L_{max}$: Maximum number of levels
    \item $C_{min}$: Minimum number of courses per level
    \item $C_{max}$: Maximum number of courses per level
    \item $p$: Average number of pre-requisites
\end{enumerate}

To generate train, val, and test sets, we use the parameters mentioned in Table~\ref{tab:eacad}.

\item \textbf{Safe Recon (SRecon)}
In Safe Recon, an agent has to traverse on a rectangular grid and take pictures of object where it detects life. Once an agent reaches at an object, it must apply tools("water" and "life"), in the correct order(first water, then life). Tools can fail with some probability. Once life has been detected, the agent can take pictures which gives it positive reward until the end of the episode. If an tool is damaged, it can go back to BASE to repair its tool or use damaged tools. Using damaged tools is risky because the a negative reward is given for each photo clicked which doesn't have life. This domain is identical to the one used for IPPC 2014, with the difference being that we do not use HAZARDS in our version. The parameters for instance generation are:
\begin{enumerate}
    \item $D_{min}$ Minimum grid size    
    \item $D_{max}$ Maximum grid size
    \item $O_{min}$ Minimum number of objects
    \item $O_{max}$ Maximum number of objects
    \item $p_{min}$ Minimum threshold for tool damage probability
    \item $p_{max}$ Maximum threshold for tool damage probability
\end{enumerate}
To generate the train, validation, and test sets we use the parameters mentioned in Table~\ref{tab:srecon}.
% \begin{table*}[!htp]
% \centering
% \begin{tabular}{c|c|c|c|c|c|c|c}
%               & $D_{min}$ & $D_{max}$ & $O_{min}$ & $O_{max}$ & $p_{min}$ & $p_{max}$ & Horizon \\ \hline
% \textbf{Train} & 6         & 13        & 2            & 4         & 0            & 0.5          & 40      \\
% \textbf{Val}   & 13        & 14        & 2            & 4         & 0            & 0.5          & 80      \\
% \textbf{Test}  & 18        & 19        & 2            & 4         & 0            & 0.5          & 80     
% \end{tabular}
% \caption{Table shows the parameters used in to generate instances in the SRecon domain.}
% \label{tab:srecon}
% \end{table*}

\item \textbf{Pizza Delivery (Pizza)}
Pizza Delivery consists of a rectangular grid of width $w$ and height $h$. In addition, the grid contains $d$ pizza shops which are subgoals. The agent must collect the pizza from any pizza shop and deliver it to the customer in the minimum time possible. Additionally, a wind blows which can randomly push you to any neighbouring cell. To generate domains, a start location $s$, customer location $c$ and a special pizza shop location $p'$ is sampled uniformly randomly.
Next, to sample the remaining $d-1$ goals, we remove all cells $p$ such that \[
dist(s, p) \geq dist(s, p')\]
or 
\[dist(s, p) + dist(c, p) \leq dist(s, p') + dist(c, p') 
\]
This is done so that the planner doesn't go to the nearest pizza shop but learns to minimize the sum of both distances(distance to shop+distance to customer). Out of the candidate cells, we sample with probability proportional to the distance from $s$. Parameters for generation:
\begin{enumerate}
    \item $w_{min}$: Minimum grid width
    \item $w_{max}$: Maximum grid width
    \item $h_{min}$: Minimum grid height
    \item $h_{max}$
    Maximum grid height
    \item $d_{min}$: Minimum number of pizza shops
    \item $d_{max}$
    Maximum number of pizza shops


\end{enumerate}
To generate the train, validation, and test sets we use the parameters mentioned in Table~\ref{tab:pizza}.
% \begin{table*}
% \centering
%     \begin{tabular}{c|c|c|c|c|c|c|c}
%                   & $w_{min}$ & $w_{max}$ & $h_{min}$ & $h_{max}$ & $d_{min}$ & $d_{max}$ & Horizon \\ \hline
%     \textbf{Train} & 5         & 12        & 5            & 12        & 2            & 4            & 100     \\
%     \textbf{Val}   & 14        & 16        & 14           & 16        & 2            & 4            & 150     \\
%     \textbf{Test}  & 20        & 22        & 20           & 22        & 2            & 4            & 200    
%     \end{tabular}
% \end{table*}
% \caption{Table shows the parameters used in to generate instances in the Pizza domain.}
% \label{tab:pizza}

\item \textbf{Stochastic Wall (StWall)} Stochastic Wall consists of a square grid of dimension $n$. Uniformly randomly, a row or a column is sampled to form a barrier. The start and goal location are sampled such that they lie on opposite sides of the barrier. In the barrier, a cell is selected to be a safe passageway from one part of the grid to the other. Hitting the barrier can cause death with some probability sampled from $\mathcal{U}(0.8, 1.0)$. The agent must navigate from his initial position to the goal. The generation parameters are:

\begin{enumerate}
    \item $n_{min}$: Minimum size of the grid.
    \item $n_{max}$: Maximum size of the grid.
\end{enumerate}To refer to the generation parameters for train, val and test splits, refer to table \ref{tab:stwall}. 

\item \textbf{Stochastic Navigation (StNav)} This domains consists of a grid of width $w$ and height $h$. The bottom and top rows are safe. The start state is sampled from the bottom row and the goal state is sampled in the top row. In the middle rows, the robot can die with some probability. However, there exists a single column which has very low death probability from $\mathcal{U}(0.045, 0.055)$. For all other column, the death probability in each cell is from $\mathcal{U}(0.88, 0.92)$. This task has long range dependencies because the agent has to decide which column to enter into. The column could be very far and thus \symnet might not be able to decide which direction to take. The generation parameters are:
\begin{enumerate}
    
    \item $w_{min}$: Minimum width of the grid.
    \item $w_{max}$: Maximum width of the grid.
    \item $h_{min}$: Minimum height of the grid.
    \item $h_{max}$: Maximum height of the grid.
\end{enumerate}
To refer to the generation parameters for train, val and test splits, refer to table \ref{tab:stnav}
\end{enumerate}

% \bibliography{uai2023-template}

\end{document}
